aboutgitcodebugslistschat
diff options
context:
space:
mode:
-rw-r--r--Makefile5
-rw-r--r--tcp.c30
-rw-r--r--tcp_info.h120
-rw-r--r--tcp_internal.h4
4 files changed, 132 insertions, 27 deletions
diff --git a/Makefile b/Makefile
index 74a9513..6faa501 100644
--- a/Makefile
+++ b/Makefile
@@ -67,11 +67,6 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
udp.h udp_flow.h util.h
HEADERS = $(PASST_HEADERS) seccomp.h
-C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
-ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
- FLAGS += -DHAS_SND_WND
-endif
-
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_bytes_acked = 0 };
ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
FLAGS += -DHAS_BYTES_ACKED
diff --git a/tcp.c b/tcp.c
index 0d22e07..2a0b272 100644
--- a/tcp.c
+++ b/tcp.c
@@ -274,6 +274,7 @@
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/ip.h>
+#include <netinet/tcp.h>
#include <stdint.h>
#include <stdbool.h>
#include <stddef.h>
@@ -286,8 +287,6 @@
#include <time.h>
#include <arpa/inet.h>
-#include <linux/tcp.h> /* For struct tcp_info */
-
#include "checksum.h"
#include "util.h"
#include "iov.h"
@@ -303,6 +302,7 @@
#include "flow_table.h"
#include "tcp_internal.h"
+#include "tcp_info.h"
#include "tcp_buf.h"
/* MSS rounding: see SET_MSS() */
@@ -318,11 +318,6 @@
#define LOW_RTT_TABLE_SIZE 8
#define LOW_RTT_THRESHOLD 10 /* us */
-/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of
- * <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP
- */
-#define SOL_TCP IPPROTO_TCP
-
#define ACK_IF_NEEDED 0 /* See tcp_send_flag() */
#define CONN_IS_CLOSING(conn) \
@@ -365,14 +360,11 @@ char tcp_buf_discard [MAX_WINDOW];
/* Does the kernel support TCP_PEEK_OFF? */
bool peek_offset_cap;
-#ifdef HAS_SND_WND
+
/* Does the kernel report sending window in TCP_INFO (kernel commit
* 8f7baad7f035)
*/
bool snd_wnd_cap;
-#else
-#define snd_wnd_cap (false)
-#endif
/* sendmsg() to socket */
static struct iovec tcp_iov [UIO_MAXIOV];
@@ -678,7 +670,7 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
* @tinfo: Pointer to struct tcp_info for socket
*/
static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
- const struct tcp_info *tinfo)
+ const struct tcp_info_linux *tinfo)
{
#ifdef HAS_MIN_RTT
const struct flowside *tapside = TAPFLOW(conn);
@@ -1114,13 +1106,13 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
* Return: 1 if sequence or window were updated, 0 otherwise
*/
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
- bool force_seq, struct tcp_info *tinfo)
+ bool force_seq, struct tcp_info_linux *tinfo)
{
uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap;
uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
/* cppcheck-suppress [ctunullpointer, unmatchedSuppression] */
socklen_t sl = sizeof(*tinfo);
- struct tcp_info tinfo_new;
+ struct tcp_info_linux tinfo_new;
uint32_t new_wnd_to_tap = prev_wnd_to_tap;
int s = conn->sock;
@@ -1235,7 +1227,7 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
size_t *optlen)
{
- struct tcp_info tinfo = { 0 };
+ struct tcp_info_linux tinfo = { 0 };
socklen_t sl = sizeof(tinfo);
int s = conn->sock;
@@ -2578,7 +2570,6 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af)
return ret;
}
-#ifdef HAS_SND_WND
/**
* tcp_probe_snd_wnd_cap() - Check if TCP_INFO reports tcpi_snd_wnd
*
@@ -2586,7 +2577,7 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af)
*/
static bool tcp_probe_snd_wnd_cap(void)
{
- struct tcp_info tinfo;
+ struct tcp_info_linux tinfo;
socklen_t sl = sizeof(tinfo);
int s;
@@ -2604,13 +2595,12 @@ static bool tcp_probe_snd_wnd_cap(void)
close(s);
- if (sl < (offsetof(struct tcp_info, tcpi_snd_wnd) +
+ if (sl < (offsetof(struct tcp_info_linux, tcpi_snd_wnd) +
sizeof(tinfo.tcpi_snd_wnd)))
return false;
return true;
}
-#endif /* HAS_SND_WND */
/**
* tcp_init() - Get initial sequence, hash secret, initialise per-socket data
@@ -2645,9 +2635,7 @@ int tcp_init(struct ctx *c)
(!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6));
debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");
-#ifdef HAS_SND_WND
snd_wnd_cap = tcp_probe_snd_wnd_cap();
-#endif
debug("TCP_INFO tcpi_snd_wnd field%ssupported",
snd_wnd_cap ? " " : " not ");
diff --git a/tcp_info.h b/tcp_info.h
new file mode 100644
index 0000000..06ccb16
--- /dev/null
+++ b/tcp_info.h
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright Red Hat
+ *
+ * Largely derived from include/linux/tcp.h in the Linux kernel
+ */
+
+#ifndef TCP_INFO_H
+#define TCP_INFO_H
+
+/* struct tcp_info_linux - Information from Linux TCP_INFO getsockopt()
+ *
+ * Some fields returned by TCP_INFO have been there for ages and are shared with
+ * BSD. struct tcp_info from netinet/tcp.h has only those fields. There are
+ * also a many Linux specific extensions to the structure, which are only found
+ * in the linux/tcp.h version of struct tcp_info.
+ *
+ * We want to use some of those extension fields, when available. We can test
+ * for availability in the runtime kernel using the length returned from
+ * getsockopt(). However, we won't necessarily be compiled against the same
+ * kernel headers as we'll run with, so compiling directly against linux/tcp.h
+ * means wrapping every field access in an #ifdef whose #else does the same
+ * thing as when the field is missing at runtime. This rapidly gets messy.
+ *
+ * Instead we define here struct tcp_info_linux which includes all the Linux
+ * extensions that we want to use. This is taken from v6.11 of the kernel.
+ */
+struct tcp_info_linux {
+ uint8_t tcpi_state;
+ uint8_t tcpi_ca_state;
+ uint8_t tcpi_retransmits;
+ uint8_t tcpi_probes;
+ uint8_t tcpi_backoff;
+ uint8_t tcpi_options;
+ uint8_t tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
+ uint8_t tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2;
+
+ uint32_t tcpi_rto;
+ uint32_t tcpi_ato;
+ uint32_t tcpi_snd_mss;
+ uint32_t tcpi_rcv_mss;
+
+ uint32_t tcpi_unacked;
+ uint32_t tcpi_sacked;
+ uint32_t tcpi_lost;
+ uint32_t tcpi_retrans;
+ uint32_t tcpi_fackets;
+
+ /* Times. */
+ uint32_t tcpi_last_data_sent;
+ uint32_t tcpi_last_ack_sent;
+ uint32_t tcpi_last_data_recv;
+ uint32_t tcpi_last_ack_recv;
+
+ /* Metrics. */
+ uint32_t tcpi_pmtu;
+ uint32_t tcpi_rcv_ssthresh;
+ uint32_t tcpi_rtt;
+ uint32_t tcpi_rttvar;
+ uint32_t tcpi_snd_ssthresh;
+ uint32_t tcpi_snd_cwnd;
+ uint32_t tcpi_advmss;
+ uint32_t tcpi_reordering;
+
+ uint32_t tcpi_rcv_rtt;
+ uint32_t tcpi_rcv_space;
+
+ uint32_t tcpi_total_retrans;
+
+ /* Linux extensions */
+ uint64_t tcpi_pacing_rate;
+ uint64_t tcpi_max_pacing_rate;
+ uint64_t tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
+ uint64_t tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
+ uint32_t tcpi_segs_out; /* RFC4898 tcpEStatsPerfSegsOut */
+ uint32_t tcpi_segs_in; /* RFC4898 tcpEStatsPerfSegsIn */
+
+ uint32_t tcpi_notsent_bytes;
+ uint32_t tcpi_min_rtt;
+ uint32_t tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */
+ uint32_t tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */
+
+ uint64_t tcpi_delivery_rate;
+
+ uint64_t tcpi_busy_time; /* Time (usec) busy sending data */
+ uint64_t tcpi_rwnd_limited; /* Time (usec) limited by receive window */
+ uint64_t tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
+
+ uint32_t tcpi_delivered;
+ uint32_t tcpi_delivered_ce;
+
+ uint64_t tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */
+ uint64_t tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */
+ uint32_t tcpi_dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups */
+ uint32_t tcpi_reord_seen; /* reordering events seen */
+
+ uint32_t tcpi_rcv_ooopack; /* Out-of-order packets received */
+
+ uint32_t tcpi_snd_wnd; /* peer's advertised receive window after
+ * scaling (bytes)
+ */
+ uint32_t tcpi_rcv_wnd; /* local advertised receive window after
+ * scaling (bytes)
+ */
+
+ uint32_t tcpi_rehash; /* PLB or timeout triggered rehash attempts */
+
+ uint16_t tcpi_total_rto; /* Total number of RTO timeouts, including
+ * SYN/SYN-ACK and recurring timeouts.
+ */
+ uint16_t tcpi_total_rto_recoveries; /* Total number of RTO
+ * recoveries, including any
+ * unfinished recovery.
+ */
+ uint32_t tcpi_total_rto_time; /* Total time spent in RTO recoveries
+ * in milliseconds, including any
+ * unfinished recovery.
+ */
+};
+
+#endif /* TCP_INFO_H */
diff --git a/tcp_internal.h b/tcp_internal.h
index 1ab8ce2..a5a47df 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -175,12 +175,14 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
tcp_rst_do(c, conn); \
} while (0)
+struct tcp_info_linux;
+
size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
struct iovec *iov, size_t dlen,
const uint16_t *check, uint32_t seq,
bool no_tcp_csum);
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
- bool force_seq, struct tcp_info *tinfo);
+ bool force_seq, struct tcp_info_linux *tinfo);
int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
size_t *optlen);