diff options
-rw-r--r-- | Makefile | 5 | ||||
-rw-r--r-- | tcp.c | 30 | ||||
-rw-r--r-- | tcp_info.h | 120 | ||||
-rw-r--r-- | tcp_internal.h | 4 |
4 files changed, 132 insertions, 27 deletions
@@ -67,11 +67,6 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ udp.h udp_flow.h util.h HEADERS = $(PASST_HEADERS) seccomp.h -C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; -ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0) - FLAGS += -DHAS_SND_WND -endif - C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_bytes_acked = 0 }; ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0) FLAGS += -DHAS_BYTES_ACKED @@ -274,6 +274,7 @@ #include <net/if.h> #include <netinet/in.h> #include <netinet/ip.h> +#include <netinet/tcp.h> #include <stdint.h> #include <stdbool.h> #include <stddef.h> @@ -286,8 +287,6 @@ #include <time.h> #include <arpa/inet.h> -#include <linux/tcp.h> /* For struct tcp_info */ - #include "checksum.h" #include "util.h" #include "iov.h" @@ -303,6 +302,7 @@ #include "flow_table.h" #include "tcp_internal.h" +#include "tcp_info.h" #include "tcp_buf.h" /* MSS rounding: see SET_MSS() */ @@ -318,11 +318,6 @@ #define LOW_RTT_TABLE_SIZE 8 #define LOW_RTT_THRESHOLD 10 /* us */ -/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of - * <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP - */ -#define SOL_TCP IPPROTO_TCP - #define ACK_IF_NEEDED 0 /* See tcp_send_flag() */ #define CONN_IS_CLOSING(conn) \ @@ -365,14 +360,11 @@ char tcp_buf_discard [MAX_WINDOW]; /* Does the kernel support TCP_PEEK_OFF? */ bool peek_offset_cap; -#ifdef HAS_SND_WND + /* Does the kernel report sending window in TCP_INFO (kernel commit * 8f7baad7f035) */ bool snd_wnd_cap; -#else -#define snd_wnd_cap (false) -#endif /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; @@ -678,7 +670,7 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn) * @tinfo: Pointer to struct tcp_info for socket */ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, - const struct tcp_info *tinfo) + const struct tcp_info_linux *tinfo) { #ifdef HAS_MIN_RTT const struct flowside *tapside = TAPFLOW(conn); @@ -1114,13 +1106,13 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, * Return: 1 if sequence or window were updated, 0 otherwise */ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, - bool force_seq, struct tcp_info *tinfo) + bool force_seq, struct tcp_info_linux *tinfo) { uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap; uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; /* cppcheck-suppress [ctunullpointer, unmatchedSuppression] */ socklen_t sl = sizeof(*tinfo); - struct tcp_info tinfo_new; + struct tcp_info_linux tinfo_new; uint32_t new_wnd_to_tap = prev_wnd_to_tap; int s = conn->sock; @@ -1235,7 +1227,7 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags, struct tcphdr *th, struct tcp_syn_opts *opts, size_t *optlen) { - struct tcp_info tinfo = { 0 }; + struct tcp_info_linux tinfo = { 0 }; socklen_t sl = sizeof(tinfo); int s = conn->sock; @@ -2578,7 +2570,6 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af) return ret; } -#ifdef HAS_SND_WND /** * tcp_probe_snd_wnd_cap() - Check if TCP_INFO reports tcpi_snd_wnd * @@ -2586,7 +2577,7 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af) */ static bool tcp_probe_snd_wnd_cap(void) { - struct tcp_info tinfo; + struct tcp_info_linux tinfo; socklen_t sl = sizeof(tinfo); int s; @@ -2604,13 +2595,12 @@ static bool tcp_probe_snd_wnd_cap(void) close(s); - if (sl < (offsetof(struct tcp_info, tcpi_snd_wnd) + + if (sl < (offsetof(struct tcp_info_linux, tcpi_snd_wnd) + sizeof(tinfo.tcpi_snd_wnd))) return false; return true; } -#endif /* HAS_SND_WND */ /** * tcp_init() - Get initial sequence, hash secret, initialise per-socket data @@ -2645,9 +2635,7 @@ int tcp_init(struct ctx *c) (!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6)); debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not "); -#ifdef HAS_SND_WND snd_wnd_cap = tcp_probe_snd_wnd_cap(); -#endif debug("TCP_INFO tcpi_snd_wnd field%ssupported", snd_wnd_cap ? " " : " not "); diff --git a/tcp_info.h b/tcp_info.h new file mode 100644 index 0000000..06ccb16 --- /dev/null +++ b/tcp_info.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * + * Largely derived from include/linux/tcp.h in the Linux kernel + */ + +#ifndef TCP_INFO_H +#define TCP_INFO_H + +/* struct tcp_info_linux - Information from Linux TCP_INFO getsockopt() + * + * Some fields returned by TCP_INFO have been there for ages and are shared with + * BSD. struct tcp_info from netinet/tcp.h has only those fields. There are + * also a many Linux specific extensions to the structure, which are only found + * in the linux/tcp.h version of struct tcp_info. + * + * We want to use some of those extension fields, when available. We can test + * for availability in the runtime kernel using the length returned from + * getsockopt(). However, we won't necessarily be compiled against the same + * kernel headers as we'll run with, so compiling directly against linux/tcp.h + * means wrapping every field access in an #ifdef whose #else does the same + * thing as when the field is missing at runtime. This rapidly gets messy. + * + * Instead we define here struct tcp_info_linux which includes all the Linux + * extensions that we want to use. This is taken from v6.11 of the kernel. + */ +struct tcp_info_linux { + uint8_t tcpi_state; + uint8_t tcpi_ca_state; + uint8_t tcpi_retransmits; + uint8_t tcpi_probes; + uint8_t tcpi_backoff; + uint8_t tcpi_options; + uint8_t tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4; + uint8_t tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2; + + uint32_t tcpi_rto; + uint32_t tcpi_ato; + uint32_t tcpi_snd_mss; + uint32_t tcpi_rcv_mss; + + uint32_t tcpi_unacked; + uint32_t tcpi_sacked; + uint32_t tcpi_lost; + uint32_t tcpi_retrans; + uint32_t tcpi_fackets; + + /* Times. */ + uint32_t tcpi_last_data_sent; + uint32_t tcpi_last_ack_sent; + uint32_t tcpi_last_data_recv; + uint32_t tcpi_last_ack_recv; + + /* Metrics. */ + uint32_t tcpi_pmtu; + uint32_t tcpi_rcv_ssthresh; + uint32_t tcpi_rtt; + uint32_t tcpi_rttvar; + uint32_t tcpi_snd_ssthresh; + uint32_t tcpi_snd_cwnd; + uint32_t tcpi_advmss; + uint32_t tcpi_reordering; + + uint32_t tcpi_rcv_rtt; + uint32_t tcpi_rcv_space; + + uint32_t tcpi_total_retrans; + + /* Linux extensions */ + uint64_t tcpi_pacing_rate; + uint64_t tcpi_max_pacing_rate; + uint64_t tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */ + uint64_t tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */ + uint32_t tcpi_segs_out; /* RFC4898 tcpEStatsPerfSegsOut */ + uint32_t tcpi_segs_in; /* RFC4898 tcpEStatsPerfSegsIn */ + + uint32_t tcpi_notsent_bytes; + uint32_t tcpi_min_rtt; + uint32_t tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */ + uint32_t tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */ + + uint64_t tcpi_delivery_rate; + + uint64_t tcpi_busy_time; /* Time (usec) busy sending data */ + uint64_t tcpi_rwnd_limited; /* Time (usec) limited by receive window */ + uint64_t tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */ + + uint32_t tcpi_delivered; + uint32_t tcpi_delivered_ce; + + uint64_t tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */ + uint64_t tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */ + uint32_t tcpi_dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups */ + uint32_t tcpi_reord_seen; /* reordering events seen */ + + uint32_t tcpi_rcv_ooopack; /* Out-of-order packets received */ + + uint32_t tcpi_snd_wnd; /* peer's advertised receive window after + * scaling (bytes) + */ + uint32_t tcpi_rcv_wnd; /* local advertised receive window after + * scaling (bytes) + */ + + uint32_t tcpi_rehash; /* PLB or timeout triggered rehash attempts */ + + uint16_t tcpi_total_rto; /* Total number of RTO timeouts, including + * SYN/SYN-ACK and recurring timeouts. + */ + uint16_t tcpi_total_rto_recoveries; /* Total number of RTO + * recoveries, including any + * unfinished recovery. + */ + uint32_t tcpi_total_rto_time; /* Total time spent in RTO recoveries + * in milliseconds, including any + * unfinished recovery. + */ +}; + +#endif /* TCP_INFO_H */ diff --git a/tcp_internal.h b/tcp_internal.h index 1ab8ce2..a5a47df 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -175,12 +175,14 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn); tcp_rst_do(c, conn); \ } while (0) +struct tcp_info_linux; + size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, struct iovec *iov, size_t dlen, const uint16_t *check, uint32_t seq, bool no_tcp_csum); int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, - bool force_seq, struct tcp_info *tinfo); + bool force_seq, struct tcp_info_linux *tinfo); int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags, struct tcphdr *th, struct tcp_syn_opts *opts, size_t *optlen); |