diff options
Diffstat (limited to 'tcp.c')
-rw-r--r-- | tcp.c | 1203 |
1 files changed, 589 insertions, 614 deletions
@@ -274,6 +274,7 @@ #include <net/if.h> #include <netinet/in.h> #include <netinet/ip.h> +#include <netinet/tcp.h> #include <stdint.h> #include <stdbool.h> #include <stddef.h> @@ -286,10 +287,9 @@ #include <time.h> #include <arpa/inet.h> -#include <linux/tcp.h> /* For struct tcp_info */ - #include "checksum.h" #include "util.h" +#include "iov.h" #include "ip.h" #include "passt.h" #include "tap.h" @@ -299,28 +299,16 @@ #include "log.h" #include "inany.h" #include "flow.h" +#include "linux_dep.h" #include "flow_table.h" #include "tcp_internal.h" #include "tcp_buf.h" #include "tcp_vu.h" -/* Sides of a flow as we use them in "tap" connections */ -#define SOCKSIDE 0 -#define TAPSIDE 1 - -#define TCP_HASH_TABLE_LOAD 70 /* % */ -#define TCP_HASH_TABLE_SIZE (FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD) - /* MSS rounding: see SET_MSS() */ #define MSS_DEFAULT 536 - #define WINDOW_DEFAULT 14600 /* RFC 6928 */ -#ifdef HAS_SND_WND -# define KERNEL_REPORTS_SND_WND(c) (c->tcp.kernel_snd_wnd) -#else -# define KERNEL_REPORTS_SND_WND(c) (0 && (c)) -#endif #define ACK_INTERVAL 10 /* ms */ #define SYN_TIMEOUT 10 /* s */ @@ -331,18 +319,12 @@ #define LOW_RTT_TABLE_SIZE 8 #define LOW_RTT_THRESHOLD 10 /* us */ -/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of - * <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP - */ -#define SOL_TCP IPPROTO_TCP - -#define ACK_IF_NEEDED 0 /* See tcp_buf_send_flag() */ - +#define ACK_IF_NEEDED 0 /* See tcp_send_flag() */ #define CONN_IS_CLOSING(conn) \ - ((conn->events & ESTABLISHED) && \ - (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) -#define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) + (((conn)->events & ESTABLISHED) && \ + ((conn)->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) +#define CONN_HAS(conn, set) (((conn)->events & (set)) == (set)) static const char *tcp_event_str[] __attribute((__unused__)) = { "SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT", @@ -370,29 +352,75 @@ static const char *tcp_flag_str[] __attribute((__unused__)) = { static int tcp_sock_init_ext [NUM_PORTS][IP_VERSIONS]; static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS]; -/* Table of guest side forwarding addresses with very low RTT (assumed - * to be local to the host), LRU +/* Table of our guest side addresses with very low RTT (assumed to be local to + * the host), LRU */ static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE]; char tcp_buf_discard [MAX_WINDOW]; -/* sendmsg() to socket */ -static struct iovec tcp_iov [UIO_MAXIOV]; +/* Does the kernel support TCP_PEEK_OFF? */ +bool peek_offset_cap; -#define CONN(idx) (&(FLOW(idx)->tcp)) +/* Size of data returned by TCP_INFO getsockopt() */ +socklen_t tcp_info_size; -/* Table for lookup from remote address, local port, remote port */ -static flow_sidx_t tc_hash[TCP_HASH_TABLE_SIZE]; +#define tcp_info_cap(f_) \ + ((offsetof(struct tcp_info_linux, tcpi_##f_) + \ + sizeof(((struct tcp_info_linux *)NULL)->tcpi_##f_)) <= tcp_info_size) -static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, - "Safe linear probing requires hash table larger than connection table"); +/* Kernel reports sending window in TCP_INFO (kernel commit 8f7baad7f035) */ +#define snd_wnd_cap tcp_info_cap(snd_wnd) +/* Kernel reports bytes acked in TCP_INFO (kernel commit 0df48c26d84) */ +#define bytes_acked_cap tcp_info_cap(bytes_acked) +/* Kernel reports minimum RTT in TCP_INFO (kernel commit cd9b266095f4) */ +#define min_rtt_cap tcp_info_cap(min_rtt) + +/* sendmsg() to socket */ +static struct iovec tcp_iov [UIO_MAXIOV]; /* Pools for pre-opened sockets (in init) */ int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; /** + * conn_at_sidx() - Get TCP connection specific flow at given sidx + * @sidx: Flow and side to retrieve + * + * Return: TCP connection at @sidx, or NULL of @sidx is invalid. Asserts if the + * flow at @sidx is not FLOW_TCP. + */ +static struct tcp_tap_conn *conn_at_sidx(flow_sidx_t sidx) +{ + union flow *flow = flow_at_sidx(sidx); + + if (!flow) + return NULL; + + ASSERT(flow->f.type == FLOW_TCP); + return &flow->tcp; +} + +/** + * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported + * @s: Socket to update + * @offset: Offset in bytes + * + * Return: -1 when it fails, 0 otherwise. + */ +int tcp_set_peek_offset(int s, int offset) +{ + if (!peek_offset_cap) + return 0; + + if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) { + err("Failed to set SO_PEEK_OFF to %i in socket %i", offset, s); + return -1; + } + return 0; +} + +/** * tcp_conn_epoll_events() - epoll events mask for given connection state * @events: Current connection events * @conn_flags Connection flags @@ -417,7 +445,7 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) if (events == TAP_SYN_RCVD) return EPOLLOUT | EPOLLET | EPOLLRDHUP; - return EPOLLRDHUP; + return EPOLLET | EPOLLRDHUP; } /** @@ -431,7 +459,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) { int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock, - .flowside = FLOW_SIDX(conn, SOCKSIDE) }; + .flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), }; struct epoll_event ev = { .data.u64 = ref.u64 }; if (conn->events == CLOSED) { @@ -522,7 +550,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) (unsigned long long)it.it_value.tv_sec, (unsigned long long)it.it_value.tv_nsec / 1000 / 1000); - timerfd_settime(conn->timer, 0, &it, NULL); + if (timerfd_settime(conn->timer, 0, &it, NULL)) + flow_err(conn, "failed to set timer: %s", strerror(errno)); } /** @@ -573,9 +602,6 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, tcp_timer_ctl(c, conn); } -static void tcp_hash_remove(const struct ctx *c, - const struct tcp_tap_conn *conn); - /** * conn_event_do() - Set and log connection events, update epoll state * @c: Execution context @@ -621,7 +647,7 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, num == -1 ? "CLOSED" : tcp_event_str[num]); if (event == CLOSED) - tcp_hash_remove(c, conn); + flow_hash_remove(c, TAP_SIDX(conn)); else if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD)) conn_flag(c, conn, ACTIVE_CLOSE); else @@ -639,10 +665,11 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, */ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn) { + const struct flowside *tapside = TAPFLOW(conn); int i; for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) - if (inany_equals(&conn->faddr, low_rtt_dst + i)) + if (inany_equals(&tapside->oaddr, low_rtt_dst + i)) return 1; return 0; @@ -654,17 +681,17 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn) * @tinfo: Pointer to struct tcp_info for socket */ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, - const struct tcp_info *tinfo) + const struct tcp_info_linux *tinfo) { -#ifdef HAS_MIN_RTT + const struct flowside *tapside = TAPFLOW(conn); int i, hole = -1; - if (!tinfo->tcpi_min_rtt || + if (!min_rtt_cap || (int)tinfo->tcpi_min_rtt > LOW_RTT_THRESHOLD) return; for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) { - if (inany_equals(&conn->faddr, low_rtt_dst + i)) + if (inany_equals(&tapside->oaddr, low_rtt_dst + i)) return; if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i)) hole = i; @@ -676,14 +703,10 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, if (hole == -1) return; - low_rtt_dst[hole++] = conn->faddr; + low_rtt_dst[hole++] = tapside->oaddr; if (hole == LOW_RTT_TABLE_SIZE) hole = 0; inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any); -#else - (void)conn; - (void)tinfo; -#endif /* HAS_MIN_RTT */ } /** @@ -730,34 +753,106 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s) } /** - * tcp_update_check_tcp4() - Update TCP checksum from stored one + * tcp_update_check_tcp4() - Calculate TCP checksum for IPv4 * @iph: IPv4 header - * @th: TCP header followed by TCP payload + * @iov: Pointer to the array of IO vectors + * @iov_cnt: Length of the array + * @l4offset: IPv4 payload offset in the iovec array */ -static void tcp_update_check_tcp4(const struct iphdr *iph, struct tcphdr *th) +void tcp_update_check_tcp4(const struct iphdr *iph, + const struct iovec *iov, int iov_cnt, + size_t l4offset) { - uint16_t tlen = ntohs(iph->tot_len) - sizeof(struct iphdr); + uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr); struct in_addr saddr = { .s_addr = iph->saddr }; struct in_addr daddr = { .s_addr = iph->daddr }; - uint32_t sum = proto_ipv4_header_psum(tlen, IPPROTO_TCP, saddr, daddr); + size_t check_ofs; + uint16_t *check; + int check_idx; + uint32_t sum; + char *ptr; + + sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr); + + check_idx = iov_skip_bytes(iov, iov_cnt, + l4offset + offsetof(struct tcphdr, check), + &check_ofs); + + if (check_idx >= iov_cnt) { + err("TCP4 buffer is too small, iov size %zd, check offset %zd", + iov_size(iov, iov_cnt), + l4offset + offsetof(struct tcphdr, check)); + return; + } + + if (check_ofs + sizeof(*check) > iov[check_idx].iov_len) { + err("TCP4 checksum field memory is not contiguous " + "check_ofs %zd check_idx %d iov_len %zd", + check_ofs, check_idx, iov[check_idx].iov_len); + return; + } + + ptr = (char *)iov[check_idx].iov_base + check_ofs; + if ((uintptr_t)ptr & (__alignof__(*check) - 1)) { + err("TCP4 checksum field is not correctly aligned in memory"); + return; + } + + check = (uint16_t *)ptr; - th->check = 0; - th->check = csum(th, tlen, sum); + *check = 0; + *check = csum_iov(iov, iov_cnt, l4offset, sum); } /** * tcp_update_check_tcp6() - Calculate TCP checksum for IPv6 * @ip6h: IPv6 header - * @th: TCP header followed by TCP payload + * @iov: Pointer to the array of IO vectors + * @iov_cnt: Length of the array + * @l4offset: IPv6 payload offset in the iovec array */ -static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th) +void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, + const struct iovec *iov, int iov_cnt, + size_t l4offset) { - uint16_t payload_len = ntohs(ip6h->payload_len); - uint32_t sum = proto_ipv6_header_psum(payload_len, IPPROTO_TCP, - &ip6h->saddr, &ip6h->daddr); + uint16_t l4len = ntohs(ip6h->payload_len); + size_t check_ofs; + uint16_t *check; + int check_idx; + uint32_t sum; + char *ptr; + + sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, &ip6h->saddr, + &ip6h->daddr); + + check_idx = iov_skip_bytes(iov, iov_cnt, + l4offset + offsetof(struct tcphdr, check), + &check_ofs); + + if (check_idx >= iov_cnt) { + err("TCP6 buffer is too small, iov size %zd, check offset %zd", + iov_size(iov, iov_cnt), + l4offset + offsetof(struct tcphdr, check)); + return; + } + + if (check_ofs + sizeof(*check) > iov[check_idx].iov_len) { + err("TCP6 checksum field memory is not contiguous " + "check_ofs %zd check_idx %d iov_len %zd", + check_ofs, check_idx, iov[check_idx].iov_len); + return; + } + + ptr = (char *)iov[check_idx].iov_base + check_ofs; + if ((uintptr_t)ptr & (__alignof__(*check) - 1)) { + err("TCP6 checksum field is not correctly aligned in memory"); + return; + } + + check = (uint16_t *)ptr; - th->check = 0; - th->check = csum(th, payload_len, sum); + *check = 0; + *check = csum_iov(iov, iov_cnt, l4offset, sum); } /** @@ -819,163 +914,14 @@ static int tcp_opt_get(const char *opts, size_t len, uint8_t type_find, } /** - * tcp_hash_match() - Check if a connection entry matches address and ports - * @conn: Connection entry to match against - * @faddr: Guest side forwarding address - * @eport: Guest side endpoint port - * @fport: Guest side forwarding port - * - * Return: 1 on match, 0 otherwise - */ -static int tcp_hash_match(const struct tcp_tap_conn *conn, - const union inany_addr *faddr, - in_port_t eport, in_port_t fport) -{ - if (inany_equals(&conn->faddr, faddr) && - conn->eport == eport && conn->fport == fport) - return 1; - - return 0; -} - -/** - * tcp_hash() - Calculate hash value for connection given address and ports - * @c: Execution context - * @faddr: Guest side forwarding address - * @eport: Guest side endpoint port - * @fport: Guest side forwarding port - * - * Return: hash value, needs to be adjusted for table size - */ -static uint64_t tcp_hash(const struct ctx *c, const union inany_addr *faddr, - in_port_t eport, in_port_t fport) -{ - struct siphash_state state = SIPHASH_INIT(c->hash_secret); - - inany_siphash_feed(&state, faddr); - return siphash_final(&state, 20, (uint64_t)eport << 16 | fport); -} - -/** - * tcp_conn_hash() - Calculate hash bucket of an existing connection - * @c: Execution context - * @conn: Connection - * - * Return: hash value, needs to be adjusted for table size - */ -static uint64_t tcp_conn_hash(const struct ctx *c, - const struct tcp_tap_conn *conn) -{ - return tcp_hash(c, &conn->faddr, conn->eport, conn->fport); -} - -/** - * tcp_hash_probe() - Find hash bucket for a connection - * @c: Execution context - * @conn: Connection to find bucket for - * - * Return: If @conn is in the table, its current bucket, otherwise a suitable - * free bucket for it. - */ -static inline unsigned tcp_hash_probe(const struct ctx *c, - const struct tcp_tap_conn *conn) -{ - flow_sidx_t sidx = FLOW_SIDX(conn, TAPSIDE); - unsigned b = tcp_conn_hash(c, conn) % TCP_HASH_TABLE_SIZE; - - /* Linear probing */ - while (!flow_sidx_eq(tc_hash[b], FLOW_SIDX_NONE) && - !flow_sidx_eq(tc_hash[b], sidx)) - b = mod_sub(b, 1, TCP_HASH_TABLE_SIZE); - - return b; -} - -/** - * tcp_hash_insert() - Insert connection into hash table, chain link - * @c: Execution context - * @conn: Connection pointer - */ -static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn) -{ - unsigned b = tcp_hash_probe(c, conn); - - tc_hash[b] = FLOW_SIDX(conn, TAPSIDE); - flow_dbg(conn, "hash table insert: sock %i, bucket: %u", conn->sock, b); -} - -/** - * tcp_hash_remove() - Drop connection from hash table, chain unlink - * @c: Execution context - * @conn: Connection pointer - */ -static void tcp_hash_remove(const struct ctx *c, - const struct tcp_tap_conn *conn) -{ - unsigned b = tcp_hash_probe(c, conn), s; - union flow *flow = flow_at_sidx(tc_hash[b]); - - if (!flow) - return; /* Redundant remove */ - - flow_dbg(conn, "hash table remove: sock %i, bucket: %u", conn->sock, b); - - /* Scan the remainder of the cluster */ - for (s = mod_sub(b, 1, TCP_HASH_TABLE_SIZE); - (flow = flow_at_sidx(tc_hash[s])); - s = mod_sub(s, 1, TCP_HASH_TABLE_SIZE)) { - unsigned h = tcp_conn_hash(c, &flow->tcp) % TCP_HASH_TABLE_SIZE; - - if (!mod_between(h, s, b, TCP_HASH_TABLE_SIZE)) { - /* tc_hash[s] can live in tc_hash[b]'s slot */ - debug("hash table remove: shuffle %u -> %u", s, b); - tc_hash[b] = tc_hash[s]; - b = s; - } - } - - tc_hash[b] = FLOW_SIDX_NONE; -} - -/** - * tcp_hash_lookup() - Look up connection given remote address and ports - * @c: Execution context - * @af: Address family, AF_INET or AF_INET6 - * @faddr: Guest side forwarding address (guest remote address) - * @eport: Guest side endpoint port (guest local port) - * @fport: Guest side forwarding port (guest remote port) - * - * Return: connection pointer, if found, -ENOENT otherwise - */ -static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, - sa_family_t af, const void *faddr, - in_port_t eport, in_port_t fport) -{ - union inany_addr aany; - union flow *flow; - unsigned b; - - inany_from_af(&aany, af, faddr); - - b = tcp_hash(c, &aany, eport, fport) % TCP_HASH_TABLE_SIZE; - while ((flow = flow_at_sidx(tc_hash[b])) && - !tcp_hash_match(&flow->tcp, &aany, eport, fport)) - b = mod_sub(b, 1, TCP_HASH_TABLE_SIZE); - - return &flow->tcp; -} - -/** * tcp_flow_defer() - Deferred per-flow handling (clean up closed connections) - * @flow: Flow table entry for this connection + * @conn: Connection to handle * - * Return: true if the flow is ready to free, false otherwise + * Return: true if the connection is ready to free, false otherwise */ -bool tcp_flow_defer(union flow *flow) +bool tcp_flow_defer(const struct tcp_tap_conn *conn) { - const struct tcp_tap_conn *conn = &flow->tcp; - - if (flow->tcp.events != CLOSED) + if (conn->events != CLOSED) return false; close(conn->sock); @@ -992,8 +938,7 @@ bool tcp_flow_defer(union flow *flow) /* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */ void tcp_defer_handler(struct ctx *c) { - tcp_buf_l2_flags_flush(c); - tcp_buf_l2_data_flush(c); + tcp_payload_flush(c); } /** @@ -1004,10 +949,12 @@ void tcp_defer_handler(struct ctx *c) * @seq: Sequence number */ static void tcp_fill_header(struct tcphdr *th, - const struct tcp_tap_conn *conn, uint32_t seq) + const struct tcp_tap_conn *conn, uint32_t seq) { - th->source = htons(conn->fport); - th->dest = htons(conn->eport); + const struct flowside *tapside = TAPFLOW(conn); + + th->source = htons(tapside->oport); + th->dest = htons(tapside->eport); th->seq = htonl(seq); th->ack_seq = htonl(conn->seq_ack_to_tap); if (conn->events & ESTABLISHED) { @@ -1021,68 +968,73 @@ static void tcp_fill_header(struct tcphdr *th, /** * tcp_fill_headers4() - Fill 802.3, IPv4, TCP headers in pre-cooked buffers - * @c: Execution context - * @conn: Connection pointer - * @iph: Pointer to IPv4 header - * @th: Pointer to TCP header - * @plen: Payload length (including TCP header options) - * @check: Checksum, if already known - * @seq: Sequence number for this segment - * - * Return: The total length of the IPv4 packet, host order - */ -size_t tcp_fill_headers4(const struct ctx *c, - const struct tcp_tap_conn *conn, - struct iphdr *iph, struct tcphdr *th, - size_t plen, const uint16_t *check, - uint32_t seq) + * @conn: Connection pointer + * @taph: tap backend specific header + * @iph: Pointer to IPv4 header + * @bp: Pointer to TCP header followed by TCP payload + * @dlen: TCP payload length + * @check: Checksum, if already known + * @seq: Sequence number for this segment + * @no_tcp_csum: Do not set TCP checksum + */ +void tcp_fill_headers4(const struct tcp_tap_conn *conn, + struct tap_hdr *taph, struct iphdr *iph, + struct tcp_payload_t *bp, size_t dlen, + const uint16_t *check, uint32_t seq, bool no_tcp_csum) { - size_t ip_len = plen + sizeof(struct iphdr) + sizeof(struct tcphdr); - const struct in_addr *a4 = inany_v4(&conn->faddr); + const struct flowside *tapside = TAPFLOW(conn); + const struct in_addr *src4 = inany_v4(&tapside->oaddr); + const struct in_addr *dst4 = inany_v4(&tapside->eaddr); + size_t l4len = dlen + sizeof(bp->th); + size_t l3len = l4len + sizeof(*iph); - ASSERT(a4); + ASSERT(src4 && dst4); - iph->tot_len = htons(ip_len); - iph->saddr = a4->s_addr; - iph->daddr = c->ip4.addr_seen.s_addr; + iph->tot_len = htons(l3len); + iph->saddr = src4->s_addr; + iph->daddr = dst4->s_addr; iph->check = check ? *check : - csum_ip4_header(iph->tot_len, IPPROTO_TCP, - *a4, c->ip4.addr_seen); + csum_ip4_header(l3len, IPPROTO_TCP, *src4, *dst4); + + tcp_fill_header(&bp->th, conn, seq); - tcp_fill_header(th, conn, seq); + if (no_tcp_csum) { + bp->th.check = 0; + } else { + const struct iovec iov = { + .iov_base = bp, + .iov_len = ntohs(iph->tot_len) - sizeof(struct iphdr), + }; - if (c->mode != MODE_VU) - tcp_update_check_tcp4(iph, th); + tcp_update_check_tcp4(iph, &iov, 1, 0); + } - return ip_len; + tap_hdr_update(taph, l3len + sizeof(struct ethhdr)); } /** * tcp_fill_headers6() - Fill 802.3, IPv6, TCP headers in pre-cooked buffers - * @c: Execution context - * @conn: Connection pointer - * @ip6h: Pointer to IPv6 header - * @th: Pointer to TCP header - * @plen: Payload length (including TCP header options) - * @check: Checksum, if already known - * @seq: Sequence number for this segment - * - * Return: The total length of the IPv6 packet, host order - */ -size_t tcp_fill_headers6(const struct ctx *c, - const struct tcp_tap_conn *conn, - struct ipv6hdr *ip6h, struct tcphdr *th, - size_t plen, uint32_t seq) + * @conn: Connection pointer + * @taph: tap backend specific header + * @ip6h: Pointer to IPv6 header + * @bp: Pointer to TCP header followed by TCP payload + * @dlen: TCP payload length + * @check: Checksum, if already known + * @seq: Sequence number for this segment + * @no_tcp_csum: Do not set TCP checksum + */ +void tcp_fill_headers6(const struct tcp_tap_conn *conn, + struct tap_hdr *taph, struct ipv6hdr *ip6h, + struct tcp_payload_t *bp, size_t dlen, + uint32_t seq, bool no_tcp_csum) { - size_t ip_len = plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr); + const struct flowside *tapside = TAPFLOW(conn); + size_t l4len = dlen + sizeof(bp->th); - ip6h->payload_len = htons(plen + sizeof(struct tcphdr)); - ip6h->saddr = conn->faddr.a6; - if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr)) - ip6h->daddr = c->ip6.addr_ll_seen; - else - ip6h->daddr = c->ip6.addr_seen; + ip6h->payload_len = htons(l4len); + ip6h->saddr = tapside->oaddr.a6; + ip6h->daddr = tapside->eaddr.a6; ip6h->hop_limit = 255; ip6h->version = 6; @@ -1092,12 +1044,20 @@ size_t tcp_fill_headers6(const struct ctx *c, ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff; ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff; - tcp_fill_header(th, conn, seq); + tcp_fill_header(&bp->th, conn, seq); + + if (no_tcp_csum) { + bp->th.check = 0; + } else { + const struct iovec iov = { + .iov_base = bp, + .iov_len = ntohs(ip6h->payload_len) + }; - if (c->mode != MODE_VU) - tcp_update_check_tcp6(ip6h, th); + tcp_update_check_tcp6(ip6h, &iov, 1, 0); + } - return ip_len; + tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr)); } /** @@ -1110,42 +1070,41 @@ size_t tcp_fill_headers6(const struct ctx *c, * Return: 1 if sequence or window were updated, 0 otherwise */ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, - int force_seq, struct tcp_info *tinfo) + bool force_seq, struct tcp_info_linux *tinfo) { uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap; uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; /* cppcheck-suppress [ctunullpointer, unmatchedSuppression] */ socklen_t sl = sizeof(*tinfo); - struct tcp_info tinfo_new; + struct tcp_info_linux tinfo_new; uint32_t new_wnd_to_tap = prev_wnd_to_tap; int s = conn->sock; -#ifndef HAS_BYTES_ACKED - (void)force_seq; - - conn->seq_ack_to_tap = conn->seq_from_tap; - if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap)) - conn->seq_ack_to_tap = prev_ack_to_tap; -#else - if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || tcp_rtt_dst_low(conn) - || CONN_IS_CLOSING(conn) || (conn->flags & LOCAL) || force_seq) { + if (!bytes_acked_cap) { conn->seq_ack_to_tap = conn->seq_from_tap; - } else if (conn->seq_ack_to_tap != conn->seq_from_tap) { - if (!tinfo) { - tinfo = &tinfo_new; - if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) - return 0; - } - - conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked + - conn->seq_init_from_tap; - if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap)) conn->seq_ack_to_tap = prev_ack_to_tap; + } else { + if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || + tcp_rtt_dst_low(conn) || CONN_IS_CLOSING(conn) || + (conn->flags & LOCAL) || force_seq) { + conn->seq_ack_to_tap = conn->seq_from_tap; + } else if (conn->seq_ack_to_tap != conn->seq_from_tap) { + if (!tinfo) { + tinfo = &tinfo_new; + if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) + return 0; + } + + conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked + + conn->seq_init_from_tap; + + if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap)) + conn->seq_ack_to_tap = prev_ack_to_tap; + } } -#endif /* !HAS_BYTES_ACKED */ - if (!KERNEL_REPORTS_SND_WND(c)) { + if (!snd_wnd_cap) { tcp_get_sndbuf(conn); new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW); conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap, @@ -1156,14 +1115,13 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, if (!tinfo) { if (prev_wnd_to_tap > WINDOW_DEFAULT) { goto out; -} + } tinfo = &tinfo_new; if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) { goto out; -} + } } -#ifdef HAS_SND_WND if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) { new_wnd_to_tap = tinfo->tcpi_snd_wnd; } else { @@ -1171,7 +1129,6 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, SNDBUF_GET(conn)); } -#endif new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW); if (!(conn->events & ESTABLISHED)) @@ -1217,25 +1174,23 @@ static void tcp_update_seqack_from_tap(const struct ctx *c, } /** - * tcp_fill_flag_header() - Prepare header for flags-only segment (no payload) + * tcp_prepare_flags() - Prepare header for flags-only segment (no payload) * @c: Execution context * @conn: Connection pointer * @flags: TCP flags: if not set, send segment only if ACK is due * @th: TCP header to update - * @opts: buffer to store TCP option - * @optlen: size of the TCP option buffer + * @data: buffer to store TCP option + * @optlen: size of the TCP option buffer (output parameter) * * Return: < 0 error code on connection reset, - * 0 if there is no flag to send - * 1 otherwise + * 0 if there is no flag to send + * 1 otherwise */ -int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, - int flags, struct tcphdr *th, char *opts, - size_t *optlen) +int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, + int flags, struct tcphdr *th, struct tcp_syn_opts *opts, + size_t *optlen) { - uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; - uint32_t prev_wnd_to_tap = conn->wnd_to_tap; - struct tcp_info tinfo = { 0 }; + struct tcp_info_linux tinfo = { 0 }; socklen_t sl = sizeof(tinfo); int s = conn->sock; @@ -1248,30 +1203,24 @@ int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, return -ECONNRESET; } -#ifdef HAS_SND_WND - if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd) - c->tcp.kernel_snd_wnd = 1; -#endif - if (!(conn->flags & LOCAL)) tcp_rtt_dst_check(conn, &tinfo); - if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags) + if (!tcp_update_seqack_wnd(c, conn, !!flags, &tinfo) && !flags) return 0; + *optlen = 0; if (flags & SYN) { int mss; - /* Options: MSS, NOP and window scale (8 bytes) */ - *optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN; - - *opts++ = OPT_MSS; - *opts++ = OPT_MSS_LEN; - if (c->mtu == -1) { mss = tinfo.tcpi_snd_mss; } else { mss = c->mtu - sizeof(struct tcphdr); + if (CONN_V4(conn)) + mss -= sizeof(struct iphdr); + else + mss -= sizeof(struct ipv6hdr); if (c->low_wmem && !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn)) @@ -1279,26 +1228,18 @@ int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, else if (mss > PAGE_SIZE) mss = ROUND_DOWN(mss, PAGE_SIZE); } - *(uint16_t *)opts = htons(MIN(USHRT_MAX, mss)); - - opts += OPT_MSS_LEN - 2; conn->ws_to_tap = MIN(MAX_WS, tinfo.tcpi_snd_wscale); - *opts++ = OPT_NOP; - *opts++ = OPT_WS; - *opts++ = OPT_WS_LEN; - *opts++ = conn->ws_to_tap; - - th->ack = !!(flags & ACK); - } else { - th->ack = !!(flags & (ACK | DUP_ACK)) || - conn->seq_ack_to_tap != prev_ack_to_tap || - !prev_wnd_to_tap; + *opts = TCP_SYN_OPTS(mss, conn->ws_to_tap); + *optlen = sizeof(*opts); + } else if (!(flags & RST)) { + flags |= ACK; } th->doff = (sizeof(*th) + *optlen) / 4; + th->ack = !!(flags & ACK); th->rst = !!(flags & RST); th->syn = !!(flags & SYN); th->fin = !!(flags & FIN); @@ -1320,10 +1261,20 @@ int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, return 1; } -int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +/** + * tcp_send_flag() - Send segment with flags to tap (no payload) + * @c: Execution context + * @conn: Connection pointer + * @flags: TCP flags: if not set, send segment only if ACK is due + * + * Return: negative error code on connection reset, 0 otherwise + */ +static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, + int flags) { if (c->mode == MODE_VU) return tcp_vu_send_flag(c, conn, flags); + return tcp_buf_send_flag(c, conn, flags); } @@ -1332,7 +1283,7 @@ int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) * @c: Execution context * @conn: Connection pointer */ -void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn) +void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn) { if (conn->events == CLOSED) return; @@ -1366,6 +1317,14 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn, static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd) { wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap); + + /* Work-around for bug introduced in peer kernel code, commit + * e2142825c120 ("net: tcp: send zero-window ACK when no memory"). + * We don't update if window shrank to zero. + */ + if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) + return; + conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX); /* FIXME: reflect the tap-side receiver's window back to the sock-side @@ -1373,33 +1332,16 @@ static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd) } /** - * tcp_seq_init() - Calculate initial sequence number according to RFC 6528 - * @c: Execution context - * @conn: TCP connection, with faddr, fport and eport populated + * tcp_init_seq() - Calculate initial sequence number according to RFC 6528 + * @hash: Hash of connection details * @now: Current timestamp */ -static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn, - const struct timespec *now) +static uint32_t tcp_init_seq(uint64_t hash, const struct timespec *now) { - struct siphash_state state = SIPHASH_INIT(c->hash_secret); - union inany_addr aany; - uint64_t hash; - uint32_t ns; - - if (CONN_V4(conn)) - inany_from_af(&aany, AF_INET, &c->ip4.addr); - else - inany_from_af(&aany, AF_INET6, &c->ip6.addr); - - inany_siphash_feed(&state, &conn->faddr); - inany_siphash_feed(&state, &aany); - hash = siphash_final(&state, 36, - (uint64_t)conn->fport << 16 | conn->eport); - /* 32ns ticks, overflows 32 bits every 137s */ - ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5; + uint32_t ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5; - conn->seq_to_tap = ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns; + return ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns; } /** @@ -1431,7 +1373,7 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) { int s; - s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); + s = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, IPPROTO_TCP); if (s > FD_REF_MAX) { close(s); @@ -1480,22 +1422,21 @@ int tcp_conn_sock(const struct ctx *c, sa_family_t af) * * Return: clamped MSS value */ -static uint16_t tcp_conn_tap_mss(const struct ctx *c, - const struct tcp_tap_conn *conn, +static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn, const char *opts, size_t optlen) { unsigned int mss; int ret; - (void)c; /* unused */ - (void)conn; /* unused */ - if ((ret = tcp_opt_get(opts, optlen, OPT_MSS, NULL, NULL)) < 0) mss = MSS_DEFAULT; else mss = ret; - mss = MIN(MSS, mss); + if (CONN_V4(conn)) + mss = MIN(MSS4, mss); + else + mss = MIN(MSS6, mss); return MIN(mss, USHRT_MAX); } @@ -1503,53 +1444,47 @@ static uint16_t tcp_conn_tap_mss(const struct ctx *c, /** * tcp_bind_outbound() - Bind socket to outbound address and interface if given * @c: Execution context + * @conn: Connection entry for socket to bind * @s: Outbound TCP socket - * @af: Address family */ -static void tcp_bind_outbound(const struct ctx *c, int s, sa_family_t af) +static void tcp_bind_outbound(const struct ctx *c, + const struct tcp_tap_conn *conn, int s) { - if (af == AF_INET) { - if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.addr_out)) { - struct sockaddr_in addr4 = { - .sin_family = AF_INET, - .sin_port = 0, - .sin_addr = c->ip4.addr_out, - }; - - if (bind(s, (struct sockaddr *)&addr4, sizeof(addr4))) { - debug("Can't bind IPv4 TCP socket address: %s", - strerror(errno)); - } + const struct flowside *tgt = &conn->f.side[TGTSIDE]; + union sockaddr_inany bind_sa; + socklen_t sl; + + + pif_sockaddr(c, &bind_sa, &sl, PIF_HOST, &tgt->oaddr, tgt->oport); + if (!inany_is_unspecified(&tgt->oaddr) || tgt->oport) { + if (bind(s, &bind_sa.sa, sl)) { + char sstr[INANY_ADDRSTRLEN]; + + flow_dbg(conn, + "Can't bind TCP outbound socket to %s:%hu: %s", + inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)), + tgt->oport, strerror(errno)); } + } + if (bind_sa.sa_family == AF_INET) { if (*c->ip4.ifname_out) { if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE, c->ip4.ifname_out, strlen(c->ip4.ifname_out))) { - debug("Can't bind IPv4 TCP socket to interface:" - " %s", strerror(errno)); - } - } - } else if (af == AF_INET6) { - if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr_out)) { - struct sockaddr_in6 addr6 = { - .sin6_family = AF_INET6, - .sin6_port = 0, - .sin6_addr = c->ip6.addr_out, - }; - - if (bind(s, (struct sockaddr *)&addr6, sizeof(addr6))) { - debug("Can't bind IPv6 TCP socket address: %s", - strerror(errno)); + flow_dbg(conn, "Can't bind IPv4 TCP socket to" + " interface %s: %s", c->ip4.ifname_out, + strerror(errno)); } } - + } else if (bind_sa.sa_family == AF_INET6) { if (*c->ip6.ifname_out) { if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE, c->ip6.ifname_out, strlen(c->ip6.ifname_out))) { - debug("Can't bind IPv6 TCP socket to interface:" - " %s", strerror(errno)); + flow_dbg(conn, "Can't bind IPv6 TCP socket to" + " interface %s: %s", c->ip6.ifname_out, + strerror(errno)); } } } @@ -1566,92 +1501,81 @@ static void tcp_bind_outbound(const struct ctx *c, int s, sa_family_t af) * @optlen: Bytes in options: caller MUST ensure available length * @now: Current timestamp */ -static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, +static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, const void *saddr, const void *daddr, const struct tcphdr *th, const char *opts, size_t optlen, const struct timespec *now) { in_port_t srcport = ntohs(th->source); in_port_t dstport = ntohs(th->dest); - struct sockaddr_in addr4 = { - .sin_family = AF_INET, - .sin_port = htons(dstport), - .sin_addr = *(struct in_addr *)daddr, - }; - struct sockaddr_in6 addr6 = { - .sin6_family = AF_INET6, - .sin6_port = htons(dstport), - .sin6_addr = *(struct in6_addr *)daddr, - }; - const struct sockaddr *sa; + const struct flowside *ini, *tgt; struct tcp_tap_conn *conn; + union sockaddr_inany sa; union flow *flow; int s = -1, mss; + uint64_t hash; socklen_t sl; if (!(flow = flow_alloc())) return; - if (af == AF_INET) { - if (IN4_IS_ADDR_UNSPECIFIED(saddr) || - IN4_IS_ADDR_BROADCAST(saddr) || - IN4_IS_ADDR_MULTICAST(saddr) || srcport == 0 || - IN4_IS_ADDR_UNSPECIFIED(daddr) || - IN4_IS_ADDR_BROADCAST(daddr) || - IN4_IS_ADDR_MULTICAST(daddr) || dstport == 0) { - char sstr[INET_ADDRSTRLEN], dstr[INET_ADDRSTRLEN]; - - debug("Invalid endpoint in TCP SYN: %s:%hu -> %s:%hu", - inet_ntop(AF_INET, saddr, sstr, sizeof(sstr)), - srcport, - inet_ntop(AF_INET, daddr, dstr, sizeof(dstr)), - dstport); - goto cancel; - } - } else if (af == AF_INET6) { - if (IN6_IS_ADDR_UNSPECIFIED(saddr) || - IN6_IS_ADDR_MULTICAST(saddr) || srcport == 0 || - IN6_IS_ADDR_UNSPECIFIED(daddr) || - IN6_IS_ADDR_MULTICAST(daddr) || dstport == 0) { - char sstr[INET6_ADDRSTRLEN], dstr[INET6_ADDRSTRLEN]; - - debug("Invalid endpoint in TCP SYN: %s:%hu -> %s:%hu", - inet_ntop(AF_INET6, saddr, sstr, sizeof(sstr)), - srcport, - inet_ntop(AF_INET6, daddr, dstr, sizeof(dstr)), - dstport); - goto cancel; - } - } + ini = flow_initiate_af(flow, PIF_TAP, + af, saddr, srcport, daddr, dstport); - if ((s = tcp_conn_sock(c, af)) < 0) + if (!(tgt = flow_target(c, flow, IPPROTO_TCP))) goto cancel; - if (!c->no_map_gw) { - if (af == AF_INET && IN4_ARE_ADDR_EQUAL(daddr, &c->ip4.gw)) - addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); - if (af == AF_INET6 && IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.gw)) - addr6.sin6_addr = in6addr_loopback; + if (flow->f.pif[TGTSIDE] != PIF_HOST) { + flow_err(flow, "No support for forwarding TCP from %s to %s", + pif_name(flow->f.pif[INISIDE]), + pif_name(flow->f.pif[TGTSIDE])); + goto cancel; } - if (af == AF_INET6 && IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr)) { - struct sockaddr_in6 addr6_ll = { - .sin6_family = AF_INET6, - .sin6_addr = c->ip6.addr_ll, - .sin6_scope_id = c->ifi6, - }; - if (bind(s, (struct sockaddr *)&addr6_ll, sizeof(addr6_ll))) + conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); + + if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0 || + !inany_is_unicast(&ini->oaddr) || ini->oport == 0) { + char sstr[INANY_ADDRSTRLEN], dstr[INANY_ADDRSTRLEN]; + + debug("Invalid endpoint in TCP SYN: %s:%hu -> %s:%hu", + inany_ntop(&ini->eaddr, sstr, sizeof(sstr)), ini->eport, + inany_ntop(&ini->oaddr, dstr, sizeof(dstr)), ini->oport); + goto cancel; + } + + if ((s = tcp_conn_sock(c, af)) < 0) + goto cancel; + + pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, tgt->eport); + + /* Use bind() to check if the target address is local (EADDRINUSE or + * similar) and already bound, and set the LOCAL flag in that case. + * + * If bind() succeeds, in general, we could infer that nobody (else) is + * listening on that address and port and reset the connection attempt + * early, but we can't rely on that if non-local binds are enabled, + * because bind() would succeed for any non-local address we can reach. + * + * So, if bind() succeeds, close the socket, get a new one, and proceed. + */ + if (bind(s, &sa.sa, sl)) { + if (errno != EADDRNOTAVAIL && errno != EACCES) + conn_flag(c, conn, LOCAL); + } else { + /* Not a local, bound destination, inconclusive test */ + close(s); + if ((s = tcp_conn_sock(c, af)) < 0) goto cancel; } - conn = FLOW_START(flow, FLOW_TCP, tcp, TAPSIDE); conn->sock = s; conn->timer = -1; conn_event(c, conn, TAP_SYN_RCVD); conn->wnd_to_tap = WINDOW_DEFAULT; - mss = tcp_conn_tap_mss(c, conn, opts, optlen); + mss = tcp_conn_tap_mss(conn, opts, optlen); if (setsockopt(s, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss))) flow_trace(conn, "failed to set TCP_MAXSEG on socket %i", s); MSS_SET(conn, mss); @@ -1664,44 +1588,20 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, if (!(conn->wnd_from_tap = (htons(th->window) >> conn->ws_from_tap))) conn->wnd_from_tap = 1; - inany_from_af(&conn->faddr, af, daddr); - - if (af == AF_INET) { - sa = (struct sockaddr *)&addr4; - sl = sizeof(addr4); - } else { - sa = (struct sockaddr *)&addr6; - sl = sizeof(addr6); - } - - conn->fport = dstport; - conn->eport = srcport; - conn->seq_init_from_tap = ntohl(th->seq); conn->seq_from_tap = conn->seq_init_from_tap + 1; conn->seq_ack_to_tap = conn->seq_from_tap; - tcp_seq_init(c, conn, now); + hash = flow_hash_insert(c, TAP_SIDX(conn)); + conn->seq_to_tap = tcp_init_seq(hash, now); conn->seq_ack_from_tap = conn->seq_to_tap; - tcp_hash_insert(c, conn); - - if (!bind(s, sa, sl)) { - tcp_rst(c, conn); /* Nobody is listening then */ - return; - } - if (errno != EADDRNOTAVAIL && errno != EACCES) - conn_flag(c, conn, LOCAL); - - if ((af == AF_INET && !IN4_IS_ADDR_LOOPBACK(&addr4.sin_addr)) || - (af == AF_INET6 && !IN6_IS_ADDR_LOOPBACK(&addr6.sin6_addr) && - !IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr))) - tcp_bind_outbound(c, s, af); + tcp_bind_outbound(c, conn, s); - if (connect(s, sa, sl)) { + if (connect(s, &sa.sa, sl)) { if (errno != EINPROGRESS) { tcp_rst(c, conn); - return; + goto cancel; } tcp_get_sndbuf(conn); @@ -1709,12 +1609,13 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, tcp_get_sndbuf(conn); if (tcp_send_flag(c, conn, SYN | ACK)) - return; + goto cancel; conn_event(c, conn, TAP_SYN_ACK_SENT); } tcp_epoll_ctl(c, conn); + FLOW_ACTIVATE(conn); return; cancel: @@ -1756,7 +1657,16 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq) return 0; } -static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) +/** + * tcp_data_from_sock() - Handle new data from socket, queue to tap, in window + * @c: Execution context + * @conn: Connection pointer + * + * Return: negative on connection reset, 0 otherwise + * + * #syscalls recvmsg + */ +static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) { if (c->mode == MODE_VU) return tcp_vu_data_from_sock(c, conn); @@ -1775,8 +1685,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) * * Return: count of consumed packets */ -static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, - const struct pool *p, int idx) +static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, + const struct pool *p, int idx) { int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0; uint16_t max_ack_seq_wnd = conn->wnd_from_tap; @@ -1895,6 +1805,10 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, "fast re-transmit, ACK: %u, previous sequence: %u", max_ack_seq, conn->seq_to_tap); conn->seq_to_tap = max_ack_seq; + if (tcp_set_peek_offset(conn->sock, 0)) { + tcp_rst(c, conn); + return -1; + } tcp_data_from_sock(c, conn); } @@ -1941,7 +1855,7 @@ out: */ if (conn->seq_dup_ack_approx != (conn->seq_from_tap & 0xff)) { conn->seq_dup_ack_approx = conn->seq_from_tap & 0xff; - tcp_send_flag(c, conn, DUP_ACK); + tcp_send_flag(c, conn, ACK | DUP_ACK); } return p->count - idx; } @@ -1969,7 +1883,8 @@ out: * @opts: Pointer to start of options * @optlen: Bytes in options: caller MUST ensure available length */ -static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, +static void tcp_conn_from_sock_finish(const struct ctx *c, + struct tcp_tap_conn *conn, const struct tcphdr *th, const char *opts, size_t optlen) { @@ -1980,19 +1895,24 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, if (!(conn->wnd_from_tap >>= conn->ws_from_tap)) conn->wnd_from_tap = 1; - MSS_SET(conn, tcp_conn_tap_mss(c, conn, opts, optlen)); + MSS_SET(conn, tcp_conn_tap_mss(conn, opts, optlen)); conn->seq_init_from_tap = ntohl(th->seq) + 1; conn->seq_from_tap = conn->seq_init_from_tap; conn->seq_ack_to_tap = conn->seq_from_tap; conn_event(c, conn, ESTABLISHED); + if (tcp_set_peek_offset(conn->sock, 0)) { + tcp_rst(c, conn); + return; + } + + tcp_send_flag(c, conn, ACK); /* The client might have sent data already, which we didn't * dequeue waiting for SYN,ACK from tap -- check now. */ tcp_data_from_sock(c, conn); - tcp_send_flag(c, conn, ACK); } /** @@ -2008,7 +1928,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, * * Return: count of consumed packets */ -int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, +int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, const struct pool *p, int idx, const struct timespec *now) { @@ -2016,6 +1936,8 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, const struct tcphdr *th; size_t optlen, len; const char *opts; + union flow *flow; + flow_sidx_t sidx; int ack_due = 0; int count; @@ -2031,16 +1953,22 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, optlen = MIN(optlen, ((1UL << 4) /* from doff width */ - 6) * 4UL); opts = packet_get(p, idx, sizeof(*th), optlen, NULL); - conn = tcp_hash_lookup(c, af, daddr, ntohs(th->source), ntohs(th->dest)); + sidx = flow_lookup_af(c, IPPROTO_TCP, PIF_TAP, af, saddr, daddr, + ntohs(th->source), ntohs(th->dest)); + flow = flow_at_sidx(sidx); /* New connection from tap */ - if (!conn) { + if (!flow) { if (opts && th->syn && !th->ack) tcp_conn_from_tap(c, af, saddr, daddr, th, opts, optlen, now); return 1; } + ASSERT(flow->f.type == FLOW_TCP); + ASSERT(pif_at_sidx(sidx) == PIF_TAP); + conn = &flow->tcp; + flow_trace(conn, "packet length %zu from tap", len); if (th->rst) { @@ -2067,6 +1995,8 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, goto reset; conn_event(c, conn, ESTABLISHED); + if (tcp_set_peek_offset(conn->sock, 0)) + goto reset; if (th->fin) { conn->seq_from_tap++; @@ -2136,7 +2066,7 @@ reset: * @c: Execution context * @conn: Connection pointer */ -static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn) +static void tcp_connect_finish(const struct ctx *c, struct tcp_tap_conn *conn) { socklen_t sl; int so; @@ -2155,61 +2085,26 @@ static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn) } /** - * tcp_snat_inbound() - Translate source address for inbound data if needed - * @c: Execution context - * @addr: Source address of inbound packet/connection - */ -static void tcp_snat_inbound(const struct ctx *c, union inany_addr *addr) -{ - struct in_addr *addr4 = inany_v4(addr); - - if (addr4) { - if (IN4_IS_ADDR_LOOPBACK(addr4) || - IN4_IS_ADDR_UNSPECIFIED(addr4) || - IN4_ARE_ADDR_EQUAL(addr4, &c->ip4.addr_seen)) - *addr4 = c->ip4.gw; - } else { - struct in6_addr *addr6 = &addr->a6; - - if (IN6_IS_ADDR_LOOPBACK(addr6) || - IN6_ARE_ADDR_EQUAL(addr6, &c->ip6.addr_seen) || - IN6_ARE_ADDR_EQUAL(addr6, &c->ip6.addr)) { - if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw)) - *addr6 = c->ip6.gw; - else - *addr6 = c->ip6.addr_ll; - } - } -} - -/** * tcp_tap_conn_from_sock() - Initialize state for non-spliced connection * @c: Execution context - * @dstport: Destination port for connection (host side) * @flow: flow to initialise * @s: Accepted socket * @sa: Peer socket address (from accept()) * @now: Current timestamp */ -static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport, - union flow *flow, int s, - const union sockaddr_inany *sa, - const struct timespec *now) +static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow, + int s, const struct timespec *now) { - struct tcp_tap_conn *conn = FLOW_START(flow, FLOW_TCP, tcp, SOCKSIDE); + struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); + uint64_t hash; conn->sock = s; conn->timer = -1; conn->ws_to_tap = conn->ws_from_tap = 0; conn_event(c, conn, SOCK_ACCEPTED); - inany_from_sockaddr(&conn->faddr, &conn->fport, sa); - conn->eport = dstport + c->tcp.fwd_in.delta[dstport]; - - tcp_snat_inbound(c, &conn->faddr); - - tcp_seq_init(c, conn, now); - tcp_hash_insert(c, conn); + hash = flow_hash_insert(c, TAP_SIDX(conn)); + conn->seq_to_tap = tcp_init_seq(hash, now); conn->seq_ack_from_tap = conn->seq_to_tap; @@ -2219,6 +2114,8 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport, conn_flag(c, conn, ACK_FROM_TAP_DUE); tcp_get_sndbuf(conn); + + FLOW_ACTIVATE(conn); } /** @@ -2227,53 +2124,58 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport, * @ref: epoll reference of listening socket * @now: Current timestamp */ -void tcp_listen_handler(struct ctx *c, union epoll_ref ref, +void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, const struct timespec *now) { + const struct flowside *ini; union sockaddr_inany sa; socklen_t sl = sizeof(sa); union flow *flow; int s; - if (c->no_tcp || !(flow = flow_alloc())) + ASSERT(!c->no_tcp); + + if (!(flow = flow_alloc())) return; s = accept4(ref.fd, &sa.sa, &sl, SOCK_NONBLOCK); if (s < 0) goto cancel; - if (sa.sa_family == AF_INET) { - const struct in_addr *addr = &sa.sa4.sin_addr; - in_port_t port = sa.sa4.sin_port; + /* FIXME: When listening port has a specific bound address, record that + * as our address + */ + ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa, + ref.tcp_listen.port); - if (IN4_IS_ADDR_UNSPECIFIED(addr) || - IN4_IS_ADDR_BROADCAST(addr) || - IN4_IS_ADDR_MULTICAST(addr) || port == 0) { - char str[INET_ADDRSTRLEN]; + if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0) { + char sastr[SOCKADDR_STRLEN]; - err("Invalid endpoint from TCP accept(): %s:%hu", - inet_ntop(AF_INET, addr, str, sizeof(str)), port); - goto cancel; - } - } else if (sa.sa_family == AF_INET6) { - const struct in6_addr *addr = &sa.sa6.sin6_addr; - in_port_t port = sa.sa6.sin6_port; + err("Invalid endpoint from TCP accept(): %s", + sockaddr_ntop(&sa, sastr, sizeof(sastr))); + goto cancel; + } - if (IN6_IS_ADDR_UNSPECIFIED(addr) || - IN6_IS_ADDR_MULTICAST(addr) || port == 0) { - char str[INET6_ADDRSTRLEN]; + if (!flow_target(c, flow, IPPROTO_TCP)) + goto cancel; - err("Invalid endpoint from TCP accept(): %s:%hu", - inet_ntop(AF_INET6, addr, str, sizeof(str)), port); - goto cancel; - } - } + switch (flow->f.pif[TGTSIDE]) { + case PIF_SPLICE: + case PIF_HOST: + tcp_splice_conn_from_sock(c, flow, s); + break; - if (tcp_splice_conn_from_sock(c, ref.tcp_listen.pif, - ref.tcp_listen.port, flow, s, &sa)) - return; + case PIF_TAP: + tcp_tap_conn_from_sock(c, flow, s, now); + break; + + default: + flow_err(flow, "No support for forwarding TCP from %s to %s", + pif_name(flow->f.pif[INISIDE]), + pif_name(flow->f.pif[TGTSIDE])); + goto cancel; + } - tcp_tap_conn_from_sock(c, ref.tcp_listen.port, flow, s, &sa, now); return; cancel: @@ -2285,21 +2187,23 @@ cancel: * @c: Execution context * @ref: epoll reference of timer (not connection) * - * #syscalls timerfd_gettime + * #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64 */ -void tcp_timer_handler(struct ctx *c, union epoll_ref ref) +void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) { struct itimerspec check_armed = { { 0 }, { 0 } }; - struct tcp_tap_conn *conn = CONN(ref.flow); + struct tcp_tap_conn *conn = &FLOW(ref.flow)->tcp; - if (c->no_tcp) - return; + ASSERT(!c->no_tcp); + ASSERT(conn->f.type == FLOW_TCP); /* We don't reset timers on ~ACK_FROM_TAP_DUE, ~ACK_TO_TAP_DUE. If the * timer is currently armed, this event came from a previous setting, * and we just set the timer to a new point in the future: discard it. */ - timerfd_gettime(conn->timer, &check_armed); + if (timerfd_gettime(conn->timer, &check_armed)) + flow_err(conn, "failed to read timer: %s", strerror(errno)); + if (check_armed.it_value.tv_sec || check_armed.it_value.tv_nsec) return; @@ -2320,8 +2224,12 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) flow_dbg(conn, "ACK timeout, retry"); conn->retrans++; conn->seq_to_tap = conn->seq_ack_from_tap; - tcp_data_from_sock(c, conn); - tcp_timer_ctl(c, conn); + if (tcp_set_peek_offset(conn->sock, 0)) { + tcp_rst(c, conn); + } else { + tcp_data_from_sock(c, conn); + tcp_timer_ctl(c, conn); + } } } else { struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } }; @@ -2333,7 +2241,10 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) * case. This avoids having to preemptively reset the timer on * ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE. */ - timerfd_settime(conn->timer, 0, &new, &old); + if (timerfd_settime(conn->timer, 0, &new, &old)) + flow_err(conn, "failed to set timer: %s", + strerror(errno)); + if (old.it_value.tv_sec == ACT_TIMEOUT) { flow_dbg(conn, "activity timeout"); tcp_rst(c, conn); @@ -2347,12 +2258,13 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) * @ref: epoll reference * @events: epoll events bitmap */ -void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) +void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events) { - struct tcp_tap_conn *conn = CONN(ref.flowside.flow); + struct tcp_tap_conn *conn = conn_at_sidx(ref.flowside); - ASSERT(conn->f.type == FLOW_TCP); - ASSERT(ref.flowside.side == SOCKSIDE); + ASSERT(!c->no_tcp); + ASSERT(pif_at_sidx(ref.flowside) != PIF_TAP); if (conn->events == CLOSED) return; @@ -2378,7 +2290,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) tcp_data_from_sock(c, conn); if (events & EPOLLOUT) - tcp_update_seqack_wnd(c, conn, 0, NULL); + tcp_update_seqack_wnd(c, conn, false, NULL); return; } @@ -2401,17 +2313,16 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) } /** - * tcp_sock_init_af() - Initialise listening socket for a given af and port + * tcp_sock_init_one() - Initialise listening socket for address and port * @c: Execution context - * @af: Address family to listen on - * @port: Port, host order - * @addr: Pointer to address for binding, NULL if not configured + * @addr: Pointer to address for binding, NULL for dual stack any * @ifname: Name of interface to bind to, NULL if not configured + * @port: Port, host order * * Return: fd for the new listening socket, negative error code on failure */ -static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port, - const void *addr, const char *ifname) +static int tcp_sock_init_one(const struct ctx *c, const union inany_addr *addr, + const char *ifname, in_port_t port) { union tcp_listen_epoll_ref tref = { .port = port, @@ -2419,12 +2330,13 @@ static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port, }; int s; - s = sock_l4(c, af, IPPROTO_TCP, addr, ifname, port, tref.u32); + s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_HOST, addr, + ifname, port, tref.u32); if (c->tcp.fwd_in.mode == FWD_AUTO) { - if (af == AF_INET || af == AF_UNSPEC) + if (!addr || inany_v4(addr)) tcp_sock_init_ext[port][V4] = s < 0 ? -1 : s; - if (af == AF_INET6 || af == AF_UNSPEC) + if (!addr || !inany_v4(addr)) tcp_sock_init_ext[port][V6] = s < 0 ? -1 : s; } @@ -2438,29 +2350,32 @@ static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port, /** * tcp_sock_init() - Create listening sockets for a given host ("inbound") port * @c: Execution context - * @af: Address family to select a specific IP version, or AF_UNSPEC * @addr: Pointer to address for binding, NULL if not configured * @ifname: Name of interface to bind to, NULL if not configured * @port: Port, host order * * Return: 0 on (partial) success, negative error code on (complete) failure */ -int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr, +int tcp_sock_init(const struct ctx *c, const union inany_addr *addr, const char *ifname, in_port_t port) { int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1; - if (af == AF_UNSPEC && c->ifi4 && c->ifi6) + ASSERT(!c->no_tcp); + + if (!addr && c->ifi4 && c->ifi6) /* Attempt to get a dual stack socket */ - if (tcp_sock_init_af(c, AF_UNSPEC, port, addr, ifname) >= 0) + if (tcp_sock_init_one(c, NULL, ifname, port) >= 0) return 0; /* Otherwise create a socket per IP version */ - if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) - r4 = tcp_sock_init_af(c, AF_INET, port, addr, ifname); + if ((!addr || inany_v4(addr)) && c->ifi4) + r4 = tcp_sock_init_one(c, addr ? addr : &inany_any4, + ifname, port); - if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) - r6 = tcp_sock_init_af(c, AF_INET6, port, addr, ifname); + if ((!addr || !inany_v4(addr)) && c->ifi6) + r6 = tcp_sock_init_one(c, addr ? addr : &inany_any6, + ifname, port); if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6)) return 0; @@ -2483,8 +2398,8 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port) ASSERT(c->mode == MODE_PASTA); - s = sock_l4(c, AF_INET, IPPROTO_TCP, &in4addr_loopback, NULL, port, - tref.u32); + s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback4, + NULL, port, tref.u32); if (s >= 0) tcp_sock_set_bufsize(c, s); else @@ -2509,8 +2424,8 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port) ASSERT(c->mode == MODE_PASTA); - s = sock_l4(c, AF_INET6, IPPROTO_TCP, &in6addr_loopback, NULL, port, - tref.u32); + s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback6, + NULL, port, tref.u32); if (s >= 0) tcp_sock_set_bufsize(c, s); else @@ -2527,6 +2442,8 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port) */ void tcp_ns_sock_init(const struct ctx *c, in_port_t port) { + ASSERT(!c->no_tcp); + if (c->ifi4) tcp_ns_sock_init4(c, port); if (c->ifi6) @@ -2539,6 +2456,7 @@ void tcp_ns_sock_init(const struct ctx *c, in_port_t port) * * Return: 0 */ +/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */ static int tcp_ns_socks_init(void *arg) { const struct ctx *c = (const struct ctx *)arg; @@ -2604,6 +2522,57 @@ static void tcp_sock_refill_init(const struct ctx *c) } /** + * tcp_probe_peek_offset_cap() - Check if SO_PEEK_OFF is supported by kernel + * @af: Address family, IPv4 or IPv6 + * + * Return: true if supported, false otherwise + */ +static bool tcp_probe_peek_offset_cap(sa_family_t af) +{ + bool ret = false; + int s, optv = 0; + + s = socket(af, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); + if (s < 0) { + warn_perror("Temporary TCP socket creation failed"); + } else { + if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int))) + ret = true; + close(s); + } + + return ret; +} + +/** + * tcp_probe_tcp_info() - Check what data TCP_INFO reports + * + * Return: Number of bytes returned by TCP_INFO getsockopt() + */ +static socklen_t tcp_probe_tcp_info(void) +{ + struct tcp_info_linux tinfo; + socklen_t sl = sizeof(tinfo); + int s; + + s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); + if (s < 0) { + warn_perror("Temporary TCP socket creation failed"); + return false; + } + + if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) { + warn_perror("Failed to get TCP_INFO on temporary socket"); + close(s); + return false; + } + + close(s); + + return sl; +} + +/** * tcp_init() - Get initial sequence, hash secret, initialise per-socket data * @c: Execution context * @@ -2611,16 +2580,9 @@ static void tcp_sock_refill_init(const struct ctx *c) */ int tcp_init(struct ctx *c) { - unsigned b; - - for (b = 0; b < TCP_HASH_TABLE_SIZE; b++) - tc_hash[b] = FLOW_SIDX_NONE; - - if (c->ifi4) - tcp_buf_sock4_iov_init(c); + ASSERT(!c->no_tcp); - if (c->ifi6) - tcp_buf_sock6_iov_init(c); + tcp_sock_iov_init(c); memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4)); memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6)); @@ -2635,6 +2597,19 @@ int tcp_init(struct ctx *c) NS_CALL(tcp_ns_socks_init, c); } + peek_offset_cap = (!c->ifi4 || tcp_probe_peek_offset_cap(AF_INET)) && + (!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6)); + debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not "); + + tcp_info_size = tcp_probe_tcp_info(); + +#define dbg_tcpi(f_) debug("TCP_INFO tcpi_%s field%s supported", \ + STRINGIFY(f_), tcp_info_cap(f_) ? " " : " not ") + dbg_tcpi(snd_wnd); + dbg_tcpi(bytes_acked); + dbg_tcpi(min_rtt); +#undef dbg_tcpi + return 0; } @@ -2676,7 +2651,7 @@ static void tcp_port_rebind(struct ctx *c, bool outbound) if (outbound) tcp_ns_sock_init(c, port); else - tcp_sock_init(c, AF_UNSPEC, NULL, NULL, port); + tcp_sock_init(c, NULL, NULL, port); } } } |