diff options
Diffstat (limited to 'tcp.c')
| -rw-r--r-- | tcp.c | 2123 |
1 files changed, 1779 insertions, 344 deletions
@@ -179,14 +179,16 @@ * * Timeouts are implemented by means of timerfd timers, set based on flags: * - * - SYN_TIMEOUT: if no ACK is received from tap/guest during handshake (flag - * ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, reset the - * connection + * - RTO_INIT: if no ACK segment was received from tap/guest, either during + * handshake (flag ACK_FROM_TAP_DUE without ESTABLISHED event) or after + * sending data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data + * from the socket and reset sequence to what was acknowledged. This is the + * timeout for the first retry, in seconds. Retry TCP_MAX_RETRIES times for + * established connections, or (syn_retries + syn_linear_timeouts) times + * during the handshake, then reset the connection * - * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending - * data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the - * socket and reset sequence to what was acknowledged. If this persists for - * more than TCP_MAX_RETRANS times in a row, reset the connection + * - RTO_INIT_AFTER_SYN_RETRIES: if SYN retries happened during handshake and + * RTO is less than this, re-initialise RTO to this for data retransmissions * * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE * with TAP_FIN_SENT event), and no ACK is received within this time, reset @@ -200,9 +202,13 @@ * - ACT_TIMEOUT, in the presence of any event: if no activity is detected on * either side, the connection is reset * - * - ACK_INTERVAL elapsed after data segment received from tap without having + * - RTT / 2 elapsed after data segment received from tap without having * sent an ACK segment, or zero-sized window advertised to tap/guest (flag - * ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent + * ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent. + * + * RTT, here, is an approximation of the RTT value reported by the kernel via + * TCP_INFO, with a representable range from RTT_STORE_MIN (100 us) to + * RTT_STORE_MAX (3276.8 ms). The timeout value is clamped accordingly. * * * Summary of data flows (with ESTABLISHED event) @@ -279,7 +285,7 @@ #include <stdbool.h> #include <stddef.h> #include <string.h> -#include <sys/epoll.h> +#include <sys/ioctl.h> #include <sys/socket.h> #include <sys/timerfd.h> #include <sys/types.h> @@ -287,6 +293,8 @@ #include <time.h> #include <arpa/inet.h> +#include <linux/sockios.h> + #include "checksum.h" #include "util.h" #include "iov.h" @@ -299,26 +307,62 @@ #include "log.h" #include "inany.h" #include "flow.h" +#include "repair.h" #include "linux_dep.h" #include "flow_table.h" #include "tcp_internal.h" #include "tcp_buf.h" #include "tcp_vu.h" +#include "epoll_ctl.h" + +/* + * The size of TCP header (including options) is given by doff (Data Offset) + * that is a 4-bit value specifying the number of 32-bit words in the header. + * The maximum value of doff is 15 [(1 << 4) - 1]. + * The maximum length in bytes of options is 15 minus the number of 32-bit + * words in the minimal TCP header (5) multiplied by the length of a 32-bit + * word (4). + */ +#define OPTLEN_MAX (((1UL << 4) - 1 - 5) * 4UL) + +#ifndef __USE_MISC +/* From Linux UAPI, missing in netinet/tcp.h provided by musl */ +struct tcp_repair_opt { + __u32 opt_code; + __u32 opt_val; +}; + +enum { + TCP_NO_QUEUE, + TCP_RECV_QUEUE, + TCP_SEND_QUEUE, + TCP_QUEUES_NR, +}; +#endif /* MSS rounding: see SET_MSS() */ #define MSS_DEFAULT 536 #define WINDOW_DEFAULT 14600 /* RFC 6928 */ -#define ACK_INTERVAL 10 /* ms */ -#define SYN_TIMEOUT 10 /* s */ -#define ACK_TIMEOUT 2 +#define RTO_INIT 1 /* s, RFC 6298 */ +#define RTO_INIT_AFTER_SYN_RETRIES 3 /* s, RFC 6298 */ #define FIN_TIMEOUT 60 #define ACT_TIMEOUT 7200 #define LOW_RTT_TABLE_SIZE 8 #define LOW_RTT_THRESHOLD 10 /* us */ +/* Parameters to temporarily exceed sending buffer to force TCP auto-tuning */ +#define SNDBUF_BOOST_BYTES_RTT_LO 2500 /* B * s: no boost until here */ +/* ...examples: 5 MB sent * 500 ns RTT, 250 kB * 10 ms, 8 kB * 300 ms */ +#define SNDBUF_BOOST_FACTOR 150 /* % */ +#define SNDBUF_BOOST_BYTES_RTT_HI 6000 /* apply full boost factor */ +/* 12 MB sent * 500 ns RTT, 600 kB * 10 ms, 20 kB * 300 ms */ + +/* Ratio of buffer to bandwidth * delay product implying interactive traffic */ +#define SNDBUF_TO_BW_DELAY_INTERACTIVE /* > */ 20 /* (i.e. < 5% of buffer) */ + #define ACK_IF_NEEDED 0 /* See tcp_send_flag() */ #define CONN_IS_CLOSING(conn) \ @@ -326,6 +370,28 @@ ((conn)->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) #define CONN_HAS(conn, set) (((conn)->events & (set)) == (set)) +/* Buffers to migrate pending data from send and receive queues. No, they don't + * use memory if we don't use them. And we're going away after this, so splurge. + */ +#define TCP_MIGRATE_SND_QUEUE_MAX (64 << 20) +#define TCP_MIGRATE_RCV_QUEUE_MAX (64 << 20) +uint8_t tcp_migrate_snd_queue [TCP_MIGRATE_SND_QUEUE_MAX]; +uint8_t tcp_migrate_rcv_queue [TCP_MIGRATE_RCV_QUEUE_MAX]; + +#define TCP_MIGRATE_RESTORE_CHUNK_MIN 1024 /* Try smaller when above this */ + +#define SYN_RETRIES "/proc/sys/net/ipv4/tcp_syn_retries" +#define SYN_LINEAR_TIMEOUTS "/proc/sys/net/ipv4/tcp_syn_linear_timeouts" +#define RTO_MAX_MS "/proc/sys/net/ipv4/tcp_rto_max_ms" + +#define SYN_RETRIES_DEFAULT 6 +#define SYN_LINEAR_TIMEOUTS_DEFAULT 4 +#define RTO_MAX_DEFAULT 120 /* s */ +#define MAX_SYNCNT 127 /* derived from kernel's limit */ + +/* "Extended" data (not stored in the flow table) for TCP flow migration */ +static struct tcp_tap_transfer_ext migrate_ext[FLOW_MAX]; + static const char *tcp_event_str[] __attribute((__unused__)) = { "SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT", @@ -338,14 +404,14 @@ static const char *tcp_state_str[] __attribute((__unused__)) = { "SYN_RCVD", /* approximately maps to TAP_SYN_ACK_SENT */ /* Passive close: */ - "CLOSE_WAIT", "CLOSE_WAIT", "LAST_ACK", "LAST_ACK", "LAST_ACK", + "CLOSE_WAIT", "CLOSE_WAIT", "CLOSE_WAIT", "LAST_ACK", "LAST_ACK", /* Active close (+5): */ "CLOSING", "FIN_WAIT_1", "FIN_WAIT_1", "FIN_WAIT_2", "TIME_WAIT", }; static const char *tcp_flag_str[] __attribute((__unused__)) = { "STALLED", "LOCAL", "ACTIVE_CLOSE", "ACK_TO_TAP_DUE", - "ACK_FROM_TAP_DUE", "ACK_FROM_TAP_BLOCKS", + "ACK_FROM_TAP_DUE", "ACK_FROM_TAP_BLOCKS", "SYN_RETRIED", }; /* Listening sockets, used for automatic port forwarding in pasta mode only */ @@ -357,7 +423,7 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS]; */ static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE]; -char tcp_buf_discard [MAX_WINDOW]; +char tcp_buf_discard [BUF_DISCARD_SIZE]; /* Does the kernel support TCP_PEEK_OFF? */ bool peek_offset_cap; @@ -370,11 +436,13 @@ socklen_t tcp_info_size; sizeof(((struct tcp_info_linux *)NULL)->tcpi_##f_)) <= tcp_info_size) /* Kernel reports sending window in TCP_INFO (kernel commit 8f7baad7f035) */ -#define snd_wnd_cap tcp_info_cap(snd_wnd) +#define snd_wnd_cap tcp_info_cap(snd_wnd) /* Kernel reports bytes acked in TCP_INFO (kernel commit 0df48c26d84) */ -#define bytes_acked_cap tcp_info_cap(bytes_acked) +#define bytes_acked_cap tcp_info_cap(bytes_acked) /* Kernel reports minimum RTT in TCP_INFO (kernel commit cd9b266095f4) */ -#define min_rtt_cap tcp_info_cap(min_rtt) +#define min_rtt_cap tcp_info_cap(min_rtt) +/* Kernel reports delivery rate in TCP_INFO (kernel commit eb8329e0a04d) */ +#define delivery_rate_cap tcp_info_cap(delivery_rate) /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; @@ -402,19 +470,20 @@ static struct tcp_tap_conn *conn_at_sidx(flow_sidx_t sidx) } /** - * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported - * @s: Socket to update + * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on connection if supported + * @conn: Pointer to the TCP connection structure * @offset: Offset in bytes * - * Return: -1 when it fails, 0 otherwise. + * Return: -1 when it fails, 0 otherwise. */ -int tcp_set_peek_offset(int s, int offset) +int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset) { if (!peek_offset_cap) return 0; - if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) { - err("Failed to set SO_PEEK_OFF to %i in socket %i", offset, s); + if (setsockopt(conn->sock, SOL_SOCKET, SO_PEEK_OFF, + &offset, sizeof(offset))) { + flow_perror(conn, "Failed to set SO_PEEK_OFF to %i", offset); return -1; } return 0; @@ -423,7 +492,7 @@ int tcp_set_peek_offset(int s, int offset) /** * tcp_conn_epoll_events() - epoll events mask for given connection state * @events: Current connection events - * @conn_flags Connection flags + * @conn_flags: Connection flags * * Return: epoll events mask corresponding to implied connection state */ @@ -461,34 +530,37 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) */ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) { - int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; + int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock, .flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), }; struct epoll_event ev = { .data.u64 = ref.u64 }; + int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f) + : c->epollfd; if (conn->events == CLOSED) { - if (conn->in_epoll) - epoll_del(c, conn->sock); + if (flow_in_epoll(&conn->f)) + epoll_del(epollfd, conn->sock); if (conn->timer != -1) - epoll_del(c, conn->timer); + epoll_del(epollfd, conn->timer); return 0; } ev.events = tcp_conn_epoll_events(conn->events, conn->flags); - if (epoll_ctl(c->epollfd, m, conn->sock, &ev)) + if (epoll_ctl(epollfd, m, conn->sock, &ev)) return -errno; - conn->in_epoll = true; + flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT); if (conn->timer != -1) { union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER, - .fd = conn->sock, + .fd = conn->timer, .flow = FLOW_IDX(conn) }; struct epoll_event ev_t = { .data.u64 = ref_t.u64, .events = EPOLLIN | EPOLLET }; - if (epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t)) + if (epoll_ctl(flow_epollfd(&conn->f), EPOLL_CTL_MOD, + conn->timer, &ev_t)) return -errno; } @@ -499,8 +571,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) * tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed * @c: Execution context * @conn: Connection pointer - * - * #syscalls timerfd_create timerfd_settime + * #syscalls timerfd_create timerfd_settime|timerfd_settime32 */ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) { @@ -511,26 +582,25 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) if (conn->timer == -1) { union epoll_ref ref = { .type = EPOLL_TYPE_TCP_TIMER, - .fd = conn->sock, .flow = FLOW_IDX(conn) }; struct epoll_event ev = { .data.u64 = ref.u64, .events = EPOLLIN | EPOLLET }; + int epollfd = flow_epollfd(&conn->f); int fd; fd = timerfd_create(CLOCK_MONOTONIC, 0); if (fd == -1 || fd > FD_REF_MAX) { - flow_dbg(conn, "failed to get timer: %s", - strerror_(errno)); + flow_dbg_perror(conn, "failed to get timer"); if (fd > -1) close(fd); conn->timer = -1; return; } conn->timer = fd; + ref.fd = conn->timer; - if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) { - flow_dbg(conn, "failed to add timer: %s", - strerror_(errno)); + if (epoll_ctl(epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) { + flow_dbg_perror(conn, "failed to add timer"); close(conn->timer); conn->timer = -1; return; @@ -538,24 +608,37 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) } if (conn->flags & ACK_TO_TAP_DUE) { - it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000; + it.it_value.tv_sec = RTT_GET(conn) / 2 / ((long)1000 * 1000); + it.it_value.tv_nsec = RTT_GET(conn) / 2 % ((long)1000 * 1000) * + 1000; } else if (conn->flags & ACK_FROM_TAP_DUE) { + int exp = conn->retries, timeout = RTO_INIT; if (!(conn->events & ESTABLISHED)) - it.it_value.tv_sec = SYN_TIMEOUT; - else - it.it_value.tv_sec = ACK_TIMEOUT; + exp -= c->tcp.syn_linear_timeouts; + else if (conn->flags & SYN_RETRIED) + timeout = MAX(timeout, RTO_INIT_AFTER_SYN_RETRIES); + timeout <<= MAX(exp, 0); + it.it_value.tv_sec = MIN(timeout, c->tcp.rto_max); } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { it.it_value.tv_sec = FIN_TIMEOUT; } else { it.it_value.tv_sec = ACT_TIMEOUT; } - flow_dbg(conn, "timer expires in %llu.%03llus", - (unsigned long long)it.it_value.tv_sec, - (unsigned long long)it.it_value.tv_nsec / 1000 / 1000); + if (conn->flags & ACK_TO_TAP_DUE) { + flow_trace(conn, "timer expires in %llu.%02llums", + (unsigned long long)it.it_value.tv_sec * 1000 + + it.it_value.tv_nsec / 1000 / 1000, + (unsigned long long)it.it_value.tv_nsec + / 1000 / 10 % 100); + } else { + flow_dbg(conn, "timer expires in %llu.%03llus", + (unsigned long long)it.it_value.tv_sec, + (unsigned long long)it.it_value.tv_nsec / 1000 / 1000); + } if (timerfd_settime(conn->timer, 0, &it, NULL)) - flow_err(conn, "failed to set timer: %s", strerror_(errno)); + flow_perror(conn, "failed to set timer"); } /** @@ -650,12 +733,13 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, flow_dbg(conn, "%s", num == -1 ? "CLOSED" : tcp_event_str[num]); - if (event == CLOSED) - flow_hash_remove(c, TAP_SIDX(conn)); - else if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD)) + if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD)) { conn_flag(c, conn, ACTIVE_CLOSE); - else + } else { + if (event == CLOSED) + flow_hash_remove(c, TAP_SIDX(conn)); tcp_epoll_ctl(c, conn); + } if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) tcp_timer_ctl(c, conn); @@ -714,7 +798,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, } /** - * tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.5 usage) + * tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.75 usage) * @conn: Connection pointer */ static void tcp_get_sndbuf(struct tcp_tap_conn *conn) @@ -729,34 +813,12 @@ static void tcp_get_sndbuf(struct tcp_tap_conn *conn) return; } - v = sndbuf; - if (v >= SNDBUF_BIG) - v /= 2; - else if (v > SNDBUF_SMALL) - v -= v * (v - SNDBUF_SMALL) / (SNDBUF_BIG - SNDBUF_SMALL) / 2; + v = clamped_scale(sndbuf, sndbuf, SNDBUF_SMALL, SNDBUF_BIG, 75); SNDBUF_SET(conn, MIN(INT_MAX, v)); } /** - * tcp_sock_set_bufsize() - Set SO_RCVBUF and SO_SNDBUF to maximum values - * @s: Socket, can be -1 to avoid check in the caller - */ -static void tcp_sock_set_bufsize(const struct ctx *c, int s) -{ - int v = INT_MAX / 2; /* Kernel clamps and rounds, no need to check */ - - if (s == -1) - return; - - if (!c->low_rmem && setsockopt(s, SOL_SOCKET, SO_RCVBUF, &v, sizeof(v))) - trace("TCP: failed to set SO_RCVBUF to %i", v); - - if (!c->low_wmem && setsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v))) - trace("TCP: failed to set SO_SNDBUF to %i", v); -} - -/** * tcp_sock_set_nodelay() - Set TCP_NODELAY option (disable Nagle's algorithm) * @s: Socket, can be -1 to avoid check in the caller */ @@ -775,7 +837,8 @@ static void tcp_sock_set_nodelay(int s) * @th: TCP header (updated) * @payload: TCP payload */ -void tcp_update_csum(uint32_t psum, struct tcphdr *th, struct iov_tail *payload) +static void tcp_update_csum(uint32_t psum, struct tcphdr *th, + struct iov_tail *payload) { th->check = 0; psum = csum_unfolded(th, sizeof(*th), psum); @@ -895,8 +958,10 @@ static void tcp_fill_header(struct tcphdr *th, /** * tcp_fill_headers() - Fill 802.3, IP, TCP headers + * @c: Execution context * @conn: Connection pointer * @taph: tap backend specific header + * @eh: Pointer to Ethernet header * @ip4h: Pointer to IPv4 header, or NULL * @ip6h: Pointer to IPv6 header, or NULL * @th: Pointer to TCP header @@ -905,14 +970,15 @@ static void tcp_fill_header(struct tcphdr *th, * @seq: Sequence number for this segment * @no_tcp_csum: Do not set TCP checksum */ -void tcp_fill_headers(const struct tcp_tap_conn *conn, - struct tap_hdr *taph, +void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn, + struct tap_hdr *taph, struct ethhdr *eh, struct iphdr *ip4h, struct ipv6hdr *ip6h, struct tcphdr *th, struct iov_tail *payload, const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum) { const struct flowside *tapside = TAPFLOW(conn); size_t l4len = iov_tail_size(payload) + sizeof(*th); + uint8_t *omac = conn->f.tap_omac; size_t l3len = l4len; uint32_t psum = 0; @@ -938,6 +1004,7 @@ void tcp_fill_headers(const struct tcp_tap_conn *conn, psum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, *src4, *dst4); } + eh->h_proto = htons_constant(ETH_P_IP); } if (ip6h) { @@ -951,17 +1018,21 @@ void tcp_fill_headers(const struct tcp_tap_conn *conn, ip6h->version = 6; ip6h->nexthdr = IPPROTO_TCP; - ip6h->flow_lbl[0] = (conn->sock >> 16) & 0xf; - ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff; - ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff; + ip6_set_flow_lbl(ip6h, conn->sock); if (!no_tcp_csum) { psum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, &ip6h->saddr, &ip6h->daddr); } + eh->h_proto = htons_constant(ETH_P_IPV6); } + /* Find if neighbour table has a recorded MAC address */ + if (MAC_IS_UNDEF(omac)) + fwd_neigh_mac_get(c, &tapside->oaddr, omac); + eth_update_mac(eh, NULL, omac); + tcp_fill_header(th, conn, seq); if (no_tcp_csum) @@ -969,7 +1040,36 @@ void tcp_fill_headers(const struct tcp_tap_conn *conn, else tcp_update_csum(psum, th, payload); - tap_hdr_update(taph, l3len + sizeof(struct ethhdr)); + tap_hdr_update(taph, MAX(l3len + sizeof(struct ethhdr), ETH_ZLEN)); +} + +/** + * tcp_sndbuf_boost() - Calculate limit of sending buffer to force auto-tuning + * @conn: Connection pointer + * @tinfo: tcp_info from kernel, must be pre-fetched + * + * Return: increased sending buffer to use as a limit for advertised window + */ +static unsigned long tcp_sndbuf_boost(const struct tcp_tap_conn *conn, + const struct tcp_info_linux *tinfo) +{ + unsigned long bytes_rtt_product; + + if (!bytes_acked_cap) + return SNDBUF_GET(conn); + + /* This is *not* a bandwidth-delay product, but it's somewhat related: + * as we send more data (usually at the beginning of a connection), we + * try to make the sending buffer progressively grow, with the RTT as a + * factor (longer delay, bigger buffer needed). + */ + bytes_rtt_product = (long long)tinfo->tcpi_bytes_acked * + tinfo->tcpi_rtt / 1000 / 1000; + + return clamped_scale(SNDBUF_GET(conn), bytes_rtt_product, + SNDBUF_BOOST_BYTES_RTT_LO, + SNDBUF_BOOST_BYTES_RTT_HI, + SNDBUF_BOOST_FACTOR); } /** @@ -980,6 +1080,8 @@ void tcp_fill_headers(const struct tcp_tap_conn *conn, * @tinfo: tcp_info from kernel, can be NULL if not pre-fetched * * Return: 1 if sequence or window were updated, 0 otherwise + * + * #syscalls ioctl */ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, bool force_seq, struct tcp_info_linux *tinfo) @@ -990,32 +1092,75 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, socklen_t sl = sizeof(*tinfo); struct tcp_info_linux tinfo_new; uint32_t new_wnd_to_tap = prev_wnd_to_tap; + bool ack_everything = true; int s = conn->sock; - if (!bytes_acked_cap) { - conn->seq_ack_to_tap = conn->seq_from_tap; - if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap)) - conn->seq_ack_to_tap = prev_ack_to_tap; - } else { - if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || - tcp_rtt_dst_low(conn) || CONN_IS_CLOSING(conn) || - (conn->flags & LOCAL) || force_seq) { - conn->seq_ack_to_tap = conn->seq_from_tap; - } else if (conn->seq_ack_to_tap != conn->seq_from_tap) { - if (!tinfo) { - tinfo = &tinfo_new; - if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) - return 0; - } + /* At this point we could ack all the data we've accepted for forwarding + * (seq_from_tap). When possible, however, we want to only acknowledge + * what the peer has acknowledged. This makes it appear to the guest + * more like a direct connection to the peer, and may improve flow + * control behaviour. + * + * For it to be possible and worth it we need: + * - The TCP_INFO Linux extensions which give us the peer acked bytes + * and the delivery rate (outbound bandwidth at receiver) + * - Not to be told not to (force_seq) + * - Not half-closed in the peer->guest direction + * With no data coming from the peer, we might not get events which + * would prompt us to recheck bytes_acked. We could poll on a + * timer, but that's more trouble than it's worth. + * - Not a host local connection + * Data goes from socket to socket, with nothing meaningfully "in + * flight". + * - Not a pseudo-local connection (e.g. to a VM on the same host) + * If it is, there's not enough in flight to bother. + * - Sending buffer significantly larger than bandwidth * delay product + * Meaning we're not bandwidth-bound and this is likely to be + * interactive traffic where we want to preserve transparent + * connection behaviour and latency. + * + * Otherwise, we probably want to maximise throughput, which needs + * sending buffer auto-tuning, triggered in turn by filling up the + * outbound socket queue. + */ + if (bytes_acked_cap && delivery_rate_cap && !force_seq && + !CONN_IS_CLOSING(conn) && + !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn)) { + if (!tinfo) { + tinfo = &tinfo_new; + if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) + return 0; + } - conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked + - conn->seq_init_from_tap; + /* This trips a cppcheck bug in some versions, including + * cppcheck 2.18.3. + * https://trac.cppcheck.net/ticket/14191 + */ + /* cppcheck-suppress [uninitvar,unmatchedSuppression] */ + if ((unsigned)SNDBUF_GET(conn) > (long long)tinfo->tcpi_rtt * + tinfo->tcpi_delivery_rate / + 1000 / 1000 * + SNDBUF_TO_BW_DELAY_INTERACTIVE) + ack_everything = false; + } - if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap)) - conn->seq_ack_to_tap = prev_ack_to_tap; - } + if (ack_everything) { + /* Fall back to acknowledging everything we got */ + conn->seq_ack_to_tap = conn->seq_from_tap; + } else { + /* cppcheck bug 14191 again, see above */ + /* cppcheck-suppress [uninitvar,unmatchedSuppression] */ + conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked + + conn->seq_init_from_tap; } + /* It's occasionally possible for us to go from using the fallback above + * to the tcpi_bytes_acked method. In that case, we must be careful not + * to let our ACKed sequence go backwards. + */ + if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap)) + conn->seq_ack_to_tap = prev_ack_to_tap; + if (!snd_wnd_cap) { tcp_get_sndbuf(conn); new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW); @@ -1037,9 +1182,53 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) { new_wnd_to_tap = tinfo->tcpi_snd_wnd; } else { + uint32_t sendq; + int limit; + + if (ioctl(s, SIOCOUTQ, &sendq)) { + debug_perror("SIOCOUTQ on socket %i, assuming 0", s); + sendq = 0; + } tcp_get_sndbuf(conn); - new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, - SNDBUF_GET(conn)); + + if ((int)sendq > SNDBUF_GET(conn)) /* Due to memory pressure? */ + limit = 0; + else if ((int)tinfo->tcpi_snd_wnd > SNDBUF_GET(conn)) + limit = tcp_sndbuf_boost(conn, tinfo) - (int)sendq; + else + limit = SNDBUF_GET(conn) - (int)sendq; + + /* If the sender uses mechanisms to prevent Silly Window + * Syndrome (SWS, described in RFC 813 Section 3) it's critical + * that, should the window ever become less than the MSS, we + * advertise a new value once it increases again to be above it. + * + * The mechanism to avoid SWS in the kernel is, implicitly, + * implemented by Nagle's algorithm (which was proposed after + * RFC 813). + * + * To this end, for simplicity, approximate a window value below + * the MSS to zero, as we already have mechanisms in place to + * force updates after the window becomes zero. This matches the + * suggestion from RFC 813, Section 4. + * + * But don't do this if, either: + * + * - there's nothing in the outbound queue: the size of the + * sending buffer is limiting us, and it won't increase if we + * don't send data, so there's no point in waiting, or + * + * - we haven't sent data in a while (somewhat arbitrarily, ten + * times the RTT), as that might indicate that the receiver + * will only process data in batches that are large enough, + * but we won't send enough to fill one because we're stuck + * with pending data in the outbound queue + */ + if (limit < MSS_GET(conn) && sendq && + tinfo->tcpi_last_data_sent < tinfo->tcpi_rtt / 1000 * 10) + limit = 0; + + new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, limit); } new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW); @@ -1059,6 +1248,10 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, conn_flag(c, conn, ACK_TO_TAP_DUE); out: + /* Opportunistically store RTT approximation on valid TCP_INFO data */ + if (tinfo) + RTT_SET(conn, tinfo->tcpi_rtt); + return new_wnd_to_tap != prev_wnd_to_tap || conn->seq_ack_to_tap != prev_ack_to_tap; } @@ -1067,7 +1260,7 @@ out: * tcp_update_seqack_from_tap() - ACK number from tap and related flags/counters * @c: Execution context * @conn: Connection pointer - * @seq Current ACK sequence, host order + * @seq: Current ACK sequence, host order */ static void tcp_update_seqack_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, uint32_t seq) @@ -1080,18 +1273,38 @@ static void tcp_update_seqack_from_tap(const struct ctx *c, if (SEQ_LT(seq, conn->seq_to_tap)) conn_flag(c, conn, ACK_FROM_TAP_DUE); - conn->retrans = 0; + conn->retries = 0; conn->seq_ack_from_tap = seq; } } /** + * tcp_rewind_seq() - Rewind sequence to tap and socket offset to current ACK + * @c: Execution context + * @conn: Connection pointer + * + * Return: 0 on success, -1 on failure, with connection reset + */ +static int tcp_rewind_seq(const struct ctx *c, struct tcp_tap_conn *conn) +{ + conn->seq_to_tap = conn->seq_ack_from_tap; + conn->events &= ~TAP_FIN_SENT; + + if (tcp_set_peek_offset(conn, 0)) { + tcp_rst(c, conn); + return -1; + } + + return 0; +} + +/** * tcp_prepare_flags() - Prepare header for flags-only segment (no payload) * @c: Execution context * @conn: Connection pointer * @flags: TCP flags: if not set, send segment only if ACK is due * @th: TCP header to update - * @data: buffer to store TCP option + * @opts: TCP option buffer (output parameter) * @optlen: size of the TCP option buffer (output parameter) * * Return: < 0 error code on connection reset, @@ -1127,7 +1340,7 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, if (flags & SYN) { int mss; - if (c->mtu == -1) { + if (!c->mtu) { mss = tinfo.tcpi_snd_mss; } else { mss = c->mtu - sizeof(struct tcphdr); @@ -1154,12 +1367,14 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, th->doff = (sizeof(*th) + *optlen) / 4; th->ack = !!(flags & ACK); + th->psh = !!(flags & PSH); th->rst = !!(flags & RST); th->syn = !!(flags & SYN); th->fin = !!(flags & FIN); if (th->ack) { - if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap)) + if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) && + conn->wnd_to_tap) conn_flag(c, conn, ~ACK_TO_TAP_DUE); else conn_flag(c, conn, ACK_TO_TAP_DUE); @@ -1202,8 +1417,8 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn) if (conn->events == CLOSED) return; - if (!tcp_send_flag(c, conn, RST)) - conn_event(c, conn, CLOSED); + tcp_send_flag(c, conn, RST); + conn_event(c, conn, CLOSED); } /** @@ -1225,30 +1440,41 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn, /** * tcp_tap_window_update() - Process an updated window from tap side + * @c: Execution context * @conn: Connection pointer - * @window: Window value, host order, unscaled + * @wnd: Window value, host order, unscaled + * + * Return: false on zero window (not stored to wnd_from_tap), true otherwise */ -static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd) +static bool tcp_tap_window_update(const struct ctx *c, + struct tcp_tap_conn *conn, unsigned wnd) { wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap); /* Work-around for bug introduced in peer kernel code, commit - * e2142825c120 ("net: tcp: send zero-window ACK when no memory"). - * We don't update if window shrank to zero. + * e2142825c120 ("net: tcp: send zero-window ACK when no memory"): don't + * update the window if it shrank to zero, so that we'll eventually + * retry to send data, but rewind the sequence as that obviously implies + * that no data beyond the updated window will be acknowledged. */ - if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) - return; + if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) { + tcp_rewind_seq(c, conn); + return false; + } conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX); /* FIXME: reflect the tap-side receiver's window back to the sock-side * sender by adjusting SO_RCVBUF? */ + return true; } /** * tcp_init_seq() - Calculate initial sequence number according to RFC 6528 * @hash: Hash of connection details * @now: Current timestamp + * + * Return: the calculated 32-bit initial sequence number. */ static uint32_t tcp_init_seq(uint64_t hash, const struct timespec *now) { @@ -1278,12 +1504,11 @@ int tcp_conn_pool_sock(int pool[]) /** * tcp_conn_new_sock() - Open and prepare new socket for connection - * @c: Execution context * @af: Address family * * Return: socket number on success, negative code if socket creation failed */ -static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) +static int tcp_conn_new_sock(sa_family_t af) { int s; @@ -1297,7 +1522,6 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) if (s < 0) return -errno; - tcp_sock_set_bufsize(c, s); tcp_sock_set_nodelay(s); return s; @@ -1305,12 +1529,11 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) /** * tcp_conn_sock() - Obtain a connectable socket in the host/init namespace - * @c: Execution context * @af: Address family (AF_INET or AF_INET6) * - * Return: Socket fd on success, -errno on failure + * Return: socket fd on success, -errno on failure */ -int tcp_conn_sock(const struct ctx *c, sa_family_t af) +int tcp_conn_sock(sa_family_t af) { int *pool = af == AF_INET6 ? init_sock_pool6 : init_sock_pool4; int s; @@ -1321,7 +1544,7 @@ int tcp_conn_sock(const struct ctx *c, sa_family_t af) /* If the pool is empty we just open a new one without refilling the * pool to keep latency down. */ - if ((s = tcp_conn_new_sock(c, af)) >= 0) + if ((s = tcp_conn_new_sock(af)) >= 0) return s; err("TCP: Unable to open socket for new connection: %s", @@ -1367,18 +1590,17 @@ static void tcp_bind_outbound(const struct ctx *c, { const struct flowside *tgt = &conn->f.side[TGTSIDE]; union sockaddr_inany bind_sa; - socklen_t sl; - pif_sockaddr(c, &bind_sa, &sl, PIF_HOST, &tgt->oaddr, tgt->oport); + pif_sockaddr(c, &bind_sa, PIF_HOST, &tgt->oaddr, tgt->oport); if (!inany_is_unspecified(&tgt->oaddr) || tgt->oport) { - if (bind(s, &bind_sa.sa, sl)) { + if (bind(s, &bind_sa.sa, socklen_inany(&bind_sa))) { char sstr[INANY_ADDRSTRLEN]; - flow_dbg(conn, - "Can't bind TCP outbound socket to %s:%hu: %s", - inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)), - tgt->oport, strerror_(errno)); + flow_dbg_perror(conn, + "Can't bind TCP outbound socket to %s:%hu", + inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)), + tgt->oport); } } @@ -1387,9 +1609,9 @@ static void tcp_bind_outbound(const struct ctx *c, if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE, c->ip4.ifname_out, strlen(c->ip4.ifname_out))) { - flow_dbg(conn, "Can't bind IPv4 TCP socket to" - " interface %s: %s", c->ip4.ifname_out, - strerror_(errno)); + flow_dbg_perror(conn, + "Can't bind IPv4 TCP socket to interface %s", + c->ip4.ifname_out); } } } else if (bind_sa.sa_family == AF_INET6) { @@ -1397,9 +1619,9 @@ static void tcp_bind_outbound(const struct ctx *c, if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE, c->ip6.ifname_out, strlen(c->ip6.ifname_out))) { - flow_dbg(conn, "Can't bind IPv6 TCP socket to" - " interface %s: %s", c->ip6.ifname_out, - strerror_(errno)); + flow_dbg_perror(conn, + "Can't bind IPv6 TCP socket to interface %s", + c->ip6.ifname_out); } } } @@ -1432,7 +1654,6 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, union flow *flow; int s = -1, mss; uint64_t hash; - socklen_t sl; if (!(flow = flow_alloc())) return; @@ -1462,10 +1683,10 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, goto cancel; } - if ((s = tcp_conn_sock(c, af)) < 0) + if ((s = tcp_conn_sock(af)) < 0) goto cancel; - pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, tgt->eport); + pif_sockaddr(c, &sa, PIF_HOST, &tgt->eaddr, tgt->eport); /* Use bind() to check if the target address is local (EADDRINUSE or * similar) and already bound, and set the LOCAL flag in that case. @@ -1477,18 +1698,19 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, * * So, if bind() succeeds, close the socket, get a new one, and proceed. */ - if (bind(s, &sa.sa, sl)) { + if (bind(s, &sa.sa, socklen_inany(&sa))) { if (errno != EADDRNOTAVAIL && errno != EACCES) conn_flag(c, conn, LOCAL); } else { /* Not a local, bound destination, inconclusive test */ close(s); - if ((s = tcp_conn_sock(c, af)) < 0) + if ((s = tcp_conn_sock(af)) < 0) goto cancel; } conn->sock = s; conn->timer = -1; + conn->listening_sock = -1; conn_event(c, conn, TAP_SYN_RCVD); conn->wnd_to_tap = WINDOW_DEFAULT; @@ -1516,7 +1738,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, tcp_bind_outbound(c, conn, s); - if (connect(s, &sa.sa, sl)) { + if (connect(s, &sa.sa, socklen_inany(&sa))) { if (errno != EINPROGRESS) { tcp_rst(c, conn); goto cancel; @@ -1535,13 +1757,11 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, tcp_epoll_ctl(c, conn); if (c->mode == MODE_VU) { /* To rebind to same oport after migration */ - sl = sizeof(sa); - if (!getsockname(s, &sa.sa, &sl)) { - inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa); - } else { - err("Failed to get local address for socket: %s", - strerror_(errno)); - } + socklen_t sl = sizeof(sa); + + if (getsockname(s, &sa.sa, &sl) || + inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa) < 0) + err_perror("Can't get local address for socket %i", s); } FLOW_ACTIVATE(conn); @@ -1604,6 +1824,23 @@ static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) } /** + * tcp_packet_data_len() - Get data (TCP payload) length for a TCP packet + * @th: Pointer to TCP header + * @l4len: TCP packet length, including TCP header + * + * Return: data length of TCP packet, -1 on invalid value of Data Offset field + */ +static ssize_t tcp_packet_data_len(const struct tcphdr *th, size_t l4len) +{ + size_t off = th->doff * 4UL; + + if (off < sizeof(*th) || off > l4len) + return -1; + + return l4len - off; +} + +/** * tcp_data_from_tap() - tap/guest data for established connection * @c: Execution context * @conn: Connection pointer @@ -1632,16 +1869,22 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, for (i = idx, iov_i = 0; i < (int)p->count; i++) { uint32_t seq, seq_offset, ack_seq; + struct tcphdr th_storage; const struct tcphdr *th; - char *data; - size_t off; + struct iov_tail data; + size_t off, size; + int count; + + if (!packet_get(p, i, &data)) + return -1; - th = packet_get(p, i, 0, sizeof(*th), &len); + th = IOV_PEEK_HEADER(&data, th_storage); if (!th) return -1; - len += sizeof(*th); + len = iov_tail_size(&data); off = th->doff * 4UL; + if (off < sizeof(*th) || off > len) return -1; @@ -1651,9 +1894,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, } len -= off; - data = packet_get(p, i, off, len, NULL); - if (!data) - continue; + iov_drop_header(&data, off); seq = ntohl(th->seq); if (SEQ_LT(seq, conn->seq_from_tap) && len <= 1) { @@ -1664,8 +1905,15 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, tcp_send_flag(c, conn, ACK); tcp_timer_ctl(c, conn); - if (p->count == 1) + if (setsockopt(conn->sock, SOL_SOCKET, SO_KEEPALIVE, + &((int){ 1 }), sizeof(int))) + flow_trace(conn, "failed to set SO_KEEPALIVE"); + + if (p->count == 1) { + tcp_tap_window_update(c, conn, + ntohs(th->window)); return 1; + } continue; } @@ -1682,12 +1930,21 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, ack_seq == max_ack_seq && ntohs(th->window) == max_ack_seq_wnd; + /* See tcp_tap_window_update() for details. On + * top of that, we also need to check here if a + * zero-window update is contained in a batch of + * packets that includes a non-zero window as + * well. + */ + if (!ntohs(th->window)) + tcp_rewind_seq(c, conn); + max_ack_seq_wnd = ntohs(th->window); max_ack_seq = ack_seq; } } - if (th->fin) + if (th->fin && seq == seq_from_tap) fin = 1; if (!len) @@ -1725,10 +1982,14 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, continue; } - tcp_iov[iov_i].iov_base = data + seq_offset; - tcp_iov[iov_i].iov_len = len - seq_offset; - seq_from_tap += tcp_iov[iov_i].iov_len; - iov_i++; + iov_drop_header(&data, seq_offset); + size = len - seq_offset; + count = iov_tail_clone(&tcp_iov[iov_i], UIO_MAXIOV - iov_i, + &data); + if (count < 0) + break; + seq_from_tap += size; + iov_i += count; if (keep == i) keep = -1; @@ -1741,17 +2002,16 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, if (ack && !tcp_sock_consume(conn, max_ack_seq)) tcp_update_seqack_from_tap(c, conn, max_ack_seq); - tcp_tap_window_update(conn, max_ack_seq_wnd); + tcp_tap_window_update(c, conn, max_ack_seq_wnd); if (retr) { flow_trace(conn, "fast re-transmit, ACK: %u, previous sequence: %u", - max_ack_seq, conn->seq_to_tap); - conn->seq_to_tap = max_ack_seq; - if (tcp_set_peek_offset(conn->sock, 0)) { - tcp_rst(c, conn); + conn->seq_ack_from_tap, conn->seq_to_tap); + + if (tcp_rewind_seq(c, conn)) return -1; - } + tcp_data_from_sock(c, conn); } @@ -1775,23 +2035,20 @@ eintr: goto eintr; if (errno == EAGAIN || errno == EWOULDBLOCK) { - tcp_send_flag(c, conn, ACK_IF_NEEDED); + tcp_send_flag(c, conn, ACK | DUP_ACK); return p->count - idx; } return -1; } - if (n < (int)(seq_from_tap - conn->seq_from_tap)) { + if (n < (int)(seq_from_tap - conn->seq_from_tap)) partial_send = 1; - conn->seq_from_tap += n; - tcp_send_flag(c, conn, ACK_IF_NEEDED); - } else { - conn->seq_from_tap += n; - } + + conn->seq_from_tap += n; out: - if (keep != -1) { + if (keep != -1 || partial_send) { /* We use an 8-bit approximation here: the associated risk is * that we skip a duplicate ACK on 8-bit sequence number * collision. Fast retransmit is a SHOULD in RFC 5681, 3.2. @@ -1831,7 +2088,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c, const struct tcphdr *th, const char *opts, size_t optlen) { - tcp_tap_window_update(conn, ntohs(th->window)); + tcp_tap_window_update(c, conn, ntohs(th->window)); tcp_get_tap_ws(conn, opts, optlen); /* First value is not scaled */ @@ -1845,7 +2102,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c, conn->seq_ack_to_tap = conn->seq_from_tap; conn_event(c, conn, ESTABLISHED); - if (tcp_set_peek_offset(conn->sock, 0)) { + if (tcp_set_peek_offset(conn, 0)) { tcp_rst(c, conn); return; } @@ -1859,12 +2116,84 @@ static void tcp_conn_from_sock_finish(const struct ctx *c, } /** + * tcp_rst_no_conn() - Send RST in response to a packet with no connection + * @c: Execution context + * @af: Address family, AF_INET or AF_INET6 + * @saddr: Source address of the packet we're responding to + * @daddr: Destination address of the packet we're responding to + * @flow_lbl: IPv6 flow label (ignored for IPv4) + * @th: TCP header of the packet we're responding to + * @l4len: Packet length, including TCP header + */ +static void tcp_rst_no_conn(const struct ctx *c, int af, + const void *saddr, const void *daddr, + uint32_t flow_lbl, + const struct tcphdr *th, size_t l4len) +{ + struct iov_tail payload = IOV_TAIL(NULL, 0, 0); + struct tcphdr *rsth; + char buf[USHRT_MAX]; + uint32_t psum = 0; + size_t rst_l2len; + + /* Don't respond to RSTs without a connection */ + if (th->rst) + return; + + if (af == AF_INET) { + struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac, + ETH_P_IP); + const struct in_addr *rst_src = daddr; + const struct in_addr *rst_dst = saddr; + + rsth = tap_push_ip4h(ip4h, *rst_src, *rst_dst, + sizeof(*rsth), IPPROTO_TCP); + psum = proto_ipv4_header_psum(sizeof(*rsth), IPPROTO_TCP, + *rst_src, *rst_dst); + + } else { + struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac, + ETH_P_IPV6); + const struct in6_addr *rst_src = daddr; + const struct in6_addr *rst_dst = saddr; + + rsth = tap_push_ip6h(ip6h, rst_src, rst_dst, + sizeof(*rsth), IPPROTO_TCP, flow_lbl); + psum = proto_ipv6_header_psum(sizeof(*rsth), IPPROTO_TCP, + rst_src, rst_dst); + } + + memset(rsth, 0, sizeof(*rsth)); + + rsth->source = th->dest; + rsth->dest = th->source; + rsth->rst = 1; + rsth->doff = sizeof(*rsth) / 4UL; + + /* Sequence matching logic from RFC 9293 section 3.10.7.1 */ + if (th->ack) { + rsth->seq = th->ack_seq; + } else { + size_t dlen = l4len - th->doff * 4UL; + uint32_t ack = ntohl(th->seq) + dlen; + + rsth->ack_seq = htonl(ack); + rsth->ack = 1; + } + + tcp_update_csum(psum, rsth, &payload); + rst_l2len = ((char *)rsth - buf) + sizeof(*rsth); + tap_send_single(c, buf, rst_l2len); +} + +/** * tcp_tap_handler() - Handle packets from tap and state transitions * @c: Execution context * @pif: pif on which the packet is arriving * @af: Address family, AF_INET or AF_INET6 * @saddr: Source address * @daddr: Destination address + * @flow_lbl: IPv6 flow label (ignored for IPv4) * @p: Pool of TCP packets, with TCP headers * @idx: Index of first packet in pool to process * @now: Current timestamp @@ -1872,12 +2201,15 @@ static void tcp_conn_from_sock_finish(const struct ctx *c, * Return: count of consumed packets */ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, - const void *saddr, const void *daddr, + const void *saddr, const void *daddr, uint32_t flow_lbl, const struct pool *p, int idx, const struct timespec *now) { struct tcp_tap_conn *conn; + struct tcphdr th_storage; const struct tcphdr *th; - size_t optlen, len; + char optsc[OPTLEN_MAX]; + struct iov_tail data; + size_t optlen, l4len; const char *opts; union flow *flow; flow_sidx_t sidx; @@ -1886,15 +2218,19 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, (void)pif; - th = packet_get(p, idx, 0, sizeof(*th), &len); + if (!packet_get(p, idx, &data)) + return 1; + + l4len = iov_tail_size(&data); + + th = IOV_REMOVE_HEADER(&data, th_storage); if (!th) return 1; - len += sizeof(*th); optlen = th->doff * 4UL - sizeof(*th); /* Static checkers might fail to see this: */ - optlen = MIN(optlen, ((1UL << 4) /* from doff width */ - 6) * 4UL); - opts = packet_get(p, idx, sizeof(*th), optlen, NULL); + optlen = MIN(optlen, OPTLEN_MAX); + opts = (char *)iov_remove_header_(&data, &optsc[0], optlen, 1); sidx = flow_lookup_af(c, IPPROTO_TCP, PIF_TAP, af, saddr, daddr, ntohs(th->source), ntohs(th->dest)); @@ -1905,6 +2241,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, if (opts && th->syn && !th->ack) tcp_conn_from_tap(c, af, saddr, daddr, th, opts, optlen, now); + else + tcp_rst_no_conn(c, af, saddr, daddr, flow_lbl, th, l4len); return 1; } @@ -1912,7 +2250,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, ASSERT(pif_at_sidx(sidx) == PIF_TAP); conn = &flow->tcp; - flow_trace(conn, "packet length %zu from tap", len); + flow_trace(conn, "packet length %zu from tap", l4len); if (th->rst) { conn_event(c, conn, CLOSED); @@ -1941,7 +2279,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, goto reset; conn_event(c, conn, ESTABLISHED); - if (tcp_set_peek_offset(conn->sock, 0)) + if (tcp_set_peek_offset(conn, 0)) goto reset; if (th->fin) { @@ -1957,9 +2295,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, if (!th->ack) goto reset; - tcp_tap_window_update(conn, ntohs(th->window)); - - tcp_data_from_sock(c, conn); + if (tcp_tap_window_update(c, conn, ntohs(th->window))) + tcp_data_from_sock(c, conn); if (p->count - idx == 1) return 1; @@ -1967,11 +2304,44 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, /* Established connections not accepting data from tap */ if (conn->events & TAP_FIN_RCVD) { - tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq)); + size_t dlen; + bool retr; - if (conn->events & SOCK_FIN_RCVD && - conn->seq_ack_from_tap == conn->seq_to_tap) - conn_event(c, conn, CLOSED); + if ((dlen = tcp_packet_data_len(th, l4len))) { + flow_dbg(conn, "data segment in CLOSE-WAIT (%zu B)", + dlen); + } + + retr = th->ack && !th->fin && + ntohl(th->ack_seq) == conn->seq_ack_from_tap && + ntohs(th->window) == conn->wnd_from_tap; + + /* On socket flush failure, pretend there was no ACK, try again + * later + */ + if (th->ack && !tcp_sock_consume(conn, ntohl(th->ack_seq))) + tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq)); + + if (retr) { + flow_trace(conn, + "fast re-transmit, ACK: %u, previous sequence: %u", + ntohl(th->ack_seq), conn->seq_to_tap); + + if (tcp_rewind_seq(c, conn)) + return -1; + } + + if (tcp_tap_window_update(c, conn, ntohs(th->window)) || retr) + tcp_data_from_sock(c, conn); + + if (conn->seq_ack_from_tap == conn->seq_to_tap) { + if (th->ack && conn->events & TAP_FIN_SENT) + conn_event(c, conn, TAP_FIN_ACKED); + + if (conn->events & SOCK_FIN_RCVD && + conn->events & TAP_FIN_ACKED) + conn_event(c, conn, CLOSED); + } return 1; } @@ -1987,10 +2357,27 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, ack_due = 1; if ((conn->events & TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_SENT)) { + socklen_t sl; + struct tcp_info tinfo; + shutdown(conn->sock, SHUT_WR); conn_event(c, conn, SOCK_FIN_SENT); tcp_send_flag(c, conn, ACK); ack_due = 0; + + /* If we received a FIN, but the socket is in TCP_ESTABLISHED + * state, it must be a migrated socket. The kernel saw the FIN + * on the source socket, but not on the target socket. + * + * Approximate the effect of that FIN: as we're sending a FIN + * out ourselves, the socket is now in a state equivalent to + * LAST_ACK. Now that we sent the FIN out, close it with a RST. + */ + sl = sizeof(tinfo); + getsockopt(conn->sock, SOL_TCP, TCP_INFO, &tinfo, &sl); + if (tinfo.tcpi_state == TCP_ESTABLISHED && + conn->events & SOCK_FIN_RCVD) + goto reset; } if (ack_due) @@ -2073,9 +2460,10 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow, void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, const struct timespec *now) { - const struct flowside *ini; + struct tcp_tap_conn *conn; union sockaddr_inany sa; socklen_t sl = sizeof(sa); + struct flowside *ini; union flow *flow; int s; @@ -2088,14 +2476,21 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, if (s < 0) goto cancel; - tcp_sock_set_bufsize(c, s); + conn = (struct tcp_tap_conn *)flow; + conn->listening_sock = ref.fd; + tcp_sock_set_nodelay(s); - /* FIXME: When listening port has a specific bound address, record that - * as our address + /* FIXME: If useful: when the listening port has a specific bound + * address, record that as our address, as implemented for vhost-user + * mode only, below. */ ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa, - ref.tcp_listen.port); + NULL, ref.tcp_listen.port); + + if (getsockname(s, &sa.sa, &sl) || + inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa) < 0) + err_perror("Can't get local address for socket %i", s); if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0) { char sastr[SOCKADDR_STRLEN]; @@ -2136,7 +2531,9 @@ cancel: * @c: Execution context * @ref: epoll reference of timer (not connection) * - * #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64 + * #syscalls timerfd_gettime|timerfd_gettime64 + * #syscalls arm:timerfd_gettime64 i686:timerfd_gettime64 + * #syscalls arm:timerfd_settime64 i686:timerfd_settime64 */ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) { @@ -2151,7 +2548,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) * and we just set the timer to a new point in the future: discard it. */ if (timerfd_gettime(conn->timer, &check_armed)) - flow_err(conn, "failed to read timer: %s", strerror_(errno)); + flow_perror(conn, "failed to read timer"); if (check_armed.it_value.tv_sec || check_armed.it_value.tv_nsec) return; @@ -2161,24 +2558,37 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) tcp_timer_ctl(c, conn); } else if (conn->flags & ACK_FROM_TAP_DUE) { if (!(conn->events & ESTABLISHED)) { - flow_dbg(conn, "handshake timeout"); - tcp_rst(c, conn); + int max; + max = c->tcp.syn_retries + c->tcp.syn_linear_timeouts; + max = MIN(TCP_MAX_RETRIES, max); + if (conn->retries >= max) { + flow_dbg(conn, "handshake timeout"); + tcp_rst(c, conn); + } else { + flow_trace(conn, "SYN timeout, retry"); + tcp_send_flag(c, conn, SYN); + conn->retries++; + conn_flag(c, conn, SYN_RETRIED); + tcp_timer_ctl(c, conn); + } } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { flow_dbg(conn, "FIN timeout"); tcp_rst(c, conn); - } else if (conn->retrans == TCP_MAX_RETRANS) { + } else if (conn->retries == TCP_MAX_RETRIES) { flow_dbg(conn, "retransmissions count exceeded"); tcp_rst(c, conn); } else { flow_dbg(conn, "ACK timeout, retry"); - conn->retrans++; - conn->seq_to_tap = conn->seq_ack_from_tap; - if (tcp_set_peek_offset(conn->sock, 0)) { - tcp_rst(c, conn); - } else { - tcp_data_from_sock(c, conn); - tcp_timer_ctl(c, conn); - } + + if (!conn->wnd_from_tap) + conn->wnd_from_tap = 1; /* Zero-window probe */ + + conn->retries++; + if (tcp_rewind_seq(c, conn)) + return; + + tcp_data_from_sock(c, conn); + tcp_timer_ctl(c, conn); } } else { struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } }; @@ -2191,8 +2601,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) * ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE. */ if (timerfd_settime(conn->timer, 0, &new, &old)) - flow_err(conn, "failed to set timer: %s", - strerror_(errno)); + flow_perror(conn, "failed to set timer"); if (old.it_value.tv_sec == ACT_TIMEOUT) { flow_dbg(conn, "activity timeout"); @@ -2223,7 +2632,7 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, return; } - if ((conn->events & TAP_FIN_SENT) && (events & EPOLLHUP)) { + if ((conn->events & TAP_FIN_ACKED) && (events & EPOLLHUP)) { conn_event(c, conn, CLOSED); return; } @@ -2264,136 +2673,88 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, } /** - * tcp_sock_init_one() - Initialise listening socket for address and port + * tcp_listen() - Create listening socket * @c: Execution context - * @addr: Pointer to address for binding, NULL for dual stack any - * @ifname: Name of interface to bind to, NULL if not configured + * @pif: Interface to open the socket for (PIF_HOST or PIF_SPLICE) + * @addr: Pointer to address for binding, NULL for any + * @ifname: Name of interface to bind to, NULL for any * @port: Port, host order * - * Return: fd for the new listening socket, negative error code on failure + * Return: 0 on success, negative error code on failure */ -static int tcp_sock_init_one(const struct ctx *c, const union inany_addr *addr, - const char *ifname, in_port_t port) +int tcp_listen(const struct ctx *c, uint8_t pif, + const union inany_addr *addr, const char *ifname, in_port_t port) { union tcp_listen_epoll_ref tref = { .port = port, - .pif = PIF_HOST, + .pif = pif, }; + const struct fwd_ports *fwd; + int (*socks)[IP_VERSIONS]; int s; - s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_HOST, addr, - ifname, port, tref.u32); - - if (c->tcp.fwd_in.mode == FWD_AUTO) { - if (!addr || inany_v4(addr)) - tcp_sock_init_ext[port][V4] = s < 0 ? -1 : s; - if (!addr || !inany_v4(addr)) - tcp_sock_init_ext[port][V6] = s < 0 ? -1 : s; - } - - if (s < 0) - return s; - - return s; -} - -/** - * tcp_sock_init() - Create listening sockets for a given host ("inbound") port - * @c: Execution context - * @addr: Pointer to address for binding, NULL if not configured - * @ifname: Name of interface to bind to, NULL if not configured - * @port: Port, host order - * - * Return: 0 on (partial) success, negative error code on (complete) failure - */ -int tcp_sock_init(const struct ctx *c, const union inany_addr *addr, - const char *ifname, in_port_t port) -{ - int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1; - ASSERT(!c->no_tcp); - if (!addr && c->ifi4 && c->ifi6) - /* Attempt to get a dual stack socket */ - if (tcp_sock_init_one(c, NULL, ifname, port) >= 0) + if (!c->ifi4) { + if (!addr) + /* Restrict to v6 only */ + addr = &inany_any6; + else if (inany_v4(addr)) + /* Nothing to do */ return 0; + } + if (!c->ifi6) { + if (!addr) + /* Restrict to v4 only */ + addr = &inany_any4; + else if (!inany_v4(addr)) + /* Nothing to do */ + return 0; + } - /* Otherwise create a socket per IP version */ - if ((!addr || inany_v4(addr)) && c->ifi4) - r4 = tcp_sock_init_one(c, addr ? addr : &inany_any4, - ifname, port); - - if ((!addr || !inany_v4(addr)) && c->ifi6) - r6 = tcp_sock_init_one(c, addr ? addr : &inany_any6, - ifname, port); - - if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6)) - return 0; - - return r4 < 0 ? r4 : r6; -} - -/** - * tcp_ns_sock_init4() - Init socket to listen for outbound IPv4 connections - * @c: Execution context - * @port: Port, host order - */ -static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port) -{ - union tcp_listen_epoll_ref tref = { - .port = port, - .pif = PIF_SPLICE, - }; - int s; - - ASSERT(c->mode == MODE_PASTA); - - s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback4, - NULL, port, tref.u32); - if (s < 0) - s = -1; + if (pif == PIF_HOST) { + fwd = &c->tcp.fwd_in; + socks = tcp_sock_init_ext; + } else { + ASSERT(pif == PIF_SPLICE); + fwd = &c->tcp.fwd_out; + socks = tcp_sock_ns; + } - if (c->tcp.fwd_out.mode == FWD_AUTO) - tcp_sock_ns[port][V4] = s; -} + s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, pif, addr, ifname, + port, tref.u32); -/** - * tcp_ns_sock_init6() - Init socket to listen for outbound IPv6 connections - * @c: Execution context - * @port: Port, host order - */ -static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port) -{ - union tcp_listen_epoll_ref tref = { - .port = port, - .pif = PIF_SPLICE, - }; - int s; - - ASSERT(c->mode == MODE_PASTA); + if (fwd->mode == FWD_AUTO) { + if (!addr || inany_v4(addr)) + socks[port][V4] = s < 0 ? -1 : s; + if (!addr || !inany_v4(addr)) + socks[port][V6] = s < 0 ? -1 : s; + } - s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback6, - NULL, port, tref.u32); if (s < 0) - s = -1; + return s; - if (c->tcp.fwd_out.mode == FWD_AUTO) - tcp_sock_ns[port][V6] = s; + return 0; } /** - * tcp_ns_sock_init() - Init socket to listen for spliced outbound connections + * tcp_ns_listen() - Init socket to listen for spliced outbound connections * @c: Execution context * @port: Port, host order */ -void tcp_ns_sock_init(const struct ctx *c, in_port_t port) +static void tcp_ns_listen(const struct ctx *c, in_port_t port) { ASSERT(!c->no_tcp); + if (!c->no_bindtodevice) { + tcp_listen(c, PIF_SPLICE, NULL, "lo", port); + return; + } + if (c->ifi4) - tcp_ns_sock_init4(c, port); + tcp_listen(c, PIF_SPLICE, &inany_loopback4, NULL, port); if (c->ifi6) - tcp_ns_sock_init6(c, port); + tcp_listen(c, PIF_SPLICE, &inany_loopback6, NULL, port); } /** @@ -2414,7 +2775,7 @@ static int tcp_ns_socks_init(void *arg) if (!bitmap_isset(c->tcp.fwd_out.map, port)) continue; - tcp_ns_sock_init(c, port); + tcp_ns_listen(c, port); } return 0; @@ -2422,13 +2783,12 @@ static int tcp_ns_socks_init(void *arg) /** * tcp_sock_refill_pool() - Refill one pool of pre-opened sockets - * @c: Execution context * @pool: Pool of sockets to refill * @af: Address family to use * * Return: 0 on success, negative error code if there was at least one error */ -int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) +int tcp_sock_refill_pool(int pool[], sa_family_t af) { int i; @@ -2438,7 +2798,7 @@ int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) if (pool[i] >= 0) continue; - if ((fd = tcp_conn_new_sock(c, af)) < 0) + if ((fd = tcp_conn_new_sock(af)) < 0) return fd; pool[i] = fd; @@ -2454,13 +2814,13 @@ int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) static void tcp_sock_refill_init(const struct ctx *c) { if (c->ifi4) { - int rc = tcp_sock_refill_pool(c, init_sock_pool4, AF_INET); + int rc = tcp_sock_refill_pool(init_sock_pool4, AF_INET); if (rc < 0) warn("TCP: Error refilling IPv4 host socket pool: %s", strerror_(-rc)); } if (c->ifi6) { - int rc = tcp_sock_refill_pool(c, init_sock_pool6, AF_INET6); + int rc = tcp_sock_refill_pool(init_sock_pool6, AF_INET6); if (rc < 0) warn("TCP: Error refilling IPv6 host socket pool: %s", strerror_(-rc)); @@ -2493,7 +2853,7 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af) /** * tcp_probe_tcp_info() - Check what data TCP_INFO reports * - * Return: Number of bytes returned by TCP_INFO getsockopt() + * Return: number of bytes returned by TCP_INFO getsockopt() */ static socklen_t tcp_probe_tcp_info(void) { @@ -2519,6 +2879,31 @@ static socklen_t tcp_probe_tcp_info(void) } /** + * tcp_get_rto_params() - Get host kernel RTO parameters + * @c: Execution context + */ +static void tcp_get_rto_params(struct ctx *c) +{ + intmax_t v; + + v = read_file_integer(SYN_RETRIES, SYN_RETRIES_DEFAULT); + c->tcp.syn_retries = MIN(v, MAX_SYNCNT); + + v = read_file_integer(SYN_LINEAR_TIMEOUTS, SYN_LINEAR_TIMEOUTS_DEFAULT); + c->tcp.syn_linear_timeouts = MIN(v, MAX_SYNCNT); + + v = read_file_integer(RTO_MAX_MS, (intmax_t)(RTO_MAX_DEFAULT * 1000)); + c->tcp.rto_max = MIN(DIV_ROUND_UP(v, 1000), INT_MAX); + + debug("Using TCP RTO parameters, syn_retries: %"PRIu8 + ", syn_linear_timeouts: %"PRIu8 + ", rto_max: %d", + c->tcp.syn_retries, + c->tcp.syn_linear_timeouts, + c->tcp.rto_max); +} + +/** * tcp_init() - Get initial sequence, hash secret, initialise per-socket data * @c: Execution context * @@ -2528,6 +2913,8 @@ int tcp_init(struct ctx *c) { ASSERT(!c->no_tcp); + tcp_get_rto_params(c); + tcp_sock_iov_init(c); memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4)); @@ -2550,7 +2937,7 @@ int tcp_init(struct ctx *c) tcp_info_size = tcp_probe_tcp_info(); #define dbg_tcpi(f_) debug("TCP_INFO tcpi_%s field%s supported", \ - STRINGIFY(f_), tcp_info_cap(f_) ? " " : " not ") + STRINGIFY(f_), tcp_info_cap(f_) ? "" : " not") dbg_tcpi(snd_wnd); dbg_tcpi(bytes_acked); dbg_tcpi(min_rtt); @@ -2569,7 +2956,6 @@ int tcp_init(struct ctx *c) static void tcp_port_rebind(struct ctx *c, bool outbound) { const uint8_t *fmap = outbound ? c->tcp.fwd_out.map : c->tcp.fwd_in.map; - const uint8_t *rmap = outbound ? c->tcp.fwd_in.map : c->tcp.fwd_out.map; int (*socks)[IP_VERSIONS] = outbound ? tcp_sock_ns : tcp_sock_init_ext; unsigned port; @@ -2588,16 +2974,12 @@ static void tcp_port_rebind(struct ctx *c, bool outbound) continue; } - /* Don't loop back our own ports */ - if (bitmap_isset(rmap, port)) - continue; - if ((c->ifi4 && socks[port][V4] == -1) || (c->ifi6 && socks[port][V6] == -1)) { if (outbound) - tcp_ns_sock_init(c, port); + tcp_ns_listen(c, port); else - tcp_sock_init(c, NULL, NULL, port); + tcp_listen(c, PIF_HOST, NULL, NULL, port); } } } @@ -2621,27 +3003,1080 @@ static int tcp_port_rebind_outbound(void *arg) } /** + * tcp_port_rebind_all() - Rebind ports to match forward maps (in host & ns) + * @c: Execution context + */ +void tcp_port_rebind_all(struct ctx *c) +{ + ASSERT(c->mode == MODE_PASTA && !c->no_tcp); + + if (c->tcp.fwd_out.mode == FWD_AUTO) + NS_CALL(tcp_port_rebind_outbound, c); + + if (c->tcp.fwd_in.mode == FWD_AUTO) + tcp_port_rebind(c, false); +} + +/** * tcp_timer() - Periodic tasks: port detection, closed connections, pool refill * @c: Execution context * @now: Current timestamp */ -void tcp_timer(struct ctx *c, const struct timespec *now) +void tcp_timer(const struct ctx *c, const struct timespec *now) { (void)now; - if (c->mode == MODE_PASTA) { - if (c->tcp.fwd_out.mode == FWD_AUTO) { - fwd_scan_ports_tcp(&c->tcp.fwd_out, &c->tcp.fwd_in); - NS_CALL(tcp_port_rebind_outbound, c); + tcp_sock_refill_init(c); + if (c->mode == MODE_PASTA) + tcp_splice_refill(c); +} + +/** + * tcp_flow_is_established() - Was the connection established? Includes closing + * @conn: Pointer to the TCP connection structure + * + * Return: true if the connection was established, false otherwise + */ +bool tcp_flow_is_established(const struct tcp_tap_conn *conn) +{ + return conn->events & ESTABLISHED; +} + +/** + * tcp_flow_repair_on() - Enable repair mode for a single TCP flow + * @c: Execution context + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn) +{ + int rc = 0; + + if (conn->sock < 0) + return 0; + + if ((rc = repair_set(c, conn->sock, TCP_REPAIR_ON))) + err("Failed to set TCP_REPAIR"); + + return rc; +} + +/** + * tcp_flow_repair_off() - Clear repair mode for a single TCP flow + * @c: Execution context + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn) +{ + int rc = 0; + + if (conn->sock < 0) + return 0; + + if ((rc = repair_set(c, conn->sock, TCP_REPAIR_OFF))) + err("Failed to clear TCP_REPAIR"); + + return rc; +} + +/** + * tcp_flow_dump_tinfo() - Dump window scale, tcpi_state, tcpi_options + * @conn: Pointer to the TCP connection structure + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_dump_tinfo(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) +{ + struct tcp_info tinfo; + socklen_t sl; + + sl = sizeof(tinfo); + if (getsockopt(conn->sock, SOL_TCP, TCP_INFO, &tinfo, &sl)) { + int rc = -errno; + flow_perror(conn, "Querying TCP_INFO"); + return rc; + } + + t->snd_ws = tinfo.tcpi_snd_wscale; + t->rcv_ws = tinfo.tcpi_rcv_wscale; + t->tcpi_state = tinfo.tcpi_state; + t->tcpi_options = tinfo.tcpi_options; + + return 0; +} + +/** + * tcp_flow_dump_mss() - Dump MSS clamp (not current MSS) via TCP_MAXSEG + * @conn: Pointer to the TCP connection structure + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_dump_mss(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) +{ + socklen_t sl = sizeof(t->mss); + int val; + + if (getsockopt(conn->sock, SOL_TCP, TCP_MAXSEG, &val, &sl)) { + int rc = -errno; + flow_perror(conn, "Getting MSS"); + return rc; + } + + t->mss = (uint32_t)val; + + return 0; +} + + +/** + * tcp_flow_dump_timestamp() - Dump RFC 7323 timestamp via TCP_TIMESTAMP + * @conn: Pointer to the TCP connection structure + * @t: Extended migration data (tcpi_options must be populated) + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_dump_timestamp(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) +{ + int val = 0; + + if (t->tcpi_options & TCPI_OPT_TIMESTAMPS) { + socklen_t sl = sizeof(val); + + if (getsockopt(conn->sock, SOL_TCP, TCP_TIMESTAMP, &val, &sl)) { + int rc = -errno; + flow_perror(conn, "Getting RFC 7323 timestamp"); + return rc; } + } + + t->timestamp = (uint32_t)val; + return 0; +} - if (c->tcp.fwd_in.mode == FWD_AUTO) { - fwd_scan_ports_tcp(&c->tcp.fwd_in, &c->tcp.fwd_out); - tcp_port_rebind(c, false); +/** + * tcp_flow_repair_timestamp() - Restore RFC 7323 timestamp via TCP_TIMESTAMP + * @conn: Pointer to the TCP connection structure + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_timestamp(const struct tcp_tap_conn *conn, + const struct tcp_tap_transfer_ext *t) +{ + int val = (int)t->timestamp; + + if (t->tcpi_options & TCPI_OPT_TIMESTAMPS) { + if (setsockopt(conn->sock, SOL_TCP, TCP_TIMESTAMP, + &val, sizeof(val))) { + int rc = -errno; + flow_perror(conn, "Setting RFC 7323 timestamp"); + return rc; } } - tcp_sock_refill_init(c); - if (c->mode == MODE_PASTA) - tcp_splice_refill(c); + return 0; +} + +/** + * tcp_flow_dump_wnd() - Dump current tcp_repair_window parameters + * @conn: Pointer to the TCP connection structure + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_dump_wnd(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) +{ + struct tcp_repair_window wnd; + socklen_t sl = sizeof(wnd); + + if (getsockopt(conn->sock, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, &sl)) { + int rc = -errno; + flow_perror(conn, "Getting window repair data"); + return rc; + } + + t->snd_wl1 = wnd.snd_wl1; + t->snd_wnd = wnd.snd_wnd; + t->max_window = wnd.max_window; + t->rcv_wnd = wnd.rcv_wnd; + t->rcv_wup = wnd.rcv_wup; + + /* If we received a FIN, we also need to adjust window parameters. + * + * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state. + */ + if (t->tcpi_state == TCP_CLOSE_WAIT || t->tcpi_state == TCP_LAST_ACK) { + t->rcv_wup--; + t->rcv_wnd++; + } + + return 0; +} + +/** + * tcp_flow_repair_wnd() - Restore window parameters from extended data + * @conn: Pointer to the TCP connection structure + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_wnd(const struct tcp_tap_conn *conn, + const struct tcp_tap_transfer_ext *t) +{ + struct tcp_repair_window wnd; + + wnd.snd_wl1 = t->snd_wl1; + wnd.snd_wnd = t->snd_wnd; + wnd.max_window = t->max_window; + wnd.rcv_wnd = t->rcv_wnd; + wnd.rcv_wup = t->rcv_wup; + + if (setsockopt(conn->sock, IPPROTO_TCP, TCP_REPAIR_WINDOW, + &wnd, sizeof(wnd))) { + int rc = -errno; + flow_perror(conn, "Setting window data"); + return rc; + } + + return 0; +} + +/** + * tcp_flow_select_queue() - Select queue (receive or send) for next operation + * @conn: Connection to select queue for + * @queue: TCP_RECV_QUEUE or TCP_SEND_QUEUE + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_select_queue(const struct tcp_tap_conn *conn, int queue) +{ + if (setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_QUEUE, + &queue, sizeof(queue))) { + int rc = -errno; + flow_perror(conn, "Selecting TCP_SEND_QUEUE"); + return rc; + } + + return 0; +} + +/** + * tcp_flow_dump_sndqueue() - Dump send queue, length of sent and not sent data + * @conn: Connection to dump queue for + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + * + * #syscalls:vu ioctl + */ +static int tcp_flow_dump_sndqueue(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) +{ + int s = conn->sock; + ssize_t rc; + + if (ioctl(s, SIOCOUTQ, &t->sndq) < 0) { + rc = -errno; + flow_perror(conn, "Getting send queue size"); + return rc; + } + + if (ioctl(s, SIOCOUTQNSD, &t->notsent) < 0) { + rc = -errno; + flow_perror(conn, "Getting not sent count"); + return rc; + } + + /* If we sent a FIN, SIOCOUTQ and SIOCOUTQNSD are one greater than the + * actual pending queue length, because they are based on the sequence + * numbers, not directly on the buffer contents. + * + * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state. + */ + if (t->tcpi_state == TCP_FIN_WAIT1 || t->tcpi_state == TCP_FIN_WAIT2 || + t->tcpi_state == TCP_LAST_ACK || t->tcpi_state == TCP_CLOSING) { + if (t->sndq) + t->sndq--; + if (t->notsent) + t->notsent--; + } + + if (t->notsent > t->sndq) { + flow_err(conn, + "Invalid notsent count socket %i, send: %u, not sent: %u", + s, t->sndq, t->notsent); + return -EINVAL; + } + + if (t->sndq > TCP_MIGRATE_SND_QUEUE_MAX) { + flow_err(conn, + "Send queue too large to migrate socket %i: %u bytes", + s, t->sndq); + return -ENOBUFS; + } + + rc = recv(s, tcp_migrate_snd_queue, + MIN(t->sndq, TCP_MIGRATE_SND_QUEUE_MAX), MSG_PEEK); + if (rc < 0) { + if (errno == EAGAIN) { /* EAGAIN means empty */ + rc = 0; + } else { + rc = -errno; + flow_perror(conn, "Can't read send queue"); + return rc; + } + } + + if ((uint32_t)rc < t->sndq) { + flow_err(conn, "Short read migrating send queue"); + return -ENXIO; + } + + t->notsent = MIN(t->notsent, t->sndq); + + return 0; +} + +/** + * tcp_flow_repair_queue() - Restore contents of a given (pre-selected) queue + * @conn: Connection to repair queue for + * @len: Length of data to be restored + * @buf: Buffer with content of pending data queue + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_queue(const struct tcp_tap_conn *conn, + size_t len, uint8_t *buf) +{ + size_t chunk = len; + uint8_t *p = buf; + + if (conn->sock < 0) { + flow_err(conn, "Invalid socket descriptor for repair queue"); + return -EBADF; + } + + while (len > 0) { + ssize_t rc = send(conn->sock, p, MIN(len, chunk), 0); + + if (rc < 0) { + if ((errno == ENOBUFS || errno == ENOMEM) && + chunk >= TCP_MIGRATE_RESTORE_CHUNK_MIN) { + chunk /= 2; + continue; + } + + rc = -errno; + flow_perror(conn, "Can't write queue"); + return rc; + } + + len -= rc; + p += rc; + } + + return 0; +} + +/** + * tcp_flow_dump_seq() - Dump current sequence of pre-selected queue + * @conn: Pointer to the TCP connection structure + * @v: Sequence value, set on return + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_dump_seq(const struct tcp_tap_conn *conn, uint32_t *v) +{ + socklen_t sl = sizeof(*v); + + if (getsockopt(conn->sock, SOL_TCP, TCP_QUEUE_SEQ, v, &sl)) { + int rc = -errno; + flow_perror(conn, "Dumping sequence"); + return rc; + } + + return 0; +} + +/** + * tcp_flow_repair_seq() - Restore sequence for pre-selected queue + * @conn: Connection to repair sequences for + * @v: Sequence value to be set + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_seq(const struct tcp_tap_conn *conn, + const uint32_t *v) +{ + if (setsockopt(conn->sock, SOL_TCP, TCP_QUEUE_SEQ, v, sizeof(*v))) { + int rc = -errno; + flow_perror(conn, "Setting sequence"); + return rc; + } + + return 0; +} + +/** + * tcp_flow_dump_rcvqueue() - Dump receive queue and its length, seal/block it + * @conn: Pointer to the TCP connection structure + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + * + * #syscalls:vu ioctl + */ +static int tcp_flow_dump_rcvqueue(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) +{ + int s = conn->sock; + ssize_t rc; + + if (ioctl(s, SIOCINQ, &t->rcvq) < 0) { + rc = -errno; + err_perror("Get receive queue size, socket %i", s); + return rc; + } + + /* If we received a FIN, SIOCINQ is one greater than the actual number + * of bytes on the queue, because it's based on the sequence number + * rather than directly on the buffer contents. + * + * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state. + */ + if (t->rcvq && + (t->tcpi_state == TCP_CLOSE_WAIT || t->tcpi_state == TCP_LAST_ACK)) + t->rcvq--; + + if (t->rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) { + flow_err(conn, + "Receive queue too large to migrate socket: %u bytes", + t->rcvq); + return -ENOBUFS; + } + + rc = recv(s, tcp_migrate_rcv_queue, t->rcvq, MSG_PEEK); + if (rc < 0) { + if (errno == EAGAIN) { /* EAGAIN means empty */ + rc = 0; + } else { + rc = -errno; + flow_perror(conn, "Can't read receive queue"); + return rc; + } + } + + if ((uint32_t)rc < t->rcvq) { + flow_err(conn, "Short read migrating receive queue"); + return -ENXIO; + } + + return 0; +} + +/** + * tcp_flow_repair_opt() - Set repair "options" (MSS, scale, SACK, timestamps) + * @conn: Pointer to the TCP connection structure + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_opt(const struct tcp_tap_conn *conn, + const struct tcp_tap_transfer_ext *t) +{ + const struct tcp_repair_opt opts[] = { + { TCPOPT_WINDOW, t->snd_ws + (t->rcv_ws << 16) }, + { TCPOPT_MAXSEG, t->mss }, + { TCPOPT_SACK_PERMITTED, 0 }, + { TCPOPT_TIMESTAMP, 0 }, + }; + socklen_t sl; + + sl = sizeof(opts[0]) * (2 + + !!(t->tcpi_options & TCPI_OPT_SACK) + + !!(t->tcpi_options & TCPI_OPT_TIMESTAMPS)); + + if (setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_OPTIONS, opts, sl)) { + int rc = -errno; + flow_perror(conn, "Setting repair options"); + return rc; + } + + return 0; +} + +/** + * tcp_flow_migrate_source() - Send data (flow table) for flow, close listening + * @fd: Descriptor for state migration + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn) +{ + struct tcp_tap_transfer t = { + .retries = conn->retries, + .ws_from_tap = conn->ws_from_tap, + .ws_to_tap = conn->ws_to_tap, + .events = conn->events, + + .tap_mss = htonl(MSS_GET(conn)), + + .sndbuf = htonl(conn->sndbuf), + + .flags = conn->flags, + .seq_dup_ack_approx = conn->seq_dup_ack_approx, + + .wnd_from_tap = htons(conn->wnd_from_tap), + .wnd_to_tap = htons(conn->wnd_to_tap), + + .seq_to_tap = htonl(conn->seq_to_tap), + .seq_ack_from_tap = htonl(conn->seq_ack_from_tap), + .seq_from_tap = htonl(conn->seq_from_tap), + .seq_ack_to_tap = htonl(conn->seq_ack_to_tap), + .seq_init_from_tap = htonl(conn->seq_init_from_tap), + }; + + memcpy(&t.pif, conn->f.pif, sizeof(t.pif)); + memcpy(&t.side, conn->f.side, sizeof(t.side)); + + if (write_all_buf(fd, &t, sizeof(t))) { + int rc = -errno; + err_perror("Can't write migration data, socket %i", conn->sock); + return rc; + } + + if (conn->listening_sock != -1 && !fcntl(conn->listening_sock, F_GETFD)) + close(conn->listening_sock); + + return 0; +} + +/** + * tcp_flow_migrate_source_ext() - Dump queues, close sockets, send final data + * @c: Execution context + * @fd: Descriptor for state migration + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative (not -EIO) on failure, -EIO on sending failure + */ +int tcp_flow_migrate_source_ext(const struct ctx *c, + int fd, const struct tcp_tap_conn *conn) +{ + uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap; + struct tcp_tap_transfer_ext *t = &migrate_ext[FLOW_IDX(conn)]; + int s = conn->sock; + int rc; + + /* Disable SO_PEEK_OFF, it will make accessing the queues in repair mode + * weird. + */ + if (tcp_set_peek_offset(conn, -1)) { + rc = -errno; + goto fail; + } + + if ((rc = tcp_flow_dump_tinfo(conn, t))) + goto fail; + + if ((rc = tcp_flow_dump_mss(conn, t))) + goto fail; + + if ((rc = tcp_flow_dump_timestamp(conn, t))) + goto fail; + + if ((rc = tcp_flow_dump_wnd(conn, t))) + goto fail; + + if ((rc = tcp_flow_select_queue(conn, TCP_SEND_QUEUE))) + goto fail; + + if ((rc = tcp_flow_dump_sndqueue(conn, t))) + goto fail; + + if ((rc = tcp_flow_dump_seq(conn, &t->seq_snd))) + goto fail; + + if ((rc = tcp_flow_select_queue(conn, TCP_RECV_QUEUE))) + goto fail; + + if ((rc = tcp_flow_dump_rcvqueue(conn, t))) + goto fail; + + if ((rc = tcp_flow_dump_seq(conn, &t->seq_rcv))) + goto fail; + + if (c->migrate_no_linger) + close(s); + else + epoll_del(flow_epollfd(&conn->f), s); + + /* Adjustments unrelated to FIN segments: sequence numbers we dumped are + * based on the end of the queues. + */ + t->seq_rcv -= t->rcvq; + t->seq_snd -= t->sndq; + + flow_dbg(conn, "Extended migration data, socket %i sequences send %u receive %u", + s, t->seq_snd, t->seq_rcv); + flow_dbg(conn, " pending queues: send %u not sent %u receive %u", + t->sndq, t->notsent, t->rcvq); + flow_dbg(conn, " window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u", + t->snd_wl1, t->snd_wnd, t->max_window, t->rcv_wnd, t->rcv_wup); + flow_dbg(conn, " SO_PEEK_OFF %s offset=%"PRIu32, + peek_offset_cap ? "enabled" : "disabled", peek_offset); + + /* Endianness fix-ups */ + t->seq_snd = htonl(t->seq_snd); + t->seq_rcv = htonl(t->seq_rcv); + t->sndq = htonl(t->sndq); + t->notsent = htonl(t->notsent); + t->rcvq = htonl(t->rcvq); + t->mss = htonl(t->mss); + t->timestamp = htonl(t->timestamp); + + t->snd_wl1 = htonl(t->snd_wl1); + t->snd_wnd = htonl(t->snd_wnd); + t->max_window = htonl(t->max_window); + t->rcv_wnd = htonl(t->rcv_wnd); + t->rcv_wup = htonl(t->rcv_wup); + + if (write_all_buf(fd, t, sizeof(*t))) { + flow_perror(conn, "Failed to write extended data"); + return -EIO; + } + + if (write_all_buf(fd, tcp_migrate_snd_queue, ntohl(t->sndq))) { + flow_perror(conn, "Failed to write send queue data"); + return -EIO; + } + + if (write_all_buf(fd, tcp_migrate_rcv_queue, ntohl(t->rcvq))) { + flow_perror(conn, "Failed to write receive queue data"); + return -EIO; + } + + return 0; + +fail: + /* For any type of failure dumping data, write an invalid extended data + * descriptor that allows us to keep the stream in sync, but tells the + * target to skip the flow. If we fail to transfer data, that's fatal: + * return -EIO in that case (and only in that case). + */ + t->tcpi_state = 0; /* Not defined: tell the target to skip this flow */ + + if (write_all_buf(fd, t, sizeof(*t))) { + flow_perror(conn, "Failed to write extended data"); + return -EIO; + } + + if (rc == -EIO) /* but not a migration data transfer failure */ + return -ENODATA; + + return rc; +} + +/** + * tcp_flow_repair_socket() - Open and bind socket, request repair mode + * @c: Execution context + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn) +{ + sa_family_t af = CONN_V4(conn) ? AF_INET : AF_INET6; + int s, rc; + + if ((conn->sock = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, + IPPROTO_TCP)) < 0) { + rc = -errno; + flow_perror(conn, "Failed to create socket for migrated flow"); + return rc; + } + s = conn->sock; + + if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &(int){ 1 }, sizeof(int))) + flow_dbg_perror(conn, "Failed to set SO_REUSEADDR on socket %i", + s); + + tcp_sock_set_nodelay(s); + + if ((rc = tcp_flow_repair_on(c, conn))) + goto err; + + return 0; + +err: + close(s); + conn->sock = -1; + return rc; +} + +/** + * tcp_flow_repair_bind() - Bind socket in repair mode + * @c: Execution context + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_bind(const struct ctx *c, struct tcp_tap_conn *conn) +{ + const struct flowside *sockside = HOSTFLOW(conn); + union sockaddr_inany a; + + pif_sockaddr(c, &a, PIF_HOST, &sockside->oaddr, sockside->oport); + + if (bind(conn->sock, &a.sa, socklen_inany(&a))) { + int rc = -errno; + flow_perror(conn, "Failed to bind socket for migrated flow"); + return rc; + } + + return 0; +} + +/** + * tcp_flow_repair_connect() - Connect socket in repair mode, then turn it off + * @c: Execution context + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_connect(const struct ctx *c, + struct tcp_tap_conn *conn) +{ + const struct flowside *tgt = HOSTFLOW(conn); + int rc; + + rc = flowside_connect(c, conn->sock, PIF_HOST, tgt); + if (rc) { + rc = -errno; + flow_perror(conn, "Failed to connect migrated socket"); + return rc; + } + + flow_epollid_clear(&conn->f); + conn->timer = -1; + conn->listening_sock = -1; + + return 0; +} + +/** + * tcp_flow_migrate_target() - Receive data (flow table part) for flow, insert + * @c: Execution context + * @fd: Descriptor for state migration + * + * Return: 0 on success, negative on fatal failure, but 0 on single flow failure + */ +int tcp_flow_migrate_target(struct ctx *c, int fd) +{ + struct tcp_tap_transfer t; + struct tcp_tap_conn *conn; + union flow *flow; + int rc; + + if (!(flow = flow_alloc())) { + err("Flow table full on migration target"); + return 0; + } + + if (read_all_buf(fd, &t, sizeof(t))) { + flow_perror(flow, "Failed to receive migration data"); + flow_alloc_cancel(flow); + return -errno; + } + + flow->f.state = FLOW_STATE_TGT; + memcpy(&flow->f.pif, &t.pif, sizeof(flow->f.pif)); + memcpy(&flow->f.side, &t.side, sizeof(flow->f.side)); + conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); + + conn->retries = t.retries; + conn->ws_from_tap = t.ws_from_tap; + conn->ws_to_tap = t.ws_to_tap; + conn->events = t.events; + + conn->sndbuf = htonl(t.sndbuf); + + conn->flags = t.flags; + conn->seq_dup_ack_approx = t.seq_dup_ack_approx; + + MSS_SET(conn, ntohl(t.tap_mss)); + + conn->wnd_from_tap = ntohs(t.wnd_from_tap); + conn->wnd_to_tap = ntohs(t.wnd_to_tap); + + conn->seq_to_tap = ntohl(t.seq_to_tap); + conn->seq_ack_from_tap = ntohl(t.seq_ack_from_tap); + conn->seq_from_tap = ntohl(t.seq_from_tap); + conn->seq_ack_to_tap = ntohl(t.seq_ack_to_tap); + conn->seq_init_from_tap = ntohl(t.seq_init_from_tap); + + if ((rc = tcp_flow_repair_socket(c, conn))) { + flow_err(flow, "Can't set up socket: %s, drop", strerror_(-rc)); + /* Can't leave the flow in an incomplete state */ + FLOW_ACTIVATE(conn); + return 0; + } + + flow_hash_insert(c, TAP_SIDX(conn)); + FLOW_ACTIVATE(conn); + + return 0; +} + +/** + * tcp_flow_migrate_target_ext() - Receive extended data for flow, set, connect + * @c: Execution context + * @conn: Connection entry to complete with extra data + * @fd: Descriptor for state migration + * + * Return: 0 on success, negative on fatal failure, but 0 on single flow failure + */ +int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd) +{ + uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap; + struct tcp_tap_transfer_ext t; + int s = conn->sock, rc; + + if (read_all_buf(fd, &t, sizeof(t))) { + rc = -errno; + flow_perror(conn, "Failed to read extended data"); + return rc; + } + + if (!t.tcpi_state) { /* Source wants us to skip this flow */ + flow_err(conn, "Dropping as requested by source"); + goto fail; + } + + /* Endianness fix-ups */ + t.seq_snd = ntohl(t.seq_snd); + t.seq_rcv = ntohl(t.seq_rcv); + t.sndq = ntohl(t.sndq); + t.notsent = ntohl(t.notsent); + t.rcvq = ntohl(t.rcvq); + t.mss = ntohl(t.mss); + t.timestamp = ntohl(t.timestamp); + + t.snd_wl1 = ntohl(t.snd_wl1); + t.snd_wnd = ntohl(t.snd_wnd); + t.max_window = ntohl(t.max_window); + t.rcv_wnd = ntohl(t.rcv_wnd); + t.rcv_wup = ntohl(t.rcv_wup); + + flow_dbg(conn, + "Extended migration data, socket %i sequences send %u receive %u", + s, t.seq_snd, t.seq_rcv); + flow_dbg(conn, " pending queues: send %u not sent %u receive %u", + t.sndq, t.notsent, t.rcvq); + flow_dbg(conn, + " window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u", + t.snd_wl1, t.snd_wnd, t.max_window, t.rcv_wnd, t.rcv_wup); + flow_dbg(conn, " SO_PEEK_OFF %s offset=%"PRIu32, + peek_offset_cap ? "enabled" : "disabled", peek_offset); + + if (t.sndq > TCP_MIGRATE_SND_QUEUE_MAX || t.notsent > t.sndq || + t.rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) { + flow_err(conn, + "Bad queues socket %i, send: %u, not sent: %u, receive: %u", + s, t.sndq, t.notsent, t.rcvq); + return -EINVAL; + } + + if (read_all_buf(fd, tcp_migrate_snd_queue, t.sndq)) { + rc = -errno; + flow_perror(conn, "Failed to read send queue data"); + return rc; + } + + if (read_all_buf(fd, tcp_migrate_rcv_queue, t.rcvq)) { + rc = -errno; + flow_perror(conn, "Failed to read receive queue data"); + return rc; + } + + if (conn->sock < 0) + /* We weren't able to create the socket, discard flow */ + goto fail; + + if (tcp_flow_repair_bind(c, conn)) + goto fail; + + if (tcp_flow_repair_timestamp(conn, &t)) + goto fail; + + if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE)) + goto fail; + + if (tcp_flow_repair_seq(conn, &t.seq_snd)) + goto fail; + + if (tcp_flow_select_queue(conn, TCP_RECV_QUEUE)) + goto fail; + + if (tcp_flow_repair_seq(conn, &t.seq_rcv)) + goto fail; + + if (tcp_flow_repair_connect(c, conn)) + goto fail; + + if (tcp_flow_repair_queue(conn, t.rcvq, tcp_migrate_rcv_queue)) + goto fail; + + if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE)) + goto fail; + + if (tcp_flow_repair_queue(conn, t.sndq - t.notsent, + tcp_migrate_snd_queue)) + goto fail; + + if (tcp_flow_repair_opt(conn, &t)) + goto fail; + + /* If we sent a FIN sent and it was acknowledged (TCP_FIN_WAIT2), don't + * send it out, because we already sent it for sure. + * + * Call shutdown(x, SHUT_WR) in repair mode, so that we move to + * FIN_WAIT_1 (tcp_shutdown()) without sending anything + * (goto in tcp_write_xmit()). + */ + if (t.tcpi_state == TCP_FIN_WAIT2) { + int v; + + v = TCP_SEND_QUEUE; + if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v))) + flow_perror(conn, "Selecting repair queue"); + else + shutdown(s, SHUT_WR); + } + + if (tcp_flow_repair_wnd(conn, &t)) + goto fail; + + tcp_flow_repair_off(c, conn); + repair_flush(c); + + if (t.notsent) { + if (tcp_flow_repair_queue(conn, t.notsent, + tcp_migrate_snd_queue + + (t.sndq - t.notsent))) { + /* This sometimes seems to fail for unclear reasons. + * Don't fail the whole migration, just reset the flow + * and carry on to the next one. + */ + goto fail; + } + } + + /* If we sent a FIN but it wasn't acknowledged yet (TCP_FIN_WAIT1), send + * it out, because we don't know if we already sent it. + * + * Call shutdown(x, SHUT_WR) *not* in repair mode, which moves us to + * TCP_FIN_WAIT1. + */ + if (t.tcpi_state == TCP_FIN_WAIT1) + shutdown(s, SHUT_WR); + + if (tcp_set_peek_offset(conn, peek_offset)) + goto fail; + + tcp_send_flag(c, conn, ACK); + tcp_data_from_sock(c, conn); + + if ((rc = tcp_epoll_ctl(c, conn))) { + flow_dbg(conn, + "Failed to subscribe to epoll for migrated socket: %s", + strerror_(-rc)); + goto fail; + } + + return 0; + +fail: + if (conn->sock >= 0) { + tcp_flow_repair_off(c, conn); + repair_flush(c); + } + + conn->flags = 0; /* Not waiting for ACK, don't schedule timer */ + tcp_rst(c, conn); + + return 0; +} + +/** + * tcp_prepare_iov() - Prepare iov according to kernel capability + * @msg: Message header to update + * @iov: iovec to receive TCP payload and data to discard + * @already_sent: Bytes sent after the last acknowledged one + * @payload_iov_cnt: Number of TCP payload iovec entries + * + * Return: 0 on success, -1 if already_sent cannot be discarded fully + */ +int tcp_prepare_iov(struct msghdr *msg, struct iovec *iov, + uint32_t already_sent, int payload_iov_cnt) +{ + /* + * IOV layout + * |- tcp_buf_discard -|---------- TCP data slots ------------| + * + * with discarded data: + * |------ddddddddddddd|ttttttttttttt-------------------------| + * ^ + * | + * msg_iov + * + * without discarded data: + * |-------------------|ttttttttttttt-------------------------| + * ^ + * | + * msg_iov + * d: discard data + * t: TCP data + */ + if (peek_offset_cap) { + msg->msg_iov = iov + DISCARD_IOV_NUM; + msg->msg_iovlen = payload_iov_cnt; + } else { + int discard_cnt, discard_iov_rem; + struct iovec *iov_start; + int i; + + discard_cnt = DIV_ROUND_UP(already_sent, BUF_DISCARD_SIZE); + if (discard_cnt > DISCARD_IOV_NUM) { + debug("Failed to discard %u already sent bytes", + already_sent); + return -1; + } + + discard_iov_rem = already_sent % BUF_DISCARD_SIZE; + + iov_start = iov + (DISCARD_IOV_NUM - discard_cnt); + + /* Multiple iov entries pointing to the same buffer */ + for (i = 0; i < discard_cnt; i++) { + iov_start[i].iov_base = tcp_buf_discard; + iov_start[i].iov_len = BUF_DISCARD_SIZE; + } + if (discard_iov_rem) + iov[DISCARD_IOV_NUM - 1].iov_len = discard_iov_rem; + + msg->msg_iov = iov_start; + msg->msg_iovlen = discard_cnt + payload_iov_cnt; + } + + return 0; } |
