diff options
Diffstat (limited to 'tcp.c')
| -rw-r--r-- | tcp.c | 696 |
1 files changed, 364 insertions, 332 deletions
@@ -190,22 +190,27 @@ * - RTO_INIT_AFTER_SYN_RETRIES: if SYN retries happened during handshake and * RTO is less than this, re-initialise RTO to this for data retransmissions * - * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE - * with TAP_FIN_SENT event), and no ACK is received within this time, reset - * the connection + * - RTT / 2 elapsed after data segment received from tap without having + * sent an ACK segment, or zero-sized window advertised to tap/guest (flag + * ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent. * - * - FIN_TIMEOUT: if a FIN segment was acknowledged by tap/guest and a FIN - * segment (write shutdown) was sent via socket (events SOCK_FIN_SENT and - * TAP_FIN_ACKED), but no socket activity is detected from the socket within - * this time, reset the connection + * RTT, here, is an approximation of the RTT value reported by the kernel via + * TCP_INFO, with a representable range from RTT_STORE_MIN (100 us) to + * RTT_STORE_MAX (3276.8 ms). The timeout value is clamped accordingly. * - * - ACT_TIMEOUT, in the presence of any event: if no activity is detected on - * either side, the connection is reset + * We also use a global interval timer for an activity timeout which doesn't + * require precision: * - * - ACK_INTERVAL elapsed after data segment received from tap without having - * sent an ACK segment, or zero-sized window advertised to tap/guest (flag - * ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent + * - INACTIVITY_INTERVAL: if a connection has had no activity for an entire + * interval, close and reset it. This means that idle connections (without + * keepalives) will be removed between INACTIVITY_INTERVAL s and + * 2*INACTIVITY_INTERVAL s after the last activity. * + * - KEEPALIVE_INTERVAL: if a connection has had no tap-side activity for an + * entire interval, send a tap-side keepalive. If the endpoint is no longer + * aware of the connection (due to a reboot, or a kernel timeout in FIN_WAIT_2 + * state) that should trigger an RST, so we won't keep track of connections + * that the guest endpoint no longer cares about. * * Summary of data flows (with ESTABLISHED event) * ---------------------------------------------- @@ -297,8 +302,6 @@ #include "ip.h" #include "passt.h" #include "tap.h" -#include "siphash.h" -#include "pcap.h" #include "tcp_splice.h" #include "log.h" #include "inany.h" @@ -341,15 +344,25 @@ enum { #define MSS_DEFAULT 536 #define WINDOW_DEFAULT 14600 /* RFC 6928 */ -#define ACK_INTERVAL 10 /* ms */ #define RTO_INIT 1 /* s, RFC 6298 */ #define RTO_INIT_AFTER_SYN_RETRIES 3 /* s, RFC 6298 */ -#define FIN_TIMEOUT 60 -#define ACT_TIMEOUT 7200 + +#define INACTIVITY_INTERVAL 7200 /* s */ +#define KEEPALIVE_INTERVAL 30 /* s */ #define LOW_RTT_TABLE_SIZE 8 #define LOW_RTT_THRESHOLD 10 /* us */ +/* Parameters to temporarily exceed sending buffer to force TCP auto-tuning */ +#define SNDBUF_BOOST_BYTES_RTT_LO 2500 /* B * s: no boost until here */ +/* ...examples: 5 MB sent * 500 ns RTT, 250 kB * 10 ms, 8 kB * 300 ms */ +#define SNDBUF_BOOST_FACTOR 150 /* % */ +#define SNDBUF_BOOST_BYTES_RTT_HI 6000 /* apply full boost factor */ +/* 12 MB sent * 500 ns RTT, 600 kB * 10 ms, 20 kB * 300 ms */ + +/* Ratio of buffer to bandwidth * delay product implying interactive traffic */ +#define SNDBUF_TO_BW_DELAY_INTERACTIVE /* > */ 20 /* (i.e. < 5% of buffer) */ + #define ACK_IF_NEEDED 0 /* See tcp_send_flag() */ #define CONN_IS_CLOSING(conn) \ @@ -401,10 +414,6 @@ static const char *tcp_flag_str[] __attribute((__unused__)) = { "ACK_FROM_TAP_DUE", "ACK_FROM_TAP_BLOCKS", "SYN_RETRIED", }; -/* Listening sockets, used for automatic port forwarding in pasta mode only */ -static int tcp_sock_init_ext [NUM_PORTS][IP_VERSIONS]; -static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS]; - /* Table of our guest side addresses with very low RTT (assumed to be local to * the host), LRU */ @@ -423,11 +432,13 @@ socklen_t tcp_info_size; sizeof(((struct tcp_info_linux *)NULL)->tcpi_##f_)) <= tcp_info_size) /* Kernel reports sending window in TCP_INFO (kernel commit 8f7baad7f035) */ -#define snd_wnd_cap tcp_info_cap(snd_wnd) +#define snd_wnd_cap tcp_info_cap(snd_wnd) /* Kernel reports bytes acked in TCP_INFO (kernel commit 0df48c26d84) */ -#define bytes_acked_cap tcp_info_cap(bytes_acked) +#define bytes_acked_cap tcp_info_cap(bytes_acked) /* Kernel reports minimum RTT in TCP_INFO (kernel commit cd9b266095f4) */ -#define min_rtt_cap tcp_info_cap(min_rtt) +#define min_rtt_cap tcp_info_cap(min_rtt) +/* Kernel reports delivery rate in TCP_INFO (kernel commit eb8329e0a04d) */ +#define delivery_rate_cap tcp_info_cap(delivery_rate) /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; @@ -508,47 +519,30 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) /** * tcp_epoll_ctl() - Add/modify/delete epoll state from connection events - * @c: Execution context * @conn: Connection pointer * * Return: 0 on success, negative error code on failure (not on deletion) */ -static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) +static int tcp_epoll_ctl(struct tcp_tap_conn *conn) { - int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; - union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock, - .flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), }; - struct epoll_event ev = { .data.u64 = ref.u64 }; - int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f) - : c->epollfd; + uint32_t events; if (conn->events == CLOSED) { - if (flow_in_epoll(&conn->f)) - epoll_del(epollfd, conn->sock); + int epollfd = flow_epollfd(&conn->f); + + epoll_del(epollfd, conn->sock); if (conn->timer != -1) epoll_del(epollfd, conn->timer); + return 0; } - ev.events = tcp_conn_epoll_events(conn->events, conn->flags); + events = tcp_conn_epoll_events(conn->events, conn->flags); - if (epoll_ctl(epollfd, m, conn->sock, &ev)) + if (flow_epoll_set(&conn->f, EPOLL_CTL_MOD, events, conn->sock, + !TAPSIDE(conn)) < 0) return -errno; - flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT); - - if (conn->timer != -1) { - union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER, - .fd = conn->sock, - .flow = FLOW_IDX(conn) }; - struct epoll_event ev_t = { .data.u64 = ref_t.u64, - .events = EPOLLIN | EPOLLET }; - - if (epoll_ctl(flow_epollfd(&conn->f), EPOLL_CTL_MOD, - conn->timer, &ev_t)) - return -errno; - } - return 0; } @@ -556,8 +550,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) * tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed * @c: Execution context * @conn: Connection pointer - * - * #syscalls timerfd_create timerfd_settime + * #syscalls timerfd_create timerfd_settime|timerfd_settime32 */ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) { @@ -567,34 +560,38 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) return; if (conn->timer == -1) { - union epoll_ref ref = { .type = EPOLL_TYPE_TCP_TIMER, - .fd = conn->sock, - .flow = FLOW_IDX(conn) }; - struct epoll_event ev = { .data.u64 = ref.u64, - .events = EPOLLIN | EPOLLET }; - int epollfd = flow_epollfd(&conn->f); + union epoll_ref ref; int fd; fd = timerfd_create(CLOCK_MONOTONIC, 0); - if (fd == -1 || fd > FD_REF_MAX) { + if (fd == -1) { flow_dbg_perror(conn, "failed to get timer"); - if (fd > -1) - close(fd); - conn->timer = -1; return; } - conn->timer = fd; + if (fd > FD_REF_MAX) { + flow_dbg(conn, "timer fd overflow (%d > %d)", + fd, FD_REF_MAX); + close(fd); + return; + } - if (epoll_ctl(epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) { - flow_dbg_perror(conn, "failed to add timer"); - close(conn->timer); - conn->timer = -1; + ref.type = EPOLL_TYPE_TCP_TIMER; + ref.flow = FLOW_IDX(conn); + ref.fd = fd; + if (epoll_add(flow_epollfd(&conn->f), EPOLLIN | EPOLLET, + ref) < 0) { + flow_dbg(conn, "failed to add timer"); + close(fd); return; } + + conn->timer = fd; } if (conn->flags & ACK_TO_TAP_DUE) { - it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000; + it.it_value.tv_sec = RTT_GET(conn) / 2 / ((long)1000 * 1000); + it.it_value.tv_nsec = RTT_GET(conn) / 2 % ((long)1000 * 1000) * + 1000; } else if (conn->flags & ACK_FROM_TAP_DUE) { int exp = conn->retries, timeout = RTO_INIT; if (!(conn->events & ESTABLISHED)) @@ -603,15 +600,23 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) timeout = MAX(timeout, RTO_INIT_AFTER_SYN_RETRIES); timeout <<= MAX(exp, 0); it.it_value.tv_sec = MIN(timeout, c->tcp.rto_max); - } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { - it.it_value.tv_sec = FIN_TIMEOUT; } else { - it.it_value.tv_sec = ACT_TIMEOUT; + /* Disarm */ + it.it_value.tv_sec = 0; + it.it_value.tv_nsec = 0; } - flow_dbg(conn, "timer expires in %llu.%03llus", - (unsigned long long)it.it_value.tv_sec, - (unsigned long long)it.it_value.tv_nsec / 1000 / 1000); + if (conn->flags & ACK_TO_TAP_DUE) { + flow_trace(conn, "timer expires in %llu.%02llums", + (unsigned long long)it.it_value.tv_sec * 1000 + + it.it_value.tv_nsec / 1000 / 1000, + (unsigned long long)it.it_value.tv_nsec + / 1000 / 10 % 100); + } else { + flow_dbg(conn, "timer expires in %llu.%03llus", + (unsigned long long)it.it_value.tv_sec, + (unsigned long long)it.it_value.tv_nsec / 1000 / 1000); + } if (timerfd_settime(conn->timer, 0, &it, NULL)) flow_perror(conn, "failed to set timer"); @@ -657,7 +662,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, } if (flag == STALLED || flag == ~STALLED) - tcp_epoll_ctl(c, conn); + tcp_epoll_ctl(conn); if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE || (flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) || @@ -714,11 +719,8 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, } else { if (event == CLOSED) flow_hash_remove(c, TAP_SIDX(conn)); - tcp_epoll_ctl(c, conn); + tcp_epoll_ctl(conn); } - - if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) - tcp_timer_ctl(c, conn); } /** @@ -774,7 +776,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, } /** - * tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.5 usage) + * tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.75 usage) * @conn: Connection pointer */ static void tcp_get_sndbuf(struct tcp_tap_conn *conn) @@ -789,11 +791,7 @@ static void tcp_get_sndbuf(struct tcp_tap_conn *conn) return; } - v = sndbuf; - if (v >= SNDBUF_BIG) - v /= 2; - else if (v > SNDBUF_SMALL) - v -= v * (v - SNDBUF_SMALL) / (SNDBUF_BIG - SNDBUF_SMALL) / 2; + v = clamped_scale(sndbuf, sndbuf, SNDBUF_SMALL, SNDBUF_BIG, 75); SNDBUF_SET(conn, MIN(INT_MAX, v)); } @@ -940,7 +938,6 @@ static void tcp_fill_header(struct tcphdr *th, * tcp_fill_headers() - Fill 802.3, IP, TCP headers * @c: Execution context * @conn: Connection pointer - * @taph: tap backend specific header * @eh: Pointer to Ethernet header * @ip4h: Pointer to IPv4 header, or NULL * @ip6h: Pointer to IPv6 header, or NULL @@ -949,12 +946,15 @@ static void tcp_fill_header(struct tcphdr *th, * @ip4_check: IPv4 checksum, if already known * @seq: Sequence number for this segment * @no_tcp_csum: Do not set TCP checksum + * + * Return: frame length (including L2 headers) */ -void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn, - struct tap_hdr *taph, struct ethhdr *eh, - struct iphdr *ip4h, struct ipv6hdr *ip6h, - struct tcphdr *th, struct iov_tail *payload, - const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum) +size_t tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn, + struct ethhdr *eh, + struct iphdr *ip4h, struct ipv6hdr *ip6h, + struct tcphdr *th, struct iov_tail *payload, + const uint16_t *ip4_check, uint32_t seq, + bool no_tcp_csum) { const struct flowside *tapside = TAPFLOW(conn); size_t l4len = iov_tail_size(payload) + sizeof(*th); @@ -1020,7 +1020,36 @@ void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn, else tcp_update_csum(psum, th, payload); - tap_hdr_update(taph, l3len + sizeof(struct ethhdr)); + return MAX(l3len + sizeof(struct ethhdr), ETH_ZLEN); +} + +/** + * tcp_sndbuf_boost() - Calculate limit of sending buffer to force auto-tuning + * @conn: Connection pointer + * @tinfo: tcp_info from kernel, must be pre-fetched + * + * Return: increased sending buffer to use as a limit for advertised window + */ +static unsigned long tcp_sndbuf_boost(const struct tcp_tap_conn *conn, + const struct tcp_info_linux *tinfo) +{ + unsigned long bytes_rtt_product; + + if (!bytes_acked_cap) + return SNDBUF_GET(conn); + + /* This is *not* a bandwidth-delay product, but it's somewhat related: + * as we send more data (usually at the beginning of a connection), we + * try to make the sending buffer progressively grow, with the RTT as a + * factor (longer delay, bigger buffer needed). + */ + bytes_rtt_product = (long long)tinfo->tcpi_bytes_acked * + tinfo->tcpi_rtt / 1000 / 1000; + + return clamped_scale(SNDBUF_GET(conn), bytes_rtt_product, + SNDBUF_BOOST_BYTES_RTT_LO, + SNDBUF_BOOST_BYTES_RTT_HI, + SNDBUF_BOOST_FACTOR); } /** @@ -1031,6 +1060,8 @@ void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn, * @tinfo: tcp_info from kernel, can be NULL if not pre-fetched * * Return: 1 if sequence or window were updated, 0 otherwise + * + * #syscalls ioctl */ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, bool force_seq, struct tcp_info_linux *tinfo) @@ -1041,6 +1072,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, socklen_t sl = sizeof(*tinfo); struct tcp_info_linux tinfo_new; uint32_t new_wnd_to_tap = prev_wnd_to_tap; + bool ack_everything = true; int s = conn->sock; /* At this point we could ack all the data we've accepted for forwarding @@ -1050,7 +1082,8 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, * control behaviour. * * For it to be possible and worth it we need: - * - The TCP_INFO Linux extension which gives us the peer acked bytes + * - The TCP_INFO Linux extensions which give us the peer acked bytes + * and the delivery rate (outbound bandwidth at receiver) * - Not to be told not to (force_seq) * - Not half-closed in the peer->guest direction * With no data coming from the peer, we might not get events which @@ -1060,13 +1093,19 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, * Data goes from socket to socket, with nothing meaningfully "in * flight". * - Not a pseudo-local connection (e.g. to a VM on the same host) - * - Large enough send buffer - * In these cases, there's not enough in flight to bother. + * If it is, there's not enough in flight to bother. + * - Sending buffer significantly larger than bandwidth * delay product + * Meaning we're not bandwidth-bound and this is likely to be + * interactive traffic where we want to preserve transparent + * connection behaviour and latency. + * + * Otherwise, we probably want to maximise throughput, which needs + * sending buffer auto-tuning, triggered in turn by filling up the + * outbound socket queue. */ - if (bytes_acked_cap && !force_seq && + if (bytes_acked_cap && delivery_rate_cap && !force_seq && !CONN_IS_CLOSING(conn) && - !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn) && - (unsigned)SNDBUF_GET(conn) >= SNDBUF_SMALL) { + !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn)) { if (!tinfo) { tinfo = &tinfo_new; if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) @@ -1075,14 +1114,24 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, /* This trips a cppcheck bug in some versions, including * cppcheck 2.18.3. - * https://sourceforge.net/p/cppcheck/discussion/general/thread/fecde59085/ + * https://trac.cppcheck.net/ticket/14191 */ /* cppcheck-suppress [uninitvar,unmatchedSuppression] */ - conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked + - conn->seq_init_from_tap; - } else { + if ((unsigned)SNDBUF_GET(conn) > (long long)tinfo->tcpi_rtt * + tinfo->tcpi_delivery_rate / + 1000 / 1000 * + SNDBUF_TO_BW_DELAY_INTERACTIVE) + ack_everything = false; + } + + if (ack_everything) { /* Fall back to acknowledging everything we got */ conn->seq_ack_to_tap = conn->seq_from_tap; + } else { + /* cppcheck bug 14191 again, see above */ + /* cppcheck-suppress [uninitvar,unmatchedSuppression] */ + conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked + + conn->seq_init_from_tap; } /* It's occasionally possible for us to go from using the fallback above @@ -1113,9 +1162,54 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) { new_wnd_to_tap = tinfo->tcpi_snd_wnd; } else { + unsigned rtt_ms_ceiling = DIV_ROUND_UP(tinfo->tcpi_rtt, 1000); + uint32_t sendq; + int limit; + + if (ioctl(s, SIOCOUTQ, &sendq)) { + debug_perror("SIOCOUTQ on socket %i, assuming 0", s); + sendq = 0; + } tcp_get_sndbuf(conn); - new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, - SNDBUF_GET(conn)); + + if ((int)sendq > SNDBUF_GET(conn)) /* Due to memory pressure? */ + limit = 0; + else if ((int)tinfo->tcpi_snd_wnd > SNDBUF_GET(conn)) + limit = tcp_sndbuf_boost(conn, tinfo) - (int)sendq; + else + limit = SNDBUF_GET(conn) - (int)sendq; + + /* If the sender uses mechanisms to prevent Silly Window + * Syndrome (SWS, described in RFC 813 Section 3) it's critical + * that, should the window ever become less than the MSS, we + * advertise a new value once it increases again to be above it. + * + * The mechanism to avoid SWS in the kernel is, implicitly, + * implemented by Nagle's algorithm (which was proposed after + * RFC 813). + * + * To this end, for simplicity, approximate a window value below + * the MSS to zero, as we already have mechanisms in place to + * force updates after the window becomes zero. This matches the + * suggestion from RFC 813, Section 4. + * + * But don't do this if, either: + * + * - there's nothing in the outbound queue: the size of the + * sending buffer is limiting us, and it won't increase if we + * don't send data, so there's no point in waiting, or + * + * - we haven't sent data in a while (somewhat arbitrarily, ten + * times the RTT), as that might indicate that the receiver + * will only process data in batches that are large enough, + * but we won't send enough to fill one because we're stuck + * with pending data in the outbound queue + */ + if (limit < MSS_GET(conn) && sendq && + tinfo->tcpi_last_data_sent < rtt_ms_ceiling * 10) + limit = 0; + + new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, limit); } new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW); @@ -1135,6 +1229,10 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, conn_flag(c, conn, ACK_TO_TAP_DUE); out: + /* Opportunistically store RTT approximation on valid TCP_INFO data */ + if (tinfo) + RTT_SET(conn, tinfo->tcpi_rtt); + return new_wnd_to_tap != prev_wnd_to_tap || conn->seq_ack_to_tap != prev_ack_to_tap; } @@ -1256,7 +1354,8 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, th->fin = !!(flags & FIN); if (th->ack) { - if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap)) + if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) && + conn->wnd_to_tap) conn_flag(c, conn, ~ACK_TO_TAP_DUE); else conn_flag(c, conn, ACK_TO_TAP_DUE); @@ -1290,7 +1389,34 @@ static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, } /** - * tcp_rst_do() - Reset a tap connection: send RST segment to tap, close socket + * tcp_sock_rst() - Close TCP connection forcing RST on socket side + * @c: Execution context + * @conn: Connection pointer + */ +static void tcp_sock_rst(const struct ctx *c, struct tcp_tap_conn *conn) +{ + const struct linger linger0 = { + .l_onoff = 1, + .l_linger = 0, + }; + + /* Force RST on socket to inform the peer + * + * We do this by setting SO_LINGER with 0 timeout, which means that + * close() will send an RST (unless the connection is already closed in + * both directions). + */ + if (setsockopt(conn->sock, SOL_SOCKET, + SO_LINGER, &linger0, sizeof(linger0)) < 0) { + flow_dbg_perror(conn, + "SO_LINGER failed, may not send RST to peer"); + } + + conn_event(c, conn, CLOSED); +} + +/** + * tcp_rst_do() - Reset a tap connection: send RST segment on both sides, close * @c: Execution context * @conn: Connection pointer */ @@ -1299,8 +1425,10 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn) if (conn->events == CLOSED) return; + /* Send RST on tap */ tcp_send_flag(c, conn, RST); - conn_event(c, conn, CLOSED); + + tcp_sock_rst(c, conn); } /** @@ -1543,7 +1671,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, ini = flow_initiate_af(flow, PIF_TAP, af, saddr, srcport, daddr, dstport); - if (!(tgt = flow_target(c, flow, IPPROTO_TCP))) + if (!(tgt = flow_target(c, flow, FWD_NO_HINT, IPPROTO_TCP))) goto cancel; if (flow->f.pif[TGTSIDE] != PIF_HOST) { @@ -1592,7 +1720,11 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, conn->sock = s; conn->timer = -1; - conn->listening_sock = -1; + flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT); + if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, s, TGTSIDE) < 0) { + flow_perror(flow, "Can't register with epoll"); + goto cancel; + } conn_event(c, conn, TAP_SYN_RCVD); conn->wnd_to_tap = WINDOW_DEFAULT; @@ -1636,7 +1768,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, conn_event(c, conn, TAP_SYN_ACK_SENT); } - tcp_epoll_ctl(c, conn); + tcp_epoll_ctl(conn); if (c->mode == MODE_VU) { /* To rebind to same oport after migration */ socklen_t sl = sizeof(sa); @@ -1771,7 +1903,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, return -1; if (th->rst) { - conn_event(c, conn, CLOSED); + tcp_sock_rst(c, conn); return 1; } @@ -1787,6 +1919,10 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, tcp_send_flag(c, conn, ACK); tcp_timer_ctl(c, conn); + if (setsockopt(conn->sock, SOL_SOCKET, SO_KEEPALIVE, + &((int){ 1 }), sizeof(int))) + flow_trace(conn, "failed to set SO_KEEPALIVE"); + if (p->count == 1) { tcp_tap_window_update(c, conn, ntohs(th->window)); @@ -1913,20 +2049,17 @@ eintr: goto eintr; if (errno == EAGAIN || errno == EWOULDBLOCK) { - tcp_send_flag(c, conn, ACK_IF_NEEDED); + tcp_send_flag(c, conn, ACK | DUP_ACK); return p->count - idx; } return -1; } - if (n < (int)(seq_from_tap - conn->seq_from_tap)) { + if (n < (int)(seq_from_tap - conn->seq_from_tap)) partial_send = 1; - conn->seq_from_tap += n; - tcp_send_flag(c, conn, ACK_IF_NEEDED); - } else { - conn->seq_from_tap += n; - } + + conn->seq_from_tap += n; out: if (keep != -1 || partial_send) { @@ -2134,10 +2267,13 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, flow_trace(conn, "packet length %zu from tap", l4len); if (th->rst) { - conn_event(c, conn, CLOSED); + tcp_sock_rst(c, conn); return 1; } + conn->inactive = false; + conn->tap_inactive = false; + if (th->ack && !(conn->events & ESTABLISHED)) tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq)); @@ -2166,7 +2302,11 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, if (th->fin) { conn->seq_from_tap++; - shutdown(conn->sock, SHUT_WR); + if (shutdown(conn->sock, SHUT_WR) < 0) { + flow_dbg_perror(conn, "shutdown() failed"); + goto reset; + } + tcp_send_flag(c, conn, ACK); conn_event(c, conn, SOCK_FIN_SENT); @@ -2241,7 +2381,11 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, socklen_t sl; struct tcp_info tinfo; - shutdown(conn->sock, SHUT_WR); + if (shutdown(conn->sock, SHUT_WR) < 0) { + flow_dbg_perror(conn, "shutdown() failed"); + goto reset; + } + conn_event(c, conn, SOCK_FIN_SENT); tcp_send_flag(c, conn, ACK); ack_due = 0; @@ -2315,6 +2459,15 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow, conn->sock = s; conn->timer = -1; conn->ws_to_tap = conn->ws_from_tap = 0; + + flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT); + if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, s, INISIDE) < 0) { + flow_perror(flow, "Can't register with epoll"); + conn_flag(c, conn, CLOSING); + FLOW_ACTIVATE(conn); + return; + } + conn_event(c, conn, SOCK_ACCEPTED); hash = flow_hash_insert(c, TAP_SIDX(conn)); @@ -2341,7 +2494,6 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow, void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, const struct timespec *now) { - struct tcp_tap_conn *conn; union sockaddr_inany sa; socklen_t sl = sizeof(sa); struct flowside *ini; @@ -2357,17 +2509,14 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, if (s < 0) goto cancel; - conn = (struct tcp_tap_conn *)flow; - conn->listening_sock = ref.fd; - tcp_sock_set_nodelay(s); /* FIXME: If useful: when the listening port has a specific bound * address, record that as our address, as implemented for vhost-user * mode only, below. */ - ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa, - NULL, ref.tcp_listen.port); + ini = flow_initiate_sa(flow, ref.listen.pif, &sa, + NULL, ref.listen.port); if (getsockname(s, &sa.sa, &sl) || inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa) < 0) @@ -2381,7 +2530,7 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, goto cancel; } - if (!flow_target(c, flow, IPPROTO_TCP)) + if (!flow_target(c, flow, ref.listen.rule, IPPROTO_TCP)) goto cancel; switch (flow->f.pif[TGTSIDE]) { @@ -2412,7 +2561,9 @@ cancel: * @c: Execution context * @ref: epoll reference of timer (not connection) * - * #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64 + * #syscalls timerfd_gettime|timerfd_gettime64 + * #syscalls arm:timerfd_gettime64 i686:timerfd_gettime64 + * #syscalls arm:timerfd_settime64 i686:timerfd_settime64 */ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) { @@ -2450,9 +2601,6 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) conn_flag(c, conn, SYN_RETRIED); tcp_timer_ctl(c, conn); } - } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { - flow_dbg(conn, "FIN timeout"); - tcp_rst(c, conn); } else if (conn->retries == TCP_MAX_RETRIES) { flow_dbg(conn, "retransmissions count exceeded"); tcp_rst(c, conn); @@ -2469,23 +2617,6 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) tcp_data_from_sock(c, conn); tcp_timer_ctl(c, conn); } - } else { - struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } }; - struct itimerspec old = { { 0 }, { 0 } }; - - /* Activity timeout: if it was already set, reset the - * connection, otherwise, it was a left-over from ACK_TO_TAP_DUE - * or ACK_FROM_TAP_DUE, so just set the long timeout in that - * case. This avoids having to preemptively reset the timer on - * ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE. - */ - if (timerfd_settime(conn->timer, 0, &new, &old)) - flow_perror(conn, "failed to set timer"); - - if (old.it_value.tv_sec == ACT_TIMEOUT) { - flow_dbg(conn, "activity timeout"); - tcp_rst(c, conn); - } } } @@ -2511,6 +2642,8 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, return; } + conn->inactive = false; + if ((conn->events & TAP_FIN_ACKED) && (events & EPOLLHUP)) { conn_event(c, conn, CLOSED); return; @@ -2552,65 +2685,18 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, } /** - * tcp_sock_init_one() - Initialise listening socket for address and port - * @c: Execution context - * @pif: Interface to open the socket for (PIF_HOST or PIF_SPLICE) - * @addr: Pointer to address for binding, NULL for dual stack any - * @ifname: Name of interface to bind to, NULL if not configured - * @port: Port, host order - * - * Return: fd for the new listening socket, negative error code on failure - * - * If pif == PIF_SPLICE, the caller must have already entered the guest ns. - */ -static int tcp_sock_init_one(const struct ctx *c, uint8_t pif, - const union inany_addr *addr, const char *ifname, - in_port_t port) -{ - union tcp_listen_epoll_ref tref = { - .port = port, - .pif = pif, - }; - const struct fwd_ports *fwd; - int s; - - if (pif == PIF_HOST) - fwd = &c->tcp.fwd_in; - else - fwd = &c->tcp.fwd_out; - - s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, pif, addr, ifname, - port, tref.u32); - - if (fwd->mode == FWD_AUTO) { - int (*socks)[IP_VERSIONS] = pif == PIF_SPLICE ? - tcp_sock_ns : tcp_sock_init_ext; - - if (!addr || inany_v4(addr)) - socks[port][V4] = s < 0 ? -1 : s; - if (!addr || !inany_v4(addr)) - socks[port][V6] = s < 0 ? -1 : s; - } - - if (s < 0) - return s; - - return s; -} - -/** - * tcp_sock_init() - Create listening socket for a given host ("inbound") port + * tcp_listen() - Create listening socket * @c: Execution context * @pif: Interface to open the socket for (PIF_HOST or PIF_SPLICE) - * @addr: Pointer to address for binding, NULL if not configured - * @ifname: Name of interface to bind to, NULL if not configured + * @rule: Index of relevant forwarding rule + * @addr: Pointer to address for binding, NULL for any + * @ifname: Name of interface to bind to, NULL for any * @port: Port, host order * - * Return: 0 on success, negative error code on failure + * Return: socket fd on success, negative error code on failure */ -int tcp_sock_init(const struct ctx *c, uint8_t pif, - const union inany_addr *addr, const char *ifname, - in_port_t port) +int tcp_listen(const struct ctx *c, uint8_t pif, unsigned rule, + const union inany_addr *addr, const char *ifname, in_port_t port) { int s; @@ -2621,69 +2707,19 @@ int tcp_sock_init(const struct ctx *c, uint8_t pif, /* Restrict to v6 only */ addr = &inany_any6; else if (inany_v4(addr)) - /* Nothing to do */ - return 0; + return -EAFNOSUPPORT; } if (!c->ifi6) { if (!addr) /* Restrict to v4 only */ addr = &inany_any4; else if (!inany_v4(addr)) - /* Nothing to do */ - return 0; - } - - s = tcp_sock_init_one(c, pif, addr, ifname, port); - if (s < 0) - return s; - if (s > FD_REF_MAX) - return -EIO; - - return 0; -} - -/** - * tcp_ns_sock_init() - Init socket to listen for spliced outbound connections - * @c: Execution context - * @port: Port, host order - */ -static void tcp_ns_sock_init(const struct ctx *c, in_port_t port) -{ - ASSERT(!c->no_tcp); - - if (!c->no_bindtodevice) { - tcp_sock_init(c, PIF_SPLICE, NULL, "lo", port); - return; + return -EAFNOSUPPORT; } - if (c->ifi4) - tcp_sock_init_one(c, PIF_SPLICE, &inany_loopback4, NULL, port); - if (c->ifi6) - tcp_sock_init_one(c, PIF_SPLICE, &inany_loopback6, NULL, port); -} - -/** - * tcp_ns_socks_init() - Bind sockets in namespace for outbound connections - * @arg: Execution context - * - * Return: 0 - */ -/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */ -static int tcp_ns_socks_init(void *arg) -{ - const struct ctx *c = (const struct ctx *)arg; - unsigned port; - - ns_enter(c); - - for (port = 0; port < NUM_PORTS; port++) { - if (!bitmap_isset(c->tcp.fwd_out.map, port)) - continue; + s = pif_listen(c, EPOLL_TYPE_TCP_LISTEN, pif, addr, ifname, port, rule); - tcp_ns_sock_init(c, port); - } - - return 0; + return s; } /** @@ -2812,7 +2848,7 @@ static void tcp_get_rto_params(struct ctx *c) * tcp_init() - Get initial sequence, hash secret, initialise per-socket data * @c: Execution context * - * Return: 0, doesn't return on failure + * Return: 0 on success, -1 on failure */ int tcp_init(struct ctx *c) { @@ -2824,15 +2860,16 @@ int tcp_init(struct ctx *c) memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4)); memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6)); - memset(tcp_sock_init_ext, 0xff, sizeof(tcp_sock_init_ext)); - memset(tcp_sock_ns, 0xff, sizeof(tcp_sock_ns)); tcp_sock_refill_init(c); + if (fwd_listen_sync(c, &c->tcp.fwd_in, PIF_HOST, IPPROTO_TCP) < 0) + return -1; if (c->mode == MODE_PASTA) { tcp_splice_init(c); - - NS_CALL(tcp_ns_socks_init, c); + if (fwd_listen_sync(c, &c->tcp.fwd_out, + PIF_SPLICE, IPPROTO_TCP) < 0) + return -1; } peek_offset_cap = (!c->ifi4 || tcp_probe_peek_offset_cap(AF_INET)) && @@ -2842,7 +2879,7 @@ int tcp_init(struct ctx *c) tcp_info_size = tcp_probe_tcp_info(); #define dbg_tcpi(f_) debug("TCP_INFO tcpi_%s field%s supported", \ - STRINGIFY(f_), tcp_info_cap(f_) ? " " : " not ") + STRINGIFY(f_), tcp_info_cap(f_) ? "" : " not") dbg_tcpi(snd_wnd); dbg_tcpi(bytes_acked); dbg_tcpi(min_rtt); @@ -2852,74 +2889,59 @@ int tcp_init(struct ctx *c) } /** - * tcp_port_rebind() - Rebind ports to match forward maps - * @c: Execution context - * @outbound: True to remap outbound forwards, otherwise inbound - * - * Must be called in namespace context if @outbound is true. + * tcp_keepalive() - Send keepalives for connections which need it + * @: Execution context */ -static void tcp_port_rebind(struct ctx *c, bool outbound) +static void tcp_keepalive(struct ctx *c, const struct timespec *now) { - const uint8_t *fmap = outbound ? c->tcp.fwd_out.map : c->tcp.fwd_in.map; - int (*socks)[IP_VERSIONS] = outbound ? tcp_sock_ns : tcp_sock_init_ext; - unsigned port; + union flow *flow; - for (port = 0; port < NUM_PORTS; port++) { - if (!bitmap_isset(fmap, port)) { - if (socks[port][V4] >= 0) { - close(socks[port][V4]); - socks[port][V4] = -1; - } + if (now->tv_sec - c->tcp.keepalive_run < KEEPALIVE_INTERVAL) + return; - if (socks[port][V6] >= 0) { - close(socks[port][V6]); - socks[port][V6] = -1; - } + c->tcp.keepalive_run = now->tv_sec; - continue; - } + flow_foreach_of_type(flow, FLOW_TCP) { + struct tcp_tap_conn *conn = &flow->tcp; - if ((c->ifi4 && socks[port][V4] == -1) || - (c->ifi6 && socks[port][V6] == -1)) { - if (outbound) - tcp_ns_sock_init(c, port); - else - tcp_sock_init(c, PIF_HOST, NULL, NULL, port); + if (conn->tap_inactive) { + flow_dbg(conn, "No tap activity for least %us, send keepalive", + KEEPALIVE_INTERVAL); + tcp_send_flag(c, conn, KEEPALIVE); } + + /* Ready to check fot next interval */ + conn->tap_inactive = true; } } /** - * tcp_port_rebind_outbound() - Rebind ports in namespace - * @arg: Execution context - * - * Called with NS_CALL() - * - * Return: 0 + * tcp_inactivity() - Scan for and close long-inactive connections + * @: Execution context */ -static int tcp_port_rebind_outbound(void *arg) +static void tcp_inactivity(struct ctx *c, const struct timespec *now) { - struct ctx *c = (struct ctx *)arg; + union flow *flow; - ns_enter(c); - tcp_port_rebind(c, true); + if (now->tv_sec - c->tcp.inactivity_run < INACTIVITY_INTERVAL) + return; - return 0; -} + debug("TCP inactivity scan"); + c->tcp.inactivity_run = now->tv_sec; -/** - * tcp_port_rebind_all() - Rebind ports to match forward maps (in host & ns) - * @c: Execution context - */ -void tcp_port_rebind_all(struct ctx *c) -{ - ASSERT(c->mode == MODE_PASTA && !c->no_tcp); + flow_foreach_of_type(flow, FLOW_TCP) { + struct tcp_tap_conn *conn = &flow->tcp; - if (c->tcp.fwd_out.mode == FWD_AUTO) - NS_CALL(tcp_port_rebind_outbound, c); + if (conn->inactive) { + /* No activity in this interval, reset */ + flow_dbg(conn, "Inactive for at least %us, resetting", + INACTIVITY_INTERVAL); + tcp_rst(c, conn); + } - if (c->tcp.fwd_in.mode == FWD_AUTO) - tcp_port_rebind(c, false); + /* Ready to check fot next interval */ + conn->inactive = true; + } } /** @@ -2927,13 +2949,14 @@ void tcp_port_rebind_all(struct ctx *c) * @c: Execution context * @now: Current timestamp */ -void tcp_timer(const struct ctx *c, const struct timespec *now) +void tcp_timer(struct ctx *c, const struct timespec *now) { - (void)now; - tcp_sock_refill_init(c); if (c->mode == MODE_PASTA) tcp_splice_refill(c); + + tcp_keepalive(c, now); + tcp_inactivity(c, now); } /** @@ -3420,7 +3443,7 @@ static int tcp_flow_repair_opt(const struct tcp_tap_conn *conn, } /** - * tcp_flow_migrate_source() - Send data (flow table) for flow, close listening + * tcp_flow_migrate_source() - Send data (flow table) for flow * @fd: Descriptor for state migration * @conn: Pointer to the TCP connection structure * @@ -3460,9 +3483,6 @@ int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn) return rc; } - if (conn->listening_sock != -1 && !fcntl(conn->listening_sock, F_GETFD)) - close(conn->listening_sock); - return 0; } @@ -3671,9 +3691,7 @@ static int tcp_flow_repair_connect(const struct ctx *c, return rc; } - flow_epollid_clear(&conn->f); conn->timer = -1; - conn->listening_sock = -1; return 0; } @@ -3731,14 +3749,19 @@ int tcp_flow_migrate_target(struct ctx *c, int fd) if ((rc = tcp_flow_repair_socket(c, conn))) { flow_err(flow, "Can't set up socket: %s, drop", strerror_(-rc)); - /* Can't leave the flow in an incomplete state */ - FLOW_ACTIVATE(conn); - return 0; + goto out; } + flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT); + if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, conn->sock, + !TAPSIDE(conn))) + goto out; /* tcp_flow_migrate_target_ext() will clean this up */ + flow_hash_insert(c, TAP_SIDX(conn)); - FLOW_ACTIVATE(conn); +out: + /* Never leave the flow in an incomplete state */ + FLOW_ACTIVATE(conn); return 0; } @@ -3862,10 +3885,15 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd int v; v = TCP_SEND_QUEUE; - if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v))) + if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v))) { flow_perror(conn, "Selecting repair queue"); - else - shutdown(s, SHUT_WR); + } else { + if (shutdown(s, SHUT_WR) < 0) { + flow_perror(conn, + "Repair mode shutdown() failed"); + goto fail; + } + } } if (tcp_flow_repair_wnd(conn, &t)) @@ -3892,8 +3920,12 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd * Call shutdown(x, SHUT_WR) *not* in repair mode, which moves us to * TCP_FIN_WAIT1. */ - if (t.tcpi_state == TCP_FIN_WAIT1) - shutdown(s, SHUT_WR); + if (t.tcpi_state == TCP_FIN_WAIT1) { + if (shutdown(s, SHUT_WR) < 0) { + flow_perror(conn, "Post-repair shutdown() failed"); + goto fail; + } + } if (tcp_set_peek_offset(conn, peek_offset)) goto fail; @@ -3901,7 +3933,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd tcp_send_flag(c, conn, ACK); tcp_data_from_sock(c, conn); - if ((rc = tcp_epoll_ctl(c, conn))) { + if ((rc = tcp_epoll_ctl(conn))) { flow_dbg(conn, "Failed to subscribe to epoll for migrated socket: %s", strerror_(-rc)); |
