1 files changed, 364 insertions, 332 deletions
diff --git a/tcp.c b/tcp.c
index fa95f6b..a0d7cd8 100644
--- a/tcp.c
+++ b/tcp.c
@@ -190,22 +190,27 @@
  * - RTO_INIT_AFTER_SYN_RETRIES: if SYN retries happened during handshake and
  *   RTO is less than this, re-initialise RTO to this for data retransmissions
  *
- * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE
- *   with TAP_FIN_SENT event), and no ACK is received within this time, reset
- *   the connection
+ * - RTT / 2 elapsed after data segment received from tap without having
+ *   sent an ACK segment, or zero-sized window advertised to tap/guest (flag
+ *   ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent.
  *
- * - FIN_TIMEOUT: if a FIN segment was acknowledged by tap/guest and a FIN
- *   segment (write shutdown) was sent via socket (events SOCK_FIN_SENT and
- *   TAP_FIN_ACKED), but no socket activity is detected from the socket within
- *   this time, reset the connection
+ *   RTT, here, is an approximation of the RTT value reported by the kernel via
+ *   TCP_INFO, with a representable range from RTT_STORE_MIN (100 us) to
+ *   RTT_STORE_MAX (3276.8 ms). The timeout value is clamped accordingly.
  *
- * - ACT_TIMEOUT, in the presence of any event: if no activity is detected on
- *   either side, the connection is reset
+ * We also use a global interval timer for an activity timeout which doesn't
+ * require precision:
  *
- * - ACK_INTERVAL elapsed after data segment received from tap without having
- *   sent an ACK segment, or zero-sized window advertised to tap/guest (flag
- *   ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent
+ * - INACTIVITY_INTERVAL: if a connection has had no activity for an entire
+ *   interval, close and reset it.  This means that idle connections (without
+ *   keepalives) will be removed between INACTIVITY_INTERVAL s and
+ *   2*INACTIVITY_INTERVAL s after the last activity.
  *
+ * - KEEPALIVE_INTERVAL: if a connection has had no tap-side activity for an
+ *   entire interval, send a tap-side keepalive.  If the endpoint is no longer
+ *   aware of the connection (due to a reboot, or a kernel timeout in FIN_WAIT_2
+ *   state) that should trigger an RST, so we won't keep track of connections
+ *   that the guest endpoint no longer cares about.
  *
  * Summary of data flows (with ESTABLISHED event)
  * ----------------------------------------------
@@ -297,8 +302,6 @@
 #include "ip.h"
 #include "passt.h"
 #include "tap.h"
-#include "siphash.h"
-#include "pcap.h"
 #include "tcp_splice.h"
 #include "log.h"
 #include "inany.h"
@@ -341,15 +344,25 @@ enum {
 #define MSS_DEFAULT			536
 #define WINDOW_DEFAULT			14600		/* RFC 6928 */
 
-#define ACK_INTERVAL			10		/* ms */
 #define RTO_INIT			1		/* s, RFC 6298 */
 #define RTO_INIT_AFTER_SYN_RETRIES	3		/* s, RFC 6298 */
-#define FIN_TIMEOUT			60
-#define ACT_TIMEOUT			7200
+
+#define INACTIVITY_INTERVAL		7200		/* s */
+#define	KEEPALIVE_INTERVAL		30		/* s */
 
 #define LOW_RTT_TABLE_SIZE		8
 #define LOW_RTT_THRESHOLD		10 /* us */
 
+/* Parameters to temporarily exceed sending buffer to force TCP auto-tuning */
+#define SNDBUF_BOOST_BYTES_RTT_LO	2500 /* B * s: no boost until here */
+/* ...examples:  5 MB sent * 500 ns RTT, 250 kB * 10 ms,  8 kB * 300 ms */
+#define SNDBUF_BOOST_FACTOR		150 /* % */
+#define SNDBUF_BOOST_BYTES_RTT_HI	6000 /* apply full boost factor */
+/*		12 MB sent * 500 ns RTT, 600 kB * 10 ms, 20 kB * 300 ms */
+
+/* Ratio of buffer to bandwidth * delay product implying interactive traffic */
+#define SNDBUF_TO_BW_DELAY_INTERACTIVE	/* > */ 20 /* (i.e. < 5% of buffer) */
+
 #define ACK_IF_NEEDED	0		/* See tcp_send_flag() */
 
 #define CONN_IS_CLOSING(conn)						\
@@ -401,10 +414,6 @@ static const char *tcp_flag_str[] __attribute((__unused__)) = {
 	"ACK_FROM_TAP_DUE", "ACK_FROM_TAP_BLOCKS", "SYN_RETRIED",
 };
 
-/* Listening sockets, used for automatic port forwarding in pasta mode only */
-static int tcp_sock_init_ext	[NUM_PORTS][IP_VERSIONS];
-static int tcp_sock_ns		[NUM_PORTS][IP_VERSIONS];
-
 /* Table of our guest side addresses with very low RTT (assumed to be local to
  * the host), LRU
  */
@@ -423,11 +432,13 @@ socklen_t tcp_info_size;
 	  sizeof(((struct tcp_info_linux *)NULL)->tcpi_##f_)) <= tcp_info_size)
 
 /* Kernel reports sending window in TCP_INFO (kernel commit 8f7baad7f035) */
-#define snd_wnd_cap	tcp_info_cap(snd_wnd)
+#define snd_wnd_cap		tcp_info_cap(snd_wnd)
 /* Kernel reports bytes acked in TCP_INFO (kernel commit 0df48c26d84) */
-#define bytes_acked_cap	tcp_info_cap(bytes_acked)
+#define bytes_acked_cap		tcp_info_cap(bytes_acked)
 /* Kernel reports minimum RTT in TCP_INFO (kernel commit cd9b266095f4) */
-#define min_rtt_cap	tcp_info_cap(min_rtt)
+#define min_rtt_cap		tcp_info_cap(min_rtt)
+/* Kernel reports delivery rate in TCP_INFO (kernel commit eb8329e0a04d) */
+#define delivery_rate_cap	tcp_info_cap(delivery_rate)
 
 /* sendmsg() to socket */
 static struct iovec	tcp_iov			[UIO_MAXIOV];
@@ -508,47 +519,30 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
 
 /**
  * tcp_epoll_ctl() - Add/modify/delete epoll state from connection events
- * @c:		Execution context
  * @conn:	Connection pointer
  *
  * Return: 0 on success, negative error code on failure (not on deletion)
  */
-static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
+static int tcp_epoll_ctl(struct tcp_tap_conn *conn)
 {
-	int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
-	union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock,
-		                .flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), };
-	struct epoll_event ev = { .data.u64 = ref.u64 };
-	int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f)
-					      : c->epollfd;
+	uint32_t events;
 
 	if (conn->events == CLOSED) {
-		if (flow_in_epoll(&conn->f))
-			epoll_del(epollfd, conn->sock);
+		int epollfd = flow_epollfd(&conn->f);
+
+		epoll_del(epollfd, conn->sock);
 		if (conn->timer != -1)
 			epoll_del(epollfd, conn->timer);
+
 		return 0;
 	}
 
-	ev.events = tcp_conn_epoll_events(conn->events, conn->flags);
+	events = tcp_conn_epoll_events(conn->events, conn->flags);
 
-	if (epoll_ctl(epollfd, m, conn->sock, &ev))
+	if (flow_epoll_set(&conn->f, EPOLL_CTL_MOD, events, conn->sock,
+			   !TAPSIDE(conn)) < 0)
 		return -errno;
 
-	flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
-
-	if (conn->timer != -1) {
-		union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER,
-					  .fd = conn->sock,
-					  .flow = FLOW_IDX(conn) };
-		struct epoll_event ev_t = { .data.u64 = ref_t.u64,
-					    .events = EPOLLIN | EPOLLET };
-
-		if (epoll_ctl(flow_epollfd(&conn->f), EPOLL_CTL_MOD,
-			      conn->timer, &ev_t))
-			return -errno;
-	}
-
 	return 0;
 }
 
@@ -556,8 +550,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
  * tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed
  * @c:		Execution context
  * @conn:	Connection pointer
- *
- * #syscalls timerfd_create timerfd_settime
+ * #syscalls timerfd_create timerfd_settime|timerfd_settime32
  */
 static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 {
@@ -567,34 +560,38 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 		return;
 
 	if (conn->timer == -1) {
-		union epoll_ref ref = { .type = EPOLL_TYPE_TCP_TIMER,
-					.fd = conn->sock,
-					.flow = FLOW_IDX(conn) };
-		struct epoll_event ev = { .data.u64 = ref.u64,
-					  .events = EPOLLIN | EPOLLET };
-		int epollfd = flow_epollfd(&conn->f);
+		union epoll_ref ref;
 		int fd;
 
 		fd = timerfd_create(CLOCK_MONOTONIC, 0);
-		if (fd == -1 || fd > FD_REF_MAX) {
+		if (fd == -1) {
 			flow_dbg_perror(conn, "failed to get timer");
-			if (fd > -1)
-				close(fd);
-			conn->timer = -1;
 			return;
 		}
-		conn->timer = fd;
+		if (fd > FD_REF_MAX) {
+			flow_dbg(conn, "timer fd overflow (%d > %d)",
+				 fd, FD_REF_MAX);
+			close(fd);
+			return;
+		}
 
-		if (epoll_ctl(epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) {
-			flow_dbg_perror(conn, "failed to add timer");
-			close(conn->timer);
-			conn->timer = -1;
+		ref.type = EPOLL_TYPE_TCP_TIMER;
+		ref.flow = FLOW_IDX(conn);
+		ref.fd = fd;
+		if (epoll_add(flow_epollfd(&conn->f), EPOLLIN | EPOLLET,
+			      ref) < 0) {
+			flow_dbg(conn, "failed to add timer");
+			close(fd);
 			return;
 		}
+
+		conn->timer = fd;
 	}
 
 	if (conn->flags & ACK_TO_TAP_DUE) {
-		it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000;
+		it.it_value.tv_sec = RTT_GET(conn) / 2 / ((long)1000 * 1000);
+		it.it_value.tv_nsec = RTT_GET(conn) / 2 % ((long)1000 * 1000) *
+				      1000;
 	} else if (conn->flags & ACK_FROM_TAP_DUE) {
 		int exp = conn->retries, timeout = RTO_INIT;
 		if (!(conn->events & ESTABLISHED))
@@ -603,15 +600,23 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 			timeout = MAX(timeout, RTO_INIT_AFTER_SYN_RETRIES);
 		timeout <<= MAX(exp, 0);
 		it.it_value.tv_sec = MIN(timeout, c->tcp.rto_max);
-	} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
-		it.it_value.tv_sec = FIN_TIMEOUT;
 	} else {
-		it.it_value.tv_sec = ACT_TIMEOUT;
+		/* Disarm */
+		it.it_value.tv_sec = 0;
+		it.it_value.tv_nsec = 0;
 	}
 
-	flow_dbg(conn, "timer expires in %llu.%03llus",
-		 (unsigned long long)it.it_value.tv_sec,
-		 (unsigned long long)it.it_value.tv_nsec / 1000 / 1000);
+	if (conn->flags & ACK_TO_TAP_DUE) {
+		flow_trace(conn, "timer expires in %llu.%02llums",
+			   (unsigned long long)it.it_value.tv_sec * 1000 +
+			   it.it_value.tv_nsec / 1000 / 1000,
+			   (unsigned long long)it.it_value.tv_nsec
+			   / 1000 / 10 % 100);
+	} else {
+		flow_dbg(conn, "timer expires in %llu.%03llus",
+			 (unsigned long long)it.it_value.tv_sec,
+			 (unsigned long long)it.it_value.tv_nsec / 1000 / 1000);
+	}
 
 	if (timerfd_settime(conn->timer, 0, &it, NULL))
 		flow_perror(conn, "failed to set timer");
@@ -657,7 +662,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
 	}
 
 	if (flag == STALLED || flag == ~STALLED)
-		tcp_epoll_ctl(c, conn);
+		tcp_epoll_ctl(conn);
 
 	if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE		  ||
 	    (flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) ||
@@ -714,11 +719,8 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
 	} else {
 		if (event == CLOSED)
 			flow_hash_remove(c, TAP_SIDX(conn));
-		tcp_epoll_ctl(c, conn);
+		tcp_epoll_ctl(conn);
 	}
-
-	if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
-		tcp_timer_ctl(c, conn);
 }
 
 /**
@@ -774,7 +776,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
 }
 
 /**
- * tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.5 usage)
+ * tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.75 usage)
  * @conn:	Connection pointer
  */
 static void tcp_get_sndbuf(struct tcp_tap_conn *conn)
@@ -789,11 +791,7 @@ static void tcp_get_sndbuf(struct tcp_tap_conn *conn)
 		return;
 	}
 
-	v = sndbuf;
-	if (v >= SNDBUF_BIG)
-		v /= 2;
-	else if (v > SNDBUF_SMALL)
-		v -= v * (v - SNDBUF_SMALL) / (SNDBUF_BIG - SNDBUF_SMALL) / 2;
+	v = clamped_scale(sndbuf, sndbuf, SNDBUF_SMALL, SNDBUF_BIG, 75);
 
 	SNDBUF_SET(conn, MIN(INT_MAX, v));
 }
@@ -940,7 +938,6 @@ static void tcp_fill_header(struct tcphdr *th,
  * tcp_fill_headers() - Fill 802.3, IP, TCP headers
  * @c:			Execution context
  * @conn:		Connection pointer
- * @taph:		tap backend specific header
  * @eh:		Pointer to Ethernet header
  * @ip4h:		Pointer to IPv4 header, or NULL
  * @ip6h:		Pointer to IPv6 header, or NULL
@@ -949,12 +946,15 @@ static void tcp_fill_header(struct tcphdr *th,
  * @ip4_check:		IPv4 checksum, if already known
  * @seq:		Sequence number for this segment
  * @no_tcp_csum:	Do not set TCP checksum
+ *
+ * Return: frame length (including L2 headers)
  */
-void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
-		      struct tap_hdr *taph, struct ethhdr *eh,
-		      struct iphdr *ip4h, struct ipv6hdr *ip6h,
-		      struct tcphdr *th, struct iov_tail *payload,
-		      const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum)
+size_t tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
+			struct ethhdr *eh,
+			struct iphdr *ip4h, struct ipv6hdr *ip6h,
+			struct tcphdr *th, struct iov_tail *payload,
+			const uint16_t *ip4_check, uint32_t seq,
+			bool no_tcp_csum)
 {
 	const struct flowside *tapside = TAPFLOW(conn);
 	size_t l4len = iov_tail_size(payload) + sizeof(*th);
@@ -1020,7 +1020,36 @@ void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
 	else
 		tcp_update_csum(psum, th, payload);
 
-	tap_hdr_update(taph, l3len + sizeof(struct ethhdr));
+	return MAX(l3len + sizeof(struct ethhdr), ETH_ZLEN);
+}
+
+/**
+ * tcp_sndbuf_boost() - Calculate limit of sending buffer to force auto-tuning
+ * @conn:	Connection pointer
+ * @tinfo:	tcp_info from kernel, must be pre-fetched
+ *
+ * Return: increased sending buffer to use as a limit for advertised window
+ */
+static unsigned long tcp_sndbuf_boost(const struct tcp_tap_conn *conn,
+				      const struct tcp_info_linux *tinfo)
+{
+	unsigned long bytes_rtt_product;
+
+	if (!bytes_acked_cap)
+		return SNDBUF_GET(conn);
+
+	/* This is *not* a bandwidth-delay product, but it's somewhat related:
+	 * as we send more data (usually at the beginning of a connection), we
+	 * try to make the sending buffer progressively grow, with the RTT as a
+	 * factor (longer delay, bigger buffer needed).
+	 */
+	bytes_rtt_product = (long long)tinfo->tcpi_bytes_acked *
+			    tinfo->tcpi_rtt / 1000 / 1000;
+
+	return clamped_scale(SNDBUF_GET(conn), bytes_rtt_product,
+			     SNDBUF_BOOST_BYTES_RTT_LO,
+			     SNDBUF_BOOST_BYTES_RTT_HI,
+			     SNDBUF_BOOST_FACTOR);
 }
 
 /**
@@ -1031,6 +1060,8 @@ void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
  * @tinfo:	tcp_info from kernel, can be NULL if not pre-fetched
  *
  * Return: 1 if sequence or window were updated, 0 otherwise
+ *
+ * #syscalls ioctl
  */
 int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
 			  bool force_seq, struct tcp_info_linux *tinfo)
@@ -1041,6 +1072,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
 	socklen_t sl = sizeof(*tinfo);
 	struct tcp_info_linux tinfo_new;
 	uint32_t new_wnd_to_tap = prev_wnd_to_tap;
+	bool ack_everything = true;
 	int s = conn->sock;
 
 	/* At this point we could ack all the data we've accepted for forwarding
@@ -1050,7 +1082,8 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
 	 * control behaviour.
 	 *
 	 * For it to be possible and worth it we need:
-	 *  - The TCP_INFO Linux extension which gives us the peer acked bytes
+	 *  - The TCP_INFO Linux extensions which give us the peer acked bytes
+	 *    and the delivery rate (outbound bandwidth at receiver)
 	 *  - Not to be told not to (force_seq)
 	 *  - Not half-closed in the peer->guest direction
 	 *      With no data coming from the peer, we might not get events which
@@ -1060,13 +1093,19 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
 	 *      Data goes from socket to socket, with nothing meaningfully "in
 	 *      flight".
 	 *  - Not a pseudo-local connection (e.g. to a VM on the same host)
-	 *  - Large enough send buffer
-	 *      In these cases, there's not enough in flight to bother.
+	 *      If it is, there's not enough in flight to bother.
+	 *  - Sending buffer significantly larger than bandwidth * delay product
+	 *      Meaning we're not bandwidth-bound and this is likely to be
+	 *      interactive traffic where we want to preserve transparent
+	 *      connection behaviour and latency.
+	 *
+	 *      Otherwise, we probably want to maximise throughput, which needs
+	 *      sending buffer auto-tuning, triggered in turn by filling up the
+	 *      outbound socket queue.
 	 */
-	if (bytes_acked_cap && !force_seq &&
+	if (bytes_acked_cap && delivery_rate_cap && !force_seq &&
 	    !CONN_IS_CLOSING(conn) &&
-	    !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn) &&
-	    (unsigned)SNDBUF_GET(conn) >= SNDBUF_SMALL) {
+	    !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn)) {
 		if (!tinfo) {
 			tinfo = &tinfo_new;
 			if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
@@ -1075,14 +1114,24 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
 
 		/* This trips a cppcheck bug in some versions, including
 		 * cppcheck 2.18.3.
-		 * https://sourceforge.net/p/cppcheck/discussion/general/thread/fecde59085/
+		 * https://trac.cppcheck.net/ticket/14191
 		 */
 		/* cppcheck-suppress [uninitvar,unmatchedSuppression] */
-		conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked +
-		                       conn->seq_init_from_tap;
-	} else {
+		if ((unsigned)SNDBUF_GET(conn) > (long long)tinfo->tcpi_rtt *
+						 tinfo->tcpi_delivery_rate /
+						 1000 / 1000 *
+						 SNDBUF_TO_BW_DELAY_INTERACTIVE)
+			ack_everything = false;
+	}
+
+	if (ack_everything) {
 		/* Fall back to acknowledging everything we got */
 		conn->seq_ack_to_tap = conn->seq_from_tap;
+	} else {
+		/* cppcheck bug 14191 again, see above */
+		/* cppcheck-suppress [uninitvar,unmatchedSuppression] */
+		conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked +
+		                       conn->seq_init_from_tap;
 	}
 
 	/* It's occasionally possible for us to go from using the fallback above
@@ -1113,9 +1162,54 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
 	if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) {
 		new_wnd_to_tap = tinfo->tcpi_snd_wnd;
 	} else {
+		unsigned rtt_ms_ceiling = DIV_ROUND_UP(tinfo->tcpi_rtt, 1000);
+		uint32_t sendq;
+		int limit;
+
+		if (ioctl(s, SIOCOUTQ, &sendq)) {
+			debug_perror("SIOCOUTQ on socket %i, assuming 0", s);
+			sendq = 0;
+		}
 		tcp_get_sndbuf(conn);
-		new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd,
-				     SNDBUF_GET(conn));
+
+		if ((int)sendq > SNDBUF_GET(conn)) /* Due to memory pressure? */
+			limit = 0;
+		else if ((int)tinfo->tcpi_snd_wnd > SNDBUF_GET(conn))
+			limit = tcp_sndbuf_boost(conn, tinfo) - (int)sendq;
+		else
+			limit = SNDBUF_GET(conn) - (int)sendq;
+
+		/* If the sender uses mechanisms to prevent Silly Window
+		 * Syndrome (SWS, described in RFC 813 Section 3) it's critical
+		 * that, should the window ever become less than the MSS, we
+		 * advertise a new value once it increases again to be above it.
+		 *
+		 * The mechanism to avoid SWS in the kernel is, implicitly,
+		 * implemented by Nagle's algorithm (which was proposed after
+		 * RFC 813).
+		 *
+		 * To this end, for simplicity, approximate a window value below
+		 * the MSS to zero, as we already have mechanisms in place to
+		 * force updates after the window becomes zero. This matches the
+		 * suggestion from RFC 813, Section 4.
+		 *
+		 * But don't do this if, either:
+		 *
+		 * - there's nothing in the outbound queue: the size of the
+		 *   sending buffer is limiting us, and it won't increase if we
+		 *   don't send data, so there's no point in waiting, or
+		 *
+		 * - we haven't sent data in a while (somewhat arbitrarily, ten
+		 *   times the RTT), as that might indicate that the receiver
+		 *   will only process data in batches that are large enough,
+		 *   but we won't send enough to fill one because we're stuck
+		 *   with pending data in the outbound queue
+		 */
+		if (limit < MSS_GET(conn) && sendq &&
+		    tinfo->tcpi_last_data_sent < rtt_ms_ceiling * 10)
+			limit = 0;
+
+		new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, limit);
 	}
 
 	new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW);
@@ -1135,6 +1229,10 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
 		conn_flag(c, conn, ACK_TO_TAP_DUE);
 
 out:
+	/* Opportunistically store RTT approximation on valid TCP_INFO data */
+	if (tinfo)
+		RTT_SET(conn, tinfo->tcpi_rtt);
+
 	return new_wnd_to_tap       != prev_wnd_to_tap ||
 	       conn->seq_ack_to_tap != prev_ack_to_tap;
 }
@@ -1256,7 +1354,8 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
 	th->fin = !!(flags & FIN);
 
 	if (th->ack) {
-		if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap))
+		if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) &&
+		    conn->wnd_to_tap)
 			conn_flag(c, conn, ~ACK_TO_TAP_DUE);
 		else
 			conn_flag(c, conn, ACK_TO_TAP_DUE);
@@ -1290,7 +1389,34 @@ static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn,
 }
 
 /**
- * tcp_rst_do() - Reset a tap connection: send RST segment to tap, close socket
+ * tcp_sock_rst() - Close TCP connection forcing RST on socket side
+ * @c:		Execution context
+ * @conn:	Connection pointer
+ */
+static void tcp_sock_rst(const struct ctx *c, struct tcp_tap_conn *conn)
+{
+	const struct linger linger0 = {
+		.l_onoff = 1,
+		.l_linger = 0,
+	};
+
+	/* Force RST on socket to inform the peer
+	 *
+	 * We do this by setting SO_LINGER with 0 timeout, which means that
+	 * close() will send an RST (unless the connection is already closed in
+	 * both directions).
+	 */
+	if (setsockopt(conn->sock, SOL_SOCKET,
+		       SO_LINGER, &linger0, sizeof(linger0)) < 0) {
+		flow_dbg_perror(conn,
+				"SO_LINGER failed, may not send RST to peer");
+	}
+
+	conn_event(c, conn, CLOSED);
+}
+
+/**
+ * tcp_rst_do() - Reset a tap connection: send RST segment on both sides, close
  * @c:		Execution context
  * @conn:	Connection pointer
  */
@@ -1299,8 +1425,10 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn)
 	if (conn->events == CLOSED)
 		return;
 
+	/* Send RST on tap */
 	tcp_send_flag(c, conn, RST);
-	conn_event(c, conn, CLOSED);
+
+	tcp_sock_rst(c, conn);
 }
 
 /**
@@ -1543,7 +1671,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
 	ini = flow_initiate_af(flow, PIF_TAP,
 			       af, saddr, srcport, daddr, dstport);
 
-	if (!(tgt = flow_target(c, flow, IPPROTO_TCP)))
+	if (!(tgt = flow_target(c, flow, FWD_NO_HINT, IPPROTO_TCP)))
 		goto cancel;
 
 	if (flow->f.pif[TGTSIDE] != PIF_HOST) {
@@ -1592,7 +1720,11 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
 
 	conn->sock = s;
 	conn->timer = -1;
-	conn->listening_sock = -1;
+	flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
+	if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, s, TGTSIDE) < 0) {
+		flow_perror(flow, "Can't register with epoll");
+		goto cancel;
+	}
 	conn_event(c, conn, TAP_SYN_RCVD);
 
 	conn->wnd_to_tap = WINDOW_DEFAULT;
@@ -1636,7 +1768,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
 		conn_event(c, conn, TAP_SYN_ACK_SENT);
 	}
 
-	tcp_epoll_ctl(c, conn);
+	tcp_epoll_ctl(conn);
 
 	if (c->mode == MODE_VU) { /* To rebind to same oport after migration */
 		socklen_t sl = sizeof(sa);
@@ -1771,7 +1903,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			return -1;
 
 		if (th->rst) {
-			conn_event(c, conn, CLOSED);
+			tcp_sock_rst(c, conn);
 			return 1;
 		}
 
@@ -1787,6 +1919,10 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			tcp_send_flag(c, conn, ACK);
 			tcp_timer_ctl(c, conn);
 
+			if (setsockopt(conn->sock, SOL_SOCKET, SO_KEEPALIVE,
+				       &((int){ 1 }), sizeof(int)))
+				flow_trace(conn, "failed to set SO_KEEPALIVE");
+
 			if (p->count == 1) {
 				tcp_tap_window_update(c, conn,
 						      ntohs(th->window));
@@ -1913,20 +2049,17 @@ eintr:
 			goto eintr;
 
 		if (errno == EAGAIN || errno == EWOULDBLOCK) {
-			tcp_send_flag(c, conn, ACK_IF_NEEDED);
+			tcp_send_flag(c, conn, ACK | DUP_ACK);
 			return p->count - idx;
 
 		}
 		return -1;
 	}
 
-	if (n < (int)(seq_from_tap - conn->seq_from_tap)) {
+	if (n < (int)(seq_from_tap - conn->seq_from_tap))
 		partial_send = 1;
-		conn->seq_from_tap += n;
-		tcp_send_flag(c, conn, ACK_IF_NEEDED);
-	} else {
-		conn->seq_from_tap += n;
-	}
+
+	conn->seq_from_tap += n;
 
 out:
 	if (keep != -1 || partial_send) {
@@ -2134,10 +2267,13 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 	flow_trace(conn, "packet length %zu from tap", l4len);
 
 	if (th->rst) {
-		conn_event(c, conn, CLOSED);
+		tcp_sock_rst(c, conn);
 		return 1;
 	}
 
+	conn->inactive = false;
+	conn->tap_inactive = false;
+
 	if (th->ack && !(conn->events & ESTABLISHED))
 		tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
 
@@ -2166,7 +2302,11 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 		if (th->fin) {
 			conn->seq_from_tap++;
 
-			shutdown(conn->sock, SHUT_WR);
+			if (shutdown(conn->sock, SHUT_WR) < 0) {
+				flow_dbg_perror(conn, "shutdown() failed");
+				goto reset;
+			}
+
 			tcp_send_flag(c, conn, ACK);
 			conn_event(c, conn, SOCK_FIN_SENT);
 
@@ -2241,7 +2381,11 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 		socklen_t sl;
 		struct tcp_info tinfo;
 
-		shutdown(conn->sock, SHUT_WR);
+		if (shutdown(conn->sock, SHUT_WR) < 0) {
+			flow_dbg_perror(conn, "shutdown() failed");
+			goto reset;
+		}
+
 		conn_event(c, conn, SOCK_FIN_SENT);
 		tcp_send_flag(c, conn, ACK);
 		ack_due = 0;
@@ -2315,6 +2459,15 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
 	conn->sock = s;
 	conn->timer = -1;
 	conn->ws_to_tap = conn->ws_from_tap = 0;
+
+	flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
+	if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, s, INISIDE) < 0) {
+		flow_perror(flow, "Can't register with epoll");
+		conn_flag(c, conn, CLOSING);
+		FLOW_ACTIVATE(conn);
+		return;
+	}
+
 	conn_event(c, conn, SOCK_ACCEPTED);
 
 	hash = flow_hash_insert(c, TAP_SIDX(conn));
@@ -2341,7 +2494,6 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
 void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 			const struct timespec *now)
 {
-	struct tcp_tap_conn *conn;
 	union sockaddr_inany sa;
 	socklen_t sl = sizeof(sa);
 	struct flowside *ini;
@@ -2357,17 +2509,14 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 	if (s < 0)
 		goto cancel;
 
-	conn = (struct tcp_tap_conn *)flow;
-	conn->listening_sock = ref.fd;
-
 	tcp_sock_set_nodelay(s);
 
 	/* FIXME: If useful: when the listening port has a specific bound
 	 * address, record that as our address, as implemented for vhost-user
 	 * mode only, below.
 	 */
-	ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa,
-			       NULL, ref.tcp_listen.port);
+	ini = flow_initiate_sa(flow, ref.listen.pif, &sa,
+			       NULL, ref.listen.port);
 
 	if (getsockname(s, &sa.sa, &sl) ||
 	    inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa) < 0)
@@ -2381,7 +2530,7 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 		goto cancel;
 	}
 
-	if (!flow_target(c, flow, IPPROTO_TCP))
+	if (!flow_target(c, flow, ref.listen.rule, IPPROTO_TCP))
 		goto cancel;
 
 	switch (flow->f.pif[TGTSIDE]) {
@@ -2412,7 +2561,9 @@ cancel:
  * @c:		Execution context
  * @ref:	epoll reference of timer (not connection)
  *
- * #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64
+ * #syscalls timerfd_gettime|timerfd_gettime64
+ * #syscalls arm:timerfd_gettime64 i686:timerfd_gettime64
+ * #syscalls arm:timerfd_settime64 i686:timerfd_settime64
  */
 void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 {
@@ -2450,9 +2601,6 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 				conn_flag(c, conn, SYN_RETRIED);
 				tcp_timer_ctl(c, conn);
 			}
-		} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
-			flow_dbg(conn, "FIN timeout");
-			tcp_rst(c, conn);
 		} else if (conn->retries == TCP_MAX_RETRIES) {
 			flow_dbg(conn, "retransmissions count exceeded");
 			tcp_rst(c, conn);
@@ -2469,23 +2617,6 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 			tcp_data_from_sock(c, conn);
 			tcp_timer_ctl(c, conn);
 		}
-	} else {
-		struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } };
-		struct itimerspec old = { { 0 }, { 0 } };
-
-		/* Activity timeout: if it was already set, reset the
-		 * connection, otherwise, it was a left-over from ACK_TO_TAP_DUE
-		 * or ACK_FROM_TAP_DUE, so just set the long timeout in that
-		 * case. This avoids having to preemptively reset the timer on
-		 * ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE.
-		 */
-		if (timerfd_settime(conn->timer, 0, &new, &old))
-			flow_perror(conn, "failed to set timer");
-
-		if (old.it_value.tv_sec == ACT_TIMEOUT) {
-			flow_dbg(conn, "activity timeout");
-			tcp_rst(c, conn);
-		}
 	}
 }
 
@@ -2511,6 +2642,8 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
 		return;
 	}
 
+	conn->inactive = false;
+
 	if ((conn->events & TAP_FIN_ACKED) && (events & EPOLLHUP)) {
 		conn_event(c, conn, CLOSED);
 		return;
@@ -2552,65 +2685,18 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
 }
 
 /**
- * tcp_sock_init_one() - Initialise listening socket for address and port
- * @c:		Execution context
- * @pif:	Interface to open the socket for (PIF_HOST or PIF_SPLICE)
- * @addr:	Pointer to address for binding, NULL for dual stack any
- * @ifname:	Name of interface to bind to, NULL if not configured
- * @port:	Port, host order
- *
- * Return: fd for the new listening socket, negative error code on failure
- *
- * If pif == PIF_SPLICE, the caller must have already entered the guest ns.
- */
-static int tcp_sock_init_one(const struct ctx *c, uint8_t pif,
-			     const union inany_addr *addr, const char *ifname,
-			     in_port_t port)
-{
-	union tcp_listen_epoll_ref tref = {
-		.port = port,
-		.pif = pif,
-	};
-	const struct fwd_ports *fwd;
-	int s;
-
-	if (pif == PIF_HOST)
-		fwd = &c->tcp.fwd_in;
-	else
-		fwd = &c->tcp.fwd_out;
-
-	s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, pif, addr, ifname,
-			port, tref.u32);
-
-	if (fwd->mode == FWD_AUTO) {
-		int (*socks)[IP_VERSIONS] = pif == PIF_SPLICE ?
-			tcp_sock_ns : tcp_sock_init_ext;
-
-		if (!addr || inany_v4(addr))
-			socks[port][V4] = s < 0 ? -1 : s;
-		if (!addr || !inany_v4(addr))
-			socks[port][V6] = s < 0 ? -1 : s;
-	}
-
-	if (s < 0)
-		return s;
-
-	return s;
-}
-
-/**
- * tcp_sock_init() - Create listening socket for a given host ("inbound") port
+ * tcp_listen() - Create listening socket
  * @c:		Execution context
  * @pif:	Interface to open the socket for (PIF_HOST or PIF_SPLICE)
- * @addr:	Pointer to address for binding, NULL if not configured
- * @ifname:	Name of interface to bind to, NULL if not configured
+ * @rule:	Index of relevant forwarding rule
+ * @addr:	Pointer to address for binding, NULL for any
+ * @ifname:	Name of interface to bind to, NULL for any
  * @port:	Port, host order
  *
- * Return: 0 on success, negative error code on failure
+ * Return: socket fd on success, negative error code on failure
  */
-int tcp_sock_init(const struct ctx *c, uint8_t pif,
-		  const union inany_addr *addr, const char *ifname,
-		  in_port_t port)
+int tcp_listen(const struct ctx *c, uint8_t pif, unsigned rule,
+	       const union inany_addr *addr, const char *ifname, in_port_t port)
 {
 	int s;
 
@@ -2621,69 +2707,19 @@ int tcp_sock_init(const struct ctx *c, uint8_t pif,
 			/* Restrict to v6 only */
 			addr = &inany_any6;
 		else if (inany_v4(addr))
-			/* Nothing to do */
-			return 0;
+			return -EAFNOSUPPORT;
 	}
 	if (!c->ifi6) {
 		if (!addr)
 			/* Restrict to v4 only */
 			addr = &inany_any4;
 		else if (!inany_v4(addr))
-			/* Nothing to do */
-			return 0;
-	}
-
-	s = tcp_sock_init_one(c, pif, addr, ifname, port);
-	if (s < 0)
-		return s;
-	if (s > FD_REF_MAX)
-		return -EIO;
-
-	return 0;
-}
-
-/**
- * tcp_ns_sock_init() - Init socket to listen for spliced outbound connections
- * @c:		Execution context
- * @port:	Port, host order
- */
-static void tcp_ns_sock_init(const struct ctx *c, in_port_t port)
-{
-	ASSERT(!c->no_tcp);
-
-	if (!c->no_bindtodevice) {
-		tcp_sock_init(c, PIF_SPLICE, NULL, "lo", port);
-		return;
+			return -EAFNOSUPPORT;
 	}
 
-	if (c->ifi4)
-		tcp_sock_init_one(c, PIF_SPLICE, &inany_loopback4, NULL, port);
-	if (c->ifi6)
-		tcp_sock_init_one(c, PIF_SPLICE, &inany_loopback6, NULL, port);
-}
-
-/**
- * tcp_ns_socks_init() - Bind sockets in namespace for outbound connections
- * @arg:	Execution context
- *
- * Return: 0
- */
-/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */
-static int tcp_ns_socks_init(void *arg)
-{
-	const struct ctx *c = (const struct ctx *)arg;
-	unsigned port;
-
-	ns_enter(c);
-
-	for (port = 0; port < NUM_PORTS; port++) {
-		if (!bitmap_isset(c->tcp.fwd_out.map, port))
-			continue;
+	s = pif_listen(c, EPOLL_TYPE_TCP_LISTEN, pif, addr, ifname, port, rule);
 
-		tcp_ns_sock_init(c, port);
-	}
-
-	return 0;
+	return s;
 }
 
 /**
@@ -2812,7 +2848,7 @@ static void tcp_get_rto_params(struct ctx *c)
  * tcp_init() - Get initial sequence, hash secret, initialise per-socket data
  * @c:		Execution context
  *
- * Return: 0, doesn't return on failure
+ * Return: 0 on success, -1 on failure
  */
 int tcp_init(struct ctx *c)
 {
@@ -2824,15 +2860,16 @@ int tcp_init(struct ctx *c)
 
 	memset(init_sock_pool4,		0xff,	sizeof(init_sock_pool4));
 	memset(init_sock_pool6,		0xff,	sizeof(init_sock_pool6));
-	memset(tcp_sock_init_ext,	0xff,	sizeof(tcp_sock_init_ext));
-	memset(tcp_sock_ns,		0xff,	sizeof(tcp_sock_ns));
 
 	tcp_sock_refill_init(c);
 
+	if (fwd_listen_sync(c, &c->tcp.fwd_in, PIF_HOST, IPPROTO_TCP) < 0)
+		return -1;
 	if (c->mode == MODE_PASTA) {
 		tcp_splice_init(c);
-
-		NS_CALL(tcp_ns_socks_init, c);
+		if (fwd_listen_sync(c, &c->tcp.fwd_out,
+				    PIF_SPLICE, IPPROTO_TCP) < 0)
+			return -1;
 	}
 
 	peek_offset_cap = (!c->ifi4 || tcp_probe_peek_offset_cap(AF_INET)) &&
@@ -2842,7 +2879,7 @@ int tcp_init(struct ctx *c)
 	tcp_info_size = tcp_probe_tcp_info();
 
 #define dbg_tcpi(f_)	debug("TCP_INFO tcpi_%s field%s supported",	\
-			      STRINGIFY(f_), tcp_info_cap(f_) ? " " : " not ")
+			      STRINGIFY(f_), tcp_info_cap(f_) ? "" : " not")
 	dbg_tcpi(snd_wnd);
 	dbg_tcpi(bytes_acked);
 	dbg_tcpi(min_rtt);
@@ -2852,74 +2889,59 @@ int tcp_init(struct ctx *c)
 }
 
 /**
- * tcp_port_rebind() - Rebind ports to match forward maps
- * @c:		Execution context
- * @outbound:	True to remap outbound forwards, otherwise inbound
- *
- * Must be called in namespace context if @outbound is true.
+ * tcp_keepalive() - Send keepalives for connections which need it
+ * @:	Execution context
  */
-static void tcp_port_rebind(struct ctx *c, bool outbound)
+static void tcp_keepalive(struct ctx *c, const struct timespec *now)
 {
-	const uint8_t *fmap = outbound ? c->tcp.fwd_out.map : c->tcp.fwd_in.map;
-	int (*socks)[IP_VERSIONS] = outbound ? tcp_sock_ns : tcp_sock_init_ext;
-	unsigned port;
+	union flow *flow;
 
-	for (port = 0; port < NUM_PORTS; port++) {
-		if (!bitmap_isset(fmap, port)) {
-			if (socks[port][V4] >= 0) {
-				close(socks[port][V4]);
-				socks[port][V4] = -1;
-			}
+	if (now->tv_sec - c->tcp.keepalive_run < KEEPALIVE_INTERVAL)
+		return;
 
-			if (socks[port][V6] >= 0) {
-				close(socks[port][V6]);
-				socks[port][V6] = -1;
-			}
+	c->tcp.keepalive_run = now->tv_sec;
 
-			continue;
-		}
+	flow_foreach_of_type(flow, FLOW_TCP) {
+		struct tcp_tap_conn *conn = &flow->tcp;
 
-		if ((c->ifi4 && socks[port][V4] == -1) ||
-		    (c->ifi6 && socks[port][V6] == -1)) {
-			if (outbound)
-				tcp_ns_sock_init(c, port);
-			else
-				tcp_sock_init(c, PIF_HOST, NULL, NULL, port);
+		if (conn->tap_inactive) {
+			flow_dbg(conn, "No tap activity for least %us, send keepalive",
+				 KEEPALIVE_INTERVAL);
+			tcp_send_flag(c, conn, KEEPALIVE);
 		}
+
+		/* Ready to check fot next interval */
+		conn->tap_inactive = true;
 	}
 }
 
 /**
- * tcp_port_rebind_outbound() - Rebind ports in namespace
- * @arg:	Execution context
- *
- * Called with NS_CALL()
- *
- * Return: 0
+ * tcp_inactivity() - Scan for and close long-inactive connections
+ * @:	Execution context
  */
-static int tcp_port_rebind_outbound(void *arg)
+static void tcp_inactivity(struct ctx *c, const struct timespec *now)
 {
-	struct ctx *c = (struct ctx *)arg;
+	union flow *flow;
 
-	ns_enter(c);
-	tcp_port_rebind(c, true);
+	if (now->tv_sec - c->tcp.inactivity_run < INACTIVITY_INTERVAL)
+		return;
 
-	return 0;
-}
+	debug("TCP inactivity scan");
+	c->tcp.inactivity_run = now->tv_sec;
 
-/**
- * tcp_port_rebind_all() - Rebind ports to match forward maps (in host & ns)
- * @c:		Execution context
- */
-void tcp_port_rebind_all(struct ctx *c)
-{
-	ASSERT(c->mode == MODE_PASTA && !c->no_tcp);
+	flow_foreach_of_type(flow, FLOW_TCP) {
+		struct tcp_tap_conn *conn = &flow->tcp;
 
-	if (c->tcp.fwd_out.mode == FWD_AUTO)
-		NS_CALL(tcp_port_rebind_outbound, c);
+		if (conn->inactive) {
+			/* No activity in this interval, reset */
+			flow_dbg(conn, "Inactive for at least %us, resetting",
+				 INACTIVITY_INTERVAL);
+			tcp_rst(c, conn);
+		}
 
-	if (c->tcp.fwd_in.mode == FWD_AUTO)
-		tcp_port_rebind(c, false);
+		/* Ready to check fot next interval */
+		conn->inactive = true;
+	}
 }
 
 /**
@@ -2927,13 +2949,14 @@ void tcp_port_rebind_all(struct ctx *c)
  * @c:		Execution context
  * @now:	Current timestamp
  */
-void tcp_timer(const struct ctx *c, const struct timespec *now)
+void tcp_timer(struct ctx *c, const struct timespec *now)
 {
-	(void)now;
-
 	tcp_sock_refill_init(c);
 	if (c->mode == MODE_PASTA)
 		tcp_splice_refill(c);
+
+	tcp_keepalive(c, now);
+	tcp_inactivity(c, now);
 }
 
 /**
@@ -3420,7 +3443,7 @@ static int tcp_flow_repair_opt(const struct tcp_tap_conn *conn,
 }
 
 /**
- * tcp_flow_migrate_source() - Send data (flow table) for flow, close listening
+ * tcp_flow_migrate_source() - Send data (flow table) for flow
  * @fd:		Descriptor for state migration
  * @conn:	Pointer to the TCP connection structure
  *
@@ -3460,9 +3483,6 @@ int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn)
 		return rc;
 	}
 
-	if (conn->listening_sock != -1 && !fcntl(conn->listening_sock, F_GETFD))
-		close(conn->listening_sock);
-
 	return 0;
 }
 
@@ -3671,9 +3691,7 @@ static int tcp_flow_repair_connect(const struct ctx *c,
 		return rc;
 	}
 
-	flow_epollid_clear(&conn->f);
 	conn->timer = -1;
-	conn->listening_sock = -1;
 
 	return 0;
 }
@@ -3731,14 +3749,19 @@ int tcp_flow_migrate_target(struct ctx *c, int fd)
 
 	if ((rc = tcp_flow_repair_socket(c, conn))) {
 		flow_err(flow, "Can't set up socket: %s, drop", strerror_(-rc));
-		/* Can't leave the flow in an incomplete state */
-		FLOW_ACTIVATE(conn);
-		return 0;
+		goto out;
 	}
 
+	flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
+	if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, conn->sock,
+			   !TAPSIDE(conn)))
+		goto out; /* tcp_flow_migrate_target_ext() will clean this up */
+
 	flow_hash_insert(c, TAP_SIDX(conn));
-	FLOW_ACTIVATE(conn);
 
+out:
+	/* Never leave the flow in an incomplete state */
+	FLOW_ACTIVATE(conn);
 	return 0;
 }
 
@@ -3862,10 +3885,15 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 		int v;
 
 		v = TCP_SEND_QUEUE;
-		if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v)))
+		if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v))) {
 			flow_perror(conn, "Selecting repair queue");
-		else
-			shutdown(s, SHUT_WR);
+		} else {
+			if (shutdown(s, SHUT_WR) < 0) {
+				flow_perror(conn,
+					    "Repair mode shutdown() failed");
+				goto fail;
+			}
+		}
 	}
 
 	if (tcp_flow_repair_wnd(conn, &t))
@@ -3892,8 +3920,12 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 	 * Call shutdown(x, SHUT_WR) *not* in repair mode, which moves us to
 	 * TCP_FIN_WAIT1.
 	 */
-	if (t.tcpi_state == TCP_FIN_WAIT1)
-		shutdown(s, SHUT_WR);
+	if (t.tcpi_state == TCP_FIN_WAIT1) {
+		if (shutdown(s, SHUT_WR) < 0) {
+			flow_perror(conn, "Post-repair shutdown() failed");
+			goto fail;
+		}
+	}
 
 	if (tcp_set_peek_offset(conn, peek_offset))
 		goto fail;
@@ -3901,7 +3933,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 	tcp_send_flag(c, conn, ACK);
 	tcp_data_from_sock(c, conn);
 
-	if ((rc = tcp_epoll_ctl(c, conn))) {
+	if ((rc = tcp_epoll_ctl(conn))) {
 		flow_dbg(conn,
 			 "Failed to subscribe to epoll for migrated socket: %s",
 			 strerror_(-rc));