1 files changed, 2136 insertions, 685 deletions
diff --git a/tcp.c b/tcp.c
index c0820ce..d6a9ba2 100644
--- a/tcp.c
+++ b/tcp.c
@@ -179,31 +179,38 @@
  *
  * Timeouts are implemented by means of timerfd timers, set based on flags:
  *
- * - SYN_TIMEOUT: if no ACK is received from tap/guest during handshake (flag
- *   ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, reset the
- *   connection
- *
- * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
- *   data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the
- *   socket and reset sequence to what was acknowledged. If this persists for
- *   more than TCP_MAX_RETRANS times in a row, reset the connection
- *
- * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE
- *   with TAP_FIN_SENT event), and no ACK is received within this time, reset
- *   the connection
+ * - RTO_INIT: if no ACK segment was received from tap/guest, either during
+ *   handshake (flag ACK_FROM_TAP_DUE without ESTABLISHED event) or after
+ *   sending data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data
+ *   from the socket and reset sequence to what was acknowledged. This is the
+ *   timeout for the first retry, in seconds. Retry TCP_MAX_RETRIES times for
+ *   established connections, or (syn_retries + syn_linear_timeouts) times
+ *   during the handshake, then reset the connection
+ *
+ * - RTO_INIT_AFTER_SYN_RETRIES: if SYN retries happened during handshake and
+ *   RTO is less than this, re-initialise RTO to this for data retransmissions
+ *
+ * - RTT / 2 elapsed after data segment received from tap without having
+ *   sent an ACK segment, or zero-sized window advertised to tap/guest (flag
+ *   ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent.
  *
- * - FIN_TIMEOUT: if a FIN segment was acknowledged by tap/guest and a FIN
- *   segment (write shutdown) was sent via socket (events SOCK_FIN_SENT and
- *   TAP_FIN_ACKED), but no socket activity is detected from the socket within
- *   this time, reset the connection
+ *   RTT, here, is an approximation of the RTT value reported by the kernel via
+ *   TCP_INFO, with a representable range from RTT_STORE_MIN (100 us) to
+ *   RTT_STORE_MAX (3276.8 ms). The timeout value is clamped accordingly.
  *
- * - ACT_TIMEOUT, in the presence of any event: if no activity is detected on
- *   either side, the connection is reset
+ * We also use a global interval timer for an activity timeout which doesn't
+ * require precision:
  *
- * - ACK_INTERVAL elapsed after data segment received from tap without having
- *   sent an ACK segment, or zero-sized window advertised to tap/guest (flag
- *   ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent
+ * - INACTIVITY_INTERVAL: if a connection has had no activity for an entire
+ *   interval, close and reset it.  This means that idle connections (without
+ *   keepalives) will be removed between INACTIVITY_INTERVAL s and
+ *   2*INACTIVITY_INTERVAL s after the last activity.
  *
+ * - KEEPALIVE_INTERVAL: if a connection has had no tap-side activity for an
+ *   entire interval, send a tap-side keepalive.  If the endpoint is no longer
+ *   aware of the connection (due to a reboot, or a kernel timeout in FIN_WAIT_2
+ *   state) that should trigger an RST, so we won't keep track of connections
+ *   that the guest endpoint no longer cares about.
  *
  * Summary of data flows (with ESTABLISHED event)
  * ----------------------------------------------
@@ -274,11 +281,12 @@
 #include <net/if.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
+#include <netinet/tcp.h>
 #include <stdint.h>
 #include <stdbool.h>
 #include <stddef.h>
 #include <string.h>
-#include <sys/epoll.h>
+#include <sys/ioctl.h>
 #include <sys/socket.h>
 #include <sys/timerfd.h>
 #include <sys/types.h>
@@ -286,7 +294,8 @@
 #include <time.h>
 #include <arpa/inet.h>
 
-#include <linux/tcp.h> /* For struct tcp_info */
+#include <linux/sockios.h>
+#include <linux/sock_diag.h>
 
 #include "checksum.h"
 #include "util.h"
@@ -294,39 +303,60 @@
 #include "ip.h"
 #include "passt.h"
 #include "tap.h"
-#include "siphash.h"
-#include "pcap.h"
 #include "tcp_splice.h"
 #include "log.h"
 #include "inany.h"
 #include "flow.h"
+#include "repair.h"
+#include "linux_dep.h"
+#include "serialise.h"
 
 #include "flow_table.h"
 #include "tcp_internal.h"
 #include "tcp_buf.h"
+#include "tcp_vu.h"
+#include "epoll_ctl.h"
+
+/*
+ * The size of TCP header (including options) is given by doff (Data Offset)
+ * that is a 4-bit value specifying the number of 32-bit words in the header.
+ * The maximum value of doff is 15 [(1 << 4) - 1].
+ * The maximum length in bytes of options is 15 minus the number of 32-bit
+ * words in the minimal TCP header (5) multiplied by the length of a 32-bit
+ * word (4).
+ */
+#define OPTLEN_MAX (((1UL << 4) - 1 - 5) * 4UL)
+
+#ifndef __USE_MISC
+/* From Linux UAPI, missing in netinet/tcp.h provided by musl */
+struct tcp_repair_opt {
+	__u32	opt_code;
+	__u32	opt_val;
+};
+
+enum {
+	TCP_NO_QUEUE,
+	TCP_RECV_QUEUE,
+	TCP_SEND_QUEUE,
+	TCP_QUEUES_NR,
+};
+#endif
 
 /* MSS rounding: see SET_MSS() */
 #define MSS_DEFAULT			536
 #define WINDOW_DEFAULT			14600		/* RFC 6928 */
-#ifdef HAS_SND_WND
-# define KERNEL_REPORTS_SND_WND(c)	((c)->tcp.kernel_snd_wnd)
-#else
-# define KERNEL_REPORTS_SND_WND(c)	(0 && (c))
-#endif
 
-#define ACK_INTERVAL			10		/* ms */
-#define SYN_TIMEOUT			10		/* s */
-#define ACK_TIMEOUT			2
-#define FIN_TIMEOUT			60
-#define ACT_TIMEOUT			7200
+#define RTO_INIT			1		/* s, RFC 6298 */
+#define RTO_INIT_AFTER_SYN_RETRIES	3		/* s, RFC 6298 */
+
+#define INACTIVITY_INTERVAL		7200		/* s */
+#define	KEEPALIVE_INTERVAL		30		/* s */
 
 #define LOW_RTT_TABLE_SIZE		8
 #define LOW_RTT_THRESHOLD		10 /* us */
 
-/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of
- * <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP
- */
-#define SOL_TCP				IPPROTO_TCP
+/* Ratio of buffer to bandwidth * delay product implying interactive traffic */
+#define SNDBUF_TO_BW_DELAY_INTERACTIVE	/* > */ 20 /* (i.e. < 5% of buffer) */
 
 #define ACK_IF_NEEDED	0		/* See tcp_send_flag() */
 
@@ -335,6 +365,28 @@
 	 ((conn)->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
 #define CONN_HAS(conn, set)	(((conn)->events & (set)) == (set))
 
+/* Buffers to migrate pending data from send and receive queues. No, they don't
+ * use memory if we don't use them. And we're going away after this, so splurge.
+ */
+#define TCP_MIGRATE_SND_QUEUE_MAX	(64 << 20)
+#define TCP_MIGRATE_RCV_QUEUE_MAX	(64 << 20)
+static uint8_t tcp_migrate_snd_queue	[TCP_MIGRATE_SND_QUEUE_MAX];
+static uint8_t tcp_migrate_rcv_queue	[TCP_MIGRATE_RCV_QUEUE_MAX];
+
+#define TCP_MIGRATE_RESTORE_CHUNK_MIN	1024 /* Try smaller when above this */
+
+#define SYN_RETRIES		"/proc/sys/net/ipv4/tcp_syn_retries"
+#define SYN_LINEAR_TIMEOUTS	"/proc/sys/net/ipv4/tcp_syn_linear_timeouts"
+#define RTO_MAX_MS		"/proc/sys/net/ipv4/tcp_rto_max_ms"
+
+#define SYN_RETRIES_DEFAULT		6
+#define SYN_LINEAR_TIMEOUTS_DEFAULT	4
+#define RTO_MAX_DEFAULT		120 /* s */
+#define MAX_SYNCNT			127 /* derived from kernel's limit */
+
+/* "Extended" data (not stored in the flow table) for TCP flow migration */
+static struct tcp_tap_transfer_ext migrate_ext[FLOW_MAX];
+
 static const char *tcp_event_str[] __attribute((__unused__)) = {
 	"SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT",
 
@@ -347,30 +399,42 @@ static const char *tcp_state_str[] __attribute((__unused__)) = {
 	"SYN_RCVD",	/* approximately maps to TAP_SYN_ACK_SENT */
 
 	/* Passive close: */
-	"CLOSE_WAIT", "CLOSE_WAIT", "LAST_ACK", "LAST_ACK", "LAST_ACK",
+	"CLOSE_WAIT", "CLOSE_WAIT", "CLOSE_WAIT", "LAST_ACK", "LAST_ACK",
 	/* Active close (+5): */
 	"CLOSING", "FIN_WAIT_1", "FIN_WAIT_1", "FIN_WAIT_2", "TIME_WAIT",
 };
 
 static const char *tcp_flag_str[] __attribute((__unused__)) = {
 	"STALLED", "LOCAL", "ACTIVE_CLOSE", "ACK_TO_TAP_DUE",
-	"ACK_FROM_TAP_DUE",
+	"ACK_FROM_TAP_DUE", "ACK_FROM_TAP_BLOCKS", "SYN_RETRIED",
 };
 
-/* Listening sockets, used for automatic port forwarding in pasta mode only */
-static int tcp_sock_init_ext	[NUM_PORTS][IP_VERSIONS];
-static int tcp_sock_ns		[NUM_PORTS][IP_VERSIONS];
-
-/* Table of guest side forwarding addresses with very low RTT (assumed
- * to be local to the host), LRU
+/* Table of our guest side addresses with very low RTT (assumed to be local to
+ * the host), LRU
  */
 static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
 
-char		tcp_buf_discard		[MAX_WINDOW];
+char		tcp_buf_discard		[BUF_DISCARD_SIZE];
 
 /* Does the kernel support TCP_PEEK_OFF? */
 bool peek_offset_cap;
 
+/* Size of data returned by TCP_INFO getsockopt() */
+static socklen_t tcp_info_size;
+
+#define tcp_info_cap(f_)						\
+	((offsetof(struct tcp_info_linux, tcpi_##f_) +			\
+	  sizeof(((struct tcp_info_linux *)NULL)->tcpi_##f_)) <= tcp_info_size)
+
+/* Kernel reports sending window in TCP_INFO (kernel commit 8f7baad7f035) */
+#define snd_wnd_cap		tcp_info_cap(snd_wnd)
+/* Kernel reports bytes acked in TCP_INFO (kernel commit 0df48c26d84) */
+#define bytes_acked_cap		tcp_info_cap(bytes_acked)
+/* Kernel reports minimum RTT in TCP_INFO (kernel commit cd9b266095f4) */
+#define min_rtt_cap		tcp_info_cap(min_rtt)
+/* Kernel reports delivery rate in TCP_INFO (kernel commit eb8329e0a04d) */
+#define delivery_rate_cap	tcp_info_cap(delivery_rate)
+
 /* sendmsg() to socket */
 static struct iovec	tcp_iov			[UIO_MAXIOV];
 
@@ -392,24 +456,25 @@ static struct tcp_tap_conn *conn_at_sidx(flow_sidx_t sidx)
 	if (!flow)
 		return NULL;
 
-	ASSERT(flow->f.type == FLOW_TCP);
+	assert(flow->f.type == FLOW_TCP);
 	return &flow->tcp;
 }
 
 /**
- * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported
- * @s:          Socket to update
+ * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on connection if supported
+ * @conn:	Pointer to the TCP connection structure
  * @offset:     Offset in bytes
  *
- * Return:      -1 when it fails, 0 otherwise.
+ * Return: -1 when it fails, 0 otherwise.
  */
-int tcp_set_peek_offset(int s, int offset)
+int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset)
 {
 	if (!peek_offset_cap)
 		return 0;
 
-	if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) {
-		err("Failed to set SO_PEEK_OFF to %i in socket %i", offset, s);
+	if (setsockopt(conn->sock, SOL_SOCKET, SO_PEEK_OFF,
+		       &offset, sizeof(offset))) {
+		flow_perror(conn, "Failed to set SO_PEEK_OFF to %i", offset);
 		return -1;
 	}
 	return 0;
@@ -418,7 +483,7 @@ int tcp_set_peek_offset(int s, int offset)
 /**
  * tcp_conn_epoll_events() - epoll events mask for given connection state
  * @events:	Current connection events
- * @conn_flags	Connection flags
+ * @conn_flags:	Connection flags
  *
  * Return: epoll events mask corresponding to implied connection state
  */
@@ -431,8 +496,12 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
 		if (events & TAP_FIN_SENT)
 			return EPOLLET;
 
-		if (conn_flags & STALLED)
-			return EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLET;
+		if (conn_flags & STALLED) {
+			if (conn_flags & ACK_FROM_TAP_BLOCKS)
+				return EPOLLRDHUP | EPOLLET;
+
+			return EPOLLIN | EPOLLRDHUP | EPOLLET;
+		}
 
 		return EPOLLIN | EPOLLRDHUP;
 	}
@@ -440,49 +509,35 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
 	if (events == TAP_SYN_RCVD)
 		return EPOLLOUT | EPOLLET | EPOLLRDHUP;
 
-	return EPOLLRDHUP;
+	return EPOLLET | EPOLLRDHUP;
 }
 
 /**
  * tcp_epoll_ctl() - Add/modify/delete epoll state from connection events
- * @c:		Execution context
  * @conn:	Connection pointer
  *
  * Return: 0 on success, negative error code on failure (not on deletion)
  */
-static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
+static int tcp_epoll_ctl(struct tcp_tap_conn *conn)
 {
-	int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
-	union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock,
-		                .flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), };
-	struct epoll_event ev = { .data.u64 = ref.u64 };
+	uint32_t events;
 
 	if (conn->events == CLOSED) {
-		if (conn->in_epoll)
-			epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev);
+		int epollfd = flow_epollfd(&conn->f);
+
+		epoll_del(epollfd, conn->sock);
 		if (conn->timer != -1)
-			epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev);
+			epoll_del(epollfd, conn->timer);
+
 		return 0;
 	}
 
-	ev.events = tcp_conn_epoll_events(conn->events, conn->flags);
+	events = tcp_conn_epoll_events(conn->events, conn->flags);
 
-	if (epoll_ctl(c->epollfd, m, conn->sock, &ev))
+	if (flow_epoll_set(&conn->f, EPOLL_CTL_MOD, events, conn->sock,
+			   !TAPSIDE(conn)) < 0)
 		return -errno;
 
-	conn->in_epoll = true;
-
-	if (conn->timer != -1) {
-		union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER,
-					  .fd = conn->sock,
-					  .flow = FLOW_IDX(conn) };
-		struct epoll_event ev_t = { .data.u64 = ref_t.u64,
-					    .events = EPOLLIN | EPOLLET };
-
-		if (epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t))
-			return -errno;
-	}
-
 	return 0;
 }
 
@@ -490,8 +545,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
  * tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed
  * @c:		Execution context
  * @conn:	Connection pointer
- *
- * #syscalls timerfd_create timerfd_settime
+ * #syscalls timerfd_create timerfd_settime|timerfd_settime32
  */
 static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 {
@@ -501,51 +555,66 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 		return;
 
 	if (conn->timer == -1) {
-		union epoll_ref ref = { .type = EPOLL_TYPE_TCP_TIMER,
-					.fd = conn->sock,
-					.flow = FLOW_IDX(conn) };
-		struct epoll_event ev = { .data.u64 = ref.u64,
-					  .events = EPOLLIN | EPOLLET };
+		union epoll_ref ref;
 		int fd;
 
 		fd = timerfd_create(CLOCK_MONOTONIC, 0);
-		if (fd == -1 || fd > FD_REF_MAX) {
-			flow_dbg(conn, "failed to get timer: %s",
-				 strerror(errno));
-			if (fd > -1)
-				close(fd);
-			conn->timer = -1;
+		if (fd == -1) {
+			flow_dbg_perror(conn, "failed to get timer");
+			return;
+		}
+		if (fd > FD_REF_MAX) {
+			flow_dbg(conn, "timer fd overflow (%d > %d)",
+				 fd, FD_REF_MAX);
+			close(fd);
 			return;
 		}
-		conn->timer = fd;
 
-		if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) {
-			flow_dbg(conn, "failed to add timer: %s",
-				 strerror(errno));
-			close(conn->timer);
-			conn->timer = -1;
+		ref.type = EPOLL_TYPE_TCP_TIMER;
+		ref.flow = FLOW_IDX(conn);
+		ref.fd = fd;
+		if (epoll_add(flow_epollfd(&conn->f), EPOLLIN | EPOLLET,
+			      ref) < 0) {
+			flow_dbg(conn, "failed to add timer");
+			close(fd);
 			return;
 		}
+
+		conn->timer = fd;
 	}
 
 	if (conn->flags & ACK_TO_TAP_DUE) {
-		it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000;
+		it.it_value.tv_sec = RTT_GET(conn) / 2 / ((long)1000 * 1000);
+		it.it_value.tv_nsec = RTT_GET(conn) / 2 % ((long)1000 * 1000) *
+				      1000;
 	} else if (conn->flags & ACK_FROM_TAP_DUE) {
+		int exp = conn->retries, timeout = RTO_INIT;
 		if (!(conn->events & ESTABLISHED))
-			it.it_value.tv_sec = SYN_TIMEOUT;
-		else
-			it.it_value.tv_sec = ACK_TIMEOUT;
-	} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
-		it.it_value.tv_sec = FIN_TIMEOUT;
+			exp -= c->tcp.syn_linear_timeouts;
+		else if (conn->flags & SYN_RETRIED)
+			timeout = MAX(timeout, RTO_INIT_AFTER_SYN_RETRIES);
+		timeout <<= MAX(exp, 0);
+		it.it_value.tv_sec = MIN(timeout, c->tcp.rto_max);
 	} else {
-		it.it_value.tv_sec = ACT_TIMEOUT;
+		/* Disarm */
+		it.it_value.tv_sec = 0;
+		it.it_value.tv_nsec = 0;
 	}
 
-	flow_dbg(conn, "timer expires in %llu.%03llus",
-		 (unsigned long long)it.it_value.tv_sec,
-		 (unsigned long long)it.it_value.tv_nsec / 1000 / 1000);
+	if (conn->flags & ACK_TO_TAP_DUE) {
+		flow_trace(conn, "timer expires in %llu.%02llums",
+			   (unsigned long long)it.it_value.tv_sec * 1000 +
+			   it.it_value.tv_nsec / 1000 / 1000,
+			   (unsigned long long)it.it_value.tv_nsec
+			   / 1000 / 10 % 100);
+	} else {
+		flow_dbg(conn, "timer expires in %llu.%03llus",
+			 (unsigned long long)it.it_value.tv_sec,
+			 (unsigned long long)it.it_value.tv_nsec / 1000 / 1000);
+	}
 
-	timerfd_settime(conn->timer, 0, &it, NULL);
+	if (timerfd_settime(conn->timer, 0, &it, NULL))
+		flow_perror(conn, "failed to set timer");
 }
 
 /**
@@ -588,7 +657,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
 	}
 
 	if (flag == STALLED || flag == ~STALLED)
-		tcp_epoll_ctl(c, conn);
+		tcp_epoll_ctl(conn);
 
 	if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE		  ||
 	    (flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) ||
@@ -640,15 +709,13 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
 		flow_dbg(conn, "%s",
 			 num == -1 	       ? "CLOSED" : tcp_event_str[num]);
 
-	if (event == CLOSED)
-		flow_hash_remove(c, TAP_SIDX(conn));
-	else if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD))
+	if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD)) {
 		conn_flag(c, conn, ACTIVE_CLOSE);
-	else
-		tcp_epoll_ctl(c, conn);
-
-	if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
-		tcp_timer_ctl(c, conn);
+	} else {
+		if (event == CLOSED)
+			flow_hash_remove(c, TAP_SIDX(conn));
+		tcp_epoll_ctl(conn);
+	}
 }
 
 /**
@@ -663,7 +730,7 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
 	int i;
 
 	for (i = 0; i < LOW_RTT_TABLE_SIZE; i++)
-		if (inany_equals(&tapside->faddr, low_rtt_dst + i))
+		if (inany_equals(&tapside->oaddr, low_rtt_dst + i))
 			return 1;
 
 	return 0;
@@ -675,18 +742,17 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
  * @tinfo:	Pointer to struct tcp_info for socket
  */
 static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
-			      const struct tcp_info *tinfo)
+			      const struct tcp_info_linux *tinfo)
 {
-#ifdef HAS_MIN_RTT
 	const struct flowside *tapside = TAPFLOW(conn);
 	int i, hole = -1;
 
-	if (!tinfo->tcpi_min_rtt ||
+	if (!min_rtt_cap ||
 	    (int)tinfo->tcpi_min_rtt > LOW_RTT_THRESHOLD)
 		return;
 
 	for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) {
-		if (inany_equals(&tapside->faddr, low_rtt_dst + i))
+		if (inany_equals(&tapside->oaddr, low_rtt_dst + i))
 			return;
 		if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i))
 			hole = i;
@@ -698,18 +764,14 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
 	if (hole == -1)
 		return;
 
-	low_rtt_dst[hole++] = tapside->faddr;
+	low_rtt_dst[hole++] = tapside->oaddr;
 	if (hole == LOW_RTT_TABLE_SIZE)
 		hole = 0;
 	inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any);
-#else
-	(void)conn;
-	(void)tinfo;
-#endif /* HAS_MIN_RTT */
 }
 
 /**
- * tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.5 usage)
+ * tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.75 usage)
  * @conn:	Connection pointer
  */
 static void tcp_get_sndbuf(struct tcp_tap_conn *conn)
@@ -724,62 +786,36 @@ static void tcp_get_sndbuf(struct tcp_tap_conn *conn)
 		return;
 	}
 
-	v = sndbuf;
-	if (v >= SNDBUF_BIG)
-		v /= 2;
-	else if (v > SNDBUF_SMALL)
-		v -= v * (v - SNDBUF_SMALL) / (SNDBUF_BIG - SNDBUF_SMALL) / 2;
+	v = clamped_scale(sndbuf, sndbuf, SNDBUF_SMALL, SNDBUF_BIG, 75);
 
 	SNDBUF_SET(conn, MIN(INT_MAX, v));
 }
 
 /**
- * tcp_sock_set_bufsize() - Set SO_RCVBUF and SO_SNDBUF to maximum values
+ * tcp_sock_set_nodelay() - Set TCP_NODELAY option (disable Nagle's algorithm)
  * @s:		Socket, can be -1 to avoid check in the caller
  */
-static void tcp_sock_set_bufsize(const struct ctx *c, int s)
+static void tcp_sock_set_nodelay(int s)
 {
-	int v = INT_MAX / 2; /* Kernel clamps and rounds, no need to check */
-
 	if (s == -1)
 		return;
 
-	if (!c->low_rmem && setsockopt(s, SOL_SOCKET, SO_RCVBUF, &v, sizeof(v)))
-		trace("TCP: failed to set SO_RCVBUF to %i", v);
-
-	if (!c->low_wmem && setsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v)))
-		trace("TCP: failed to set SO_SNDBUF to %i", v);
-}
-
-/**
- * tcp_update_check_tcp4() - Update TCP checksum from stored one
- * @iph:	IPv4 header
- * @th:		TCP header followed by TCP payload
- */
-static void tcp_update_check_tcp4(const struct iphdr *iph, struct tcphdr *th)
-{
-	uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr);
-	struct in_addr saddr = { .s_addr = iph->saddr };
-	struct in_addr daddr = { .s_addr = iph->daddr };
-	uint32_t sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr);
-
-	th->check = 0;
-	th->check = csum(th, l4len, sum);
+	if (setsockopt(s, SOL_TCP, TCP_NODELAY, &((int){ 1 }), sizeof(int)))
+		debug("TCP: failed to set TCP_NODELAY on socket %i", s);
 }
 
 /**
- * tcp_update_check_tcp6() - Calculate TCP checksum for IPv6
- * @ip6h:	IPv6 header
- * @th:		TCP header followed by TCP payload
+ * tcp_update_csum() - Calculate TCP checksum
+ * @psum:	Unfolded partial checksum of the IPv4 or IPv6 pseudo-header
+ * @th:		TCP header (updated)
+ * @payload:	TCP payload
  */
-static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
+static void tcp_update_csum(uint32_t psum, struct tcphdr *th,
+			    struct iov_tail *payload)
 {
-	uint16_t l4len = ntohs(ip6h->payload_len);
-	uint32_t sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP,
-					      &ip6h->saddr, &ip6h->daddr);
-
 	th->check = 0;
-	th->check = csum(th, l4len, sum);
+	psum = csum_unfolded(th, sizeof(*th), psum);
+	th->check = csum_iov_tail(payload, psum);
 }
 
 /**
@@ -865,7 +901,6 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn)
 /* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */
 void tcp_defer_handler(struct ctx *c)
 {
-	tcp_flags_flush(c);
 	tcp_payload_flush(c);
 }
 
@@ -881,7 +916,7 @@ static void tcp_fill_header(struct tcphdr *th,
 {
 	const struct flowside *tapside = TAPFLOW(conn);
 
-	th->source = htons(tapside->fport);
+	th->source = htons(tapside->oport);
 	th->dest = htons(tapside->eport);
 	th->seq = htonl(seq);
 	th->ack_seq = htonl(conn->seq_ack_to_tap);
@@ -895,116 +930,176 @@ static void tcp_fill_header(struct tcphdr *th,
 }
 
 /**
- * tcp_fill_headers4() - Fill 802.3, IPv4, TCP headers in pre-cooked buffers
- * @conn:	Connection pointer
- * @taph:	tap backend specific header
- * @iph:	Pointer to IPv4 header
- * @th:		Pointer to TCP header
- * @dlen:	TCP payload length
- * @check:	Checksum, if already known
- * @seq:	Sequence number for this segment
- *
- * Return: The IPv4 payload length, host order
+ * tcp_fill_headers() - Fill 802.3, IP, TCP headers
+ * @c:			Execution context
+ * @conn:		Connection pointer
+ * @eh:		Pointer to Ethernet header
+ * @ip4h:		Pointer to IPv4 header, or NULL
+ * @ip6h:		Pointer to IPv6 header, or NULL
+ * @th:			Pointer to TCP header
+ * @payload:		TCP payload
+ * @ip4_check:		IPv4 checksum, if already known
+ * @seq:		Sequence number for this segment
+ * @no_tcp_csum:	Do not set TCP checksum
+ *
+ * Return: frame length (including L2 headers)
  */
-static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
-				struct tap_hdr *taph,
-				struct iphdr *iph, struct tcphdr *th,
-				size_t dlen, const uint16_t *check,
-				uint32_t seq)
+size_t tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
+			struct ethhdr *eh,
+			struct iphdr *ip4h, struct ipv6hdr *ip6h,
+			struct tcphdr *th, struct iov_tail *payload,
+			const uint16_t *ip4_check, uint32_t seq,
+			bool no_tcp_csum)
 {
 	const struct flowside *tapside = TAPFLOW(conn);
-	const struct in_addr *src4 = inany_v4(&tapside->faddr);
-	const struct in_addr *dst4 = inany_v4(&tapside->eaddr);
-	size_t l4len = dlen + sizeof(*th);
-	size_t l3len = l4len + sizeof(*iph);
+	size_t l4len = iov_tail_size(payload) + sizeof(*th);
+	uint8_t *omac = conn->f.tap_omac;
+	size_t l3len = l4len;
+	uint32_t psum = 0;
 
-	ASSERT(src4 && dst4);
+	if (ip4h) {
+		const struct in_addr *src4 = inany_v4(&tapside->oaddr);
+		const struct in_addr *dst4 = inany_v4(&tapside->eaddr);
 
-	iph->tot_len = htons(l3len);
-	iph->saddr = src4->s_addr;
-	iph->daddr = dst4->s_addr;
+		assert(src4 && dst4);
 
-	iph->check = check ? *check :
-			     csum_ip4_header(l3len, IPPROTO_TCP, *src4, *dst4);
+		l3len += + sizeof(*ip4h);
 
-	tcp_fill_header(th, conn, seq);
+		ip4h->tot_len = htons(l3len);
+		ip4h->saddr = src4->s_addr;
+		ip4h->daddr = dst4->s_addr;
 
-	tcp_update_check_tcp4(iph, th);
+		if (ip4_check)
+			ip4h->check = *ip4_check;
+		else
+			ip4h->check = csum_ip4_header(l3len, IPPROTO_TCP,
+						      *src4, *dst4);
 
-	tap_hdr_update(taph, l3len + sizeof(struct ethhdr));
+		if (!no_tcp_csum) {
+			psum = proto_ipv4_header_psum(l4len, IPPROTO_TCP,
+						      *src4, *dst4);
+		}
+		eh->h_proto = htons_constant(ETH_P_IP);
+	}
 
-	return l4len;
-}
+	if (ip6h) {
+		l3len += sizeof(*ip6h);
 
-/**
- * tcp_fill_headers6() - Fill 802.3, IPv6, TCP headers in pre-cooked buffers
- * @conn:	Connection pointer
- * @taph:	tap backend specific header
- * @ip6h:	Pointer to IPv6 header
- * @th:		Pointer to TCP header
- * @dlen:	TCP payload length
- * @check:	Checksum, if already known
- * @seq:	Sequence number for this segment
- *
- * Return: The IPv6 payload length, host order
- */
-static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
-				struct tap_hdr *taph,
-				struct ipv6hdr *ip6h, struct tcphdr *th,
-				size_t dlen, uint32_t seq)
-{
-	const struct flowside *tapside = TAPFLOW(conn);
-	size_t l4len = dlen + sizeof(*th);
+		ip6h->payload_len = htons(l4len);
+		ip6h->saddr = tapside->oaddr.a6;
+		ip6h->daddr = tapside->eaddr.a6;
 
-	ip6h->payload_len = htons(l4len);
-	ip6h->saddr = tapside->faddr.a6;
-	ip6h->daddr = tapside->eaddr.a6;
+		ip6h->hop_limit = 255;
+		ip6h->version = 6;
+		ip6h->nexthdr = IPPROTO_TCP;
 
-	ip6h->hop_limit = 255;
-	ip6h->version = 6;
-	ip6h->nexthdr = IPPROTO_TCP;
+		ip6_set_flow_lbl(ip6h, conn->sock);
 
-	ip6h->flow_lbl[0] = (conn->sock >> 16) & 0xf;
-	ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff;
-	ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff;
+		if (!no_tcp_csum) {
+			psum = proto_ipv6_header_psum(l4len, IPPROTO_TCP,
+						      &ip6h->saddr,
+						      &ip6h->daddr);
+		}
+		eh->h_proto = htons_constant(ETH_P_IPV6);
+	}
 
-	tcp_fill_header(th, conn, seq);
+	/* Find if neighbour table has a recorded MAC address */
+	if (MAC_IS_UNDEF(omac))
+		fwd_neigh_mac_get(c, &tapside->oaddr, omac);
+	eth_update_mac(eh, NULL, omac);
 
-	tcp_update_check_tcp6(ip6h, th);
+	tcp_fill_header(th, conn, seq);
 
-	tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr));
+	if (no_tcp_csum)
+		th->check = 0;
+	else
+		tcp_update_csum(psum, th, payload);
 
-	return l4len;
+	return MAX(l3len + sizeof(struct ethhdr), ETH_ZLEN);
 }
 
 /**
- * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
+ * tcp_wnd_from_sndbuf() - Calculate window from available send buffer space
+ * @s:		Socket file descriptor
  * @conn:	Connection pointer
- * @iov:	Pointer to an array of iovec of TCP pre-cooked buffers
- * @dlen:	TCP payload length
- * @check:	Checksum, if already known
- * @seq:	Sequence number for this segment
+ * @tinfo:	tcp_info from kernel
  *
- * Return: IP payload length, host order
+ * Return: window value to advertise, not scaled
  */
-size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
-			       struct iovec *iov, size_t dlen,
-			       const uint16_t *check, uint32_t seq)
+static uint32_t tcp_wnd_from_sndbuf(int s, struct tcp_tap_conn *conn,
+				     const struct tcp_info_linux *tinfo)
 {
-	const struct flowside *tapside = TAPFLOW(conn);
-	const struct in_addr *a4 = inany_v4(&tapside->faddr);
+	uint32_t rtt_ms_ceiling = DIV_ROUND_UP(tinfo->tcpi_rtt, 1000);
+	uint32_t mem[SK_MEMINFO_VARS];
+	socklen_t mem_sl = sizeof(mem);
+	int mss = MSS_GET(conn);
+	uint32_t limit, sendq;
+
+	if (ioctl(s, SIOCOUTQ, &sendq)) {
+		debug_perror("SIOCOUTQ on socket %i, assuming 0", s);
+		sendq = 0;
+	}
+
+	if (getsockopt(s, SOL_SOCKET, SO_MEMINFO, &mem, &mem_sl)) {
+		tcp_get_sndbuf(conn);
+
+		if (sendq > SNDBUF_GET(conn)) /* Due to memory pressure? */
+			limit = 0;
+		else
+			limit = SNDBUF_GET(conn) - sendq;
+	} else {
+		uint32_t sndbuf = mem[SK_MEMINFO_SNDBUF];
+		uint32_t wmemq = mem[SK_MEMINFO_WMEM_QUEUED];
+		uint32_t scaled = clamped_scale(sndbuf, sndbuf, SNDBUF_SMALL,
+						SNDBUF_BIG, 75);
+
+		SNDBUF_SET(conn, MIN(INT_MAX, scaled));
+
+		if (wmemq > sndbuf) {
+			limit = 0;
+		} else if (!sendq || !mss || wmemq <= sendq) {
+			limit = SNDBUF_GET(conn) - wmemq;
+		} else {
+			uint32_t used_segs = MAX(sendq / mss, 1);
+			uint32_t overhead = (wmemq - sendq) / used_segs;
+			uint32_t remaining = sndbuf - wmemq;
+			uint32_t avail_segs = remaining / (mss + overhead);
 
-	if (a4) {
-		return tcp_fill_headers4(conn, iov[TCP_IOV_TAP].iov_base,
-					 iov[TCP_IOV_IP].iov_base,
-					 iov[TCP_IOV_PAYLOAD].iov_base, dlen,
-					 check, seq);
+			limit = avail_segs * mss;
+		}
 	}
 
-	return tcp_fill_headers6(conn, iov[TCP_IOV_TAP].iov_base,
-				 iov[TCP_IOV_IP].iov_base,
-				 iov[TCP_IOV_PAYLOAD].iov_base, dlen,
-				 seq);
+	/* If the sender uses mechanisms to prevent Silly Window
+	 * Syndrome (SWS, described in RFC 813 Section 3) it's critical
+	 * that, should the window ever become less than the MSS, we
+	 * advertise a new value once it increases again to be above it.
+	 *
+	 * The mechanism to avoid SWS in the kernel is, implicitly,
+	 * implemented by Nagle's algorithm (which was proposed after
+	 * RFC 813).
+	 *
+	 * To this end, for simplicity, approximate a window value below
+	 * the MSS to zero, as we already have mechanisms in place to
+	 * force updates after the window becomes zero. This matches the
+	 * suggestion from RFC 813, Section 4.
+	 *
+	 * But don't do this if, either:
+	 *
+	 * - there's nothing in the outbound queue: the size of the
+	 *   sending buffer is limiting us, and it won't increase if we
+	 *   don't send data, so there's no point in waiting, or
+	 *
+	 * - we haven't sent data in a while (somewhat arbitrarily, ten
+	 *   times the RTT), as that might indicate that the receiver
+	 *   will only process data in batches that are large enough,
+	 *   but we won't send enough to fill one because we're stuck
+	 *   with pending data in the outbound queue
+	 */
+	if (limit < (uint32_t)MSS_GET(conn) && sendq &&
+	    tinfo->tcpi_last_data_sent < rtt_ms_ceiling * 10)
+		limit = 0;
+
+	return MIN(tinfo->tcpi_snd_wnd, limit);
 }
 
 /**
@@ -1015,44 +1110,88 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
  * @tinfo:	tcp_info from kernel, can be NULL if not pre-fetched
  *
  * Return: 1 if sequence or window were updated, 0 otherwise
+ *
+ * #syscalls ioctl
  */
 int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
-			  int force_seq, struct tcp_info *tinfo)
+			  bool force_seq, struct tcp_info_linux *tinfo)
 {
 	uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap;
 	uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
 	/* cppcheck-suppress [ctunullpointer, unmatchedSuppression] */
 	socklen_t sl = sizeof(*tinfo);
-	struct tcp_info tinfo_new;
+	struct tcp_info_linux tinfo_new;
 	uint32_t new_wnd_to_tap = prev_wnd_to_tap;
+	bool ack_everything = true;
 	int s = conn->sock;
 
-#ifndef HAS_BYTES_ACKED
-	(void)force_seq;
-
-	conn->seq_ack_to_tap = conn->seq_from_tap;
-	if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
-		conn->seq_ack_to_tap = prev_ack_to_tap;
-#else
-	if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || tcp_rtt_dst_low(conn)
-	    || CONN_IS_CLOSING(conn) || (conn->flags & LOCAL) || force_seq) {
-		conn->seq_ack_to_tap = conn->seq_from_tap;
-	} else if (conn->seq_ack_to_tap != conn->seq_from_tap) {
+	/* At this point we could ack all the data we've accepted for forwarding
+	 * (seq_from_tap).  When possible, however, we want to only acknowledge
+	 * what the peer has acknowledged.  This makes it appear to the guest
+	 * more like a direct connection to the peer, and may improve flow
+	 * control behaviour.
+	 *
+	 * For it to be possible and worth it we need:
+	 *  - The TCP_INFO Linux extensions which give us the peer acked bytes
+	 *    and the delivery rate (outbound bandwidth at receiver)
+	 *  - Not to be told not to (force_seq)
+	 *  - Not half-closed in the peer->guest direction
+	 *      With no data coming from the peer, we might not get events which
+	 *      would prompt us to recheck bytes_acked.  We could poll on a
+	 *      timer, but that's more trouble than it's worth.
+	 *  - Not a host local connection
+	 *      Data goes from socket to socket, with nothing meaningfully "in
+	 *      flight".
+	 *  - Not a pseudo-local connection (e.g. to a VM on the same host)
+	 *      If it is, there's not enough in flight to bother.
+	 *  - Sending buffer significantly larger than bandwidth * delay product
+	 *      Meaning we're not bandwidth-bound and this is likely to be
+	 *      interactive traffic where we want to preserve transparent
+	 *      connection behaviour and latency.
+	 *
+	 *      Otherwise, we probably want to maximise throughput, which needs
+	 *      sending buffer auto-tuning, triggered in turn by filling up the
+	 *      outbound socket queue.
+	 */
+	if (bytes_acked_cap && delivery_rate_cap && !force_seq &&
+	    !CONN_IS_CLOSING(conn) &&
+	    !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn)) {
 		if (!tinfo) {
 			tinfo = &tinfo_new;
 			if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
 				return 0;
 		}
 
-		conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked +
-				       conn->seq_init_from_tap;
+		/* This trips a cppcheck bug in some versions, including
+		 * cppcheck 2.18.3.
+		 * https://trac.cppcheck.net/ticket/14191
+		 */
+		/* cppcheck-suppress [uninitvar,unmatchedSuppression] */
+		if ((unsigned)SNDBUF_GET(conn) > (long long)tinfo->tcpi_rtt *
+						 tinfo->tcpi_delivery_rate /
+						 1000 / 1000 *
+						 SNDBUF_TO_BW_DELAY_INTERACTIVE)
+			ack_everything = false;
+	}
 
-		if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
-			conn->seq_ack_to_tap = prev_ack_to_tap;
+	if (ack_everything) {
+		/* Fall back to acknowledging everything we got */
+		conn->seq_ack_to_tap = conn->seq_from_tap;
+	} else {
+		/* cppcheck bug 14191 again, see above */
+		/* cppcheck-suppress [uninitvar,unmatchedSuppression] */
+		conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked +
+		                       conn->seq_init_from_tap;
 	}
-#endif /* !HAS_BYTES_ACKED */
 
-	if (!KERNEL_REPORTS_SND_WND(c)) {
+	/* It's occasionally possible for us to go from using the fallback above
+	 * to the tcpi_bytes_acked method.  In that case, we must be careful not
+	 * to let our ACKed sequence go backwards.
+	 */
+	if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
+		conn->seq_ack_to_tap = prev_ack_to_tap;
+
+	if (!snd_wnd_cap) {
 		tcp_get_sndbuf(conn);
 		new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW);
 		conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap,
@@ -1063,22 +1202,17 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
 	if (!tinfo) {
 		if (prev_wnd_to_tap > WINDOW_DEFAULT) {
 			goto out;
-}
+		}
 		tinfo = &tinfo_new;
 		if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) {
 			goto out;
-}
+		}
 	}
 
-#ifdef HAS_SND_WND
-	if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) {
+	if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn))
 		new_wnd_to_tap = tinfo->tcpi_snd_wnd;
-	} else {
-		tcp_get_sndbuf(conn);
-		new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd,
-				     SNDBUF_GET(conn));
-	}
-#endif
+	else
+		new_wnd_to_tap = tcp_wnd_from_sndbuf(s, conn, tinfo);
 
 	new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW);
 	if (!(conn->events & ESTABLISHED))
@@ -1097,6 +1231,10 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
 		conn_flag(c, conn, ACK_TO_TAP_DUE);
 
 out:
+	/* Opportunistically store RTT approximation on valid TCP_INFO data */
+	if (tinfo)
+		RTT_SET(conn, tinfo->tcpi_rtt);
+
 	return new_wnd_to_tap       != prev_wnd_to_tap ||
 	       conn->seq_ack_to_tap != prev_ack_to_tap;
 }
@@ -1105,7 +1243,7 @@ out:
  * tcp_update_seqack_from_tap() - ACK number from tap and related flags/counters
  * @c:		Execution context
  * @conn:	Connection pointer
- * @seq		Current ACK sequence, host order
+ * @seq:	Current ACK sequence, host order
  */
 static void tcp_update_seqack_from_tap(const struct ctx *c,
 				       struct tcp_tap_conn *conn, uint32_t seq)
@@ -1118,63 +1256,74 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
 		if (SEQ_LT(seq, conn->seq_to_tap))
 			conn_flag(c, conn, ACK_FROM_TAP_DUE);
 
-		conn->retrans = 0;
+		conn->retries = 0;
 		conn->seq_ack_from_tap = seq;
 	}
 }
 
 /**
+ * tcp_rewind_seq() - Rewind sequence to tap and socket offset to current ACK
+ * @c:		Execution context
+ * @conn:	Connection pointer
+ *
+ * Return: 0 on success, -1 on failure, with connection reset
+ */
+static int tcp_rewind_seq(const struct ctx *c, struct tcp_tap_conn *conn)
+{
+	conn->seq_to_tap = conn->seq_ack_from_tap;
+	conn->events &= ~TAP_FIN_SENT;
+
+	if (tcp_set_peek_offset(conn, 0)) {
+		tcp_rst(c, conn);
+		return -1;
+	}
+
+	return 0;
+}
+
+/**
  * tcp_prepare_flags() - Prepare header for flags-only segment (no payload)
  * @c:		Execution context
  * @conn:	Connection pointer
  * @flags:	TCP flags: if not set, send segment only if ACK is due
  * @th:		TCP header to update
- * @data:	buffer to store TCP option
+ * @opts:	TCP option buffer (output parameter)
  * @optlen:	size of the TCP option buffer (output parameter)
  *
  * Return: < 0 error code on connection reset,
  *	     0 if there is no flag to send
  *	     1 otherwise
  */
-int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
-		      int flags, struct tcphdr *th, char *data,
+int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
+		      int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
 		      size_t *optlen)
 {
-	struct tcp_info tinfo = { 0 };
+	struct tcp_info_linux tinfo = { 0 };
 	socklen_t sl = sizeof(tinfo);
 	int s = conn->sock;
 
 	if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) &&
-	    !flags && conn->wnd_to_tap)
+	    !flags && conn->wnd_to_tap) {
+		conn_flag(c, conn, ~ACK_TO_TAP_DUE);
 		return 0;
+	}
 
 	if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
 		conn_event(c, conn, CLOSED);
 		return -ECONNRESET;
 	}
 
-#ifdef HAS_SND_WND
-	if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd)
-		c->tcp.kernel_snd_wnd = 1;
-#endif
-
 	if (!(conn->flags & LOCAL))
 		tcp_rtt_dst_check(conn, &tinfo);
 
-	if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags)
+	if (!tcp_update_seqack_wnd(c, conn, !!flags, &tinfo) && !flags)
 		return 0;
 
 	*optlen = 0;
 	if (flags & SYN) {
 		int mss;
 
-		/* Options: MSS, NOP and window scale (8 bytes) */
-		*optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN;
-
-		*data++ = OPT_MSS;
-		*data++ = OPT_MSS_LEN;
-
-		if (c->mtu == -1) {
+		if (!c->mtu) {
 			mss = tinfo.tcpi_snd_mss;
 		} else {
 			mss = c->mtu - sizeof(struct tcphdr);
@@ -1189,29 +1338,26 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
 			else if (mss > PAGE_SIZE)
 				mss = ROUND_DOWN(mss, PAGE_SIZE);
 		}
-		*(uint16_t *)data = htons(MIN(USHRT_MAX, mss));
-
-		data += OPT_MSS_LEN - 2;
 
 		conn->ws_to_tap = MIN(MAX_WS, tinfo.tcpi_snd_wscale);
 
-		*data++ = OPT_NOP;
-		*data++ = OPT_WS;
-		*data++ = OPT_WS_LEN;
-		*data++ = conn->ws_to_tap;
-	} else if (!(flags & RST)) {
+		*opts = TCP_SYN_OPTS(mss, conn->ws_to_tap);
+		*optlen = sizeof(*opts);
+	} else {
 		flags |= ACK;
 	}
 
 	th->doff = (sizeof(*th) + *optlen) / 4;
 
 	th->ack = !!(flags & ACK);
+	th->psh = !!(flags & PSH);
 	th->rst = !!(flags & RST);
 	th->syn = !!(flags & SYN);
 	th->fin = !!(flags & FIN);
 
 	if (th->ack) {
-		if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap))
+		if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) &&
+		    conn->wnd_to_tap)
 			conn_flag(c, conn, ~ACK_TO_TAP_DUE);
 		else
 			conn_flag(c, conn, ACK_TO_TAP_DUE);
@@ -1233,25 +1379,62 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
  * @conn:      Connection pointer
  * @flags:     TCP flags: if not set, send segment only if ACK is due
  *
- * Return: negative error code on connection reset, 0 otherwise
+ * Return: negative error code on fatal connection failure, 0 otherwise
  */
-int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
+static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn,
+			 int flags)
 {
-	return tcp_buf_send_flag(c, conn, flags);
+	int ret;
+
+	if (c->mode == MODE_VU)
+		ret = tcp_vu_send_flag(c, conn, flags);
+	else
+		ret = tcp_buf_send_flag(c, conn, flags);
+
+	return ret == -EAGAIN ? 0 : ret;
+}
+
+/**
+ * tcp_sock_rst() - Close TCP connection forcing RST on socket side
+ * @c:		Execution context
+ * @conn:	Connection pointer
+ */
+static void tcp_sock_rst(const struct ctx *c, struct tcp_tap_conn *conn)
+{
+	const struct linger linger0 = {
+		.l_onoff = 1,
+		.l_linger = 0,
+	};
+
+	/* Force RST on socket to inform the peer
+	 *
+	 * We do this by setting SO_LINGER with 0 timeout, which means that
+	 * close() will send an RST (unless the connection is already closed in
+	 * both directions).
+	 */
+	if (setsockopt(conn->sock, SOL_SOCKET,
+		       SO_LINGER, &linger0, sizeof(linger0)) < 0) {
+		flow_dbg_perror(conn,
+				"SO_LINGER failed, may not send RST to peer");
+	}
+
+	conn_event(c, conn, CLOSED);
 }
 
 /**
- * tcp_rst_do() - Reset a tap connection: send RST segment to tap, close socket
+ * tcp_rst_do() - Reset a tap connection: send RST segment on both sides, close
  * @c:		Execution context
  * @conn:	Connection pointer
  */
-void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
+void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn)
 {
 	if (conn->events == CLOSED)
 		return;
 
-	if (!tcp_send_flag(c, conn, RST))
-		conn_event(c, conn, CLOSED);
+	/* Send RST on tap */
+	tcp_send_flag(c, conn, RST);
+
+	tcp_sock_rst(c, conn);
 }
 
 /**
@@ -1273,30 +1456,41 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn,
 
 /**
  * tcp_tap_window_update() - Process an updated window from tap side
+ * @c:		Execution context
  * @conn:	Connection pointer
- * @window:	Window value, host order, unscaled
+ * @wnd:	Window value, host order, unscaled
+ *
+ * Return: false on zero window (not stored to wnd_from_tap), true otherwise
  */
-static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd)
+static bool tcp_tap_window_update(const struct ctx *c,
+				  struct tcp_tap_conn *conn, unsigned wnd)
 {
 	wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap);
 
 	/* Work-around for bug introduced in peer kernel code, commit
-	 * e2142825c120 ("net: tcp: send zero-window ACK when no memory").
-	 * We don't update if window shrank to zero.
+	 * e2142825c120 ("net: tcp: send zero-window ACK when no memory"): don't
+	 * update the window if it shrank to zero, so that we'll eventually
+	 * retry to send data, but rewind the sequence as that obviously implies
+	 * that no data beyond the updated window will be acknowledged.
 	 */
-	if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap))
-		return;
+	if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) {
+		tcp_rewind_seq(c, conn);
+		return false;
+	}
 
 	conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX);
 
 	/* FIXME: reflect the tap-side receiver's window back to the sock-side
 	 * sender by adjusting SO_RCVBUF? */
+	return true;
 }
 
 /**
  * tcp_init_seq() - Calculate initial sequence number according to RFC 6528
  * @hash:	Hash of connection details
  * @now:	Current timestamp
+ *
+ * Return: the calculated 32-bit initial sequence number.
  */
 static uint32_t tcp_init_seq(uint64_t hash, const struct timespec *now)
 {
@@ -1326,16 +1520,15 @@ int tcp_conn_pool_sock(int pool[])
 
 /**
  * tcp_conn_new_sock() - Open and prepare new socket for connection
- * @c:		Execution context
  * @af:		Address family
  *
  * Return: socket number on success, negative code if socket creation failed
  */
-static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
+static int tcp_conn_new_sock(sa_family_t af)
 {
 	int s;
 
-	s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
+	s = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, IPPROTO_TCP);
 
 	if (s > FD_REF_MAX) {
 		close(s);
@@ -1345,19 +1538,18 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
 	if (s < 0)
 		return -errno;
 
-	tcp_sock_set_bufsize(c, s);
+	tcp_sock_set_nodelay(s);
 
 	return s;
 }
 
 /**
  * tcp_conn_sock() - Obtain a connectable socket in the host/init namespace
- * @c:		Execution context
  * @af:		Address family (AF_INET or AF_INET6)
  *
- * Return: Socket fd on success, -errno on failure
+ * Return: socket fd on success, -errno on failure
  */
-int tcp_conn_sock(const struct ctx *c, sa_family_t af)
+int tcp_conn_sock(sa_family_t af)
 {
 	int *pool = af == AF_INET6 ? init_sock_pool6 : init_sock_pool4;
 	int s;
@@ -1368,11 +1560,11 @@ int tcp_conn_sock(const struct ctx *c, sa_family_t af)
 	/* If the pool is empty we just open a new one without refilling the
 	 * pool to keep latency down.
 	 */
-	if ((s = tcp_conn_new_sock(c, af)) >= 0)
+	if ((s = tcp_conn_new_sock(af)) >= 0)
 		return s;
 
 	err("TCP: Unable to open socket for new connection: %s",
-	    strerror(-s));
+	    strerror_(-s));
 	return -1;
 }
 
@@ -1414,18 +1606,17 @@ static void tcp_bind_outbound(const struct ctx *c,
 {
 	const struct flowside *tgt = &conn->f.side[TGTSIDE];
 	union sockaddr_inany bind_sa;
-	socklen_t sl;
 
 
-	pif_sockaddr(c, &bind_sa, &sl, PIF_HOST, &tgt->faddr, tgt->fport);
-	if (!inany_is_unspecified(&tgt->faddr) || tgt->fport) {
-		if (bind(s, &bind_sa.sa, sl)) {
+	pif_sockaddr(c, &bind_sa, PIF_HOST, &tgt->oaddr, tgt->oport);
+	if (!inany_is_unspecified(&tgt->oaddr) || tgt->oport) {
+		if (bind(s, &bind_sa.sa, socklen_inany(&bind_sa))) {
 			char sstr[INANY_ADDRSTRLEN];
 
-			flow_dbg(conn,
-				 "Can't bind TCP outbound socket to %s:%hu: %s",
-				 inany_ntop(&tgt->faddr, sstr, sizeof(sstr)),
-				 tgt->fport, strerror(errno));
+			flow_dbg_perror(conn,
+					"Can't bind TCP outbound socket to %s:%hu",
+					inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)),
+					tgt->oport);
 		}
 	}
 
@@ -1434,9 +1625,9 @@ static void tcp_bind_outbound(const struct ctx *c,
 			if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE,
 				       c->ip4.ifname_out,
 				       strlen(c->ip4.ifname_out))) {
-				flow_dbg(conn, "Can't bind IPv4 TCP socket to"
-					 " interface %s: %s", c->ip4.ifname_out,
-					 strerror(errno));
+				flow_dbg_perror(conn,
+						"Can't bind IPv4 TCP socket to interface %s",
+						c->ip4.ifname_out);
 			}
 		}
 	} else if (bind_sa.sa_family == AF_INET6) {
@@ -1444,9 +1635,9 @@ static void tcp_bind_outbound(const struct ctx *c,
 			if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE,
 				       c->ip6.ifname_out,
 				       strlen(c->ip6.ifname_out))) {
-				flow_dbg(conn, "Can't bind IPv6 TCP socket to"
-					 " interface %s: %s", c->ip6.ifname_out,
-					 strerror(errno));
+				flow_dbg_perror(conn,
+						"Can't bind IPv6 TCP socket to interface %s",
+						c->ip6.ifname_out);
 			}
 		}
 	}
@@ -1462,21 +1653,23 @@ static void tcp_bind_outbound(const struct ctx *c,
  * @opts:	Pointer to start of options
  * @optlen:	Bytes in options: caller MUST ensure available length
  * @now:	Current timestamp
+ *
+ * #syscalls:vu getsockname
  */
-static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
+static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
 			      const void *saddr, const void *daddr,
 			      const struct tcphdr *th, const char *opts,
 			      size_t optlen, const struct timespec *now)
 {
 	in_port_t srcport = ntohs(th->source);
 	in_port_t dstport = ntohs(th->dest);
-	const struct flowside *ini, *tgt;
+	const struct flowside *ini;
 	struct tcp_tap_conn *conn;
 	union sockaddr_inany sa;
+	struct flowside *tgt;
 	union flow *flow;
 	int s = -1, mss;
 	uint64_t hash;
-	socklen_t sl;
 
 	if (!(flow = flow_alloc()))
 		return;
@@ -1484,7 +1677,7 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
 	ini = flow_initiate_af(flow, PIF_TAP,
 			       af, saddr, srcport, daddr, dstport);
 
-	if (!(tgt = flow_target(c, flow, IPPROTO_TCP)))
+	if (!(tgt = flow_target(c, flow, FWD_NO_HINT, IPPROTO_TCP)))
 		goto cancel;
 
 	if (flow->f.pif[TGTSIDE] != PIF_HOST) {
@@ -1497,19 +1690,19 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
 	conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
 
 	if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0 ||
-	    !inany_is_unicast(&ini->faddr) || ini->fport == 0) {
+	    !inany_is_unicast(&ini->oaddr) || ini->oport == 0) {
 		char sstr[INANY_ADDRSTRLEN], dstr[INANY_ADDRSTRLEN];
 
-		debug("Invalid endpoint in TCP SYN: %s:%hu -> %s:%hu",
-		      inany_ntop(&ini->eaddr, sstr, sizeof(sstr)), ini->eport,
-		      inany_ntop(&ini->faddr, dstr, sizeof(dstr)), ini->fport);
+		warn_ratelimit(now, "Invalid endpoint in TCP SYN: %s:%hu -> %s:%hu",
+			       inany_ntop(&ini->eaddr, sstr, sizeof(sstr)), ini->eport,
+			       inany_ntop(&ini->oaddr, dstr, sizeof(dstr)), ini->oport);
 		goto cancel;
 	}
 
-	if ((s = tcp_conn_sock(c, af)) < 0)
+	if ((s = tcp_conn_sock(af)) < 0)
 		goto cancel;
 
-	pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, tgt->eport);
+	pif_sockaddr(c, &sa, PIF_HOST, &tgt->eaddr, tgt->eport);
 
 	/* Use bind() to check if the target address is local (EADDRINUSE or
 	 * similar) and already bound, and set the LOCAL flag in that case.
@@ -1521,18 +1714,23 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
 	 *
 	 * So, if bind() succeeds, close the socket, get a new one, and proceed.
 	 */
-	if (bind(s, &sa.sa, sl)) {
+	if (bind(s, &sa.sa, socklen_inany(&sa))) {
 		if (errno != EADDRNOTAVAIL && errno != EACCES)
 			conn_flag(c, conn, LOCAL);
 	} else {
 		/* Not a local, bound destination, inconclusive test */
 		close(s);
-		if ((s = tcp_conn_sock(c, af)) < 0)
+		if ((s = tcp_conn_sock(af)) < 0)
 			goto cancel;
 	}
 
 	conn->sock = s;
 	conn->timer = -1;
+	flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
+	if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, s, TGTSIDE) < 0) {
+		flow_perror(flow, "Can't register with epoll");
+		goto cancel;
+	}
 	conn_event(c, conn, TAP_SYN_RCVD);
 
 	conn->wnd_to_tap = WINDOW_DEFAULT;
@@ -1560,7 +1758,7 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
 
 	tcp_bind_outbound(c, conn, s);
 
-	if (connect(s, &sa.sa, sl)) {
+	if (connect(s, &sa.sa, socklen_inany(&sa))) {
 		if (errno != EINPROGRESS) {
 			tcp_rst(c, conn);
 			goto cancel;
@@ -1576,7 +1774,16 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
 		conn_event(c, conn, TAP_SYN_ACK_SENT);
 	}
 
-	tcp_epoll_ctl(c, conn);
+	tcp_epoll_ctl(conn);
+
+	if (c->mode == MODE_VU) { /* To rebind to same oport after migration */
+		socklen_t sl = sizeof(sa);
+
+		if (getsockname(s, &sa.sa, &sl) ||
+		    inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa) < 0)
+			err_perror("Can't get local address for socket %i", s);
+	}
+
 	FLOW_ACTIVATE(conn);
 	return;
 
@@ -1628,12 +1835,32 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
  *
  * #syscalls recvmsg
  */
-static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
+static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
 {
+	if (c->mode == MODE_VU)
+		return tcp_vu_data_from_sock(c, conn);
+
 	return tcp_buf_data_from_sock(c, conn);
 }
 
 /**
+ * tcp_packet_data_len() - Get data (TCP payload) length for a TCP packet
+ * @th:		Pointer to TCP header
+ * @l4len:	TCP packet length, including TCP header
+ *
+ * Return: data length of TCP packet, -1 on invalid value of Data Offset field
+ */
+static ssize_t tcp_packet_data_len(const struct tcphdr *th, size_t l4len)
+{
+	size_t off = th->doff * 4UL;
+
+	if (off < sizeof(*th) || off > l4len)
+		return -1;
+
+	return l4len - off;
+}
+
+/**
  * tcp_data_from_tap() - tap/guest data for established connection
  * @c:		Execution context
  * @conn:	Connection pointer
@@ -1644,8 +1871,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
  *
  * Return: count of consumed packets
  */
-static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
-			      const struct pool *p, int idx)
+static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
+			     const struct pool *p, int idx)
 {
 	int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0;
 	uint16_t max_ack_seq_wnd = conn->wnd_from_tap;
@@ -1658,34 +1885,61 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
 	if (conn->events == CLOSED)
 		return p->count - idx;
 
-	ASSERT(conn->events & ESTABLISHED);
+	assert(conn->events & ESTABLISHED);
 
 	for (i = idx, iov_i = 0; i < (int)p->count; i++) {
 		uint32_t seq, seq_offset, ack_seq;
+		struct tcphdr th_storage;
 		const struct tcphdr *th;
-		char *data;
-		size_t off;
+		struct iov_tail data;
+		size_t off, size;
+		int count;
+
+		if (!packet_get(p, i, &data))
+			return -1;
 
-		th = packet_get(p, i, 0, sizeof(*th), &len);
+		th = IOV_PEEK_HEADER(&data, th_storage);
 		if (!th)
 			return -1;
-		len += sizeof(*th);
+		len = iov_tail_size(&data);
 
 		off = th->doff * 4UL;
+
 		if (off < sizeof(*th) || off > len)
 			return -1;
 
 		if (th->rst) {
-			conn_event(c, conn, CLOSED);
+			tcp_sock_rst(c, conn);
 			return 1;
 		}
 
 		len -= off;
-		data = packet_get(p, i, off, len, NULL);
-		if (!data)
-			continue;
+		iov_drop_header(&data, off);
 
 		seq = ntohl(th->seq);
+		if (SEQ_LT(seq, conn->seq_from_tap) && len <= 1) {
+			flow_trace(conn,
+				   "keep-alive sequence: %u, previous: %u",
+				   seq, conn->seq_from_tap);
+
+			if (tcp_send_flag(c, conn, ACK))
+				return -1;
+
+			tcp_timer_ctl(c, conn);
+
+			if (setsockopt(conn->sock, SOL_SOCKET, SO_KEEPALIVE,
+				       &((int){ 1 }), sizeof(int)))
+				flow_trace(conn, "failed to set SO_KEEPALIVE");
+
+			if (p->count == 1) {
+				tcp_tap_window_update(c, conn,
+						      ntohs(th->window));
+				return 1;
+			}
+
+			continue;
+		}
+
 		ack_seq = ntohl(th->ack_seq);
 
 		if (th->ack) {
@@ -1698,12 +1952,21 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
 				       ack_seq == max_ack_seq &&
 				       ntohs(th->window) == max_ack_seq_wnd;
 
+				/* See tcp_tap_window_update() for details. On
+				 * top of that, we also need to check here if a
+				 * zero-window update is contained in a batch of
+				 * packets that includes a non-zero window as
+				 * well.
+				 */
+				if (!ntohs(th->window))
+					tcp_rewind_seq(c, conn);
+
 				max_ack_seq_wnd = ntohs(th->window);
 				max_ack_seq = ack_seq;
 			}
 		}
 
-		if (th->fin)
+		if (th->fin && seq == seq_from_tap)
 			fin = 1;
 
 		if (!len)
@@ -1741,10 +2004,14 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
 			continue;
 		}
 
-		tcp_iov[iov_i].iov_base = data + seq_offset;
-		tcp_iov[iov_i].iov_len = len - seq_offset;
-		seq_from_tap += tcp_iov[iov_i].iov_len;
-		iov_i++;
+		iov_drop_header(&data, seq_offset);
+		size = len - seq_offset;
+		count = iov_tail_clone(&tcp_iov[iov_i], UIO_MAXIOV - iov_i,
+				       &data);
+		if (count < 0)
+			break;
+		seq_from_tap += size;
+		iov_i += count;
 
 		if (keep == i)
 			keep = -1;
@@ -1757,17 +2024,16 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
 	if (ack && !tcp_sock_consume(conn, max_ack_seq))
 		tcp_update_seqack_from_tap(c, conn, max_ack_seq);
 
-	tcp_tap_window_update(conn, max_ack_seq_wnd);
+	tcp_tap_window_update(c, conn, max_ack_seq_wnd);
 
 	if (retr) {
 		flow_trace(conn,
 			   "fast re-transmit, ACK: %u, previous sequence: %u",
-			   max_ack_seq, conn->seq_to_tap);
-		conn->seq_to_tap = max_ack_seq;
-		if (tcp_set_peek_offset(conn->sock, 0)) {
-			tcp_rst(c, conn);
+			   conn->seq_ack_from_tap, conn->seq_to_tap);
+
+		if (tcp_rewind_seq(c, conn))
 			return -1;
-		}
+
 		tcp_data_from_sock(c, conn);
 	}
 
@@ -1784,37 +2050,52 @@ eintr:
 			 *   Then swiftly looked away and left.
 			 */
 			conn->seq_from_tap = seq_from_tap;
-			tcp_send_flag(c, conn, ACK);
+			if (tcp_send_flag(c, conn, ACK))
+				return -1;
 		}
 
 		if (errno == EINTR)
 			goto eintr;
 
 		if (errno == EAGAIN || errno == EWOULDBLOCK) {
-			tcp_send_flag(c, conn, ACK_IF_NEEDED);
-			return p->count - idx;
+			if (tcp_send_flag(c, conn, ACK | DUP_ACK))
+				return -1;
 
+			uint32_t events = tcp_conn_epoll_events(conn->events,
+								conn->flags);
+			events |= EPOLLOUT;
+			if (flow_epoll_set(&conn->f, EPOLL_CTL_MOD, events,
+			    conn->sock, !TAPSIDE(conn)) < 0)
+				debug("Failed to add EPOLLOUT");
+			return p->count - idx;
 		}
 		return -1;
 	}
 
 	if (n < (int)(seq_from_tap - conn->seq_from_tap)) {
 		partial_send = 1;
-		conn->seq_from_tap += n;
-		tcp_send_flag(c, conn, ACK_IF_NEEDED);
-	} else {
-		conn->seq_from_tap += n;
-	}
+		uint32_t events = tcp_conn_epoll_events(conn->events,
+							conn->flags);
+		events |= EPOLLOUT;
+		if (flow_epoll_set(&conn->f, EPOLL_CTL_MOD, events, conn->sock,
+		    !TAPSIDE(conn)) < 0)
+			debug("Failed to add EPOLLOUT");
+	 } else {
+		tcp_epoll_ctl(conn);
+	 }
+
+	conn->seq_from_tap += n;
 
 out:
-	if (keep != -1) {
+	if (keep != -1 || partial_send) {
 		/* We use an 8-bit approximation here: the associated risk is
 		 * that we skip a duplicate ACK on 8-bit sequence number
 		 * collision. Fast retransmit is a SHOULD in RFC 5681, 3.2.
 		 */
 		if (conn->seq_dup_ack_approx != (conn->seq_from_tap & 0xff)) {
 			conn->seq_dup_ack_approx = conn->seq_from_tap & 0xff;
-			tcp_send_flag(c, conn, ACK | DUP_ACK);
+			if (tcp_send_flag(c, conn, ACK | DUP_ACK))
+				return -1;
 		}
 		return p->count - idx;
 	}
@@ -1828,7 +2109,8 @@ out:
 
 		conn_event(c, conn, TAP_FIN_RCVD);
 	} else {
-		tcp_send_flag(c, conn, ACK_IF_NEEDED);
+		if (tcp_send_flag(c, conn, ACK_IF_NEEDED))
+			return -1;
 	}
 
 	return p->count - idx;
@@ -1842,11 +2124,12 @@ out:
  * @opts:	Pointer to start of options
  * @optlen:	Bytes in options: caller MUST ensure available length
  */
-static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
+static void tcp_conn_from_sock_finish(const struct ctx *c,
+				      struct tcp_tap_conn *conn,
 				      const struct tcphdr *th,
 				      const char *opts, size_t optlen)
 {
-	tcp_tap_window_update(conn, ntohs(th->window));
+	tcp_tap_window_update(c, conn, ntohs(th->window));
 	tcp_get_tap_ws(conn, opts, optlen);
 
 	/* First value is not scaled */
@@ -1860,7 +2143,12 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
 	conn->seq_ack_to_tap = conn->seq_from_tap;
 
 	conn_event(c, conn, ESTABLISHED);
-	if (tcp_set_peek_offset(conn->sock, 0)) {
+	if (tcp_set_peek_offset(conn, 0)) {
+		tcp_rst(c, conn);
+		return;
+	}
+
+	if (tcp_send_flag(c, conn, ACK)) {
 		tcp_rst(c, conn);
 		return;
 	}
@@ -1869,7 +2157,77 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
 	 * dequeue waiting for SYN,ACK from tap -- check now.
 	 */
 	tcp_data_from_sock(c, conn);
-	tcp_send_flag(c, conn, ACK);
+}
+
+/**
+ * tcp_rst_no_conn() - Send RST in response to a packet with no connection
+ * @c:		Execution context
+ * @af:		Address family, AF_INET or AF_INET6
+ * @saddr:	Source address of the packet we're responding to
+ * @daddr:	Destination address of the packet we're responding to
+ * @flow_lbl:	IPv6 flow label (ignored for IPv4)
+ * @th:		TCP header of the packet we're responding to
+ * @l4len:	Packet length, including TCP header
+ */
+static void tcp_rst_no_conn(const struct ctx *c, int af,
+			    const void *saddr, const void *daddr,
+			    uint32_t flow_lbl,
+			    const struct tcphdr *th, size_t l4len)
+{
+	struct iov_tail payload = IOV_TAIL(NULL, 0, 0);
+	struct tcphdr *rsth;
+	char buf[USHRT_MAX];
+	uint32_t psum = 0;
+	size_t rst_l2len;
+
+	/* Don't respond to RSTs without a connection */
+	if (th->rst)
+		return;
+
+	if (af == AF_INET) {
+		struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac,
+						  ETH_P_IP);
+		const struct in_addr *rst_src = daddr;
+		const struct in_addr *rst_dst = saddr;
+
+		rsth = tap_push_ip4h(ip4h, *rst_src, *rst_dst,
+				     sizeof(*rsth), IPPROTO_TCP);
+		psum = proto_ipv4_header_psum(sizeof(*rsth), IPPROTO_TCP,
+					      *rst_src, *rst_dst);
+
+	} else {
+		struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac,
+						    ETH_P_IPV6);
+		const struct in6_addr *rst_src = daddr;
+		const struct in6_addr *rst_dst = saddr;
+
+		rsth = tap_push_ip6h(ip6h, rst_src, rst_dst,
+				     sizeof(*rsth), IPPROTO_TCP, flow_lbl);
+		psum = proto_ipv6_header_psum(sizeof(*rsth), IPPROTO_TCP,
+					      rst_src, rst_dst);
+	}
+
+	memset(rsth, 0, sizeof(*rsth));
+
+	rsth->source = th->dest;
+	rsth->dest = th->source;
+	rsth->rst = 1;
+	rsth->doff = sizeof(*rsth) / 4UL;
+
+	/* Sequence matching logic from RFC 9293 section 3.10.7.1 */
+	if (th->ack) {
+		rsth->seq = th->ack_seq;
+	} else {
+		size_t dlen = l4len - th->doff * 4UL;
+		uint32_t ack = ntohl(th->seq) + dlen;
+
+		rsth->ack_seq = htonl(ack);
+		rsth->ack = 1;
+	}
+
+	tcp_update_csum(psum, rsth, &payload);
+	rst_l2len = ((char *)rsth - buf) + sizeof(*rsth);
+	tap_send_single(c, buf, rst_l2len);
 }
 
 /**
@@ -1879,19 +2237,23 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
  * @af:		Address family, AF_INET or AF_INET6
  * @saddr:	Source address
  * @daddr:	Destination address
+ * @flow_lbl:	IPv6 flow label (ignored for IPv4)
  * @p:		Pool of TCP packets, with TCP headers
  * @idx:	Index of first packet in pool to process
  * @now:	Current timestamp
  *
  * Return: count of consumed packets
  */
-int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
-		    const void *saddr, const void *daddr,
+int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
+		    const void *saddr, const void *daddr, uint32_t flow_lbl,
 		    const struct pool *p, int idx, const struct timespec *now)
 {
 	struct tcp_tap_conn *conn;
+	struct tcphdr th_storage;
 	const struct tcphdr *th;
-	size_t optlen, len;
+	char optsc[OPTLEN_MAX];
+	struct iov_tail data;
+	size_t optlen, l4len;
 	const char *opts;
 	union flow *flow;
 	flow_sidx_t sidx;
@@ -1900,15 +2262,19 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
 
 	(void)pif;
 
-	th = packet_get(p, idx, 0, sizeof(*th), &len);
+	if (!packet_get(p, idx, &data))
+		return 1;
+
+	l4len = iov_tail_size(&data);
+
+	th = IOV_REMOVE_HEADER(&data, th_storage);
 	if (!th)
 		return 1;
-	len += sizeof(*th);
 
 	optlen = th->doff * 4UL - sizeof(*th);
 	/* Static checkers might fail to see this: */
-	optlen = MIN(optlen, ((1UL << 4) /* from doff width */ - 6) * 4UL);
-	opts = packet_get(p, idx, sizeof(*th), optlen, NULL);
+	optlen = MIN(optlen, OPTLEN_MAX);
+	opts = (char *)iov_remove_header_(&data, &optsc[0], optlen, 1);
 
 	sidx = flow_lookup_af(c, IPPROTO_TCP, PIF_TAP, af, saddr, daddr,
 			      ntohs(th->source), ntohs(th->dest));
@@ -1919,20 +2285,25 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
 		if (opts && th->syn && !th->ack)
 			tcp_conn_from_tap(c, af, saddr, daddr, th,
 					  opts, optlen, now);
+		else
+			tcp_rst_no_conn(c, af, saddr, daddr, flow_lbl, th, l4len);
 		return 1;
 	}
 
-	ASSERT(flow->f.type == FLOW_TCP);
-	ASSERT(pif_at_sidx(sidx) == PIF_TAP);
+	assert(flow->f.type == FLOW_TCP);
+	assert(pif_at_sidx(sidx) == PIF_TAP);
 	conn = &flow->tcp;
 
-	flow_trace(conn, "packet length %zu from tap", len);
+	flow_trace(conn, "packet length %zu from tap", l4len);
 
 	if (th->rst) {
-		conn_event(c, conn, CLOSED);
+		tcp_sock_rst(c, conn);
 		return 1;
 	}
 
+	conn->inactive = false;
+	conn->tap_inactive = false;
+
 	if (th->ack && !(conn->events & ESTABLISHED))
 		tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
 
@@ -1948,18 +2319,27 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
 
 	/* Establishing connection from tap */
 	if (conn->events & TAP_SYN_RCVD) {
+		if (th->syn && !th->ack && !th->fin)
+			return 1;	/* SYN retry: ignore and keep waiting */
+
 		if (!(conn->events & TAP_SYN_ACK_SENT))
 			goto reset;
 
 		conn_event(c, conn, ESTABLISHED);
-		if (tcp_set_peek_offset(conn->sock, 0))
+		if (tcp_set_peek_offset(conn, 0))
 			goto reset;
 
 		if (th->fin) {
 			conn->seq_from_tap++;
 
-			shutdown(conn->sock, SHUT_WR);
-			tcp_send_flag(c, conn, ACK);
+			if (shutdown(conn->sock, SHUT_WR) < 0) {
+				flow_dbg_perror(conn, "shutdown() failed");
+				goto reset;
+			}
+
+			if (tcp_send_flag(c, conn, ACK))
+				goto reset;
+
 			conn_event(c, conn, SOCK_FIN_SENT);
 
 			return 1;
@@ -1968,9 +2348,8 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
 		if (!th->ack)
 			goto reset;
 
-		tcp_tap_window_update(conn, ntohs(th->window));
-
-		tcp_data_from_sock(c, conn);
+		if (tcp_tap_window_update(c, conn, ntohs(th->window)))
+			tcp_data_from_sock(c, conn);
 
 		if (p->count - idx == 1)
 			return 1;
@@ -1978,11 +2357,44 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
 
 	/* Established connections not accepting data from tap */
 	if (conn->events & TAP_FIN_RCVD) {
-		tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
+		size_t dlen;
+		bool retr;
 
-		if (conn->events & SOCK_FIN_RCVD &&
-		    conn->seq_ack_from_tap == conn->seq_to_tap)
-			conn_event(c, conn, CLOSED);
+		if ((dlen = tcp_packet_data_len(th, l4len))) {
+			flow_dbg(conn, "data segment in CLOSE-WAIT (%zu B)",
+				 dlen);
+		}
+
+		retr = th->ack && !th->fin &&
+		       ntohl(th->ack_seq) == conn->seq_ack_from_tap &&
+		       ntohs(th->window) == conn->wnd_from_tap;
+
+		/* On socket flush failure, pretend there was no ACK, try again
+		 * later
+		 */
+		if (th->ack && !tcp_sock_consume(conn, ntohl(th->ack_seq)))
+			tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
+
+		if (retr) {
+			flow_trace(conn,
+				   "fast re-transmit, ACK: %u, previous sequence: %u",
+				   ntohl(th->ack_seq), conn->seq_to_tap);
+
+			if (tcp_rewind_seq(c, conn))
+				return -1;
+		}
+
+		if (tcp_tap_window_update(c, conn, ntohs(th->window)) || retr)
+			tcp_data_from_sock(c, conn);
+
+		if (conn->seq_ack_from_tap == conn->seq_to_tap) {
+			if (th->ack && conn->events & TAP_FIN_SENT)
+				conn_event(c, conn, TAP_FIN_ACKED);
+
+			if (conn->events & SOCK_FIN_RCVD &&
+			    conn->events & TAP_FIN_ACKED)
+				conn_event(c, conn, CLOSED);
+		}
 
 		return 1;
 	}
@@ -1998,10 +2410,33 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
 		ack_due = 1;
 
 	if ((conn->events & TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_SENT)) {
-		shutdown(conn->sock, SHUT_WR);
+		socklen_t sl;
+		struct tcp_info tinfo;
+
+		if (shutdown(conn->sock, SHUT_WR) < 0) {
+			flow_dbg_perror(conn, "shutdown() failed");
+			goto reset;
+		}
+
 		conn_event(c, conn, SOCK_FIN_SENT);
-		tcp_send_flag(c, conn, ACK);
+		if (tcp_send_flag(c, conn, ACK))
+			goto reset;
+
 		ack_due = 0;
+
+		/* If we received a FIN, but the socket is in TCP_ESTABLISHED
+		 * state, it must be a migrated socket. The kernel saw the FIN
+		 * on the source socket, but not on the target socket.
+		 *
+		 * Approximate the effect of that FIN: as we're sending a FIN
+		 * out ourselves, the socket is now in a state equivalent to
+		 * LAST_ACK. Now that we sent the FIN out, close it with a RST.
+		 */
+		sl = sizeof(tinfo);
+		getsockopt(conn->sock, SOL_TCP, TCP_INFO, &tinfo, &sl);
+		if (tinfo.tcpi_state == TCP_ESTABLISHED &&
+		    conn->events & SOCK_FIN_RCVD)
+			goto reset;
 	}
 
 	if (ack_due)
@@ -2023,7 +2458,7 @@ reset:
  * @c:		Execution context
  * @conn:	Connection pointer
  */
-static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
+static void tcp_connect_finish(const struct ctx *c, struct tcp_tap_conn *conn)
 {
 	socklen_t sl;
 	int so;
@@ -2034,8 +2469,10 @@ static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
 		return;
 	}
 
-	if (tcp_send_flag(c, conn, SYN | ACK))
+	if (tcp_send_flag(c, conn, SYN | ACK)) {
+		tcp_rst(c, conn);
 		return;
+	}
 
 	conn_event(c, conn, TAP_SYN_ACK_SENT);
 	conn_flag(c, conn, ACK_FROM_TAP_DUE);
@@ -2049,8 +2486,8 @@ static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
  * @sa:		Peer socket address (from accept())
  * @now:	Current timestamp
  */
-static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s,
-				   const struct timespec *now)
+static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
+				   int s, const struct timespec *now)
 {
 	struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
 	uint64_t hash;
@@ -2058,6 +2495,15 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s,
 	conn->sock = s;
 	conn->timer = -1;
 	conn->ws_to_tap = conn->ws_from_tap = 0;
+
+	flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
+	if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, s, INISIDE) < 0) {
+		flow_perror(flow, "Can't register with epoll");
+		conn_flag(c, conn, CLOSING);
+		FLOW_ACTIVATE(conn);
+		return;
+	}
+
 	conn_event(c, conn, SOCK_ACCEPTED);
 
 	hash = flow_hash_insert(c, TAP_SIDX(conn));
@@ -2067,7 +2513,12 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s,
 
 	conn->wnd_from_tap = WINDOW_DEFAULT;
 
-	tcp_send_flag(c, conn, SYN);
+	if (tcp_send_flag(c, conn, SYN)) {
+		conn_flag(c, conn, CLOSING);
+		FLOW_ACTIVATE(conn);
+		return;
+	}
+
 	conn_flag(c, conn, ACK_FROM_TAP_DUE);
 
 	tcp_get_sndbuf(conn);
@@ -2081,16 +2532,16 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s,
  * @ref:	epoll reference of listening socket
  * @now:	Current timestamp
  */
-void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
+void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 			const struct timespec *now)
 {
-	const struct flowside *ini;
 	union sockaddr_inany sa;
 	socklen_t sl = sizeof(sa);
+	struct flowside *ini;
 	union flow *flow;
 	int s;
 
-	ASSERT(!c->no_tcp);
+	assert(!c->no_tcp);
 
 	if (!(flow = flow_alloc()))
 		return;
@@ -2099,10 +2550,18 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
 	if (s < 0)
 		goto cancel;
 
-	/* FIXME: When listening port has a specific bound address, record that
-	 * as the forwarding address */
-	ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa,
-			       ref.tcp_listen.port);
+	tcp_sock_set_nodelay(s);
+
+	/* FIXME: If useful: when the listening port has a specific bound
+	 * address, record that as our address, as implemented for vhost-user
+	 * mode only, below.
+	 */
+	ini = flow_initiate_sa(flow, ref.listen.pif, &sa,
+			       NULL, ref.listen.port);
+
+	if (getsockname(s, &sa.sa, &sl) ||
+	    inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa) < 0)
+		err_perror("Can't get local address for socket %i", s);
 
 	if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0) {
 		char sastr[SOCKADDR_STRLEN];
@@ -2112,7 +2571,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
 		goto cancel;
 	}
 
-	if (!flow_target(c, flow, IPPROTO_TCP))
+	if (!flow_target(c, flow, ref.listen.rule, IPPROTO_TCP))
 		goto cancel;
 
 	switch (flow->f.pif[TGTSIDE]) {
@@ -2143,62 +2602,68 @@ cancel:
  * @c:		Execution context
  * @ref:	epoll reference of timer (not connection)
  *
- * #syscalls timerfd_gettime
+ * #syscalls timerfd_gettime|timerfd_gettime64
+ * #syscalls arm:timerfd_gettime64 i686:timerfd_gettime64
+ * #syscalls arm:timerfd_settime64 i686:timerfd_settime64
  */
-void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
+void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 {
 	struct itimerspec check_armed = { { 0 }, { 0 } };
 	struct tcp_tap_conn *conn = &FLOW(ref.flow)->tcp;
 
-	ASSERT(!c->no_tcp);
-	ASSERT(conn->f.type == FLOW_TCP);
+	assert(!c->no_tcp);
+	assert(conn->f.type == FLOW_TCP);
 
 	/* We don't reset timers on ~ACK_FROM_TAP_DUE, ~ACK_TO_TAP_DUE. If the
 	 * timer is currently armed, this event came from a previous setting,
 	 * and we just set the timer to a new point in the future: discard it.
 	 */
-	timerfd_gettime(conn->timer, &check_armed);
+	if (timerfd_gettime(conn->timer, &check_armed))
+		flow_perror(conn, "failed to read timer");
+
 	if (check_armed.it_value.tv_sec || check_armed.it_value.tv_nsec)
 		return;
 
 	if (conn->flags & ACK_TO_TAP_DUE) {
-		tcp_send_flag(c, conn, ACK_IF_NEEDED);
+		if (tcp_send_flag(c, conn, ACK_IF_NEEDED)) {
+			tcp_rst(c, conn);
+			return;
+		}
 		tcp_timer_ctl(c, conn);
 	} else if (conn->flags & ACK_FROM_TAP_DUE) {
 		if (!(conn->events & ESTABLISHED)) {
-			flow_dbg(conn, "handshake timeout");
-			tcp_rst(c, conn);
-		} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
-			flow_dbg(conn, "FIN timeout");
-			tcp_rst(c, conn);
-		} else if (conn->retrans == TCP_MAX_RETRANS) {
-			flow_dbg(conn, "retransmissions count exceeded");
-			tcp_rst(c, conn);
-		} else {
-			flow_dbg(conn, "ACK timeout, retry");
-			conn->retrans++;
-			conn->seq_to_tap = conn->seq_ack_from_tap;
-			if (tcp_set_peek_offset(conn->sock, 0)) {
+			unsigned int max;
+
+			max = c->tcp.syn_retries + c->tcp.syn_linear_timeouts;
+			max = MIN(TCP_MAX_RETRIES, max);
+			if (conn->retries >= max) {
+				flow_dbg(conn, "handshake timeout");
 				tcp_rst(c, conn);
 			} else {
-				tcp_data_from_sock(c, conn);
+				flow_trace(conn, "SYN timeout, retry");
+				if (tcp_send_flag(c, conn, SYN)) {
+					tcp_rst(c, conn);
+					return;
+				}
+				conn->retries++;
+				conn_flag(c, conn, SYN_RETRIED);
 				tcp_timer_ctl(c, conn);
 			}
-		}
-	} else {
-		struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } };
-		struct itimerspec old = { { 0 }, { 0 } };
-
-		/* Activity timeout: if it was already set, reset the
-		 * connection, otherwise, it was a left-over from ACK_TO_TAP_DUE
-		 * or ACK_FROM_TAP_DUE, so just set the long timeout in that
-		 * case. This avoids having to preemptively reset the timer on
-		 * ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE.
-		 */
-		timerfd_settime(conn->timer, 0, &new, &old);
-		if (old.it_value.tv_sec == ACT_TIMEOUT) {
-			flow_dbg(conn, "activity timeout");
+		} else if (conn->retries == TCP_MAX_RETRIES) {
+			flow_dbg(conn, "retransmissions count exceeded");
 			tcp_rst(c, conn);
+		} else {
+			flow_dbg(conn, "ACK timeout, retry");
+
+			if (!conn->wnd_from_tap)
+				conn->wnd_from_tap = 1; /* Zero-window probe */
+
+			conn->retries++;
+			if (tcp_rewind_seq(c, conn))
+				return;
+
+			tcp_data_from_sock(c, conn);
+			tcp_timer_ctl(c, conn);
 		}
 	}
 }
@@ -2209,12 +2674,13 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
  * @ref:	epoll reference
  * @events:	epoll events bitmap
  */
-void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
+void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
+		      uint32_t events)
 {
 	struct tcp_tap_conn *conn = conn_at_sidx(ref.flowside);
 
-	ASSERT(!c->no_tcp);
-	ASSERT(pif_at_sidx(ref.flowside) != PIF_TAP);
+	assert(!c->no_tcp);
+	assert(pif_at_sidx(ref.flowside) != PIF_TAP);
 
 	if (conn->events == CLOSED)
 		return;
@@ -2224,7 +2690,9 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
 		return;
 	}
 
-	if ((conn->events & TAP_FIN_SENT) && (events & EPOLLHUP)) {
+	conn->inactive = false;
+
+	if ((conn->events & TAP_FIN_ACKED) && (events & EPOLLHUP)) {
 		conn_event(c, conn, CLOSED);
 		return;
 	}
@@ -2239,8 +2707,14 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
 		if (events & EPOLLIN)
 			tcp_data_from_sock(c, conn);
 
-		if (events & EPOLLOUT)
-			tcp_update_seqack_wnd(c, conn, 0, NULL);
+		if (events & EPOLLOUT) {
+			tcp_epoll_ctl(conn);
+			if (tcp_update_seqack_wnd(c, conn, false, NULL) &&
+			    tcp_send_flag(c, conn, ACK)) {
+				tcp_rst(c, conn);
+				return;
+			}
+		}
 
 		return;
 	}
@@ -2263,355 +2737,1332 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
 }
 
 /**
- * tcp_sock_init_af() - Initialise listening socket for a given af and port
+ * tcp_listen() - Create listening socket
  * @c:		Execution context
- * @af:		Address family to listen on
+ * @pif:	Interface to open the socket for (PIF_HOST or PIF_SPLICE)
+ * @rule:	Index of relevant forwarding rule
+ * @addr:	Pointer to address for binding, NULL for any
+ * @ifname:	Name of interface to bind to, NULL for any
  * @port:	Port, host order
- * @addr:	Pointer to address for binding, NULL if not configured
- * @ifname:	Name of interface to bind to, NULL if not configured
  *
- * Return: fd for the new listening socket, negative error code on failure
+ * Return: socket fd on success, negative error code on failure
  */
-static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port,
-			    const void *addr, const char *ifname)
+int tcp_listen(const struct ctx *c, uint8_t pif, unsigned rule,
+	       const union inany_addr *addr, const char *ifname, in_port_t port)
 {
-	union tcp_listen_epoll_ref tref = {
-		.port = port,
-		.pif = PIF_HOST,
-	};
 	int s;
 
-	s = sock_l4(c, af, EPOLL_TYPE_TCP_LISTEN, addr, ifname, port, tref.u32);
+	assert(!c->no_tcp);
 
-	if (c->tcp.fwd_in.mode == FWD_AUTO) {
-		if (af == AF_INET  || af == AF_UNSPEC)
-			tcp_sock_init_ext[port][V4] = s < 0 ? -1 : s;
-		if (af == AF_INET6 || af == AF_UNSPEC)
-			tcp_sock_init_ext[port][V6] = s < 0 ? -1 : s;
+	if (!c->ifi4) {
+		if (!addr)
+			/* Restrict to v6 only */
+			addr = &inany_any6;
+		else if (inany_v4(addr))
+			return -EAFNOSUPPORT;
+	}
+	if (!c->ifi6) {
+		if (!addr)
+			/* Restrict to v4 only */
+			addr = &inany_any4;
+		else if (!inany_v4(addr))
+			return -EAFNOSUPPORT;
 	}
 
-	if (s < 0)
-		return s;
+	s = pif_listen(c, EPOLL_TYPE_TCP_LISTEN, pif, addr, ifname, port, rule);
 
-	tcp_sock_set_bufsize(c, s);
 	return s;
 }
 
 /**
- * tcp_sock_init() - Create listening sockets for a given host ("inbound") port
+ * tcp_sock_refill_pool() - Refill one pool of pre-opened sockets
+ * @pool:	Pool of sockets to refill
+ * @af:		Address family to use
+ *
+ * Return: 0 on success, negative error code if there was at least one error
+ */
+int tcp_sock_refill_pool(int pool[], sa_family_t af)
+{
+	int i;
+
+	for (i = 0; i < TCP_SOCK_POOL_SIZE; i++) {
+		int fd;
+
+		if (pool[i] >= 0)
+			continue;
+
+		if ((fd = tcp_conn_new_sock(af)) < 0)
+			return fd;
+
+		pool[i] = fd;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_sock_refill_init() - Refill pools of pre-opened sockets in init ns
  * @c:		Execution context
- * @af:		Address family to select a specific IP version, or AF_UNSPEC
- * @addr:	Pointer to address for binding, NULL if not configured
- * @ifname:	Name of interface to bind to, NULL if not configured
- * @port:	Port, host order
+ */
+static void tcp_sock_refill_init(const struct ctx *c)
+{
+	if (c->ifi4) {
+		int rc = tcp_sock_refill_pool(init_sock_pool4, AF_INET);
+		if (rc < 0)
+			warn("TCP: Error refilling IPv4 host socket pool: %s",
+			     strerror_(-rc));
+	}
+	if (c->ifi6) {
+		int rc = tcp_sock_refill_pool(init_sock_pool6, AF_INET6);
+		if (rc < 0)
+			warn("TCP: Error refilling IPv6 host socket pool: %s",
+			     strerror_(-rc));
+	}
+}
+
+/**
+ * tcp_probe_peek_offset_cap() - Check if SO_PEEK_OFF is supported by kernel
+ * @af:		Address family, IPv4 or IPv6
  *
- * Return: 0 on (partial) success, negative error code on (complete) failure
+ * Return: true if supported, false otherwise
  */
-int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
-		  const char *ifname, in_port_t port)
+static bool tcp_probe_peek_offset_cap(sa_family_t af)
 {
-	int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
+	bool ret = false;
+	int s, optv = 0;
+
+	s = socket(af, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+	if (s < 0) {
+		warn_perror("Temporary TCP socket creation failed");
+	} else {
+		if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int)))
+			ret = true;
+		close(s);
+	}
 
-	ASSERT(!c->no_tcp);
+	return ret;
+}
 
-	if (af == AF_UNSPEC && c->ifi4 && c->ifi6)
-		/* Attempt to get a dual stack socket */
-		if (tcp_sock_init_af(c, AF_UNSPEC, port, addr, ifname) >= 0)
-			return 0;
+/**
+ * tcp_probe_tcp_info() - Check what data TCP_INFO reports
+ *
+ * Return: number of bytes returned by TCP_INFO getsockopt()
+ */
+static socklen_t tcp_probe_tcp_info(void)
+{
+	struct tcp_info_linux tinfo;
+	socklen_t sl = sizeof(tinfo);
+	int s;
 
-	/* Otherwise create a socket per IP version */
-	if ((af == AF_INET  || af == AF_UNSPEC) && c->ifi4)
-		r4 = tcp_sock_init_af(c, AF_INET, port, addr, ifname);
+	s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+	if (s < 0) {
+		warn_perror("Temporary TCP socket creation failed");
+		return false;
+	}
 
-	if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6)
-		r6 = tcp_sock_init_af(c, AF_INET6, port, addr, ifname);
+	if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
+		warn_perror("Failed to get TCP_INFO on temporary socket");
+		close(s);
+		return false;
+	}
 
-	if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6))
-		return 0;
+	close(s);
 
-	return r4 < 0 ? r4 : r6;
+	return sl;
 }
 
 /**
- * tcp_ns_sock_init4() - Init socket to listen for outbound IPv4 connections
+ * tcp_get_rto_params() - Get host kernel RTO parameters
  * @c:		Execution context
- * @port:	Port, host order
  */
-static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)
+static void tcp_get_rto_params(struct ctx *c)
 {
-	union tcp_listen_epoll_ref tref = {
-		.port = port,
-		.pif = PIF_SPLICE,
-	};
-	int s;
+	intmax_t v;
 
-	ASSERT(c->mode == MODE_PASTA);
+	v = read_file_integer(SYN_RETRIES, SYN_RETRIES_DEFAULT);
+	c->tcp.syn_retries = MIN(v, MAX_SYNCNT);
 
-	s = sock_l4(c, AF_INET, EPOLL_TYPE_TCP_LISTEN, &in4addr_loopback,
-		    NULL, port, tref.u32);
-	if (s >= 0)
-		tcp_sock_set_bufsize(c, s);
-	else
-		s = -1;
+	v = read_file_integer(SYN_LINEAR_TIMEOUTS, SYN_LINEAR_TIMEOUTS_DEFAULT);
+	c->tcp.syn_linear_timeouts = MIN(v, MAX_SYNCNT);
+
+	v = read_file_integer(RTO_MAX_MS, (intmax_t)(RTO_MAX_DEFAULT * 1000));
+	c->tcp.rto_max = MIN(DIV_ROUND_UP(v, 1000), INT_MAX);
 
-	if (c->tcp.fwd_out.mode == FWD_AUTO)
-		tcp_sock_ns[port][V4] = s;
+	debug("Using TCP RTO parameters, syn_retries: %"PRIu8
+	      ", syn_linear_timeouts: %"PRIu8
+	      ", rto_max: %d",
+	      c->tcp.syn_retries,
+	      c->tcp.syn_linear_timeouts,
+	      c->tcp.rto_max);
 }
 
 /**
- * tcp_ns_sock_init6() - Init socket to listen for outbound IPv6 connections
+ * tcp_init() - Get initial sequence, hash secret, initialise per-socket data
  * @c:		Execution context
- * @port:	Port, host order
+ *
+ * Return: 0 on success, -1 on failure
  */
-static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)
+int tcp_init(struct ctx *c)
 {
-	union tcp_listen_epoll_ref tref = {
-		.port = port,
-		.pif = PIF_SPLICE,
-	};
-	int s;
+	assert(!c->no_tcp);
 
-	ASSERT(c->mode == MODE_PASTA);
+	tcp_get_rto_params(c);
 
-	s = sock_l4(c, AF_INET6, EPOLL_TYPE_TCP_LISTEN, &in6addr_loopback,
-		    NULL, port, tref.u32);
-	if (s >= 0)
-		tcp_sock_set_bufsize(c, s);
-	else
-		s = -1;
+	tcp_sock_iov_init(c);
+
+	memset(init_sock_pool4,		0xff,	sizeof(init_sock_pool4));
+	memset(init_sock_pool6,		0xff,	sizeof(init_sock_pool6));
+
+	tcp_sock_refill_init(c);
+
+	if (c->mode == MODE_PASTA)
+		tcp_splice_init(c);
+
+	peek_offset_cap = (!c->ifi4 || tcp_probe_peek_offset_cap(AF_INET)) &&
+			  (!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6));
+	debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");
+
+	tcp_info_size = tcp_probe_tcp_info();
+
+#define dbg_tcpi(f_)	debug("TCP_INFO tcpi_%s field%s supported",	\
+			      STRINGIFY(f_), tcp_info_cap(f_) ? "" : " not")
+	dbg_tcpi(snd_wnd);
+	dbg_tcpi(bytes_acked);
+	dbg_tcpi(min_rtt);
+#undef dbg_tcpi
+
+	return 0;
+}
 
-	if (c->tcp.fwd_out.mode == FWD_AUTO)
-		tcp_sock_ns[port][V6] = s;
+/**
+ * tcp_keepalive() - Send keepalives for connections which need it
+ * @:	Execution context
+ */
+static void tcp_keepalive(struct ctx *c, const struct timespec *now)
+{
+	union flow *flow;
+
+	if (now->tv_sec - c->tcp.keepalive_run < KEEPALIVE_INTERVAL)
+		return;
+
+	c->tcp.keepalive_run = now->tv_sec;
+
+	flow_foreach_of_type(flow, FLOW_TCP) {
+		struct tcp_tap_conn *conn = &flow->tcp;
+
+		if (conn->tap_inactive) {
+			flow_dbg(conn, "No tap activity for least %us, send keepalive",
+				 KEEPALIVE_INTERVAL);
+			if (tcp_send_flag(c, conn, KEEPALIVE))
+				tcp_rst(c, conn);
+		}
+
+		/* Ready to check fot next interval */
+		conn->tap_inactive = true;
+	}
+}
+
+/**
+ * tcp_inactivity() - Scan for and close long-inactive connections
+ * @:	Execution context
+ */
+static void tcp_inactivity(struct ctx *c, const struct timespec *now)
+{
+	union flow *flow;
+
+	if (now->tv_sec - c->tcp.inactivity_run < INACTIVITY_INTERVAL)
+		return;
+
+	debug("TCP inactivity scan");
+	c->tcp.inactivity_run = now->tv_sec;
+
+	flow_foreach_of_type(flow, FLOW_TCP) {
+		struct tcp_tap_conn *conn = &flow->tcp;
+
+		if (conn->inactive) {
+			/* No activity in this interval, reset */
+			flow_dbg(conn, "Inactive for at least %us, resetting",
+				 INACTIVITY_INTERVAL);
+			tcp_rst(c, conn);
+		}
+
+		/* Ready to check fot next interval */
+		conn->inactive = true;
+	}
 }
 
 /**
- * tcp_ns_sock_init() - Init socket to listen for spliced outbound connections
+ * tcp_timer() - Periodic tasks: port detection, closed connections, pool refill
  * @c:		Execution context
- * @port:	Port, host order
+ * @now:	Current timestamp
  */
-void tcp_ns_sock_init(const struct ctx *c, in_port_t port)
+void tcp_timer(struct ctx *c, const struct timespec *now)
 {
-	ASSERT(!c->no_tcp);
+	tcp_sock_refill_init(c);
+	if (c->mode == MODE_PASTA)
+		tcp_splice_refill(c);
 
-	if (c->ifi4)
-		tcp_ns_sock_init4(c, port);
-	if (c->ifi6)
-		tcp_ns_sock_init6(c, port);
+	tcp_keepalive(c, now);
+	tcp_inactivity(c, now);
 }
 
 /**
- * tcp_ns_socks_init() - Bind sockets in namespace for outbound connections
- * @arg:	Execution context
+ * tcp_flow_is_established() - Was the connection established? Includes closing
+ * @conn:	Pointer to the TCP connection structure
  *
- * Return: 0
+ * Return: true if the connection was established, false otherwise
  */
-/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */
-static int tcp_ns_socks_init(void *arg)
+bool tcp_flow_is_established(const struct tcp_tap_conn *conn)
 {
-	const struct ctx *c = (const struct ctx *)arg;
-	unsigned port;
+	return conn->events & ESTABLISHED;
+}
 
-	ns_enter(c);
+/**
+ * tcp_flow_repair_on() - Enable repair mode for a single TCP flow
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn)
+{
+	int rc = 0;
 
-	for (port = 0; port < NUM_PORTS; port++) {
-		if (!bitmap_isset(c->tcp.fwd_out.map, port))
-			continue;
+	if (conn->sock < 0)
+		return 0;
+
+	if ((rc = repair_set(c, conn->sock, TCP_REPAIR_ON)))
+		err("Failed to set TCP_REPAIR");
+
+	return rc;
+}
+
+/**
+ * tcp_flow_repair_off() - Clear repair mode for a single TCP flow
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn)
+{
+	int rc = 0;
+
+	if (conn->sock < 0)
+		return 0;
+
+	if ((rc = repair_set(c, conn->sock, TCP_REPAIR_OFF)))
+		err("Failed to clear TCP_REPAIR");
+
+	return rc;
+}
+
+/**
+ * tcp_flow_dump_tinfo() - Dump window scale, tcpi_state, tcpi_options
+ * @conn:	Pointer to the TCP connection structure
+ * @t:		Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_tinfo(const struct tcp_tap_conn *conn,
+			       struct tcp_tap_transfer_ext *t)
+{
+	struct tcp_info tinfo;
+	socklen_t sl;
 
-		tcp_ns_sock_init(c, port);
+	sl = sizeof(tinfo);
+	if (getsockopt(conn->sock, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
+		int rc = -errno;
+		flow_perror(conn, "Querying TCP_INFO");
+		return rc;
 	}
 
+	t->snd_ws		= tinfo.tcpi_snd_wscale;
+	t->rcv_ws		= tinfo.tcpi_rcv_wscale;
+	t->tcpi_state		= tinfo.tcpi_state;
+	t->tcpi_options		= tinfo.tcpi_options;
+
 	return 0;
 }
 
 /**
- * tcp_sock_refill_pool() - Refill one pool of pre-opened sockets
- * @c:		Execution context
- * @pool:	Pool of sockets to refill
- * @af:		Address family to use
+ * tcp_flow_dump_mss() - Dump MSS clamp (not current MSS) via TCP_MAXSEG
+ * @conn:	Pointer to the TCP connection structure
+ * @t:		Extended migration data
  *
- * Return: 0 on success, negative error code if there was at least one error
+ * Return: 0 on success, negative error code on failure
  */
-int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af)
+static int tcp_flow_dump_mss(const struct tcp_tap_conn *conn,
+			     struct tcp_tap_transfer_ext *t)
 {
-	int i;
+	socklen_t sl = sizeof(t->mss);
+	int val;
 
-	for (i = 0; i < TCP_SOCK_POOL_SIZE; i++) {
-		int fd;
+	if (getsockopt(conn->sock, SOL_TCP, TCP_MAXSEG, &val, &sl)) {
+		int rc = -errno;
+		flow_perror(conn, "Getting MSS");
+		return rc;
+	}
 
-		if (pool[i] >= 0)
-			continue;
+	t->mss = (uint32_t)val;
 
-		if ((fd = tcp_conn_new_sock(c, af)) < 0)
-			return fd;
+	return 0;
+}
 
-		pool[i] = fd;
+
+/**
+ * tcp_flow_dump_timestamp() - Dump RFC 7323 timestamp via TCP_TIMESTAMP
+ * @conn:	Pointer to the TCP connection structure
+ * @t:		Extended migration data (tcpi_options must be populated)
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_timestamp(const struct tcp_tap_conn *conn,
+				   struct tcp_tap_transfer_ext *t)
+{
+	int val = 0;
+
+	if (t->tcpi_options & TCPI_OPT_TIMESTAMPS) {
+		socklen_t sl = sizeof(val);
+
+		if (getsockopt(conn->sock, SOL_TCP, TCP_TIMESTAMP, &val, &sl)) {
+			int rc = -errno;
+			flow_perror(conn, "Getting RFC 7323 timestamp");
+			return rc;
+		}
 	}
 
+	t->timestamp = (uint32_t)val;
 	return 0;
 }
 
 /**
- * tcp_sock_refill_init() - Refill pools of pre-opened sockets in init ns
- * @c:		Execution context
+ * tcp_flow_repair_timestamp() - Restore RFC 7323 timestamp via TCP_TIMESTAMP
+ * @conn:	Pointer to the TCP connection structure
+ * @t:		Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
  */
-static void tcp_sock_refill_init(const struct ctx *c)
+static int tcp_flow_repair_timestamp(const struct tcp_tap_conn *conn,
+				   const struct tcp_tap_transfer_ext *t)
 {
-	if (c->ifi4) {
-		int rc = tcp_sock_refill_pool(c, init_sock_pool4, AF_INET);
-		if (rc < 0)
-			warn("TCP: Error refilling IPv4 host socket pool: %s",
-			     strerror(-rc));
+	int val = (int)t->timestamp;
+
+	if (t->tcpi_options & TCPI_OPT_TIMESTAMPS) {
+		if (setsockopt(conn->sock, SOL_TCP, TCP_TIMESTAMP,
+			       &val, sizeof(val))) {
+			int rc = -errno;
+			flow_perror(conn, "Setting RFC 7323 timestamp");
+			return rc;
+		}
 	}
-	if (c->ifi6) {
-		int rc = tcp_sock_refill_pool(c, init_sock_pool6, AF_INET6);
-		if (rc < 0)
-			warn("TCP: Error refilling IPv6 host socket pool: %s",
-			     strerror(-rc));
+
+	return 0;
+}
+
+/**
+ * tcp_flow_dump_wnd() - Dump current tcp_repair_window parameters
+ * @conn:	Pointer to the TCP connection structure
+ * @t:		Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_wnd(const struct tcp_tap_conn *conn,
+			     struct tcp_tap_transfer_ext *t)
+{
+	struct tcp_repair_window wnd;
+	socklen_t sl = sizeof(wnd);
+
+	if (getsockopt(conn->sock, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, &sl)) {
+		int rc = -errno;
+		flow_perror(conn, "Getting window repair data");
+		return rc;
 	}
+
+	t->snd_wl1	= wnd.snd_wl1;
+	t->snd_wnd	= wnd.snd_wnd;
+	t->max_window	= wnd.max_window;
+	t->rcv_wnd	= wnd.rcv_wnd;
+	t->rcv_wup	= wnd.rcv_wup;
+
+	/* If we received a FIN, we also need to adjust window parameters.
+	 *
+	 * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state.
+	 */
+	if (t->tcpi_state == TCP_CLOSE_WAIT || t->tcpi_state == TCP_LAST_ACK) {
+		t->rcv_wup--;
+		t->rcv_wnd++;
+	}
+
+	return 0;
 }
 
 /**
- * tcp_probe_peek_offset_cap() - Check if SO_PEEK_OFF is supported by kernel
- * @af:		Address family, IPv4 or IPv6
+ * tcp_flow_repair_wnd() - Restore window parameters from extended data
+ * @conn:	Pointer to the TCP connection structure
+ * @t:		Extended migration data
  *
- * Return: true if supported, false otherwise
+ * Return: 0 on success, negative error code on failure
  */
-bool tcp_probe_peek_offset_cap(sa_family_t af)
+static int tcp_flow_repair_wnd(const struct tcp_tap_conn *conn,
+			       const struct tcp_tap_transfer_ext *t)
 {
-	bool ret = false;
-	int s, optv = 0;
+	struct tcp_repair_window wnd;
+
+	wnd.snd_wl1	= t->snd_wl1;
+	wnd.snd_wnd	= t->snd_wnd;
+	wnd.max_window	= t->max_window;
+	wnd.rcv_wnd	= t->rcv_wnd;
+	wnd.rcv_wup	= t->rcv_wup;
+
+	if (setsockopt(conn->sock, IPPROTO_TCP, TCP_REPAIR_WINDOW,
+		       &wnd, sizeof(wnd))) {
+		int rc = -errno;
+		flow_perror(conn, "Setting window data");
+		return rc;
+	}
 
-	s = socket(af, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
-	if (s < 0) {
-		warn_perror("Temporary TCP socket creation failed");
-	} else {
-		if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int)))
-			ret = true;
-		close(s);
+	return 0;
+}
+
+/**
+ * tcp_flow_select_queue() - Select queue (receive or send) for next operation
+ * @conn:	Connection to select queue for
+ * @queue:	TCP_RECV_QUEUE or TCP_SEND_QUEUE
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_select_queue(const struct tcp_tap_conn *conn, int queue)
+{
+	if (setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_QUEUE,
+		       &queue, sizeof(queue))) {
+		int rc = -errno;
+		flow_perror(conn, "Selecting TCP_SEND_QUEUE");
+		return rc;
 	}
 
-	return ret;
+	return 0;
 }
 
 /**
- * tcp_init() - Get initial sequence, hash secret, initialise per-socket data
- * @c:		Execution context
+ * tcp_flow_dump_sndqueue() - Dump send queue, length of sent and not sent data
+ * @conn:	Connection to dump queue for
+ * @t:		Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
  *
- * Return: 0, doesn't return on failure
+ * #syscalls:vu ioctl
  */
-int tcp_init(struct ctx *c)
+static int tcp_flow_dump_sndqueue(const struct tcp_tap_conn *conn,
+				  struct tcp_tap_transfer_ext *t)
 {
-	ASSERT(!c->no_tcp);
+	int s = conn->sock;
+	ssize_t rc;
 
-	if (c->ifi4)
-		tcp_sock4_iov_init(c);
+	if (ioctl(s, SIOCOUTQ, &t->sndq) < 0) {
+		rc = -errno;
+		flow_perror(conn, "Getting send queue size");
+		return rc;
+	}
 
-	if (c->ifi6)
-		tcp_sock6_iov_init(c);
+	if (ioctl(s, SIOCOUTQNSD, &t->notsent) < 0) {
+		rc = -errno;
+		flow_perror(conn, "Getting not sent count");
+		return rc;
+	}
 
-	memset(init_sock_pool4,		0xff,	sizeof(init_sock_pool4));
-	memset(init_sock_pool6,		0xff,	sizeof(init_sock_pool6));
-	memset(tcp_sock_init_ext,	0xff,	sizeof(tcp_sock_init_ext));
-	memset(tcp_sock_ns,		0xff,	sizeof(tcp_sock_ns));
+	/* If we sent a FIN, SIOCOUTQ and SIOCOUTQNSD are one greater than the
+	 * actual pending queue length, because they are based on the sequence
+	 * numbers, not directly on the buffer contents.
+	 *
+	 * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state.
+	 */
+	if (t->tcpi_state == TCP_FIN_WAIT1 || t->tcpi_state == TCP_FIN_WAIT2 ||
+	    t->tcpi_state == TCP_LAST_ACK  || t->tcpi_state == TCP_CLOSING) {
+		if (t->sndq)
+			t->sndq--;
+		if (t->notsent)
+			t->notsent--;
+	}
 
-	tcp_sock_refill_init(c);
+	if (t->notsent > t->sndq) {
+		flow_err(conn,
+			 "Invalid notsent count socket %i, send: %u, not sent: %u",
+			 s, t->sndq, t->notsent);
+		return -EINVAL;
+	}
 
-	if (c->mode == MODE_PASTA) {
-		tcp_splice_init(c);
+	if (t->sndq > TCP_MIGRATE_SND_QUEUE_MAX) {
+		flow_err(conn,
+			 "Send queue too large to migrate socket %i: %u bytes",
+			 s, t->sndq);
+		return -ENOBUFS;
+	}
+
+	rc = recv(s, tcp_migrate_snd_queue,
+		  MIN(t->sndq, TCP_MIGRATE_SND_QUEUE_MAX), MSG_PEEK);
+	if (rc < 0) {
+		if (errno == EAGAIN)  { /* EAGAIN means empty */
+			rc = 0;
+		} else {
+			rc = -errno;
+			flow_perror(conn, "Can't read send queue");
+			return rc;
+		}
+	}
 
-		NS_CALL(tcp_ns_socks_init, c);
+	if ((uint32_t)rc < t->sndq) {
+		flow_err(conn, "Short read migrating send queue");
+		return -ENXIO;
 	}
 
-	peek_offset_cap = (!c->ifi4 || tcp_probe_peek_offset_cap(AF_INET)) &&
-			  (!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6));
-	debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");
+	t->notsent = MIN(t->notsent, t->sndq);
 
 	return 0;
 }
 
 /**
- * tcp_port_rebind() - Rebind ports to match forward maps
- * @c:		Execution context
- * @outbound:	True to remap outbound forwards, otherwise inbound
+ * tcp_flow_repair_queue() - Restore contents of a given (pre-selected) queue
+ * @conn:	Connection to repair queue for
+ * @len:	Length of data to be restored
+ * @buf:	Buffer with content of pending data queue
  *
- * Must be called in namespace context if @outbound is true.
+ * Return: 0 on success, negative error code on failure
  */
-static void tcp_port_rebind(struct ctx *c, bool outbound)
+static int tcp_flow_repair_queue(const struct tcp_tap_conn *conn,
+				 size_t len, uint8_t *buf)
 {
-	const uint8_t *fmap = outbound ? c->tcp.fwd_out.map : c->tcp.fwd_in.map;
-	const uint8_t *rmap = outbound ? c->tcp.fwd_in.map : c->tcp.fwd_out.map;
-	int (*socks)[IP_VERSIONS] = outbound ? tcp_sock_ns : tcp_sock_init_ext;
-	unsigned port;
+	size_t chunk = len;
+	uint8_t *p = buf;
 
-	for (port = 0; port < NUM_PORTS; port++) {
-		if (!bitmap_isset(fmap, port)) {
-			if (socks[port][V4] >= 0) {
-				close(socks[port][V4]);
-				socks[port][V4] = -1;
-			}
+	if (conn->sock < 0) {
+		flow_err(conn, "Invalid socket descriptor for repair queue");
+		return -EBADF;
+	}
 
-			if (socks[port][V6] >= 0) {
-				close(socks[port][V6]);
-				socks[port][V6] = -1;
+	while (len > 0) {
+		ssize_t rc = send(conn->sock, p, MIN(len, chunk), 0);
+
+		if (rc < 0) {
+			if ((errno == ENOBUFS || errno == ENOMEM) &&
+			    chunk >= TCP_MIGRATE_RESTORE_CHUNK_MIN) {
+				chunk /= 2;
+				continue;
 			}
 
-			continue;
+			rc = -errno;
+			flow_perror(conn, "Can't write queue");
+			return rc;
 		}
 
-		/* Don't loop back our own ports */
-		if (bitmap_isset(rmap, port))
-			continue;
+		len -= rc;
+		p += rc;
+	}
 
-		if ((c->ifi4 && socks[port][V4] == -1) ||
-		    (c->ifi6 && socks[port][V6] == -1)) {
-			if (outbound)
-				tcp_ns_sock_init(c, port);
-			else
-				tcp_sock_init(c, AF_UNSPEC, NULL, NULL, port);
+	return 0;
+}
+
+/**
+ * tcp_flow_dump_seq() - Dump current sequence of pre-selected queue
+ * @conn:	Pointer to the TCP connection structure
+ * @v:		Sequence value, set on return
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_seq(const struct tcp_tap_conn *conn, uint32_t *v)
+{
+	socklen_t sl = sizeof(*v);
+
+	if (getsockopt(conn->sock, SOL_TCP, TCP_QUEUE_SEQ, v, &sl)) {
+		int rc = -errno;
+		flow_perror(conn, "Dumping sequence");
+		return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_repair_seq() - Restore sequence for pre-selected queue
+ * @conn:	Connection to repair sequences for
+ * @v:		Sequence value to be set
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_seq(const struct tcp_tap_conn *conn,
+			       const uint32_t *v)
+{
+	if (setsockopt(conn->sock, SOL_TCP, TCP_QUEUE_SEQ, v, sizeof(*v))) {
+		int rc = -errno;
+		flow_perror(conn, "Setting sequence");
+		return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_dump_rcvqueue() - Dump receive queue and its length, seal/block it
+ * @conn:	Pointer to the TCP connection structure
+ * @t:		Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ *
+ * #syscalls:vu ioctl
+ */
+static int tcp_flow_dump_rcvqueue(const struct tcp_tap_conn *conn,
+				  struct tcp_tap_transfer_ext *t)
+{
+	int s = conn->sock;
+	ssize_t rc;
+
+	if (ioctl(s, SIOCINQ, &t->rcvq) < 0) {
+		rc = -errno;
+		err_perror("Get receive queue size, socket %i", s);
+		return rc;
+	}
+
+	/* If we received a FIN, SIOCINQ is one greater than the actual number
+	 * of bytes on the queue, because it's based on the sequence number
+	 * rather than directly on the buffer contents.
+	 *
+	 * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state.
+	 */
+	if (t->rcvq &&
+	    (t->tcpi_state == TCP_CLOSE_WAIT || t->tcpi_state == TCP_LAST_ACK))
+		t->rcvq--;
+
+	if (t->rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) {
+		flow_err(conn,
+			 "Receive queue too large to migrate socket: %u bytes",
+			 t->rcvq);
+		return -ENOBUFS;
+	}
+
+	rc = recv(s, tcp_migrate_rcv_queue, t->rcvq, MSG_PEEK);
+	if (rc < 0) {
+		if (errno == EAGAIN)  { /* EAGAIN means empty */
+			rc = 0;
+		} else {
+			rc = -errno;
+			flow_perror(conn, "Can't read receive queue");
+			return rc;
 		}
 	}
+
+	if ((uint32_t)rc < t->rcvq) {
+		flow_err(conn, "Short read migrating receive queue");
+		return -ENXIO;
+	}
+
+	return 0;
 }
 
 /**
- * tcp_port_rebind_outbound() - Rebind ports in namespace
- * @arg:	Execution context
+ * tcp_flow_repair_opt() - Set repair "options" (MSS, scale, SACK, timestamps)
+ * @conn:	Pointer to the TCP connection structure
+ * @t:		Extended migration data
  *
- * Called with NS_CALL()
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_opt(const struct tcp_tap_conn *conn,
+			       const struct tcp_tap_transfer_ext *t)
+{
+	const struct tcp_repair_opt opts[] = {
+		{ TCPOPT_WINDOW,		t->snd_ws + (t->rcv_ws << 16) },
+		{ TCPOPT_MAXSEG,		t->mss },
+		{ TCPOPT_SACK_PERMITTED,	0 },
+		{ TCPOPT_TIMESTAMP,		0 },
+	};
+	socklen_t sl;
+
+	sl = sizeof(opts[0]) * (2 +
+				!!(t->tcpi_options & TCPI_OPT_SACK) +
+				!!(t->tcpi_options & TCPI_OPT_TIMESTAMPS));
+
+	if (setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_OPTIONS, opts, sl)) {
+		int rc = -errno;
+		flow_perror(conn, "Setting repair options");
+		return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_migrate_source() - Send data (flow table) for flow
+ * @fd:		Descriptor for state migration
+ * @conn:	Pointer to the TCP connection structure
  *
- * Return: 0
+ * Return: 0 on success, negative error code on failure
  */
-static int tcp_port_rebind_outbound(void *arg)
+int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn)
 {
-	struct ctx *c = (struct ctx *)arg;
+	struct tcp_tap_transfer t = {
+		.retries		= conn->retries,
+		.ws_from_tap		= conn->ws_from_tap,
+		.ws_to_tap		= conn->ws_to_tap,
+		.events			= conn->events,
+
+		.tap_mss		= htonl(MSS_GET(conn)),
+
+		.sndbuf			= htonl(conn->sndbuf),
 
-	ns_enter(c);
-	tcp_port_rebind(c, true);
+		.flags			= conn->flags,
+		.seq_dup_ack_approx	= conn->seq_dup_ack_approx,
+
+		.wnd_from_tap		= htons(conn->wnd_from_tap),
+		.wnd_to_tap		= htons(conn->wnd_to_tap),
+
+		.seq_to_tap		= htonl(conn->seq_to_tap),
+		.seq_ack_from_tap	= htonl(conn->seq_ack_from_tap),
+		.seq_from_tap		= htonl(conn->seq_from_tap),
+		.seq_ack_to_tap		= htonl(conn->seq_ack_to_tap),
+		.seq_init_from_tap	= htonl(conn->seq_init_from_tap),
+	};
+
+	memcpy(&t.pif, conn->f.pif, sizeof(t.pif));
+	memcpy(&t.side, conn->f.side, sizeof(t.side));
+
+	if (write_all_buf(fd, &t, sizeof(t))) {
+		int rc = -errno;
+		err_perror("Can't write migration data, socket %i", conn->sock);
+		return rc;
+	}
 
 	return 0;
 }
 
 /**
- * tcp_timer() - Periodic tasks: port detection, closed connections, pool refill
+ * tcp_flow_migrate_source_ext() - Dump queues, close sockets, send final data
  * @c:		Execution context
- * @now:	Current timestamp
+ * @fd:		Descriptor for state migration
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative (not -EIO) on failure, -EIO on sending failure
  */
-void tcp_timer(struct ctx *c, const struct timespec *now)
+int tcp_flow_migrate_source_ext(const struct ctx *c,
+				int fd, const struct tcp_tap_conn *conn)
+{
+	uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
+	struct tcp_tap_transfer_ext *t = &migrate_ext[FLOW_IDX(conn)];
+	int s = conn->sock;
+	int rc;
+
+	/* Disable SO_PEEK_OFF, it will make accessing the queues in repair mode
+	 * weird.
+	 */
+	if (tcp_set_peek_offset(conn, -1)) {
+		rc = -errno;
+		goto fail;
+	}
+
+	if ((rc = tcp_flow_dump_tinfo(conn, t)))
+		goto fail;
+
+	if ((rc = tcp_flow_dump_mss(conn, t)))
+		goto fail;
+
+	if ((rc = tcp_flow_dump_timestamp(conn, t)))
+		goto fail;
+
+	if ((rc = tcp_flow_dump_wnd(conn, t)))
+		goto fail;
+
+	if ((rc = tcp_flow_select_queue(conn, TCP_SEND_QUEUE)))
+		goto fail;
+
+	if ((rc = tcp_flow_dump_sndqueue(conn, t)))
+		goto fail;
+
+	if ((rc = tcp_flow_dump_seq(conn, &t->seq_snd)))
+		goto fail;
+
+	if ((rc = tcp_flow_select_queue(conn, TCP_RECV_QUEUE)))
+		goto fail;
+
+	if ((rc = tcp_flow_dump_rcvqueue(conn, t)))
+		goto fail;
+
+	if ((rc = tcp_flow_dump_seq(conn, &t->seq_rcv)))
+		goto fail;
+
+	if (c->migrate_no_linger)
+		close(s);
+	else
+		epoll_del(flow_epollfd(&conn->f), s);
+
+	/* Adjustments unrelated to FIN segments: sequence numbers we dumped are
+	 * based on the end of the queues.
+	 */
+	t->seq_rcv	-= t->rcvq;
+	t->seq_snd	-= t->sndq;
+
+	flow_dbg(conn, "Extended migration data, socket %i sequences send %u receive %u",
+		 s, t->seq_snd, t->seq_rcv);
+	flow_dbg(conn, "  pending queues: send %u not sent %u receive %u",
+		 t->sndq, t->notsent, t->rcvq);
+	flow_dbg(conn, "  window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
+		 t->snd_wl1, t->snd_wnd, t->max_window, t->rcv_wnd, t->rcv_wup);
+	flow_dbg(conn, "  SO_PEEK_OFF %s  offset=%"PRIu32,
+		 peek_offset_cap ? "enabled" : "disabled", peek_offset);
+
+	/* Endianness fix-ups */
+	t->seq_snd	= htonl(t->seq_snd);
+	t->seq_rcv 	= htonl(t->seq_rcv);
+	t->sndq		= htonl(t->sndq);
+	t->notsent	= htonl(t->notsent);
+	t->rcvq		= htonl(t->rcvq);
+	t->mss		= htonl(t->mss);
+	t->timestamp	= htonl(t->timestamp);
+
+	t->snd_wl1	= htonl(t->snd_wl1);
+	t->snd_wnd	= htonl(t->snd_wnd);
+	t->max_window	= htonl(t->max_window);
+	t->rcv_wnd	= htonl(t->rcv_wnd);
+	t->rcv_wup	= htonl(t->rcv_wup);
+
+	if (write_all_buf(fd, t, sizeof(*t))) {
+		flow_perror(conn, "Failed to write extended data");
+		return -EIO;
+	}
+
+	if (write_all_buf(fd, tcp_migrate_snd_queue, ntohl(t->sndq))) {
+		flow_perror(conn, "Failed to write send queue data");
+		return -EIO;
+	}
+
+	if (write_all_buf(fd, tcp_migrate_rcv_queue, ntohl(t->rcvq))) {
+		flow_perror(conn, "Failed to write receive queue data");
+		return -EIO;
+	}
+
+	return 0;
+
+fail:
+	/* For any type of failure dumping data, write an invalid extended data
+	 * descriptor that allows us to keep the stream in sync, but tells the
+	 * target to skip the flow. If we fail to transfer data, that's fatal:
+	 * return -EIO in that case (and only in that case).
+	 */
+	t->tcpi_state = 0; /* Not defined: tell the target to skip this flow */
+
+	if (write_all_buf(fd, t, sizeof(*t))) {
+		flow_perror(conn, "Failed to write extended data");
+		return -EIO;
+	}
+
+	if (rc == -EIO) /* but not a migration data transfer failure */
+		return -ENODATA;
+
+	return rc;
+}
+
+/**
+ * tcp_flow_repair_socket() - Open and bind socket, request repair mode
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
+{
+	sa_family_t af = CONN_V4(conn) ? AF_INET : AF_INET6;
+	int s, rc;
+
+	if ((conn->sock = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
+				 IPPROTO_TCP)) < 0) {
+		rc = -errno;
+		flow_perror(conn, "Failed to create socket for migrated flow");
+		return rc;
+	}
+	s = conn->sock;
+
+	if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &(int){ 1 }, sizeof(int)))
+		flow_dbg_perror(conn, "Failed to set SO_REUSEADDR on socket %i",
+				s);
+
+	tcp_sock_set_nodelay(s);
+
+	if ((rc = tcp_flow_repair_on(c, conn)))
+		goto err;
+
+	return 0;
+
+err:
+	close(s);
+	conn->sock = -1;
+	return rc;
+}
+
+/**
+ * tcp_flow_repair_bind() - Bind socket in repair mode
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_bind(const struct ctx *c, struct tcp_tap_conn *conn)
+{
+	const struct flowside *sockside = HOSTFLOW(conn);
+	union sockaddr_inany a;
+
+	pif_sockaddr(c, &a, PIF_HOST, &sockside->oaddr, sockside->oport);
+
+	if (bind(conn->sock, &a.sa, socklen_inany(&a))) {
+		int rc = -errno;
+		flow_perror(conn, "Failed to bind socket for migrated flow");
+		return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_repair_connect() - Connect socket in repair mode, then turn it off
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_connect(const struct ctx *c,
+				   struct tcp_tap_conn *conn)
+{
+	const struct flowside *tgt = HOSTFLOW(conn);
+	int rc;
+
+	rc = flowside_connect(c, conn->sock, PIF_HOST, tgt);
+	if (rc) {
+		rc = -errno;
+		flow_perror(conn, "Failed to connect migrated socket");
+		return rc;
+	}
+
+	conn->timer = -1;
+
+	return 0;
+}
+
+/**
+ * tcp_flow_migrate_target() - Receive data (flow table part) for flow, insert
+ * @c:		Execution context
+ * @fd:		Descriptor for state migration
+ *
+ * Return: 0 on success, negative on fatal failure, but 0 on single flow failure
+ */
+int tcp_flow_migrate_target(struct ctx *c, int fd)
+{
+	struct tcp_tap_transfer t;
+	struct tcp_tap_conn *conn;
+	union flow *flow;
+	int rc;
+
+	if (!(flow = flow_alloc())) {
+		err("Flow table full on migration target");
+		return 0;
+	}
+
+	if (read_all_buf(fd, &t, sizeof(t))) {
+		flow_perror(flow, "Failed to receive migration data");
+		flow_alloc_cancel(flow);
+		return -errno;
+	}
+
+	flow->f.state = FLOW_STATE_TGT;
+	memcpy(&flow->f.pif, &t.pif, sizeof(flow->f.pif));
+	memcpy(&flow->f.side, &t.side, sizeof(flow->f.side));
+	conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
+
+	conn->retries			= t.retries;
+	conn->ws_from_tap		= t.ws_from_tap;
+	conn->ws_to_tap			= t.ws_to_tap;
+	conn->events			= t.events;
+
+	conn->sndbuf			= htonl(t.sndbuf);
+
+	conn->flags			= t.flags;
+	conn->seq_dup_ack_approx	= t.seq_dup_ack_approx;
+
+	MSS_SET(conn,			  ntohl(t.tap_mss));
+
+	conn->wnd_from_tap		= ntohs(t.wnd_from_tap);
+	conn->wnd_to_tap		= ntohs(t.wnd_to_tap);
+
+	conn->seq_to_tap		= ntohl(t.seq_to_tap);
+	conn->seq_ack_from_tap		= ntohl(t.seq_ack_from_tap);
+	conn->seq_from_tap		= ntohl(t.seq_from_tap);
+	conn->seq_ack_to_tap		= ntohl(t.seq_ack_to_tap);
+	conn->seq_init_from_tap		= ntohl(t.seq_init_from_tap);
+
+	if ((rc = tcp_flow_repair_socket(c, conn))) {
+		flow_err(flow, "Can't set up socket: %s, drop", strerror_(-rc));
+		goto out;
+	}
+
+	flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
+	if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, conn->sock,
+			   !TAPSIDE(conn)))
+		goto out; /* tcp_flow_migrate_target_ext() will clean this up */
+
+	flow_hash_insert(c, TAP_SIDX(conn));
+
+out:
+	/* Never leave the flow in an incomplete state */
+	FLOW_ACTIVATE(conn);
+	return 0;
+}
+
+/**
+ * tcp_flow_migrate_target_ext() - Receive extended data for flow, set, connect
+ * @c:		Execution context
+ * @conn:	Connection entry to complete with extra data
+ * @fd:		Descriptor for state migration
+ *
+ * Return: 0 on success, negative on fatal failure, but 0 on single flow failure
+ */
+int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd)
 {
-	(void)now;
+	uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
+	struct tcp_tap_transfer_ext t;
+	int s = conn->sock, rc;
+
+	if (read_all_buf(fd, &t, sizeof(t))) {
+		rc = -errno;
+		flow_perror(conn, "Failed to read extended data");
+		return rc;
+	}
+
+	if (!t.tcpi_state) { /* Source wants us to skip this flow */
+		flow_err(conn, "Dropping as requested by source");
+		goto fail;
+	}
+
+	/* Endianness fix-ups */
+	t.seq_snd	= ntohl(t.seq_snd);
+	t.seq_rcv 	= ntohl(t.seq_rcv);
+	t.sndq		= ntohl(t.sndq);
+	t.notsent	= ntohl(t.notsent);
+	t.rcvq		= ntohl(t.rcvq);
+	t.mss		= ntohl(t.mss);
+	t.timestamp	= ntohl(t.timestamp);
+
+	t.snd_wl1	= ntohl(t.snd_wl1);
+	t.snd_wnd	= ntohl(t.snd_wnd);
+	t.max_window	= ntohl(t.max_window);
+	t.rcv_wnd	= ntohl(t.rcv_wnd);
+	t.rcv_wup	= ntohl(t.rcv_wup);
+
+	flow_dbg(conn,
+		 "Extended migration data, socket %i sequences send %u receive %u",
+		 s, t.seq_snd, t.seq_rcv);
+	flow_dbg(conn, "  pending queues: send %u not sent %u receive %u",
+		 t.sndq, t.notsent, t.rcvq);
+	flow_dbg(conn,
+		 "  window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
+		 t.snd_wl1, t.snd_wnd, t.max_window, t.rcv_wnd, t.rcv_wup);
+	flow_dbg(conn, "  SO_PEEK_OFF %s  offset=%"PRIu32,
+		 peek_offset_cap ? "enabled" : "disabled", peek_offset);
+
+	if (t.sndq > TCP_MIGRATE_SND_QUEUE_MAX || t.notsent > t.sndq ||
+	    t.rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) {
+		flow_err(conn,
+			 "Bad queues socket %i, send: %u, not sent: %u, receive: %u",
+			 s, t.sndq, t.notsent, t.rcvq);
+		return -EINVAL;
+	}
+
+	if (read_all_buf(fd, tcp_migrate_snd_queue, t.sndq)) {
+		rc = -errno;
+		flow_perror(conn, "Failed to read send queue data");
+		return rc;
+	}
+
+	if (read_all_buf(fd, tcp_migrate_rcv_queue, t.rcvq)) {
+		rc = -errno;
+		flow_perror(conn, "Failed to read receive queue data");
+		return rc;
+	}
+
+	if (conn->sock < 0)
+		/* We weren't able to create the socket, discard flow */
+		goto fail;
+
+	if (tcp_flow_repair_bind(c, conn))
+		goto fail;
+
+	if (tcp_flow_repair_timestamp(conn, &t))
+		goto fail;
+
+	if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE))
+		goto fail;
+
+	if (tcp_flow_repair_seq(conn, &t.seq_snd))
+		goto fail;
+
+	if (tcp_flow_select_queue(conn, TCP_RECV_QUEUE))
+		goto fail;
+
+	if (tcp_flow_repair_seq(conn, &t.seq_rcv))
+		goto fail;
+
+	if (tcp_flow_repair_connect(c, conn))
+		goto fail;
 
-	if (c->mode == MODE_PASTA) {
-		if (c->tcp.fwd_out.mode == FWD_AUTO) {
-			fwd_scan_ports_tcp(&c->tcp.fwd_out, &c->tcp.fwd_in);
-			NS_CALL(tcp_port_rebind_outbound, c);
+	if (tcp_flow_repair_queue(conn, t.rcvq, tcp_migrate_rcv_queue))
+		goto fail;
+
+	if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE))
+		goto fail;
+
+	if (tcp_flow_repair_queue(conn, t.sndq - t.notsent,
+				  tcp_migrate_snd_queue))
+		goto fail;
+
+	if (tcp_flow_repair_opt(conn, &t))
+		goto fail;
+
+	/* If we sent a FIN sent and it was acknowledged (TCP_FIN_WAIT2), don't
+	 * send it out, because we already sent it for sure.
+	 *
+	 * Call shutdown(x, SHUT_WR) in repair mode, so that we move to
+	 * FIN_WAIT_1 (tcp_shutdown()) without sending anything
+	 * (goto in tcp_write_xmit()).
+	 */
+	if (t.tcpi_state == TCP_FIN_WAIT2) {
+		int v;
+
+		v = TCP_SEND_QUEUE;
+		if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v))) {
+			flow_perror(conn, "Selecting repair queue");
+		} else {
+			if (shutdown(s, SHUT_WR) < 0) {
+				flow_perror(conn,
+					    "Repair mode shutdown() failed");
+				goto fail;
+			}
 		}
+	}
+
+	if (tcp_flow_repair_wnd(conn, &t))
+		goto fail;
 
-		if (c->tcp.fwd_in.mode == FWD_AUTO) {
-			fwd_scan_ports_tcp(&c->tcp.fwd_in, &c->tcp.fwd_out);
-			tcp_port_rebind(c, false);
+	tcp_flow_repair_off(c, conn);
+	repair_flush(c);
+
+	if (t.notsent) {
+		if (tcp_flow_repair_queue(conn, t.notsent,
+					  tcp_migrate_snd_queue +
+					  (t.sndq - t.notsent))) {
+			/* This sometimes seems to fail for unclear reasons.
+			 * Don't fail the whole migration, just reset the flow
+			 * and carry on to the next one.
+			 */
+			goto fail;
 		}
 	}
 
-	tcp_sock_refill_init(c);
-	if (c->mode == MODE_PASTA)
-		tcp_splice_refill(c);
+	/* If we sent a FIN but it wasn't acknowledged yet (TCP_FIN_WAIT1), send
+	 * it out, because we don't know if we already sent it.
+	 *
+	 * Call shutdown(x, SHUT_WR) *not* in repair mode, which moves us to
+	 * TCP_FIN_WAIT1.
+	 */
+	if (t.tcpi_state == TCP_FIN_WAIT1) {
+		if (shutdown(s, SHUT_WR) < 0) {
+			flow_perror(conn, "Post-repair shutdown() failed");
+			goto fail;
+		}
+	}
+
+	if (tcp_set_peek_offset(conn, peek_offset))
+		goto fail;
+
+	if (tcp_send_flag(c, conn, ACK))
+		goto fail;
+
+	tcp_data_from_sock(c, conn);
+
+	if ((rc = tcp_epoll_ctl(conn))) {
+		flow_dbg(conn,
+			 "Failed to subscribe to epoll for migrated socket: %s",
+			 strerror_(-rc));
+		goto fail;
+	}
+
+	return 0;
+
+fail:
+	if (conn->sock >= 0) {
+		tcp_flow_repair_off(c, conn);
+		repair_flush(c);
+	}
+
+	conn->flags = 0; /* Not waiting for ACK, don't schedule timer */
+	tcp_rst(c, conn);
+
+	return 0;
+}
+
+/**
+ * tcp_prepare_iov() - Prepare iov according to kernel capability
+ * @msg:		Message header to update
+ * @iov:		iovec to receive TCP payload and data to discard
+ * @already_sent:	Bytes sent after the last acknowledged one
+ * @payload_iov_cnt:	Number of TCP payload iovec entries
+ *
+ * Return: 0 on success, -1 if already_sent cannot be discarded fully
+ */
+int tcp_prepare_iov(struct msghdr *msg, struct iovec *iov,
+		    uint32_t already_sent, int payload_iov_cnt)
+{
+	/*
+	 * IOV layout
+	 * |- tcp_buf_discard -|---------- TCP data slots ------------|
+	 *
+	 * with discarded data:
+	 * |------ddddddddddddd|ttttttttttttt-------------------------|
+	 *        ^
+	 *        |
+	 *     msg_iov
+	 *
+	 * without discarded data:
+	 * |-------------------|ttttttttttttt-------------------------|
+	 *                      ^
+	 *                      |
+	 *                   msg_iov
+	 * d: discard data
+	 * t: TCP data
+	 */
+	if (peek_offset_cap) {
+		msg->msg_iov = iov + DISCARD_IOV_NUM;
+		msg->msg_iovlen = payload_iov_cnt;
+	} else {
+		int discard_cnt, discard_iov_rem;
+		struct iovec *iov_start;
+		int i;
+
+		discard_cnt = DIV_ROUND_UP(already_sent, BUF_DISCARD_SIZE);
+		if (discard_cnt > DISCARD_IOV_NUM) {
+			debug("Failed to discard %u already sent bytes",
+				already_sent);
+			return -1;
+		}
+
+		discard_iov_rem = already_sent % BUF_DISCARD_SIZE;
+
+		iov_start = iov + (DISCARD_IOV_NUM - discard_cnt);
+
+		/* Multiple iov entries pointing to the same buffer */
+		for (i = 0; i < discard_cnt; i++) {
+			iov_start[i].iov_base = tcp_buf_discard;
+			iov_start[i].iov_len = BUF_DISCARD_SIZE;
+		}
+		if (discard_iov_rem)
+			iov[DISCARD_IOV_NUM - 1].iov_len = discard_iov_rem;
+
+		msg->msg_iov = iov_start;
+		msg->msg_iovlen = discard_cnt + payload_iov_cnt;
+	}
+
+	return 0;
 }