1 files changed, 1174 insertions, 84 deletions
diff --git a/tcp.c b/tcp.c
index af6bd95..f43c1e2 100644
--- a/tcp.c
+++ b/tcp.c
@@ -280,6 +280,7 @@
 #include <stddef.h>
 #include <string.h>
 #include <sys/epoll.h>
+#include <sys/ioctl.h>
 #include <sys/socket.h>
 #include <sys/timerfd.h>
 #include <sys/types.h>
@@ -287,6 +288,8 @@
 #include <time.h>
 #include <arpa/inet.h>
 
+#include <linux/sockios.h>
+
 #include "checksum.h"
 #include "util.h"
 #include "iov.h"
@@ -299,6 +302,7 @@
 #include "log.h"
 #include "inany.h"
 #include "flow.h"
+#include "repair.h"
 #include "linux_dep.h"
 
 #include "flow_table.h"
@@ -306,6 +310,21 @@
 #include "tcp_buf.h"
 #include "tcp_vu.h"
 
+#ifndef __USE_MISC
+/* From Linux UAPI, missing in netinet/tcp.h provided by musl */
+struct tcp_repair_opt {
+	__u32	opt_code;
+	__u32	opt_val;
+};
+
+enum {
+	TCP_NO_QUEUE,
+	TCP_RECV_QUEUE,
+	TCP_SEND_QUEUE,
+	TCP_QUEUES_NR,
+};
+#endif
+
 /* MSS rounding: see SET_MSS() */
 #define MSS_DEFAULT			536
 #define WINDOW_DEFAULT			14600		/* RFC 6928 */
@@ -326,6 +345,19 @@
 	 ((conn)->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
 #define CONN_HAS(conn, set)	(((conn)->events & (set)) == (set))
 
+/* Buffers to migrate pending data from send and receive queues. No, they don't
+ * use memory if we don't use them. And we're going away after this, so splurge.
+ */
+#define TCP_MIGRATE_SND_QUEUE_MAX	(64 << 20)
+#define TCP_MIGRATE_RCV_QUEUE_MAX	(64 << 20)
+uint8_t tcp_migrate_snd_queue		[TCP_MIGRATE_SND_QUEUE_MAX];
+uint8_t tcp_migrate_rcv_queue		[TCP_MIGRATE_RCV_QUEUE_MAX];
+
+#define TCP_MIGRATE_RESTORE_CHUNK_MIN	1024 /* Try smaller when above this */
+
+/* "Extended" data (not stored in the flow table) for TCP flow migration */
+static struct tcp_tap_transfer_ext migrate_ext[FLOW_MAX];
+
 static const char *tcp_event_str[] __attribute((__unused__)) = {
 	"SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT",
 
@@ -338,7 +370,7 @@ static const char *tcp_state_str[] __attribute((__unused__)) = {
 	"SYN_RCVD",	/* approximately maps to TAP_SYN_ACK_SENT */
 
 	/* Passive close: */
-	"CLOSE_WAIT", "CLOSE_WAIT", "LAST_ACK", "LAST_ACK", "LAST_ACK",
+	"CLOSE_WAIT", "CLOSE_WAIT", "CLOSE_WAIT", "LAST_ACK", "LAST_ACK",
 	/* Active close (+5): */
 	"CLOSING", "FIN_WAIT_1", "FIN_WAIT_1", "FIN_WAIT_2", "TIME_WAIT",
 };
@@ -402,19 +434,20 @@ static struct tcp_tap_conn *conn_at_sidx(flow_sidx_t sidx)
 }
 
 /**
- * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported
- * @s:          Socket to update
+ * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on connection if supported
+ * @conn:	Pointer to the TCP connection structure
  * @offset:     Offset in bytes
  *
  * Return:      -1 when it fails, 0 otherwise.
  */
-int tcp_set_peek_offset(int s, int offset)
+int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset)
 {
 	if (!peek_offset_cap)
 		return 0;
 
-	if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) {
-		err("Failed to set SO_PEEK_OFF to %i in socket %i", offset, s);
+	if (setsockopt(conn->sock, SOL_SOCKET, SO_PEEK_OFF,
+		       &offset, sizeof(offset))) {
+		flow_perror(conn, "Failed to set SO_PEEK_OFF to %i", offset);
 		return -1;
 	}
 	return 0;
@@ -423,7 +456,7 @@ int tcp_set_peek_offset(int s, int offset)
 /**
  * tcp_conn_epoll_events() - epoll events mask for given connection state
  * @events:	Current connection events
- * @conn_flags	Connection flags
+ * @conn_flags:	Connection flags
  *
  * Return: epoll events mask corresponding to implied connection state
  */
@@ -519,8 +552,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 
 		fd = timerfd_create(CLOCK_MONOTONIC, 0);
 		if (fd == -1 || fd > FD_REF_MAX) {
-			flow_dbg(conn, "failed to get timer: %s",
-				 strerror_(errno));
+			flow_dbg_perror(conn, "failed to get timer");
 			if (fd > -1)
 				close(fd);
 			conn->timer = -1;
@@ -529,8 +561,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 		conn->timer = fd;
 
 		if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) {
-			flow_dbg(conn, "failed to add timer: %s",
-				 strerror_(errno));
+			flow_dbg_perror(conn, "failed to add timer");
 			close(conn->timer);
 			conn->timer = -1;
 			return;
@@ -555,7 +586,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
 		 (unsigned long long)it.it_value.tv_nsec / 1000 / 1000);
 
 	if (timerfd_settime(conn->timer, 0, &it, NULL))
-		flow_err(conn, "failed to set timer: %s", strerror_(errno));
+		flow_perror(conn, "failed to set timer");
 }
 
 /**
@@ -739,24 +770,6 @@ static void tcp_get_sndbuf(struct tcp_tap_conn *conn)
 }
 
 /**
- * tcp_sock_set_bufsize() - Set SO_RCVBUF and SO_SNDBUF to maximum values
- * @s:		Socket, can be -1 to avoid check in the caller
- */
-static void tcp_sock_set_bufsize(const struct ctx *c, int s)
-{
-	int v = INT_MAX / 2; /* Kernel clamps and rounds, no need to check */
-
-	if (s == -1)
-		return;
-
-	if (!c->low_rmem && setsockopt(s, SOL_SOCKET, SO_RCVBUF, &v, sizeof(v)))
-		trace("TCP: failed to set SO_RCVBUF to %i", v);
-
-	if (!c->low_wmem && setsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v)))
-		trace("TCP: failed to set SO_SNDBUF to %i", v);
-}
-
-/**
  * tcp_sock_set_nodelay() - Set TCP_NODELAY option (disable Nagle's algorithm)
  * @s:		Socket, can be -1 to avoid check in the caller
  */
@@ -775,7 +788,8 @@ static void tcp_sock_set_nodelay(int s)
  * @th:		TCP header (updated)
  * @payload:	TCP payload
  */
-void tcp_update_csum(uint32_t psum, struct tcphdr *th, struct iov_tail *payload)
+static void tcp_update_csum(uint32_t psum, struct tcphdr *th,
+			    struct iov_tail *payload)
 {
 	th->check = 0;
 	psum = csum_unfolded(th, sizeof(*th), psum);
@@ -951,9 +965,7 @@ void tcp_fill_headers(const struct tcp_tap_conn *conn,
 		ip6h->version = 6;
 		ip6h->nexthdr = IPPROTO_TCP;
 
-		ip6h->flow_lbl[0] = (conn->sock >> 16) & 0xf;
-		ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff;
-		ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff;
+		ip6_set_flow_lbl(ip6h, conn->sock);
 
 		if (!no_tcp_csum) {
 			psum = proto_ipv6_header_psum(l4len, IPPROTO_TCP,
@@ -1067,7 +1079,7 @@ out:
  * tcp_update_seqack_from_tap() - ACK number from tap and related flags/counters
  * @c:		Execution context
  * @conn:	Connection pointer
- * @seq		Current ACK sequence, host order
+ * @seq:	Current ACK sequence, host order
  */
 static void tcp_update_seqack_from_tap(const struct ctx *c,
 				       struct tcp_tap_conn *conn, uint32_t seq)
@@ -1091,7 +1103,7 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
  * @conn:	Connection pointer
  * @flags:	TCP flags: if not set, send segment only if ACK is due
  * @th:		TCP header to update
- * @data:	buffer to store TCP option
+ * @opts:	TCP option buffer (output parameter)
  * @optlen:	size of the TCP option buffer (output parameter)
  *
  * Return: < 0 error code on connection reset,
@@ -1127,7 +1139,7 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
 	if (flags & SYN) {
 		int mss;
 
-		if (c->mtu == -1) {
+		if (!c->mtu) {
 			mss = tinfo.tcpi_snd_mss;
 		} else {
 			mss = c->mtu - sizeof(struct tcphdr);
@@ -1202,8 +1214,8 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn)
 	if (conn->events == CLOSED)
 		return;
 
-	if (!tcp_send_flag(c, conn, RST))
-		conn_event(c, conn, CLOSED);
+	tcp_send_flag(c, conn, RST);
+	conn_event(c, conn, CLOSED);
 }
 
 /**
@@ -1226,7 +1238,7 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn,
 /**
  * tcp_tap_window_update() - Process an updated window from tap side
  * @conn:	Connection pointer
- * @window:	Window value, host order, unscaled
+ * @wnd:	Window value, host order, unscaled
  */
 static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd)
 {
@@ -1249,6 +1261,8 @@ static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd)
  * tcp_init_seq() - Calculate initial sequence number according to RFC 6528
  * @hash:	Hash of connection details
  * @now:	Current timestamp
+ *
+ * Return: the calculated 32-bit initial sequence number.
  */
 static uint32_t tcp_init_seq(uint64_t hash, const struct timespec *now)
 {
@@ -1278,12 +1292,11 @@ int tcp_conn_pool_sock(int pool[])
 
 /**
  * tcp_conn_new_sock() - Open and prepare new socket for connection
- * @c:		Execution context
  * @af:		Address family
  *
  * Return: socket number on success, negative code if socket creation failed
  */
-static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
+static int tcp_conn_new_sock(sa_family_t af)
 {
 	int s;
 
@@ -1297,7 +1310,6 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
 	if (s < 0)
 		return -errno;
 
-	tcp_sock_set_bufsize(c, s);
 	tcp_sock_set_nodelay(s);
 
 	return s;
@@ -1305,12 +1317,11 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
 
 /**
  * tcp_conn_sock() - Obtain a connectable socket in the host/init namespace
- * @c:		Execution context
  * @af:		Address family (AF_INET or AF_INET6)
  *
  * Return: Socket fd on success, -errno on failure
  */
-int tcp_conn_sock(const struct ctx *c, sa_family_t af)
+int tcp_conn_sock(sa_family_t af)
 {
 	int *pool = af == AF_INET6 ? init_sock_pool6 : init_sock_pool4;
 	int s;
@@ -1321,7 +1332,7 @@ int tcp_conn_sock(const struct ctx *c, sa_family_t af)
 	/* If the pool is empty we just open a new one without refilling the
 	 * pool to keep latency down.
 	 */
-	if ((s = tcp_conn_new_sock(c, af)) >= 0)
+	if ((s = tcp_conn_new_sock(af)) >= 0)
 		return s;
 
 	err("TCP: Unable to open socket for new connection: %s",
@@ -1375,10 +1386,10 @@ static void tcp_bind_outbound(const struct ctx *c,
 		if (bind(s, &bind_sa.sa, sl)) {
 			char sstr[INANY_ADDRSTRLEN];
 
-			flow_dbg(conn,
-				 "Can't bind TCP outbound socket to %s:%hu: %s",
-				 inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)),
-				 tgt->oport, strerror_(errno));
+			flow_dbg_perror(conn,
+					"Can't bind TCP outbound socket to %s:%hu",
+					inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)),
+					tgt->oport);
 		}
 	}
 
@@ -1387,9 +1398,9 @@ static void tcp_bind_outbound(const struct ctx *c,
 			if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE,
 				       c->ip4.ifname_out,
 				       strlen(c->ip4.ifname_out))) {
-				flow_dbg(conn, "Can't bind IPv4 TCP socket to"
-					 " interface %s: %s", c->ip4.ifname_out,
-					 strerror_(errno));
+				flow_dbg_perror(conn,
+						"Can't bind IPv4 TCP socket to interface %s",
+						c->ip4.ifname_out);
 			}
 		}
 	} else if (bind_sa.sa_family == AF_INET6) {
@@ -1397,9 +1408,9 @@ static void tcp_bind_outbound(const struct ctx *c,
 			if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE,
 				       c->ip6.ifname_out,
 				       strlen(c->ip6.ifname_out))) {
-				flow_dbg(conn, "Can't bind IPv6 TCP socket to"
-					 " interface %s: %s", c->ip6.ifname_out,
-					 strerror_(errno));
+				flow_dbg_perror(conn,
+						"Can't bind IPv6 TCP socket to interface %s",
+						c->ip6.ifname_out);
 			}
 		}
 	}
@@ -1462,7 +1473,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
 		goto cancel;
 	}
 
-	if ((s = tcp_conn_sock(c, af)) < 0)
+	if ((s = tcp_conn_sock(af)) < 0)
 		goto cancel;
 
 	pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, tgt->eport);
@@ -1483,12 +1494,13 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
 	} else {
 		/* Not a local, bound destination, inconclusive test */
 		close(s);
-		if ((s = tcp_conn_sock(c, af)) < 0)
+		if ((s = tcp_conn_sock(af)) < 0)
 			goto cancel;
 	}
 
 	conn->sock = s;
 	conn->timer = -1;
+	conn->listening_sock = -1;
 	conn_event(c, conn, TAP_SYN_RCVD);
 
 	conn->wnd_to_tap = WINDOW_DEFAULT;
@@ -1536,12 +1548,9 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
 
 	if (c->mode == MODE_VU) { /* To rebind to same oport after migration */
 		sl = sizeof(sa);
-		if (!getsockname(s, &sa.sa, &sl)) {
-			inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa);
-		} else {
-			err("Failed to get local address for socket: %s",
-			    strerror_(errno));
-		}
+		if (getsockname(s, &sa.sa, &sl) ||
+		    inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa) < 0)
+			err_perror("Can't get local address for socket %i", s);
 	}
 
 	FLOW_ACTIVATE(conn);
@@ -1664,8 +1673,10 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			tcp_send_flag(c, conn, ACK);
 			tcp_timer_ctl(c, conn);
 
-			if (p->count == 1)
+			if (p->count == 1) {
+				tcp_tap_window_update(conn, ntohs(th->window));
 				return 1;
+			}
 
 			continue;
 		}
@@ -1748,7 +1759,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 			   "fast re-transmit, ACK: %u, previous sequence: %u",
 			   max_ack_seq, conn->seq_to_tap);
 		conn->seq_to_tap = max_ack_seq;
-		if (tcp_set_peek_offset(conn->sock, 0)) {
+		if (tcp_set_peek_offset(conn, 0)) {
 			tcp_rst(c, conn);
 			return -1;
 		}
@@ -1845,7 +1856,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
 	conn->seq_ack_to_tap = conn->seq_from_tap;
 
 	conn_event(c, conn, ESTABLISHED);
-	if (tcp_set_peek_offset(conn->sock, 0)) {
+	if (tcp_set_peek_offset(conn, 0)) {
 		tcp_rst(c, conn);
 		return;
 	}
@@ -1859,12 +1870,82 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
 }
 
 /**
+ * tcp_rst_no_conn() - Send RST in response to a packet with no connection
+ * @c:		Execution context
+ * @af:		Address family, AF_INET or AF_INET6
+ * @saddr:	Source address of the packet we're responding to
+ * @daddr:	Destination address of the packet we're responding to
+ * @flow_lbl:	IPv6 flow label (ignored for IPv4)
+ * @th:		TCP header of the packet we're responding to
+ * @l4len:	Packet length, including TCP header
+ */
+static void tcp_rst_no_conn(const struct ctx *c, int af,
+			    const void *saddr, const void *daddr,
+			    uint32_t flow_lbl,
+			    const struct tcphdr *th, size_t l4len)
+{
+	struct iov_tail payload = IOV_TAIL(NULL, 0, 0);
+	struct tcphdr *rsth;
+	char buf[USHRT_MAX];
+	uint32_t psum = 0;
+	size_t rst_l2len;
+
+	/* Don't respond to RSTs without a connection */
+	if (th->rst)
+		return;
+
+	if (af == AF_INET) {
+		struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
+		const struct in_addr *rst_src = daddr;
+		const struct in_addr *rst_dst = saddr;
+
+		rsth = tap_push_ip4h(ip4h, *rst_src, *rst_dst,
+				     sizeof(*rsth), IPPROTO_TCP);
+		psum = proto_ipv4_header_psum(sizeof(*rsth), IPPROTO_TCP,
+					      *rst_src, *rst_dst);
+
+	} else {
+		struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
+		const struct in6_addr *rst_src = daddr;
+		const struct in6_addr *rst_dst = saddr;
+
+		rsth = tap_push_ip6h(ip6h, rst_src, rst_dst,
+				     sizeof(*rsth), IPPROTO_TCP, flow_lbl);
+		psum = proto_ipv6_header_psum(sizeof(*rsth), IPPROTO_TCP,
+					      rst_src, rst_dst);
+	}
+
+	memset(rsth, 0, sizeof(*rsth));
+
+	rsth->source = th->dest;
+	rsth->dest = th->source;
+	rsth->rst = 1;
+	rsth->doff = sizeof(*rsth) / 4UL;
+
+	/* Sequence matching logic from RFC 9293 section 3.10.7.1 */
+	if (th->ack) {
+		rsth->seq = th->ack_seq;
+	} else {
+		size_t dlen = l4len - th->doff * 4UL;
+		uint32_t ack = ntohl(th->seq) + dlen;
+
+		rsth->ack_seq = htonl(ack);
+		rsth->ack = 1;
+	}
+
+	tcp_update_csum(psum, rsth, &payload);
+	rst_l2len = ((char *)rsth - buf) + sizeof(*rsth);
+	tap_send_single(c, buf, rst_l2len);
+}
+
+/**
  * tcp_tap_handler() - Handle packets from tap and state transitions
  * @c:		Execution context
  * @pif:	pif on which the packet is arriving
  * @af:		Address family, AF_INET or AF_INET6
  * @saddr:	Source address
  * @daddr:	Destination address
+ * @flow_lbl:	IPv6 flow label (ignored for IPv4)
  * @p:		Pool of TCP packets, with TCP headers
  * @idx:	Index of first packet in pool to process
  * @now:	Current timestamp
@@ -1872,7 +1953,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
  * Return: count of consumed packets
  */
 int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
-		    const void *saddr, const void *daddr,
+		    const void *saddr, const void *daddr, uint32_t flow_lbl,
 		    const struct pool *p, int idx, const struct timespec *now)
 {
 	struct tcp_tap_conn *conn;
@@ -1905,6 +1986,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 		if (opts && th->syn && !th->ack)
 			tcp_conn_from_tap(c, af, saddr, daddr, th,
 					  opts, optlen, now);
+		else
+			tcp_rst_no_conn(c, af, saddr, daddr, flow_lbl, th, len);
 		return 1;
 	}
 
@@ -1941,7 +2024,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 			goto reset;
 
 		conn_event(c, conn, ESTABLISHED);
-		if (tcp_set_peek_offset(conn->sock, 0))
+		if (tcp_set_peek_offset(conn, 0))
 			goto reset;
 
 		if (th->fin) {
@@ -1967,7 +2050,10 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 
 	/* Established connections not accepting data from tap */
 	if (conn->events & TAP_FIN_RCVD) {
+		tcp_sock_consume(conn, ntohl(th->ack_seq));
 		tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
+		tcp_tap_window_update(conn, ntohs(th->window));
+		tcp_data_from_sock(c, conn);
 
 		if (conn->events & SOCK_FIN_RCVD &&
 		    conn->seq_ack_from_tap == conn->seq_to_tap)
@@ -1987,10 +2073,27 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 		ack_due = 1;
 
 	if ((conn->events & TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_SENT)) {
+		socklen_t sl;
+		struct tcp_info tinfo;
+
 		shutdown(conn->sock, SHUT_WR);
 		conn_event(c, conn, SOCK_FIN_SENT);
 		tcp_send_flag(c, conn, ACK);
 		ack_due = 0;
+
+		/* If we received a FIN, but the socket is in TCP_ESTABLISHED
+		 * state, it must be a migrated socket. The kernel saw the FIN
+		 * on the source socket, but not on the target socket.
+		 *
+		 * Approximate the effect of that FIN: as we're sending a FIN
+		 * out ourselves, the socket is now in a state equivalent to
+		 * LAST_ACK. Now that we sent the FIN out, close it with a RST.
+		 */
+		sl = sizeof(tinfo);
+		getsockopt(conn->sock, SOL_TCP, TCP_INFO, &tinfo, &sl);
+		if (tinfo.tcpi_state == TCP_ESTABLISHED &&
+		    conn->events & SOCK_FIN_RCVD)
+			goto reset;
 	}
 
 	if (ack_due)
@@ -2073,9 +2176,10 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
 void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 			const struct timespec *now)
 {
-	const struct flowside *ini;
+	struct tcp_tap_conn *conn;
 	union sockaddr_inany sa;
 	socklen_t sl = sizeof(sa);
+	struct flowside *ini;
 	union flow *flow;
 	int s;
 
@@ -2088,14 +2192,23 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
 	if (s < 0)
 		goto cancel;
 
-	tcp_sock_set_bufsize(c, s);
+	conn = (struct tcp_tap_conn *)flow;
+	conn->listening_sock = ref.fd;
+
 	tcp_sock_set_nodelay(s);
 
-	/* FIXME: When listening port has a specific bound address, record that
-	 * as our address
+	/* FIXME: If useful: when the listening port has a specific bound
+	 * address, record that as our address, as implemented for vhost-user
+	 * mode only, below.
 	 */
 	ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa,
-			       ref.tcp_listen.port);
+			       NULL, ref.tcp_listen.port);
+
+	if (c->mode == MODE_VU) { /* Rebind to same address after migration */
+		if (getsockname(s, &sa.sa, &sl) ||
+		    inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa) < 0)
+			err_perror("Can't get local address for socket %i", s);
+	}
 
 	if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0) {
 		char sastr[SOCKADDR_STRLEN];
@@ -2151,7 +2264,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 	 * and we just set the timer to a new point in the future: discard it.
 	 */
 	if (timerfd_gettime(conn->timer, &check_armed))
-		flow_err(conn, "failed to read timer: %s", strerror_(errno));
+		flow_perror(conn, "failed to read timer");
 
 	if (check_armed.it_value.tv_sec || check_armed.it_value.tv_nsec)
 		return;
@@ -2173,7 +2286,9 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 			flow_dbg(conn, "ACK timeout, retry");
 			conn->retrans++;
 			conn->seq_to_tap = conn->seq_ack_from_tap;
-			if (tcp_set_peek_offset(conn->sock, 0)) {
+			if (!conn->wnd_from_tap)
+				conn->wnd_from_tap = 1; /* Zero-window probe */
+			if (tcp_set_peek_offset(conn, 0)) {
 				tcp_rst(c, conn);
 			} else {
 				tcp_data_from_sock(c, conn);
@@ -2191,8 +2306,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 		 * ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE.
 		 */
 		if (timerfd_settime(conn->timer, 0, &new, &old))
-			flow_err(conn, "failed to set timer: %s",
-				 strerror_(errno));
+			flow_perror(conn, "failed to set timer");
 
 		if (old.it_value.tv_sec == ACT_TIMEOUT) {
 			flow_dbg(conn, "activity timeout");
@@ -2386,7 +2500,7 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)
  * @c:		Execution context
  * @port:	Port, host order
  */
-void tcp_ns_sock_init(const struct ctx *c, in_port_t port)
+static void tcp_ns_sock_init(const struct ctx *c, in_port_t port)
 {
 	ASSERT(!c->no_tcp);
 
@@ -2422,13 +2536,12 @@ static int tcp_ns_socks_init(void *arg)
 
 /**
  * tcp_sock_refill_pool() - Refill one pool of pre-opened sockets
- * @c:		Execution context
  * @pool:	Pool of sockets to refill
  * @af:		Address family to use
  *
  * Return: 0 on success, negative error code if there was at least one error
  */
-int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af)
+int tcp_sock_refill_pool(int pool[], sa_family_t af)
 {
 	int i;
 
@@ -2438,7 +2551,7 @@ int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af)
 		if (pool[i] >= 0)
 			continue;
 
-		if ((fd = tcp_conn_new_sock(c, af)) < 0)
+		if ((fd = tcp_conn_new_sock(af)) < 0)
 			return fd;
 
 		pool[i] = fd;
@@ -2454,13 +2567,13 @@ int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af)
 static void tcp_sock_refill_init(const struct ctx *c)
 {
 	if (c->ifi4) {
-		int rc = tcp_sock_refill_pool(c, init_sock_pool4, AF_INET);
+		int rc = tcp_sock_refill_pool(init_sock_pool4, AF_INET);
 		if (rc < 0)
 			warn("TCP: Error refilling IPv4 host socket pool: %s",
 			     strerror_(-rc));
 	}
 	if (c->ifi6) {
-		int rc = tcp_sock_refill_pool(c, init_sock_pool6, AF_INET6);
+		int rc = tcp_sock_refill_pool(init_sock_pool6, AF_INET6);
 		if (rc < 0)
 			warn("TCP: Error refilling IPv6 host socket pool: %s",
 			     strerror_(-rc));
@@ -2645,3 +2758,980 @@ void tcp_timer(struct ctx *c, const struct timespec *now)
 	if (c->mode == MODE_PASTA)
 		tcp_splice_refill(c);
 }
+
+/**
+ * tcp_flow_is_established() - Was the connection established? Includes closing
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: true if the connection was established, false otherwise
+ */
+bool tcp_flow_is_established(const struct tcp_tap_conn *conn)
+{
+	return conn->events & ESTABLISHED;
+}
+
+/**
+ * tcp_flow_repair_on() - Enable repair mode for a single TCP flow
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn)
+{
+	int rc = 0;
+
+	if (conn->sock < 0)
+		return 0;
+
+	if ((rc = repair_set(c, conn->sock, TCP_REPAIR_ON)))
+		err("Failed to set TCP_REPAIR");
+
+	return rc;
+}
+
+/**
+ * tcp_flow_repair_off() - Clear repair mode for a single TCP flow
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn)
+{
+	int rc = 0;
+
+	if (conn->sock < 0)
+		return 0;
+
+	if ((rc = repair_set(c, conn->sock, TCP_REPAIR_OFF)))
+		err("Failed to clear TCP_REPAIR");
+
+	return rc;
+}
+
+/**
+ * tcp_flow_dump_tinfo() - Dump window scale, tcpi_state, tcpi_options
+ * @conn:	Pointer to the TCP connection structure
+ * @t:		Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_tinfo(const struct tcp_tap_conn *conn,
+			       struct tcp_tap_transfer_ext *t)
+{
+	struct tcp_info tinfo;
+	socklen_t sl;
+
+	sl = sizeof(tinfo);
+	if (getsockopt(conn->sock, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
+		int rc = -errno;
+		flow_perror(conn, "Querying TCP_INFO");
+		return rc;
+	}
+
+	t->snd_ws		= tinfo.tcpi_snd_wscale;
+	t->rcv_ws		= tinfo.tcpi_rcv_wscale;
+	t->tcpi_state		= tinfo.tcpi_state;
+	t->tcpi_options		= tinfo.tcpi_options;
+
+	return 0;
+}
+
+/**
+ * tcp_flow_dump_mss() - Dump MSS clamp (not current MSS) via TCP_MAXSEG
+ * @conn:	Pointer to the TCP connection structure
+ * @t:		Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_mss(const struct tcp_tap_conn *conn,
+			     struct tcp_tap_transfer_ext *t)
+{
+	socklen_t sl = sizeof(t->mss);
+	int val;
+
+	if (getsockopt(conn->sock, SOL_TCP, TCP_MAXSEG, &val, &sl)) {
+		int rc = -errno;
+		flow_perror(conn, "Getting MSS");
+		return rc;
+	}
+
+	t->mss = (uint32_t)val;
+
+	return 0;
+}
+
+
+/**
+ * tcp_flow_dump_timestamp() - Dump RFC 7323 timestamp via TCP_TIMESTAMP
+ * @conn:	Pointer to the TCP connection structure
+ * @t:		Extended migration data (tcpi_options must be populated)
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_timestamp(const struct tcp_tap_conn *conn,
+				   struct tcp_tap_transfer_ext *t)
+{
+	int val = 0;
+
+	if (t->tcpi_options & TCPI_OPT_TIMESTAMPS) {
+		socklen_t sl = sizeof(val);
+
+		if (getsockopt(conn->sock, SOL_TCP, TCP_TIMESTAMP, &val, &sl)) {
+			int rc = -errno;
+			flow_perror(conn, "Getting RFC 7323 timestamp");
+			return rc;
+		}
+	}
+
+	t->timestamp = (uint32_t)val;
+	return 0;
+}
+
+/**
+ * tcp_flow_repair_timestamp() - Restore RFC 7323 timestamp via TCP_TIMESTAMP
+ * @conn:	Pointer to the TCP connection structure
+ * @t:		Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_timestamp(const struct tcp_tap_conn *conn,
+				   const struct tcp_tap_transfer_ext *t)
+{
+	int val = (int)t->timestamp;
+
+	if (t->tcpi_options & TCPI_OPT_TIMESTAMPS) {
+		if (setsockopt(conn->sock, SOL_TCP, TCP_TIMESTAMP,
+			       &val, sizeof(val))) {
+			int rc = -errno;
+			flow_perror(conn, "Setting RFC 7323 timestamp");
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_dump_wnd() - Dump current tcp_repair_window parameters
+ * @conn:	Pointer to the TCP connection structure
+ * @t:		Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_wnd(const struct tcp_tap_conn *conn,
+			     struct tcp_tap_transfer_ext *t)
+{
+	struct tcp_repair_window wnd;
+	socklen_t sl = sizeof(wnd);
+
+	if (getsockopt(conn->sock, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, &sl)) {
+		int rc = -errno;
+		flow_perror(conn, "Getting window repair data");
+		return rc;
+	}
+
+	t->snd_wl1	= wnd.snd_wl1;
+	t->snd_wnd	= wnd.snd_wnd;
+	t->max_window	= wnd.max_window;
+	t->rcv_wnd	= wnd.rcv_wnd;
+	t->rcv_wup	= wnd.rcv_wup;
+
+	/* If we received a FIN, we also need to adjust window parameters.
+	 *
+	 * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state.
+	 */
+	if (t->tcpi_state == TCP_CLOSE_WAIT || t->tcpi_state == TCP_LAST_ACK) {
+		t->rcv_wup--;
+		t->rcv_wnd++;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_repair_wnd() - Restore window parameters from extended data
+ * @conn:	Pointer to the TCP connection structure
+ * @t:		Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_wnd(const struct tcp_tap_conn *conn,
+			       const struct tcp_tap_transfer_ext *t)
+{
+	struct tcp_repair_window wnd;
+
+	wnd.snd_wl1	= t->snd_wl1;
+	wnd.snd_wnd	= t->snd_wnd;
+	wnd.max_window	= t->max_window;
+	wnd.rcv_wnd	= t->rcv_wnd;
+	wnd.rcv_wup	= t->rcv_wup;
+
+	if (setsockopt(conn->sock, IPPROTO_TCP, TCP_REPAIR_WINDOW,
+		       &wnd, sizeof(wnd))) {
+		int rc = -errno;
+		flow_perror(conn, "Setting window data");
+		return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_select_queue() - Select queue (receive or send) for next operation
+ * @conn:	Connection to select queue for
+ * @queue:	TCP_RECV_QUEUE or TCP_SEND_QUEUE
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_select_queue(const struct tcp_tap_conn *conn, int queue)
+{
+	if (setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_QUEUE,
+		       &queue, sizeof(queue))) {
+		int rc = -errno;
+		flow_perror(conn, "Selecting TCP_SEND_QUEUE");
+		return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_dump_sndqueue() - Dump send queue, length of sent and not sent data
+ * @conn:	Connection to dump queue for
+ * @t:		Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ *
+ * #syscalls:vu ioctl
+ */
+static int tcp_flow_dump_sndqueue(const struct tcp_tap_conn *conn,
+				  struct tcp_tap_transfer_ext *t)
+{
+	int s = conn->sock;
+	ssize_t rc;
+
+	if (ioctl(s, SIOCOUTQ, &t->sndq) < 0) {
+		rc = -errno;
+		flow_perror(conn, "Getting send queue size");
+		return rc;
+	}
+
+	if (ioctl(s, SIOCOUTQNSD, &t->notsent) < 0) {
+		rc = -errno;
+		flow_perror(conn, "Getting not sent count");
+		return rc;
+	}
+
+	/* If we sent a FIN, SIOCOUTQ and SIOCOUTQNSD are one greater than the
+	 * actual pending queue length, because they are based on the sequence
+	 * numbers, not directly on the buffer contents.
+	 *
+	 * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state.
+	 */
+	if (t->tcpi_state == TCP_FIN_WAIT1 || t->tcpi_state == TCP_FIN_WAIT2 ||
+	    t->tcpi_state == TCP_LAST_ACK  || t->tcpi_state == TCP_CLOSING) {
+		if (t->sndq)
+			t->sndq--;
+		if (t->notsent)
+			t->notsent--;
+	}
+
+	if (t->notsent > t->sndq) {
+		flow_err(conn,
+			 "Invalid notsent count socket %i, send: %u, not sent: %u",
+			 s, t->sndq, t->notsent);
+		return -EINVAL;
+	}
+
+	if (t->sndq > TCP_MIGRATE_SND_QUEUE_MAX) {
+		flow_err(conn,
+			 "Send queue too large to migrate socket %i: %u bytes",
+			 s, t->sndq);
+		return -ENOBUFS;
+	}
+
+	rc = recv(s, tcp_migrate_snd_queue,
+		  MIN(t->sndq, TCP_MIGRATE_SND_QUEUE_MAX), MSG_PEEK);
+	if (rc < 0) {
+		if (errno == EAGAIN)  { /* EAGAIN means empty */
+			rc = 0;
+		} else {
+			rc = -errno;
+			flow_perror(conn, "Can't read send queue");
+			return rc;
+		}
+	}
+
+	if ((uint32_t)rc < t->sndq) {
+		flow_err(conn, "Short read migrating send queue");
+		return -ENXIO;
+	}
+
+	t->notsent = MIN(t->notsent, t->sndq);
+
+	return 0;
+}
+
+/**
+ * tcp_flow_repair_queue() - Restore contents of a given (pre-selected) queue
+ * @conn:	Connection to repair queue for
+ * @len:	Length of data to be restored
+ * @buf:	Buffer with content of pending data queue
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_queue(const struct tcp_tap_conn *conn,
+				 size_t len, uint8_t *buf)
+{
+	size_t chunk = len;
+	uint8_t *p = buf;
+
+	while (len > 0) {
+		ssize_t rc = send(conn->sock, p, MIN(len, chunk), 0);
+
+		if (rc < 0) {
+			if ((errno == ENOBUFS || errno == ENOMEM) &&
+			    chunk >= TCP_MIGRATE_RESTORE_CHUNK_MIN) {
+				chunk /= 2;
+				continue;
+			}
+
+			rc = -errno;
+			flow_perror(conn, "Can't write queue");
+			return rc;
+		}
+
+		len -= rc;
+		p += rc;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_dump_seq() - Dump current sequence of pre-selected queue
+ * @conn:	Pointer to the TCP connection structure
+ * @v:		Sequence value, set on return
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_seq(const struct tcp_tap_conn *conn, uint32_t *v)
+{
+	socklen_t sl = sizeof(*v);
+
+	if (getsockopt(conn->sock, SOL_TCP, TCP_QUEUE_SEQ, v, &sl)) {
+		int rc = -errno;
+		flow_perror(conn, "Dumping sequence");
+		return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_repair_seq() - Restore sequence for pre-selected queue
+ * @conn:	Connection to repair sequences for
+ * @v:		Sequence value to be set
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_seq(const struct tcp_tap_conn *conn,
+			       const uint32_t *v)
+{
+	if (setsockopt(conn->sock, SOL_TCP, TCP_QUEUE_SEQ, v, sizeof(*v))) {
+		int rc = -errno;
+		flow_perror(conn, "Setting sequence");
+		return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_dump_rcvqueue() - Dump receive queue and its length, seal/block it
+ * @conn:	Pointer to the TCP connection structure
+ * @t:		Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ *
+ * #syscalls:vu ioctl
+ */
+static int tcp_flow_dump_rcvqueue(const struct tcp_tap_conn *conn,
+				  struct tcp_tap_transfer_ext *t)
+{
+	int s = conn->sock;
+	ssize_t rc;
+
+	if (ioctl(s, SIOCINQ, &t->rcvq) < 0) {
+		rc = -errno;
+		err_perror("Get receive queue size, socket %i", s);
+		return rc;
+	}
+
+	/* If we received a FIN, SIOCINQ is one greater than the actual number
+	 * of bytes on the queue, because it's based on the sequence number
+	 * rather than directly on the buffer contents.
+	 *
+	 * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state.
+	 */
+	if (t->rcvq &&
+	    (t->tcpi_state == TCP_CLOSE_WAIT || t->tcpi_state == TCP_LAST_ACK))
+		t->rcvq--;
+
+	if (t->rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) {
+		flow_err(conn,
+			 "Receive queue too large to migrate socket: %u bytes",
+			 t->rcvq);
+		return -ENOBUFS;
+	}
+
+	rc = recv(s, tcp_migrate_rcv_queue, t->rcvq, MSG_PEEK);
+	if (rc < 0) {
+		if (errno == EAGAIN)  { /* EAGAIN means empty */
+			rc = 0;
+		} else {
+			rc = -errno;
+			flow_perror(conn, "Can't read receive queue");
+			return rc;
+		}
+	}
+
+	if ((uint32_t)rc < t->rcvq) {
+		flow_err(conn, "Short read migrating receive queue");
+		return -ENXIO;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_repair_opt() - Set repair "options" (MSS, scale, SACK, timestamps)
+ * @conn:	Pointer to the TCP connection structure
+ * @t:		Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_opt(const struct tcp_tap_conn *conn,
+			       const struct tcp_tap_transfer_ext *t)
+{
+	const struct tcp_repair_opt opts[] = {
+		{ TCPOPT_WINDOW,		t->snd_ws + (t->rcv_ws << 16) },
+		{ TCPOPT_MAXSEG,		t->mss },
+		{ TCPOPT_SACK_PERMITTED,	0 },
+		{ TCPOPT_TIMESTAMP,		0 },
+	};
+	socklen_t sl;
+
+	sl = sizeof(opts[0]) * (2 +
+				!!(t->tcpi_options & TCPI_OPT_SACK) +
+				!!(t->tcpi_options & TCPI_OPT_TIMESTAMPS));
+
+	if (setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_OPTIONS, opts, sl)) {
+		int rc = -errno;
+		flow_perror(conn, "Setting repair options");
+		return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_migrate_source() - Send data (flow table) for flow, close listening
+ * @fd:		Descriptor for state migration
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn)
+{
+	struct tcp_tap_transfer t = {
+		.retrans		= conn->retrans,
+		.ws_from_tap		= conn->ws_from_tap,
+		.ws_to_tap		= conn->ws_to_tap,
+		.events			= conn->events,
+
+		.tap_mss		= htonl(MSS_GET(conn)),
+
+		.sndbuf			= htonl(conn->sndbuf),
+
+		.flags			= conn->flags,
+		.seq_dup_ack_approx	= conn->seq_dup_ack_approx,
+
+		.wnd_from_tap		= htons(conn->wnd_from_tap),
+		.wnd_to_tap		= htons(conn->wnd_to_tap),
+
+		.seq_to_tap		= htonl(conn->seq_to_tap),
+		.seq_ack_from_tap	= htonl(conn->seq_ack_from_tap),
+		.seq_from_tap		= htonl(conn->seq_from_tap),
+		.seq_ack_to_tap		= htonl(conn->seq_ack_to_tap),
+		.seq_init_from_tap	= htonl(conn->seq_init_from_tap),
+	};
+
+	memcpy(&t.pif, conn->f.pif, sizeof(t.pif));
+	memcpy(&t.side, conn->f.side, sizeof(t.side));
+
+	if (write_all_buf(fd, &t, sizeof(t))) {
+		int rc = -errno;
+		err_perror("Can't write migration data, socket %i", conn->sock);
+		return rc;
+	}
+
+	if (conn->listening_sock != -1 && !fcntl(conn->listening_sock, F_GETFD))
+		close(conn->listening_sock);
+
+	return 0;
+}
+
+/**
+ * tcp_flow_migrate_source_ext() - Dump queues, close sockets, send final data
+ * @fd:		Descriptor for state migration
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative (not -EIO) on failure, -EIO on sending failure
+ */
+int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
+{
+	uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
+	struct tcp_tap_transfer_ext *t = &migrate_ext[FLOW_IDX(conn)];
+	int s = conn->sock;
+	int rc;
+
+	/* Disable SO_PEEK_OFF, it will make accessing the queues in repair mode
+	 * weird.
+	 */
+	if (tcp_set_peek_offset(conn, -1)) {
+		rc = -errno;
+		goto fail;
+	}
+
+	if ((rc = tcp_flow_dump_tinfo(conn, t)))
+		goto fail;
+
+	if ((rc = tcp_flow_dump_mss(conn, t)))
+		goto fail;
+
+	if ((rc = tcp_flow_dump_timestamp(conn, t)))
+		goto fail;
+
+	if ((rc = tcp_flow_dump_wnd(conn, t)))
+		goto fail;
+
+	if ((rc = tcp_flow_select_queue(conn, TCP_SEND_QUEUE)))
+		goto fail;
+
+	if ((rc = tcp_flow_dump_sndqueue(conn, t)))
+		goto fail;
+
+	if ((rc = tcp_flow_dump_seq(conn, &t->seq_snd)))
+		goto fail;
+
+	if ((rc = tcp_flow_select_queue(conn, TCP_RECV_QUEUE)))
+		goto fail;
+
+	if ((rc = tcp_flow_dump_rcvqueue(conn, t)))
+		goto fail;
+
+	if ((rc = tcp_flow_dump_seq(conn, &t->seq_rcv)))
+		goto fail;
+
+	close(s);
+
+	/* Adjustments unrelated to FIN segments: sequence numbers we dumped are
+	 * based on the end of the queues.
+	 */
+	t->seq_rcv	-= t->rcvq;
+	t->seq_snd	-= t->sndq;
+
+	flow_dbg(conn, "Extended migration data, socket %i sequences send %u receive %u",
+		 s, t->seq_snd, t->seq_rcv);
+	flow_dbg(conn, "  pending queues: send %u not sent %u receive %u",
+		 t->sndq, t->notsent, t->rcvq);
+	flow_dbg(conn, "  window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
+		 t->snd_wl1, t->snd_wnd, t->max_window, t->rcv_wnd, t->rcv_wup);
+	flow_dbg(conn, "  SO_PEEK_OFF %s  offset=%"PRIu32,
+		 peek_offset_cap ? "enabled" : "disabled", peek_offset);
+
+	/* Endianness fix-ups */
+	t->seq_snd	= htonl(t->seq_snd);
+	t->seq_rcv 	= htonl(t->seq_rcv);
+	t->sndq		= htonl(t->sndq);
+	t->notsent	= htonl(t->notsent);
+	t->rcvq		= htonl(t->rcvq);
+	t->mss		= htonl(t->mss);
+	t->timestamp	= htonl(t->timestamp);
+
+	t->snd_wl1	= htonl(t->snd_wl1);
+	t->snd_wnd	= htonl(t->snd_wnd);
+	t->max_window	= htonl(t->max_window);
+	t->rcv_wnd	= htonl(t->rcv_wnd);
+	t->rcv_wup	= htonl(t->rcv_wup);
+
+	if (write_all_buf(fd, t, sizeof(*t))) {
+		flow_perror(conn, "Failed to write extended data");
+		return -EIO;
+	}
+
+	if (write_all_buf(fd, tcp_migrate_snd_queue, ntohl(t->sndq))) {
+		flow_perror(conn, "Failed to write send queue data");
+		return -EIO;
+	}
+
+	if (write_all_buf(fd, tcp_migrate_rcv_queue, ntohl(t->rcvq))) {
+		flow_perror(conn, "Failed to write receive queue data");
+		return -EIO;
+	}
+
+	return 0;
+
+fail:
+	/* For any type of failure dumping data, write an invalid extended data
+	 * descriptor that allows us to keep the stream in sync, but tells the
+	 * target to skip the flow. If we fail to transfer data, that's fatal:
+	 * return -EIO in that case (and only in that case).
+	 */
+	t->tcpi_state = 0; /* Not defined: tell the target to skip this flow */
+
+	if (write_all_buf(fd, t, sizeof(*t))) {
+		flow_perror(conn, "Failed to write extended data");
+		return -EIO;
+	}
+
+	if (rc == -EIO) /* but not a migration data transfer failure */
+		return -ENODATA;
+
+	return rc;
+}
+
+/**
+ * tcp_flow_repair_socket() - Open and bind socket, request repair mode
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
+{
+	sa_family_t af = CONN_V4(conn) ? AF_INET : AF_INET6;
+	int s, rc;
+
+	if ((conn->sock = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
+				 IPPROTO_TCP)) < 0) {
+		rc = -errno;
+		flow_perror(conn, "Failed to create socket for migrated flow");
+		return rc;
+	}
+	s = conn->sock;
+
+	if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &(int){ 1 }, sizeof(int)))
+		flow_dbg_perror(conn, "Failed to set SO_REUSEADDR on socket %i",
+				s);
+
+	tcp_sock_set_nodelay(s);
+
+	if ((rc = tcp_flow_repair_on(c, conn)))
+		goto err;
+
+	return 0;
+
+err:
+	close(s);
+	conn->sock = -1;
+	return rc;
+}
+
+/**
+ * tcp_flow_repair_bind() - Bind socket in repair mode
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_bind(const struct ctx *c, struct tcp_tap_conn *conn)
+{
+	const struct flowside *sockside = HOSTFLOW(conn);
+	union sockaddr_inany a;
+	socklen_t sl;
+
+	pif_sockaddr(c, &a, &sl, PIF_HOST, &sockside->oaddr, sockside->oport);
+
+	if (bind(conn->sock, &a.sa, sizeof(a))) {
+		int rc = -errno;
+		flow_perror(conn, "Failed to bind socket for migrated flow");
+		return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_repair_connect() - Connect socket in repair mode, then turn it off
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_connect(const struct ctx *c,
+				   struct tcp_tap_conn *conn)
+{
+	const struct flowside *tgt = HOSTFLOW(conn);
+	int rc;
+
+	rc = flowside_connect(c, conn->sock, PIF_HOST, tgt);
+	if (rc) {
+		rc = -errno;
+		flow_perror(conn, "Failed to connect migrated socket");
+		return rc;
+	}
+
+	conn->in_epoll = 0;
+	conn->timer = -1;
+	conn->listening_sock = -1;
+
+	return 0;
+}
+
+/**
+ * tcp_flow_migrate_target() - Receive data (flow table part) for flow, insert
+ * @c:		Execution context
+ * @fd:		Descriptor for state migration
+ *
+ * Return: 0 on success, negative on fatal failure, but 0 on single flow failure
+ */
+int tcp_flow_migrate_target(struct ctx *c, int fd)
+{
+	struct tcp_tap_transfer t;
+	struct tcp_tap_conn *conn;
+	union flow *flow;
+	int rc;
+
+	if (!(flow = flow_alloc())) {
+		err("Flow table full on migration target");
+		return 0;
+	}
+
+	if (read_all_buf(fd, &t, sizeof(t))) {
+		flow_perror(flow, "Failed to receive migration data");
+		flow_alloc_cancel(flow);
+		return -errno;
+	}
+
+	flow->f.state = FLOW_STATE_TGT;
+	memcpy(&flow->f.pif, &t.pif, sizeof(flow->f.pif));
+	memcpy(&flow->f.side, &t.side, sizeof(flow->f.side));
+	conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
+
+	conn->retrans			= t.retrans;
+	conn->ws_from_tap		= t.ws_from_tap;
+	conn->ws_to_tap			= t.ws_to_tap;
+	conn->events			= t.events;
+
+	conn->sndbuf			= htonl(t.sndbuf);
+
+	conn->flags			= t.flags;
+	conn->seq_dup_ack_approx	= t.seq_dup_ack_approx;
+
+	MSS_SET(conn,			  ntohl(t.tap_mss));
+
+	conn->wnd_from_tap		= ntohs(t.wnd_from_tap);
+	conn->wnd_to_tap		= ntohs(t.wnd_to_tap);
+
+	conn->seq_to_tap		= ntohl(t.seq_to_tap);
+	conn->seq_ack_from_tap		= ntohl(t.seq_ack_from_tap);
+	conn->seq_from_tap		= ntohl(t.seq_from_tap);
+	conn->seq_ack_to_tap		= ntohl(t.seq_ack_to_tap);
+	conn->seq_init_from_tap		= ntohl(t.seq_init_from_tap);
+
+	if ((rc = tcp_flow_repair_socket(c, conn))) {
+		flow_err(flow, "Can't set up socket: %s, drop", strerror_(-rc));
+		/* Can't leave the flow in an incomplete state */
+		FLOW_ACTIVATE(conn);
+		return 0;
+	}
+
+	flow_hash_insert(c, TAP_SIDX(conn));
+	FLOW_ACTIVATE(conn);
+
+	return 0;
+}
+
+/**
+ * tcp_flow_migrate_target_ext() - Receive extended data for flow, set, connect
+ * @c:		Execution context
+ * @conn:	Connection entry to complete with extra data
+ * @fd:		Descriptor for state migration
+ *
+ * Return: 0 on success, negative on fatal failure, but 0 on single flow failure
+ */
+int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd)
+{
+	uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
+	struct tcp_tap_transfer_ext t;
+	int s = conn->sock, rc;
+
+	if (read_all_buf(fd, &t, sizeof(t))) {
+		rc = -errno;
+		flow_perror(conn, "Failed to read extended data");
+		return rc;
+	}
+
+	if (!t.tcpi_state) { /* Source wants us to skip this flow */
+		flow_err(conn, "Dropping as requested by source");
+		goto fail;
+	}
+
+	/* Endianness fix-ups */
+	t.seq_snd	= ntohl(t.seq_snd);
+	t.seq_rcv 	= ntohl(t.seq_rcv);
+	t.sndq		= ntohl(t.sndq);
+	t.notsent	= ntohl(t.notsent);
+	t.rcvq		= ntohl(t.rcvq);
+	t.mss		= ntohl(t.mss);
+	t.timestamp	= ntohl(t.timestamp);
+
+	t.snd_wl1	= ntohl(t.snd_wl1);
+	t.snd_wnd	= ntohl(t.snd_wnd);
+	t.max_window	= ntohl(t.max_window);
+	t.rcv_wnd	= ntohl(t.rcv_wnd);
+	t.rcv_wup	= ntohl(t.rcv_wup);
+
+	flow_dbg(conn,
+		 "Extended migration data, socket %i sequences send %u receive %u",
+		 s, t.seq_snd, t.seq_rcv);
+	flow_dbg(conn, "  pending queues: send %u not sent %u receive %u",
+		 t.sndq, t.notsent, t.rcvq);
+	flow_dbg(conn,
+		 "  window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
+		 t.snd_wl1, t.snd_wnd, t.max_window, t.rcv_wnd, t.rcv_wup);
+	flow_dbg(conn, "  SO_PEEK_OFF %s  offset=%"PRIu32,
+		 peek_offset_cap ? "enabled" : "disabled", peek_offset);
+
+	if (t.sndq > TCP_MIGRATE_SND_QUEUE_MAX || t.notsent > t.sndq ||
+	    t.rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) {
+		flow_err(conn,
+			 "Bad queues socket %i, send: %u, not sent: %u, receive: %u",
+			 s, t.sndq, t.notsent, t.rcvq);
+		return -EINVAL;
+	}
+
+	if (read_all_buf(fd, tcp_migrate_snd_queue, t.sndq)) {
+		rc = -errno;
+		flow_perror(conn, "Failed to read send queue data");
+		return rc;
+	}
+
+	if (read_all_buf(fd, tcp_migrate_rcv_queue, t.rcvq)) {
+		rc = -errno;
+		flow_perror(conn, "Failed to read receive queue data");
+		return rc;
+	}
+
+	if (conn->sock < 0)
+		/* We weren't able to create the socket, discard flow */
+		goto fail;
+
+	if (tcp_flow_repair_bind(c, conn))
+		goto fail;
+
+	if (tcp_flow_repair_timestamp(conn, &t))
+		goto fail;
+
+	if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE))
+		goto fail;
+
+	if (tcp_flow_repair_seq(conn, &t.seq_snd))
+		goto fail;
+
+	if (tcp_flow_select_queue(conn, TCP_RECV_QUEUE))
+		goto fail;
+
+	if (tcp_flow_repair_seq(conn, &t.seq_rcv))
+		goto fail;
+
+	if (tcp_flow_repair_connect(c, conn))
+		goto fail;
+
+	if (tcp_flow_repair_queue(conn, t.rcvq, tcp_migrate_rcv_queue))
+		goto fail;
+
+	if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE))
+		goto fail;
+
+	if (tcp_flow_repair_queue(conn, t.sndq - t.notsent,
+				  tcp_migrate_snd_queue))
+		goto fail;
+
+	if (tcp_flow_repair_opt(conn, &t))
+		goto fail;
+
+	/* If we sent a FIN sent and it was acknowledged (TCP_FIN_WAIT2), don't
+	 * send it out, because we already sent it for sure.
+	 *
+	 * Call shutdown(x, SHUT_WR) in repair mode, so that we move to
+	 * FIN_WAIT_1 (tcp_shutdown()) without sending anything
+	 * (goto in tcp_write_xmit()).
+	 */
+	if (t.tcpi_state == TCP_FIN_WAIT2) {
+		int v;
+
+		v = TCP_SEND_QUEUE;
+		if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v)))
+			flow_perror(conn, "Selecting repair queue");
+		else
+			shutdown(s, SHUT_WR);
+	}
+
+	if (tcp_flow_repair_wnd(conn, &t))
+		goto fail;
+
+	tcp_flow_repair_off(c, conn);
+	repair_flush(c);
+
+	if (t.notsent) {
+		if (tcp_flow_repair_queue(conn, t.notsent,
+					  tcp_migrate_snd_queue +
+					  (t.sndq - t.notsent))) {
+			/* This sometimes seems to fail for unclear reasons.
+			 * Don't fail the whole migration, just reset the flow
+			 * and carry on to the next one.
+			 */
+			goto fail;
+		}
+	}
+
+	/* If we sent a FIN but it wasn't acknowledged yet (TCP_FIN_WAIT1), send
+	 * it out, because we don't know if we already sent it.
+	 *
+	 * Call shutdown(x, SHUT_WR) *not* in repair mode, which moves us to
+	 * TCP_FIN_WAIT1.
+	 */
+	if (t.tcpi_state == TCP_FIN_WAIT1)
+		shutdown(s, SHUT_WR);
+
+	if (tcp_set_peek_offset(conn, peek_offset))
+		goto fail;
+
+	tcp_send_flag(c, conn, ACK);
+	tcp_data_from_sock(c, conn);
+
+	if ((rc = tcp_epoll_ctl(c, conn))) {
+		flow_dbg(conn,
+			 "Failed to subscribe to epoll for migrated socket: %s",
+			 strerror_(-rc));
+		goto fail;
+	}
+
+	return 0;
+
+fail:
+	if (conn->sock >= 0) {
+		tcp_flow_repair_off(c, conn);
+		repair_flush(c);
+	}
+
+	conn->flags = 0; /* Not waiting for ACK, don't schedule timer */
+	tcp_rst(c, conn);
+
+	return 0;
+}