From d2272f74f72469c3d4c2368439f36bb3b348db7c Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 26 Aug 2021 14:37:48 +0200
Subject: tcp: Proper error handling for sendmmsg() to UNIX domain socket

As data from socket is forwarded to the guest, sendmmsg() might send
fewer bytes than requested in three different ways:

- failing altogether with a negative error code -- ignore that,
  we'll get an error on the UNIX domain socket later if there's
  really an issue with it and reset the connection to the guest

- sending less than 'vlen' messages -- instead of assuming success
  in that case and waiting for the guest to send a duplicate ACK
  indicating missing data, update the sequence number according to
  what was actually sent and spare some retransmissions

- somewhat unexpectedly to me, sending 'vlen' or less than 'vlen'
  messages, returning up to 'vlen', with the last message being
  partially sent, and no further indication of errors other than
  the returned msg_len for the last partially sent message being
  less than iov_len.

  In this case, we would assume success and proceed as nothing
  happened. However, qemu would fail to parse any further message,
  having received a partial descriptor, and eventually close the
  connection, logging:

	serious error: oversized packet received,connection terminated.

  as the length descriptor for the next message would be sourced
  from the middle of the next successfully sent message, not from
  its header.

  Handle this by checking the msg_len returned for the last (even
  partially) sent message, and force re-sending the missing bytes,
  if any, with a blocking sendmsg() -- qemu must not receive
  anything else than that anyway.

While at it, allow to send up to 64KiB for each message, the
previous 32KiB limit isn't actually required, and just switch to a
new message at each iteration on sending buffers, they are already
MSS-sized anyway, so the check in the loop isn't really needed.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tcp.c | 101 ++++++++++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 64 insertions(+), 37 deletions(-)

diff --git a/tcp.c b/tcp.c
index 093f95f..1aca1ac 100644
--- a/tcp.c
+++ b/tcp.c
@@ -601,7 +601,12 @@ static struct iovec	tcp6_l2_iov_tap		[TCP_TAP_FRAMES];
 static struct msghdr	tcp4_l2_mh_sock;
 static struct msghdr	tcp6_l2_mh_sock;
 
-static struct mmsghdr	tcp_l2_mh_tap		[TCP_TAP_FRAMES];
+__extension__
+static struct mmsghdr	tcp_l2_mh_tap		[TCP_TAP_FRAMES] = {
+	[ 0 ... TCP_TAP_FRAMES - 1 ] = {
+		.msg_hdr.msg_iovlen = 1,
+	},
+};
 
 /* sendmsg() to socket */
 static struct iovec	tcp_tap_iov		[TAP_MSGS];
@@ -1358,8 +1363,15 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
 	if (conn->mss_guest < 0)
 		conn->mss_guest = MSS_DEFAULT;
 
-	if (c->mode == MODE_PASST && c->v6 && conn->mss_guest > SHRT_MAX)
-		conn->mss_guest = SHRT_MAX;
+	if (c->mode == MODE_PASST) {
+		/* Don't upset qemu */
+		conn->mss_guest = MIN(USHRT_MAX -
+				      sizeof(uint32_t) -
+				      sizeof(struct ethhdr) -
+				      sizeof(struct ipv6hdr) -
+				      sizeof(struct tcphdr),
+				      conn->mss_guest);
+	}
 
 	sl = sizeof(conn->mss_guest);
 	setsockopt(s, SOL_TCP, TCP_MAXSEG, &conn->mss_guest, sl);
@@ -1534,15 +1546,16 @@ static void tcp_sock_consume(struct tcp_tap_conn *conn, uint32_t ack_seq)
 static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
 			      struct timespec *now)
 {
-	int mss_tap, fill_bufs, send_bufs = 0, last_len, msg_len, iov_rem = 0;
 	int *buf_mss, *buf_mss_nr_set, *buf_mss_tap, *buf_mss_tap_nr_set;
+	int mss_tap, fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
 	int send, len, plen, v4 = IN6_IS_ADDR_V4MAPPED(&conn->a.a6);
+	uint32_t seq_to_tap = conn->seq_to_tap;
 	socklen_t sl = sizeof(struct tcp_info);
-	struct mmsghdr *mh = tcp_l2_mh_tap;
 	int s = conn->sock, i, ret = 0;
 	struct iovec *iov, *iov_tap;
 	uint32_t already_sent;
 	struct tcp_info info;
+	struct mmsghdr *mh;
 
 	already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
 
@@ -1644,15 +1657,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
 		info.tcpi_snd_wnd = conn->tcpi_snd_wnd;
 	}
 
-	if (v4)
-		mh->msg_hdr.msg_iov = tcp4_l2_iov_tap;
-	else
-		mh->msg_hdr.msg_iov = tcp6_l2_iov_tap;
-	mh->msg_hdr.msg_iovlen = 0;
 	plen = conn->mss_guest;
-	msg_len = 0;
-	for (i = 0; i < send_bufs; i++) {
-		int iov_len, ip_len;
+	for (i = 0, mh = tcp_l2_mh_tap; i < send_bufs; i++, mh++) {
+		int ip_len;
 
 		if (i == send_bufs - 1)
 			plen = last_len;
@@ -1673,7 +1680,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
 
 			b->th.source = htons(conn->sock_port);
 			b->th.dest = htons(conn->tap_port);
-			b->th.seq = htonl(conn->seq_to_tap);
+			b->th.seq = htonl(seq_to_tap);
 			b->th.ack_seq = htonl(conn->seq_ack_to_tap);
 
 			if (conn->no_snd_wnd) {
@@ -1696,6 +1703,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
 			}
 
 			b->vnet_len = htonl(sizeof(struct ethhdr) + ip_len);
+
+			mh->msg_hdr.msg_iov = &tcp4_l2_iov_tap[i];
 		} else {
 			struct tcp6_l2_buf_t *b = &tcp6_l2_buf[i];
 			uint32_t flow = conn->seq_init_to_tap;
@@ -1713,7 +1722,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
 
 			b->th.source = htons(conn->sock_port);
 			b->th.dest = htons(conn->tap_port);
-			b->th.seq = htonl(conn->seq_to_tap);
+			b->th.seq = htonl(seq_to_tap);
 			b->th.ack_seq = htonl(conn->seq_ack_to_tap);
 
 			if (conn->no_snd_wnd) {
@@ -1741,32 +1750,45 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
 			}
 
 			b->vnet_len = htonl(sizeof(struct ethhdr) + ip_len);
-		}
 
-		iov_len = sizeof(uint32_t) + sizeof(struct ethhdr) + ip_len;
-
-		/* Switch to a new message if this one is too long for qemu. */
-		if (msg_len && msg_len + iov_len > SHRT_MAX) {
-			mh++;
-			mh->msg_hdr.msg_iovlen = 0;
-			msg_len = 0;
-			if (v4)
-				mh->msg_hdr.msg_iov = &tcp4_l2_iov_tap[i];
-			else
-				mh->msg_hdr.msg_iov = &tcp6_l2_iov_tap[i];
+			mh->msg_hdr.msg_iov = &tcp6_l2_iov_tap[i];
 		}
-		mh->msg_hdr.msg_iovlen++;
-		msg_len += iov_len;
 
-		conn->seq_to_tap += plen;
+		seq_to_tap += plen;
 	}
 
 	if (c->mode == MODE_PASTA)
 		goto out;
 
-	sendmmsg(c->fd_tap, tcp_l2_mh_tap, mh - tcp_l2_mh_tap + 1,
-		 MSG_NOSIGNAL | MSG_DONTWAIT);
-	pcapmm(tcp_l2_mh_tap, mh - tcp_l2_mh_tap + 1);
+	ret = sendmmsg(c->fd_tap, tcp_l2_mh_tap, mh - tcp_l2_mh_tap,
+		       MSG_NOSIGNAL | MSG_DONTWAIT);
+	if (ret <= 0)
+		goto out;
+
+	conn->seq_to_tap += conn->mss_guest * (ret - 1) + last_len;
+
+	/* sendmmsg() indicates how many messages were sent at least partially.
+	 * Kernel commit 3023898b7d4a ("sock: fix sendmmsg for partial sendmsg")
+	 * gives us the guarantee that at most one message, namely the last sent
+	 * one, might have been sent partially. Check how many bytes of that
+	 * message were sent, and re-send any missing bytes with a blocking
+	 * sendmsg(), otherwise qemu will fail to parse any subsequent message.
+	 */
+	mh = &tcp_l2_mh_tap[ret - 1];
+	if (mh->msg_len < mh->msg_hdr.msg_iov->iov_len) {
+		uint8_t **iov_base = (uint8_t **)&mh->msg_hdr.msg_iov->iov_base;
+		int part_sent = mh->msg_len;
+
+		mh->msg_hdr.msg_iov->iov_len -= part_sent;
+		*iov_base += part_sent;
+
+		sendmsg(c->fd_tap, &mh->msg_hdr, MSG_NOSIGNAL);
+
+		mh->msg_hdr.msg_iov->iov_len += part_sent;
+		*iov_base -= part_sent;
+	}
+
+	pcapmm(tcp_l2_mh_tap, ret);
 
 	goto out;
 
@@ -2027,10 +2049,15 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
 		if (conn->mss_guest < 0)
 			conn->mss_guest = MSS_DEFAULT;
 
-		/* Don't upset qemu */
-		if (c->mode == MODE_PASST && c->v6 &&
-		    conn->mss_guest > SHRT_MAX)
-			conn->mss_guest = SHRT_MAX;
+		if (c->mode == MODE_PASST) {
+			/* Don't upset qemu */
+			conn->mss_guest = MIN(USHRT_MAX -
+					      sizeof(uint32_t) -
+					      sizeof(struct ethhdr) -
+					      sizeof(struct ipv6hdr) -
+					      sizeof(struct tcphdr),
+					      conn->mss_guest);
+		}
 
 		ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL);
 		if (ws > MAX_WS) {
-- 
cgit v1.2.3