tcp: Proper error handling for sendmmsg() to UNIX domain socket

As data from socket is forwarded to the guest, sendmmsg() might send fewer bytes than requested in three different ways: - failing altogether with a negative error code -- ignore that, we'll get an error on the UNIX domain socket later if there's really an issue with it and reset the connection to the guest - sending less than 'vlen' messages -- instead of assuming success in that case and waiting for the guest to send a duplicate ACK indicating missing data, update the sequence number according to what was actually sent and spare some retransmissions - somewhat unexpectedly to me, sending 'vlen' or less than 'vlen' messages, returning up to 'vlen', with the last message being partially sent, and no further indication of errors other than the returned msg_len for the last partially sent message being less than iov_len. In this case, we would assume success and proceed as nothing happened. However, qemu would fail to parse any further message, having received a partial descriptor, and eventually close the connection, logging: serious error: oversized packet received,connection terminated. as the length descriptor for the next message would be sourced from the middle of the next successfully sent message, not from its header. Handle this by checking the msg_len returned for the last (even partially) sent message, and force re-sending the missing bytes, if any, with a blocking sendmsg() -- qemu must not receive anything else than that anyway. While at it, allow to send up to 64KiB for each message, the previous 32KiB limit isn't actually required, and just switch to a new message at each iteration on sending buffers, they are already MSS-sized anyway, so the check in the loop isn't really needed. Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
author: Stefano Brivio <sbrivio@redhat.com> 2021-08-26 14:37:48 +0200
committer: Stefano Brivio <sbrivio@redhat.com> 2021-08-26 23:30:22 +0200
commit: d2272f74f72469c3d4c2368439f36bb3b348db7c (patch)
tree: 29422c60ae49604c790134a0fea63a4171a05452
parent: cc2ebfd5f2c73b61590a28ff7d088520ce2c1502 (diff)
download: passt-d2272f74f72469c3d4c2368439f36bb3b348db7c.tar
passt-d2272f74f72469c3d4c2368439f36bb3b348db7c.tar.gz
passt-d2272f74f72469c3d4c2368439f36bb3b348db7c.tar.bz2
passt-d2272f74f72469c3d4c2368439f36bb3b348db7c.tar.lz
passt-d2272f74f72469c3d4c2368439f36bb3b348db7c.tar.xz
passt-d2272f74f72469c3d4c2368439f36bb3b348db7c.tar.zst
passt-d2272f74f72469c3d4c2368439f36bb3b348db7c.zip
1 files changed, 64 insertions, 37 deletions
diff --git a/tcp.c b/tcp.c
index 093f95f..1aca1ac 100644
--- a/tcp.c
+++ b/tcp.c
@@ -601,7 +601,12 @@ static struct iovec	tcp6_l2_iov_tap		[TCP_TAP_FRAMES];
 static struct msghdr	tcp4_l2_mh_sock;
 static struct msghdr	tcp6_l2_mh_sock;
 
-static struct mmsghdr	tcp_l2_mh_tap		[TCP_TAP_FRAMES];
+__extension__
+static struct mmsghdr	tcp_l2_mh_tap		[TCP_TAP_FRAMES] = {
+	[ 0 ... TCP_TAP_FRAMES - 1 ] = {
+		.msg_hdr.msg_iovlen = 1,
+	},
+};
 
 /* sendmsg() to socket */
 static struct iovec	tcp_tap_iov		[TAP_MSGS];
@@ -1358,8 +1363,15 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
 	if (conn->mss_guest < 0)
 		conn->mss_guest = MSS_DEFAULT;
 
-	if (c->mode == MODE_PASST && c->v6 && conn->mss_guest > SHRT_MAX)
-		conn->mss_guest = SHRT_MAX;
+	if (c->mode == MODE_PASST) {
+		/* Don't upset qemu */
+		conn->mss_guest = MIN(USHRT_MAX -
+				      sizeof(uint32_t) -
+				      sizeof(struct ethhdr) -
+				      sizeof(struct ipv6hdr) -
+				      sizeof(struct tcphdr),
+				      conn->mss_guest);
+	}
 
 	sl = sizeof(conn->mss_guest);
 	setsockopt(s, SOL_TCP, TCP_MAXSEG, &conn->mss_guest, sl);
@@ -1534,15 +1546,16 @@ static void tcp_sock_consume(struct tcp_tap_conn *conn, uint32_t ack_seq)
 static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
 			      struct timespec *now)
 {
-	int mss_tap, fill_bufs, send_bufs = 0, last_len, msg_len, iov_rem = 0;
 	int *buf_mss, *buf_mss_nr_set, *buf_mss_tap, *buf_mss_tap_nr_set;
+	int mss_tap, fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
 	int send, len, plen, v4 = IN6_IS_ADDR_V4MAPPED(&conn->a.a6);
+	uint32_t seq_to_tap = conn->seq_to_tap;
 	socklen_t sl = sizeof(struct tcp_info);
-	struct mmsghdr *mh = tcp_l2_mh_tap;
 	int s = conn->sock, i, ret = 0;
 	struct iovec *iov, *iov_tap;
 	uint32_t already_sent;
 	struct tcp_info info;
+	struct mmsghdr *mh;
 
 	already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
 
@@ -1644,15 +1657,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
 		info.tcpi_snd_wnd = conn->tcpi_snd_wnd;
 	}
 
-	if (v4)
-		mh->msg_hdr.msg_iov = tcp4_l2_iov_tap;
-	else
-		mh->msg_hdr.msg_iov = tcp6_l2_iov_tap;
-	mh->msg_hdr.msg_iovlen = 0;
 	plen = conn->mss_guest;
-	msg_len = 0;
-	for (i = 0; i < send_bufs; i++) {
-		int iov_len, ip_len;
+	for (i = 0, mh = tcp_l2_mh_tap; i < send_bufs; i++, mh++) {
+		int ip_len;
 
 		if (i == send_bufs - 1)
 			plen = last_len;
@@ -1673,7 +1680,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
 
 			b->th.source = htons(conn->sock_port);
 			b->th.dest = htons(conn->tap_port);
-			b->th.seq = htonl(conn->seq_to_tap);
+			b->th.seq = htonl(seq_to_tap);
 			b->th.ack_seq = htonl(conn->seq_ack_to_tap);
 
 			if (conn->no_snd_wnd) {
@@ -1696,6 +1703,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
 			}
 
 			b->vnet_len = htonl(sizeof(struct ethhdr) + ip_len);
+
+			mh->msg_hdr.msg_iov = &tcp4_l2_iov_tap[i];
 		} else {
 			struct tcp6_l2_buf_t *b = &tcp6_l2_buf[i];
 			uint32_t flow = conn->seq_init_to_tap;
@@ -1713,7 +1722,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
 
 			b->th.source = htons(conn->sock_port);
 			b->th.dest = htons(conn->tap_port);
-			b->th.seq = htonl(conn->seq_to_tap);
+			b->th.seq = htonl(seq_to_tap);
 			b->th.ack_seq = htonl(conn->seq_ack_to_tap);
 
 			if (conn->no_snd_wnd) {
@@ -1741,32 +1750,45 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
 			}
 
 			b->vnet_len = htonl(sizeof(struct ethhdr) + ip_len);
-		}
 
-		iov_len = sizeof(uint32_t) + sizeof(struct ethhdr) + ip_len;
-
-		/* Switch to a new message if this one is too long for qemu. */
-		if (msg_len && msg_len + iov_len > SHRT_MAX) {
-			mh++;
-			mh->msg_hdr.msg_iovlen = 0;
-			msg_len = 0;
-			if (v4)
-				mh->msg_hdr.msg_iov = &tcp4_l2_iov_tap[i];
-			else
-				mh->msg_hdr.msg_iov = &tcp6_l2_iov_tap[i];
+			mh->msg_hdr.msg_iov = &tcp6_l2_iov_tap[i];
 		}
-		mh->msg_hdr.msg_iovlen++;
-		msg_len += iov_len;
 
-		conn->seq_to_tap += plen;
+		seq_to_tap += plen;
 	}
 
 	if (c->mode == MODE_PASTA)
 		goto out;
 
-	sendmmsg(c->fd_tap, tcp_l2_mh_tap, mh - tcp_l2_mh_tap + 1,
-		 MSG_NOSIGNAL | MSG_DONTWAIT);
-	pcapmm(tcp_l2_mh_tap, mh - tcp_l2_mh_tap + 1);
+	ret = sendmmsg(c->fd_tap, tcp_l2_mh_tap, mh - tcp_l2_mh_tap,
+		       MSG_NOSIGNAL | MSG_DONTWAIT);
+	if (ret <= 0)
+		goto out;
+
+	conn->seq_to_tap += conn->mss_guest * (ret - 1) + last_len;
+
+	/* sendmmsg() indicates how many messages were sent at least partially.
+	 * Kernel commit 3023898b7d4a ("sock: fix sendmmsg for partial sendmsg")
+	 * gives us the guarantee that at most one message, namely the last sent
+	 * one, might have been sent partially. Check how many bytes of that
+	 * message were sent, and re-send any missing bytes with a blocking
+	 * sendmsg(), otherwise qemu will fail to parse any subsequent message.
+	 */
+	mh = &tcp_l2_mh_tap[ret - 1];
+	if (mh->msg_len < mh->msg_hdr.msg_iov->iov_len) {
+		uint8_t **iov_base = (uint8_t **)&mh->msg_hdr.msg_iov->iov_base;
+		int part_sent = mh->msg_len;
+
+		mh->msg_hdr.msg_iov->iov_len -= part_sent;
+		*iov_base += part_sent;
+
+		sendmsg(c->fd_tap, &mh->msg_hdr, MSG_NOSIGNAL);
+
+		mh->msg_hdr.msg_iov->iov_len += part_sent;
+		*iov_base -= part_sent;
+	}
+
+	pcapmm(tcp_l2_mh_tap, ret);
 
 	goto out;
 
@@ -2027,10 +2049,15 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
 		if (conn->mss_guest < 0)
 			conn->mss_guest = MSS_DEFAULT;
 
-		/* Don't upset qemu */
-		if (c->mode == MODE_PASST && c->v6 &&
-		    conn->mss_guest > SHRT_MAX)
-			conn->mss_guest = SHRT_MAX;
+		if (c->mode == MODE_PASST) {
+			/* Don't upset qemu */
+			conn->mss_guest = MIN(USHRT_MAX -
+					      sizeof(uint32_t) -
+					      sizeof(struct ethhdr) -
+					      sizeof(struct ipv6hdr) -
+					      sizeof(struct tcphdr),
+					      conn->mss_guest);
+		}
 
 		ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL);
 		if (ws > MAX_WS) {
author	Stefano Brivio <sbrivio@redhat.com>	2021-08-26 14:37:48 +0200
committer	Stefano Brivio <sbrivio@redhat.com>	2021-08-26 23:30:22 +0200
commit	d2272f74f72469c3d4c2368439f36bb3b348db7c (patch)
tree	29422c60ae49604c790134a0fea63a4171a05452
parent	cc2ebfd5f2c73b61590a28ff7d088520ce2c1502 (diff)
download	passt-d2272f74f72469c3d4c2368439f36bb3b348db7c.tar passt-d2272f74f72469c3d4c2368439f36bb3b348db7c.tar.gz passt-d2272f74f72469c3d4c2368439f36bb3b348db7c.tar.bz2 passt-d2272f74f72469c3d4c2368439f36bb3b348db7c.tar.lz passt-d2272f74f72469c3d4c2368439f36bb3b348db7c.tar.xz passt-d2272f74f72469c3d4c2368439f36bb3b348db7c.tar.zst passt-d2272f74f72469c3d4c2368439f36bb3b348db7c.zip