From d2272f74f72469c3d4c2368439f36bb3b348db7c Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Thu, 26 Aug 2021 14:37:48 +0200 Subject: tcp: Proper error handling for sendmmsg() to UNIX domain socket As data from socket is forwarded to the guest, sendmmsg() might send fewer bytes than requested in three different ways: - failing altogether with a negative error code -- ignore that, we'll get an error on the UNIX domain socket later if there's really an issue with it and reset the connection to the guest - sending less than 'vlen' messages -- instead of assuming success in that case and waiting for the guest to send a duplicate ACK indicating missing data, update the sequence number according to what was actually sent and spare some retransmissions - somewhat unexpectedly to me, sending 'vlen' or less than 'vlen' messages, returning up to 'vlen', with the last message being partially sent, and no further indication of errors other than the returned msg_len for the last partially sent message being less than iov_len. In this case, we would assume success and proceed as nothing happened. However, qemu would fail to parse any further message, having received a partial descriptor, and eventually close the connection, logging: serious error: oversized packet received,connection terminated. as the length descriptor for the next message would be sourced from the middle of the next successfully sent message, not from its header. Handle this by checking the msg_len returned for the last (even partially) sent message, and force re-sending the missing bytes, if any, with a blocking sendmsg() -- qemu must not receive anything else than that anyway. While at it, allow to send up to 64KiB for each message, the previous 32KiB limit isn't actually required, and just switch to a new message at each iteration on sending buffers, they are already MSS-sized anyway, so the check in the loop isn't really needed. Signed-off-by: Stefano Brivio --- tcp.c | 101 ++++++++++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 64 insertions(+), 37 deletions(-) diff --git a/tcp.c b/tcp.c index 093f95f..1aca1ac 100644 --- a/tcp.c +++ b/tcp.c @@ -601,7 +601,12 @@ static struct iovec tcp6_l2_iov_tap [TCP_TAP_FRAMES]; static struct msghdr tcp4_l2_mh_sock; static struct msghdr tcp6_l2_mh_sock; -static struct mmsghdr tcp_l2_mh_tap [TCP_TAP_FRAMES]; +__extension__ +static struct mmsghdr tcp_l2_mh_tap [TCP_TAP_FRAMES] = { + [ 0 ... TCP_TAP_FRAMES - 1 ] = { + .msg_hdr.msg_iovlen = 1, + }, +}; /* sendmsg() to socket */ static struct iovec tcp_tap_iov [TAP_MSGS]; @@ -1358,8 +1363,15 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, if (conn->mss_guest < 0) conn->mss_guest = MSS_DEFAULT; - if (c->mode == MODE_PASST && c->v6 && conn->mss_guest > SHRT_MAX) - conn->mss_guest = SHRT_MAX; + if (c->mode == MODE_PASST) { + /* Don't upset qemu */ + conn->mss_guest = MIN(USHRT_MAX - + sizeof(uint32_t) - + sizeof(struct ethhdr) - + sizeof(struct ipv6hdr) - + sizeof(struct tcphdr), + conn->mss_guest); + } sl = sizeof(conn->mss_guest); setsockopt(s, SOL_TCP, TCP_MAXSEG, &conn->mss_guest, sl); @@ -1534,15 +1546,16 @@ static void tcp_sock_consume(struct tcp_tap_conn *conn, uint32_t ack_seq) static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, struct timespec *now) { - int mss_tap, fill_bufs, send_bufs = 0, last_len, msg_len, iov_rem = 0; int *buf_mss, *buf_mss_nr_set, *buf_mss_tap, *buf_mss_tap_nr_set; + int mss_tap, fill_bufs, send_bufs = 0, last_len, iov_rem = 0; int send, len, plen, v4 = IN6_IS_ADDR_V4MAPPED(&conn->a.a6); + uint32_t seq_to_tap = conn->seq_to_tap; socklen_t sl = sizeof(struct tcp_info); - struct mmsghdr *mh = tcp_l2_mh_tap; int s = conn->sock, i, ret = 0; struct iovec *iov, *iov_tap; uint32_t already_sent; struct tcp_info info; + struct mmsghdr *mh; already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; @@ -1644,15 +1657,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, info.tcpi_snd_wnd = conn->tcpi_snd_wnd; } - if (v4) - mh->msg_hdr.msg_iov = tcp4_l2_iov_tap; - else - mh->msg_hdr.msg_iov = tcp6_l2_iov_tap; - mh->msg_hdr.msg_iovlen = 0; plen = conn->mss_guest; - msg_len = 0; - for (i = 0; i < send_bufs; i++) { - int iov_len, ip_len; + for (i = 0, mh = tcp_l2_mh_tap; i < send_bufs; i++, mh++) { + int ip_len; if (i == send_bufs - 1) plen = last_len; @@ -1673,7 +1680,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, b->th.source = htons(conn->sock_port); b->th.dest = htons(conn->tap_port); - b->th.seq = htonl(conn->seq_to_tap); + b->th.seq = htonl(seq_to_tap); b->th.ack_seq = htonl(conn->seq_ack_to_tap); if (conn->no_snd_wnd) { @@ -1696,6 +1703,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, } b->vnet_len = htonl(sizeof(struct ethhdr) + ip_len); + + mh->msg_hdr.msg_iov = &tcp4_l2_iov_tap[i]; } else { struct tcp6_l2_buf_t *b = &tcp6_l2_buf[i]; uint32_t flow = conn->seq_init_to_tap; @@ -1713,7 +1722,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, b->th.source = htons(conn->sock_port); b->th.dest = htons(conn->tap_port); - b->th.seq = htonl(conn->seq_to_tap); + b->th.seq = htonl(seq_to_tap); b->th.ack_seq = htonl(conn->seq_ack_to_tap); if (conn->no_snd_wnd) { @@ -1741,32 +1750,45 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, } b->vnet_len = htonl(sizeof(struct ethhdr) + ip_len); - } - iov_len = sizeof(uint32_t) + sizeof(struct ethhdr) + ip_len; - - /* Switch to a new message if this one is too long for qemu. */ - if (msg_len && msg_len + iov_len > SHRT_MAX) { - mh++; - mh->msg_hdr.msg_iovlen = 0; - msg_len = 0; - if (v4) - mh->msg_hdr.msg_iov = &tcp4_l2_iov_tap[i]; - else - mh->msg_hdr.msg_iov = &tcp6_l2_iov_tap[i]; + mh->msg_hdr.msg_iov = &tcp6_l2_iov_tap[i]; } - mh->msg_hdr.msg_iovlen++; - msg_len += iov_len; - conn->seq_to_tap += plen; + seq_to_tap += plen; } if (c->mode == MODE_PASTA) goto out; - sendmmsg(c->fd_tap, tcp_l2_mh_tap, mh - tcp_l2_mh_tap + 1, - MSG_NOSIGNAL | MSG_DONTWAIT); - pcapmm(tcp_l2_mh_tap, mh - tcp_l2_mh_tap + 1); + ret = sendmmsg(c->fd_tap, tcp_l2_mh_tap, mh - tcp_l2_mh_tap, + MSG_NOSIGNAL | MSG_DONTWAIT); + if (ret <= 0) + goto out; + + conn->seq_to_tap += conn->mss_guest * (ret - 1) + last_len; + + /* sendmmsg() indicates how many messages were sent at least partially. + * Kernel commit 3023898b7d4a ("sock: fix sendmmsg for partial sendmsg") + * gives us the guarantee that at most one message, namely the last sent + * one, might have been sent partially. Check how many bytes of that + * message were sent, and re-send any missing bytes with a blocking + * sendmsg(), otherwise qemu will fail to parse any subsequent message. + */ + mh = &tcp_l2_mh_tap[ret - 1]; + if (mh->msg_len < mh->msg_hdr.msg_iov->iov_len) { + uint8_t **iov_base = (uint8_t **)&mh->msg_hdr.msg_iov->iov_base; + int part_sent = mh->msg_len; + + mh->msg_hdr.msg_iov->iov_len -= part_sent; + *iov_base += part_sent; + + sendmsg(c->fd_tap, &mh->msg_hdr, MSG_NOSIGNAL); + + mh->msg_hdr.msg_iov->iov_len += part_sent; + *iov_base -= part_sent; + } + + pcapmm(tcp_l2_mh_tap, ret); goto out; @@ -2027,10 +2049,15 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, if (conn->mss_guest < 0) conn->mss_guest = MSS_DEFAULT; - /* Don't upset qemu */ - if (c->mode == MODE_PASST && c->v6 && - conn->mss_guest > SHRT_MAX) - conn->mss_guest = SHRT_MAX; + if (c->mode == MODE_PASST) { + /* Don't upset qemu */ + conn->mss_guest = MIN(USHRT_MAX - + sizeof(uint32_t) - + sizeof(struct ethhdr) - + sizeof(struct ipv6hdr) - + sizeof(struct tcphdr), + conn->mss_guest); + } ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL); if (ws > MAX_WS) { -- cgit v1.2.3