diff options
author | Stefano Brivio <sbrivio@redhat.com> | 2021-08-26 14:37:48 +0200 |
---|---|---|
committer | Stefano Brivio <sbrivio@redhat.com> | 2021-08-26 23:30:22 +0200 |
commit | d2272f74f72469c3d4c2368439f36bb3b348db7c (patch) | |
tree | 29422c60ae49604c790134a0fea63a4171a05452 /tcp.c | |
parent | cc2ebfd5f2c73b61590a28ff7d088520ce2c1502 (diff) | |
download | passt-d2272f74f72469c3d4c2368439f36bb3b348db7c.tar passt-d2272f74f72469c3d4c2368439f36bb3b348db7c.tar.gz passt-d2272f74f72469c3d4c2368439f36bb3b348db7c.tar.bz2 passt-d2272f74f72469c3d4c2368439f36bb3b348db7c.tar.lz passt-d2272f74f72469c3d4c2368439f36bb3b348db7c.tar.xz passt-d2272f74f72469c3d4c2368439f36bb3b348db7c.tar.zst passt-d2272f74f72469c3d4c2368439f36bb3b348db7c.zip |
tcp: Proper error handling for sendmmsg() to UNIX domain socket
As data from socket is forwarded to the guest, sendmmsg() might send
fewer bytes than requested in three different ways:
- failing altogether with a negative error code -- ignore that,
we'll get an error on the UNIX domain socket later if there's
really an issue with it and reset the connection to the guest
- sending less than 'vlen' messages -- instead of assuming success
in that case and waiting for the guest to send a duplicate ACK
indicating missing data, update the sequence number according to
what was actually sent and spare some retransmissions
- somewhat unexpectedly to me, sending 'vlen' or less than 'vlen'
messages, returning up to 'vlen', with the last message being
partially sent, and no further indication of errors other than
the returned msg_len for the last partially sent message being
less than iov_len.
In this case, we would assume success and proceed as nothing
happened. However, qemu would fail to parse any further message,
having received a partial descriptor, and eventually close the
connection, logging:
serious error: oversized packet received,connection terminated.
as the length descriptor for the next message would be sourced
from the middle of the next successfully sent message, not from
its header.
Handle this by checking the msg_len returned for the last (even
partially) sent message, and force re-sending the missing bytes,
if any, with a blocking sendmsg() -- qemu must not receive
anything else than that anyway.
While at it, allow to send up to 64KiB for each message, the
previous 32KiB limit isn't actually required, and just switch to a
new message at each iteration on sending buffers, they are already
MSS-sized anyway, so the check in the loop isn't really needed.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Diffstat (limited to 'tcp.c')
-rw-r--r-- | tcp.c | 101 |
1 files changed, 64 insertions, 37 deletions
@@ -601,7 +601,12 @@ static struct iovec tcp6_l2_iov_tap [TCP_TAP_FRAMES]; static struct msghdr tcp4_l2_mh_sock; static struct msghdr tcp6_l2_mh_sock; -static struct mmsghdr tcp_l2_mh_tap [TCP_TAP_FRAMES]; +__extension__ +static struct mmsghdr tcp_l2_mh_tap [TCP_TAP_FRAMES] = { + [ 0 ... TCP_TAP_FRAMES - 1 ] = { + .msg_hdr.msg_iovlen = 1, + }, +}; /* sendmsg() to socket */ static struct iovec tcp_tap_iov [TAP_MSGS]; @@ -1358,8 +1363,15 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, if (conn->mss_guest < 0) conn->mss_guest = MSS_DEFAULT; - if (c->mode == MODE_PASST && c->v6 && conn->mss_guest > SHRT_MAX) - conn->mss_guest = SHRT_MAX; + if (c->mode == MODE_PASST) { + /* Don't upset qemu */ + conn->mss_guest = MIN(USHRT_MAX - + sizeof(uint32_t) - + sizeof(struct ethhdr) - + sizeof(struct ipv6hdr) - + sizeof(struct tcphdr), + conn->mss_guest); + } sl = sizeof(conn->mss_guest); setsockopt(s, SOL_TCP, TCP_MAXSEG, &conn->mss_guest, sl); @@ -1534,15 +1546,16 @@ static void tcp_sock_consume(struct tcp_tap_conn *conn, uint32_t ack_seq) static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, struct timespec *now) { - int mss_tap, fill_bufs, send_bufs = 0, last_len, msg_len, iov_rem = 0; int *buf_mss, *buf_mss_nr_set, *buf_mss_tap, *buf_mss_tap_nr_set; + int mss_tap, fill_bufs, send_bufs = 0, last_len, iov_rem = 0; int send, len, plen, v4 = IN6_IS_ADDR_V4MAPPED(&conn->a.a6); + uint32_t seq_to_tap = conn->seq_to_tap; socklen_t sl = sizeof(struct tcp_info); - struct mmsghdr *mh = tcp_l2_mh_tap; int s = conn->sock, i, ret = 0; struct iovec *iov, *iov_tap; uint32_t already_sent; struct tcp_info info; + struct mmsghdr *mh; already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; @@ -1644,15 +1657,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, info.tcpi_snd_wnd = conn->tcpi_snd_wnd; } - if (v4) - mh->msg_hdr.msg_iov = tcp4_l2_iov_tap; - else - mh->msg_hdr.msg_iov = tcp6_l2_iov_tap; - mh->msg_hdr.msg_iovlen = 0; plen = conn->mss_guest; - msg_len = 0; - for (i = 0; i < send_bufs; i++) { - int iov_len, ip_len; + for (i = 0, mh = tcp_l2_mh_tap; i < send_bufs; i++, mh++) { + int ip_len; if (i == send_bufs - 1) plen = last_len; @@ -1673,7 +1680,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, b->th.source = htons(conn->sock_port); b->th.dest = htons(conn->tap_port); - b->th.seq = htonl(conn->seq_to_tap); + b->th.seq = htonl(seq_to_tap); b->th.ack_seq = htonl(conn->seq_ack_to_tap); if (conn->no_snd_wnd) { @@ -1696,6 +1703,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, } b->vnet_len = htonl(sizeof(struct ethhdr) + ip_len); + + mh->msg_hdr.msg_iov = &tcp4_l2_iov_tap[i]; } else { struct tcp6_l2_buf_t *b = &tcp6_l2_buf[i]; uint32_t flow = conn->seq_init_to_tap; @@ -1713,7 +1722,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, b->th.source = htons(conn->sock_port); b->th.dest = htons(conn->tap_port); - b->th.seq = htonl(conn->seq_to_tap); + b->th.seq = htonl(seq_to_tap); b->th.ack_seq = htonl(conn->seq_ack_to_tap); if (conn->no_snd_wnd) { @@ -1741,32 +1750,45 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, } b->vnet_len = htonl(sizeof(struct ethhdr) + ip_len); - } - iov_len = sizeof(uint32_t) + sizeof(struct ethhdr) + ip_len; - - /* Switch to a new message if this one is too long for qemu. */ - if (msg_len && msg_len + iov_len > SHRT_MAX) { - mh++; - mh->msg_hdr.msg_iovlen = 0; - msg_len = 0; - if (v4) - mh->msg_hdr.msg_iov = &tcp4_l2_iov_tap[i]; - else - mh->msg_hdr.msg_iov = &tcp6_l2_iov_tap[i]; + mh->msg_hdr.msg_iov = &tcp6_l2_iov_tap[i]; } - mh->msg_hdr.msg_iovlen++; - msg_len += iov_len; - conn->seq_to_tap += plen; + seq_to_tap += plen; } if (c->mode == MODE_PASTA) goto out; - sendmmsg(c->fd_tap, tcp_l2_mh_tap, mh - tcp_l2_mh_tap + 1, - MSG_NOSIGNAL | MSG_DONTWAIT); - pcapmm(tcp_l2_mh_tap, mh - tcp_l2_mh_tap + 1); + ret = sendmmsg(c->fd_tap, tcp_l2_mh_tap, mh - tcp_l2_mh_tap, + MSG_NOSIGNAL | MSG_DONTWAIT); + if (ret <= 0) + goto out; + + conn->seq_to_tap += conn->mss_guest * (ret - 1) + last_len; + + /* sendmmsg() indicates how many messages were sent at least partially. + * Kernel commit 3023898b7d4a ("sock: fix sendmmsg for partial sendmsg") + * gives us the guarantee that at most one message, namely the last sent + * one, might have been sent partially. Check how many bytes of that + * message were sent, and re-send any missing bytes with a blocking + * sendmsg(), otherwise qemu will fail to parse any subsequent message. + */ + mh = &tcp_l2_mh_tap[ret - 1]; + if (mh->msg_len < mh->msg_hdr.msg_iov->iov_len) { + uint8_t **iov_base = (uint8_t **)&mh->msg_hdr.msg_iov->iov_base; + int part_sent = mh->msg_len; + + mh->msg_hdr.msg_iov->iov_len -= part_sent; + *iov_base += part_sent; + + sendmsg(c->fd_tap, &mh->msg_hdr, MSG_NOSIGNAL); + + mh->msg_hdr.msg_iov->iov_len += part_sent; + *iov_base -= part_sent; + } + + pcapmm(tcp_l2_mh_tap, ret); goto out; @@ -2027,10 +2049,15 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, if (conn->mss_guest < 0) conn->mss_guest = MSS_DEFAULT; - /* Don't upset qemu */ - if (c->mode == MODE_PASST && c->v6 && - conn->mss_guest > SHRT_MAX) - conn->mss_guest = SHRT_MAX; + if (c->mode == MODE_PASST) { + /* Don't upset qemu */ + conn->mss_guest = MIN(USHRT_MAX - + sizeof(uint32_t) - + sizeof(struct ethhdr) - + sizeof(struct ipv6hdr) - + sizeof(struct tcphdr), + conn->mss_guest); + } ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL); if (ws > MAX_WS) { |