aboutgitcodebugslistschat
diff options
context:
space:
mode:
authorXun Gu <xugu@redhat.com>2025-09-08 20:04:39 +0900
committerStefano Brivio <sbrivio@redhat.com>2025-09-11 17:09:03 +0200
commitcd2e8863d4d3e98e81e50b3e713aa9c64a6cd023 (patch)
treeece518e7c5766065f4963e0499527215b97adb16
parent8d2f8c4d0fb58d6b2011e614bc7d7ff9dab406b3 (diff)
downloadpasst-cd2e8863d4d3e98e81e50b3e713aa9c64a6cd023.tar
passt-cd2e8863d4d3e98e81e50b3e713aa9c64a6cd023.tar.gz
passt-cd2e8863d4d3e98e81e50b3e713aa9c64a6cd023.tar.bz2
passt-cd2e8863d4d3e98e81e50b3e713aa9c64a6cd023.tar.lz
passt-cd2e8863d4d3e98e81e50b3e713aa9c64a6cd023.tar.xz
passt-cd2e8863d4d3e98e81e50b3e713aa9c64a6cd023.tar.zst
passt-cd2e8863d4d3e98e81e50b3e713aa9c64a6cd023.zip
Reduce tcp_buf_discard size
On kernels without SO_PEEK_OFF, a 16MB static buffer is used to discard sent data. This patch reduces the buffer to 1MB. Larger discards are now handled by using multiple iovec entries pointing to the same 1MB buffer. Signed-off-by: Xun Gu <xugu@redhat.com> [sbrivio: Drop stray whitespace after BUF_DISCARD_SIZE define] Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
-rw-r--r--tcp.c66
-rw-r--r--tcp_buf.c18
-rw-r--r--tcp_internal.h7
-rw-r--r--tcp_vu.c17
4 files changed, 82 insertions, 26 deletions
diff --git a/tcp.c b/tcp.c
index b35f220..48b1ef2 100644
--- a/tcp.c
+++ b/tcp.c
@@ -399,7 +399,7 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS];
*/
static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
-char tcp_buf_discard [MAX_WINDOW];
+char tcp_buf_discard [BUF_DISCARD_SIZE];
/* Does the kernel support TCP_PEEK_OFF? */
bool peek_offset_cap;
@@ -3844,3 +3844,67 @@ fail:
return 0;
}
+
+/**
+ * tcp_prepare_iov() - Prepare iov according to kernel capability
+ * @msg: Message header to update
+ * @iov: iovec to receive TCP payload and data to discard
+ * @already_sent: Bytes sent after the last acknowledged one
+ * @payload_iov_cnt: Number of TCP payload iovec entries
+ *
+ * Return: 0 on success, -1 if already_sent cannot be discarded fully
+ */
+int tcp_prepare_iov(struct msghdr *msg, struct iovec *iov,
+ uint32_t already_sent, int payload_iov_cnt)
+{
+ /*
+ * IOV layout
+ * |- tcp_buf_discard -|---------- TCP data slots ------------|
+ *
+ * with discarded data:
+ * |------ddddddddddddd|ttttttttttttt-------------------------|
+ * ^
+ * |
+ * msg_iov
+ *
+ * without discarded data:
+ * |-------------------|ttttttttttttt-------------------------|
+ * ^
+ * |
+ * msg_iov
+ * d: discard data
+ * t: TCP data
+ */
+ if (peek_offset_cap) {
+ msg->msg_iov = iov + DISCARD_IOV_NUM;
+ msg->msg_iovlen = payload_iov_cnt;
+ } else {
+ int discard_cnt, discard_iov_rem;
+ struct iovec *iov_start;
+ int i;
+
+ discard_cnt = DIV_ROUND_UP(already_sent, BUF_DISCARD_SIZE);
+ if (discard_cnt > DISCARD_IOV_NUM) {
+ debug("Failed to discard %u already sent bytes",
+ already_sent);
+ return -1;
+ }
+
+ discard_iov_rem = already_sent % BUF_DISCARD_SIZE;
+
+ iov_start = iov + (DISCARD_IOV_NUM - discard_cnt);
+
+ /* Multiple iov entries pointing to the same buffer */
+ for (i = 0; i < discard_cnt; i++) {
+ iov_start[i].iov_base = tcp_buf_discard;
+ iov_start[i].iov_len = BUF_DISCARD_SIZE;
+ }
+ if (discard_iov_rem)
+ iov[DISCARD_IOV_NUM - 1].iov_len = discard_iov_rem;
+
+ msg->msg_iov = iov_start;
+ msg->msg_iovlen = discard_cnt + payload_iov_cnt;
+ }
+
+ return 0;
+}
diff --git a/tcp_buf.c b/tcp_buf.c
index b02d986..49bddbe 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -60,7 +60,7 @@ static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM];
static unsigned int tcp_payload_used;
/* recvmsg()/sendmsg() data for tap */
-static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
+static struct iovec iov_sock [TCP_FRAMES_MEM + DISCARD_IOV_NUM];
static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
@@ -326,15 +326,9 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
iov_rem = (wnd_scaled - already_sent) % mss;
}
- /* Prepare iov according to kernel capability */
- if (!peek_offset_cap) {
- mh_sock.msg_iov = iov_sock;
- iov_sock[0].iov_base = tcp_buf_discard;
- iov_sock[0].iov_len = already_sent;
- mh_sock.msg_iovlen = fill_bufs + 1;
- } else {
- mh_sock.msg_iov = &iov_sock[1];
- mh_sock.msg_iovlen = fill_bufs;
+ if (tcp_prepare_iov(&mh_sock, iov_sock, already_sent, fill_bufs)) {
+ tcp_rst(c, conn);
+ return -1;
}
if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) {
@@ -344,12 +338,12 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
tcp_payload_used = 0;
}
- for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
+ for (i = 0, iov = iov_sock + DISCARD_IOV_NUM; i < fill_bufs; i++, iov++) {
iov->iov_base = &tcp_payload[tcp_payload_used + i].data;
iov->iov_len = mss;
}
if (iov_rem)
- iov_sock[fill_bufs].iov_len = iov_rem;
+ iov_sock[fill_bufs + DISCARD_IOV_NUM - 1].iov_len = iov_rem;
/* Receive into buffers, don't dequeue until acknowledged by guest. */
do
diff --git a/tcp_internal.h b/tcp_internal.h
index 65144a8..5cb6cba 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -9,6 +9,9 @@
#define MAX_WS 8
#define MAX_WINDOW (1 << (16 + (MAX_WS)))
+#define BUF_DISCARD_SIZE (1 << 20)
+#define DISCARD_IOV_NUM DIV_ROUND_UP(MAX_WINDOW, BUF_DISCARD_SIZE)
+
#define MSS4 ROUND_DOWN(IP_MAX_MTU - \
sizeof(struct tcphdr) - \
sizeof(struct iphdr), \
@@ -143,7 +146,7 @@ struct tcp_syn_opts {
.ws = TCP_OPT_WS(ws_), \
})
-extern char tcp_buf_discard [MAX_WINDOW];
+extern char tcp_buf_discard [BUF_DISCARD_SIZE];
void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
unsigned long flag);
@@ -184,4 +187,6 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
size_t *optlen);
int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset);
+int tcp_prepare_iov(struct msghdr *msg, struct iovec *iov,
+ uint32_t already_sent, int payload_iov_cnt);
#endif /* TCP_INTERNAL_H */
diff --git a/tcp_vu.c b/tcp_vu.c
index c6b5b91..ebd3a1e 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -35,7 +35,7 @@
#include "vu_common.h"
#include <time.h>
-static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + 1];
+static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + DISCARD_IOV_NUM];
static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
static int head[VIRTQUEUE_MAX_SIZE + 1];
@@ -200,7 +200,7 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq,
hdrlen = tcp_vu_hdrlen(v6);
- vu_init_elem(elem, &iov_vu[1], VIRTQUEUE_MAX_SIZE);
+ vu_init_elem(elem, &iov_vu[DISCARD_IOV_NUM], VIRTQUEUE_MAX_SIZE);
elem_cnt = 0;
*head_cnt = 0;
@@ -228,16 +228,9 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq,
elem_cnt += cnt;
}
- if (peek_offset_cap) {
- mh_sock.msg_iov = iov_vu + 1;
- mh_sock.msg_iovlen = elem_cnt;
- } else {
- iov_vu[0].iov_base = tcp_buf_discard;
- iov_vu[0].iov_len = already_sent;
-
- mh_sock.msg_iov = iov_vu;
- mh_sock.msg_iovlen = elem_cnt + 1;
- }
+ if (tcp_prepare_iov(&mh_sock, iov_vu, already_sent, elem_cnt))
+ /* Expect caller to do a TCP reset */
+ return -1;
do
ret = recvmsg(s, &mh_sock, MSG_PEEK);