aboutgitcodebugslistschat
diff options
context:
space:
mode:
authorStefano Brivio <sbrivio@redhat.com>2025-11-03 11:16:11 +0100
committerStefano Brivio <sbrivio@redhat.com>2025-12-08 04:47:22 +0100
commit68b0a36d6a40aece53df9d2a0a89addd07eda515 (patch)
tree6d581684967a8077c7a9c8e684ea31cd9e762e0b
parentb9cd36fa1f306ed2a1cc5c1a0c38ce05202afaaa (diff)
downloadpasst-68b0a36d6a40aece53df9d2a0a89addd07eda515.tar
passt-68b0a36d6a40aece53df9d2a0a89addd07eda515.tar.gz
passt-68b0a36d6a40aece53df9d2a0a89addd07eda515.tar.bz2
passt-68b0a36d6a40aece53df9d2a0a89addd07eda515.tar.lz
passt-68b0a36d6a40aece53df9d2a0a89addd07eda515.tar.xz
passt-68b0a36d6a40aece53df9d2a0a89addd07eda515.tar.zst
passt-68b0a36d6a40aece53df9d2a0a89addd07eda515.zip
tcp, udp: Pad batched frames to 60 bytes (802.3 minimum) in non-vhost-user modes
Add a further iovec frame part, TCP_IOV_ETH_PAD for TCP and UDP_IOV_ETH_PAD for UDP, after the payload, make that point to a zero-filled buffer, and send out a part of it if needed to reach the minimum frame length given by 802.3, that is, 60 bytes altogether. The frames we might need to pad are IPv4 only (the IPv6 header is larger), and are typically TCP ACK segments but can also be small data segments or datagrams. Link: https://bugs.passt.top/show_bug.cgi?id=166 Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
-rw-r--r--tcp.c2
-rw-r--r--tcp_buf.c23
-rw-r--r--tcp_internal.h2
-rw-r--r--udp.c33
-rw-r--r--util.c3
-rw-r--r--util.h3
6 files changed, 62 insertions, 4 deletions
diff --git a/tcp.c b/tcp.c
index 9827302..bb661ee 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1019,7 +1019,7 @@ void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
else
tcp_update_csum(psum, th, payload);
- tap_hdr_update(taph, l3len + sizeof(struct ethhdr));
+ tap_hdr_update(taph, MAX(l3len + sizeof(struct ethhdr), ETH_ZLEN));
}
/**
diff --git a/tcp_buf.c b/tcp_buf.c
index 2058225..5d419d3 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -96,6 +96,7 @@ void tcp_sock_iov_init(const struct ctx *c)
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp_payload_tap_hdr[i]);
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp_payload[i];
+ iov[TCP_IOV_ETH_PAD].iov_base = eth_pad;
}
}
@@ -145,6 +146,22 @@ void tcp_payload_flush(const struct ctx *c)
}
/**
+ * tcp_l2_buf_pad() - Calculate padding to send out of padding (zero) buffer
+ * @iov: Pointer to iovec of frame parts we're about to send
+ */
+static void tcp_l2_buf_pad(struct iovec *iov)
+{
+ size_t l2len = iov[TCP_IOV_ETH].iov_len +
+ iov[TCP_IOV_IP].iov_len +
+ iov[TCP_IOV_PAYLOAD].iov_len;
+
+ if (l2len < ETH_ZLEN)
+ iov[TCP_IOV_ETH_PAD].iov_len = ETH_ZLEN - l2len;
+ else
+ iov[TCP_IOV_ETH_PAD].iov_len = 0;
+}
+
+/**
* tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
* @c: Execution context
* @conn: Connection pointer
@@ -212,6 +229,8 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
tcp_l2_buf_fill_headers(c, conn, iov, NULL, seq, false);
+ tcp_l2_buf_pad(iov);
+
if (flags & DUP_ACK) {
struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used];
tcp_frame_conns[tcp_payload_used++] = conn;
@@ -223,6 +242,7 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
memcpy(dup_iov[TCP_IOV_PAYLOAD].iov_base,
iov[TCP_IOV_PAYLOAD].iov_base, l4len);
dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len;
+ dup_iov[TCP_IOV_ETH_PAD].iov_len = iov[TCP_IOV_ETH_PAD].iov_len;
}
if (tcp_payload_used > TCP_FRAMES_MEM - 2)
@@ -270,6 +290,9 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
payload->th.psh = push;
iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr);
tcp_l2_buf_fill_headers(c, conn, iov, check, seq, false);
+
+ tcp_l2_buf_pad(iov);
+
if (++tcp_payload_used > TCP_FRAMES_MEM - 1)
tcp_payload_flush(c);
}
diff --git a/tcp_internal.h b/tcp_internal.h
index 19e8922..5f8fb35 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -63,6 +63,7 @@
* @TCP_IOV_ETH Ethernet header
* @TCP_IOV_IP IP (v4/v6) header
* @TCP_IOV_PAYLOAD IP payload (TCP header + data)
+ * @TCP_IOV_ETH_PAD Ethernet (802.3) padding to 60 bytes
* @TCP_NUM_IOVS the number of entries in the iovec array
*/
enum tcp_iov_parts {
@@ -70,6 +71,7 @@ enum tcp_iov_parts {
TCP_IOV_ETH = 1,
TCP_IOV_IP = 2,
TCP_IOV_PAYLOAD = 3,
+ TCP_IOV_ETH_PAD = 4,
TCP_NUM_IOVS
};
diff --git a/udp.c b/udp.c
index b93c18b..08bec50 100644
--- a/udp.c
+++ b/udp.c
@@ -168,6 +168,7 @@ udp_meta[UDP_MAX_FRAMES];
* @UDP_IOV_ETH Ethernet header
* @UDP_IOV_IP IP (v4/v6) header
* @UDP_IOV_PAYLOAD IP payload (UDP header + data)
+ * @UDP_IOV_ETH_PAD Ethernet (802.3) padding to 60 bytes
* @UDP_NUM_IOVS the number of entries in the iovec array
*/
enum udp_iov_idx {
@@ -175,6 +176,7 @@ enum udp_iov_idx {
UDP_IOV_ETH,
UDP_IOV_IP,
UDP_IOV_PAYLOAD,
+ UDP_IOV_ETH_PAD,
UDP_NUM_IOVS,
};
@@ -239,6 +241,7 @@ static void udp_iov_init_one(const struct ctx *c, size_t i)
tiov[UDP_IOV_ETH] = IOV_OF_LVALUE(udp_eth_hdr[i]);
tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph);
tiov[UDP_IOV_PAYLOAD].iov_base = payload;
+ tiov[UDP_IOV_ETH_PAD].iov_base = eth_pad;
mh->msg_iov = siov;
mh->msg_iovlen = 1;
@@ -345,6 +348,22 @@ size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
}
/**
+ * udp_tap_pad() - Calculate padding to send out of padding (zero) buffer
+ * @iov: Pointer to iovec of frame parts we're about to send
+ */
+static void udp_tap_pad(struct iovec *iov)
+{
+ size_t l2len = iov[UDP_IOV_ETH].iov_len +
+ iov[UDP_IOV_IP].iov_len +
+ iov[UDP_IOV_PAYLOAD].iov_len;
+
+ if (l2len < ETH_ZLEN)
+ iov[UDP_IOV_ETH_PAD].iov_len = ETH_ZLEN - l2len;
+ else
+ iov[UDP_IOV_ETH_PAD].iov_len = 0;
+}
+
+/**
* udp_tap_prepare() - Convert one datagram into a tap frame
* @mmh: Receiving mmsghdr array
* @idx: Index of the datagram to prepare
@@ -362,23 +381,31 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
struct ethhdr *eh = (*tap_iov)[UDP_IOV_ETH].iov_base;
struct udp_payload_t *bp = &udp_payload[idx];
struct udp_meta_t *bm = &udp_meta[idx];
- size_t l4len;
+ size_t l4len, l2len;
eth_update_mac(eh, NULL, tap_omac);
if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) {
l4len = udp_update_hdr6(&bm->ip6h, bp, toside,
mmh[idx].msg_len, no_udp_csum);
- tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + ETH_HLEN);
+
+ l2len = MAX(l4len + sizeof(bm->ip6h) + ETH_HLEN, ETH_ZLEN);
+ tap_hdr_update(&bm->taph, l2len);
+
eh->h_proto = htons_constant(ETH_P_IPV6);
(*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h);
} else {
l4len = udp_update_hdr4(&bm->ip4h, bp, toside,
mmh[idx].msg_len, no_udp_csum);
- tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) + ETH_HLEN);
+
+ l2len = MAX(l4len + sizeof(bm->ip4h) + ETH_HLEN, ETH_ZLEN);
+ tap_hdr_update(&bm->taph, l2len);
+
eh->h_proto = htons_constant(ETH_P_IP);
(*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip4h);
}
(*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len;
+
+ udp_tap_pad(*tap_iov);
}
/**
diff --git a/util.c b/util.c
index 4beb7c2..f32c9cb 100644
--- a/util.c
+++ b/util.c
@@ -39,6 +39,9 @@
#include <sys/random.h>
#endif
+/* Zero-filled buffer to pad 802.3 frames, up to 60 (ETH_ZLEN) bytes */
+uint8_t eth_pad[ETH_ZLEN] = { 0 };
+
/**
* sock_l4_() - Create and bind socket to socket address
* @c: Execution context
diff --git a/util.h b/util.h
index 7bf0701..17f5ae0 100644
--- a/util.h
+++ b/util.h
@@ -17,6 +17,7 @@
#include <arpa/inet.h>
#include <unistd.h>
#include <sys/syscall.h>
+#include <net/ethernet.h>
#include "log.h"
@@ -152,6 +153,8 @@ void abort_with_msg(const char *fmt, ...)
#define ntohll(x) (be64toh((x)))
#define htonll(x) (htobe64((x)))
+extern uint8_t eth_pad[ETH_ZLEN];
+
/**
* ntohl_unaligned() - Read 32-bit BE value from a possibly unaligned address
* @p: Pointer to the BE value in memory