aboutgitcodebugslistschat
diff options
context:
space:
mode:
authorStefano Brivio <sbrivio@redhat.com>2025-12-04 06:43:16 +0100
committerStefano Brivio <sbrivio@redhat.com>2025-12-08 09:15:36 +0100
commit9139e60fd455fafb753c838e554732aed5ecbcd3 (patch)
tree6ab076ee3d75cc812abd457e4d0e23bcb8802369
parent28f413d0332c923f1a4a7a05359d90116cbcb4a3 (diff)
downloadpasst-9139e60fd455fafb753c838e554732aed5ecbcd3.tar
passt-9139e60fd455fafb753c838e554732aed5ecbcd3.tar.gz
passt-9139e60fd455fafb753c838e554732aed5ecbcd3.tar.bz2
passt-9139e60fd455fafb753c838e554732aed5ecbcd3.tar.lz
passt-9139e60fd455fafb753c838e554732aed5ecbcd3.tar.xz
passt-9139e60fd455fafb753c838e554732aed5ecbcd3.tar.zst
passt-9139e60fd455fafb753c838e554732aed5ecbcd3.zip
tcp: Acknowledge everything if it looks like bulk traffic, not interactive
...instead of checking if the current sending buffer is less than SNDBUF_SMALL, because this isn't simply an optimisation to coalesce ACK segments: we rely on having enough data at once from the sender to make the buffer grow by means of TCP buffer size tuning implemented in the Linux kernel. This is important if we're trying to maximise throughput, but not desirable for interactive traffic, where we want to be transparent as possible and avoid introducing unnecessary latency. Use the tcpi_delivery_rate field reported by the Linux kernel, if available, to calculate the current bandwidth-delay product: if it's significantly smaller than the available sending buffer, conclude that we're not bandwidth-bound and this is likely to be interactive traffic, so acknowledge data only as it's acknowledged by the peer. Conversely, if the bandwidth-delay product is comparable to the size of the sending buffer (more than 5%), we're probably bandwidth-bound or... bound to be: acknowledge everything in that case. Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
-rw-r--r--tcp.c45
1 files changed, 33 insertions, 12 deletions
diff --git a/tcp.c b/tcp.c
index d91db22..4080a1e 100644
--- a/tcp.c
+++ b/tcp.c
@@ -353,6 +353,9 @@ enum {
#define LOW_RTT_TABLE_SIZE 8
#define LOW_RTT_THRESHOLD 10 /* us */
+/* Ratio of buffer to bandwidth * delay product implying interactive traffic */
+#define SNDBUF_TO_BW_DELAY_INTERACTIVE /* > */ 20 /* (i.e. < 5% of buffer) */
+
#define ACK_IF_NEEDED 0 /* See tcp_send_flag() */
#define CONN_IS_CLOSING(conn) \
@@ -426,11 +429,13 @@ socklen_t tcp_info_size;
sizeof(((struct tcp_info_linux *)NULL)->tcpi_##f_)) <= tcp_info_size)
/* Kernel reports sending window in TCP_INFO (kernel commit 8f7baad7f035) */
-#define snd_wnd_cap tcp_info_cap(snd_wnd)
+#define snd_wnd_cap tcp_info_cap(snd_wnd)
/* Kernel reports bytes acked in TCP_INFO (kernel commit 0df48c26d84) */
-#define bytes_acked_cap tcp_info_cap(bytes_acked)
+#define bytes_acked_cap tcp_info_cap(bytes_acked)
/* Kernel reports minimum RTT in TCP_INFO (kernel commit cd9b266095f4) */
-#define min_rtt_cap tcp_info_cap(min_rtt)
+#define min_rtt_cap tcp_info_cap(min_rtt)
+/* Kernel reports delivery rate in TCP_INFO (kernel commit eb8329e0a04d) */
+#define delivery_rate_cap tcp_info_cap(delivery_rate)
/* sendmsg() to socket */
static struct iovec tcp_iov [UIO_MAXIOV];
@@ -1051,6 +1056,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
socklen_t sl = sizeof(*tinfo);
struct tcp_info_linux tinfo_new;
uint32_t new_wnd_to_tap = prev_wnd_to_tap;
+ bool ack_everything = true;
int s = conn->sock;
/* At this point we could ack all the data we've accepted for forwarding
@@ -1060,7 +1066,8 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
* control behaviour.
*
* For it to be possible and worth it we need:
- * - The TCP_INFO Linux extension which gives us the peer acked bytes
+ * - The TCP_INFO Linux extensions which give us the peer acked bytes
+ * and the delivery rate (outbound bandwidth at receiver)
* - Not to be told not to (force_seq)
* - Not half-closed in the peer->guest direction
* With no data coming from the peer, we might not get events which
@@ -1070,19 +1077,36 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
* Data goes from socket to socket, with nothing meaningfully "in
* flight".
* - Not a pseudo-local connection (e.g. to a VM on the same host)
- * - Large enough send buffer
- * In these cases, there's not enough in flight to bother.
+ * If it is, there's not enough in flight to bother.
+ * - Sending buffer significantly larger than bandwidth * delay product
+ * Meaning we're not bandwidth-bound and this is likely to be
+ * interactive traffic where we want to preserve transparent
+ * connection behaviour and latency.
+ *
+ * Otherwise, we probably want to maximise throughput, which needs
+ * sending buffer auto-tuning, triggered in turn by filling up the
+ * outbound socket queue.
*/
- if (bytes_acked_cap && !force_seq &&
+ if (bytes_acked_cap && delivery_rate_cap && !force_seq &&
!CONN_IS_CLOSING(conn) &&
- !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn) &&
- (unsigned)SNDBUF_GET(conn) >= SNDBUF_SMALL) {
+ !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn)) {
if (!tinfo) {
tinfo = &tinfo_new;
if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
return 0;
}
+ if ((unsigned)SNDBUF_GET(conn) > (long long)tinfo->tcpi_rtt *
+ tinfo->tcpi_delivery_rate /
+ 1000 / 1000 *
+ SNDBUF_TO_BW_DELAY_INTERACTIVE)
+ ack_everything = false;
+ }
+
+ if (ack_everything) {
+ /* Fall back to acknowledging everything we got */
+ conn->seq_ack_to_tap = conn->seq_from_tap;
+ } else {
/* This trips a cppcheck bug in some versions, including
* cppcheck 2.18.3.
* https://sourceforge.net/p/cppcheck/discussion/general/thread/fecde59085/
@@ -1090,9 +1114,6 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
/* cppcheck-suppress [uninitvar,unmatchedSuppression] */
conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked +
conn->seq_init_from_tap;
- } else {
- /* Fall back to acknowledging everything we got */
- conn->seq_ack_to_tap = conn->seq_from_tap;
}
/* It's occasionally possible for us to go from using the fallback above