aboutgitcodebugslistschat
diff options
context:
space:
mode:
authorStefano Brivio <sbrivio@redhat.com>2025-12-03 20:04:21 +0100
committerStefano Brivio <sbrivio@redhat.com>2025-12-08 09:15:36 +0100
commit000601ba86da0d876fc91e0813a1e752540666f1 (patch)
treeb82da8e30fec32e1eda175ae6a232de9d02ac999
parent920a479de40b58a81178e5d6e96c0eed30b992d5 (diff)
downloadpasst-000601ba86da0d876fc91e0813a1e752540666f1.tar
passt-000601ba86da0d876fc91e0813a1e752540666f1.tar.gz
passt-000601ba86da0d876fc91e0813a1e752540666f1.tar.bz2
passt-000601ba86da0d876fc91e0813a1e752540666f1.tar.lz
passt-000601ba86da0d876fc91e0813a1e752540666f1.tar.xz
passt-000601ba86da0d876fc91e0813a1e752540666f1.tar.zst
passt-000601ba86da0d876fc91e0813a1e752540666f1.zip
tcp: Adaptive interval based on RTT for socket-side acknowledgement checks
A fixed 10 ms ACK_INTERVAL timer value served us relatively well until the previous change, because we would generally cause retransmissions for non-local outbound transfers with relatively high (> 100 Mbps) bandwidth and non-local but low (< 5 ms) RTT. Now that retransmissions are less frequent, we don't have a proper trigger to check for acknowledged bytes on the socket, and will generally block the sender for a significant amount of time while we could acknowledge more data, instead. Store the RTT reported by the kernel using an approximation (exponent), to keep flow storage size within two (typical) cachelines. Check for socket updates when half of this time elapses: it should be a good indication of the one-way delay we're interested in (peer to us). Representable values are between 100 us and 3.2768 s, and any value outside this range is clamped to these bounds. This choice appears to be a good trade-off between additional overhead and throughput. This mechanism partially overlaps with the "low RTT" destinations, which we use to infer that a socket is connected to an endpoint to the same machine (while possibly in a different namespace) if the RTT is reported as 10 us or less. This change doesn't, however, conflict with it: we are reading TCP_INFO parameters for local connections anyway, so we can always store the RTT approximation opportunistically. Then, if the RTT is "low", we don't really need a timer to acknowledge data as we'll always acknowledge everything to the sender right away. However, we have limited space in the array where we store addresses of local destination, so the low RTT property of a connection might toggle frequently. Because of this, it's actually helpful to always have the RTT approximation stored. This could probably benefit from a future rework, though, introducing a more integrated approach between these two mechanisms. Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
-rw-r--r--tcp.c31
-rw-r--r--tcp_conn.h9
-rw-r--r--util.c14
-rw-r--r--util.h1
4 files changed, 48 insertions, 7 deletions
diff --git a/tcp.c b/tcp.c
index 28d3304..4a886ae 100644
--- a/tcp.c
+++ b/tcp.c
@@ -202,9 +202,13 @@
* - ACT_TIMEOUT, in the presence of any event: if no activity is detected on
* either side, the connection is reset
*
- * - ACK_INTERVAL elapsed after data segment received from tap without having
+ * - RTT / 2 elapsed after data segment received from tap without having
* sent an ACK segment, or zero-sized window advertised to tap/guest (flag
- * ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent
+ * ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent.
+ *
+ * RTT, here, is an approximation of the RTT value reported by the kernel via
+ * TCP_INFO, with a representable range from RTT_STORE_MIN (100 us) to
+ * RTT_STORE_MAX (3276.8 ms). The timeout value is clamped accordingly.
*
*
* Summary of data flows (with ESTABLISHED event)
@@ -341,7 +345,6 @@ enum {
#define MSS_DEFAULT 536
#define WINDOW_DEFAULT 14600 /* RFC 6928 */
-#define ACK_INTERVAL 10 /* ms */
#define RTO_INIT 1 /* s, RFC 6298 */
#define RTO_INIT_AFTER_SYN_RETRIES 3 /* s, RFC 6298 */
#define FIN_TIMEOUT 60
@@ -593,7 +596,9 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
}
if (conn->flags & ACK_TO_TAP_DUE) {
- it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000;
+ it.it_value.tv_sec = RTT_GET(conn) / 2 / ((long)1000 * 1000);
+ it.it_value.tv_nsec = RTT_GET(conn) / 2 % ((long)1000 * 1000) *
+ 1000;
} else if (conn->flags & ACK_FROM_TAP_DUE) {
int exp = conn->retries, timeout = RTO_INIT;
if (!(conn->events & ESTABLISHED))
@@ -608,9 +613,17 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
it.it_value.tv_sec = ACT_TIMEOUT;
}
- flow_dbg(conn, "timer expires in %llu.%03llus",
- (unsigned long long)it.it_value.tv_sec,
- (unsigned long long)it.it_value.tv_nsec / 1000 / 1000);
+ if (conn->flags & ACK_TO_TAP_DUE) {
+ flow_trace(conn, "timer expires in %llu.%03llums",
+ (unsigned long)it.it_value.tv_sec * 1000 +
+ (unsigned long long)it.it_value.tv_nsec %
+ ((long)1000 * 1000),
+ (unsigned long long)it.it_value.tv_nsec / 1000);
+ } else {
+ flow_dbg(conn, "timer expires in %llu.%03llus",
+ (unsigned long long)it.it_value.tv_sec,
+ (unsigned long long)it.it_value.tv_nsec / 1000 / 1000);
+ }
if (timerfd_settime(conn->timer, 0, &it, NULL))
flow_perror(conn, "failed to set timer");
@@ -1144,6 +1157,10 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
conn_flag(c, conn, ACK_TO_TAP_DUE);
out:
+ /* Opportunistically store RTT approximation on valid TCP_INFO data */
+ if (tinfo)
+ RTT_SET(conn, tinfo->tcpi_rtt);
+
return new_wnd_to_tap != prev_wnd_to_tap ||
conn->seq_ack_to_tap != prev_ack_to_tap;
}
diff --git a/tcp_conn.h b/tcp_conn.h
index e36910c..9c6ff9e 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -49,6 +49,15 @@ struct tcp_tap_conn {
#define MSS_SET(conn, mss) (conn->tap_mss = (mss >> (16 - TCP_MSS_BITS)))
#define MSS_GET(conn) (conn->tap_mss << (16 - TCP_MSS_BITS))
+#define RTT_EXP_BITS 4
+ unsigned int rtt_exp :RTT_EXP_BITS;
+#define RTT_EXP_MAX MAX_FROM_BITS(RTT_EXP_BITS)
+#define RTT_STORE_MIN 100 /* us, minimum representable */
+#define RTT_STORE_MAX ((long)(RTT_STORE_MIN << RTT_EXP_MAX))
+#define RTT_SET(conn, rtt) \
+ (conn->rtt_exp = MIN(RTT_EXP_MAX, ilog2(MAX(1, rtt / RTT_STORE_MIN))))
+#define RTT_GET(conn) (RTT_STORE_MIN << conn->rtt_exp)
+
int sock :FD_REF_BITS;
uint8_t events;
diff --git a/util.c b/util.c
index 2232a24..bfeb619 100644
--- a/util.c
+++ b/util.c
@@ -614,6 +614,9 @@ int __daemon(int pidfile_fd, int devnull_fd)
* fls() - Find last (most significant) bit set in word
* @x: Word
*
+ * Note: unlike ffs() and other implementations of fls(), notably the one from
+ * the Linux kernel, the starting position is 0 and not 1, that is, fls(1) = 0.
+ *
* Return: position of most significant bit set, starting from 0, -1 if none
*/
int fls(unsigned long x)
@@ -630,6 +633,17 @@ int fls(unsigned long x)
}
/**
+ * ilog2() - Integral part (floor) of binary logarithm (logarithm to the base 2)
+ * @x: Argument
+ *
+ * Return: integral part of binary logarithm of @x, -1 if undefined (if @x is 0)
+ */
+int ilog2(unsigned long x)
+{
+ return fls(x);
+}
+
+/**
* write_file() - Replace contents of file with a string
* @path: File to write
* @buf: String to write
diff --git a/util.h b/util.h
index 744880b..f7a941f 100644
--- a/util.h
+++ b/util.h
@@ -233,6 +233,7 @@ int output_file_open(const char *path, int flags);
void pidfile_write(int fd, pid_t pid);
int __daemon(int pidfile_fd, int devnull_fd);
int fls(unsigned long x);
+int ilog2(unsigned long x);
int write_file(const char *path, const char *buf);
intmax_t read_file_integer(const char *path, intmax_t fallback);
int write_all_buf(int fd, const void *buf, size_t len);