aboutgitcodebugslistschat
path: root/tcp.c
diff options
context:
space:
mode:
authorDavid Gibson <david@gibson.dropbear.id.au>2024-09-18 11:53:05 +1000
committerStefano Brivio <sbrivio@redhat.com>2024-09-18 17:14:47 +0200
commit4aff6f93923327cb875ceacf12ef0ffc2e613174 (patch)
tree9142414b32751a7d2027a2a2efe1fe0e02009006 /tcp.c
parent7d8804beb8ecbd07b51dbbeaf14289d37f4f8107 (diff)
downloadpasst-4aff6f93923327cb875ceacf12ef0ffc2e613174.tar
passt-4aff6f93923327cb875ceacf12ef0ffc2e613174.tar.gz
passt-4aff6f93923327cb875ceacf12ef0ffc2e613174.tar.bz2
passt-4aff6f93923327cb875ceacf12ef0ffc2e613174.tar.lz
passt-4aff6f93923327cb875ceacf12ef0ffc2e613174.tar.xz
passt-4aff6f93923327cb875ceacf12ef0ffc2e613174.tar.zst
passt-4aff6f93923327cb875ceacf12ef0ffc2e613174.zip
tcp: Clean up tcpi_snd_wnd probing
When available, we want to retrieve our socket peer's advertised window and forward that to the guest. That information has been available from the kernel via the TCP_INFO getsockopt() since kernel commit 8f7baad7f035. Currently our probing for this is a bit odd. The HAS_SND_WND define determines if our headers include the tcp_snd_wnd field, but that doesn't necessarily mean the running kernel supports it. Currently we start by assuming it's _not_ available, but mark it as available if we ever see a non-zero value in the field. This is a bit hit and miss in two ways: * Zero is perfectly possible window the peer could report, so we can get false negatives * We're reading TCP_INFO into a local variable, which might not be zero initialised, so if the kernel _doesn't_ write it it could have non-zero garbage, giving us false positives. We can use a more direct way of probing for this: getsockopt() reports the length of the information retreived. So, check whether that's long enough to include the field. This lets us probe the availability of the field once and for all during initialisation. That in turn allows ctx to become a const pointer to tcp_prepare_flags() which cascades through many other functions. We also move the flag for the probe result from the ctx structure to a global, to match peek_offset_cap. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Diffstat (limited to 'tcp.c')
-rw-r--r--tcp.c93
1 files changed, 67 insertions, 26 deletions
diff --git a/tcp.c b/tcp.c
index 14b48a8..cba3f3b 100644
--- a/tcp.c
+++ b/tcp.c
@@ -308,11 +308,6 @@
/* MSS rounding: see SET_MSS() */
#define MSS_DEFAULT 536
#define WINDOW_DEFAULT 14600 /* RFC 6928 */
-#ifdef HAS_SND_WND
-# define KERNEL_REPORTS_SND_WND(c) ((c)->tcp.kernel_snd_wnd)
-#else
-# define KERNEL_REPORTS_SND_WND(c) (0 && (c))
-#endif
#define ACK_INTERVAL 10 /* ms */
#define SYN_TIMEOUT 10 /* s */
@@ -370,6 +365,14 @@ char tcp_buf_discard [MAX_WINDOW];
/* Does the kernel support TCP_PEEK_OFF? */
bool peek_offset_cap;
+#ifdef HAS_SND_WND
+/* Does the kernel report sending window in TCP_INFO (kernel commit
+ * 8f7baad7f035)
+ */
+bool snd_wnd_cap;
+#else
+#define snd_wnd_cap (false)
+#endif
/* sendmsg() to socket */
static struct iovec tcp_iov [UIO_MAXIOV];
@@ -1052,7 +1055,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
}
#endif /* !HAS_BYTES_ACKED */
- if (!KERNEL_REPORTS_SND_WND(c)) {
+ if (!snd_wnd_cap) {
tcp_get_sndbuf(conn);
new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW);
conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap,
@@ -1136,7 +1139,7 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
* 0 if there is no flag to send
* 1 otherwise
*/
-int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
+int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
int flags, struct tcphdr *th, char *data,
size_t *optlen)
{
@@ -1153,11 +1156,6 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
return -ECONNRESET;
}
-#ifdef HAS_SND_WND
- if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd)
- c->tcp.kernel_snd_wnd = 1;
-#endif
-
if (!(conn->flags & LOCAL))
tcp_rtt_dst_check(conn, &tinfo);
@@ -1235,7 +1233,8 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
*
* Return: negative error code on connection reset, 0 otherwise
*/
-static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
+static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn,
+ int flags)
{
return tcp_buf_send_flag(c, conn, flags);
}
@@ -1245,7 +1244,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
* @c: Execution context
* @conn: Connection pointer
*/
-void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
+void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn)
{
if (conn->events == CLOSED)
return;
@@ -1463,7 +1462,7 @@ static void tcp_bind_outbound(const struct ctx *c,
* @optlen: Bytes in options: caller MUST ensure available length
* @now: Current timestamp
*/
-static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
+static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
const void *saddr, const void *daddr,
const struct tcphdr *th, const char *opts,
size_t optlen, const struct timespec *now)
@@ -1628,7 +1627,7 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
*
* #syscalls recvmsg
*/
-static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
+static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
{
return tcp_buf_data_from_sock(c, conn);
}
@@ -1644,8 +1643,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
*
* Return: count of consumed packets
*/
-static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
- const struct pool *p, int idx)
+static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
+ const struct pool *p, int idx)
{
int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0;
uint16_t max_ack_seq_wnd = conn->wnd_from_tap;
@@ -1842,7 +1841,8 @@ out:
* @opts: Pointer to start of options
* @optlen: Bytes in options: caller MUST ensure available length
*/
-static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
+static void tcp_conn_from_sock_finish(const struct ctx *c,
+ struct tcp_tap_conn *conn,
const struct tcphdr *th,
const char *opts, size_t optlen)
{
@@ -1885,7 +1885,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
*
* Return: count of consumed packets
*/
-int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
+int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
const void *saddr, const void *daddr,
const struct pool *p, int idx, const struct timespec *now)
{
@@ -2023,7 +2023,7 @@ reset:
* @c: Execution context
* @conn: Connection pointer
*/
-static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
+static void tcp_connect_finish(const struct ctx *c, struct tcp_tap_conn *conn)
{
socklen_t sl;
int so;
@@ -2049,8 +2049,8 @@ static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
* @sa: Peer socket address (from accept())
* @now: Current timestamp
*/
-static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s,
- const struct timespec *now)
+static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
+ int s, const struct timespec *now)
{
struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
uint64_t hash;
@@ -2081,7 +2081,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s,
* @ref: epoll reference of listening socket
* @now: Current timestamp
*/
-void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
+void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
const struct timespec *now)
{
const struct flowside *ini;
@@ -2146,7 +2146,7 @@ cancel:
*
* #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64
*/
-void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
+void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
{
struct itimerspec check_armed = { { 0 }, { 0 } };
struct tcp_tap_conn *conn = &FLOW(ref.flow)->tcp;
@@ -2210,7 +2210,8 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
* @ref: epoll reference
* @events: epoll events bitmap
*/
-void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
+void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
+ uint32_t events)
{
struct tcp_tap_conn *conn = conn_at_sidx(ref.flowside);
@@ -2494,6 +2495,40 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af)
return ret;
}
+#ifdef HAS_SND_WND
+/**
+ * tcp_probe_snd_wnd_cap() - Check if TCP_INFO reports tcpi_snd_wnd
+ *
+ * Return: true if supported, false otherwise
+ */
+static bool tcp_probe_snd_wnd_cap(void)
+{
+ struct tcp_info tinfo;
+ socklen_t sl = sizeof(tinfo);
+ int s;
+
+ s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+ if (s < 0) {
+ warn_perror("Temporary TCP socket creation failed");
+ return false;
+ }
+
+ if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
+ warn_perror("Failed to get TCP_INFO on temporary socket");
+ close(s);
+ return false;
+ }
+
+ close(s);
+
+ if (sl < (offsetof(struct tcp_info, tcpi_snd_wnd) +
+ sizeof(tinfo.tcpi_snd_wnd)))
+ return false;
+
+ return true;
+}
+#endif /* HAS_SND_WND */
+
/**
* tcp_init() - Get initial sequence, hash secret, initialise per-socket data
* @c: Execution context
@@ -2527,6 +2562,12 @@ int tcp_init(struct ctx *c)
(!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6));
debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");
+#ifdef HAS_SND_WND
+ snd_wnd_cap = tcp_probe_snd_wnd_cap();
+#endif
+ debug("TCP_INFO tcpi_snd_wnd field%ssupported",
+ snd_wnd_cap ? " " : " not ");
+
return 0;
}