aboutgitcodebugslistschat
diff options
context:
space:
mode:
-rw-r--r--passt.c4
-rw-r--r--tcp.c2346
-rw-r--r--tcp.h10
-rw-r--r--tcp_splice.c859
-rw-r--r--tcp_splice.h14
-rw-r--r--util.c19
-rw-r--r--util.h4
7 files changed, 1755 insertions, 1501 deletions
diff --git a/passt.c b/passt.c
index 5cd8f3b..6c04266 100644
--- a/passt.c
+++ b/passt.c
@@ -119,12 +119,12 @@ static void post_handler(struct ctx *c, struct timespec *now)
#define CALL_PROTO_HANDLER(c, now, lc, uc) \
do { \
extern void \
- lc ## _defer_handler (struct ctx *c) \
+ lc ## _defer_handler (struct ctx *, struct timespec *) \
__attribute__ ((weak)); \
\
if (!c->no_ ## lc) { \
if (lc ## _defer_handler) \
- lc ## _defer_handler(c); \
+ lc ## _defer_handler(c, now); \
\
if (timespec_diff_ms((now), &c->lc.timer_run) \
>= uc ## _TIMER_INTERVAL) { \
diff --git a/tcp.c b/tcp.c
index 4dc9750..323dee3 100644
--- a/tcp.c
+++ b/tcp.c
@@ -8,7 +8,7 @@
*
* tcp.c - TCP L2-L4 translation state machine
*
- * Copyright (c) 2020-2021 Red Hat GmbH
+ * Copyright (c) 2020-2022 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*/
@@ -52,7 +52,7 @@
* delegated as much as possible to the TCP implementations of guest and host
* kernel. This is achieved by:
* - avoiding a complete TCP stack reimplementation, with a modified TCP state
- * machine focused on the translation of observed states instead
+ * machine focused on the translation of observed events instead
* - mirroring TCP dynamics as described above and hence avoiding the need for
* segmentation, explicit queueing, and reassembly of segments
* - security:
@@ -98,14 +98,14 @@
* Connection tracking and storage
* -------------------------------
*
- * Connections are tracked by the @tt array of struct tcp_tap_conn, containing
+ * Connections are tracked by the @tc array of struct tcp_conn, containing
* addresses, ports, TCP states and parameters. This is statically allocated and
* indexed by an arbitrary connection number. The array is compacted whenever a
* connection is closed, by remapping the highest connection index in use to the
* one freed up.
*
* References used for the epoll interface report the connection index used for
- * the @tt array.
+ * the @tc array.
*
* IPv4 addresses are stored as IPv4-mapped IPv6 addresses to avoid the need for
* separate data structures depending on the protocol version.
@@ -127,64 +127,40 @@
* added to the epoll list, with no separate storage.
*
*
- * States and events
+ * Events and states
* -----------------
*
- * These states apply to connected sockets only, listening sockets are always
- * open after initialisation, in LISTEN state. A single state is maintained for
- * both sides of the connection, and some states are omitted as they are already
- * handled by host kernel and guest.
- *
- * - CLOSED no connection
- * No associated events: this is always a final state, new connections
- * directly start from TAP_SYN_SENT or SOCK_SYN_SENT described below.
- *
- * - TAP_SYN_SENT connect() in progress, triggered from tap
- * - connect() completes SYN,ACK to tap > TAP_SYN_RCVD
- * - connect() aborts RST to tap, close socket > CLOSED
- *
- * - SOCK_SYN_SENT new connected socket, SYN sent to tap
- * - SYN,ACK from tap ACK to tap > ESTABLISHED
- * - SYN,ACK timeout RST to tap, close socket > CLOSED
- *
- * - TAP_SYN_RCVD connect() completed, SYN,ACK sent to tap
- * - FIN from tap write shutdown > FIN_WAIT_1
- * - ACK from tap > ESTABLISHED
- * - ACK timeout RST to tap, close socket > CLOSED
- *
- * - ESTABLISHED connection established, ready for data
- * - EPOLLRDHUP read shutdown > ESTABLISHED_SOCK_FIN
- * - FIN from tap write shutdown > FIN_WAIT_1
- * - EPOLLHUP RST to tap, close socket > CLOSED
- * - data timeout read shutdown, FIN to tap >
- * ESTABLISHED_SOCK_FIN_SENT
- *
- * - ESTABLISHED_SOCK_FIN socket closing connection, reading half closed
- * - zero-sized socket read FIN,ACK to tap > ESTABLISHED_SOCK_FIN_SENT
- *
- * - ESTABLISHED_SOCK_FIN_SENT socket closing connection, FIN sent to tap
- * - ACK (for FIN) from tap > CLOSE_WAIT
- * - tap ACK timeout RST to tap, close socket > CLOSED
- *
- * - CLOSE_WAIT socket closing connection, ACK from tap
- * - FIN from tap write shutdown > LAST_ACK
- * - data timeout RST to tap, close socket > CLOSED
- *
- * - LAST_ACK socket started close, tap completed it
- * - any event from socket ACK to tap, close socket > CLOSED
- * - ACK timeout RST to tap, close socket > CLOSED
+ * Instead of tracking connection states using a state machine, connection
+ * events are used to determine state and actions for a given connection. This
+ * makes the implementation simpler as most of the relevant tasks deal with
+ * reactions to events, rather than state-associated actions. For user
+ * convenience, approximate states are mapped in logs from events by
+ * @tcp_state_str.
+ *
+ * The events are:
+ *
+ * - SOCK_ACCEPTED connection accepted from socket, SYN sent to tap/guest
+ *
+ * - TAP_SYN_RCVD tap/guest initiated connection, SYN received
+ *
+ * - TAP_SYN_ACK_SENT SYN, ACK sent to tap/guest, valid for TAP_SYN_RCVD only
+ *
+ * - ESTABLISHED connection established, the following events are valid:
*
- * - FIN_WAIT_1 tap closing connection, FIN sent to socket
- * - EPOLLRDHUP FIN,ACK to tap, shutdown > FIN_WAIT_1_SOCK_FIN
- * - socket timeout RST to tap, close socket > CLOSED
+ * - SOCK_FIN_RCVD FIN (EPOLLRDHUP) received from socket
*
- * - FIN_WAIT_1_SOCK_FIN tap closing connection, FIN received from socket
- * - ACK from tap close socket > CLOSED
- * - tap ACK timeout RST to tap, close socket > CLOSED
+ * - SOCK_FIN_SENT FIN (write shutdown) sent to socket
*
- * - from any state
- * - RST from tap close socket > CLOSED
- * - socket error RST to tap, close socket > CLOSED
+ * - TAP_FIN_RCVD FIN received from tap/guest
+ *
+ * - TAP_FIN_SENT FIN sent to tap/guest
+ *
+ * - TAP_FIN_ACKED ACK to FIN seen from tap/guest
+ *
+ * Setting any event in CONN_STATE_BITS (SOCK_ACCEPTED, TAP_SYN_RCVD,
+ * ESTABLISHED) clears all the other events, as those represent the fundamental
+ * connection states. No events (events == CLOSED) means the connection is
+ * closed.
*
* Connection setup
* ----------------
@@ -201,76 +177,75 @@
* Aging and timeout
* -----------------
*
- * A bitmap of TCP_MAX_CONNS bits indicate the connections subject to timed
- * events based on states:
- * - SOCK_SYN_SENT: after a 2MSL (240s) timeout waiting for a SYN,ACK segment
- * from tap expires, connection is reset (RST to tap, socket closed)
- * - TAP_SYN_RCVD: after a 2MSL (240s) timeout waiting for an ACK segment from
- * tap expires, connection is reset (RST to tap, socket closed)
- * - TAP_SYN_SENT: connect() is pending, timeout is handled implicitly by
- * connect() timeout, connection will be reset in case
- * - ESTABLISHED, ESTABLISHED_SOCK_FIN: if an ACK segment to tap is pending,
- * bytes acknowledged by socket endpoint are checked every 50ms (one quarter
- * of current TCP_DELACK_MAX on Linux)
- * - ESTABLISHED, ESTABLISHED_SOCK_FIN: after a timeout of 3s (TODO: implement
- * requirements from RFC 6298) waiting for an ACK segment from tap expires,
- * data from socket queue is retransmitted starting from the last ACK sequence
- * - ESTABLISHED, ESTABLISHED_SOCK_FIN: after a two hours (current
- * TCP_KEEPALIVE_TIME on Linux) timeout waiting for any activity expires,
- * connection is reset (RST to tap, socket closed)
- * - ESTABLISHED_SOCK_FIN: after a 2MSL (240s) timeout waiting for an ACK
- * segment from tap expires, connection is reset (RST to tap, socket closed)
- * - CLOSE_WAIT: after a 2MSL (240s) timeout waiting for a FIN segment from tap
- * expires, connection is reset (RST to tap, socket closed)
- * - FIN_WAIT_1: after a 2MSL (240s) timeout waiting for an ACK segment from
- * socet expires, connection is reset (RST to tap, socket closed)
- * - FIN_WAIT_1_SOCK_FIN: after a 2MSL (240s) timeout waiting for an ACK segment
- * from tap expires, connection is reset (RST to tap, socket closed)
- * - LAST_ACK: after a 2MSL (240s) timeout waiting for an ACK segment from
- * socket expires, connection is reset (RST to tap, socket closed)
- *
- *
- * Data flows (from ESTABLISHED, ESTABLISHED_SOCK_FIN states)
- * ----------------------------------------------------------
- *
- * @seq_to_tap: next sequence for packets to tap
- * @seq_ack_from_tap: last ACK number received from tap
- * @seq_from_tap: next sequence for packets from tap (not actually sent)
- * @seq_ack_to_tap: last ACK number sent to tap
- *
- * @seq_init_from_tap: initial sequence number from tap
+ * Open connections are checked periodically against a number of timeouts. Those
+ * are:
+ *
+ * - SYN_TIMEOUT: if no ACK is received from tap/guest during handshake within
+ * this time, reset the connection
+ *
+ * - ACT_TIMEOUT, in the presence of any event: if no activity is detected on
+ * either side, the connection is reset
+ *
+ * - ACK_INTERVAL, or zero-sized window advertised to tap/guest: forcibly check
+ * if an ACK segment can be sent
+ *
+ * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
+ * data, re-send data from the socket and reset sequence to what was
+ * acknowledged. If this persists for longer than LAST_ACK_TIMEOUT, reset the
+ * connection
+ *
+ * - FIN_TIMEOUT, on TAP_FIN_SENT: if no ACK is received for the FIN segment
+ * within this time, the connection is reset
+ *
+ * - FIN_TIMEOUT, on SOCK_FIN_SENT: if no activity is detected on the socket
+ * after sending a FIN segment (write shutdown), reset the connection
+ *
+ * - LAST_ACK_TIMEOUT on SOCK_FIN_SENT *and* SOCK_FIN_RCVD: reset the connection
+ * if no activity was detected on any of the two sides after sending a FIN
+ * segment
+ *
+ *
+ * Summary of data flows (with ESTABLISHED event)
+ * ----------------------------------------------
+ *
+ * @seq_to_tap: next sequence for packets to tap/guest
+ * @seq_ack_from_tap: last ACK number received from tap/guest
+ * @seq_from_tap: next sequence for packets from tap/guest (expected)
+ * @seq_ack_to_tap: last ACK number sent to tap/guest
+ *
+ * @seq_init_from_tap: initial sequence number from tap/guest
+ * @seq_init_to_tap: initial sequence number from tap/guest
*
* @wnd_from_tap: last window size received from tap, scaled
- *
- * - from socket to tap:
+ * @wnd_from_tap: last window size advertised from tap, scaled
+ *
+ * - from socket to tap/guest:
* - on new data from socket:
* - peek into buffer
- * - send data to tap:
+ * - send data to tap/guest:
* - starting at offset (@seq_to_tap - @seq_ack_from_tap)
* - in MSS-sized segments
* - increasing @seq_to_tap at each segment
* - up to window (until @seq_to_tap - @seq_ack_from_tap <= @wnd_from_tap)
- * - mark socket in bitmap for periodic ACK check, set @last_ts_to_tap
- * - on read error, send RST to tap, close socket
- * - on zero read, send FIN to tap, enter ESTABLISHED_SOCK_FIN
- * - on ACK from tap:
- * - set @ts_ack_tap
+ * - on read error, send RST to tap/guest, close socket
+ * - on zero read, send FIN to tap/guest, set TAP_FIN_SENT
+ * - on ACK from tap/guest:
+ * - set @ts_ack_from_tap
* - check if it's the second duplicated ACK
* - consume buffer by difference between new ack_seq and @seq_ack_from_tap
* - update @seq_ack_from_tap from ack_seq in header
* - on two duplicated ACKs, reset @seq_to_tap to @seq_ack_from_tap, and
* resend with steps listed above
* - set TCP_WINDOW_CLAMP from TCP header from tap
- * - on @seq_ack_from_tap == @seq_to_tap, mark in bitmap, umark otherwise
* - periodically:
* - if @seq_ack_from_tap < @seq_to_tap and the retransmission timer
* (TODO: implement requirements from RFC 6298, currently 3s fixed) from
- * @ts_tap_from_ack elapsed, reset @seq_to_tap to @seq_ack_from_tap, and
+ * @ts_ack_from_tap elapsed, reset @seq_to_tap to @seq_ack_from_tap, and
* resend data with the steps listed above
*
- * - from tap to socket:
- * - on packet from tap:
- * - set @ts_tap_ack
+ * - from tap/guest to socket:
+ * - on packet from tap/guest:
+ * - set @ts_tap_act
* - set TCP_WINDOW_CLAMP from TCP header from tap
* - check seq from header against @seq_from_tap, if data is missing, send
* two ACKs with number @seq_ack_to_tap, discard packet
@@ -279,7 +254,7 @@
* - in ESTABLISHED state, send ACK to tap as soon as we queue to the
* socket. In other states, query socket for TCP_INFO, set
* @seq_ack_to_tap to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and
- * send ACK to tap
+ * send ACK to tap/guest
*
*
* PASTA mode
@@ -291,20 +266,7 @@
* section.
*
* For local traffic directed to TCP ports configured for direct mapping between
- * namespaces, the implementation is substantially simpler: packets are directly
- * translated between L4 sockets using a pair of splice() syscalls. These
- * connections are tracked in the @ts array of struct tcp_splice_conn, using
- * these states:
- *
- * - CLOSED: no connection
- * - SPLICE_ACCEPTED: accept() on the listening socket succeeded
- * - SPLICE_CONNECT: connect() issued in the destination namespace
- * - SPLICE_ESTABLISHED: connect() succeeded, packets are transferred
- * - SPLICE_FIN_FROM: FIN (EPOLLRDHUP) seen from originating socket
- * - SPLICE_FIN_TO: FIN (EPOLLRDHUP) seen from connected socket
- * - SPLICE_FIN_BOTH: FIN (EPOLLRDHUP) seen from both sides
- *
- * #syscalls:pasta pipe2|pipe fcntl armv6l:fcntl64 armv7l:fcntl64 ppc64:fcntl64
+ * namespaces, see the implementation in tcp_splice.c.
*/
#include <sched.h>
@@ -339,15 +301,13 @@
#include "siphash.h"
#include "pcap.h"
#include "conf.h"
+#include "tcp_splice.h"
#define MAX_TAP_CONNS (128 * 1024)
-#define MAX_SPLICE_CONNS (128 * 1024)
-
-#define TCP_TAP_FRAMES_MEM 256
-#define TCP_TAP_FRAMES \
- (c->mode == MODE_PASST ? TCP_TAP_FRAMES_MEM : 1)
-#define MAX_PIPE_SIZE (2UL * 1024 * 1024)
+#define TCP_FRAMES_MEM 256
+#define TCP_FRAMES \
+ (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)
#define TCP_HASH_TABLE_LOAD 70 /* % */
#define TCP_HASH_TABLE_SIZE (MAX_TAP_CONNS * 100 / \
@@ -375,9 +335,7 @@
#define FIN_TIMEOUT 240000
#define LAST_ACK_TIMEOUT 240000
-#define TCP_SOCK_POOL_SIZE 32
#define TCP_SOCK_POOL_TSH 16 /* Refill in ns if > x used */
-#define TCP_SPLICE_PIPE_POOL_SIZE 16
#define REFILL_INTERVAL 1000
#define PORT_DETECT_INTERVAL 1000
@@ -395,45 +353,13 @@
#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW)
#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW)
-#define CONN_V4(conn) (IN6_IS_ADDR_V4MAPPED(&conn->a.a6))
-#define CONN_V6(conn) (!CONN_V4(conn))
-
-enum tcp_state {
- CLOSED = 0,
- TAP_SYN_SENT,
- SOCK_SYN_SENT,
- TAP_SYN_RCVD,
- ESTABLISHED,
- ESTABLISHED_SOCK_FIN,
- ESTABLISHED_SOCK_FIN_SENT,
- CLOSE_WAIT,
- LAST_ACK,
- FIN_WAIT_1,
- FIN_WAIT_1_SOCK_FIN,
- SPLICE_ACCEPTED,
- SPLICE_CONNECT,
- SPLICE_ESTABLISHED,
- SPLICE_FIN_FROM,
- SPLICE_FIN_TO,
- SPLICE_FIN_BOTH,
-};
-#define TCP_STATE_STR_SIZE (SPLICE_FIN_BOTH + 1)
-
-static char *tcp_state_str[TCP_STATE_STR_SIZE] __attribute((__unused__)) = {
- "CLOSED", "TAP_SYN_SENT", "SOCK_SYN_SENT", "TAP_SYN_RCVD",
- "ESTABLISHED", "ESTABLISHED_SOCK_FIN", "ESTABLISHED_SOCK_FIN_SENT",
- "CLOSE_WAIT", "LAST_ACK", "FIN_WAIT_1", "FIN_WAIT_1_SOCK_FIN",
- "SPLICE_ACCEPTED", "SPLICE_CONNECT", "SPLICE_ESTABLISHED",
- "SPLICE_FIN_FROM", "SPLICE_FIN_TO", "SPLICE_FIN_BOTH",
-};
-
#define FIN (1 << 0)
#define SYN (1 << 1)
#define RST (1 << 2)
#define ACK (1 << 4)
/* Flags for internal usage */
#define DUP_ACK (1 << 5)
-#define FORCE_ACK (1 << 6)
+#define ACK_IF_NEEDED 0 /* See tcp_send_flag() */
#define OPT_EOL 0
#define OPT_NOP 1
@@ -445,10 +371,10 @@ static char *tcp_state_str[TCP_STATE_STR_SIZE] __attribute((__unused__)) = {
#define OPT_SACK 5
#define OPT_TS 8
-struct tcp_tap_conn;
+struct tcp_conn;
/**
- * struct tcp_tap_conn - Descriptor for a TCP connection via tap (not spliced)
+ * struct tcp_conn - Descriptor for a TCP connection (not spliced)
* @next: Pointer to next item in hash chain, if any
* @sock: Socket descriptor number
* @hash_bucket: Bucket index in connection lookup hash table
@@ -458,8 +384,9 @@ struct tcp_tap_conn;
* @a.a4.a: IPv4 address
* @tap_port: Guest-facing tap port
* @sock_port: Remote, socket-facing port
- * @local: Destination is local
- * @state: TCP connection state
+ * @events: Connection events, implying connection states
+ * @flags: Connection flags representing internal attributes
+ * @tap_mss: Maximum segment size advertised by guest
* @seq_to_tap: Next sequence for packets to tap
* @seq_ack_from_tap: Last ACK number received from tap
* @seq_from_tap: Next sequence for packets from tap (not actually sent)
@@ -471,17 +398,15 @@ struct tcp_tap_conn;
* @ws: Window scaling factor
* @wnd_from_tap: Last window size received from tap, scaled
* @wnd_to_tap: Socket-side sending window, advertised to tap
- * @window_clamped: Window was clamped on socket at least once
+ * @snd_buf: Socket sending buffer reported by kernel, in bytes
* @ts_sock_act: Last activity timestamp from socket for timeout purposes
* @ts_tap_act: Last activity timestamp from tap for timeout purposes
* @ts_ack_from_tap: Last ACK segment timestamp from tap
* @ts_ack_to_tap: Last ACK segment timestamp to tap
* @tap_data_noack: Last unacked data to tap, set to { 0, 0 } on ACK
- * @mss_guest: Maximum segment size advertised by guest
- * @events: epoll events currently enabled for socket
*/
-struct tcp_tap_conn {
- struct tcp_tap_conn *next;
+struct tcp_conn {
+ struct tcp_conn *next;
int sock;
int hash_bucket;
@@ -493,10 +418,35 @@ struct tcp_tap_conn {
struct in_addr a;
} a4;
} a;
+#define CONN_V4(conn) IN6_IS_ADDR_V4MAPPED(&conn->a.a6)
+#define CONN_V6(conn) (!CONN_V4(conn))
+
in_port_t tap_port;
in_port_t sock_port;
- int local;
- enum tcp_state state;
+
+ uint8_t events;
+#define CLOSED 0
+#define SOCK_ACCEPTED BIT(0) /* implies SYN sent to tap */
+#define TAP_SYN_RCVD BIT(1) /* implies socket connecting */
+#define TAP_SYN_ACK_SENT BIT( 3) /* implies socket connected */
+#define ESTABLISHED BIT(2)
+#define SOCK_FIN_RCVD BIT( 3)
+#define SOCK_FIN_SENT BIT( 4)
+#define TAP_FIN_RCVD BIT( 5)
+#define TAP_FIN_SENT BIT( 6)
+#define TAP_FIN_ACKED BIT( 7)
+
+#define CONN_STATE_BITS /* Setting these clears other flags */ \
+ (SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED)
+
+ uint8_t flags;
+#define CONN_STALLED BIT(0)
+#define CONN_LOCAL BIT(1)
+#define CONN_WND_CLAMPED BIT(2)
+#define CONN_IN_EPOLL BIT(3)
+#define CONN_ACTIVE_CLOSE BIT(4)
+
+ uint16_t tap_mss;
uint32_t seq_to_tap;
uint32_t seq_ack_from_tap;
@@ -508,9 +458,10 @@ struct tcp_tap_conn {
uint16_t ws_tap;
uint16_t ws;
+
uint32_t wnd_from_tap;
uint32_t wnd_to_tap;
- int window_clamped;
+
int snd_buf;
struct timespec ts_sock_act;
@@ -518,33 +469,35 @@ struct tcp_tap_conn {
struct timespec ts_ack_from_tap;
struct timespec ts_ack_to_tap;
struct timespec tap_data_noack;
+};
- unsigned int mss_guest;
+#define CONN_IS_CLOSED(conn) (conn->events == CLOSED)
+#define CONN_IS_CLOSING(conn) \
+ ((conn->events & ESTABLISHED) && \
+ (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
+#define CONN_HAS(conn, set) ((conn->events & (set)) == (set))
- uint32_t events;
+#define CONN(index) (tc + (index))
+
+static const char *tcp_event_str[] __attribute((__unused__)) = {
+ "SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT",
+
+ "SOCK_FIN_RCVD", "SOCK_FIN_SENT", "TAP_FIN_RCVD", "TAP_FIN_SENT",
+ "TAP_FIN_ACKED",
};
-/**
- * struct tcp_splice_conn - Descriptor for a spliced TCP connection
- * @from: File descriptor number of socket for accepted connection
- * @pipe_from_to: Pipe ends for splice() from @from to @to
- * @to: File descriptor number of peer connected socket
- * @pipe_to_from: Pipe ends for splice() from @to to @from
- * @state: TCP connection state
-*/
-struct tcp_splice_conn {
- int from;
- int pipe_from_to[2];
- int to;
- int pipe_to_from[2];
- enum tcp_state state;
- int from_fin_sent;
- int to_fin_sent;
- int v6;
- uint64_t from_read;
- uint64_t from_written;
- uint64_t to_read;
- uint64_t to_written;
+static const char *tcp_state_str[] __attribute((__unused__)) = {
+ "SYN_RCVD", "SYN_SENT", "ESTABLISHED",
+ "SYN_RCVD", /* approximately maps to TAP_SYN_ACK_SENT */
+
+ /* Passive close: */
+ "CLOSE_WAIT", "CLOSE_WAIT", "LAST_ACK", "LAST_ACK", "LAST_ACK",
+ /* Active close (+5): */
+ "CLOSING", "FIN_WAIT_1", "FIN_WAIT_1", "FIN_WAIT_2", "TIME_WAIT",
+};
+
+static const char *tcp_flag_str[] __attribute((__unused__)) = {
+ "STALLED", "LOCAL", "WND_CLAMPED", "IN_EPOLL", "ACTIVE_CLOSE",
};
/* Port re-mappings as delta, indexed by original destination port */
@@ -559,26 +512,6 @@ static int tcp_sock_ns [USHRT_MAX][IP_VERSIONS];
/* Table of destinations with very low RTT (assumed to be local), LRU */
static struct in6_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
-/**
- * tcp_remap_to_tap() - Set delta for port translation toward guest/tap
- * @port: Original destination port, host order
- * @delta: Delta to be added to original destination port
- */
-void tcp_remap_to_tap(in_port_t port, in_port_t delta)
-{
- tcp_port_delta_to_tap[port] = delta;
-}
-
-/**
- * tcp_remap_to_tap() - Set delta for port translation toward init namespace
- * @port: Original destination port, host order
- * @delta: Delta to be added to original destination port
- */
-void tcp_remap_to_init(in_port_t port, in_port_t delta)
-{
- tcp_port_delta_to_init[port] = delta;
-}
-
/* Static buffers */
/**
@@ -611,7 +544,7 @@ static struct tcp4_l2_buf_t {
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
-tcp4_l2_buf[TCP_TAP_FRAMES_MEM];
+tcp4_l2_buf[TCP_FRAMES_MEM];
static unsigned int tcp4_l2_buf_used;
static size_t tcp4_l2_buf_bytes;
@@ -642,24 +575,24 @@ struct tcp6_l2_buf_t {
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
-tcp6_l2_buf[TCP_TAP_FRAMES_MEM];
+tcp6_l2_buf[TCP_FRAMES_MEM];
static unsigned int tcp6_l2_buf_used;
static size_t tcp6_l2_buf_bytes;
/* recvmsg()/sendmsg() data for tap */
static char tcp_buf_discard [MAX_WINDOW];
-static struct iovec iov_sock [TCP_TAP_FRAMES_MEM + 1];
+static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
-static struct iovec tcp4_l2_iov_tap [TCP_TAP_FRAMES_MEM];
-static struct iovec tcp6_l2_iov_tap [TCP_TAP_FRAMES_MEM];
-static struct iovec tcp4_l2_flags_iov_tap [TCP_TAP_FRAMES_MEM];
-static struct iovec tcp6_l2_flags_iov_tap [TCP_TAP_FRAMES_MEM];
+static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM];
+static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM];
+static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM];
+static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM];
-static struct mmsghdr tcp_l2_mh_tap [TCP_TAP_FRAMES_MEM];
+static struct mmsghdr tcp_l2_mh [TCP_FRAMES_MEM];
/* sendmsg() to socket */
-static struct iovec tcp_tap_iov [UIO_MAXIOV];
+static struct iovec tcp_iov [UIO_MAXIOV];
/**
* tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags)
@@ -690,9 +623,10 @@ static struct tcp4_l2_flags_buf_t {
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
-tcp4_l2_flags_buf[TCP_TAP_FRAMES_MEM];
+tcp4_l2_flags_buf[TCP_FRAMES_MEM];
-static int tcp4_l2_flags_buf_used;
+static unsigned int tcp4_l2_flags_buf_used;
+static size_t tcp4_l2_flags_buf_bytes;
/**
* tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags)
@@ -719,34 +653,202 @@ static struct tcp6_l2_flags_buf_t {
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
-tcp6_l2_flags_buf[TCP_TAP_FRAMES_MEM];
-
-static int tcp6_l2_flags_buf_used;
+tcp6_l2_flags_buf[TCP_FRAMES_MEM];
-/* SO_RCVLOWAT set on source ([0]) or destination ([1]) socket, and activity */
-static uint8_t splice_rcvlowat_set[MAX_SPLICE_CONNS / 8][2];
-static uint8_t splice_rcvlowat_act[MAX_SPLICE_CONNS / 8][2];
+static unsigned int tcp6_l2_flags_buf_used;
+static size_t tcp6_l2_flags_buf_bytes;
/* TCP connections */
-static struct tcp_tap_conn tt[MAX_TAP_CONNS];
-static struct tcp_splice_conn ts[MAX_SPLICE_CONNS];
+static struct tcp_conn tc[MAX_TAP_CONNS];
/* Table for lookup from remote address, local port, remote port */
-static struct tcp_tap_conn *tt_hash[TCP_HASH_TABLE_SIZE];
+static struct tcp_conn *tc_hash[TCP_HASH_TABLE_SIZE];
+
+/* Pools for pre-opened sockets */
+int init_sock_pool4 [TCP_SOCK_POOL_SIZE];
+int init_sock_pool6 [TCP_SOCK_POOL_SIZE];
+int ns_sock_pool4 [TCP_SOCK_POOL_SIZE];
+int ns_sock_pool6 [TCP_SOCK_POOL_SIZE];
+
+/**
+ * tcp_conn_epoll_events() - epoll events mask for given connection state
+ * @events: Current connection events
+ * @conn_flags Connection flags
+ *
+ * Return: epoll events mask corresponding to implied connection state
+ */
+static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
+{
+ if (!events)
+ return 0;
+
+ if (events & ESTABLISHED) {
+ if (events & TAP_FIN_SENT)
+ return EPOLLET;
+
+ if (conn_flags & CONN_STALLED)
+ return EPOLLIN | EPOLLRDHUP | EPOLLET;
-/* Pools for pre-opened sockets and pipes */
-static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2][2];
-static int init_sock_pool4 [TCP_SOCK_POOL_SIZE];
-static int init_sock_pool6 [TCP_SOCK_POOL_SIZE];
-static int ns_sock_pool4 [TCP_SOCK_POOL_SIZE];
-static int ns_sock_pool6 [TCP_SOCK_POOL_SIZE];
+ return EPOLLIN | EPOLLRDHUP;
+ }
+
+ if (events == TAP_SYN_RCVD)
+ return EPOLLOUT | EPOLLET | EPOLLRDHUP;
+
+ return EPOLLRDHUP;
+}
+
+static void conn_flag_do(struct ctx *c, struct tcp_conn *conn,
+ unsigned long flag);
+#define conn_flag(c, conn, flag) \
+ do { \
+ trace("TCP: flag at %s:%i", __func__, __LINE__); \
+ conn_flag_do(c, conn, flag); \
+ } while (0)
+
+/**
+ * tcp_epoll_ctl() - Add/modify/delete epoll state from connection events
+ * @c: Execution context
+ * @conn: Connection pointer
+ *
+ * Return: 0 on success, negative error code on failure (not on deletion)
+ */
+static int tcp_epoll_ctl(struct ctx *c, struct tcp_conn *conn)
+{
+ int m = (conn->flags & CONN_IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
+ union epoll_ref ref = { .r.proto = IPPROTO_TCP, .r.s = conn->sock,
+ .r.p.tcp.tcp.index = conn - tc,
+ .r.p.tcp.tcp.v6 = CONN_V6(conn) };
+ struct epoll_event ev = { .data.u64 = ref.u64 };
+
+ if (CONN_IS_CLOSED(conn)) {
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev);
+ return 0;
+ }
+
+ ev.events = tcp_conn_epoll_events(conn->events, conn->flags);
+
+ if (epoll_ctl(c->epollfd, m, conn->sock, &ev))
+ return -errno;
+
+ conn->flags |= CONN_IN_EPOLL; /* No need to log this */
+
+ return 0;
+}
+
+/**
+ * conn_flag_do() - Set/unset given flag, log, update epoll on CONN_STALLED
+ * @c: Execution context
+ * @conn: Connection pointer
+ * @flag: Flag to set, or ~flag to unset
+ */
+static void conn_flag_do(struct ctx *c, struct tcp_conn *conn,
+ unsigned long flag)
+{
+ if (flag & (flag - 1)) {
+ if (!(conn->flags & ~flag))
+ return;
+
+ conn->flags &= flag;
+ debug("TCP: index %i: %s dropped", (conn) - tc,
+ tcp_flag_str[fls(~flag)]);
+ } else {
+ if (conn->flags & flag)
+ return;
+
+ conn->flags |= flag;
+ debug("TCP: index %i: %s", (conn) - tc,
+ tcp_flag_str[fls(flag)]);
+ }
+
+ if (flag == CONN_STALLED || flag == ~CONN_STALLED)
+ tcp_epoll_ctl(c, conn);
+}
+
+/**
+ * conn_event_do() - Set and log connection events, update epoll state
+ * @c: Execution context
+ * @conn: Connection pointer
+ * @event: Connection event
+ */
+static void conn_event_do(struct ctx *c, struct tcp_conn *conn,
+ unsigned long event)
+{
+ int prev, new, num = fls(event);
+
+ if (conn->events & event)
+ return;
+
+ prev = fls(conn->events);
+ if (conn->flags & CONN_ACTIVE_CLOSE)
+ prev += 5;
+
+ if ((conn->events & ESTABLISHED) && (conn->events != ESTABLISHED))
+ prev++; /* i.e. SOCK_FIN_RCVD, not TAP_SYN_ACK_SENT */
+
+ if (event == CLOSED || (event & CONN_STATE_BITS))
+ conn->events = event;
+ else
+ conn->events |= event;
+
+ if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD))
+ conn_flag(c, conn, CONN_ACTIVE_CLOSE);
+ else
+ tcp_epoll_ctl(c, conn);
+
+ new = fls(conn->events);
+
+ if ((conn->events & ESTABLISHED) && (conn->events != ESTABLISHED)) {
+ num++;
+ new++;
+ }
+ if (conn->flags & CONN_ACTIVE_CLOSE)
+ new += 5;
+
+ if (prev != new) {
+ debug("TCP: index %i, %s: %s -> %s", (conn) - tc,
+ num == -1 ? "CLOSED" : tcp_event_str[num],
+ prev == -1 ? "CLOSED" : tcp_state_str[prev],
+ (new == -1 || num == -1) ? "CLOSED" : tcp_state_str[new]);
+ } else {
+ debug("TCP: index %i, %s", (conn) - tc,
+ num == -1 ? "CLOSED" : tcp_event_str[num]);
+ }
+}
+
+#define conn_event(c, conn, event) \
+ do { \
+ trace("TCP: event at %s:%i", __func__, __LINE__); \
+ conn_event_do(c, conn, event); \
+ } while (0)
+
+/**
+ * tcp_remap_to_tap() - Set delta for port translation toward guest/tap
+ * @port: Original destination port, host order
+ * @delta: Delta to be added to original destination port
+ */
+void tcp_remap_to_tap(in_port_t port, in_port_t delta)
+{
+ tcp_port_delta_to_tap[port] = delta;
+}
+
+/**
+ * tcp_remap_to_tap() - Set delta for port translation toward init namespace
+ * @port: Original destination port, host order
+ * @delta: Delta to be added to original destination port
+ */
+void tcp_remap_to_init(in_port_t port, in_port_t delta)
+{
+ tcp_port_delta_to_init[port] = delta;
+}
/**
* tcp_rtt_dst_low() - Check if low RTT was seen for connection endpoint
* @conn: Connection pointer
+ *
* Return: 1 if destination is in low RTT table, 0 otherwise
*/
-static int tcp_rtt_dst_low(struct tcp_tap_conn *conn)
+static int tcp_rtt_dst_low(struct tcp_conn *conn)
{
int i;
@@ -762,7 +864,7 @@ static int tcp_rtt_dst_low(struct tcp_tap_conn *conn)
* @conn: Connection pointer
* @tinfo: Pointer to struct tcp_info for socket
*/
-static void tcp_rtt_dst_check(struct tcp_tap_conn *conn, struct tcp_info *tinfo)
+static void tcp_rtt_dst_check(struct tcp_conn *conn, struct tcp_info *tinfo)
{
#ifdef HAS_MIN_RTT
int i, hole = -1;
@@ -789,34 +891,10 @@ static void tcp_rtt_dst_check(struct tcp_tap_conn *conn, struct tcp_info *tinfo)
}
/**
- * tcp_tap_state() - Set given TCP state for tap connection, report to stderr
- * @conn: Connection pointer
- * @state: New TCP state to be set
- */
-static void tcp_tap_state(struct tcp_tap_conn *conn, enum tcp_state state)
-{
- debug("TCP: socket %i: %s -> %s",
- conn->sock, tcp_state_str[conn->state], tcp_state_str[state]);
- conn->state = state;
-}
-
-/**
- * tcp_splice_state() - Set state for spliced connection, report to stderr
- * @conn: Connection pointer
- * @state: New TCP state to be set
- */
-static void tcp_splice_state(struct tcp_splice_conn *conn, enum tcp_state state)
-{
- debug("TCP: index %i: %s -> %s",
- conn - ts, tcp_state_str[conn->state], tcp_state_str[state]);
- conn->state = state;
-}
-
-/**
* tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.5 usage)
* @conn: Connection pointer
*/
-static void tcp_get_sndbuf(struct tcp_tap_conn *conn)
+static void tcp_get_sndbuf(struct tcp_conn *conn)
{
int s = conn->sock, sndbuf;
socklen_t sl;
@@ -841,7 +919,7 @@ static void tcp_get_sndbuf(struct tcp_tap_conn *conn)
* tcp_sock_set_bufsize() - Set SO_RCVBUF and SO_SNDBUF to maximum values
* @s: Socket, can be -1 to avoid check in the caller
*/
-static void tcp_sock_set_bufsize(struct ctx *c, int s)
+void tcp_sock_set_bufsize(struct ctx *c, int s)
{
int v = INT_MAX / 2; /* Kernel clamps and rounds, no need to check */
@@ -918,7 +996,7 @@ void tcp_update_l2_buf(unsigned char *eth_d, unsigned char *eth_s,
{
int i;
- for (i = 0; i < TCP_TAP_FRAMES_MEM; i++) {
+ for (i = 0; i < TCP_FRAMES_MEM; i++) {
struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i];
struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i];
struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i];
@@ -984,13 +1062,12 @@ static void tcp_sock4_iov_init(void)
};
}
- for (i = 0, iov = tcp4_l2_iov_tap; i < TCP_TAP_FRAMES_MEM; i++, iov++) {
+ for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) {
iov->iov_base = &tcp4_l2_buf[i].vnet_len;
iov->iov_len = MSS_DEFAULT;
}
- for (i = 0, iov = tcp4_l2_flags_iov_tap; i < TCP_TAP_FRAMES_MEM;
- i++, iov++)
+ for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
iov->iov_base = &tcp4_l2_flags_buf[i].vnet_len;
}
@@ -1018,13 +1095,12 @@ static void tcp_sock6_iov_init(void)
};
}
- for (i = 0, iov = tcp6_l2_iov_tap; i < TCP_TAP_FRAMES_MEM; i++, iov++) {
+ for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) {
iov->iov_base = &tcp6_l2_buf[i].vnet_len;
iov->iov_len = MSS_DEFAULT;
}
- for (i = 0, iov = tcp6_l2_flags_iov_tap; i < TCP_TAP_FRAMES_MEM;
- i++, iov++)
+ for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
iov->iov_base = &tcp6_l2_flags_buf[i].vnet_len;
}
@@ -1032,13 +1108,13 @@ static void tcp_sock6_iov_init(void)
* tcp_opt_get() - Get option, and value if any, from TCP header
* @th: Pointer to TCP header
* @len: Length of buffer, including TCP header
- * @type: Option type to look for
+ * @type_find: Option type to look for
* @optlen_set: Optional, filled with option length if passed
* @value_set: Optional, set to start of option value if passed
*
- * Return: Option value, meaningful for up to 4 bytes, -1 if not found
+ * Return: option value, meaningful for up to 4 bytes, -1 if not found
*/
-static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t type_search,
+static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t type_find,
uint8_t *optlen_set, char **value_set)
{
uint8_t type, optlen;
@@ -1062,7 +1138,7 @@ static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t type_search,
optlen = *(p++) - 2;
len -= 2;
- if (type != type_search)
+ if (type != type_find)
break;
if (optlen_set)
@@ -1096,7 +1172,7 @@ static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t type_search,
*
* Return: 1 on match, 0 otherwise
*/
-static int tcp_hash_match(struct tcp_tap_conn *conn, int af, void *addr,
+static int tcp_hash_match(struct tcp_conn *conn, int af, void *addr,
in_port_t tap_port, in_port_t sock_port)
{
if (af == AF_INET && CONN_V4(conn) &&
@@ -1136,9 +1212,7 @@ static unsigned int tcp_hash(struct ctx *c, int af, void *addr,
in_port_t tap_port;
in_port_t sock_port;
} __attribute__((__packed__)) in = {
- .addr = *(struct in_addr *)addr,
- .tap_port = tap_port,
- .sock_port = sock_port,
+ *(struct in_addr *)addr, tap_port, sock_port,
};
b = siphash_8b((uint8_t *)&in, c->tcp.hash_secret);
@@ -1148,9 +1222,7 @@ static unsigned int tcp_hash(struct ctx *c, int af, void *addr,
in_port_t tap_port;
in_port_t sock_port;
} __attribute__((__packed__)) in = {
- .addr = *(struct in6_addr *)addr,
- .tap_port = tap_port,
- .sock_port = sock_port,
+ *(struct in6_addr *)addr, tap_port, sock_port,
};
b = siphash_20b((uint8_t *)&in, c->tcp.hash_secret);
@@ -1166,41 +1238,41 @@ static unsigned int tcp_hash(struct ctx *c, int af, void *addr,
* @af: Address family, AF_INET or AF_INET6
* @addr: Remote address, pointer to sin_addr or sin6_addr
*/
-static void tcp_hash_insert(struct ctx *c, struct tcp_tap_conn *conn,
+static void tcp_hash_insert(struct ctx *c, struct tcp_conn *conn,
int af, void *addr)
{
int b;
b = tcp_hash(c, af, addr, conn->tap_port, conn->sock_port);
- conn->next = tt_hash[b];
- tt_hash[b] = conn;
+ conn->next = tc_hash[b];
+ tc_hash[b] = conn;
conn->hash_bucket = b;
debug("TCP: hash table insert: index %i, sock %i, bucket: %i, next: %p",
- conn - tt, conn->sock, b, conn->next);
+ conn - tc, conn->sock, b, conn->next);
}
/**
* tcp_hash_remove() - Drop connection from hash table, chain unlink
* @conn: Connection pointer
*/
-static void tcp_hash_remove(struct tcp_tap_conn *conn)
+static void tcp_hash_remove(struct tcp_conn *conn)
{
- struct tcp_tap_conn *entry, *prev = NULL;
+ struct tcp_conn *entry, *prev = NULL;
int b = conn->hash_bucket;
- for (entry = tt_hash[b]; entry; prev = entry, entry = entry->next) {
+ for (entry = tc_hash[b]; entry; prev = entry, entry = entry->next) {
if (entry == conn) {
if (prev)
prev->next = conn->next;
else
- tt_hash[b] = conn->next;
+ tc_hash[b] = conn->next;
break;
}
}
debug("TCP: hash table remove: index %i, sock %i, bucket: %i, new: %p",
- conn - tt, conn->sock, b, prev ? prev->next : tt_hash[b]);
+ conn - tc, conn->sock, b, prev ? prev->next : tc_hash[b]);
}
/**
@@ -1208,24 +1280,24 @@ static void tcp_hash_remove(struct tcp_tap_conn *conn)
* @old: Old connection pointer
* @new: New connection pointer
*/
-static void tcp_hash_update(struct tcp_tap_conn *old, struct tcp_tap_conn *new)
+static void tcp_hash_update(struct tcp_conn *old, struct tcp_conn *new)
{
- struct tcp_tap_conn *entry, *prev = NULL;
+ struct tcp_conn *entry, *prev = NULL;
int b = old->hash_bucket;
- for (entry = tt_hash[b]; entry; prev = entry, entry = entry->next) {
+ for (entry = tc_hash[b]; entry; prev = entry, entry = entry->next) {
if (entry == old) {
if (prev)
prev->next = new;
else
- tt_hash[b] = new;
+ tc_hash[b] = new;
break;
}
}
debug("TCP: hash table update: old index %i, new index %i, sock %i, "
"bucket: %i, old: %p, new: %p",
- old - tt, new - tt, new->sock, b, old, new);
+ old - tc, new - tc, new->sock, b, old, new);
}
/**
@@ -1238,14 +1310,13 @@ static void tcp_hash_update(struct tcp_tap_conn *old, struct tcp_tap_conn *new)
*
* Return: connection pointer, if found, -ENOENT otherwise
*/
-static struct tcp_tap_conn *tcp_hash_lookup(struct ctx *c, int af, void *addr,
- in_port_t tap_port,
- in_port_t sock_port)
+static struct tcp_conn *tcp_hash_lookup(struct ctx *c, int af, void *addr,
+ in_port_t tap_port, in_port_t sock_port)
{
int b = tcp_hash(c, af, addr, tap_port, sock_port);
- struct tcp_tap_conn *conn;
+ struct tcp_conn *conn;
- for (conn = tt_hash[b]; conn; conn = conn->next) {
+ for (conn = tc_hash[b]; conn; conn = conn->next) {
if (tcp_hash_match(conn, af, addr, tap_port, sock_port))
return conn;
}
@@ -1254,70 +1325,46 @@ static struct tcp_tap_conn *tcp_hash_lookup(struct ctx *c, int af, void *addr,
}
/**
- * tcp_tap_epoll_mask() - Set new epoll event mask given a connection
- * @c: Execution context
- * @conn: Connection pointer
- * @events: New epoll event bitmap
- */
-static void tcp_tap_epoll_mask(struct ctx *c, struct tcp_tap_conn *conn,
- uint32_t events)
-{
- union epoll_ref ref = { .r.proto = IPPROTO_TCP, .r.s = conn->sock,
- .r.p.tcp.tcp.index = conn - tt,
- .r.p.tcp.tcp.v6 = CONN_V6(conn) };
- struct epoll_event ev = { .data.u64 = ref.u64, .events = events };
-
- if (conn->events == events)
- return;
-
- conn->events = events;
- epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->sock, &ev);
-}
-
-/**
- * tcp_table_tap_compact() - Perform compaction on tap connection table
+ * tcp_table_compact() - Perform compaction on connection table
* @c: Execution context
* @hole: Pointer to recently closed connection
*/
-static void tcp_table_tap_compact(struct ctx *c, struct tcp_tap_conn *hole)
+static void tcp_table_compact(struct ctx *c, struct tcp_conn *hole)
{
- struct tcp_tap_conn *from, *to;
- uint32_t events;
+ struct tcp_conn *from, *to;
- if ((hole - tt) == --c->tcp.tap_conn_count) {
+ if ((hole - tc) == --c->tcp.conn_count) {
debug("TCP: hash table compaction: index %i (%p) was max index",
- hole - tt, hole);
+ hole - tc, hole);
return;
}
- from = &tt[c->tcp.tap_conn_count];
+ from = CONN(c->tcp.conn_count);
memcpy(hole, from, sizeof(*hole));
- from->state = CLOSED;
+ from->flags = from->events = 0;
to = hole;
tcp_hash_update(from, to);
- events = hole->events;
- hole->events = UINT_MAX;
- tcp_tap_epoll_mask(c, hole, events);
+ tcp_epoll_ctl(c, to);
debug("TCP: hash table compaction: old index %i, new index %i, "
"sock %i, from: %p, to: %p",
- from - tt, to - tt, from->sock, from, to);
+ from - tc, to - tc, from->sock, from, to);
}
/**
- * tcp_tap_destroy() - Close tap connection, drop from hash table and epoll
+ * tcp_conn_destroy() - Close connection, drop from epoll file descriptor
* @c: Execution context
* @conn: Connection pointer
*/
-static void tcp_tap_destroy(struct ctx *c, struct tcp_tap_conn *conn)
+static void tcp_conn_destroy(struct ctx *c, struct tcp_conn *conn)
{
- if (conn->state == CLOSED)
+ if (CONN_IS_CLOSED(conn))
return;
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, NULL);
- tcp_tap_state(conn, CLOSED);
+ conn_event(c, conn, CLOSED);
+ conn->flags = 0;
close(conn->sock);
/* Removal from hash table and connection table compaction deferred to
@@ -1325,50 +1372,33 @@ static void tcp_tap_destroy(struct ctx *c, struct tcp_tap_conn *conn)
*/
}
-static void tcp_rst(struct ctx *c, struct tcp_tap_conn *conn);
+static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn);
+#define tcp_rst(c, conn) \
+ do { \
+ debug("TCP: index %i, reset at %s:%i", conn - tc, \
+ __func__, __LINE__); \
+ tcp_rst_do(c, conn); \
+ } while (0)
/**
- * tcp_l2_flags_buf_flush() - Send out buffers for segments with no data (flags)
+ * tcp_l2_buf_write_one() - Write a single buffer to tap file descriptor
* @c: Execution context
+ * @iov: struct iovec item pointing to buffer
+ * @ts: Current timestamp
+ *
+ * Return: 0 on success, negative error code on failure (tap reset possible)
*/
-static void tcp_l2_flags_buf_flush(struct ctx *c)
+static int tcp_l2_buf_write_one(struct ctx *c, struct iovec *iov,
+ struct timespec *ts)
{
- struct msghdr mh = { 0 };
- size_t i;
-
- mh.msg_iov = tcp6_l2_flags_iov_tap;
- if ((mh.msg_iovlen = tcp6_l2_flags_buf_used)) {
- if (c->mode == MODE_PASST) {
- sendmsg(c->fd_tap, &mh, MSG_NOSIGNAL | MSG_DONTWAIT);
- } else {
- for (i = 0; i < mh.msg_iovlen; i++) {
- struct iovec *iov = &mh.msg_iov[i];
-
- if (write(c->fd_tap, (char *)iov->iov_base + 4,
- iov->iov_len - 4) < 0)
- debug("tap write: %s", strerror(errno));
- }
- }
- tcp6_l2_flags_buf_used = 0;
- pcapm(&mh);
+ if (write(c->fd_tap, (char *)iov->iov_base + 4, iov->iov_len - 4) < 0) {
+ debug("tap write: %s", strerror(errno));
+ if (errno != EAGAIN && errno != EWOULDBLOCK)
+ tap_handler(c, c->fd_tap, EPOLLERR, ts);
+ return -errno;
}
- mh.msg_iov = tcp4_l2_flags_iov_tap;
- if ((mh.msg_iovlen = tcp4_l2_flags_buf_used)) {
- if (c->mode == MODE_PASST) {
- sendmsg(c->fd_tap, &mh, MSG_NOSIGNAL | MSG_DONTWAIT);
- } else {
- for (i = 0; i < mh.msg_iovlen; i++) {
- struct iovec *iov = &mh.msg_iov[i];
-
- if (write(c->fd_tap, (char *)iov->iov_base + 4,
- iov->iov_len - 4) < 0)
- debug("tap write: %s", strerror(errno));
- }
- }
- tcp4_l2_flags_buf_used = 0;
- pcapm(&mh);
- }
+ return 0;
}
/**
@@ -1396,65 +1426,91 @@ static void tcp_l2_buf_flush_part(struct ctx *c, struct msghdr *mh, size_t sent)
}
/**
- * tcp_l2_flags_buf() - Send out buffers for segments with data
+ * tcp_l2_flags_buf_flush() - Send out buffers for segments with or without data
* @c: Execution context
- */
-static void tcp_l2_buf_flush(struct ctx *c)
+ * @mh: Message header pointing to buffers, msg_iovlen not set
+ * @buf_used: Pointer to count of used buffers, set to 0 on return
+ * @buf_bytes: Pointer to count of buffer bytes, set to 0 on return
+ * @ts: Current timestamp
+ */
+static void tcp_l2_buf_flush(struct ctx *c, struct msghdr *mh,
+ unsigned int *buf_used, size_t *buf_bytes,
+ struct timespec *ts)
{
- struct msghdr mh = { 0 };
- size_t i, n;
-
- mh.msg_iov = tcp6_l2_iov_tap;
- if (!(mh.msg_iovlen = tcp6_l2_buf_used))
- goto v4;
+ if (!(mh->msg_iovlen = *buf_used))
+ return;
if (c->mode == MODE_PASST) {
- n = sendmsg(c->fd_tap, &mh, MSG_NOSIGNAL | MSG_DONTWAIT);
- if (n > 0 && n < tcp6_l2_buf_bytes)
- tcp_l2_buf_flush_part(c, &mh, n);
+ size_t n = sendmsg(c->fd_tap, mh, MSG_NOSIGNAL | MSG_DONTWAIT);
+ if (n > 0 && n < *buf_bytes)
+ tcp_l2_buf_flush_part(c, mh, n);
} else {
- for (i = 0; i < mh.msg_iovlen; i++) {
- struct iovec *iov = &mh.msg_iov[i];
+ size_t i;
+
+ for (i = 0; i < mh->msg_iovlen; i++) {
+ struct iovec *iov = &mh->msg_iov[i];
- if (write(c->fd_tap, (char *)iov->iov_base + 4,
- iov->iov_len - 4) < 0)
- debug("tap write: %s", strerror(errno));
+ if (tcp_l2_buf_write_one(c, iov, ts))
+ i--;
}
}
- tcp6_l2_buf_used = tcp6_l2_buf_bytes = 0;
- pcapm(&mh);
-
-v4:
- mh.msg_iov = tcp4_l2_iov_tap;
- if (!(mh.msg_iovlen = tcp4_l2_buf_used))
- return;
-
- if (c->mode == MODE_PASST) {
- n = sendmsg(c->fd_tap, &mh, MSG_NOSIGNAL | MSG_DONTWAIT);
+ *buf_used = *buf_bytes = 0;
+ pcapm(mh);
+}
- if (n > 0 && n < tcp4_l2_buf_bytes)
- tcp_l2_buf_flush_part(c, &mh, n);
- } else {
- for (i = 0; i < mh.msg_iovlen; i++) {
- struct iovec *iov = &mh.msg_iov[i];
+/**
+ * tcp_l2_flags_buf_flush() - Send out buffers for segments with no data (flags)
+ * @c: Execution context
+ * @ts: Current timestamp (not packet timestamp)
+ */
+static void tcp_l2_flags_buf_flush(struct ctx *c, struct timespec *ts)
+{
+ struct msghdr mh = { 0 };
+ unsigned int *buf_used;
+ size_t *buf_bytes;
+
+ mh.msg_iov = tcp6_l2_flags_iov;
+ buf_used = &tcp6_l2_flags_buf_used;
+ buf_bytes = &tcp6_l2_flags_buf_bytes;
+ tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes, ts);
+
+ mh.msg_iov = tcp4_l2_flags_iov;
+ buf_used = &tcp4_l2_flags_buf_used;
+ buf_bytes = &tcp4_l2_flags_buf_bytes;
+ tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes, ts);
+}
- if (write(c->fd_tap, (char *)iov->iov_base + 4,
- iov->iov_len - 4) < 0)
- debug("tap write: %s", strerror(errno));
- }
- }
- tcp4_l2_buf_used = tcp4_l2_buf_bytes = 0;
- pcapm(&mh);
+/**
+ * tcp_l2_data_buf_flush() - Send out buffers for segments with data
+ * @c: Execution context
+ * @ts: Current timestamp (not packet timestamp)
+ */
+static void tcp_l2_data_buf_flush(struct ctx *c, struct timespec *ts)
+{
+ struct msghdr mh = { 0 };
+ unsigned int *buf_used;
+ size_t *buf_bytes;
+
+ mh.msg_iov = tcp6_l2_iov;
+ buf_used = &tcp6_l2_buf_used;
+ buf_bytes = &tcp6_l2_buf_bytes;
+ tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes, ts);
+
+ mh.msg_iov = tcp4_l2_iov;
+ buf_used = &tcp4_l2_buf_used;
+ buf_bytes = &tcp4_l2_buf_bytes;
+ tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes, ts);
}
/**
* tcp_defer_handler() - Handler for TCP deferred tasks
* @c: Execution context
+ * @now: Current timestamp
*/
-void tcp_defer_handler(struct ctx *c)
+void tcp_defer_handler(struct ctx *c, struct timespec *now)
{
- tcp_l2_flags_buf_flush(c);
- tcp_l2_buf_flush(c);
+ tcp_l2_flags_buf_flush(c, now);
+ tcp_l2_data_buf_flush(c, now);
}
/**
@@ -1466,9 +1522,9 @@ void tcp_defer_handler(struct ctx *c)
* @check: Checksum, if already known
* @seq: Sequence number for this segment
*
- * Return: 802.3 length, host order.
+ * Return: 802.3 length, host order
*/
-static size_t tcp_l2_buf_fill_headers(struct ctx *c, struct tcp_tap_conn *conn,
+static size_t tcp_l2_buf_fill_headers(struct ctx *c, struct tcp_conn *conn,
void *p, size_t plen,
const uint16_t *check, uint32_t seq)
{
@@ -1549,13 +1605,13 @@ static size_t tcp_l2_buf_fill_headers(struct ctx *c, struct tcp_tap_conn *conn,
* tcp_update_seqack_wnd() - Update ACK sequence and window to guest/tap
* @c: Execution context
* @conn: Connection pointer
- * @flags: TCP header flags we are about to send, if any
+ * @force_seq: Force ACK sequence to latest segment, instead of checking socket
* @tinfo: tcp_info from kernel, can be NULL if not pre-fetched
*
* Return: 1 if sequence or window were updated, 0 otherwise
*/
-static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_tap_conn *conn,
- int flags, struct tcp_info *tinfo)
+static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn,
+ int force_seq, struct tcp_info *tinfo)
{
uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
uint32_t prev_wnd_to_tap = conn->wnd_to_tap;
@@ -1564,15 +1620,14 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_tap_conn *conn,
int s = conn->sock;
#ifndef HAS_BYTES_ACKED
- (void)flags;
+ (void)force_seq;
conn->seq_ack_to_tap = conn->seq_from_tap;
if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
conn->seq_ack_to_tap = prev_ack_to_tap;
#else
- if (conn->state > ESTABLISHED || (flags & (DUP_ACK | FORCE_ACK)) ||
- conn->local || tcp_rtt_dst_low(conn) ||
- (unsigned long)conn->snd_buf < SNDBUF_SMALL) {
+ if ((unsigned long)conn->snd_buf < SNDBUF_SMALL || tcp_rtt_dst_low(conn)
+ || CONN_IS_CLOSING(conn) || conn->flags & CONN_LOCAL || force_seq) {
conn->seq_ack_to_tap = conn->seq_from_tap;
} else if (conn->seq_ack_to_tap != conn->seq_from_tap) {
if (!tinfo) {
@@ -1605,7 +1660,7 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_tap_conn *conn,
}
#ifdef HAS_SND_WND
- if (conn->local || tcp_rtt_dst_low(conn)) {
+ if ((conn->flags & CONN_LOCAL) || tcp_rtt_dst_low(conn)) {
conn->wnd_to_tap = tinfo->tcpi_snd_wnd;
} else {
tcp_get_sndbuf(conn);
@@ -1621,16 +1676,16 @@ out:
}
/**
- * tcp_send_to_tap() - Send segment to tap, with options and values from socket
+ * tcp_send_flag() - Send segment with flags to tap (no payload)
* @c: Execution context
* @conn: Connection pointer
- * @flags: TCP flags to set
- * @now: Current timestamp, can be NULL
+ * @flags: TCP flags: if not set, send segment only if ACK is due
+ * @now: Current timestamp
*
* Return: negative error code on connection reset, 0 otherwise
*/
-static int tcp_send_to_tap(struct ctx *c, struct tcp_tap_conn *conn, int flags,
- struct timespec *now)
+static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags,
+ struct timespec *now)
{
uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
uint32_t prev_wnd_to_tap = conn->wnd_to_tap;
@@ -1650,26 +1705,26 @@ static int tcp_send_to_tap(struct ctx *c, struct tcp_tap_conn *conn, int flags,
return 0;
if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
- tcp_tap_destroy(c, conn);
+ tcp_conn_destroy(c, conn);
return -ECONNRESET;
}
- if (!conn->local)
+ if (!(conn->flags & CONN_LOCAL))
tcp_rtt_dst_check(conn, &tinfo);
if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags)
return 0;
if (CONN_V4(conn)) {
- iov = tcp4_l2_flags_iov_tap + tcp4_l2_flags_buf_used;
- p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++;
+ iov = tcp4_l2_flags_iov + tcp4_l2_flags_buf_used;
+ p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++;
th = &b4->th;
/* gcc 11.2 would complain on data = (char *)(th + 1); */
data = b4->opts;
} else {
- iov = tcp6_l2_flags_iov_tap + tcp6_l2_flags_buf_used;
- p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++;
+ iov = tcp6_l2_flags_iov + tcp6_l2_flags_buf_used;
+ p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++;
th = &b6->th;
data = b6->opts;
}
@@ -1693,7 +1748,8 @@ static int tcp_send_to_tap(struct ctx *c, struct tcp_tap_conn *conn, int flags,
mss -= sizeof(struct ipv6hdr);
if (c->low_wmem &&
- !conn->local && !tcp_rtt_dst_low(conn))
+ !(conn->flags & CONN_LOCAL) &&
+ !tcp_rtt_dst_low(conn))
mss = MIN(mss, PAGE_SIZE);
else if (mss > PAGE_SIZE)
mss = ROUND_DOWN(mss, PAGE_SIZE);
@@ -1719,7 +1775,7 @@ static int tcp_send_to_tap(struct ctx *c, struct tcp_tap_conn *conn, int flags,
conn->wnd_to_tap = WINDOW_DEFAULT;
} else {
- th->ack = !!(flags & (ACK | FORCE_ACK | DUP_ACK)) ||
+ th->ack = !!(flags & (ACK | DUP_ACK)) ||
conn->seq_ack_to_tap != prev_ack_to_tap ||
!prev_wnd_to_tap;
}
@@ -1734,6 +1790,11 @@ static int tcp_send_to_tap(struct ctx *c, struct tcp_tap_conn *conn, int flags,
NULL, conn->seq_to_tap);
iov->iov_len = eth_len + sizeof(uint32_t);
+ if (CONN_V4(conn))
+ tcp4_l2_flags_buf_bytes += iov->iov_len;
+ else
+ tcp6_l2_flags_buf_bytes += iov->iov_len;
+
if (th->ack && now)
conn->ts_ack_to_tap = *now;
@@ -1749,35 +1810,38 @@ static int tcp_send_to_tap(struct ctx *c, struct tcp_tap_conn *conn, int flags,
memcpy(b4 + 1, b4, sizeof(*b4));
(iov + 1)->iov_len = iov->iov_len;
tcp4_l2_flags_buf_used++;
+ tcp4_l2_flags_buf_bytes += iov->iov_len;
}
if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2)
- tcp_l2_flags_buf_flush(c);
+ tcp_l2_flags_buf_flush(c, now);
} else {
if (flags & DUP_ACK) {
memcpy(b6 + 1, b6, sizeof(*b6));
(iov + 1)->iov_len = iov->iov_len;
tcp6_l2_flags_buf_used++;
+ tcp6_l2_flags_buf_bytes += iov->iov_len;
}
+
if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2)
- tcp_l2_flags_buf_flush(c);
+ tcp_l2_flags_buf_flush(c, now);
}
return 0;
}
/**
- * tcp_rst() - Reset a tap connection: send RST segment to tap, close socket
+ * tcp_rst_do() - Reset a tap connection: send RST segment to tap, close socket
* @c: Execution context
* @conn: Connection pointer
*/
-static void tcp_rst(struct ctx *c, struct tcp_tap_conn *conn)
+static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn)
{
- if (conn->state == CLOSED)
+ if (CONN_IS_CLOSED(conn))
return;
- tcp_send_to_tap(c, conn, RST, NULL);
- tcp_tap_destroy(c, conn);
+ if (!tcp_send_flag(c, conn, RST, NULL))
+ tcp_conn_destroy(c, conn);
}
/**
@@ -1788,8 +1852,9 @@ static void tcp_rst(struct ctx *c, struct tcp_tap_conn *conn)
* @window: Window value, host order, unscaled, if no header is passed
* @init: Set if this is the very first segment from tap
*/
-static void tcp_clamp_window(struct tcp_tap_conn *conn, struct tcphdr *th,
- int len, unsigned int window, int init)
+static void tcp_clamp_window(struct ctx *c, struct tcp_conn *conn,
+ struct tcphdr *th, int len, unsigned int window,
+ int init)
{
if (init && th) {
int ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL);
@@ -1801,7 +1866,6 @@ static void tcp_clamp_window(struct tcp_tap_conn *conn, struct tcphdr *th,
* small window now.
*/
conn->wnd_from_tap = ntohs(th->window);
- conn->window_clamped = 0;
} else {
if (th)
window = ntohs(th->window) << conn->ws_tap;
@@ -1810,7 +1874,7 @@ static void tcp_clamp_window(struct tcp_tap_conn *conn, struct tcphdr *th,
window = MIN(MAX_WINDOW, window);
- if (conn->window_clamped) {
+ if (conn->flags & CONN_WND_CLAMPED) {
if (conn->wnd_from_tap == window)
return;
@@ -1829,7 +1893,7 @@ static void tcp_clamp_window(struct tcp_tap_conn *conn, struct tcphdr *th,
window = 256;
setsockopt(conn->sock, SOL_TCP, TCP_WINDOW_CLAMP,
&window, sizeof(window));
- conn->window_clamped = 1;
+ conn_flag(c, conn, CONN_WND_CLAMPED);
}
}
@@ -1887,6 +1951,66 @@ static uint32_t tcp_seq_init(struct ctx *c, int af, void *addr,
}
/**
+ * tcp_conn_new_sock() - Get socket for new connection from pool or make new one
+ * @c: Execution context
+ * @af: Address family
+ *
+ * Return: socket number if available, negative code if socket creation failed
+ */
+static int tcp_conn_new_sock(struct ctx *c, sa_family_t af)
+{
+ int *pool = af == AF_INET6 ? init_sock_pool6 : init_sock_pool4, i, s;
+
+ for (i = 0; i < TCP_SOCK_POOL_SIZE; i++, pool++) {
+ if ((s = *pool) >= 0) {
+ *pool = -1;
+ break;
+ }
+ }
+
+ if (s < 0)
+ s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
+
+ if (s < 0)
+ return -errno;
+
+ tcp_sock_set_bufsize(c, s);
+
+ return s;
+}
+
+/**
+ * tcp_conn_tap_mss() - Get and clamp MSS value advertised by tap/guest
+ * @c: Execution context
+ * @conn: Connection pointer
+ * @th: TCP header send by tap/guest
+ * @len: L4 packet length, host order
+ *
+ * Return: clamped MSS value
+ */
+static uint16_t tcp_conn_tap_mss(struct ctx *c, struct tcp_conn *conn,
+ struct tcphdr *th, size_t len)
+{
+ unsigned int mss;
+ int ret;
+
+ if ((ret = tcp_opt_get(th, len, OPT_MSS, NULL, NULL)) < 0)
+ mss = MSS_DEFAULT;
+ else
+ mss = ret;
+
+ /* Don't upset qemu */
+ if (c->mode == MODE_PASST) {
+ if (CONN_V4(conn))
+ mss = MIN(MSS4, mss);
+ else
+ mss = MIN(MSS6, mss);
+ }
+
+ return MIN(mss, USHRT_MAX);
+}
+
+/**
* tcp_conn_from_tap() - Handle connection request (SYN segment) from tap
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
@@ -1899,7 +2023,6 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
struct tcphdr *th, size_t len,
struct timespec *now)
{
- union epoll_ref ref = { .r.proto = IPPROTO_TCP };
struct sockaddr_in addr4 = {
.sin_family = AF_INET,
.sin_port = th->dest,
@@ -1910,41 +2033,23 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
.sin6_port = th->dest,
.sin6_addr = *(struct in6_addr *)addr,
};
- int i, s, *sock_pool_p, mss;
const struct sockaddr *sa;
- struct tcp_tap_conn *conn;
- struct epoll_event ev;
+ struct tcp_conn *conn;
socklen_t sl;
+ int s;
- if (c->tcp.tap_conn_count >= MAX_TAP_CONNS)
+ if (c->tcp.conn_count >= TCP_MAX_CONNS)
return;
- for (i = 0; i < TCP_SOCK_POOL_SIZE; i++) {
- if (af == AF_INET6)
- sock_pool_p = &init_sock_pool6[i];
- else
- sock_pool_p = &init_sock_pool4[i];
- if ((ref.r.s = s = (*sock_pool_p)) >= 0) {
- *sock_pool_p = -1;
- break;
- }
- }
-
- if (s < 0) {
- s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
- ref.r.s = s;
- }
-
- if (s < 0)
+ if ((s = tcp_conn_new_sock(c, af)) < 0)
return;
- tcp_sock_set_bufsize(c, s);
-
- if (af == AF_INET && addr4.sin_addr.s_addr == c->gw4 && !c->no_map_gw)
- addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
- else if (af == AF_INET6 && !memcmp(addr, &c->gw6, sizeof(c->gw6)) &&
- !c->no_map_gw)
- addr6.sin6_addr = in6addr_loopback;
+ if (!c->no_map_gw) {
+ if (af == AF_INET && addr4.sin_addr.s_addr == c->gw4)
+ addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ if (af == AF_INET6 && !memcmp(addr, &c->gw6, sizeof(c->gw6)))
+ addr6.sin6_addr = in6addr_loopback;
+ }
if (af == AF_INET6 && IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr)) {
struct sockaddr_in6 addr6_ll = {
@@ -1958,29 +2063,18 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
}
}
- conn = &tt[c->tcp.tap_conn_count++];
+ conn = CONN(c->tcp.conn_count++);
conn->sock = s;
- conn->events = 0;
+ conn_event(c, conn, TAP_SYN_RCVD);
conn->wnd_to_tap = WINDOW_DEFAULT;
- if ((mss = tcp_opt_get(th, len, OPT_MSS, NULL, NULL)) < 0)
- conn->mss_guest = MSS_DEFAULT;
- else
- conn->mss_guest = mss;
-
- /* Don't upset qemu */
- if (c->mode == MODE_PASST) {
- if (af == AF_INET)
- conn->mss_guest = MIN(MSS4, conn->mss_guest);
- else
- conn->mss_guest = MIN(MSS6, conn->mss_guest);
- }
+ conn->tap_mss = tcp_conn_tap_mss(c, conn, th, len);
- sl = sizeof(conn->mss_guest);
- setsockopt(s, SOL_TCP, TCP_MAXSEG, &conn->mss_guest, sl);
+ sl = sizeof(conn->tap_mss);
+ setsockopt(s, SOL_TCP, TCP_MAXSEG, &conn->tap_mss, sl);
- tcp_clamp_window(conn, th, len, 0, 1);
+ tcp_clamp_window(c, conn, th, len, 0, 1);
if (af == AF_INET) {
sa = (struct sockaddr *)&addr4;
@@ -2015,162 +2109,86 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
if (!bind(s, sa, sl))
tcp_rst(c, conn); /* Nobody is listening then */
if (errno != EADDRNOTAVAIL)
- conn->local = 1;
+ conn_flag(c, conn, CONN_LOCAL);
if (connect(s, sa, sl)) {
- tcp_tap_state(conn, TAP_SYN_SENT);
-
if (errno != EINPROGRESS) {
tcp_rst(c, conn);
return;
}
- ev.events = EPOLLOUT | EPOLLRDHUP;
-
tcp_get_sndbuf(conn);
} else {
- tcp_tap_state(conn, TAP_SYN_RCVD);
-
tcp_get_sndbuf(conn);
- if (tcp_send_to_tap(c, conn, SYN | ACK, now))
+ if (tcp_send_flag(c, conn, SYN | ACK, now))
return;
- ev.events = EPOLLIN | EPOLLRDHUP;
+ conn_event(c, conn, TAP_SYN_ACK_SENT);
}
- conn->events = ev.events;
- ref.r.p.tcp.tcp.index = conn - tt;
- ev.data.u64 = ref.u64;
- epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev);
+ tcp_epoll_ctl(c, conn);
}
/**
- * tcp_table_splice_compact - Compact spliced connection table
- * @c: Execution context
- * @hole: Pointer to recently closed connection
+ * tcp_sock_consume() - Consume (discard) data from buffer, update ACK sequence
+ * @conn: Connection pointer
+ * @ack_seq: ACK sequence, host order
+ *
+ * Return: 0 on success, negative error code from recv() on failure
*/
-static void tcp_table_splice_compact(struct ctx *c,
- struct tcp_splice_conn *hole)
+static int tcp_sock_consume(struct tcp_conn *conn, uint32_t ack_seq)
{
- union epoll_ref ref_from = { .r.proto = IPPROTO_TCP,
- .r.p.tcp.tcp.splice = 1,
- .r.p.tcp.tcp.index = hole - ts };
- union epoll_ref ref_to = { .r.proto = IPPROTO_TCP,
- .r.p.tcp.tcp.splice = 1,
- .r.p.tcp.tcp.index = hole - ts };
- struct tcp_splice_conn *move;
- struct epoll_event ev_from;
- struct epoll_event ev_to;
-
- hole->from_fin_sent = hole->to_fin_sent = 0;
- hole->from_read = hole->from_written = 0;
- hole->to_read = hole->to_written = 0;
-
- bitmap_clear(splice_rcvlowat_set[0], hole - ts);
- bitmap_clear(splice_rcvlowat_set[1], hole - ts);
- bitmap_clear(splice_rcvlowat_act[0], hole - ts);
- bitmap_clear(splice_rcvlowat_act[1], hole - ts);
-
- if ((hole - ts) == --c->tcp.splice_conn_count)
- return;
-
- move = &ts[c->tcp.splice_conn_count];
- if (move->state == CLOSED)
- return;
-
- memcpy(hole, move, sizeof(*hole));
- move->state = CLOSED;
- move = hole;
-
- ref_from.r.s = move->from;
- ref_from.r.p.tcp.tcp.v6 = move->v6;
- ref_to.r.s = move->to;
- ref_to.r.p.tcp.tcp.v6 = move->v6;
-
- if (move->state == SPLICE_ACCEPTED) {
- ev_from.events = ev_to.events = 0;
- } else if (move->state == SPLICE_CONNECT) {
- ev_from.events = 0;
- ev_to.events = EPOLLOUT;
- } else {
- ev_from.events = EPOLLIN | EPOLLOUT | EPOLLRDHUP;
- ev_to.events = EPOLLIN | EPOLLOUT | EPOLLRDHUP;
- }
+ /* Simply ignore out-of-order ACKs: we already consumed the data we
+ * needed from the buffer, and we won't rewind back to a lower ACK
+ * sequence.
+ */
+ if (SEQ_LE(ack_seq, conn->seq_ack_from_tap))
+ return 0;
- ev_from.data.u64 = ref_from.u64;
- ev_to.data.u64 = ref_to.u64;
+ if (recv(conn->sock, NULL, ack_seq - conn->seq_ack_from_tap,
+ MSG_DONTWAIT | MSG_TRUNC) < 0)
+ return -errno;
- epoll_ctl(c->epollfd, EPOLL_CTL_MOD, move->from, &ev_from);
- epoll_ctl(c->epollfd, EPOLL_CTL_MOD, move->to, &ev_to);
+ conn->seq_ack_from_tap = ack_seq;
+ return 0;
}
/**
- * tcp_splice_destroy() - Close spliced connection and pipes, drop from epoll
+ * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer
* @c: Execution context
* @conn: Connection pointer
+ * @plen: Payload length at L4
+ * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer
+ * @seq: Sequence number to be sent
+ * @now: Current timestamp
*/
-static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn)
+static void tcp_data_to_tap(struct ctx *c, struct tcp_conn *conn, ssize_t plen,
+ int no_csum, uint32_t seq, struct timespec *now)
{
- int epoll_del_done = 0;
-
- switch (conn->state) {
- case CLOSED:
- epoll_del_done = 1;
- /* Falls through */
- case SPLICE_FIN_BOTH:
- case SPLICE_FIN_FROM:
- case SPLICE_FIN_TO:
- case SPLICE_ESTABLISHED:
- /* Flushing might need to block: don't recycle them. */
- if (conn->pipe_from_to[0] != -1) {
- close(conn->pipe_from_to[0]);
- conn->pipe_from_to[0] = -1;
- close(conn->pipe_from_to[1]);
- conn->pipe_from_to[1] = -1;
- }
- if (conn->pipe_to_from[0] != -1) {
- close(conn->pipe_to_from[0]);
- conn->pipe_to_from[0] = -1;
- close(conn->pipe_to_from[1]);
- conn->pipe_to_from[1] = -1;
- }
- /* Falls through */
- case SPLICE_CONNECT:
- if (!epoll_del_done) {
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->from, NULL);
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->to, NULL);
- }
- close(conn->to);
- /* Falls through */
- case SPLICE_ACCEPTED:
- close(conn->from);
- tcp_splice_state(conn, CLOSED);
- tcp_table_splice_compact(c, conn);
- break;
- default:
- return;
- }
-}
+ struct iovec *iov;
+ size_t len;
-/**
- * tcp_sock_consume() - Consume (discard) data from buffer, update ACK sequence
- * @conn: Connection pointer
- * @ack_seq: ACK sequence, host order
- */
-static void tcp_sock_consume(struct tcp_tap_conn *conn, uint32_t ack_seq)
-{
- /* Simply ignore out-of-order ACKs: we already consumed the data we
- * needed from the buffer, and we won't rewind back to a lower ACK
- * sequence.
- */
- if (SEQ_LE(ack_seq, conn->seq_ack_from_tap))
- return;
+ if (CONN_V4(conn)) {
+ struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used];
+ uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL;
- recv(conn->sock, NULL, ack_seq - conn->seq_ack_from_tap,
- MSG_DONTWAIT | MSG_TRUNC);
+ len = tcp_l2_buf_fill_headers(c, conn, b, plen, check, seq);
- conn->seq_ack_from_tap = ack_seq;
+ iov = tcp4_l2_iov + tcp4_l2_buf_used++;
+ tcp4_l2_buf_bytes += iov->iov_len = len + sizeof(b->vnet_len);
+ if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1)
+ tcp_l2_data_buf_flush(c, now);
+ } else if (CONN_V6(conn)) {
+ struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used];
+
+ len = tcp_l2_buf_fill_headers(c, conn, b, plen, NULL, seq);
+
+ iov = tcp6_l2_iov + tcp6_l2_buf_used++;
+ tcp6_l2_buf_bytes += iov->iov_len = len + sizeof(b->vnet_len);
+ if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1)
+ tcp_l2_data_buf_flush(c, now);
+ }
}
/**
@@ -2183,12 +2201,11 @@ static void tcp_sock_consume(struct tcp_tap_conn *conn, uint32_t ack_seq)
*
* #syscalls recvmsg
*/
-static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
+static int tcp_data_from_sock(struct ctx *c, struct tcp_conn *conn,
struct timespec *now)
{
int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
int sendlen, len, plen, v4 = CONN_V4(conn);
- uint32_t seq_to_tap = conn->seq_to_tap;
int s = conn->sock, i, ret = 0;
struct msghdr mh_sock = { 0 };
uint32_t already_sent;
@@ -2198,23 +2215,26 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
if (SEQ_LT(already_sent, 0)) {
/* RFC 761, section 2.1. */
- seq_to_tap = conn->seq_to_tap = conn->seq_ack_from_tap;
+ trace("TCP: ACK sequence gap: ACK for %lu, sent: %lu",
+ conn->seq_ack_from_tap, conn->seq_to_tap);
+ conn->seq_to_tap = conn->seq_ack_from_tap;
already_sent = 0;
}
if (!conn->wnd_from_tap || already_sent >= conn->wnd_from_tap) {
- tcp_tap_epoll_mask(c, conn, conn->events | EPOLLET);
+ conn_flag(c, conn, CONN_STALLED);
conn->tap_data_noack = *now;
return 0;
}
+ /* Set up buffer descriptors we'll fill completely and partially. */
fill_bufs = DIV_ROUND_UP(conn->wnd_from_tap - already_sent,
- conn->mss_guest);
- if (fill_bufs > TCP_TAP_FRAMES) {
- fill_bufs = TCP_TAP_FRAMES;
+ conn->tap_mss);
+ if (fill_bufs > TCP_FRAMES) {
+ fill_bufs = TCP_FRAMES;
iov_rem = 0;
} else {
- iov_rem = (conn->wnd_from_tap - already_sent) % conn->mss_guest;
+ iov_rem = (conn->wnd_from_tap - already_sent) % conn->tap_mss;
}
mh_sock.msg_iov = iov_sock;
@@ -2225,19 +2245,19 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) ||
(!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf)))
- tcp_l2_buf_flush(c);
+ tcp_l2_data_buf_flush(c, now);
for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
if (v4)
iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data;
else
iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data;
- iov->iov_len = conn->mss_guest;
+ iov->iov_len = conn->tap_mss;
}
if (iov_rem)
iov_sock[fill_bufs].iov_len = iov_rem;
- /* Don't dequeue until acknowledged by guest. */
+ /* Receive into buffers, don't dequeue until acknowledged by guest. */
recvmsg:
len = recvmsg(s, &mh_sock, MSG_PEEK);
if (len < 0) {
@@ -2251,117 +2271,57 @@ recvmsg:
sendlen = len - already_sent;
if (sendlen <= 0) {
- tcp_tap_epoll_mask(c, conn, conn->events | EPOLLET);
+ conn_flag(c, conn, CONN_STALLED);
return 0;
}
- tcp_tap_epoll_mask(c, conn, conn->events & ~EPOLLET);
+ conn_flag(c, conn, ~CONN_STALLED);
- send_bufs = DIV_ROUND_UP(sendlen, conn->mss_guest);
- last_len = sendlen - (send_bufs - 1) * conn->mss_guest;
+ send_bufs = DIV_ROUND_UP(sendlen, conn->tap_mss);
+ last_len = sendlen - (send_bufs - 1) * conn->tap_mss;
/* Likely, some new data was acked too. */
tcp_update_seqack_wnd(c, conn, 0, NULL);
- plen = conn->mss_guest;
+ /* Finally, queue to tap */
+ plen = conn->tap_mss;
for (i = 0; i < send_bufs; i++) {
- ssize_t eth_len;
+ int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used;
if (i == send_bufs - 1)
plen = last_len;
- if (v4) {
- struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used];
- uint16_t *check = NULL;
-
- if (i && i != send_bufs - 1 && tcp4_l2_buf_used)
- check = &(b - 1)->iph.check;
-
- eth_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
- check, seq_to_tap);
-
- if (c->mode == MODE_PASST) {
- iov = tcp4_l2_iov_tap + tcp4_l2_buf_used++;
- iov->iov_len = eth_len + sizeof(uint32_t);
- tcp4_l2_buf_bytes += iov->iov_len;
-
- if (tcp4_l2_buf_used >
- ARRAY_SIZE(tcp4_l2_buf) - 1)
- tcp_l2_buf_flush(c);
-
- seq_to_tap += plen;
- continue;
- }
-
- pcap((char *)&b->eh, eth_len);
- ret = write(c->fd_tap, &b->eh, eth_len);
- } else {
- struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used];
-
- eth_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
- NULL, seq_to_tap);
-
- if (c->mode == MODE_PASST) {
- iov = tcp6_l2_iov_tap + tcp6_l2_buf_used++;
- iov->iov_len = eth_len + sizeof(uint32_t);
- tcp6_l2_buf_bytes += iov->iov_len;
-
- if (tcp6_l2_buf_used >
- ARRAY_SIZE(tcp6_l2_buf) - 1)
- tcp_l2_buf_flush(c);
-
- seq_to_tap += plen;
- continue;
- }
-
- pcap((char *)&b->eh, eth_len);
- ret = write(c->fd_tap, &b->eh, eth_len);
- }
-
- if (ret < eth_len) {
- if (ret < 0) {
- if (errno == EAGAIN || errno == EWOULDBLOCK)
- return 0;
-
- tap_handler(c, c->fd_tap, EPOLLERR, now);
- }
-
- i--;
- continue;
- }
-
+ tcp_data_to_tap(c, conn, plen, no_csum, conn->seq_to_tap, now);
conn->seq_to_tap += plen;
}
- if (c->mode == MODE_PASTA)
- return ret;
-
- conn->tap_data_noack = *now;
- conn->seq_to_tap += conn->mss_guest * (send_bufs - 1) + last_len;
-
- conn->ts_ack_to_tap = *now;
+ conn->tap_data_noack = conn->ts_ack_to_tap = *now;
return 0;
err:
if (errno != EAGAIN && errno != EWOULDBLOCK) {
- tcp_rst(c, conn);
ret = -errno;
+ tcp_rst(c, conn);
}
+
return ret;
zero_len:
- if (conn->state == ESTABLISHED_SOCK_FIN) {
- tcp_tap_epoll_mask(c, conn, EPOLLET);
- tcp_send_to_tap(c, conn, FIN | ACK, now);
- tcp_tap_state(conn, ESTABLISHED_SOCK_FIN_SENT);
+ if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
+ if ((ret = tcp_send_flag(c, conn, FIN | ACK, now))) {
+ tcp_rst(c, conn);
+ return ret;
+ }
+
+ conn_event(c, conn, TAP_FIN_SENT);
}
return 0;
}
/**
- * tcp_data_from_tap() - tap data in ESTABLISHED{,SOCK_FIN}, CLOSE_WAIT states
+ * tcp_data_from_tap() - tap data for established connection
* @c: Execution context
* @conn: Connection pointer
* @msg: Array of messages from tap
@@ -2370,15 +2330,15 @@ zero_len:
*
* #syscalls sendmsg
*/
-static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
+static void tcp_data_from_tap(struct ctx *c, struct tcp_conn *conn,
struct tap_l4_msg *msg, int count,
struct timespec *now)
{
int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1;
- struct msghdr mh = { .msg_iov = tcp_tap_iov };
uint32_t max_ack_seq = conn->seq_ack_from_tap;
uint16_t max_ack_seq_wnd = conn->wnd_from_tap;
uint32_t seq_from_tap = conn->seq_from_tap;
+ struct msghdr mh = { .msg_iov = tcp_iov };
int partial_send = 0;
uint16_t len;
ssize_t n;
@@ -2404,7 +2364,7 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
}
if (th->rst) {
- tcp_tap_destroy(c, conn);
+ tcp_conn_destroy(c, conn);
return;
}
@@ -2467,9 +2427,9 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
continue;
}
- tcp_tap_iov[iov_i].iov_base = data + seq_offset;
- tcp_tap_iov[iov_i].iov_len = len - seq_offset;
- seq_from_tap += tcp_tap_iov[iov_i].iov_len;
+ tcp_iov[iov_i].iov_base = data + seq_offset;
+ tcp_iov[iov_i].iov_len = len - seq_offset;
+ seq_from_tap += tcp_iov[iov_i].iov_len;
iov_i++;
if (keep == i)
@@ -2479,7 +2439,7 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
i = keep - 1;
}
- tcp_clamp_window(conn, NULL, 0, max_ack_seq_wnd, 0);
+ tcp_clamp_window(c, conn, NULL, 0, max_ack_seq_wnd, 0);
if (ack) {
conn->ts_ack_from_tap = *now;
@@ -2489,6 +2449,8 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
}
if (retr) {
+ trace("TCP: fast re-transmit, ACK: %lu, previous sequence: %lu",
+ max_ack_seq, conn->seq_to_tap);
conn->seq_ack_from_tap = max_ack_seq;
conn->seq_to_tap = max_ack_seq;
tcp_data_from_sock(c, conn, now);
@@ -2507,25 +2469,24 @@ eintr:
* Then swiftly looked away and left.
*/
conn->seq_from_tap = seq_from_tap;
- tcp_send_to_tap(c, conn, FORCE_ACK, now);
+ tcp_send_flag(c, conn, ACK, now);
}
if (errno == EINTR)
goto eintr;
if (errno == EAGAIN || errno == EWOULDBLOCK) {
- tcp_send_to_tap(c, conn, 0, now);
+ tcp_send_flag(c, conn, ACK_IF_NEEDED, now);
return;
}
tcp_rst(c, conn);
return;
}
-
if (n < (int)(seq_from_tap - conn->seq_from_tap)) {
partial_send = 1;
conn->seq_from_tap += n;
- tcp_send_to_tap(c, conn, 0, now);
+ tcp_send_flag(c, conn, ACK_IF_NEEDED, now);
} else {
conn->seq_from_tap += n;
}
@@ -2534,35 +2495,53 @@ out:
if (keep != -1) {
if (conn->seq_dup_ack != conn->seq_from_tap) {
conn->seq_dup_ack = conn->seq_from_tap;
- tcp_send_to_tap(c, conn, DUP_ACK, now);
+ tcp_send_flag(c, conn, DUP_ACK, now);
}
return;
}
- if (ack) {
- if (conn->state == ESTABLISHED_SOCK_FIN_SENT &&
- conn->seq_ack_from_tap == conn->seq_to_tap)
- tcp_tap_state(conn, CLOSE_WAIT);
- }
+ if (ack && conn->events & TAP_FIN_SENT &&
+ conn->seq_ack_from_tap == conn->seq_to_tap)
+ conn_event(c, conn, TAP_FIN_ACKED);
if (fin && !partial_send) {
conn->seq_from_tap++;
- if (conn->state == ESTABLISHED) {
- shutdown(conn->sock, SHUT_WR);
- tcp_tap_state(conn, FIN_WAIT_1);
- tcp_send_to_tap(c, conn, ACK, now);
- } else if (conn->state == CLOSE_WAIT) {
- shutdown(conn->sock, SHUT_WR);
- tcp_tap_state(conn, LAST_ACK);
- tcp_send_to_tap(c, conn, ACK, now);
- }
+ conn_event(c, conn, TAP_FIN_RCVD);
} else {
- tcp_send_to_tap(c, conn, 0, now);
+ tcp_send_flag(c, conn, ACK_IF_NEEDED, now);
}
}
/**
+ * tcp_conn_from_sock_finish() - Complete connection setup after connect()
+ * @c: Execution context
+ * @conn: Connection pointer
+ * @th: TCP header of SYN, ACK segment from tap/guest
+ * @len: Packet length of SYN, ACK segment at L4, host order
+ * @now: Current timestamp
+ */
+static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn,
+ struct tcphdr *th, size_t len,
+ struct timespec *now)
+{
+ tcp_clamp_window(c, conn, th, len, 0, 1);
+ conn->tap_mss = tcp_conn_tap_mss(c, conn, th, len);
+
+ conn->seq_init_from_tap = ntohl(th->seq) + 1;
+ conn->seq_from_tap = conn->seq_init_from_tap;
+ conn->seq_ack_to_tap = conn->seq_from_tap;
+
+ conn_event(c, conn, ESTABLISHED);
+
+ /* The client might have sent data already, which we didn't
+ * dequeue waiting for SYN,ACK from tap -- check now.
+ */
+ tcp_data_from_sock(c, conn, now);
+ tcp_send_flag(c, conn, ACK_IF_NEEDED, now);
+}
+
+/**
* tcp_tap_handler() - Handle packets from tap and state transitions
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
@@ -2578,10 +2557,11 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
{
struct tcphdr *th = (struct tcphdr *)(pkt_buf + msg[0].pkt_buf_offset);
uint16_t len = msg[0].l4_len;
- struct tcp_tap_conn *conn;
- int mss;
+ struct tcp_conn *conn;
conn = tcp_hash_lookup(c, af, addr, htons(th->source), htons(th->dest));
+
+ /* New connection from tap */
if (!conn) {
if (th->syn && !th->ack)
tcp_conn_from_tap(c, af, addr, th, len, now);
@@ -2589,59 +2569,40 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
}
if (th->rst) {
- tcp_tap_destroy(c, conn);
+ tcp_conn_destroy(c, conn);
return count;
}
conn->ts_tap_act = *now;
+ conn_flag(c, conn, ~CONN_STALLED);
- switch (conn->state) {
- case SOCK_SYN_SENT:
- if (!th->syn || !th->ack) {
+ /* Establishing connection from socket */
+ if (conn->events & SOCK_ACCEPTED) {
+ if (th->syn && th->ack && !th->fin)
+ tcp_conn_from_sock_finish(c, conn, th, len, now);
+ else
tcp_rst(c, conn);
- return count;
- }
-
- tcp_clamp_window(conn, th, len, 0, 1);
- if ((mss = tcp_opt_get(th, len, OPT_MSS, NULL, NULL)) < 0)
- conn->mss_guest = MSS_DEFAULT;
- else
- conn->mss_guest = mss;
+ return 1;
+ }
- /* Don't upset qemu */
- if (c->mode == MODE_PASST) {
- if (af == AF_INET)
- conn->mss_guest = MIN(MSS4, conn->mss_guest);
- else
- conn->mss_guest = MIN(MSS6, conn->mss_guest);
+ /* Establishing connection from tap */
+ if (conn->events & TAP_SYN_RCVD) {
+ if (!(conn->events & TAP_SYN_ACK_SENT)) {
+ tcp_rst(c, conn);
+ return count;
}
- /* tinfo.tcpi_bytes_acked already includes one byte for SYN, but
- * not for incoming connections.
- */
- conn->seq_init_from_tap = ntohl(th->seq) + 1;
- conn->seq_from_tap = conn->seq_init_from_tap;
- conn->seq_ack_to_tap = conn->seq_from_tap;
-
- tcp_tap_state(conn, ESTABLISHED);
+ conn_event(c, conn, ESTABLISHED);
- /* The client might have sent data already, which we didn't
- * dequeue waiting for SYN,ACK from tap -- check now.
- */
- tcp_data_from_sock(c, conn, now);
- tcp_send_to_tap(c, conn, 0, now);
-
- tcp_tap_epoll_mask(c, conn, EPOLLIN | EPOLLRDHUP);
- break;
- case TAP_SYN_RCVD:
if (th->fin) {
conn->seq_from_tap++;
shutdown(conn->sock, SHUT_WR);
- tcp_send_to_tap(c, conn, ACK, now);
- tcp_tap_state(conn, FIN_WAIT_1);
- break;
+ tcp_send_flag(c, conn, ACK, now);
+ conn_event(c, conn, SOCK_FIN_SENT);
+
+ return count;
}
if (!th->ack) {
@@ -2649,275 +2610,60 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
return count;
}
- tcp_clamp_window(conn, th, len, 0, 0);
+ tcp_clamp_window(c, conn, th, len, 0, 0);
- tcp_tap_state(conn, ESTABLISHED);
if (count == 1)
- break;
+ return 1;
+ }
- /* Falls through */
- case ESTABLISHED:
- case ESTABLISHED_SOCK_FIN:
- case ESTABLISHED_SOCK_FIN_SENT:
- tcp_tap_epoll_mask(c, conn, conn->events & ~EPOLLET);
- tcp_data_from_tap(c, conn, msg, count, now);
- return count;
- case CLOSE_WAIT:
- case FIN_WAIT_1_SOCK_FIN:
- case FIN_WAIT_1:
+ /* Established connections not accepting data from tap */
+ if (conn->events & TAP_FIN_RCVD) {
if (th->ack) {
conn->tap_data_noack = ((struct timespec) { 0, 0 });
conn->ts_ack_from_tap = *now;
}
- tcp_sock_consume(conn, ntohl(th->ack_seq));
- if (conn->state == FIN_WAIT_1_SOCK_FIN &&
- conn->seq_ack_from_tap == conn->seq_to_tap) {
- tcp_tap_destroy(c, conn);
- return count;
- }
+ if (conn->events & SOCK_FIN_RCVD &&
+ conn->seq_ack_from_tap == conn->seq_to_tap)
+ tcp_conn_destroy(c, conn);
- tcp_tap_epoll_mask(c, conn, conn->events & ~EPOLLET);
- return count;
- case TAP_SYN_SENT:
- case LAST_ACK:
- case SPLICE_ACCEPTED:
- case SPLICE_CONNECT:
- case SPLICE_ESTABLISHED:
- case SPLICE_FIN_FROM:
- case SPLICE_FIN_TO:
- case SPLICE_FIN_BOTH:
- case CLOSED: /* ;) */
- break;
- }
-
- return 1;
+ return 1;
+ }
+
+ /* Established connections accepting data from tap */
+ tcp_data_from_tap(c, conn, msg, count, now);
+
+ if ((conn->events & TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_SENT)) {
+ shutdown(conn->sock, SHUT_WR);
+ conn_event(c, conn, SOCK_FIN_SENT);
+ tcp_send_flag(c, conn, ACK, now);
+ }
+
+ return count;
}
/**
* tcp_connect_finish() - Handle completion of connect() from EPOLLOUT event
* @c: Execution context
- * @s: File descriptor number for socket
+ * @conn: Connection pointer
* @now: Current timestamp
*/
-static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn,
+static void tcp_connect_finish(struct ctx *c, struct tcp_conn *conn,
struct timespec *now)
{
socklen_t sl;
int so;
- /* Drop EPOLLOUT, only used to wait for connect() to complete */
- tcp_tap_epoll_mask(c, conn, EPOLLIN | EPOLLRDHUP);
-
sl = sizeof(so);
if (getsockopt(conn->sock, SOL_SOCKET, SO_ERROR, &so, &sl) || so) {
tcp_rst(c, conn);
return;
}
- if (tcp_send_to_tap(c, conn, SYN | ACK, now))
+ if (tcp_send_flag(c, conn, SYN | ACK, now))
return;
- tcp_tap_state(conn, TAP_SYN_RCVD);
-}
-
-/**
- * tcp_splice_connect_finish() - Completion of connect() or call on success
- * @c: Execution context
- * @conn: Connection pointer
- * @v6: Set on IPv6 connection
- */
-static void tcp_splice_connect_finish(struct ctx *c,
- struct tcp_splice_conn *conn, int v6)
-{
- union epoll_ref ref_from = { .r.proto = IPPROTO_TCP, .r.s = conn->from,
- .r.p.tcp.tcp = { .splice = 1, .v6 = v6,
- .index = conn - ts } };
- union epoll_ref ref_to = { .r.proto = IPPROTO_TCP, .r.s = conn->to,
- .r.p.tcp.tcp = { .splice = 1, .v6 = v6,
- .index = conn - ts } };
- struct epoll_event ev_from, ev_to;
- int i;
-
- conn->pipe_from_to[0] = conn->pipe_to_from[0] = -1;
- conn->pipe_from_to[1] = conn->pipe_to_from[1] = -1;
- for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) {
- if (splice_pipe_pool[i][0][0] > 0) {
- SWAP(conn->pipe_from_to[0], splice_pipe_pool[i][0][0]);
- SWAP(conn->pipe_from_to[1], splice_pipe_pool[i][0][1]);
-
- SWAP(conn->pipe_to_from[0], splice_pipe_pool[i][1][0]);
- SWAP(conn->pipe_to_from[1], splice_pipe_pool[i][1][1]);
- break;
- }
- }
-
- if (conn->pipe_from_to[0] < 0) {
- if (pipe2(conn->pipe_to_from, O_NONBLOCK) ||
- pipe2(conn->pipe_from_to, O_NONBLOCK)) {
- tcp_splice_destroy(c, conn);
- return;
- }
-
- fcntl(conn->pipe_from_to[0], F_SETPIPE_SZ, c->tcp.pipe_size);
- fcntl(conn->pipe_to_from[0], F_SETPIPE_SZ, c->tcp.pipe_size);
- }
-
- if (conn->state == SPLICE_CONNECT) {
- tcp_splice_state(conn, SPLICE_ESTABLISHED);
-
- ev_from.events = ev_to.events = EPOLLIN | EPOLLRDHUP;
- ev_from.data.u64 = ref_from.u64;
- ev_to.data.u64 = ref_to.u64;
-
- epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->from, &ev_from);
- epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->to, &ev_to);
- }
-}
-
-/**
- * tcp_splice_connect() - Create and connect socket for new spliced connection
- * @c: Execution context
- * @conn: Connection pointer
- * @v6: Set on IPv6 connection
- * @port: Destination port, host order
- *
- * Return: 0 for connect() succeeded or in progress, negative value on error
- */
-static int tcp_splice_connect(struct ctx *c, struct tcp_splice_conn *conn,
- int s, int v6, in_port_t port)
-{
- int sock_conn = (s >= 0) ? s : socket(v6 ? AF_INET6 : AF_INET,
- SOCK_STREAM | SOCK_NONBLOCK,
- IPPROTO_TCP);
- union epoll_ref ref_accept = { .r.proto = IPPROTO_TCP,
- .r.s = conn->from,
- .r.p.tcp.tcp = { .splice = 1, .v6 = v6,
- .index = conn - ts } };
- union epoll_ref ref_conn = { .r.proto = IPPROTO_TCP, .r.s = sock_conn,
- .r.p.tcp.tcp = { .splice = 1, .v6 = v6,
- .index = conn - ts } };
- struct epoll_event ev_accept = { .data.u64 = ref_accept.u64 };
- struct epoll_event ev_conn = { .data.u64 = ref_conn.u64 };
- struct sockaddr_in6 addr6 = {
- .sin6_family = AF_INET6,
- .sin6_port = htons(port),
- .sin6_addr = IN6ADDR_LOOPBACK_INIT,
- };
- struct sockaddr_in addr4 = {
- .sin_family = AF_INET,
- .sin_port = htons(port),
- .sin_addr = { .s_addr = htonl(INADDR_LOOPBACK) },
- };
- const struct sockaddr *sa;
- socklen_t sl;
- int one = 1;
-
- conn->to = sock_conn;
-
- if (s < 0)
- tcp_sock_set_bufsize(c, conn->to);
-
- setsockopt(conn->to, SOL_TCP, TCP_QUICKACK, &one, sizeof(one));
-
- if (v6) {
- sa = (struct sockaddr *)&addr6;
- sl = sizeof(addr6);
- } else {
- sa = (struct sockaddr *)&addr4;
- sl = sizeof(addr4);
- }
-
- if (connect(conn->to, sa, sl)) {
- if (errno != EINPROGRESS) {
- int ret = -errno;
-
- close(sock_conn);
- return ret;
- }
-
- tcp_splice_state(conn, SPLICE_CONNECT);
- ev_conn.events = EPOLLOUT;
- } else {
- tcp_splice_state(conn, SPLICE_ESTABLISHED);
- tcp_splice_connect_finish(c, conn, v6);
-
- ev_accept.events = EPOLLIN | EPOLLOUT | EPOLLRDHUP;
- ev_conn.events = EPOLLIN | EPOLLOUT | EPOLLRDHUP;
-
- epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->from, &ev_accept);
- }
-
- epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->to, &ev_conn);
-
- return 0;
-}
-
-/**
- * struct tcp_splice_connect_ns_arg - Arguments for tcp_splice_connect_ns()
- * @c: Execution context
- * @conn: Accepted inbound connection
- * @v6: Set for inbound IPv6 connection
- * @port: Destination port, host order
- * @ret: Return value of tcp_splice_connect_ns()
- */
-struct tcp_splice_connect_ns_arg {
- struct ctx *c;
- struct tcp_splice_conn *conn;
- int v6;
- in_port_t port;
- int ret;
-};
-
-/**
- * tcp_splice_connect_ns() - Enter namespace and call tcp_splice_connect()
- * @arg: See struct tcp_splice_connect_ns_arg
- *
- * Return: 0
- */
-static int tcp_splice_connect_ns(void *arg)
-{
- struct tcp_splice_connect_ns_arg *a;
-
- a = (struct tcp_splice_connect_ns_arg *)arg;
- ns_enter(a->c);
- a->ret = tcp_splice_connect(a->c, a->conn, -1, a->v6, a->port);
- return 0;
-}
-
-/**
- * tcp_splice_new() - Handle new inbound, spliced connection
- * @c: Execution context
- * @conn: Connection pointer
- * @v6: Set for IPv6 connection
- * @port: Destination port, host order
- *
- * Return: return code from connect()
- */
-static int tcp_splice_new(struct ctx *c, struct tcp_splice_conn *conn,
- int v6, in_port_t port)
-{
- struct tcp_splice_connect_ns_arg ns_arg = { c, conn, v6, port, 0 };
- int *sock_pool_p, i, s = -1;
-
- if (bitmap_isset(c->tcp.port_to_tap, port))
- sock_pool_p = v6 ? ns_sock_pool6 : ns_sock_pool4;
- else
- sock_pool_p = v6 ? init_sock_pool6 : init_sock_pool4;
-
- for (i = 0; i < TCP_SOCK_POOL_SIZE; i++, sock_pool_p++) {
- if ((s = *sock_pool_p) >= 0) {
- *sock_pool_p = -1;
- break;
- }
- }
-
- if (s < 0 && bitmap_isset(c->tcp.port_to_tap, port)) {
- NS_CALL(tcp_splice_connect_ns, &ns_arg);
- return ns_arg.ret;
- }
-
- return tcp_splice_connect(c, conn, s, v6, port);
+ conn_event(c, conn, TAP_SYN_ACK_SENT);
}
/**
@@ -2929,15 +2675,12 @@ static int tcp_splice_new(struct ctx *c, struct tcp_splice_conn *conn,
static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
struct timespec *now)
{
- union epoll_ref ref_conn = { .r.proto = IPPROTO_TCP,
- .r.p.tcp.tcp.v6 = ref.r.p.tcp.tcp.v6 };
struct sockaddr_storage sa;
- struct tcp_tap_conn *conn;
- struct epoll_event ev;
+ struct tcp_conn *conn;
socklen_t sl;
int s;
- if (c->tcp.tap_conn_count >= MAX_TAP_CONNS)
+ if (c->tcp.conn_count >= TCP_MAX_CONNS)
return;
sl = sizeof(sa);
@@ -2945,9 +2688,10 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
if (s < 0)
return;
- conn = &tt[c->tcp.tap_conn_count++];
- ref_conn.r.p.tcp.tcp.index = conn - tt;
- ref_conn.r.s = conn->sock = s;
+ conn = CONN(c->tcp.conn_count++);
+ conn->sock = s;
+
+ conn_event(c, conn, SOCK_ACCEPTED);
if (ref.r.p.tcp.tcp.v6) {
struct sockaddr_in6 sa6;
@@ -3015,267 +2759,12 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
conn->ts_sock_act = conn->ts_tap_act = *now;
conn->ts_ack_from_tap = conn->ts_ack_to_tap = *now;
- tcp_send_to_tap(c, conn, SYN, now);
-
- conn->events = ev.events = EPOLLRDHUP;
- ev.data.u64 = ref_conn.u64;
- epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->sock, &ev);
-
- tcp_tap_state(conn, SOCK_SYN_SENT);
+ tcp_send_flag(c, conn, SYN, now);
tcp_get_sndbuf(conn);
}
/**
- * tcp_sock_handler_splice() - Handler for socket mapped to spliced connection
- * @c: Execution context
- * @ref: epoll reference
- * @events: epoll events bitmap
- *
- * #syscalls:pasta splice
- */
-void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
- uint32_t events)
-{
- int move_from, move_to, *pipes, eof, never_read;
- uint8_t *rcvlowat_set, *rcvlowat_act;
- uint64_t *seq_read, *seq_write;
- struct tcp_splice_conn *conn;
- struct epoll_event ev;
-
- if (ref.r.p.tcp.tcp.listen) {
- int s, one = 1;
-
- if (c->tcp.splice_conn_count >= MAX_SPLICE_CONNS)
- return;
-
- if ((s = accept4(ref.r.s, NULL, NULL, SOCK_NONBLOCK)) < 0)
- return;
-
- setsockopt(s, SOL_TCP, TCP_QUICKACK, &one, sizeof(one));
-
- conn = &ts[c->tcp.splice_conn_count++];
- conn->from = s;
- tcp_splice_state(conn, SPLICE_ACCEPTED);
-
- if (tcp_splice_new(c, conn, ref.r.p.tcp.tcp.v6,
- ref.r.p.tcp.tcp.index))
- tcp_splice_destroy(c, conn);
-
- return;
- }
-
- conn = &ts[ref.r.p.tcp.tcp.index];
-
- if (events & EPOLLERR)
- goto close;
-
- if (conn->state == SPLICE_CONNECT && (events & EPOLLHUP))
- goto close;
-
- if (events & EPOLLOUT) {
- ev.events = EPOLLIN | EPOLLRDHUP;
- ev.data.u64 = ref.u64;
-
- if (conn->state == SPLICE_CONNECT)
- tcp_splice_connect_finish(c, conn, ref.r.p.tcp.tcp.v6);
- else if (conn->state == SPLICE_ESTABLISHED)
- epoll_ctl(c->epollfd, EPOLL_CTL_MOD, ref.r.s, &ev);
-
- move_to = ref.r.s;
- if (ref.r.s == conn->to) {
- move_from = conn->from;
- pipes = conn->pipe_from_to;
- } else {
- move_from = conn->to;
- pipes = conn->pipe_to_from;
- }
- } else {
- move_from = ref.r.s;
- if (ref.r.s == conn->from) {
- move_to = conn->to;
- pipes = conn->pipe_from_to;
- } else {
- move_to = conn->from;
- pipes = conn->pipe_to_from;
- }
- }
-
- if (events & EPOLLRDHUP) {
- if (ref.r.s == conn->from) {
- if (conn->state == SPLICE_ESTABLISHED)
- tcp_splice_state(conn, SPLICE_FIN_FROM);
- else if (conn->state == SPLICE_FIN_TO)
- tcp_splice_state(conn, SPLICE_FIN_BOTH);
- } else {
- if (conn->state == SPLICE_ESTABLISHED)
- tcp_splice_state(conn, SPLICE_FIN_TO);
- else if (conn->state == SPLICE_FIN_FROM)
- tcp_splice_state(conn, SPLICE_FIN_BOTH);
- }
- }
-
-swap:
- eof = 0;
- never_read = 1;
-
- if (move_from == conn->from) {
- seq_read = &conn->from_read;
- seq_write = &conn->from_written;
- rcvlowat_set = splice_rcvlowat_set[0];
- rcvlowat_act = splice_rcvlowat_act[0];
- } else {
- seq_read = &conn->to_read;
- seq_write = &conn->to_written;
- rcvlowat_set = splice_rcvlowat_set[1];
- rcvlowat_act = splice_rcvlowat_act[1];
- }
-
-
- while (1) {
- int retry_write = 0, more = 0;
- ssize_t readlen, to_write = 0, written;
-
-retry:
- readlen = splice(move_from, NULL, pipes[1], NULL,
- c->tcp.pipe_size,
- SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
- if (readlen < 0) {
- if (errno == EINTR)
- goto retry;
-
- if (errno != EAGAIN)
- goto close;
-
- to_write = c->tcp.pipe_size;
- } else if (!readlen) {
- eof = 1;
- to_write = c->tcp.pipe_size;
- } else {
- never_read = 0;
- to_write += readlen;
- if (readlen >= (long)c->tcp.pipe_size * 90 / 100)
- more = SPLICE_F_MORE;
-
- if (bitmap_isset(rcvlowat_set, conn - ts))
- bitmap_set(rcvlowat_act, conn - ts);
- }
-
-eintr:
- written = splice(pipes[0], NULL, move_to, NULL, to_write,
- SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
-
- /* Most common case: skip updating counters. */
- if (readlen > 0 && readlen == written) {
- if (readlen >= (long)c->tcp.pipe_size * 10 / 100)
- continue;
-
- if (!bitmap_isset(rcvlowat_set, conn - ts) &&
- readlen > (long)c->tcp.pipe_size / 10) {
- int lowat = c->tcp.pipe_size / 4;
-
- setsockopt(move_from, SOL_SOCKET, SO_RCVLOWAT,
- &lowat, sizeof(lowat));
-
- bitmap_set(rcvlowat_set, conn - ts);
- bitmap_set(rcvlowat_act, conn - ts);
- }
-
- break;
- }
-
- *seq_read += readlen > 0 ? readlen : 0;
- *seq_write += written > 0 ? written : 0;
-
- if (written < 0) {
- if (errno == EINTR)
- goto eintr;
-
- if (errno != EAGAIN)
- goto close;
-
- if (never_read)
- break;
-
- if (retry_write--)
- goto retry;
-
- ev.events = EPOLLIN | EPOLLOUT | EPOLLRDHUP;
- ref.r.s = move_to;
- ev.data.u64 = ref.u64,
- epoll_ctl(c->epollfd, EPOLL_CTL_MOD, move_to, &ev);
- break;
- }
-
- if (never_read && written == (long)(c->tcp.pipe_size))
- goto retry;
-
- if (!never_read && written < to_write) {
- to_write -= written;
- goto retry;
- }
-
- if (eof)
- break;
- }
-
- if (*seq_read == *seq_write) {
- if (move_from == conn->from &&
- (conn->state == SPLICE_FIN_FROM ||
- conn->state == SPLICE_FIN_BOTH)) {
- if (!conn->from_fin_sent) {
- shutdown(conn->to, SHUT_WR);
- conn->from_fin_sent = 1;
-
- ev.events = 0;
- ref.r.s = move_from;
- ev.data.u64 = ref.u64,
- epoll_ctl(c->epollfd, EPOLL_CTL_MOD,
- move_from, &ev);
- }
-
- if (conn->to_fin_sent)
- goto close;
- } else if (move_from == conn->to &&
- (conn->state == SPLICE_FIN_TO ||
- conn->state == SPLICE_FIN_BOTH)) {
- if (!conn->to_fin_sent) {
- shutdown(conn->from, SHUT_WR);
- conn->to_fin_sent = 1;
-
- ev.events = 0;
- ref.r.s = move_from;
- ev.data.u64 = ref.u64,
- epoll_ctl(c->epollfd, EPOLL_CTL_MOD,
- move_from, &ev);
- }
-
- if (conn->from_fin_sent)
- goto close;
- }
- }
-
- if ((events & (EPOLLIN | EPOLLOUT)) == (EPOLLIN | EPOLLOUT)) {
- events = EPOLLIN;
-
- SWAP(move_from, move_to);
- if (pipes == conn->pipe_from_to)
- pipes = conn->pipe_to_from;
- else
- pipes = conn->pipe_from_to;
-
- goto swap;
- }
-
- return;
-
-close:
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->from, NULL);
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->to, NULL);
- conn->state = CLOSED;
-}
-
-/**
* tcp_sock_handler() - Handle new data from socket
* @c: Execution context
* @ref: epoll reference
@@ -3285,7 +2774,7 @@ close:
void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now)
{
- struct tcp_tap_conn *conn;
+ struct tcp_conn *conn;
if (ref.r.p.tcp.tcp.splice) {
tcp_sock_handler_splice(c, ref, events);
@@ -3297,110 +2786,52 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
return;
}
- conn = &tt[ref.r.p.tcp.tcp.index];
+ if (!(conn = CONN(ref.r.p.tcp.tcp.index)))
+ return;
conn->ts_sock_act = *now;
if (events & EPOLLERR) {
- if (conn->state != CLOSED)
- tcp_rst(c, conn);
-
+ tcp_rst(c, conn);
return;
}
- switch (conn->state) {
- case TAP_SYN_SENT:
- if (events & EPOLLOUT)
- tcp_connect_finish(c, conn, now);
- else
- tcp_rst(c, conn);
- return;
- case ESTABLISHED_SOCK_FIN:
- case ESTABLISHED_SOCK_FIN_SENT:
- case ESTABLISHED:
- if (events & EPOLLRDHUP) {
- if (conn->state == ESTABLISHED)
- tcp_tap_state(conn, ESTABLISHED_SOCK_FIN);
- }
- tcp_data_from_sock(c, conn, now);
- return;
- case LAST_ACK:
- tcp_send_to_tap(c, conn, 0, now);
- if (conn->seq_ack_to_tap == conn->seq_from_tap + 1 ||
- conn->seq_ack_to_tap == conn->seq_from_tap)
- tcp_tap_destroy(c, conn);
- return;
- case FIN_WAIT_1:
- if (events & EPOLLIN)
- tcp_data_from_sock(c, conn, now);
- if (events & EPOLLRDHUP) {
- tcp_send_to_tap(c, conn, FIN | ACK, now);
- tcp_tap_state(conn, FIN_WAIT_1_SOCK_FIN);
- }
- return;
- case CLOSE_WAIT:
- case FIN_WAIT_1_SOCK_FIN:
- if (events & EPOLLIN)
- tcp_data_from_sock(c, conn, now);
- if (events & EPOLLHUP) {
- if ((conn->seq_ack_to_tap == conn->seq_from_tap + 1 ||
- conn->seq_ack_to_tap == conn->seq_from_tap) &&
- (conn->seq_ack_from_tap == conn->seq_to_tap - 1 ||
- conn->seq_ack_from_tap == conn->seq_to_tap)) {
- tcp_tap_destroy(c, conn);
- } else {
- tcp_send_to_tap(c, conn, ACK, now);
- }
- }
+ if ((conn->events & TAP_FIN_SENT) && (events & EPOLLHUP)) {
+ tcp_conn_destroy(c, conn);
return;
- case TAP_SYN_RCVD:
- case SOCK_SYN_SENT:
- case SPLICE_ACCEPTED:
- case SPLICE_CONNECT:
- case SPLICE_ESTABLISHED:
- case SPLICE_FIN_FROM:
- case SPLICE_FIN_TO:
- case SPLICE_FIN_BOTH:
- case CLOSED:
- break;
}
-}
-/**
- * tcp_set_pipe_size() - Set usable pipe size, probe starting from MAX_PIPE_SIZE
- * @c: Execution context
- */
-static void tcp_set_pipe_size(struct ctx *c)
-{
- int probe_pipe[TCP_SPLICE_PIPE_POOL_SIZE * 2][2], i, j;
+ if (conn->events & ESTABLISHED) {
+ if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
+ tcp_conn_destroy(c, conn);
- c->tcp.pipe_size = MAX_PIPE_SIZE;
+ if (events & (EPOLLRDHUP | EPOLLHUP))
+ conn_event(c, conn, SOCK_FIN_RCVD);
-smaller:
- for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE * 2; i++) {
- if (pipe2(probe_pipe[i], 0)) {
- i++;
- break;
- }
+ if (events & EPOLLIN)
+ tcp_data_from_sock(c, conn, now);
- if (fcntl(probe_pipe[i][0], F_SETPIPE_SZ, c->tcp.pipe_size) < 0)
- break;
- }
+ if (events & EPOLLOUT)
+ tcp_update_seqack_wnd(c, conn, 0, NULL);
- for (j = i - 1; j >= 0; j--) {
- close(probe_pipe[j][0]);
- close(probe_pipe[j][1]);
+ return;
}
- if (i == TCP_SPLICE_PIPE_POOL_SIZE * 2)
+ /* EPOLLHUP during handshake: reset */
+ if (events & EPOLLHUP) {
+ tcp_rst(c, conn);
return;
+ }
- if (!(c->tcp.pipe_size /= 2)) {
- c->tcp.pipe_size = MAX_PIPE_SIZE;
+ /* Data during handshake tap-side: check later */
+ if (conn->events & SOCK_ACCEPTED)
return;
- }
- goto smaller;
+ if (conn->events == TAP_SYN_RCVD) {
+ if (events & EPOLLOUT)
+ tcp_connect_finish(c, conn, now);
+ /* Data? Check later */
+ }
}
/**
@@ -3517,32 +2948,6 @@ static int tcp_sock_init_ns(void *arg)
}
/**
- * tcp_splice_pipe_refill() - Refill pool of pre-opened pipes
- * @c: Execution context
- */
-static void tcp_splice_pipe_refill(struct ctx *c)
-{
- int i;
-
- for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) {
- if (splice_pipe_pool[i][0][0] >= 0)
- break;
- if (pipe2(splice_pipe_pool[i][0], O_NONBLOCK))
- continue;
- if (pipe2(splice_pipe_pool[i][1], O_NONBLOCK)) {
- close(splice_pipe_pool[i][1][0]);
- close(splice_pipe_pool[i][1][1]);
- continue;
- }
-
- fcntl(splice_pipe_pool[i][0][0], F_SETPIPE_SZ,
- c->tcp.pipe_size);
- fcntl(splice_pipe_pool[i][1][0], F_SETPIPE_SZ,
- c->tcp.pipe_size);
- }
-}
-
-/**
* struct tcp_sock_refill_arg - Arguments for tcp_sock_refill()
* @c: Execution context
* @ns: Set to refill pool of sockets created in namespace
@@ -3637,8 +3042,8 @@ int tcp_sock_init(struct ctx *c, struct timespec *now)
tcp_sock_init_one(c, 0, port);
}
- for (i = 0; i < ARRAY_SIZE(tcp_l2_mh_tap); i++)
- tcp_l2_mh_tap[i] = (struct mmsghdr) { .msg_hdr.msg_iovlen = 1 };
+ for (i = 0; i < ARRAY_SIZE(tcp_l2_mh); i++)
+ tcp_l2_mh[i] = (struct mmsghdr) { .msg_hdr.msg_iovlen = 1 };
if (c->v4)
tcp_sock4_iov_init();
@@ -3646,7 +3051,6 @@ int tcp_sock_init(struct ctx *c, struct timespec *now)
if (c->v6)
tcp_sock6_iov_init();
- memset(splice_pipe_pool, 0xff, sizeof(splice_pipe_pool));
memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4));
memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6));
memset(ns_sock_pool4, 0xff, sizeof(ns_sock_pool4));
@@ -3659,12 +3063,12 @@ int tcp_sock_init(struct ctx *c, struct timespec *now)
tcp_sock_refill(&refill_arg);
if (c->mode == MODE_PASTA) {
- tcp_set_pipe_size(c);
+ tcp_splice_init(c);
+
NS_CALL(tcp_sock_init_ns, c);
refill_arg.ns = 1;
NS_CALL(tcp_sock_refill, &refill_arg);
- tcp_splice_pipe_refill(c);
c->tcp.port_detect_ts = *now;
}
@@ -3678,7 +3082,7 @@ int tcp_sock_init(struct ctx *c, struct timespec *now)
* @conn: Connection pointer
* @ts: Timestamp from caller
*/
-static void tcp_timer_one(struct ctx *c, struct tcp_tap_conn *conn,
+static void tcp_timer_one(struct ctx *c, struct tcp_conn *conn,
struct timespec *ts)
{
int ack_from_tap = timespec_diff_ms(ts, &conn->ts_ack_from_tap);
@@ -3693,67 +3097,49 @@ static void tcp_timer_one(struct ctx *c, struct tcp_tap_conn *conn,
else
tap_data_noack = timespec_diff_ms(ts, &conn->tap_data_noack);
- switch (conn->state) {
- case CLOSED:
+ if (CONN_IS_CLOSED(conn)) {
tcp_hash_remove(conn);
- tcp_table_tap_compact(c, conn);
- break;
- case SOCK_SYN_SENT:
- case TAP_SYN_RCVD:
+ tcp_table_compact(c, conn);
+ return;
+ }
+
+ if (!(conn->events & ESTABLISHED)) {
if (ack_from_tap > SYN_TIMEOUT)
tcp_rst(c, conn);
+ return;
+ }
- break;
- case ESTABLISHED_SOCK_FIN_SENT:
- if (tap_data_noack > FIN_TIMEOUT) {
- tcp_rst(c, conn);
- break;
- }
- /* Falls through */
- case ESTABLISHED:
- case ESTABLISHED_SOCK_FIN:
- if (tap_act > ACT_TIMEOUT && sock_act > ACT_TIMEOUT) {
- tcp_rst(c, conn);
- break;
- }
+ if (tap_act > ACT_TIMEOUT && sock_act > ACT_TIMEOUT)
+ goto rst;
- if (!conn->wnd_to_tap || ack_to_tap > ACK_INTERVAL)
- tcp_send_to_tap(c, conn, 0, ts);
+ if (!conn->wnd_to_tap || ack_to_tap > ACK_INTERVAL)
+ tcp_send_flag(c, conn, ACK_IF_NEEDED, ts);
- if (tap_data_noack > ACK_TIMEOUT) {
- if (conn->seq_ack_from_tap < conn->seq_to_tap) {
- if (tap_data_noack > LAST_ACK_TIMEOUT) {
- tcp_rst(c, conn);
- break;
- }
+ if (tap_data_noack > ACK_TIMEOUT) {
+ if (conn->seq_ack_from_tap < conn->seq_to_tap) {
+ if (tap_data_noack > LAST_ACK_TIMEOUT)
+ goto rst;
- conn->seq_to_tap = conn->seq_ack_from_tap;
- tcp_data_from_sock(c, conn, ts);
- }
+ conn->seq_to_tap = conn->seq_ack_from_tap;
+ tcp_data_from_sock(c, conn, ts);
}
- break;
- case CLOSE_WAIT:
- case FIN_WAIT_1_SOCK_FIN:
- if (tap_data_noack > FIN_TIMEOUT)
- tcp_rst(c, conn);
- break;
- case FIN_WAIT_1:
- if (sock_act > FIN_TIMEOUT)
- tcp_rst(c, conn);
- break;
- case LAST_ACK:
+ return;
+ }
+
+ if (conn->events & TAP_FIN_SENT && tap_data_noack > FIN_TIMEOUT)
+ goto rst;
+
+ if (conn->events & SOCK_FIN_SENT && sock_act > FIN_TIMEOUT)
+ goto rst;
+
+ if (conn->events & SOCK_FIN_SENT && conn->events & SOCK_FIN_RCVD) {
if (sock_act > LAST_ACK_TIMEOUT || tap_act > LAST_ACK_TIMEOUT)
- tcp_rst(c, conn);
- break;
- case TAP_SYN_SENT:
- case SPLICE_ACCEPTED:
- case SPLICE_CONNECT:
- case SPLICE_ESTABLISHED:
- case SPLICE_FIN_FROM:
- case SPLICE_FIN_TO:
- case SPLICE_FIN_BOTH:
- break;
+ goto rst;
}
+
+ return;
+rst:
+ tcp_rst(c, conn);
}
/**
@@ -3904,6 +3290,8 @@ void tcp_timer(struct ctx *c, struct timespec *now)
c->tcp.port_detect_ts = *now;
}
+
+ tcp_splice_timer(c, now);
}
if (timespec_diff_ms(now, &c->tcp.refill_ts) > REFILL_INTERVAL) {
@@ -3913,41 +3301,9 @@ void tcp_timer(struct ctx *c, struct timespec *now)
if ((c->v4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) ||
(c->v6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0))
NS_CALL(tcp_sock_refill, &refill_arg);
-
- tcp_splice_pipe_refill(c);
}
}
- for (i = c->tcp.tap_conn_count - 1; i >= 0; i--)
- tcp_timer_one(c, tt + i, now);
-
- if (c->mode == MODE_PASTA) {
- for (i = c->tcp.splice_conn_count - 1; i >= 0; i--) {
- if ((ts + i)->state == CLOSED) {
- tcp_splice_destroy(c, ts + i);
- continue;
- }
-
- if (bitmap_isset(splice_rcvlowat_set[0], i) &&
- !bitmap_isset(splice_rcvlowat_act[0], i)) {
- int lowat = 1;
-
- setsockopt((ts + i)->from, SOL_SOCKET,
- SO_RCVLOWAT, &lowat, sizeof(lowat));
- bitmap_clear(splice_rcvlowat_set[0], i);
- }
-
- if (bitmap_isset(splice_rcvlowat_set[1], i) &&
- !bitmap_isset(splice_rcvlowat_act[1], i)) {
- int lowat = 1;
-
- setsockopt((ts + i)->to, SOL_SOCKET,
- SO_RCVLOWAT, &lowat, sizeof(lowat));
- bitmap_clear(splice_rcvlowat_set[1], i);
- }
-
- bitmap_clear(splice_rcvlowat_act[0], i);
- bitmap_clear(splice_rcvlowat_act[1], i);
- }
- }
+ for (i = c->tcp.conn_count - 1; i >= 0; i--)
+ tcp_timer_one(c, CONN(i), now);
}
diff --git a/tcp.h b/tcp.h
index 512ee76..b4e3fde 100644
--- a/tcp.h
+++ b/tcp.h
@@ -11,6 +11,8 @@
#define TCP_MAX_CONNS (128 * 1024)
#define TCP_MAX_SOCKS (TCP_MAX_CONNS + USHRT_MAX * 2)
+#define TCP_SOCK_POOL_SIZE 32
+
struct ctx;
void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
@@ -19,7 +21,9 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
struct tap_l4_msg *msg, int count, struct timespec *now);
int tcp_sock_init(struct ctx *c, struct timespec *now);
void tcp_timer(struct ctx *c, struct timespec *now);
-void tcp_defer_handler(struct ctx *c);
+void tcp_defer_handler(struct ctx *c, struct timespec *now);
+
+void tcp_sock_set_bufsize(struct ctx *c, int s);
void tcp_update_l2_buf(unsigned char *eth_d, unsigned char *eth_s,
const uint32_t *ip_da);
void tcp_remap_to_tap(in_port_t port, in_port_t delta);
@@ -46,7 +50,7 @@ union tcp_epoll_ref {
/**
* struct tcp_ctx - Execution context for TCP routines
* @hash_secret: 128-bit secret for hash functions, ISN and hash table
- * @tap_conn_count: Count of tap connections in connection table
+ * @conn_count: Count of connections (not spliced) in connection table
* @splice_conn_count: Count of spliced connections in connection table
* @port_to_tap: Ports bound host-side, packets to tap or spliced
* @init_detect_ports: If set, periodically detect ports bound in init
@@ -60,7 +64,7 @@ union tcp_epoll_ref {
*/
struct tcp_ctx {
uint64_t hash_secret[2];
- int tap_conn_count;
+ int conn_count;
int splice_conn_count;
uint8_t port_to_tap [USHRT_MAX / 8];
int init_detect_ports;
diff --git a/tcp_splice.c b/tcp_splice.c
new file mode 100644
index 0000000..cb8df7b
--- /dev/null
+++ b/tcp_splice.c
@@ -0,0 +1,859 @@
+// SPDX-License-Identifier: AGPL-3.0-or-later
+
+/* PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
+ *
+ * tcp_splice.c - direct namespace forwarding for local connections
+ *
+ * Copyright (c) 2020-2022 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+/**
+ * DOC: Theory of Operation
+ *
+ *
+ * For local traffic directed to TCP ports configured for direct mapping between
+ * namespaces, packets are directly translated between L4 sockets using a pair
+ * of splice() syscalls. These connections are tracked in the @tc array of
+ * struct tcp_splice_conn, using these events:
+ *
+ * - SPLICE_CONNECT: connection accepted, connecting to target
+ * - SPLICE_ESTABLISHED: connection to target established
+ * - SPLICE_A_OUT_WAIT: pipe to accepted socket full, wait for EPOLLOUT
+ * - SPLICE_B_OUT_WAIT: pipe to target socket full, wait for EPOLLOUT
+ * - SPLICE_A_FIN_RCVD: FIN (EPOLLRDHUP) seen from accepted socket
+ * - SPLICE_B_FIN_RCVD: FIN (EPOLLRDHUP) seen from target socket
+ * - SPLICE_A_FIN_RCVD: FIN (write shutdown) sent to accepted socket
+ * - SPLICE_B_FIN_RCVD: FIN (write shutdown) sent to target socket
+ *
+ * #syscalls:pasta pipe2|pipe fcntl armv6l:fcntl64 armv7l:fcntl64 ppc64:fcntl64
+ */
+
+#include <sched.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stdint.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <net/ethernet.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sys/epoll.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#include "util.h"
+#include "passt.h"
+
+#define MAX_PIPE_SIZE (2UL * 1024 * 1024)
+#define TCP_SPLICE_MAX_CONNS (128 * 1024)
+#define TCP_SPLICE_PIPE_POOL_SIZE 16
+#define REFILL_INTERVAL 1000 /* ms, refill pool of pipes */
+
+/* From tcp.c */
+extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE];
+extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE];
+extern int ns_sock_pool4 [TCP_SOCK_POOL_SIZE];
+extern int ns_sock_pool6 [TCP_SOCK_POOL_SIZE];
+
+/* Pool of pre-opened pipes */
+static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2][2];
+
+/**
+ * struct tcp_splice_conn - Descriptor for a spliced TCP connection
+ * @a: File descriptor number of socket for accepted connection
+ * @pipe_a_b: Pipe ends for splice() from @a to @b
+ * @b: File descriptor number of peer connected socket
+ * @pipe_b_a: Pipe ends for splice() from @b to @a
+ * @events: Events observed/actions performed on connection
+ * @flags: Connection flags (attributes, not events)
+ * @a_read: Bytes read from @a (not fully written to @b in one shot)
+ * @a_written: Bytes written to @a (not fully written from one @b read)
+ * @b_read: Bytes read from @b (not fully written to @a in one shot)
+ * @b_written: Bytes written to @b (not fully written from one @a read)
+*/
+struct tcp_splice_conn {
+ int a;
+ int pipe_a_b[2];
+ int b;
+ int pipe_b_a[2];
+
+ uint8_t events;
+#define SPLICE_CLOSED 0
+#define SPLICE_CONNECT BIT(0)
+#define SPLICE_ESTABLISHED BIT(1)
+#define SPLICE_A_OUT_WAIT BIT(2)
+#define SPLICE_B_OUT_WAIT BIT(3)
+#define SPLICE_A_FIN_RCVD BIT(4)
+#define SPLICE_B_FIN_RCVD BIT(5)
+#define SPLICE_A_FIN_SENT BIT(6)
+#define SPLICE_B_FIN_SENT BIT(7)
+
+ uint8_t flags;
+#define SPLICE_V6 BIT(0)
+#define SPLICE_IN_EPOLL BIT(1)
+#define SPLICE_RCVLOWAT_SET_A BIT(2)
+#define SPLICE_RCVLOWAT_SET_B BIT(3)
+#define SPLICE_RCVLOWAT_ACT_A BIT(4)
+#define SPLICE_RCVLOWAT_ACT_B BIT(5)
+#define SPLICE_CLOSING BIT(6)
+
+ uint64_t a_read;
+ uint64_t a_written;
+ uint64_t b_read;
+ uint64_t b_written;
+};
+
+#define CONN_V6(x) (x->flags & SPLICE_V6)
+#define CONN_V4(x) (!CONN_V6(x))
+#define CONN_HAS(conn, set) ((conn->events & (set)) == (set))
+#define CONN(index) (tc + (index))
+
+/* Spliced connections */
+static struct tcp_splice_conn tc[TCP_SPLICE_MAX_CONNS];
+
+/* Display strings for connection events */
+static const char *tcp_splice_event_str[] __attribute((__unused__)) = {
+ "SPLICE_CONNECT", "SPLICE_ESTABLISHED",
+ "SPLICE_A_OUT_WAIT", "SPLICE_B_OUT_WAIT",
+ "SPLICE_A_FIN_RCVD", "SPLICE_B_FIN_RCVD",
+ "SPLICE_A_FIN_SENT", "SPLICE_B_FIN_SENT",
+};
+
+/* Display strings for connection flags */
+static const char *tcp_splice_flag_str[] __attribute((__unused__)) = {
+ "V6", "IN_EPOLL", "RCVLOWAT_SET_A", "RCVLOWAT_SET_B",
+ "RCVLOWAT_ACT_A", "RCVLOWAT_ACT_B", "CLOSING",
+};
+
+/**
+ * tcp_splice_conn_epoll_events() - epoll events masks for given state
+ * @events: Connection event flags
+ * @a: Event mask for socket with accepted connection, set on return
+ * @b: Event mask for connection target socket, set on return
+ */
+static void tcp_splice_conn_epoll_events(uint16_t events,
+ uint32_t *a, uint32_t *b)
+{
+ *a = *b = 0;
+
+ if (events & SPLICE_CLOSED)
+ return;
+
+ if (events & SPLICE_ESTABLISHED)
+ *a = *b = EPOLLIN | EPOLLRDHUP;
+ else if (events & SPLICE_CONNECT)
+ *b = EPOLLOUT;
+
+ *a |= (events & SPLICE_A_OUT_WAIT) ? EPOLLOUT : 0;
+ *b |= (events & SPLICE_B_OUT_WAIT) ? EPOLLOUT : 0;
+}
+
+static int tcp_splice_epoll_ctl(struct ctx *c, struct tcp_splice_conn *conn);
+
+/**
+ * conn_flag_do() - Set/unset given flag, log, update epoll on SPLICE_CLOSING
+ * @c: Execution context
+ * @conn: Connection pointer
+ * @flag: Flag to set, or ~flag to unset
+ */
+static void conn_flag_do(struct ctx *c, struct tcp_splice_conn *conn,
+ unsigned long flag)
+{
+ if (flag & (flag - 1)) {
+ if (!(conn->flags & ~flag))
+ return;
+
+ conn->flags &= flag;
+ debug("TCP (spliced): index %i: %s dropped", (conn) - tc,
+ tcp_splice_flag_str[fls(~flag)]);
+ } else {
+ if (conn->flags & flag)
+ return;
+
+ conn->flags |= flag;
+ debug("TCP (spliced): index %i: %s", (conn) - tc,
+ tcp_splice_flag_str[fls(flag)]);
+ }
+
+ if (flag == SPLICE_CLOSING)
+ tcp_splice_epoll_ctl(c, conn);
+}
+
+#define conn_flag(c, conn, flag) \
+ do { \
+ trace("TCP (spliced): flag at %s:%i", \
+ __func__, __LINE__); \
+ conn_flag_do(c, conn, flag); \
+ } while (0)
+
+/**
+ * tcp_splice_epoll_ctl() - Add/modify/delete epoll state from connection events
+ * @c: Execution context
+ * @conn: Connection pointer
+ *
+ * Return: 0 on success, negative error code on failure (not on deletion)
+ */
+static int tcp_splice_epoll_ctl(struct ctx *c, struct tcp_splice_conn *conn)
+{
+ int m = (conn->flags & SPLICE_IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
+ union epoll_ref ref_a = { .r.proto = IPPROTO_TCP, .r.s = conn->a,
+ .r.p.tcp.tcp.splice = 1,
+ .r.p.tcp.tcp.index = conn - tc,
+ .r.p.tcp.tcp.v6 = CONN_V6(conn) };
+ union epoll_ref ref_b = { .r.proto = IPPROTO_TCP, .r.s = conn->b,
+ .r.p.tcp.tcp.splice = 1,
+ .r.p.tcp.tcp.index = conn - tc,
+ .r.p.tcp.tcp.v6 = CONN_V6(conn) };
+ struct epoll_event ev_a = { .data.u64 = ref_a.u64 };
+ struct epoll_event ev_b = { .data.u64 = ref_b.u64 };
+ uint32_t events_a, events_b;
+
+ if (conn->flags & SPLICE_CLOSING) {
+ if (conn->flags & SPLICE_IN_EPOLL)
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->a, &ev_a);
+
+ if (conn->events & SPLICE_CONNECT)
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->b, &ev_b);
+
+ return 0;
+ }
+
+ tcp_splice_conn_epoll_events(conn->events, &events_a, &events_b);
+ ev_a.events = events_a;
+ ev_b.events = events_b;
+
+ if (epoll_ctl(c->epollfd, m, conn->a, &ev_a) ||
+ epoll_ctl(c->epollfd, m, conn->b, &ev_b))
+ goto err;
+
+ conn->flags |= SPLICE_IN_EPOLL; /* No need to log this */
+
+ return 0;
+
+err:
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->a, &ev_a);
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->b, &ev_b);
+ return -errno;
+}
+
+/**
+ * conn_event_do() - Set and log connection events, update epoll state
+ * @c: Execution context
+ * @conn: Connection pointer
+ * @event: Connection event
+ */
+static void conn_event_do(struct ctx *c, struct tcp_splice_conn *conn,
+ unsigned long event)
+{
+ if (event == SPLICE_CLOSED) {
+ conn->events = SPLICE_CLOSED;
+ debug("TCP (spliced): index %i, CLOSED", conn - tc);
+ return;
+ }
+
+ if (event & (event - 1)) {
+ if (!(conn->events & ~event))
+ return;
+
+ conn->events &= event;
+ debug("TCP (spliced): index %i, ~%s", conn - tc,
+ tcp_splice_event_str[fls(~event)]);
+ } else {
+ if (conn->events & event)
+ return;
+
+ conn->events |= event;
+ debug("TCP (spliced): index %i, %s", conn - tc,
+ tcp_splice_event_str[fls(event)]);
+ }
+
+ if (tcp_splice_epoll_ctl(c, conn))
+ conn_flag(c, conn, SPLICE_CLOSING);
+}
+
+#define conn_event(c, conn, event) \
+ do { \
+ trace("TCP (spliced): event at %s:%i", \
+ __func__, __LINE__); \
+ conn_event_do(c, conn, event); \
+ } while (0)
+
+/**
+ * tcp_table_splice_compact - Compact spliced connection table
+ * @c: Execution context
+ * @hole: Pointer to recently closed connection
+ */
+static void tcp_table_splice_compact(struct ctx *c,
+ struct tcp_splice_conn *hole)
+{
+ struct tcp_splice_conn *move;
+
+ if ((hole - tc) == --c->tcp.splice_conn_count) {
+ debug("TCP (spliced): index %i (max) removed", hole - tc);
+ return;
+ }
+
+ move = CONN(c->tcp.splice_conn_count);
+
+ memcpy(hole, move, sizeof(*hole));
+
+ move->a = move->b = -1;
+ move->flags = move->events = 0;
+ move->a_read = move->a_written = move->b_read = move->b_written = 0;
+
+ debug("TCP (spliced): index %i moved to %i", move - tc, hole - tc);
+ if (tcp_splice_epoll_ctl(c, hole))
+ conn_flag(c, hole, SPLICE_CLOSING);
+}
+
+/**
+ * tcp_splice_destroy() - Close spliced connection and pipes, drop from epoll
+ * @c: Execution context
+ * @conn: Connection pointer
+ */
+static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn)
+{
+ if (conn->events & SPLICE_ESTABLISHED) {
+ /* Flushing might need to block: don't recycle them. */
+ if (conn->pipe_a_b[0] != -1) {
+ close(conn->pipe_a_b[0]);
+ close(conn->pipe_a_b[1]);
+ conn->pipe_a_b[0] = conn->pipe_a_b[1] = -1;
+ }
+ if (conn->pipe_b_a[0] != -1) {
+ close(conn->pipe_b_a[0]);
+ close(conn->pipe_b_a[1]);
+ conn->pipe_b_a[0] = conn->pipe_b_a[1] = -1;
+ }
+ }
+
+ if (conn->events & SPLICE_CONNECT) {
+ close(conn->b);
+ conn->b = -1;
+ }
+
+ conn_event(c, conn, SPLICE_CLOSED);
+
+ close(conn->a);
+ conn->a = -1;
+ conn->flags = 0;
+ conn->a_read = conn->a_written = conn->b_read = conn->b_written = 0;
+
+ tcp_table_splice_compact(c, conn);
+}
+
+/**
+ * tcp_splice_connect_finish() - Completion of connect() or call on success
+ * @c: Execution context
+ * @conn: Connection pointer
+ *
+ * Return: 0 on success, -EIO on failure
+ */
+static int tcp_splice_connect_finish(struct ctx *c,
+ struct tcp_splice_conn *conn)
+{
+ int i;
+
+ conn->pipe_a_b[0] = conn->pipe_b_a[0] = -1;
+ conn->pipe_a_b[1] = conn->pipe_b_a[1] = -1;
+
+ for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) {
+ if (splice_pipe_pool[i][0][0] > 0) {
+ SWAP(conn->pipe_a_b[0], splice_pipe_pool[i][0][0]);
+ SWAP(conn->pipe_a_b[1], splice_pipe_pool[i][0][1]);
+
+ SWAP(conn->pipe_b_a[0], splice_pipe_pool[i][1][0]);
+ SWAP(conn->pipe_b_a[1], splice_pipe_pool[i][1][1]);
+ break;
+ }
+ }
+
+ if (conn->pipe_a_b[0] < 0) {
+ if (pipe2(conn->pipe_a_b, O_NONBLOCK) ||
+ pipe2(conn->pipe_b_a, O_NONBLOCK)) {
+ conn_flag(c, conn, SPLICE_CLOSING);
+ return -EIO;
+ }
+
+ fcntl(conn->pipe_a_b[0], F_SETPIPE_SZ, c->tcp.pipe_size);
+ fcntl(conn->pipe_b_a[0], F_SETPIPE_SZ, c->tcp.pipe_size);
+ }
+
+ if (!(conn->events & SPLICE_ESTABLISHED))
+ conn_event(c, conn, SPLICE_ESTABLISHED);
+
+ return 0;
+}
+
+/**
+ * tcp_splice_connect() - Create and connect socket for new spliced connection
+ * @c: Execution context
+ * @conn: Connection pointer
+ * @s: Accepted socket
+ * @port: Destination port, host order
+ *
+ * Return: 0 for connect() succeeded or in progress, negative value on error
+ */
+static int tcp_splice_connect(struct ctx *c, struct tcp_splice_conn *conn,
+ int s, in_port_t port)
+{
+ int sock_conn = (s >= 0) ? s : socket(CONN_V6(conn) ? AF_INET6 :
+ AF_INET,
+ SOCK_STREAM | SOCK_NONBLOCK,
+ IPPROTO_TCP);
+ struct sockaddr_in6 addr6 = {
+ .sin6_family = AF_INET6,
+ .sin6_port = htons(port),
+ .sin6_addr = IN6ADDR_LOOPBACK_INIT,
+ };
+ struct sockaddr_in addr4 = {
+ .sin_family = AF_INET,
+ .sin_port = htons(port),
+ .sin_addr = { .s_addr = htonl(INADDR_LOOPBACK) },
+ };
+ const struct sockaddr *sa;
+ socklen_t sl;
+
+ conn->b = sock_conn;
+
+ if (s < 0)
+ tcp_sock_set_bufsize(c, conn->b);
+
+ setsockopt(conn->b, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int));
+
+ if (CONN_V6(conn)) {
+ sa = (struct sockaddr *)&addr6;
+ sl = sizeof(addr6);
+ } else {
+ sa = (struct sockaddr *)&addr4;
+ sl = sizeof(addr4);
+ }
+
+ if (connect(conn->b, sa, sl)) {
+ if (errno != EINPROGRESS) {
+ int ret = -errno;
+
+ close(sock_conn);
+ return ret;
+ }
+ conn_event(c, conn, SPLICE_CONNECT);
+ } else {
+ conn_event(c, conn, SPLICE_ESTABLISHED);
+ return tcp_splice_connect_finish(c, conn);
+ }
+
+ return 0;
+}
+
+/**
+ * struct tcp_splice_connect_ns_arg - Arguments for tcp_splice_connect_ns()
+ * @c: Execution context
+ * @conn: Accepted inbound connection
+ * @port: Destination port, host order
+ * @ret: Return value of tcp_splice_connect_ns()
+ */
+struct tcp_splice_connect_ns_arg {
+ struct ctx *c;
+ struct tcp_splice_conn *conn;
+ in_port_t port;
+ int ret;
+};
+
+/**
+ * tcp_splice_connect_ns() - Enter namespace and call tcp_splice_connect()
+ * @arg: See struct tcp_splice_connect_ns_arg
+ *
+ * Return: 0
+ */
+static int tcp_splice_connect_ns(void *arg)
+{
+ struct tcp_splice_connect_ns_arg *a;
+
+ a = (struct tcp_splice_connect_ns_arg *)arg;
+ ns_enter(a->c);
+ a->ret = tcp_splice_connect(a->c, a->conn, -1, a->port);
+ return 0;
+}
+
+/**
+ * tcp_splice_new() - Handle new inbound, spliced connection
+ * @c: Execution context
+ * @conn: Connection pointer
+ * @port: Destination port, host order
+ *
+ * Return: return code from connect()
+ */
+static int tcp_splice_new(struct ctx *c, struct tcp_splice_conn *conn,
+ in_port_t port)
+{
+ struct tcp_splice_connect_ns_arg ns_arg = { c, conn, port, 0 };
+ int *sock_pool_p, i, s = -1;
+
+ if (bitmap_isset(c->tcp.port_to_tap, port))
+ sock_pool_p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4;
+ else
+ sock_pool_p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4;
+
+ for (i = 0; i < TCP_SOCK_POOL_SIZE; i++, sock_pool_p++) {
+ if ((s = *sock_pool_p) >= 0) {
+ *sock_pool_p = -1;
+ break;
+ }
+ }
+
+ if (s < 0 && bitmap_isset(c->tcp.port_to_tap, port)) {
+ NS_CALL(tcp_splice_connect_ns, &ns_arg);
+ return ns_arg.ret;
+ }
+
+ return tcp_splice_connect(c, conn, s, port);
+}
+
+/**
+ * tcp_splice_dir() - Set sockets/pipe pointers reflecting flow direction
+ * @conn: Connection pointers
+ * @ref_sock: Socket returned as reference from epoll
+ * @reverse: Reverse direction: @ref_sock is used as destination
+ * @from: Destination socket pointer to set
+ * @to: Source socket pointer to set
+ * @pipes: Pipe set, assigned on return
+ */
+static void tcp_splice_dir(struct tcp_splice_conn *conn, int ref_sock,
+ int reverse, int *from, int *to, int **pipes)
+{
+ if (!reverse) {
+ *from = ref_sock;
+ *to = (*from == conn->a) ? conn->b : conn->a;
+ } else {
+ *to = ref_sock;
+ *from = (*to == conn->a) ? conn->b : conn->a;
+ }
+
+ *pipes = *from == conn->a ? conn->pipe_a_b : conn->pipe_b_a;
+}
+
+/**
+ * tcp_sock_handler_splice() - Handler for socket mapped to spliced connection
+ * @c: Execution context
+ * @ref: epoll reference
+ * @events: epoll events bitmap
+ *
+ * #syscalls:pasta splice
+ */
+void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
+ uint32_t events)
+{
+ uint8_t lowat_set_flag, lowat_act_flag;
+ int from, to, *pipes, eof, never_read;
+ uint64_t *seq_read, *seq_write;
+ struct tcp_splice_conn *conn;
+
+ if (ref.r.p.tcp.tcp.listen) {
+ int s;
+
+ if (c->tcp.splice_conn_count >= TCP_SPLICE_MAX_CONNS)
+ return;
+
+ if ((s = accept4(ref.r.s, NULL, NULL, SOCK_NONBLOCK)) < 0)
+ return;
+
+ setsockopt(s, SOL_TCP, TCP_QUICKACK, &((int){ 1 }),
+ sizeof(int));
+
+ conn = CONN(c->tcp.splice_conn_count++);
+ conn->a = s;
+ conn->flags = ref.r.p.tcp.tcp.v6 ? SPLICE_V6 : 0;
+
+ if (tcp_splice_new(c, conn, ref.r.p.tcp.tcp.index))
+ conn_flag(c, conn, SPLICE_CLOSING);
+
+ return;
+ }
+
+ conn = CONN(ref.r.p.tcp.tcp.index);
+
+ if (events & EPOLLERR || events & EPOLLHUP)
+ goto close;
+
+ if (conn->events == SPLICE_CONNECT) {
+ if (!(events & EPOLLOUT))
+ goto close;
+ if (tcp_splice_connect_finish(c, conn))
+ goto close;
+ }
+
+ if (events & EPOLLOUT) {
+ if (ref.r.s == conn->a)
+ conn_event(c, conn, ~SPLICE_A_OUT_WAIT);
+ else
+ conn_event(c, conn, ~SPLICE_B_OUT_WAIT);
+
+ tcp_splice_dir(conn, ref.r.s, 1, &from, &to, &pipes);
+ } else {
+ tcp_splice_dir(conn, ref.r.s, 0, &from, &to, &pipes);
+ }
+
+ if (events & EPOLLRDHUP) {
+ if (ref.r.s == conn->a)
+ conn_event(c, conn, SPLICE_A_FIN_RCVD);
+ else
+ conn_event(c, conn, SPLICE_B_FIN_RCVD);
+ }
+
+swap:
+ eof = 0;
+ never_read = 1;
+
+ if (from == conn->a) {
+ seq_read = &conn->a_read;
+ seq_write = &conn->a_written;
+ lowat_set_flag = SPLICE_RCVLOWAT_SET_A;
+ lowat_act_flag = SPLICE_RCVLOWAT_ACT_A;
+ } else {
+ seq_read = &conn->b_read;
+ seq_write = &conn->b_written;
+ lowat_set_flag = SPLICE_RCVLOWAT_SET_B;
+ lowat_act_flag = SPLICE_RCVLOWAT_ACT_B;
+ }
+
+ while (1) {
+ int retry_write = 0, more = 0;
+ ssize_t readlen, to_write = 0, written;
+
+retry:
+ readlen = splice(from, NULL, pipes[1], NULL, c->tcp.pipe_size,
+ SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
+ if (readlen < 0) {
+ if (errno == EINTR)
+ goto retry;
+
+ if (errno != EAGAIN)
+ goto close;
+
+ to_write = c->tcp.pipe_size;
+ } else if (!readlen) {
+ eof = 1;
+ to_write = c->tcp.pipe_size;
+ } else {
+ never_read = 0;
+ to_write += readlen;
+ if (readlen >= (long)c->tcp.pipe_size * 90 / 100)
+ more = SPLICE_F_MORE;
+
+ if (conn->flags & lowat_set_flag)
+ conn_flag(c, conn, lowat_act_flag);
+ }
+
+eintr:
+ written = splice(pipes[0], NULL, to, NULL, to_write,
+ SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
+
+ /* Most common case: skip updating counters. */
+ if (readlen > 0 && readlen == written) {
+ if (readlen >= (long)c->tcp.pipe_size * 10 / 100)
+ continue;
+
+ if (conn->flags & lowat_set_flag &&
+ readlen > (long)c->tcp.pipe_size / 10) {
+ int lowat = c->tcp.pipe_size / 4;
+
+ setsockopt(from, SOL_SOCKET, SO_RCVLOWAT,
+ &lowat, sizeof(lowat));
+
+ conn_flag(c, conn, lowat_set_flag);
+ conn_flag(c, conn, lowat_act_flag);
+ }
+
+ break;
+ }
+
+ *seq_read += readlen > 0 ? readlen : 0;
+ *seq_write += written > 0 ? written : 0;
+
+ if (written < 0) {
+ if (errno == EINTR)
+ goto eintr;
+
+ if (errno != EAGAIN)
+ goto close;
+
+ if (never_read)
+ break;
+
+ if (retry_write--)
+ goto retry;
+
+ if (to == conn->a)
+ conn_event(c, conn, SPLICE_A_OUT_WAIT);
+ else
+ conn_event(c, conn, SPLICE_B_OUT_WAIT);
+ break;
+ }
+
+ if (never_read && written == (long)(c->tcp.pipe_size))
+ goto retry;
+
+ if (!never_read && written < to_write) {
+ to_write -= written;
+ goto retry;
+ }
+
+ if (eof)
+ break;
+ }
+
+ if ( (conn->events & SPLICE_A_FIN_RCVD) &&
+ !(conn->events & SPLICE_B_FIN_SENT)) {
+ if (*seq_read == *seq_write) {
+ shutdown(conn->b, SHUT_WR);
+ conn_event(c, conn, SPLICE_B_FIN_SENT);
+ }
+ }
+
+ if ( (conn->events & SPLICE_B_FIN_RCVD) &&
+ !(conn->events & SPLICE_A_FIN_SENT)) {
+ if (*seq_read == *seq_write) {
+ shutdown(conn->a, SHUT_WR);
+ conn_event(c, conn, SPLICE_A_FIN_SENT);
+ }
+ }
+
+ if (CONN_HAS(conn, SPLICE_A_FIN_SENT | SPLICE_B_FIN_SENT))
+ goto close;
+
+ if ((events & (EPOLLIN | EPOLLOUT)) == (EPOLLIN | EPOLLOUT)) {
+ events = EPOLLIN;
+
+ SWAP(from, to);
+ if (pipes == conn->pipe_a_b)
+ pipes = conn->pipe_b_a;
+ else
+ pipes = conn->pipe_a_b;
+
+ goto swap;
+ }
+
+ return;
+
+close:
+ conn_flag(c, conn, SPLICE_CLOSING);
+}
+
+/**
+ * tcp_set_pipe_size() - Set usable pipe size, probe starting from MAX_PIPE_SIZE
+ * @c: Execution context
+ */
+static void tcp_set_pipe_size(struct ctx *c)
+{
+ int probe_pipe[TCP_SPLICE_PIPE_POOL_SIZE * 2][2], i, j;
+
+ c->tcp.pipe_size = MAX_PIPE_SIZE;
+
+smaller:
+ for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE * 2; i++) {
+ if (pipe2(probe_pipe[i], 0)) {
+ i++;
+ break;
+ }
+
+ if (fcntl(probe_pipe[i][0], F_SETPIPE_SZ, c->tcp.pipe_size) < 0)
+ break;
+ }
+
+ for (j = i - 1; j >= 0; j--) {
+ close(probe_pipe[j][0]);
+ close(probe_pipe[j][1]);
+ }
+
+ if (i == TCP_SPLICE_PIPE_POOL_SIZE * 2)
+ return;
+
+ if (!(c->tcp.pipe_size /= 2)) {
+ c->tcp.pipe_size = MAX_PIPE_SIZE;
+ return;
+ }
+
+ goto smaller;
+}
+
+/**
+ * tcp_splice_pipe_refill() - Refill pool of pre-opened pipes
+ * @c: Execution context
+ */
+static void tcp_splice_pipe_refill(struct ctx *c)
+{
+ int i;
+
+ for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) {
+ if (splice_pipe_pool[i][0][0] >= 0)
+ break;
+ if (pipe2(splice_pipe_pool[i][0], O_NONBLOCK))
+ continue;
+ if (pipe2(splice_pipe_pool[i][1], O_NONBLOCK)) {
+ close(splice_pipe_pool[i][1][0]);
+ close(splice_pipe_pool[i][1][1]);
+ continue;
+ }
+
+ fcntl(splice_pipe_pool[i][0][0], F_SETPIPE_SZ,
+ c->tcp.pipe_size);
+ fcntl(splice_pipe_pool[i][1][0], F_SETPIPE_SZ,
+ c->tcp.pipe_size);
+ }
+}
+
+/**
+ * tcp_splice_init() - Initialise pipe pool and size
+ * @c: Execution context
+ */
+void tcp_splice_init(struct ctx *c)
+{
+ memset(splice_pipe_pool, 0xff, sizeof(splice_pipe_pool));
+ tcp_set_pipe_size(c);
+ tcp_splice_pipe_refill(c);
+}
+
+/**
+ * tcp_splice_timer() - Timer for spliced connections
+ * @c: Execution context
+ * @now: Current timestamp
+ */
+void tcp_splice_timer(struct ctx *c, struct timespec *now)
+{
+ int i;
+
+ for (i = c->tcp.splice_conn_count - 1; i >= 0; i--) {
+ struct tcp_splice_conn *conn;
+
+ conn = CONN(i);
+
+ if (conn->flags & SPLICE_CLOSING) {
+ tcp_splice_destroy(c, conn);
+ continue;
+ }
+
+ if ( (conn->flags & SPLICE_RCVLOWAT_SET_A) &&
+ !(conn->flags & SPLICE_RCVLOWAT_ACT_A)) {
+ setsockopt(conn->a, SOL_SOCKET, SO_RCVLOWAT,
+ &((int){ 1 }), sizeof(int));
+ conn_flag(c, conn, ~SPLICE_RCVLOWAT_SET_A);
+ }
+
+ if ( (conn->flags & SPLICE_RCVLOWAT_SET_B) &&
+ !(conn->flags & SPLICE_RCVLOWAT_ACT_B)) {
+ setsockopt(conn->b, SOL_SOCKET, SO_RCVLOWAT,
+ &((int){ 1 }), sizeof(int));
+ conn_flag(c, conn, ~SPLICE_RCVLOWAT_SET_B);
+ }
+
+ conn_flag(c, conn, ~SPLICE_RCVLOWAT_ACT_A);
+ conn_flag(c, conn, ~SPLICE_RCVLOWAT_ACT_B);
+ }
+
+ if (timespec_diff_ms(now, &c->tcp.refill_ts) > REFILL_INTERVAL)
+ tcp_splice_pipe_refill(c);
+}
diff --git a/tcp_splice.h b/tcp_splice.h
new file mode 100644
index 0000000..45ab1ca
--- /dev/null
+++ b/tcp_splice.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: AGPL-3.0-or-later
+ * Copyright (c) 2022 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#define TCP_SPLICE_MAX_CONNS (128 * 1024)
+
+struct tcp_splice_conn;
+
+void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
+ uint32_t events);
+void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn);
+void tcp_splice_init(struct ctx *c);
+void tcp_splice_timer(struct ctx *c, struct timespec *now);
diff --git a/util.c b/util.c
index 50b83db..2d8952a 100644
--- a/util.c
+++ b/util.c
@@ -589,3 +589,22 @@ int __daemon(int pidfile_fd, int devnull_fd)
return 0;
}
+
+/**
+ * fls() - Find last (most significant) bit set in word
+ * @x: Word
+ *
+ * Return: position of most significant bit set, starting from 0, -1 if none
+ */
+int fls(unsigned long x)
+{
+ int y = 0;
+
+ if (!x)
+ return -1;
+
+ while (x >>= 1)
+ y++;
+
+ return y;
+}
diff --git a/util.h b/util.h
index e314c71..3073f58 100644
--- a/util.h
+++ b/util.h
@@ -37,7 +37,8 @@ void trace_init(int enable);
#define ROUND_DOWN(x, y) ((x) & ~((y) - 1))
#define ROUND_UP(x, y) (((x) + (y) - 1) & ~((y) - 1))
-#define BITMAP_BIT(n) (1UL << (n) % (sizeof(long) * 8))
+#define BIT(n) (1UL << (n))
+#define BITMAP_BIT(n) (BIT((n) % (sizeof(long) * 8)))
#define BITMAP_WORD(n) (n / (sizeof(long) * 8))
#define SWAP(a, b) \
@@ -208,3 +209,4 @@ void drop_caps(void);
int ns_enter(struct ctx *c);
void write_pidfile(int fd, pid_t pid);
int __daemon(int pidfile_fd, int devnull_fd);
+int fls(unsigned long x);