aboutgitcodebugslistschat
diff options
context:
space:
mode:
-rw-r--r--Makefile3
-rw-r--r--tcp.c206
-rw-r--r--tcp_conn.h168
-rw-r--r--tcp_splice.c93
4 files changed, 245 insertions, 225 deletions
diff --git a/Makefile b/Makefile
index cc4f014..a9b4f79 100644
--- a/Makefile
+++ b/Makefile
@@ -47,7 +47,8 @@ MANPAGES = passt.1 pasta.1 qrap.1
PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h icmp.h \
isolation.h lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h \
- pcap.h port_fwd.h siphash.h tap.h tcp.h tcp_splice.h udp.h util.h
+ pcap.h port_fwd.h siphash.h tap.h tcp.h tcp_conn.h tcp_splice.h udp.h \
+ util.h
HEADERS = $(PASST_HEADERS) seccomp.h
# On gcc 11 and 12, with -O2 and -flto, tcp_hash() and siphash_20b(), if
diff --git a/tcp.c b/tcp.c
index 34d7d45..189041c 100644
--- a/tcp.c
+++ b/tcp.c
@@ -98,7 +98,7 @@
* Connection tracking and storage
* -------------------------------
*
- * Connections are tracked by the @tc array of struct tcp_conn, containing
+ * Connections are tracked by the @tc array of struct tcp_tap_conn, containing
* addresses, ports, TCP states and parameters. This is statically allocated and
* indexed by an arbitrary connection number. The array is compacted whenever a
* connection is closed, by remapping the highest connection index in use to the
@@ -301,6 +301,8 @@
#include "tcp_splice.h"
#include "log.h"
+#include "tcp_conn.h"
+
#define TCP_FRAMES_MEM 128
#define TCP_FRAMES \
(c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)
@@ -308,7 +310,6 @@
#define TCP_FILE_PRESSURE 30 /* % of c->nofile */
#define TCP_CONN_PRESSURE 30 /* % of c->tcp.conn_count */
-#define TCP_HASH_BUCKET_BITS (TCP_CONN_INDEX_BITS + 1)
#define TCP_HASH_TABLE_LOAD 70 /* % */
#define TCP_HASH_TABLE_SIZE (TCP_MAX_CONNS * 100 / \
TCP_HASH_TABLE_LOAD)
@@ -402,117 +403,8 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
#define OPT_SACK 5
#define OPT_TS 8
-/**
- * struct tcp_conn - Descriptor for a TCP connection (not spliced)
- * @next_index: Connection index of next item in hash chain, -1 for none
- * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
- * @sock: Socket descriptor number
- * @events: Connection events, implying connection states
- * @timer: timerfd descriptor for timeout events
- * @flags: Connection flags representing internal attributes
- * @hash_bucket: Bucket index in connection lookup hash table
- * @retrans: Number of retransmissions occurred due to ACK_TIMEOUT
- * @ws_from_tap: Window scaling factor advertised from tap/guest
- * @ws_to_tap: Window scaling factor advertised to tap/guest
- * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
- * @seq_dup_ack_approx: Last duplicate ACK number sent to tap
- * @a.a6: IPv6 remote address, can be IPv4-mapped
- * @a.a4.zero: Zero prefix for IPv4-mapped, see RFC 6890, Table 20
- * @a.a4.one: Ones prefix for IPv4-mapped
- * @a.a4.a: IPv4 address
- * @tap_port: Guest-facing tap port
- * @sock_port: Remote, socket-facing port
- * @wnd_from_tap: Last window size from tap, unscaled (as received)
- * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent)
- * @seq_to_tap: Next sequence for packets to tap
- * @seq_ack_from_tap: Last ACK number received from tap
- * @seq_from_tap: Next sequence for packets from tap (not actually sent)
- * @seq_ack_to_tap: Last ACK number sent to tap
- * @seq_init_from_tap: Initial sequence number from tap
- */
-struct tcp_conn {
- int next_index :TCP_CONN_INDEX_BITS + 2;
-
-#define TCP_RETRANS_BITS 3
- unsigned int retrans :TCP_RETRANS_BITS;
-#define TCP_MAX_RETRANS ((1U << TCP_RETRANS_BITS) - 1)
-
-#define TCP_WS_BITS 4 /* RFC 7323 */
-#define TCP_WS_MAX 14
- unsigned int ws_from_tap :TCP_WS_BITS;
- unsigned int ws_to_tap :TCP_WS_BITS;
-
-
- int sock :SOCKET_REF_BITS;
-
- uint8_t events;
-#define CLOSED 0
-#define SOCK_ACCEPTED BIT(0) /* implies SYN sent to tap */
-#define TAP_SYN_RCVD BIT(1) /* implies socket connecting */
-#define TAP_SYN_ACK_SENT BIT( 3) /* implies socket connected */
-#define ESTABLISHED BIT(2)
-#define SOCK_FIN_RCVD BIT( 3)
-#define SOCK_FIN_SENT BIT( 4)
-#define TAP_FIN_RCVD BIT( 5)
-#define TAP_FIN_SENT BIT( 6)
-#define TAP_FIN_ACKED BIT( 7)
-
-#define CONN_STATE_BITS /* Setting these clears other flags */ \
- (SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED)
-
-
- int timer :SOCKET_REF_BITS;
-
- uint8_t flags;
-#define STALLED BIT(0)
-#define LOCAL BIT(1)
-#define WND_CLAMPED BIT(2)
-#define IN_EPOLL BIT(3)
-#define ACTIVE_CLOSE BIT(4)
-#define ACK_TO_TAP_DUE BIT(5)
-#define ACK_FROM_TAP_DUE BIT(6)
-
-
- unsigned int hash_bucket :TCP_HASH_BUCKET_BITS;
-
-#define TCP_MSS_BITS 14
- unsigned int tap_mss :TCP_MSS_BITS;
-#define MSS_SET(conn, mss) (conn->tap_mss = (mss >> (16 - TCP_MSS_BITS)))
-#define MSS_GET(conn) (conn->tap_mss << (16 - TCP_MSS_BITS))
-
-
-#define SNDBUF_BITS 24
- unsigned int sndbuf :SNDBUF_BITS;
-#define SNDBUF_SET(conn, bytes) (conn->sndbuf = ((bytes) >> (32 - SNDBUF_BITS)))
-#define SNDBUF_GET(conn) (conn->sndbuf << (32 - SNDBUF_BITS))
-
- uint8_t seq_dup_ack_approx;
-
-
- union {
- struct in6_addr a6;
- struct {
- uint8_t zero[10];
- uint8_t one[2];
- struct in_addr a;
- } a4;
- } a;
#define CONN_V4(conn) IN6_IS_ADDR_V4MAPPED(&conn->a.a6)
#define CONN_V6(conn) (!CONN_V4(conn))
-
- in_port_t tap_port;
- in_port_t sock_port;
-
- uint16_t wnd_from_tap;
- uint16_t wnd_to_tap;
-
- uint32_t seq_to_tap;
- uint32_t seq_ack_from_tap;
- uint32_t seq_from_tap;
- uint32_t seq_ack_to_tap;
- uint32_t seq_init_from_tap;
-};
-
#define CONN_IS_CLOSING(conn) \
((conn->events & ESTABLISHED) && \
(conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
@@ -695,7 +587,7 @@ static unsigned int tcp6_l2_flags_buf_used;
static size_t tcp6_l2_flags_buf_bytes;
/* TCP connections */
-static struct tcp_conn tc[TCP_MAX_CONNS];
+static struct tcp_tap_conn tc[TCP_MAX_CONNS];
#define CONN(index) (tc + (index))
#define CONN_IDX(conn) ((conn) - tc)
@@ -705,7 +597,7 @@ static struct tcp_conn tc[TCP_MAX_CONNS];
*
* Return: pointer to connection, or NULL if @index is out of bounds
*/
-static inline struct tcp_conn *conn_at_idx(int index)
+static inline struct tcp_tap_conn *conn_at_idx(int index)
{
if ((index < 0) || (index >= TCP_MAX_CONNS))
return NULL;
@@ -713,7 +605,7 @@ static inline struct tcp_conn *conn_at_idx(int index)
}
/* Table for lookup from remote address, local port, remote port */
-static struct tcp_conn *tc_hash[TCP_HASH_TABLE_SIZE];
+static struct tcp_tap_conn *tc_hash[TCP_HASH_TABLE_SIZE];
/* Pools for pre-opened sockets */
int init_sock_pool4 [TCP_SOCK_POOL_SIZE];
@@ -749,7 +641,7 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
return EPOLLRDHUP;
}
-static void conn_flag_do(const struct ctx *c, struct tcp_conn *conn,
+static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
unsigned long flag);
#define conn_flag(c, conn, flag) \
do { \
@@ -764,7 +656,7 @@ static void conn_flag_do(const struct ctx *c, struct tcp_conn *conn,
*
* Return: 0 on success, negative error code on failure (not on deletion)
*/
-static int tcp_epoll_ctl(const struct ctx *c, struct tcp_conn *conn)
+static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
{
int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
union epoll_ref ref = { .r.proto = IPPROTO_TCP, .r.s = conn->sock,
@@ -809,7 +701,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_conn *conn)
*
* #syscalls timerfd_create timerfd_settime
*/
-static void tcp_timer_ctl(const struct ctx *c, struct tcp_conn *conn)
+static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
{
struct itimerspec it = { { 0 }, { 0 } };
@@ -865,7 +757,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_conn *conn)
* @conn: Connection pointer
* @flag: Flag to set, or ~flag to unset
*/
-static void conn_flag_do(const struct ctx *c, struct tcp_conn *conn,
+static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
unsigned long flag)
{
if (flag & (flag - 1)) {
@@ -903,7 +795,7 @@ static void conn_flag_do(const struct ctx *c, struct tcp_conn *conn,
* @conn: Connection pointer
* @event: Connection event
*/
-static void conn_event_do(const struct ctx *c, struct tcp_conn *conn,
+static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
unsigned long event)
{
int prev, new, num = fls(event);
@@ -963,7 +855,7 @@ static void conn_event_do(const struct ctx *c, struct tcp_conn *conn,
*
* Return: 1 if destination is in low RTT table, 0 otherwise
*/
-static int tcp_rtt_dst_low(const struct tcp_conn *conn)
+static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
{
int i;
@@ -979,7 +871,7 @@ static int tcp_rtt_dst_low(const struct tcp_conn *conn)
* @conn: Connection pointer
* @tinfo: Pointer to struct tcp_info for socket
*/
-static void tcp_rtt_dst_check(const struct tcp_conn *conn,
+static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
const struct tcp_info *tinfo)
{
#ifdef HAS_MIN_RTT
@@ -1016,7 +908,7 @@ static void tcp_rtt_dst_check(const struct tcp_conn *conn,
* tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.5 usage)
* @conn: Connection pointer
*/
-static void tcp_get_sndbuf(struct tcp_conn *conn)
+static void tcp_get_sndbuf(struct tcp_tap_conn *conn)
{
int s = conn->sock, sndbuf;
socklen_t sl;
@@ -1290,7 +1182,8 @@ static int tcp_opt_get(const char *opts, size_t len, uint8_t type_find,
*
* Return: 1 on match, 0 otherwise
*/
-static int tcp_hash_match(const struct tcp_conn *conn, int af, const void *addr,
+static int tcp_hash_match(const struct tcp_tap_conn *conn,
+ int af, const void *addr,
in_port_t tap_port, in_port_t sock_port)
{
if (af == AF_INET && CONN_V4(conn) &&
@@ -1356,7 +1249,7 @@ static unsigned int tcp_hash(const struct ctx *c, int af, const void *addr,
* @af: Address family, AF_INET or AF_INET6
* @addr: Remote address, pointer to in_addr or in6_addr
*/
-static void tcp_hash_insert(const struct ctx *c, struct tcp_conn *conn,
+static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn,
int af, const void *addr)
{
int b;
@@ -1374,9 +1267,9 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_conn *conn,
* tcp_hash_remove() - Drop connection from hash table, chain unlink
* @conn: Connection pointer
*/
-static void tcp_hash_remove(const struct tcp_conn *conn)
+static void tcp_hash_remove(const struct tcp_tap_conn *conn)
{
- struct tcp_conn *entry, *prev = NULL;
+ struct tcp_tap_conn *entry, *prev = NULL;
int b = conn->hash_bucket;
for (entry = tc_hash[b]; entry;
@@ -1400,9 +1293,9 @@ static void tcp_hash_remove(const struct tcp_conn *conn)
* @old: Old connection pointer
* @new: New connection pointer
*/
-static void tcp_hash_update(struct tcp_conn *old, struct tcp_conn *new)
+static void tcp_hash_update(struct tcp_tap_conn *old, struct tcp_tap_conn *new)
{
- struct tcp_conn *entry, *prev = NULL;
+ struct tcp_tap_conn *entry, *prev = NULL;
int b = old->hash_bucket;
for (entry = tc_hash[b]; entry;
@@ -1431,12 +1324,13 @@ static void tcp_hash_update(struct tcp_conn *old, struct tcp_conn *new)
*
* Return: connection pointer, if found, -ENOENT otherwise
*/
-static struct tcp_conn *tcp_hash_lookup(const struct ctx *c, int af,
- const void *addr,
- in_port_t tap_port, in_port_t sock_port)
+static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c,
+ int af, const void *addr,
+ in_port_t tap_port,
+ in_port_t sock_port)
{
int b = tcp_hash(c, af, addr, tap_port, sock_port);
- struct tcp_conn *conn;
+ struct tcp_tap_conn *conn;
for (conn = tc_hash[b]; conn; conn = conn_at_idx(conn->next_index)) {
if (tcp_hash_match(conn, af, addr, tap_port, sock_port))
@@ -1451,9 +1345,9 @@ static struct tcp_conn *tcp_hash_lookup(const struct ctx *c, int af,
* @c: Execution context
* @hole: Pointer to recently closed connection
*/
-static void tcp_table_compact(struct ctx *c, struct tcp_conn *hole)
+static void tcp_table_compact(struct ctx *c, struct tcp_tap_conn *hole)
{
- struct tcp_conn *from, *to;
+ struct tcp_tap_conn *from, *to;
if (CONN_IDX(hole) == --c->tcp.conn_count) {
debug("TCP: hash table compaction: maximum index was %li (%p)",
@@ -1482,7 +1376,7 @@ static void tcp_table_compact(struct ctx *c, struct tcp_conn *hole)
* @c: Execution context
* @conn: Connection pointer
*/
-static void tcp_conn_destroy(struct ctx *c, struct tcp_conn *conn)
+static void tcp_conn_destroy(struct ctx *c, struct tcp_tap_conn *conn)
{
close(conn->sock);
if (conn->timer != -1)
@@ -1492,7 +1386,7 @@ static void tcp_conn_destroy(struct ctx *c, struct tcp_conn *conn)
tcp_table_compact(c, conn);
}
-static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn);
+static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
#define tcp_rst(c, conn) \
do { \
debug("TCP: index %li, reset at %s:%i", CONN_IDX(conn), \
@@ -1627,7 +1521,7 @@ void tcp_defer_handler(struct ctx *c)
{
int max_conns = c->tcp.conn_count / 100 * TCP_CONN_PRESSURE;
int max_files = c->nofile / 100 * TCP_FILE_PRESSURE;
- struct tcp_conn *conn;
+ struct tcp_tap_conn *conn;
tcp_l2_flags_buf_flush(c);
tcp_l2_data_buf_flush(c);
@@ -1656,7 +1550,7 @@ void tcp_defer_handler(struct ctx *c)
* Return: 802.3 length, host order
*/
static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
- const struct tcp_conn *conn,
+ const struct tcp_tap_conn *conn,
void *p, size_t plen,
const uint16_t *check, uint32_t seq)
{
@@ -1738,7 +1632,7 @@ do { \
*
* Return: 1 if sequence or window were updated, 0 otherwise
*/
-static int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_conn *conn,
+static int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
int force_seq, struct tcp_info *tinfo)
{
uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap;
@@ -1824,7 +1718,7 @@ out:
*
* Return: negative error code on connection reset, 0 otherwise
*/
-static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags)
+static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
{
uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
uint32_t prev_wnd_to_tap = conn->wnd_to_tap;
@@ -1971,7 +1865,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags)
* @c: Execution context
* @conn: Connection pointer
*/
-static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn)
+static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
{
if (conn->events == CLOSED)
return;
@@ -1986,7 +1880,7 @@ static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn)
* @opts: Pointer to start of TCP options
* @optlen: Bytes in options: caller MUST ensure available length
*/
-static void tcp_get_tap_ws(struct tcp_conn *conn,
+static void tcp_get_tap_ws(struct tcp_tap_conn *conn,
const char *opts, size_t optlen)
{
int ws = tcp_opt_get(opts, optlen, OPT_WS, NULL, NULL);
@@ -2003,7 +1897,7 @@ static void tcp_get_tap_ws(struct tcp_conn *conn,
* @conn: Connection pointer
* @window: Window value, host order, unscaled
*/
-static void tcp_clamp_window(const struct ctx *c, struct tcp_conn *conn,
+static void tcp_clamp_window(const struct ctx *c, struct tcp_tap_conn *conn,
unsigned wnd)
{
uint32_t prev_scaled = conn->wnd_from_tap << conn->ws_from_tap;
@@ -2125,7 +2019,7 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
* Return: clamped MSS value
*/
static uint16_t tcp_conn_tap_mss(const struct ctx *c,
- const struct tcp_conn *conn,
+ const struct tcp_tap_conn *conn,
const char *opts, size_t optlen)
{
unsigned int mss;
@@ -2172,7 +2066,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr,
.sin6_addr = *(struct in6_addr *)addr,
};
const struct sockaddr *sa;
- struct tcp_conn *conn;
+ struct tcp_tap_conn *conn;
socklen_t sl;
int s, mss;
@@ -2280,7 +2174,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr,
*
* Return: 0 on success, negative error code from recv() on failure
*/
-static int tcp_sock_consume(struct tcp_conn *conn, uint32_t ack_seq)
+static int tcp_sock_consume(struct tcp_tap_conn *conn, uint32_t ack_seq)
{
/* Simply ignore out-of-order ACKs: we already consumed the data we
* needed from the buffer, and we won't rewind back to a lower ACK
@@ -2307,7 +2201,7 @@ static int tcp_sock_consume(struct tcp_conn *conn, uint32_t ack_seq)
* @seq: Sequence number to be sent
* @now: Current timestamp
*/
-static void tcp_data_to_tap(struct ctx *c, struct tcp_conn *conn,
+static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn,
ssize_t plen, int no_csum, uint32_t seq)
{
struct iovec *iov;
@@ -2344,7 +2238,7 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp_conn *conn,
*
* #syscalls recvmsg
*/
-static int tcp_data_from_sock(struct ctx *c, struct tcp_conn *conn)
+static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
{
uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
@@ -2475,7 +2369,7 @@ zero_len:
*
* #syscalls sendmsg
*/
-static void tcp_data_from_tap(struct ctx *c, struct tcp_conn *conn,
+static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
const struct pool *p)
{
int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0;
@@ -2675,7 +2569,7 @@ out:
* @opts: Pointer to start of options
* @optlen: Bytes in options: caller MUST ensure available length
*/
-static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn,
+static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
const struct tcphdr *th,
const char *opts, size_t optlen)
{
@@ -2714,7 +2608,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn,
int tcp_tap_handler(struct ctx *c, int af, const void *addr,
const struct pool *p, const struct timespec *now)
{
- struct tcp_conn *conn;
+ struct tcp_tap_conn *conn;
size_t optlen, len;
struct tcphdr *th;
int ack_due = 0;
@@ -2829,7 +2723,7 @@ int tcp_tap_handler(struct ctx *c, int af, const void *addr,
* @c: Execution context
* @conn: Connection pointer
*/
-static void tcp_connect_finish(struct ctx *c, struct tcp_conn *conn)
+static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
{
socklen_t sl;
int so;
@@ -2857,7 +2751,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
const struct timespec *now)
{
struct sockaddr_storage sa;
- struct tcp_conn *conn;
+ struct tcp_tap_conn *conn;
socklen_t sl;
int s;
@@ -2949,7 +2843,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
*/
static void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
{
- struct tcp_conn *conn = conn_at_idx(ref.r.p.tcp.tcp.index);
+ struct tcp_tap_conn *conn = conn_at_idx(ref.r.p.tcp.tcp.index);
struct itimerspec check_armed = { { 0 }, { 0 } };
if (!conn)
@@ -3012,7 +2906,7 @@ static void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
const struct timespec *now)
{
- struct tcp_conn *conn;
+ struct tcp_tap_conn *conn;
if (ref.r.p.tcp.tcp.timer) {
tcp_timer_handler(c, ref);
@@ -3510,7 +3404,7 @@ static int tcp_port_rebind(void *arg)
void tcp_timer(struct ctx *c, const struct timespec *ts)
{
struct tcp_sock_refill_arg refill_arg = { c, 0 };
- struct tcp_conn *conn;
+ struct tcp_tap_conn *conn;
(void)ts;
diff --git a/tcp_conn.h b/tcp_conn.h
new file mode 100644
index 0000000..db4c2d9
--- /dev/null
+++ b/tcp_conn.h
@@ -0,0 +1,168 @@
+/* SPDX-License-Identifier: AGPL-3.0-or-later
+ * Copyright Red Hat
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ * Author: David Gibson <david@gibson.dropbear.id.au>
+ *
+ * TCP connection tracking data structures, used by tcp.c and
+ * tcp_splice.c. Shouldn't be included in non-TCP code.
+ */
+#ifndef TCP_CONN_H
+#define TCP_CONN_H
+
+#define TCP_HASH_BUCKET_BITS (TCP_CONN_INDEX_BITS + 1)
+
+/**
+ * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced)
+ * @next_index: Connection index of next item in hash chain, -1 for none
+ * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
+ * @sock: Socket descriptor number
+ * @events: Connection events, implying connection states
+ * @timer: timerfd descriptor for timeout events
+ * @flags: Connection flags representing internal attributes
+ * @hash_bucket: Bucket index in connection lookup hash table
+ * @retrans: Number of retransmissions occurred due to ACK_TIMEOUT
+ * @ws_from_tap: Window scaling factor advertised from tap/guest
+ * @ws_to_tap: Window scaling factor advertised to tap/guest
+ * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
+ * @seq_dup_ack_approx: Last duplicate ACK number sent to tap
+ * @a.a6: IPv6 remote address, can be IPv4-mapped
+ * @a.a4.zero: Zero prefix for IPv4-mapped, see RFC 6890, Table 20
+ * @a.a4.one: Ones prefix for IPv4-mapped
+ * @a.a4.a: IPv4 address
+ * @tap_port: Guest-facing tap port
+ * @sock_port: Remote, socket-facing port
+ * @wnd_from_tap: Last window size from tap, unscaled (as received)
+ * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent)
+ * @seq_to_tap: Next sequence for packets to tap
+ * @seq_ack_from_tap: Last ACK number received from tap
+ * @seq_from_tap: Next sequence for packets from tap (not actually sent)
+ * @seq_ack_to_tap: Last ACK number sent to tap
+ * @seq_init_from_tap: Initial sequence number from tap
+ */
+struct tcp_tap_conn {
+ int next_index :TCP_CONN_INDEX_BITS + 2;
+
+#define TCP_RETRANS_BITS 3
+ unsigned int retrans :TCP_RETRANS_BITS;
+#define TCP_MAX_RETRANS ((1U << TCP_RETRANS_BITS) - 1)
+
+#define TCP_WS_BITS 4 /* RFC 7323 */
+#define TCP_WS_MAX 14
+ unsigned int ws_from_tap :TCP_WS_BITS;
+ unsigned int ws_to_tap :TCP_WS_BITS;
+
+
+ int sock :SOCKET_REF_BITS;
+
+ uint8_t events;
+#define CLOSED 0
+#define SOCK_ACCEPTED BIT(0) /* implies SYN sent to tap */
+#define TAP_SYN_RCVD BIT(1) /* implies socket connecting */
+#define TAP_SYN_ACK_SENT BIT( 3) /* implies socket connected */
+#define ESTABLISHED BIT(2)
+#define SOCK_FIN_RCVD BIT( 3)
+#define SOCK_FIN_SENT BIT( 4)
+#define TAP_FIN_RCVD BIT( 5)
+#define TAP_FIN_SENT BIT( 6)
+#define TAP_FIN_ACKED BIT( 7)
+
+#define CONN_STATE_BITS /* Setting these clears other flags */ \
+ (SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED)
+
+
+ int timer :SOCKET_REF_BITS;
+
+ uint8_t flags;
+#define STALLED BIT(0)
+#define LOCAL BIT(1)
+#define WND_CLAMPED BIT(2)
+#define IN_EPOLL BIT(3)
+#define ACTIVE_CLOSE BIT(4)
+#define ACK_TO_TAP_DUE BIT(5)
+#define ACK_FROM_TAP_DUE BIT(6)
+
+
+ unsigned int hash_bucket :TCP_HASH_BUCKET_BITS;
+
+#define TCP_MSS_BITS 14
+ unsigned int tap_mss :TCP_MSS_BITS;
+#define MSS_SET(conn, mss) (conn->tap_mss = (mss >> (16 - TCP_MSS_BITS)))
+#define MSS_GET(conn) (conn->tap_mss << (16 - TCP_MSS_BITS))
+
+
+#define SNDBUF_BITS 24
+ unsigned int sndbuf :SNDBUF_BITS;
+#define SNDBUF_SET(conn, bytes) (conn->sndbuf = ((bytes) >> (32 - SNDBUF_BITS)))
+#define SNDBUF_GET(conn) (conn->sndbuf << (32 - SNDBUF_BITS))
+
+ uint8_t seq_dup_ack_approx;
+
+
+ union {
+ struct in6_addr a6;
+ struct {
+ uint8_t zero[10];
+ uint8_t one[2];
+ struct in_addr a;
+ } a4;
+ } a;
+
+ in_port_t tap_port;
+ in_port_t sock_port;
+
+ uint16_t wnd_from_tap;
+ uint16_t wnd_to_tap;
+
+ uint32_t seq_to_tap;
+ uint32_t seq_ack_from_tap;
+ uint32_t seq_from_tap;
+ uint32_t seq_ack_to_tap;
+ uint32_t seq_init_from_tap;
+};
+
+/**
+ * struct tcp_splice_conn - Descriptor for a spliced TCP connection
+ * @a: File descriptor number of socket for accepted connection
+ * @pipe_a_b: Pipe ends for splice() from @a to @b
+ * @b: File descriptor number of peer connected socket
+ * @pipe_b_a: Pipe ends for splice() from @b to @a
+ * @events: Events observed/actions performed on connection
+ * @flags: Connection flags (attributes, not events)
+ * @a_read: Bytes read from @a (not fully written to @b in one shot)
+ * @a_written: Bytes written to @a (not fully written from one @b read)
+ * @b_read: Bytes read from @b (not fully written to @a in one shot)
+ * @b_written: Bytes written to @b (not fully written from one @a read)
+*/
+struct tcp_splice_conn {
+ int a;
+ int pipe_a_b[2];
+ int b;
+ int pipe_b_a[2];
+
+ uint8_t events;
+#define SPLICE_CLOSED 0
+#define SPLICE_CONNECT BIT(0)
+#define SPLICE_ESTABLISHED BIT(1)
+#define A_OUT_WAIT BIT(2)
+#define B_OUT_WAIT BIT(3)
+#define A_FIN_RCVD BIT(4)
+#define B_FIN_RCVD BIT(5)
+#define A_FIN_SENT BIT(6)
+#define B_FIN_SENT BIT(7)
+
+ uint8_t flags;
+#define SPLICE_V6 BIT(0)
+#define SPLICE_IN_EPOLL BIT(1)
+#define RCVLOWAT_SET_A BIT(2)
+#define RCVLOWAT_SET_B BIT(3)
+#define RCVLOWAT_ACT_A BIT(4)
+#define RCVLOWAT_ACT_B BIT(5)
+#define CLOSING BIT(6)
+
+ uint32_t a_read;
+ uint32_t a_written;
+ uint32_t b_read;
+ uint32_t b_written;
+};
+
+#endif /* TCP_CONN_H */
diff --git a/tcp_splice.c b/tcp_splice.c
index 4cc4ad2..cbfab01 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -21,12 +21,12 @@
*
* - SPLICE_CONNECT: connection accepted, connecting to target
* - SPLICE_ESTABLISHED: connection to target established
- * - SPLICE_A_OUT_WAIT: pipe to accepted socket full, wait for EPOLLOUT
- * - SPLICE_B_OUT_WAIT: pipe to target socket full, wait for EPOLLOUT
- * - SPLICE_A_FIN_RCVD: FIN (EPOLLRDHUP) seen from accepted socket
- * - SPLICE_B_FIN_RCVD: FIN (EPOLLRDHUP) seen from target socket
- * - SPLICE_A_FIN_RCVD: FIN (write shutdown) sent to accepted socket
- * - SPLICE_B_FIN_RCVD: FIN (write shutdown) sent to target socket
+ * - A_OUT_WAIT: pipe to accepted socket full, wait for EPOLLOUT
+ * - B_OUT_WAIT: pipe to target socket full, wait for EPOLLOUT
+ * - A_FIN_RCVD: FIN (EPOLLRDHUP) seen from accepted socket
+ * - B_FIN_RCVD: FIN (EPOLLRDHUP) seen from target socket
+ * - A_FIN_RCVD: FIN (write shutdown) sent to accepted socket
+ * - B_FIN_RCVD: FIN (write shutdown) sent to target socket
*
* #syscalls:pasta pipe2|pipe fcntl armv6l:fcntl64 armv7l:fcntl64 ppc64:fcntl64
*/
@@ -52,6 +52,8 @@
#include "log.h"
#include "tcp_splice.h"
+#include "tcp_conn.h"
+
#define MAX_PIPE_SIZE (8UL * 1024 * 1024)
#define TCP_SPLICE_PIPE_POOL_SIZE 16
#define TCP_SPLICE_CONN_PRESSURE 30 /* % of splice_conn_count */
@@ -66,52 +68,7 @@ extern int ns_sock_pool6 [TCP_SOCK_POOL_SIZE];
/* Pool of pre-opened pipes */
static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2][2];
-/**
- * struct tcp_splice_conn - Descriptor for a spliced TCP connection
- * @a: File descriptor number of socket for accepted connection
- * @pipe_a_b: Pipe ends for splice() from @a to @b
- * @b: File descriptor number of peer connected socket
- * @pipe_b_a: Pipe ends for splice() from @b to @a
- * @events: Events observed/actions performed on connection
- * @flags: Connection flags (attributes, not events)
- * @a_read: Bytes read from @a (not fully written to @b in one shot)
- * @a_written: Bytes written to @a (not fully written from one @b read)
- * @b_read: Bytes read from @b (not fully written to @a in one shot)
- * @b_written: Bytes written to @b (not fully written from one @a read)
-*/
-struct tcp_splice_conn {
- int a;
- int pipe_a_b[2];
- int b;
- int pipe_b_a[2];
-
- uint8_t events;
-#define CLOSED 0
-#define CONNECT BIT(0)
-#define ESTABLISHED BIT(1)
-#define A_OUT_WAIT BIT(2)
-#define B_OUT_WAIT BIT(3)
-#define A_FIN_RCVD BIT(4)
-#define B_FIN_RCVD BIT(5)
-#define A_FIN_SENT BIT(6)
-#define B_FIN_SENT BIT(7)
-
- uint8_t flags;
-#define SOCK_V6 BIT(0)
-#define IN_EPOLL BIT(1)
-#define RCVLOWAT_SET_A BIT(2)
-#define RCVLOWAT_SET_B BIT(3)
-#define RCVLOWAT_ACT_A BIT(4)
-#define RCVLOWAT_ACT_B BIT(5)
-#define CLOSING BIT(6)
-
- uint32_t a_read;
- uint32_t a_written;
- uint32_t b_read;
- uint32_t b_written;
-};
-
-#define CONN_V6(x) (x->flags & SOCK_V6)
+#define CONN_V6(x) (x->flags & SPLICE_V6)
#define CONN_V4(x) (!CONN_V6(x))
#define CONN_HAS(conn, set) ((conn->events & (set)) == (set))
#define CONN(index) (tc_splice + (index))
@@ -122,13 +79,13 @@ static struct tcp_splice_conn tc_splice[TCP_SPLICE_MAX_CONNS];
/* Display strings for connection events */
static const char *tcp_splice_event_str[] __attribute((__unused__)) = {
- "CONNECT", "ESTABLISHED", "A_OUT_WAIT", "B_OUT_WAIT",
+ "SPLICE_CONNECT", "SPLICE_ESTABLISHED", "A_OUT_WAIT", "B_OUT_WAIT",
"A_FIN_RCVD", "B_FIN_RCVD", "A_FIN_SENT", "B_FIN_SENT",
};
/* Display strings for connection flags */
static const char *tcp_splice_flag_str[] __attribute((__unused__)) = {
- "SOCK_V6", "IN_EPOLL", "RCVLOWAT_SET_A", "RCVLOWAT_SET_B",
+ "SPLICE_V6", "SPLICE_IN_EPOLL", "RCVLOWAT_SET_A", "RCVLOWAT_SET_B",
"RCVLOWAT_ACT_A", "RCVLOWAT_ACT_B", "CLOSING",
};
@@ -143,12 +100,12 @@ static void tcp_splice_conn_epoll_events(uint16_t events,
{
*a = *b = 0;
- if (events & ESTABLISHED) {
+ if (events & SPLICE_ESTABLISHED) {
if (!(events & B_FIN_SENT))
*a = EPOLLIN | EPOLLRDHUP;
if (!(events & A_FIN_SENT))
*b = EPOLLIN | EPOLLRDHUP;
- } else if (events & CONNECT) {
+ } else if (events & SPLICE_CONNECT) {
*b = EPOLLOUT;
}
@@ -210,7 +167,7 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
static int tcp_splice_epoll_ctl(const struct ctx *c,
struct tcp_splice_conn *conn)
{
- int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
+ int m = (conn->flags & SPLICE_IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
union epoll_ref ref_a = { .r.proto = IPPROTO_TCP, .r.s = conn->a,
.r.p.tcp.tcp.splice = 1,
.r.p.tcp.tcp.index = CONN_IDX(conn),
@@ -234,7 +191,7 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
epoll_ctl(c->epollfd, m, conn->b, &ev_b))
goto delete;
- conn->flags |= IN_EPOLL; /* No need to log this */
+ conn->flags |= SPLICE_IN_EPOLL; /* No need to log this */
return 0;
@@ -323,7 +280,7 @@ static void tcp_table_splice_compact(struct ctx *c,
*/
static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn)
{
- if (conn->events & ESTABLISHED) {
+ if (conn->events & SPLICE_ESTABLISHED) {
/* Flushing might need to block: don't recycle them. */
if (conn->pipe_a_b[0] != -1) {
close(conn->pipe_a_b[0]);
@@ -337,7 +294,7 @@ static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn)
}
}
- if (conn->events & CONNECT) {
+ if (conn->events & SPLICE_CONNECT) {
close(conn->b);
conn->b = -1;
}
@@ -346,7 +303,7 @@ static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn)
conn->a = -1;
conn->a_read = conn->a_written = conn->b_read = conn->b_written = 0;
- conn->events = CLOSED;
+ conn->events = SPLICE_CLOSED;
conn->flags = 0;
debug("TCP (spliced): index %li, CLOSED", CONN_IDX(conn));
@@ -397,8 +354,8 @@ static int tcp_splice_connect_finish(const struct ctx *c,
}
}
- if (!(conn->events & ESTABLISHED))
- conn_event(c, conn, ESTABLISHED);
+ if (!(conn->events & SPLICE_ESTABLISHED))
+ conn_event(c, conn, SPLICE_ESTABLISHED);
return 0;
}
@@ -466,9 +423,9 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
close(sock_conn);
return ret;
}
- conn_event(c, conn, CONNECT);
+ conn_event(c, conn, SPLICE_CONNECT);
} else {
- conn_event(c, conn, ESTABLISHED);
+ conn_event(c, conn, SPLICE_ESTABLISHED);
return tcp_splice_connect_finish(c, conn);
}
@@ -598,7 +555,7 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
conn = CONN(c->tcp.splice_conn_count++);
conn->a = s;
- conn->flags = ref.r.p.tcp.tcp.v6 ? SOCK_V6 : 0;
+ conn->flags = ref.r.p.tcp.tcp.v6 ? SPLICE_V6 : 0;
if (tcp_splice_new(c, conn, ref.r.p.tcp.tcp.index,
ref.r.p.tcp.tcp.outbound))
@@ -609,13 +566,13 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
conn = CONN(ref.r.p.tcp.tcp.index);
- if (conn->events == CLOSED)
+ if (conn->events == SPLICE_CLOSED)
return;
if (events & EPOLLERR)
goto close;
- if (conn->events == CONNECT) {
+ if (conn->events == SPLICE_CONNECT) {
if (!(events & EPOLLOUT))
goto close;
if (tcp_splice_connect_finish(c, conn))