// SPDX-License-Identifier: GPL-2.0-or-later
/* PASST - Plug A Simple Socket Transport
* for qemu/UNIX domain socket mode
*
* PASTA - Pack A Subtle Tap Abstraction
* for network namespace/tap device mode
*
* tcp.c - TCP L2-L4 translation state machine
*
* Copyright (c) 2020-2022 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*/
/**
* DOC: Theory of Operation
*
*
* PASST mode
* ==========
*
* This implementation maps TCP traffic between a single L2 interface (tap) and
* native TCP (L4) sockets, mimicking and reproducing as closely as possible the
* inferred behaviour of applications running on a guest, connected via said L2
* interface. Four connection flows are supported:
* - from the local host to the guest behind the tap interface:
* - this is the main use case for proxies in service meshes
* - we bind to configured local ports, and relay traffic between L4 sockets
* with local endpoints and the L2 interface
* - from remote hosts to the guest behind the tap interface:
* - this might be needed for services that need to be addressed directly,
* and typically configured with special port forwarding rules (which are
* not needed here)
* - we also relay traffic between L4 sockets with remote endpoints and the L2
* interface
* - from the guest to the local host:
* - this is not observed in practice, but implemented for completeness and
* transparency
* - from the guest to external hosts:
* - this might be needed for applications running on the guest that need to
* directly access internet services (e.g. NTP)
*
* Relevant goals are:
* - transparency: sockets need to behave as if guest applications were running
* directly on the host. This is achieved by:
* - avoiding port and address translations whenever possible
* - mirroring TCP dynamics by observation of socket parameters (TCP_INFO
* socket option) and TCP headers of packets coming from the tap interface,
* reapplying those parameters in both flow directions (including TCP_MSS
* socket option)
* - simplicity: only a small subset of TCP logic is implemented here and
* delegated as much as possible to the TCP implementations of guest and host
* kernel. This is achieved by:
* - avoiding a complete TCP stack reimplementation, with a modified TCP state
* machine focused on the translation of observed events instead
* - mirroring TCP dynamics as described above and hence avoiding the need for
* segmentation, explicit queueing, and reassembly of segments
* - security:
* - no dynamic memory allocation is performed
* - TODO: synflood protection
*
* Portability is limited by usage of Linux-specific socket options.
*
*
* Limits
* ------
*
* To avoid the need for dynamic memory allocation, a maximum, reasonable amount
* of connections is defined by TCP_MAX_CONNS (currently 128k).
*
* Data needs to linger on sockets as long as it's not acknowledged by the
* guest, and is read using MSG_PEEK into preallocated static buffers sized
* to the maximum supported window, 16 MiB ("discard" buffer, for already-sent
* data) plus a number of maximum-MSS-sized buffers. This imposes a practical
* limitation on window scaling, that is, the maximum factor is 256. Larger
* factors will be accepted, but resulting, larger values are never advertised
* to the other side, and not used while queueing data.
*
*
* Ports
* -----
*
* To avoid the need for ad-hoc configuration of port forwarding or allowed
* ports, listening sockets can be opened and bound to all unbound ports on the
* host, as far as process capabilities allow. This service needs to be started
* after any application proxy that needs to bind to local ports. Mapped ports
* can also be configured explicitly.
*
* No port translation is needed for connections initiated remotely or by the
* local host: source port from socket is reused while establishing connections
* to the guest.
*
* For connections initiated by the guest, it's not possible to force the same
* source port as connections are established by the host kernel: that's the
* only port translation needed.
*
*
* Connection tracking and storage
* -------------------------------
*
* Connections are tracked by struct tcp_tap_conn entries in the @tc
* array, containing addresses, ports, TCP states and parameters. This
* is statically allocated and indexed by an arbitrary connection
* number. The array is compacted whenever a connection is closed, by
* remapping the highest connection index in use to the one freed up.
*
* References used for the epoll interface report the connection index used for
* the @tc array.
*
* IPv4 addresses are stored as IPv4-mapped IPv6 addresses to avoid the need for
* separate data structures depending on the protocol version.
*
* - Inbound connection requests (to the guest) are mapped using the triple
* < source IP address, source port, destination port >
* - Outbound connection requests (from the guest) are mapped using the triple
* < destination IP address, destination port, source port >
* where the source port is the one used by the guest, not the one used by the
* corresponding host socket
*
*
* Initialisation
* --------------
*
* Up to 2^15 + 2^14 listening sockets (excluding ephemeral ports, repeated for
* IPv4 and IPv6) can be opened and bound to wildcard addresses. Some will fail
* to bind (for low ports, or ports already bound, e.g. by a proxy). These are
* added to the epoll list, with no separate storage.
*
*
* Events and states
* -----------------
*
* Instead of tracking connection states using a state machine, connection
* events are used to determine state and actions for a given connection. This
* makes the implementation simpler as most of the relevant tasks deal with
* reactions to events, rather than state-associated actions. For user
* convenience, approximate states are mapped in logs from events by
* @tcp_state_str.
*
* The events are:
*
* - SOCK_ACCEPTED connection accepted from socket, SYN sent to tap/guest
*
* - TAP_SYN_RCVD tap/guest initiated connection, SYN received
*
* - TAP_SYN_ACK_SENT SYN, ACK sent to tap/guest, valid for TAP_SYN_RCVD only
*
* - ESTABLISHED connection established, the following events are valid:
*
* - SOCK_FIN_RCVD FIN (EPOLLRDHUP) received from socket
*
* - SOCK_FIN_SENT FIN (write shutdown) sent to socket
*
* - TAP_FIN_RCVD FIN received from tap/guest
*
* - TAP_FIN_SENT FIN sent to tap/guest
*
* - TAP_FIN_ACKED ACK to FIN seen from tap/guest
*
* Setting any event in CONN_STATE_BITS (SOCK_ACCEPTED, TAP_SYN_RCVD,
* ESTABLISHED) clears all the other events, as those represent the fundamental
* connection states. No events (events == CLOSED) means the connection is
* closed.
*
* Connection setup
* ----------------
*
* - inbound connection (from socket to guest): on accept() from listening
* socket, the new socket is mapped in connection tracking table, and
* three-way handshake initiated towards the guest, advertising MSS and window
* size and scaling from socket parameters
* - outbound connection (from guest to socket): on SYN segment from guest, a
* new socket is created and mapped in connection tracking table, setting
* MSS and window clamping from header and option of the observed SYN segment
*
*
* Aging and timeout
* -----------------
*
* Timeouts are implemented by means of timerfd timers, set based on flags:
*
* - SYN_TIMEOUT: if no ACK is received from tap/guest during handshake (flag
* ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, reset the
* connection
*
* - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
* data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the
* socket and reset sequence to what was acknowledged. If this persists for
* more than TCP_MAX_RETRANS times in a row, reset the connection
*
* - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE
* with TAP_FIN_SENT event), and no ACK is received within this time, reset
* the connection
*
* - FIN_TIMEOUT: if a FIN segment was acknowledged by tap/guest and a FIN
* segment (write shutdown) was sent via socket (events SOCK_FIN_SENT and
* TAP_FIN_ACKED), but no socket activity is detected from the socket within
* this time, reset the connection
*
* - ACT_TIMEOUT, in the presence of any event: if no activity is detected on
* either side, the connection is reset
*
* - ACK_INTERVAL elapsed after data segment received from tap without having
* sent an ACK segment, or zero-sized window advertised to tap/guest (flag
* ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent
*
*
* Summary of data flows (with ESTABLISHED event)
* ----------------------------------------------
*
* @seq_to_tap: next sequence for packets to tap/guest
* @seq_ack_from_tap: last ACK number received from tap/guest
* @seq_from_tap: next sequence for packets from tap/guest (expected)
* @seq_ack_to_tap: last ACK number sent to tap/guest
*
* @seq_init_from_tap: initial sequence number from tap/guest
* @seq_init_to_tap: initial sequence number from tap/guest
*
* @wnd_from_tap: last window size received from tap, never scaled
* @wnd_from_tap: last window size advertised from tap, never scaled
*
* - from socket to tap/guest:
* - on new data from socket:
* - peek into buffer
* - send data to tap/guest:
* - starting at offset (@seq_to_tap - @seq_ack_from_tap)
* - in MSS-sized segments
* - increasing @seq_to_tap at each segment
* - up to window (until @seq_to_tap - @seq_ack_from_tap <= @wnd_from_tap)
* - on read error, send RST to tap/guest, close socket
* - on zero read, send FIN to tap/guest, set TAP_FIN_SENT
* - on ACK from tap/guest:
* - set @ts_ack_from_tap
* - check if it's the second duplicated ACK
* - consume buffer by difference between new ack_seq and @seq_ack_from_tap
* - update @seq_ack_from_tap from ack_seq in header
* - on two duplicated ACKs, reset @seq_to_tap to @seq_ack_from_tap, and
* resend with steps listed above
*
* - from tap/guest to socket:
* - on packet from tap/guest:
* - set @ts_tap_act
* - check seq from header against @seq_from_tap, if data is missing, send
* two ACKs with number @seq_ack_to_tap, discard packet
* - otherwise queue data to socket, set @seq_from_tap to seq from header
* plus payload length
* - in ESTABLISHED state, send ACK to tap as soon as we queue to the
* socket. In other states, query socket for TCP_INFO, set
* @seq_ack_to_tap to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and
* send ACK to tap/guest
*
*
* PASTA mode
* ==========
*
* For traffic directed to TCP ports configured for mapping to the tuntap device
* in the namespace, and for non-local traffic coming from the tuntap device,
* the implementation is identical as the PASST mode described in the previous
* section.
*
* For local traffic directed to TCP ports configured for direct mapping between
* namespaces, see the implementation in tcp_splice.c.
*/
#include <sched.h>
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <signal.h>
#include <stdlib.h>
#include <errno.h>
#include <limits.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <stdint.h>
#include <stdbool.h>
#include <stddef.h>
#include <string.h>
#include <sys/epoll.h>
#include <sys/socket.h>
#include <sys/timerfd.h>
#include <sys/types.h>
#include <sys/uio.h>
#include <time.h>
#include <arpa/inet.h>
#include <linux/tcp.h> /* For struct tcp_info */
#include "checksum.h"
#include "util.h"
#include "ip.h"
#include "passt.h"
#include "tap.h"
#include "siphash.h"
#include "pcap.h"
#include "tcp_splice.h"
#include "log.h"
#include "inany.h"
#include "flow.h"
#include "flow_table.h"
/* Sides of a flow as we use them in "tap" connections */
#define SOCKSIDE 0
#define TAPSIDE 1
#define TCP_FRAMES_MEM 128
#define TCP_FRAMES \
(c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)
#define TCP_HASH_TABLE_LOAD 70 /* % */
#define TCP_HASH_TABLE_SIZE (FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD)
#define MAX_WS 8
#define MAX_WINDOW (1 << (16 + (MAX_WS)))
/* MSS rounding: see SET_MSS() */
#define MSS_DEFAULT 536
struct tcp4_l2_head { /* For MSS4 macro: keep in sync with tcp4_l2_buf_t */
#ifdef __AVX2__
uint8_t pad[26];
#else
uint8_t pad[2];
#endif
struct tap_hdr taph;
struct iphdr iph;
struct tcphdr th;
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)));
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif
struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
#ifdef __AVX2__
uint8_t pad[14];
#else
uint8_t pad[2];
#endif
struct tap_hdr taph;
struct ipv6hdr ip6h;
struct tcphdr th;
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)));
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif
#define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4)
#define MSS6 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4)
#define WINDOW_DEFAULT 14600 /* RFC 6928 */
#ifdef HAS_SND_WND
# define KERNEL_REPORTS_SND_WND(c) (c->tcp.kernel_snd_wnd)
#else
# define KERNEL_REPORTS_SND_WND(c) (0 && (c))
#endif
#define ACK_INTERVAL 10 /* ms */
#define SYN_TIMEOUT 10 /* s */
#define ACK_TIMEOUT 2
#define FIN_TIMEOUT 60
#define ACT_TIMEOUT 7200
#define LOW_RTT_TABLE_SIZE 8
#define LOW_RTT_THRESHOLD 10 /* us */
/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of
* <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP
*/
#define SOL_TCP IPPROTO_TCP
#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW)
#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW)
#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW)
#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW)
#define FIN (1 << 0)
#define SYN (1 << 1)
#define RST (1 << 2)
#define ACK (1 << 4)
/* Flags for internal usage */
#define DUP_ACK (1 << 5)
#define ACK_IF_NEEDED 0 /* See tcp_send_flag() */
#define OPT_EOL 0
#define OPT_NOP 1
#define OPT_MSS 2
#define OPT_MSS_LEN 4
#define OPT_WS 3
#define OPT_WS_LEN 3
#define OPT_SACKP 4
#define OPT_SACK 5
#define OPT_TS 8
#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr))
#define CONN_V6(conn) (!CONN_V4(conn))
#define CONN_IS_CLOSING(conn) \
((conn->events & ESTABLISHED) && \
(conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
#define CONN_HAS(conn, set) ((conn->events & (set)) == (set))
static const char *tcp_event_str[] __attribute((__unused__)) = {
"SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT",
"SOCK_FIN_RCVD", "SOCK_FIN_SENT", "TAP_FIN_RCVD", "TAP_FIN_SENT",
"TAP_FIN_ACKED",
};
static const char *tcp_state_str[] __attribute((__unused__)) = {
"SYN_RCVD", "SYN_SENT", "ESTABLISHED",
"SYN_RCVD", /* approximately maps to TAP_SYN_ACK_SENT */
/* Passive close: */
"CLOSE_WAIT", "CLOSE_WAIT", "LAST_ACK", "LAST_ACK", "LAST_ACK",
/* Active close (+5): */
"CLOSING", "FIN_WAIT_1", "FIN_WAIT_1", "FIN_WAIT_2", "TIME_WAIT",
};
static const char *tcp_flag_str[] __attribute((__unused__)) = {
"STALLED", "LOCAL", "ACTIVE_CLOSE", "ACK_TO_TAP_DUE",
"ACK_FROM_TAP_DUE",
};
/* Listening sockets, used for automatic port forwarding in pasta mode only */
static int tcp_sock_init_ext [NUM_PORTS][IP_VERSIONS];
static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS];
/* Table of guest side forwarding addresses with very low RTT (assumed
* to be local to the host), LRU
*/
static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
/**
* tcp_buf_seq_update - Sequences to update with length of frames once sent
* @seq: Pointer to sequence number sent to tap-side, to be updated
* @len: TCP payload length
*/
struct tcp_buf_seq_update {
uint32_t *seq;
uint16_t len;
};
/* Static buffers */
/**
* tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
* @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only
* @taph: Tap-level headers (partially pre-filled)
* @iph: Pre-filled IP header (except for tot_len and saddr)
* @uh: Headroom for TCP header
* @data: Storage for TCP payload
*/
static struct tcp4_l2_buf_t {
#ifdef __AVX2__
uint8_t pad[26]; /* 0, align th to 32 bytes */
#else
uint8_t pad[2]; /* align iph to 4 bytes 0 */
#endif
struct tap_hdr taph; /* 26 2 */
struct iphdr iph; /* 44 20 */
struct tcphdr th; /* 64 40 */
uint8_t data[MSS4]; /* 84 60 */
/* 65536 65532 */
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp4_l2_buf[TCP_FRAMES_MEM];
static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];
static unsigned int tcp4_l2_buf_used;
/**
* tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections
* @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
* @taph: Tap-level headers (partially pre-filled)
* @ip6h: Pre-filled IP header (except for payload_len and addresses)
* @th: Headroom for TCP header
* @data: Storage for TCP payload
*/
struct tcp6_l2_buf_t {
#ifdef __AVX2__
uint8_t pad[14]; /* 0 align ip6h to 32 bytes */
#else
uint8_t pad[2]; /* align ip6h to 4 bytes 0 */
#endif
struct tap_hdr taph; /* 14 2 */
struct ipv6hdr ip6h; /* 32 20 */
struct tcphdr th; /* 72 60 */
uint8_t data[MSS6]; /* 92 80 */
/* 65536 65532 */
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp6_l2_buf[TCP_FRAMES_MEM];
static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
static unsigned int tcp6_l2_buf_used;
/* recvmsg()/sendmsg() data for tap */
static char tcp_buf_discard [MAX_WINDOW];
static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM];
static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM];
static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM];
static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM];
/* sendmsg() to socket */
static struct iovec tcp_iov [UIO_MAXIOV];
/**
* tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags)
* @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only
* @taph: Tap-level headers (partially pre-filled)
* @iph: Pre-filled IP header (except for tot_len and saddr)
* @th: Headroom for TCP header
* @opts: Headroom for TCP options
*/
static struct tcp4_l2_flags_buf_t {
#ifdef __AVX2__
uint8_t pad[26]; /* 0, align th to 32 bytes */
#else
uint8_t pad[2]; /* align iph to 4 bytes 0 */
#endif
struct tap_hdr taph; /* 26 2 */
struct iphdr iph; /* 44 20 */
struct tcphdr th; /* 64 40 */
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp4_l2_flags_buf[TCP_FRAMES_MEM];
static unsigned int tcp4_l2_flags_buf_used;
/**
* tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags)
* @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
* @taph: Tap-level headers (partially pre-filled)
* @ip6h: Pre-filled IP header (except for payload_len and addresses)
* @th: Headroom for TCP header
* @opts: Headroom for TCP options
*/
static struct tcp6_l2_flags_buf_t {
#ifdef __AVX2__
uint8_t pad[14]; /* 0 align ip6h to 32 bytes */
#else
uint8_t pad[2]; /* align ip6h to 4 bytes 0 */
#endif
struct tap_hdr taph; /* 14 2 */
struct ipv6hdr ip6h; /* 32 20 */
struct tcphdr th /* 72 */ __attribute__ ((aligned(4))); /* 60 */
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp6_l2_flags_buf[TCP_FRAMES_MEM];
static unsigned int tcp6_l2_flags_buf_used;
#define CONN(idx) (&(FLOW(idx)->tcp))
/* Table for lookup from remote address, local port, remote port */
static flow_sidx_t tc_hash[TCP_HASH_TABLE_SIZE];
static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX,
"Safe linear probing requires hash table larger than connection table");
/* Pools for pre-opened sockets (in init) */
int init_sock_pool4 [TCP_SOCK_POOL_SIZE];
int init_sock_pool6 [TCP_SOCK_POOL_SIZE];
/**
* tcp_conn_epoll_events() - epoll events mask for given connection state
* @events: Current connection events
* @conn_flags Connection flags
*
* Return: epoll events mask corresponding to implied connection state
*/
static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
{
if (!events)
return 0;
if (events & ESTABLISHED) {
if (events & TAP_FIN_SENT)
return EPOLLET;
if (conn_flags & STALLED)
return EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLET;
return EPOLLIN | EPOLLRDHUP;
}
if (events == TAP_SYN_RCVD)
return EPOLLOUT | EPOLLET | EPOLLRDHUP;
return EPOLLRDHUP;
}
static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
unsigned long flag);
#define conn_flag(c, conn, flag) \
do { \
flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \
conn_flag_do(c, conn, flag); \
} while (0)
/**
* tcp_epoll_ctl() - Add/modify/delete epoll state from connection events
* @c: Execution context
* @conn: Connection pointer
*
* Return: 0 on success, negative error code on failure (not on deletion)
*/
static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
{
int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock,
.flowside = FLOW_SIDX(conn, SOCKSIDE) };
struct epoll_event ev = { .data.u64 = ref.u64 };
if (conn->events == CLOSED) {
if (conn->in_epoll)
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev);
if (conn->timer != -1)
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev);
return 0;
}
ev.events = tcp_conn_epoll_events(conn->events, conn->flags);
if (epoll_ctl(c->epollfd, m, conn->sock, &ev))
return -errno;
conn->in_epoll = true;
if (conn->timer != -1) {
union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER,
.fd = conn->sock,
.flow = FLOW_IDX(conn) };
struct epoll_event ev_t = { .data.u64 = ref_t.u64,
.events = EPOLLIN | EPOLLET };
if (epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t))
return -errno;
}
return 0;
}
/**
* tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed
* @c: Execution context
* @conn: Connection pointer
*
* #syscalls timerfd_create timerfd_settime
*/
static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
{
struct itimerspec it = { { 0 }, { 0 } };
if (conn->events == CLOSED)
return;
if (conn->timer == -1) {
union epoll_ref ref = { .type = EPOLL_TYPE_TCP_TIMER,
.fd = conn->sock,
.flow = FLOW_IDX(conn) };
struct epoll_event ev = { .data.u64 = ref.u64,
.events = EPOLLIN | EPOLLET };
int fd;
fd = timerfd_create(CLOCK_MONOTONIC, 0);
if (fd == -1 || fd > FD_REF_MAX) {
flow_dbg(conn, "failed to get timer: %s",
strerror(errno));
if (fd > -1)
close(fd);
conn->timer = -1;
return;
}
conn->timer = fd;
if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) {
flow_dbg(conn, "failed to add timer: %s",
strerror(errno));
close(conn->timer);
conn->timer = -1;
return;
}
}
if (conn->flags & ACK_TO_TAP_DUE) {
it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000;
} else if (conn->flags & ACK_FROM_TAP_DUE) {
if (!(conn->events & ESTABLISHED))
it.it_value.tv_sec = SYN_TIMEOUT;
else
it.it_value.tv_sec = ACK_TIMEOUT;
} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
it.it_value.tv_sec = FIN_TIMEOUT;
} else {
it.it_value.tv_sec = ACT_TIMEOUT;
}
flow_dbg(conn, "timer expires in %llu.%03llus",
(unsigned long long)it.it_value.tv_sec,
(unsigned long long)it.it_value.tv_nsec / 1000 / 1000);
timerfd_settime(conn->timer, 0, &it, NULL);
}
/**
* conn_flag_do() - Set/unset given flag, log, update epoll on STALLED flag
* @c: Execution context
* @conn: Connection pointer
* @flag: Flag to set, or ~flag to unset
*/
static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
unsigned long flag)
{
if (flag & (flag - 1)) {
int flag_index = fls(~flag);
if (!(conn->flags & ~flag))
return;
conn->flags &= flag;
if (flag_index >= 0)
flow_dbg(conn, "%s dropped", tcp_flag_str[flag_index]);
} else {
int flag_index = fls(flag);
if (conn->flags & flag) {
/* Special case: setting ACK_FROM_TAP_DUE on a
* connection where it's already set is used to
* re-schedule the existing timer.
* TODO: define clearer semantics for timer-related
* flags and factor this into the logic below.
*/
if (flag == ACK_FROM_TAP_DUE)
tcp_timer_ctl(c, conn);
return;
}
conn->flags |= flag;
if (flag_index >= 0)
flow_dbg(conn, "%s", tcp_flag_str[flag_index]);
}
if (flag == STALLED || flag == ~STALLED)
tcp_epoll_ctl(c, conn);
if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE ||
(flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) ||
(flag == ~ACK_TO_TAP_DUE && (conn->flags & ACK_FROM_TAP_DUE)))
tcp_timer_ctl(c, conn);
}
static void tcp_hash_remove(const struct ctx *c,
const struct tcp_tap_conn *conn);
/**
* conn_event_do() - Set and log connection events, update epoll state
* @c: Execution context
* @conn: Connection pointer
* @event: Connection event
*/
static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
unsigned long event)
{
int prev, new, num = fls(event);
if (conn->events & event)
return;
prev = fls(conn->events);
if (conn->flags & ACTIVE_CLOSE)
prev += 5;
if ((conn->events & ESTABLISHED) && (conn->events != ESTABLISHED))
prev++; /* i.e. SOCK_FIN_RCVD, not TAP_SYN_ACK_SENT */
if (event == CLOSED || (event & CONN_STATE_BITS))
conn->events = event;
else
conn->events |= event;
new = fls(conn->events);
if ((conn->events & ESTABLISHED) && (conn->events != ESTABLISHED)) {
num++;
new++;
}
if (conn->flags & ACTIVE_CLOSE)
new += 5;
if (prev != new)
flow_dbg(conn, "%s: %s -> %s",
num == -1 ? "CLOSED" : tcp_event_str[num],
prev == -1 ? "CLOSED" : tcp_state_str[prev],
(new == -1 || num == -1) ? "CLOSED" : tcp_state_str[new]);
else
flow_dbg(conn, "%s",
num == -1 ? "CLOSED" : tcp_event_str[num]);
if (event == CLOSED)
tcp_hash_remove(c, conn);
else if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD))
conn_flag(c, conn, ACTIVE_CLOSE);
else
tcp_epoll_ctl(c, conn);
if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
tcp_timer_ctl(c, conn);
}
#define conn_event(c, conn, event) \
do { \
flow_trace(conn, "event at %s:%i", __func__, __LINE__); \
conn_event_do(c, conn, event); \
} while (0)
/**
* tcp_rtt_dst_low() - Check if low RTT was seen for connection endpoint
* @conn: Connection pointer
*
* Return: 1 if destination is in low RTT table, 0 otherwise
*/
static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
{
int i;
for (i = 0; i < LOW_RTT_TABLE_SIZE; i++)
if (inany_equals(&conn->faddr, low_rtt_dst + i))
return 1;
return 0;
}
/**
* tcp_rtt_dst_check() - Check tcpi_min_rtt, insert endpoint in table if low
* @conn: Connection pointer
* @tinfo: Pointer to struct tcp_info for socket
*/
static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
const struct tcp_info *tinfo)
{
#ifdef HAS_MIN_RTT
int i, hole = -1;
if (!tinfo->tcpi_min_rtt ||
(int)tinfo->tcpi_min_rtt >