diff options
Diffstat (limited to 'tcp.c')
-rw-r--r-- | tcp.c | 1367 |
1 files changed, 1367 insertions, 0 deletions
@@ -0,0 +1,1367 @@ +// SPDX-License-Identifier: AGPL-3.0-or-later + +/* PASST - Plug A Simple Socket Transport + * + * tcp.c - TCP L2-L4 translation state machine + * + * Copyright (c) 2020-2021 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + * + */ + +/** + * DOC: Theory of Operation + * + * + * Overview + * -------- + * + * This implementation maps TCP traffic between a single L2 interface (tap) and + * native TCP (L4) sockets, mimicking and reproducing as closely as possible the + * inferred behaviour of applications running on a guest, connected via said L2 + * interface. Four connection flows are supported: + * - from the local host to the guest behind the tap interface: + * - this is the main use case for proxies in service meshes + * - we bind to all unbound local ports, and relay traffic between L4 sockets + * with local endpoints and the L2 interface + * - from remote hosts to the guest behind the tap interface: + * - this might be needed for services that need to be addressed directly, + * and typically configured with special port forwarding rules (which are + * not needed here) + * - we also relay traffic between L4 sockets with remote endpoints and the L2 + * interface + * - from the guest to the local host: + * - this is not observed in practice, but implemented for completeness and + * transparency + * - from the guest to external hosts: + * - this might be needed for applications running on the guest that need to + * directly access internet services (e.g. NTP) + * + * Relevant goals are: + * - transparency: sockets need to behave as if guest applications were running + * directly on the host. This is achieved by: + * - avoiding port and address translations whenever possible + * - mirroring TCP dynamics by observation of socket parameters (TCP_INFO + * socket option) and TCP headers of packets coming from the tap interface, + * reapplying those parameters in both flow directions (including TCP_MSS, + * TCP_WINDOW_CLAMP socket options) + * - simplicity: only a small subset of TCP logic is implemented here and + * delegated as much as possible to the TCP implementations of guest and host + * kernel. This is achieved by: + * - avoiding a complete TCP stack reimplementation, with a modified TCP state + * machine focused on the translation of observed states instead + * - mirroring TCP dynamics as described above and hence avoiding the need for + * segmentation, explicit queueing, and reassembly of segments + * - security: + * - no dynamic memory allocation is performed + * - TODO: synflood protection + * - TODO: sequence collision attacks + * + * Portability is limited by usage of Linux-specific socket options. + * + * + * Limits + * ------ + * + * To avoid the need for dynamic memory allocation, a maximum, reasonable amount + * of connections is defined by TCP_MAX_CONNS below (currently 256k, close to + * the maximum amount of file descriptors typically available to a process on + * Linux). + * + * While fragmentation and reassembly are not implemented, tracking of missing + * segments and retransmissions needs to be, thus data needs to linger on + * sockets as long as it's not acknowledged by the guest, and read using + * MSG_PEEK into a single, preallocated static buffer sized to the maximum + * supported window, 64MiB. This imposes a practical limitation on window + * scaling, that is, the maximum factor is 1024. If a bigger window scaling + * factor is observed during connection establishment, connection is reset and + * reestablished by omitting the scaling factor in the SYN segment. This + * limitation only applies to the window scaling advertised by the guest, but + * if exceeded, no window scaling will be allowed at all toward either endpoint. + * + * + * Ports + * ----- + * + * To avoid the need for ad-hoc configuration of port forwarding or allowed + * ports, listening sockets are opened and bound to all unbound ports on the + * host, as far as process capabilities allow. This service needs to be started + * after any application proxy that needs to bind to local ports. + * + * No port translation is needed for connections initiated remotely or by the + * local host: source port from socket is reused while establishing connections + * to the guest. + * + * For connections initiated by the guest, it's not possible to force the same + * source port as connections are established by the host kernel: that's the + * only port translation needed. + * + * + * Connection tracking and storage + * ------------------------------- + * + * Connection are tracked by the @tc array of struct tcp_conn, containing + * addresses, ports, TCP states and parameters. This is statically allocated and + * indices are the file descriptor numbers associated to inbound or outbound + * sockets. + * + * IPv4 addresses are stored as IPv4-mapped IPv6 addresses to avoid the need for + * separate data structures depending on the protocol version. + * + * - Inbound connection requests (to the guest) are mapped using the triple + * < source IP address, source port, destination port > + * - Outbound connection requests (from the guest) are mapped using the triple + * < destination IP address, destination port, source port > + * where the source port is the one used by the guest, not the one used by the + * corresponding host socket + * + * + * Initialisation + * -------------- + * + * Up to 2^15 + 2^14 listening sockets (excluding ephemeral ports, repeated for + * IPv4 and IPv6) are opened and bound to wildcard addresses. Some will fail to + * bind (for low ports, or ports already bound, e.g. by a proxy). These are + * added to the epoll list, with no separate storage. + * + * + * States and events + * ----------------- + * + * These states apply to connected sockets only, listening sockets are always + * open after initialisation, in LISTEN state. A single state is maintained for + * both sides of the connection, and most states are omitted as they are already + * handled by host kernel and guest. + * + * - CLOSED no connection + * No associated events: this is always a final state, new connections + * directly start from TAP_SYN_SENT or SOCK_SYN_SENT described below. + * + * - TAP_SYN_SENT connect() in progress, triggered from tap + * - connect() completes SYN,ACK to tap > TAP_SYN_RCVD + * - connect() aborts RST to tap, close socket > CLOSED + * - RST from tap close socket > CLOSED + * + * - SOCK_SYN_SENT new connected socket, SYN sent to tap + * - SYN,ACK from tap ACK to tap > ESTABLISHED + * - SYN,ACK timeout RST to tap, close socket > CLOSED + * - socket error RST to tap, close socket > CLOSED + * - RST from tap close socket > CLOSED + * + * - TAP_SYN_RCVD connect() completed, SYN,ACK sent to tap + * - ACK from tap > ESTABLISHED + * - ACK timeout RST to tap, close socket > CLOSED + * - socket error RST to tap, close socket > CLOSED + * - RST from tap close socket > CLOSED + * + * - ESTABLISHED connection established, ready for data + * - zero-sized socket read FIN to tap > ESTABLISHED_SOCK_FIN + * - data timeout FIN to tap > ESTABLISHED_SOCK_FIN + * - socket error RST to tap, close socket > CLOSED + * - FIN from tap FIN,ACK to tap, close socket > FIN_WAIT_1 + * - RST from tap close socket > CLOSED + * + * - ESTABLISHED_SOCK_FIN socket wants to close connection, data allowed + * - ACK from tap > CLOSE_WAIT + * - ACK timeout RST to tap, close socket > CLOSED + * - RST from tap close socket > CLOSED + * + * - CLOSE_WAIT socket wants to close connection, seen by tap + * - socket error RST to tap, close socket > CLOSED + * - FIN from tap ACK to tap, close socket > LAST_ACK + * - FIN timeout RST to tap, close socket > CLOSED + * - RST from tap close socket > CLOSED + * + * - LAST_ACK socket started close, tap completed it + * - anything from socket close socket > CLOSED + * - socket error RST to tap, close socket > CLOSED + * - ACK timeout RST to tap, close socket > CLOSED + * + * - FIN_WAIT_1 tap wants to close connection, _FIN,ACK sent_ + * - ACK from tap close socket > CLOSED + * - socket error RST to tap, close socket > CLOSED + * - ACK timeout RST to tap, close socket > CLOSED + * + * + * Connection setup + * ---------------- + * + * - inbound connection (from socket to guest): on accept() from listening + * socket, the new socket is mapped in connection tracking table, and + * three-way handshake initiated towards the guest, advertising MSS and window + * size and scaling from socket parameters + * - outbound connection (from guest to socket): on SYN segment from guest, a + * new socket is created and mapped in connection tracking table, setting + * MSS and window clamping from header and option of the observed SYN segment + * + * + * Aging and timeout + * ----------------- + * + * Two bitmaps of TCP_MAX_CONNS bits indicate which connections need scheduled + * actions: + * - @tcp_act_fast is used to send ACK segments to the tap once TCP_INFO reports + * an increased number of acknowledged bytes sent on a socket, and examined + * every 20ms (one tenth of current TCP_DELACK_MAX on Linux): for each marked + * connection, a TCP_INFO query is performed and ACK segments are sent right + * away as needed + * - @tcp_act_slow is used for state and retransmission timeouts, and examined + * every 2s: for each marked connection with an expired @timeout timestamp + * specific actions are taken depending on the connection state: + * - SOCK_SYN_SENT: after a 2MSL (240s) timeout waiting for a SYN,ACK segment + * from tap expires, connection is reset (RST to tap, socket closed) + * - TAP_SYN_RCVD: after a 2MSL (240s) timeout waiting for an ACK segment from + * tap expires, connection is reset (RST to tap, socket closed) + * - ESTABLISHED: after a timeout of 1s (TODO: implement requirements from + * RFC 6298) waiting for an ACK segment from tap expires, data from socket + * queue is retransmitted starting from the last ACK sequence + * - ESTABLISHED: after a two hours (current TCP_KEEPALIVE_TIME on Linux) + * timeout waiting for any activity expires, connection is reset (RST to + * tap, socket closed) + * - ESTABLISHED_SOCK_FIN: after a 2MSL (240s) timeout waiting for an ACK + * segment from tap expires, connection is reset (RST to tap, socket closed) + * - CLOSE_WAIT: after a 2MSL (240s) timeout waiting for a FIN segment from + * tap expires, connection is reset (RST to tap, socket closed) + * - LAST_ACK: after a 2MSL (240s) timeout waiting for an ACK segment from + * socket expires, connection is reset (RST to tap, socket closed) + * - FIN_WAIT_1: after a 2MSL (240s) timeout waiting for an ACK segment from + * tap expires, connection is reset (RST to tap, socket closed) + * + * + * Data flows (from ESTABLISHED, ESTABLISHED_SOCK_FIN states) + * ---------------------------------------------------------- + * + * @seq_to_tap: next sequence for packets to tap + * @seq_ack_from_tap: last ACK number received from tap + * @seq_from_tap: next sequence for packets from tap (not actually sent) + * @seq_ack_to_tap: last ACK number sent to tap + * + * @seq_init_from_tap: initial sequence number from tap + * + * @tap_window: last window size received from tap, scaled + * @tcpi_acked_last: most recent value of tcpi_bytes_acked (TCP_INFO) + * + * - from socket to tap: + * - on new data from socket: + * - peek into buffer + * - send data to tap: + * - starting at offset (@seq_to_tap - @seq_ack_from_tap) + * - in MSS-sized segments + * - increasing @seq_to_tap at each segment + * - up to window (until @seq_to_tap - @seq_ack_from_tap <= @tap_window) + * - mark socket in bitmap for periodic ACK check, set @last_ts_to_tap + * - on read error, send RST to tap, close socket + * - on zero read, send FIN to tap, enter ESTABLISHED_SOCK_FIN + * - on ACK from tap: + * - check if it's the second duplicated ACK + * - consume buffer by difference between new ack_seq and @seq_ack_from_tap + * - update @seq_ack_from_tap from ack_seq in header + * - on two duplicated ACKs, reset @seq_to_tap to @seq_ack_from_tap, and + * resend with steps listed above + * - set TCP_WINDOW_CLAMP from TCP header from tap + * - on @seq_ack_from_tap == @seq_to_tap, mark in bitmap, umark otherwise + * - periodically: + * - if @seq_ack_from_tap < @seq_to_tap and the retransmission timer + * (TODO: implement requirements from RFC 6298, currently 3s fixed) from + * @last_ts_to_tap elapsed, reset @seq_to_tap to @seq_ack_from_tap, and + * resend data with the steps listed above + * + * - from tap to socket: + * - on packet from tap: + * - set TCP_WINDOW_CLAMP from TCP header from tap + * - check seq from header against @seq_from_tap, if data is missing, send + * two ACKs with number @seq_ack_to_tap, discard packet + * - otherwise queue data to socket, set @seq_from_tap to seq from header + * plus payload length + * - query socket for TCP_INFO, on tcpi_bytes_acked > @tcpi_acked_last, + * set @tcpi_acked_last to tcpi_bytes_acked, set @seq_ack_to_tap + * to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and + * send ACK to tap + * - set @last_ts_sock + * - on @seq_ack_to_tap < @seq_from_tap, mark socket for later ACK in bitmap + * - periodically: + * - if socket is marked in bitmap, query socket for TCP_INFO, on + * tcpi_bytes_acked > @tcpi_acked_last, + * set @tcpi_acked_last to tcpi_bytes_acked, set @seq_ack_to_tap + * to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and + * send ACK to tap + * - on @seq_ack_to_tap == @seq_from_tap, unmark socket from bitmap + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <errno.h> +#include <limits.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <netinet/in.h> +#include <stdint.h> +#include <stddef.h> +#include <string.h> +#include <sys/epoll.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <unistd.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/tcp.h> +#include <time.h> + +#include "passt.h" +#include "tap.h" +#include "util.h" + +/* Approximately maximum number of open descriptors per process */ +#define MAX_CONNS (256 * 1024) + +#define MAX_WS 10 +#define MAX_WINDOW (1 << (16 + (MAX_WS))) +#define MSS_DEFAULT 536 +#define WINDOW_DEFAULT 4380 + +#define SYN_TIMEOUT 240000 /* ms */ +#define ACK_TIMEOUT 3000 +#define ACT_TIMEOUT 7200000 +#define FIN_TIMEOUT 240000 +#define LAST_ACK_TIMEOUT 240000 + +#define SOCK_ACK_INTERVAL 20 + +/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of + * <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP + */ +#define SOL_TCP IPPROTO_TCP + +static char tcp_in_buf[MAX_WINDOW]; + +static uint8_t tcp_act_fast[MAX_CONNS / 8] = { 0 }; +static uint8_t tcp_act_slow[MAX_CONNS / 8] = { 0 }; + +enum tcp_state { + CLOSED = 0, + TAP_SYN_SENT, + SOCK_SYN_SENT, + TAP_SYN_RCVD, + ESTABLISHED, + ESTABLISHED_SOCK_FIN, + CLOSE_WAIT, + LAST_ACK, + FIN_WAIT_1, +}; + +#define FIN (1 << 0) +#define SYN (1 << 1) +#define RST (1 << 2) +#define ACK (1 << 4) + +#define OPT_EOL 0 +#define OPT_NOP 1 +#define OPT_MSS 2 +#define OPT_WS 3 +#define OPT_SACKP 4 +#define OPT_SACK 5 +#define OPT_TS 8 + +/** + * struct tcp_conn - Descriptor for a TCP connection + * @a.a6: IPv6 remote address, can be IPv4-mapped + * @a.a4.zero: Zero prefix for IPv4-mapped, see RFC 6890, Table 20 + * @a.a4.one: Ones prefix for IPv4-mapped + * @a.a4.a: IPv4 address + * @tap_port: Guest-facing tap port + * @sock_port: Remote, socket-facing port + * @s: TCP connection state + * @seq_to_tap: Next sequence for packets to tap + * @seq_ack_from_tap: Last ACK number received from tap + * @seq_from_tap: Next sequence for packets from tap (not actually sent) + * @seq_ack_to_tap: Last ACK number sent to tap + * @seq_init_from_tap: Initial sequence number from tap + * @tcpi_acked_last: Most recent value of tcpi_bytes_acked (TCP_INFO query) + * @dup_acks: Count of currently duplicated ACKs from tap + * @ws_allowed: Window scaling allowed + * @ws: Window scaling factor + * @tap_window: Last window size received from tap, scaled + * @last_ts_sock: Last activity timestamp from socket for timeout purposes + * @last_ts_tap: Last activity timestamp from tap for timeout purposes + * @mss_guest: Maximum segment size advertised by guest + */ +struct tcp_conn { + union { + struct in6_addr a6; + struct { + uint8_t zero[10]; + uint8_t one[2]; + struct in_addr a; + } a4; + } a; + in_port_t tap_port; + in_port_t sock_port; + enum tcp_state s; + + uint32_t seq_to_tap; + uint32_t seq_ack_from_tap; + uint32_t seq_from_tap; + uint32_t seq_ack_to_tap; + uint32_t seq_init_from_tap; + uint64_t tcpi_acked_last; + int dup_acks; + + int ws_allowed; + int ws; + int tap_window; + + struct timespec last_ts_sock; + struct timespec last_ts_tap; + + int mss_guest; +}; + +static struct tcp_conn tc[MAX_CONNS]; + +static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len); + +/** + * tcp_act_fast_set() - Set socket in bitmap for "fast" timeout events + * @s: Socket file descriptor number + */ +static void tcp_act_fast_set(int s) +{ + tcp_act_fast[s / 8] |= 1 << (s % 8); +} + +/** + * tcp_act_fast_clear() - Clear socket from bitmap for "fast" timeout events + * @s: Socket file descriptor number + */ +static void tcp_act_fast_clear(int s) +{ + tcp_act_fast[s / 8] &= ~(1 << (s % 8)); +} + +/** + * tcp_act_slow_set() - Set socket in bitmap for "slow" timeout events + * @s: Socket file descriptor number + */ +static void tcp_act_slow_set(int s) +{ + tcp_act_slow[s / 8] |= 1 << (s % 8); +} + +/** + * tcp_act_slow_clear() - Clear socket from bitmap for "slow" timeout events + * @s: Socket file descriptor number + */ +static void tcp_act_slow_clear(int s) +{ + tcp_act_slow[s / 8] &= ~(1 << (s % 8)); +} + +/** + * tcp_opt_get() - Get option, and value if any, from TCP header + * @th: Pointer to TCP header + * @len: Length of buffer, including TCP header + * @type: Option type to look for + * @optlen: Optional, filled with option length if passed + * @value: Optional, set to start of option value if passed + * + * Return: Option value, meaningful for up to 4 bytes, -1 if not found + */ +static int tcp_opt_get(struct tcphdr *th, unsigned int len, uint8_t type, + uint8_t *optlen, void *value) +{ + uint8_t *p, __type, __optlen; + + len -= sizeof(*th); + p = (uint8_t *)(th + 1); + + if (len > th->doff * 4 - sizeof(*th)) + len = th->doff * 4 - sizeof(*th); + + while (len >= 2) { + switch (*p) { + case OPT_EOL: + return -1; + case OPT_NOP: + p++; + len--; + break; + default: + __type = *(p++); + __optlen = *(p++); + len -= 2; + + if (type == __type) { + if (optlen) + *optlen = __optlen; + if (value) + value = p; + + if (__optlen - 2 == 0) + return 0; + + if (__optlen - 2 == 1) + return *p; + + if (__optlen - 2 == 2) + return ntohs(*(uint16_t *)p); + + return ntohl(*(uint32_t *)p); + } + + p += __optlen - 2; + len -= __optlen - 2; + } + } + + return -1; +} + +/** + * tcp_close_and_epoll_del() - Close socket and remove from epoll descriptor + * @c: Execution context + * @s: File descriptor number for socket + */ +static void tcp_close_and_epoll_del(struct ctx *c, int s) +{ + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL); + close(s); + tcp_act_fast_clear(s); + tcp_act_slow_clear(s); +} + +/** + * tcp_rst() - Reset a connection: send RST segment to tap, close socket + * @c: Execution context + * @s: File descriptor number for socket + */ +static void tcp_rst(struct ctx *c, int s) +{ + if (s < 0) + return; + + tcp_send_to_tap(c, s, RST, NULL, 0); + tcp_close_and_epoll_del(c, s); + tc[s].s = CLOSED; +} + +/** + * tcp_send_to_tap() - Send segment to tap, with options and values from socket + * @c: Execution context + * @s: File descriptor number for socket + * @flags: TCP flags to set + * @in: Input buffer, L4 header + * @len: Buffer length, at L4 + * + * Return: -1 on error with connection reset, 0 otherwise + */ +static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len) +{ + char buf[USHRT_MAX] = { 0 }, *data; + struct tcp_info info = { 0 }; + socklen_t sl = sizeof(info); + int ws = 0, have_info = 1; + struct tcphdr *th; + + if (getsockopt(s, SOL_TCP, TCP_INFO, &info, &sl)) { + if (!(flags & RST)) { + tcp_rst(c, s); + return -1; + } + + have_info = 0; + } + + th = (struct tcphdr *)buf; + data = (char *)(th + 1); + + if (flags & SYN && have_info) { + if (tc[s].ws_allowed) + ws = info.tcpi_snd_wscale; + + /* Options: MSS, NOP and window scale if allowed (4-8 bytes) */ + *data++ = 2; + *data++ = 4; + *(uint16_t *)data = htons(info.tcpi_snd_mss); + data += 2; + + if (ws) { + *data++ = 1; + + *data++ = 3; + *data++ = 3; + *data++ = ws; + + th->doff = (20 + 8) / 4; + } else { + th->doff = (20 + 4) / 4; + } + + th->seq = htonl(tc[s].seq_to_tap++); + } else { + th->doff = 20 / 4; + + th->seq = htonl(tc[s].seq_to_tap); + tc[s].seq_to_tap += len; + } + + if ((info.tcpi_bytes_acked > tc[s].tcpi_acked_last || (flags & ACK) || + len) && + have_info) { + uint64_t ack_seq; + + th->ack = 1; + /* info.tcpi_bytes_acked already includes one byte for SYN, but + * not for incoming connections. + */ + ack_seq = info.tcpi_bytes_acked + tc[s].seq_init_from_tap; + if (!info.tcpi_bytes_acked) + ack_seq++; + ack_seq &= (uint32_t)~0U; + + tc[s].seq_ack_to_tap = ack_seq; + th->ack_seq = htonl(tc[s].seq_ack_to_tap); + + tc[s].tcpi_acked_last = info.tcpi_bytes_acked; + } else { + if (!len && !flags) + return 0; + + th->ack = th->ack_seq = 0; + } + + th->rst = !!(flags & RST); + th->syn = !!(flags & SYN); + th->fin = !!(flags & FIN); + + th->source = tc[s].sock_port; + th->dest = tc[s].tap_port; + + if (have_info) + th->window = htons(info.tcpi_snd_wnd >> info.tcpi_snd_wscale); + else + th->window = WINDOW_DEFAULT; + + th->urg_ptr = 0; + th->check = 0; + + memcpy(data, in, len); + + tap_ip_send(c, &tc[s].a.a6, IPPROTO_TCP, buf, th->doff * 4 + len); + + return 0; +} + +/** + * tcp_clamp_window() - Set window and scaling from option, clamp on socket + * @s: File descriptor number for socket + * @th: TCP header, from tap + * @len: Buffer length, at L4 + */ +static void tcp_clamp_window(int s, struct tcphdr *th, int len) +{ + int ws; + + if (!tc[s].tap_window) { + ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL); + if (ws >= 0 && ws <= MAX_WS) { + tc[s].ws_allowed = 1; + tc[s].ws = ws; + } else { + tc[s].ws_allowed = 0; + tc[s].ws = 0; + } + + /* First value is not scaled. Also, don't clamp yet, to avoid + * getting a zero scale just because we set a small window now. + */ + tc[s].tap_window = ntohs(th->window); + } else { + tc[s].tap_window = ntohs(th->window) << tc[s].ws; + setsockopt(s, SOL_TCP, TCP_WINDOW_CLAMP, + &tc[s].tap_window, sizeof(tc[s].tap_window)); + } +} + +/** + * tcp_conn_from_tap() - Handle connection request (SYN segment) from tap + * @c: Execution context + * @af: Address family, AF_INET or AF_INET6 + * @addr: Remote address, pointer to sin_addr or sin6_addr + * @th: TCP header from tap + * @len: Packet length at L4 + */ +static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, + struct tcphdr *th, size_t len) +{ + struct sockaddr_in addr4 = { + .sin_family = AF_INET, + .sin_port = th->dest, + .sin_addr = *(struct in_addr *)addr, + }; + struct sockaddr_in6 addr6 = { + .sin6_family = AF_INET6, + .sin6_port = th->dest, + .sin6_addr = *(struct in6_addr *)addr, + }; + struct epoll_event ev = { 0 }; + const struct sockaddr *sa; + socklen_t sl; + int s; + + s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); + if (s < 0) + return; + + tc[s].mss_guest = tcp_opt_get(th, len, OPT_MSS, NULL, NULL); + if (tc[s].mss_guest < 0) + tc[s].mss_guest = MSS_DEFAULT; + sl = sizeof(tc[s].mss_guest); + setsockopt(s, SOL_TCP, TCP_MAXSEG, &tc[s].mss_guest, sl); + + tcp_clamp_window(s, th, len); + + if (af == AF_INET) { + sa = (const struct sockaddr *)&addr4; + sl = sizeof(addr4); + + memset(&tc[s].a.a4.zero, 0, sizeof(tc[s].a.a4.zero)); + memset(&tc[s].a.a4.one, 0xff, sizeof(tc[s].a.a4.one)); + memcpy(&tc[s].a.a4.a, addr, sizeof(tc[s].a.a4.a)); + } else { + sa = (const struct sockaddr *)&addr6; + sl = sizeof(addr6); + + memcpy(&tc[s].a.a6, addr, sizeof(tc[s].a.a6)); + } + + tc[s].sock_port = th->dest; + tc[s].tap_port = th->source; + + ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP; + ev.data.fd = s; + + tc[s].seq_init_from_tap = ntohl(th->seq); + tc[s].seq_from_tap = tc[s].seq_init_from_tap + 1; + tc[s].seq_ack_to_tap = tc[s].seq_from_tap; + + /* TODO: RFC 6528 with SipHash, worth it? */ + tc[s].seq_ack_from_tap = tc[s].seq_to_tap = 0; + + if (connect(s, sa, sl)) { + if (errno != EINPROGRESS) { + tcp_rst(c, s); + return; + } + + ev.events |= EPOLLOUT; + tc[s].s = TAP_SYN_SENT; + } else { + if (tcp_send_to_tap(c, s, SYN | ACK, NULL, 0)) + return; + + tc[s].s = TAP_SYN_RCVD; + } + + epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev); + + return; +} + +/** + * tcp_sock_lookup() - Look up socket given remote address and pair of ports + * @af: Address family, AF_INET or AF_INET6 + * @tap_port: tap-facing port + * @sock_port: Socket-facing port + * + * Return: file descriptor number for socket, if found, -1 otherwise + */ +static int tcp_sock_lookup(int af, void *addr, + in_port_t tap_port, in_port_t sock_port) +{ + int i; + + /* TODO: hash table and lookup. This is just a dummy implementation. */ + for (i = 0; i < MAX_CONNS; i++) { + if (af == AF_INET && IN6_IS_ADDR_V4MAPPED(&tc[i].a.a6) && + !memcmp(&tc[i].a.a4.a, addr, sizeof(tc[i].a.a4.a)) && + tc[i].tap_port == tap_port && + tc[i].sock_port == sock_port && + tc[i].s) + return i; + + if (af == AF_INET6 && + !memcmp(&tc[i].a.a6, addr, sizeof(tc[i].a.a6)) && + tc[i].tap_port == tap_port && + tc[i].sock_port == sock_port && + tc[i].s) + return i; + } + + return -1; +} + +/** + * tcp_conn_from_sock() - Handle new connection request from listening socket + * @c: Execution context + * @fd: File descriptor number for listening socket + */ +static void tcp_conn_from_sock(struct ctx *c, int fd) +{ + struct sockaddr_storage sa_r, sa_l; + socklen_t sa_len = sizeof(sa_r); + struct epoll_event ev = { 0 }; + struct sockaddr_in6 *sa6; + struct sockaddr_in *sa4; + int s; + + if (getsockname(fd, (struct sockaddr *)&sa_l, &sa_len)) + return; + + s = accept4(fd, (struct sockaddr *)&sa_r, &sa_len, SOCK_NONBLOCK); + if (s == -1) + return; + + if (sa_l.ss_family == AF_INET) { + sa4 = (struct sockaddr_in *)&sa_r; + + memset(&tc[s].a.a4.zero, 0, sizeof(tc[s].a.a4.zero)); + memset(&tc[s].a.a4.one, 0xff, sizeof(tc[s].a.a4.one)); + memcpy(&tc[s].a.a4.a, &sa4->sin_addr, sizeof(tc[s].a.a4.a)); + + tc[s].sock_port = sa4->sin_port; + + sa4 = (struct sockaddr_in *)&sa_l; + tc[s].tap_port = sa4->sin_port; + + } else if (sa_l.ss_family == AF_INET6) { + sa6 = (struct sockaddr_in6 *)&sa_r; + + memcpy(&tc[s].a.a6, &sa6->sin6_addr, sizeof(tc[s].a.a6)); + + tc[s].sock_port = sa6->sin6_port; + + sa6 = (struct sockaddr_in6 *)&sa_l; + tc[s].tap_port = sa6->sin6_port; + } + + /* TODO: RFC 6528 with SipHash, worth it? */ + tc[s].seq_to_tap = 0; + + tc[s].ws_allowed = 1; + + clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_sock); + clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_tap); + + ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP; + ev.data.fd = s; + epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev); + + tc[s].s = SOCK_SYN_SENT; + tcp_send_to_tap(c, s, SYN, NULL, 0); +} + +/** + * tcp_send_to_sock() - Send buffer to socket, update timestamp and sequence + * @c: Execution context + * @s: File descriptor number for socket + * @seq: Previous TCP sequence, host order + * @data: Data buffer + * @len: Length at L4 + * @extra_flags: Additional flags for send(), if any + * + * Return: -1 on socket error with connection reset, 0 otherwise + */ +static int tcp_send_to_sock(struct ctx *c, int s, int seq, char *data, int len, + int extra_flags) +{ + int err = send(s, data, len, MSG_DONTWAIT | MSG_NOSIGNAL | extra_flags); + + if (err < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + /* If we can't queue right now, do nothing, sender has + * to retransmit. + */ + return 0; + } + + tcp_rst(c, s); + return -1; + } + + clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_sock); + tc[s].seq_from_tap = seq + len; + + return 0; +} + +/** + * tcp_check_dupack() - Check if given ACK number is duplicated, update counter + * @s: File descriptor number for socket + * @ack_seq: ACK sequence, host order + * + * Return: 1 on two duplicated ACKs observed, with counter reset, 0 otherwise + */ +static int tcp_check_dupack(int s, uint32_t ack_seq) +{ + if (ack_seq == tc[s].seq_ack_from_tap && ++tc[s].dup_acks == 2) { + tc[s].dup_acks = 0; + return 1; + } + + return 0; +} + +/** + * tcp_sock_consume() - Consume (discard) data from socket buffer + * @s: File descriptor number for socket + * @ack_seq: ACK sequence, host order + * + * Return: -1 on invalid sequence, 0 otherwise + */ +static int tcp_sock_consume(int s, uint32_t ack_seq) +{ + int to_ack; + + /* Implicitly take care of wrap-arounds */ + to_ack = ack_seq - tc[s].seq_ack_from_tap; + + if (to_ack < 0) + return -1; + + recv(s, NULL, to_ack, MSG_DONTWAIT | MSG_TRUNC); + tc[s].seq_ack_from_tap = ack_seq; + + return 0; +} + +/** + * tcp_data_from_sock() - Handle new data from socket, queue to tap, in window + * @c: Execution context + * @s: File descriptor number for socket + * + * Return: non-zero on socket error or pending data, 0 otherwise + */ +static int tcp_data_from_sock(struct ctx *c, int s) +{ + int len, offset, left, send; + + /* Don't dequeue until acknowledged by guest */ + len = recv(s, tcp_in_buf, sizeof(tcp_in_buf), MSG_DONTWAIT | MSG_PEEK); + if (len < 0) { + if (errno != EAGAIN && errno != EWOULDBLOCK) + tcp_rst(c, s); + return 1; + } + + if (len == 0) { + if (tc[s].s >= ESTABLISHED_SOCK_FIN) + return 0; + + tc[s].s = ESTABLISHED_SOCK_FIN; + if (tcp_send_to_tap(c, s, FIN | ACK, NULL, 0)) + return 0; + + left = 0; + goto out; + } + + offset = tc[s].seq_to_tap - tc[s].seq_ack_from_tap; + left = len - offset; + while (left && offset + tc[s].mss_guest <= tc[s].tap_window) { + if (left < tc[s].mss_guest) + send = left; + else + send = tc[s].mss_guest; + + if (tcp_send_to_tap(c, s, 0, tcp_in_buf + offset, send)) + return 0; + + offset += send; + left -= send; + } + +out: + clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_tap); + tcp_act_slow_set(s); + + return !!left; +} + +/** + * tcp_tap_handler() - Handle packets from tap and state transitions + * @c: Execution context + * @af: Address family, AF_INET or AF_INET6 + * @in: Input buffer + * @len: Length, including TCP header + */ +void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len) +{ + struct tcphdr *th = (struct tcphdr *)in; + size_t off; + int s, ws; + + if (len < sizeof(*th)) + return; + + off = th->doff * 4; + if (off < sizeof(*th) || off > len) + return; + + s = tcp_sock_lookup(af, addr, th->source, th->dest); + + if (s < 0) { + if (th->syn) + tcp_conn_from_tap(c, af, addr, th, len); + return; + } + + if (th->rst) { + tcp_close_and_epoll_del(c, s); + return; + } + + tcp_clamp_window(s, th, len); + + if (th->ack) + clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_tap); + + switch (tc[s].s) { + case SOCK_SYN_SENT: + if (!th->syn || !th->ack) + return; + + tc[s].mss_guest = tcp_opt_get(th, len, OPT_MSS, NULL, NULL); + if (tc[s].mss_guest < 0) + tc[s].mss_guest = MSS_DEFAULT; + + ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL); + if (ws > MAX_WS) { + if (tcp_send_to_tap(c, s, RST, NULL, 0)) + return; + + tc[s].seq_to_tap = 0; + tc[s].ws_allowed = 0; + tcp_send_to_tap(c, s, SYN, NULL, 0); + return; + } + + tc[s].seq_from_tap = tc[s].seq_init_from_tap = ntohl(th->seq); + tc[s].seq_ack_to_tap = tc[s].seq_from_tap; + + tc[s].s = ESTABLISHED; + tcp_send_to_tap(c, s, ACK, NULL, 0); + break; + case TAP_SYN_SENT: + break; + case TAP_SYN_RCVD: + if (th->fin) { + shutdown(s, SHUT_WR); + tc[s].s = FIN_WAIT_1; + + break; + } + + if (!th->ack) { + tcp_rst(c, s); + return; + } + + tc[s].seq_ack_from_tap = ntohl(th->ack_seq); + + tc[s].s = ESTABLISHED; + break; + case ESTABLISHED: + if (th->ack) { + int retrans = 0; + + if (len == th->doff) + retrans = tcp_check_dupack(s, th->ack_seq); + + if (tcp_sock_consume(s, ntohl(th->ack_seq))) { + tcp_rst(c, s); + return; + } + + if (retrans) { + tc[s].seq_to_tap = tc[s].seq_ack_from_tap; + tcp_data_from_sock(c, s); + } + } + + if (tcp_send_to_sock(c, s, ntohl(th->seq), in + off, len - off, + th->psh ? 0 : MSG_MORE)) + break; + + if (th->fin) { + shutdown(s, SHUT_WR); + tc[s].s = FIN_WAIT_1; + } + + break; + case ESTABLISHED_SOCK_FIN: + if (tcp_send_to_sock(c, s, ntohl(th->seq), in + off, len - off, + th->psh ? 0 : MSG_MORE) < 0) + break; + + if (th->ack) { + shutdown(s, SHUT_RD); + if (!tcp_data_from_sock(c, s)) + tc[s].s = CLOSE_WAIT; + + if (tcp_sock_consume(s, ntohl(th->ack_seq))) { + tcp_rst(c, s); + return; + } + } + + break; + + case CLOSE_WAIT: + if (tcp_sock_consume(s, ntohl(th->ack_seq))) { + tcp_rst(c, s); + return; + } + + if (th->fin) { + shutdown(s, SHUT_WR); + tc[s].s = LAST_ACK; + } + + break; + case FIN_WAIT_1: + case LAST_ACK: + case CLOSED: /* ;) */ + break; + } + + if (tc[s].seq_to_tap > tc[s].seq_ack_from_tap) + tcp_act_slow_set(s); + else + tcp_act_slow_clear(s); + + if (tc[s].seq_from_tap > tc[s].seq_ack_to_tap) + tcp_act_fast_set(s); + else + tcp_act_fast_clear(s); +} + +/** + * tcp_connect_finish() - Handle completion of connect() from EPOLLOUT event + * @c: Execution context + * @s: File descriptor number for socket + */ +static void tcp_connect_finish(struct ctx *c, int s) +{ + struct epoll_event ev = { 0 }; + socklen_t sl; + int so; + + sl = sizeof(so); + if (getsockopt(s, SOL_SOCKET, SO_ERROR, &so, &sl) || so) { + tcp_rst(c, s); + return; + } + + if (tcp_send_to_tap(c, s, SYN | ACK, NULL, 0) < 0) + return; + + ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP; + ev.data.fd = s; + epoll_ctl(c->epollfd, EPOLL_CTL_MOD, s, &ev); + + tc[s].s = TAP_SYN_RCVD; +} + +/** + * tcp_sock_handler() - Handle new data from socket + * @c: Execution context + * @s: File descriptor number for socket + * @events: epoll events bitmap + */ +void tcp_sock_handler(struct ctx *c, int s, uint32_t events) +{ + socklen_t sl; + int so; + + if (tc[s].s == LAST_ACK) { + tcp_close_and_epoll_del(c, s); + return; + } + + sl = sizeof(so); + if ((events & EPOLLERR) || + getsockopt(s, SOL_SOCKET, SO_ACCEPTCONN, &so, &sl)) { + if (tc[s].s != CLOSED) + tcp_rst(c, s); + return; + } + + if (so) { + tcp_conn_from_sock(c, s); + return; + } + + if (events & EPOLLOUT) { /* Implies TAP_SYN_SENT */ + tcp_connect_finish(c, s); + return; + } + + if (tc[s].s == ESTABLISHED) + tcp_data_from_sock(c, s); + + if (events & EPOLLRDHUP || events & EPOLLHUP) { + if (tc[s].s == ESTABLISHED) + tc[s].s = ESTABLISHED_SOCK_FIN; + + tcp_send_to_tap(c, s, FIN | ACK, NULL, 0); + + if (tc[s].s == FIN_WAIT_1) { + shutdown(s, SHUT_RD); + + if (tcp_sock_consume(s, ntohl(tc[s].seq_ack_from_tap))) { + tcp_rst(c, s); + return; + } + + tcp_close_and_epoll_del(c, s); + tc[s].s = CLOSED; + } + } +} + +/** + * tcp_sock_init() - Create and bind listening sockets for inbound connections + * @c: Execution context + * + * Return: 0 on success, -1 on failure + */ +int tcp_sock_init(struct ctx *c) +{ + in_port_t port; + + for (port = 0; port < (1 << 15) + (1 << 14); port++) { + if (c->v4 && sock_l4_add(c, 4, IPPROTO_TCP, htons(port)) < 0) + return -1; + if (c->v6 && sock_l4_add(c, 6, IPPROTO_TCP, htons(port)) < 0) + return -1; + } + + return 0; +} + +/** + * tcp_periodic_fast_one() - Handler for "fast" timeout events on one socket + * @c: Execution context + * @s: File descriptor number for socket + * @ts: Timestamp from caller + * + * Return: 0 if socket needs to be monitored further, non-zero otherwise + */ +int tcp_periodic_fast_one(struct ctx *c, int s, struct timespec *ts) +{ + if (timespec_diff_ms(ts, &tc[s].last_ts_sock) < SOCK_ACK_INTERVAL) + return 0; + + tc[s].last_ts_sock = *ts; + + tcp_send_to_tap(c, s, 0, NULL, 0); + + return tc[s].seq_from_tap == tc[s].seq_ack_to_tap; +} + +/** + * tcp_periodic_fast() - Handle sockets in "fast" event bitmap, clear as needed + * @c: Execution context + */ +void tcp_periodic_fast(struct ctx *c) +{ + long *word = (long *)tcp_act_fast, tmp; + struct timespec now; + unsigned int i; + int n, s; + + clock_gettime(CLOCK_MONOTONIC, &now); + + for (i = 0; i < sizeof(tcp_act_fast) / sizeof(long); i++, word++) { + tmp = *word; + while ((n = ffsl(tmp))) { + tmp &= ~(1UL << (n - 1)); + + s = i * sizeof(long) * 8 + n - 1; + + if (tcp_periodic_fast_one(c, s, &now)) + *word &= ~(1UL << (n - 1)); + } + } +} + +/** + * tcp_periodic_fast_one() - Handler for "slow" timeout events on one socket + * @c: Execution context + * @s: File descriptor number for socket + * @ts: Timestamp from caller + */ +void tcp_periodic_slow_one(struct ctx *c, int s, struct timespec *ts) +{ + switch (tc[s].s) { + case SOCK_SYN_SENT: + case TAP_SYN_SENT: + case TAP_SYN_RCVD: + if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > SYN_TIMEOUT) + tcp_rst(c, s); + break; + case ESTABLISHED_SOCK_FIN: + if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > FIN_TIMEOUT) { + tcp_rst(c, s); + break; + } + /* Falls through */ + case ESTABLISHED: + if (tc[s].seq_ack_from_tap < tc[s].seq_to_tap && + timespec_diff_ms(ts, &tc[s].last_ts_tap) > ACK_TIMEOUT) { + tc[s].seq_to_tap = tc[s].seq_ack_from_tap; + tcp_data_from_sock(c, s); + } + + if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > ACT_TIMEOUT && + timespec_diff_ms(ts, &tc[s].last_ts_sock) > ACT_TIMEOUT) + tcp_rst(c, s); + + break; + case CLOSE_WAIT: + case FIN_WAIT_1: + if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > FIN_TIMEOUT) + tcp_rst(c, s); + break; + case LAST_ACK: + if (timespec_diff_ms(ts, &tc[s].last_ts_sock) > + LAST_ACK_TIMEOUT) + tcp_rst(c, s); + break; + case CLOSED: + break; + } +} + +/** + * tcp_periodic_slow() - Handle sockets in "slow" event bitmap + * @c: Execution context + */ +void tcp_periodic_slow(struct ctx *c) +{ + long *word = (long *)tcp_act_slow, tmp; + struct timespec now; + unsigned int i; + int n; + + clock_gettime(CLOCK_MONOTONIC, &now); + + for (i = 0; i < sizeof(tcp_act_slow) / sizeof(long); i++, word++) { + tmp = *word; + while ((n = ffsl(tmp))) { + tmp &= ~(1UL << (n - 1)); + tcp_periodic_slow_one(c, i * sizeof(long) * 8 + n - 1, + &now); + } + } +} |