// SPDX-License-Identifier: AGPL-3.0-or-later /* PASST - Plug A Simple Socket Transport * * tcp.c - TCP L2-L4 translation state machine * * Copyright (c) 2020-2021 Red Hat GmbH * Author: Stefano Brivio * */ /** * DOC: Theory of Operation * * * Overview * -------- * * This implementation maps TCP traffic between a single L2 interface (tap) and * native TCP (L4) sockets, mimicking and reproducing as closely as possible the * inferred behaviour of applications running on a guest, connected via said L2 * interface. Four connection flows are supported: * - from the local host to the guest behind the tap interface: * - this is the main use case for proxies in service meshes * - we bind to all unbound local ports, and relay traffic between L4 sockets * with local endpoints and the L2 interface * - from remote hosts to the guest behind the tap interface: * - this might be needed for services that need to be addressed directly, * and typically configured with special port forwarding rules (which are * not needed here) * - we also relay traffic between L4 sockets with remote endpoints and the L2 * interface * - from the guest to the local host: * - this is not observed in practice, but implemented for completeness and * transparency * - from the guest to external hosts: * - this might be needed for applications running on the guest that need to * directly access internet services (e.g. NTP) * * Relevant goals are: * - transparency: sockets need to behave as if guest applications were running * directly on the host. This is achieved by: * - avoiding port and address translations whenever possible * - mirroring TCP dynamics by observation of socket parameters (TCP_INFO * socket option) and TCP headers of packets coming from the tap interface, * reapplying those parameters in both flow directions (including TCP_MSS, * TCP_WINDOW_CLAMP socket options) * - simplicity: only a small subset of TCP logic is implemented here and * delegated as much as possible to the TCP implementations of guest and host * kernel. This is achieved by: * - avoiding a complete TCP stack reimplementation, with a modified TCP state * machine focused on the translation of observed states instead * - mirroring TCP dynamics as described above and hence avoiding the need for * segmentation, explicit queueing, and reassembly of segments * - security: * - no dynamic memory allocation is performed * - TODO: synflood protection * - TODO: sequence collision attacks * * Portability is limited by usage of Linux-specific socket options. * * * Limits * ------ * * To avoid the need for dynamic memory allocation, a maximum, reasonable amount * of connections is defined by TCP_MAX_CONNS below (currently 256k, close to * the maximum amount of file descriptors typically available to a process on * Linux). * * While fragmentation and reassembly are not implemented, tracking of missing * segments and retransmissions needs to be, thus data needs to linger on * sockets as long as it's not acknowledged by the guest, and read using * MSG_PEEK into a single, preallocated static buffer sized to the maximum * supported window, 64MiB. This imposes a practical limitation on window * scaling, that is, the maximum factor is 1024. If a bigger window scaling * factor is observed during connection establishment, connection is reset and * reestablished by omitting the scaling factor in the SYN segment. This * limitation only applies to the window scaling advertised by the guest, but * if exceeded, no window scaling will be allowed at all toward either endpoint. * * * Ports * ----- * * To avoid the need for ad-hoc configuration of port forwarding or allowed * ports, listening sockets are opened and bound to all unbound ports on the * host, as far as process capabilities allow. This service needs to be started * after any application proxy that needs to bind to local ports. * * No port translation is needed for connections initiated remotely or by the * local host: source port from socket is reused while establishing connections * to the guest. * * For connections initiated by the guest, it's not possible to force the same * source port as connections are established by the host kernel: that's the * only port translation needed. * * * Connection tracking and storage * ------------------------------- * * Connection are tracked by the @tc array of struct tcp_conn, containing * addresses, ports, TCP states and parameters. This is statically allocated and * indices are the file descriptor numbers associated to inbound or outbound * sockets. * * IPv4 addresses are stored as IPv4-mapped IPv6 addresses to avoid the need for * separate data structures depending on the protocol version. * * - Inbound connection requests (to the guest) are mapped using the triple * < source IP address, source port, destination port > * - Outbound connection requests (from the guest) are mapped using the triple * < destination IP address, destination port, source port > * where the source port is the one used by the guest, not the one used by the * corresponding host socket * * * Initialisation * -------------- * * Up to 2^15 + 2^14 listening sockets (excluding ephemeral ports, repeated for * IPv4 and IPv6) are opened and bound to wildcard addresses. Some will fail to * bind (for low ports, or ports already bound, e.g. by a proxy). These are * added to the epoll list, with no separate storage. * * * States and events * ----------------- * * These states apply to connected sockets only, listening sockets are always * open after initialisation, in LISTEN state. A single state is maintained for * both sides of the connection, and most states are omitted as they are already * handled by host kernel and guest. * * - CLOSED no connection * No associated events: this is always a final state, new connections * directly start from TAP_SYN_SENT or SOCK_SYN_SENT described below. * * - TAP_SYN_SENT connect() in progress, triggered from tap * - connect() completes SYN,ACK to tap > TAP_SYN_RCVD * - connect() aborts RST to tap, close socket > CLOSED * - RST from tap close socket > CLOSED * * - SOCK_SYN_SENT new connected socket, SYN sent to tap * - SYN,ACK from tap ACK to tap > ESTABLISHED * - SYN,ACK timeout RST to tap, close socket > CLOSED * - socket error RST to tap, close socket > CLOSED * - RST from tap close socket > CLOSED * * - TAP_SYN_RCVD connect() completed, SYN,ACK sent to tap * - ACK from tap > ESTABLISHED * - ACK timeout RST to tap, close socket > CLOSED * - socket error RST to tap, close socket > CLOSED * - RST from tap close socket > CLOSED * * - ESTABLISHED connection established, ready for data * - zero-sized socket read FIN to tap > ESTABLISHED_SOCK_FIN * - data timeout FIN to tap > ESTABLISHED_SOCK_FIN * - socket error RST to tap, close socket > CLOSED * - FIN from tap FIN,ACK to tap, close socket > FIN_WAIT_1 * - RST from tap close socket > CLOSED * * - ESTABLISHED_SOCK_FIN socket wants to close connection, data allowed * - ACK from tap > CLOSE_WAIT * - ACK timeout RST to tap, close socket > CLOSED * - RST from tap close socket > CLOSED * * - CLOSE_WAIT socket wants to close connection, seen by tap * - socket error RST to tap, close socket > CLOSED * - FIN from tap ACK to tap, close socket > LAST_ACK * - FIN timeout RST to tap, close socket > CLOSED * - RST from tap close socket > CLOSED * * - LAST_ACK socket started close, tap completed it * - anything from socket close socket > CLOSED * - socket error RST to tap, close socket > CLOSED * - ACK timeout RST to tap, close socket > CLOSED * * - FIN_WAIT_1 tap wants to close connection, _FIN,ACK sent_ * - ACK from tap close socket > CLOSED * - socket error RST to tap, close socket > CLOSED * - ACK timeout RST to tap, close socket > CLOSED * * * Connection setup * ---------------- * * - inbound connection (from socket to guest): on accept() from listening * socket, the new socket is mapped in connection tracking table, and * three-way handshake initiated towards the guest, advertising MSS and window * size and scaling from socket parameters * - outbound connection (from guest to socket): on SYN segment from guest, a * new socket is created and mapped in connection tracking table, setting * MSS and window clamping from header and option of the observed SYN segment * * * Aging and timeout * ----------------- * * Two bitmaps of TCP_MAX_CONNS bits indicate which connections need scheduled * actions: * - @tcp_act_fast is used to send ACK segments to the tap once TCP_INFO reports * an increased number of acknowledged bytes sent on a socket, and examined * every 20ms (one tenth of current TCP_DELACK_MAX on Linux): for each marked * connection, a TCP_INFO query is performed and ACK segments are sent right * away as needed * - @tcp_act_slow is used for state and retransmission timeouts, and examined * every 2s: for each marked connection with an expired @timeout timestamp * specific actions are taken depending on the connection state: * - SOCK_SYN_SENT: after a 2MSL (240s) timeout waiting for a SYN,ACK segment * from tap expires, connection is reset (RST to tap, socket closed) * - TAP_SYN_RCVD: after a 2MSL (240s) timeout waiting for an ACK segment from * tap expires, connection is reset (RST to tap, socket closed) * - ESTABLISHED: after a timeout of 1s (TODO: implement requirements from * RFC 6298) waiting for an ACK segment from tap expires, data from socket * queue is retransmitted starting from the last ACK sequence * - ESTABLISHED: after a two hours (current TCP_KEEPALIVE_TIME on Linux) * timeout waiting for any activity expires, connection is reset (RST to * tap, socket closed) * - ESTABLISHED_SOCK_FIN: after a 2MSL (240s) timeout waiting for an ACK * segment from tap expires, connection is reset (RST to tap, socket closed) * - CLOSE_WAIT: after a 2MSL (240s) timeout waiting for a FIN segment from * tap expires, connection is reset (RST to tap, socket closed) * - LAST_ACK: after a 2MSL (240s) timeout waiting for an ACK segment from * socket expires, connection is reset (RST to tap, socket closed) * - FIN_WAIT_1: after a 2MSL (240s) timeout waiting for an ACK segment from * tap expires, connection is reset (RST to tap, socket closed) * * * Data flows (from ESTABLISHED, ESTABLISHED_SOCK_FIN states) * ---------------------------------------------------------- * * @seq_to_tap: next sequence for packets to tap * @seq_ack_from_tap: last ACK number received from tap * @seq_from_tap: next sequence for packets from tap (not actually sent) * @seq_ack_to_tap: last ACK number sent to tap * * @seq_init_from_tap: initial sequence number from tap * * @tap_window: last window size received from tap, scaled * @tcpi_acked_last: most recent value of tcpi_bytes_acked (TCP_INFO) * * - from socket to tap: * - on new data from socket: * - peek into buffer * - send data to tap: * - starting at offset (@seq_to_tap - @seq_ack_from_tap) * - in MSS-sized segments * - increasing @seq_to_tap at each segment * - up to window (until @seq_to_tap - @seq_ack_from_tap <= @tap_window) * - mark socket in bitmap for periodic ACK check, set @last_ts_to_tap * - on read error, send RST to tap, close socket * - on zero read, send FIN to tap, enter ESTABLISHED_SOCK_FIN * - on ACK from tap: * - check if it's the second duplicated ACK * - consume buffer by difference between new ack_seq and @seq_ack_from_tap * - update @seq_ack_from_tap from ack_seq in header * - on two duplicated ACKs, reset @seq_to_tap to @seq_ack_from_tap, and * resend with steps listed above * - set TCP_WINDOW_CLAMP from TCP header from tap * - on @seq_ack_from_tap == @seq_to_tap, mark in bitmap, umark otherwise * - periodically: * - if @seq_ack_from_tap < @seq_to_tap and the retransmission timer * (TODO: implement requirements from RFC 6298, currently 3s fixed) from * @last_ts_to_tap elapsed, reset @seq_to_tap to @seq_ack_from_tap, and * resend data with the steps listed above * * - from tap to socket: * - on packet from tap: * - set TCP_WINDOW_CLAMP from TCP header from tap * - check seq from header against @seq_from_tap, if data is missing, send * two ACKs with number @seq_ack_to_tap, discard packet * - otherwise queue data to socket, set @seq_from_tap to seq from header * plus payload length * - query socket for TCP_INFO, on tcpi_bytes_acked > @tcpi_acked_last, * set @tcpi_acked_last to tcpi_bytes_acked, set @seq_ack_to_tap * to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and * send ACK to tap * - set @last_ts_sock * - on @seq_ack_to_tap < @seq_from_tap, mark socket for later ACK in bitmap * - periodically: * - if socket is marked in bitmap, query socket for TCP_INFO, on * tcpi_bytes_acked > @tcpi_acked_last, * set @tcpi_acked_last to tcpi_bytes_acked, set @seq_ack_to_tap * to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and * send ACK to tap * - on @seq_ack_to_tap == @seq_from_tap, unmark socket from bitmap */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "passt.h" #include "tap.h" #include "util.h" /* Approximately maximum number of open descriptors per process */ #define MAX_CONNS (256 * 1024) #define MAX_WS 10 #define MAX_WINDOW (1 << (16 + (MAX_WS))) #define MSS_DEFAULT 536 #define WINDOW_DEFAULT 4380 #define SYN_TIMEOUT 240000 /* ms */ #define ACK_TIMEOUT 3000 #define ACT_TIMEOUT 7200000 #define FIN_TIMEOUT 240000 #define LAST_ACK_TIMEOUT 240000 #define SOCK_ACK_INTERVAL 20 /* We need to include for tcpi_bytes_acked, instead of * , but that doesn't include a definition for SOL_TCP */ #define SOL_TCP IPPROTO_TCP static char tcp_in_buf[MAX_WINDOW]; static uint8_t tcp_act_fast[MAX_CONNS / 8] = { 0 }; static uint8_t tcp_act_slow[MAX_CONNS / 8] = { 0 }; enum tcp_state { CLOSED = 0, TAP_SYN_SENT, SOCK_SYN_SENT, TAP_SYN_RCVD, ESTABLISHED, ESTABLISHED_SOCK_FIN, CLOSE_WAIT, LAST_ACK, FIN_WAIT_1, }; #define FIN (1 << 0) #define SYN (1 << 1) #define RST (1 << 2) #define ACK (1 << 4) #define OPT_EOL 0 #define OPT_NOP 1 #define OPT_MSS 2 #define OPT_WS 3 #define OPT_SACKP 4 #define OPT_SACK 5 #define OPT_TS 8 /** * struct tcp_conn - Descriptor for a TCP connection * @a.a6: IPv6 remote address, can be IPv4-mapped * @a.a4.zero: Zero prefix for IPv4-mapped, see RFC 6890, Table 20 * @a.a4.one: Ones prefix for IPv4-mapped * @a.a4.a: IPv4 address * @tap_port: Guest-facing tap port * @sock_port: Remote, socket-facing port * @s: TCP connection state * @seq_to_tap: Next sequence for packets to tap * @seq_ack_from_tap: Last ACK number received from tap * @seq_from_tap: Next sequence for packets from tap (not actually sent) * @seq_ack_to_tap: Last ACK number sent to tap * @seq_init_from_tap: Initial sequence number from tap * @tcpi_acked_last: Most recent value of tcpi_bytes_acked (TCP_INFO query) * @dup_acks: Count of currently duplicated ACKs from tap * @ws_allowed: Window scaling allowed * @ws: Window scaling factor * @tap_window: Last window size received from tap, scaled * @last_ts_sock: Last activity timestamp from socket for timeout purposes * @last_ts_tap: Last activity timestamp from tap for timeout purposes * @mss_guest: Maximum segment size advertised by guest */ struct tcp_conn { union { struct in6_addr a6; struct { uint8_t zero[10]; uint8_t one[2]; struct in_addr a; } a4; } a; in_port_t tap_port; in_port_t sock_port; enum tcp_state s; uint32_t seq_to_tap; uint32_t seq_ack_from_tap; uint32_t seq_from_tap; uint32_t seq_ack_to_tap; uint32_t seq_init_from_tap; uint64_t tcpi_acked_last; int dup_acks; int ws_allowed; int ws; int tap_window; struct timespec last_ts_sock; struct timespec last_ts_tap; int mss_guest; }; static struct tcp_conn tc[MAX_CONNS]; static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len); /** * tcp_act_fast_set() - Set socket in bitmap for "fast" timeout events * @s: Socket file descriptor number */ static void tcp_act_fast_set(int s) { tcp_act_fast[s / 8] |= 1 << (s % 8); } /** * tcp_act_fast_clear() - Clear socket from bitmap for "fast" timeout events * @s: Socket file descriptor number */ static void tcp_act_fast_clear(int s) { tcp_act_fast[s / 8] &= ~(1 << (s % 8)); } /** * tcp_act_slow_set() - Set socket in bitmap for "slow" timeout events * @s: Socket file descriptor number */ static void tcp_act_slow_set(int s) { tcp_act_slow[s / 8] |= 1 << (s % 8); } /** * tcp_act_slow_clear() - Clear socket from bitmap for "slow" timeout events * @s: Socket file descriptor number */ static void tcp_act_slow_clear(int s) { tcp_act_slow[s / 8] &= ~(1 << (s % 8)); } /** * tcp_opt_get() - Get option, and value if any, from TCP header * @th: Pointer to TCP header * @len: Length of buffer, including TCP header * @type: Option type to look for * @optlen: Optional, filled with option length if passed * @value: Optional, set to start of option value if passed * * Return: Option value, meaningful for up to 4 bytes, -1 if not found */ static int tcp_opt_get(struct tcphdr *th, unsigned int len, uint8_t type, uint8_t *optlen, void *value) { uint8_t *p, __type, __optlen; len -= sizeof(*th); p = (uint8_t *)(th + 1); if (len > th->doff * 4 - sizeof(*th)) len = th->doff * 4 - sizeof(*th); while (len >= 2) { switch (*p) { case OPT_EOL: return -1; case OPT_NOP: p++; len--; break; default: __type = *(p++); __optlen = *(p++); len -= 2; if (type == __type) { if (optlen) *optlen = __optlen; if (value) value = p; if (__optlen - 2 == 0) return 0; if (__optlen - 2 == 1) return *p; if (__optlen - 2 == 2) return ntohs(*(uint16_t *)p); return ntohl(*(uint32_t *)p); } p += __optlen - 2; len -= __optlen - 2; } } return -1; } /** * tcp_close_and_epoll_del() - Close socket and remove from epoll descriptor * @c: Execution context * @s: File descriptor number for socket */ static void tcp_close_and_epoll_del(struct ctx *c, int s) { epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL); close(s); tcp_act_fast_clear(s); tcp_act_slow_clear(s); } /** * tcp_rst() - Reset a connection: send RST segment to tap, close socket * @c: Execution context * @s: File descriptor number for socket */ static void tcp_rst(struct ctx *c, int s) { if (s < 0) return; tcp_send_to_tap(c, s, RST, NULL, 0); tcp_close_and_epoll_del(c, s); tc[s].s = CLOSED; } /** * tcp_send_to_tap() - Send segment to tap, with options and values from socket * @c: Execution context * @s: File descriptor number for socket * @flags: TCP flags to set * @in: Input buffer, L4 header * @len: Buffer length, at L4 * * Return: -1 on error with connection reset, 0 otherwise */ static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len) { char buf[USHRT_MAX] = { 0 }, *data; struct tcp_info info = { 0 }; socklen_t sl = sizeof(info); int ws = 0, have_info = 1; struct tcphdr *th; if (getsockopt(s, SOL_TCP, TCP_INFO, &info, &sl)) { if (!(flags & RST)) { tcp_rst(c, s); return -1; } have_info = 0; } th = (struct tcphdr *)buf; data = (char *)(th + 1); if (flags & SYN && have_info) { if (tc[s].ws_allowed) ws = info.tcpi_snd_wscale; /* Options: MSS, NOP and window scale if allowed (4-8 bytes) */ *data++ = 2; *data++ = 4; *(uint16_t *)data = htons(info.tcpi_snd_mss); data += 2; if (ws) { *data++ = 1; *data++ = 3; *data++ = 3; *data++ = ws; th->doff = (20 + 8) / 4; } else { th->doff = (20 + 4) / 4; } th->seq = htonl(tc[s].seq_to_tap++); } else { th->doff = 20 / 4; th->seq = htonl(tc[s].seq_to_tap); tc[s].seq_to_tap += len; } if ((info.tcpi_bytes_acked > tc[s].tcpi_acked_last || (flags & ACK) || len) && have_info) { uint64_t ack_seq; th->ack = 1; /* info.tcpi_bytes_acked already includes one byte for SYN, but * not for incoming connections. */ ack_seq = info.tcpi_bytes_acked + tc[s].seq_init_from_tap; if (!info.tcpi_bytes_acked) ack_seq++; ack_seq &= (uint32_t)~0U; tc[s].seq_ack_to_tap = ack_seq; th->ack_seq = htonl(tc[s].seq_ack_to_tap); tc[s].tcpi_acked_last = info.tcpi_bytes_acked; } else { if (!len && !flags) return 0; th->ack = th->ack_seq = 0; } th->rst = !!(flags & RST); th->syn = !!(flags & SYN); th->fin = !!(flags & FIN); th->source = tc[s].sock_port; th->dest = tc[s].tap_port; if (have_info) th->window = htons(info.tcpi_snd_wnd >> info.tcpi_snd_wscale); else th->window = WINDOW_DEFAULT; th->urg_ptr = 0; th->check = 0; memcpy(data, in, len); tap_ip_send(c, &tc[s].a.a6, IPPROTO_TCP, buf, th->doff * 4 + len); return 0; } /** * tcp_clamp_window() - Set window and scaling from option, clamp on socket * @s: File descriptor number for socket * @th: TCP header, from tap * @len: Buffer length, at L4 */ static void tcp_clamp_window(int s, struct tcphdr *th, int len) { int ws; if (!tc[s].tap_window) { ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL); if (ws >= 0 && ws <= MAX_WS) { tc[s].ws_allowed = 1; tc[s].ws = ws; } else { tc[s].ws_allowed = 0; tc[s].ws = 0; } /* First value is not scaled. Also, don't clamp yet, to avoid * getting a zero scale just because we set a small window now. */ tc[s].tap_window = ntohs(th->window); } else { tc[s].tap_window = ntohs(th->window) << tc[s].ws; setsockopt(s, SOL_TCP, TCP_WINDOW_CLAMP, &tc[s].tap_window, sizeof(tc[s].tap_window)); } } /** * tcp_conn_from_tap() - Handle connection request (SYN segment) from tap * @c: Execution context * @af: Address family, AF_INET or AF_INET6 * @addr: Remote address, pointer to sin_addr or sin6_addr * @th: TCP header from tap * @len: Packet length at L4 */ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, struct tcphdr *th, size_t len) { struct sockaddr_in addr4 = { .sin_family = AF_INET, .sin_port = th->dest, .sin_addr = *(struct in_addr *)addr, }; struct sockaddr_in6 addr6 = { .sin6_family = AF_INET6, .sin6_port = th->dest, .sin6_addr = *(struct in6_addr *)addr, }; struct epoll_event ev = { 0 }; const struct sockaddr *sa; socklen_t sl; int s; s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); if (s < 0) return; tc[s].mss_guest = tcp_opt_get(th, len, OPT_MSS, NULL, NULL); if (tc[s].mss_guest < 0) tc[s].mss_guest = MSS_DEFAULT; sl = sizeof(tc[s].mss_guest); setsockopt(s, SOL_TCP, TCP_MAXSEG, &tc[s].mss_guest, sl); tcp_clamp_window(s, th, len); if (af == AF_INET) { sa = (const struct sockaddr *)&addr4; sl = sizeof(addr4); memset(&tc[s].a.a4.zero, 0, sizeof(tc[s].a.a4.zero)); memset(&tc[s].a.a4.one, 0xff, sizeof(tc[s].a.a4.one)); memcpy(&tc[s].a.a4.a, addr, sizeof(tc[s].a.a4.a)); } else { sa = (const struct sockaddr *)&addr6; sl = sizeof(addr6); memcpy(&tc[s].a.a6, addr, sizeof(tc[s].a.a6)); } tc[s].sock_port = th->dest; tc[s].tap_port = th->source; ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP; ev.data.fd = s; tc[s].seq_init_from_tap = ntohl(th->seq); tc[s].seq_from_tap = tc[s].seq_init_from_tap + 1; tc[s].seq_ack_to_tap = tc[s].seq_from_tap; /* TODO: RFC 6528 with SipHash, worth it? */ tc[s].seq_ack_from_tap = tc[s].seq_to_tap = 0; if (connect(s, sa, sl)) { if (errno != EINPROGRESS) { tcp_rst(c, s); return; } ev.events |= EPOLLOUT; tc[s].s = TAP_SYN_SENT; } else { if (tcp_send_to_tap(c, s, SYN | ACK, NULL, 0)) return; tc[s].s = TAP_SYN_RCVD; } epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev); return; } /** * tcp_sock_lookup() - Look up socket given remote address and pair of ports * @af: Address family, AF_INET or AF_INET6 * @tap_port: tap-facing port * @sock_port: Socket-facing port * * Return: file descriptor number for socket, if found, -1 otherwise */ static int tcp_sock_lookup(int af, void *addr, in_port_t tap_port, in_port_t sock_port) { int i; /* TODO: hash table and lookup. This is just a dummy implementation. */ for (i = 0; i < MAX_CONNS; i++) { if (af == AF_INET && IN6_IS_ADDR_V4MAPPED(&tc[i].a.a6) && !memcmp(&tc[i].a.a4.a, addr, sizeof(tc[i].a.a4.a)) && tc[i].tap_port == tap_port && tc[i].sock_port == sock_port && tc[i].s) return i; if (af == AF_INET6 && !memcmp(&tc[i].a.a6, addr, sizeof(tc[i].a.a6)) && tc[i].tap_port == tap_port && tc[i].sock_port == sock_port && tc[i].s) return i; } return -1; } /** * tcp_conn_from_sock() - Handle new connection request from listening socket * @c: Execution context * @fd: File descriptor number for listening socket */ static void tcp_conn_from_sock(struct ctx *c, int fd) { struct sockaddr_storage sa_r, sa_l; socklen_t sa_len = sizeof(sa_r); struct epoll_event ev = { 0 }; struct sockaddr_in6 *sa6; struct sockaddr_in *sa4; int s; if (getsockname(fd, (struct sockaddr *)&sa_l, &sa_len)) return; s = accept4(fd, (struct sockaddr *)&sa_r, &sa_len, SOCK_NONBLOCK); if (s == -1) return; if (sa_l.ss_family == AF_INET) { sa4 = (struct sockaddr_in *)&sa_r; memset(&tc[s].a.a4.zero, 0, sizeof(tc[s].a.a4.zero)); memset(&tc[s].a.a4.one, 0xff, sizeof(tc[s].a.a4.one)); memcpy(&tc[s].a.a4.a, &sa4->sin_addr, sizeof(tc[s].a.a4.a)); tc[s].sock_port = sa4->sin_port; sa4 = (struct sockaddr_in *)&sa_l; tc[s].tap_port = sa4->sin_port; } else if (sa_l.ss_family == AF_INET6) { sa6 = (struct sockaddr_in6 *)&sa_r; memcpy(&tc[s].a.a6, &sa6->sin6_addr, sizeof(tc[s].a.a6)); tc[s].sock_port = sa6->sin6_port; sa6 = (struct sockaddr_in6 *)&sa_l; tc[s].tap_port = sa6->sin6_port; } /* TODO: RFC 6528 with SipHash, worth it? */ tc[s].seq_to_tap = 0; tc[s].ws_allowed = 1; clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_sock); clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_tap); ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP; ev.data.fd = s; epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev); tc[s].s = SOCK_SYN_SENT; tcp_send_to_tap(c, s, SYN, NULL, 0); } /** * tcp_send_to_sock() - Send buffer to socket, update timestamp and sequence * @c: Execution context * @s: File descriptor number for socket * @seq: Previous TCP sequence, host order * @data: Data buffer * @len: Length at L4 * @extra_flags: Additional flags for send(), if any * * Return: -1 on socket error with connection reset, 0 otherwise */ static int tcp_send_to_sock(struct ctx *c, int s, int seq, char *data, int len, int extra_flags) { int err = send(s, data, len, MSG_DONTWAIT | MSG_NOSIGNAL | extra_flags); if (err < 0) { if (errno == EAGAIN || errno == EWOULDBLOCK) { /* If we can't queue right now, do nothing, sender has * to retransmit. */ return 0; } tcp_rst(c, s); return -1; } clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_sock); tc[s].seq_from_tap = seq + len; return 0; } /** * tcp_check_dupack() - Check if given ACK number is duplicated, update counter * @s: File descriptor number for socket * @ack_seq: ACK sequence, host order * * Return: 1 on two duplicated ACKs observed, with counter reset, 0 otherwise */ static int tcp_check_dupack(int s, uint32_t ack_seq) { if (ack_seq == tc[s].seq_ack_from_tap && ++tc[s].dup_acks == 2) { tc[s].dup_acks = 0; return 1; } return 0; } /** * tcp_sock_consume() - Consume (discard) data from socket buffer * @s: File descriptor number for socket * @ack_seq: ACK sequence, host order * * Return: -1 on invalid sequence, 0 otherwise */ static int tcp_sock_consume(int s, uint32_t ack_seq) { int to_ack; /* Implicitly take care of wrap-arounds */ to_ack = ack_seq - tc[s].seq_ack_from_tap; if (to_ack < 0) return -1; recv(s, NULL, to_ack, MSG_DONTWAIT | MSG_TRUNC); tc[s].seq_ack_from_tap = ack_seq; return 0; } /** * tcp_data_from_sock() - Handle new data from socket, queue to tap, in window * @c: Execution context * @s: File descriptor number for socket * * Return: non-zero on socket error or pending data, 0 otherwise */ static int tcp_data_from_sock(struct ctx *c, int s) { int len, offset, left, send; /* Don't dequeue until acknowledged by guest */ len = recv(s, tcp_in_buf, sizeof(tcp_in_buf), MSG_DONTWAIT | MSG_PEEK); if (len < 0) { if (errno != EAGAIN && errno != EWOULDBLOCK) tcp_rst(c, s); return 1; } if (len == 0) { if (tc[s].s >= ESTABLISHED_SOCK_FIN) return 0; tc[s].s = ESTABLISHED_SOCK_FIN; if (tcp_send_to_tap(c, s, FIN | ACK, NULL, 0)) return 0; left = 0; goto out; } offset = tc[s].seq_to_tap - tc[s].seq_ack_from_tap; left = len - offset; while (left && offset + tc[s].mss_guest <= tc[s].tap_window) { if (left < tc[s].mss_guest) send = left; else send = tc[s].mss_guest; if (tcp_send_to_tap(c, s, 0, tcp_in_buf + offset, send)) return 0; offset += send; left -= send; } out: clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_tap); tcp_act_slow_set(s); return !!left; } /** * tcp_tap_handler() - Handle packets from tap and state transitions * @c: Execution context * @af: Address family, AF_INET or AF_INET6 * @in: Input buffer * @len: Length, including TCP header */ void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len) { struct tcphdr *th = (struct tcphdr *)in; size_t off; int s, ws; if (len < sizeof(*th)) return; off = th->doff * 4; if (off < sizeof(*th) || off > len) return; s = tcp_sock_lookup(af, addr, th->source, th->dest); if (s < 0) { if (th->syn) tcp_conn_from_tap(c, af, addr, th, len); return; } if (th->rst) { tcp_close_and_epoll_del(c, s); return; } tcp_clamp_window(s, th, len); if (th->ack) clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_tap); switch (tc[s].s) { case SOCK_SYN_SENT: if (!th->syn || !th->ack) return; tc[s].mss_guest = tcp_opt_get(th, len, OPT_MSS, NULL, NULL); if (tc[s].mss_guest < 0) tc[s].mss_guest = MSS_DEFAULT; ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL); if (ws > MAX_WS) { if (tcp_send_to_tap(c, s, RST, NULL, 0)) return; tc[s].seq_to_tap = 0; tc[s].ws_allowed = 0; tcp_send_to_tap(c, s, SYN, NULL, 0); return; } tc[s].seq_from_tap = tc[s].seq_init_from_tap = ntohl(th->seq); tc[s].seq_ack_to_tap = tc[s].seq_from_tap; tc[s].s = ESTABLISHED; tcp_send_to_tap(c, s, ACK, NULL, 0); break; case TAP_SYN_SENT: break; case TAP_SYN_RCVD: if (th->fin) { shutdown(s, SHUT_WR); tc[s].s = FIN_WAIT_1; break; } if (!th->ack) { tcp_rst(c, s); return; } tc[s].seq_ack_from_tap = ntohl(th->ack_seq); tc[s].s = ESTABLISHED; break; case ESTABLISHED: if (th->ack) { int retrans = 0; if (len == th->doff) retrans = tcp_check_dupack(s, th->ack_seq); if (tcp_sock_consume(s, ntohl(th->ack_seq))) { tcp_rst(c, s); return; } if (retrans) { tc[s].seq_to_tap = tc[s].seq_ack_from_tap; tcp_data_from_sock(c, s); } } if (tcp_send_to_sock(c, s, ntohl(th->seq), in + off, len - off, th->psh ? 0 : MSG_MORE)) break; if (th->fin) { shutdown(s, SHUT_WR); tc[s].s = FIN_WAIT_1; } break; case ESTABLISHED_SOCK_FIN: if (tcp_send_to_sock(c, s, ntohl(th->seq), in + off, len - off, th->psh ? 0 : MSG_MORE) < 0) break; if (th->ack) { shutdown(s, SHUT_RD); if (!tcp_data_from_sock(c, s)) tc[s].s = CLOSE_WAIT; if (tcp_sock_consume(s, ntohl(th->ack_seq))) { tcp_rst(c, s); return; } } break; case CLOSE_WAIT: if (tcp_sock_consume(s, ntohl(th->ack_seq))) { tcp_rst(c, s); return; } if (th->fin) { shutdown(s, SHUT_WR); tc[s].s = LAST_ACK; } break; case FIN_WAIT_1: case LAST_ACK: case CLOSED: /* ;) */ break; } if (tc[s].seq_to_tap > tc[s].seq_ack_from_tap) tcp_act_slow_set(s); else tcp_act_slow_clear(s); if (tc[s].seq_from_tap > tc[s].seq_ack_to_tap) tcp_act_fast_set(s); else tcp_act_fast_clear(s); } /** * tcp_connect_finish() - Handle completion of connect() from EPOLLOUT event * @c: Execution context * @s: File descriptor number for socket */ static void tcp_connect_finish(struct ctx *c, int s) { struct epoll_event ev = { 0 }; socklen_t sl; int so; sl = sizeof(so); if (getsockopt(s, SOL_SOCKET, SO_ERROR, &so, &sl) || so) { tcp_rst(c, s); return; } if (tcp_send_to_tap(c, s, SYN | ACK, NULL, 0) < 0) return; ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP; ev.data.fd = s; epoll_ctl(c->epollfd, EPOLL_CTL_MOD, s, &ev); tc[s].s = TAP_SYN_RCVD; } /** * tcp_sock_handler() - Handle new data from socket * @c: Execution context * @s: File descriptor number for socket * @events: epoll events bitmap */ void tcp_sock_handler(struct ctx *c, int s, uint32_t events) { socklen_t sl; int so; if (tc[s].s == LAST_ACK) { tcp_close_and_epoll_del(c, s); return; } sl = sizeof(so); if ((events & EPOLLERR) || getsockopt(s, SOL_SOCKET, SO_ACCEPTCONN, &so, &sl)) { if (tc[s].s != CLOSED) tcp_rst(c, s); return; } if (so) { tcp_conn_from_sock(c, s); return; } if (events & EPOLLOUT) { /* Implies TAP_SYN_SENT */ tcp_connect_finish(c, s); return; } if (tc[s].s == ESTABLISHED) tcp_data_from_sock(c, s); if (events & EPOLLRDHUP || events & EPOLLHUP) { if (tc[s].s == ESTABLISHED) tc[s].s = ESTABLISHED_SOCK_FIN; tcp_send_to_tap(c, s, FIN | ACK, NULL, 0); if (tc[s].s == FIN_WAIT_1) { shutdown(s, SHUT_RD); if (tcp_sock_consume(s, ntohl(tc[s].seq_ack_from_tap))) { tcp_rst(c, s); return; } tcp_close_and_epoll_del(c, s); tc[s].s = CLOSED; } } } /** * tcp_sock_init() - Create and bind listening sockets for inbound connections * @c: Execution context * * Return: 0 on success, -1 on failure */ int tcp_sock_init(struct ctx *c) { in_port_t port; for (port = 0; port < (1 << 15) + (1 << 14); port++) { if (c->v4 && sock_l4_add(c, 4, IPPROTO_TCP, htons(port)) < 0) return -1; if (c->v6 && sock_l4_add(c, 6, IPPROTO_TCP, htons(port)) < 0) return -1; } return 0; } /** * tcp_periodic_fast_one() - Handler for "fast" timeout events on one socket * @c: Execution context * @s: File descriptor number for socket * @ts: Timestamp from caller * * Return: 0 if socket needs to be monitored further, non-zero otherwise */ int tcp_periodic_fast_one(struct ctx *c, int s, struct timespec *ts) { if (timespec_diff_ms(ts, &tc[s].last_ts_sock) < SOCK_ACK_INTERVAL) return 0; tc[s].last_ts_sock = *ts; tcp_send_to_tap(c, s, 0, NULL, 0); return tc[s].seq_from_tap == tc[s].seq_ack_to_tap; } /** * tcp_periodic_fast() - Handle sockets in "fast" event bitmap, clear as needed * @c: Execution context */ void tcp_periodic_fast(struct ctx *c) { long *word = (long *)tcp_act_fast, tmp; struct timespec now; unsigned int i; int n, s; clock_gettime(CLOCK_MONOTONIC, &now); for (i = 0; i < sizeof(tcp_act_fast) / sizeof(long); i++, word++) { tmp = *word; while ((n = ffsl(tmp))) { tmp &= ~(1UL << (n - 1)); s = i * sizeof(long) * 8 + n - 1; if (tcp_periodic_fast_one(c, s, &now)) *word &= ~(1UL << (n - 1)); } } } /** * tcp_periodic_fast_one() - Handler for "slow" timeout events on one socket * @c: Execution context * @s: File descriptor number for socket * @ts: Timestamp from caller */ void tcp_periodic_slow_one(struct ctx *c, int s, struct timespec *ts) { switch (tc[s].s) { case SOCK_SYN_SENT: case TAP_SYN_SENT: case TAP_SYN_RCVD: if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > SYN_TIMEOUT) tcp_rst(c, s); break; case ESTABLISHED_SOCK_FIN: if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > FIN_TIMEOUT) { tcp_rst(c, s); break; } /* Falls through */ case ESTABLISHED: if (tc[s].seq_ack_from_tap < tc[s].seq_to_tap && timespec_diff_ms(ts, &tc[s].last_ts_tap) > ACK_TIMEOUT) { tc[s].seq_to_tap = tc[s].seq_ack_from_tap; tcp_data_from_sock(c, s); } if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > ACT_TIMEOUT && timespec_diff_ms(ts, &tc[s].last_ts_sock) > ACT_TIMEOUT) tcp_rst(c, s); break; case CLOSE_WAIT: case FIN_WAIT_1: if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > FIN_TIMEOUT) tcp_rst(c, s); break; case LAST_ACK: if (timespec_diff_ms(ts, &tc[s].last_ts_sock) > LAST_ACK_TIMEOUT) tcp_rst(c, s); break; case CLOSED: break; } } /** * tcp_periodic_slow() - Handle sockets in "slow" event bitmap * @c: Execution context */ void tcp_periodic_slow(struct ctx *c) { long *word = (long *)tcp_act_slow, tmp; struct timespec now; unsigned int i; int n; clock_gettime(CLOCK_MONOTONIC, &now); for (i = 0; i < sizeof(tcp_act_slow) / sizeof(long); i++, word++) { tmp = *word; while ((n = ffsl(tmp))) { tmp &= ~(1UL << (n - 1)); tcp_periodic_slow_one(c, i * sizeof(long) * 8 + n - 1, &now); } } }