aboutgitcodebugslistschat
diff options
context:
space:
mode:
-rw-r--r--Makefile11
-rw-r--r--arp.c9
-rw-r--r--dhcp.c9
-rw-r--r--dhcpv6.c7
-rw-r--r--icmp.c185
-rw-r--r--icmp.h24
-rw-r--r--ndp.c8
-rw-r--r--passt.c547
-rw-r--r--passt.h68
-rw-r--r--pcap.c34
-rw-r--r--pcap.h2
-rw-r--r--siphash.c4
-rw-r--r--tap.c590
-rw-r--r--tap.h4
-rw-r--r--tcp.c1543
-rw-r--r--tcp.h45
-rw-r--r--udp.c814
-rw-r--r--udp.h38
-rw-r--r--util.c162
-rw-r--r--util.h25
20 files changed, 2815 insertions, 1314 deletions
diff --git a/Makefile b/Makefile
index fb4494a..2f48c35 100644
--- a/Makefile
+++ b/Makefile
@@ -1,13 +1,20 @@
CFLAGS += -Wall -Wextra -pedantic
+CFLAGS += -DRLIMIT_STACK_VAL=$(shell ulimit -s)
-all: passt qrap
+all: passt pasta passt4netns qrap
passt: passt.c passt.h arp.c arp.h dhcp.c dhcp.h dhcpv6.c dhcpv6.h pcap.c pcap.h ndp.c ndp.h siphash.c siphash.h tap.c tap.h icmp.c icmp.h tcp.c tcp.h udp.c udp.h util.c util.h
$(CC) $(CFLAGS) passt.c arp.c dhcp.c dhcpv6.c pcap.c ndp.c siphash.c tap.c icmp.c tcp.c udp.c util.c -o passt
+pasta: passt
+ ln -s passt pasta
+
+passt4netns: passt
+ ln -s passt passt4netns
+
qrap: qrap.c passt.h
$(CC) $(CFLAGS) -DARCH=\"$(shell uname -m)\" qrap.c -o qrap
.PHONY: clean
clean:
- -${RM} passt *.o qrap
+ -${RM} passt *.o qrap pasta passt4netns
diff --git a/arp.c b/arp.c
index 20f08b2..547057c 100644
--- a/arp.c
+++ b/arp.c
@@ -1,12 +1,15 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
*
* arp.c - ARP implementation
*
* Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
- *
*/
#include <stdio.h>
@@ -22,9 +25,9 @@
#include <net/if_arp.h>
#include <arpa/inet.h>
+#include "util.h"
#include "passt.h"
#include "dhcp.h"
-#include "util.h"
#include "tap.h"
#include "arp.h"
@@ -66,7 +69,7 @@ int arp(struct ctx *c, struct ethhdr *eh, size_t len)
memcpy(eh->h_dest, eh->h_source, ETH_ALEN);
memcpy(eh->h_source, c->mac, ETH_ALEN);
- if (tap_send(c->fd_unix, eh, len, 0) < 0)
+ if (tap_send(c, eh, len, 0) < 0)
perror("ARP: send");
return 1;
diff --git a/dhcp.c b/dhcp.c
index 6448e51..337463a 100644
--- a/dhcp.c
+++ b/dhcp.c
@@ -1,12 +1,15 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
*
* dhcp.c - Minimalistic DHCP server for PASST
*
* Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
- *
*/
#include <stdio.h>
@@ -21,9 +24,9 @@
#include <net/if.h>
#include <arpa/inet.h>
+#include "util.h"
#include "passt.h"
#include "dhcp.h"
-#include "util.h"
#include "tap.h"
/**
@@ -322,7 +325,7 @@ int dhcp(struct ctx *c, struct ethhdr *eh, size_t len)
memcpy(eh->h_dest, eh->h_source, ETH_ALEN);
memcpy(eh->h_source, c->mac, ETH_ALEN);
- if (tap_send(c->fd_unix, eh, len, 0) < 0)
+ if (tap_send(c, eh, len, 0) < 0)
perror("DHCP: send");
return 1;
diff --git a/dhcpv6.c b/dhcpv6.c
index 4ce7a87..be6d9b1 100644
--- a/dhcpv6.c
+++ b/dhcpv6.c
@@ -1,12 +1,15 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
*
* dhcpv6.c - Minimalistic DHCPv6 server for PASST
*
* Copyright (c) 2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
- *
*/
#include <stdio.h>
@@ -23,9 +26,9 @@
#include <net/if.h>
#include <net/if_arp.h>
+#include "util.h"
#include "passt.h"
#include "tap.h"
-#include "util.h"
/**
* struct opt_hdr - DHCPv6 option header
diff --git a/icmp.c b/icmp.c
index 378e787..8f2fdb2 100644
--- a/icmp.c
+++ b/icmp.c
@@ -1,12 +1,15 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
*
* icmp.c - ICMP/ICMPv6 echo proxy
*
* Copyright (c) 2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
- *
*/
#include <stdio.h>
@@ -28,57 +31,91 @@
#include <linux/icmpv6.h>
#include <time.h>
+#include "util.h"
#include "passt.h"
#include "tap.h"
-#include "util.h"
#include "icmp.h"
+#define ICMP_ECHO_TIMEOUT 60 /* s, timeout for ICMP socket activity */
+
+/**
+ * struct icmp_id - Tracking information for single ICMP echo identifier
+ * @sock: Bound socket for identifier
+ * @ts: Last associated activity from tap, seconds
+ * @seq: Last sequence number sent to tap, host order
+ */
+struct icmp_id {
+ int sock;
+ time_t ts;
+ uint16_t seq;
+};
+
/* Indexed by ICMP echo identifier */
-static int icmp_s_v4[USHRT_MAX];
-static int icmp_s_v6[USHRT_MAX];
+static struct icmp_id icmp_id_map [IP_VERSIONS][USHRT_MAX];
+
+/* Bitmaps, activity monitoring needed for identifier */
+static uint8_t icmp_act [IP_VERSIONS][USHRT_MAX / 8];
/**
* icmp_sock_handler() - Handle new data from socket
* @c: Execution context
- * @s: File descriptor number for socket
+ * @ref: epoll reference
* @events: epoll events bitmap
- * @pkt_buf: Buffer to receive packets, currently unused
* @now: Current timestamp, unused
*/
-void icmp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf,
+void icmp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now)
{
struct in6_addr a6 = { .s6_addr = { 0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0xff, 0xff,
0, 0, 0, 0 } };
- struct sockaddr_storage sr, sl;
- socklen_t slen = sizeof(sr);
+ struct sockaddr_storage sr;
+ socklen_t sl = sizeof(sr);
char buf[USHRT_MAX];
+ uint16_t seq, id;
ssize_t n;
(void)events;
- (void)pkt_buf;
(void)now;
- n = recvfrom(s, buf, sizeof(buf), MSG_DONTWAIT,
- (struct sockaddr *)&sr, &slen);
+ n = recvfrom(ref.s, buf, sizeof(buf), 0, (struct sockaddr *)&sr, &sl);
if (n < 0)
return;
- if (getsockname(s, (struct sockaddr *)&sl, &slen))
- return;
+ if (ref.icmp.v6) {
+ struct sockaddr_in6 *sr6 = (struct sockaddr_in6 *)&sr;
+ struct icmp6hdr *ih = (struct icmp6hdr *)buf;
- if (sl.ss_family == AF_INET) {
+ /* In PASTA mode, we'll get any reply we send, discard them. */
+ if (c->mode == MODE_PASTA) {
+ seq = ntohs(ih->icmp6_sequence);
+ id = ntohs(ih->icmp6_identifier);
+
+ if (icmp_id_map[V6][id].seq == seq)
+ return;
+
+ icmp_id_map[V6][id].seq = seq;
+ }
+
+ tap_ip_send(c, &sr6->sin6_addr, IPPROTO_ICMPV6, buf, n);
+ } else {
struct sockaddr_in *sr4 = (struct sockaddr_in *)&sr;
+ struct icmphdr *ih = (struct icmphdr *)buf;
+
+ if (c->mode == MODE_PASTA) {
+ seq = ntohs(ih->un.echo.sequence);
+ id = ntohs(ih->un.echo.id);
+
+ if (icmp_id_map[V4][id].seq == seq)
+ return;
+
+ icmp_id_map[V4][id].seq = seq;
+ }
memcpy(&a6.s6_addr[12], &sr4->sin_addr, sizeof(sr4->sin_addr));
tap_ip_send(c, &a6, IPPROTO_ICMP, buf, n);
- } else if (sl.ss_family == AF_INET6) {
- struct sockaddr_in6 *sr6 = (struct sockaddr_in6 *)&sr;
-
- tap_ip_send(c, &sr6->sin6_addr, IPPROTO_ICMPV6, buf, n);
}
}
@@ -86,101 +123,131 @@ void icmp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf,
* icmp_tap_handler() - Handle packets from tap
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
+ * @
* @msg: Input message
* @count: Message count (always 1 for ICMP)
- * @now: Current timestamp, unused
+ * @now: Current timestamp
*
* Return: count of consumed packets (always 1, even if malformed)
*/
int icmp_tap_handler(struct ctx *c, int af, void *addr,
struct tap_msg *msg, int count, struct timespec *now)
{
- int s;
-
(void)count;
- (void)now;
- (void)c;
if (af == AF_INET) {
struct icmphdr *ih = (struct icmphdr *)msg[0].l4h;
+ union icmp_epoll_ref iref = { .v6 = 0 };
struct sockaddr_in sa = {
.sin_family = AF_INET,
.sin_addr = { .s_addr = INADDR_ANY },
.sin_port = ih->un.echo.id,
};
+ int id, s;
if (msg[0].l4_len < sizeof(*ih) || ih->type != ICMP_ECHO)
return 1;
- if ((s = icmp_s_v4[ntohs(ih->un.echo.id)]) < 0)
- return 1;
+ id = ntohs(ih->un.echo.id);
- bind(s, (struct sockaddr *)&sa, sizeof(sa));
+ if ((s = icmp_id_map[V4][id].sock) <= 0) {
+ s = sock_l4(c, AF_INET, IPPROTO_ICMP, id, 0, iref.u32);
+ if (s < 0)
+ goto fail_sock;
+
+ icmp_id_map[V4][id].sock = s;
+ }
+ icmp_id_map[V4][id].ts = now->tv_sec;
+ bitmap_set(icmp_act[V4], id);
sa.sin_addr = *(struct in_addr *)addr;
- sendto(s, msg[0].l4h, msg[0].l4_len,
- MSG_DONTWAIT | MSG_NOSIGNAL,
+ sendto(s, msg[0].l4h, msg[0].l4_len, MSG_NOSIGNAL,
(struct sockaddr *)&sa, sizeof(sa));
} else if (af == AF_INET6) {
struct icmp6hdr *ih = (struct icmp6hdr *)msg[0].l4h;
+ union icmp_epoll_ref iref = { .v6 = 1 };
struct sockaddr_in6 sa = {
.sin6_family = AF_INET6,
.sin6_addr = IN6ADDR_ANY_INIT,
.sin6_port = ih->icmp6_identifier,
};
+ int id, s;
if (msg[0].l4_len < sizeof(*ih) ||
(ih->icmp6_type != 128 && ih->icmp6_type != 129))
return 1;
- if ((s = icmp_s_v6[ntohs(ih->icmp6_identifier)]) < 0)
- return 1;
+ id = ntohs(ih->icmp6_identifier);
+ if ((s = icmp_id_map[V6][id].sock) <= 0) {
+ s = sock_l4(c, AF_INET6, IPPROTO_ICMPV6, id, 0,
+ iref.u32);
+ if (s < 0)
+ goto fail_sock;
- bind(s, (struct sockaddr *)&sa, sizeof(sa));
+ icmp_id_map[V6][id].sock = s;
+ }
+ icmp_id_map[V6][id].ts = now->tv_sec;
+ bitmap_set(icmp_act[V6], id);
sa.sin6_addr = *(struct in6_addr *)addr;
- sendto(s, msg[0].l4h, msg[0].l4_len,
- MSG_DONTWAIT | MSG_NOSIGNAL,
+ sendto(s, msg[0].l4h, msg[0].l4_len, MSG_NOSIGNAL,
(struct sockaddr *)&sa, sizeof(sa));
}
return 1;
+
+fail_sock:
+ warn("Cannot open \"ping\" socket. You might need to:");
+ warn(" sysctl -w net.ipv4.ping_group_range=\"0 2147483647\"");
+ warn("...echo requests/replies will fail.");
+ return 1;
}
/**
- * icmp_sock_init() - Create ICMP, ICMPv6 sockets for echo requests and replies
+ * icmp_timer_one() - Handler for timed events related to a given identifier
* @c: Execution context
- *
- * Return: 0 on success, -1 on failure
+ * @v6: Set for IPv6 echo identifier bindings
+ * @id: Echo identifier, host order
+ * @ts: Timestamp from caller
*/
-int icmp_sock_init(struct ctx *c)
+static void icmp_timer_one(struct ctx *c, int v6, uint16_t id,
+ struct timespec *ts)
{
- int i, fail = 0;
+ struct icmp_id *id_map = &icmp_id_map[v6 ? V6 : V4][id];
- c->icmp.fd_min = INT_MAX;
- c->icmp.fd_max = 0;
+ if (ts->tv_sec - id_map->ts <= ICMP_ECHO_TIMEOUT)
+ return;
- if (c->v4) {
- for (i = 0; i < USHRT_MAX; i++) {
- icmp_s_v4[i] = sock_l4(c, AF_INET, IPPROTO_ICMP, i);
- if (icmp_s_v4[i] < 0)
- fail = 1;
- }
- }
+ bitmap_clear(icmp_act[v6 ? V6 : V4], id);
+
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, id_map->sock, NULL);
+ close(id_map->sock);
+ id_map->sock = 0;
+}
- if (c->v6) {
- for (i = 0; i < USHRT_MAX; i++) {
- icmp_s_v6[i] = sock_l4(c, AF_INET6, IPPROTO_ICMPV6, i);
- if (icmp_s_v6[i] < 0)
- fail = 1;
+/**
+ * icmp_timer() - Scan activity bitmap for identifiers with timed events
+ * @c: Execution context
+ * @ts: Timestamp from caller
+ */
+void icmp_timer(struct ctx *c, struct timespec *ts)
+{
+ long *word, tmp;
+ unsigned int i;
+ int n, v6 = 0;
+
+v6:
+ word = (long *)icmp_act[v6 ? V6 : V4];
+ for (i = 0; i < sizeof(icmp_act[0]) / sizeof(long); i++, word++) {
+ tmp = *word;
+ while ((n = ffsl(tmp))) {
+ tmp &= ~(1UL << (n - 1));
+ icmp_timer_one(c, v6, i * sizeof(long) * 8 + n - 1, ts);
}
}
- if (fail) {
- warn("Cannot open some \"ping\" sockets. You might need to:");
- warn(" sysctl -w net.ipv4.ping_group_range=\"0 2147483647\"");
- warn("...echo requests/replies might fail.");
+ if (!v6) {
+ v6 = 1;
+ goto v6;
}
-
- return 0;
}
diff --git a/icmp.h b/icmp.h
index d04eb8c..12547b7 100644
--- a/icmp.h
+++ b/icmp.h
@@ -1,22 +1,34 @@
#ifndef ICMP_H
#define ICMP_H
+#define ICMP_TIMER_INTERVAL 1000 /* ms */
+
struct ctx;
-void icmp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf,
+void icmp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now);
int icmp_tap_handler(struct ctx *c, int af, void *addr,
struct tap_msg *msg, int count, struct timespec *now);
-int icmp_sock_init(struct ctx *c);
+void icmp_timer(struct ctx *c, struct timespec *ts);
+
+/**
+ * union icmp_epoll_ref - epoll reference portion for ICMP tracking
+ * @v6: Set for IPv6 sockets or connections
+ * @u32: Opaque u32 value of reference
+ */
+union icmp_epoll_ref {
+ struct {
+ uint32_t v6:1;
+ };
+ uint32_t u32;
+};
/**
* struct icmp_ctx - Execution context for ICMP routines
- * @fd_min: Lowest file descriptor number for ICMP/ICMPv6 ever used
- * @fd_max: Highest file descriptor number for ICMP/ICMPv6 ever used
+ * @timer_run: Timestamp of most recent timer run
*/
struct icmp_ctx {
- int fd_min;
- int fd_max;
+ struct timespec timer_run;
};
#endif /* ICMP_H */
diff --git a/ndp.c b/ndp.c
index 1d2a2d3..a7360aa 100644
--- a/ndp.c
+++ b/ndp.c
@@ -1,6 +1,10 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
*
* ndp.c - NDP support for PASST
*
@@ -23,8 +27,8 @@
#include <net/if.h>
#include <net/if_arp.h>
-#include "passt.h"
#include "util.h"
+#include "passt.h"
#include "tap.h"
#define RS 133
@@ -175,7 +179,7 @@ int ndp(struct ctx *c, struct ethhdr *eh, size_t len)
memcpy(ehr->h_source, c->mac, ETH_ALEN);
ehr->h_proto = htons(ETH_P_IPV6);
- if (tap_send(c->fd_unix, ehr, len, 0) < 0)
+ if (tap_send(c, ehr, len, 0) < 0)
perror("NDP: send");
return 1;
diff --git a/passt.c b/passt.c
index 46eb5f6..ee721df 100644
--- a/passt.c
+++ b/passt.c
@@ -1,18 +1,26 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
*
* passt.c - Daemon implementation
*
* Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*
- * Grab Ethernet frames via AF_UNIX socket, build SOCK_DGRAM/SOCK_STREAM sockets
- * for each 5-tuple from TCP, UDP packets, perform connection tracking and
- * forward them. Forward packets received on sockets back to the UNIX domain
- * socket (typically, a socket virtio_net file descriptor from qemu).
+ * Grab Ethernet frames from AF_UNIX socket (in "passt" mode) or tap device (in
+ * "pasta" mode), build SOCK_DGRAM/SOCK_STREAM sockets for each 5-tuple from
+ * TCP, UDP packets, perform connection tracking and forward them. Forward
+ * packets received on sockets back to the UNIX domain socket (typically, a
+ * socket virtio_net file descriptor from qemu) or to the tap device (typically,
+ * created in a separate network namespace).
*/
+#define _GNU_SOURCE
+#include <sched.h>
#include <stdio.h>
#include <sys/epoll.h>
#include <sys/socket.h>
@@ -44,92 +52,33 @@
#include <syslog.h>
#include <sys/stat.h>
+#include "util.h"
#include "passt.h"
-#include "arp.h"
-#include "dhcp.h"
-#include "ndp.h"
#include "dhcpv6.h"
-#include "util.h"
#include "icmp.h"
#include "tcp.h"
#include "udp.h"
#include "pcap.h"
+#include "tap.h"
#define EPOLL_EVENTS 10
-#define TAP_BUF_BYTES (ETH_MAX_MTU * 8)
-#define TAP_BUF_FILL (TAP_BUF_BYTES - ETH_MAX_MTU - sizeof(uint32_t))
-#define TAP_MSGS (TAP_BUF_BYTES / sizeof(struct ethhdr) + 1)
-
-#define PKT_BUF_BYTES MAX(TAP_BUF_BYTES, SOCK_BUF_BYTES)
-static char pkt_buf [PKT_BUF_BYTES];
+#define __TIMER_INTERVAL MIN(TCP_TIMER_INTERVAL, UDP_TIMER_INTERVAL)
+#define TIMER_INTERVAL MIN(__TIMER_INTERVAL, ICMP_TIMER_INTERVAL)
-#define TIMER_INTERVAL MIN(TCP_TIMER_INTERVAL, UDP_TIMER_INTERVAL)
+char pkt_buf [PKT_BUF_BYTES];
#ifdef DEBUG
-static char *ip_proto_str[IPPROTO_SCTP + 1] = {
+char *ip_proto_str[IPPROTO_SCTP + 1] = {
[IPPROTO_ICMP] = "ICMP",
[IPPROTO_TCP] = "TCP",
[IPPROTO_UDP] = "UDP",
[IPPROTO_ICMPV6] = "ICMPV6",
[IPPROTO_SCTP] = "SCTP",
};
-
-#define IP_PROTO_STR(n) \
- (((n) <= IPPROTO_SCTP && ip_proto_str[(n)]) ? ip_proto_str[(n)] : "?")
-
#endif
/**
- * sock_unix() - Create and bind AF_UNIX socket, add to epoll list
- * @index: Index used in socket path, filled on success
- *
- * Return: newly created socket, doesn't return on error
- */
-static int sock_unix(int *index)
-{
- int fd = socket(AF_UNIX, SOCK_STREAM, 0), ex;
- struct sockaddr_un addr = {
- .sun_family = AF_UNIX,
- };
- int i, ret;
-
- if (fd < 0) {
- perror("UNIX socket");
- exit(EXIT_FAILURE);
- }
-
- for (i = 1; i < UNIX_SOCK_MAX; i++) {
- snprintf(addr.sun_path, UNIX_PATH_MAX, UNIX_SOCK_PATH, i);
-
- ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0);
- ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr));
- if (!ret || (errno != ENOENT && errno != ECONNREFUSED)) {
- close(ex);
- continue;
- }
- close(ex);
-
- unlink(addr.sun_path);
- if (!bind(fd, (const struct sockaddr *)&addr, sizeof(addr)))
- break;
- }
-
- if (i == UNIX_SOCK_MAX) {
- perror("UNIX socket bind");
- exit(EXIT_FAILURE);
- }
-
- info("UNIX domain socket bound at %s\n", addr.sun_path);
- chmod(addr.sun_path,
- S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
-
- *index = i;
-
- return fd;
-}
-
-/**
* struct nl_request - Netlink request filled and sent by get_routes()
* @nlh: Netlink message header
* @rtm: Routing Netlink message
@@ -365,362 +314,76 @@ static void get_dns(struct ctx *c)
}
/**
- * tap4_handler() - IPv4 and ARP packet handler for tap file descriptor
- * @c: Execution context
- * @msg: Array of messages with the same L3 protocol
- * @count: Count of messages with the same L3 protocol
- * @now: Current timestamp
+ * get_bound_ports_ns() - Get TCP and UDP ports bound in namespace
+ * @arg: Execution context
*
- * Return: count of packets consumed by handlers
+ * Return: 0
*/
-static int tap4_handler(struct ctx *c, struct tap_msg *msg, size_t count,
- struct timespec *now)
+static int get_bound_ports_ns(void *arg)
{
- char buf_s[INET_ADDRSTRLEN] __attribute((__unused__));
- char buf_d[INET_ADDRSTRLEN] __attribute((__unused__));
- struct ethhdr *eh = (struct ethhdr *)msg[0].start;
- struct iphdr *iph, *prev_iph = NULL;
- struct udphdr *uh, *prev_uh = NULL;
- size_t len = msg[0].len;
- unsigned int i;
- char *l4h;
+ struct ctx *c = (struct ctx *)arg;
- if (!c->v4)
- return count;
+ ns_enter(c->pasta_pid);
- if (len < sizeof(*eh) + sizeof(*iph))
- return 1;
-
- if (arp(c, eh, len) || dhcp(c, eh, len))
- return 1;
-
- for (i = 0; i < count; i++) {
- len = msg[i].len;
- if (len < sizeof(*eh) + sizeof(*iph))
- return 1;
-
- eh = (struct ethhdr *)msg[i].start;
- iph = (struct iphdr *)(eh + 1);
- l4h = (char *)iph + iph->ihl * 4;
-
- c->addr4_seen = iph->saddr;
-
- msg[i].l4h = l4h;
- msg[i].l4_len = len - ((intptr_t)l4h - (intptr_t)eh);
-
- if (iph->protocol != IPPROTO_TCP &&
- iph->protocol != IPPROTO_UDP)
- break;
-
- if (len < sizeof(*uh))
- break;
-
- uh = (struct udphdr *)l4h;
-
- if (!i) {
- prev_iph = iph;
- prev_uh = uh;
- continue;
- }
-
- if (iph->tos != prev_iph->tos ||
- iph->frag_off != prev_iph->frag_off ||
- iph->protocol != prev_iph->protocol ||
- iph->saddr != prev_iph->saddr ||
- iph->daddr != prev_iph->daddr ||
- uh->source != prev_uh->source ||
- uh->dest != prev_uh->dest)
- break;
-
- prev_iph = iph;
- prev_uh = uh;
+ if (c->v4) {
+ procfs_scan_listen("tcp", c->tcp.port_to_ns);
+ procfs_scan_listen("udp", c->udp.port_to_ns);
}
- eh = (struct ethhdr *)msg[0].start;
- iph = (struct iphdr *)(eh + 1);
-
- if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP ||
- iph->protocol == IPPROTO_SCTP) {
- uh = (struct udphdr *)msg[0].l4h;
-
- if (msg[0].len < sizeof(*uh))
- return 1;
-
- debug("%s (%i) from tap: %s:%i -> %s:%i (%i packet%s)",
- IP_PROTO_STR(iph->protocol), iph->protocol,
- inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)),
- ntohs(uh->source),
- inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)),
- ntohs(uh->dest),
- i, i > 1 ? "s" : "");
- } else if (iph->protocol == IPPROTO_ICMP) {
- debug("icmp from tap: %s -> %s",
- inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)),
- inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)));
+ if (c->v6) {
+ procfs_scan_listen("tcp6", c->tcp.port_to_ns);
+ procfs_scan_listen("udp6", c->udp.port_to_ns);
}
- if (iph->protocol == IPPROTO_TCP)
- return tcp_tap_handler(c, AF_INET, &iph->daddr, msg, i, now);
-
- if (iph->protocol == IPPROTO_UDP)
- return udp_tap_handler(c, AF_INET, &iph->daddr, msg, i, now);
-
- if (iph->protocol == IPPROTO_ICMP)
- icmp_tap_handler(c, AF_INET, &iph->daddr, msg, 1, now);
-
- return 1;
+ return 0;
}
/**
- * tap6_handler() - IPv6 packet handler for tap file descriptor
+ * get_bound_ports() - Get maps of ports that should have bound sockets
* @c: Execution context
- * @msg: Array of messages with the same L3 protocol
- * @count: Count of messages with the same L3 protocol
- * @now: Current timestamp
*/
-static int tap6_handler(struct ctx *c, struct tap_msg *msg, size_t count,
- struct timespec *now)
+static void get_bound_ports(struct ctx *c)
{
- char buf_s[INET6_ADDRSTRLEN] __attribute((__unused__));
- char buf_d[INET6_ADDRSTRLEN] __attribute((__unused__));
- struct ethhdr *eh = (struct ethhdr *)msg[0].start;
- struct udphdr *uh, *prev_uh = NULL;
- uint8_t proto = 0, prev_proto = 0;
- size_t len = msg[0].len;
- struct ipv6hdr *ip6h;
- unsigned int i;
- char *l4h;
-
- if (!c->v6)
- return count;
-
- if (len < sizeof(*eh) + sizeof(*ip6h))
- return 1;
-
- if (ndp(c, eh, len) || dhcpv6(c, eh, len))
- return 1;
-
- for (i = 0; i < count; i++) {
- struct ipv6hdr *p_ip6h;
-
- len = msg[i].len;
- if (len < sizeof(*eh) + sizeof(*ip6h))
- return 1;
-
- eh = (struct ethhdr *)msg[i].start;
- ip6h = (struct ipv6hdr *)(eh + 1);
- l4h = ipv6_l4hdr(ip6h, &proto);
-
- msg[i].l4h = l4h;
- msg[i].l4_len = len - ((intptr_t)l4h - (intptr_t)eh);
-
- if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr))
- c->addr6_ll_seen = ip6h->saddr;
- else
- c->addr6_seen = ip6h->saddr;
-
- ip6h->saddr = c->addr6;
+ char ns_fn_stack[NS_FN_STACK_SIZE];
- if (proto != IPPROTO_TCP && proto != IPPROTO_UDP)
- break;
-
- if (len < sizeof(*uh))
- break;
-
- uh = (struct udphdr *)l4h;
-
- if (!i) {
- p_ip6h = ip6h;
- prev_proto = proto;
- prev_uh = uh;
- continue;
- }
-
- if (proto != prev_proto ||
- memcmp(&ip6h->saddr, &p_ip6h->saddr, sizeof(ip6h->saddr)) ||
- memcmp(&ip6h->daddr, &p_ip6h->daddr, sizeof(ip6h->daddr)) ||
- uh->source != prev_uh->source ||
- uh->dest != prev_uh->dest)
- break;
-
- p_ip6h = ip6h;
- prev_proto = proto;
- prev_uh = uh;
- }
-
- if (prev_proto)
- proto = prev_proto;
-
- eh = (struct ethhdr *)msg[0].start;
- ip6h = (struct ipv6hdr *)(eh + 1);
-
- if (proto == IPPROTO_ICMPV6) {
- debug("icmpv6 from tap: %s ->\n\t%s",
- inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)),
- inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d)));
- } else if (proto == IPPROTO_TCP || proto == IPPROTO_UDP ||
- proto == IPPROTO_SCTP) {
- uh = (struct udphdr *)msg[0].l4h;
-
- if (msg[0].len < sizeof(*uh))
- return 1;
-
- debug("%s (%i) from tap: [%s]:%i\n\t-> [%s]:%i (%i packet%s)",
- IP_PROTO_STR(proto), proto,
- inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)),
- ntohs(uh->source),
- inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d)),
- ntohs(uh->dest),
- i, i > 1 ? "s" : "");
+ if (c->mode == MODE_PASST) {
+ memset(c->tcp.port_to_tap, 0xff, PORT_EPHEMERAL_MIN / 8);
+ memset(c->udp.port_to_tap, 0xff, PORT_EPHEMERAL_MIN / 8);
+ return;
}
- if (proto == IPPROTO_TCP)
- return tcp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, i, now);
-
- if (proto == IPPROTO_UDP)
- return udp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, i, now);
-
- if (proto == IPPROTO_ICMPV6)
- icmp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, 1, now);
-
- return 1;
-}
-
-/**
- * tap_handler() - Packet handler for tap file descriptor
- * @c: Execution context
- * @now: Current timestamp
- *
- * Return: -ECONNRESET if tap connection was lost, 0 otherwise
- */
-static int tap_handler(struct ctx *c, struct timespec *now)
-{
- struct tap_msg msg[TAP_MSGS];
- int msg_count, same, i;
- struct ethhdr *eh;
- char *p = pkt_buf;
- ssize_t n, rem;
-
- while ((n = recv(c->fd_unix, p, TAP_BUF_FILL, MSG_DONTWAIT)) > 0) {
- msg_count = 0;
-
- while (n > (ssize_t)sizeof(uint32_t)) {
- ssize_t len = ntohl(*(uint32_t *)p);
-
- p += sizeof(uint32_t);
- n -= sizeof(uint32_t);
-
- if (len < (ssize_t)sizeof(*eh))
- return 0;
-
- /* At most one packet might not fit in a single read */
- if (len > n) {
- rem = recv(c->fd_unix, p + n, len - n,
- MSG_DONTWAIT);
- if ((n += rem) != len)
- return 0;
- }
-
- pcap(p, len);
-
- msg[msg_count].start = p;
- msg[msg_count++].len = len;
-
- n -= len;
- p += len;
- }
-
- i = 0;
- while (i < msg_count) {
- eh = (struct ethhdr *)msg[i].start;
-
- memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
-
- switch (ntohs(eh->h_proto)) {
- case ETH_P_ARP:
- tap4_handler(c, msg + i, 1, now);
- i++;
- break;
- case ETH_P_IP:
- for (same = 1; i + same < msg_count &&
- same < UIO_MAXIOV; same++) {
- struct tap_msg *next = &msg[i + same];
-
- eh = (struct ethhdr *)next->start;
- if (ntohs(eh->h_proto) != ETH_P_IP)
- break;
- }
-
- i += tap4_handler(c, msg + i, same, now);
- break;
- case ETH_P_IPV6:
- for (same = 1; i + same < msg_count &&
- same < UIO_MAXIOV; same++) {
- struct tap_msg *next = &msg[i + same];
-
- eh = (struct ethhdr *)next->start;
- if (ntohs(eh->h_proto) != ETH_P_IPV6)
- break;
- }
-
- i += tap6_handler(c, msg + i, same, now);
- break;
- default:
- i++;
- break;
- }
- }
+ clone(get_bound_ports_ns, ns_fn_stack + sizeof(ns_fn_stack) / 2,
+ CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD, (void *)c);
- p = pkt_buf;
+ if (c->v4) {
+ procfs_scan_listen("tcp", c->tcp.port_to_init);
+ procfs_scan_listen("udp", c->udp.port_to_init);
}
- if (n >= 0 || errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)
- return 0;
-
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_unix, NULL);
- close(c->fd_unix);
-
- return -ECONNRESET;
+ if (c->v6) {
+ procfs_scan_listen("tcp6", c->tcp.port_to_init);
+ procfs_scan_listen("udp6", c->udp.port_to_init);
+ }
}
/**
* sock_handler() - Event handler for L4 sockets
* @c: Execution context
- * @s: Socket associated to event
+ * @ref: epoll reference
* @events: epoll events
* @now: Current timestamp
*/
-static void sock_handler(struct ctx *c, int s, uint32_t events,
+static void sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now)
{
- socklen_t sl;
- int proto;
-
- sl = sizeof(proto);
-
- if ( FD_PROTO(s, udp) && !FD_PROTO(s, icmp) && !FD_PROTO(s, tcp))
- proto = IPPROTO_UDP;
- else if (FD_PROTO(s, tcp) && !FD_PROTO(s, icmp) && !FD_PROTO(s, udp))
- proto = IPPROTO_TCP;
- else if (FD_PROTO(s, icmp) && !FD_PROTO(s, udp) && !FD_PROTO(s, tcp))
- proto = IPPROTO_ICMP; /* Fits ICMPv6 below, too */
- else if (getsockopt(s, SOL_SOCKET, SO_PROTOCOL, &proto, &sl))
- proto = -1;
-
- if (proto == -1) {
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL);
- close(s);
- return;
- }
-
- debug("%s (%i): packet from socket %i", IP_PROTO_STR(proto), proto, s);
-
- if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6)
- icmp_sock_handler(c, s, events, pkt_buf, now);
- else if (proto == IPPROTO_TCP)
- tcp_sock_handler( c, s, events, pkt_buf, now);
- else if (proto == IPPROTO_UDP)
- udp_sock_handler( c, s, events, pkt_buf, now);
+ debug("%s packet from socket %i", IP_PROTO_STR(ref.proto), ref.s);
+
+ if (ref.proto == IPPROTO_TCP)
+ tcp_sock_handler( c, ref, events, now);
+ else if (ref.proto == IPPROTO_UDP)
+ udp_sock_handler( c, ref, events, now);
+ else if (ref.proto == IPPROTO_ICMP || ref.proto == IPPROTO_ICMPV6)
+ icmp_sock_handler(c, ref, events, now);
}
/**
@@ -739,13 +402,18 @@ static void timer_handler(struct ctx *c, struct timespec *now)
udp_timer(c, now);
c->udp.timer_run = *now;
}
+
+ if (timespec_diff_ms(now, &c->icmp.timer_run) >= ICMP_TIMER_INTERVAL) {
+ icmp_timer(c, now);
+ c->icmp.timer_run = *now;
+ }
}
/**
- * usage() - Print usage and exit
+ * usage_passt() - Print usage for "passt" mode and exit
* @name: Executable name
*/
-void usage(const char *name)
+void usage_passt(const char *name)
{
fprintf(stderr, "Usage: %s\n", name);
@@ -753,25 +421,51 @@ void usage(const char *name)
}
/**
+ * usage_pasta() - Print usage for "pasta" mode and exit
+ * @name: Executable name
+ */
+void usage_pasta(const char *name)
+{
+ fprintf(stderr, "Usage: %s TARGET_PID\n", name);
+
+ exit(EXIT_FAILURE);
+}
+
+/**
* main() - Entry point and main loop
* @argc: Argument count
- * @argv: Interface names
+ * @argv: Target PID for pasta mode
*
* Return: 0 once interrupted, non-zero on failure
*/
int main(int argc, char **argv)
{
+ char buf6[INET6_ADDRSTRLEN], buf4[INET_ADDRSTRLEN], *log_name;
struct epoll_event events[EPOLL_EVENTS];
- int nfds, i, fd_unix, sock_index;
- char buf6[INET6_ADDRSTRLEN];
- char buf4[INET_ADDRSTRLEN];
- struct epoll_event ev = { 0 };
struct ctx c = { 0 };
struct rlimit limit;
struct timespec now;
+ int nfds, i;
+
+ if (strstr(argv[0], "pasta") || strstr(argv[0], "passt4netns")) {
+ if (argc != 2)
+ usage_pasta(argv[0]);
+
+ errno = 0;
+ c.pasta_pid = strtol(argv[1], NULL, 0);
+ if (c.pasta_pid < 0 || errno)
+ usage_pasta(argv[0]);
- if (argc != 1)
- usage(argv[0]);
+ c.mode = MODE_PASTA;
+ log_name = "pasta";
+ } else {
+ if (argc != 1)
+ usage_passt(argv[0]);
+
+ c.mode = MODE_PASST;
+ log_name = "passt";
+ memset(&c.mac_guest, 0xff, sizeof(c.mac_guest));
+ }
if (clock_gettime(CLOCK_MONOTONIC, &now)) {
perror("clock_gettime");
@@ -795,27 +489,22 @@ int main(int argc, char **argv)
}
#if DEBUG
- openlog("passt", 0, LOG_DAEMON);
+ openlog(log_name, 0, LOG_DAEMON);
#else
- openlog("passt", isatty(fileno(stdout)) ? 0 : LOG_PERROR, LOG_DAEMON);
+ openlog(log_name, isatty(fileno(stdout)) ? 0 : LOG_PERROR, LOG_DAEMON);
#endif
get_routes(&c);
get_addrs(&c);
get_dns(&c);
+ get_bound_ports(&c);
- fd_unix = sock_unix(&sock_index);
-
- if (icmp_sock_init(&c) || udp_sock_init(&c) || tcp_sock_init(&c))
+ if (udp_sock_init(&c) || tcp_sock_init(&c))
exit(EXIT_FAILURE);
if (c.v6)
dhcpv6_init(&c);
- memset(&c.mac_guest, 0xff, sizeof(c.mac_guest));
-
- pcap_init(sock_index);
-
if (c.v4) {
info("ARP:");
info(" address: %02x:%02x:%02x:%02x:%02x:%02x from %s",
@@ -859,15 +548,7 @@ int main(int argc, char **argv)
}
}
-listen:
- listen(fd_unix, 0);
- info("You can now start qrap:");
- info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio");
- info("or directly qemu, patched with:");
- info(" qemu/0001-net-Allow-also-UNIX-domain-sockets-to-be-used-as-net.patch");
- info("as follows:");
- info(" kvm ... -net socket,connect=" UNIX_SOCK_PATH
- " -net nic,model=virtio", sock_index);
+ tap_sock_init(&c);
#ifndef DEBUG
if (isatty(fileno(stdout)) && daemon(0, 0)) {
@@ -876,12 +557,6 @@ listen:
}
#endif
- c.fd_unix = accept(fd_unix, NULL, NULL);
-
- ev.events = EPOLLIN | EPOLLRDHUP | EPOLLERR | EPOLLHUP;
- ev.data.fd = c.fd_unix;
- epoll_ctl(c.epollfd, EPOLL_CTL_ADD, c.fd_unix, &ev);
-
loop:
nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL);
if (nfds == -1 && errno != EINTR) {
@@ -892,18 +567,12 @@ loop:
clock_gettime(CLOCK_MONOTONIC, &now);
for (i = 0; i < nfds; i++) {
- if (events[i].data.fd == c.fd_unix) {
- if (events[i].events & EPOLLRDHUP ||
- events[i].events & EPOLLHUP ||
- events[i].events & EPOLLERR ||
- tap_handler(&c, &now)) {
- close(c.fd_unix);
- goto listen;
- }
- } else {
- sock_handler(&c, events[i].data.fd, events[i].events,
- &now);
- }
+ union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64);
+
+ if (events[i].data.fd == c.fd_tap)
+ tap_handler(&c, events[i].events, &now);
+ else
+ sock_handler(&c, ref, events[i].events, &now);
}
timer_handler(&c, &now);
diff --git a/passt.h b/passt.h
index 28840fc..1a708fd 100644
--- a/passt.h
+++ b/passt.h
@@ -15,27 +15,76 @@ struct tap_msg {
size_t l4_len;
};
-#define SOCK_BUF_BYTES (ETH_MAX_MTU * 4)
+union epoll_ref;
#include "icmp.h"
#include "tcp.h"
#include "udp.h"
+/**
+ * union epoll_ref - Breakdown of reference for epoll socket bookkeeping
+ * @proto: IP protocol number
+ * @s: Socket number (implies 2^24 limit on number of descriptors)
+ * @tcp: TCP-specific reference part
+ * @udp: UDP-specific reference part
+ * @icmp: ICMP-specific reference part
+ * @data: Data handled by protocol handlers
+ * @u64: Opaque reference for epoll_ctl() and epoll_wait()
+ */
+union epoll_ref {
+ struct {
+ uint32_t proto:8,
+ s:24;
+ union {
+ union tcp_epoll_ref tcp;
+ union udp_epoll_ref udp;
+ union icmp_epoll_ref icmp;
+ uint32_t data;
+ };
+ };
+ uint64_t u64;
+};
+
+#define TAP_BUF_BYTES (ETH_MAX_MTU * 3)
+#define TAP_BUF_FILL (TAP_BUF_BYTES - ETH_MAX_MTU - sizeof(uint32_t))
+#define TAP_MSGS (TAP_BUF_BYTES / sizeof(struct ethhdr) + 1)
+
+#define PKT_BUF_BYTES MAX(TAP_BUF_BYTES, 0)
+extern char pkt_buf [PKT_BUF_BYTES];
+
+#ifdef DEBUG
+extern char *ip_proto_str[];
+#define IP_PROTO_STR(n) \
+ (((n) <= IPPROTO_SCTP && ip_proto_str[(n)]) ? ip_proto_str[(n)] : "?")
+#endif
+
#include <resolv.h> /* For MAXNS below */
+/**
+ * struct fqdn - Representation of fully-qualified domain name
+ * @n: Domain name string
+ */
struct fqdn {
char n[NS_MAXDNAME];
};
#include <net/if.h>
+enum passt_modes {
+ MODE_PASST,
+ MODE_PASTA,
+};
+
/**
* struct ctx - Execution context
- * @epollfd: file descriptor for epoll instance
- * @fd_unix: AF_UNIX socket for tap file descriptor
- * @v4: Enable IPv4 transport
+ * @mode: Operation mode, qemu/UNIX domain socket or namespace/tap
+ * @pasta_pid: Target PID of namespace for pasta mode
+ * @epollfd: File descriptor for epoll instance
+ * @fd_tap_listen: File descriptor for listening AF_UNIX socket, if any
+ * @fd_tap: File descriptor for AF_UNIX socket or tuntap device
* @mac: Host MAC address
* @mac_guest: Guest MAC address
+ * @v4: Enable IPv4 transport
* @addr4: IPv4 address for external, routable interface
* @addr4_seen: Latest IPv4 address seen as source from tap
* @mask4: IPv4 netmask, network order
@@ -49,10 +98,17 @@ struct fqdn {
* @gw6: Default IPv6 gateway
* @dns4: IPv4 DNS addresses, zero-terminated
* @ifn: Name of routable interface
+ * @tcp: Context for TCP protocol handler
+ * @udp: Context for UDP protocol handler
+ * @icmp: Context for ICMP protocol handler
*/
struct ctx {
+ enum passt_modes mode;
+ int pasta_pid;
+
int epollfd;
- int fd_unix;
+ int fd_tap_listen;
+ int fd_tap;
unsigned char mac[ETH_ALEN];
unsigned char mac_guest[ETH_ALEN];
@@ -74,7 +130,7 @@ struct ctx {
char ifn[IF_NAMESIZE];
- struct icmp_ctx icmp;
struct tcp_ctx tcp;
struct udp_ctx udp;
+ struct icmp_ctx icmp;
};
diff --git a/pcap.c b/pcap.c
index 8dd647a..c728b8a 100644
--- a/pcap.c
+++ b/pcap.c
@@ -1,12 +1,15 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
*
- * pcap.c - Packet capture for PASST
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
+ *
+ * pcap.c - Packet capture for PASST/PASTA
*
* Copyright (c) 2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
- *
*/
#include <stdio.h>
@@ -22,18 +25,19 @@
#include <unistd.h>
#include <net/if.h>
-#include "passt.h"
#include "util.h"
+#include "passt.h"
#ifdef DEBUG
#define PCAP_PREFIX "/tmp/passt_"
+#define PCAP_PREFIX_PASTA "/tmp/pasta_"
#define PCAP_ISO8601_FORMAT "%FT%H:%M:%SZ"
#define PCAP_ISO8601_STR "YYYY-MM-ddTHH:mm:ssZ"
#define PCAP_VERSION_MINOR 4
-static int pcap_fd = 1;
+static int pcap_fd = -1;
/* See pcap.h from libpcap, or pcap-savefile(5) */
static struct {
@@ -64,6 +68,11 @@ struct pcap_pkthdr {
uint32_t len;
};
+/**
+ * pcap() - Capture a single frame to pcap file
+ * @pkt: Pointer to data buffer, including L2 headers
+ * @len: L2 packet length
+ */
void pcap(char *pkt, size_t len)
{
struct pcap_pkthdr h;
@@ -81,12 +90,23 @@ void pcap(char *pkt, size_t len)
write(pcap_fd, pkt, len);
}
-void pcap_init(int sock_index)
+/**
+ * pcap_init() - Initialise pcap file
+ * @c: Execution context
+ * @index: pcap name index: passt instance number or pasta target pid
+ */
+void pcap_init(struct ctx *c, int index)
{
- char name[] = PCAP_PREFIX PCAP_ISO8601_STR STR(UNIX_SOCK_MAX) ".pcap";
+ char name[] = PCAP_PREFIX PCAP_ISO8601_STR STR(UINT_MAX) ".pcap";
struct timeval tv;
struct tm *tm;
+ if (pcap_fd != -1)
+ close(pcap_fd);
+
+ if (c->mode == MODE_PASTA)
+ memcpy(name, PCAP_PREFIX_PASTA, sizeof(PCAP_PREFIX_PASTA));
+
gettimeofday(&tv, NULL);
tm = localtime(&tv.tv_sec);
strftime(name + strlen(PCAP_PREFIX), sizeof(PCAP_ISO8601_STR) - 1,
@@ -94,7 +114,7 @@ void pcap_init(int sock_index)
snprintf(name + strlen(PCAP_PREFIX) + strlen(PCAP_ISO8601_STR),
sizeof(name) - strlen(PCAP_PREFIX) - strlen(PCAP_ISO8601_STR),
- "_%i.pcap", sock_index);
+ "_%i.pcap", index);
pcap_fd = open(name, O_WRONLY | O_CREAT | O_APPEND | O_DSYNC,
S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
diff --git a/pcap.h b/pcap.h
index abca097..c69c3b0 100644
--- a/pcap.h
+++ b/pcap.h
@@ -1,2 +1,2 @@
void pcap(char *pkt, size_t len);
-void pcap_init(int sock_index);
+void pcap_init(struct ctx *c, int sock_index);
diff --git a/siphash.c b/siphash.c
index 910c718..b18f640 100644
--- a/siphash.c
+++ b/siphash.c
@@ -1,6 +1,10 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
*
* siphash.c - SipHash routines
*
diff --git a/tap.c b/tap.c
index 70e4774..583344d 100644
--- a/tap.c
+++ b/tap.c
@@ -1,21 +1,39 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
*
- * tap.c - Functions to communicate with guest-facing tap interface
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
+ *
+ * tap.c - Functions to communicate with guest- or namespace-facing interface
*
* Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
- *
*/
+#define _GNU_SOURCE
+#include <sched.h>
#include <stdio.h>
+#include <errno.h>
#include <limits.h>
#include <string.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <netinet/in.h>
+#include <arpa/inet.h>
#include <stdint.h>
+#include <sys/epoll.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/uio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <linux/un.h>
+#include <linux/if.h>
+#include <linux/if_tun.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/tcp.h>
@@ -23,26 +41,46 @@
#include <linux/icmp.h>
#include <linux/icmpv6.h>
-#include "passt.h"
#include "util.h"
+#include "passt.h"
+#include "arp.h"
+#include "dhcp.h"
+#include "ndp.h"
+#include "dhcpv6.h"
#include "pcap.h"
/**
- * tap_send() - Send frame and qemu socket header with indication of length
- * @fd: tap file descriptor
+ * tap_send() - Send frame, with qemu socket header if needed
+ * @c: Execution context
+ * @data: Packet buffer
* @len: Total L2 packet length
- * @flags: Flags for send(), if any
+ * @vnet_pre: Buffer has four-byte headroom
*
- * Return: return code from send()
+ * Return: return code from send() or write()
*/
-int tap_send(int fd, void *data, size_t len, int flags)
+int tap_send(struct ctx *c, void *data, size_t len, int vnet_pre)
{
- uint32_t vnet_len = htonl(len);
- send(fd, &vnet_len, 4, MSG_DONTWAIT | MSG_NOSIGNAL);
+ if (vnet_pre)
+ pcap((char *)data + 4, len);
+ else
+ pcap(data, len);
+
+ if (c->mode == MODE_PASST) {
+ int flags = MSG_NOSIGNAL | MSG_DONTWAIT;
- pcap(data, len);
+ if (vnet_pre) {
+ *((uint32_t *)data) = htonl(len);
+ len += 4;
+ } else {
+ uint32_t vnet_len = htonl(len);
- return send(fd, data, len, flags | MSG_DONTWAIT | MSG_NOSIGNAL);
+ send(c->fd_tap, &vnet_len, 4, flags);
+ }
+
+ return send(c->fd_tap, data, len, flags);
+ }
+
+ return write(c->fd_tap, (char *)data + (vnet_pre ? 4 : 0), len);
}
/**
@@ -56,7 +94,8 @@ int tap_send(int fd, void *data, size_t len, int flags)
void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto,
char *in, size_t len)
{
- char pkt[USHRT_MAX];
+ char buf[USHRT_MAX];
+ char *pkt = buf + 4;
struct ethhdr *eh;
eh = (struct ethhdr *)pkt;
@@ -95,7 +134,7 @@ void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto,
uh->check = 0;
}
- tap_send(c->fd_unix, pkt, len + sizeof(*iph) + sizeof(*eh), 0);
+ tap_send(c, buf, len + sizeof(*iph) + sizeof(*eh), 1);
} else {
struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
char *data = (char *)(ip6h + 1);
@@ -137,6 +176,527 @@ void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto,
ip6h->nexthdr = proto;
ip6h->hop_limit = 255;
- tap_send(c->fd_unix, pkt, len + sizeof(*ip6h) + sizeof(*eh), 0);
+ tap_send(c, buf, len + sizeof(*ip6h) + sizeof(*eh), 1);
+ }
+}
+
+/**
+ * tap4_handler() - IPv4 and ARP packet handler for tap file descriptor
+ * @c: Execution context
+ * @msg: Array of messages with the same L3 protocol
+ * @count: Count of messages with the same L3 protocol
+ * @now: Current timestamp
+ *
+ * Return: count of packets consumed by handlers
+ */
+static int tap4_handler(struct ctx *c, struct tap_msg *msg, size_t count,
+ struct timespec *now)
+{
+ char buf_s[INET_ADDRSTRLEN] __attribute((__unused__));
+ char buf_d[INET_ADDRSTRLEN] __attribute((__unused__));
+ struct ethhdr *eh = (struct ethhdr *)msg[0].start;
+ struct iphdr *iph, *prev_iph = NULL;
+ struct udphdr *uh, *prev_uh = NULL;
+ size_t len = msg[0].len;
+ unsigned int i;
+ char *l4h;
+
+ if (!c->v4)
+ return count;
+
+ if (len < sizeof(*eh) + sizeof(*iph))
+ return 1;
+
+ if (arp(c, eh, len) || dhcp(c, eh, len))
+ return 1;
+
+ for (i = 0; i < count; i++) {
+ len = msg[i].len;
+ if (len < sizeof(*eh) + sizeof(*iph))
+ return 1;
+
+ eh = (struct ethhdr *)msg[i].start;
+ iph = (struct iphdr *)(eh + 1);
+ l4h = (char *)iph + iph->ihl * 4;
+
+ c->addr4_seen = iph->saddr;
+
+ msg[i].l4h = l4h;
+ msg[i].l4_len = len - ((intptr_t)l4h - (intptr_t)eh);
+
+ if (iph->protocol != IPPROTO_TCP &&
+ iph->protocol != IPPROTO_UDP)
+ break;
+
+ if (len < sizeof(*uh))
+ break;
+
+ uh = (struct udphdr *)l4h;
+
+ if (!i) {
+ prev_iph = iph;
+ prev_uh = uh;
+ continue;
+ }
+
+ if (iph->tos != prev_iph->tos ||
+ iph->frag_off != prev_iph->frag_off ||
+ iph->protocol != prev_iph->protocol ||
+ iph->saddr != prev_iph->saddr ||
+ iph->daddr != prev_iph->daddr ||
+ uh->source != prev_uh->source ||
+ uh->dest != prev_uh->dest)
+ break;
+
+ prev_iph = iph;
+ prev_uh = uh;
+ }
+
+ eh = (struct ethhdr *)msg[0].start;
+ iph = (struct iphdr *)(eh + 1);
+
+ if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP ||
+ iph->protocol == IPPROTO_SCTP) {
+ uh = (struct udphdr *)msg[0].l4h;
+
+ if (msg[0].len < sizeof(*uh))
+ return 1;
+
+ debug("%s (%i) from tap: %s:%i -> %s:%i (%i packet%s)",
+ IP_PROTO_STR(iph->protocol), iph->protocol,
+ inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)),
+ ntohs(uh->source),
+ inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)),
+ ntohs(uh->dest),
+ i, i > 1 ? "s" : "");
+ } else if (iph->protocol == IPPROTO_ICMP) {
+ debug("icmp from tap: %s -> %s",
+ inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)),
+ inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)));
+ }
+
+ if (iph->protocol == IPPROTO_TCP)
+ return tcp_tap_handler(c, AF_INET, &iph->daddr, msg, i, now);
+
+ if (iph->protocol == IPPROTO_UDP)
+ return udp_tap_handler(c, AF_INET, &iph->daddr, msg, i, now);
+
+ if (iph->protocol == IPPROTO_ICMP)
+ icmp_tap_handler(c, AF_INET, &iph->daddr, msg, 1, now);
+
+ return 1;
+}
+
+/**
+ * tap6_handler() - IPv6 packet handler for tap file descriptor
+ * @c: Execution context
+ * @msg: Array of messages with the same L3 protocol
+ * @count: Count of messages with the same L3 protocol
+ * @now: Current timestamp
+ *
+ * Return: count of packets consumed by handlers
+ */
+static int tap6_handler(struct ctx *c, struct tap_msg *msg, size_t count,
+ struct timespec *now)
+{
+ char buf_s[INET6_ADDRSTRLEN] __attribute((__unused__));
+ char buf_d[INET6_ADDRSTRLEN] __attribute((__unused__));
+ struct ethhdr *eh = (struct ethhdr *)msg[0].start;
+ struct udphdr *uh, *prev_uh = NULL;
+ uint8_t proto = 0, prev_proto = 0;
+ size_t len = msg[0].len;
+ struct ipv6hdr *ip6h;
+ unsigned int i;
+ char *l4h;
+
+ if (!c->v6)
+ return count;
+
+ if (len < sizeof(*eh) + sizeof(*ip6h))
+ return 1;
+
+ if (ndp(c, eh, len) || dhcpv6(c, eh, len))
+ return 1;
+
+ for (i = 0; i < count; i++) {
+ struct ipv6hdr *p_ip6h;
+
+ len = msg[i].len;
+ if (len < sizeof(*eh) + sizeof(*ip6h))
+ return 1;
+
+ eh = (struct ethhdr *)msg[i].start;
+ ip6h = (struct ipv6hdr *)(eh + 1);
+ l4h = ipv6_l4hdr(ip6h, &proto);
+
+ msg[i].l4h = l4h;
+ msg[i].l4_len = len - ((intptr_t)l4h - (intptr_t)eh);
+
+ if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr))
+ c->addr6_ll_seen = ip6h->saddr;
+ else
+ c->addr6_seen = ip6h->saddr;
+
+ ip6h->saddr = c->addr6;
+
+ if (proto != IPPROTO_TCP && proto != IPPROTO_UDP)
+ break;
+
+ if (len < sizeof(*uh))
+ break;
+
+ uh = (struct udphdr *)l4h;
+
+ if (!i) {
+ p_ip6h = ip6h;
+ prev_proto = proto;
+ prev_uh = uh;
+ continue;
+ }
+
+ if (proto != prev_proto ||
+ memcmp(&ip6h->saddr, &p_ip6h->saddr, sizeof(ip6h->saddr)) ||
+ memcmp(&ip6h->daddr, &p_ip6h->daddr, sizeof(ip6h->daddr)) ||
+ uh->source != prev_uh->source ||
+ uh->dest != prev_uh->dest)
+ break;
+
+ p_ip6h = ip6h;
+ prev_proto = proto;
+ prev_uh = uh;
+ }
+
+ if (prev_proto)
+ proto = prev_proto;
+
+ eh = (struct ethhdr *)msg[0].start;
+ ip6h = (struct ipv6hdr *)(eh + 1);
+
+ if (proto == IPPROTO_ICMPV6) {
+ debug("icmpv6 from tap: %s ->\n\t%s",
+ inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)),
+ inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d)));
+ } else if (proto == IPPROTO_TCP || proto == IPPROTO_UDP ||
+ proto == IPPROTO_SCTP) {
+ uh = (struct udphdr *)msg[0].l4h;
+
+ if (msg[0].len < sizeof(*uh))
+ return 1;
+
+ debug("%s (%i) from tap: [%s]:%i\n\t-> [%s]:%i (%i packet%s)",
+ IP_PROTO_STR(proto), proto,
+ inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)),
+ ntohs(uh->source),
+ inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d)),
+ ntohs(uh->dest),
+ i, i > 1 ? "s" : "");
+ }
+
+ if (proto == IPPROTO_TCP)
+ return tcp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, i, now);
+
+ if (proto == IPPROTO_UDP)
+ return udp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, i, now);
+
+ if (proto == IPPROTO_ICMPV6)
+ icmp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, 1, now);
+
+ return 1;
+}
+
+/**
+ * tap_handler_passt() - Packet handler for AF_UNIX file descriptor
+ * @c: Execution context
+ * @now: Current timestamp
+ *
+ * Return: -ECONNRESET on receive error, 0 otherwise
+ */
+static int tap_handler_passt(struct ctx *c, struct timespec *now)
+{
+ int msg_count = 0, same, i = 0;
+ struct tap_msg msg[TAP_MSGS];
+ struct ethhdr *eh;
+ char *p = pkt_buf;
+ ssize_t n, rem;
+
+ n = recv(c->fd_tap, p, TAP_BUF_FILL, MSG_DONTWAIT);
+ if (n < 0) {
+ if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)
+ return 0;
+
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL);
+ close(c->fd_tap);
+
+ return -ECONNRESET;
+ }
+
+ while (n > (ssize_t)sizeof(uint32_t)) {
+ ssize_t len = ntohl(*(uint32_t *)p);
+
+ p += sizeof(uint32_t);
+ n -= sizeof(uint32_t);
+
+ if (len < (ssize_t)sizeof(*eh))
+ return 0;
+
+ /* At most one packet might not fit in a single read */
+ if (len > n) {
+ rem = recv(c->fd_tap, p + n, len - n, MSG_DONTWAIT);
+ if ((n += rem) != len)
+ return 0;
+ }
+
+ pcap(p, len);
+
+ msg[msg_count].start = p;
+ msg[msg_count++].len = len;
+
+ n -= len;
+ p += len;
+ }
+
+ while (i < msg_count) {
+ eh = (struct ethhdr *)msg[i].start;
+
+ memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
+
+ switch (ntohs(eh->h_proto)) {
+ case ETH_P_ARP:
+ tap4_handler(c, msg + i, 1, now);
+ i++;
+ break;
+ case ETH_P_IP:
+ for (same = 1; i + same < msg_count &&
+ same < UIO_MAXIOV; same++) {
+ struct tap_msg *next = &msg[i + same];
+
+ eh = (struct ethhdr *)next->start;
+ if (ntohs(eh->h_proto) != ETH_P_IP)
+ break;
+ }
+
+ i += tap4_handler(c, msg + i, same, now);
+ break;
+ case ETH_P_IPV6:
+ for (same = 1; i + same < msg_count &&
+ same < UIO_MAXIOV; same++) {
+ struct tap_msg *next = &msg[i + same];
+
+ eh = (struct ethhdr *)next->start;
+ if (ntohs(eh->h_proto) != ETH_P_IPV6)
+ break;
+ }
+
+ i += tap6_handler(c, msg + i, same, now);
+ break;
+ default:
+ i++;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * tap_handler_passt() - Packet handler for tuntap file descriptor
+ * @c: Execution context
+ * @now: Current timestamp
+ *
+ * Return: -ECONNRESET on receive error, 0 otherwise
+ */
+static int tap_handler_pasta(struct ctx *c, struct timespec *now)
+{
+ struct tap_msg msg = { .start = pkt_buf };
+ ssize_t n;
+
+ while ((n = read(c->fd_tap, pkt_buf, TAP_BUF_BYTES)) > 0) {
+ struct ethhdr *eh = (struct ethhdr *)pkt_buf;
+ msg.len = n;
+
+ pcap(msg.start, msg.len);
+
+ memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
+
+ switch (ntohs(eh->h_proto)) {
+ case ETH_P_ARP:
+ tap4_handler(c, &msg, 1, now);
+ break;
+ case ETH_P_IP:
+ tap4_handler(c, &msg, 1, now);
+ break;
+ case ETH_P_IPV6:
+ tap6_handler(c, &msg, 1, now);
+ break;
+ }
+ }
+
+ if (!n || errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)
+ return 0;
+
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL);
+ close(c->fd_tap);
+
+ return -ECONNRESET;
+}
+
+/**
+ * tap_sock_init_unix() - Create and bind AF_UNIX socket, wait for connection
+ * @c: Execution context
+ */
+static void tap_sock_init_unix(struct ctx *c)
+{
+ int fd = socket(AF_UNIX, SOCK_STREAM, 0), ex;
+ struct sockaddr_un addr = {
+ .sun_family = AF_UNIX,
+ };
+ int i, ret;
+
+ if (c->fd_tap_listen)
+ close(c->fd_tap_listen);
+
+ if (fd < 0) {
+ perror("UNIX socket");
+ exit(EXIT_FAILURE);
+ }
+ c->fd_tap_listen = fd;
+
+ for (i = 1; i < UNIX_SOCK_MAX; i++) {
+ snprintf(addr.sun_path, UNIX_PATH_MAX, UNIX_SOCK_PATH, i);
+
+ ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0);
+ ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr));
+ if (!ret || (errno != ENOENT && errno != ECONNREFUSED)) {
+ close(ex);
+ continue;
+ }
+ close(ex);
+
+ unlink(addr.sun_path);
+ if (!bind(fd, (const struct sockaddr *)&addr, sizeof(addr)))
+ break;
+ }
+
+ if (i == UNIX_SOCK_MAX) {
+ perror("UNIX socket bind");
+ exit(EXIT_FAILURE);
+ }
+
+ info("UNIX domain socket bound at %s\n", addr.sun_path);
+ chmod(addr.sun_path,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
+
+ pcap_init(c, i);
+
+ listen(fd, 0);
+
+ info("You can now start qrap:");
+ info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio");
+ info("or directly qemu, patched with:");
+ info(" qemu/0001-net-Allow-also-UNIX-domain-sockets-to-be-used-as-net.patch");
+ info("as follows:");
+ info(" kvm ... -net socket,connect=" UNIX_SOCK_PATH
+ " -net nic,model=virtio", i);
+
+ c->fd_tap = accept(fd, NULL, NULL);
+}
+
+static int tun_ns_fd = -1;
+
+/**
+ * tap_sock_init_tun_ns() - Create tuntap file descriptor in namespace
+ * @c: Execution context
+ */
+static int tap_sock_init_tun_ns(void *target_pid)
+{
+ int fd;
+
+ if (ns_enter(*(int *)target_pid))
+ goto fail;
+
+ if ((fd = open("/dev/net/tun", O_RDWR)) < 0)
+ goto fail;
+
+ fcntl(fd, F_SETFL, O_NONBLOCK);
+
+ tun_ns_fd = fd;
+
+ return 0;
+
+fail:
+ tun_ns_fd = -1;
+ return 0;
+}
+
+/**
+ * tap_sock_init_tun() - Set up tuntap file descriptor
+ * @c: Execution context
+ */
+static void tap_sock_init_tun(struct ctx *c)
+{
+ struct ifreq ifr = { .ifr_name = "pasta0",
+ .ifr_flags = IFF_TAP | IFF_NO_PI,
+ };
+ char ns_fn_stack[NS_FN_STACK_SIZE];
+
+ clone(tap_sock_init_tun_ns, ns_fn_stack + sizeof(ns_fn_stack) / 2,
+ CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD,
+ (void *)&c->pasta_pid);
+
+ if (tun_ns_fd == -1) {
+ err("Failed to open tun socket in namespace");
+ exit(EXIT_FAILURE);
+ }
+
+ if (ioctl(tun_ns_fd, TUNSETIFF, &ifr)) {
+ perror("TUNSETIFF ioctl");
+ exit(EXIT_FAILURE);
}
+
+ pcap_init(c, c->pasta_pid);
+
+ c->fd_tap = tun_ns_fd;
+}
+
+/**
+ * tap_sock_init() - Create and set up AF_UNIX socket or tuntap file descriptor
+ * @c: Execution context
+ */
+void tap_sock_init(struct ctx *c)
+{
+ struct epoll_event ev = { 0 };
+
+ if (c->fd_tap) {
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL);
+ close(c->fd_tap);
+ }
+
+ if (c->mode == MODE_PASST)
+ tap_sock_init_unix(c);
+ else
+ tap_sock_init_tun(c);
+
+ ev.events = EPOLLIN | EPOLLRDHUP;
+ ev.data.fd = c->fd_tap;
+ epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
+}
+
+/**
+ * tap_handler() - Packet handler for AF_UNIX or tuntap file descriptor
+ * @c: Execution context
+ * @events: epoll events
+ * @now: Current timestamp
+ */
+void tap_handler(struct ctx *c, uint32_t events, struct timespec *now)
+{
+ if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR))
+ goto fail;
+
+ if ((c->mode == MODE_PASST && tap_handler_passt(c, now)) ||
+ (c->mode == MODE_PASTA && tap_handler_pasta(c, now)))
+ goto fail;
+
+ return;
+fail:
+ tap_sock_init(c);
}
diff --git a/tap.h b/tap.h
index ecea936..385fab0 100644
--- a/tap.h
+++ b/tap.h
@@ -1,3 +1,5 @@
void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto,
char *in, size_t len);
-int tap_send(int fd, void *data, size_t len, int flags);
+int tap_send(struct ctx *c, void *data, size_t len, int vnet_pre);
+void tap_handler(struct ctx *c, uint32_t events, struct timespec *now);
+void tap_sock_init(struct ctx *c);
diff --git a/tcp.c b/tcp.c
index d650166..dec2df3 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1,20 +1,23 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
*
* tcp.c - TCP L2-L4 translation state machine
*
* Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
- *
*/
/**
* DOC: Theory of Operation
*
*
- * Overview
- * --------
+ * PASST mode
+ * ==========
*
* This implementation maps TCP traffic between a single L2 interface (tap) and
* native TCP (L4) sockets, mimicking and reproducing as closely as possible the
@@ -22,7 +25,7 @@
* interface. Four connection flows are supported:
* - from the local host to the guest behind the tap interface:
* - this is the main use case for proxies in service meshes
- * - we bind to all unbound local ports, and relay traffic between L4 sockets
+ * - we bind to configured local ports, and relay traffic between L4 sockets
* with local endpoints and the L2 interface
* - from remote hosts to the guest behind the tap interface:
* - this might be needed for services that need to be addressed directly,
@@ -64,7 +67,7 @@
* ------
*
* To avoid the need for dynamic memory allocation, a maximum, reasonable amount
- * of connections is defined by TCP_MAX_CONNS below (currently 256k, close to
+ * of connections is defined by MAX_TAP_CONNS below (currently 1M, close to
* the maximum amount of file descriptors typically available to a process on
* Linux).
*
@@ -72,8 +75,8 @@
* segments and retransmissions needs to be, thus data needs to linger on
* sockets as long as it's not acknowledged by the guest, and read using
* MSG_PEEK into a single, preallocated static buffer sized to the maximum
- * supported window, 64MiB. This imposes a practical limitation on window
- * scaling, that is, the maximum factor is 1024. If a bigger window scaling
+ * supported window, 16MiB. This imposes a practical limitation on window
+ * scaling, that is, the maximum factor is 512. If a bigger window scaling
* factor is observed during connection establishment, connection is reset and
* reestablished by omitting the scaling factor in the SYN segment. This
* limitation only applies to the window scaling advertised by the guest, but
@@ -84,9 +87,10 @@
* -----
*
* To avoid the need for ad-hoc configuration of port forwarding or allowed
- * ports, listening sockets are opened and bound to all unbound ports on the
+ * ports, listening sockets can be opened and bound to all unbound ports on the
* host, as far as process capabilities allow. This service needs to be started
- * after any application proxy that needs to bind to local ports.
+ * after any application proxy that needs to bind to local ports. Mapped ports
+ * can also be configured explicitly.
*
* No port translation is needed for connections initiated remotely or by the
* local host: source port from socket is reused while establishing connections
@@ -100,10 +104,14 @@
* Connection tracking and storage
* -------------------------------
*
- * Connection are tracked by the @tc array of struct tcp_conn, containing
+ * Connections are tracked by the @tt array of struct tcp_tap_conn, containing
* addresses, ports, TCP states and parameters. This is statically allocated and
- * indices are the file descriptor numbers associated to inbound or outbound
- * sockets.
+ * indexed by an arbitrary connection number. The array is compacted whenever a
+ * connection is closed, by remapping the highest connection index in use to the
+ * one freed up.
+ *
+ * References used for the epoll interface report the connection index used for
+ * the @tt array.
*
* IPv4 addresses are stored as IPv4-mapped IPv6 addresses to avoid the need for
* separate data structures depending on the protocol version.
@@ -120,8 +128,8 @@
* --------------
*
* Up to 2^15 + 2^14 listening sockets (excluding ephemeral ports, repeated for
- * IPv4 and IPv6) are opened and bound to wildcard addresses. Some will fail to
- * bind (for low ports, or ports already bound, e.g. by a proxy). These are
+ * IPv4 and IPv6) can be opened and bound to wildcard addresses. Some will fail
+ * to bind (for low ports, or ports already bound, e.g. by a proxy). These are
* added to the epoll list, with no separate storage.
*
*
@@ -291,9 +299,31 @@
* set @tcpi_acked_last to tcpi_bytes_acked, set @seq_ack_to_tap
* to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and
* send ACK to tap
+ *
+ *
+ * PASTA mode
+ * ==========
+ *
+ * For traffic directed to TCP ports configured for mapping to the tuntap device
+ * in the namespace, and for non-local traffic coming from the tuntap device,
+ * the implementation is identical as the PASST mode described in the previous
+ * section.
+ *
+ * For local traffic directed to TCP ports configured for direct mapping between
+ * namespaces, the implementation is substantially simpler: packets are directly
+ * translated between L4 sockets using a pair of splice() syscalls. These
+ * connections are tracked in the @ts array of struct tcp_splice_conn, using
+ * just four states:
+ *
+ * - CLOSED: no connection
+ * - SPLICE_ACCEPTED: accept() on the listening socket succeeded
+ * - SPLICE_CONNECT: connect() issued in the destination namespace
+ * - SPLICE_ESTABLISHED: connect() succeeded, packets are transferred
*/
#define _GNU_SOURCE
+#include <sched.h>
+#include <fcntl.h>
#include <stdio.h>
#include <errno.h>
#include <limits.h>
@@ -313,24 +343,27 @@
#include <linux/tcp.h>
#include <time.h>
+#include "util.h"
#include "passt.h"
#include "tap.h"
-#include "util.h"
#include "siphash.h"
-/* Approximately maximum number of open descriptors per process */
-#define MAX_CONNS (1024 * 1024)
+#define MAX_TAP_CONNS (128 * 1024)
+#define MAX_SPLICE_CONNS (128 * 1024)
+
+#define PIPE_SIZE (1024 * 1024)
#define TCP_HASH_TABLE_LOAD 70 /* % */
-#define TCP_HASH_TABLE_SIZE (MAX_CONNS * 100 / TCP_HASH_TABLE_LOAD)
+#define TCP_HASH_TABLE_SIZE (MAX_TAP_CONNS * 100 / \
+ TCP_HASH_TABLE_LOAD)
-#define MAX_WS 10
+#define MAX_WS 8
#define MAX_WINDOW (1 << (16 + (MAX_WS)))
#define MSS_DEFAULT 536
#define WINDOW_DEFAULT 14600 /* RFC 6928 */
#define SYN_TIMEOUT 240000 /* ms */
-#define ACK_TIMEOUT 3000
+#define ACK_TIMEOUT 10000
#define ACK_INTERVAL 50
#define ACT_TIMEOUT 7200000
#define FIN_TIMEOUT 240000
@@ -353,19 +386,25 @@ enum tcp_state {
LAST_ACK,
FIN_WAIT_1,
FIN_WAIT_1_SOCK_FIN,
+ SPLICE_ACCEPTED,
+ SPLICE_CONNECT,
+ SPLICE_ESTABLISHED,
};
-#define TCP_STATE_STR_SIZE (FIN_WAIT_1_SOCK_FIN + 1)
+#define TCP_STATE_STR_SIZE (SPLICE_ESTABLISHED + 1)
static char *tcp_state_str[TCP_STATE_STR_SIZE] __attribute((__unused__)) = {
"CLOSED", "TAP_SYN_SENT", "SOCK_SYN_SENT", "TAP_SYN_RCVD",
"ESTABLISHED", "ESTABLISHED_SOCK_FIN", "CLOSE_WAIT", "LAST_ACK",
"FIN_WAIT_1", "FIN_WAIT_1_SOCK_FIN",
+ "SPLICE_ACCEPTED", "SPLICE_CONNECT", "SPLICE_ESTABLISHED",
};
#define FIN (1 << 0)
#define SYN (1 << 1)
#define RST (1 << 2)
#define ACK (1 << 4)
+/* Flags for internal usage */
+#define ZERO_WINDOW (1 << 5)
#define OPT_EOL 0
#define OPT_NOP 1
@@ -377,38 +416,39 @@ static char *tcp_state_str[TCP_STATE_STR_SIZE] __attribute((__unused__)) = {
#define OPT_SACK 5
#define OPT_TS 8
-struct tcp_conn;
+struct tcp_tap_conn;
/**
- * struct tcp_conn - Descriptor for a TCP connection
+ * struct tcp_tap_conn - Descriptor for a TCP connection via tap (not spliced)
* @next: Pointer to next item in hash chain, if any
* @sock: Socket descriptor number
- * @hash_bucket: Bucket index in socket lookup hash table
+ * @hash_bucket: Bucket index in connection lookup hash table
* @a.a6: IPv6 remote address, can be IPv4-mapped
* @a.a4.zero: Zero prefix for IPv4-mapped, see RFC 6890, Table 20
* @a.a4.one: Ones prefix for IPv4-mapped
* @a.a4.a: IPv4 address
* @tap_port: Guest-facing tap port
* @sock_port: Remote, socket-facing port
- * @s: TCP connection state
+ * @state: TCP connection state
* @seq_to_tap: Next sequence for packets to tap
* @seq_ack_from_tap: Last ACK number received from tap
* @seq_from_tap: Next sequence for packets from tap (not actually sent)
* @seq_ack_to_tap: Last ACK number sent to tap
* @seq_init_from_tap: Initial sequence number from tap
* @tcpi_acked_last: Most recent value of tcpi_bytes_acked (TCP_INFO query)
- * @dup_acks: Count of currently duplicated ACKs from tap
* @ws_allowed: Window scaling allowed
* @ws: Window scaling factor
* @tap_window: Last window size received from tap, scaled
+ * @window_clamped: Window was clamped on socket at least once
* @no_snd_wnd: Kernel won't report window (without commit 8f7baad7f035)
+ * @tcpi_acked_last: Most recent value of tcpi_snd_wnd (TCP_INFO query)
* @ts_sock: Last activity timestamp from socket for timeout purposes
* @ts_tap: Last activity timestamp from tap for timeout purposes
* @ts_ack_tap: Last ACK segment timestamp from tap for timeout purposes
* @mss_guest: Maximum segment size advertised by guest
*/
-struct tcp_conn {
- struct tcp_conn *next;
+struct tcp_tap_conn {
+ struct tcp_tap_conn *next;
int sock;
int hash_bucket;
@@ -422,7 +462,7 @@ struct tcp_conn {
} a;
in_port_t tap_port;
in_port_t sock_port;
- enum tcp_state s;
+ enum tcp_state state;
uint32_t seq_to_tap;
uint32_t seq_ack_from_tap;
@@ -430,12 +470,13 @@ struct tcp_conn {
uint32_t seq_ack_to_tap;
uint32_t seq_init_from_tap;
uint64_t tcpi_acked_last;
- int dup_acks;
int ws_allowed;
int ws;
- int tap_window;
+ uint32_t tap_window;
+ int window_clamped;
int no_snd_wnd;
+ uint32_t tcpi_snd_wnd;
struct timespec ts_sock;
struct timespec ts_tap;
@@ -444,48 +485,58 @@ struct tcp_conn {
int mss_guest;
};
+/**
+ * struct tcp_splice_conn - Descriptor for a spliced TCP connection
+ * @from: File descriptor number of socket for accepted connection
+ * @pipe_from_to: Pipe ends for splice() from @from to @to
+ * @to: File descriptor number of peer connected socket
+ * @pipe_to_from: Pipe ends for splice() from @to to @from
+ * @state: TCP connection state
+*/
+struct tcp_splice_conn {
+ int from;
+ int pipe_from_to[2];
+ int to;
+ int pipe_to_from[2];
+ enum tcp_state state;
+ int v6;
+};
+
/* Socket receive buffer */
static char sock_buf[MAX_WINDOW];
-/* Bitmap, activity monitoring needed for connection, indexed by socket */
-static uint8_t tcp_act[MAX_CONNS / 8] = { 0 };
-
-/* TCP connections, indexed by socket */
-static struct tcp_conn tc[MAX_CONNS];
+/* Bitmap, activity monitoring needed for connection via tap */
+static uint8_t tcp_act[MAX_TAP_CONNS / 8] = { 0 };
-/* Hash table for socket lookup given remote address, local port, remote port */
-static int tc_hash[TCP_HASH_TABLE_SIZE];
+/* TCP connections */
+static struct tcp_tap_conn tt[MAX_TAP_CONNS];
+static struct tcp_splice_conn ts[MAX_SPLICE_CONNS];
-static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len);
-
-/**
- * tcp_act_set() - Set socket in bitmap for timed events
- * @s: Socket file descriptor number
- */
-static void tcp_act_set(int s)
-{
- tcp_act[s / 8] |= 1 << (s % 8);
-}
+/* Table for lookup from remote address, local port, remote port */
+static struct tcp_tap_conn *tt_hash[TCP_HASH_TABLE_SIZE];
/**
- * tcp_act_clear() - Clear socket from bitmap for timed events
- * @s: Socket file descriptor number
+ * tcp_tap_state() - Set given TCP state for tap connection, report to stderr
+ * @conn: Connection pointer
+ * @state: New TCP state to be set
*/
-static void tcp_act_clear(int s)
+static void tcp_tap_state(struct tcp_tap_conn *conn, enum tcp_state state)
{
- tcp_act[s / 8] &= ~(1 << (s % 8));
+ debug("TCP: socket %i: %s -> %s",
+ conn->sock, tcp_state_str[conn->state], tcp_state_str[state]);
+ conn->state = state;
}
/**
- * tcp_set_state() - Set given TCP state for socket, report change to stderr
- * @s: Socket file descriptor number
+ * tcp_splice_state() - Set state for spliced connection, report to stderr
+ * @conn: Connection pointer
* @state: New TCP state to be set
*/
-static void tcp_set_state(int s, enum tcp_state state)
+static void tcp_splice_state(struct tcp_splice_conn *conn, enum tcp_state state)
{
- debug("TCP: socket %i: %s -> %s", s,
- tcp_state_str[tc[s].s], tcp_state_str[state]);
- tc[s].s = state;
+ debug("TCP: index %i: %s -> %s",
+ conn - ts, tcp_state_str[conn->state], tcp_state_str[state]);
+ conn->state = state;
}
/**
@@ -547,7 +598,7 @@ static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t __type,
}
/**
- * tcp_sock_hash_match() - Check if a connection entry matches address and ports
+ * tcp_hash_match() - Check if a connection entry matches address and ports
* @conn: Connection entry to match against
* @af: Address family, AF_INET or AF_INET6
* @addr: Remote address, pointer to sin_addr or sin6_addr
@@ -556,8 +607,8 @@ static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t __type,
*
* Return: 1 on match, 0 otherwise
*/
-static int tcp_sock_hash_match(struct tcp_conn *conn, int af, void *addr,
- in_port_t tap_port, in_port_t sock_port)
+static int tcp_hash_match(struct tcp_tap_conn *conn, int af, void *addr,
+ in_port_t tap_port, in_port_t sock_port)
{
if (af == AF_INET && IN6_IS_ADDR_V4MAPPED(&conn->a.a6) &&
!memcmp(&conn->a.a4.a, addr, sizeof(conn->a.a4.a)) &&
@@ -573,7 +624,7 @@ static int tcp_sock_hash_match(struct tcp_conn *conn, int af, void *addr,
}
/**
- * tcp_sock_hash() - Calculate hash value for connection given address and ports
+ * tcp_hash() - Calculate hash value for connection given address and ports
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
* @addr: Remote address, pointer to sin_addr or sin6_addr
@@ -582,8 +633,8 @@ static int tcp_sock_hash_match(struct tcp_conn *conn, int af, void *addr,
*
* Return: hash value, already modulo size of the hash table
*/
-static unsigned int tcp_sock_hash(struct ctx *c, int af, void *addr,
- in_port_t tap_port, in_port_t sock_port)
+static unsigned int tcp_hash(struct ctx *c, int af, void *addr,
+ in_port_t tap_port, in_port_t sock_port)
{
uint64_t b = 0;
@@ -617,114 +668,172 @@ static unsigned int tcp_sock_hash(struct ctx *c, int af, void *addr,
}
/**
- * tcp_sock_hash_insert() - Insert socket into hash table, chain link if needed
+ * tcp_hash_insert() - Insert connection into hash table, chain link
* @c: Execution context
- * @s: File descriptor number for socket
+ * @conn: Connection pointer
* @af: Address family, AF_INET or AF_INET6
* @addr: Remote address, pointer to sin_addr or sin6_addr
- * @tap_port: tap-facing port
- * @sock_port: Socket-facing port
*/
-static void tcp_sock_hash_insert(struct ctx *c, int s, int af, void *addr,
- in_port_t tap_port, in_port_t sock_port)
+static void tcp_hash_insert(struct ctx *c, struct tcp_tap_conn *conn,
+ int af, void *addr)
{
int b;
- b = tcp_sock_hash(c, af, addr, tap_port, sock_port);
- tc[s].next = tc_hash[b] ? &tc[tc_hash[b]] : NULL;
- tc_hash[b] = tc[s].sock = s;
- tc[s].hash_bucket = b;
+ b = tcp_hash(c, af, addr, conn->tap_port, conn->sock_port);
+ conn->next = tt_hash[b];
+ tt_hash[b] = conn;
+ conn->hash_bucket = b;
+
+ debug("TCP: hash table insert: index %i, sock %i, bucket: %i, next: %p",
+ conn - tt, conn->sock, b, conn->next);
}
/**
- * tcp_sock_hash_remove() - Drop socket from hash table, chain unlink if needed
- * @b: Bucket index
- * @s: File descriptor number for socket
+ * tcp_hash_remove() - Drop connection from hash table, chain unlink
+ * @conn: Connection pointer
*/
-static void tcp_sock_hash_remove(int b, int s)
+static void tcp_hash_remove(struct tcp_tap_conn *conn)
{
- struct tcp_conn *conn, *prev = NULL;
+ struct tcp_tap_conn *entry, *prev = NULL;
+ int b = conn->hash_bucket;
- for (conn = &tc[tc_hash[b]]; conn; prev = conn, conn = conn->next) {
- if (conn->sock == s) {
- conn->sock = 0;
+ for (entry = tt_hash[b]; entry; prev = entry, entry = entry->next) {
+ if (entry == conn) {
if (prev)
prev->next = conn->next;
else
- tc_hash[b] = conn->next ? conn->next->sock : 0;
- return;
+ tt_hash[b] = conn->next;
+ break;
+ }
+ }
+
+ debug("TCP: hash table remove: index %i, sock %i, bucket: %i, new: %p",
+ conn - tt, conn->sock, b, prev ? prev->next : tt_hash[b]);
+}
+
+/**
+ * tcp_hash_update() - Update pointer for given connection
+ * @old: Old connection pointer
+ * @new: New connection pointer
+ */
+static void tcp_hash_update(struct tcp_tap_conn *old, struct tcp_tap_conn *new)
+{
+ struct tcp_tap_conn *entry, *prev = NULL;
+ int b = old->hash_bucket;
+
+ for (entry = tt_hash[b]; entry; prev = entry, entry = entry->next) {
+ if (entry == old) {
+ if (prev)
+ prev->next = new;
+ else
+ tt_hash[b] = new;
+ break;
}
}
+
+ debug("TCP: hash table update: old index %i, new index %i, sock %i, "
+ "bucket: %i, old: %p, new: %p",
+ old - tt, new - tt, new->sock, b, old, new);
}
/**
- * tcp_sock_hash_lookup() - Look up socket given remote address and ports
+ * tcp_hash_lookup() - Look up connection given remote address and ports
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
* @addr: Remote address, pointer to sin_addr or sin6_addr
* @tap_port: tap-facing port
* @sock_port: Socket-facing port
*
- * Return: file descriptor number for socket, if found, -ENOENT otherwise
+ * Return: connection pointer, if found, -ENOENT otherwise
*/
-static int tcp_sock_hash_lookup(struct ctx *c, int af, void *addr,
- in_port_t tap_port, in_port_t sock_port)
+static struct tcp_tap_conn *tcp_hash_lookup(struct ctx *c, int af, void *addr,
+ in_port_t tap_port,
+ in_port_t sock_port)
{
- struct tcp_conn *conn;
- int b;
+ int b = tcp_hash(c, af, addr, tap_port, sock_port);
+ struct tcp_tap_conn *conn;
- b = tcp_sock_hash(c, af, addr, tap_port, sock_port);
- if (!tc_hash[b])
- return -ENOENT;
-
- for (conn = &tc[tc_hash[b]]; conn; conn = conn->next) {
- if (tcp_sock_hash_match(conn, af, addr, tap_port, sock_port))
- return conn->sock;
+ for (conn = tt_hash[b]; conn; conn = conn->next) {
+ if (tcp_hash_match(conn, af, addr, tap_port, sock_port))
+ return conn;
}
- return -ENOENT;
+ return NULL;
}
/**
- * tcp_close_and_epoll_del() - Close, remove socket from hash table and epoll fd
+ * tcp_table_tap_compact - Compaction tap connection table
* @c: Execution context
- * @s: File descriptor number for socket
+ * @hole: Pointer to recently closed connection
*/
-static void tcp_close_and_epoll_del(struct ctx *c, int s)
+static void tcp_table_tap_compact(struct ctx *c, struct tcp_tap_conn *hole)
{
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL);
- tcp_set_state(s, CLOSED);
- close(s);
- tcp_sock_hash_remove(tc[s].hash_bucket, tc[s].sock);
- tcp_act_clear(s);
+ union epoll_ref ref = { .proto = IPPROTO_TCP, .tcp.index = hole - tt };
+ struct tcp_tap_conn *from, *to;
+ struct epoll_event ev;
+
+ if ((hole - tt) == --c->tcp.tap_conn_count) {
+ bitmap_clear(tcp_act, hole - tt);
+ debug("TCP: hash table compaction: index %i (%p) was max index",
+ hole - tt, hole);
+ return;
+ }
+
+ from = &tt[c->tcp.tap_conn_count];
+ memcpy(hole, from, sizeof(*hole));
+ from->state = CLOSED;
+
+ to = hole;
+ tcp_hash_update(from, to);
+
+ if (to->state == SOCK_SYN_SENT)
+ ev.events = EPOLLRDHUP;
+ else if (to->state == TAP_SYN_SENT)
+ ev.events = EPOLLOUT | EPOLLRDHUP;
+ else
+ ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
+
+ ref.tcp.v6 = !IN6_IS_ADDR_V4MAPPED(&to->a.a6);
+ ref.s = from->sock;
+ ev.data.u64 = ref.u64;
+ epoll_ctl(c->epollfd, EPOLL_CTL_MOD, from->sock, &ev);
+
+ debug("TCP: hash table compaction: old index %i, new index %i, "
+ "sock %i, from: %p, to: %p",
+ from - tt, to - tt, from->sock, from, to);
}
/**
- * tcp_rst() - Reset a connection: send RST segment to tap, close socket
+ * tcp_tap_destroy() - Close tap connection, drop from hash table and epoll
* @c: Execution context
- * @s: File descriptor number for socket
+ * @conn: Connection pointer
*/
-static void tcp_rst(struct ctx *c, int s)
+static void tcp_tap_destroy(struct ctx *c, struct tcp_tap_conn *conn)
{
- if (s < 0)
+ if (conn->state == CLOSED)
return;
- tcp_send_to_tap(c, s, RST, NULL, 0);
- tcp_close_and_epoll_del(c, s);
- tcp_set_state(s, CLOSED);
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, NULL);
+ tcp_tap_state(conn, CLOSED);
+ close(conn->sock);
+ tcp_hash_remove(conn);
+ tcp_table_tap_compact(c, conn);
}
+static void tcp_rst(struct ctx *c, struct tcp_tap_conn *conn);
+
/**
* tcp_send_to_tap() - Send segment to tap, with options and values from socket
* @c: Execution context
- * @s: File descriptor number for socket
+ * @conn: Connection pointer
* @flags: TCP flags to set
* @in: Payload buffer
* @len: Payload length
*
* Return: negative error code on connection reset, 0 otherwise
*/
-static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len)
+static int tcp_send_to_tap(struct ctx *c, struct tcp_tap_conn *conn,
+ int flags, char *in, int len)
{
char buf[USHRT_MAX] = { 0 }, *data;
struct tcp_info info = { 0 };
@@ -732,10 +841,18 @@ static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len)
struct tcphdr *th;
int ws = 0, err;
- if ((err = getsockopt(s, SOL_TCP, TCP_INFO, &info, &sl)) &&
- !(flags & RST)) {
- tcp_rst(c, s);
- return err;
+ if (conn->seq_from_tap == conn->seq_ack_to_tap && !flags && len) {
+ err = 0;
+ info.tcpi_bytes_acked = conn->tcpi_acked_last;
+ info.tcpi_snd_wnd = conn->tcpi_snd_wnd;
+ } else {
+ err = getsockopt(conn->sock, SOL_TCP, TCP_INFO, &info, &sl);
+ if (err && !(flags & RST)) {
+ tcp_rst(c, conn);
+ return err;
+ }
+
+ conn->tcpi_snd_wnd = info.tcpi_snd_wnd;
}
th = (struct tcphdr *)buf;
@@ -753,10 +870,10 @@ static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len)
/* Check if kernel includes commit:
* 8f7baad7f035 ("tcp: Add snd_wnd to TCP_INFO")
*/
- tc[s].no_snd_wnd = !info.tcpi_snd_wnd;
+ conn->no_snd_wnd = !info.tcpi_snd_wnd;
- if (tc[s].ws_allowed && (ws = info.tcpi_snd_wscale) &&
- !tc[s].no_snd_wnd) {
+ if (conn->ws_allowed && (ws = info.tcpi_snd_wscale) &&
+ !conn->no_snd_wnd) {
*data++ = OPT_NOP;
*data++ = OPT_WS;
@@ -767,30 +884,27 @@ static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len)
}
/* RFC 793, 3.1: "[...] and the first data octet is ISN+1." */
- th->seq = htonl(tc[s].seq_to_tap++);
+ th->seq = htonl(conn->seq_to_tap++);
} else {
- th->seq = htonl(tc[s].seq_to_tap);
- tc[s].seq_to_tap += len;
+ th->seq = htonl(conn->seq_to_tap);
+ conn->seq_to_tap += len;
}
- if (!err && ((info.tcpi_bytes_acked > tc[s].tcpi_acked_last) ||
+ if (!err && ((info.tcpi_bytes_acked > conn->tcpi_acked_last) ||
(flags & ACK) || len)) {
- uint64_t ack_seq;
-
th->ack = 1;
- ack_seq = info.tcpi_bytes_acked + tc[s].seq_init_from_tap;
+ conn->seq_ack_to_tap = info.tcpi_bytes_acked +
+ conn->seq_init_from_tap;
- tc[s].seq_ack_to_tap = ack_seq & (uint32_t)~0U;
-
- if (tc[s].s == LAST_ACK) {
- tc[s].seq_ack_to_tap = tc[s].seq_from_tap + 1;
+ if (conn->state == LAST_ACK) {
+ conn->seq_ack_to_tap = conn->seq_from_tap + 1;
th->seq = htonl(ntohl(th->seq) + 1);
}
- th->ack_seq = htonl(tc[s].seq_ack_to_tap);
+ th->ack_seq = htonl(conn->seq_ack_to_tap);
- tc[s].tcpi_acked_last = info.tcpi_bytes_acked;
+ conn->tcpi_acked_last = info.tcpi_bytes_acked;
} else {
if (!len && !flags)
return 0;
@@ -802,10 +916,12 @@ static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len)
th->syn = !!(flags & SYN);
th->fin = !!(flags & FIN);
- th->source = tc[s].sock_port;
- th->dest = tc[s].tap_port;
+ th->source = htons(conn->sock_port);
+ th->dest = htons(conn->tap_port);
- if (!err && !tc[s].no_snd_wnd) {
+ if (flags & ZERO_WINDOW) {
+ th->window = 0;
+ } else if (!err && !conn->no_snd_wnd) {
/* First value sent by receiver is not scaled */
th->window = htons(info.tcpi_snd_wnd >>
(th->syn ? 0 : info.tcpi_snd_wscale));
@@ -818,34 +934,58 @@ static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len)
memcpy(data, in, len);
- tap_ip_send(c, &tc[s].a.a6, IPPROTO_TCP, buf, th->doff * 4 + len);
+ tap_ip_send(c, &conn->a.a6, IPPROTO_TCP, buf, th->doff * 4 + len);
return 0;
}
/**
+ * tcp_rst() - Reset a tap connection: send RST segment to tap, close socket
+ * @c: Execution context
+ * @conn: Connection pointer
+ */
+static void tcp_rst(struct ctx *c, struct tcp_tap_conn *conn)
+{
+ if (conn->state == CLOSED)
+ return;
+
+ tcp_send_to_tap(c, conn, RST, NULL, 0);
+ tcp_tap_destroy(c, conn);
+}
+
+/**
* tcp_clamp_window() - Set window and scaling from option, clamp on socket
- * @s: File descriptor number for socket
+ * @conn: Connection pointer
* @th: TCP header, from tap
* @len: Buffer length, at L4
* @init: Set if this is the very first segment from tap
*/
-static void tcp_clamp_window(int s, struct tcphdr *th, int len, int init)
+static void tcp_clamp_window(struct tcp_tap_conn *conn, struct tcphdr *th,
+ int len, int init)
{
if (init) {
- tc[s].ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL);
- tc[s].ws_allowed = tc[s].ws >= 0 && tc[s].ws <= MAX_WS;
- tc[s].ws *= tc[s].ws_allowed;
+ conn->ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL);
+ conn->ws_allowed = conn->ws >= 0 && conn->ws <= MAX_WS;
+ conn->ws *= conn->ws_allowed;
/* RFC 7323, 2.2: first value is not scaled. Also, don't clamp
* yet, to avoid getting a zero scale just because we set a
* small window now.
*/
- tc[s].tap_window = ntohs(th->window);
+ conn->tap_window = ntohs(th->window);
+ conn->window_clamped = 0;
} else {
- tc[s].tap_window = ntohs(th->window) << tc[s].ws;
- setsockopt(s, SOL_TCP, TCP_WINDOW_CLAMP,
- &tc[s].tap_window, sizeof(tc[s].tap_window));
+ unsigned int window = ntohs(th->window) << conn->ws;
+
+ if (conn->tap_window == window && conn->window_clamped)
+ return;
+
+ conn->tap_window = window;
+ if (window < 256)
+ window = 256;
+ setsockopt(conn->sock, SOL_TCP, TCP_WINDOW_CLAMP,
+ &window, sizeof(window));
+ conn->window_clamped = 1;
}
}
@@ -925,283 +1065,277 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
.sin6_port = th->dest,
.sin6_addr = *(struct in6_addr *)addr,
};
- struct epoll_event ev = { 0 };
+ struct epoll_event ev = { .events = EPOLLIN | EPOLLET | EPOLLRDHUP };
+ union epoll_ref ref = { .proto = IPPROTO_TCP };
const struct sockaddr *sa;
+ struct tcp_tap_conn *conn;
socklen_t sl;
int s;
- s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
- if (s < 0)
+ if (c->tcp.tap_conn_count >= MAX_TAP_CONNS)
return;
- if (s >= MAX_CONNS) {
- close(s);
+ ref.s = s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
+ if (s < 0)
return;
- }
- tc[s].mss_guest = tcp_opt_get(th, len, OPT_MSS, NULL, NULL);
- if (tc[s].mss_guest < 0)
- tc[s].mss_guest = MSS_DEFAULT;
- sl = sizeof(tc[s].mss_guest);
- setsockopt(s, SOL_TCP, TCP_MAXSEG, &tc[s].mss_guest, sl);
+ conn = &tt[c->tcp.tap_conn_count++];
+ conn->sock = s;
+
+ conn->mss_guest = tcp_opt_get(th, len, OPT_MSS, NULL, NULL);
+ if (conn->mss_guest < 0)
+ conn->mss_guest = MSS_DEFAULT;
+ sl = sizeof(conn->mss_guest);
+ setsockopt(s, SOL_TCP, TCP_MAXSEG, &conn->mss_guest, sl);
- tcp_clamp_window(s, th, len, 1);
+ tcp_clamp_window(conn, th, len, 1);
if (af == AF_INET) {
sa = (struct sockaddr *)&addr4;
sl = sizeof(addr4);
- memset(&tc[s].a.a4.zero, 0, sizeof(tc[s].a.a4.zero));
- memset(&tc[s].a.a4.one, 0xff, sizeof(tc[s].a.a4.one));
- memcpy(&tc[s].a.a4.a, addr, sizeof(tc[s].a.a4.a));
+ memset(&conn->a.a4.zero, 0, sizeof(conn->a.a4.zero));
+ memset(&conn->a.a4.one, 0xff, sizeof(conn->a.a4.one));
+ memcpy(&conn->a.a4.a, addr, sizeof(conn->a.a4.a));
} else {
sa = (struct sockaddr *)&addr6;
sl = sizeof(addr6);
- memcpy(&tc[s].a.a6, addr, sizeof(tc[s].a.a6));
+ memcpy(&conn->a.a6, addr, sizeof(conn->a.a6));
}
- tc[s].sock_port = th->dest;
- tc[s].tap_port = th->source;
-
- tc[s].ts_sock = tc[s].ts_tap = tc[s].ts_ack_tap = *now;
+ conn->sock_port = ntohs(th->dest);
+ conn->tap_port = ntohs(th->source);
- tcp_act_set(s);
+ conn->ts_sock = conn->ts_tap = conn->ts_ack_tap = *now;
- ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLHUP;
- ev.data.fd = s;
+ bitmap_set(tcp_act, conn - tt);
- tc[s].seq_init_from_tap = ntohl(th->seq);
- tc[s].seq_from_tap = tc[s].seq_init_from_tap + 1;
- tc[s].seq_ack_to_tap = tc[s].seq_from_tap;
+ conn->seq_init_from_tap = ntohl(th->seq);
+ conn->seq_from_tap = conn->seq_init_from_tap + 1;
+ conn->seq_ack_to_tap = conn->seq_from_tap;
- tc[s].seq_to_tap = tcp_seq_init(c, af, addr, th->dest, th->source, now);
- tc[s].seq_ack_from_tap = tc[s].seq_to_tap + 1;
+ conn->seq_to_tap = tcp_seq_init(c, af, addr, th->dest, th->source, now);
+ conn->seq_ack_from_tap = conn->seq_to_tap + 1;
- tcp_sock_hash_insert(c, s, af, addr, th->source, th->dest);
+ tcp_hash_insert(c, conn, af, addr);
if (connect(s, sa, sl)) {
+ tcp_tap_state(conn, TAP_SYN_SENT);
+
if (errno != EINPROGRESS) {
- tcp_rst(c, s);
+ tcp_rst(c, conn);
return;
}
- ev.events |= EPOLLOUT;
- tcp_set_state(s, TAP_SYN_SENT);
+ ev.events = EPOLLOUT | EPOLLRDHUP;
} else {
- if (tcp_send_to_tap(c, s, SYN | ACK, NULL, 0))
- return;
+ tcp_tap_state(conn, TAP_SYN_RCVD);
- tcp_set_state(s, TAP_SYN_RCVD);
+ if (tcp_send_to_tap(c, conn, SYN | ACK, NULL, 0))
+ return;
}
+ ref.tcp.index = conn - tt;
+ ev.data.u64 = ref.u64;
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev);
}
/**
- * tcp_conn_from_sock() - Handle new connection request from listening socket
+ * tcp_table_splice_compact - Compact spliced connection table
* @c: Execution context
- * @fd: File descriptor number for listening socket
- * @now: Current timestamp
+ * @hole: Pointer to recently closed connection
*/
-static void tcp_conn_from_sock(struct ctx *c, int fd, struct timespec *now)
+static void tcp_table_splice_compact(struct ctx *c,
+ struct tcp_splice_conn *hole)
{
- struct sockaddr_storage sa_r, sa_l;
- socklen_t sa_len = sizeof(sa_l);
- struct epoll_event ev = { 0 };
- int s;
-
- if (getsockname(fd, (struct sockaddr *)&sa_l, &sa_len))
- return;
-
- s = accept4(fd, (struct sockaddr *)&sa_r, &sa_len, SOCK_NONBLOCK);
- if (s == -1)
+ union epoll_ref ref_from = { .proto = IPPROTO_TCP,
+ .tcp.index = hole - ts };
+ union epoll_ref ref_to = { .proto = IPPROTO_TCP,
+ .tcp.index = hole - ts };
+ struct tcp_splice_conn *move;
+ struct epoll_event ev_from;
+ struct epoll_event ev_to;
+
+ if ((hole - ts) == --c->tcp.splice_conn_count)
return;
- if (s >= MAX_CONNS) {
- close(s);
- return;
+ move = &ts[c->tcp.splice_conn_count];
+ memcpy(hole, move, sizeof(*hole));
+ move->state = CLOSED;
+ move = hole;
+
+ ref_from.s = move->from;
+ ref_from.tcp.v6 = move->v6;
+ ref_to.s = move->to;
+ ref_to.tcp.v6 = move->v6;
+
+ if (move->state == SPLICE_ACCEPTED) {
+ ev_from.events = ev_to.events = 0;
+ } else if (move->state == SPLICE_CONNECT) {
+ ev_from.events = EPOLLET | EPOLLRDHUP;
+ ev_to.events = EPOLLET | EPOLLOUT | EPOLLRDHUP;
+ } else {
+ ev_from.events = EPOLLET | EPOLLIN | EPOLLOUT | EPOLLRDHUP;
+ ev_to.events = EPOLLET | EPOLLIN | EPOLLOUT | EPOLLRDHUP;
}
- CHECK_SET_MIN_MAX(c->tcp.fd_, s);
- CHECK_SET_MIN_MAX(c->tcp.fd_conn_, s);
-
- if (sa_l.ss_family == AF_INET) {
- struct sockaddr_in *sa4 = (struct sockaddr_in *)&sa_r;
-
- memset(&tc[s].a.a4.zero, 0, sizeof(tc[s].a.a4.zero));
- memset(&tc[s].a.a4.one, 0xff, sizeof(tc[s].a.a4.one));
-
- if (ntohl(sa4->sin_addr.s_addr) == INADDR_LOOPBACK ||
- ntohl(sa4->sin_addr.s_addr) == INADDR_ANY)
- sa4->sin_addr.s_addr = c->gw4;
+ ev_from.data.u64 = ref_from.u64;
+ ev_to.data.u64 = ref_to.u64;
- memcpy(&tc[s].a.a4.a, &sa4->sin_addr, sizeof(tc[s].a.a4.a));
-
- tc[s].sock_port = sa4->sin_port;
- tc[s].tap_port = ((struct sockaddr_in *)&sa_l)->sin_port;
-
- tc[s].seq_to_tap = tcp_seq_init(c, AF_INET, &sa4->sin_addr,
- tc[s].sock_port,
- tc[s].tap_port,
- now);
-
- tcp_sock_hash_insert(c, s, AF_INET, &sa4->sin_addr,
- tc[s].tap_port, tc[s].sock_port);
- } else if (sa_l.ss_family == AF_INET6) {
- struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)&sa_r;
-
- if (IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr))
- memcpy(&sa6->sin6_addr, &c->gw6, sizeof(c->gw6));
-
- memcpy(&tc[s].a.a6, &sa6->sin6_addr, sizeof(tc[s].a.a6));
-
- tc[s].sock_port = sa6->sin6_port;
- tc[s].tap_port = ((struct sockaddr_in6 *)&sa_l)->sin6_port;
-
- tc[s].seq_to_tap = tcp_seq_init(c, AF_INET6, &sa6->sin6_addr,
- tc[s].sock_port,
- tc[s].tap_port,
- now);
+ epoll_ctl(c->epollfd, EPOLL_CTL_MOD, move->from, &ev_from);
+ epoll_ctl(c->epollfd, EPOLL_CTL_MOD, move->to, &ev_to);
+}
- tcp_sock_hash_insert(c, s, AF_INET6, &sa6->sin6_addr,
- tc[s].tap_port, tc[s].sock_port);
+/**
+ * tcp_tap_destroy() - Close spliced connection and pipes, drop from epoll
+ * @c: Execution context
+ * @conn: Connection pointer
+ */
+static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn)
+{
+ switch (conn->state) {
+ case SPLICE_ESTABLISHED:
+ if (conn->pipe_from_to[0] != -1) {
+ close(conn->pipe_from_to[0]);
+ close(conn->pipe_from_to[1]);
+ }
+ if (conn->pipe_to_from[0] != -1) {
+ close(conn->pipe_to_from[0]);
+ close(conn->pipe_to_from[1]);
+ }
+ /* Falls through */
+ case SPLICE_CONNECT:
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->from, NULL);
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->to, NULL);
+ close(conn->to);
+ /* Falls through */
+ case SPLICE_ACCEPTED:
+ close(conn->from);
+ tcp_splice_state(conn, CLOSED);
+ tcp_table_splice_compact(c, conn);
+ return;
+ default:
+ return;
}
-
- tc[s].seq_ack_from_tap = tc[s].seq_to_tap + 1;
-
- tc[s].tap_window = WINDOW_DEFAULT;
- tc[s].ws_allowed = 1;
-
- tc[s].ts_sock = tc[s].ts_tap = tc[s].ts_ack_tap = *now;
-
- tcp_act_set(s);
-
- ev.events = EPOLLRDHUP | EPOLLHUP;
- ev.data.fd = s;
- epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev);
-
- tcp_set_state(s, SOCK_SYN_SENT);
- tcp_send_to_tap(c, s, SYN, NULL, 0);
}
/**
* tcp_send_to_sock() - Send buffer to socket, update timestamp and sequence
* @c: Execution context
- * @s: File descriptor number for socket
+ * @conn: Connection pointer
* @data: Data buffer
* @len: Length at L4
* @extra_flags: Additional flags for send(), if any
*
* Return: negative on socket error with connection reset, 0 otherwise
*/
-static int tcp_send_to_sock(struct ctx *c, int s, char *data, int len,
- int extra_flags)
+static int tcp_send_to_sock(struct ctx *c, struct tcp_tap_conn *conn,
+ char *data, int len, int extra_flags)
{
- int err = send(s, data, len, MSG_DONTWAIT | MSG_NOSIGNAL | extra_flags);
+ int err = send(conn->sock, data, len,
+ MSG_DONTWAIT | MSG_NOSIGNAL | extra_flags);
if (err < 0) {
if (errno == EAGAIN || errno == EWOULDBLOCK) {
- /* If we can't queue right now, do nothing, sender has
- * to retransmit.
- */
- return 0;
+ tcp_send_to_tap(c, conn, ZERO_WINDOW, NULL, 0);
+ return err;
}
err = errno;
- tcp_rst(c, s);
+ tcp_rst(c, conn);
return -err;
}
- tc[s].seq_from_tap += len;
-
- return 0;
-}
-
-/**
- * tcp_is_dupack() - Check if given ACK number is duplicated, update counter
- * @s: File descriptor number for socket
- * @ack_seq: ACK sequence, host order
- *
- * Return: -EAGAIN on duplicated ACKs observed, with counter reset, 0 otherwise
- */
-static int tcp_is_dupack(int s, uint32_t ack_seq)
-{
- if (ack_seq == tc[s].seq_ack_from_tap && ++tc[s].dup_acks == 2) {
- tc[s].dup_acks = 0;
- return -EAGAIN;
- }
+ conn->seq_from_tap += err;
return 0;
}
/**
* tcp_sock_consume() - Consume (discard) data from buffer, update ACK sequence
- * @s: File descriptor number for socket
+ * @conn: Connection pointer
* @ack_seq: ACK sequence, host order
*/
-static void tcp_sock_consume(int s, uint32_t ack_seq)
+static void tcp_sock_consume(struct tcp_tap_conn *conn, uint32_t ack_seq)
{
- int to_ack;
+ uint32_t to_ack;
/* Implicitly take care of wrap-arounds */
- to_ack = ack_seq - tc[s].seq_ack_from_tap;
+ to_ack = ack_seq - conn->seq_ack_from_tap;
/* Simply ignore out-of-order ACKs: we already consumed the data we
* needed from the buffer, and we won't rewind back to a lower ACK
* sequence.
*/
- if (to_ack < 0)
+ if (to_ack > MAX_WINDOW)
return;
- recv(s, NULL, to_ack, MSG_DONTWAIT | MSG_TRUNC);
+ if (to_ack)
+ recv(conn->sock, NULL, to_ack, MSG_DONTWAIT | MSG_TRUNC);
- tc[s].seq_ack_from_tap = ack_seq;
+ conn->seq_ack_from_tap = ack_seq;
}
/**
* tcp_data_from_sock() - Handle new data from socket, queue to tap, in window
* @c: Execution context
- * @s: File descriptor number for socket
+ * @conn: Connection pointer
* @now: Current timestamp
*
* Return: negative on connection reset, 1 on pending data, 0 otherwise
*/
-static int tcp_data_from_sock(struct ctx *c, int s, struct timespec *now)
+static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
+ struct timespec *now)
{
- int len, err, offset, left, send;
+ uint32_t offset = conn->seq_to_tap - conn->seq_ack_from_tap;
+ int len, err, left, send, s = conn->sock;
+
+ if (!conn->tap_window || offset >= conn->tap_window)
+ return 1;
+
+ len = recv(s, sock_buf,
+ /* TODO: Drop 64KiB limit (needed for responsiveness) once
+ * tap-side coalescing and zero-copy are fully implemented.
+ */
+ MIN(64 * 1024, conn->tap_window),
+ /* Don't dequeue until acknowledged by guest */
+ MSG_DONTWAIT | MSG_PEEK);
- /* Don't dequeue until acknowledged by guest */
- len = recv(s, sock_buf, sizeof(sock_buf), MSG_DONTWAIT | MSG_PEEK);
if (len < 0) {
if (errno != EAGAIN && errno != EWOULDBLOCK) {
- tcp_rst(c, s);
+ tcp_rst(c, conn);
return -errno;
}
return 0;
}
if (len == 0) {
- if (tc[s].s >= ESTABLISHED_SOCK_FIN)
+ if (conn->state >= ESTABLISHED_SOCK_FIN)
return 0;
- tcp_set_state(s, ESTABLISHED_SOCK_FIN);
- if ((err = tcp_send_to_tap(c, s, FIN | ACK, NULL, 0)))
+ tcp_tap_state(conn, ESTABLISHED_SOCK_FIN);
+ if ((err = tcp_send_to_tap(c, conn, FIN | ACK, NULL, 0)))
return err;
left = 0;
goto out;
}
- offset = tc[s].seq_to_tap - tc[s].seq_ack_from_tap;
left = len - offset;
- while (left && offset + tc[s].mss_guest <= tc[s].tap_window) {
- if (left < tc[s].mss_guest)
+ while (left && (offset + conn->mss_guest <= conn->tap_window)) {
+ if (left < conn->mss_guest)
send = left;
else
- send = tc[s].mss_guest;
+ send = conn->mss_guest;
+
+ if (offset + send > MAX_WINDOW) {
+ tcp_rst(c, conn);
+ return -EIO;
+ }
- if ((err = tcp_send_to_tap(c, s, 0, sock_buf + offset, send)))
+ err = tcp_send_to_tap(c, conn, 0, sock_buf + offset, send);
+ if (err)
return err;
offset += send;
@@ -1209,7 +1343,7 @@ static int tcp_data_from_sock(struct ctx *c, int s, struct timespec *now)
}
out:
- tc[s].ts_sock = *now;
+ conn->ts_sock = *now;
return !!left;
}
@@ -1218,6 +1352,7 @@ out:
* tcp_tap_handler() - Handle packets from tap and state transitions
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
+ * @addr: Destination address
* @msg: Input messages
* @count: Message count
* @now: Current timestamp
@@ -1227,15 +1362,19 @@ out:
int tcp_tap_handler(struct ctx *c, int af, void *addr,
struct tap_msg *msg, int count, struct timespec *now)
{
+ union epoll_ref ref = { .proto = IPPROTO_TCP,
+ .tcp.v6 = ( af == AF_INET6 ) };
+
/* TODO: Implement message batching for TCP */
struct tcphdr *th = (struct tcphdr *)msg[0].l4h;
- struct epoll_event ev = { 0 };
size_t len = msg[0].l4_len;
+ struct tcp_tap_conn *conn;
+ struct epoll_event ev;
size_t off, skip = 0;
- int s, ws;
+ int ws, i;
- (void)count;
+ uint32_t __seq_max;
if (len < sizeof(*th))
return 1;
@@ -1244,146 +1383,178 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
if (off < sizeof(*th) || off > len)
return 1;
- if ((s = tcp_sock_hash_lookup(c, af, addr, th->source, th->dest)) < 0) {
+ conn = tcp_hash_lookup(c, af, addr, htons(th->source), htons(th->dest));
+ if (!conn) {
if (th->syn)
tcp_conn_from_tap(c, af, addr, th, len, now);
return 1;
}
+ /* TODO: Partial ACK coalescing, merge with message coalescing */
+ for (i = 0; conn->state == ESTABLISHED && i < count; i++) {
+ struct tcphdr *__th = (struct tcphdr *)msg[i].l4h;
+ size_t __len = msg[i].l4_len;
+ uint32_t __this;
+
+ if (__len < sizeof(*th))
+ break;
+
+ off = __th->doff * 4;
+ if (off < sizeof(*th) || off > __len)
+ break;
+
+ __this = ntohl(th->ack_seq);
+
+ if (!i || __this - __seq_max < MAX_WINDOW)
+ __seq_max = __this;
+
+ if ((!th->ack || len != off) && i) {
+ tcp_sock_consume(conn, __seq_max);
+ conn->ts_tap = *now;
+ return i;
+ }
+ }
+
if (th->rst) {
- tcp_close_and_epoll_del(c, s);
+ tcp_tap_destroy(c, conn);
return 1;
}
- tcp_clamp_window(s, th, len, th->syn && th->ack);
+ tcp_clamp_window(conn, th, len, th->syn && th->ack);
- tc[s].ts_tap = *now;
+ conn->ts_tap = *now;
- if (ntohl(th->seq) < tc[s].seq_from_tap)
- skip = tc[s].seq_from_tap - ntohl(th->seq);
+ if (ntohl(th->seq) < conn->seq_from_tap &&
+ conn->seq_from_tap - ntohl(th->seq) < MAX_WINDOW) {
+ skip = conn->seq_from_tap - ntohl(th->seq);
+ }
- switch (tc[s].s) {
+ switch (conn->state) {
case SOCK_SYN_SENT:
if (!th->syn || !th->ack) {
- tcp_rst(c, s);
+ tcp_rst(c, conn);
return 1;
}
- tc[s].mss_guest = tcp_opt_get(th, len, OPT_MSS, NULL, NULL);
- if (tc[s].mss_guest < 0)
- tc[s].mss_guest = MSS_DEFAULT;
+ conn->mss_guest = tcp_opt_get(th, len, OPT_MSS, NULL, NULL);
+ if (conn->mss_guest < 0)
+ conn->mss_guest = MSS_DEFAULT;
ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL);
if (ws > MAX_WS) {
- if (tcp_send_to_tap(c, s, RST, NULL, 0))
+ if (tcp_send_to_tap(c, conn, RST, NULL, 0))
return 1;
- tc[s].seq_to_tap = 0;
- tc[s].ws_allowed = 0;
- tcp_send_to_tap(c, s, SYN, NULL, 0);
+ conn->seq_to_tap = 0;
+ conn->ws_allowed = 0;
+ tcp_send_to_tap(c, conn, SYN, NULL, 0);
return 1;
}
/* info.tcpi_bytes_acked already includes one byte for SYN, but
* not for incoming connections.
*/
- tc[s].seq_init_from_tap = ntohl(th->seq) + 1;
- tc[s].seq_from_tap = tc[s].seq_init_from_tap;
- tc[s].seq_ack_to_tap = tc[s].seq_from_tap;
+ conn->seq_init_from_tap = ntohl(th->seq) + 1;
+ conn->seq_from_tap = conn->seq_init_from_tap;
+ conn->seq_ack_to_tap = conn->seq_from_tap;
- tcp_set_state(s, ESTABLISHED);
- tcp_send_to_tap(c, s, ACK, NULL, 0);
+ tcp_tap_state(conn, ESTABLISHED);
+ tcp_send_to_tap(c, conn, ACK, NULL, 0);
/* The client might have sent data already, which we didn't
* dequeue waiting for SYN,ACK from tap -- check now.
*/
- tcp_data_from_sock(c, s, now);
-
- ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLHUP;
- ev.data.fd = s;
- epoll_ctl(c->epollfd, EPOLL_CTL_MOD, s, &ev);
+ tcp_data_from_sock(c, conn, now);
+ ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
+ ref.s = conn->sock;
+ ref.tcp.index = conn - tt;
+ ev.data.u64 = ref.u64;
+ epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->sock, &ev);
break;
case TAP_SYN_RCVD:
if (th->fin) {
- shutdown(s, SHUT_WR);
- tcp_set_state(s, FIN_WAIT_1);
+ shutdown(conn->sock, SHUT_WR);
+ tcp_tap_state(conn, FIN_WAIT_1);
break;
}
if (!th->ack) {
- tcp_rst(c, s);
+ tcp_rst(c, conn);
return 1;
}
- tcp_set_state(s, ESTABLISHED);
+ tcp_tap_state(conn, ESTABLISHED);
break;
case ESTABLISHED:
case ESTABLISHED_SOCK_FIN:
- tc[s].ts_ack_tap = *now;
+ conn->ts_ack_tap = *now;
- if (ntohl(th->seq) > tc[s].seq_from_tap) {
- tc[s].seq_from_tap = tc[s].seq_ack_to_tap;
- tcp_send_to_tap(c, s, ACK, NULL, 0);
- break;
+ if (ntohl(th->ack_seq) > conn->seq_to_tap &&
+ (conn->seq_to_tap - ntohl(th->ack_seq)) > MAX_WINDOW) {
+ return count;
}
if (th->ack) {
- int retrans = 0;
-
- if (len == off)
- retrans = tcp_is_dupack(s, ntohl(th->ack_seq));
+ tcp_sock_consume(conn, ntohl(th->ack_seq));
- tcp_sock_consume(s, ntohl(th->ack_seq));
-
- if (retrans)
- tc[s].seq_to_tap = tc[s].seq_ack_from_tap;
-
- if (tc[s].s == ESTABLISHED_SOCK_FIN) {
- if (!tcp_data_from_sock(c, s, now))
- tcp_set_state(s, CLOSE_WAIT);
+ if (conn->state == ESTABLISHED_SOCK_FIN) {
+ if (!tcp_data_from_sock(c, conn, now))
+ tcp_tap_state(conn, CLOSE_WAIT);
+ } else {
+ tcp_data_from_sock(c, conn, now);
}
}
+ if (ntohl(th->seq) > conn->seq_from_tap) {
+ tcp_send_to_tap(c, conn, ACK, NULL, 0);
+ tcp_send_to_tap(c, conn, ACK, NULL, 0);
+ return count;
+ }
+
if (skip < len - off &&
- tcp_send_to_sock(c, s,
+ tcp_send_to_sock(c, conn,
msg[0].l4h + off + skip, len - off - skip,
th->psh ? 0 : MSG_MORE))
- break;
+ return 1;
- tcp_data_from_sock(c, s, now);
+ if (count == 1)
+ tcp_send_to_tap(c, conn, ACK, NULL, 0);
if (th->fin) {
- shutdown(s, SHUT_WR);
- if (tc[s].s == ESTABLISHED)
- tcp_set_state(s, FIN_WAIT_1);
+ shutdown(conn->sock, SHUT_WR);
+ if (conn->state == ESTABLISHED)
+ tcp_tap_state(conn, FIN_WAIT_1);
else
- tcp_set_state(s, LAST_ACK);
+ tcp_tap_state(conn, LAST_ACK);
}
break;
case CLOSE_WAIT:
- tcp_sock_consume(s, ntohl(th->ack_seq));
+ tcp_sock_consume(conn, ntohl(th->ack_seq));
- if (skip < len - off &&
- tcp_send_to_sock(c, s,
+ if (skip < (len - off) &&
+ tcp_send_to_sock(c, conn,
msg[0].l4h + off + skip, len - off - skip,
th->psh ? 0 : MSG_MORE))
break;
if (th->fin) {
- shutdown(s, SHUT_WR);
- tcp_set_state(s, LAST_ACK);
+ shutdown(conn->sock, SHUT_WR);
+ tcp_tap_state(conn, LAST_ACK);
}
break;
case FIN_WAIT_1_SOCK_FIN:
if (th->ack)
- tcp_close_and_epoll_del(c, s);
+ tcp_tap_destroy(c, conn);
break;
case FIN_WAIT_1:
case TAP_SYN_SENT:
case LAST_ACK:
+ case SPLICE_ACCEPTED:
+ case SPLICE_CONNECT:
+ case SPLICE_ESTABLISHED:
case CLOSED: /* ;) */
break;
}
@@ -1395,106 +1566,538 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
* tcp_connect_finish() - Handle completion of connect() from EPOLLOUT event
* @c: Execution context
* @s: File descriptor number for socket
+ * @ref: epoll reference
*/
-static void tcp_connect_finish(struct ctx *c, int s)
+static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn,
+ union epoll_ref ref)
{
- struct epoll_event ev = { 0 };
+ struct epoll_event ev;
socklen_t sl;
int so;
sl = sizeof(so);
- if (getsockopt(s, SOL_SOCKET, SO_ERROR, &so, &sl) || so) {
- tcp_rst(c, s);
+ if (getsockopt(conn->sock, SOL_SOCKET, SO_ERROR, &so, &sl) || so) {
+ tcp_rst(c, conn);
return;
}
- if (tcp_send_to_tap(c, s, SYN | ACK, NULL, 0))
+ if (tcp_send_to_tap(c, conn, SYN | ACK, NULL, 0))
return;
/* Drop EPOLLOUT, only used to wait for connect() to complete */
- ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLHUP;
- ev.data.fd = s;
- epoll_ctl(c->epollfd, EPOLL_CTL_MOD, s, &ev);
+ ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
+ ev.data.u64 = ref.u64;
+ epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->sock, &ev);
- tcp_set_state(s, TAP_SYN_RCVD);
+ tcp_tap_state(conn, TAP_SYN_RCVD);
+}
+
+/**
+ * tcp_splice_connect_finish() - Completion of connect() or call on success
+ * @c: Execution context
+ * @conn: Connection pointer
+ * @v6: Set on IPv6 connection
+ */
+static void tcp_splice_connect_finish(struct ctx *c,
+ struct tcp_splice_conn *conn, int v6)
+{
+ union epoll_ref ref_from = { .proto = IPPROTO_TCP, .s = conn->from,
+ .tcp = { .splice = 1, .v6 = v6,
+ .index = conn - ts } };
+ union epoll_ref ref_to = { .proto = IPPROTO_TCP, .s = conn->to,
+ .tcp = { .splice = 1, .v6 = v6,
+ .index = conn - ts } };
+ struct epoll_event ev_from, ev_to;
+
+ if (conn->state == SPLICE_CONNECT) {
+ socklen_t sl;
+ int so;
+
+ sl = sizeof(so);
+ if (getsockopt(conn->to, SOL_SOCKET, SO_ERROR, &so, &sl) ||
+ so) {
+ tcp_splice_destroy(c, conn);
+ return;
+ }
+
+ tcp_splice_state(conn, SPLICE_ESTABLISHED);
+
+ ev_from.events = ev_to.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
+ ev_from.data.u64 = ref_from.u64;
+ ev_to.data.u64 = ref_to.u64;
+
+ epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->from, &ev_from);
+ epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->to, &ev_to);
+ }
+
+ conn->pipe_from_to[0] = conn->pipe_to_from[0] = -1;
+ if (pipe2(conn->pipe_to_from, O_NONBLOCK) ||
+ pipe2(conn->pipe_from_to, O_NONBLOCK)) {
+ tcp_splice_destroy(c, conn);
+ return;
+ }
+
+ fcntl(conn->pipe_from_to[0], F_SETPIPE_SZ, PIPE_SIZE);
+ fcntl(conn->pipe_to_from[0], F_SETPIPE_SZ, PIPE_SIZE);
+}
+
+/**
+ * tcp_splice_connect() - Create and connect socket for new spliced connection
+ * @c: Execution context
+ * @conn: Connection pointer
+ * @v6: Set on IPv6 connection
+ * @port: Destination port, host order
+ *
+ * Return: 0 for connect() succeeded or in progress, negative value on error
+ */
+static int tcp_splice_connect(struct ctx *c, struct tcp_splice_conn *conn,
+ int v6, in_port_t port)
+{
+ int sock_conn = socket(v6 ? AF_INET6 : AF_INET,
+ SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
+ union epoll_ref ref_accept = { .proto = IPPROTO_TCP, .s = conn->from,
+ .tcp = { .splice = 1, .v6 = v6,
+ .index = conn - ts } };
+ union epoll_ref ref_conn = { .proto = IPPROTO_TCP, .s = sock_conn,
+ .tcp = { .splice = 1, .v6 = v6,
+ .index = conn - ts } };
+ struct epoll_event ev_accept = { .events = EPOLLRDHUP | EPOLLET,
+ .data.u64 = ref_accept.u64 };
+ struct epoll_event ev_conn = { .events = EPOLLRDHUP | EPOLLET,
+ .data.u64 = ref_conn.u64 };
+ struct sockaddr_in6 addr6 = {
+ .sin6_family = AF_INET6,
+ .sin6_port = htons(port),
+ .sin6_addr = IN6ADDR_LOOPBACK_INIT,
+ };
+ struct sockaddr_in addr4 = {
+ .sin_family = AF_INET,
+ .sin_port = htons(port),
+ .sin_addr = { .s_addr = htonl(INADDR_LOOPBACK) },
+ };
+ const struct sockaddr *sa;
+ int ret, one = 1;
+ socklen_t sl;
+
+ if (sock_conn < 0)
+ return -errno;
+
+ conn->to = sock_conn;
+
+ setsockopt(conn->from, SOL_TCP, TCP_CORK, &one, sizeof(one));
+ setsockopt(conn->from, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
+ setsockopt(conn->to, SOL_TCP, TCP_CORK, &one, sizeof(one));
+ setsockopt(conn->to, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
+
+ if (v6) {
+ sa = (struct sockaddr *)&addr6;
+ sl = sizeof(addr6);
+ } else {
+ sa = (struct sockaddr *)&addr4;
+ sl = sizeof(addr4);
+ }
+
+ if (connect(conn->to, sa, sl)) {
+ if (errno != EINPROGRESS) {
+ ret = -errno;
+ close(sock_conn);
+ return ret;
+ }
+
+ tcp_splice_state(conn, SPLICE_CONNECT);
+ ev_conn.events |= EPOLLOUT;
+ } else {
+ tcp_splice_state(conn, SPLICE_ESTABLISHED);
+ tcp_splice_connect_finish(c, conn, v6);
+
+ ev_conn.events |= EPOLLIN;
+ ev_accept.events |= EPOLLIN;
+ }
+
+ epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->from, &ev_accept);
+ epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->to, &ev_conn);
+
+ return 0;
+}
+
+/**
+ * struct tcp_splice_connect_ns_arg - Arguments for tcp_splice_connect_ns()
+ * @c: Execution context
+ * @conn: Accepted inbound connection
+ * @v6: Set for inbound IPv6 connection
+ * @port: Destination port, host order
+ * @ret: Return value of tcp_splice_connect_ns()
+ */
+struct tcp_splice_connect_ns_arg {
+ struct ctx *c;
+ struct tcp_splice_conn *conn;
+ int v6;
+ in_port_t port;
+ int ret;
+};
+
+/**
+ * tcp_splice_connect_ns() - Enter namespace and call tcp_splice_connect()
+ * @arg: See struct tcp_splice_connect_ns_arg
+ *
+ * Return: 0
+ */
+static int tcp_splice_connect_ns(void *arg)
+{
+ struct tcp_splice_connect_ns_arg *a;
+
+ a = (struct tcp_splice_connect_ns_arg *)arg;
+ ns_enter(a->c->pasta_pid);
+ a->ret = tcp_splice_connect(a->c, a->conn, a->v6, a->port);
+ return 0;
+}
+
+/**
+ * tcp_splice_new() - Handle new inbound, spliced connection
+ * @c: Execution context
+ * @conn: Connection pointer
+ * @v6: Set for IPv6 connection
+ * @port: Destination port, host order
+ *
+ * Return: return code from connect()
+ */
+static int tcp_splice_new(struct ctx *c, struct tcp_splice_conn *conn,
+ int v6, in_port_t port)
+{
+ struct tcp_splice_connect_ns_arg ns_arg = { c, conn, v6, port, 0 };
+ char ns_fn_stack[NS_FN_STACK_SIZE];
+
+ if (bitmap_isset(c->tcp.port_to_ns, port)) {
+ clone(tcp_splice_connect_ns,
+ ns_fn_stack + sizeof(ns_fn_stack) / 2,
+ CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD,
+ (void *)&ns_arg);
+
+ return ns_arg.ret;
+ }
+
+ return tcp_splice_connect(c, conn, v6, port);
+}
+
+/**
+ * tcp_conn_from_sock() - Handle new connection request from listening socket
+ * @c: Execution context
+ * @ref: epoll reference of listening socket
+ * @now: Current timestamp
+ */
+static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
+ struct timespec *now)
+{
+ union epoll_ref ref_conn = { .proto = IPPROTO_TCP,
+ .tcp.v6 = ref.tcp.v6 };
+ struct sockaddr_storage sa;
+ struct tcp_tap_conn *conn;
+ struct epoll_event ev;
+ socklen_t sa_len;
+ int s;
+
+ if (c->tcp.tap_conn_count >= MAX_TAP_CONNS)
+ return;
+
+ sa_len = sizeof(sa);
+ s = accept4(ref.s, (struct sockaddr *)&sa, &sa_len, SOCK_NONBLOCK);
+ if (s < 0)
+ return;
+
+ conn = &tt[c->tcp.tap_conn_count++];
+ ref_conn.tcp.index = conn - tt;
+ ref_conn.s = conn->sock = s;
+
+ if (ref.tcp.v6) {
+ struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)&sa;
+
+ if (IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr))
+ memcpy(&sa6->sin6_addr, &c->gw6, sizeof(c->gw6));
+
+ memcpy(&conn->a.a6, &sa6->sin6_addr, sizeof(conn->a.a6));
+
+ conn->sock_port = ntohs(sa6->sin6_port);
+ conn->tap_port = ref.tcp.index;
+
+ conn->seq_to_tap = tcp_seq_init(c, AF_INET6, &sa6->sin6_addr,
+ conn->sock_port,
+ conn->tap_port,
+ now);
+
+ tcp_hash_insert(c, conn, AF_INET6, &sa6->sin6_addr);
+ } else {
+ struct sockaddr_in *sa4 = (struct sockaddr_in *)&sa;
+
+ memset(&conn->a.a4.zero, 0, sizeof(conn->a.a4.zero));
+ memset(&conn->a.a4.one, 0xff, sizeof(conn->a.a4.one));
+
+ if (ntohl(sa4->sin_addr.s_addr) == INADDR_LOOPBACK ||
+ ntohl(sa4->sin_addr.s_addr) == INADDR_ANY)
+ sa4->sin_addr.s_addr = c->gw4;
+
+ memcpy(&conn->a.a4.a, &sa4->sin_addr, sizeof(conn->a.a4.a));
+
+ conn->sock_port = ntohs(sa4->sin_port);
+ conn->tap_port = ref.tcp.index;
+
+ conn->seq_to_tap = tcp_seq_init(c, AF_INET, &sa4->sin_addr,
+ conn->sock_port,
+ conn->tap_port,
+ now);
+
+ tcp_hash_insert(c, conn, AF_INET, &sa4->sin_addr);
+ }
+
+ conn->seq_ack_from_tap = conn->seq_to_tap + 1;
+
+ conn->tap_window = WINDOW_DEFAULT;
+ conn->ws_allowed = 1;
+
+ conn->ts_sock = conn->ts_tap = conn->ts_ack_tap = *now;
+
+ bitmap_set(tcp_act, conn - tt);
+
+ ev.events = EPOLLRDHUP;
+ ev.data.u64 = ref_conn.u64;
+ epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->sock, &ev);
+
+ tcp_tap_state(conn, SOCK_SYN_SENT);
+ tcp_send_to_tap(c, conn, SYN, NULL, 0);
+}
+
+/**
+ * tcp_sock_handler_splice() - Handler for socket mapped to spliced connection
+ * @c: Execution context
+ * @ref: epoll reference
+ * @events: epoll events bitmap
+ */
+void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
+ uint32_t events)
+{
+ int move_from, move_to, *pipes;
+ struct tcp_splice_conn *conn;
+
+ if (ref.tcp.listen) {
+ int s;
+
+ if (c->tcp.splice_conn_count >= MAX_SPLICE_CONNS)
+ return;
+
+ if ((s = accept4(ref.s, NULL, NULL, SOCK_NONBLOCK)) < 0)
+ return;
+
+ conn = &ts[c->tcp.splice_conn_count++];
+ conn->from = s;
+ tcp_splice_state(conn, SPLICE_ACCEPTED);
+
+ if (tcp_splice_new(c, conn, ref.tcp.v6, ref.tcp.index))
+ tcp_splice_destroy(c, conn);
+
+ return;
+ }
+
+ conn = &ts[ref.tcp.index];
+
+ if (events & EPOLLRDHUP || events & EPOLLHUP || events & EPOLLERR) {
+ tcp_splice_destroy(c, conn);
+ return;
+ }
+
+ if (events & EPOLLOUT) {
+ struct epoll_event ev = {
+ .events = EPOLLIN | EPOLLET | EPOLLRDHUP,
+ .data.u64 = ref.u64,
+ };
+
+ if (conn->state == SPLICE_CONNECT) {
+ tcp_splice_connect_finish(c, conn, ref.tcp.v6);
+ return;
+ }
+
+ epoll_ctl(c->epollfd, EPOLL_CTL_MOD, ref.s, &ev);
+
+ move_to = ref.s;
+ if (ref.s == conn->to) {
+ move_from = conn->from;
+ pipes = conn->pipe_from_to;
+ } else {
+ move_from = conn->to;
+ pipes = conn->pipe_to_from;
+ }
+ } else {
+ move_from = ref.s;
+ if (ref.s == conn->from) {
+ move_to = conn->to;
+ pipes = conn->pipe_from_to;
+ } else {
+ move_to = conn->from;
+ pipes = conn->pipe_to_from;
+ }
+ }
+
+swap:
+ while (1) {
+ int retry_write = 1, no_read = 1;
+ ssize_t ret, nr = 0, nw;
+
+retry:
+ ret = splice(move_from, NULL, pipes[1], NULL, PIPE_SIZE,
+ SPLICE_F_MOVE);
+ if (ret < 0) {
+ if (errno == EAGAIN) {
+ nr = PIPE_SIZE;
+ } else {
+ tcp_splice_destroy(c, conn);
+ return;
+ }
+ } else if (!ret && no_read) {
+ break;
+ } else if (ret) {
+ no_read = 0;
+ nr += ret;
+ }
+
+ nw = splice(pipes[0], NULL, move_to, NULL, nr, SPLICE_F_MOVE);
+ if (nw < 0) {
+ if (errno == EAGAIN) {
+ struct epoll_event ev = {
+ .events = EPOLLIN | EPOLLOUT | EPOLLET |
+ EPOLLRDHUP
+ };
+
+ if (no_read)
+ break;
+
+ if (retry_write--)
+ goto retry;
+
+ ref.s = move_to;
+ ev.data.u64 = ref.u64,
+ epoll_ctl(c->epollfd, EPOLL_CTL_MOD, move_to,
+ &ev);
+ break;
+ }
+ tcp_splice_destroy(c, conn);
+ return;
+ }
+ }
+
+ if ((events & (EPOLLIN | EPOLLOUT)) == (EPOLLIN | EPOLLOUT)) {
+ events = EPOLLIN;
+
+ SWAP(move_from, move_to);
+ if (pipes == conn->pipe_from_to)
+ pipes = conn->pipe_to_from;
+ else
+ pipes = conn->pipe_from_to;
+
+ goto swap;
+ }
}
/**
* tcp_sock_handler() - Handle new data from socket
* @c: Execution context
- * @s: File descriptor number for socket
+ * @ref: epoll reference
* @events: epoll events bitmap
- * @pkt_buf: Buffer to receive packets, currently unused
* @now: Current timestamp
*/
-void tcp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf,
+void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now)
{
- int accept = -1;
- socklen_t sl;
+ struct tcp_tap_conn *conn;
- (void)pkt_buf;
-
- sl = sizeof(accept);
+ if (ref.tcp.splice) {
+ tcp_sock_handler_splice(c, ref, events);
+ return;
+ }
- if (tc[s].s == LAST_ACK) {
- tcp_send_to_tap(c, s, ACK, NULL, 0);
- tcp_close_and_epoll_del(c, s);
+ if (ref.tcp.listen) {
+ tcp_conn_from_sock(c, ref, now);
return;
}
- if (tc[s].s == SOCK_SYN_SENT) {
- /* This can only be a socket error or a shutdown from remote */
- tcp_rst(c, s);
+ conn = &tt[ref.tcp.index];
+
+ if (conn->state == LAST_ACK) {
+ tcp_send_to_tap(c, conn, ACK, NULL, 0);
+ tcp_tap_destroy(c, conn);
return;
}
- if (IN_INTERVAL(c->tcp.fd_listen_min, c->tcp.fd_listen_max, s) &&
- !IN_INTERVAL(c->tcp.fd_conn_min, c->tcp.fd_conn_max, s))
- accept = 1;
- else if (IN_INTERVAL(c->tcp.fd_conn_min, c->tcp.fd_conn_max, s) &&
- !IN_INTERVAL(c->tcp.fd_listen_min, c->tcp.fd_listen_max, s))
- accept = 0;
- else if (getsockopt(s, SOL_SOCKET, SO_ACCEPTCONN, &accept, &sl))
- accept = -1;
-
- if ((events & EPOLLERR) || accept == -1) {
- if (tc[s].s != CLOSED)
- tcp_rst(c, s);
+
+ if (conn->state == SOCK_SYN_SENT) {
+ /* This can only be a socket error or a shutdown from remote */
+ tcp_rst(c, conn);
return;
}
- if (accept) {
- tcp_conn_from_sock(c, s, now);
+ if (events & EPOLLERR) {
+ if (conn->state != CLOSED)
+ tcp_rst(c, conn);
return;
}
if (events & EPOLLOUT) { /* Implies TAP_SYN_SENT */
- tcp_connect_finish(c, s);
+ tcp_connect_finish(c, conn, ref);
return;
}
- if (tc[s].s == ESTABLISHED)
- tcp_data_from_sock(c, s, now);
-
- if (events & EPOLLRDHUP || events & EPOLLHUP) {
- if (tc[s].s == ESTABLISHED) {
- tcp_set_state(s, ESTABLISHED_SOCK_FIN);
- shutdown(s, SHUT_RD);
- tcp_data_from_sock(c, s, now);
- tcp_send_to_tap(c, s, FIN | ACK, NULL, 0);
- } else if (tc[s].s == FIN_WAIT_1) {
- tcp_set_state(s, FIN_WAIT_1_SOCK_FIN);
- shutdown(s, SHUT_RD);
- tcp_data_from_sock(c, s, now);
- tcp_send_to_tap(c, s, FIN | ACK, NULL, 0);
- tcp_sock_consume(s, tc[s].seq_ack_from_tap);
+ if (conn->state == ESTABLISHED)
+ tcp_data_from_sock(c, conn, now);
+
+ if (events & (EPOLLRDHUP | EPOLLHUP)) {
+ if (conn->state == ESTABLISHED) {
+ tcp_tap_state(conn, ESTABLISHED_SOCK_FIN);
+ shutdown(conn->sock, SHUT_RD);
+ tcp_data_from_sock(c, conn, now);
+ tcp_send_to_tap(c, conn, FIN | ACK, NULL, 0);
+ } else if (conn->state == FIN_WAIT_1) {
+ tcp_tap_state(conn, FIN_WAIT_1_SOCK_FIN);
+ shutdown(conn->sock, SHUT_RD);
+ tcp_data_from_sock(c, conn, now);
+ tcp_send_to_tap(c, conn, FIN | ACK, NULL, 0);
+ tcp_sock_consume(conn, conn->seq_ack_from_tap);
} else {
- tcp_close_and_epoll_del(c, s);
+ tcp_tap_destroy(c, conn);
}
}
}
/**
+ * tcp_sock_init_ns() - Bind sockets in namespace for inbound connections
+ * @arg: Execution context
+ *
+ * Return: 0 on success, -1 on failure
+ */
+static int tcp_sock_init_ns(void *arg)
+{
+ union tcp_epoll_ref tref = { .listen = 1, .splice = 1 };
+ struct ctx *c = (struct ctx *)arg;
+ in_port_t port;
+
+ ns_enter(c->pasta_pid);
+
+ for (port = 0; !PORT_IS_EPHEMERAL(port); port++) {
+ if (!bitmap_isset(c->tcp.port_to_init, port))
+ continue;
+
+ tref.index = port;
+
+ if (c->v4) {
+ tref.v6 = 0;
+ sock_l4(c, AF_INET, IPPROTO_TCP, port, 1, tref.u32);
+ }
+
+ if (c->v6) {
+ tref.v6 = 1;
+ sock_l4(c, AF_INET6, IPPROTO_TCP, port, 1, tref.u32);
+ }
+ }
+
+ return 0;
+}
+
+/**
* tcp_sock_init() - Bind sockets for inbound connections, get key for sequence
* @c: Execution context
*
@@ -1502,28 +2105,40 @@ void tcp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf,
*/
int tcp_sock_init(struct ctx *c)
{
+ union tcp_epoll_ref tref = { .listen = 1 };
+ char ns_fn_stack[NS_FN_STACK_SIZE];
in_port_t port;
- int s = 0;
- c->tcp.fd_min = c->tcp.fd_listen_min = c->tcp.fd_conn_min = INT_MAX;
- c->tcp.fd_max = c->tcp.fd_listen_max = c->tcp.fd_conn_max = 0;
- CHECK_SET_MIN_MAX(c->tcp.fd_listen_, s);
+ getrandom(&c->tcp.hash_secret, sizeof(c->tcp.hash_secret), GRND_RANDOM);
for (port = 0; !PORT_IS_EPHEMERAL(port); port++) {
+ if (bitmap_isset(c->tcp.port_to_ns, port))
+ tref.splice = 1;
+ else if (bitmap_isset(c->tcp.port_to_tap, port))
+ tref.splice = 0;
+ else
+ continue;
+
+ tref.index = port;
+
if (c->v4) {
- if ((s = sock_l4(c, AF_INET, IPPROTO_TCP, port)) < 0)
- return -1;
- CHECK_SET_MIN_MAX(c->tcp.fd_listen_, s);
+ tref.v6 = 0;
+ sock_l4(c, AF_INET, IPPROTO_TCP, port, tref.splice,
+ tref.u32);
}
if (c->v6) {
- if ((s = sock_l4(c, AF_INET6, IPPROTO_TCP, port)) < 0)
- return -1;
- CHECK_SET_MIN_MAX(c->tcp.fd_listen_, s);
+ tref.v6 = 1;
+ sock_l4(c, AF_INET6, IPPROTO_TCP, port, tref.splice,
+ tref.u32);
}
}
- getrandom(&c->tcp.hash_secret, sizeof(c->tcp.hash_secret), GRND_RANDOM);
+ if (c->mode == MODE_PASTA) {
+ clone(tcp_sock_init_ns, ns_fn_stack + sizeof(ns_fn_stack) / 2,
+ CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD,
+ (void *)c);
+ }
return 0;
}
@@ -1531,69 +2146,79 @@ int tcp_sock_init(struct ctx *c)
/**
* tcp_timer_one() - Handler for timed events on one socket
* @c: Execution context
- * @s: File descriptor number for socket
+ * @conn: Connection pointer
* @ts: Timestamp from caller
*/
-static void tcp_timer_one(struct ctx *c, int s, struct timespec *ts)
+static void tcp_timer_one(struct ctx *c, struct tcp_tap_conn *conn,
+ struct timespec *ts)
{
- int ack_tap_ms = timespec_diff_ms(ts, &tc[s].ts_ack_tap);
- int sock_ms = timespec_diff_ms(ts, &tc[s].ts_tap);
- int tap_ms = timespec_diff_ms(ts, &tc[s].ts_tap);
+ int ack_tap_ms = timespec_diff_ms(ts, &conn->ts_ack_tap);
+ int sock_ms = timespec_diff_ms(ts, &conn->ts_tap);
+ int tap_ms = timespec_diff_ms(ts, &conn->ts_tap);
- switch (tc[s].s) {
+ switch (conn->state) {
case SOCK_SYN_SENT:
case TAP_SYN_RCVD:
if (ack_tap_ms > SYN_TIMEOUT)
- tcp_rst(c, s);
+ tcp_rst(c, conn);
break;
case ESTABLISHED_SOCK_FIN:
if (ack_tap_ms > FIN_TIMEOUT) {
- tcp_rst(c, s);
+ tcp_rst(c, conn);
break;
}
/* Falls through */
case ESTABLISHED:
- if (tap_ms > ACT_TIMEOUT && sock_ms > ACT_TIMEOUT)
- tcp_rst(c, s);
+ if (tap_ms > ACT_TIMEOUT && sock_ms > ACT_TIMEOUT) {
+ tcp_rst(c, conn);
+ break;
+ }
- if (tc[s].seq_to_tap == tc[s].seq_ack_from_tap &&
- tc[s].seq_from_tap == tc[s].seq_ack_to_tap) {
- tc[s].ts_sock = *ts;
+ if (conn->seq_to_tap == conn->seq_ack_from_tap &&
+ conn->seq_from_tap == conn->seq_ack_to_tap) {
+ conn->ts_sock = *ts;
break;
}
if (sock_ms > ACK_INTERVAL) {
- if (tc[s].seq_from_tap > tc[s].seq_ack_to_tap)
- tcp_send_to_tap(c, s, 0, NULL, 0);
+ if (conn->seq_from_tap > conn->seq_ack_to_tap)
+ tcp_send_to_tap(c, conn, ACK, NULL, 0);
}
if (ack_tap_ms > ACK_TIMEOUT) {
- if (tc[s].seq_ack_from_tap < tc[s].seq_to_tap) {
- tc[s].seq_to_tap = tc[s].seq_ack_from_tap;
- tc[s].ts_ack_tap = *ts;
- tcp_data_from_sock(c, s, ts);
+ if (conn->seq_ack_from_tap < conn->seq_to_tap) {
+ if (ack_tap_ms > 10 * ACK_TIMEOUT) {
+ tcp_rst(c, conn);
+ break;
+ }
+
+ conn->seq_to_tap = conn->seq_ack_from_tap;
+ tcp_data_from_sock(c, conn, ts);
}
}
- if (tc[s].seq_from_tap == tc[s].seq_ack_to_tap)
- tc[s].ts_sock = *ts;
+ if (conn->seq_from_tap == conn->seq_ack_to_tap)
+ conn->ts_sock = *ts;
break;
case CLOSE_WAIT:
case FIN_WAIT_1:
if (sock_ms > FIN_TIMEOUT)
- tcp_rst(c, s);
+ tcp_rst(c, conn);
break;
case FIN_WAIT_1_SOCK_FIN:
if (ack_tap_ms > FIN_TIMEOUT)
- tcp_rst(c, s);
+ tcp_rst(c, conn);
break;
case LAST_ACK:
if (sock_ms > LAST_ACK_TIMEOUT)
- tcp_rst(c, s);
+ tcp_rst(c, conn);
break;
case TAP_SYN_SENT:
+ case SPLICE_ACCEPTED:
+ case SPLICE_CONNECT:
+ case SPLICE_ESTABLISHED:
case CLOSED:
break;
}
@@ -1613,8 +2238,10 @@ void tcp_timer(struct ctx *c, struct timespec *ts)
for (i = 0; i < sizeof(tcp_act) / sizeof(long); i++, word++) {
tmp = *word;
while ((n = ffsl(tmp))) {
+ int index = i * sizeof(long) * 8 + n - 1;
+
tmp &= ~(1UL << (n - 1));
- tcp_timer_one(c, i * sizeof(long) * 8 + n - 1, ts);
+ tcp_timer_one(c, &tt[index], ts);
}
}
}
diff --git a/tcp.h b/tcp.h
index 7435c41..6a9aa4a 100644
--- a/tcp.h
+++ b/tcp.h
@@ -3,9 +3,12 @@
#define TCP_TIMER_INTERVAL 20 /* ms */
+#define TCP_MAX_CONNS (128 * 1024)
+#define TCP_MAX_SOCKS (TCP_MAX_CONNS + USHRT_MAX * 2)
+
struct ctx;
-void tcp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf,
+void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now);
int tcp_tap_handler(struct ctx *c, int af, void *addr,
struct tap_msg *msg, int count, struct timespec *now);
@@ -13,24 +16,40 @@ int tcp_sock_init(struct ctx *c);
void tcp_timer(struct ctx *c, struct timespec *ts);
/**
+ * union tcp_epoll_ref - epoll reference portion for TCP connections
+ * @listen: Set if this file descriptor is a listening socket
+ * @splice: Set if descriptor is associated to a spliced connection
+ * @v6: Set for IPv6 sockets or connections
+ * @index: Index of connection in table, or port for bound sockets
+ * @u32: Opaque u32 value of reference
+ */
+union tcp_epoll_ref {
+ struct {
+ uint32_t listen:1,
+ splice:1,
+ v6:1,
+ index:20;
+ };
+ uint32_t u32;
+};
+
+/**
* struct tcp_ctx - Execution context for TCP routines
* @hash_secret: 128-bit secret for hash functions, ISN and hash table
- * @fd_min: Lowest file descriptor number for TCP ever used
- * @fd_max: Highest file descriptor number for TCP ever used
- * @fd_listen_min: Lowest file descriptor number for listening sockets
- * @fd_listen_max: Highest file descriptor number for listening sockets
- * @fd_conn_min: Lowest file descriptor number for connected sockets
- * @fd_conn_max: Highest file descriptor number for connected sockets
+ * @tap_conn_count: Count of tap connections in connection table
+ * @splice_conn_count: Count of spliced connections in connection table
+ * @port_to_tap: Ports bound host/init-side, packets to guest/tap
+ * @port_to_init: Ports bound namespace-side, spliced to init
+ * @port_to_ns: Ports bound init-side, spliced to namespace
* @timer_run: Timestamp of most recent timer run
*/
struct tcp_ctx {
uint64_t hash_secret[2];
- int fd_min;
- int fd_max;
- int fd_listen_min;
- int fd_listen_max;
- int fd_conn_min;
- int fd_conn_max;
+ int tap_conn_count;
+ int splice_conn_count;
+ uint8_t port_to_tap [USHRT_MAX / 8];
+ uint8_t port_to_init [USHRT_MAX / 8];
+ uint8_t port_to_ns [USHRT_MAX / 8];
struct timespec timer_run;
};
diff --git a/udp.c b/udp.c
index 46a3302..d64d59b 100644
--- a/udp.c
+++ b/udp.c
@@ -1,12 +1,15 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
*
* udp.c - UDP L2-L4 translation routines
*
* Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
- *
*/
/**
@@ -17,23 +20,77 @@
* with two purposes:
* - binding ephemeral ports when they're used as source port by the guest, so
* that replies on those ports can be forwarded back to the guest, with a
- * fixed 180s timeout for this binding
+ * fixed timeout for this binding
* - packets received from the local host get their source changed to a local
* address (gateway address) so that they can be forwarded to the guest, and
* packets sent as replies by the guest need their destination address to
* be changed back to the address of the local host. This is dynamic to allow
* connections from the gateway as well, and uses the same fixed 180s timeout
*
- * Sockets for ephemeral and non-ephemeral ports are created and at
- * initialisation time, one set for IPv4 and one for IPv6. Non-ephemeral ports
- * are bound at initialisation time, ephemeral ports are bound dynamically.
+ * Sockets for bound ports are created at initialisation time, one set for IPv4
+ * and one for IPv6.
*
* Packets are forwarded back and forth, by prepending and stripping UDP headers
* in the obvious way, with no port translation.
*
+ * In PASTA mode, the L2-L4 translation is skipped for connections to ports
+ * bound between namespaces using the loopback interface, messages are directly
+ * transferred between L4 sockets instead. These are called spliced connections
+ * for consistency with the TCP implementation, but the splice() syscall isn't
+ * actually used as it wouldn't make sense for datagram-based connections: a
+ * pair of recvmmsg() and sendmmsg() deals with this case.
+ *
+ * The connection tracking for PASTA mode is slightly complicated by the absence
+ * of actual connections, see struct udp_splice_port, and these examples:
+ *
+ * - from init to namespace:
+ *
+ * - forward direction: 127.0.0.1:5000 -> 127.0.0.1:80 in init from bound
+ * socket s, with epoll reference: index = 80, splice = UDP_TO_NS
+ * - if udp_splice_map[V4][5000].ns_conn_sock:
+ * - send packet to udp4_splice_map[5000].ns_conn_sock
+ * - otherwise:
+ * - create new socket udp_splice_map[V4][5000].ns_conn_sock
+ * - connect in namespace to 127.0.0.1:80
+ * - get source port of new connected socket (10000) with getsockname()
+ * - add to epoll with reference: index = 10000, splice: UDP_BACK_TO_INIT
+ * - set udp_splice_map[V4][10000].init_bound_sock to s
+ * - set udp_splice_map[V4][10000].init_dst_port to 5000
+ * - update udp_splice_map[V4][5000].ns_conn_ts with current time
+ *
+ * - reverse direction: 127.0.0.1:80 -> 127.0.0.1:10000 in namespace from
+ * connected socket s, having epoll reference: index = 10000,
+ * splice = UDP_BACK_TO_INIT
+ * - if udp_splice_map[V4][10000].init_bound_sock:
+ * - send to udp_splice_map[V4][10000].init_bound_sock, with destination
+ * port udp_splice_map[V4][10000].init_dst_port (5000)
+ * - otherwise, discard
+ *
+ * - from namespace to init:
+ *
+ * - forward direction: 127.0.0.1:2000 -> 127.0.0.1:22 in namespace from bound
+ * socket s, with epoll reference: index = 22, splice = UDP_TO_INIT
+ * - if udp4_splice_map[V4][2000].init_conn_sock:
+ * - send packet to udp4_splice_map[2000].init_conn_sock
+ * - otherwise:
+ * - create new socket udp_splice_map[V4][2000].init_conn_sock
+ * - connect in init to 127.0.0.1:22,
+ * - get source port of new connected socket (4000) with getsockname()
+ * - add to epoll with reference: index = 4000, splice = UDP_BACK_TO_NS
+ * - set udp_splice_map[V4][4000].ns_bound_sock to s
+ * - set udp_splice_map[V4][4000].ns_dst_port to 2000
+ * - update udp_splice_map[V4][4000].init_conn_ts with current time
+ *
+ * - reverse direction: 127.0.0.1:22 -> 127.0.0.1:4000 in init from connected
+ * socket s, having epoll reference: index = 4000, splice = UDP_BACK_TO_NS
+ * - if udp_splice_map[V4][4000].ns_bound_sock:
+ * - send to udp_splice_map[V4][4000].ns_bound_sock, with destination port
+ * udp_splice_map[4000].ns_dst_port (2000)
+ * - otherwise, discard
*/
#define _GNU_SOURCE
+#include <sched.h>
#include <stdio.h>
#include <errno.h>
#include <limits.h>
@@ -53,252 +110,373 @@
#include <linux/udp.h>
#include <time.h>
+#include "util.h"
#include "passt.h"
#include "tap.h"
-#include "util.h"
#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */
+#define UDP_SPLICE_FRAMES 128
-struct udp_port {
- int s;
- time_t ts_ephemeral;
+/**
+ * struct udp_tap_port - Port tracking based on tap-facing source port
+ * @sock: Socket bound to source port used as index
+ * @ts: Activity timestamp from tap, used for socket aging
+ * @ts_local: Timestamp of tap packet to gateway address, aging for local bind
+ */
+struct udp_tap_port {
+ int sock;
+ time_t ts;
time_t ts_local;
};
-static struct udp_port up4[USHRT_MAX];
-static struct udp_port up6[USHRT_MAX];
+/**
+ * struct udp_splice_port - Source port tracking for traffic between namespaces
+ * @ns_conn_sock: Socket connected in namespace for init source port
+ * @init_conn_sock: Socket connected in init for namespace source port
+ * @ns_conn_ts: Timestamp of activity for socket connected in namespace
+ * @init_conn_ts: Timestamp of activity for socket connceted in init
+ * @ns_dst_port: Destination port in namespace for init source port
+ * @init_dst_port: Destination port in init for namespace source port
+ * @ns_bound_sock: Bound socket in namespace for this source port in init
+ * @init_bound_sock: Bound socket in init for this source port in namespace
+ */
+struct udp_splice_port {
+ int ns_conn_sock;
+ int init_conn_sock;
+
+ time_t ns_conn_ts;
+ time_t init_conn_ts;
+
+ in_port_t ns_dst_port;
+ in_port_t init_dst_port;
+
+ int ns_bound_sock;
+ int init_bound_sock;
+};
+
+/* Port tracking, arrays indexed by packet source port (host order) */
+static struct udp_tap_port udp_tap_map [IP_VERSIONS][USHRT_MAX];
+static struct udp_splice_port udp_splice_map [IP_VERSIONS][USHRT_MAX];
+
+enum udp_act_type {
+ UDP_ACT_TAP,
+ UDP_ACT_NS_CONN,
+ UDP_ACT_INIT_CONN,
+ UDP_ACT_TYPE_MAX,
+};
+
+/* Activity-based aging for bindings */
+static uint8_t udp_act[IP_VERSIONS][UDP_ACT_TYPE_MAX][USHRT_MAX / 8];
+
+/* recvmmsg()/sendmmsg() data */
+static struct sockaddr_storage udp_splice_namebuf;
+static uint8_t udp_splice_buf[UDP_SPLICE_FRAMES][USHRT_MAX];
+
+static struct iovec udp_splice_iov_recv [UDP_SPLICE_FRAMES];
+static struct mmsghdr udp_splice_mmh_recv [UDP_SPLICE_FRAMES];
+
+static struct iovec udp_splice_iov_send [UDP_SPLICE_FRAMES];
+static struct mmsghdr udp_splice_mmh_send [UDP_SPLICE_FRAMES];
-/* Bitmaps, activity monitoring needed for port */
-static uint8_t udp4_act[USHRT_MAX / 8];
-static uint8_t udp6_act[USHRT_MAX / 8];
+static struct iovec udp_splice_iov_sendto [UDP_SPLICE_FRAMES];
+static struct mmsghdr udp_splice_mmh_sendto [UDP_SPLICE_FRAMES];
/**
- * udp_act_set() - Set port in bitmap for timed events
- * @af: Protocol family
- * @s: Port number
+ * udp_splice_connect() - Create and connect socket for "spliced" binding
+ * @c: Execution context
+ * @v6: Set for IPv6 connections
+ * @bound_sock: Originating bound socket
+ * @src: Source port of original connection, host order
+ * @dst: Destination port of original connection, host order
+ * @splice: UDP_BACK_TO_INIT from init, UDP_BACK_TO_NS from namespace
+ *
+ * Return: connected socket, negative error code on failure
*/
-static void udp_act_set(int af, int p)
+int udp_splice_connect(struct ctx *c, int v6, int bound_sock,
+ in_port_t src, in_port_t dst, int splice)
{
- if (af == AF_INET)
- udp4_act[p / 8] |= 1 << (p % 8);
+ struct epoll_event ev = { .events = EPOLLIN | EPOLLRDHUP | EPOLLHUP };
+ union epoll_ref ref = { .proto = IPPROTO_UDP,
+ .udp = { .splice = splice, .v6 = v6 }
+ };
+ struct sockaddr_storage sa;
+ struct udp_splice_port *sp;
+ socklen_t sl = sizeof(sa);
+ int s;
+
+ s = socket(v6 ? AF_INET6 : AF_INET, SOCK_DGRAM | SOCK_NONBLOCK,
+ IPPROTO_UDP);
+ if (s < 0)
+ return s;
+ ref.s = s;
+
+ if (v6) {
+ struct sockaddr_in6 addr6 = {
+ .sin6_family = AF_INET6,
+ .sin6_port = htons(dst),
+ .sin6_addr = IN6ADDR_LOOPBACK_INIT,
+ };
+ if (connect(s, (struct sockaddr *)&addr6, sizeof(addr6)))
+ goto fail;
+ } else {
+ struct sockaddr_in addr4 = {
+ .sin_family = AF_INET,
+ .sin_port = htons(dst),
+ .sin_addr = { .s_addr = htonl(INADDR_LOOPBACK) },
+ };
+ if (connect(s, (struct sockaddr *)&addr4, sizeof(addr4)))
+ goto fail;
+ }
+
+ if (getsockname(s, (struct sockaddr *)&sa, &sl))
+ goto fail;
+
+ if (v6)
+ ref.udp.port = ntohs(((struct sockaddr_in6 *)&sa)->sin6_port);
else
- udp6_act[p / 8] |= 1 << (p % 8);
+ ref.udp.port = ntohs(((struct sockaddr_in *)&sa)->sin_port);
+
+ sp = &udp_splice_map[v6 ? V6 : V4][ref.udp.port];
+ if (splice == UDP_BACK_TO_INIT) {
+ sp->init_bound_sock = bound_sock;
+ sp->init_dst_port = src;
+ udp_splice_map[v6 ? V6 : V4][src].ns_conn_sock = s;
+ bitmap_set(udp_act[v6 ? V6 : V4][UDP_ACT_NS_CONN], src);
+ } else if (splice == UDP_BACK_TO_NS) {
+ sp->ns_bound_sock = bound_sock;
+ sp->ns_dst_port = src;
+ udp_splice_map[v6 ? V6 : V4][src].init_conn_sock = s;
+ bitmap_set(udp_act[v6 ? V6 : V4][UDP_ACT_INIT_CONN], src);
+ }
+
+ ev.data.u64 = ref.u64;
+ epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev);
+ return s;
+
+fail:
+ close(s);
+ return -1;
}
/**
- * udp_act_clear() - Clear port from bitmap for timed events
- * @af: Protocol family
- * @s: Port number
+ * struct udp_splice_connect_ns_arg - Arguments for udp_splice_connect_ns()
+ * @c: Execution context
+ * @v6: Set for inbound IPv6 connection
+ * @bound_sock: Originating bound socket
+ * @src: Source port of original connection, host order
+ * @dst: Destination port of original connection, host order
+ * @s: Newly created socket or negative error code
*/
-static void udp_act_clear(int af, int p)
+struct udp_splice_connect_ns_arg {
+ struct ctx *c;
+ int v6;
+ int bound_sock;
+ in_port_t src;
+ in_port_t dst;
+ int s;
+};
+
+/**
+ * udp_splice_connect_ns() - Enter namespace and call udp_splice_connect()
+ * @arg: See struct udp_splice_connect_ns_arg
+ *
+ * Return: 0
+ */
+static int udp_splice_connect_ns(void *arg)
{
- if (af == AF_INET)
- udp4_act[p / 8] &= ~(1 << (p % 8));
- else
- udp6_act[p / 8] &= ~(1 << (p % 8));
+ struct udp_splice_connect_ns_arg *a;
+
+ a = (struct udp_splice_connect_ns_arg *)arg;
+
+ ns_enter(a->c->pasta_pid);
+ a->s = udp_splice_connect(a->c, a->v6, a->bound_sock, a->src, a->dst,
+ UDP_BACK_TO_INIT);
+
+ return 0;
}
/**
- * udp_sock_handler_local() - Replace address if local, update timestamp
+ * udp_sock_handler_splice() - Handler for socket mapped to "spliced" connection
* @c: Execution context
- * @sa: Socket address as struct sockaddr_in or sockaddr_in6
+ * @ref: epoll reference
+ * @events: epoll events bitmap
* @now: Current timestamp
*/
-static void udp_sock_handler_local(struct ctx *c, int af, void *sa,
- struct timespec *now)
+static void udp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
+ uint32_t events, struct timespec *now)
{
- if (af == AF_INET) {
- struct sockaddr_in *s_in = (struct sockaddr_in *)sa;
+ struct msghdr *mh = &udp_splice_mmh_recv[0].msg_hdr;
+ struct sockaddr_storage *sa_s = mh->msg_name;
+ in_port_t src, dst = ref.udp.port, send_dst;
+ char ns_fn_stack[NS_FN_STACK_SIZE];
+ int s, v6 = ref.udp.v6, n, i;
- s_in->sin_addr.s_addr = c->gw4;
+ if (!(events & EPOLLIN))
+ return;
- up4[ntohs(s_in->sin_port)].ts_local = now->tv_sec;
- udp_act_set(AF_INET, ntohs(s_in->sin_port));
- } else {
- struct sockaddr_in6 *s_in6 = (struct sockaddr_in6 *)sa;
+ n = recvmmsg(ref.s, udp_splice_mmh_recv, UDP_SPLICE_FRAMES, 0, NULL);
+
+ if (n <= 0)
+ return;
- memcpy(&s_in6->sin6_addr, &c->gw6, sizeof(c->gw6));
+ if (v6) {
+ struct sockaddr_in6 *sa = (struct sockaddr_in6 *)sa_s;
- up6[ntohs(s_in6->sin6_port)].ts_local = now->tv_sec;
- udp_act_set(AF_INET6, ntohs(s_in6->sin6_port));
+ src = htons(sa->sin6_port);
+ } else {
+ struct sockaddr_in *sa = (struct sockaddr_in *)sa_s;
+
+ src = ntohs(sa->sin_port);
}
-}
-/**
- * udp_sock_name() - Get address family and port for bound UDP socket
- * @c: Execution context
- * @s: File descriptor number for socket
- * @port: Local port, set on return, network order
- *
- * Return: address family, AF_INET or AF_INET6, negative error code on failure
- */
-static int udp_sock_name(struct ctx *c, int s, in_port_t *port)
-{
- if (!c->udp.fd_in_seq) {
- struct sockaddr_storage sa;
- socklen_t sl;
+ switch (ref.udp.splice) {
+ case UDP_TO_NS:
+ if (!(s = udp_splice_map[v6][src].ns_conn_sock)) {
+ struct udp_splice_connect_ns_arg arg = {
+ c, v6, ref.s, src, dst, -1,
+ };
- sl = sizeof(sa);
- if (getsockname(s, (struct sockaddr *)&sa, &sl))
- return -errno;
+ clone(udp_splice_connect_ns,
+ ns_fn_stack + sizeof(ns_fn_stack) / 2,
+ CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD,
+ (void *)&arg);
- if (sa.ss_family == AF_INET) {
- *port = ((struct sockaddr_in *)&sa)->sin_port;
- return AF_INET;
+ if ((s = arg.s) < 0)
+ return;
}
+ udp_splice_map[v6][src].ns_conn_ts = now->tv_sec;
+ break;
+ case UDP_BACK_TO_INIT:
+ if (!(s = udp_splice_map[v6][dst].init_bound_sock))
+ return;
- if (sa.ss_family == AF_INET6) {
- *port = ((struct sockaddr_in6 *)&sa)->sin6_port;
- return AF_INET6;
+ send_dst = udp_splice_map[v6][dst].init_dst_port;
+ break;
+ case UDP_TO_INIT:
+ if (!(s = udp_splice_map[v6][src].init_conn_sock)) {
+ s = udp_splice_connect(c, v6, ref.s, src, dst,
+ UDP_BACK_TO_NS);
+ if (s < 0)
+ return;
}
+ udp_splice_map[v6][src].init_conn_ts = now->tv_sec;
+ break;
+ case UDP_BACK_TO_NS:
+ if (!(s = udp_splice_map[v6][dst].ns_bound_sock))
+ return;
+
+ send_dst = udp_splice_map[v6][dst].ns_dst_port;
+ break;
+ default:
+ return;
+ }
+
+ if (ref.udp.splice == UDP_TO_NS || ref.udp.splice == UDP_TO_INIT) {
+ for (i = 0; i < n; i++) {
+ struct msghdr *mh = &udp_splice_mmh_send[i].msg_hdr;
+
+ mh->msg_iov->iov_len = udp_splice_mmh_recv[i].msg_len;
+ }
+
+ sendmmsg(s, udp_splice_mmh_send, n, MSG_NOSIGNAL);
+ return;
+ }
+
+ for (i = 0; i < n; i++) {
+ struct msghdr *mh = &udp_splice_mmh_sendto[i].msg_hdr;
- return -ENOTSUP;
+ mh->msg_iov->iov_len = udp_splice_mmh_recv[i].msg_len;
}
- if (c->v4 && c->v6) {
- *port = htons((s - c->udp.fd_min) / 2);
- return ((s - c->udp.fd_min) % 2) ? AF_INET6 : AF_INET;
+ if (v6) {
+ *((struct sockaddr_in6 *)&udp_splice_namebuf) =
+ ((struct sockaddr_in6) {
+ .sin6_family = AF_INET6,
+ .sin6_addr = IN6ADDR_LOOPBACK_INIT,
+ .sin6_port = htons(send_dst),
+ });
+ } else {
+ *((struct sockaddr_in *)&udp_splice_namebuf) =
+ ((struct sockaddr_in) {
+ .sin_family = AF_INET,
+ .sin_addr = { .s_addr = htonl(INADDR_LOOPBACK) },
+ .sin_port = htons(send_dst),
+ });
}
- *port = htons(s - c->udp.fd_min);
- return c->v4 ? AF_INET : AF_INET6;
+ sendmmsg(s, udp_splice_mmh_sendto, n, MSG_NOSIGNAL);
}
/**
* udp_sock_handler() - Handle new data from socket
* @c: Execution context
- * @s: File descriptor number for socket
+ * @ref: epoll reference
* @events: epoll events bitmap
- * @pkt_buf: Buffer to receive packets, currently unused
* @now: Current timestamp
*/
-void udp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf,
+void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now)
{
- struct in6_addr a6 = { .s6_addr = { 0, 0, 0, 0,
- 0, 0, 0, 0,
- 0, 0, 0xff, 0xff,
- 0, 0, 0, 0 } };
struct sockaddr_storage sr;
- socklen_t slen = sizeof(sr);
+ socklen_t sl = sizeof(sr);
char buf[USHRT_MAX];
struct udphdr *uh;
ssize_t n;
- int af;
-
- (void)pkt_buf;
if (events == EPOLLERR)
return;
- n = recvfrom(s, buf + sizeof(*uh), sizeof(buf) - sizeof(*uh),
- MSG_DONTWAIT, (struct sockaddr *)&sr, &slen);
- if (n < 0)
+ if (ref.udp.splice) {
+ udp_sock_handler_splice(c, ref, events, now);
return;
+ }
uh = (struct udphdr *)buf;
- af = udp_sock_name(c, s, &uh->dest);
-
- if (af == AF_INET) {
- struct sockaddr_in *sr4 = (struct sockaddr_in *)&sr;
- if (ntohl(sr4->sin_addr.s_addr) == INADDR_LOOPBACK ||
- ntohl(sr4->sin_addr.s_addr) == INADDR_ANY)
- udp_sock_handler_local(c, AF_INET, sr4, now);
+ n = recvfrom(ref.s, buf + sizeof(*uh), sizeof(buf) - sizeof(*uh), 0,
+ (struct sockaddr *)&sr, &sl);
+ if (n < 0)
+ return;
- memcpy(&a6.s6_addr[12], &sr4->sin_addr, sizeof(sr4->sin_addr));
- uh->source = sr4->sin_port;
- uh->len = htons(n + sizeof(*uh));
+ uh->dest = htons(ref.udp.port);
+ uh->len = htons(n + sizeof(*uh));
- tap_ip_send(c, &a6, IPPROTO_UDP, buf, n + sizeof(*uh));
- } else if (af == AF_INET6) {
+ if (ref.udp.v6) {
struct sockaddr_in6 *sr6 = (struct sockaddr_in6 *)&sr;
- if (IN6_IS_ADDR_LOOPBACK(&sr6->sin6_addr))
- udp_sock_handler_local(c, AF_INET6, sr6, now);
+ if (IN6_IS_ADDR_LOOPBACK(&sr6->sin6_addr)) {
+ in_port_t src = htons(sr6->sin6_port);
+
+ memcpy(&sr6->sin6_addr, &c->gw6, sizeof(c->gw6));
+ udp_tap_map[V6][src].ts_local = now->tv_sec;
+ bitmap_set(udp_act[V6][UDP_ACT_TAP], src);
+ }
uh->source = sr6->sin6_port;
- uh->len = htons(n + sizeof(*uh));
tap_ip_send(c, &sr6->sin6_addr, IPPROTO_UDP,
buf, n + sizeof(*uh));
- }
-}
-
-/**
- * udp_tap_handler_ephemeral() - Bind ephemeral source port, update timestamp
- * @af: Address family, AF_INET or AF_INET6
- * @src: Source port, host order
- * @now: Current timestamp
- */
-static void udp_tap_handler_ephemeral(int af, in_port_t src,
- struct timespec *now)
-{
- struct sockaddr *addr = NULL;
- struct sockaddr_in6 s_in6 = {
- .sin6_family = AF_INET6,
- .sin6_port = htons(src),
- .sin6_addr = IN6ADDR_ANY_INIT,
- };
- struct sockaddr_in s_in = {
- .sin_family = AF_INET,
- .sin_port = htons(src),
- .sin_addr = { .s_addr = INADDR_ANY },
- };
- socklen_t sl;
- int s;
-
- if (af == AF_INET) {
- if (!up4[src].ts_ephemeral) {
- s = up4[src].s;
- addr = (struct sockaddr *)&s_in;
- sl = sizeof(s_in);
- }
} else {
- if (!up6[src].ts_ephemeral) {
- s = up6[src].s;
- addr = (struct sockaddr *)&s_in6;
- sl = sizeof(s_in6);
- }
- }
-
- if (addr) {
- if (bind(s, addr, sl))
- return;
+ struct in6_addr a6 = { .s6_addr = { 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0xff, 0xff,
+ 0, 0, 0, 0 } };
+ struct sockaddr_in *sr4 = (struct sockaddr_in *)&sr;
- udp_act_set(af, src);
- }
+ if (ntohl(sr4->sin_addr.s_addr) == INADDR_LOOPBACK ||
+ ntohl(sr4->sin_addr.s_addr) == INADDR_ANY) {
+ in_port_t src = htons(sr4->sin_port);
- if (af == AF_INET)
- up4[src].ts_ephemeral = now->tv_sec;
- else
- up6[src].ts_ephemeral = now->tv_sec;
-}
+ sr4->sin_addr.s_addr = c->gw4;
+ udp_tap_map[V4][src].ts_local = now->tv_sec;
+ bitmap_set(udp_act[V4][UDP_ACT_TAP], src);
+ }
-/**
- * udp_tap_handler_local() - Set address to local if needed, update timestamp
- * @af: Address family, AF_INET or AF_INET6
- * @dst: Destination port, host order
- * @sa: Socket address as struct sockaddr_in or sockaddr_in6 to modify
- * @now: Current timestamp
- */
-static void udp_tap_handler_local(int af, in_port_t dst, void *sa,
- struct timespec *now)
-{
- if (af == AF_INET) {
- if (up4[dst].ts_local) {
- struct sockaddr_in *s_in = (struct sockaddr_in *)sa;
+ memcpy(&a6.s6_addr[12], &sr4->sin_addr, sizeof(sr4->sin_addr));
- s_in->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
- up4[dst].ts_local = now->tv_sec;
- }
- } else {
- if (up6[dst].ts_local) {
- struct sockaddr_in6 *s_in6 = (struct sockaddr_in6 *)sa;
+ uh->source = sr4->sin_port;
- s_in6->sin6_addr = in6addr_loopback;
- up6[dst].ts_local = now->tv_sec;
- }
+ tap_ip_send(c, &a6, IPPROTO_UDP, buf, n + sizeof(*uh));
}
}
@@ -306,6 +484,7 @@ static void udp_tap_handler_local(int af, in_port_t dst, void *sa,
* udp_tap_handler() - Handle packets from tap
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
+ * @addr: Destination address
* @msg: Input messages
* @count: Message count
* @now: Current timestamp
@@ -345,7 +524,24 @@ int udp_tap_handler(struct ctx *c, int af, void *addr,
sa = (struct sockaddr *)&s_in;
sl = sizeof(s_in);
- } else if (af == AF_INET6) {
+
+ if (!(s = udp_tap_map[V4][src].sock)) {
+ union udp_epoll_ref uref = { .bound = 1, .port = src };
+
+ s = sock_l4(c, AF_INET, IPPROTO_UDP, src, 0, uref.u32);
+ if (s <= 0)
+ return count;
+
+ udp_tap_map[V4][src].sock = s;
+ bitmap_set(udp_act[V4][UDP_ACT_TAP], src);
+ }
+
+ udp_tap_map[V4][src].ts = now->tv_sec;
+
+ if (s_in.sin_addr.s_addr == c->gw4 &&
+ udp_tap_map[V4][dst].ts_local)
+ s_in.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ } else {
s_in6 = (struct sockaddr_in6) {
.sin6_family = AF_INET6,
.sin6_port = uh->dest,
@@ -354,8 +550,25 @@ int udp_tap_handler(struct ctx *c, int af, void *addr,
sa = (struct sockaddr *)&s_in6;
sl = sizeof(s_in6);
- } else {
- return count;
+
+ if (!(s = udp_tap_map[V6][src].sock)) {
+ union udp_epoll_ref uref = { .bound = 1, .v6 = 1,
+ .port = src
+ };
+
+ s = sock_l4(c, AF_INET6, IPPROTO_UDP, src, 0, uref.u32);
+ if (s <= 0)
+ return count;
+
+ udp_tap_map[V6][src].sock = s;
+ bitmap_set(udp_act[V6][UDP_ACT_TAP], src);
+ }
+
+ udp_tap_map[V6][src].ts = now->tv_sec;
+
+ if (!memcmp(addr, &c->gw6, sizeof(c->gw6)) &&
+ udp_tap_map[V6][dst].ts_local)
+ s_in6.sin6_addr = in6addr_loopback;
}
for (i = 0; i < count; i++) {
@@ -369,28 +582,98 @@ int udp_tap_handler(struct ctx *c, int af, void *addr,
mm[i].msg_hdr.msg_iovlen = 1;
}
- if (af == AF_INET) {
- if (!(s = up4[src].s))
- return count;
+ count = sendmmsg(s, mm, count, MSG_NOSIGNAL);
+ if (count < 0)
+ return 1;
- if (s_in.sin_addr.s_addr == c->gw4)
- udp_tap_handler_local(AF_INET, dst, &s_in, now);
- } else {
- if (!(s = up6[src].s))
- return count;
+ return count;
+}
+
+/**
+ * udp_sock_init_ns() - Bind sockets in namespace for inbound connections
+ * @arg: Execution context
+ *
+ * Return: 0
+ */
+int udp_sock_init_ns(void *arg)
+{
+ union udp_epoll_ref uref = { .bound = 1, .splice = UDP_TO_INIT };
+ struct ctx *c = (struct ctx *)arg;
+ in_port_t port;
+
+ ns_enter(c->pasta_pid);
+
+ for (port = 0; port < USHRT_MAX; port++) {
+ if (!bitmap_isset(c->udp.port_to_init, port))
+ continue;
- if (!memcmp(addr, &c->gw6, sizeof(c->gw6)))
- udp_tap_handler_local(AF_INET6, dst, &s_in6, now);
+ uref.port = port;
+
+ if (c->v4) {
+ uref.v6 = 0;
+ sock_l4(c, AF_INET, IPPROTO_UDP, port, 1, uref.u32);
+ }
+
+ if (c->v6) {
+ uref.v6 = 1;
+ sock_l4(c, AF_INET6, IPPROTO_UDP, port, 1, uref.u32);
+ }
}
- if (PORT_IS_EPHEMERAL(src))
- udp_tap_handler_ephemeral(af, src, now);
+ return 0;
+}
- count = sendmmsg(s, mm, count, MSG_DONTWAIT | MSG_NOSIGNAL);
- if (count < 0)
- return 1;
+/**
+ * udp_splice_iov_init() - Set up buffers and descriptors for recvmmsg/sendmmsg
+ */
+static void udp_splice_iov_init(void)
+{
+ struct mmsghdr *h;
+ struct iovec *iov;
+ int i;
- return count;
+ for (i = 0, h = udp_splice_mmh_recv; i < UDP_SPLICE_FRAMES; i++, h++) {
+ struct msghdr *mh = &h->msg_hdr;
+
+ if (!i) {
+ mh->msg_name = &udp_splice_namebuf;
+ mh->msg_namelen = sizeof(udp_splice_namebuf);
+ }
+
+ mh->msg_iov = &udp_splice_iov_recv[i];
+ mh->msg_iovlen = 1;
+ }
+ for (i = 0, iov = udp_splice_iov_recv; i < UDP_SPLICE_FRAMES;
+ i++, iov++) {
+ iov->iov_base = udp_splice_buf[i];
+ iov->iov_len = sizeof(udp_splice_buf[i]);
+ }
+
+ for (i = 0, h = udp_splice_mmh_send; i < UDP_SPLICE_FRAMES; i++, h++) {
+ struct msghdr *mh = &h->msg_hdr;
+
+ mh->msg_iov = &udp_splice_iov_send[i];
+ mh->msg_iovlen = 1;
+ }
+ for (i = 0, iov = udp_splice_iov_send; i < UDP_SPLICE_FRAMES;
+ i++, iov++) {
+ iov->iov_base = udp_splice_buf[i];
+ }
+
+ for (i = 0, h = udp_splice_mmh_sendto; i < UDP_SPLICE_FRAMES;
+ i++, h++) {
+ struct msghdr *mh = &h->msg_hdr;
+
+ mh->msg_name = &udp_splice_namebuf;
+ mh->msg_namelen = sizeof(udp_splice_namebuf);
+
+ mh->msg_iov = &udp_splice_iov_sendto[i];
+ mh->msg_iovlen = 1;
+ }
+ for (i = 0, iov = udp_splice_iov_sendto; i < UDP_SPLICE_FRAMES;
+ i++, iov++) {
+ iov->iov_base = udp_splice_buf[i];
+ }
}
/**
@@ -401,111 +684,128 @@ int udp_tap_handler(struct ctx *c, int af, void *addr,
*/
int udp_sock_init(struct ctx *c)
{
- int s, prev = -1;
+ union udp_epoll_ref uref = { .bound = 1 };
+ char ns_fn_stack[NS_FN_STACK_SIZE];
in_port_t port;
-
- c->udp.fd_min = INT_MAX;
- c->udp.fd_max = 0;
- c->udp.fd_in_seq = 1;
+ int s;
for (port = 0; port < USHRT_MAX; port++) {
- if (c->v4) {
- if ((s = sock_l4(c, AF_INET, IPPROTO_UDP, port)) < 0)
- return -1;
+ if (bitmap_isset(c->udp.port_to_ns, port))
+ uref.splice = UDP_TO_NS;
+ else if (bitmap_isset(c->udp.port_to_tap, port))
+ uref.splice = 0;
+ else
+ continue;
- if (c->udp.fd_in_seq && prev != -1 && s != prev + 1)
- c->udp.fd_in_seq = 0;
- else
- prev = s;
+ uref.port = port;
- up4[port].s = s;
+ if (c->v4) {
+ uref.v6 = 0;
+ s = sock_l4(c, AF_INET, IPPROTO_UDP, port,
+ uref.splice == UDP_TO_NS, uref.u32);
+
+ if (!uref.splice && s > 0)
+ udp_tap_map[V4][port].sock = s;
}
if (c->v6) {
- if ((s = sock_l4(c, AF_INET6, IPPROTO_UDP, port)) < 0)
- return -1;
-
- if (c->udp.fd_in_seq && prev != -1 && s != prev + 1)
- c->udp.fd_in_seq = 0;
- else
- prev = s;
+ uref.v6 = 1;
+ s = sock_l4(c, AF_INET6, IPPROTO_UDP, port,
+ uref.splice == UDP_TO_NS, uref.u32);
- up6[port].s = s;
+ if (!uref.splice && s > 0)
+ udp_tap_map[V6][port].sock = s;
}
}
+ if (c->mode == MODE_PASTA) {
+ udp_splice_iov_init();
+ clone(udp_sock_init_ns, ns_fn_stack + sizeof(ns_fn_stack) / 2,
+ CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD,
+ (void *)c);
+ }
+
return 0;
}
/**
* udp_timer_one() - Handler for timed events on one port
- * @af: Address family, AF_INET or AF_INET6
- * @p: Port number, host order
+ * @c: Execution context
+ * @v6: Set for IPv6 connections
+ * @type: Socket type
+ * @port: Port number, host order
* @ts: Timestamp from caller
*/
-static void udp_timer_one(struct ctx *c, int af, in_port_t p,
- struct timespec *ts)
+static void udp_timer_one(struct ctx *c, int v6, enum udp_act_type type,
+ in_port_t port, struct timespec *ts)
{
+ struct udp_splice_port *sp;
+ struct udp_tap_port *tp;
int s = -1;
- if (af == AF_INET) {
- if (ts->tv_sec - up4[p].ts_ephemeral > UDP_CONN_TIMEOUT)
- up4[p].ts_ephemeral = 0;
- if (ts->tv_sec - up4[p].ts_local > UDP_CONN_TIMEOUT)
- up4[p].ts_local = 0;
-
- if (!up4[p].ts_ephemeral && !up4[p].ts_local) {
- udp_act_clear(AF_INET, p);
- s = up4[p].s;
- }
- } else {
- if (ts->tv_sec - up6[p].ts_ephemeral > UDP_CONN_TIMEOUT)
- up6[p].ts_ephemeral = 0;
- if (ts->tv_sec - up6[p].ts_local > UDP_CONN_TIMEOUT)
- up6[p].ts_local = 0;
-
- if (!up6[p].ts_ephemeral && !up6[p].ts_local) {
- udp_act_clear(AF_INET6, p);
- s = up6[p].s;
- }
+ switch (type) {
+ case UDP_ACT_TAP:
+ tp = &udp_tap_map[v6 ? V6 : V4][port];
+
+ if (ts->tv_sec - tp->ts > UDP_CONN_TIMEOUT)
+ s = tp->sock;
+
+ if (ts->tv_sec - tp->ts_local > UDP_CONN_TIMEOUT)
+ tp->ts_local = 0;
+
+ break;
+ case UDP_ACT_INIT_CONN:
+ sp = &udp_splice_map[v6 ? V6 : V4][port];
+
+ if (ts->tv_sec - sp->init_conn_ts > UDP_CONN_TIMEOUT)
+ s = sp->init_conn_sock;
+
+ break;
+ case UDP_ACT_NS_CONN:
+ sp = &udp_splice_map[v6 ? V6 : V4][port];
+
+ if (ts->tv_sec - sp->ns_conn_ts > UDP_CONN_TIMEOUT)
+ s = sp->ns_conn_sock;
+
+ break;
+ default:
+ return;
}
if (s != -1) {
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL);
close(s);
- if (sock_l4(c, af, IPPROTO_UDP, p) != s)
- c->udp.fd_in_seq = 0;
+ bitmap_clear(udp_act[v6 ? V6 : V4][type], port);
}
}
/**
- * udp_timer() - Scan activity bitmap for ports with associated timed events
+ * udp_timer() - Scan activity bitmaps for ports with associated timed events
* @c: Execution context
* @ts: Timestamp from caller
*/
void udp_timer(struct ctx *c, struct timespec *ts)
{
- long *word, tmp;
+ int n, t, v6 = 0;
unsigned int i;
- int n;
-
- word = (long *)udp4_act;
- for (i = 0; i < sizeof(udp4_act) / sizeof(long); i++, word++) {
- tmp = *word;
- while ((n = ffsl(tmp))) {
- tmp &= ~(1UL << (n - 1));
- udp_timer_one(c, AF_INET,
- i * sizeof(long) * 8 + n - 1, ts);
+ long *word, tmp;
+
+v6:
+ for (t = 0; t < UDP_ACT_TYPE_MAX; t++) {
+ word = (long *)udp_act[v6 ? V6 : V4][t];
+ for (i = 0; i < sizeof(udp_act[0][0]) / sizeof(long);
+ i++, word++) {
+ tmp = *word;
+ while ((n = ffsl(tmp))) {
+ tmp &= ~(1UL << (n - 1));
+ udp_timer_one(c, v6, t,
+ i * sizeof(long) * 8 + n - 1, ts);
+ }
}
}
- word = (long *)udp6_act;
- for (i = 0; i < sizeof(udp6_act) / sizeof(long); i++, word++) {
- tmp = *word;
- while ((n = ffsl(tmp))) {
- tmp &= ~(1UL << (n - 1));
- udp_timer_one(c, AF_INET6,
- i * sizeof(long) * 8 + n - 1, ts);
- }
+ if (!v6) {
+ v6 = 1;
+ goto v6;
}
}
diff --git a/udp.h b/udp.h
index a126488..e3afa74 100644
--- a/udp.h
+++ b/udp.h
@@ -3,7 +3,7 @@
#define UDP_TIMER_INTERVAL 1000 /* ms */
-void udp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf,
+void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now);
int udp_tap_handler(struct ctx *c, int af, void *addr,
struct tap_msg *msg, int count, struct timespec *now);
@@ -11,16 +11,40 @@ int udp_sock_init(struct ctx *c);
void udp_timer(struct ctx *c, struct timespec *ts);
/**
+ * union udp_epoll_ref - epoll reference portion for TCP connections
+ * @bound: Set if this file descriptor is a bound socket
+ * @splice: Set if descriptor is associated to "spliced" connection
+ * @v6: Set for IPv6 sockets or connections
+ * @port: Source port for connected sockets, bound port otherwise
+ * @u32: Opaque u32 value of reference
+ */
+union udp_epoll_ref {
+ struct {
+ uint32_t bound:1,
+ splice:3,
+#define UDP_TO_NS 1
+#define UDP_TO_INIT 2
+#define UDP_BACK_TO_NS 3
+#define UDP_BACK_TO_INIT 4
+
+ v6:1,
+ port:16;
+ };
+ uint32_t u32;
+};
+
+
+/**
* struct udp_ctx - Execution context for UDP
- * @fd_min: Lowest file descriptor number for UDP ever used
- * @fd_max: Highest file descriptor number for UDP ever used
- * @fd_in_seq: 1 if all socket numbers are in sequence, 0 otherwise
+ * @port_to_tap: Ports bound host/init-side, packets to guest/tap
+ * @port_to_init: Ports bound namespace-side, spliced to init
+ * @port_to_ns: Ports bound init-side, spliced to namespace
* @timer_run: Timestamp of most recent timer run
*/
struct udp_ctx {
- int fd_min;
- int fd_max;
- int fd_in_seq;
+ uint8_t port_to_tap [USHRT_MAX / 8];
+ uint8_t port_to_init [USHRT_MAX / 8];
+ uint8_t port_to_ns [USHRT_MAX / 8];
struct timespec timer_run;
};
diff --git a/util.c b/util.c
index 59a0cb2..1372eec 100644
--- a/util.c
+++ b/util.c
@@ -1,14 +1,19 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
*
* util.c - Convenience helpers
*
* Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
- *
*/
+#define _GNU_SOURCE
+#include <sched.h>
#include <stdio.h>
#include <stdint.h>
#include <stddef.h>
@@ -20,13 +25,16 @@
#include <netinet/tcp.h>
#include <netinet/udp.h>
#include <sys/epoll.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
#include <syslog.h>
#include <stdarg.h>
#include <string.h>
#include <time.h>
-#include "passt.h"
#include "util.h"
+#include "passt.h"
#ifdef DEBUG
#define logfn(name, level) \
@@ -183,73 +191,72 @@ char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto)
* sock_l4() - Create and bind socket for given L4, add to epoll list
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
- * @proto: Protocol number, host order
+ * @proto: Protocol number
* @port: Port, host order
+ * @lo: Bind to loopback address only, if set
+ * @data: epoll reference portion for protocol handlers
*
* Return: newly created socket, -1 on error
*/
-int sock_l4(struct ctx *c, int af, uint16_t proto, uint16_t port)
+int sock_l4(struct ctx *c, int af, uint8_t proto, uint16_t port, int lo,
+ uint32_t data)
{
+ union epoll_ref ref = { .proto = proto, .data = data };
struct sockaddr_in addr4 = {
.sin_family = AF_INET,
.sin_port = htons(port),
- .sin_addr = { .s_addr = INADDR_ANY },
};
struct sockaddr_in6 addr6 = {
.sin6_family = AF_INET6,
.sin6_port = htons(port),
- .sin6_addr = IN6ADDR_ANY_INIT,
};
- struct epoll_event ev = { 0 };
const struct sockaddr *sa;
+ struct epoll_event ev;
int fd, sl, one = 1;
if (proto != IPPROTO_TCP && proto != IPPROTO_UDP &&
proto != IPPROTO_ICMP && proto != IPPROTO_ICMPV6)
return -1; /* Not implemented. */
- fd = socket(af, proto == IPPROTO_TCP ? SOCK_STREAM : SOCK_DGRAM, proto);
+ if (proto == IPPROTO_TCP)
+ fd = socket(af, SOCK_STREAM | SOCK_NONBLOCK, proto);
+ else
+ fd = socket(af, SOCK_DGRAM | SOCK_NONBLOCK, proto);
if (fd < 0) {
perror("L4 socket");
return -1;
}
+ ref.s = fd;
if (af == AF_INET) {
+ if (lo)
+ addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ else
+ addr4.sin_addr.s_addr = htonl(INADDR_ANY);
+
sa = (const struct sockaddr *)&addr4;
sl = sizeof(addr4);
} else {
+ if (lo)
+ addr6.sin6_addr = in6addr_loopback;
+ else
+ addr6.sin6_addr = in6addr_any;
+
sa = (const struct sockaddr *)&addr6;
sl = sizeof(addr6);
setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof(one));
}
- CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_ICMP, icmp, fd);
- CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_ICMPV6, icmp, fd);
- CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_TCP, tcp, fd);
- CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_UDP, udp, fd);
-
- if (proto == IPPROTO_UDP && PORT_IS_EPHEMERAL(port))
- goto epoll_add;
-
- if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6)
- goto epoll_add;
+ setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one));
if (bind(fd, sa, sl) < 0) {
/* We'll fail to bind to low ports if we don't have enough
* capabilities, and we'll fail to bind on already bound ports,
- * this is fine. If this isn't the socket with the lowest number
- * for a given protocol, leave it open, to avoid unnecessary
- * holes in the numbering.
+ * this is fine.
*/
- if ((proto == IPPROTO_TCP && fd == c->tcp.fd_min) ||
- (proto == IPPROTO_UDP && fd == c->udp.fd_min) ||
- ((proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) &&
- fd == c->icmp.fd_min)) {
- close(fd);
- return 0;
- }
- return fd;
+ close(fd);
+ return 0;
}
if (proto == IPPROTO_TCP && listen(fd, 128) < 0) {
@@ -258,9 +265,8 @@ int sock_l4(struct ctx *c, int af, uint16_t proto, uint16_t port)
return -1;
}
-epoll_add:
ev.events = EPOLLIN;
- ev.data.fd = fd;
+ ev.data.u64 = ref.u64;
if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev) == -1) {
perror("L4 epoll_ctl");
return -1;
@@ -286,3 +292,97 @@ int timespec_diff_ms(struct timespec *a, struct timespec *b)
return (a->tv_nsec - b->tv_nsec) / 1000000 +
(a->tv_sec - b->tv_sec) * 1000;
}
+
+/**
+ * bitmap_set() - Set single bit in bitmap
+ * @map: Pointer to bitmap
+ * @bit: Bit number to set
+ */
+void bitmap_set(uint8_t *map, int bit)
+{
+ map[bit / 8] |= 1 << (bit % 8);
+}
+
+/**
+ * bitmap_set() - Clear single bit in bitmap
+ * @map: Pointer to bitmap
+ * @bit: Bit number to clear
+ */
+void bitmap_clear(uint8_t *map, int bit)
+{
+ map[bit / 8] &= ~(1 << (bit % 8));
+}
+
+/**
+ * bitmap_isset() - Check for set bit in bitmap
+ * @map: Pointer to bitmap
+ * @bit: Bit number to check
+ *
+ * Return: non-zero if given bit is set, zero if it's not
+ */
+int bitmap_isset(uint8_t *map, int bit)
+{
+ return map[bit / 8] & (1 << bit % 8);
+}
+
+/**
+ * procfs_scan_listen() - Set bits for listening TCP or UDP sockets from procfs
+ * @name: Corresponding name of file under /proc/net/
+ * @map: Bitmap where numbers of ports in listening state will be set
+ */
+void procfs_scan_listen(char *name, uint8_t *map)
+{
+ char line[200], path[PATH_MAX];
+ unsigned long port;
+ unsigned int state;
+ FILE *fp;
+
+ snprintf(path, PATH_MAX, "/proc/net/%s", name);
+ if (!(fp = fopen(path, "r")))
+ return;
+
+ fgets(line, sizeof(line), fp);
+ while (fgets(line, sizeof(line), fp)) {
+ if (sscanf(line, "%*u: %*x:%lx %*x:%*x %x", &port, &state) != 2)
+ continue;
+
+ /* See enum in kernel's include/net/tcp_states.h */
+ if ((strstr(name, "tcp") && state != 0x0a) ||
+ (strstr(name, "udp") && state != 0x07))
+ continue;
+
+ bitmap_set(map, port);
+ }
+
+ fclose(fp);
+}
+
+/**
+ * ns_enter() - Enter user and network namespaces of process with given PID
+ * @target_pid: Process PID
+ *
+ * Return: 0 on success, -1 on failure
+ */
+int ns_enter(int target_pid)
+{
+ char ns[PATH_MAX];
+ int fd;
+
+ snprintf(ns, PATH_MAX, "/proc/%i/ns/user", target_pid);
+ if ((fd = open(ns, O_RDONLY)) < 0 || setns(fd, 0))
+ goto fail;
+ close(fd);
+
+ snprintf(ns, PATH_MAX, "/proc/%i/ns/net", target_pid);
+ if ((fd = open(ns, O_RDONLY)) < 0 || setns(fd, 0))
+ goto fail;
+ close(fd);
+
+ return 0;
+
+fail:
+ if (fd != -1)
+ close(fd);
+
+ return -1;
+}
diff --git a/util.h b/util.h
index 3e24c9a..c4d947a 100644
--- a/util.h
+++ b/util.h
@@ -29,24 +29,45 @@ void debug(const char *format, ...);
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
#endif
+#define SWAP(a, b) \
+ do { \
+ typeof(a) __x = (a); (a) = (b); (b) = __x; \
+ } while (0) \
+
#define STRINGIFY(x) #x
#define STR(x) STRINGIFY(x)
+#define V4 0
+#define V6 1
+#define IP_VERSIONS 2
+
#define ARRAY_SIZE(a) ((int)(sizeof(a) / sizeof((a)[0])))
#define IN_INTERVAL(a, b, x) ((x) >= (a) && (x) <= (b))
#define FD_PROTO(x, proto) \
(IN_INTERVAL(c->proto.fd_min, c->proto.fd_max, (x)))
-#define PORT_IS_EPHEMERAL(port) ((port) >= (1 << 15) + (1 << 14)) /* RFC 6335 */
+#define PORT_EPHEMERAL_MIN ((1 << 15) + (1 << 14)) /* RFC 6335 */
+#define PORT_IS_EPHEMERAL(port) ((port) >= PORT_EPHEMERAL_MIN)
+
+#define NS_FN_STACK_SIZE (RLIMIT_STACK_VAL * 1024 / 4)
#include <linux/ipv6.h>
#include <net/if.h>
#include <linux/ip.h>
+#include <limits.h>
+
+struct ctx;
uint16_t csum_fold(uint32_t sum);
uint16_t csum_ip4(void *buf, size_t len);
void csum_tcp4(struct iphdr *iph);
char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto);
-int sock_l4(struct ctx *c, int af, uint16_t proto, uint16_t port);
+int sock_l4(struct ctx *c, int af, uint8_t proto, uint16_t port, int lo,
+ uint32_t data);
int timespec_diff_ms(struct timespec *a, struct timespec *b);
+void bitmap_set(uint8_t *map, int bit);
+void bitmap_clear(uint8_t *map, int bit);
+int bitmap_isset(uint8_t *map, int bit);
+void procfs_scan_listen(char *name, uint8_t *map);
+int ns_enter(int target_pid);