aboutgitcodebugslistschat
diff options
context:
space:
mode:
-rw-r--r--Makefile4
-rw-r--r--arp.c11
-rw-r--r--dhcp.c7
-rwxr-xr-xdoc/demo.sh69
-rw-r--r--ndp.c7
-rw-r--r--passt.c695
-rw-r--r--passt.h52
-rw-r--r--qrap.c6
-rw-r--r--tap.c136
-rw-r--r--tap.h3
-rw-r--r--tcp.c1367
-rw-r--r--tcp.h5
-rw-r--r--udp.c174
-rw-r--r--udp.h3
-rw-r--r--util.c141
-rw-r--r--util.h5
16 files changed, 2061 insertions, 624 deletions
diff --git a/Makefile b/Makefile
index 257d89e..ca7a3e5 100644
--- a/Makefile
+++ b/Makefile
@@ -2,8 +2,8 @@ CFLAGS += -Wall -Wextra -pedantic
all: passt qrap
-passt: passt.c passt.h arp.c arp.h dhcp.c dhcp.h ndp.c ndp.h util.c util.h
- $(CC) $(CFLAGS) passt.c arp.c dhcp.c ndp.c util.c -o passt
+passt: passt.c passt.h arp.c arp.h dhcp.c dhcp.h ndp.c ndp.h tap.c tap.h tcp.c tcp.h udp.c udp.h util.c util.h
+ $(CC) $(CFLAGS) passt.c arp.c dhcp.c ndp.c tap.c tcp.c udp.c util.c -o passt
qrap: qrap.c passt.h
$(CC) $(CFLAGS) qrap.c -o qrap
diff --git a/arp.c b/arp.c
index 3837a04..21b6417 100644
--- a/arp.c
+++ b/arp.c
@@ -1,9 +1,11 @@
+// SPDX-License-Identifier: AGPL-3.0-or-later
+
/* PASST - Plug A Simple Socket Transport
*
* arp.c - ARP implementation
*
+ * Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
- * License: GPLv2
*
*/
@@ -23,6 +25,7 @@
#include "passt.h"
#include "dhcp.h"
#include "util.h"
+#include "tap.h"
/**
* struct arpmsg - 802.2 ARP IPv4 payload
@@ -39,7 +42,7 @@ struct arpmsg {
} __attribute__((__packed__));
/**
- * dhcp() - Check if this is an ARP message, reply as needed
+ * arp() - Check if this is an ARP message, reply as needed
* @c: Execution context
* @len: Total L2 packet length
* @eh: Packet buffer, Ethernet header
@@ -74,9 +77,11 @@ int arp(struct ctx *c, unsigned len, struct ethhdr *eh)
len = sizeof(*eh) + sizeof(*ah) + sizeof(*am);
memcpy(eh->h_dest, eh->h_source, ETH_ALEN);
+ /* HACK */
+ memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
memcpy(eh->h_source, c->mac, ETH_ALEN);
- if (send(c->fd_unix, eh, len, 0) < 0)
+ if (tap_send(c->fd_unix, eh, len, 0) < 0)
perror("ARP: send");
return 1;
diff --git a/dhcp.c b/dhcp.c
index abb76f6..3af4ace 100644
--- a/dhcp.c
+++ b/dhcp.c
@@ -1,9 +1,11 @@
+// SPDX-License-Identifier: AGPL-3.0-or-later
+
/* PASST - Plug A Simple Socket Transport
*
* dhcp.c - Minimalistic DHCP server for PASST
*
+ * Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
- * License: GPLv2
*
*/
@@ -22,6 +24,7 @@
#include "passt.h"
#include "dhcp.h"
#include "util.h"
+#include "tap.h"
/**
* struct opt - DHCP option
@@ -212,7 +215,7 @@ int dhcp(struct ctx *c, unsigned len, struct ethhdr *eh)
memcpy(eh->h_dest, eh->h_source, ETH_ALEN);
memcpy(eh->h_source, c->mac, ETH_ALEN);
- if (send(c->fd_unix, eh, len, 0) < 0)
+ if (tap_send(c->fd_unix, eh, len, 0) < 0)
perror("DHCP: send");
return 1;
diff --git a/doc/demo.sh b/doc/demo.sh
new file mode 100755
index 0000000..3d20491
--- /dev/null
+++ b/doc/demo.sh
@@ -0,0 +1,69 @@
+#!/bin/sh -e
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+#
+# demo.sh - Set up namespaces, addresses and routes to show PASST functionality
+#
+# Copyright (c) 2020-2021 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+get_token() {
+ IFS=' '
+ __next=0
+ for __token in ${@}; do
+ [ ${__next} -eq 2 ] && echo "${__token}" && return
+ [ "${__token}" = "${1}" ] && __next=$((__next + 1))
+ done
+ unset IFS
+}
+
+ipv6_dev() { get_token "dev" $(ip -o -6 ro show default); }
+ipv6_devaddr() { get_token "inet6" $(ip -o -6 ad sh dev "${1}" scope global); }
+ipv6_ll_addr() { get_token "inet6" $(ip -o -6 ad sh dev "${1}" scope link); }
+ipv6_mask() { echo ${1#*/}; }
+ipv6_mangle() {
+ IFS=':'
+ __c=0
+ for __16b in ${1%%/*}; do
+ if [ ${__c} -lt 7 ]; then
+ printf "${__16b}:"
+ else
+ printf "abcd\n" && break
+ fi
+ __c=$((__c + 1))
+ done
+ unset IFS
+}
+
+ndp_setup() {
+ sysctl -w net.ipv6.conf.all.proxy_ndp=1
+ ip -6 neigh add proxy "${1}" dev "$(ipv6_dev)"
+}
+
+ip netns del passt 2>/dev/null || :
+ip link del veth_passt 2>/dev/null || :
+ip netns add passt
+ip link add veth_passt up netns passt type veth peer name veth_passt
+ip link set dev veth_passt up
+
+ip -n passt addr add 192.0.2.2/24 dev veth_passt
+ip addr add 192.0.2.1/24 dev veth_passt
+ip -n passt route add default via 192.0.2.1
+
+ipv6_addr="$(ipv6_devaddr "$(ipv6_dev)")"
+ipv6_passt="$(ipv6_mangle "${ipv6_addr}")"
+ndp_setup "${ipv6_passt}"
+ip -n passt addr add "${ipv6_passt}/$(ipv6_mask "${ipv6_addr}")" dev veth_passt
+ip addr add "${ipv6_addr}" dev veth_passt
+passt_ll="$(ipv6_ll_addr "veth_passt")"
+main_ll="$(get_token "link/ether" $(ip -o li sh veth_passt))"
+ip -n passt neigh add "${passt_ll%%/*}" dev veth_passt lladdr "${main_ll}"
+ip -n passt route add default via "${passt_ll%%/*}" dev veth_passt
+
+ethtool -K veth_passt tx off
+ip netns exec passt ethtool -K veth_passt tx off
+ulimit -n 300000
+
+ip netns exec passt ./passt
diff --git a/ndp.c b/ndp.c
index a15ecc3..9e38cec 100644
--- a/ndp.c
+++ b/ndp.c
@@ -1,9 +1,11 @@
+// SPDX-License-Identifier: AGPL-3.0-or-later
+
/* PASST - Plug A Simple Socket Transport
*
* ndp.c - NDP support for PASST
*
+ * Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
- * License: GPLv2
*
*/
@@ -23,6 +25,7 @@
#include "passt.h"
#include "util.h"
+#include "tap.h"
#define RS 133
#define RA 134
@@ -126,7 +129,7 @@ int ndp(struct ctx *c, unsigned len, struct ethhdr *eh)
memcpy(ehr->h_source, c->mac, ETH_ALEN);
ehr->h_proto = htons(ETH_P_IPV6);
- if (send(c->fd_unix, ehr, len, 0) < 0)
+ if (tap_send(c->fd_unix, ehr, len, 0) < 0)
perror("NDP: send");
return 1;
diff --git a/passt.c b/passt.c
index 57759e4..4ef6e72 100644
--- a/passt.c
+++ b/passt.c
@@ -1,27 +1,16 @@
+// SPDX-License-Identifier: AGPL-3.0-or-later
+
/* PASST - Plug A Simple Socket Transport
*
* passt.c - Daemon implementation
*
+ * Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
- * License: GPLv2
- *
- * Grab Ethernet frames via AF_UNIX socket, build AF_INET/AF_INET6 sockets for
- * each 5-tuple from ICMP, TCP, UDP packets, perform connection tracking and
- * forward them with destination address NAT. Forward packets received on
- * sockets back to the UNIX domain socket (typically, a tap file descriptor from
- * qemu).
*
- * TODO:
- * - steal packets from AF_INET/AF_INET6 sockets (using eBPF/XDP, or a new
- * socket option): currently, incoming packets are also handled by in-kernel
- * protocol handlers, so every incoming untracked TCP packet gets a RST.
- * Workaround:
- * iptables -A OUTPUT -m state --state INVALID,NEW,ESTABLISHED \
- * -p tcp --tcp-flags RST RST -j DROP
- * ip6tables -A OUTPUT -m state --state INVALID,NEW,ESTABLISHED \
- * -p tcp --tcp-flags RST RST -j DROP
- * - and use XDP sockmap on top of that to improve performance
- * - aging and timeout/RST bookkeeping for connection tracking entries
+ * Grab Ethernet frames via AF_UNIX socket, build SOCK_DGRAM/SOCK_STREAM sockets
+ * for each 5-tuple from TCP, UDP packets, perform connection tracking and
+ * forward them. Forward packets received on sockets back to the UNIX domain
+ * socket (typically, a socket virtio_net file descriptor from qemu).
*/
#include <stdio.h>
@@ -50,14 +39,21 @@
#include <linux/ip.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
+#include <time.h>
#include "passt.h"
#include "arp.h"
#include "dhcp.h"
#include "ndp.h"
#include "util.h"
+#include "tcp.h"
+#include "udp.h"
-#define EPOLL_EVENTS 10
+#define EPOLL_EVENTS 10
+
+#define EPOLL_TIMEOUT 100 /* ms, for protocol periodic handlers */
+#define PERIODIC_HANDLER_FAST 100
+#define PERIODIC_HANDLER_SLOW 1000
/**
* sock_unix() - Create and bind AF_UNIX socket, add to epoll list
@@ -298,376 +294,42 @@ static void get_dns(struct ctx *c)
}
/**
- * sock_l4() - Create and bind socket for given L4, add to epoll list
- * @c: Execution context
- * @v: IP protocol, 4 or 6
- * @proto: Protocol number, network order
- * @port: L4 port, network order
- *
- * Return: newly created socket, -1 on error
- */
-static int sock_l4(struct ctx *c, int v, uint16_t proto, uint16_t port)
-{
- struct sockaddr_in addr4 = {
- .sin_family = AF_INET,
- .sin_port = port,
- .sin_addr = { .s_addr = c->addr4 },
- };
- struct sockaddr_in6 addr6 = {
- .sin6_family = AF_INET6,
- .sin6_port = port,
- .sin6_addr = c->addr6,
- };
- struct epoll_event ev = { 0 };
- const struct sockaddr *sa;
- int fd, sl;
-
- fd = socket(v == 4 ? AF_INET : AF_INET6, SOCK_RAW, proto);
- if (fd < 0) {
- perror("L4 socket");
- return -1;
- }
-
- if (v == 4) {
- sa = (const struct sockaddr *)&addr4;
- sl = sizeof(addr4);
- } else {
- sa = (const struct sockaddr *)&addr6;
- sl = sizeof(addr6);
- }
-
- if (bind(fd, sa, sl) < 0) {
- perror("L4 bind");
- close(fd);
- return -1;
- }
-
- ev.events = EPOLLIN;
- ev.data.fd = fd;
- if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev) == -1) {
- perror("L4 epoll_ctl");
- return -1;
- }
-
- return fd;
-}
-
-/**
- * lookup4() - Look up entry from tap-sourced IPv4 packet, create if missing
- * @c: Execution context
- * @eh: Packet buffer, Ethernet header
- *
- * Return: -1 for unsupported or too many sockets, matching socket otherwise
- */
-static int lookup4(struct ctx *c, const struct ethhdr *eh)
-{
- struct iphdr *iph = (struct iphdr *)(eh + 1);
- struct tcphdr *th = (struct tcphdr *)((char *)iph + iph->ihl * 4);
- char buf_s[BUFSIZ], buf_d[BUFSIZ];
- struct ct4 *ct = c->map4;
- int i, one_icmp_fd = 0;
-
- if (iph->protocol != IPPROTO_ICMP && iph->protocol != IPPROTO_TCP &&
- iph->protocol != IPPROTO_UDP)
- return -1;
-
- for (i = 0; i < CT_SIZE; i++) {
- if (ct[i].p == iph->protocol && ct[i].sa == iph->saddr &&
- ((ct[i].p == IPPROTO_ICMP && ct[i].da == iph->daddr)
- || ct[i].sp == th->source) &&
- !memcmp(ct[i].hd, eh->h_dest, ETH_ALEN) &&
- !memcmp(ct[i].hs, eh->h_source, ETH_ALEN)) {
- if (iph->protocol != IPPROTO_ICMP) {
- ct[i].da = iph->daddr;
- ct[i].dp = th->dest;
- }
- return ct[i].fd;
- }
- }
-
- for (i = 0; i < CT_SIZE && ct[i].p; i++) {
- if (iph->protocol == IPPROTO_ICMP && ct[i].p == IPPROTO_ICMP)
- one_icmp_fd = ct[i].fd;
- }
-
- if (i == CT_SIZE) {
- fprintf(stderr, "\nToo many sockets, aborting ");
- } else {
- if (iph->protocol == IPPROTO_ICMP) {
- if (one_icmp_fd)
- ct[i].fd = one_icmp_fd;
- else
- ct[i].fd = sock_l4(c, 4, iph->protocol, 0);
- } else {
- ct[i].fd = sock_l4(c, 4, iph->protocol, th->source);
- }
-
- fprintf(stderr, "\n(socket %i) New ", ct[i].fd);
- ct[i].p = iph->protocol;
- ct[i].sa = iph->saddr;
- ct[i].da = iph->daddr;
- if (iph->protocol != IPPROTO_ICMP) {
- ct[i].sp = th->source;
- ct[i].dp = th->dest;
- }
- memcpy(&ct[i].hd, eh->h_dest, ETH_ALEN);
- memcpy(&ct[i].hs, eh->h_source, ETH_ALEN);
- }
-
- if (iph->protocol == IPPROTO_ICMP) {
- fprintf(stderr, "icmp connection\n\tfrom %s to %s\n\n",
- inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)),
- inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)));
- } else {
- fprintf(stderr, "%s connection\n\tfrom %s:%i to %s:%i\n\n",
- getprotobynumber(iph->protocol)->p_name,
- inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)),
- ntohs(th->source),
- inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)),
- ntohs(th->dest));
- }
-
- return (i == CT_SIZE) ? -1 : ct[i].fd;
-}
-
-/**
- * lookup6() - Look up entry from tap-sourced IPv6 packet, create if missing
- * @c: Execution context
- * @eh: Packet buffer, Ethernet header
- *
- * Return: -1 for unsupported or too many sockets, matching socket otherwise
- */
-static int lookup6(struct ctx *c, const struct ethhdr *eh)
-{
- struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
- char buf_s[BUFSIZ], buf_d[BUFSIZ];
- struct ct6 *ct = c->map6;
- int i, one_icmp_fd = 0;
- struct tcphdr *th;
- uint8_t proto;
-
- th = (struct tcphdr *)ipv6_l4hdr(ip6h, &proto);
- if (!th)
- return -1;
-
- if (proto != IPPROTO_ICMPV6 && proto != IPPROTO_TCP &&
- proto != IPPROTO_UDP)
- return -1;
-
- for (i = 0; i < CT_SIZE; i++) {
- if (ct[i].p != proto)
- continue;
-
- if (memcmp(ct[i].hd, eh->h_dest, ETH_ALEN) ||
- memcmp(ct[i].hs, eh->h_source, ETH_ALEN) ||
- memcmp(&ct[i].sa, &ip6h->saddr, sizeof(ct[i].sa)))
- continue;
-
- if (ct[i].p != IPPROTO_ICMPV6 &&
- ct[i].sp != th->source)
- continue;
-
- if (ct[i].p == IPPROTO_ICMPV6 &&
- memcmp(&ct[i].da, &ip6h->daddr, sizeof(ct[i].da)))
- continue;
-
- if (ct[i].p != IPPROTO_ICMPV6) {
- memcpy(&ct[i].da, &ip6h->daddr, sizeof(ct[i].da));
- ct[i].dp = th->dest;
- }
-
- return ct[i].fd;
- }
-
- for (i = 0; i < CT_SIZE && ct[i].p; i++) {
- if (proto == IPPROTO_ICMPV6 && ct[i].p == IPPROTO_ICMPV6)
- one_icmp_fd = ct[i].fd;
- }
-
- if (i == CT_SIZE) {
- fprintf(stderr, "\nToo many sockets, aborting ");
- } else {
- if (proto == IPPROTO_ICMPV6) {
- if (one_icmp_fd)
- ct[i].fd = one_icmp_fd;
- else
- ct[i].fd = sock_l4(c, 6, proto, 0);
- } else {
- ct[i].fd = sock_l4(c, 6, proto, th->source);
- }
-
- fprintf(stderr, "\n(socket %i) New ", ct[i].fd);
- ct[i].p = proto;
- memcpy(&ct[i].sa, &ip6h->saddr, sizeof(ct[i].sa));
- memcpy(&ct[i].da, &ip6h->daddr, sizeof(ct[i].da));
- if (ct[i].p != IPPROTO_ICMPV6) {
- ct[i].sp = th->source;
- ct[i].dp = th->dest;
- }
- memcpy(&ct[i].hd, eh->h_dest, ETH_ALEN);
- memcpy(&ct[i].hs, eh->h_source, ETH_ALEN);
- }
-
- if (proto == IPPROTO_ICMPV6) {
- fprintf(stderr, "icmpv6 connection\n\tfrom %s\n"
- "\tto %s\n\n",
- inet_ntop(AF_INET6, &ct[i].sa, buf_s, sizeof(buf_s)),
- inet_ntop(AF_INET6, &ct[i].da, buf_d, sizeof(buf_d)));
- } else {
- fprintf(stderr, "%s connection\n\tfrom [%s]:%i\n"
- "\tto [%s]:%i\n\n",
- getprotobynumber(proto)->p_name,
- inet_ntop(AF_INET6, &ct[i].sa, buf_s, sizeof(buf_s)),
- ntohs(th->source),
- inet_ntop(AF_INET6, &ct[i].da, buf_d, sizeof(buf_d)),
- ntohs(th->dest));
- }
-
- return (i == CT_SIZE) ? -1 : ct[i].fd;
-}
-
-/**
- * lookup_r4() - Reverse look up connection tracking entry for IPv4 packet
- * @ct: Connection tracking table
- * @fd: File descriptor that received the packet
- * @iph: Packet buffer, IP header
- *
- * Return: matching entry if any, NULL otherwise
- */
-struct ct4 *lookup_r4(struct ct4 *ct, int fd, struct iphdr *iph)
-{
- struct tcphdr *th = (struct tcphdr *)((char *)iph + iph->ihl * 4);
- int i;
-
- for (i = 0; i < CT_SIZE; i++) {
- if (ct[i].fd == fd &&
- iph->protocol == ct[i].p &&
- iph->saddr == ct[i].da &&
- (iph->protocol == IPPROTO_ICMP ||
- (th->source == ct[i].dp && th->dest == ct[i].sp)))
- return &ct[i];
- }
-
- return NULL;
-}
-
-/**
- * lookup_r6() - Reverse look up connection tracking entry for IPv6 packet
- * @ct: Connection tracking table
- * @fd: File descriptor that received the packet
- *
- * Return: matching entry if any, NULL otherwise
- */
-struct ct6 *lookup_r6(struct ct6 *ct, int fd, struct tcphdr *th)
-{
- int i;
-
- for (i = 0; i < CT_SIZE; i++) {
- if (ct[i].fd != fd)
- continue;
-
- if (ct[i].p == IPPROTO_ICMPV6 ||
- (ct[i].dp == th->source && ct[i].sp == th->dest))
- return &ct[i];
- }
-
- return NULL;
-}
-
-/**
- * nat4_in() - Perform incoming IPv4 address translation
- * @addr: Original destination address to be used
- * @iph: IP header
- */
-static void nat_in(unsigned long addr, struct iphdr *iph)
-{
- iph->daddr = addr;
-}
-
-/**
- * csum_ipv4() - Calculate TCP checksum for IPv4 and set in place
- * @iph: Packet buffer, IP header
- */
-static void csum_tcp4(struct iphdr *iph)
-{
- struct tcphdr *th = (struct tcphdr *)((char *)iph + iph->ihl * 4);
- uint16_t tlen = ntohs(iph->tot_len) - iph->ihl * 4, *p = (uint16_t *)th;
- uint32_t sum = 0;
-
- sum += (iph->saddr >> 16) & 0xffff;
- sum += iph->saddr & 0xffff;
- sum += (iph->daddr >> 16) & 0xffff;
- sum += iph->daddr & 0xffff;
-
- sum += htons(IPPROTO_TCP);
- sum += htons(tlen);
-
- th->check = 0;
- while (tlen > 1) {
- sum += *p++;
- tlen -= 2;
- }
-
- if (tlen > 0) {
- sum += *p & htons(0xff00);
- }
-
- th->check = (uint16_t)~csum_fold(sum);
-}
-
-/**
* tap4_handler() - IPv4 packet handler for tap file descriptor
* @c: Execution context
* @len: Total L2 packet length
* @in: Packet buffer, L2 headers
*/
-static void tap4_handler(struct ctx *c, int len, char *in)
+static void tap4_handler(struct ctx *c, char *in, size_t len)
{
struct ethhdr *eh = (struct ethhdr *)in;
struct iphdr *iph = (struct iphdr *)(eh + 1);
- struct tcphdr *th = (struct tcphdr *)((char *)iph + iph->ihl * 4);
- struct udphdr *uh = (struct udphdr *)th;
- struct sockaddr_in addr = {
- .sin_family = AF_INET,
- .sin_port = th->dest,
- .sin_addr = { .s_addr = iph->daddr },
- };
+ char *l4h = (char *)iph + iph->ihl * 4;
char buf_s[BUFSIZ], buf_d[BUFSIZ];
- int fd;
if (arp(c, len, eh) || dhcp(c, len, eh))
return;
- fd = lookup4(c, eh);
- if (fd == -1)
- return;
-
if (iph->protocol == IPPROTO_ICMP) {
- fprintf(stderr, "icmp from tap: %s -> %s (socket %i)\n",
+ fprintf(stderr, "icmp from tap: %s -> %s\n",
inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)),
- inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)),
- fd);
+ inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)));
} else {
- fprintf(stderr, "%s from tap: %s:%i -> %s:%i (socket %i)\n",
+ struct tcphdr *th = (struct tcphdr *)l4h;
+
+ fprintf(stderr, "%s from tap: %s:%i -> %s:%i\n",
getprotobynumber(iph->protocol)->p_name,
inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)),
ntohs(th->source),
inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)),
- ntohs(th->dest),
- fd);
+ ntohs(th->dest));
}
+ len -= (intptr_t)l4h - (intptr_t)eh;
+
if (iph->protocol == IPPROTO_TCP)
- csum_tcp4(iph);
+ tcp_tap_handler(c, AF_INET, &iph->daddr, l4h, len);
else if (iph->protocol == IPPROTO_UDP)
- uh->check = 0;
- else if (iph->protocol != IPPROTO_ICMP)
- return;
-
- if (sendto(fd, (void *)th, len - sizeof(*eh) - iph->ihl * 4, 0,
- (struct sockaddr *)&addr, sizeof(addr)) < 0)
- perror("sendto");
-
+ udp_tap_handler(c, AF_INET, &iph->daddr, l4h, len);
}
/**
@@ -676,228 +338,122 @@ static void tap4_handler(struct ctx *c, int len, char *in)
* @len: Total L2 packet length
* @in: Packet buffer, L2 headers
*/
-static void tap6_handler(struct ctx *c, int len, char *in)
+static void tap6_handler(struct ctx *c, char *in, size_t len)
{
struct ethhdr *eh = (struct ethhdr *)in;
struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
- struct tcphdr *th;
- struct udphdr *uh;
- struct icmp6hdr *ih;
- struct sockaddr_in6 addr = {
- .sin6_family = AF_INET6,
- .sin6_addr = ip6h->daddr,
- };
char buf_s[BUFSIZ], buf_d[BUFSIZ];
uint8_t proto;
- int fd;
+ char *l4h;
if (ndp(c, len, eh))
return;
- fd = lookup6(c, eh);
- if (fd == -1)
- return;
+ l4h = ipv6_l4hdr(ip6h, &proto);
+
+ /* TODO: Assign MAC address to guest so that, together with prefix
+ * assigned via NDP, address matches the one on the host. Then drop
+ * address change and checksum recomputation.
+ */
+ c->addr6_guest = ip6h->saddr;
+ ip6h->saddr = c->addr6;
+ if (proto == IPPROTO_TCP) {
+ struct tcphdr *th = (struct tcphdr *)(ip6h + 1);
+
+ th->check = 0;
+ th->check = csum_ip4(ip6h, len + sizeof(*ip6h));
+ } else if (proto == IPPROTO_UDP) {
+ struct udphdr *uh = (struct udphdr *)(ip6h + 1);
- th = (struct tcphdr *)ipv6_l4hdr(ip6h, &proto);
- uh = (struct udphdr *)th;
- ih = (struct icmp6hdr *)th;
+ uh->check = 0;
+ uh->check = csum_ip4(ip6h, len + sizeof(*ip6h));
+ } else if (proto == IPPROTO_ICMPV6) {
+ struct icmp6hdr *ih = (struct icmp6hdr *)(ip6h + 1);
+
+ ih->icmp6_cksum = 0;
+ ih->icmp6_cksum = csum_ip4(ip6h, len + sizeof(*ip6h));
+ }
if (proto == IPPROTO_ICMPV6) {
- fprintf(stderr, "icmpv6 from tap: %s ->\n\t%s (socket %i)\n",
+ fprintf(stderr, "icmpv6 from tap: %s ->\n\t%s\n",
inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)),
- inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d)),
- fd);
+ inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d))
+ );
} else {
+ struct tcphdr *th = (struct tcphdr *)l4h;
+
fprintf(stderr, "%s from tap: [%s]:%i\n"
- "\t-> [%s]:%i (socket %i)\n",
+ "\t-> [%s]:%i\n",
getprotobynumber(proto)->p_name,
inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)),
ntohs(th->source),
inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d)),
- ntohs(th->dest),
- fd);
- }
-
- if (proto != IPPROTO_TCP && proto != IPPROTO_UDP &&
- proto != IPPROTO_ICMPV6)
- return;
-
- ip6h->saddr = c->addr6;
-
- ip6h->hop_limit = proto;
- ip6h->version = 0;
- ip6h->nexthdr = 0;
- memset(ip6h->flow_lbl, 0, 3);
-
- if (proto == IPPROTO_TCP) {
- th->check = 0;
- th->check = csum_ip4(ip6h,
- len - ((intptr_t)th - (intptr_t)eh) +
- sizeof(*ip6h));
- } else if (proto == IPPROTO_UDP) {
- uh->check = 0;
- uh->check = csum_ip4(ip6h,
- len - ((intptr_t)uh - (intptr_t)eh) +
- sizeof(*ip6h));
- } else if (proto == IPPROTO_ICMPV6) {
- ih->icmp6_cksum = 0;
- ih->icmp6_cksum = csum_ip4(ip6h,
- len - ((intptr_t)ih - (intptr_t)eh) +
- sizeof(*ip6h));
+ ntohs(th->dest));
}
- ip6h->version = 6;
- ip6h->nexthdr = proto;
- ip6h->hop_limit = 255;
-
- if (sendto(fd, (void *)th, len - ((intptr_t)th - (intptr_t)eh), 0,
- (struct sockaddr *)&addr, sizeof(addr)) < 0)
- perror("sendto");
+ len -= (intptr_t)l4h - (intptr_t)eh;
+ if (proto == IPPROTO_TCP)
+ tcp_tap_handler(c, AF_INET6, &ip6h->daddr, l4h, len);
+ else if (proto == IPPROTO_UDP)
+ udp_tap_handler(c, AF_INET6, &ip6h->daddr, l4h, len);
}
-static void tap_handler(struct ctx *c, int len, char *in)
+/**
+ * tap_handler() - IPv4/IPv6/ARP packet handler for tap file descriptor
+ * @c: Execution context
+ * @len: Total L2 packet length
+ * @in: Packet buffer, L2 headers
+ */
+static void tap_handler(struct ctx *c, char *in, size_t len)
{
struct ethhdr *eh = (struct ethhdr *)in;
if (eh->h_proto == ntohs(ETH_P_IP) || eh->h_proto == ntohs(ETH_P_ARP))
- tap4_handler(c, len, in);
+ tap4_handler(c, in, len);
else if (eh->h_proto == ntohs(ETH_P_IPV6))
- tap6_handler(c, len, in);
+ tap6_handler(c, in, len);
}
/**
- * ext4_handler() - IPv4 packet handler for external routable interface
+ * sock_handler() - Event handler for L4 sockets
* @c: Execution context
- * @fd: File descriptor that received the packet
- * @len: Total L3 packet length
- * @in: Packet buffer, L3 headers
+ * @fd: File descriptor associated to event
+ * @events epoll events
*/
-static void ext4_handler(struct ctx *c, int fd, int len, char *in)
+static void sock_handler(struct ctx *c, int fd, uint32_t events)
{
- struct iphdr *iph = (struct iphdr *)in;
- struct tcphdr *th = (struct tcphdr *)((char *)iph + iph->ihl * 4);
- struct udphdr *uh = (struct udphdr *)th;
- char buf_s[BUFSIZ], buf_d[BUFSIZ], buf[ETH_MAX_MTU];
- struct ethhdr *eh = (struct ethhdr *)buf;
- struct ct4 *entry;
-
- entry = lookup_r4(c->map4, fd, iph);
- if (!entry)
- return;
+ socklen_t sl;
+ int so;
- nat_in(entry->sa, iph);
-
- iph->check = 0;
- iph->check = csum_ip4(iph, iph->ihl * 4);
-
- if (iph->protocol == IPPROTO_TCP)
- csum_tcp4(iph);
- else if (iph->protocol == IPPROTO_UDP)
- uh->check = 0;
-
- memcpy(eh->h_dest, entry->hs, ETH_ALEN);
- memcpy(eh->h_source, entry->hd, ETH_ALEN);
- eh->h_proto = ntohs(ETH_P_IP);
-
- memcpy(eh + 1, in, len);
-
- if (iph->protocol == IPPROTO_ICMP) {
- fprintf(stderr, "icmp (socket %i) to tap: %s -> %s\n",
- entry->fd,
- inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)),
- inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)));
- } else {
- fprintf(stderr, "%s (socket %i) to tap: %s:%i -> %s:%i\n",
- getprotobynumber(iph->protocol)->p_name,
- entry->fd,
- inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)),
- ntohs(th->source),
- inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)),
- ntohs(th->dest));
- }
+ sl = sizeof(so);
- if (send(c->fd_unix, buf, len + sizeof(*eh), 0) < 0)
- perror("send");
+ if (getsockopt(fd, SOL_SOCKET, SO_TYPE, &so, &sl) ||
+ so == SOCK_STREAM)
+ tcp_sock_handler(c, fd, events);
+ else if (so == SOCK_DGRAM)
+ udp_sock_handler(c, fd, events);
}
/**
- * ext6_handler() - IPv6 packet handler for external routable interface
+ * periodic_handler() - Run periodic tasks for L4 protocol handlers
* @c: Execution context
- * @fd: File descriptor that received the packet
- * @len: Total L4 packet length
- * @in: Packet buffer, L4 headers
+ * @last: Timestamp of last run, updated on return
*/
-static int ext6_handler(struct ctx *c, int fd, int len, char *in)
+static void periodic_handler(struct ctx *c, struct timespec *last)
{
- struct tcphdr *th = (struct tcphdr *)in;
- struct udphdr *uh;
- struct icmp6hdr *ih;
- char buf_s[BUFSIZ], buf_d[BUFSIZ], buf[ETH_MAX_MTU] = { 0 };
- struct ethhdr *eh = (struct ethhdr *)buf;
- struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
- struct ct6 *entry;
-
- entry = lookup_r6(c->map6, fd, th);
- if (!entry)
- return 0;
-
- ip6h->daddr = entry->sa;
- ip6h->saddr = entry->da;
- memcpy(ip6h + 1, in, len);
- ip6h->payload_len = htons(len);
-
- th = (struct tcphdr *)(ip6h + 1);
- uh = (struct udphdr *)th;
- ih = (struct icmp6hdr *)th;
- ip6h->hop_limit = entry->p;
-
- if (entry->p == IPPROTO_TCP) {
- th->check = 0;
- th->check = csum_ip4(ip6h, len + sizeof(*ip6h));
- } else if (entry->p == IPPROTO_UDP) {
- uh->check = 0;
- uh->check = csum_ip4(ip6h, len + sizeof(*ip6h));
- } else if (entry->p == IPPROTO_ICMPV6) {
- ih->icmp6_cksum = 0;
- ih->icmp6_cksum = csum_ip4(ip6h, len + sizeof(*ip6h));
- }
-
- ip6h->version = 6;
- ip6h->nexthdr = entry->p;
- ip6h->hop_limit = 255;
-
- memcpy(eh->h_dest, entry->hs, ETH_ALEN);
- memcpy(eh->h_source, entry->hd, ETH_ALEN);
- eh->h_proto = ntohs(ETH_P_IPV6);
-
- if (entry->p == IPPROTO_ICMPV6) {
- fprintf(stderr, "icmpv6 (socket %i) to tap: %s\n\t-> %s\n",
- entry->fd,
- inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)),
- inet_ntop(AF_INET6, &ip6h->daddr, buf_d,
- sizeof(buf_d)));
- } else {
- fprintf(stderr, "%s (socket %i) to tap: [%s]:%i\n"
- "\t-> [%s]:%i\n",
- getprotobynumber(entry->p)->p_name,
- entry->fd,
- inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)),
- ntohs(th->source),
- inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d)),
- ntohs(th->dest));
- }
+ struct timespec tmp;
+ int elapsed_ms;
- if (send(c->fd_unix, buf, len + sizeof(*ip6h) + sizeof(*eh), 0) < 0)
- perror("send");
+ clock_gettime(CLOCK_MONOTONIC, &tmp);
+ elapsed_ms = timespec_diff_ms(&tmp, last);
- return 1;
-}
+ if (elapsed_ms >= PERIODIC_HANDLER_FAST)
+ tcp_periodic_fast(c);
+ if (elapsed_ms >= PERIODIC_HANDLER_SLOW)
+ tcp_periodic_slow(c);
-static void ext_handler(struct ctx *c, int fd, int len, char *in)
-{
- if (!ext6_handler(c, fd, len, in))
- ext4_handler(c, fd, len, in);
+ *last = tmp;
}
/**
@@ -924,6 +480,7 @@ int main(int argc, char **argv)
char buf4[4][sizeof("255.255.255.255")];
struct epoll_event events[EPOLL_EVENTS];
struct epoll_event ev = { 0 };
+ struct timespec last_time;
char buf[ETH_MAX_MTU];
struct ctx c = { 0 };
int nfds, i, len;
@@ -958,55 +515,71 @@ int main(int argc, char **argv)
}
fprintf(stderr, "\n");
+ if (clock_gettime(CLOCK_MONOTONIC, &last_time)) {
+ perror("clock_gettime");
+ exit(EXIT_FAILURE);
+ }
+
c.epollfd = epoll_create1(0);
if (c.epollfd == -1) {
perror("epoll_create1");
exit(EXIT_FAILURE);
}
+ if (tcp_sock_init(&c) || udp_sock_init(&c))
+ exit(EXIT_FAILURE);
+
fd_unix = sock_unix();
listen:
listen(fd_unix, 1);
fprintf(stderr,
"You can now start qrap:\n\t"
- "./qrap 42 kvm ... -net tap,fd=42 -net nic,model=virtio\n\n");
+ "./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio\n\n");
c.fd_unix = accept(fd_unix, NULL, NULL);
- ev.events = EPOLLIN;
+ ev.events = EPOLLIN | EPOLLRDHUP | EPOLLERR | EPOLLHUP;
ev.data.fd = c.fd_unix;
epoll_ctl(c.epollfd, EPOLL_CTL_ADD, c.fd_unix, &ev);
+ clock_gettime(CLOCK_MONOTONIC, &last_time);
+
loop:
- nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, -1);
+ nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, EPOLL_TIMEOUT);
if (nfds == -1 && errno != EINTR) {
perror("epoll_wait");
exit(EXIT_FAILURE);
}
for (i = 0; i < nfds; i++) {
- len = recv(events[i].data.fd, buf, sizeof(buf), MSG_DONTWAIT);
+ if (events[i].data.fd == c.fd_unix) {
+ len = recv(events[i].data.fd, buf, sizeof(buf),
+ MSG_DONTWAIT);
+
+ if (len <= 0) {
+ epoll_ctl(c.epollfd, EPOLL_CTL_DEL, c.fd_unix,
+ &ev);
+ close(c.fd_unix);
+ goto listen;
+ }
- if (events[i].data.fd == c.fd_unix && len <= 0) {
- epoll_ctl(c.epollfd, EPOLL_CTL_DEL, c.fd_unix, &ev);
- close(c.fd_unix);
- goto listen;
- }
+ if (len == 0 || (len < 0 && errno == EINTR))
+ continue;
- if (len == 0 || (len < 0 && errno == EINTR))
- continue;
+ if (len < 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK)
+ break;
+ goto out;
+ }
- if (len < 0) {
- if (errno == EAGAIN || errno == EWOULDBLOCK)
- break;
- goto out;
+ tap_handler(&c, buf + 4, ntohl(*(uint32_t *)buf));
+ } else {
+ sock_handler(&c, events[i].data.fd, events[i].events);
}
-
- if (events[i].data.fd == c.fd_unix)
- tap_handler(&c, len, buf);
- else
- ext_handler(&c, events[i].data.fd, len, buf);
}
+ periodic_handler(&c, &last_time);
+ clock_gettime(CLOCK_MONOTONIC, &last_time);
+
goto loop;
out:
diff --git a/passt.h b/passt.h
index 904b42f..e2c9035 100644
--- a/passt.h
+++ b/passt.h
@@ -1,56 +1,12 @@
-#define CT_SIZE 4096
#define UNIX_SOCK_PATH "/tmp/passt.socket"
/**
- * struct ct4 - IPv4 connection tracking entry
- * @p: IANA protocol number
- * @sa: Source address (as seen from tap interface)
- * @da: Destination address
- * @sp: Source port, network order
- * @dp: Destination port, network order
- * @hd: Destination MAC address
- * @hs: Source MAC address
- * @fd: File descriptor for corresponding AF_INET socket
- */
-struct ct4 {
- uint8_t p;
- uint32_t sa;
- uint32_t da;
- uint16_t sp;
- uint16_t dp;
- unsigned char hd[ETH_ALEN];
- unsigned char hs[ETH_ALEN];
- int fd;
-};
-
-/**
- * struct ct6 - IPv6 connection tracking entry
- * @p: IANA protocol number
- * @sa: Source address (as seen from tap interface)
- * @da: Destination address
- * @sp: Source port, network order
- * @dp: Destination port, network order
- * @hd: Destination MAC address
- * @hs: Source MAC address
- * @fd: File descriptor for corresponding AF_INET6 socket
- */
-struct ct6 {
- uint8_t p;
- struct in6_addr sa;
- struct in6_addr da;
- uint16_t sp;
- uint16_t dp;
- unsigned char hd[ETH_ALEN];
- unsigned char hs[ETH_ALEN];
- int fd;
-};
-
-/**
* struct ctx - Execution context
* @epollfd: file descriptor for epoll instance
* @fd_unix: AF_UNIX socket for tap file descriptor
- * @map4: Connection tracking table
* @v4: Enable IPv4 transport
+ * @mac: Host MAC address
+ * @mac_guest: Guest MAC address
* @addr4: IPv4 address for external, routable interface
* @mask4: IPv4 netmask, network order
* @gw4: Default IPv4 gateway, network order
@@ -64,9 +20,8 @@ struct ct6 {
struct ctx {
int epollfd;
int fd_unix;
- struct ct4 map4[CT_SIZE];
- struct ct6 map6[CT_SIZE];
unsigned char mac[ETH_ALEN];
+ unsigned char mac_guest[ETH_ALEN];
int v4;
unsigned long addr4;
@@ -76,6 +31,7 @@ struct ctx {
int v6;
struct in6_addr addr6;
+ struct in6_addr addr6_guest;
struct in6_addr gw6;
struct in6_addr dns6;
diff --git a/qrap.c b/qrap.c
index a3a04d3..3dc525a 100644
--- a/qrap.c
+++ b/qrap.c
@@ -1,9 +1,11 @@
+// SPDX-License-Identifier: AGPL-3.0-or-later
+
/* PASST - Plug A Simple Socket Transport
*
- * qrap.c - qemu wrapper connecting UNIX domain socket to tap file descriptor
+ * qrap.c - qemu wrapper connecting UNIX domain socket to socket file descriptor
*
+ * Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
- * License: GPLv2
*
* TODO: Implement this functionality directly in qemu: we have TCP and UDP
* socket back-ends already.
diff --git a/tap.c b/tap.c
new file mode 100644
index 0000000..f8b8b4f
--- /dev/null
+++ b/tap.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: AGPL-3.0-or-later
+
+/* PASST - Plug A Simple Socket Transport
+ *
+ * tap.c - Functions to communicate with guest-facing tap interface
+ *
+ * Copyright (c) 2020-2021 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ *
+ */
+
+#include <stdio.h>
+#include <limits.h>
+#include <string.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <stdint.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+
+#include "passt.h"
+#include "util.h"
+
+/**
+ * tap_send() - Send frame and qemu socket header with indication of length
+ * @fd: tap file descriptor
+ * @len: Total L2 packet length
+ * @flags: Flags for send(), if any
+ *
+ * Return: return code from send()
+ */
+int tap_send(int fd, void *data, size_t len, int flags)
+{
+ uint32_t vnet_len = htonl(len);
+ send(fd, &vnet_len, 4, 0);
+
+ return send(fd, data, len, flags);
+}
+
+/**
+ * tap_ip_send() - Send IP packet, with L2 headers, calculating L3/L4 checksums
+ * @c: Execution context
+ * @src: IPv6 source address, IPv4-mapped for IPv4 sources
+ * @proto: L4 protocol number
+ * @in: Payload
+ * @len: L4 payload length
+ */
+void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto,
+ char *in, size_t len)
+{
+ char pkt[USHRT_MAX];
+ struct ethhdr *eh;
+
+ eh = (struct ethhdr *)pkt;
+
+ /* TODO: ARP table lookup */
+ memcpy(eh->h_dest, c->mac_guest, ETH_ALEN);
+ memcpy(eh->h_source, c->mac, ETH_ALEN);
+
+ if (IN6_IS_ADDR_V4MAPPED(src)) {
+ struct iphdr *iph = (struct iphdr *)(eh + 1);
+ char *data = (char *)(iph + 1);
+
+ eh->h_proto = ntohs(ETH_P_IP);
+
+ iph->version = 4;
+ iph->ihl = 5;
+ iph->tos = 0;
+ iph->tot_len = htons(len + 20);
+ iph->id = 0;
+ iph->frag_off = 0;
+ iph->ttl = 255;
+ iph->protocol = proto;
+ iph->daddr = c->addr4;
+ memcpy(&iph->saddr, &src->s6_addr[12], 4);
+
+ iph->check = 0;
+ iph->check = csum_ip4(iph, iph->ihl * 4);
+
+ memcpy(data, in, len);
+
+ if (iph->protocol == IPPROTO_TCP) {
+ csum_tcp4(iph);
+ } else if (iph->protocol == IPPROTO_UDP) {
+ struct udphdr *uh = (struct udphdr *)(iph + 1);
+
+ uh->check = 0;
+ }
+
+ tap_send(c->fd_unix, pkt, len + sizeof(*iph) + sizeof(*eh), 0);
+ } else {
+ struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
+ char *data = (char *)(ip6h + 1);
+
+ eh->h_proto = ntohs(ETH_P_IPV6);
+
+ memset(ip6h->flow_lbl, 0, 3);
+ ip6h->payload_len = htons(len);
+ ip6h->priority = 0;
+
+ ip6h->saddr = *src;
+ ip6h->daddr = c->addr6_guest;
+
+ memcpy(data, in, len);
+
+ ip6h->hop_limit = proto;
+ ip6h->version = 0;
+ ip6h->nexthdr = 0;
+ if (proto == IPPROTO_TCP) {
+ struct tcphdr *th = (struct tcphdr *)(ip6h + 1);
+
+ th->check = 0;
+ th->check = csum_ip4(ip6h, len + sizeof(*ip6h));
+ } else if (proto == IPPROTO_UDP) {
+ struct udphdr *uh = (struct udphdr *)(ip6h + 1);
+
+ uh->check = 0;
+ uh->check = csum_ip4(ip6h, len + sizeof(*ip6h));
+ } else if (proto == IPPROTO_ICMPV6) {
+ struct icmp6hdr *ih = (struct icmp6hdr *)(ip6h + 1);
+
+ ih->icmp6_cksum = 0;
+ ih->icmp6_cksum = csum_ip4(ip6h, len + sizeof(*ip6h));
+ }
+ ip6h->version = 6;
+ ip6h->nexthdr = proto;
+ ip6h->hop_limit = 255;
+
+ tap_send(c->fd_unix, pkt, len + sizeof(*ip6h) + sizeof(*eh), 0);
+ }
+}
diff --git a/tap.h b/tap.h
new file mode 100644
index 0000000..ecea936
--- /dev/null
+++ b/tap.h
@@ -0,0 +1,3 @@
+void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto,
+ char *in, size_t len);
+int tap_send(int fd, void *data, size_t len, int flags);
diff --git a/tcp.c b/tcp.c
new file mode 100644
index 0000000..46b739d
--- /dev/null
+++ b/tcp.c
@@ -0,0 +1,1367 @@
+// SPDX-License-Identifier: AGPL-3.0-or-later
+
+/* PASST - Plug A Simple Socket Transport
+ *
+ * tcp.c - TCP L2-L4 translation state machine
+ *
+ * Copyright (c) 2020-2021 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ *
+ */
+
+/**
+ * DOC: Theory of Operation
+ *
+ *
+ * Overview
+ * --------
+ *
+ * This implementation maps TCP traffic between a single L2 interface (tap) and
+ * native TCP (L4) sockets, mimicking and reproducing as closely as possible the
+ * inferred behaviour of applications running on a guest, connected via said L2
+ * interface. Four connection flows are supported:
+ * - from the local host to the guest behind the tap interface:
+ * - this is the main use case for proxies in service meshes
+ * - we bind to all unbound local ports, and relay traffic between L4 sockets
+ * with local endpoints and the L2 interface
+ * - from remote hosts to the guest behind the tap interface:
+ * - this might be needed for services that need to be addressed directly,
+ * and typically configured with special port forwarding rules (which are
+ * not needed here)
+ * - we also relay traffic between L4 sockets with remote endpoints and the L2
+ * interface
+ * - from the guest to the local host:
+ * - this is not observed in practice, but implemented for completeness and
+ * transparency
+ * - from the guest to external hosts:
+ * - this might be needed for applications running on the guest that need to
+ * directly access internet services (e.g. NTP)
+ *
+ * Relevant goals are:
+ * - transparency: sockets need to behave as if guest applications were running
+ * directly on the host. This is achieved by:
+ * - avoiding port and address translations whenever possible
+ * - mirroring TCP dynamics by observation of socket parameters (TCP_INFO
+ * socket option) and TCP headers of packets coming from the tap interface,
+ * reapplying those parameters in both flow directions (including TCP_MSS,
+ * TCP_WINDOW_CLAMP socket options)
+ * - simplicity: only a small subset of TCP logic is implemented here and
+ * delegated as much as possible to the TCP implementations of guest and host
+ * kernel. This is achieved by:
+ * - avoiding a complete TCP stack reimplementation, with a modified TCP state
+ * machine focused on the translation of observed states instead
+ * - mirroring TCP dynamics as described above and hence avoiding the need for
+ * segmentation, explicit queueing, and reassembly of segments
+ * - security:
+ * - no dynamic memory allocation is performed
+ * - TODO: synflood protection
+ * - TODO: sequence collision attacks
+ *
+ * Portability is limited by usage of Linux-specific socket options.
+ *
+ *
+ * Limits
+ * ------
+ *
+ * To avoid the need for dynamic memory allocation, a maximum, reasonable amount
+ * of connections is defined by TCP_MAX_CONNS below (currently 256k, close to
+ * the maximum amount of file descriptors typically available to a process on
+ * Linux).
+ *
+ * While fragmentation and reassembly are not implemented, tracking of missing
+ * segments and retransmissions needs to be, thus data needs to linger on
+ * sockets as long as it's not acknowledged by the guest, and read using
+ * MSG_PEEK into a single, preallocated static buffer sized to the maximum
+ * supported window, 64MiB. This imposes a practical limitation on window
+ * scaling, that is, the maximum factor is 1024. If a bigger window scaling
+ * factor is observed during connection establishment, connection is reset and
+ * reestablished by omitting the scaling factor in the SYN segment. This
+ * limitation only applies to the window scaling advertised by the guest, but
+ * if exceeded, no window scaling will be allowed at all toward either endpoint.
+ *
+ *
+ * Ports
+ * -----
+ *
+ * To avoid the need for ad-hoc configuration of port forwarding or allowed
+ * ports, listening sockets are opened and bound to all unbound ports on the
+ * host, as far as process capabilities allow. This service needs to be started
+ * after any application proxy that needs to bind to local ports.
+ *
+ * No port translation is needed for connections initiated remotely or by the
+ * local host: source port from socket is reused while establishing connections
+ * to the guest.
+ *
+ * For connections initiated by the guest, it's not possible to force the same
+ * source port as connections are established by the host kernel: that's the
+ * only port translation needed.
+ *
+ *
+ * Connection tracking and storage
+ * -------------------------------
+ *
+ * Connection are tracked by the @tc array of struct tcp_conn, containing
+ * addresses, ports, TCP states and parameters. This is statically allocated and
+ * indices are the file descriptor numbers associated to inbound or outbound
+ * sockets.
+ *
+ * IPv4 addresses are stored as IPv4-mapped IPv6 addresses to avoid the need for
+ * separate data structures depending on the protocol version.
+ *
+ * - Inbound connection requests (to the guest) are mapped using the triple
+ * < source IP address, source port, destination port >
+ * - Outbound connection requests (from the guest) are mapped using the triple
+ * < destination IP address, destination port, source port >
+ * where the source port is the one used by the guest, not the one used by the
+ * corresponding host socket
+ *
+ *
+ * Initialisation
+ * --------------
+ *
+ * Up to 2^15 + 2^14 listening sockets (excluding ephemeral ports, repeated for
+ * IPv4 and IPv6) are opened and bound to wildcard addresses. Some will fail to
+ * bind (for low ports, or ports already bound, e.g. by a proxy). These are
+ * added to the epoll list, with no separate storage.
+ *
+ *
+ * States and events
+ * -----------------
+ *
+ * These states apply to connected sockets only, listening sockets are always
+ * open after initialisation, in LISTEN state. A single state is maintained for
+ * both sides of the connection, and most states are omitted as they are already
+ * handled by host kernel and guest.
+ *
+ * - CLOSED no connection
+ * No associated events: this is always a final state, new connections
+ * directly start from TAP_SYN_SENT or SOCK_SYN_SENT described below.
+ *
+ * - TAP_SYN_SENT connect() in progress, triggered from tap
+ * - connect() completes SYN,ACK to tap > TAP_SYN_RCVD
+ * - connect() aborts RST to tap, close socket > CLOSED
+ * - RST from tap close socket > CLOSED
+ *
+ * - SOCK_SYN_SENT new connected socket, SYN sent to tap
+ * - SYN,ACK from tap ACK to tap > ESTABLISHED
+ * - SYN,ACK timeout RST to tap, close socket > CLOSED
+ * - socket error RST to tap, close socket > CLOSED
+ * - RST from tap close socket > CLOSED
+ *
+ * - TAP_SYN_RCVD connect() completed, SYN,ACK sent to tap
+ * - ACK from tap > ESTABLISHED
+ * - ACK timeout RST to tap, close socket > CLOSED
+ * - socket error RST to tap, close socket > CLOSED
+ * - RST from tap close socket > CLOSED
+ *
+ * - ESTABLISHED connection established, ready for data
+ * - zero-sized socket read FIN to tap > ESTABLISHED_SOCK_FIN
+ * - data timeout FIN to tap > ESTABLISHED_SOCK_FIN
+ * - socket error RST to tap, close socket > CLOSED
+ * - FIN from tap FIN,ACK to tap, close socket > FIN_WAIT_1
+ * - RST from tap close socket > CLOSED
+ *
+ * - ESTABLISHED_SOCK_FIN socket wants to close connection, data allowed
+ * - ACK from tap > CLOSE_WAIT
+ * - ACK timeout RST to tap, close socket > CLOSED
+ * - RST from tap close socket > CLOSED
+ *
+ * - CLOSE_WAIT socket wants to close connection, seen by tap
+ * - socket error RST to tap, close socket > CLOSED
+ * - FIN from tap ACK to tap, close socket > LAST_ACK
+ * - FIN timeout RST to tap, close socket > CLOSED
+ * - RST from tap close socket > CLOSED
+ *
+ * - LAST_ACK socket started close, tap completed it
+ * - anything from socket close socket > CLOSED
+ * - socket error RST to tap, close socket > CLOSED
+ * - ACK timeout RST to tap, close socket > CLOSED
+ *
+ * - FIN_WAIT_1 tap wants to close connection, _FIN,ACK sent_
+ * - ACK from tap close socket > CLOSED
+ * - socket error RST to tap, close socket > CLOSED
+ * - ACK timeout RST to tap, close socket > CLOSED
+ *
+ *
+ * Connection setup
+ * ----------------
+ *
+ * - inbound connection (from socket to guest): on accept() from listening
+ * socket, the new socket is mapped in connection tracking table, and
+ * three-way handshake initiated towards the guest, advertising MSS and window
+ * size and scaling from socket parameters
+ * - outbound connection (from guest to socket): on SYN segment from guest, a
+ * new socket is created and mapped in connection tracking table, setting
+ * MSS and window clamping from header and option of the observed SYN segment
+ *
+ *
+ * Aging and timeout
+ * -----------------
+ *
+ * Two bitmaps of TCP_MAX_CONNS bits indicate which connections need scheduled
+ * actions:
+ * - @tcp_act_fast is used to send ACK segments to the tap once TCP_INFO reports
+ * an increased number of acknowledged bytes sent on a socket, and examined
+ * every 20ms (one tenth of current TCP_DELACK_MAX on Linux): for each marked
+ * connection, a TCP_INFO query is performed and ACK segments are sent right
+ * away as needed
+ * - @tcp_act_slow is used for state and retransmission timeouts, and examined
+ * every 2s: for each marked connection with an expired @timeout timestamp
+ * specific actions are taken depending on the connection state:
+ * - SOCK_SYN_SENT: after a 2MSL (240s) timeout waiting for a SYN,ACK segment
+ * from tap expires, connection is reset (RST to tap, socket closed)
+ * - TAP_SYN_RCVD: after a 2MSL (240s) timeout waiting for an ACK segment from
+ * tap expires, connection is reset (RST to tap, socket closed)
+ * - ESTABLISHED: after a timeout of 1s (TODO: implement requirements from
+ * RFC 6298) waiting for an ACK segment from tap expires, data from socket
+ * queue is retransmitted starting from the last ACK sequence
+ * - ESTABLISHED: after a two hours (current TCP_KEEPALIVE_TIME on Linux)
+ * timeout waiting for any activity expires, connection is reset (RST to
+ * tap, socket closed)
+ * - ESTABLISHED_SOCK_FIN: after a 2MSL (240s) timeout waiting for an ACK
+ * segment from tap expires, connection is reset (RST to tap, socket closed)
+ * - CLOSE_WAIT: after a 2MSL (240s) timeout waiting for a FIN segment from
+ * tap expires, connection is reset (RST to tap, socket closed)
+ * - LAST_ACK: after a 2MSL (240s) timeout waiting for an ACK segment from
+ * socket expires, connection is reset (RST to tap, socket closed)
+ * - FIN_WAIT_1: after a 2MSL (240s) timeout waiting for an ACK segment from
+ * tap expires, connection is reset (RST to tap, socket closed)
+ *
+ *
+ * Data flows (from ESTABLISHED, ESTABLISHED_SOCK_FIN states)
+ * ----------------------------------------------------------
+ *
+ * @seq_to_tap: next sequence for packets to tap
+ * @seq_ack_from_tap: last ACK number received from tap
+ * @seq_from_tap: next sequence for packets from tap (not actually sent)
+ * @seq_ack_to_tap: last ACK number sent to tap
+ *
+ * @seq_init_from_tap: initial sequence number from tap
+ *
+ * @tap_window: last window size received from tap, scaled
+ * @tcpi_acked_last: most recent value of tcpi_bytes_acked (TCP_INFO)
+ *
+ * - from socket to tap:
+ * - on new data from socket:
+ * - peek into buffer
+ * - send data to tap:
+ * - starting at offset (@seq_to_tap - @seq_ack_from_tap)
+ * - in MSS-sized segments
+ * - increasing @seq_to_tap at each segment
+ * - up to window (until @seq_to_tap - @seq_ack_from_tap <= @tap_window)
+ * - mark socket in bitmap for periodic ACK check, set @last_ts_to_tap
+ * - on read error, send RST to tap, close socket
+ * - on zero read, send FIN to tap, enter ESTABLISHED_SOCK_FIN
+ * - on ACK from tap:
+ * - check if it's the second duplicated ACK
+ * - consume buffer by difference between new ack_seq and @seq_ack_from_tap
+ * - update @seq_ack_from_tap from ack_seq in header
+ * - on two duplicated ACKs, reset @seq_to_tap to @seq_ack_from_tap, and
+ * resend with steps listed above
+ * - set TCP_WINDOW_CLAMP from TCP header from tap
+ * - on @seq_ack_from_tap == @seq_to_tap, mark in bitmap, umark otherwise
+ * - periodically:
+ * - if @seq_ack_from_tap < @seq_to_tap and the retransmission timer
+ * (TODO: implement requirements from RFC 6298, currently 3s fixed) from
+ * @last_ts_to_tap elapsed, reset @seq_to_tap to @seq_ack_from_tap, and
+ * resend data with the steps listed above
+ *
+ * - from tap to socket:
+ * - on packet from tap:
+ * - set TCP_WINDOW_CLAMP from TCP header from tap
+ * - check seq from header against @seq_from_tap, if data is missing, send
+ * two ACKs with number @seq_ack_to_tap, discard packet
+ * - otherwise queue data to socket, set @seq_from_tap to seq from header
+ * plus payload length
+ * - query socket for TCP_INFO, on tcpi_bytes_acked > @tcpi_acked_last,
+ * set @tcpi_acked_last to tcpi_bytes_acked, set @seq_ack_to_tap
+ * to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and
+ * send ACK to tap
+ * - set @last_ts_sock
+ * - on @seq_ack_to_tap < @seq_from_tap, mark socket for later ACK in bitmap
+ * - periodically:
+ * - if socket is marked in bitmap, query socket for TCP_INFO, on
+ * tcpi_bytes_acked > @tcpi_acked_last,
+ * set @tcpi_acked_last to tcpi_bytes_acked, set @seq_ack_to_tap
+ * to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and
+ * send ACK to tap
+ * - on @seq_ack_to_tap == @seq_from_tap, unmark socket from bitmap
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <errno.h>
+#include <limits.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <time.h>
+
+#include "passt.h"
+#include "tap.h"
+#include "util.h"
+
+/* Approximately maximum number of open descriptors per process */
+#define MAX_CONNS (256 * 1024)
+
+#define MAX_WS 10
+#define MAX_WINDOW (1 << (16 + (MAX_WS)))
+#define MSS_DEFAULT 536
+#define WINDOW_DEFAULT 4380
+
+#define SYN_TIMEOUT 240000 /* ms */
+#define ACK_TIMEOUT 3000
+#define ACT_TIMEOUT 7200000
+#define FIN_TIMEOUT 240000
+#define LAST_ACK_TIMEOUT 240000
+
+#define SOCK_ACK_INTERVAL 20
+
+/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of
+ * <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP
+ */
+#define SOL_TCP IPPROTO_TCP
+
+static char tcp_in_buf[MAX_WINDOW];
+
+static uint8_t tcp_act_fast[MAX_CONNS / 8] = { 0 };
+static uint8_t tcp_act_slow[MAX_CONNS / 8] = { 0 };
+
+enum tcp_state {
+ CLOSED = 0,
+ TAP_SYN_SENT,
+ SOCK_SYN_SENT,
+ TAP_SYN_RCVD,
+ ESTABLISHED,
+ ESTABLISHED_SOCK_FIN,
+ CLOSE_WAIT,
+ LAST_ACK,
+ FIN_WAIT_1,
+};
+
+#define FIN (1 << 0)
+#define SYN (1 << 1)
+#define RST (1 << 2)
+#define ACK (1 << 4)
+
+#define OPT_EOL 0
+#define OPT_NOP 1
+#define OPT_MSS 2
+#define OPT_WS 3
+#define OPT_SACKP 4
+#define OPT_SACK 5
+#define OPT_TS 8
+
+/**
+ * struct tcp_conn - Descriptor for a TCP connection
+ * @a.a6: IPv6 remote address, can be IPv4-mapped
+ * @a.a4.zero: Zero prefix for IPv4-mapped, see RFC 6890, Table 20
+ * @a.a4.one: Ones prefix for IPv4-mapped
+ * @a.a4.a: IPv4 address
+ * @tap_port: Guest-facing tap port
+ * @sock_port: Remote, socket-facing port
+ * @s: TCP connection state
+ * @seq_to_tap: Next sequence for packets to tap
+ * @seq_ack_from_tap: Last ACK number received from tap
+ * @seq_from_tap: Next sequence for packets from tap (not actually sent)
+ * @seq_ack_to_tap: Last ACK number sent to tap
+ * @seq_init_from_tap: Initial sequence number from tap
+ * @tcpi_acked_last: Most recent value of tcpi_bytes_acked (TCP_INFO query)
+ * @dup_acks: Count of currently duplicated ACKs from tap
+ * @ws_allowed: Window scaling allowed
+ * @ws: Window scaling factor
+ * @tap_window: Last window size received from tap, scaled
+ * @last_ts_sock: Last activity timestamp from socket for timeout purposes
+ * @last_ts_tap: Last activity timestamp from tap for timeout purposes
+ * @mss_guest: Maximum segment size advertised by guest
+ */
+struct tcp_conn {
+ union {
+ struct in6_addr a6;
+ struct {
+ uint8_t zero[10];
+ uint8_t one[2];
+ struct in_addr a;
+ } a4;
+ } a;
+ in_port_t tap_port;
+ in_port_t sock_port;
+ enum tcp_state s;
+
+ uint32_t seq_to_tap;
+ uint32_t seq_ack_from_tap;
+ uint32_t seq_from_tap;
+ uint32_t seq_ack_to_tap;
+ uint32_t seq_init_from_tap;
+ uint64_t tcpi_acked_last;
+ int dup_acks;
+
+ int ws_allowed;
+ int ws;
+ int tap_window;
+
+ struct timespec last_ts_sock;
+ struct timespec last_ts_tap;
+
+ int mss_guest;
+};
+
+static struct tcp_conn tc[MAX_CONNS];
+
+static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len);
+
+/**
+ * tcp_act_fast_set() - Set socket in bitmap for "fast" timeout events
+ * @s: Socket file descriptor number
+ */
+static void tcp_act_fast_set(int s)
+{
+ tcp_act_fast[s / 8] |= 1 << (s % 8);
+}
+
+/**
+ * tcp_act_fast_clear() - Clear socket from bitmap for "fast" timeout events
+ * @s: Socket file descriptor number
+ */
+static void tcp_act_fast_clear(int s)
+{
+ tcp_act_fast[s / 8] &= ~(1 << (s % 8));
+}
+
+/**
+ * tcp_act_slow_set() - Set socket in bitmap for "slow" timeout events
+ * @s: Socket file descriptor number
+ */
+static void tcp_act_slow_set(int s)
+{
+ tcp_act_slow[s / 8] |= 1 << (s % 8);
+}
+
+/**
+ * tcp_act_slow_clear() - Clear socket from bitmap for "slow" timeout events
+ * @s: Socket file descriptor number
+ */
+static void tcp_act_slow_clear(int s)
+{
+ tcp_act_slow[s / 8] &= ~(1 << (s % 8));
+}
+
+/**
+ * tcp_opt_get() - Get option, and value if any, from TCP header
+ * @th: Pointer to TCP header
+ * @len: Length of buffer, including TCP header
+ * @type: Option type to look for
+ * @optlen: Optional, filled with option length if passed
+ * @value: Optional, set to start of option value if passed
+ *
+ * Return: Option value, meaningful for up to 4 bytes, -1 if not found
+ */
+static int tcp_opt_get(struct tcphdr *th, unsigned int len, uint8_t type,
+ uint8_t *optlen, void *value)
+{
+ uint8_t *p, __type, __optlen;
+
+ len -= sizeof(*th);
+ p = (uint8_t *)(th + 1);
+
+ if (len > th->doff * 4 - sizeof(*th))
+ len = th->doff * 4 - sizeof(*th);
+
+ while (len >= 2) {
+ switch (*p) {
+ case OPT_EOL:
+ return -1;
+ case OPT_NOP:
+ p++;
+ len--;
+ break;
+ default:
+ __type = *(p++);
+ __optlen = *(p++);
+ len -= 2;
+
+ if (type == __type) {
+ if (optlen)
+ *optlen = __optlen;
+ if (value)
+ value = p;
+
+ if (__optlen - 2 == 0)
+ return 0;
+
+ if (__optlen - 2 == 1)
+ return *p;
+
+ if (__optlen - 2 == 2)
+ return ntohs(*(uint16_t *)p);
+
+ return ntohl(*(uint32_t *)p);
+ }
+
+ p += __optlen - 2;
+ len -= __optlen - 2;
+ }
+ }
+
+ return -1;
+}
+
+/**
+ * tcp_close_and_epoll_del() - Close socket and remove from epoll descriptor
+ * @c: Execution context
+ * @s: File descriptor number for socket
+ */
+static void tcp_close_and_epoll_del(struct ctx *c, int s)
+{
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL);
+ close(s);
+ tcp_act_fast_clear(s);
+ tcp_act_slow_clear(s);
+}
+
+/**
+ * tcp_rst() - Reset a connection: send RST segment to tap, close socket
+ * @c: Execution context
+ * @s: File descriptor number for socket
+ */
+static void tcp_rst(struct ctx *c, int s)
+{
+ if (s < 0)
+ return;
+
+ tcp_send_to_tap(c, s, RST, NULL, 0);
+ tcp_close_and_epoll_del(c, s);
+ tc[s].s = CLOSED;
+}
+
+/**
+ * tcp_send_to_tap() - Send segment to tap, with options and values from socket
+ * @c: Execution context
+ * @s: File descriptor number for socket
+ * @flags: TCP flags to set
+ * @in: Input buffer, L4 header
+ * @len: Buffer length, at L4
+ *
+ * Return: -1 on error with connection reset, 0 otherwise
+ */
+static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len)
+{
+ char buf[USHRT_MAX] = { 0 }, *data;
+ struct tcp_info info = { 0 };
+ socklen_t sl = sizeof(info);
+ int ws = 0, have_info = 1;
+ struct tcphdr *th;
+
+ if (getsockopt(s, SOL_TCP, TCP_INFO, &info, &sl)) {
+ if (!(flags & RST)) {
+ tcp_rst(c, s);
+ return -1;
+ }
+
+ have_info = 0;
+ }
+
+ th = (struct tcphdr *)buf;
+ data = (char *)(th + 1);
+
+ if (flags & SYN && have_info) {
+ if (tc[s].ws_allowed)
+ ws = info.tcpi_snd_wscale;
+
+ /* Options: MSS, NOP and window scale if allowed (4-8 bytes) */
+ *data++ = 2;
+ *data++ = 4;
+ *(uint16_t *)data = htons(info.tcpi_snd_mss);
+ data += 2;
+
+ if (ws) {
+ *data++ = 1;
+
+ *data++ = 3;
+ *data++ = 3;
+ *data++ = ws;
+
+ th->doff = (20 + 8) / 4;
+ } else {
+ th->doff = (20 + 4) / 4;
+ }
+
+ th->seq = htonl(tc[s].seq_to_tap++);
+ } else {
+ th->doff = 20 / 4;
+
+ th->seq = htonl(tc[s].seq_to_tap);
+ tc[s].seq_to_tap += len;
+ }
+
+ if ((info.tcpi_bytes_acked > tc[s].tcpi_acked_last || (flags & ACK) ||
+ len) &&
+ have_info) {
+ uint64_t ack_seq;
+
+ th->ack = 1;
+ /* info.tcpi_bytes_acked already includes one byte for SYN, but
+ * not for incoming connections.
+ */
+ ack_seq = info.tcpi_bytes_acked + tc[s].seq_init_from_tap;
+ if (!info.tcpi_bytes_acked)
+ ack_seq++;
+ ack_seq &= (uint32_t)~0U;
+
+ tc[s].seq_ack_to_tap = ack_seq;
+ th->ack_seq = htonl(tc[s].seq_ack_to_tap);
+
+ tc[s].tcpi_acked_last = info.tcpi_bytes_acked;
+ } else {
+ if (!len && !flags)
+ return 0;
+
+ th->ack = th->ack_seq = 0;
+ }
+
+ th->rst = !!(flags & RST);
+ th->syn = !!(flags & SYN);
+ th->fin = !!(flags & FIN);
+
+ th->source = tc[s].sock_port;
+ th->dest = tc[s].tap_port;
+
+ if (have_info)
+ th->window = htons(info.tcpi_snd_wnd >> info.tcpi_snd_wscale);
+ else
+ th->window = WINDOW_DEFAULT;
+
+ th->urg_ptr = 0;
+ th->check = 0;
+
+ memcpy(data, in, len);
+
+ tap_ip_send(c, &tc[s].a.a6, IPPROTO_TCP, buf, th->doff * 4 + len);
+
+ return 0;
+}
+
+/**
+ * tcp_clamp_window() - Set window and scaling from option, clamp on socket
+ * @s: File descriptor number for socket
+ * @th: TCP header, from tap
+ * @len: Buffer length, at L4
+ */
+static void tcp_clamp_window(int s, struct tcphdr *th, int len)
+{
+ int ws;
+
+ if (!tc[s].tap_window) {
+ ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL);
+ if (ws >= 0 && ws <= MAX_WS) {
+ tc[s].ws_allowed = 1;
+ tc[s].ws = ws;
+ } else {
+ tc[s].ws_allowed = 0;
+ tc[s].ws = 0;
+ }
+
+ /* First value is not scaled. Also, don't clamp yet, to avoid
+ * getting a zero scale just because we set a small window now.
+ */
+ tc[s].tap_window = ntohs(th->window);
+ } else {
+ tc[s].tap_window = ntohs(th->window) << tc[s].ws;
+ setsockopt(s, SOL_TCP, TCP_WINDOW_CLAMP,
+ &tc[s].tap_window, sizeof(tc[s].tap_window));
+ }
+}
+
+/**
+ * tcp_conn_from_tap() - Handle connection request (SYN segment) from tap
+ * @c: Execution context
+ * @af: Address family, AF_INET or AF_INET6
+ * @addr: Remote address, pointer to sin_addr or sin6_addr
+ * @th: TCP header from tap
+ * @len: Packet length at L4
+ */
+static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
+ struct tcphdr *th, size_t len)
+{
+ struct sockaddr_in addr4 = {
+ .sin_family = AF_INET,
+ .sin_port = th->dest,
+ .sin_addr = *(struct in_addr *)addr,
+ };
+ struct sockaddr_in6 addr6 = {
+ .sin6_family = AF_INET6,
+ .sin6_port = th->dest,
+ .sin6_addr = *(struct in6_addr *)addr,
+ };
+ struct epoll_event ev = { 0 };
+ const struct sockaddr *sa;
+ socklen_t sl;
+ int s;
+
+ s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
+ if (s < 0)
+ return;
+
+ tc[s].mss_guest = tcp_opt_get(th, len, OPT_MSS, NULL, NULL);
+ if (tc[s].mss_guest < 0)
+ tc[s].mss_guest = MSS_DEFAULT;
+ sl = sizeof(tc[s].mss_guest);
+ setsockopt(s, SOL_TCP, TCP_MAXSEG, &tc[s].mss_guest, sl);
+
+ tcp_clamp_window(s, th, len);
+
+ if (af == AF_INET) {
+ sa = (const struct sockaddr *)&addr4;
+ sl = sizeof(addr4);
+
+ memset(&tc[s].a.a4.zero, 0, sizeof(tc[s].a.a4.zero));
+ memset(&tc[s].a.a4.one, 0xff, sizeof(tc[s].a.a4.one));
+ memcpy(&tc[s].a.a4.a, addr, sizeof(tc[s].a.a4.a));
+ } else {
+ sa = (const struct sockaddr *)&addr6;
+ sl = sizeof(addr6);
+
+ memcpy(&tc[s].a.a6, addr, sizeof(tc[s].a.a6));
+ }
+
+ tc[s].sock_port = th->dest;
+ tc[s].tap_port = th->source;
+
+ ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP;
+ ev.data.fd = s;
+
+ tc[s].seq_init_from_tap = ntohl(th->seq);
+ tc[s].seq_from_tap = tc[s].seq_init_from_tap + 1;
+ tc[s].seq_ack_to_tap = tc[s].seq_from_tap;
+
+ /* TODO: RFC 6528 with SipHash, worth it? */
+ tc[s].seq_ack_from_tap = tc[s].seq_to_tap = 0;
+
+ if (connect(s, sa, sl)) {
+ if (errno != EINPROGRESS) {
+ tcp_rst(c, s);
+ return;
+ }
+
+ ev.events |= EPOLLOUT;
+ tc[s].s = TAP_SYN_SENT;
+ } else {
+ if (tcp_send_to_tap(c, s, SYN | ACK, NULL, 0))
+ return;
+
+ tc[s].s = TAP_SYN_RCVD;
+ }
+
+ epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev);
+
+ return;
+}
+
+/**
+ * tcp_sock_lookup() - Look up socket given remote address and pair of ports
+ * @af: Address family, AF_INET or AF_INET6
+ * @tap_port: tap-facing port
+ * @sock_port: Socket-facing port
+ *
+ * Return: file descriptor number for socket, if found, -1 otherwise
+ */
+static int tcp_sock_lookup(int af, void *addr,
+ in_port_t tap_port, in_port_t sock_port)
+{
+ int i;
+
+ /* TODO: hash table and lookup. This is just a dummy implementation. */
+ for (i = 0; i < MAX_CONNS; i++) {
+ if (af == AF_INET && IN6_IS_ADDR_V4MAPPED(&tc[i].a.a6) &&
+ !memcmp(&tc[i].a.a4.a, addr, sizeof(tc[i].a.a4.a)) &&
+ tc[i].tap_port == tap_port &&
+ tc[i].sock_port == sock_port &&
+ tc[i].s)
+ return i;
+
+ if (af == AF_INET6 &&
+ !memcmp(&tc[i].a.a6, addr, sizeof(tc[i].a.a6)) &&
+ tc[i].tap_port == tap_port &&
+ tc[i].sock_port == sock_port &&
+ tc[i].s)
+ return i;
+ }
+
+ return -1;
+}
+
+/**
+ * tcp_conn_from_sock() - Handle new connection request from listening socket
+ * @c: Execution context
+ * @fd: File descriptor number for listening socket
+ */
+static void tcp_conn_from_sock(struct ctx *c, int fd)
+{
+ struct sockaddr_storage sa_r, sa_l;
+ socklen_t sa_len = sizeof(sa_r);
+ struct epoll_event ev = { 0 };
+ struct sockaddr_in6 *sa6;
+ struct sockaddr_in *sa4;
+ int s;
+
+ if (getsockname(fd, (struct sockaddr *)&sa_l, &sa_len))
+ return;
+
+ s = accept4(fd, (struct sockaddr *)&sa_r, &sa_len, SOCK_NONBLOCK);
+ if (s == -1)
+ return;
+
+ if (sa_l.ss_family == AF_INET) {
+ sa4 = (struct sockaddr_in *)&sa_r;
+
+ memset(&tc[s].a.a4.zero, 0, sizeof(tc[s].a.a4.zero));
+ memset(&tc[s].a.a4.one, 0xff, sizeof(tc[s].a.a4.one));
+ memcpy(&tc[s].a.a4.a, &sa4->sin_addr, sizeof(tc[s].a.a4.a));
+
+ tc[s].sock_port = sa4->sin_port;
+
+ sa4 = (struct sockaddr_in *)&sa_l;
+ tc[s].tap_port = sa4->sin_port;
+
+ } else if (sa_l.ss_family == AF_INET6) {
+ sa6 = (struct sockaddr_in6 *)&sa_r;
+
+ memcpy(&tc[s].a.a6, &sa6->sin6_addr, sizeof(tc[s].a.a6));
+
+ tc[s].sock_port = sa6->sin6_port;
+
+ sa6 = (struct sockaddr_in6 *)&sa_l;
+ tc[s].tap_port = sa6->sin6_port;
+ }
+
+ /* TODO: RFC 6528 with SipHash, worth it? */
+ tc[s].seq_to_tap = 0;
+
+ tc[s].ws_allowed = 1;
+
+ clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_sock);
+ clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_tap);
+
+ ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP;
+ ev.data.fd = s;
+ epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev);
+
+ tc[s].s = SOCK_SYN_SENT;
+ tcp_send_to_tap(c, s, SYN, NULL, 0);
+}
+
+/**
+ * tcp_send_to_sock() - Send buffer to socket, update timestamp and sequence
+ * @c: Execution context
+ * @s: File descriptor number for socket
+ * @seq: Previous TCP sequence, host order
+ * @data: Data buffer
+ * @len: Length at L4
+ * @extra_flags: Additional flags for send(), if any
+ *
+ * Return: -1 on socket error with connection reset, 0 otherwise
+ */
+static int tcp_send_to_sock(struct ctx *c, int s, int seq, char *data, int len,
+ int extra_flags)
+{
+ int err = send(s, data, len, MSG_DONTWAIT | MSG_NOSIGNAL | extra_flags);
+
+ if (err < 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK) {
+ /* If we can't queue right now, do nothing, sender has
+ * to retransmit.
+ */
+ return 0;
+ }
+
+ tcp_rst(c, s);
+ return -1;
+ }
+
+ clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_sock);
+ tc[s].seq_from_tap = seq + len;
+
+ return 0;
+}
+
+/**
+ * tcp_check_dupack() - Check if given ACK number is duplicated, update counter
+ * @s: File descriptor number for socket
+ * @ack_seq: ACK sequence, host order
+ *
+ * Return: 1 on two duplicated ACKs observed, with counter reset, 0 otherwise
+ */
+static int tcp_check_dupack(int s, uint32_t ack_seq)
+{
+ if (ack_seq == tc[s].seq_ack_from_tap && ++tc[s].dup_acks == 2) {
+ tc[s].dup_acks = 0;
+ return 1;
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_sock_consume() - Consume (discard) data from socket buffer
+ * @s: File descriptor number for socket
+ * @ack_seq: ACK sequence, host order
+ *
+ * Return: -1 on invalid sequence, 0 otherwise
+ */
+static int tcp_sock_consume(int s, uint32_t ack_seq)
+{
+ int to_ack;
+
+ /* Implicitly take care of wrap-arounds */
+ to_ack = ack_seq - tc[s].seq_ack_from_tap;
+
+ if (to_ack < 0)
+ return -1;
+
+ recv(s, NULL, to_ack, MSG_DONTWAIT | MSG_TRUNC);
+ tc[s].seq_ack_from_tap = ack_seq;
+
+ return 0;
+}
+
+/**
+ * tcp_data_from_sock() - Handle new data from socket, queue to tap, in window
+ * @c: Execution context
+ * @s: File descriptor number for socket
+ *
+ * Return: non-zero on socket error or pending data, 0 otherwise
+ */
+static int tcp_data_from_sock(struct ctx *c, int s)
+{
+ int len, offset, left, send;
+
+ /* Don't dequeue until acknowledged by guest */
+ len = recv(s, tcp_in_buf, sizeof(tcp_in_buf), MSG_DONTWAIT | MSG_PEEK);
+ if (len < 0) {
+ if (errno != EAGAIN && errno != EWOULDBLOCK)
+ tcp_rst(c, s);
+ return 1;
+ }
+
+ if (len == 0) {
+ if (tc[s].s >= ESTABLISHED_SOCK_FIN)
+ return 0;
+
+ tc[s].s = ESTABLISHED_SOCK_FIN;
+ if (tcp_send_to_tap(c, s, FIN | ACK, NULL, 0))
+ return 0;
+
+ left = 0;
+ goto out;
+ }
+
+ offset = tc[s].seq_to_tap - tc[s].seq_ack_from_tap;
+ left = len - offset;
+ while (left && offset + tc[s].mss_guest <= tc[s].tap_window) {
+ if (left < tc[s].mss_guest)
+ send = left;
+ else
+ send = tc[s].mss_guest;
+
+ if (tcp_send_to_tap(c, s, 0, tcp_in_buf + offset, send))
+ return 0;
+
+ offset += send;
+ left -= send;
+ }
+
+out:
+ clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_tap);
+ tcp_act_slow_set(s);
+
+ return !!left;
+}
+
+/**
+ * tcp_tap_handler() - Handle packets from tap and state transitions
+ * @c: Execution context
+ * @af: Address family, AF_INET or AF_INET6
+ * @in: Input buffer
+ * @len: Length, including TCP header
+ */
+void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len)
+{
+ struct tcphdr *th = (struct tcphdr *)in;
+ size_t off;
+ int s, ws;
+
+ if (len < sizeof(*th))
+ return;
+
+ off = th->doff * 4;
+ if (off < sizeof(*th) || off > len)
+ return;
+
+ s = tcp_sock_lookup(af, addr, th->source, th->dest);
+
+ if (s < 0) {
+ if (th->syn)
+ tcp_conn_from_tap(c, af, addr, th, len);
+ return;
+ }
+
+ if (th->rst) {
+ tcp_close_and_epoll_del(c, s);
+ return;
+ }
+
+ tcp_clamp_window(s, th, len);
+
+ if (th->ack)
+ clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_tap);
+
+ switch (tc[s].s) {
+ case SOCK_SYN_SENT:
+ if (!th->syn || !th->ack)
+ return;
+
+ tc[s].mss_guest = tcp_opt_get(th, len, OPT_MSS, NULL, NULL);
+ if (tc[s].mss_guest < 0)
+ tc[s].mss_guest = MSS_DEFAULT;
+
+ ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL);
+ if (ws > MAX_WS) {
+ if (tcp_send_to_tap(c, s, RST, NULL, 0))
+ return;
+
+ tc[s].seq_to_tap = 0;
+ tc[s].ws_allowed = 0;
+ tcp_send_to_tap(c, s, SYN, NULL, 0);
+ return;
+ }
+
+ tc[s].seq_from_tap = tc[s].seq_init_from_tap = ntohl(th->seq);
+ tc[s].seq_ack_to_tap = tc[s].seq_from_tap;
+
+ tc[s].s = ESTABLISHED;
+ tcp_send_to_tap(c, s, ACK, NULL, 0);
+ break;
+ case TAP_SYN_SENT:
+ break;
+ case TAP_SYN_RCVD:
+ if (th->fin) {
+ shutdown(s, SHUT_WR);
+ tc[s].s = FIN_WAIT_1;
+
+ break;
+ }
+
+ if (!th->ack) {
+ tcp_rst(c, s);
+ return;
+ }
+
+ tc[s].seq_ack_from_tap = ntohl(th->ack_seq);
+
+ tc[s].s = ESTABLISHED;
+ break;
+ case ESTABLISHED:
+ if (th->ack) {
+ int retrans = 0;
+
+ if (len == th->doff)
+ retrans = tcp_check_dupack(s, th->ack_seq);
+
+ if (tcp_sock_consume(s, ntohl(th->ack_seq))) {
+ tcp_rst(c, s);
+ return;
+ }
+
+ if (retrans) {
+ tc[s].seq_to_tap = tc[s].seq_ack_from_tap;
+ tcp_data_from_sock(c, s);
+ }
+ }
+
+ if (tcp_send_to_sock(c, s, ntohl(th->seq), in + off, len - off,
+ th->psh ? 0 : MSG_MORE))
+ break;
+
+ if (th->fin) {
+ shutdown(s, SHUT_WR);
+ tc[s].s = FIN_WAIT_1;
+ }
+
+ break;
+ case ESTABLISHED_SOCK_FIN:
+ if (tcp_send_to_sock(c, s, ntohl(th->seq), in + off, len - off,
+ th->psh ? 0 : MSG_MORE) < 0)
+ break;
+
+ if (th->ack) {
+ shutdown(s, SHUT_RD);
+ if (!tcp_data_from_sock(c, s))
+ tc[s].s = CLOSE_WAIT;
+
+ if (tcp_sock_consume(s, ntohl(th->ack_seq))) {
+ tcp_rst(c, s);
+ return;
+ }
+ }
+
+ break;
+
+ case CLOSE_WAIT:
+ if (tcp_sock_consume(s, ntohl(th->ack_seq))) {
+ tcp_rst(c, s);
+ return;
+ }
+
+ if (th->fin) {
+ shutdown(s, SHUT_WR);
+ tc[s].s = LAST_ACK;
+ }
+
+ break;
+ case FIN_WAIT_1:
+ case LAST_ACK:
+ case CLOSED: /* ;) */
+ break;
+ }
+
+ if (tc[s].seq_to_tap > tc[s].seq_ack_from_tap)
+ tcp_act_slow_set(s);
+ else
+ tcp_act_slow_clear(s);
+
+ if (tc[s].seq_from_tap > tc[s].seq_ack_to_tap)
+ tcp_act_fast_set(s);
+ else
+ tcp_act_fast_clear(s);
+}
+
+/**
+ * tcp_connect_finish() - Handle completion of connect() from EPOLLOUT event
+ * @c: Execution context
+ * @s: File descriptor number for socket
+ */
+static void tcp_connect_finish(struct ctx *c, int s)
+{
+ struct epoll_event ev = { 0 };
+ socklen_t sl;
+ int so;
+
+ sl = sizeof(so);
+ if (getsockopt(s, SOL_SOCKET, SO_ERROR, &so, &sl) || so) {
+ tcp_rst(c, s);
+ return;
+ }
+
+ if (tcp_send_to_tap(c, s, SYN | ACK, NULL, 0) < 0)
+ return;
+
+ ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP;
+ ev.data.fd = s;
+ epoll_ctl(c->epollfd, EPOLL_CTL_MOD, s, &ev);
+
+ tc[s].s = TAP_SYN_RCVD;
+}
+
+/**
+ * tcp_sock_handler() - Handle new data from socket
+ * @c: Execution context
+ * @s: File descriptor number for socket
+ * @events: epoll events bitmap
+ */
+void tcp_sock_handler(struct ctx *c, int s, uint32_t events)
+{
+ socklen_t sl;
+ int so;
+
+ if (tc[s].s == LAST_ACK) {
+ tcp_close_and_epoll_del(c, s);
+ return;
+ }
+
+ sl = sizeof(so);
+ if ((events & EPOLLERR) ||
+ getsockopt(s, SOL_SOCKET, SO_ACCEPTCONN, &so, &sl)) {
+ if (tc[s].s != CLOSED)
+ tcp_rst(c, s);
+ return;
+ }
+
+ if (so) {
+ tcp_conn_from_sock(c, s);
+ return;
+ }
+
+ if (events & EPOLLOUT) { /* Implies TAP_SYN_SENT */
+ tcp_connect_finish(c, s);
+ return;
+ }
+
+ if (tc[s].s == ESTABLISHED)
+ tcp_data_from_sock(c, s);
+
+ if (events & EPOLLRDHUP || events & EPOLLHUP) {
+ if (tc[s].s == ESTABLISHED)
+ tc[s].s = ESTABLISHED_SOCK_FIN;
+
+ tcp_send_to_tap(c, s, FIN | ACK, NULL, 0);
+
+ if (tc[s].s == FIN_WAIT_1) {
+ shutdown(s, SHUT_RD);
+
+ if (tcp_sock_consume(s, ntohl(tc[s].seq_ack_from_tap))) {
+ tcp_rst(c, s);
+ return;
+ }
+
+ tcp_close_and_epoll_del(c, s);
+ tc[s].s = CLOSED;
+ }
+ }
+}
+
+/**
+ * tcp_sock_init() - Create and bind listening sockets for inbound connections
+ * @c: Execution context
+ *
+ * Return: 0 on success, -1 on failure
+ */
+int tcp_sock_init(struct ctx *c)
+{
+ in_port_t port;
+
+ for (port = 0; port < (1 << 15) + (1 << 14); port++) {
+ if (c->v4 && sock_l4_add(c, 4, IPPROTO_TCP, htons(port)) < 0)
+ return -1;
+ if (c->v6 && sock_l4_add(c, 6, IPPROTO_TCP, htons(port)) < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_periodic_fast_one() - Handler for "fast" timeout events on one socket
+ * @c: Execution context
+ * @s: File descriptor number for socket
+ * @ts: Timestamp from caller
+ *
+ * Return: 0 if socket needs to be monitored further, non-zero otherwise
+ */
+int tcp_periodic_fast_one(struct ctx *c, int s, struct timespec *ts)
+{
+ if (timespec_diff_ms(ts, &tc[s].last_ts_sock) < SOCK_ACK_INTERVAL)
+ return 0;
+
+ tc[s].last_ts_sock = *ts;
+
+ tcp_send_to_tap(c, s, 0, NULL, 0);
+
+ return tc[s].seq_from_tap == tc[s].seq_ack_to_tap;
+}
+
+/**
+ * tcp_periodic_fast() - Handle sockets in "fast" event bitmap, clear as needed
+ * @c: Execution context
+ */
+void tcp_periodic_fast(struct ctx *c)
+{
+ long *word = (long *)tcp_act_fast, tmp;
+ struct timespec now;
+ unsigned int i;
+ int n, s;
+
+ clock_gettime(CLOCK_MONOTONIC, &now);
+
+ for (i = 0; i < sizeof(tcp_act_fast) / sizeof(long); i++, word++) {
+ tmp = *word;
+ while ((n = ffsl(tmp))) {
+ tmp &= ~(1UL << (n - 1));
+
+ s = i * sizeof(long) * 8 + n - 1;
+
+ if (tcp_periodic_fast_one(c, s, &now))
+ *word &= ~(1UL << (n - 1));
+ }
+ }
+}
+
+/**
+ * tcp_periodic_fast_one() - Handler for "slow" timeout events on one socket
+ * @c: Execution context
+ * @s: File descriptor number for socket
+ * @ts: Timestamp from caller
+ */
+void tcp_periodic_slow_one(struct ctx *c, int s, struct timespec *ts)
+{
+ switch (tc[s].s) {
+ case SOCK_SYN_SENT:
+ case TAP_SYN_SENT:
+ case TAP_SYN_RCVD:
+ if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > SYN_TIMEOUT)
+ tcp_rst(c, s);
+ break;
+ case ESTABLISHED_SOCK_FIN:
+ if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > FIN_TIMEOUT) {
+ tcp_rst(c, s);
+ break;
+ }
+ /* Falls through */
+ case ESTABLISHED:
+ if (tc[s].seq_ack_from_tap < tc[s].seq_to_tap &&
+ timespec_diff_ms(ts, &tc[s].last_ts_tap) > ACK_TIMEOUT) {
+ tc[s].seq_to_tap = tc[s].seq_ack_from_tap;
+ tcp_data_from_sock(c, s);
+ }
+
+ if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > ACT_TIMEOUT &&
+ timespec_diff_ms(ts, &tc[s].last_ts_sock) > ACT_TIMEOUT)
+ tcp_rst(c, s);
+
+ break;
+ case CLOSE_WAIT:
+ case FIN_WAIT_1:
+ if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > FIN_TIMEOUT)
+ tcp_rst(c, s);
+ break;
+ case LAST_ACK:
+ if (timespec_diff_ms(ts, &tc[s].last_ts_sock) >
+ LAST_ACK_TIMEOUT)
+ tcp_rst(c, s);
+ break;
+ case CLOSED:
+ break;
+ }
+}
+
+/**
+ * tcp_periodic_slow() - Handle sockets in "slow" event bitmap
+ * @c: Execution context
+ */
+void tcp_periodic_slow(struct ctx *c)
+{
+ long *word = (long *)tcp_act_slow, tmp;
+ struct timespec now;
+ unsigned int i;
+ int n;
+
+ clock_gettime(CLOCK_MONOTONIC, &now);
+
+ for (i = 0; i < sizeof(tcp_act_slow) / sizeof(long); i++, word++) {
+ tmp = *word;
+ while ((n = ffsl(tmp))) {
+ tmp &= ~(1UL << (n - 1));
+ tcp_periodic_slow_one(c, i * sizeof(long) * 8 + n - 1,
+ &now);
+ }
+ }
+}
diff --git a/tcp.h b/tcp.h
new file mode 100644
index 0000000..1f16790
--- /dev/null
+++ b/tcp.h
@@ -0,0 +1,5 @@
+void tcp_sock_handler(struct ctx *c, int s, uint32_t events);
+void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len);
+int tcp_sock_init(struct ctx *c);
+void tcp_periodic_fast(struct ctx *c);
+void tcp_periodic_slow(struct ctx *c);
diff --git a/udp.c b/udp.c
new file mode 100644
index 0000000..74ce843
--- /dev/null
+++ b/udp.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: AGPL-3.0-or-later
+
+/* PASST - Plug A Simple Socket Transport
+ *
+ * udp.c - UDP L2-L4 translation routines
+ *
+ * Copyright (c) 2020-2021 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ *
+ */
+
+/**
+ * DOC: Theory of Operation
+ *
+ *
+ * For UDP, no state machine or any particular tracking is required. Try to
+ * create and bind sets of 2^16 sockets, one for IPv4 and one for IPv6. Binding
+ * will fail on ports that are already bound, or low ports depending on
+ * capabilities.
+ *
+ * Packets are forwarded back and forth, by prepending and stripping UDP headers
+ * in the obvious way, with no port translation.
+ *
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <limits.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/udp.h>
+#include <time.h>
+
+#include "passt.h"
+#include "tap.h"
+#include "util.h"
+
+static int udp4_sock_port[USHRT_MAX];
+static int udp6_sock_port[USHRT_MAX];
+
+/**
+ * udp_sock_handler() - Handle new data from socket
+ * @c: Execution context
+ * @s: File descriptor number for socket
+ * @events: epoll events bitmap
+ */
+void udp_sock_handler(struct ctx *c, int s, uint32_t events)
+{
+ struct in6_addr a6 = { .s6_addr = { 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0xff, 0xff,
+ 0, 0, 0, 0 } };
+ struct sockaddr_storage sr, sl;
+ socklen_t slen = sizeof(sr);
+ char buf[USHRT_MAX];
+ struct udphdr *uh;
+ int n;
+
+ (void)events;
+
+ n = recvfrom(s, buf + sizeof(*uh), sizeof(buf) - sizeof(*uh),
+ MSG_DONTWAIT, (struct sockaddr *)&sr, &slen);
+ if (n < 0)
+ return;
+
+ uh = (struct udphdr *)buf;
+
+ if (getsockname(s, (struct sockaddr *)&sl, &slen))
+ return;
+
+ if (sl.ss_family == AF_INET) {
+ struct sockaddr_in *sr4 = (struct sockaddr_in *)&sr;
+ struct sockaddr_in *sl4 = (struct sockaddr_in *)&sl;
+
+ memcpy(&a6.s6_addr[12], &sr4->sin_addr, sizeof(sr4->sin_addr));
+ uh->source = sr4->sin_port;
+ uh->dest = sl4->sin_port;
+ uh->len = htons(n + sizeof(*uh));
+
+ tap_ip_send(c, &a6, IPPROTO_UDP, buf, n + sizeof(*uh));
+ } else if (sl.ss_family == AF_INET6) {
+ struct sockaddr_in6 *sr6 = (struct sockaddr_in6 *)&sr;
+ struct sockaddr_in6 *sl6 = (struct sockaddr_in6 *)&sl;
+
+ uh->source = sr6->sin6_port;
+ uh->dest = sl6->sin6_port;
+ uh->len = htons(n + sizeof(*uh));
+
+ tap_ip_send(c, &sr6->sin6_addr, IPPROTO_UDP,
+ buf, n + sizeof(*uh));
+ }
+}
+
+/**
+ * tcp_tap_handler() - Handle packets from tap
+ * @c: Execution context
+ * @af: Address family, AF_INET or AF_INET6
+ * @in: Input buffer
+ * @len: Length, including UDP header
+ */
+void udp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len)
+{
+ struct udphdr *uh = (struct udphdr *)in;
+ int s;
+
+ (void)c;
+
+ if (af == AF_INET) {
+ struct sockaddr_in sa = {
+ .sin_family = AF_INET,
+ .sin_port = uh->dest,
+ };
+
+ if (!(s = udp4_sock_port[ntohs(uh->source)]))
+ return;
+
+ fprintf(stderr, "udp from tap: using socket %i\n", s);
+
+ sa.sin_addr = *(struct in_addr *)addr;
+
+ sendto(s, in + sizeof(*uh), len - sizeof(*uh), MSG_DONTWAIT,
+ (struct sockaddr *)&sa, sizeof(sa));
+ } else if (af == AF_INET6) {
+ struct sockaddr_in6 sa = {
+ .sin6_family = AF_INET6,
+ .sin6_port = uh->dest,
+ .sin6_addr = *(struct in6_addr *)addr,
+ };
+
+ if (!(s = udp6_sock_port[ntohs(uh->source)]))
+ return;
+
+ fprintf(stderr, "udp from tap: using socket %i\n", s);
+
+ sendto(s, in + sizeof(*uh), len - sizeof(*uh), MSG_DONTWAIT,
+ (struct sockaddr *)&sa, sizeof(sa));
+ }
+}
+
+/**
+ * udp_sock_init() - Create and bind listening sockets for inbound connections
+ * @c: Execution context
+ *
+ * Return: 0 on success, -1 on failure
+ */
+int udp_sock_init(struct ctx *c)
+{
+ in_port_t port;
+ int s;
+
+ for (port = 0; port < USHRT_MAX; port++) {
+ if (c->v4 &&
+ (s = sock_l4_add(c, 4, IPPROTO_UDP, htons(port))) < 0)
+ return -1;
+ udp4_sock_port[port] = s;
+
+ if (c->v6 &&
+ (s = sock_l4_add(c, 6, IPPROTO_UDP, htons(port))) < 0)
+ return -1;
+ udp6_sock_port[port] = s;
+ }
+
+ return 0;
+}
diff --git a/udp.h b/udp.h
new file mode 100644
index 0000000..b18aa7e
--- /dev/null
+++ b/udp.h
@@ -0,0 +1,3 @@
+void udp_sock_handler(struct ctx *c, int s, uint32_t events);
+void udp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len);
+int udp_sock_init(struct ctx *c);
diff --git a/util.c b/util.c
index 7dd0db1..324f800 100644
--- a/util.c
+++ b/util.c
@@ -1,17 +1,28 @@
+// SPDX-License-Identifier: AGPL-3.0-or-later
+
/* PASST - Plug A Simple Socket Transport
*
* util.c - Convenience helpers
*
+ * Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
- * License: GPLv2
*
*/
#include <stdio.h>
#include <stdint.h>
#include <stddef.h>
+#include <unistd.h>
#include <linux/ipv6.h>
#include <arpa/inet.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <sys/epoll.h>
+
+#include "passt.h"
/**
* csum_fold() - Fold long sum for IP and TCP checksum
@@ -50,7 +61,45 @@ uint16_t csum_ip4(void *buf, size_t len)
return ~csum_fold(sum);
}
-unsigned char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto)
+/**
+ * csum_ipv4() - Calculate TCP checksum for IPv4 and set in place
+ * @iph: Packet buffer, IP header
+ */
+void csum_tcp4(struct iphdr *iph)
+{
+ struct tcphdr *th = (struct tcphdr *)((char *)iph + iph->ihl * 4);
+ uint16_t tlen = ntohs(iph->tot_len) - iph->ihl * 4, *p = (uint16_t *)th;
+ uint32_t sum = 0;
+
+ sum += (iph->saddr >> 16) & 0xffff;
+ sum += iph->saddr & 0xffff;
+ sum += (iph->daddr >> 16) & 0xffff;
+ sum += iph->daddr & 0xffff;
+
+ sum += htons(IPPROTO_TCP);
+ sum += htons(tlen);
+
+ th->check = 0;
+ while (tlen > 1) {
+ sum += *p++;
+ tlen -= 2;
+ }
+
+ if (tlen > 0) {
+ sum += *p & htons(0xff00);
+ }
+
+ th->check = (uint16_t)~csum_fold(sum);
+}
+
+/**
+ * ipv6_l4hdr() - Find pointer to L4 header in IPv6 packet and extract protocol
+ * @ip6h: IPv6 header
+ * @proto: Filled with L4 protocol number
+ *
+ * Return: pointer to L4 header, NULL if not found
+ */
+char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto)
{
int offset, len, hdrlen;
struct ipv6_opt_hdr *o;
@@ -79,9 +128,95 @@ unsigned char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto)
offset;
} else {
*proto = nh;
- return (unsigned char *)(ip6h + 1) + offset;
+ return (char *)(ip6h + 1) + offset;
}
}
return NULL;
}
+
+/**
+ * sock_l4_add() - Create and bind socket for given L4, add to epoll list
+ * @c: Execution context
+ * @v: IP protocol, 4 or 6
+ * @proto: Protocol number, network order
+ * @port: Port, network order
+ *
+ * Return: newly created socket, -1 on error
+ */
+int sock_l4_add(struct ctx *c, int v, uint16_t proto, uint16_t port)
+{
+ struct sockaddr_in addr4 = {
+ .sin_family = AF_INET,
+ .sin_port = port,
+ .sin_addr = { .s_addr = INADDR_ANY },
+ };
+ struct sockaddr_in6 addr6 = {
+ .sin6_family = AF_INET6,
+ .sin6_port = port,
+ .sin6_addr = IN6ADDR_ANY_INIT,
+ };
+ struct epoll_event ev = { 0 };
+ const struct sockaddr *sa;
+ int fd, sl;
+
+ if (proto != IPPROTO_TCP && proto != IPPROTO_UDP)
+ return -1; /* Not implemented. */
+
+ fd = socket(v == 4 ? AF_INET : AF_INET6,
+ proto == IPPROTO_TCP ? SOCK_STREAM : SOCK_DGRAM, proto);
+ if (fd < 0) {
+ perror("L4 socket");
+ return -1;
+ }
+
+ if (v == 4) {
+ sa = (const struct sockaddr *)&addr4;
+ sl = sizeof(addr4);
+ } else {
+ sa = (const struct sockaddr *)&addr6;
+ sl = sizeof(addr6);
+ }
+
+ if (bind(fd, sa, sl) < 0) {
+ /* We'll fail to bind to low ports if we don't have enough
+ * capabilities, and we'll fail to bind on already bound ports,
+ * this is fine.
+ */
+ close(fd);
+ return 0;
+ }
+
+ if (proto == IPPROTO_TCP && listen(fd, 128) < 0) {
+ perror("TCP socket listen");
+ close(fd);
+ return -1;
+ }
+
+ ev.events = EPOLLIN;
+ ev.data.fd = fd;
+ if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev) == -1) {
+ perror("L4 epoll_ctl");
+ return -1;
+ }
+
+ return fd;
+}
+
+/**
+ * timespec_diff_ms() - Report difference in milliseconds between two timestamps
+ * @a: Minuend timestamp
+ * @b: Subtrahend timestamp
+ *
+ * Return: difference in milliseconds
+ */
+int timespec_diff_ms(struct timespec *a, struct timespec *b)
+{
+ if (a->tv_nsec < b->tv_nsec) {
+ return (b->tv_nsec - a->tv_nsec) / 1000 +
+ (a->tv_sec - b->tv_sec - 1) * 1000;
+ }
+
+ return (a->tv_nsec - b->tv_nsec) / 1000 +
+ (a->tv_sec - b->tv_sec) * 1000;
+}
diff --git a/util.h b/util.h
index 8298d22..77e8d8c 100644
--- a/util.h
+++ b/util.h
@@ -1,3 +1,6 @@
uint16_t csum_fold(uint32_t sum);
uint16_t csum_ip4(void *buf, size_t len);
-unsigned char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto);
+void csum_tcp4(struct iphdr *iph);
+char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto);
+int sock_l4_add(struct ctx *c, int v, uint16_t proto, uint16_t port);
+int timespec_diff_ms(struct timespec *a, struct timespec *b);