diff options
-rw-r--r-- | Makefile | 4 | ||||
-rw-r--r-- | arp.c | 11 | ||||
-rw-r--r-- | dhcp.c | 7 | ||||
-rwxr-xr-x | doc/demo.sh | 69 | ||||
-rw-r--r-- | ndp.c | 7 | ||||
-rw-r--r-- | passt.c | 695 | ||||
-rw-r--r-- | passt.h | 52 | ||||
-rw-r--r-- | qrap.c | 6 | ||||
-rw-r--r-- | tap.c | 136 | ||||
-rw-r--r-- | tap.h | 3 | ||||
-rw-r--r-- | tcp.c | 1367 | ||||
-rw-r--r-- | tcp.h | 5 | ||||
-rw-r--r-- | udp.c | 174 | ||||
-rw-r--r-- | udp.h | 3 | ||||
-rw-r--r-- | util.c | 141 | ||||
-rw-r--r-- | util.h | 5 |
16 files changed, 2061 insertions, 624 deletions
@@ -2,8 +2,8 @@ CFLAGS += -Wall -Wextra -pedantic all: passt qrap -passt: passt.c passt.h arp.c arp.h dhcp.c dhcp.h ndp.c ndp.h util.c util.h - $(CC) $(CFLAGS) passt.c arp.c dhcp.c ndp.c util.c -o passt +passt: passt.c passt.h arp.c arp.h dhcp.c dhcp.h ndp.c ndp.h tap.c tap.h tcp.c tcp.h udp.c udp.h util.c util.h + $(CC) $(CFLAGS) passt.c arp.c dhcp.c ndp.c tap.c tcp.c udp.c util.c -o passt qrap: qrap.c passt.h $(CC) $(CFLAGS) qrap.c -o qrap @@ -1,9 +1,11 @@ +// SPDX-License-Identifier: AGPL-3.0-or-later + /* PASST - Plug A Simple Socket Transport * * arp.c - ARP implementation * + * Copyright (c) 2020-2021 Red Hat GmbH * Author: Stefano Brivio <sbrivio@redhat.com> - * License: GPLv2 * */ @@ -23,6 +25,7 @@ #include "passt.h" #include "dhcp.h" #include "util.h" +#include "tap.h" /** * struct arpmsg - 802.2 ARP IPv4 payload @@ -39,7 +42,7 @@ struct arpmsg { } __attribute__((__packed__)); /** - * dhcp() - Check if this is an ARP message, reply as needed + * arp() - Check if this is an ARP message, reply as needed * @c: Execution context * @len: Total L2 packet length * @eh: Packet buffer, Ethernet header @@ -74,9 +77,11 @@ int arp(struct ctx *c, unsigned len, struct ethhdr *eh) len = sizeof(*eh) + sizeof(*ah) + sizeof(*am); memcpy(eh->h_dest, eh->h_source, ETH_ALEN); + /* HACK */ + memcpy(c->mac_guest, eh->h_source, ETH_ALEN); memcpy(eh->h_source, c->mac, ETH_ALEN); - if (send(c->fd_unix, eh, len, 0) < 0) + if (tap_send(c->fd_unix, eh, len, 0) < 0) perror("ARP: send"); return 1; @@ -1,9 +1,11 @@ +// SPDX-License-Identifier: AGPL-3.0-or-later + /* PASST - Plug A Simple Socket Transport * * dhcp.c - Minimalistic DHCP server for PASST * + * Copyright (c) 2020-2021 Red Hat GmbH * Author: Stefano Brivio <sbrivio@redhat.com> - * License: GPLv2 * */ @@ -22,6 +24,7 @@ #include "passt.h" #include "dhcp.h" #include "util.h" +#include "tap.h" /** * struct opt - DHCP option @@ -212,7 +215,7 @@ int dhcp(struct ctx *c, unsigned len, struct ethhdr *eh) memcpy(eh->h_dest, eh->h_source, ETH_ALEN); memcpy(eh->h_source, c->mac, ETH_ALEN); - if (send(c->fd_unix, eh, len, 0) < 0) + if (tap_send(c->fd_unix, eh, len, 0) < 0) perror("DHCP: send"); return 1; diff --git a/doc/demo.sh b/doc/demo.sh new file mode 100755 index 0000000..3d20491 --- /dev/null +++ b/doc/demo.sh @@ -0,0 +1,69 @@ +#!/bin/sh -e +# +# SPDX-License-Identifier: AGPL-3.0-or-later +# +# PASST - Plug A Simple Socket Transport +# +# demo.sh - Set up namespaces, addresses and routes to show PASST functionality +# +# Copyright (c) 2020-2021 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +get_token() { + IFS=' ' + __next=0 + for __token in ${@}; do + [ ${__next} -eq 2 ] && echo "${__token}" && return + [ "${__token}" = "${1}" ] && __next=$((__next + 1)) + done + unset IFS +} + +ipv6_dev() { get_token "dev" $(ip -o -6 ro show default); } +ipv6_devaddr() { get_token "inet6" $(ip -o -6 ad sh dev "${1}" scope global); } +ipv6_ll_addr() { get_token "inet6" $(ip -o -6 ad sh dev "${1}" scope link); } +ipv6_mask() { echo ${1#*/}; } +ipv6_mangle() { + IFS=':' + __c=0 + for __16b in ${1%%/*}; do + if [ ${__c} -lt 7 ]; then + printf "${__16b}:" + else + printf "abcd\n" && break + fi + __c=$((__c + 1)) + done + unset IFS +} + +ndp_setup() { + sysctl -w net.ipv6.conf.all.proxy_ndp=1 + ip -6 neigh add proxy "${1}" dev "$(ipv6_dev)" +} + +ip netns del passt 2>/dev/null || : +ip link del veth_passt 2>/dev/null || : +ip netns add passt +ip link add veth_passt up netns passt type veth peer name veth_passt +ip link set dev veth_passt up + +ip -n passt addr add 192.0.2.2/24 dev veth_passt +ip addr add 192.0.2.1/24 dev veth_passt +ip -n passt route add default via 192.0.2.1 + +ipv6_addr="$(ipv6_devaddr "$(ipv6_dev)")" +ipv6_passt="$(ipv6_mangle "${ipv6_addr}")" +ndp_setup "${ipv6_passt}" +ip -n passt addr add "${ipv6_passt}/$(ipv6_mask "${ipv6_addr}")" dev veth_passt +ip addr add "${ipv6_addr}" dev veth_passt +passt_ll="$(ipv6_ll_addr "veth_passt")" +main_ll="$(get_token "link/ether" $(ip -o li sh veth_passt))" +ip -n passt neigh add "${passt_ll%%/*}" dev veth_passt lladdr "${main_ll}" +ip -n passt route add default via "${passt_ll%%/*}" dev veth_passt + +ethtool -K veth_passt tx off +ip netns exec passt ethtool -K veth_passt tx off +ulimit -n 300000 + +ip netns exec passt ./passt @@ -1,9 +1,11 @@ +// SPDX-License-Identifier: AGPL-3.0-or-later + /* PASST - Plug A Simple Socket Transport * * ndp.c - NDP support for PASST * + * Copyright (c) 2020-2021 Red Hat GmbH * Author: Stefano Brivio <sbrivio@redhat.com> - * License: GPLv2 * */ @@ -23,6 +25,7 @@ #include "passt.h" #include "util.h" +#include "tap.h" #define RS 133 #define RA 134 @@ -126,7 +129,7 @@ int ndp(struct ctx *c, unsigned len, struct ethhdr *eh) memcpy(ehr->h_source, c->mac, ETH_ALEN); ehr->h_proto = htons(ETH_P_IPV6); - if (send(c->fd_unix, ehr, len, 0) < 0) + if (tap_send(c->fd_unix, ehr, len, 0) < 0) perror("NDP: send"); return 1; @@ -1,27 +1,16 @@ +// SPDX-License-Identifier: AGPL-3.0-or-later + /* PASST - Plug A Simple Socket Transport * * passt.c - Daemon implementation * + * Copyright (c) 2020-2021 Red Hat GmbH * Author: Stefano Brivio <sbrivio@redhat.com> - * License: GPLv2 - * - * Grab Ethernet frames via AF_UNIX socket, build AF_INET/AF_INET6 sockets for - * each 5-tuple from ICMP, TCP, UDP packets, perform connection tracking and - * forward them with destination address NAT. Forward packets received on - * sockets back to the UNIX domain socket (typically, a tap file descriptor from - * qemu). * - * TODO: - * - steal packets from AF_INET/AF_INET6 sockets (using eBPF/XDP, or a new - * socket option): currently, incoming packets are also handled by in-kernel - * protocol handlers, so every incoming untracked TCP packet gets a RST. - * Workaround: - * iptables -A OUTPUT -m state --state INVALID,NEW,ESTABLISHED \ - * -p tcp --tcp-flags RST RST -j DROP - * ip6tables -A OUTPUT -m state --state INVALID,NEW,ESTABLISHED \ - * -p tcp --tcp-flags RST RST -j DROP - * - and use XDP sockmap on top of that to improve performance - * - aging and timeout/RST bookkeeping for connection tracking entries + * Grab Ethernet frames via AF_UNIX socket, build SOCK_DGRAM/SOCK_STREAM sockets + * for each 5-tuple from TCP, UDP packets, perform connection tracking and + * forward them. Forward packets received on sockets back to the UNIX domain + * socket (typically, a socket virtio_net file descriptor from qemu). */ #include <stdio.h> @@ -50,14 +39,21 @@ #include <linux/ip.h> #include <linux/netlink.h> #include <linux/rtnetlink.h> +#include <time.h> #include "passt.h" #include "arp.h" #include "dhcp.h" #include "ndp.h" #include "util.h" +#include "tcp.h" +#include "udp.h" -#define EPOLL_EVENTS 10 +#define EPOLL_EVENTS 10 + +#define EPOLL_TIMEOUT 100 /* ms, for protocol periodic handlers */ +#define PERIODIC_HANDLER_FAST 100 +#define PERIODIC_HANDLER_SLOW 1000 /** * sock_unix() - Create and bind AF_UNIX socket, add to epoll list @@ -298,376 +294,42 @@ static void get_dns(struct ctx *c) } /** - * sock_l4() - Create and bind socket for given L4, add to epoll list - * @c: Execution context - * @v: IP protocol, 4 or 6 - * @proto: Protocol number, network order - * @port: L4 port, network order - * - * Return: newly created socket, -1 on error - */ -static int sock_l4(struct ctx *c, int v, uint16_t proto, uint16_t port) -{ - struct sockaddr_in addr4 = { - .sin_family = AF_INET, - .sin_port = port, - .sin_addr = { .s_addr = c->addr4 }, - }; - struct sockaddr_in6 addr6 = { - .sin6_family = AF_INET6, - .sin6_port = port, - .sin6_addr = c->addr6, - }; - struct epoll_event ev = { 0 }; - const struct sockaddr *sa; - int fd, sl; - - fd = socket(v == 4 ? AF_INET : AF_INET6, SOCK_RAW, proto); - if (fd < 0) { - perror("L4 socket"); - return -1; - } - - if (v == 4) { - sa = (const struct sockaddr *)&addr4; - sl = sizeof(addr4); - } else { - sa = (const struct sockaddr *)&addr6; - sl = sizeof(addr6); - } - - if (bind(fd, sa, sl) < 0) { - perror("L4 bind"); - close(fd); - return -1; - } - - ev.events = EPOLLIN; - ev.data.fd = fd; - if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev) == -1) { - perror("L4 epoll_ctl"); - return -1; - } - - return fd; -} - -/** - * lookup4() - Look up entry from tap-sourced IPv4 packet, create if missing - * @c: Execution context - * @eh: Packet buffer, Ethernet header - * - * Return: -1 for unsupported or too many sockets, matching socket otherwise - */ -static int lookup4(struct ctx *c, const struct ethhdr *eh) -{ - struct iphdr *iph = (struct iphdr *)(eh + 1); - struct tcphdr *th = (struct tcphdr *)((char *)iph + iph->ihl * 4); - char buf_s[BUFSIZ], buf_d[BUFSIZ]; - struct ct4 *ct = c->map4; - int i, one_icmp_fd = 0; - - if (iph->protocol != IPPROTO_ICMP && iph->protocol != IPPROTO_TCP && - iph->protocol != IPPROTO_UDP) - return -1; - - for (i = 0; i < CT_SIZE; i++) { - if (ct[i].p == iph->protocol && ct[i].sa == iph->saddr && - ((ct[i].p == IPPROTO_ICMP && ct[i].da == iph->daddr) - || ct[i].sp == th->source) && - !memcmp(ct[i].hd, eh->h_dest, ETH_ALEN) && - !memcmp(ct[i].hs, eh->h_source, ETH_ALEN)) { - if (iph->protocol != IPPROTO_ICMP) { - ct[i].da = iph->daddr; - ct[i].dp = th->dest; - } - return ct[i].fd; - } - } - - for (i = 0; i < CT_SIZE && ct[i].p; i++) { - if (iph->protocol == IPPROTO_ICMP && ct[i].p == IPPROTO_ICMP) - one_icmp_fd = ct[i].fd; - } - - if (i == CT_SIZE) { - fprintf(stderr, "\nToo many sockets, aborting "); - } else { - if (iph->protocol == IPPROTO_ICMP) { - if (one_icmp_fd) - ct[i].fd = one_icmp_fd; - else - ct[i].fd = sock_l4(c, 4, iph->protocol, 0); - } else { - ct[i].fd = sock_l4(c, 4, iph->protocol, th->source); - } - - fprintf(stderr, "\n(socket %i) New ", ct[i].fd); - ct[i].p = iph->protocol; - ct[i].sa = iph->saddr; - ct[i].da = iph->daddr; - if (iph->protocol != IPPROTO_ICMP) { - ct[i].sp = th->source; - ct[i].dp = th->dest; - } - memcpy(&ct[i].hd, eh->h_dest, ETH_ALEN); - memcpy(&ct[i].hs, eh->h_source, ETH_ALEN); - } - - if (iph->protocol == IPPROTO_ICMP) { - fprintf(stderr, "icmp connection\n\tfrom %s to %s\n\n", - inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)), - inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d))); - } else { - fprintf(stderr, "%s connection\n\tfrom %s:%i to %s:%i\n\n", - getprotobynumber(iph->protocol)->p_name, - inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)), - ntohs(th->source), - inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)), - ntohs(th->dest)); - } - - return (i == CT_SIZE) ? -1 : ct[i].fd; -} - -/** - * lookup6() - Look up entry from tap-sourced IPv6 packet, create if missing - * @c: Execution context - * @eh: Packet buffer, Ethernet header - * - * Return: -1 for unsupported or too many sockets, matching socket otherwise - */ -static int lookup6(struct ctx *c, const struct ethhdr *eh) -{ - struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1); - char buf_s[BUFSIZ], buf_d[BUFSIZ]; - struct ct6 *ct = c->map6; - int i, one_icmp_fd = 0; - struct tcphdr *th; - uint8_t proto; - - th = (struct tcphdr *)ipv6_l4hdr(ip6h, &proto); - if (!th) - return -1; - - if (proto != IPPROTO_ICMPV6 && proto != IPPROTO_TCP && - proto != IPPROTO_UDP) - return -1; - - for (i = 0; i < CT_SIZE; i++) { - if (ct[i].p != proto) - continue; - - if (memcmp(ct[i].hd, eh->h_dest, ETH_ALEN) || - memcmp(ct[i].hs, eh->h_source, ETH_ALEN) || - memcmp(&ct[i].sa, &ip6h->saddr, sizeof(ct[i].sa))) - continue; - - if (ct[i].p != IPPROTO_ICMPV6 && - ct[i].sp != th->source) - continue; - - if (ct[i].p == IPPROTO_ICMPV6 && - memcmp(&ct[i].da, &ip6h->daddr, sizeof(ct[i].da))) - continue; - - if (ct[i].p != IPPROTO_ICMPV6) { - memcpy(&ct[i].da, &ip6h->daddr, sizeof(ct[i].da)); - ct[i].dp = th->dest; - } - - return ct[i].fd; - } - - for (i = 0; i < CT_SIZE && ct[i].p; i++) { - if (proto == IPPROTO_ICMPV6 && ct[i].p == IPPROTO_ICMPV6) - one_icmp_fd = ct[i].fd; - } - - if (i == CT_SIZE) { - fprintf(stderr, "\nToo many sockets, aborting "); - } else { - if (proto == IPPROTO_ICMPV6) { - if (one_icmp_fd) - ct[i].fd = one_icmp_fd; - else - ct[i].fd = sock_l4(c, 6, proto, 0); - } else { - ct[i].fd = sock_l4(c, 6, proto, th->source); - } - - fprintf(stderr, "\n(socket %i) New ", ct[i].fd); - ct[i].p = proto; - memcpy(&ct[i].sa, &ip6h->saddr, sizeof(ct[i].sa)); - memcpy(&ct[i].da, &ip6h->daddr, sizeof(ct[i].da)); - if (ct[i].p != IPPROTO_ICMPV6) { - ct[i].sp = th->source; - ct[i].dp = th->dest; - } - memcpy(&ct[i].hd, eh->h_dest, ETH_ALEN); - memcpy(&ct[i].hs, eh->h_source, ETH_ALEN); - } - - if (proto == IPPROTO_ICMPV6) { - fprintf(stderr, "icmpv6 connection\n\tfrom %s\n" - "\tto %s\n\n", - inet_ntop(AF_INET6, &ct[i].sa, buf_s, sizeof(buf_s)), - inet_ntop(AF_INET6, &ct[i].da, buf_d, sizeof(buf_d))); - } else { - fprintf(stderr, "%s connection\n\tfrom [%s]:%i\n" - "\tto [%s]:%i\n\n", - getprotobynumber(proto)->p_name, - inet_ntop(AF_INET6, &ct[i].sa, buf_s, sizeof(buf_s)), - ntohs(th->source), - inet_ntop(AF_INET6, &ct[i].da, buf_d, sizeof(buf_d)), - ntohs(th->dest)); - } - - return (i == CT_SIZE) ? -1 : ct[i].fd; -} - -/** - * lookup_r4() - Reverse look up connection tracking entry for IPv4 packet - * @ct: Connection tracking table - * @fd: File descriptor that received the packet - * @iph: Packet buffer, IP header - * - * Return: matching entry if any, NULL otherwise - */ -struct ct4 *lookup_r4(struct ct4 *ct, int fd, struct iphdr *iph) -{ - struct tcphdr *th = (struct tcphdr *)((char *)iph + iph->ihl * 4); - int i; - - for (i = 0; i < CT_SIZE; i++) { - if (ct[i].fd == fd && - iph->protocol == ct[i].p && - iph->saddr == ct[i].da && - (iph->protocol == IPPROTO_ICMP || - (th->source == ct[i].dp && th->dest == ct[i].sp))) - return &ct[i]; - } - - return NULL; -} - -/** - * lookup_r6() - Reverse look up connection tracking entry for IPv6 packet - * @ct: Connection tracking table - * @fd: File descriptor that received the packet - * - * Return: matching entry if any, NULL otherwise - */ -struct ct6 *lookup_r6(struct ct6 *ct, int fd, struct tcphdr *th) -{ - int i; - - for (i = 0; i < CT_SIZE; i++) { - if (ct[i].fd != fd) - continue; - - if (ct[i].p == IPPROTO_ICMPV6 || - (ct[i].dp == th->source && ct[i].sp == th->dest)) - return &ct[i]; - } - - return NULL; -} - -/** - * nat4_in() - Perform incoming IPv4 address translation - * @addr: Original destination address to be used - * @iph: IP header - */ -static void nat_in(unsigned long addr, struct iphdr *iph) -{ - iph->daddr = addr; -} - -/** - * csum_ipv4() - Calculate TCP checksum for IPv4 and set in place - * @iph: Packet buffer, IP header - */ -static void csum_tcp4(struct iphdr *iph) -{ - struct tcphdr *th = (struct tcphdr *)((char *)iph + iph->ihl * 4); - uint16_t tlen = ntohs(iph->tot_len) - iph->ihl * 4, *p = (uint16_t *)th; - uint32_t sum = 0; - - sum += (iph->saddr >> 16) & 0xffff; - sum += iph->saddr & 0xffff; - sum += (iph->daddr >> 16) & 0xffff; - sum += iph->daddr & 0xffff; - - sum += htons(IPPROTO_TCP); - sum += htons(tlen); - - th->check = 0; - while (tlen > 1) { - sum += *p++; - tlen -= 2; - } - - if (tlen > 0) { - sum += *p & htons(0xff00); - } - - th->check = (uint16_t)~csum_fold(sum); -} - -/** * tap4_handler() - IPv4 packet handler for tap file descriptor * @c: Execution context * @len: Total L2 packet length * @in: Packet buffer, L2 headers */ -static void tap4_handler(struct ctx *c, int len, char *in) +static void tap4_handler(struct ctx *c, char *in, size_t len) { struct ethhdr *eh = (struct ethhdr *)in; struct iphdr *iph = (struct iphdr *)(eh + 1); - struct tcphdr *th = (struct tcphdr *)((char *)iph + iph->ihl * 4); - struct udphdr *uh = (struct udphdr *)th; - struct sockaddr_in addr = { - .sin_family = AF_INET, - .sin_port = th->dest, - .sin_addr = { .s_addr = iph->daddr }, - }; + char *l4h = (char *)iph + iph->ihl * 4; char buf_s[BUFSIZ], buf_d[BUFSIZ]; - int fd; if (arp(c, len, eh) || dhcp(c, len, eh)) return; - fd = lookup4(c, eh); - if (fd == -1) - return; - if (iph->protocol == IPPROTO_ICMP) { - fprintf(stderr, "icmp from tap: %s -> %s (socket %i)\n", + fprintf(stderr, "icmp from tap: %s -> %s\n", inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)), - inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)), - fd); + inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d))); } else { - fprintf(stderr, "%s from tap: %s:%i -> %s:%i (socket %i)\n", + struct tcphdr *th = (struct tcphdr *)l4h; + + fprintf(stderr, "%s from tap: %s:%i -> %s:%i\n", getprotobynumber(iph->protocol)->p_name, inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)), ntohs(th->source), inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)), - ntohs(th->dest), - fd); + ntohs(th->dest)); } + len -= (intptr_t)l4h - (intptr_t)eh; + if (iph->protocol == IPPROTO_TCP) - csum_tcp4(iph); + tcp_tap_handler(c, AF_INET, &iph->daddr, l4h, len); else if (iph->protocol == IPPROTO_UDP) - uh->check = 0; - else if (iph->protocol != IPPROTO_ICMP) - return; - - if (sendto(fd, (void *)th, len - sizeof(*eh) - iph->ihl * 4, 0, - (struct sockaddr *)&addr, sizeof(addr)) < 0) - perror("sendto"); - + udp_tap_handler(c, AF_INET, &iph->daddr, l4h, len); } /** @@ -676,228 +338,122 @@ static void tap4_handler(struct ctx *c, int len, char *in) * @len: Total L2 packet length * @in: Packet buffer, L2 headers */ -static void tap6_handler(struct ctx *c, int len, char *in) +static void tap6_handler(struct ctx *c, char *in, size_t len) { struct ethhdr *eh = (struct ethhdr *)in; struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1); - struct tcphdr *th; - struct udphdr *uh; - struct icmp6hdr *ih; - struct sockaddr_in6 addr = { - .sin6_family = AF_INET6, - .sin6_addr = ip6h->daddr, - }; char buf_s[BUFSIZ], buf_d[BUFSIZ]; uint8_t proto; - int fd; + char *l4h; if (ndp(c, len, eh)) return; - fd = lookup6(c, eh); - if (fd == -1) - return; + l4h = ipv6_l4hdr(ip6h, &proto); + + /* TODO: Assign MAC address to guest so that, together with prefix + * assigned via NDP, address matches the one on the host. Then drop + * address change and checksum recomputation. + */ + c->addr6_guest = ip6h->saddr; + ip6h->saddr = c->addr6; + if (proto == IPPROTO_TCP) { + struct tcphdr *th = (struct tcphdr *)(ip6h + 1); + + th->check = 0; + th->check = csum_ip4(ip6h, len + sizeof(*ip6h)); + } else if (proto == IPPROTO_UDP) { + struct udphdr *uh = (struct udphdr *)(ip6h + 1); - th = (struct tcphdr *)ipv6_l4hdr(ip6h, &proto); - uh = (struct udphdr *)th; - ih = (struct icmp6hdr *)th; + uh->check = 0; + uh->check = csum_ip4(ip6h, len + sizeof(*ip6h)); + } else if (proto == IPPROTO_ICMPV6) { + struct icmp6hdr *ih = (struct icmp6hdr *)(ip6h + 1); + + ih->icmp6_cksum = 0; + ih->icmp6_cksum = csum_ip4(ip6h, len + sizeof(*ip6h)); + } if (proto == IPPROTO_ICMPV6) { - fprintf(stderr, "icmpv6 from tap: %s ->\n\t%s (socket %i)\n", + fprintf(stderr, "icmpv6 from tap: %s ->\n\t%s\n", inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)), - inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d)), - fd); + inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d)) + ); } else { + struct tcphdr *th = (struct tcphdr *)l4h; + fprintf(stderr, "%s from tap: [%s]:%i\n" - "\t-> [%s]:%i (socket %i)\n", + "\t-> [%s]:%i\n", getprotobynumber(proto)->p_name, inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)), ntohs(th->source), inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d)), - ntohs(th->dest), - fd); - } - - if (proto != IPPROTO_TCP && proto != IPPROTO_UDP && - proto != IPPROTO_ICMPV6) - return; - - ip6h->saddr = c->addr6; - - ip6h->hop_limit = proto; - ip6h->version = 0; - ip6h->nexthdr = 0; - memset(ip6h->flow_lbl, 0, 3); - - if (proto == IPPROTO_TCP) { - th->check = 0; - th->check = csum_ip4(ip6h, - len - ((intptr_t)th - (intptr_t)eh) + - sizeof(*ip6h)); - } else if (proto == IPPROTO_UDP) { - uh->check = 0; - uh->check = csum_ip4(ip6h, - len - ((intptr_t)uh - (intptr_t)eh) + - sizeof(*ip6h)); - } else if (proto == IPPROTO_ICMPV6) { - ih->icmp6_cksum = 0; - ih->icmp6_cksum = csum_ip4(ip6h, - len - ((intptr_t)ih - (intptr_t)eh) + - sizeof(*ip6h)); + ntohs(th->dest)); } - ip6h->version = 6; - ip6h->nexthdr = proto; - ip6h->hop_limit = 255; - - if (sendto(fd, (void *)th, len - ((intptr_t)th - (intptr_t)eh), 0, - (struct sockaddr *)&addr, sizeof(addr)) < 0) - perror("sendto"); + len -= (intptr_t)l4h - (intptr_t)eh; + if (proto == IPPROTO_TCP) + tcp_tap_handler(c, AF_INET6, &ip6h->daddr, l4h, len); + else if (proto == IPPROTO_UDP) + udp_tap_handler(c, AF_INET6, &ip6h->daddr, l4h, len); } -static void tap_handler(struct ctx *c, int len, char *in) +/** + * tap_handler() - IPv4/IPv6/ARP packet handler for tap file descriptor + * @c: Execution context + * @len: Total L2 packet length + * @in: Packet buffer, L2 headers + */ +static void tap_handler(struct ctx *c, char *in, size_t len) { struct ethhdr *eh = (struct ethhdr *)in; if (eh->h_proto == ntohs(ETH_P_IP) || eh->h_proto == ntohs(ETH_P_ARP)) - tap4_handler(c, len, in); + tap4_handler(c, in, len); else if (eh->h_proto == ntohs(ETH_P_IPV6)) - tap6_handler(c, len, in); + tap6_handler(c, in, len); } /** - * ext4_handler() - IPv4 packet handler for external routable interface + * sock_handler() - Event handler for L4 sockets * @c: Execution context - * @fd: File descriptor that received the packet - * @len: Total L3 packet length - * @in: Packet buffer, L3 headers + * @fd: File descriptor associated to event + * @events epoll events */ -static void ext4_handler(struct ctx *c, int fd, int len, char *in) +static void sock_handler(struct ctx *c, int fd, uint32_t events) { - struct iphdr *iph = (struct iphdr *)in; - struct tcphdr *th = (struct tcphdr *)((char *)iph + iph->ihl * 4); - struct udphdr *uh = (struct udphdr *)th; - char buf_s[BUFSIZ], buf_d[BUFSIZ], buf[ETH_MAX_MTU]; - struct ethhdr *eh = (struct ethhdr *)buf; - struct ct4 *entry; - - entry = lookup_r4(c->map4, fd, iph); - if (!entry) - return; + socklen_t sl; + int so; - nat_in(entry->sa, iph); - - iph->check = 0; - iph->check = csum_ip4(iph, iph->ihl * 4); - - if (iph->protocol == IPPROTO_TCP) - csum_tcp4(iph); - else if (iph->protocol == IPPROTO_UDP) - uh->check = 0; - - memcpy(eh->h_dest, entry->hs, ETH_ALEN); - memcpy(eh->h_source, entry->hd, ETH_ALEN); - eh->h_proto = ntohs(ETH_P_IP); - - memcpy(eh + 1, in, len); - - if (iph->protocol == IPPROTO_ICMP) { - fprintf(stderr, "icmp (socket %i) to tap: %s -> %s\n", - entry->fd, - inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)), - inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d))); - } else { - fprintf(stderr, "%s (socket %i) to tap: %s:%i -> %s:%i\n", - getprotobynumber(iph->protocol)->p_name, - entry->fd, - inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)), - ntohs(th->source), - inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)), - ntohs(th->dest)); - } + sl = sizeof(so); - if (send(c->fd_unix, buf, len + sizeof(*eh), 0) < 0) - perror("send"); + if (getsockopt(fd, SOL_SOCKET, SO_TYPE, &so, &sl) || + so == SOCK_STREAM) + tcp_sock_handler(c, fd, events); + else if (so == SOCK_DGRAM) + udp_sock_handler(c, fd, events); } /** - * ext6_handler() - IPv6 packet handler for external routable interface + * periodic_handler() - Run periodic tasks for L4 protocol handlers * @c: Execution context - * @fd: File descriptor that received the packet - * @len: Total L4 packet length - * @in: Packet buffer, L4 headers + * @last: Timestamp of last run, updated on return */ -static int ext6_handler(struct ctx *c, int fd, int len, char *in) +static void periodic_handler(struct ctx *c, struct timespec *last) { - struct tcphdr *th = (struct tcphdr *)in; - struct udphdr *uh; - struct icmp6hdr *ih; - char buf_s[BUFSIZ], buf_d[BUFSIZ], buf[ETH_MAX_MTU] = { 0 }; - struct ethhdr *eh = (struct ethhdr *)buf; - struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1); - struct ct6 *entry; - - entry = lookup_r6(c->map6, fd, th); - if (!entry) - return 0; - - ip6h->daddr = entry->sa; - ip6h->saddr = entry->da; - memcpy(ip6h + 1, in, len); - ip6h->payload_len = htons(len); - - th = (struct tcphdr *)(ip6h + 1); - uh = (struct udphdr *)th; - ih = (struct icmp6hdr *)th; - ip6h->hop_limit = entry->p; - - if (entry->p == IPPROTO_TCP) { - th->check = 0; - th->check = csum_ip4(ip6h, len + sizeof(*ip6h)); - } else if (entry->p == IPPROTO_UDP) { - uh->check = 0; - uh->check = csum_ip4(ip6h, len + sizeof(*ip6h)); - } else if (entry->p == IPPROTO_ICMPV6) { - ih->icmp6_cksum = 0; - ih->icmp6_cksum = csum_ip4(ip6h, len + sizeof(*ip6h)); - } - - ip6h->version = 6; - ip6h->nexthdr = entry->p; - ip6h->hop_limit = 255; - - memcpy(eh->h_dest, entry->hs, ETH_ALEN); - memcpy(eh->h_source, entry->hd, ETH_ALEN); - eh->h_proto = ntohs(ETH_P_IPV6); - - if (entry->p == IPPROTO_ICMPV6) { - fprintf(stderr, "icmpv6 (socket %i) to tap: %s\n\t-> %s\n", - entry->fd, - inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)), - inet_ntop(AF_INET6, &ip6h->daddr, buf_d, - sizeof(buf_d))); - } else { - fprintf(stderr, "%s (socket %i) to tap: [%s]:%i\n" - "\t-> [%s]:%i\n", - getprotobynumber(entry->p)->p_name, - entry->fd, - inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)), - ntohs(th->source), - inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d)), - ntohs(th->dest)); - } + struct timespec tmp; + int elapsed_ms; - if (send(c->fd_unix, buf, len + sizeof(*ip6h) + sizeof(*eh), 0) < 0) - perror("send"); + clock_gettime(CLOCK_MONOTONIC, &tmp); + elapsed_ms = timespec_diff_ms(&tmp, last); - return 1; -} + if (elapsed_ms >= PERIODIC_HANDLER_FAST) + tcp_periodic_fast(c); + if (elapsed_ms >= PERIODIC_HANDLER_SLOW) + tcp_periodic_slow(c); -static void ext_handler(struct ctx *c, int fd, int len, char *in) -{ - if (!ext6_handler(c, fd, len, in)) - ext4_handler(c, fd, len, in); + *last = tmp; } /** @@ -924,6 +480,7 @@ int main(int argc, char **argv) char buf4[4][sizeof("255.255.255.255")]; struct epoll_event events[EPOLL_EVENTS]; struct epoll_event ev = { 0 }; + struct timespec last_time; char buf[ETH_MAX_MTU]; struct ctx c = { 0 }; int nfds, i, len; @@ -958,55 +515,71 @@ int main(int argc, char **argv) } fprintf(stderr, "\n"); + if (clock_gettime(CLOCK_MONOTONIC, &last_time)) { + perror("clock_gettime"); + exit(EXIT_FAILURE); + } + c.epollfd = epoll_create1(0); if (c.epollfd == -1) { perror("epoll_create1"); exit(EXIT_FAILURE); } + if (tcp_sock_init(&c) || udp_sock_init(&c)) + exit(EXIT_FAILURE); + fd_unix = sock_unix(); listen: listen(fd_unix, 1); fprintf(stderr, "You can now start qrap:\n\t" - "./qrap 42 kvm ... -net tap,fd=42 -net nic,model=virtio\n\n"); + "./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio\n\n"); c.fd_unix = accept(fd_unix, NULL, NULL); - ev.events = EPOLLIN; + ev.events = EPOLLIN | EPOLLRDHUP | EPOLLERR | EPOLLHUP; ev.data.fd = c.fd_unix; epoll_ctl(c.epollfd, EPOLL_CTL_ADD, c.fd_unix, &ev); + clock_gettime(CLOCK_MONOTONIC, &last_time); + loop: - nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, -1); + nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, EPOLL_TIMEOUT); if (nfds == -1 && errno != EINTR) { perror("epoll_wait"); exit(EXIT_FAILURE); } for (i = 0; i < nfds; i++) { - len = recv(events[i].data.fd, buf, sizeof(buf), MSG_DONTWAIT); + if (events[i].data.fd == c.fd_unix) { + len = recv(events[i].data.fd, buf, sizeof(buf), + MSG_DONTWAIT); + + if (len <= 0) { + epoll_ctl(c.epollfd, EPOLL_CTL_DEL, c.fd_unix, + &ev); + close(c.fd_unix); + goto listen; + } - if (events[i].data.fd == c.fd_unix && len <= 0) { - epoll_ctl(c.epollfd, EPOLL_CTL_DEL, c.fd_unix, &ev); - close(c.fd_unix); - goto listen; - } + if (len == 0 || (len < 0 && errno == EINTR)) + continue; - if (len == 0 || (len < 0 && errno == EINTR)) - continue; + if (len < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) + break; + goto out; + } - if (len < 0) { - if (errno == EAGAIN || errno == EWOULDBLOCK) - break; - goto out; + tap_handler(&c, buf + 4, ntohl(*(uint32_t *)buf)); + } else { + sock_handler(&c, events[i].data.fd, events[i].events); } - - if (events[i].data.fd == c.fd_unix) - tap_handler(&c, len, buf); - else - ext_handler(&c, events[i].data.fd, len, buf); } + periodic_handler(&c, &last_time); + clock_gettime(CLOCK_MONOTONIC, &last_time); + goto loop; out: @@ -1,56 +1,12 @@ -#define CT_SIZE 4096 #define UNIX_SOCK_PATH "/tmp/passt.socket" /** - * struct ct4 - IPv4 connection tracking entry - * @p: IANA protocol number - * @sa: Source address (as seen from tap interface) - * @da: Destination address - * @sp: Source port, network order - * @dp: Destination port, network order - * @hd: Destination MAC address - * @hs: Source MAC address - * @fd: File descriptor for corresponding AF_INET socket - */ -struct ct4 { - uint8_t p; - uint32_t sa; - uint32_t da; - uint16_t sp; - uint16_t dp; - unsigned char hd[ETH_ALEN]; - unsigned char hs[ETH_ALEN]; - int fd; -}; - -/** - * struct ct6 - IPv6 connection tracking entry - * @p: IANA protocol number - * @sa: Source address (as seen from tap interface) - * @da: Destination address - * @sp: Source port, network order - * @dp: Destination port, network order - * @hd: Destination MAC address - * @hs: Source MAC address - * @fd: File descriptor for corresponding AF_INET6 socket - */ -struct ct6 { - uint8_t p; - struct in6_addr sa; - struct in6_addr da; - uint16_t sp; - uint16_t dp; - unsigned char hd[ETH_ALEN]; - unsigned char hs[ETH_ALEN]; - int fd; -}; - -/** * struct ctx - Execution context * @epollfd: file descriptor for epoll instance * @fd_unix: AF_UNIX socket for tap file descriptor - * @map4: Connection tracking table * @v4: Enable IPv4 transport + * @mac: Host MAC address + * @mac_guest: Guest MAC address * @addr4: IPv4 address for external, routable interface * @mask4: IPv4 netmask, network order * @gw4: Default IPv4 gateway, network order @@ -64,9 +20,8 @@ struct ct6 { struct ctx { int epollfd; int fd_unix; - struct ct4 map4[CT_SIZE]; - struct ct6 map6[CT_SIZE]; unsigned char mac[ETH_ALEN]; + unsigned char mac_guest[ETH_ALEN]; int v4; unsigned long addr4; @@ -76,6 +31,7 @@ struct ctx { int v6; struct in6_addr addr6; + struct in6_addr addr6_guest; struct in6_addr gw6; struct in6_addr dns6; @@ -1,9 +1,11 @@ +// SPDX-License-Identifier: AGPL-3.0-or-later + /* PASST - Plug A Simple Socket Transport * - * qrap.c - qemu wrapper connecting UNIX domain socket to tap file descriptor + * qrap.c - qemu wrapper connecting UNIX domain socket to socket file descriptor * + * Copyright (c) 2020-2021 Red Hat GmbH * Author: Stefano Brivio <sbrivio@redhat.com> - * License: GPLv2 * * TODO: Implement this functionality directly in qemu: we have TCP and UDP * socket back-ends already. @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: AGPL-3.0-or-later + +/* PASST - Plug A Simple Socket Transport + * + * tap.c - Functions to communicate with guest-facing tap interface + * + * Copyright (c) 2020-2021 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + * + */ + +#include <stdio.h> +#include <limits.h> +#include <string.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <netinet/in.h> +#include <stdint.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmp.h> +#include <linux/icmpv6.h> + +#include "passt.h" +#include "util.h" + +/** + * tap_send() - Send frame and qemu socket header with indication of length + * @fd: tap file descriptor + * @len: Total L2 packet length + * @flags: Flags for send(), if any + * + * Return: return code from send() + */ +int tap_send(int fd, void *data, size_t len, int flags) +{ + uint32_t vnet_len = htonl(len); + send(fd, &vnet_len, 4, 0); + + return send(fd, data, len, flags); +} + +/** + * tap_ip_send() - Send IP packet, with L2 headers, calculating L3/L4 checksums + * @c: Execution context + * @src: IPv6 source address, IPv4-mapped for IPv4 sources + * @proto: L4 protocol number + * @in: Payload + * @len: L4 payload length + */ +void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto, + char *in, size_t len) +{ + char pkt[USHRT_MAX]; + struct ethhdr *eh; + + eh = (struct ethhdr *)pkt; + + /* TODO: ARP table lookup */ + memcpy(eh->h_dest, c->mac_guest, ETH_ALEN); + memcpy(eh->h_source, c->mac, ETH_ALEN); + + if (IN6_IS_ADDR_V4MAPPED(src)) { + struct iphdr *iph = (struct iphdr *)(eh + 1); + char *data = (char *)(iph + 1); + + eh->h_proto = ntohs(ETH_P_IP); + + iph->version = 4; + iph->ihl = 5; + iph->tos = 0; + iph->tot_len = htons(len + 20); + iph->id = 0; + iph->frag_off = 0; + iph->ttl = 255; + iph->protocol = proto; + iph->daddr = c->addr4; + memcpy(&iph->saddr, &src->s6_addr[12], 4); + + iph->check = 0; + iph->check = csum_ip4(iph, iph->ihl * 4); + + memcpy(data, in, len); + + if (iph->protocol == IPPROTO_TCP) { + csum_tcp4(iph); + } else if (iph->protocol == IPPROTO_UDP) { + struct udphdr *uh = (struct udphdr *)(iph + 1); + + uh->check = 0; + } + + tap_send(c->fd_unix, pkt, len + sizeof(*iph) + sizeof(*eh), 0); + } else { + struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1); + char *data = (char *)(ip6h + 1); + + eh->h_proto = ntohs(ETH_P_IPV6); + + memset(ip6h->flow_lbl, 0, 3); + ip6h->payload_len = htons(len); + ip6h->priority = 0; + + ip6h->saddr = *src; + ip6h->daddr = c->addr6_guest; + + memcpy(data, in, len); + + ip6h->hop_limit = proto; + ip6h->version = 0; + ip6h->nexthdr = 0; + if (proto == IPPROTO_TCP) { + struct tcphdr *th = (struct tcphdr *)(ip6h + 1); + + th->check = 0; + th->check = csum_ip4(ip6h, len + sizeof(*ip6h)); + } else if (proto == IPPROTO_UDP) { + struct udphdr *uh = (struct udphdr *)(ip6h + 1); + + uh->check = 0; + uh->check = csum_ip4(ip6h, len + sizeof(*ip6h)); + } else if (proto == IPPROTO_ICMPV6) { + struct icmp6hdr *ih = (struct icmp6hdr *)(ip6h + 1); + + ih->icmp6_cksum = 0; + ih->icmp6_cksum = csum_ip4(ip6h, len + sizeof(*ip6h)); + } + ip6h->version = 6; + ip6h->nexthdr = proto; + ip6h->hop_limit = 255; + + tap_send(c->fd_unix, pkt, len + sizeof(*ip6h) + sizeof(*eh), 0); + } +} @@ -0,0 +1,3 @@ +void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto, + char *in, size_t len); +int tap_send(int fd, void *data, size_t len, int flags); @@ -0,0 +1,1367 @@ +// SPDX-License-Identifier: AGPL-3.0-or-later + +/* PASST - Plug A Simple Socket Transport + * + * tcp.c - TCP L2-L4 translation state machine + * + * Copyright (c) 2020-2021 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + * + */ + +/** + * DOC: Theory of Operation + * + * + * Overview + * -------- + * + * This implementation maps TCP traffic between a single L2 interface (tap) and + * native TCP (L4) sockets, mimicking and reproducing as closely as possible the + * inferred behaviour of applications running on a guest, connected via said L2 + * interface. Four connection flows are supported: + * - from the local host to the guest behind the tap interface: + * - this is the main use case for proxies in service meshes + * - we bind to all unbound local ports, and relay traffic between L4 sockets + * with local endpoints and the L2 interface + * - from remote hosts to the guest behind the tap interface: + * - this might be needed for services that need to be addressed directly, + * and typically configured with special port forwarding rules (which are + * not needed here) + * - we also relay traffic between L4 sockets with remote endpoints and the L2 + * interface + * - from the guest to the local host: + * - this is not observed in practice, but implemented for completeness and + * transparency + * - from the guest to external hosts: + * - this might be needed for applications running on the guest that need to + * directly access internet services (e.g. NTP) + * + * Relevant goals are: + * - transparency: sockets need to behave as if guest applications were running + * directly on the host. This is achieved by: + * - avoiding port and address translations whenever possible + * - mirroring TCP dynamics by observation of socket parameters (TCP_INFO + * socket option) and TCP headers of packets coming from the tap interface, + * reapplying those parameters in both flow directions (including TCP_MSS, + * TCP_WINDOW_CLAMP socket options) + * - simplicity: only a small subset of TCP logic is implemented here and + * delegated as much as possible to the TCP implementations of guest and host + * kernel. This is achieved by: + * - avoiding a complete TCP stack reimplementation, with a modified TCP state + * machine focused on the translation of observed states instead + * - mirroring TCP dynamics as described above and hence avoiding the need for + * segmentation, explicit queueing, and reassembly of segments + * - security: + * - no dynamic memory allocation is performed + * - TODO: synflood protection + * - TODO: sequence collision attacks + * + * Portability is limited by usage of Linux-specific socket options. + * + * + * Limits + * ------ + * + * To avoid the need for dynamic memory allocation, a maximum, reasonable amount + * of connections is defined by TCP_MAX_CONNS below (currently 256k, close to + * the maximum amount of file descriptors typically available to a process on + * Linux). + * + * While fragmentation and reassembly are not implemented, tracking of missing + * segments and retransmissions needs to be, thus data needs to linger on + * sockets as long as it's not acknowledged by the guest, and read using + * MSG_PEEK into a single, preallocated static buffer sized to the maximum + * supported window, 64MiB. This imposes a practical limitation on window + * scaling, that is, the maximum factor is 1024. If a bigger window scaling + * factor is observed during connection establishment, connection is reset and + * reestablished by omitting the scaling factor in the SYN segment. This + * limitation only applies to the window scaling advertised by the guest, but + * if exceeded, no window scaling will be allowed at all toward either endpoint. + * + * + * Ports + * ----- + * + * To avoid the need for ad-hoc configuration of port forwarding or allowed + * ports, listening sockets are opened and bound to all unbound ports on the + * host, as far as process capabilities allow. This service needs to be started + * after any application proxy that needs to bind to local ports. + * + * No port translation is needed for connections initiated remotely or by the + * local host: source port from socket is reused while establishing connections + * to the guest. + * + * For connections initiated by the guest, it's not possible to force the same + * source port as connections are established by the host kernel: that's the + * only port translation needed. + * + * + * Connection tracking and storage + * ------------------------------- + * + * Connection are tracked by the @tc array of struct tcp_conn, containing + * addresses, ports, TCP states and parameters. This is statically allocated and + * indices are the file descriptor numbers associated to inbound or outbound + * sockets. + * + * IPv4 addresses are stored as IPv4-mapped IPv6 addresses to avoid the need for + * separate data structures depending on the protocol version. + * + * - Inbound connection requests (to the guest) are mapped using the triple + * < source IP address, source port, destination port > + * - Outbound connection requests (from the guest) are mapped using the triple + * < destination IP address, destination port, source port > + * where the source port is the one used by the guest, not the one used by the + * corresponding host socket + * + * + * Initialisation + * -------------- + * + * Up to 2^15 + 2^14 listening sockets (excluding ephemeral ports, repeated for + * IPv4 and IPv6) are opened and bound to wildcard addresses. Some will fail to + * bind (for low ports, or ports already bound, e.g. by a proxy). These are + * added to the epoll list, with no separate storage. + * + * + * States and events + * ----------------- + * + * These states apply to connected sockets only, listening sockets are always + * open after initialisation, in LISTEN state. A single state is maintained for + * both sides of the connection, and most states are omitted as they are already + * handled by host kernel and guest. + * + * - CLOSED no connection + * No associated events: this is always a final state, new connections + * directly start from TAP_SYN_SENT or SOCK_SYN_SENT described below. + * + * - TAP_SYN_SENT connect() in progress, triggered from tap + * - connect() completes SYN,ACK to tap > TAP_SYN_RCVD + * - connect() aborts RST to tap, close socket > CLOSED + * - RST from tap close socket > CLOSED + * + * - SOCK_SYN_SENT new connected socket, SYN sent to tap + * - SYN,ACK from tap ACK to tap > ESTABLISHED + * - SYN,ACK timeout RST to tap, close socket > CLOSED + * - socket error RST to tap, close socket > CLOSED + * - RST from tap close socket > CLOSED + * + * - TAP_SYN_RCVD connect() completed, SYN,ACK sent to tap + * - ACK from tap > ESTABLISHED + * - ACK timeout RST to tap, close socket > CLOSED + * - socket error RST to tap, close socket > CLOSED + * - RST from tap close socket > CLOSED + * + * - ESTABLISHED connection established, ready for data + * - zero-sized socket read FIN to tap > ESTABLISHED_SOCK_FIN + * - data timeout FIN to tap > ESTABLISHED_SOCK_FIN + * - socket error RST to tap, close socket > CLOSED + * - FIN from tap FIN,ACK to tap, close socket > FIN_WAIT_1 + * - RST from tap close socket > CLOSED + * + * - ESTABLISHED_SOCK_FIN socket wants to close connection, data allowed + * - ACK from tap > CLOSE_WAIT + * - ACK timeout RST to tap, close socket > CLOSED + * - RST from tap close socket > CLOSED + * + * - CLOSE_WAIT socket wants to close connection, seen by tap + * - socket error RST to tap, close socket > CLOSED + * - FIN from tap ACK to tap, close socket > LAST_ACK + * - FIN timeout RST to tap, close socket > CLOSED + * - RST from tap close socket > CLOSED + * + * - LAST_ACK socket started close, tap completed it + * - anything from socket close socket > CLOSED + * - socket error RST to tap, close socket > CLOSED + * - ACK timeout RST to tap, close socket > CLOSED + * + * - FIN_WAIT_1 tap wants to close connection, _FIN,ACK sent_ + * - ACK from tap close socket > CLOSED + * - socket error RST to tap, close socket > CLOSED + * - ACK timeout RST to tap, close socket > CLOSED + * + * + * Connection setup + * ---------------- + * + * - inbound connection (from socket to guest): on accept() from listening + * socket, the new socket is mapped in connection tracking table, and + * three-way handshake initiated towards the guest, advertising MSS and window + * size and scaling from socket parameters + * - outbound connection (from guest to socket): on SYN segment from guest, a + * new socket is created and mapped in connection tracking table, setting + * MSS and window clamping from header and option of the observed SYN segment + * + * + * Aging and timeout + * ----------------- + * + * Two bitmaps of TCP_MAX_CONNS bits indicate which connections need scheduled + * actions: + * - @tcp_act_fast is used to send ACK segments to the tap once TCP_INFO reports + * an increased number of acknowledged bytes sent on a socket, and examined + * every 20ms (one tenth of current TCP_DELACK_MAX on Linux): for each marked + * connection, a TCP_INFO query is performed and ACK segments are sent right + * away as needed + * - @tcp_act_slow is used for state and retransmission timeouts, and examined + * every 2s: for each marked connection with an expired @timeout timestamp + * specific actions are taken depending on the connection state: + * - SOCK_SYN_SENT: after a 2MSL (240s) timeout waiting for a SYN,ACK segment + * from tap expires, connection is reset (RST to tap, socket closed) + * - TAP_SYN_RCVD: after a 2MSL (240s) timeout waiting for an ACK segment from + * tap expires, connection is reset (RST to tap, socket closed) + * - ESTABLISHED: after a timeout of 1s (TODO: implement requirements from + * RFC 6298) waiting for an ACK segment from tap expires, data from socket + * queue is retransmitted starting from the last ACK sequence + * - ESTABLISHED: after a two hours (current TCP_KEEPALIVE_TIME on Linux) + * timeout waiting for any activity expires, connection is reset (RST to + * tap, socket closed) + * - ESTABLISHED_SOCK_FIN: after a 2MSL (240s) timeout waiting for an ACK + * segment from tap expires, connection is reset (RST to tap, socket closed) + * - CLOSE_WAIT: after a 2MSL (240s) timeout waiting for a FIN segment from + * tap expires, connection is reset (RST to tap, socket closed) + * - LAST_ACK: after a 2MSL (240s) timeout waiting for an ACK segment from + * socket expires, connection is reset (RST to tap, socket closed) + * - FIN_WAIT_1: after a 2MSL (240s) timeout waiting for an ACK segment from + * tap expires, connection is reset (RST to tap, socket closed) + * + * + * Data flows (from ESTABLISHED, ESTABLISHED_SOCK_FIN states) + * ---------------------------------------------------------- + * + * @seq_to_tap: next sequence for packets to tap + * @seq_ack_from_tap: last ACK number received from tap + * @seq_from_tap: next sequence for packets from tap (not actually sent) + * @seq_ack_to_tap: last ACK number sent to tap + * + * @seq_init_from_tap: initial sequence number from tap + * + * @tap_window: last window size received from tap, scaled + * @tcpi_acked_last: most recent value of tcpi_bytes_acked (TCP_INFO) + * + * - from socket to tap: + * - on new data from socket: + * - peek into buffer + * - send data to tap: + * - starting at offset (@seq_to_tap - @seq_ack_from_tap) + * - in MSS-sized segments + * - increasing @seq_to_tap at each segment + * - up to window (until @seq_to_tap - @seq_ack_from_tap <= @tap_window) + * - mark socket in bitmap for periodic ACK check, set @last_ts_to_tap + * - on read error, send RST to tap, close socket + * - on zero read, send FIN to tap, enter ESTABLISHED_SOCK_FIN + * - on ACK from tap: + * - check if it's the second duplicated ACK + * - consume buffer by difference between new ack_seq and @seq_ack_from_tap + * - update @seq_ack_from_tap from ack_seq in header + * - on two duplicated ACKs, reset @seq_to_tap to @seq_ack_from_tap, and + * resend with steps listed above + * - set TCP_WINDOW_CLAMP from TCP header from tap + * - on @seq_ack_from_tap == @seq_to_tap, mark in bitmap, umark otherwise + * - periodically: + * - if @seq_ack_from_tap < @seq_to_tap and the retransmission timer + * (TODO: implement requirements from RFC 6298, currently 3s fixed) from + * @last_ts_to_tap elapsed, reset @seq_to_tap to @seq_ack_from_tap, and + * resend data with the steps listed above + * + * - from tap to socket: + * - on packet from tap: + * - set TCP_WINDOW_CLAMP from TCP header from tap + * - check seq from header against @seq_from_tap, if data is missing, send + * two ACKs with number @seq_ack_to_tap, discard packet + * - otherwise queue data to socket, set @seq_from_tap to seq from header + * plus payload length + * - query socket for TCP_INFO, on tcpi_bytes_acked > @tcpi_acked_last, + * set @tcpi_acked_last to tcpi_bytes_acked, set @seq_ack_to_tap + * to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and + * send ACK to tap + * - set @last_ts_sock + * - on @seq_ack_to_tap < @seq_from_tap, mark socket for later ACK in bitmap + * - periodically: + * - if socket is marked in bitmap, query socket for TCP_INFO, on + * tcpi_bytes_acked > @tcpi_acked_last, + * set @tcpi_acked_last to tcpi_bytes_acked, set @seq_ack_to_tap + * to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and + * send ACK to tap + * - on @seq_ack_to_tap == @seq_from_tap, unmark socket from bitmap + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <errno.h> +#include <limits.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <netinet/in.h> +#include <stdint.h> +#include <stddef.h> +#include <string.h> +#include <sys/epoll.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <unistd.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/tcp.h> +#include <time.h> + +#include "passt.h" +#include "tap.h" +#include "util.h" + +/* Approximately maximum number of open descriptors per process */ +#define MAX_CONNS (256 * 1024) + +#define MAX_WS 10 +#define MAX_WINDOW (1 << (16 + (MAX_WS))) +#define MSS_DEFAULT 536 +#define WINDOW_DEFAULT 4380 + +#define SYN_TIMEOUT 240000 /* ms */ +#define ACK_TIMEOUT 3000 +#define ACT_TIMEOUT 7200000 +#define FIN_TIMEOUT 240000 +#define LAST_ACK_TIMEOUT 240000 + +#define SOCK_ACK_INTERVAL 20 + +/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of + * <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP + */ +#define SOL_TCP IPPROTO_TCP + +static char tcp_in_buf[MAX_WINDOW]; + +static uint8_t tcp_act_fast[MAX_CONNS / 8] = { 0 }; +static uint8_t tcp_act_slow[MAX_CONNS / 8] = { 0 }; + +enum tcp_state { + CLOSED = 0, + TAP_SYN_SENT, + SOCK_SYN_SENT, + TAP_SYN_RCVD, + ESTABLISHED, + ESTABLISHED_SOCK_FIN, + CLOSE_WAIT, + LAST_ACK, + FIN_WAIT_1, +}; + +#define FIN (1 << 0) +#define SYN (1 << 1) +#define RST (1 << 2) +#define ACK (1 << 4) + +#define OPT_EOL 0 +#define OPT_NOP 1 +#define OPT_MSS 2 +#define OPT_WS 3 +#define OPT_SACKP 4 +#define OPT_SACK 5 +#define OPT_TS 8 + +/** + * struct tcp_conn - Descriptor for a TCP connection + * @a.a6: IPv6 remote address, can be IPv4-mapped + * @a.a4.zero: Zero prefix for IPv4-mapped, see RFC 6890, Table 20 + * @a.a4.one: Ones prefix for IPv4-mapped + * @a.a4.a: IPv4 address + * @tap_port: Guest-facing tap port + * @sock_port: Remote, socket-facing port + * @s: TCP connection state + * @seq_to_tap: Next sequence for packets to tap + * @seq_ack_from_tap: Last ACK number received from tap + * @seq_from_tap: Next sequence for packets from tap (not actually sent) + * @seq_ack_to_tap: Last ACK number sent to tap + * @seq_init_from_tap: Initial sequence number from tap + * @tcpi_acked_last: Most recent value of tcpi_bytes_acked (TCP_INFO query) + * @dup_acks: Count of currently duplicated ACKs from tap + * @ws_allowed: Window scaling allowed + * @ws: Window scaling factor + * @tap_window: Last window size received from tap, scaled + * @last_ts_sock: Last activity timestamp from socket for timeout purposes + * @last_ts_tap: Last activity timestamp from tap for timeout purposes + * @mss_guest: Maximum segment size advertised by guest + */ +struct tcp_conn { + union { + struct in6_addr a6; + struct { + uint8_t zero[10]; + uint8_t one[2]; + struct in_addr a; + } a4; + } a; + in_port_t tap_port; + in_port_t sock_port; + enum tcp_state s; + + uint32_t seq_to_tap; + uint32_t seq_ack_from_tap; + uint32_t seq_from_tap; + uint32_t seq_ack_to_tap; + uint32_t seq_init_from_tap; + uint64_t tcpi_acked_last; + int dup_acks; + + int ws_allowed; + int ws; + int tap_window; + + struct timespec last_ts_sock; + struct timespec last_ts_tap; + + int mss_guest; +}; + +static struct tcp_conn tc[MAX_CONNS]; + +static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len); + +/** + * tcp_act_fast_set() - Set socket in bitmap for "fast" timeout events + * @s: Socket file descriptor number + */ +static void tcp_act_fast_set(int s) +{ + tcp_act_fast[s / 8] |= 1 << (s % 8); +} + +/** + * tcp_act_fast_clear() - Clear socket from bitmap for "fast" timeout events + * @s: Socket file descriptor number + */ +static void tcp_act_fast_clear(int s) +{ + tcp_act_fast[s / 8] &= ~(1 << (s % 8)); +} + +/** + * tcp_act_slow_set() - Set socket in bitmap for "slow" timeout events + * @s: Socket file descriptor number + */ +static void tcp_act_slow_set(int s) +{ + tcp_act_slow[s / 8] |= 1 << (s % 8); +} + +/** + * tcp_act_slow_clear() - Clear socket from bitmap for "slow" timeout events + * @s: Socket file descriptor number + */ +static void tcp_act_slow_clear(int s) +{ + tcp_act_slow[s / 8] &= ~(1 << (s % 8)); +} + +/** + * tcp_opt_get() - Get option, and value if any, from TCP header + * @th: Pointer to TCP header + * @len: Length of buffer, including TCP header + * @type: Option type to look for + * @optlen: Optional, filled with option length if passed + * @value: Optional, set to start of option value if passed + * + * Return: Option value, meaningful for up to 4 bytes, -1 if not found + */ +static int tcp_opt_get(struct tcphdr *th, unsigned int len, uint8_t type, + uint8_t *optlen, void *value) +{ + uint8_t *p, __type, __optlen; + + len -= sizeof(*th); + p = (uint8_t *)(th + 1); + + if (len > th->doff * 4 - sizeof(*th)) + len = th->doff * 4 - sizeof(*th); + + while (len >= 2) { + switch (*p) { + case OPT_EOL: + return -1; + case OPT_NOP: + p++; + len--; + break; + default: + __type = *(p++); + __optlen = *(p++); + len -= 2; + + if (type == __type) { + if (optlen) + *optlen = __optlen; + if (value) + value = p; + + if (__optlen - 2 == 0) + return 0; + + if (__optlen - 2 == 1) + return *p; + + if (__optlen - 2 == 2) + return ntohs(*(uint16_t *)p); + + return ntohl(*(uint32_t *)p); + } + + p += __optlen - 2; + len -= __optlen - 2; + } + } + + return -1; +} + +/** + * tcp_close_and_epoll_del() - Close socket and remove from epoll descriptor + * @c: Execution context + * @s: File descriptor number for socket + */ +static void tcp_close_and_epoll_del(struct ctx *c, int s) +{ + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL); + close(s); + tcp_act_fast_clear(s); + tcp_act_slow_clear(s); +} + +/** + * tcp_rst() - Reset a connection: send RST segment to tap, close socket + * @c: Execution context + * @s: File descriptor number for socket + */ +static void tcp_rst(struct ctx *c, int s) +{ + if (s < 0) + return; + + tcp_send_to_tap(c, s, RST, NULL, 0); + tcp_close_and_epoll_del(c, s); + tc[s].s = CLOSED; +} + +/** + * tcp_send_to_tap() - Send segment to tap, with options and values from socket + * @c: Execution context + * @s: File descriptor number for socket + * @flags: TCP flags to set + * @in: Input buffer, L4 header + * @len: Buffer length, at L4 + * + * Return: -1 on error with connection reset, 0 otherwise + */ +static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len) +{ + char buf[USHRT_MAX] = { 0 }, *data; + struct tcp_info info = { 0 }; + socklen_t sl = sizeof(info); + int ws = 0, have_info = 1; + struct tcphdr *th; + + if (getsockopt(s, SOL_TCP, TCP_INFO, &info, &sl)) { + if (!(flags & RST)) { + tcp_rst(c, s); + return -1; + } + + have_info = 0; + } + + th = (struct tcphdr *)buf; + data = (char *)(th + 1); + + if (flags & SYN && have_info) { + if (tc[s].ws_allowed) + ws = info.tcpi_snd_wscale; + + /* Options: MSS, NOP and window scale if allowed (4-8 bytes) */ + *data++ = 2; + *data++ = 4; + *(uint16_t *)data = htons(info.tcpi_snd_mss); + data += 2; + + if (ws) { + *data++ = 1; + + *data++ = 3; + *data++ = 3; + *data++ = ws; + + th->doff = (20 + 8) / 4; + } else { + th->doff = (20 + 4) / 4; + } + + th->seq = htonl(tc[s].seq_to_tap++); + } else { + th->doff = 20 / 4; + + th->seq = htonl(tc[s].seq_to_tap); + tc[s].seq_to_tap += len; + } + + if ((info.tcpi_bytes_acked > tc[s].tcpi_acked_last || (flags & ACK) || + len) && + have_info) { + uint64_t ack_seq; + + th->ack = 1; + /* info.tcpi_bytes_acked already includes one byte for SYN, but + * not for incoming connections. + */ + ack_seq = info.tcpi_bytes_acked + tc[s].seq_init_from_tap; + if (!info.tcpi_bytes_acked) + ack_seq++; + ack_seq &= (uint32_t)~0U; + + tc[s].seq_ack_to_tap = ack_seq; + th->ack_seq = htonl(tc[s].seq_ack_to_tap); + + tc[s].tcpi_acked_last = info.tcpi_bytes_acked; + } else { + if (!len && !flags) + return 0; + + th->ack = th->ack_seq = 0; + } + + th->rst = !!(flags & RST); + th->syn = !!(flags & SYN); + th->fin = !!(flags & FIN); + + th->source = tc[s].sock_port; + th->dest = tc[s].tap_port; + + if (have_info) + th->window = htons(info.tcpi_snd_wnd >> info.tcpi_snd_wscale); + else + th->window = WINDOW_DEFAULT; + + th->urg_ptr = 0; + th->check = 0; + + memcpy(data, in, len); + + tap_ip_send(c, &tc[s].a.a6, IPPROTO_TCP, buf, th->doff * 4 + len); + + return 0; +} + +/** + * tcp_clamp_window() - Set window and scaling from option, clamp on socket + * @s: File descriptor number for socket + * @th: TCP header, from tap + * @len: Buffer length, at L4 + */ +static void tcp_clamp_window(int s, struct tcphdr *th, int len) +{ + int ws; + + if (!tc[s].tap_window) { + ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL); + if (ws >= 0 && ws <= MAX_WS) { + tc[s].ws_allowed = 1; + tc[s].ws = ws; + } else { + tc[s].ws_allowed = 0; + tc[s].ws = 0; + } + + /* First value is not scaled. Also, don't clamp yet, to avoid + * getting a zero scale just because we set a small window now. + */ + tc[s].tap_window = ntohs(th->window); + } else { + tc[s].tap_window = ntohs(th->window) << tc[s].ws; + setsockopt(s, SOL_TCP, TCP_WINDOW_CLAMP, + &tc[s].tap_window, sizeof(tc[s].tap_window)); + } +} + +/** + * tcp_conn_from_tap() - Handle connection request (SYN segment) from tap + * @c: Execution context + * @af: Address family, AF_INET or AF_INET6 + * @addr: Remote address, pointer to sin_addr or sin6_addr + * @th: TCP header from tap + * @len: Packet length at L4 + */ +static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, + struct tcphdr *th, size_t len) +{ + struct sockaddr_in addr4 = { + .sin_family = AF_INET, + .sin_port = th->dest, + .sin_addr = *(struct in_addr *)addr, + }; + struct sockaddr_in6 addr6 = { + .sin6_family = AF_INET6, + .sin6_port = th->dest, + .sin6_addr = *(struct in6_addr *)addr, + }; + struct epoll_event ev = { 0 }; + const struct sockaddr *sa; + socklen_t sl; + int s; + + s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); + if (s < 0) + return; + + tc[s].mss_guest = tcp_opt_get(th, len, OPT_MSS, NULL, NULL); + if (tc[s].mss_guest < 0) + tc[s].mss_guest = MSS_DEFAULT; + sl = sizeof(tc[s].mss_guest); + setsockopt(s, SOL_TCP, TCP_MAXSEG, &tc[s].mss_guest, sl); + + tcp_clamp_window(s, th, len); + + if (af == AF_INET) { + sa = (const struct sockaddr *)&addr4; + sl = sizeof(addr4); + + memset(&tc[s].a.a4.zero, 0, sizeof(tc[s].a.a4.zero)); + memset(&tc[s].a.a4.one, 0xff, sizeof(tc[s].a.a4.one)); + memcpy(&tc[s].a.a4.a, addr, sizeof(tc[s].a.a4.a)); + } else { + sa = (const struct sockaddr *)&addr6; + sl = sizeof(addr6); + + memcpy(&tc[s].a.a6, addr, sizeof(tc[s].a.a6)); + } + + tc[s].sock_port = th->dest; + tc[s].tap_port = th->source; + + ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP; + ev.data.fd = s; + + tc[s].seq_init_from_tap = ntohl(th->seq); + tc[s].seq_from_tap = tc[s].seq_init_from_tap + 1; + tc[s].seq_ack_to_tap = tc[s].seq_from_tap; + + /* TODO: RFC 6528 with SipHash, worth it? */ + tc[s].seq_ack_from_tap = tc[s].seq_to_tap = 0; + + if (connect(s, sa, sl)) { + if (errno != EINPROGRESS) { + tcp_rst(c, s); + return; + } + + ev.events |= EPOLLOUT; + tc[s].s = TAP_SYN_SENT; + } else { + if (tcp_send_to_tap(c, s, SYN | ACK, NULL, 0)) + return; + + tc[s].s = TAP_SYN_RCVD; + } + + epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev); + + return; +} + +/** + * tcp_sock_lookup() - Look up socket given remote address and pair of ports + * @af: Address family, AF_INET or AF_INET6 + * @tap_port: tap-facing port + * @sock_port: Socket-facing port + * + * Return: file descriptor number for socket, if found, -1 otherwise + */ +static int tcp_sock_lookup(int af, void *addr, + in_port_t tap_port, in_port_t sock_port) +{ + int i; + + /* TODO: hash table and lookup. This is just a dummy implementation. */ + for (i = 0; i < MAX_CONNS; i++) { + if (af == AF_INET && IN6_IS_ADDR_V4MAPPED(&tc[i].a.a6) && + !memcmp(&tc[i].a.a4.a, addr, sizeof(tc[i].a.a4.a)) && + tc[i].tap_port == tap_port && + tc[i].sock_port == sock_port && + tc[i].s) + return i; + + if (af == AF_INET6 && + !memcmp(&tc[i].a.a6, addr, sizeof(tc[i].a.a6)) && + tc[i].tap_port == tap_port && + tc[i].sock_port == sock_port && + tc[i].s) + return i; + } + + return -1; +} + +/** + * tcp_conn_from_sock() - Handle new connection request from listening socket + * @c: Execution context + * @fd: File descriptor number for listening socket + */ +static void tcp_conn_from_sock(struct ctx *c, int fd) +{ + struct sockaddr_storage sa_r, sa_l; + socklen_t sa_len = sizeof(sa_r); + struct epoll_event ev = { 0 }; + struct sockaddr_in6 *sa6; + struct sockaddr_in *sa4; + int s; + + if (getsockname(fd, (struct sockaddr *)&sa_l, &sa_len)) + return; + + s = accept4(fd, (struct sockaddr *)&sa_r, &sa_len, SOCK_NONBLOCK); + if (s == -1) + return; + + if (sa_l.ss_family == AF_INET) { + sa4 = (struct sockaddr_in *)&sa_r; + + memset(&tc[s].a.a4.zero, 0, sizeof(tc[s].a.a4.zero)); + memset(&tc[s].a.a4.one, 0xff, sizeof(tc[s].a.a4.one)); + memcpy(&tc[s].a.a4.a, &sa4->sin_addr, sizeof(tc[s].a.a4.a)); + + tc[s].sock_port = sa4->sin_port; + + sa4 = (struct sockaddr_in *)&sa_l; + tc[s].tap_port = sa4->sin_port; + + } else if (sa_l.ss_family == AF_INET6) { + sa6 = (struct sockaddr_in6 *)&sa_r; + + memcpy(&tc[s].a.a6, &sa6->sin6_addr, sizeof(tc[s].a.a6)); + + tc[s].sock_port = sa6->sin6_port; + + sa6 = (struct sockaddr_in6 *)&sa_l; + tc[s].tap_port = sa6->sin6_port; + } + + /* TODO: RFC 6528 with SipHash, worth it? */ + tc[s].seq_to_tap = 0; + + tc[s].ws_allowed = 1; + + clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_sock); + clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_tap); + + ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP; + ev.data.fd = s; + epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev); + + tc[s].s = SOCK_SYN_SENT; + tcp_send_to_tap(c, s, SYN, NULL, 0); +} + +/** + * tcp_send_to_sock() - Send buffer to socket, update timestamp and sequence + * @c: Execution context + * @s: File descriptor number for socket + * @seq: Previous TCP sequence, host order + * @data: Data buffer + * @len: Length at L4 + * @extra_flags: Additional flags for send(), if any + * + * Return: -1 on socket error with connection reset, 0 otherwise + */ +static int tcp_send_to_sock(struct ctx *c, int s, int seq, char *data, int len, + int extra_flags) +{ + int err = send(s, data, len, MSG_DONTWAIT | MSG_NOSIGNAL | extra_flags); + + if (err < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + /* If we can't queue right now, do nothing, sender has + * to retransmit. + */ + return 0; + } + + tcp_rst(c, s); + return -1; + } + + clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_sock); + tc[s].seq_from_tap = seq + len; + + return 0; +} + +/** + * tcp_check_dupack() - Check if given ACK number is duplicated, update counter + * @s: File descriptor number for socket + * @ack_seq: ACK sequence, host order + * + * Return: 1 on two duplicated ACKs observed, with counter reset, 0 otherwise + */ +static int tcp_check_dupack(int s, uint32_t ack_seq) +{ + if (ack_seq == tc[s].seq_ack_from_tap && ++tc[s].dup_acks == 2) { + tc[s].dup_acks = 0; + return 1; + } + + return 0; +} + +/** + * tcp_sock_consume() - Consume (discard) data from socket buffer + * @s: File descriptor number for socket + * @ack_seq: ACK sequence, host order + * + * Return: -1 on invalid sequence, 0 otherwise + */ +static int tcp_sock_consume(int s, uint32_t ack_seq) +{ + int to_ack; + + /* Implicitly take care of wrap-arounds */ + to_ack = ack_seq - tc[s].seq_ack_from_tap; + + if (to_ack < 0) + return -1; + + recv(s, NULL, to_ack, MSG_DONTWAIT | MSG_TRUNC); + tc[s].seq_ack_from_tap = ack_seq; + + return 0; +} + +/** + * tcp_data_from_sock() - Handle new data from socket, queue to tap, in window + * @c: Execution context + * @s: File descriptor number for socket + * + * Return: non-zero on socket error or pending data, 0 otherwise + */ +static int tcp_data_from_sock(struct ctx *c, int s) +{ + int len, offset, left, send; + + /* Don't dequeue until acknowledged by guest */ + len = recv(s, tcp_in_buf, sizeof(tcp_in_buf), MSG_DONTWAIT | MSG_PEEK); + if (len < 0) { + if (errno != EAGAIN && errno != EWOULDBLOCK) + tcp_rst(c, s); + return 1; + } + + if (len == 0) { + if (tc[s].s >= ESTABLISHED_SOCK_FIN) + return 0; + + tc[s].s = ESTABLISHED_SOCK_FIN; + if (tcp_send_to_tap(c, s, FIN | ACK, NULL, 0)) + return 0; + + left = 0; + goto out; + } + + offset = tc[s].seq_to_tap - tc[s].seq_ack_from_tap; + left = len - offset; + while (left && offset + tc[s].mss_guest <= tc[s].tap_window) { + if (left < tc[s].mss_guest) + send = left; + else + send = tc[s].mss_guest; + + if (tcp_send_to_tap(c, s, 0, tcp_in_buf + offset, send)) + return 0; + + offset += send; + left -= send; + } + +out: + clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_tap); + tcp_act_slow_set(s); + + return !!left; +} + +/** + * tcp_tap_handler() - Handle packets from tap and state transitions + * @c: Execution context + * @af: Address family, AF_INET or AF_INET6 + * @in: Input buffer + * @len: Length, including TCP header + */ +void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len) +{ + struct tcphdr *th = (struct tcphdr *)in; + size_t off; + int s, ws; + + if (len < sizeof(*th)) + return; + + off = th->doff * 4; + if (off < sizeof(*th) || off > len) + return; + + s = tcp_sock_lookup(af, addr, th->source, th->dest); + + if (s < 0) { + if (th->syn) + tcp_conn_from_tap(c, af, addr, th, len); + return; + } + + if (th->rst) { + tcp_close_and_epoll_del(c, s); + return; + } + + tcp_clamp_window(s, th, len); + + if (th->ack) + clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_tap); + + switch (tc[s].s) { + case SOCK_SYN_SENT: + if (!th->syn || !th->ack) + return; + + tc[s].mss_guest = tcp_opt_get(th, len, OPT_MSS, NULL, NULL); + if (tc[s].mss_guest < 0) + tc[s].mss_guest = MSS_DEFAULT; + + ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL); + if (ws > MAX_WS) { + if (tcp_send_to_tap(c, s, RST, NULL, 0)) + return; + + tc[s].seq_to_tap = 0; + tc[s].ws_allowed = 0; + tcp_send_to_tap(c, s, SYN, NULL, 0); + return; + } + + tc[s].seq_from_tap = tc[s].seq_init_from_tap = ntohl(th->seq); + tc[s].seq_ack_to_tap = tc[s].seq_from_tap; + + tc[s].s = ESTABLISHED; + tcp_send_to_tap(c, s, ACK, NULL, 0); + break; + case TAP_SYN_SENT: + break; + case TAP_SYN_RCVD: + if (th->fin) { + shutdown(s, SHUT_WR); + tc[s].s = FIN_WAIT_1; + + break; + } + + if (!th->ack) { + tcp_rst(c, s); + return; + } + + tc[s].seq_ack_from_tap = ntohl(th->ack_seq); + + tc[s].s = ESTABLISHED; + break; + case ESTABLISHED: + if (th->ack) { + int retrans = 0; + + if (len == th->doff) + retrans = tcp_check_dupack(s, th->ack_seq); + + if (tcp_sock_consume(s, ntohl(th->ack_seq))) { + tcp_rst(c, s); + return; + } + + if (retrans) { + tc[s].seq_to_tap = tc[s].seq_ack_from_tap; + tcp_data_from_sock(c, s); + } + } + + if (tcp_send_to_sock(c, s, ntohl(th->seq), in + off, len - off, + th->psh ? 0 : MSG_MORE)) + break; + + if (th->fin) { + shutdown(s, SHUT_WR); + tc[s].s = FIN_WAIT_1; + } + + break; + case ESTABLISHED_SOCK_FIN: + if (tcp_send_to_sock(c, s, ntohl(th->seq), in + off, len - off, + th->psh ? 0 : MSG_MORE) < 0) + break; + + if (th->ack) { + shutdown(s, SHUT_RD); + if (!tcp_data_from_sock(c, s)) + tc[s].s = CLOSE_WAIT; + + if (tcp_sock_consume(s, ntohl(th->ack_seq))) { + tcp_rst(c, s); + return; + } + } + + break; + + case CLOSE_WAIT: + if (tcp_sock_consume(s, ntohl(th->ack_seq))) { + tcp_rst(c, s); + return; + } + + if (th->fin) { + shutdown(s, SHUT_WR); + tc[s].s = LAST_ACK; + } + + break; + case FIN_WAIT_1: + case LAST_ACK: + case CLOSED: /* ;) */ + break; + } + + if (tc[s].seq_to_tap > tc[s].seq_ack_from_tap) + tcp_act_slow_set(s); + else + tcp_act_slow_clear(s); + + if (tc[s].seq_from_tap > tc[s].seq_ack_to_tap) + tcp_act_fast_set(s); + else + tcp_act_fast_clear(s); +} + +/** + * tcp_connect_finish() - Handle completion of connect() from EPOLLOUT event + * @c: Execution context + * @s: File descriptor number for socket + */ +static void tcp_connect_finish(struct ctx *c, int s) +{ + struct epoll_event ev = { 0 }; + socklen_t sl; + int so; + + sl = sizeof(so); + if (getsockopt(s, SOL_SOCKET, SO_ERROR, &so, &sl) || so) { + tcp_rst(c, s); + return; + } + + if (tcp_send_to_tap(c, s, SYN | ACK, NULL, 0) < 0) + return; + + ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP; + ev.data.fd = s; + epoll_ctl(c->epollfd, EPOLL_CTL_MOD, s, &ev); + + tc[s].s = TAP_SYN_RCVD; +} + +/** + * tcp_sock_handler() - Handle new data from socket + * @c: Execution context + * @s: File descriptor number for socket + * @events: epoll events bitmap + */ +void tcp_sock_handler(struct ctx *c, int s, uint32_t events) +{ + socklen_t sl; + int so; + + if (tc[s].s == LAST_ACK) { + tcp_close_and_epoll_del(c, s); + return; + } + + sl = sizeof(so); + if ((events & EPOLLERR) || + getsockopt(s, SOL_SOCKET, SO_ACCEPTCONN, &so, &sl)) { + if (tc[s].s != CLOSED) + tcp_rst(c, s); + return; + } + + if (so) { + tcp_conn_from_sock(c, s); + return; + } + + if (events & EPOLLOUT) { /* Implies TAP_SYN_SENT */ + tcp_connect_finish(c, s); + return; + } + + if (tc[s].s == ESTABLISHED) + tcp_data_from_sock(c, s); + + if (events & EPOLLRDHUP || events & EPOLLHUP) { + if (tc[s].s == ESTABLISHED) + tc[s].s = ESTABLISHED_SOCK_FIN; + + tcp_send_to_tap(c, s, FIN | ACK, NULL, 0); + + if (tc[s].s == FIN_WAIT_1) { + shutdown(s, SHUT_RD); + + if (tcp_sock_consume(s, ntohl(tc[s].seq_ack_from_tap))) { + tcp_rst(c, s); + return; + } + + tcp_close_and_epoll_del(c, s); + tc[s].s = CLOSED; + } + } +} + +/** + * tcp_sock_init() - Create and bind listening sockets for inbound connections + * @c: Execution context + * + * Return: 0 on success, -1 on failure + */ +int tcp_sock_init(struct ctx *c) +{ + in_port_t port; + + for (port = 0; port < (1 << 15) + (1 << 14); port++) { + if (c->v4 && sock_l4_add(c, 4, IPPROTO_TCP, htons(port)) < 0) + return -1; + if (c->v6 && sock_l4_add(c, 6, IPPROTO_TCP, htons(port)) < 0) + return -1; + } + + return 0; +} + +/** + * tcp_periodic_fast_one() - Handler for "fast" timeout events on one socket + * @c: Execution context + * @s: File descriptor number for socket + * @ts: Timestamp from caller + * + * Return: 0 if socket needs to be monitored further, non-zero otherwise + */ +int tcp_periodic_fast_one(struct ctx *c, int s, struct timespec *ts) +{ + if (timespec_diff_ms(ts, &tc[s].last_ts_sock) < SOCK_ACK_INTERVAL) + return 0; + + tc[s].last_ts_sock = *ts; + + tcp_send_to_tap(c, s, 0, NULL, 0); + + return tc[s].seq_from_tap == tc[s].seq_ack_to_tap; +} + +/** + * tcp_periodic_fast() - Handle sockets in "fast" event bitmap, clear as needed + * @c: Execution context + */ +void tcp_periodic_fast(struct ctx *c) +{ + long *word = (long *)tcp_act_fast, tmp; + struct timespec now; + unsigned int i; + int n, s; + + clock_gettime(CLOCK_MONOTONIC, &now); + + for (i = 0; i < sizeof(tcp_act_fast) / sizeof(long); i++, word++) { + tmp = *word; + while ((n = ffsl(tmp))) { + tmp &= ~(1UL << (n - 1)); + + s = i * sizeof(long) * 8 + n - 1; + + if (tcp_periodic_fast_one(c, s, &now)) + *word &= ~(1UL << (n - 1)); + } + } +} + +/** + * tcp_periodic_fast_one() - Handler for "slow" timeout events on one socket + * @c: Execution context + * @s: File descriptor number for socket + * @ts: Timestamp from caller + */ +void tcp_periodic_slow_one(struct ctx *c, int s, struct timespec *ts) +{ + switch (tc[s].s) { + case SOCK_SYN_SENT: + case TAP_SYN_SENT: + case TAP_SYN_RCVD: + if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > SYN_TIMEOUT) + tcp_rst(c, s); + break; + case ESTABLISHED_SOCK_FIN: + if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > FIN_TIMEOUT) { + tcp_rst(c, s); + break; + } + /* Falls through */ + case ESTABLISHED: + if (tc[s].seq_ack_from_tap < tc[s].seq_to_tap && + timespec_diff_ms(ts, &tc[s].last_ts_tap) > ACK_TIMEOUT) { + tc[s].seq_to_tap = tc[s].seq_ack_from_tap; + tcp_data_from_sock(c, s); + } + + if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > ACT_TIMEOUT && + timespec_diff_ms(ts, &tc[s].last_ts_sock) > ACT_TIMEOUT) + tcp_rst(c, s); + + break; + case CLOSE_WAIT: + case FIN_WAIT_1: + if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > FIN_TIMEOUT) + tcp_rst(c, s); + break; + case LAST_ACK: + if (timespec_diff_ms(ts, &tc[s].last_ts_sock) > + LAST_ACK_TIMEOUT) + tcp_rst(c, s); + break; + case CLOSED: + break; + } +} + +/** + * tcp_periodic_slow() - Handle sockets in "slow" event bitmap + * @c: Execution context + */ +void tcp_periodic_slow(struct ctx *c) +{ + long *word = (long *)tcp_act_slow, tmp; + struct timespec now; + unsigned int i; + int n; + + clock_gettime(CLOCK_MONOTONIC, &now); + + for (i = 0; i < sizeof(tcp_act_slow) / sizeof(long); i++, word++) { + tmp = *word; + while ((n = ffsl(tmp))) { + tmp &= ~(1UL << (n - 1)); + tcp_periodic_slow_one(c, i * sizeof(long) * 8 + n - 1, + &now); + } + } +} @@ -0,0 +1,5 @@ +void tcp_sock_handler(struct ctx *c, int s, uint32_t events); +void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len); +int tcp_sock_init(struct ctx *c); +void tcp_periodic_fast(struct ctx *c); +void tcp_periodic_slow(struct ctx *c); @@ -0,0 +1,174 @@ +// SPDX-License-Identifier: AGPL-3.0-or-later + +/* PASST - Plug A Simple Socket Transport + * + * udp.c - UDP L2-L4 translation routines + * + * Copyright (c) 2020-2021 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + * + */ + +/** + * DOC: Theory of Operation + * + * + * For UDP, no state machine or any particular tracking is required. Try to + * create and bind sets of 2^16 sockets, one for IPv4 and one for IPv6. Binding + * will fail on ports that are already bound, or low ports depending on + * capabilities. + * + * Packets are forwarded back and forth, by prepending and stripping UDP headers + * in the obvious way, with no port translation. + * + */ + +#include <stdio.h> +#include <errno.h> +#include <limits.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <netinet/in.h> +#include <stdint.h> +#include <stddef.h> +#include <string.h> +#include <sys/epoll.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <unistd.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/udp.h> +#include <time.h> + +#include "passt.h" +#include "tap.h" +#include "util.h" + +static int udp4_sock_port[USHRT_MAX]; +static int udp6_sock_port[USHRT_MAX]; + +/** + * udp_sock_handler() - Handle new data from socket + * @c: Execution context + * @s: File descriptor number for socket + * @events: epoll events bitmap + */ +void udp_sock_handler(struct ctx *c, int s, uint32_t events) +{ + struct in6_addr a6 = { .s6_addr = { 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0xff, 0xff, + 0, 0, 0, 0 } }; + struct sockaddr_storage sr, sl; + socklen_t slen = sizeof(sr); + char buf[USHRT_MAX]; + struct udphdr *uh; + int n; + + (void)events; + + n = recvfrom(s, buf + sizeof(*uh), sizeof(buf) - sizeof(*uh), + MSG_DONTWAIT, (struct sockaddr *)&sr, &slen); + if (n < 0) + return; + + uh = (struct udphdr *)buf; + + if (getsockname(s, (struct sockaddr *)&sl, &slen)) + return; + + if (sl.ss_family == AF_INET) { + struct sockaddr_in *sr4 = (struct sockaddr_in *)&sr; + struct sockaddr_in *sl4 = (struct sockaddr_in *)&sl; + + memcpy(&a6.s6_addr[12], &sr4->sin_addr, sizeof(sr4->sin_addr)); + uh->source = sr4->sin_port; + uh->dest = sl4->sin_port; + uh->len = htons(n + sizeof(*uh)); + + tap_ip_send(c, &a6, IPPROTO_UDP, buf, n + sizeof(*uh)); + } else if (sl.ss_family == AF_INET6) { + struct sockaddr_in6 *sr6 = (struct sockaddr_in6 *)&sr; + struct sockaddr_in6 *sl6 = (struct sockaddr_in6 *)&sl; + + uh->source = sr6->sin6_port; + uh->dest = sl6->sin6_port; + uh->len = htons(n + sizeof(*uh)); + + tap_ip_send(c, &sr6->sin6_addr, IPPROTO_UDP, + buf, n + sizeof(*uh)); + } +} + +/** + * tcp_tap_handler() - Handle packets from tap + * @c: Execution context + * @af: Address family, AF_INET or AF_INET6 + * @in: Input buffer + * @len: Length, including UDP header + */ +void udp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len) +{ + struct udphdr *uh = (struct udphdr *)in; + int s; + + (void)c; + + if (af == AF_INET) { + struct sockaddr_in sa = { + .sin_family = AF_INET, + .sin_port = uh->dest, + }; + + if (!(s = udp4_sock_port[ntohs(uh->source)])) + return; + + fprintf(stderr, "udp from tap: using socket %i\n", s); + + sa.sin_addr = *(struct in_addr *)addr; + + sendto(s, in + sizeof(*uh), len - sizeof(*uh), MSG_DONTWAIT, + (struct sockaddr *)&sa, sizeof(sa)); + } else if (af == AF_INET6) { + struct sockaddr_in6 sa = { + .sin6_family = AF_INET6, + .sin6_port = uh->dest, + .sin6_addr = *(struct in6_addr *)addr, + }; + + if (!(s = udp6_sock_port[ntohs(uh->source)])) + return; + + fprintf(stderr, "udp from tap: using socket %i\n", s); + + sendto(s, in + sizeof(*uh), len - sizeof(*uh), MSG_DONTWAIT, + (struct sockaddr *)&sa, sizeof(sa)); + } +} + +/** + * udp_sock_init() - Create and bind listening sockets for inbound connections + * @c: Execution context + * + * Return: 0 on success, -1 on failure + */ +int udp_sock_init(struct ctx *c) +{ + in_port_t port; + int s; + + for (port = 0; port < USHRT_MAX; port++) { + if (c->v4 && + (s = sock_l4_add(c, 4, IPPROTO_UDP, htons(port))) < 0) + return -1; + udp4_sock_port[port] = s; + + if (c->v6 && + (s = sock_l4_add(c, 6, IPPROTO_UDP, htons(port))) < 0) + return -1; + udp6_sock_port[port] = s; + } + + return 0; +} @@ -0,0 +1,3 @@ +void udp_sock_handler(struct ctx *c, int s, uint32_t events); +void udp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len); +int udp_sock_init(struct ctx *c); @@ -1,17 +1,28 @@ +// SPDX-License-Identifier: AGPL-3.0-or-later + /* PASST - Plug A Simple Socket Transport * * util.c - Convenience helpers * + * Copyright (c) 2020-2021 Red Hat GmbH * Author: Stefano Brivio <sbrivio@redhat.com> - * License: GPLv2 * */ #include <stdio.h> #include <stdint.h> #include <stddef.h> +#include <unistd.h> #include <linux/ipv6.h> #include <arpa/inet.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <sys/epoll.h> + +#include "passt.h" /** * csum_fold() - Fold long sum for IP and TCP checksum @@ -50,7 +61,45 @@ uint16_t csum_ip4(void *buf, size_t len) return ~csum_fold(sum); } -unsigned char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto) +/** + * csum_ipv4() - Calculate TCP checksum for IPv4 and set in place + * @iph: Packet buffer, IP header + */ +void csum_tcp4(struct iphdr *iph) +{ + struct tcphdr *th = (struct tcphdr *)((char *)iph + iph->ihl * 4); + uint16_t tlen = ntohs(iph->tot_len) - iph->ihl * 4, *p = (uint16_t *)th; + uint32_t sum = 0; + + sum += (iph->saddr >> 16) & 0xffff; + sum += iph->saddr & 0xffff; + sum += (iph->daddr >> 16) & 0xffff; + sum += iph->daddr & 0xffff; + + sum += htons(IPPROTO_TCP); + sum += htons(tlen); + + th->check = 0; + while (tlen > 1) { + sum += *p++; + tlen -= 2; + } + + if (tlen > 0) { + sum += *p & htons(0xff00); + } + + th->check = (uint16_t)~csum_fold(sum); +} + +/** + * ipv6_l4hdr() - Find pointer to L4 header in IPv6 packet and extract protocol + * @ip6h: IPv6 header + * @proto: Filled with L4 protocol number + * + * Return: pointer to L4 header, NULL if not found + */ +char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto) { int offset, len, hdrlen; struct ipv6_opt_hdr *o; @@ -79,9 +128,95 @@ unsigned char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto) offset; } else { *proto = nh; - return (unsigned char *)(ip6h + 1) + offset; + return (char *)(ip6h + 1) + offset; } } return NULL; } + +/** + * sock_l4_add() - Create and bind socket for given L4, add to epoll list + * @c: Execution context + * @v: IP protocol, 4 or 6 + * @proto: Protocol number, network order + * @port: Port, network order + * + * Return: newly created socket, -1 on error + */ +int sock_l4_add(struct ctx *c, int v, uint16_t proto, uint16_t port) +{ + struct sockaddr_in addr4 = { + .sin_family = AF_INET, + .sin_port = port, + .sin_addr = { .s_addr = INADDR_ANY }, + }; + struct sockaddr_in6 addr6 = { + .sin6_family = AF_INET6, + .sin6_port = port, + .sin6_addr = IN6ADDR_ANY_INIT, + }; + struct epoll_event ev = { 0 }; + const struct sockaddr *sa; + int fd, sl; + + if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) + return -1; /* Not implemented. */ + + fd = socket(v == 4 ? AF_INET : AF_INET6, + proto == IPPROTO_TCP ? SOCK_STREAM : SOCK_DGRAM, proto); + if (fd < 0) { + perror("L4 socket"); + return -1; + } + + if (v == 4) { + sa = (const struct sockaddr *)&addr4; + sl = sizeof(addr4); + } else { + sa = (const struct sockaddr *)&addr6; + sl = sizeof(addr6); + } + + if (bind(fd, sa, sl) < 0) { + /* We'll fail to bind to low ports if we don't have enough + * capabilities, and we'll fail to bind on already bound ports, + * this is fine. + */ + close(fd); + return 0; + } + + if (proto == IPPROTO_TCP && listen(fd, 128) < 0) { + perror("TCP socket listen"); + close(fd); + return -1; + } + + ev.events = EPOLLIN; + ev.data.fd = fd; + if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev) == -1) { + perror("L4 epoll_ctl"); + return -1; + } + + return fd; +} + +/** + * timespec_diff_ms() - Report difference in milliseconds between two timestamps + * @a: Minuend timestamp + * @b: Subtrahend timestamp + * + * Return: difference in milliseconds + */ +int timespec_diff_ms(struct timespec *a, struct timespec *b) +{ + if (a->tv_nsec < b->tv_nsec) { + return (b->tv_nsec - a->tv_nsec) / 1000 + + (a->tv_sec - b->tv_sec - 1) * 1000; + } + + return (a->tv_nsec - b->tv_nsec) / 1000 + + (a->tv_sec - b->tv_sec) * 1000; +} @@ -1,3 +1,6 @@ uint16_t csum_fold(uint32_t sum); uint16_t csum_ip4(void *buf, size_t len); -unsigned char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto); +void csum_tcp4(struct iphdr *iph); +char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto); +int sock_l4_add(struct ctx *c, int v, uint16_t proto, uint16_t port); +int timespec_diff_ms(struct timespec *a, struct timespec *b); |