diff options
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | README.md | 4 | ||||
-rw-r--r-- | conf.c | 16 | ||||
-rw-r--r-- | contrib/apparmor/abstractions/passt | 1 | ||||
-rw-r--r-- | contrib/apparmor/abstractions/pasta | 5 | ||||
-rw-r--r-- | fwd.c | 2 | ||||
-rw-r--r-- | ip.h | 9 | ||||
-rw-r--r-- | log.c | 4 | ||||
-rw-r--r-- | netlink.c | 43 | ||||
-rw-r--r-- | passt.c | 5 | ||||
-rw-r--r-- | pasta.c | 7 | ||||
-rwxr-xr-x | seccomp.sh | 4 | ||||
-rw-r--r-- | tcp.c | 535 | ||||
-rw-r--r-- | tcp_splice.c | 2 | ||||
-rw-r--r-- | test/.gitignore | 1 | ||||
-rw-r--r-- | test/Makefile | 20 | ||||
-rw-r--r-- | test/pasta_options/log_to_file | 11 | ||||
-rw-r--r-- | test/pasta_podman/bats | 15 | ||||
-rw-r--r-- | udp.c | 5 | ||||
-rw-r--r-- | util.h | 26 |
20 files changed, 397 insertions, 320 deletions
@@ -308,4 +308,4 @@ cppcheck: $(SRCS) $(HEADERS) --inline-suppr \ --suppress=unusedStructMember \ $(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \ - . + $(SRCS) $(HEADERS) @@ -342,16 +342,18 @@ speeding up local connections, and usually requiring NAT. _pasta_: ### Availability * official packages for: + * ✅ [Alpine Linux](https://pkgs.alpinelinux.org/packages?name=passt) * ✅ [Arch Linux](https://archlinux.org/packages/extra/x86_64/passt/) ([aarch64](https://archlinuxarm.org/packages/aarch64/passt), [i486](https://www.archlinux32.org/packages/?q=passt)) * ✅ [CentOS Stream](https://gitlab.com/redhat/centos-stream/rpms/passt) * ✅ [Debian](https://tracker.debian.org/pkg/passt) * ✅ [Fedora](https://src.fedoraproject.org/rpms/passt) * ✅ [Gentoo](https://packages.gentoo.org/packages/net-misc/passt) + * ✅ [GNU Guix](https://packages.guix.gnu.org/packages/passt/) + * ✅ [OpenSUSE](https://build.opensuse.org/package/requests/Virtualization:containers/passt) * ✅ [Ubuntu](https://launchpad.net/ubuntu/+source/passt) * ✅ [Void Linux](https://voidlinux.org/packages/?q=passt) * unofficial packages for: * ✅ [EPEL, Mageia](https://copr.fedorainfracloud.org/coprs/sbrivio/passt/) - * 🛠 [openSUSE](https://build.opensuse.org/package/show/Virtualization:containers/passt) * ✅ unofficial [packages](https://passt.top/builds/latest/x86_64/) from x86_64 static builds for other RPM-based distributions * ✅ unofficial [packages](https://passt.top/builds/latest/x86_64/) from x86_64 @@ -625,14 +625,16 @@ static unsigned int conf_ip4(unsigned int ifi, int rc = nl_link_get_mac(nl_sock, ifi, mac); if (rc < 0) { char ifname[IFNAMSIZ]; - err("Couldn't discover MAC for %s: %s", + err("Couldn't discover MAC address for %s: %s", if_indextoname(ifi, ifname), strerror(-rc)); return 0; } + + if (MAC_IS_ZERO(mac)) + memcpy(mac, MAC_LAA, ETH_ALEN); } - if (IN4_IS_ADDR_UNSPECIFIED(&ip4->addr) || - MAC_IS_ZERO(mac)) + if (IN4_IS_ADDR_UNSPECIFIED(&ip4->addr)) return 0; return ifi; @@ -684,15 +686,17 @@ static unsigned int conf_ip6(unsigned int ifi, rc = nl_link_get_mac(nl_sock, ifi, mac); if (rc < 0) { char ifname[IFNAMSIZ]; - err("Couldn't discover MAC for %s: %s", + err("Couldn't discover MAC address for %s: %s", if_indextoname(ifi, ifname), strerror(-rc)); return 0; } + + if (MAC_IS_ZERO(mac)) + memcpy(mac, MAC_LAA, ETH_ALEN); } if (IN6_IS_ADDR_UNSPECIFIED(&ip6->addr) || - IN6_IS_ADDR_UNSPECIFIED(&ip6->addr_ll) || - MAC_IS_ZERO(mac)) + IN6_IS_ADDR_UNSPECIFIED(&ip6->addr_ll)) return 0; return ifi; diff --git a/contrib/apparmor/abstractions/passt b/contrib/apparmor/abstractions/passt index 6bb25e0..61ec32c 100644 --- a/contrib/apparmor/abstractions/passt +++ b/contrib/apparmor/abstractions/passt @@ -27,6 +27,7 @@ / r, # isolate_prefork(), isolation.c mount options=(rw, runbindable) /, + mount "" -> "/", mount "" -> "/tmp/", pivot_root "/tmp/" -> "/tmp/", umount "/", diff --git a/contrib/apparmor/abstractions/pasta b/contrib/apparmor/abstractions/pasta index a890391..581ad1b 100644 --- a/contrib/apparmor/abstractions/pasta +++ b/contrib/apparmor/abstractions/pasta @@ -27,8 +27,9 @@ @{PROC}/@{pid}/net/udp r, @{PROC}/@{pid}/net/udp6 r, - @{run}/user/@{uid}/netns/* r, # pasta_open_ns(), pasta.c + @{run}/user/@{uid}/** rw, # pasta_open_ns(), main() + @{PROC}/[0-9]*/ns/ r, # pasta_netns_quit_init(), @{PROC}/[0-9]*/ns/net r, # pasta_wait_for_ns(), @{PROC}/[0-9]*/ns/user r, # conf_pasta_ns() @@ -42,3 +43,5 @@ /{usr/,}bin/** Ux, /usr/bin/pasta.avx2 ix, # arch_avx2_exec(), arch.c + + ptrace r, # pasta_open_ns() @@ -38,7 +38,7 @@ * @exclude: Bitmap of ports to exclude from setting (and clear) * * #syscalls:pasta lseek - * #syscalls:pasta ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek + * #syscalls:pasta ppc64le:_llseek ppc64:_llseek arm:_llseek */ static void procfs_scan_listen(int fd, unsigned int lstate, uint8_t *map, const uint8_t *exclude) @@ -24,6 +24,11 @@ #define IN4ADDR_ANY_INIT \ { .s_addr = htonl_constant(INADDR_ANY) } +#define IN4_IS_ADDR_LINKLOCAL(a) \ + ((ntohl(((struct in_addr *)(a))->s_addr) >> 16) == 0xa9fe) +#define IN4_IS_PREFIX_LINKLOCAL(a, len) \ + ((len) >= 16 && IN4_IS_ADDR_LINKLOCAL(a)) + #define L2_BUF_IP4_INIT(proto) \ { \ .version = 4, \ @@ -40,6 +45,10 @@ #define L2_BUF_IP4_PSUM(proto) ((uint32_t)htons_constant(0x4500) + \ (uint32_t)htons(0xff00 | (proto))) + +#define IN6_IS_PREFIX_LINKLOCAL(a, len) \ + ((len) >= 10 && IN6_IS_ADDR_LINKLOCAL(a)) + #define L2_BUF_IP6_INIT(proto) \ { \ .priority = 0, \ @@ -220,7 +220,7 @@ void logfile_init(const char *name, const char *path, size_t size) * @fd: Log file descriptor * @now: Current timestamp * - * #syscalls lseek ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek + * #syscalls lseek ppc64le:_llseek ppc64:_llseek arm:_llseek */ static void logfile_rotate_fallocate(int fd, const struct timespec *now) { @@ -257,7 +257,7 @@ static void logfile_rotate_fallocate(int fd, const struct timespec *now) * @fd: Log file descriptor * @now: Current timestamp * - * #syscalls lseek ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek + * #syscalls lseek ppc64le:_llseek ppc64:_llseek arm:_llseek * #syscalls ftruncate */ static void logfile_rotate_move(int fd, const struct timespec *now) @@ -33,6 +33,7 @@ #include "util.h" #include "passt.h" #include "log.h" +#include "ip.h" #include "netlink.h" /* Netlink expects a buffer of at least 8kiB or the system page size, @@ -270,6 +271,7 @@ unsigned int nl_get_ext_if(int s, sa_family_t af) seq = nl_send(s, &req, RTM_GETROUTE, NLM_F_DUMP, sizeof(req)); nl_foreach_oftype(nh, status, s, buf, seq, RTM_NEWROUTE) { struct rtmsg *rtm = (struct rtmsg *)NLMSG_DATA(nh); + const void *dst = NULL; unsigned thisifi = 0; if (rtm->rtm_family != af) @@ -284,12 +286,23 @@ unsigned int nl_get_ext_if(int s, sa_family_t af) rtnh = (struct rtnexthop *)RTA_DATA(rta); thisifi = rtnh->rtnh_ifindex; + } else if (rta->rta_type == RTA_DST) { + dst = RTA_DATA(rta); } } if (!thisifi) continue; /* No interface for this route */ + /* Skip routes to link-local addresses */ + if (af == AF_INET && dst && + IN4_IS_PREFIX_LINKLOCAL(dst, rtm->rtm_dst_len)) + continue; + + if (af == AF_INET6 && dst && + IN6_IS_PREFIX_LINKLOCAL(dst, rtm->rtm_dst_len)) + continue; + if (rtm->rtm_dst_len == 0) { /* Default route */ ndef++; @@ -309,7 +322,7 @@ unsigned int nl_get_ext_if(int s, sa_family_t af) if (defifi) { if (ndef > 1) info("Multiple default %s routes, picked first", - af == AF_INET ? "IPv4" : "IPv6"); + af_name(af)); return defifi; } @@ -318,11 +331,11 @@ unsigned int nl_get_ext_if(int s, sa_family_t af) return anyifi; info("Multiple interfaces with %s routes, use -i to select one", - af == AF_INET ? "IPv4" : "IPv6"); + af_name(af)); } if (!nany) - info("No interfaces with %s routes", af == AF_INET ? "IPv4" : "IPv6"); + info("No interfaces with usable %s routes", af_name(af)); return 0; } @@ -546,12 +559,19 @@ int nl_route_dup(int s_src, unsigned int ifi_src, for (rta = RTM_RTA(rtm), na = RTM_PAYLOAD(nh); RTA_OK(rta, na); rta = RTA_NEXT(rta, na)) { + /* RTA_OIF and RTA_MULTIPATH attributes carry the + * identifier of a host interface. Change them to match + * the corresponding identifier in the target namespace. + */ if (rta->rta_type == RTA_OIF) { - /* The host obviously list's the host interface - * id here, we need to change it to the - * namespace's interface id - */ *(unsigned int *)RTA_DATA(rta) = ifi_dst; + } else if (rta->rta_type == RTA_MULTIPATH) { + struct rtnexthop *rtnh; + + for (rtnh = (struct rtnexthop *)RTA_DATA(rta); + RTNH_OK(rtnh, RTA_PAYLOAD(rta)); + rtnh = RTNH_NEXT(rtnh)) + rtnh->rtnh_ifindex = ifi_dst; } else if (rta->rta_type == RTA_PREFSRC) { /* Host routes might include a preferred source * address, which must be one of the host's @@ -648,7 +668,8 @@ int nl_addr_get(int s, unsigned int ifi, sa_family_t af, for (rta = IFA_RTA(ifa), na = IFA_PAYLOAD(nh); RTA_OK(rta, na); rta = RTA_NEXT(rta, na)) { - if (rta->rta_type != IFA_ADDRESS) + if ((af == AF_INET && rta->rta_type != IFA_LOCAL) || + (af == AF_INET6 && rta->rta_type != IFA_ADDRESS)) continue; if (af == AF_INET && ifa->ifa_prefixlen > prefix_max) { @@ -783,6 +804,8 @@ int nl_addr_dup(int s_src, unsigned int ifi_src, continue; ifa->ifa_index = ifi_dst; + /* Same as nl_addr_set(), but here it's more than a default */ + ifa->ifa_flags |= IFA_F_NODAD; for (rta = IFA_RTA(ifa), na = IFA_PAYLOAD(nh); RTA_OK(rta, na); rta = RTA_NEXT(rta, na)) { @@ -790,6 +813,10 @@ int nl_addr_dup(int s_src, unsigned int ifi_src, if (rta->rta_type == IFA_LABEL || rta->rta_type == IFA_CACHEINFO) rta->rta_type = IFA_UNSPEC; + + /* If 32-bit flags are used, add IFA_F_NODAD there */ + if (rta->rta_type == IFA_FLAGS) + *(uint32_t *)RTA_DATA(rta) |= IFA_F_NODAD; } rc = nl_do(s_dst, nh, RTM_NEWADDR, @@ -192,10 +192,9 @@ void exit_handler(int signal) * #syscalls read write writev * #syscalls socket bind connect getsockopt setsockopt s390x:socketcall close * #syscalls recvfrom sendto shutdown - * #syscalls armv6l:recv armv7l:recv ppc64le:recv - * #syscalls armv6l:send armv7l:send ppc64le:send + * #syscalls arm:recv ppc64le:recv arm:send ppc64le:send * #syscalls accept4|accept listen epoll_ctl epoll_wait|epoll_pwait epoll_pwait - * #syscalls clock_gettime armv6l:clock_gettime64 armv7l:clock_gettime64 + * #syscalls clock_gettime arm:clock_gettime64 */ int main(int argc, char **argv) { @@ -12,8 +12,8 @@ * Author: Stefano Brivio <sbrivio@redhat.com> * * #syscalls:pasta clone waitid exit exit_group rt_sigprocmask - * #syscalls:pasta rt_sigreturn|sigreturn armv6l:sigreturn armv7l:sigreturn - * #syscalls:pasta ppc64:sigreturn s390x:sigreturn + * #syscalls:pasta rt_sigreturn|sigreturn + * #syscalls:pasta arm:sigreturn ppc64:sigreturn s390x:sigreturn */ #include <sched.h> @@ -211,12 +211,13 @@ static int pasta_spawn_cmd(void *arg) void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid, int argc, char *argv[]) { + char ns_fn_stack[NS_FN_STACK_SIZE] + __attribute__ ((aligned(__alignof__(max_align_t)))); struct pasta_spawn_cmd_arg arg = { .exe = argv[0], .argv = argv, }; char uidmap[BUFSIZ], gidmap[BUFSIZ]; - char ns_fn_stack[NS_FN_STACK_SIZE]; char *sh_argv[] = { NULL, NULL }; char sh_arg0[PATH_MAX + 1]; sigset_t set; @@ -29,11 +29,11 @@ HEADER="/* This file was automatically generated by $(basename ${0}) */ # Prefix for each profile: check that 'arch' in seccomp_data is matching PRE=' struct sock_filter filter_@PROFILE@[] = { - /* cppcheck-suppress badBitmaskCheck */ + /* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, arch))), BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, PASST_AUDIT_ARCH, 0, @KILL@), - /* cppcheck-suppress badBitmaskCheck */ + /* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))), @@ -318,39 +318,14 @@ /* MSS rounding: see SET_MSS() */ #define MSS_DEFAULT 536 - -struct tcp4_l2_head { /* For MSS4 macro: keep in sync with tcp4_l2_buf_t */ -#ifdef __AVX2__ - uint8_t pad[26]; -#else - uint8_t pad[2]; -#endif - struct tap_hdr taph; - struct iphdr iph; - struct tcphdr th; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ -#ifdef __AVX2__ - uint8_t pad[14]; -#else - uint8_t pad[2]; -#endif - struct tap_hdr taph; - struct ipv6hdr ip6h; - struct tcphdr th; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -#define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4) -#define MSS6 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4) +#define MSS4 ROUND_DOWN(IP_MAX_MTU - \ + sizeof(struct tcphdr) - \ + sizeof(struct iphdr), \ + sizeof(uint32_t)) +#define MSS6 ROUND_DOWN(IP_MAX_MTU - \ + sizeof(struct tcphdr) - \ + sizeof(struct ipv6hdr), \ + sizeof(uint32_t)) #define WINDOW_DEFAULT 14600 /* RFC 6928 */ #ifdef HAS_SND_WND @@ -445,133 +420,107 @@ struct tcp_buf_seq_update { }; /* Static buffers */ - /** - * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections - * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only - * @taph: Tap-level headers (partially pre-filled) - * @iph: Pre-filled IP header (except for tot_len and saddr) - * @uh: Headroom for TCP header - * @data: Storage for TCP payload + * struct tcp_payload_t - TCP header and data to send segments with payload + * @th: TCP header + * @data: TCP data */ -static struct tcp4_l2_buf_t { -#ifdef __AVX2__ - uint8_t pad[26]; /* 0, align th to 32 bytes */ -#else - uint8_t pad[2]; /* align iph to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 26 2 */ - struct iphdr iph; /* 44 20 */ - struct tcphdr th; /* 64 40 */ - uint8_t data[MSS4]; /* 84 60 */ - /* 65536 65532 */ +struct tcp_payload_t { + struct tcphdr th; + uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)]; #ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) +} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */ #else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); #endif -tcp4_l2_buf[TCP_FRAMES_MEM]; - -static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM]; - -static unsigned int tcp4_l2_buf_used; /** - * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections - * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B - * @taph: Tap-level headers (partially pre-filled) - * @ip6h: Pre-filled IP header (except for payload_len and addresses) - * @th: Headroom for TCP header - * @data: Storage for TCP payload + * struct tcp_flags_t - TCP header and data to send zero-length + * segments (flags) + * @th: TCP header + * @opts TCP options */ -struct tcp6_l2_buf_t { -#ifdef __AVX2__ - uint8_t pad[14]; /* 0 align ip6h to 32 bytes */ -#else - uint8_t pad[2]; /* align ip6h to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 14 2 */ - struct ipv6hdr ip6h; /* 32 20 */ - struct tcphdr th; /* 72 60 */ - uint8_t data[MSS6]; /* 92 80 */ - /* 65536 65532 */ +struct tcp_flags_t { + struct tcphdr th; + char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; #ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) +} __attribute__ ((packed, aligned(32))); #else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); #endif -tcp6_l2_buf[TCP_FRAMES_MEM]; -static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; +/* Ethernet header for IPv4 frames */ +static struct ethhdr tcp4_eth_src; + +static uint32_t tcp4_payload_vnet_len[TCP_FRAMES_MEM]; +/* IPv4 headers */ +static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM]; +/* TCP segments with payload for IPv4 frames */ +static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM]; + +static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516"); + +static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM]; +static unsigned int tcp4_payload_used; + +static uint32_t tcp4_flags_vnet_len[TCP_FRAMES_MEM]; +/* IPv4 headers for TCP segment without payload */ +static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM]; +/* TCP segments without payload for IPv4 frames */ +static struct tcp_flags_t tcp4_flags[TCP_FRAMES_MEM]; + +static unsigned int tcp4_flags_used; + +/* Ethernet header for IPv6 frames */ +static struct ethhdr tcp6_eth_src; + +static uint32_t tcp6_payload_vnet_len[TCP_FRAMES_MEM]; +/* IPv6 headers */ +static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM]; +/* TCP headers and data for IPv6 frames */ +static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM]; -static unsigned int tcp6_l2_buf_used; +static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516"); + +static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM]; +static unsigned int tcp6_payload_used; + +static uint32_t tcp6_flags_vnet_len[TCP_FRAMES_MEM]; +/* IPv6 headers for TCP segment without payload */ +static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM]; +/* TCP segment without payload for IPv6 frames */ +static struct tcp_flags_t tcp6_flags[TCP_FRAMES_MEM]; + +static unsigned int tcp6_flags_used; /* recvmsg()/sendmsg() data for tap */ static char tcp_buf_discard [MAX_WINDOW]; static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; -static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM]; -static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM]; -static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM]; -static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM]; +/* + * enum tcp_iov_parts - I/O vector parts for one TCP frame + * @TCP_IOV_VLEN virtio net header + * @TCP_IOV_ETH Ethernet header + * @TCP_IOV_IP IP (v4/v6) header + * @TCP_IOV_PAYLOAD IP payload (TCP header + data) + * @TCP_NUM_IOVS the number of entries in the iovec array + */ +enum tcp_iov_parts { + TCP_IOV_VLEN = 0, + TCP_IOV_ETH = 1, + TCP_IOV_IP = 2, + TCP_IOV_PAYLOAD = 3, + TCP_NUM_IOVS +}; + +static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; -/** - * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags) - * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only - * @taph: Tap-level headers (partially pre-filled) - * @iph: Pre-filled IP header (except for tot_len and saddr) - * @th: Headroom for TCP header - * @opts: Headroom for TCP options - */ -static struct tcp4_l2_flags_buf_t { -#ifdef __AVX2__ - uint8_t pad[26]; /* 0, align th to 32 bytes */ -#else - uint8_t pad[2]; /* align iph to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 26 2 */ - struct iphdr iph; /* 44 20 */ - struct tcphdr th; /* 64 40 */ - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp4_l2_flags_buf[TCP_FRAMES_MEM]; - -static unsigned int tcp4_l2_flags_buf_used; - -/** - * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags) - * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B - * @taph: Tap-level headers (partially pre-filled) - * @ip6h: Pre-filled IP header (except for payload_len and addresses) - * @th: Headroom for TCP header - * @opts: Headroom for TCP options - */ -static struct tcp6_l2_flags_buf_t { -#ifdef __AVX2__ - uint8_t pad[14]; /* 0 align ip6h to 32 bytes */ -#else - uint8_t pad[2]; /* align ip6h to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 14 2 */ - struct ipv6hdr ip6h; /* 32 20 */ - struct tcphdr th /* 72 */ __attribute__ ((aligned(4))); /* 60 */ - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp6_l2_flags_buf[TCP_FRAMES_MEM]; - -static unsigned int tcp6_l2_flags_buf_used; - #define CONN(idx) (&(FLOW(idx)->tcp)) /* Table for lookup from remote address, local port, remote port */ @@ -967,25 +916,14 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th) } /** - * tcp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses + * tcp_update_l2_buf() - Update Ethernet header buffers with addresses * @eth_d: Ethernet destination address, NULL if unchanged * @eth_s: Ethernet source address, NULL if unchanged */ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) { - int i; - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i]; - struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i]; - struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i]; - struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i]; - - eth_update_mac(&b4->taph.eh, eth_d, eth_s); - eth_update_mac(&b6->taph.eh, eth_d, eth_s); - eth_update_mac(&b4f->taph.eh, eth_d, eth_s); - eth_update_mac(&b6f->taph.eh, eth_d, eth_s); - } + eth_update_mac(&tcp4_eth_src, eth_d, eth_s); + eth_update_mac(&tcp6_eth_src, eth_d, eth_s); } /** @@ -998,26 +936,45 @@ static void tcp_sock4_iov_init(const struct ctx *c) struct iovec *iov; int i; - for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) { - tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IP), - .iph = iph, - .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } - }; + tcp4_eth_src.h_proto = htons_constant(ETH_P_IP); + + for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) { + tcp4_payload_ip[i] = iph; + tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4; + tcp4_payload[i].th.ack = 1; } - for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) { - tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IP), - .iph = L2_BUF_IP4_INIT(IPPROTO_TCP) - }; + for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) { + tcp4_flags_ip[i] = iph; + tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4; + tcp4_flags[i].th.ack = 1; + } + + for (i = 0; i < TCP_FRAMES_MEM; i++) { + iov = tcp4_l2_iov[i]; + + iov[TCP_IOV_VLEN].iov_base = &tcp4_payload_vnet_len[i]; + iov[TCP_IOV_VLEN].iov_len = c->mode == MODE_PASTA ? 0 : + sizeof(tcp4_payload_vnet_len[i]); + iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src); + iov[TCP_IOV_IP].iov_base = &tcp4_payload_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(tcp4_payload_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i]; } - for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_frame_base(c, &tcp4_l2_buf[i].taph); + for (i = 0; i < TCP_FRAMES_MEM; i++) { + iov = tcp4_l2_flags_iov[i]; - for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_frame_base(c, &tcp4_l2_flags_buf[i].taph); + iov[TCP_IOV_VLEN].iov_base = &tcp4_flags_vnet_len[i]; + iov[TCP_IOV_VLEN].iov_len = c->mode == MODE_PASTA ? 0 : + sizeof(tcp4_flags_vnet_len[i]); + iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src); + iov[TCP_IOV_IP].iov_base = &tcp4_flags_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(tcp4_flags_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i]; + } } /** @@ -1026,29 +983,49 @@ static void tcp_sock4_iov_init(const struct ctx *c) */ static void tcp_sock6_iov_init(const struct ctx *c) { + struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP); struct iovec *iov; int i; - for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) { - tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IPV6), - .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP), - .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } - }; + tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6); + + for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) { + tcp6_payload_ip[i] = ip6; + tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4; + tcp6_payload[i].th.ack = 1; } - for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) { - tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IPV6), - .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP) - }; + for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) { + tcp6_flags_ip[i] = ip6; + tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4; + tcp6_flags[i].th .ack = 1; } - for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_frame_base(c, &tcp6_l2_buf[i].taph); + for (i = 0; i < TCP_FRAMES_MEM; i++) { + iov = tcp6_l2_iov[i]; + + iov[TCP_IOV_VLEN].iov_base = &tcp6_payload_vnet_len[i]; + iov[TCP_IOV_VLEN].iov_len = c->mode == MODE_PASTA ? 0 : + sizeof(tcp6_payload_vnet_len[i]); + iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src); + iov[TCP_IOV_IP].iov_base = &tcp6_payload_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(tcp6_payload_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i]; + } - for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_frame_base(c, &tcp6_l2_flags_buf[i].taph); + for (i = 0; i < TCP_FRAMES_MEM; i++) { + iov = tcp6_l2_flags_iov[i]; + + iov[TCP_IOV_VLEN].iov_base = &tcp6_flags_vnet_len[i]; + iov[TCP_IOV_VLEN].iov_len = c->mode == MODE_PASTA ? 0 : + sizeof(tcp6_flags_vnet_len[i]); + iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src); + iov[TCP_IOV_IP].iov_base = &tcp6_flags_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(tcp6_flags_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i]; + } } /** @@ -1284,36 +1261,40 @@ static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); } while (0) /** - * tcp_l2_flags_buf_flush() - Send out buffers for segments with no data (flags) + * tcp_flags_flush() - Send out buffers for segments with no data (flags) * @c: Execution context */ -static void tcp_l2_flags_buf_flush(const struct ctx *c) +static void tcp_flags_flush(const struct ctx *c) { - tap_send_frames(c, tcp6_l2_flags_iov, 1, tcp6_l2_flags_buf_used); - tcp6_l2_flags_buf_used = 0; + tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS, + tcp6_flags_used); + tcp6_flags_used = 0; - tap_send_frames(c, tcp4_l2_flags_iov, 1, tcp4_l2_flags_buf_used); - tcp4_l2_flags_buf_used = 0; + tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS, + tcp4_flags_used); + tcp4_flags_used = 0; } /** - * tcp_l2_data_buf_flush() - Send out buffers for segments with data + * tcp_payload_flush() - Send out buffers for segments with data * @c: Execution context */ -static void tcp_l2_data_buf_flush(const struct ctx *c) +static void tcp_payload_flush(const struct ctx *c) { unsigned i; size_t m; - m = tap_send_frames(c, tcp6_l2_iov, 1, tcp6_l2_buf_used); + m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS, + tcp6_payload_used); for (i = 0; i < m; i++) - *tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len; - tcp6_l2_buf_used = 0; + *tcp6_seq_update[i].seq += tcp6_seq_update[i].len; + tcp6_payload_used = 0; - m = tap_send_frames(c, tcp4_l2_iov, 1, tcp4_l2_buf_used); + m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS, + tcp4_payload_used); for (i = 0; i < m; i++) - *tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len; - tcp4_l2_buf_used = 0; + *tcp4_seq_update[i].seq += tcp4_seq_update[i].len; + tcp4_payload_used = 0; } /** @@ -1323,8 +1304,8 @@ static void tcp_l2_data_buf_flush(const struct ctx *c) /* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */ void tcp_defer_handler(struct ctx *c) { - tcp_l2_flags_buf_flush(c); - tcp_l2_data_buf_flush(c); + tcp_flags_flush(c); + tcp_payload_flush(c); } /** @@ -1433,35 +1414,31 @@ static size_t tcp_fill_headers6(const struct ctx *c, * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers * @c: Execution context * @conn: Connection pointer - * @p: Pointer to any type of TCP pre-cooked buffer + * @iov: Pointer to an array of iovec of TCP pre-cooked buffers * @plen: Payload length (including TCP header options) * @check: Checksum, if already known * @seq: Sequence number for this segment * - * Return: frame length including L2 headers, host order + * Return: IP payload length, host order */ static size_t tcp_l2_buf_fill_headers(const struct ctx *c, const struct tcp_tap_conn *conn, - void *p, size_t plen, + struct iovec *iov, size_t plen, const uint16_t *check, uint32_t seq) { const struct in_addr *a4 = inany_v4(&conn->faddr); size_t ip_len, tlen; if (a4) { - struct tcp4_l2_buf_t *b = (struct tcp4_l2_buf_t *)p; - - ip_len = tcp_fill_headers4(c, conn, &b->iph, &b->th, plen, + ip_len = tcp_fill_headers4(c, conn, iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, plen, check, seq); - - tlen = tap_frame_len(c, &b->taph, ip_len); + tlen = ip_len - sizeof(struct iphdr); } else { - struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p; - - ip_len = tcp_fill_headers6(c, conn, &b->ip6h, &b->th, plen, + ip_len = tcp_fill_headers6(c, conn, iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, plen, seq); - - tlen = tap_frame_len(c, &b->taph, ip_len); + tlen = ip_len - sizeof(struct ipv6hdr); } return tlen; @@ -1593,18 +1570,16 @@ static void tcp_update_seqack_from_tap(const struct ctx *c, */ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) { - uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; - uint32_t prev_wnd_to_tap = conn->wnd_to_tap; - struct tcp4_l2_flags_buf_t *b4 = NULL; - struct tcp6_l2_flags_buf_t *b6 = NULL; + struct tcp_flags_t *payload; struct tcp_info tinfo = { 0 }; socklen_t sl = sizeof(tinfo); int s = conn->sock; + uint32_t vnet_len; size_t optlen = 0; - struct iovec *iov; struct tcphdr *th; + struct iovec *iov; + size_t ip_len; char *data; - void *p; if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) && !flags && conn->wnd_to_tap) @@ -1627,19 +1602,17 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) return 0; if (CONN_V4(conn)) { - iov = tcp4_l2_flags_iov + tcp4_l2_flags_buf_used; - p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++; - th = &b4->th; - - /* gcc 11.2 would complain on data = (char *)(th + 1); */ - data = b4->opts; + iov = tcp4_l2_flags_iov[tcp4_flags_used++]; + vnet_len = sizeof(struct ethhdr) + sizeof(struct iphdr); } else { - iov = tcp6_l2_flags_iov + tcp6_l2_flags_buf_used; - p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++; - th = &b6->th; - data = b6->opts; + iov = tcp6_l2_flags_iov[tcp6_flags_used++]; + vnet_len = sizeof(struct ethhdr) + sizeof(struct ipv6hdr); } + payload = iov[TCP_IOV_PAYLOAD].iov_base; + th = &payload->th; + data = payload->opts; + if (flags & SYN) { int mss; @@ -1675,9 +1648,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) *data++ = OPT_WS_LEN; *data++ = conn->ws_to_tap; } else if (!(flags & RST)) { - if (conn->seq_ack_to_tap != prev_ack_to_tap || - !prev_wnd_to_tap) - flags |= ACK; + flags |= ACK; } th->doff = (sizeof(*th) + optlen) / 4; @@ -1687,8 +1658,11 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) th->syn = !!(flags & SYN); th->fin = !!(flags & FIN); - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, p, optlen, - NULL, conn->seq_to_tap); + ip_len = tcp_l2_buf_fill_headers(c, conn, iov, optlen, NULL, + conn->seq_to_tap); + iov[TCP_IOV_PAYLOAD].iov_len = ip_len; + + *(uint32_t *)iov[TCP_IOV_VLEN].iov_base = htonl(vnet_len + ip_len); if (th->ack) { if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap)) @@ -1704,24 +1678,27 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) if (th->fin || th->syn) conn->seq_to_tap++; - if (CONN_V4(conn)) { - if (flags & DUP_ACK) { - memcpy(b4 + 1, b4, sizeof(*b4)); - (iov + 1)->iov_len = iov->iov_len; - tcp4_l2_flags_buf_used++; - } + if (flags & DUP_ACK) { + struct iovec *dup_iov; + int i; - if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2) - tcp_l2_flags_buf_flush(c); - } else { - if (flags & DUP_ACK) { - memcpy(b6 + 1, b6, sizeof(*b6)); - (iov + 1)->iov_len = iov->iov_len; - tcp6_l2_flags_buf_used++; - } + if (CONN_V4(conn)) + dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++]; + else + dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++]; - if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2) - tcp_l2_flags_buf_flush(c); + for (i = 0; i < TCP_NUM_IOVS; i++) + memcpy(dup_iov[i].iov_base, iov[i].iov_base, + iov[i].iov_len); + dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len; + } + + if (CONN_V4(conn)) { + if (tcp4_flags_used > TCP_FRAMES_MEM - 2) + tcp_flags_flush(c); + } else { + if (tcp6_flags_used > TCP_FRAMES_MEM - 2) + tcp_flags_flush(c); } return 0; @@ -2168,30 +2145,42 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, { uint32_t *seq_update = &conn->seq_to_tap; struct iovec *iov; + size_t ip_len; + uint32_t vnet_len; if (CONN_V4(conn)) { - struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used]; - const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL; + struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1]; + const uint16_t *check = NULL; - tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update; - tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen; + if (no_csum) { + struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base; + check = &iph->check; + } - iov = tcp4_l2_iov + tcp4_l2_buf_used++; - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, - check, seq); - if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1) - tcp_l2_data_buf_flush(c); + tcp4_seq_update[tcp4_payload_used].seq = seq_update; + tcp4_seq_update[tcp4_payload_used].len = plen; + + iov = tcp4_l2_iov[tcp4_payload_used++]; + ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, check, + seq); + iov[TCP_IOV_PAYLOAD].iov_len = ip_len; + vnet_len = sizeof(struct ethhdr) + sizeof(struct iphdr) + + ip_len; + *(uint32_t *)iov[TCP_IOV_VLEN].iov_base = htonl(vnet_len); + if (tcp4_payload_used > TCP_FRAMES_MEM - 1) + tcp_payload_flush(c); } else if (CONN_V6(conn)) { - struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used]; - - tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update; - tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen; + tcp6_seq_update[tcp6_payload_used].seq = seq_update; + tcp6_seq_update[tcp6_payload_used].len = plen; - iov = tcp6_l2_iov + tcp6_l2_buf_used++; - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, - NULL, seq); - if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1) - tcp_l2_data_buf_flush(c); + iov = tcp6_l2_iov[tcp6_payload_used++]; + ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, NULL, seq); + iov[TCP_IOV_PAYLOAD].iov_len = ip_len; + vnet_len = sizeof(struct ethhdr) + sizeof(struct ipv6hdr) + + ip_len; + *(uint32_t *)iov[TCP_IOV_VLEN].iov_base = htonl(vnet_len); + if (tcp6_payload_used > TCP_FRAMES_MEM - 1) + tcp_payload_flush(c); } } @@ -2246,19 +2235,19 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) iov_sock[0].iov_base = tcp_buf_discard; iov_sock[0].iov_len = already_sent; - if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) || - (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) { - tcp_l2_data_buf_flush(c); + if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) || + (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) { + tcp_payload_flush(c); /* Silence Coverity CWE-125 false positive */ - tcp4_l2_buf_used = tcp6_l2_buf_used = 0; + tcp4_payload_used = tcp6_payload_used = 0; } for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { if (v4) - iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data; + iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data; else - iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data; + iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data; iov->iov_len = mss; } if (iov_rem) @@ -2303,7 +2292,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) plen = mss; seq = conn->seq_to_tap; for (i = 0; i < send_bufs; i++) { - int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used; + int no_csum = i && i != send_bufs - 1 && tcp4_payload_used; if (i == send_bufs - 1) plen = last_len; diff --git a/tcp_splice.c b/tcp_splice.c index d066112..42b7be0 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -28,7 +28,7 @@ * - FIN_SENT_0: FIN (write shutdown) sent to accepted socket * - FIN_SENT_1: FIN (write shutdown) sent to target socket * - * #syscalls:pasta pipe2|pipe fcntl armv6l:fcntl64 armv7l:fcntl64 ppc64:fcntl64 + * #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64 */ #include <sched.h> diff --git a/test/.gitignore b/test/.gitignore index 4837402..6dd4790 100644 --- a/test/.gitignore +++ b/test/.gitignore @@ -1,5 +1,6 @@ test_logs/ mbuto/ +podman/ *.img QEMU_EFI.fd *.qcow2 diff --git a/test/Makefile b/test/Makefile index 7b00bef..35a3b55 100644 --- a/test/Makefile +++ b/test/Makefile @@ -52,10 +52,10 @@ UBUNTU_NEW_IMGS = xenial-server-cloudimg-powerpc-disk1.img \ jammy-server-cloudimg-s390x.img UBUNTU_IMGS = $(UBUNTU_OLD_IMGS) $(UBUNTU_NEW_IMGS) -DOWNLOAD_ASSETS = mbuto \ +DOWNLOAD_ASSETS = mbuto podman \ $(DEBIAN_IMGS) $(FEDORA_IMGS) $(OPENSUSE_IMGS) $(UBUNTU_IMGS) TESTDATA_ASSETS = small.bin big.bin medium.bin -LOCAL_ASSETS = mbuto.img mbuto.mem.img QEMU_EFI.fd \ +LOCAL_ASSETS = mbuto.img mbuto.mem.img podman/bin/podman QEMU_EFI.fd \ $(DEBIAN_IMGS:%=prepared-%) $(FEDORA_IMGS:%=prepared-%) \ $(UBUNTU_NEW_IMGS:%=prepared-%) \ nstool guest-key guest-key.pub \ @@ -67,13 +67,27 @@ CFLAGS = -Wall -Werror -Wextra -pedantic -std=c99 assets: $(ASSETS) +.PHONY: pull-% +pull-%: % + git -C $* pull + mbuto: git clone git://mbuto.sh/mbuto +mbuto/mbuto: pull-mbuto + +podman: + git clone https://github.com/containers/podman.git + +# To succesfully build podman, you will need gpgme and systemd +# development packages +podman/bin/podman: pull-podman + $(MAKE) -C podman + guest-key guest-key.pub: ssh-keygen -f guest-key -N '' -mbuto.img: passt.mbuto mbuto guest-key.pub $(TESTDATA_ASSETS) +mbuto.img: passt.mbuto mbuto/mbuto guest-key.pub $(TESTDATA_ASSETS) ./mbuto/mbuto -p ./$< -c lz4 -f $@ mbuto.mem.img: passt.mem.mbuto mbuto ../passt.avx2 diff --git a/test/pasta_options/log_to_file b/test/pasta_options/log_to_file index fcdd553..fe50e50 100644 --- a/test/pasta_options/log_to_file +++ b/test/pasta_options/log_to_file @@ -33,16 +33,13 @@ test Log creation set PORTS -t 10001,10002 -u 10001,10002 set LOG_FILE __STATEDIR__/pasta.log -passt ./pasta -l __LOG_FILE__ -passtb exit -sleep 1 +passt ./pasta -l __LOG_FILE__ -- /bin/true check [ -s __LOG_FILE__ ] test Log truncated on creation -passt ./pasta -l __LOG_FILE__ -passtb exit -sleep 1 -check [ $(cat __LOG_FILE__ | wc -l) -eq 1 ] +passt ./pasta -l __LOG_FILE__ -- /bin/true & wait +pout PID2 echo $! +check head -1 __LOG_FILE__ | grep '^pasta .* [(]__PID2__[)]$' test Maximum log size passtb ./pasta --config-net -d -f -l __LOG_FILE__ --log-size $((100 * 1024)) -- sh -c 'while true; do tcp_crr --nolog -P 10001 -C 10002 -6; done' diff --git a/test/pasta_podman/bats b/test/pasta_podman/bats index 21446f0..6b1c575 100644 --- a/test/pasta_podman/bats +++ b/test/pasta_podman/bats @@ -11,11 +11,16 @@ # Copyright (c) 2022 Red Hat GmbH # Author: Stefano Brivio <sbrivio@redhat.com> -htools git make go bats catatonit ip jq socat +htools git make go bats ip jq socat ./test/podman/bin/podman + +set PODMAN test/podman/bin/podman +hout WD pwd + +test Podman pasta path + +hout PASTA_BIN CONTAINERS_HELPER_BINARY_DIR="__WD__" __PODMAN__ info --format "{{.Host.Pasta.Executable}}" +check [ "__PASTA_BIN__" = "__WD__/pasta" ] test Podman system test with bats -host git -C __STATEDIR__ clone https://github.com/containers/podman.git -host make -C __STATEDIR__/podman -hout WD pwd -host PODMAN="__STATEDIR__/podman/bin/podman" CONTAINERS_HELPER_BINARY_DIR="__WD__" bats __STATEDIR__/podman/test/system/505-networking-pasta.bats +host PODMAN="__PODMAN__" CONTAINERS_HELPER_BINARY_DIR="__WD__" bats test/podman/test/system/505-networking-pasta.bats @@ -829,6 +829,7 @@ int udp_tap_handler(struct ctx *c, uint8_t pif, * and destination, so we can just take those from the first message. */ src = ntohs(uh->source); + src += c->udp.fwd_in.rdelta[src]; dst = ntohs(uh->dest); if (af == AF_INET) { @@ -1005,7 +1006,7 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, r4 = s = sock_l4(c, AF_INET, IPPROTO_UDP, addr, ifname, port, uref.u32); - udp_tap_map[V4][uref.port].sock = s < 0 ? -1 : s; + udp_tap_map[V4][port].sock = s < 0 ? -1 : s; udp_splice_init[V4][port].sock = s < 0 ? -1 : s; } else { r4 = s = sock_l4(c, AF_INET, IPPROTO_UDP, @@ -1022,7 +1023,7 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, r6 = s = sock_l4(c, AF_INET6, IPPROTO_UDP, addr, ifname, port, uref.u32); - udp_tap_map[V6][uref.port].sock = s < 0 ? -1 : s; + udp_tap_map[V6][port].sock = s < 0 ? -1 : s; udp_splice_init[V6][port].sock = s < 0 ? -1 : s; } else { r6 = s = sock_l4(c, AF_INET6, IPPROTO_UDP, @@ -9,6 +9,7 @@ #include <stdlib.h> #include <stdarg.h> #include <stdbool.h> +#include <stddef.h> #include <string.h> #include <signal.h> @@ -31,6 +32,9 @@ #ifndef ETH_MIN_MTU #define ETH_MIN_MTU 68 #endif +#ifndef IP_MAX_MTU +#define IP_MAX_MTU USHRT_MAX +#endif #ifndef MIN #define MIN(x, y) (((x) < (y)) ? (x) : (y)) @@ -90,6 +94,7 @@ #define PORT_IS_EPHEMERAL(port) ((port) >= PORT_EPHEMERAL_MIN) #define MAC_ZERO ((uint8_t [ETH_ALEN]){ 0 }) +#define MAC_LAA ((uint8_t [ETH_ALEN]){ BIT(1), 0, 0, 0, 0, 0 }) #define MAC_IS_ZERO(addr) (!memcmp((addr), MAC_ZERO, ETH_ALEN)) #ifndef __bswap_constant_16 @@ -116,7 +121,8 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, void *arg); #define NS_CALL(fn, arg) \ do { \ - char ns_fn_stack[NS_FN_STACK_SIZE]; \ + char ns_fn_stack[NS_FN_STACK_SIZE] \ + __attribute__ ((aligned(__alignof__(max_align_t)))); \ \ do_clone((fn), ns_fn_stack, sizeof(ns_fn_stack), \ CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD,\ @@ -157,6 +163,24 @@ int write_file(const char *path, const char *buf); int write_remainder(int fd, const struct iovec *iov, int iovcnt, size_t skip); /** + * af_name() - Return name of an address family + * @af: Address/protocol family (AF_INET or AF_INET6) + * + * Returns: Name of the protocol family as a string + */ +static inline const char *af_name(sa_family_t af) +{ + switch (af) { + case AF_INET: + return "IPv4"; + case AF_INET6: + return "IPv6"; + default: + return "<unknown address family>"; + } +} + +/** * mod_sub() - Modular arithmetic subtraction * @a: Minued, unsigned value < @m * @b: Subtrahend, unsigned value < @m |