aboutgitcodebugslistschat
diff options
context:
space:
mode:
-rw-r--r--Makefile2
-rw-r--r--README.md4
-rw-r--r--conf.c16
-rw-r--r--contrib/apparmor/abstractions/passt1
-rw-r--r--contrib/apparmor/abstractions/pasta5
-rw-r--r--fwd.c2
-rw-r--r--ip.h9
-rw-r--r--log.c4
-rw-r--r--netlink.c43
-rw-r--r--passt.c5
-rw-r--r--pasta.c7
-rwxr-xr-xseccomp.sh4
-rw-r--r--tcp.c535
-rw-r--r--tcp_splice.c2
-rw-r--r--test/.gitignore1
-rw-r--r--test/Makefile20
-rw-r--r--test/pasta_options/log_to_file11
-rw-r--r--test/pasta_podman/bats15
-rw-r--r--udp.c5
-rw-r--r--util.h26
20 files changed, 397 insertions, 320 deletions
diff --git a/Makefile b/Makefile
index 8428052..c1e1f06 100644
--- a/Makefile
+++ b/Makefile
@@ -308,4 +308,4 @@ cppcheck: $(SRCS) $(HEADERS)
--inline-suppr \
--suppress=unusedStructMember \
$(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \
- .
+ $(SRCS) $(HEADERS)
diff --git a/README.md b/README.md
index 9154425..aa2afac 100644
--- a/README.md
+++ b/README.md
@@ -342,16 +342,18 @@ speeding up local connections, and usually requiring NAT. _pasta_:
### Availability
* official packages for:
+ * ✅ [Alpine Linux](https://pkgs.alpinelinux.org/packages?name=passt)
* ✅ [Arch Linux](https://archlinux.org/packages/extra/x86_64/passt/) ([aarch64](https://archlinuxarm.org/packages/aarch64/passt), [i486](https://www.archlinux32.org/packages/?q=passt))
* ✅ [CentOS Stream](https://gitlab.com/redhat/centos-stream/rpms/passt)
* ✅ [Debian](https://tracker.debian.org/pkg/passt)
* ✅ [Fedora](https://src.fedoraproject.org/rpms/passt)
* ✅ [Gentoo](https://packages.gentoo.org/packages/net-misc/passt)
+ * ✅ [GNU Guix](https://packages.guix.gnu.org/packages/passt/)
+ * ✅ [OpenSUSE](https://build.opensuse.org/package/requests/Virtualization:containers/passt)
* ✅ [Ubuntu](https://launchpad.net/ubuntu/+source/passt)
* ✅ [Void Linux](https://voidlinux.org/packages/?q=passt)
* unofficial packages for:
* ✅ [EPEL, Mageia](https://copr.fedorainfracloud.org/coprs/sbrivio/passt/)
- * 🛠 [openSUSE](https://build.opensuse.org/package/show/Virtualization:containers/passt)
* ✅ unofficial [packages](https://passt.top/builds/latest/x86_64/) from x86_64
static builds for other RPM-based distributions
* ✅ unofficial [packages](https://passt.top/builds/latest/x86_64/) from x86_64
diff --git a/conf.c b/conf.c
index 9e0318a..3f30725 100644
--- a/conf.c
+++ b/conf.c
@@ -625,14 +625,16 @@ static unsigned int conf_ip4(unsigned int ifi,
int rc = nl_link_get_mac(nl_sock, ifi, mac);
if (rc < 0) {
char ifname[IFNAMSIZ];
- err("Couldn't discover MAC for %s: %s",
+ err("Couldn't discover MAC address for %s: %s",
if_indextoname(ifi, ifname), strerror(-rc));
return 0;
}
+
+ if (MAC_IS_ZERO(mac))
+ memcpy(mac, MAC_LAA, ETH_ALEN);
}
- if (IN4_IS_ADDR_UNSPECIFIED(&ip4->addr) ||
- MAC_IS_ZERO(mac))
+ if (IN4_IS_ADDR_UNSPECIFIED(&ip4->addr))
return 0;
return ifi;
@@ -684,15 +686,17 @@ static unsigned int conf_ip6(unsigned int ifi,
rc = nl_link_get_mac(nl_sock, ifi, mac);
if (rc < 0) {
char ifname[IFNAMSIZ];
- err("Couldn't discover MAC for %s: %s",
+ err("Couldn't discover MAC address for %s: %s",
if_indextoname(ifi, ifname), strerror(-rc));
return 0;
}
+
+ if (MAC_IS_ZERO(mac))
+ memcpy(mac, MAC_LAA, ETH_ALEN);
}
if (IN6_IS_ADDR_UNSPECIFIED(&ip6->addr) ||
- IN6_IS_ADDR_UNSPECIFIED(&ip6->addr_ll) ||
- MAC_IS_ZERO(mac))
+ IN6_IS_ADDR_UNSPECIFIED(&ip6->addr_ll))
return 0;
return ifi;
diff --git a/contrib/apparmor/abstractions/passt b/contrib/apparmor/abstractions/passt
index 6bb25e0..61ec32c 100644
--- a/contrib/apparmor/abstractions/passt
+++ b/contrib/apparmor/abstractions/passt
@@ -27,6 +27,7 @@
/ r, # isolate_prefork(), isolation.c
mount options=(rw, runbindable) /,
+ mount "" -> "/",
mount "" -> "/tmp/",
pivot_root "/tmp/" -> "/tmp/",
umount "/",
diff --git a/contrib/apparmor/abstractions/pasta b/contrib/apparmor/abstractions/pasta
index a890391..581ad1b 100644
--- a/contrib/apparmor/abstractions/pasta
+++ b/contrib/apparmor/abstractions/pasta
@@ -27,8 +27,9 @@
@{PROC}/@{pid}/net/udp r,
@{PROC}/@{pid}/net/udp6 r,
- @{run}/user/@{uid}/netns/* r, # pasta_open_ns(), pasta.c
+ @{run}/user/@{uid}/** rw, # pasta_open_ns(), main()
+ @{PROC}/[0-9]*/ns/ r, # pasta_netns_quit_init(),
@{PROC}/[0-9]*/ns/net r, # pasta_wait_for_ns(),
@{PROC}/[0-9]*/ns/user r, # conf_pasta_ns()
@@ -42,3 +43,5 @@
/{usr/,}bin/** Ux,
/usr/bin/pasta.avx2 ix, # arch_avx2_exec(), arch.c
+
+ ptrace r, # pasta_open_ns()
diff --git a/fwd.c b/fwd.c
index a235d13..b3d5a37 100644
--- a/fwd.c
+++ b/fwd.c
@@ -38,7 +38,7 @@
* @exclude: Bitmap of ports to exclude from setting (and clear)
*
* #syscalls:pasta lseek
- * #syscalls:pasta ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek
+ * #syscalls:pasta ppc64le:_llseek ppc64:_llseek arm:_llseek
*/
static void procfs_scan_listen(int fd, unsigned int lstate,
uint8_t *map, const uint8_t *exclude)
diff --git a/ip.h b/ip.h
index b9aedf6..b8d4a5b 100644
--- a/ip.h
+++ b/ip.h
@@ -24,6 +24,11 @@
#define IN4ADDR_ANY_INIT \
{ .s_addr = htonl_constant(INADDR_ANY) }
+#define IN4_IS_ADDR_LINKLOCAL(a) \
+ ((ntohl(((struct in_addr *)(a))->s_addr) >> 16) == 0xa9fe)
+#define IN4_IS_PREFIX_LINKLOCAL(a, len) \
+ ((len) >= 16 && IN4_IS_ADDR_LINKLOCAL(a))
+
#define L2_BUF_IP4_INIT(proto) \
{ \
.version = 4, \
@@ -40,6 +45,10 @@
#define L2_BUF_IP4_PSUM(proto) ((uint32_t)htons_constant(0x4500) + \
(uint32_t)htons(0xff00 | (proto)))
+
+#define IN6_IS_PREFIX_LINKLOCAL(a, len) \
+ ((len) >= 10 && IN6_IS_ADDR_LINKLOCAL(a))
+
#define L2_BUF_IP6_INIT(proto) \
{ \
.priority = 0, \
diff --git a/log.c b/log.c
index bdd31b4..e3630c3 100644
--- a/log.c
+++ b/log.c
@@ -220,7 +220,7 @@ void logfile_init(const char *name, const char *path, size_t size)
* @fd: Log file descriptor
* @now: Current timestamp
*
- * #syscalls lseek ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek
+ * #syscalls lseek ppc64le:_llseek ppc64:_llseek arm:_llseek
*/
static void logfile_rotate_fallocate(int fd, const struct timespec *now)
{
@@ -257,7 +257,7 @@ static void logfile_rotate_fallocate(int fd, const struct timespec *now)
* @fd: Log file descriptor
* @now: Current timestamp
*
- * #syscalls lseek ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek
+ * #syscalls lseek ppc64le:_llseek ppc64:_llseek arm:_llseek
* #syscalls ftruncate
*/
static void logfile_rotate_move(int fd, const struct timespec *now)
diff --git a/netlink.c b/netlink.c
index 9b3dba2..b3897e6 100644
--- a/netlink.c
+++ b/netlink.c
@@ -33,6 +33,7 @@
#include "util.h"
#include "passt.h"
#include "log.h"
+#include "ip.h"
#include "netlink.h"
/* Netlink expects a buffer of at least 8kiB or the system page size,
@@ -270,6 +271,7 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
seq = nl_send(s, &req, RTM_GETROUTE, NLM_F_DUMP, sizeof(req));
nl_foreach_oftype(nh, status, s, buf, seq, RTM_NEWROUTE) {
struct rtmsg *rtm = (struct rtmsg *)NLMSG_DATA(nh);
+ const void *dst = NULL;
unsigned thisifi = 0;
if (rtm->rtm_family != af)
@@ -284,12 +286,23 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
rtnh = (struct rtnexthop *)RTA_DATA(rta);
thisifi = rtnh->rtnh_ifindex;
+ } else if (rta->rta_type == RTA_DST) {
+ dst = RTA_DATA(rta);
}
}
if (!thisifi)
continue; /* No interface for this route */
+ /* Skip routes to link-local addresses */
+ if (af == AF_INET && dst &&
+ IN4_IS_PREFIX_LINKLOCAL(dst, rtm->rtm_dst_len))
+ continue;
+
+ if (af == AF_INET6 && dst &&
+ IN6_IS_PREFIX_LINKLOCAL(dst, rtm->rtm_dst_len))
+ continue;
+
if (rtm->rtm_dst_len == 0) {
/* Default route */
ndef++;
@@ -309,7 +322,7 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
if (defifi) {
if (ndef > 1)
info("Multiple default %s routes, picked first",
- af == AF_INET ? "IPv4" : "IPv6");
+ af_name(af));
return defifi;
}
@@ -318,11 +331,11 @@ unsigned int nl_get_ext_if(int s, sa_family_t af)
return anyifi;
info("Multiple interfaces with %s routes, use -i to select one",
- af == AF_INET ? "IPv4" : "IPv6");
+ af_name(af));
}
if (!nany)
- info("No interfaces with %s routes", af == AF_INET ? "IPv4" : "IPv6");
+ info("No interfaces with usable %s routes", af_name(af));
return 0;
}
@@ -546,12 +559,19 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
for (rta = RTM_RTA(rtm), na = RTM_PAYLOAD(nh); RTA_OK(rta, na);
rta = RTA_NEXT(rta, na)) {
+ /* RTA_OIF and RTA_MULTIPATH attributes carry the
+ * identifier of a host interface. Change them to match
+ * the corresponding identifier in the target namespace.
+ */
if (rta->rta_type == RTA_OIF) {
- /* The host obviously list's the host interface
- * id here, we need to change it to the
- * namespace's interface id
- */
*(unsigned int *)RTA_DATA(rta) = ifi_dst;
+ } else if (rta->rta_type == RTA_MULTIPATH) {
+ struct rtnexthop *rtnh;
+
+ for (rtnh = (struct rtnexthop *)RTA_DATA(rta);
+ RTNH_OK(rtnh, RTA_PAYLOAD(rta));
+ rtnh = RTNH_NEXT(rtnh))
+ rtnh->rtnh_ifindex = ifi_dst;
} else if (rta->rta_type == RTA_PREFSRC) {
/* Host routes might include a preferred source
* address, which must be one of the host's
@@ -648,7 +668,8 @@ int nl_addr_get(int s, unsigned int ifi, sa_family_t af,
for (rta = IFA_RTA(ifa), na = IFA_PAYLOAD(nh); RTA_OK(rta, na);
rta = RTA_NEXT(rta, na)) {
- if (rta->rta_type != IFA_ADDRESS)
+ if ((af == AF_INET && rta->rta_type != IFA_LOCAL) ||
+ (af == AF_INET6 && rta->rta_type != IFA_ADDRESS))
continue;
if (af == AF_INET && ifa->ifa_prefixlen > prefix_max) {
@@ -783,6 +804,8 @@ int nl_addr_dup(int s_src, unsigned int ifi_src,
continue;
ifa->ifa_index = ifi_dst;
+ /* Same as nl_addr_set(), but here it's more than a default */
+ ifa->ifa_flags |= IFA_F_NODAD;
for (rta = IFA_RTA(ifa), na = IFA_PAYLOAD(nh); RTA_OK(rta, na);
rta = RTA_NEXT(rta, na)) {
@@ -790,6 +813,10 @@ int nl_addr_dup(int s_src, unsigned int ifi_src,
if (rta->rta_type == IFA_LABEL ||
rta->rta_type == IFA_CACHEINFO)
rta->rta_type = IFA_UNSPEC;
+
+ /* If 32-bit flags are used, add IFA_F_NODAD there */
+ if (rta->rta_type == IFA_FLAGS)
+ *(uint32_t *)RTA_DATA(rta) |= IFA_F_NODAD;
}
rc = nl_do(s_dst, nh, RTM_NEWADDR,
diff --git a/passt.c b/passt.c
index 59ab501..e93b6be 100644
--- a/passt.c
+++ b/passt.c
@@ -192,10 +192,9 @@ void exit_handler(int signal)
* #syscalls read write writev
* #syscalls socket bind connect getsockopt setsockopt s390x:socketcall close
* #syscalls recvfrom sendto shutdown
- * #syscalls armv6l:recv armv7l:recv ppc64le:recv
- * #syscalls armv6l:send armv7l:send ppc64le:send
+ * #syscalls arm:recv ppc64le:recv arm:send ppc64le:send
* #syscalls accept4|accept listen epoll_ctl epoll_wait|epoll_pwait epoll_pwait
- * #syscalls clock_gettime armv6l:clock_gettime64 armv7l:clock_gettime64
+ * #syscalls clock_gettime arm:clock_gettime64
*/
int main(int argc, char **argv)
{
diff --git a/pasta.c b/pasta.c
index 61feaa9..31e1e00 100644
--- a/pasta.c
+++ b/pasta.c
@@ -12,8 +12,8 @@
* Author: Stefano Brivio <sbrivio@redhat.com>
*
* #syscalls:pasta clone waitid exit exit_group rt_sigprocmask
- * #syscalls:pasta rt_sigreturn|sigreturn armv6l:sigreturn armv7l:sigreturn
- * #syscalls:pasta ppc64:sigreturn s390x:sigreturn
+ * #syscalls:pasta rt_sigreturn|sigreturn
+ * #syscalls:pasta arm:sigreturn ppc64:sigreturn s390x:sigreturn
*/
#include <sched.h>
@@ -211,12 +211,13 @@ static int pasta_spawn_cmd(void *arg)
void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
int argc, char *argv[])
{
+ char ns_fn_stack[NS_FN_STACK_SIZE]
+ __attribute__ ((aligned(__alignof__(max_align_t))));
struct pasta_spawn_cmd_arg arg = {
.exe = argv[0],
.argv = argv,
};
char uidmap[BUFSIZ], gidmap[BUFSIZ];
- char ns_fn_stack[NS_FN_STACK_SIZE];
char *sh_argv[] = { NULL, NULL };
char sh_arg0[PATH_MAX + 1];
sigset_t set;
diff --git a/seccomp.sh b/seccomp.sh
index e1224e0..052e1c8 100755
--- a/seccomp.sh
+++ b/seccomp.sh
@@ -29,11 +29,11 @@ HEADER="/* This file was automatically generated by $(basename ${0}) */
# Prefix for each profile: check that 'arch' in seccomp_data is matching
PRE='
struct sock_filter filter_@PROFILE@[] = {
- /* cppcheck-suppress badBitmaskCheck */
+ /* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
(offsetof(struct seccomp_data, arch))),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, PASST_AUDIT_ARCH, 0, @KILL@),
- /* cppcheck-suppress badBitmaskCheck */
+ /* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
(offsetof(struct seccomp_data, nr))),
diff --git a/tcp.c b/tcp.c
index b65ddeb..24f99cd 100644
--- a/tcp.c
+++ b/tcp.c
@@ -318,39 +318,14 @@
/* MSS rounding: see SET_MSS() */
#define MSS_DEFAULT 536
-
-struct tcp4_l2_head { /* For MSS4 macro: keep in sync with tcp4_l2_buf_t */
-#ifdef __AVX2__
- uint8_t pad[26];
-#else
- uint8_t pad[2];
-#endif
- struct tap_hdr taph;
- struct iphdr iph;
- struct tcphdr th;
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)));
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
-#endif
-
-struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
-#ifdef __AVX2__
- uint8_t pad[14];
-#else
- uint8_t pad[2];
-#endif
- struct tap_hdr taph;
- struct ipv6hdr ip6h;
- struct tcphdr th;
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)));
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
-#endif
-
-#define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4)
-#define MSS6 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4)
+#define MSS4 ROUND_DOWN(IP_MAX_MTU - \
+ sizeof(struct tcphdr) - \
+ sizeof(struct iphdr), \
+ sizeof(uint32_t))
+#define MSS6 ROUND_DOWN(IP_MAX_MTU - \
+ sizeof(struct tcphdr) - \
+ sizeof(struct ipv6hdr), \
+ sizeof(uint32_t))
#define WINDOW_DEFAULT 14600 /* RFC 6928 */
#ifdef HAS_SND_WND
@@ -445,133 +420,107 @@ struct tcp_buf_seq_update {
};
/* Static buffers */
-
/**
- * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
- * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only
- * @taph: Tap-level headers (partially pre-filled)
- * @iph: Pre-filled IP header (except for tot_len and saddr)
- * @uh: Headroom for TCP header
- * @data: Storage for TCP payload
+ * struct tcp_payload_t - TCP header and data to send segments with payload
+ * @th: TCP header
+ * @data: TCP data
*/
-static struct tcp4_l2_buf_t {
-#ifdef __AVX2__
- uint8_t pad[26]; /* 0, align th to 32 bytes */
-#else
- uint8_t pad[2]; /* align iph to 4 bytes 0 */
-#endif
- struct tap_hdr taph; /* 26 2 */
- struct iphdr iph; /* 44 20 */
- struct tcphdr th; /* 64 40 */
- uint8_t data[MSS4]; /* 84 60 */
- /* 65536 65532 */
+struct tcp_payload_t {
+ struct tcphdr th;
+ uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)))
+} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */
#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
+} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif
-tcp4_l2_buf[TCP_FRAMES_MEM];
-
-static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];
-
-static unsigned int tcp4_l2_buf_used;
/**
- * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections
- * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
- * @taph: Tap-level headers (partially pre-filled)
- * @ip6h: Pre-filled IP header (except for payload_len and addresses)
- * @th: Headroom for TCP header
- * @data: Storage for TCP payload
+ * struct tcp_flags_t - TCP header and data to send zero-length
+ * segments (flags)
+ * @th: TCP header
+ * @opts TCP options
*/
-struct tcp6_l2_buf_t {
-#ifdef __AVX2__
- uint8_t pad[14]; /* 0 align ip6h to 32 bytes */
-#else
- uint8_t pad[2]; /* align ip6h to 4 bytes 0 */
-#endif
- struct tap_hdr taph; /* 14 2 */
- struct ipv6hdr ip6h; /* 32 20 */
- struct tcphdr th; /* 72 60 */
- uint8_t data[MSS6]; /* 92 80 */
- /* 65536 65532 */
+struct tcp_flags_t {
+ struct tcphdr th;
+ char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)))
+} __attribute__ ((packed, aligned(32)));
#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
+} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif
-tcp6_l2_buf[TCP_FRAMES_MEM];
-static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
+/* Ethernet header for IPv4 frames */
+static struct ethhdr tcp4_eth_src;
+
+static uint32_t tcp4_payload_vnet_len[TCP_FRAMES_MEM];
+/* IPv4 headers */
+static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM];
+/* TCP segments with payload for IPv4 frames */
+static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM];
+
+static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
+
+static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM];
+static unsigned int tcp4_payload_used;
+
+static uint32_t tcp4_flags_vnet_len[TCP_FRAMES_MEM];
+/* IPv4 headers for TCP segment without payload */
+static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM];
+/* TCP segments without payload for IPv4 frames */
+static struct tcp_flags_t tcp4_flags[TCP_FRAMES_MEM];
+
+static unsigned int tcp4_flags_used;
+
+/* Ethernet header for IPv6 frames */
+static struct ethhdr tcp6_eth_src;
+
+static uint32_t tcp6_payload_vnet_len[TCP_FRAMES_MEM];
+/* IPv6 headers */
+static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM];
+/* TCP headers and data for IPv6 frames */
+static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM];
-static unsigned int tcp6_l2_buf_used;
+static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
+
+static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM];
+static unsigned int tcp6_payload_used;
+
+static uint32_t tcp6_flags_vnet_len[TCP_FRAMES_MEM];
+/* IPv6 headers for TCP segment without payload */
+static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM];
+/* TCP segment without payload for IPv6 frames */
+static struct tcp_flags_t tcp6_flags[TCP_FRAMES_MEM];
+
+static unsigned int tcp6_flags_used;
/* recvmsg()/sendmsg() data for tap */
static char tcp_buf_discard [MAX_WINDOW];
static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
-static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM];
-static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM];
-static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM];
-static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM];
+/*
+ * enum tcp_iov_parts - I/O vector parts for one TCP frame
+ * @TCP_IOV_VLEN virtio net header
+ * @TCP_IOV_ETH Ethernet header
+ * @TCP_IOV_IP IP (v4/v6) header
+ * @TCP_IOV_PAYLOAD IP payload (TCP header + data)
+ * @TCP_NUM_IOVS the number of entries in the iovec array
+ */
+enum tcp_iov_parts {
+ TCP_IOV_VLEN = 0,
+ TCP_IOV_ETH = 1,
+ TCP_IOV_IP = 2,
+ TCP_IOV_PAYLOAD = 3,
+ TCP_NUM_IOVS
+};
+
+static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
+static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
+static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
+static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
/* sendmsg() to socket */
static struct iovec tcp_iov [UIO_MAXIOV];
-/**
- * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags)
- * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only
- * @taph: Tap-level headers (partially pre-filled)
- * @iph: Pre-filled IP header (except for tot_len and saddr)
- * @th: Headroom for TCP header
- * @opts: Headroom for TCP options
- */
-static struct tcp4_l2_flags_buf_t {
-#ifdef __AVX2__
- uint8_t pad[26]; /* 0, align th to 32 bytes */
-#else
- uint8_t pad[2]; /* align iph to 4 bytes 0 */
-#endif
- struct tap_hdr taph; /* 26 2 */
- struct iphdr iph; /* 44 20 */
- struct tcphdr th; /* 64 40 */
- char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)))
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
-#endif
-tcp4_l2_flags_buf[TCP_FRAMES_MEM];
-
-static unsigned int tcp4_l2_flags_buf_used;
-
-/**
- * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags)
- * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
- * @taph: Tap-level headers (partially pre-filled)
- * @ip6h: Pre-filled IP header (except for payload_len and addresses)
- * @th: Headroom for TCP header
- * @opts: Headroom for TCP options
- */
-static struct tcp6_l2_flags_buf_t {
-#ifdef __AVX2__
- uint8_t pad[14]; /* 0 align ip6h to 32 bytes */
-#else
- uint8_t pad[2]; /* align ip6h to 4 bytes 0 */
-#endif
- struct tap_hdr taph; /* 14 2 */
- struct ipv6hdr ip6h; /* 32 20 */
- struct tcphdr th /* 72 */ __attribute__ ((aligned(4))); /* 60 */
- char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)))
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
-#endif
-tcp6_l2_flags_buf[TCP_FRAMES_MEM];
-
-static unsigned int tcp6_l2_flags_buf_used;
-
#define CONN(idx) (&(FLOW(idx)->tcp))
/* Table for lookup from remote address, local port, remote port */
@@ -967,25 +916,14 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
}
/**
- * tcp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses
+ * tcp_update_l2_buf() - Update Ethernet header buffers with addresses
* @eth_d: Ethernet destination address, NULL if unchanged
* @eth_s: Ethernet source address, NULL if unchanged
*/
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
{
- int i;
-
- for (i = 0; i < TCP_FRAMES_MEM; i++) {
- struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i];
- struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i];
- struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i];
- struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i];
-
- eth_update_mac(&b4->taph.eh, eth_d, eth_s);
- eth_update_mac(&b6->taph.eh, eth_d, eth_s);
- eth_update_mac(&b4f->taph.eh, eth_d, eth_s);
- eth_update_mac(&b6f->taph.eh, eth_d, eth_s);
- }
+ eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
+ eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
}
/**
@@ -998,26 +936,45 @@ static void tcp_sock4_iov_init(const struct ctx *c)
struct iovec *iov;
int i;
- for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) {
- tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) {
- .taph = TAP_HDR_INIT(ETH_P_IP),
- .iph = iph,
- .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
- };
+ tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
+
+ for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) {
+ tcp4_payload_ip[i] = iph;
+ tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4;
+ tcp4_payload[i].th.ack = 1;
}
- for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) {
- tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) {
- .taph = TAP_HDR_INIT(ETH_P_IP),
- .iph = L2_BUF_IP4_INIT(IPPROTO_TCP)
- };
+ for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) {
+ tcp4_flags_ip[i] = iph;
+ tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4;
+ tcp4_flags[i].th.ack = 1;
+ }
+
+ for (i = 0; i < TCP_FRAMES_MEM; i++) {
+ iov = tcp4_l2_iov[i];
+
+ iov[TCP_IOV_VLEN].iov_base = &tcp4_payload_vnet_len[i];
+ iov[TCP_IOV_VLEN].iov_len = c->mode == MODE_PASTA ? 0 :
+ sizeof(tcp4_payload_vnet_len[i]);
+ iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
+ iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src);
+ iov[TCP_IOV_IP].iov_base = &tcp4_payload_ip[i];
+ iov[TCP_IOV_IP].iov_len = sizeof(tcp4_payload_ip[i]);
+ iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
}
- for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
- iov->iov_base = tap_frame_base(c, &tcp4_l2_buf[i].taph);
+ for (i = 0; i < TCP_FRAMES_MEM; i++) {
+ iov = tcp4_l2_flags_iov[i];
- for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
- iov->iov_base = tap_frame_base(c, &tcp4_l2_flags_buf[i].taph);
+ iov[TCP_IOV_VLEN].iov_base = &tcp4_flags_vnet_len[i];
+ iov[TCP_IOV_VLEN].iov_len = c->mode == MODE_PASTA ? 0 :
+ sizeof(tcp4_flags_vnet_len[i]);
+ iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
+ iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src);
+ iov[TCP_IOV_IP].iov_base = &tcp4_flags_ip[i];
+ iov[TCP_IOV_IP].iov_len = sizeof(tcp4_flags_ip[i]);
+ iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i];
+ }
}
/**
@@ -1026,29 +983,49 @@ static void tcp_sock4_iov_init(const struct ctx *c)
*/
static void tcp_sock6_iov_init(const struct ctx *c)
{
+ struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
struct iovec *iov;
int i;
- for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) {
- tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) {
- .taph = TAP_HDR_INIT(ETH_P_IPV6),
- .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP),
- .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
- };
+ tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
+
+ for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) {
+ tcp6_payload_ip[i] = ip6;
+ tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4;
+ tcp6_payload[i].th.ack = 1;
}
- for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) {
- tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) {
- .taph = TAP_HDR_INIT(ETH_P_IPV6),
- .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP)
- };
+ for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) {
+ tcp6_flags_ip[i] = ip6;
+ tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4;
+ tcp6_flags[i].th .ack = 1;
}
- for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
- iov->iov_base = tap_frame_base(c, &tcp6_l2_buf[i].taph);
+ for (i = 0; i < TCP_FRAMES_MEM; i++) {
+ iov = tcp6_l2_iov[i];
+
+ iov[TCP_IOV_VLEN].iov_base = &tcp6_payload_vnet_len[i];
+ iov[TCP_IOV_VLEN].iov_len = c->mode == MODE_PASTA ? 0 :
+ sizeof(tcp6_payload_vnet_len[i]);
+ iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
+ iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src);
+ iov[TCP_IOV_IP].iov_base = &tcp6_payload_ip[i];
+ iov[TCP_IOV_IP].iov_len = sizeof(tcp6_payload_ip[i]);
+ iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
+ }
- for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
- iov->iov_base = tap_frame_base(c, &tcp6_l2_flags_buf[i].taph);
+ for (i = 0; i < TCP_FRAMES_MEM; i++) {
+ iov = tcp6_l2_flags_iov[i];
+
+ iov[TCP_IOV_VLEN].iov_base = &tcp6_flags_vnet_len[i];
+ iov[TCP_IOV_VLEN].iov_len = c->mode == MODE_PASTA ? 0 :
+ sizeof(tcp6_flags_vnet_len[i]);
+ iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
+ iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src);
+ iov[TCP_IOV_IP].iov_base = &tcp6_flags_ip[i];
+ iov[TCP_IOV_IP].iov_len = sizeof(tcp6_flags_ip[i]);
+ iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i];
+ }
}
/**
@@ -1284,36 +1261,40 @@ static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
} while (0)
/**
- * tcp_l2_flags_buf_flush() - Send out buffers for segments with no data (flags)
+ * tcp_flags_flush() - Send out buffers for segments with no data (flags)
* @c: Execution context
*/
-static void tcp_l2_flags_buf_flush(const struct ctx *c)
+static void tcp_flags_flush(const struct ctx *c)
{
- tap_send_frames(c, tcp6_l2_flags_iov, 1, tcp6_l2_flags_buf_used);
- tcp6_l2_flags_buf_used = 0;
+ tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS,
+ tcp6_flags_used);
+ tcp6_flags_used = 0;
- tap_send_frames(c, tcp4_l2_flags_iov, 1, tcp4_l2_flags_buf_used);
- tcp4_l2_flags_buf_used = 0;
+ tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS,
+ tcp4_flags_used);
+ tcp4_flags_used = 0;
}
/**
- * tcp_l2_data_buf_flush() - Send out buffers for segments with data
+ * tcp_payload_flush() - Send out buffers for segments with data
* @c: Execution context
*/
-static void tcp_l2_data_buf_flush(const struct ctx *c)
+static void tcp_payload_flush(const struct ctx *c)
{
unsigned i;
size_t m;
- m = tap_send_frames(c, tcp6_l2_iov, 1, tcp6_l2_buf_used);
+ m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
+ tcp6_payload_used);
for (i = 0; i < m; i++)
- *tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len;
- tcp6_l2_buf_used = 0;
+ *tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
+ tcp6_payload_used = 0;
- m = tap_send_frames(c, tcp4_l2_iov, 1, tcp4_l2_buf_used);
+ m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
+ tcp4_payload_used);
for (i = 0; i < m; i++)
- *tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len;
- tcp4_l2_buf_used = 0;
+ *tcp4_seq_update[i].seq += tcp4_seq_update[i].len;
+ tcp4_payload_used = 0;
}
/**
@@ -1323,8 +1304,8 @@ static void tcp_l2_data_buf_flush(const struct ctx *c)
/* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */
void tcp_defer_handler(struct ctx *c)
{
- tcp_l2_flags_buf_flush(c);
- tcp_l2_data_buf_flush(c);
+ tcp_flags_flush(c);
+ tcp_payload_flush(c);
}
/**
@@ -1433,35 +1414,31 @@ static size_t tcp_fill_headers6(const struct ctx *c,
* tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
* @c: Execution context
* @conn: Connection pointer
- * @p: Pointer to any type of TCP pre-cooked buffer
+ * @iov: Pointer to an array of iovec of TCP pre-cooked buffers
* @plen: Payload length (including TCP header options)
* @check: Checksum, if already known
* @seq: Sequence number for this segment
*
- * Return: frame length including L2 headers, host order
+ * Return: IP payload length, host order
*/
static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
const struct tcp_tap_conn *conn,
- void *p, size_t plen,
+ struct iovec *iov, size_t plen,
const uint16_t *check, uint32_t seq)
{
const struct in_addr *a4 = inany_v4(&conn->faddr);
size_t ip_len, tlen;
if (a4) {
- struct tcp4_l2_buf_t *b = (struct tcp4_l2_buf_t *)p;
-
- ip_len = tcp_fill_headers4(c, conn, &b->iph, &b->th, plen,
+ ip_len = tcp_fill_headers4(c, conn, iov[TCP_IOV_IP].iov_base,
+ iov[TCP_IOV_PAYLOAD].iov_base, plen,
check, seq);
-
- tlen = tap_frame_len(c, &b->taph, ip_len);
+ tlen = ip_len - sizeof(struct iphdr);
} else {
- struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p;
-
- ip_len = tcp_fill_headers6(c, conn, &b->ip6h, &b->th, plen,
+ ip_len = tcp_fill_headers6(c, conn, iov[TCP_IOV_IP].iov_base,
+ iov[TCP_IOV_PAYLOAD].iov_base, plen,
seq);
-
- tlen = tap_frame_len(c, &b->taph, ip_len);
+ tlen = ip_len - sizeof(struct ipv6hdr);
}
return tlen;
@@ -1593,18 +1570,16 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
*/
static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
{
- uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
- uint32_t prev_wnd_to_tap = conn->wnd_to_tap;
- struct tcp4_l2_flags_buf_t *b4 = NULL;
- struct tcp6_l2_flags_buf_t *b6 = NULL;
+ struct tcp_flags_t *payload;
struct tcp_info tinfo = { 0 };
socklen_t sl = sizeof(tinfo);
int s = conn->sock;
+ uint32_t vnet_len;
size_t optlen = 0;
- struct iovec *iov;
struct tcphdr *th;
+ struct iovec *iov;
+ size_t ip_len;
char *data;
- void *p;
if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) &&
!flags && conn->wnd_to_tap)
@@ -1627,19 +1602,17 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
return 0;
if (CONN_V4(conn)) {
- iov = tcp4_l2_flags_iov + tcp4_l2_flags_buf_used;
- p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++;
- th = &b4->th;
-
- /* gcc 11.2 would complain on data = (char *)(th + 1); */
- data = b4->opts;
+ iov = tcp4_l2_flags_iov[tcp4_flags_used++];
+ vnet_len = sizeof(struct ethhdr) + sizeof(struct iphdr);
} else {
- iov = tcp6_l2_flags_iov + tcp6_l2_flags_buf_used;
- p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++;
- th = &b6->th;
- data = b6->opts;
+ iov = tcp6_l2_flags_iov[tcp6_flags_used++];
+ vnet_len = sizeof(struct ethhdr) + sizeof(struct ipv6hdr);
}
+ payload = iov[TCP_IOV_PAYLOAD].iov_base;
+ th = &payload->th;
+ data = payload->opts;
+
if (flags & SYN) {
int mss;
@@ -1675,9 +1648,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
*data++ = OPT_WS_LEN;
*data++ = conn->ws_to_tap;
} else if (!(flags & RST)) {
- if (conn->seq_ack_to_tap != prev_ack_to_tap ||
- !prev_wnd_to_tap)
- flags |= ACK;
+ flags |= ACK;
}
th->doff = (sizeof(*th) + optlen) / 4;
@@ -1687,8 +1658,11 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
th->syn = !!(flags & SYN);
th->fin = !!(flags & FIN);
- iov->iov_len = tcp_l2_buf_fill_headers(c, conn, p, optlen,
- NULL, conn->seq_to_tap);
+ ip_len = tcp_l2_buf_fill_headers(c, conn, iov, optlen, NULL,
+ conn->seq_to_tap);
+ iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
+
+ *(uint32_t *)iov[TCP_IOV_VLEN].iov_base = htonl(vnet_len + ip_len);
if (th->ack) {
if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap))
@@ -1704,24 +1678,27 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
if (th->fin || th->syn)
conn->seq_to_tap++;
- if (CONN_V4(conn)) {
- if (flags & DUP_ACK) {
- memcpy(b4 + 1, b4, sizeof(*b4));
- (iov + 1)->iov_len = iov->iov_len;
- tcp4_l2_flags_buf_used++;
- }
+ if (flags & DUP_ACK) {
+ struct iovec *dup_iov;
+ int i;
- if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2)
- tcp_l2_flags_buf_flush(c);
- } else {
- if (flags & DUP_ACK) {
- memcpy(b6 + 1, b6, sizeof(*b6));
- (iov + 1)->iov_len = iov->iov_len;
- tcp6_l2_flags_buf_used++;
- }
+ if (CONN_V4(conn))
+ dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++];
+ else
+ dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];
- if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2)
- tcp_l2_flags_buf_flush(c);
+ for (i = 0; i < TCP_NUM_IOVS; i++)
+ memcpy(dup_iov[i].iov_base, iov[i].iov_base,
+ iov[i].iov_len);
+ dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len;
+ }
+
+ if (CONN_V4(conn)) {
+ if (tcp4_flags_used > TCP_FRAMES_MEM - 2)
+ tcp_flags_flush(c);
+ } else {
+ if (tcp6_flags_used > TCP_FRAMES_MEM - 2)
+ tcp_flags_flush(c);
}
return 0;
@@ -2168,30 +2145,42 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
{
uint32_t *seq_update = &conn->seq_to_tap;
struct iovec *iov;
+ size_t ip_len;
+ uint32_t vnet_len;
if (CONN_V4(conn)) {
- struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used];
- const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL;
+ struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
+ const uint16_t *check = NULL;
- tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update;
- tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen;
+ if (no_csum) {
+ struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
+ check = &iph->check;
+ }
- iov = tcp4_l2_iov + tcp4_l2_buf_used++;
- iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
- check, seq);
- if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1)
- tcp_l2_data_buf_flush(c);
+ tcp4_seq_update[tcp4_payload_used].seq = seq_update;
+ tcp4_seq_update[tcp4_payload_used].len = plen;
+
+ iov = tcp4_l2_iov[tcp4_payload_used++];
+ ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, check,
+ seq);
+ iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
+ vnet_len = sizeof(struct ethhdr) + sizeof(struct iphdr) +
+ ip_len;
+ *(uint32_t *)iov[TCP_IOV_VLEN].iov_base = htonl(vnet_len);
+ if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
+ tcp_payload_flush(c);
} else if (CONN_V6(conn)) {
- struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used];
-
- tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update;
- tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen;
+ tcp6_seq_update[tcp6_payload_used].seq = seq_update;
+ tcp6_seq_update[tcp6_payload_used].len = plen;
- iov = tcp6_l2_iov + tcp6_l2_buf_used++;
- iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
- NULL, seq);
- if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1)
- tcp_l2_data_buf_flush(c);
+ iov = tcp6_l2_iov[tcp6_payload_used++];
+ ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, NULL, seq);
+ iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
+ vnet_len = sizeof(struct ethhdr) + sizeof(struct ipv6hdr) +
+ ip_len;
+ *(uint32_t *)iov[TCP_IOV_VLEN].iov_base = htonl(vnet_len);
+ if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
+ tcp_payload_flush(c);
}
}
@@ -2246,19 +2235,19 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
iov_sock[0].iov_base = tcp_buf_discard;
iov_sock[0].iov_len = already_sent;
- if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) ||
- (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) {
- tcp_l2_data_buf_flush(c);
+ if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
+ (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
+ tcp_payload_flush(c);
/* Silence Coverity CWE-125 false positive */
- tcp4_l2_buf_used = tcp6_l2_buf_used = 0;
+ tcp4_payload_used = tcp6_payload_used = 0;
}
for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
if (v4)
- iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data;
+ iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data;
else
- iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data;
+ iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data;
iov->iov_len = mss;
}
if (iov_rem)
@@ -2303,7 +2292,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
plen = mss;
seq = conn->seq_to_tap;
for (i = 0; i < send_bufs; i++) {
- int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used;
+ int no_csum = i && i != send_bufs - 1 && tcp4_payload_used;
if (i == send_bufs - 1)
plen = last_len;
diff --git a/tcp_splice.c b/tcp_splice.c
index d066112..42b7be0 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -28,7 +28,7 @@
* - FIN_SENT_0: FIN (write shutdown) sent to accepted socket
* - FIN_SENT_1: FIN (write shutdown) sent to target socket
*
- * #syscalls:pasta pipe2|pipe fcntl armv6l:fcntl64 armv7l:fcntl64 ppc64:fcntl64
+ * #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64
*/
#include <sched.h>
diff --git a/test/.gitignore b/test/.gitignore
index 4837402..6dd4790 100644
--- a/test/.gitignore
+++ b/test/.gitignore
@@ -1,5 +1,6 @@
test_logs/
mbuto/
+podman/
*.img
QEMU_EFI.fd
*.qcow2
diff --git a/test/Makefile b/test/Makefile
index 7b00bef..35a3b55 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -52,10 +52,10 @@ UBUNTU_NEW_IMGS = xenial-server-cloudimg-powerpc-disk1.img \
jammy-server-cloudimg-s390x.img
UBUNTU_IMGS = $(UBUNTU_OLD_IMGS) $(UBUNTU_NEW_IMGS)
-DOWNLOAD_ASSETS = mbuto \
+DOWNLOAD_ASSETS = mbuto podman \
$(DEBIAN_IMGS) $(FEDORA_IMGS) $(OPENSUSE_IMGS) $(UBUNTU_IMGS)
TESTDATA_ASSETS = small.bin big.bin medium.bin
-LOCAL_ASSETS = mbuto.img mbuto.mem.img QEMU_EFI.fd \
+LOCAL_ASSETS = mbuto.img mbuto.mem.img podman/bin/podman QEMU_EFI.fd \
$(DEBIAN_IMGS:%=prepared-%) $(FEDORA_IMGS:%=prepared-%) \
$(UBUNTU_NEW_IMGS:%=prepared-%) \
nstool guest-key guest-key.pub \
@@ -67,13 +67,27 @@ CFLAGS = -Wall -Werror -Wextra -pedantic -std=c99
assets: $(ASSETS)
+.PHONY: pull-%
+pull-%: %
+ git -C $* pull
+
mbuto:
git clone git://mbuto.sh/mbuto
+mbuto/mbuto: pull-mbuto
+
+podman:
+ git clone https://github.com/containers/podman.git
+
+# To succesfully build podman, you will need gpgme and systemd
+# development packages
+podman/bin/podman: pull-podman
+ $(MAKE) -C podman
+
guest-key guest-key.pub:
ssh-keygen -f guest-key -N ''
-mbuto.img: passt.mbuto mbuto guest-key.pub $(TESTDATA_ASSETS)
+mbuto.img: passt.mbuto mbuto/mbuto guest-key.pub $(TESTDATA_ASSETS)
./mbuto/mbuto -p ./$< -c lz4 -f $@
mbuto.mem.img: passt.mem.mbuto mbuto ../passt.avx2
diff --git a/test/pasta_options/log_to_file b/test/pasta_options/log_to_file
index fcdd553..fe50e50 100644
--- a/test/pasta_options/log_to_file
+++ b/test/pasta_options/log_to_file
@@ -33,16 +33,13 @@ test Log creation
set PORTS -t 10001,10002 -u 10001,10002
set LOG_FILE __STATEDIR__/pasta.log
-passt ./pasta -l __LOG_FILE__
-passtb exit
-sleep 1
+passt ./pasta -l __LOG_FILE__ -- /bin/true
check [ -s __LOG_FILE__ ]
test Log truncated on creation
-passt ./pasta -l __LOG_FILE__
-passtb exit
-sleep 1
-check [ $(cat __LOG_FILE__ | wc -l) -eq 1 ]
+passt ./pasta -l __LOG_FILE__ -- /bin/true & wait
+pout PID2 echo $!
+check head -1 __LOG_FILE__ | grep '^pasta .* [(]__PID2__[)]$'
test Maximum log size
passtb ./pasta --config-net -d -f -l __LOG_FILE__ --log-size $((100 * 1024)) -- sh -c 'while true; do tcp_crr --nolog -P 10001 -C 10002 -6; done'
diff --git a/test/pasta_podman/bats b/test/pasta_podman/bats
index 21446f0..6b1c575 100644
--- a/test/pasta_podman/bats
+++ b/test/pasta_podman/bats
@@ -11,11 +11,16 @@
# Copyright (c) 2022 Red Hat GmbH
# Author: Stefano Brivio <sbrivio@redhat.com>
-htools git make go bats catatonit ip jq socat
+htools git make go bats ip jq socat ./test/podman/bin/podman
+
+set PODMAN test/podman/bin/podman
+hout WD pwd
+
+test Podman pasta path
+
+hout PASTA_BIN CONTAINERS_HELPER_BINARY_DIR="__WD__" __PODMAN__ info --format "{{.Host.Pasta.Executable}}"
+check [ "__PASTA_BIN__" = "__WD__/pasta" ]
test Podman system test with bats
-host git -C __STATEDIR__ clone https://github.com/containers/podman.git
-host make -C __STATEDIR__/podman
-hout WD pwd
-host PODMAN="__STATEDIR__/podman/bin/podman" CONTAINERS_HELPER_BINARY_DIR="__WD__" bats __STATEDIR__/podman/test/system/505-networking-pasta.bats
+host PODMAN="__PODMAN__" CONTAINERS_HELPER_BINARY_DIR="__WD__" bats test/podman/test/system/505-networking-pasta.bats
diff --git a/udp.c b/udp.c
index 694424a..594ea19 100644
--- a/udp.c
+++ b/udp.c
@@ -829,6 +829,7 @@ int udp_tap_handler(struct ctx *c, uint8_t pif,
* and destination, so we can just take those from the first message.
*/
src = ntohs(uh->source);
+ src += c->udp.fwd_in.rdelta[src];
dst = ntohs(uh->dest);
if (af == AF_INET) {
@@ -1005,7 +1006,7 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
r4 = s = sock_l4(c, AF_INET, IPPROTO_UDP, addr,
ifname, port, uref.u32);
- udp_tap_map[V4][uref.port].sock = s < 0 ? -1 : s;
+ udp_tap_map[V4][port].sock = s < 0 ? -1 : s;
udp_splice_init[V4][port].sock = s < 0 ? -1 : s;
} else {
r4 = s = sock_l4(c, AF_INET, IPPROTO_UDP,
@@ -1022,7 +1023,7 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
r6 = s = sock_l4(c, AF_INET6, IPPROTO_UDP, addr,
ifname, port, uref.u32);
- udp_tap_map[V6][uref.port].sock = s < 0 ? -1 : s;
+ udp_tap_map[V6][port].sock = s < 0 ? -1 : s;
udp_splice_init[V6][port].sock = s < 0 ? -1 : s;
} else {
r6 = s = sock_l4(c, AF_INET6, IPPROTO_UDP,
diff --git a/util.h b/util.h
index 48f3560..264423b 100644
--- a/util.h
+++ b/util.h
@@ -9,6 +9,7 @@
#include <stdlib.h>
#include <stdarg.h>
#include <stdbool.h>
+#include <stddef.h>
#include <string.h>
#include <signal.h>
@@ -31,6 +32,9 @@
#ifndef ETH_MIN_MTU
#define ETH_MIN_MTU 68
#endif
+#ifndef IP_MAX_MTU
+#define IP_MAX_MTU USHRT_MAX
+#endif
#ifndef MIN
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
@@ -90,6 +94,7 @@
#define PORT_IS_EPHEMERAL(port) ((port) >= PORT_EPHEMERAL_MIN)
#define MAC_ZERO ((uint8_t [ETH_ALEN]){ 0 })
+#define MAC_LAA ((uint8_t [ETH_ALEN]){ BIT(1), 0, 0, 0, 0, 0 })
#define MAC_IS_ZERO(addr) (!memcmp((addr), MAC_ZERO, ETH_ALEN))
#ifndef __bswap_constant_16
@@ -116,7 +121,8 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
void *arg);
#define NS_CALL(fn, arg) \
do { \
- char ns_fn_stack[NS_FN_STACK_SIZE]; \
+ char ns_fn_stack[NS_FN_STACK_SIZE] \
+ __attribute__ ((aligned(__alignof__(max_align_t)))); \
\
do_clone((fn), ns_fn_stack, sizeof(ns_fn_stack), \
CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD,\
@@ -157,6 +163,24 @@ int write_file(const char *path, const char *buf);
int write_remainder(int fd, const struct iovec *iov, int iovcnt, size_t skip);
/**
+ * af_name() - Return name of an address family
+ * @af: Address/protocol family (AF_INET or AF_INET6)
+ *
+ * Returns: Name of the protocol family as a string
+ */
+static inline const char *af_name(sa_family_t af)
+{
+ switch (af) {
+ case AF_INET:
+ return "IPv4";
+ case AF_INET6:
+ return "IPv6";
+ default:
+ return "<unknown address family>";
+ }
+}
+
+/**
* mod_sub() - Modular arithmetic subtraction
* @a: Minued, unsigned value < @m
* @b: Subtrahend, unsigned value < @m