diff options
-rw-r--r-- | Makefile | 11 | ||||
-rw-r--r-- | README.md | 4 | ||||
-rw-r--r-- | arp.c | 4 | ||||
-rw-r--r-- | conf.c | 343 | ||||
-rw-r--r-- | contrib/apparmor/abstractions/passt | 2 | ||||
-rw-r--r-- | contrib/selinux/passt.te | 3 | ||||
-rw-r--r-- | contrib/selinux/pasta.te | 2 | ||||
-rw-r--r-- | dhcp.c | 21 | ||||
-rw-r--r-- | dhcpv6.c | 21 | ||||
-rw-r--r-- | flow.c | 141 | ||||
-rw-r--r-- | flow.h | 25 | ||||
-rw-r--r-- | fwd.c | 244 | ||||
-rw-r--r-- | fwd.h | 3 | ||||
-rw-r--r-- | icmp.c | 4 | ||||
-rw-r--r-- | log.c | 16 | ||||
-rw-r--r-- | ndp.c | 9 | ||||
-rw-r--r-- | netlink.c | 146 | ||||
-rw-r--r-- | netlink.h | 6 | ||||
-rw-r--r-- | passt.1 | 43 | ||||
-rw-r--r-- | passt.c | 8 | ||||
-rw-r--r-- | passt.h | 53 | ||||
-rw-r--r-- | pasta.c | 48 | ||||
-rwxr-xr-x | seccomp.sh | 5 | ||||
-rw-r--r-- | tap.c | 110 | ||||
-rw-r--r-- | tcp.c | 37 | ||||
-rw-r--r-- | tcp_buf.c | 11 | ||||
-rw-r--r-- | tcp_internal.h | 2 | ||||
-rw-r--r-- | tcp_splice.c | 2 | ||||
-rw-r--r-- | test/README.md | 9 | ||||
-rw-r--r-- | test/lib/layout | 14 | ||||
-rwxr-xr-x | test/lib/setup | 21 | ||||
-rwxr-xr-x | test/lib/term | 31 | ||||
-rwxr-xr-x | test/lib/test | 2 | ||||
-rwxr-xr-x | test/passt.mbuto | 10 | ||||
-rw-r--r-- | test/passt_in_ns/dhcp | 73 | ||||
-rw-r--r-- | test/passt_in_ns/tcp | 38 | ||||
-rw-r--r-- | test/passt_in_ns/udp | 22 | ||||
-rw-r--r-- | test/pasta_options/log_to_file | 4 | ||||
-rw-r--r-- | test/perf/passt_tcp | 47 | ||||
-rw-r--r-- | test/perf/passt_udp | 33 | ||||
-rw-r--r-- | test/perf/pasta_tcp | 55 | ||||
-rw-r--r-- | test/perf/pasta_udp | 27 | ||||
-rwxr-xr-x | test/run | 4 | ||||
-rw-r--r-- | test/valgrind.supp | 9 | ||||
-rw-r--r-- | udp.c | 213 | ||||
-rw-r--r-- | udp.h | 2 | ||||
-rw-r--r-- | udp_flow.c | 21 | ||||
-rw-r--r-- | udp_flow.h | 4 | ||||
-rw-r--r-- | util.c | 26 | ||||
-rw-r--r-- | util.h | 29 |
50 files changed, 1379 insertions, 639 deletions
@@ -33,9 +33,16 @@ AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/MIPS64EL/MIPSEL64/') AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/HPPA/PARISC/') AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/SH4/SH/') +# On some systems enabling optimization also enables source fortification, +# automagically. Do not override it. +FORTIFY_FLAG := +ifeq ($(shell $(CC) -O2 -dM -E - < /dev/null 2>&1 | grep ' _FORTIFY_SOURCE ' > /dev/null; echo $$?),1) +FORTIFY_FLAG := -D_FORTIFY_SOURCE=2 +endif + FLAGS := -Wall -Wextra -Wno-format-zero-length FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE -FLAGS += -D_FORTIFY_SOURCE=2 -O2 -pie -fPIE +FLAGS += $(FORTIFY_FLAG) -O2 -pie -fPIE FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE) FLAGS += -DNETNS_RUN_DIR=\"/run/netns\" FLAGS += -DPASST_AUDIT_ARCH=AUDIT_ARCH_$(AUDIT_ARCH) @@ -129,7 +136,7 @@ qrap: $(QRAP_SRCS) passt.h valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction \ rt_sigreturn getpid gettid kill clock_gettime mmap \ - munmap open unlink gettimeofday futex + mmap2 munmap open unlink gettimeofday futex valgrind: FLAGS += -g -DVALGRIND valgrind: all @@ -338,7 +338,9 @@ speeding up local connections, and usually requiring NAT. _pasta_: [_slirp4netns_ replacement](/passt/tree/slirp4netns.sh) * ✅ out-of-tree patch for [Kata Containers](/passt/tree/contrib/kata-containers) available -* ⌚ drop-in replacement for VPNKit (rootless Docker) +* ✅ rootless Docker + [network back-end](https://docs.docker.com/engine/security/rootless/#networking-errors) + via moby/rootlesskit ### Availability * official packages for: @@ -72,7 +72,7 @@ int arp(const struct ctx *c, const struct pool *p) ah->ar_op = htons(ARPOP_REPLY); memcpy(am->tha, am->sha, sizeof(am->tha)); - memcpy(am->sha, c->mac, sizeof(am->sha)); + memcpy(am->sha, c->our_tap_mac, sizeof(am->sha)); memcpy(swap, am->tip, sizeof(am->tip)); memcpy(am->tip, am->sip, sizeof(am->tip)); @@ -80,7 +80,7 @@ int arp(const struct ctx *c, const struct pool *p) l2len = sizeof(*eh) + sizeof(*ah) + sizeof(*am); memcpy(eh->h_dest, eh->h_source, sizeof(eh->h_dest)); - memcpy(eh->h_source, c->mac, sizeof(eh->h_source)); + memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source)); tap_send_single(c, eh, l2len); @@ -156,9 +156,15 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, die("'all' port forwarding is only allowed for passt"); fwd->mode = FWD_ALL; - memset(fwd->map, 0xff, PORT_EPHEMERAL_MIN / 8); - for (i = 0; i < PORT_EPHEMERAL_MIN; i++) { + /* Skip port 0. It has special meaning for many socket APIs, so + * trying to bind it is not really safe. + */ + for (i = 1; i < NUM_PORTS; i++) { + if (fwd_port_is_ephemeral(i)) + continue; + + bitmap_set(fwd->map, i); if (optname == 't') { ret = tcp_sock_init(c, AF_UNSPEC, NULL, NULL, i); @@ -259,8 +265,12 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, } while ((p = next_chunk(p, ','))); if (exclude_only) { - for (i = 0; i < PORT_EPHEMERAL_MIN; i++) { - if (bitmap_isset(exclude, i)) + /* Skip port 0. It has special meaning for many socket APIs, so + * trying to bind it is not really safe. + */ + for (i = 1; i < NUM_PORTS; i++) { + if (fwd_port_is_ephemeral(i) || + bitmap_isset(exclude, i)) continue; bitmap_set(fwd->map, i); @@ -353,55 +363,93 @@ bind_all_fail: /** * add_dns4() - Possibly add the IPv4 address of a DNS resolver to configuration * @c: Execution context - * @addr: Address found in /etc/resolv.conf - * @conf: Pointer to reference of current entry in array of IPv4 resolvers + * @addr: Guest nameserver IPv4 address + * @idx: Index of free entry in array of IPv4 resolvers + * + * Return: Number of entries added (0 or 1) */ -static void add_dns4(struct ctx *c, const struct in_addr *addr, - struct in_addr **conf) +static unsigned add_dns4(struct ctx *c, const struct in_addr *addr, + unsigned idx) { - /* Guest or container can only access local addresses via redirect */ - if (IN4_IS_ADDR_LOOPBACK(addr)) { - if (!c->no_map_gw) { - **conf = c->ip4.gw; - (*conf)++; - - if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match)) - c->ip4.dns_match = c->ip4.gw; - } - } else { - **conf = *addr; - (*conf)++; - } + if (idx >= ARRAY_SIZE(c->ip4.dns)) + return 0; - if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host)) - c->ip4.dns_host = *addr; + c->ip4.dns[idx] = *addr; + return 1; } /** * add_dns6() - Possibly add the IPv6 address of a DNS resolver to configuration * @c: Execution context - * @addr: Address found in /etc/resolv.conf - * @conf: Pointer to reference of current entry in array of IPv6 resolvers + * @addr: Guest nameserver IPv6 address + * @idx: Index of free entry in array of IPv6 resolvers + * + * Return: Number of entries added (0 or 1) */ -static void add_dns6(struct ctx *c, - struct in6_addr *addr, struct in6_addr **conf) +static unsigned add_dns6(struct ctx *c, const struct in6_addr *addr, + unsigned idx) { - /* Guest or container can only access local addresses via redirect */ - if (IN6_IS_ADDR_LOOPBACK(addr)) { - if (!c->no_map_gw) { - memcpy(*conf, &c->ip6.gw, sizeof(**conf)); - (*conf)++; + if (idx >= ARRAY_SIZE(c->ip6.dns)) + return 0; - if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match)) - memcpy(&c->ip6.dns_match, addr, sizeof(*addr)); + c->ip6.dns[idx] = *addr; + return 1; +} + +/** + * add_dns_resolv() - Possibly add ns from host resolv.conf to configuration + * @c: Execution context + * @nameserver: Nameserver address string from /etc/resolv.conf + * @idx4: Pointer to index of current entry in array of IPv4 resolvers + * @idx6: Pointer to index of current entry in array of IPv6 resolvers + * + * @idx4 or @idx6 may be NULL, in which case resolvers of the corresponding type + * are ignored. + */ +static void add_dns_resolv(struct ctx *c, const char *nameserver, + unsigned *idx4, unsigned *idx6) +{ + struct in6_addr ns6; + struct in_addr ns4; + + if (idx4 && inet_pton(AF_INET, nameserver, &ns4)) { + if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host)) + c->ip4.dns_host = ns4; + + /* Guest or container can only access local addresses via + * redirect + */ + if (IN4_IS_ADDR_LOOPBACK(&ns4)) { + if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback)) + return; + + ns4 = c->ip4.map_host_loopback; + if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match)) + c->ip4.dns_match = c->ip4.map_host_loopback; } - } else { - memcpy(*conf, addr, sizeof(**conf)); - (*conf)++; + + *idx4 += add_dns4(c, &ns4, *idx4); } - if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host)) - c->ip6.dns_host = *addr; + if (idx6 && inet_pton(AF_INET6, nameserver, &ns6)) { + if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host)) + c->ip6.dns_host = ns6; + + /* Guest or container can only access local addresses via + * redirect + */ + if (IN6_IS_ADDR_LOOPBACK(&ns6)) { + if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback)) + return; + + ns6 = c->ip6.map_host_loopback; + + if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match)) + c->ip6.dns_match = c->ip6.map_host_loopback; + } + + *idx6 += add_dns6(c, &ns6, *idx6); + } } /** @@ -410,18 +458,16 @@ static void add_dns6(struct ctx *c, */ static void get_dns(struct ctx *c) { - struct in6_addr *dns6 = &c->ip6.dns[0], dns6_tmp; - struct in_addr *dns4 = &c->ip4.dns[0], dns4_tmp; int dns4_set, dns6_set, dnss_set, dns_set, fd; + unsigned dns4_idx = 0, dns6_idx = 0; struct fqdn *s = c->dns_search; struct lineread resolvconf; - unsigned int added = 0; ssize_t line_len; char *line, *end; const char *p; - dns4_set = !c->ifi4 || !IN4_IS_ADDR_UNSPECIFIED(dns4); - dns6_set = !c->ifi6 || !IN6_IS_ADDR_UNSPECIFIED(dns6); + dns4_set = !c->ifi4 || !IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns[0]); + dns6_set = !c->ifi6 || !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns[0]); dnss_set = !!*s->n || c->no_dns_search; dns_set = (dns4_set && dns6_set) || c->no_dns; @@ -442,19 +488,9 @@ static void get_dns(struct ctx *c) if (end) *end = 0; - if (!dns4_set && - dns4 - &c->ip4.dns[0] < ARRAY_SIZE(c->ip4.dns) - 1 - && inet_pton(AF_INET, p + 1, &dns4_tmp)) { - add_dns4(c, &dns4_tmp, &dns4); - added++; - } - - if (!dns6_set && - dns6 - &c->ip6.dns[0] < ARRAY_SIZE(c->ip6.dns) - 1 - && inet_pton(AF_INET6, p + 1, &dns6_tmp)) { - add_dns6(c, &dns6_tmp, &dns6); - added++; - } + add_dns_resolv(c, p + 1, + dns4_set ? NULL : &dns4_idx, + dns6_set ? NULL : &dns6_idx); } else if (!dnss_set && strstr(line, "search ") == line && s == c->dns_search) { end = strpbrk(line, "\n"); @@ -481,7 +517,7 @@ static void get_dns(struct ctx *c) out: if (!dns_set) { - if (!added) + if (!(dns4_idx + dns6_idx)) warn("Couldn't get any nameserver address"); if (c->no_dhcp_dns) @@ -586,12 +622,10 @@ static int conf_ip4_prefix(const char *arg) * conf_ip4() - Verify or detect IPv4 support, get relevant addresses * @ifi: Host interface to attempt (0 to determine one) * @ip4: IPv4 context (will be written) - * @mac: MAC address to use (written if unset) * * Return: Interface index for IPv4, or 0 on failure. */ -static unsigned int conf_ip4(unsigned int ifi, - struct ip4_ctx *ip4, unsigned char *mac) +static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4) { if (!ifi) ifi = nl_get_ext_if(nl_sock, AF_INET); @@ -601,8 +635,9 @@ static unsigned int conf_ip4(unsigned int ifi, return 0; } - if (IN4_IS_ADDR_UNSPECIFIED(&ip4->gw)) { - int rc = nl_route_get_def(nl_sock, ifi, AF_INET, &ip4->gw); + if (IN4_IS_ADDR_UNSPECIFIED(&ip4->guest_gw)) { + int rc = nl_route_get_def(nl_sock, ifi, AF_INET, + &ip4->guest_gw); if (rc < 0) { err("Couldn't discover IPv4 gateway address: %s", strerror(-rc)); @@ -632,21 +667,9 @@ static unsigned int conf_ip4(unsigned int ifi, ip4->prefix_len = 32; } - memcpy(&ip4->addr_seen, &ip4->addr, sizeof(ip4->addr_seen)); + ip4->addr_seen = ip4->addr; - if (MAC_IS_ZERO(mac)) { - int rc = nl_link_get_mac(nl_sock, ifi, mac); - if (rc < 0) { - char ifname[IFNAMSIZ]; - - err("Couldn't discover MAC address for %s: %s", - if_indextoname(ifi, ifname), strerror(-rc)); - return 0; - } - - if (MAC_IS_ZERO(mac)) - memcpy(mac, MAC_LAA, ETH_ALEN); - } + ip4->our_tap_addr = ip4->guest_gw; if (IN4_IS_ADDR_UNSPECIFIED(&ip4->addr)) return 0; @@ -658,12 +681,10 @@ static unsigned int conf_ip4(unsigned int ifi, * conf_ip6() - Verify or detect IPv6 support, get relevant addresses * @ifi: Host interface to attempt (0 to determine one) * @ip6: IPv6 context (will be written) - * @mac: MAC address to use (written if unset) * * Return: Interface index for IPv6, or 0 on failure. */ -static unsigned int conf_ip6(unsigned int ifi, - struct ip6_ctx *ip6, unsigned char *mac) +static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6) { int prefix_len = 0; int rc; @@ -676,8 +697,8 @@ static unsigned int conf_ip6(unsigned int ifi, return 0; } - if (IN6_IS_ADDR_UNSPECIFIED(&ip6->gw)) { - rc = nl_route_get_def(nl_sock, ifi, AF_INET6, &ip6->gw); + if (IN6_IS_ADDR_UNSPECIFIED(&ip6->guest_gw)) { + rc = nl_route_get_def(nl_sock, ifi, AF_INET6, &ip6->guest_gw); if (rc < 0) { err("Couldn't discover IPv6 gateway address: %s", strerror(-rc)); @@ -687,30 +708,19 @@ static unsigned int conf_ip6(unsigned int ifi, rc = nl_addr_get(nl_sock, ifi, AF_INET6, IN6_IS_ADDR_UNSPECIFIED(&ip6->addr) ? &ip6->addr : NULL, - &prefix_len, &ip6->addr_ll); + &prefix_len, &ip6->our_tap_ll); if (rc < 0) { err("Couldn't discover IPv6 address: %s", strerror(-rc)); return 0; } - memcpy(&ip6->addr_seen, &ip6->addr, sizeof(ip6->addr)); - memcpy(&ip6->addr_ll_seen, &ip6->addr_ll, sizeof(ip6->addr_ll)); - - if (MAC_IS_ZERO(mac)) { - rc = nl_link_get_mac(nl_sock, ifi, mac); - if (rc < 0) { - char ifname[IFNAMSIZ]; - err("Couldn't discover MAC address for %s: %s", - if_indextoname(ifi, ifname), strerror(-rc)); - return 0; - } + ip6->addr_seen = ip6->addr; - if (MAC_IS_ZERO(mac)) - memcpy(mac, MAC_LAA, ETH_ALEN); - } + if (IN6_IS_ADDR_LINKLOCAL(&ip6->guest_gw)) + ip6->our_tap_ll = ip6->guest_gw; if (IN6_IS_ADDR_UNSPECIFIED(&ip6->addr) || - IN6_IS_ADDR_UNSPECIFIED(&ip6->addr_ll)) + IN6_IS_ADDR_UNSPECIFIED(&ip6->our_tap_ll)) return 0; return ifi; @@ -817,6 +827,12 @@ static void usage(const char *name, FILE *f, int status) fprintf(f, " --no-dhcp-search No list in DHCP/DHCPv6/NDP\n"); fprintf(f, + " --map-host-loopback ADDR Translate ADDR to refer to host\n" + " can be specified zero to two times (for IPv4 and IPv6)\n" + " default: gateway address\n" + " --map-guest-addr ADDR Translate ADDR to guest's address\n" + " can be specified zero to two times (for IPv4 and IPv6)\n" + " default: none\n" " --dns-forward ADDR Forward DNS queries sent to ADDR\n" " can be specified zero to two times (for IPv4 and IPv6)\n" " default: don't forward DNS queries\n" @@ -921,7 +937,8 @@ pasta_opts: */ static void conf_print(const struct ctx *c) { - char buf4[INET_ADDRSTRLEN], buf6[INET6_ADDRSTRLEN], ifn[IFNAMSIZ]; + char buf4[INET_ADDRSTRLEN], buf6[INET6_ADDRSTRLEN]; + char bufmac[ETH_ADDRSTRLEN], ifn[IFNAMSIZ]; int i; info("Template interface: %s%s%s%s%s", @@ -955,11 +972,14 @@ static void conf_print(const struct ctx *c) info("Namespace interface: %s", c->pasta_ifn); info("MAC:"); - info(" host: %02x:%02x:%02x:%02x:%02x:%02x", - c->mac[0], c->mac[1], c->mac[2], - c->mac[3], c->mac[4], c->mac[5]); + info(" host: %s", eth_ntop(c->our_tap_mac, bufmac, sizeof(bufmac))); if (c->ifi4) { + if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback)) + info(" NAT to host 127.0.0.1: %s", + inet_ntop(AF_INET, &c->ip4.map_host_loopback, + buf4, sizeof(buf4))); + if (!c->no_dhcp) { uint32_t mask; @@ -971,7 +991,8 @@ static void conf_print(const struct ctx *c) info(" mask: %s", inet_ntop(AF_INET, &mask, buf4, sizeof(buf4))); info(" router: %s", - inet_ntop(AF_INET, &c->ip4.gw, buf4, sizeof(buf4))); + inet_ntop(AF_INET, &c->ip4.guest_gw, + buf4, sizeof(buf4))); } for (i = 0; !IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns[i]); i++) { @@ -989,6 +1010,11 @@ static void conf_print(const struct ctx *c) } if (c->ifi6) { + if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback)) + info(" NAT to host ::1: %s", + inet_ntop(AF_INET6, &c->ip6.map_host_loopback, + buf6, sizeof(buf6))); + if (!c->no_ndp && !c->no_dhcpv6) info("NDP/DHCPv6:"); else if (!c->no_ndp) @@ -1001,9 +1027,10 @@ static void conf_print(const struct ctx *c) info(" assign: %s", inet_ntop(AF_INET6, &c->ip6.addr, buf6, sizeof(buf6))); info(" router: %s", - inet_ntop(AF_INET6, &c->ip6.gw, buf6, sizeof(buf6))); + inet_ntop(AF_INET6, &c->ip6.guest_gw, buf6, sizeof(buf6))); info(" our link-local: %s", - inet_ntop(AF_INET6, &c->ip6.addr_ll, buf6, sizeof(buf6))); + inet_ntop(AF_INET6, &c->ip6.our_tap_ll, + buf6, sizeof(buf6))); dns6: for (i = 0; !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns[i]); i++) { @@ -1122,6 +1149,38 @@ static void conf_ugid(char *runas, uid_t *uid, gid_t *gid) } /** + * conf_nat() - Parse --map-host-loopback or --map-guest-addr option + * @arg: String argument to option + * @addr4: IPv4 to update with parsed address + * @addr6: IPv6 to update with parsed address + * @no_map_gw: --no-map-gw flag, or NULL, updated for "none" argument + */ +static void conf_nat(const char *arg, struct in_addr *addr4, + struct in6_addr *addr6, int *no_map_gw) +{ + if (strcmp(arg, "none") == 0) { + *addr4 = in4addr_any; + *addr6 = in6addr_any; + if (no_map_gw) + *no_map_gw = 1; + } + + if (inet_pton(AF_INET6, arg, addr6) && + !IN6_IS_ADDR_UNSPECIFIED(addr6) && + !IN6_IS_ADDR_LOOPBACK(addr6) && + !IN6_IS_ADDR_MULTICAST(addr6)) + return; + + if (inet_pton(AF_INET, arg, addr4) && + !IN4_IS_ADDR_UNSPECIFIED(addr4) && + !IN4_IS_ADDR_LOOPBACK(addr4) && + !IN4_IS_ADDR_MULTICAST(addr4)) + return; + + die("Invalid address to remap to host: %s", optarg); +} + +/** * conf_open_files() - Open files as requested by configuration * @c: Execution context */ @@ -1174,7 +1233,7 @@ fail: */ void conf(struct ctx *c, int argc, char **argv) { - int netns_only = 0; + int netns_only = 0, no_map_gw = 0; const struct option options[] = { {"debug", no_argument, NULL, 'd' }, {"quiet", no_argument, NULL, 'q' }, @@ -1203,7 +1262,7 @@ void conf(struct ctx *c, int argc, char **argv) {"no-dhcpv6", no_argument, &c->no_dhcpv6, 1 }, {"no-ndp", no_argument, &c->no_ndp, 1 }, {"no-ra", no_argument, &c->no_ra, 1 }, - {"no-map-gw", no_argument, &c->no_map_gw, 1 }, + {"no-map-gw", no_argument, &no_map_gw, 1 }, {"ipv4-only", no_argument, NULL, '4' }, {"ipv6-only", no_argument, NULL, '6' }, {"one-off", no_argument, NULL, '1' }, @@ -1230,6 +1289,8 @@ void conf(struct ctx *c, int argc, char **argv) {"no-copy-routes", no_argument, NULL, 18 }, {"no-copy-addrs", no_argument, NULL, 19 }, {"netns-only", no_argument, NULL, 20 }, + {"map-host-loopback", required_argument, NULL, 21 }, + {"map-guest-addr", required_argument, NULL, 22 }, { 0 }, }; const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt"; @@ -1237,8 +1298,7 @@ void conf(struct ctx *c, int argc, char **argv) bool copy_addrs_opt = false, copy_routes_opt = false; enum fwd_ports_mode fwd_default = FWD_NONE; bool v4_only = false, v6_only = false; - struct in6_addr *dns6 = c->ip6.dns; - struct in_addr *dns4 = c->ip4.dns; + unsigned dns4_idx = 0, dns6_idx = 0; struct fqdn *dnss = c->dns_search; unsigned int ifi4 = 0, ifi6 = 0; const char *logfile = NULL; @@ -1260,8 +1320,9 @@ void conf(struct ctx *c, int argc, char **argv) c->tcp.fwd_in.mode = c->tcp.fwd_out.mode = FWD_UNSET; c->udp.fwd_in.mode = c->udp.fwd_out.mode = FWD_UNSET; + memcpy(c->our_tap_mac, MAC_OUR_LAA, ETH_ALEN); - optind = 1; + optind = 0; do { name = getopt_long(argc, argv, optstring, options, NULL); @@ -1290,7 +1351,7 @@ void conf(struct ctx *c, int argc, char **argv) if (c->mode != MODE_PASTA) die("--ns-mac-addr is for pasta mode only"); - parse_mac(c->mac_guest, optarg); + parse_mac(c->guest_mac, optarg); break; case 5: if (c->mode != MODE_PASTA) @@ -1399,6 +1460,14 @@ void conf(struct ctx *c, int argc, char **argv) netns_only = 1; *userns = 0; break; + case 21: + conf_nat(optarg, &c->ip4.map_host_loopback, + &c->ip6.map_host_loopback, &no_map_gw); + break; + case 22: + conf_nat(optarg, &c->ip4.map_guest_addr, + &c->ip6.map_guest_addr, NULL); + break; case 'd': c->debug = 1; c->quiet = 0; @@ -1501,21 +1570,21 @@ void conf(struct ctx *c, int argc, char **argv) break; case 'M': - parse_mac(c->mac, optarg); + parse_mac(c->our_tap_mac, optarg); break; case 'g': - if (inet_pton(AF_INET6, optarg, &c->ip6.gw) && - !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.gw) && - !IN6_IS_ADDR_LOOPBACK(&c->ip6.gw)) { + if (inet_pton(AF_INET6, optarg, &c->ip6.guest_gw) && + !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.guest_gw) && + !IN6_IS_ADDR_LOOPBACK(&c->ip6.guest_gw)) { if (c->mode == MODE_PASTA) c->ip6.no_copy_routes = true; break; } - if (inet_pton(AF_INET, optarg, &c->ip4.gw) && - !IN4_IS_ADDR_UNSPECIFIED(&c->ip4.gw) && - !IN4_IS_ADDR_BROADCAST(&c->ip4.gw) && - !IN4_IS_ADDR_LOOPBACK(&c->ip4.gw)) { + if (inet_pton(AF_INET, optarg, &c->ip4.guest_gw) && + !IN4_IS_ADDR_UNSPECIFIED(&c->ip4.guest_gw) && + !IN4_IS_ADDR_BROADCAST(&c->ip4.guest_gw) && + !IN4_IS_ADDR_LOOPBACK(&c->ip4.guest_gw)) { if (c->mode == MODE_PASTA) c->ip4.no_copy_routes = true; break; @@ -1630,25 +1699,31 @@ void conf(struct ctx *c, int argc, char **argv) nl_sock_init(c, false); if (!v6_only) - c->ifi4 = conf_ip4(ifi4, &c->ip4, c->mac); + c->ifi4 = conf_ip4(ifi4, &c->ip4); if (!v4_only) - c->ifi6 = conf_ip6(ifi6, &c->ip6, c->mac); + c->ifi6 = conf_ip6(ifi6, &c->ip6); if ((!c->ifi4 && !c->ifi6) || (*c->ip4.ifname_out && !c->ifi4) || (*c->ip6.ifname_out && !c->ifi6)) die("External interface not usable"); - if (c->ifi4 && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.gw)) - c->no_map_gw = c->no_dhcp = 1; + if (c->ifi4 && !no_map_gw && + IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback)) + c->ip4.map_host_loopback = c->ip4.guest_gw; - if (c->ifi6 && IN6_IS_ADDR_UNSPECIFIED(&c->ip6.gw)) - c->no_map_gw = 1; + if (c->ifi6 && !no_map_gw && + IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback)) + c->ip6.map_host_loopback = c->ip6.guest_gw; + + if (c->ifi4 && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.guest_gw)) + c->no_dhcp = 1; /* Inbound port options & DNS can be parsed now (after IPv4/IPv6 * settings) */ + fwd_probe_ephemeral(); udp_portmap_clear(); - optind = 1; + optind = 0; do { name = getopt_long(argc, argv, optstring, options, NULL); @@ -1663,13 +1738,13 @@ void conf(struct ctx *c, int argc, char **argv) if (!strcmp(optarg, "none")) { c->no_dns = 1; - dns4 = &c->ip4.dns[0]; + dns4_idx = 0; memset(c->ip4.dns, 0, sizeof(c->ip4.dns)); c->ip4.dns[0] = (struct in_addr){ 0 }; c->ip4.dns_match = (struct in_addr){ 0 }; c->ip4.dns_host = (struct in_addr){ 0 }; - dns6 = &c->ip6.dns[0]; + dns6_idx = 0; memset(c->ip6.dns, 0, sizeof(c->ip6.dns)); c->ip6.dns_match = (struct in6_addr){ 0 }; c->ip6.dns_host = (struct in6_addr){ 0 }; @@ -1679,15 +1754,13 @@ void conf(struct ctx *c, int argc, char **argv) c->no_dns = 0; - if (dns4 - &c->ip4.dns[0] < ARRAY_SIZE(c->ip4.dns) && - inet_pton(AF_INET, optarg, &dns4_tmp)) { - add_dns4(c, &dns4_tmp, &dns4); + if (inet_pton(AF_INET, optarg, &dns4_tmp)) { + dns4_idx += add_dns4(c, &dns4_tmp, dns4_idx); continue; } - if (dns6 - &c->ip6.dns[0] < ARRAY_SIZE(c->ip6.dns) && - inet_pton(AF_INET6, optarg, &dns6_tmp)) { - add_dns6(c, &dns6_tmp, &dns6); + if (inet_pton(AF_INET6, optarg, &dns6_tmp)) { + dns6_idx += add_dns6(c, &dns6_tmp, dns6_idx); continue; } @@ -1720,7 +1793,7 @@ void conf(struct ctx *c, int argc, char **argv) nl_sock_init(c, true); /* ...and outbound port options now that namespaces are set up. */ - optind = 1; + optind = 0; do { name = getopt_long(argc, argv, optstring, options, NULL); diff --git a/contrib/apparmor/abstractions/passt b/contrib/apparmor/abstractions/passt index d245115..43fd63f 100644 --- a/contrib/apparmor/abstractions/passt +++ b/contrib/apparmor/abstractions/passt @@ -34,6 +34,8 @@ owner @{PROC}/@{pid}/uid_map r, # conf_ugid() + @{PROC}/sys/net/ipv4/ip_local_port_range r, # fwd_probe_ephemeral() + network netlink raw, # nl_sock_init_do(), netlink.c network inet stream, # tcp.c diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te index bbb0917..80bf780 100644 --- a/contrib/selinux/passt.te +++ b/contrib/selinux/passt.te @@ -50,6 +50,7 @@ require { type passwd_file_t; class netlink_route_socket { bind create nlmsg_read }; + type sysctl_net_t; class capability { sys_tty_config setuid setgid }; class cap_userns { setpcap sys_admin sys_ptrace }; @@ -104,6 +105,8 @@ allow passt_t net_conf_t:lnk_file read; allow passt_t tmp_t:sock_file { create unlink write }; allow passt_t self:netlink_route_socket { bind create nlmsg_read read write setopt }; kernel_search_network_sysctl(passt_t) +allow passt_t sysctl_net_t:dir search; +allow passt_t sysctl_net_t:file { open read }; corenet_tcp_bind_all_nodes(passt_t) corenet_udp_bind_all_nodes(passt_t) diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te index 4e36c3f..310383c 100644 --- a/contrib/selinux/pasta.te +++ b/contrib/selinux/pasta.te @@ -196,7 +196,7 @@ allow pasta_t ifconfig_var_run_t:dir { read search watch }; allow pasta_t self:tun_socket create; allow pasta_t tun_tap_device_t:chr_file { ioctl open read write }; allow pasta_t sysctl_net_t:dir search; -allow pasta_t sysctl_net_t:file { open write }; +allow pasta_t sysctl_net_t:file { open read write }; allow pasta_t kernel_t:system module_request; allow pasta_t nsfs_t:file read; @@ -276,6 +276,7 @@ static void opt_set_dns_search(const struct ctx *c, size_t max_len) int dhcp(const struct ctx *c, const struct pool *p) { size_t mlen, dlen, offset = 0, opt_len, opt_off = 0; + char macstr[ETH_ADDRSTRLEN]; const struct ethhdr *eh; const struct iphdr *iph; const struct udphdr *uh; @@ -340,26 +341,26 @@ int dhcp(const struct ctx *c, const struct pool *p) return -1; } - info(" from %02x:%02x:%02x:%02x:%02x:%02x", - m->chaddr[0], m->chaddr[1], m->chaddr[2], - m->chaddr[3], m->chaddr[4], m->chaddr[5]); + info(" from %s", eth_ntop(m->chaddr, macstr, sizeof(macstr))); m->yiaddr = c->ip4.addr; mask.s_addr = htonl(0xffffffff << (32 - c->ip4.prefix_len)); - memcpy(opts[1].s, &mask, sizeof(mask)); - memcpy(opts[3].s, &c->ip4.gw, sizeof(c->ip4.gw)); - memcpy(opts[54].s, &c->ip4.gw, sizeof(c->ip4.gw)); + memcpy(opts[1].s, &mask, sizeof(mask)); + memcpy(opts[3].s, &c->ip4.guest_gw, sizeof(c->ip4.guest_gw)); + memcpy(opts[54].s, &c->ip4.our_tap_addr, sizeof(c->ip4.our_tap_addr)); /* If the gateway is not on the assigned subnet, send an option 121 * (Classless Static Routing) adding a dummy route to it. */ if ((c->ip4.addr.s_addr & mask.s_addr) - != (c->ip4.gw.s_addr & mask.s_addr)) { + != (c->ip4.guest_gw.s_addr & mask.s_addr)) { /* a.b.c.d/32:0.0.0.0, 0:a.b.c.d */ opts[121].slen = 14; opts[121].s[0] = 32; - memcpy(opts[121].s + 1, &c->ip4.gw, sizeof(c->ip4.gw)); - memcpy(opts[121].s + 10, &c->ip4.gw, sizeof(c->ip4.gw)); + memcpy(opts[121].s + 1, + &c->ip4.guest_gw, sizeof(c->ip4.guest_gw)); + memcpy(opts[121].s + 10, + &c->ip4.guest_gw, sizeof(c->ip4.guest_gw)); } if (c->mtu != -1) { @@ -378,7 +379,7 @@ int dhcp(const struct ctx *c, const struct pool *p) opt_set_dns_search(c, sizeof(m->o)); dlen = offsetof(struct msg, o) + fill(m); - tap_udp4_send(c, c->ip4.gw, 67, c->ip4.addr, 68, m, dlen); + tap_udp4_send(c, c->ip4.our_tap_addr, 67, c->ip4.addr, 68, m, dlen); return 1; } @@ -298,7 +298,8 @@ static struct opt_hdr *dhcpv6_ia_notonlink(const struct pool *p, { char buf[INET6_ADDRSTRLEN]; struct in6_addr req_addr; - struct opt_hdr *ia, *h; + const struct opt_hdr *h; + struct opt_hdr *ia; size_t offset; int ia_type; @@ -312,12 +313,13 @@ ia_ta: offset += sizeof(struct opt_ia_na); while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) { - struct opt_ia_addr *opt_addr = (struct opt_ia_addr *)h; + const struct opt_ia_addr *opt_addr; if (ntohs(h->l) != OPT_VSIZE(ia_addr)) return NULL; - memcpy(&req_addr, &opt_addr->addr, sizeof(req_addr)); + opt_addr = (const struct opt_ia_addr *)h; + req_addr = opt_addr->addr; if (!IN6_ARE_ADDR_EQUAL(la, &req_addr)) { info("DHCPv6: requested address %s not on link", inet_ntop(AF_INET6, &req_addr, @@ -363,7 +365,7 @@ static size_t dhcpv6_dns_fill(const struct ctx *c, char *buf, int offset) srv->hdr.l = 0; } - memcpy(&srv->addr[i], &c->ip6.dns[i], sizeof(srv->addr[i])); + srv->addr[i] = c->ip6.dns[i]; srv->hdr.l += sizeof(srv->addr[i]); offset += sizeof(srv->addr[i]); } @@ -451,10 +453,7 @@ int dhcpv6(struct ctx *c, const struct pool *p, c->ip6.addr_ll_seen = *saddr; - if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw)) - src = &c->ip6.gw; - else - src = &c->ip6.addr_ll; + src = &c->ip6.our_tap_ll; mh = packet_get(p, 0, sizeof(*uh), sizeof(*mh), NULL); if (!mh) @@ -574,8 +573,10 @@ void dhcpv6_init(const struct ctx *c) resp.server_id.duid_time = duid_time; resp_not_on_link.server_id.duid_time = duid_time; - memcpy(resp.server_id.duid_lladdr, c->mac, sizeof(c->mac)); - memcpy(resp_not_on_link.server_id.duid_lladdr, c->mac, sizeof(c->mac)); + memcpy(resp.server_id.duid_lladdr, + c->our_tap_mac, sizeof(c->our_tap_mac)); + memcpy(resp_not_on_link.server_id.duid_lladdr, + c->our_tap_mac, sizeof(c->our_tap_mac)); resp.ia_addr.addr = c->ip6.addr; } @@ -127,18 +127,18 @@ static struct timespec flow_timer_run; * @af: Address family (AF_INET or AF_INET6) * @eaddr: Endpoint address (pointer to in_addr or in6_addr) * @eport: Endpoint port - * @faddr: Forwarding address (pointer to in_addr or in6_addr) - * @fport: Forwarding port + * @oaddr: Our address (pointer to in_addr or in6_addr) + * @oport: Our port */ static void flowside_from_af(struct flowside *side, sa_family_t af, const void *eaddr, in_port_t eport, - const void *faddr, in_port_t fport) + const void *oaddr, in_port_t oport) { - if (faddr) - inany_from_af(&side->faddr, af, faddr); + if (oaddr) + inany_from_af(&side->oaddr, af, oaddr); else - side->faddr = inany_any6; - side->fport = fport; + side->oaddr = inany_any6; + side->oport = oport; if (eaddr) inany_from_af(&side->eaddr, af, eaddr); @@ -193,8 +193,8 @@ static int flowside_sock_splice(void *arg) * @tgt: Target flowside * @data: epoll reference portion for protocol handlers * - * Return: socket fd of protocol @proto bound to the forwarding address and port - * from @tgt (if specified). + * Return: socket fd of protocol @proto bound to our address and port from @tgt + * (if specified). */ int flowside_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif, const struct flowside *tgt, uint32_t data) @@ -205,11 +205,11 @@ int flowside_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif, ASSERT(pif_is_socket(pif)); - pif_sockaddr(c, &sa, &sl, pif, &tgt->faddr, tgt->fport); + pif_sockaddr(c, &sa, &sl, pif, &tgt->oaddr, tgt->oport); switch (pif) { case PIF_HOST: - if (inany_is_loopback(&tgt->faddr)) + if (inany_is_loopback(&tgt->oaddr)) ifname = NULL; else if (sa.sa_family == AF_INET) ifname = c->ip4.ifname_out; @@ -283,46 +283,60 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) "Flow %u (%s): %s", flow_idx(f), type_or_state, msg); } -/** - * flow_set_state() - Change flow's state - * @f: Flow changing state - * @state: New state +/** flow_log_details_() - Log the details of a flow + * @f: flow to log + * @pri: Log priority + * @state: State to log details according to + * + * Logs the details of the flow: endpoints, interfaces, type etc. */ -static void flow_set_state(struct flow_common *f, enum flow_state state) +void flow_log_details_(const struct flow_common *f, int pri, + enum flow_state state) { char estr0[INANY_ADDRSTRLEN], fstr0[INANY_ADDRSTRLEN]; char estr1[INANY_ADDRSTRLEN], fstr1[INANY_ADDRSTRLEN]; const struct flowside *ini = &f->side[INISIDE]; const struct flowside *tgt = &f->side[TGTSIDE]; - uint8_t oldstate = f->state; - - ASSERT(state < FLOW_NUM_STATES); - ASSERT(oldstate < FLOW_NUM_STATES); - f->state = state; - flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate], - FLOW_STATE(f)); - - if (MAX(state, oldstate) >= FLOW_STATE_TGT) - flow_log_(f, LOG_DEBUG, + if (state >= FLOW_STATE_TGT) + flow_log_(f, pri, "%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu", pif_name(f->pif[INISIDE]), inany_ntop(&ini->eaddr, estr0, sizeof(estr0)), ini->eport, - inany_ntop(&ini->faddr, fstr0, sizeof(fstr0)), - ini->fport, + inany_ntop(&ini->oaddr, fstr0, sizeof(fstr0)), + ini->oport, pif_name(f->pif[TGTSIDE]), - inany_ntop(&tgt->faddr, fstr1, sizeof(fstr1)), - tgt->fport, + inany_ntop(&tgt->oaddr, fstr1, sizeof(fstr1)), + tgt->oport, inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)), tgt->eport); - else if (MAX(state, oldstate) >= FLOW_STATE_INI) - flow_log_(f, LOG_DEBUG, "%s [%s]:%hu -> [%s]:%hu => ?", + else if (state >= FLOW_STATE_INI) + flow_log_(f, pri, "%s [%s]:%hu -> [%s]:%hu => ?", pif_name(f->pif[INISIDE]), inany_ntop(&ini->eaddr, estr0, sizeof(estr0)), ini->eport, - inany_ntop(&ini->faddr, fstr0, sizeof(fstr0)), - ini->fport); + inany_ntop(&ini->oaddr, fstr0, sizeof(fstr0)), + ini->oport); +} + +/** + * flow_set_state() - Change flow's state + * @f: Flow changing state + * @state: New state + */ +static void flow_set_state(struct flow_common *f, enum flow_state state) +{ + uint8_t oldstate = f->state; + + ASSERT(state < FLOW_NUM_STATES); + ASSERT(oldstate < FLOW_NUM_STATES); + + f->state = state; + flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate], + FLOW_STATE(f)); + + flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate)); } /** @@ -347,7 +361,7 @@ static void flow_initiate_(union flow *flow, uint8_t pif) * flow_initiate_af() - Move flow to INI, setting INISIDE details * @flow: Flow to change state * @pif: pif of the initiating side - * @af: Address family of @eaddr and @faddr + * @af: Address family of @saddr and @daddr * @saddr: Source address (pointer to in_addr or in6_addr) * @sport: Endpoint port * @daddr: Destination address (pointer to in_addr or in6_addr) @@ -384,10 +398,10 @@ const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif, inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa); if (inany_v4(&ini->eaddr)) - ini->faddr = inany_any4; + ini->oaddr = inany_any4; else - ini->faddr = inany_any6; - ini->fport = dport; + ini->oaddr = inany_any6; + ini->oport = dport; flow_initiate_(flow, pif); return ini; } @@ -432,8 +446,8 @@ const struct flowside *flow_target(const struct ctx *c, union flow *flow, pif_name(f->pif[INISIDE]), inany_ntop(&ini->eaddr, estr, sizeof(estr)), ini->eport, - inany_ntop(&ini->faddr, fstr, sizeof(fstr)), - ini->fport); + inany_ntop(&ini->oaddr, fstr, sizeof(fstr)), + ini->oport); } if (tgtpif == PIF_NONE) @@ -561,18 +575,12 @@ static uint64_t flow_hash(const struct ctx *c, uint8_t proto, uint8_t pif, { struct siphash_state state = SIPHASH_INIT(c->hash_secret); - /* For the hash table to work, we need complete endpoint information, - * and at least a forwarding port. - */ - ASSERT(pif != PIF_NONE && !inany_is_unspecified(&side->eaddr) && - side->eport != 0 && side->fport != 0); - - inany_siphash_feed(&state, &side->faddr); + inany_siphash_feed(&state, &side->oaddr); inany_siphash_feed(&state, &side->eaddr); return siphash_final(&state, 38, (uint64_t)proto << 40 | (uint64_t)pif << 32 | - (uint64_t)side->fport << 16 | + (uint64_t)side->oport << 16 | (uint64_t)side->eport); } @@ -586,8 +594,16 @@ static uint64_t flow_hash(const struct ctx *c, uint8_t proto, uint8_t pif, static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx) { const struct flow_common *f = &flow_at_sidx(sidx)->f; - return flow_hash(c, FLOW_PROTO(f), - f->pif[sidx.sidei], &f->side[sidx.sidei]); + const struct flowside *side = &f->side[sidx.sidei]; + uint8_t pif = f->pif[sidx.sidei]; + + /* For the hash table to work, entries must have complete endpoint + * information, and at least a forwarding port. + */ + ASSERT(pif != PIF_NONE && !inany_is_unspecified(&side->eaddr) && + side->eport != 0 && side->oport != 0); + + return flow_hash(c, FLOW_PROTO(f), pif, side); } /** @@ -695,7 +711,7 @@ static flow_sidx_t flowside_lookup(const struct ctx *c, uint8_t proto, !(FLOW_PROTO(&flow->f) == proto && flow->f.pif[sidx.sidei] == pif && flowside_eq(&flow->f.side[sidx.sidei], side))) - b = (b + 1) % FLOW_HASH_SIZE; + b = mod_sub(b, 1, FLOW_HASH_SIZE); return flow_hashtab[b]; } @@ -707,20 +723,20 @@ static flow_sidx_t flowside_lookup(const struct ctx *c, uint8_t proto, * @pif: Interface of the flow * @af: Address family, AF_INET or AF_INET6 * @eaddr: Guest side endpoint address (guest local address) - * @faddr: Guest side forwarding address (guest remote address) + * @oaddr: Our guest side address (guest remote address) * @eport: Guest side endpoint port (guest local port) - * @fport: Guest side forwarding port (guest remote port) + * @oport: Our guest side port (guest remote port) * * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found */ flow_sidx_t flow_lookup_af(const struct ctx *c, uint8_t proto, uint8_t pif, sa_family_t af, - const void *eaddr, const void *faddr, - in_port_t eport, in_port_t fport) + const void *eaddr, const void *oaddr, + in_port_t eport, in_port_t oport) { struct flowside side; - flowside_from_af(&side, af, eaddr, eport, faddr, fport); + flowside_from_af(&side, af, eaddr, eport, oaddr, oport); return flowside_lookup(c, proto, pif, &side); } @@ -730,22 +746,22 @@ flow_sidx_t flow_lookup_af(const struct ctx *c, * @proto: Protocol of the flow (IP L4 protocol number) * @pif: Interface of the flow * @esa: Socket address of the endpoint - * @fport: Forwarding port number + * @oport: Our port number * * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found */ flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif, - const void *esa, in_port_t fport) + const void *esa, in_port_t oport) { struct flowside side = { - .fport = fport, + .oport = oport, }; inany_from_sockaddr(&side.eaddr, &side.eport, esa); if (inany_v4(&side.eaddr)) - side.faddr = inany_any4; + side.oaddr = inany_any4; else - side.faddr = inany_any6; + side.oaddr = inany_any6; return flowside_lookup(c, proto, pif, &side); } @@ -830,7 +846,8 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) closed = icmp_ping_timer(c, &flow->ping, now); break; case FLOW_UDP: - if (timer) + closed = udp_flow_defer(&flow->udp); + if (!closed && timer) closed = udp_flow_timer(c, &flow->udp, now); break; default: @@ -140,14 +140,14 @@ extern const uint8_t flow_proto[]; /** * struct flowside - Address information for one side of a flow * @eaddr: Endpoint address (remote address from passt's PoV) - * @faddr: Forwarding address (local address from passt's PoV) + * @oaddr: Our address (local address from passt's PoV) * @eport: Endpoint port - * @fport: Forwarding port + * @oport: Our port */ struct flowside { - union inany_addr faddr; + union inany_addr oaddr; union inany_addr eaddr; - in_port_t fport; + in_port_t oport; in_port_t eport; }; @@ -162,8 +162,8 @@ static inline bool flowside_eq(const struct flowside *left, { return inany_equals(&left->eaddr, &right->eaddr) && left->eport == right->eport && - inany_equals(&left->faddr, &right->faddr) && - left->fport == right->fport; + inany_equals(&left->oaddr, &right->oaddr) && + left->oport == right->oport; } int flowside_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif, @@ -240,10 +240,10 @@ uint64_t flow_hash_insert(const struct ctx *c, flow_sidx_t sidx); void flow_hash_remove(const struct ctx *c, flow_sidx_t sidx); flow_sidx_t flow_lookup_af(const struct ctx *c, uint8_t proto, uint8_t pif, sa_family_t af, - const void *eaddr, const void *faddr, - in_port_t eport, in_port_t fport); + const void *eaddr, const void *oaddr, + in_port_t eport, in_port_t oport); flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif, - const void *esa, in_port_t fport); + const void *esa, in_port_t oport); union flow; @@ -264,4 +264,11 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) flow_dbg((f), __VA_ARGS__); \ } while (0) +void flow_log_details_(const struct flow_common *f, int pri, + enum flow_state state); +#define flow_log_details(f_, pri) \ + flow_log_details_(&((f_)->f), (pri), (f_)->f.state) +#define flow_dbg_details(f_) flow_log_details((f_), LOG_DEBUG) +#define flow_err_details(f_) flow_log_details((f_), LOG_ERR) + #endif /* FLOW_H */ @@ -27,6 +27,80 @@ #include "lineread.h" #include "flow_table.h" +/* Empheral port range: values from RFC 6335 */ +static in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14); +static in_port_t fwd_ephemeral_max = NUM_PORTS - 1; + +#define PORT_RANGE_SYSCTL "/proc/sys/net/ipv4/ip_local_port_range" + +/** fwd_probe_ephemeral() - Determine what ports this host considers ephemeral + * + * Work out what ports the host thinks are emphemeral and record it for later + * use by fwd_port_is_ephemeral(). If we're unable to probe, assume the range + * recommended by RFC 6335. + */ +void fwd_probe_ephemeral(void) +{ + char *line, *tab, *end; + struct lineread lr; + long min, max; + ssize_t len; + int fd; + + fd = open(PORT_RANGE_SYSCTL, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + warn_perror("Unable to open %s", PORT_RANGE_SYSCTL); + return; + } + + lineread_init(&lr, fd); + len = lineread_get(&lr, &line); + close(fd); + + if (len < 0) + goto parse_err; + + tab = strchr(line, '\t'); + if (!tab) + goto parse_err; + *tab = '\0'; + + errno = 0; + min = strtol(line, &end, 10); + if (*end || errno) + goto parse_err; + + errno = 0; + max = strtol(tab + 1, &end, 10); + if (*end || errno) + goto parse_err; + + if (min < 0 || min >= NUM_PORTS || + max < 0 || max >= NUM_PORTS) + goto parse_err; + + fwd_ephemeral_min = min; + fwd_ephemeral_max = max; + + return; + +parse_err: + warn("Unable to parse %s", PORT_RANGE_SYSCTL); +} + +/** + * fwd_port_is_ephemeral() - Is port number ephemeral? + * @port: Port number + * + * Return: true if @port is ephemeral, that is may be allocated by the kernel as + * a local port for outgoing connections or datagrams, but should not be + * used for binding services to. + */ +bool fwd_port_is_ephemeral(in_port_t port) +{ + return (port >= fwd_ephemeral_min) && (port <= fwd_ephemeral_max); +} + /* See enum in kernel's include/net/tcp_states.h */ #define UDP_LISTEN 0x07 #define TCP_LISTEN 0x0a @@ -167,7 +241,86 @@ void fwd_scan_ports_init(struct ctx *c) static bool is_dns_flow(uint8_t proto, const struct flowside *ini) { return ((proto == IPPROTO_UDP) || (proto == IPPROTO_TCP)) && - ((ini->fport == 53) || (ini->fport == 853)); + ((ini->oport == 53) || (ini->oport == 853)); +} + +/** + * fwd_guest_accessible4() - Is IPv4 address guest-accessible + * @c: Execution context + * @addr: Host visible IPv4 address + * + * Return: true if @addr on the host is accessible to the guest without + * translation, false otherwise + */ +static bool fwd_guest_accessible4(const struct ctx *c, + const struct in_addr *addr) +{ + if (IN4_IS_ADDR_LOOPBACK(addr)) + return false; + + /* In socket interfaces 0.0.0.0 generally means "any" or unspecified, + * however on the wire it can mean "this host on this network". Since + * that has a different meaning for host and guest, we can't let it + * through untranslated. + */ + if (IN4_IS_ADDR_UNSPECIFIED(addr)) + return false; + + /* For IPv4, addr_seen is initialised to addr, so is always a valid + * address + */ + if (IN4_ARE_ADDR_EQUAL(addr, &c->ip4.addr) || + IN4_ARE_ADDR_EQUAL(addr, &c->ip4.addr_seen)) + return false; + + return true; +} + +/** + * fwd_guest_accessible6() - Is IPv6 address guest-accessible + * @c: Execution context + * @addr: Host visible IPv6 address + * + * Return: true if @addr on the host is accessible to the guest without + * translation, false otherwise + */ +static bool fwd_guest_accessible6(const struct ctx *c, + const struct in6_addr *addr) +{ + if (IN6_IS_ADDR_LOOPBACK(addr)) + return false; + + if (IN6_ARE_ADDR_EQUAL(addr, &c->ip6.addr)) + return false; + + /* For IPv6, addr_seen starts unspecified, because we don't know what LL + * address the guest will take until we see it. Only check against it + * if it has been set to a real address. + */ + if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr_seen) && + IN6_ARE_ADDR_EQUAL(addr, &c->ip6.addr_seen)) + return false; + + return true; +} + +/** + * fwd_guest_accessible() - Is IPv[46] address guest-accessible + * @c: Execution context + * @addr: Host visible IPv[46] address + * + * Return: true if @addr on the host is accessible to the guest without + * translation, false otherwise + */ +static bool fwd_guest_accessible(const struct ctx *c, + const union inany_addr *addr) +{ + const struct in_addr *a4 = inany_v4(addr); + + if (a4) + return fwd_guest_accessible4(c, a4); + + return fwd_guest_accessible6(c, &addr->a6); } /** @@ -184,33 +337,37 @@ uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto, const struct flowside *ini, struct flowside *tgt) { if (is_dns_flow(proto, ini) && - inany_equals4(&ini->faddr, &c->ip4.dns_match)) + inany_equals4(&ini->oaddr, &c->ip4.dns_match)) tgt->eaddr = inany_from_v4(c->ip4.dns_host); else if (is_dns_flow(proto, ini) && - inany_equals6(&ini->faddr, &c->ip6.dns_match)) + inany_equals6(&ini->oaddr, &c->ip6.dns_match)) tgt->eaddr.a6 = c->ip6.dns_host; - else if (!c->no_map_gw && inany_equals4(&ini->faddr, &c->ip4.gw)) + else if (inany_equals4(&ini->oaddr, &c->ip4.map_host_loopback)) tgt->eaddr = inany_loopback4; - else if (!c->no_map_gw && inany_equals6(&ini->faddr, &c->ip6.gw)) + else if (inany_equals6(&ini->oaddr, &c->ip6.map_host_loopback)) tgt->eaddr = inany_loopback6; + else if (inany_equals4(&ini->oaddr, &c->ip4.map_guest_addr)) + tgt->eaddr = inany_from_v4(c->ip4.addr); + else if (inany_equals6(&ini->oaddr, &c->ip6.map_guest_addr)) + tgt->eaddr.a6 = c->ip6.addr; else - tgt->eaddr = ini->faddr; + tgt->eaddr = ini->oaddr; - tgt->eport = ini->fport; + tgt->eport = ini->oport; /* The relevant addr_out controls the host side source address. This * may be unspecified, which allows the kernel to pick an address. */ if (inany_v4(&tgt->eaddr)) - tgt->faddr = inany_from_v4(c->ip4.addr_out); + tgt->oaddr = inany_from_v4(c->ip4.addr_out); else - tgt->faddr.a6 = c->ip6.addr_out; + tgt->oaddr.a6 = c->ip6.addr_out; /* Let the kernel pick a host side source port */ - tgt->fport = 0; + tgt->oport = 0; if (proto == IPPROTO_UDP) { /* But for UDP we preserve the source port */ - tgt->fport = ini->eport; + tgt->oport = ini->eport; } return PIF_HOST; @@ -230,13 +387,13 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto, const struct flowside *ini, struct flowside *tgt) { if (!inany_is_loopback(&ini->eaddr) || - (!inany_is_loopback(&ini->faddr) && !inany_is_unspecified(&ini->faddr))) { + (!inany_is_loopback(&ini->oaddr) && !inany_is_unspecified(&ini->oaddr))) { char estr[INANY_ADDRSTRLEN], fstr[INANY_ADDRSTRLEN]; debug("Non loopback address on %s: [%s]:%hu -> [%s]:%hu", pif_name(PIF_SPLICE), inany_ntop(&ini->eaddr, estr, sizeof(estr)), ini->eport, - inany_ntop(&ini->faddr, fstr, sizeof(fstr)), ini->fport); + inany_ntop(&ini->oaddr, fstr, sizeof(fstr)), ini->oport); return PIF_NONE; } @@ -248,20 +405,20 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto, /* Preserve the specific loopback adddress used, but let the kernel pick * a source port on the target side */ - tgt->faddr = ini->eaddr; - tgt->fport = 0; + tgt->oaddr = ini->eaddr; + tgt->oport = 0; - tgt->eport = ini->fport; + tgt->eport = ini->oport; if (proto == IPPROTO_TCP) tgt->eport += c->tcp.fwd_out.delta[tgt->eport]; else if (proto == IPPROTO_UDP) tgt->eport += c->udp.fwd_out.delta[tgt->eport]; /* Let the kernel pick a host side source port */ - tgt->fport = 0; + tgt->oport = 0; if (proto == IPPROTO_UDP) /* But for UDP preserve the source port */ - tgt->fport = ini->eport; + tgt->oport = ini->eport; return PIF_HOST; } @@ -280,7 +437,7 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto, const struct flowside *ini, struct flowside *tgt) { /* Common for spliced and non-spliced cases */ - tgt->eport = ini->fport; + tgt->eport = ini->oport; if (proto == IPPROTO_TCP) tgt->eport += c->tcp.fwd_in.delta[tgt->eport]; else if (proto == IPPROTO_UDP) @@ -293,11 +450,11 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto, /* Preserve the specific loopback adddress used, but let the * kernel pick a source port on the target side */ - tgt->faddr = ini->eaddr; - tgt->fport = 0; + tgt->oaddr = ini->eaddr; + tgt->oport = 0; if (proto == IPPROTO_UDP) /* But for UDP preserve the source port */ - tgt->fport = ini->eport; + tgt->oport = ini->eport; if (inany_v4(&ini->eaddr)) tgt->eaddr = inany_loopback4; @@ -307,26 +464,37 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto, return PIF_SPLICE; } - tgt->faddr = ini->eaddr; - tgt->fport = ini->eport; - - if (inany_is_loopback4(&tgt->faddr) || - inany_is_unspecified4(&tgt->faddr) || - inany_equals4(&tgt->faddr, &c->ip4.addr_seen)) { - tgt->faddr = inany_from_v4(c->ip4.gw); - } else if (inany_is_loopback6(&tgt->faddr) || - inany_equals6(&tgt->faddr, &c->ip6.addr_seen) || - inany_equals6(&tgt->faddr, &c->ip6.addr)) { - if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw)) - tgt->faddr.a6 = c->ip6.gw; - else - tgt->faddr.a6 = c->ip6.addr_ll; + if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) && + inany_equals4(&ini->eaddr, &in4addr_loopback)) { + /* Specifically 127.0.0.1, not 127.0.0.0/8 */ + tgt->oaddr = inany_from_v4(c->ip4.map_host_loopback); + } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) && + inany_equals6(&ini->eaddr, &in6addr_loopback)) { + tgt->oaddr.a6 = c->ip6.map_host_loopback; + } else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) && + inany_equals4(&ini->eaddr, &c->ip4.addr)) { + tgt->oaddr = inany_from_v4(c->ip4.map_guest_addr); + } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) && + inany_equals6(&ini->eaddr, &c->ip6.addr)) { + tgt->oaddr.a6 = c->ip6.map_guest_addr; + } else if (!fwd_guest_accessible(c, &ini->eaddr)) { + if (inany_v4(&ini->eaddr)) { + if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.our_tap_addr)) + /* No source address we can use */ + return PIF_NONE; + tgt->oaddr = inany_from_v4(c->ip4.our_tap_addr); + } else { + tgt->oaddr.a6 = c->ip6.our_tap_ll; + } + } else { + tgt->oaddr = ini->eaddr; } + tgt->oport = ini->eport; - if (inany_v4(&tgt->faddr)) { + if (inany_v4(&tgt->oaddr)) { tgt->eaddr = inany_from_v4(c->ip4.addr_seen); } else { - if (inany_is_linklocal6(&tgt->faddr)) + if (inany_is_linklocal6(&tgt->oaddr)) tgt->eaddr.a6 = c->ip6.addr_ll_seen; else tgt->eaddr.a6 = c->ip6.addr_seen; @@ -12,6 +12,9 @@ struct flowside; /* Number of ports for both TCP and UDP */ #define NUM_PORTS (1U << 16) +void fwd_probe_ephemeral(void); +bool fwd_port_is_ephemeral(in_port_t port); + enum fwd_ports_mode { FWD_UNSET = 0, FWD_SPEC = 1, @@ -125,13 +125,13 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref) ini->eport, seq); if (pingf->f.type == FLOW_PING4) { - const struct in_addr *saddr = inany_v4(&ini->faddr); + const struct in_addr *saddr = inany_v4(&ini->oaddr); const struct in_addr *daddr = inany_v4(&ini->eaddr); ASSERT(saddr && daddr); /* Must have IPv4 addresses */ tap_icmp4_send(c, *saddr, *daddr, buf, n); } else if (pingf->f.type == FLOW_PING6) { - const struct in6_addr *saddr = &ini->faddr.a6; + const struct in6_addr *saddr = &ini->oaddr.a6; const struct in6_addr *daddr = &ini->eaddr.a6; tap_icmp6_send(c, saddr, daddr, buf, n); @@ -98,7 +98,7 @@ const char *logfile_prefix[] = { * @fd: Log file descriptor * @now: Current timestamp * - * #syscalls lseek ppc64le:_llseek ppc64:_llseek arm:_llseek + * #syscalls lseek ppc64le:_llseek ppc64:_llseek arm:_llseek i686:_llseek */ static void logfile_rotate_fallocate(int fd, const struct timespec *now) { @@ -224,19 +224,23 @@ static int logfile_rotate(int fd, const struct timespec *now) /** * logfile_write() - Write entry to log file, trigger rotation if full * @newline: Append newline at the end of the message, if missing + * @cont: Continuation of a previous message, on the same line * @pri: Facility and level map, same as priority for vsyslog() * @now: Timestamp * @format: Same as vsyslog() format * @ap: Same as vsyslog() ap */ -static void logfile_write(bool newline, int pri, const struct timespec *now, +static void logfile_write(bool newline, bool cont, int pri, + const struct timespec *now, const char *format, va_list ap) { char buf[BUFSIZ]; - int n; + int n = 0; - n = logtime_fmt(buf, BUFSIZ, now); - n += snprintf(buf + n, BUFSIZ - n, ": %s", logfile_prefix[pri]); + if (!cont) { + n += logtime_fmt(buf, BUFSIZ, now); + n += snprintf(buf + n, BUFSIZ - n, ": %s", logfile_prefix[pri]); + } n += vsnprintf(buf + n, BUFSIZ - n, format, ap); @@ -278,7 +282,7 @@ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap) va_copy(ap2, ap); /* Don't clobber ap, we need it again */ if (log_file != -1) - logfile_write(newline, pri, now, format, ap2); + logfile_write(newline, cont, pri, now, format, ap2); else if (!(log_mask & LOG_MASK(LOG_DEBUG))) passt_vsyslog(newline, pri, format, ap2); @@ -247,7 +247,7 @@ int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr, memcpy(&na.target_addr, &ns->target_addr, sizeof(na.target_addr)); - memcpy(na.target_l2_addr.mac, c->mac, ETH_ALEN); + memcpy(na.target_l2_addr.mac, c->our_tap_mac, ETH_ALEN); } else if (ih->icmp6_type == RS) { size_t dns_s_len = 0; @@ -331,7 +331,7 @@ int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr, } dns_done: - memcpy(&ra.source_ll.mac, c->mac, ETH_ALEN); + memcpy(&ra.source_ll.mac, c->our_tap_mac, ETH_ALEN); } else { return 1; } @@ -341,10 +341,7 @@ dns_done: else c->ip6.addr_seen = *saddr; - if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw)) - rsaddr = &c->ip6.gw; - else - rsaddr = &c->ip6.addr_ll; + rsaddr = &c->ip6.our_tap_ll; if (ih->icmp6_type == NS) { dlen = sizeof(struct ndp_na); @@ -674,6 +674,63 @@ int nl_route_dup(int s_src, unsigned int ifi_src, } /** + * nl_addr_set_ll_nodad() - Set IFA_F_NODAD on IPv6 link-local addresses + * @s: Netlink socket + * @ifi: Interface index in target namespace + * + * Return: 0 on success, negative error code on failure + */ +int nl_addr_set_ll_nodad(int s, unsigned int ifi) +{ + struct req_t { + struct nlmsghdr nlh; + struct ifaddrmsg ifa; + } req = { + .ifa.ifa_family = AF_INET6, + .ifa.ifa_index = ifi, + }; + uint32_t seq, last_seq = 0; + ssize_t status, ret = 0; + struct nlmsghdr *nh; + char buf[NLBUFSIZ]; + + seq = nl_send(s, &req, RTM_GETADDR, NLM_F_DUMP, sizeof(req)); + nl_foreach_oftype(nh, status, s, buf, seq, RTM_NEWADDR) { + struct ifaddrmsg *ifa = (struct ifaddrmsg *)NLMSG_DATA(nh); + struct rtattr *rta; + size_t na; + + if (ifa->ifa_index != ifi || ifa->ifa_scope != RT_SCOPE_LINK) + continue; + + ifa->ifa_flags |= IFA_F_NODAD; + + for (rta = IFA_RTA(ifa), na = IFA_PAYLOAD(nh); RTA_OK(rta, na); + rta = RTA_NEXT(rta, na)) { + /* If 32-bit flags are used, add IFA_F_NODAD there */ + if (rta->rta_type == IFA_FLAGS) + *(uint32_t *)RTA_DATA(rta) |= IFA_F_NODAD; + } + + last_seq = nl_send(s, nh, RTM_NEWADDR, NLM_F_REPLACE, + nh->nlmsg_len); + } + + if (status < 0) + ret = status; + + for (seq = seq + 1; seq <= last_seq; seq++) { + nl_foreach(nh, status, s, buf, seq) + warn("netlink: Unexpected response message"); + + if (!ret && status < 0) + ret = status; + } + + return ret; +} + +/** * nl_addr_get() - Get most specific global address, given interface and family * @s: Netlink socket * @ifi: Interface index in outer network namespace @@ -682,7 +739,7 @@ int nl_route_dup(int s_src, unsigned int ifi_src, * @prefix_len: Mask or prefix length, to fill (for IPv4) * @addr_l: Link-scoped address to fill (for IPv6) * - * Return: 9 on success, negative error code on failure + * Return: 0 on success, negative error code on failure */ int nl_addr_get(int s, unsigned int ifi, sa_family_t af, void *addr, int *prefix_len, void *addr_l) @@ -740,7 +797,54 @@ int nl_addr_get(int s, unsigned int ifi, sa_family_t af, } /** - * nl_add_set() - Set IP addresses for given interface and address family + * nl_addr_get_ll() - Get first IPv6 link-local address for a given interface + * @s: Netlink socket + * @ifi: Interface index in outer network namespace + * @addr: Link-local address to fill + * + * Return: 0 on success, negative error code on failure + */ +int nl_addr_get_ll(int s, unsigned int ifi, struct in6_addr *addr) +{ + struct req_t { + struct nlmsghdr nlh; + struct ifaddrmsg ifa; + } req = { + .ifa.ifa_family = AF_INET6, + .ifa.ifa_index = ifi, + }; + struct nlmsghdr *nh; + bool found = false; + char buf[NLBUFSIZ]; + ssize_t status; + uint32_t seq; + + seq = nl_send(s, &req, RTM_GETADDR, NLM_F_DUMP, sizeof(req)); + nl_foreach_oftype(nh, status, s, buf, seq, RTM_NEWADDR) { + struct ifaddrmsg *ifa = (struct ifaddrmsg *)NLMSG_DATA(nh); + struct rtattr *rta; + size_t na; + + if (ifa->ifa_index != ifi || ifa->ifa_scope != RT_SCOPE_LINK || + found) + continue; + + for (rta = IFA_RTA(ifa), na = IFA_PAYLOAD(nh); RTA_OK(rta, na); + rta = RTA_NEXT(rta, na)) { + if (rta->rta_type != IFA_ADDRESS) + continue; + + if (!found) { + memcpy(addr, RTA_DATA(rta), RTA_PAYLOAD(rta)); + found = true; + } + } + } + return status; +} + +/** + * nl_addr_set() - Set IP addresses for given interface and address family * @s: Netlink socket * @ifi: Interface index * @af: Address family @@ -942,14 +1046,14 @@ int nl_link_set_mac(int s, unsigned int ifi, const void *mac) } /** - * nl_link_up() - Bring link up + * nl_link_set_mtu() - Set link MTU * @s: Netlink socket * @ifi: Interface index - * @mtu: If non-zero, set interface MTU + * @mtu: Interface MTU * * Return: 0 on success, negative error code on failure */ -int nl_link_up(int s, unsigned int ifi, int mtu) +int nl_link_set_mtu(int s, unsigned int ifi, int mtu) { struct req_t { struct nlmsghdr nlh; @@ -959,17 +1063,35 @@ int nl_link_up(int s, unsigned int ifi, int mtu) } req = { .ifm.ifi_family = AF_UNSPEC, .ifm.ifi_index = ifi, - .ifm.ifi_flags = IFF_UP, - .ifm.ifi_change = IFF_UP, .rta.rta_type = IFLA_MTU, .rta.rta_len = RTA_LENGTH(sizeof(unsigned int)), .mtu = mtu, }; - ssize_t len = sizeof(req); - if (!mtu) - /* Shorten request to drop MTU attribute */ - len = offsetof(struct req_t, rta); + return nl_do(s, &req, RTM_NEWLINK, 0, sizeof(req)); +} + +/** + * nl_link_set_flags() - Set link flags + * @s: Netlink socket + * @ifi: Interface index + * @set: Device flags to set + * @change: Mask of device flag changes + * + * Return: 0 on success, negative error code on failure + */ +int nl_link_set_flags(int s, unsigned int ifi, + unsigned int set, unsigned int change) +{ + struct req_t { + struct nlmsghdr nlh; + struct ifinfomsg ifm; + } req = { + .ifm.ifi_family = AF_UNSPEC, + .ifm.ifi_index = ifi, + .ifm.ifi_flags = set, + .ifm.ifi_change = change, + }; - return nl_do(s, &req, RTM_NEWLINK, 0, len); + return nl_do(s, &req, RTM_NEWLINK, 0, sizeof(req)); } @@ -19,10 +19,14 @@ int nl_addr_get(int s, unsigned int ifi, sa_family_t af, void *addr, int *prefix_len, void *addr_l); int nl_addr_set(int s, unsigned int ifi, sa_family_t af, const void *addr, int prefix_len); +int nl_addr_get_ll(int s, unsigned int ifi, struct in6_addr *addr); +int nl_addr_set_ll_nodad(int s, unsigned int ifi); int nl_addr_dup(int s_src, unsigned int ifi_src, int s_dst, unsigned int ifi_dst, sa_family_t af); int nl_link_get_mac(int s, unsigned int ifi, void *mac); int nl_link_set_mac(int s, unsigned int ifi, const void *mac); -int nl_link_up(int s, unsigned int ifi, int mtu); +int nl_link_set_mtu(int s, unsigned int ifi, int mtu); +int nl_link_set_flags(int s, unsigned int ifi, + unsigned int set, unsigned int change); #endif /* NETLINK_H */ @@ -236,11 +236,15 @@ interface will be chosen instead. .TP .BR \-D ", " \-\-dns " " \fIaddr -Use \fIaddr\fR (IPv4 or IPv6) for DHCP, DHCPv6, NDP or DNS forwarding, as -configured (see options \fB--no-dhcp-dns\fR, \fB--dhcp-dns\fR, -\fB--dns-forward\fR) instead of reading addresses from \fI/etc/resolv.conf\fR. -This option can be specified multiple times. Specifying \fB-D none\fR disables -usage of DNS addresses altogether. +Instruct the guest (via DHCP, DHVPv6 or NDP) to use \fIaddr\fR (IPv4 +or IPv6) as a nameserver, as configured (see options +\fB--no-dhcp-dns\fR, \fB--dhcp-dns\fR) instead of reading addresses +from \fI/etc/resolv.conf\fR. This option can be specified multiple +times. Specifying \fB-D none\fR disables usage of DNS addresses +altogether. Unlike addresses from \fI/etc/resolv.conf\fR, \fIaddr\fR +is given to the guest without remapping. For example \fB--dns +127.0.0.1\fR will instruct the guest to use itself as nameserver, not +the host. .TP .BR \-\-dns-forward " " \fIaddr @@ -324,6 +328,20 @@ Disable Router Advertisements. Router Solicitations coming from guest or target namespace will be ignored. .TP +.BR \-\-map-host-loopback " " \fIaddr +Translate \fIaddr\fR to refer to the host. Packets from the guest to +\fIaddr\fR will be redirected to the host. On the host such packets +will appear to have both source and destination of 127.0.0.1 or ::1. + +If \fIaddr\fR is 'none', no address is mapped (this implies +\fB--no-map-gw\fR). Only one IPv4 and one IPv6 address can be +translated, if the option is specified multiple times, the last one +takes effect. + +Default is to translate the guest's default gateway address, unless +\fB--no-map-gw\fR is given, in which case no address is mapped. + +.TP .BR \-\-no-map-gw Don't remap TCP connections and untracked UDP traffic, with the gateway address as destination, to the host. Implied if there is no gateway on the selected @@ -331,6 +349,21 @@ default route, or if there is no default route, for any of the enabled address families. .TP +.BR \-\-map-guest-addr " " \fIaddr +Translate \fIaddr\fR in the guest to be equal to the guest's assigned +address on the host. That is, packets from the guest to \fIaddr\fR +will be redirected to the address assigned to the guest with \fB-a\fR, +or by default the host's global address. This allows the guest to +access services availble on the host's global address, even though its +own address shadows that of the host. + +If \fIaddr\fR is 'none', no address is mapped. Only one IPv4 and one +IPv6 address can be translated, and if the option is specified +multiple times, the last one for each address type takes effect. + +Default is no mapping. + +.TP .BR \-4 ", " \-\-ipv4-only Enable IPv4-only operation. IPv6 traffic will be ignored. By default, IPv6 operation is enabled as long as at least an IPv6 route and an @@ -191,11 +191,11 @@ void exit_handler(int signal) * Return: non-zero on failure * * #syscalls read write writev - * #syscalls socket bind connect getsockopt setsockopt s390x:socketcall close - * #syscalls recvfrom sendto shutdown + * #syscalls socket getsockopt setsockopt s390x:socketcall i686:socketcall close + * #syscalls bind connect recvfrom sendto shutdown * #syscalls arm:recv ppc64le:recv arm:send ppc64le:send * #syscalls accept4|accept listen epoll_ctl epoll_wait|epoll_pwait epoll_pwait - * #syscalls clock_gettime arm:clock_gettime64 + * #syscalls clock_gettime arm:clock_gettime64 i686:clock_gettime64 */ int main(int argc, char **argv) { @@ -272,7 +272,7 @@ int main(int argc, char **argv) if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c))) exit(EXIT_FAILURE); - proto_update_l2_buf(c.mac_guest, c.mac); + proto_update_l2_buf(c.guest_mac, c.our_tap_mac); if (c.ifi4 && !c.no_dhcp) dhcp_init(); @@ -26,6 +26,13 @@ union epoll_ref; #include "tcp.h" #include "udp.h" +/* Default address for our end on the tap interface. Bit 0 of byte 0 must be 0 + * (unicast) and bit 1 of byte 1 must be 1 (locally administered). Otherwise + * it's arbitrary. + */ +#define MAC_OUR_LAA \ + ((uint8_t [ETH_ALEN]){0x9a, 0x55, 0x9a, 0x55, 0x9a, 0x55}) + /** * union epoll_ref - Breakdown of reference for epoll fd bookkeeping * @type: Type of fd (tells us what to do with events) @@ -94,9 +101,14 @@ enum passt_modes { * @addr: IPv4 address assigned to guest * @addr_seen: Latest IPv4 address seen as source from tap * @prefixlen: IPv4 prefix length (netmask) - * @gw: Default IPv4 gateway + * @guest_gw: IPv4 gateway as seen by the guest + * @map_host_loopback: Outbound connections to this address are NATted to the + * host's 127.0.0.1 + * @map_guest_addr: Outbound connections to this address are NATted to the + * guest's assigned address * @dns: DNS addresses for DHCP, zero-terminated * @dns_match: Forward DNS query if sent to this address + * @our_tap_addr: IPv4 address for passt's use on tap * @dns_host: Use this DNS on the host for forwarding * @addr_out: Optional source address for outbound traffic * @ifname_out: Optional interface name to bind outbound sockets to @@ -104,15 +116,21 @@ enum passt_modes { * @no_copy_addrs: Don't copy all addresses when configuring namespace */ struct ip4_ctx { + /* PIF_TAP addresses */ struct in_addr addr; struct in_addr addr_seen; int prefix_len; - struct in_addr gw; + struct in_addr guest_gw; + struct in_addr map_host_loopback; + struct in_addr map_guest_addr; struct in_addr dns[MAXNS + 1]; struct in_addr dns_match; - struct in_addr dns_host; + struct in_addr our_tap_addr; + /* PIF_HOST addresses */ + struct in_addr dns_host; struct in_addr addr_out; + char ifname_out[IFNAMSIZ]; bool no_copy_routes; @@ -122,12 +140,16 @@ struct ip4_ctx { /** * struct ip6_ctx - IPv6 execution context * @addr: IPv6 address assigned to guest - * @addr_ll: Link-local IPv6 address on external, routable interface * @addr_seen: Latest IPv6 global/site address seen as source from tap * @addr_ll_seen: Latest IPv6 link-local address seen as source from tap - * @gw: Default IPv6 gateway + * @guest_gw: IPv6 gateway as seen by the guest + * @map_host_loopback: Outbound connections to this address are NATted to the + * host's [::1] + * @map_guest_addr: Outbound connections to this address are NATted to the + * guest's assigned address * @dns: DNS addresses for DHCPv6 and NDP, zero-terminated * @dns_match: Forward DNS query if sent to this address + * @our_tap_ll: Link-local IPv6 address for passt's use on tap * @dns_host: Use this DNS on the host for forwarding * @addr_out: Optional source address for outbound traffic * @ifname_out: Optional interface name to bind outbound sockets to @@ -135,16 +157,21 @@ struct ip4_ctx { * @no_copy_addrs: Don't copy all addresses when configuring namespace */ struct ip6_ctx { + /* PIF_TAP addresses */ struct in6_addr addr; - struct in6_addr addr_ll; struct in6_addr addr_seen; struct in6_addr addr_ll_seen; - struct in6_addr gw; + struct in6_addr guest_gw; + struct in6_addr map_host_loopback; + struct in6_addr map_guest_addr; struct in6_addr dns[MAXNS + 1]; struct in6_addr dns_match; - struct in6_addr dns_host; + struct in6_addr our_tap_ll; + /* PIF_HOST addresses */ + struct in6_addr dns_host; struct in6_addr addr_out; + char ifname_out[IFNAMSIZ]; bool no_copy_routes; @@ -172,8 +199,8 @@ struct ip6_ctx { * @epollfd: File descriptor for epoll instance * @fd_tap_listen: File descriptor for listening AF_UNIX socket, if any * @fd_tap: AF_UNIX socket, tuntap device, or pre-opened socket - * @mac: Host MAC address - * @mac_guest: MAC address of guest or namespace, seen or configured + * @our_tap_mac: Pasta/passt's MAC on the tap link + * @guest_mac: MAC address of guest or namespace, seen or configured * @hash_secret: 128-bit secret for siphash functions * @ifi4: Index of template interface for IPv4, 0 if IPv4 disabled * @ip: IPv4 configuration @@ -198,7 +225,6 @@ struct ip6_ctx { * @no_dhcpv6: Disable DHCPv6 server * @no_ndp: Disable NDP handler altogether * @no_ra: Disable router advertisements - * @no_map_gw: Don't map connections, untracked UDP to gateway to host * @low_wmem: Low probed net.core.wmem_max * @low_rmem: Low probed net.core.rmem_max */ @@ -226,8 +252,8 @@ struct ctx { int epollfd; int fd_tap_listen; int fd_tap; - unsigned char mac[ETH_ALEN]; - unsigned char mac_guest[ETH_ALEN]; + unsigned char our_tap_mac[ETH_ALEN]; + unsigned char guest_mac[ETH_ALEN]; uint64_t hash_secret[2]; unsigned int ifi4; @@ -258,7 +284,6 @@ struct ctx { int no_dhcpv6; int no_ndp; int no_ra; - int no_map_gw; int low_wmem; int low_rmem; @@ -13,7 +13,7 @@ * * #syscalls:pasta clone waitid exit exit_group rt_sigprocmask * #syscalls:pasta rt_sigreturn|sigreturn - * #syscalls:pasta arm:sigreturn ppc64:sigreturn s390x:sigreturn + * #syscalls:pasta arm:sigreturn ppc64:sigreturn s390x:sigreturn i686:sigreturn */ #include <sched.h> @@ -288,22 +288,30 @@ void pasta_ns_conf(struct ctx *c) { int rc = 0; - rc = nl_link_up(nl_sock_ns, 1 /* lo */, 0); + rc = nl_link_set_flags(nl_sock_ns, 1 /* lo */, IFF_UP, IFF_UP); if (rc < 0) die("Couldn't bring up loopback interface in namespace: %s", strerror(-rc)); /* Get or set MAC in target namespace */ - if (MAC_IS_ZERO(c->mac_guest)) - nl_link_get_mac(nl_sock_ns, c->pasta_ifi, c->mac_guest); + if (MAC_IS_ZERO(c->guest_mac)) + nl_link_get_mac(nl_sock_ns, c->pasta_ifi, c->guest_mac); else - rc = nl_link_set_mac(nl_sock_ns, c->pasta_ifi, c->mac_guest); + rc = nl_link_set_mac(nl_sock_ns, c->pasta_ifi, c->guest_mac); if (rc < 0) die("Couldn't set MAC address in namespace: %s", strerror(-rc)); if (c->pasta_conf_ns) { - nl_link_up(nl_sock_ns, c->pasta_ifi, c->mtu); + unsigned int flags = IFF_UP; + + if (c->mtu != -1) + nl_link_set_mtu(nl_sock_ns, c->pasta_ifi, c->mtu); + + if (c->ifi6) /* Avoid duplicate address detection on link up */ + flags |= IFF_NOARP; + + nl_link_set_flags(nl_sock_ns, c->pasta_ifi, flags, flags); if (c->ifi4) { if (c->ip4.no_copy_addrs) { @@ -324,7 +332,8 @@ void pasta_ns_conf(struct ctx *c) if (c->ip4.no_copy_routes) { rc = nl_route_set_def(nl_sock_ns, c->pasta_ifi, - AF_INET, &c->ip4.gw); + AF_INET, + &c->ip4.guest_gw); } else { rc = nl_route_dup(nl_sock, c->ifi4, nl_sock_ns, c->pasta_ifi, AF_INET); @@ -337,6 +346,23 @@ void pasta_ns_conf(struct ctx *c) } if (c->ifi6) { + rc = nl_addr_get_ll(nl_sock_ns, c->pasta_ifi, + &c->ip6.addr_ll_seen); + if (rc < 0) { + warn("Can't get LL address from namespace: %s", + strerror(-rc)); + } + + rc = nl_addr_set_ll_nodad(nl_sock_ns, c->pasta_ifi); + if (rc < 0) { + warn("Can't set nodad for LL in namespace: %s", + strerror(-rc)); + } + + /* We dodged DAD: re-enable neighbour solicitations */ + nl_link_set_flags(nl_sock_ns, c->pasta_ifi, + 0, IFF_NOARP); + if (c->ip6.no_copy_addrs) { rc = nl_addr_set(nl_sock_ns, c->pasta_ifi, AF_INET6, &c->ip6.addr, 64); @@ -353,7 +379,8 @@ void pasta_ns_conf(struct ctx *c) if (c->ip6.no_copy_routes) { rc = nl_route_set_def(nl_sock_ns, c->pasta_ifi, - AF_INET6, &c->ip6.gw); + AF_INET6, + &c->ip6.guest_gw); } else { rc = nl_route_dup(nl_sock, c->ifi6, nl_sock_ns, c->pasta_ifi, @@ -367,7 +394,7 @@ void pasta_ns_conf(struct ctx *c) } } - proto_update_l2_buf(c->mac_guest, NULL); + proto_update_l2_buf(c->guest_mac, NULL); } /** @@ -400,12 +427,12 @@ static int pasta_netns_quit_timer(void) */ void pasta_netns_quit_init(const struct ctx *c) { - union epoll_ref ref = { .type = EPOLL_TYPE_NSQUIT_INOTIFY }; struct epoll_event ev = { .events = EPOLLIN }; int flags = O_NONBLOCK | O_CLOEXEC; struct statfs s = { 0 }; bool try_inotify = true; int fd = -1, dir_fd; + union epoll_ref ref; if (c->mode != MODE_PASTA || c->no_netns_quit || !*c->netns_base) return; @@ -436,6 +463,7 @@ void pasta_netns_quit_init(const struct ctx *c) ref.type = EPOLL_TYPE_NSQUIT_TIMER; } else { close(dir_fd); + ref.type = EPOLL_TYPE_NSQUIT_INOTIFY; } if (fd > FD_REF_MAX) @@ -242,7 +242,10 @@ for __p in ${__profiles}; do __calls="$(sed -n 's/[\t ]*\*[\t ]*#syscalls\(:'"${__p}"'\|\)[\t ]\{1,\}\(.*\)/\2/p' ${IN})" __calls="${__calls} ${EXTRA_SYSCALLS:-}" __calls="$(filter ${__calls})" - echo "seccomp profile ${__p} allows: ${__calls}" | tr '\n' ' ' | fmt -t + + cols="$(stty -a | sed -n 's/.*columns \([0-9]*\).*/\1/p' || :)" 2>/dev/null + case $cols in [0-9]*) col_args="-w ${cols}";; *) col_args="";; esac + echo "seccomp profile ${__p} allows: ${__calls}" | tr '\n' ' ' | fmt -t ${col_args} # Pad here to keep gen_profile() "simple" __count=0 @@ -118,8 +118,8 @@ static void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto) struct ethhdr *eh = (struct ethhdr *)buf; /* TODO: ARP table lookup */ - memcpy(eh->h_dest, c->mac_guest, ETH_ALEN); - memcpy(eh->h_source, c->mac, ETH_ALEN); + memcpy(eh->h_dest, c->guest_mac, ETH_ALEN); + memcpy(eh->h_source, c->our_tap_mac, ETH_ALEN); eh->h_proto = ntohs(proto); return eh + 1; } @@ -946,9 +946,9 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p) eh = (struct ethhdr *)p; - if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) { - memcpy(c->mac_guest, eh->h_source, ETH_ALEN); - proto_update_l2_buf(c->mac_guest, NULL); + if (memcmp(c->guest_mac, eh->h_source, ETH_ALEN)) { + memcpy(c->guest_mac, eh->h_source, ETH_ALEN); + proto_update_l2_buf(c->guest_mac, NULL); } switch (ntohs(eh->h_proto)) { @@ -982,24 +982,17 @@ static void tap_sock_reset(struct ctx *c) } /** - * tap_handler_passt() - Packet handler for AF_UNIX file descriptor + * tap_passt_input() - Handler for new data on the socket to qemu * @c: Execution context - * @events: epoll events * @now: Current timestamp */ -void tap_handler_passt(struct ctx *c, uint32_t events, - const struct timespec *now) +static void tap_passt_input(struct ctx *c, const struct timespec *now) { static const char *partial_frame; static ssize_t partial_len = 0; ssize_t n; char *p; - if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) { - tap_sock_reset(c); - return; - } - tap_flush_pools(); if (partial_len) { @@ -1010,10 +1003,13 @@ void tap_handler_passt(struct ctx *c, uint32_t events, memmove(pkt_buf, partial_frame, partial_len); } - n = recv(c->fd_tap, pkt_buf + partial_len, TAP_BUF_BYTES - partial_len, - MSG_DONTWAIT); + do { + n = recv(c->fd_tap, pkt_buf + partial_len, + TAP_BUF_BYTES - partial_len, MSG_DONTWAIT); + } while ((n < 0) && errno == EINTR); + if (n < 0) { - if (errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK) { + if (errno != EAGAIN && errno != EWOULDBLOCK) { err_perror("Receive error on guest connection, reset"); tap_sock_reset(c); } @@ -1052,54 +1048,76 @@ void tap_handler_passt(struct ctx *c, uint32_t events, } /** - * tap_handler_pasta() - Packet handler for /dev/net/tun file descriptor + * tap_handler_passt() - Event handler for AF_UNIX file descriptor * @c: Execution context * @events: epoll events * @now: Current timestamp */ -void tap_handler_pasta(struct ctx *c, uint32_t events, +void tap_handler_passt(struct ctx *c, uint32_t events, const struct timespec *now) { - ssize_t n, len; - int ret; + if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) { + tap_sock_reset(c); + return; + } - if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) - die("Disconnect event on /dev/net/tun device, exiting"); + if (events & EPOLLIN) + tap_passt_input(c, now); +} -redo: - n = 0; +/** + * tap_pasta_input() - Handler for new data on the socket to hypervisor + * @c: Execution context + * @now: Current timestamp + */ +static void tap_pasta_input(struct ctx *c, const struct timespec *now) +{ + ssize_t n, len; tap_flush_pools(); -restart: - while ((len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n)) > 0) { - if (len < (ssize_t)sizeof(struct ethhdr) || - len > (ssize_t)ETH_MAX_MTU) { - n += len; - continue; - } + for (n = 0; n <= (ssize_t)TAP_BUF_BYTES - ETH_MAX_MTU; n += len) { + len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU); + if (len == 0) { + die("EOF on tap device, exiting"); + } else if (len < 0) { + if (errno == EINTR) { + len = 0; + continue; + } - tap_add_packet(c, len, pkt_buf + n); + if (errno == EAGAIN && errno == EWOULDBLOCK) + break; /* all done for now */ - if ((n += len) == TAP_BUF_BYTES) - break; - } + die("Error on tap device, exiting"); + } - if (len < 0 && errno == EINTR) - goto restart; + /* Ignore frames of bad length */ + if (len < (ssize_t)sizeof(struct ethhdr) || + len > (ssize_t)ETH_MAX_MTU) + continue; - ret = errno; + tap_add_packet(c, len, pkt_buf + n); + } tap_handler(c, now); +} - if (len > 0 || ret == EAGAIN) - return; - - if (n == TAP_BUF_BYTES) - goto redo; +/** + * tap_handler_pasta() - Packet handler for /dev/net/tun file descriptor + * @c: Execution context + * @events: epoll events + * @now: Current timestamp + */ +void tap_handler_pasta(struct ctx *c, uint32_t events, + const struct timespec *now) +{ + if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) + die("Disconnect event on /dev/net/tun device, exiting"); - die("Error on tap device, exiting"); + if (events & EPOLLIN) + tap_pasta_input(c, now); } /** @@ -1337,6 +1355,6 @@ void tap_sock_init(struct ctx *c) * sends us packets. Use the broadcast address so that our * first packets will reach it. */ - memset(&c->mac_guest, 0xff, sizeof(c->mac_guest)); + memset(&c->guest_mac, 0xff, sizeof(c->guest_mac)); } } @@ -361,8 +361,8 @@ static const char *tcp_flag_str[] __attribute((__unused__)) = { static int tcp_sock_init_ext [NUM_PORTS][IP_VERSIONS]; static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS]; -/* Table of guest side forwarding addresses with very low RTT (assumed - * to be local to the host), LRU +/* Table of our guest side addresses with very low RTT (assumed to be local to + * the host), LRU */ static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE]; @@ -440,7 +440,7 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) if (events == TAP_SYN_RCVD) return EPOLLOUT | EPOLLET | EPOLLRDHUP; - return EPOLLRDHUP; + return EPOLLET | EPOLLRDHUP; } /** @@ -663,7 +663,7 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn) int i; for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) - if (inany_equals(&tapside->faddr, low_rtt_dst + i)) + if (inany_equals(&tapside->oaddr, low_rtt_dst + i)) return 1; return 0; @@ -686,7 +686,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, return; for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) { - if (inany_equals(&tapside->faddr, low_rtt_dst + i)) + if (inany_equals(&tapside->oaddr, low_rtt_dst + i)) return; if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i)) hole = i; @@ -698,7 +698,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, if (hole == -1) return; - low_rtt_dst[hole++] = tapside->faddr; + low_rtt_dst[hole++] = tapside->oaddr; if (hole == LOW_RTT_TABLE_SIZE) hole = 0; inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any); @@ -881,7 +881,7 @@ static void tcp_fill_header(struct tcphdr *th, { const struct flowside *tapside = TAPFLOW(conn); - th->source = htons(tapside->fport); + th->source = htons(tapside->oport); th->dest = htons(tapside->eport); th->seq = htonl(seq); th->ack_seq = htonl(conn->seq_ack_to_tap); @@ -913,7 +913,7 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, uint32_t seq) { const struct flowside *tapside = TAPFLOW(conn); - const struct in_addr *src4 = inany_v4(&tapside->faddr); + const struct in_addr *src4 = inany_v4(&tapside->oaddr); const struct in_addr *dst4 = inany_v4(&tapside->eaddr); size_t l4len = dlen + sizeof(*th); size_t l3len = l4len + sizeof(*iph); @@ -957,7 +957,7 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn, size_t l4len = dlen + sizeof(*th); ip6h->payload_len = htons(l4len); - ip6h->saddr = tapside->faddr.a6; + ip6h->saddr = tapside->oaddr.a6; ip6h->daddr = tapside->eaddr.a6; ip6h->hop_limit = 255; @@ -992,7 +992,7 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, const uint16_t *check, uint32_t seq) { const struct flowside *tapside = TAPFLOW(conn); - const struct in_addr *a4 = inany_v4(&tapside->faddr); + const struct in_addr *a4 = inany_v4(&tapside->oaddr); if (a4) { return tcp_fill_headers4(conn, iov[TCP_IOV_TAP].iov_base, @@ -1417,15 +1417,15 @@ static void tcp_bind_outbound(const struct ctx *c, socklen_t sl; - pif_sockaddr(c, &bind_sa, &sl, PIF_HOST, &tgt->faddr, tgt->fport); - if (!inany_is_unspecified(&tgt->faddr) || tgt->fport) { + pif_sockaddr(c, &bind_sa, &sl, PIF_HOST, &tgt->oaddr, tgt->oport); + if (!inany_is_unspecified(&tgt->oaddr) || tgt->oport) { if (bind(s, &bind_sa.sa, sl)) { char sstr[INANY_ADDRSTRLEN]; flow_dbg(conn, "Can't bind TCP outbound socket to %s:%hu: %s", - inany_ntop(&tgt->faddr, sstr, sizeof(sstr)), - tgt->fport, strerror(errno)); + inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)), + tgt->oport, strerror(errno)); } } @@ -1497,12 +1497,12 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0 || - !inany_is_unicast(&ini->faddr) || ini->fport == 0) { + !inany_is_unicast(&ini->oaddr) || ini->oport == 0) { char sstr[INANY_ADDRSTRLEN], dstr[INANY_ADDRSTRLEN]; debug("Invalid endpoint in TCP SYN: %s:%hu -> %s:%hu", inany_ntop(&ini->eaddr, sstr, sizeof(sstr)), ini->eport, - inany_ntop(&ini->faddr, dstr, sizeof(dstr)), ini->fport); + inany_ntop(&ini->oaddr, dstr, sizeof(dstr)), ini->oport); goto cancel; } @@ -2100,7 +2100,8 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, goto cancel; /* FIXME: When listening port has a specific bound address, record that - * as the forwarding address */ + * as our address + */ ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa, ref.tcp_listen.port); @@ -2143,7 +2144,7 @@ cancel: * @c: Execution context * @ref: epoll reference of timer (not connection) * - * #syscalls timerfd_gettime + * #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64 */ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) { @@ -168,7 +168,6 @@ void tcp_sock4_iov_init(const struct ctx *c) iov = tcp4_l2_flags_iov[i]; iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]); - iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src); iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]); iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i]; @@ -333,9 +332,13 @@ int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) else dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++]; - for (i = 0; i < TCP_NUM_IOVS; i++) - memcpy(dup_iov[i].iov_base, iov[i].iov_base, - iov[i].iov_len); + for (i = 0; i < TCP_NUM_IOVS; i++) { + /* All frames share the same ethernet header buffer */ + if (i != TCP_IOV_ETH) { + memcpy(dup_iov[i].iov_base, iov[i].iov_base, + iov[i].iov_len); + } + } dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len; } diff --git a/tcp_internal.h b/tcp_internal.h index 8b60aab..aa8bb64 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -44,7 +44,7 @@ #define TAPFLOW(conn_) (&((conn_)->f.side[TAPSIDE(conn_)])) #define TAP_SIDX(conn_) (FLOW_SIDX((conn_), TAPSIDE(conn_))) -#define CONN_V4(conn) (!!inany_v4(&TAPFLOW(conn)->faddr)) +#define CONN_V4(conn) (!!inany_v4(&TAPFLOW(conn)->oaddr)) #define CONN_V6(conn) (!CONN_V4(conn)) /* diff --git a/tcp_splice.c b/tcp_splice.c index 483e45d..9f5cc27 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -28,7 +28,7 @@ * - FIN_SENT_0: FIN (write shutdown) sent to accepted socket * - FIN_SENT_1: FIN (write shutdown) sent to target socket * - * #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64 + * #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64 i686:fcntl64 */ #include <sched.h> diff --git a/test/README.md b/test/README.md index 0936b04..91ca603 100644 --- a/test/README.md +++ b/test/README.md @@ -28,10 +28,11 @@ on a system, i.e. common utilities such as a shell are not included here. Example for Debian, and possibly most Debian-based distributions: - build-essential git jq strace iperf3 qemu-system-x86 tmux sipcalc bats bc - catatonit clang-tidy cppcheck go isc-dhcp-common psmisc linux-cpupower socat - netcat-openbsd fakeroot lz4 lm-sensors qemu-system-arm qemu-system-ppc - qemu-system-misc qemu-system-x86 valgrind + bats bc build-essential catatonit clang-tidy conmon cppcheck crun fakeroot + git go iperf3 isc-dhcp-common jq libgpgme-dev libseccomp-dev linux-cpupower + lm-sensors lz4 netavark netcat-openbsd psmisc qemu-efi-aarch64 + qemu-system-arm qemu-system-misc qemu-system-ppc qemu-system-x86 + qemu-system-x86 sipcalc socat strace tmux uidmap valgrind NOTE: the tests need a qemu version >= 7.2, or one that contains commit 13c6be96618c ("net: stream: add unix socket"): this change introduces support diff --git a/test/lib/layout b/test/lib/layout index f9a1cf1..4d03572 100644 --- a/test/lib/layout +++ b/test/lib/layout @@ -15,7 +15,7 @@ # layout_pasta() - Panes for host, pasta, and separate one for namespace layout_pasta() { - sleep 3 + sleep 1 tmux kill-pane -a -t 0 cmd_write 0 clear @@ -46,7 +46,7 @@ layout_pasta() { # layout_passt() - Panes for host, passt, and guest layout_passt() { - sleep 3 + sleep 1 tmux kill-pane -a -t 0 cmd_write 0 clear @@ -77,7 +77,7 @@ layout_passt() { # layout_passt_in_pasta() - Host, passt within pasta, namespace and guest layout_passt_in_pasta() { - sleep 3 + sleep 1 tmux kill-pane -a -t 0 cmd_write 0 clear @@ -113,7 +113,7 @@ layout_passt_in_pasta() { # layout_two_guests() - Two guest panes, two passt panes, plus host and log layout_two_guests() { - sleep 3 + sleep 1 tmux kill-pane -a -t 0 cmd_write 0 clear @@ -152,7 +152,7 @@ layout_two_guests() { # layout_demo_pasta() - Four panes for pasta demo layout_demo_pasta() { - sleep 3 + sleep 1 cmd_write 0 cd ${BASEPATH} cmd_write 0 clear @@ -188,7 +188,7 @@ layout_demo_pasta() { # layout_demo_passt() - Four panes for passt demo layout_demo_passt() { - sleep 3 + sleep 1 cmd_write 0 cd ${BASEPATH} cmd_write 0 clear @@ -224,7 +224,7 @@ layout_demo_passt() { # layout_demo_podman() - Four panes for pasta demo with Podman layout_demo_podman() { - sleep 3 + sleep 1 cmd_write 0 cd ${BASEPATH} cmd_write 0 clear diff --git a/test/lib/setup b/test/lib/setup index 9b39b9f..d764138 100755 --- a/test/lib/setup +++ b/test/lib/setup @@ -17,6 +17,8 @@ INITRAMFS="${BASEPATH}/mbuto.img" VCPUS="$( [ $(nproc) -ge 8 ] && echo 6 || echo $(( $(nproc) / 2 + 1 )) )" __mem_kib="$(sed -n 's/MemTotal:[ ]*\([0-9]*\) kB/\1/p' /proc/meminfo)" VMEM="$((${__mem_kib} / 1024 / 4))" +QEMU_ARCH="$(uname -m)" +[ "${QEMU_ARCH}" = "i686" ] && QEMU_ARCH=i386 # setup_build() - Set up pane layout for build tests setup_build() { @@ -53,7 +55,7 @@ setup_passt() { wait_for [ -f "${STATESETUP}/passt.pid" ] GUEST_CID=94557 - context_run_bg qemu 'qemu-system-$(uname -m)' \ + context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}" \ ' -machine accel=kvm' \ ' -m '${VMEM}' -cpu host -smp '${VCPUS} \ ' -kernel ' "/boot/vmlinuz-$(uname -r)" \ @@ -124,7 +126,12 @@ setup_passt_in_ns() { [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" - context_run_bg pasta "./pasta ${__opts} -t 10001,10002,10011,10012 -T 10003,10013 -u 10001,10002,10011,10012 -U 10003,10013 -P ${STATESETUP}/pasta.pid --config-net ${NSTOOL} hold ${STATESETUP}/ns.hold" + __map_host4=192.0.2.1 + __map_host6=2001:db8:9a55::1 + __map_ns4=192.0.2.2 + __map_ns6=2001:db8:9a55::2 + + context_run_bg pasta "./pasta ${__opts} -t 10001,10002,10011,10012 -T 10003,10013 -u 10001,10002,10011,10012 -U 10003,10013 -P ${STATESETUP}/pasta.pid --map-host-loopback ${__map_host4} --map-host-loopback ${__map_host6} --config-net ${NSTOOL} hold ${STATESETUP}/ns.hold" wait_for [ -f "${STATESETUP}/pasta.pid" ] context_setup_nstool qemu ${STATESETUP}/ns.hold @@ -139,16 +146,16 @@ setup_passt_in_ns() { if [ ${VALGRIND} -eq 1 ]; then context_run passt "make clean" context_run passt "make valgrind" - context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid" + context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}" else context_run passt "make clean" context_run passt "make" - context_run_bg passt "./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid" + context_run_bg passt "./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}" fi wait_for [ -f "${STATESETUP}/passt.pid" ] GUEST_CID=94557 - context_run_bg qemu 'qemu-system-$(uname -m)' \ + context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}" \ ' -machine accel=kvm' \ ' -M accel=kvm:tcg' \ ' -m '${VMEM}' -cpu host -smp '${VCPUS} \ @@ -220,7 +227,7 @@ setup_two_guests() { wait_for [ -f "${STATESETUP}/passt_2.pid" ] GUEST_1_CID=94557 - context_run_bg qemu_1 'qemu-system-$(uname -m)' \ + context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}" \ ' -M accel=kvm:tcg' \ ' -m '${VMEM}' -cpu host -smp '${VCPUS} \ ' -kernel ' "/boot/vmlinuz-$(uname -r)" \ @@ -233,7 +240,7 @@ setup_two_guests() { " -device vhost-vsock-pci,guest-cid=$GUEST_1_CID" GUEST_2_CID=94558 - context_run_bg qemu_2 'qemu-system-$(uname -m)' \ + context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}" \ ' -M accel=kvm:tcg' \ ' -m '${VMEM}' -cpu host -smp '${VCPUS} \ ' -kernel ' "/boot/vmlinuz-$(uname -r)" \ diff --git a/test/lib/term b/test/lib/term index 262937e..3834092 100755 --- a/test/lib/term +++ b/test/lib/term @@ -97,7 +97,6 @@ display_delay() { switch_pane() { tmux select-pane -t ${1} PR_DELAY=${PR_DELAY_INIT} - display_delay "0.2" } # cmd_write() - Write a command to a pane, letter by letter, and execute it @@ -199,7 +198,7 @@ pane_run() { # $1: Pane name pane_wait() { __lc="$(echo "${1}" | tr [A-Z] [a-z])" - sleep 0.1 || sleep 1 + sleep 0.01 || sleep 1 __done=0 while @@ -207,7 +206,7 @@ pane_wait() { case ${__l} in *"$ " | *"# ") return ;; esac - do sleep 0.1 || sleep 1; done + do sleep 0.01 || sleep 1; done } # pane_parse() - Print last line, @EMPTY@ if command had no output @@ -231,7 +230,7 @@ pane_status() { __status="$(pane_parse "${1}")" while ! [ "${__status}" -eq "${__status}" ] 2>/dev/null; do - sleep 1 + sleep 0.01 || sleep 1 pane_run "${1}" 'echo $?' pane_wait "${1}" __status="$(pane_parse "${1}")" @@ -383,6 +382,16 @@ info_check_failed() { printf " < failed.\n" >> "${LOGFILE}" } +# status_bar_blink() - Make status bar blink +status_bar_blink() { + for i in `seq 1 3`; do + tmux set status-right-style 'bg=colour1 fg=colour196 bold' + sleep 0.1 || sleep 1 + tmux set status-right-style 'bg=colour1 fg=colour233 bold' + sleep 0.1 || sleep 1 + done +} + # info_passed() - Display, log, and make status bar blink when a test passes info_passed() { switch_pane ${PANE_INFO} @@ -391,12 +400,7 @@ info_passed() { log "...passed." log - for i in `seq 1 3`; do - tmux set status-right-style 'bg=colour1 fg=colour2 bold' - sleep "0.1" - tmux set status-right-style 'bg=colour1 fg=colour233 bold' - sleep "0.1" - done + [ ${FAST} -eq 1 ] || status_bar_blink } # info_failed() - Display, log, and make status bar blink when a test passes @@ -407,12 +411,7 @@ info_failed() { log "...failed." log - for i in `seq 1 3`; do - tmux set status-right-style 'bg=colour1 fg=colour196 bold' - sleep "0.1" - tmux set status-right-style 'bg=colour1 fg=colour233 bold' - sleep "0.1" - done + [ ${FAST} -eq 1 ] || status_bar_blink pause_continue \ "Press any key to pause test session" \ diff --git a/test/lib/test b/test/lib/test index c525f8e..e6726be 100755 --- a/test/lib/test +++ b/test/lib/test @@ -33,7 +33,7 @@ test_iperf3k() { pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid); rm s.pid' - sleep 3 # Wait for kernel to free up ports + sleep 1 # Wait for kernel to free up ports } # test_iperf3() - Ugly helper for iperf3 directive diff --git a/test/passt.mbuto b/test/passt.mbuto index 436eecc..138d365 100755 --- a/test/passt.mbuto +++ b/test/passt.mbuto @@ -15,6 +15,14 @@ PROGS="${PROGS:-ash,dash,bash ip mount ls insmod mkdir ln cat chmod lsmod sed tr chown sipcalc cut socat dd strace ping tail killall sleep sysctl nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp}" +# OpenSSH 9.8 introduced split binaries, with sshd being the daemon, and +# sshd-session the per-session program. We need the latter as well, and the path +# depends on the distribution. It doesn't exist on older versions. +for bin in /usr/lib/openssh/sshd-session /usr/lib/ssh/sshd-session \ + /usr/libexec/openssh/sshd-session; do + command -v "${bin}" >/dev/null && PROGS="${PROGS} ${bin}" +done + KMODS="${KMODS:- virtio_net virtio_pci vmw_vsock_virtio_transport}" LINKS="${LINKS:- @@ -78,7 +86,7 @@ EOF EOF chmod 600 /root/.ssh/authorized_keys chmod 700 /root - socat VSOCK-LISTEN:22,fork EXEC:"sshd -i -e" 2> /var/log/vsock-ssh.log & + socat VSOCK-LISTEN:22,fork EXEC:"/sbin/sshd -i -e" 2> /var/log/vsock-ssh.log & sh +m ' diff --git a/test/passt_in_ns/dhcp b/test/passt_in_ns/dhcp new file mode 100644 index 0000000..0ceed7c --- /dev/null +++ b/test/passt_in_ns/dhcp @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/passt/dhcp - Check DHCP and DHCPv6 functionality in passt mode +# +# Copyright (c) 2021 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +gtools ip jq dhclient sed tr +htools ip jq sed tr head + +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +test Interface name +gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME__" ] + +test DHCP: address +guest /sbin/dhclient -4 __IFNAME__ +gout ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR__" = "__HOST_ADDR__" ] + +test DHCP: route +gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' +hout HOST_GW ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").gateway] | .[0]' +check [ "__GW__" = "__HOST_GW__" ] + +test DHCP: MTU +gout MTU ip -j link show | jq -rM '.[] | select(.ifname == "__IFNAME__").mtu' +check [ __MTU__ = 65520 ] + +test DHCP: DNS +gout DNS sed -n 's/^nameserver \([0-9]*\.\)\(.*\)/\1\2/p' /etc/resolv.conf | tr '\n' ',' | sed 's/,$//;s/$/\n/' +hout HOST_DNS sed -n 's/^nameserver \([0-9]*\.\)\(.*\)/\1\2/p' /etc/resolv.conf | head -n3 | tr '\n' ',' | sed 's/,$//;s/$/\n/' +check [ "__DNS__" = "__HOST_DNS__" ] || ( [ "__DNS__" = "__MAP_NS4__" ] && expr "__HOST_DNS__" : "127[.]" ) + +# FQDNs should be terminated by dots, but the guest DHCP client might omit them: +# strip them first +test DHCP: search list +gout SEARCH sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/' +hout HOST_SEARCH sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/' +check [ "__SEARCH__" = "__HOST_SEARCH__" ] + +test DHCPv6: address +guest /sbin/dhclient -6 __IFNAME__ +gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR6__" = "__HOST_ADDR6__" ] + +test DHCPv6: route +gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway' +hout HOST_GW6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").gateway] | .[0]' +check [ "__GW6__" = "__HOST_GW6__" ] + +# Strip interface specifier: interface names might differ between host and guest +test DHCPv6: DNS +gout DNS6 sed -n 's/^nameserver \([^:]*:\)\([^%]*\).*/\1\2/p' /etc/resolv.conf | tr '\n' ',' | sed 's/,$//;s/$/\n/' +hout HOST_DNS6 sed -n 's/^nameserver \([^:]*:\)\([^%]*\).*/\1\2/p' /etc/resolv.conf | tr '\n' ',' | sed 's/,$//;s/$/\n/' +check [ "__DNS6__" = "__HOST_DNS6__" ] || [ "__DNS6__" = "__MAP_NS6__" -a "__HOST_DNS6__" = "::1" ] + +test DHCPv6: search list +gout SEARCH6 sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/' +hout HOST_SEARCH6 sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/' +check [ "__SEARCH6__" = "__HOST_SEARCH6__" ] diff --git a/test/passt_in_ns/tcp b/test/passt_in_ns/tcp index cdb7060..aaf340e 100644 --- a/test/passt_in_ns/tcp +++ b/test/passt_in_ns/tcp @@ -15,6 +15,11 @@ gtools socat ip jq htools socat ip jq nstools socat ip jq +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + set TEMP_BIG __STATEDIR__/test_big.bin set TEMP_SMALL __STATEDIR__/test_small.bin set TEMP_NS_BIG __STATEDIR__/test_ns_big.bin @@ -36,16 +41,15 @@ check cmp __TEMP_NS_BIG__ __BASEPATH__/big.bin test TCP/IPv4: guest to host: big transfer hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc -gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' sleep 1 -guest socat -u OPEN:/root/big.bin TCP4:__GW__:10003 +guest socat -u OPEN:/root/big.bin TCP4:__MAP_HOST4__:10003 hostw check cmp __TEMP_BIG__ __BASEPATH__/big.bin test TCP/IPv4: guest to ns: big transfer nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc sleep 1 -guest socat -u OPEN:/root/big.bin TCP4:__GW__:10002 +guest socat -u OPEN:/root/big.bin TCP4:__MAP_NS4__:10002 nsw check cmp __TEMP_NS_BIG__ __BASEPATH__/big.bin @@ -59,7 +63,7 @@ check cmp __TEMP_BIG__ __BASEPATH__/big.bin test TCP/IPv4: ns to host (via tap): big transfer hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc sleep 1 -ns socat -u OPEN:__BASEPATH__/big.bin TCP4:__GW__:10003 +ns socat -u OPEN:__BASEPATH__/big.bin TCP4:__MAP_HOST4__:10003 hostw check cmp __TEMP_BIG__ __BASEPATH__/big.bin @@ -95,16 +99,15 @@ check cmp __TEMP_NS_SMALL__ __BASEPATH__/small.bin test TCP/IPv4: guest to host: small transfer hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc -gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' sleep 1 -guest socat -u OPEN:/root/small.bin TCP4:__GW__:10003 +guest socat -u OPEN:/root/small.bin TCP4:__MAP_HOST4__:10003 hostw check cmp __TEMP_SMALL__ __BASEPATH__/small.bin test TCP/IPv4: guest to ns: small transfer nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc sleep 1 -guest socat -u OPEN:/root/small.bin TCP4:__GW__:10002 +guest socat -u OPEN:/root/small.bin TCP4:__MAP_NS4__:10002 nsw check cmp __TEMP_NS_SMALL__ __BASEPATH__/small.bin @@ -118,7 +121,7 @@ check cmp __TEMP_SMALL__ __BASEPATH__/small.bin test TCP/IPv4: ns to host (via tap): small transfer hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc sleep 1 -ns socat -u OPEN:__BASEPATH__/small.bin TCP4:__GW__:10003 +ns socat -u OPEN:__BASEPATH__/small.bin TCP4:__MAP_HOST4__:10003 hostw check cmp __TEMP_SMALL__ __BASEPATH__/small.bin @@ -152,17 +155,15 @@ check cmp __TEMP_NS_BIG__ __BASEPATH__/big.bin test TCP/IPv6: guest to host: big transfer hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc -gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway' -gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' sleep 1 -guest socat -u OPEN:/root/big.bin TCP6:[__GW6__%__IFNAME__]:10003 +guest socat -u OPEN:/root/big.bin TCP6:[__MAP_HOST6__]:10003 hostw check cmp __TEMP_BIG__ __BASEPATH__/big.bin test TCP/IPv6: guest to ns: big transfer nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc sleep 1 -guest socat -u OPEN:/root/big.bin TCP6:[__GW6__%__IFNAME__]:10002 +guest socat -u OPEN:/root/big.bin TCP6:[__MAP_NS6__]:10002 nsw check cmp __TEMP_NS_BIG__ __BASEPATH__/big.bin @@ -175,9 +176,8 @@ check cmp __TEMP_BIG__ __BASEPATH__/big.bin test TCP/IPv6: ns to host (via tap): big transfer hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc -nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' sleep 1 -ns socat -u OPEN:__BASEPATH__/big.bin TCP6:[__GW6__%__IFNAME__]:10003 +ns socat -u OPEN:__BASEPATH__/big.bin TCP6:[__MAP_HOST6__]:10003 hostw check cmp __TEMP_BIG__ __BASEPATH__/big.bin @@ -190,6 +190,7 @@ guest cmp test_big.bin /root/big.bin test TCP/IPv6: ns to guest (using namespace address): big transfer guestb socat -u TCP6-LISTEN:10001 OPEN:test_big.bin,create,trunc +nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[0].local' sleep 1 ns socat -u OPEN:__BASEPATH__/big.bin TCP6:[__ADDR6__]:10001 @@ -212,17 +213,15 @@ check cmp __TEMP_NS_SMALL__ __BASEPATH__/small.bin test TCP/IPv6: guest to host: small transfer hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc -gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway' -gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' sleep 1 -guest socat -u OPEN:/root/small.bin TCP6:[__GW6__%__IFNAME__]:10003 +guest socat -u OPEN:/root/small.bin TCP6:[__MAP_HOST6__]:10003 hostw check cmp __TEMP_SMALL__ __BASEPATH__/small.bin test TCP/IPv6: guest to ns: small transfer nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__ sleep 1 -guest socat -u OPEN:/root/small.bin TCP6:[__GW6__%__IFNAME__]:10002 +guest socat -u OPEN:/root/small.bin TCP6:[__MAP_NS6__]:10002 nsw check cmp __TEMP_NS_SMALL__ __BASEPATH__/small.bin @@ -235,9 +234,8 @@ check cmp __TEMP_SMALL__ __BASEPATH__/small.bin test TCP/IPv6: ns to host (via tap): small transfer hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc -nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' sleep 1 -ns socat -u OPEN:__BASEPATH__/small.bin TCP6:[__GW6__%__IFNAME__]:10003 +ns socat -u OPEN:__BASEPATH__/small.bin TCP6:[__MAP_HOST6__]:10003 hostw check cmp __TEMP_SMALL__ __BASEPATH__/small.bin diff --git a/test/passt_in_ns/udp b/test/passt_in_ns/udp index 8a02513..3426ab9 100644 --- a/test/passt_in_ns/udp +++ b/test/passt_in_ns/udp @@ -15,6 +15,11 @@ gtools socat ip jq nstools socat ip jq htools socat ip jq +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + set TEMP __STATEDIR__/test.bin set TEMP_NS __STATEDIR__/test_ns.bin @@ -34,16 +39,15 @@ check cmp __TEMP_NS__ __BASEPATH__/medium.bin test UDP/IPv4: guest to host hostb socat -u UDP4-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc -gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' sleep 1 -guest socat -u OPEN:/root/medium.bin UDP4:__GW__:10003,shut-null +guest socat -u OPEN:/root/medium.bin UDP4:__MAP_HOST4__:10003,shut-null hostw check cmp __TEMP__ __BASEPATH__/medium.bin test UDP/IPv4: guest to ns nsb socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc sleep 1 -guest socat -u OPEN:/root/medium.bin UDP4:__GW__:10002,shut-null +guest socat -u OPEN:/root/medium.bin UDP4:__MAP_NS4__:10002,shut-null nsw check cmp __TEMP_NS__ __BASEPATH__/medium.bin @@ -57,7 +61,7 @@ check cmp __TEMP__ __BASEPATH__/medium.bin test UDP/IPv4: ns to host (via tap) hostb socat -u UDP4-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc sleep 1 -ns socat -u OPEN:__BASEPATH__/medium.bin UDP4:__GW__:10003,shut-null +ns socat -u OPEN:__BASEPATH__/medium.bin UDP4:__MAP_HOST4__:10003,shut-null hostw check cmp __TEMP__ __BASEPATH__/medium.bin @@ -93,17 +97,15 @@ check cmp __TEMP_NS__ __BASEPATH__/medium.bin test UDP/IPv6: guest to host hostb socat -u UDP6-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc -gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway' -gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' sleep 1 -guest socat -u OPEN:/root/medium.bin UDP6:[__GW6__%__IFNAME__]:10003,shut-null +guest socat -u OPEN:/root/medium.bin UDP6:[__MAP_HOST6__]:10003,shut-null hostw check cmp __TEMP__ __BASEPATH__/medium.bin test UDP/IPv6: guest to ns nsb socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc sleep 1 -guest socat -u OPEN:/root/medium.bin UDP6:[__GW6__%__IFNAME__]:10002,shut-null +guest socat -u OPEN:/root/medium.bin UDP6:[__MAP_NS6__]:10002,shut-null nsw check cmp __TEMP_NS__ __BASEPATH__/medium.bin @@ -116,9 +118,8 @@ check cmp __TEMP__ __BASEPATH__/medium.bin test UDP/IPv6: ns to host (via tap) hostb socat -u UDP6-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc -nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' sleep 1 -ns socat -u OPEN:__BASEPATH__/medium.bin UDP6:[__GW6__%__IFNAME__]:10003,shut-null +ns socat -u OPEN:__BASEPATH__/medium.bin UDP6:[__MAP_HOST6__]:10003,shut-null hostw check cmp __TEMP__ __BASEPATH__/medium.bin @@ -131,6 +132,7 @@ guest cmp test.bin /root/medium.bin test UDP/IPv6: ns to guest (using namespace address) guestb socat -u UDP6-LISTEN:10001,null-eof OPEN:test.bin,create,trunc +nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[0].local' sleep 1 ns socat -u OPEN:__BASEPATH__/medium.bin UDP6:[__ADDR6__]:10001,shut-null diff --git a/test/pasta_options/log_to_file b/test/pasta_options/log_to_file index fe50e50..3ead06c 100644 --- a/test/pasta_options/log_to_file +++ b/test/pasta_options/log_to_file @@ -19,7 +19,7 @@ sleep 1 endef def flood_log_client -host tcp_crr --nolog -P 10001 -C 10002 -6 -c -H ::1 +host tcp_crr --nolog -l1 -P 10001 -C 10002 -6 -c -H ::1 endef def check_log_size_mountns @@ -42,7 +42,7 @@ pout PID2 echo $! check head -1 __LOG_FILE__ | grep '^pasta .* [(]__PID2__[)]$' test Maximum log size -passtb ./pasta --config-net -d -f -l __LOG_FILE__ --log-size $((100 * 1024)) -- sh -c 'while true; do tcp_crr --nolog -P 10001 -C 10002 -6; done' +passtb ./pasta --config-net -d -f -l __LOG_FILE__ --log-size $((100 * 1024)) -- sh -c 'while true; do tcp_crr --nolog -l1 -P 10001 -C 10002 -6; done' sleep 1 flood_log_client diff --git a/test/perf/passt_tcp b/test/perf/passt_tcp index 14343cb..089d953 100644 --- a/test/perf/passt_tcp +++ b/test/perf/passt_tcp @@ -15,6 +15,9 @@ gtools /sbin/sysctl ip jq nproc seq sleep iperf3 tcp_rr tcp_crr # From neper nstools /sbin/sysctl ip jq nproc seq sleep iperf3 tcp_rr tcp_crr htools bc head sed seq +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + test passt: throughput and latency guest /sbin/sysctl -w net.core.rmem_max=536870912 @@ -29,8 +32,6 @@ ns /sbin/sysctl -w net.ipv4.tcp_rmem="4096 524288 134217728" ns /sbin/sysctl -w net.ipv4.tcp_wmem="4096 524288 134217728" ns /sbin/sysctl -w net.ipv4.tcp_timestamps=0 -gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' -gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway' gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' hout FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1 @@ -38,7 +39,7 @@ hout FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sy hout FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__ set THREADS 4 -set TIME 10 +set TIME 1 set OMIT 0.1 set OPTS -Z -P __THREADS__ -l 1M -O__OMIT__ @@ -54,16 +55,16 @@ iperf3s ns 10002 bw - bw - guest ip link set dev __IFNAME__ mtu 1280 -iperf3 BW guest __GW6__%__IFNAME__ 10002 __TIME__ __OPTS__ -w 4M +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 4M bw __BW__ 1.2 1.5 guest ip link set dev __IFNAME__ mtu 1500 -iperf3 BW guest __GW6__%__IFNAME__ 10002 __TIME__ __OPTS__ -w 4M +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 4M bw __BW__ 1.6 1.8 guest ip link set dev __IFNAME__ mtu 9000 -iperf3 BW guest __GW6__%__IFNAME__ 10002 __TIME__ __OPTS__ -w 8M +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 8M bw __BW__ 4.0 5.0 guest ip link set dev __IFNAME__ mtu 65520 -iperf3 BW guest __GW6__%__IFNAME__ 10002 __TIME__ __OPTS__ -w 16M +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 16M bw __BW__ 7.0 8.0 iperf3k ns @@ -75,7 +76,7 @@ lat - lat - lat - nsb tcp_rr --nolog -6 -gout LAT tcp_rr --nolog -6 -c -H __GW6__%__IFNAME__ | sed -n 's/^throughput=\(.*\)/\1/p' +gout LAT tcp_rr --nolog -l1 -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p' lat __LAT__ 200 150 tl TCP CRR latency over IPv6: guest to host @@ -85,33 +86,37 @@ lat - lat - lat - nsb tcp_crr --nolog -6 -gout LAT tcp_crr --nolog -6 -c -H __GW6__%__IFNAME__ | sed -n 's/^throughput=\(.*\)/\1/p' +gout LAT tcp_crr --nolog -l1 -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p' lat __LAT__ 500 400 tr TCP throughput over IPv4: guest to host iperf3s ns 10002 guest ip link set dev __IFNAME__ mtu 256 -iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -w 1M +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 1M bw __BW__ 0.2 0.3 guest ip link set dev __IFNAME__ mtu 576 -iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -w 1M +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 1M bw __BW__ 0.5 0.8 guest ip link set dev __IFNAME__ mtu 1280 -iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -w 4M +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 4M bw __BW__ 1.2 1.5 guest ip link set dev __IFNAME__ mtu 1500 -iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -w 4M +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 4M bw __BW__ 1.6 1.8 guest ip link set dev __IFNAME__ mtu 9000 -iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -w 8M +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 8M bw __BW__ 4.0 5.0 guest ip link set dev __IFNAME__ mtu 65520 -iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -w 16M +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 16M bw __BW__ 7.0 8.0 iperf3k ns +# Reducing MTU below 1280 deconfigures IPv6, get our address back +guest dhclient -6 -x +guest dhclient -6 __IFNAME__ + tl TCP RR latency over IPv4: guest to host lat - lat - @@ -119,7 +124,7 @@ lat - lat - lat - nsb tcp_rr --nolog -4 -gout LAT tcp_rr --nolog -4 -c -H __GW__ | sed -n 's/^throughput=\(.*\)/\1/p' +gout LAT tcp_rr --nolog -l1 -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p' lat __LAT__ 200 150 tl TCP CRR latency over IPv4: guest to host @@ -129,7 +134,7 @@ lat - lat - lat - nsb tcp_crr --nolog -4 -gout LAT tcp_crr --nolog -4 -c -H __GW__ | sed -n 's/^throughput=\(.*\)/\1/p' +gout LAT tcp_crr --nolog -l1 -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p' lat __LAT__ 500 400 tr TCP throughput over IPv6: host to guest @@ -153,7 +158,7 @@ lat - lat - guestb tcp_rr --nolog -P 10001 -C 10011 -6 sleep 1 -nsout LAT tcp_rr --nolog -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_rr --nolog -l1 -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' lat __LAT__ 200 150 tl TCP CRR latency over IPv6: host to guest @@ -164,7 +169,7 @@ lat - lat - guestb tcp_crr --nolog -P 10001 -C 10011 -6 sleep 1 -nsout LAT tcp_crr --nolog -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_crr --nolog -l1 -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' lat __LAT__ 500 350 @@ -189,7 +194,7 @@ lat - lat - guestb tcp_rr --nolog -P 10001 -C 10011 -4 sleep 1 -nsout LAT tcp_rr --nolog -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_rr --nolog -l1 -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' lat __LAT__ 200 150 tl TCP CRR latency over IPv6: host to guest @@ -200,7 +205,7 @@ lat - lat - guestb tcp_crr --nolog -P 10001 -C 10011 -4 sleep 1 -nsout LAT tcp_crr --nolog -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_crr --nolog -l1 -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' lat __LAT__ 500 300 te diff --git a/test/perf/passt_udp b/test/perf/passt_udp index 8919280..4c66c41 100644 --- a/test/perf/passt_udp +++ b/test/perf/passt_udp @@ -15,6 +15,9 @@ gtools /sbin/sysctl ip jq nproc sleep iperf3 udp_rr # From neper nstools ip jq sleep iperf3 udp_rr htools bc head sed +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + test passt: throughput and latency guest /sbin/sysctl -w net.core.rmem_max=16777216 @@ -22,16 +25,12 @@ guest /sbin/sysctl -w net.core.wmem_max=16777216 guest /sbin/sysctl -w net.core.rmem_default=16777216 guest /sbin/sysctl -w net.core.wmem_default=16777216 -gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' -gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway' -gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' - hout FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1 hout FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq) ) | bc -l hout FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__ set THREADS 2 -set TIME 10 +set TIME 1 set OPTS -u -P __THREADS__ --pacing-timer 1000 info Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz @@ -46,13 +45,13 @@ iperf3s ns 10002 bw - bw - -iperf3 BW guest __GW6__%__IFNAME__ 10002 __TIME__ __OPTS__ -b 3G -l 1232 +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 3G -l 1232 bw __BW__ 0.8 1.2 -iperf3 BW guest __GW6__%__IFNAME__ 10002 __TIME__ __OPTS__ -b 4G -l 1452 +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 4G -l 1452 bw __BW__ 1.0 1.5 -iperf3 BW guest __GW6__%__IFNAME__ 10002 __TIME__ __OPTS__ -b 8G -l 8952 +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 8G -l 8952 bw __BW__ 4.0 5.0 -iperf3 BW guest __GW6__%__IFNAME__ 10002 __TIME__ __OPTS__ -b 15G -l 64372 +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 15G -l 64372 bw __BW__ 4.0 5.0 iperf3k ns @@ -64,7 +63,7 @@ lat - lat - lat - nsb udp_rr --nolog -6 -gout LAT udp_rr --nolog -6 -c -H __GW6__%__IFNAME__ | sed -n 's/^throughput=\(.*\)/\1/p' +gout LAT udp_rr --nolog -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p' lat __LAT__ 200 150 @@ -72,17 +71,17 @@ tr UDP throughput over IPv4: guest to host iperf3s ns 10002 # (datagram size) = (packet size) - 28: 20 bytes of IPv4 header, 8 of UDP header -iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -b 1G -l 228 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 1G -l 228 bw __BW__ 0.0 0.0 -iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -b 2G -l 548 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 2G -l 548 bw __BW__ 0.4 0.6 -iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -b 3G -l 1252 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 3G -l 1252 bw __BW__ 0.8 1.2 -iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -b 4G -l 1472 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 4G -l 1472 bw __BW__ 1.0 1.5 -iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -b 8G -l 8972 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 8G -l 8972 bw __BW__ 4.0 5.0 -iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -b 15G -l 65492 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 15G -l 65492 bw __BW__ 4.0 5.0 iperf3k ns @@ -94,7 +93,7 @@ lat - lat - lat - nsb udp_rr --nolog -4 -gout LAT udp_rr --nolog -4 -c -H __GW__ | sed -n 's/^throughput=\(.*\)/\1/p' +gout LAT udp_rr --nolog -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p' lat __LAT__ 200 150 diff --git a/test/perf/pasta_tcp b/test/perf/pasta_tcp index 8d2f911..d1ccf7d 100644 --- a/test/perf/pasta_tcp +++ b/test/perf/pasta_tcp @@ -14,6 +14,9 @@ htools head ip seq bc sleep iperf3 tcp_rr tcp_crr jq sed nstools /sbin/sysctl nproc ip seq sleep iperf3 tcp_rr tcp_crr jq sed +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 + test pasta: throughput and latency (local connections) ns /sbin/sysctl -w net.ipv4.tcp_rmem="131072 524288 134217728" @@ -22,7 +25,7 @@ ns /sbin/sysctl -w net.ipv4.tcp_timestamps=0 set THREADS 4 -set TIME 10 +set TIME 1 set OMIT 0.1 set OPTS -Z -w 4M -l 1M -P __THREADS__ -O__OMIT__ @@ -46,13 +49,13 @@ iperf3k host tl TCP RR latency over IPv6: ns to host hostb tcp_rr --nolog -P 10003 -C 10013 -6 -nsout LAT tcp_rr --nolog -P 10003 -C 10013 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_rr --nolog -l1 -P 10003 -C 10013 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' hostw lat __LAT__ 150 100 tl TCP CRR latency over IPv6: ns to host hostb tcp_crr --nolog -P 10003 -C 10013 -6 -nsout LAT tcp_crr --nolog -P 10003 -C 10013 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_crr --nolog -l1 -P 10003 -C 10013 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' hostw lat __LAT__ 500 350 @@ -67,13 +70,13 @@ iperf3k host tl TCP RR latency over IPv4: ns to host hostb tcp_rr --nolog -P 10003 -C 10013 -4 -nsout LAT tcp_rr --nolog -P 10003 -C 10013 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_rr --nolog -l1 -P 10003 -C 10013 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' hostw lat __LAT__ 150 100 tl TCP CRR latency over IPv4: ns to host hostb tcp_crr --nolog -P 10003 -C 10013 -4 -nsout LAT tcp_crr --nolog -P 10003 -C 10013 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_crr --nolog -l1 -P 10003 -C 10013 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' hostw lat __LAT__ 500 350 @@ -87,13 +90,13 @@ iperf3k ns tl TCP RR latency over IPv6: host to ns nsb tcp_rr --nolog -P 10002 -C 10012 -6 -hout LAT tcp_rr --nolog -P 10002 -C 10012 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' +hout LAT tcp_rr --nolog -l1 -P 10002 -C 10012 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' nsw lat __LAT__ 150 100 tl TCP CRR latency over IPv6: host to ns nsb tcp_crr --nolog -P 10002 -C 10012 -6 -hout LAT tcp_crr --nolog -P 10002 -C 10012 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' +hout LAT tcp_crr --nolog -l1 -P 10002 -C 10012 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' nsw lat __LAT__ 1000 700 @@ -108,13 +111,13 @@ iperf3k ns tl TCP RR latency over IPv4: host to ns nsb tcp_rr --nolog -P 10002 -C 10012 -4 -hout LAT tcp_rr --nolog -P 10002 -C 10012 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' +hout LAT tcp_rr --nolog -l1 -P 10002 -C 10012 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' nsw lat __LAT__ 150 100 tl TCP CRR latency over IPv4: host to ns nsb tcp_crr --nolog -P 10002 -C 10012 -4 -hout LAT tcp_crr --nolog -P 10002 -C 10012 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' +hout LAT tcp_crr --nolog -l1 -P 10002 -C 10012 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' nsw lat __LAT__ 1000 700 @@ -122,8 +125,6 @@ te test pasta: throughput and latency (connections via tap) -nsout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' -nsout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway' nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' set THREADS 2 set OPTS -Z -P __THREADS__ -i1 -O__OMIT__ @@ -137,16 +138,16 @@ tr TCP throughput over IPv6: ns to host iperf3s host 10003 ns ip link set dev __IFNAME__ mtu 1500 -iperf3 BW ns __GW6__%__IFNAME__ 10003 __TIME__ __OPTS__ -w 512k +iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -w 512k bw __BW__ 0.2 0.4 ns ip link set dev __IFNAME__ mtu 4000 -iperf3 BW ns __GW6__%__IFNAME__ 10003 __TIME__ __OPTS__ -w 1M +iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -w 1M bw __BW__ 0.3 0.5 ns ip link set dev __IFNAME__ mtu 16384 -iperf3 BW ns __GW6__%__IFNAME__ 10003 __TIME__ __OPTS__ -w 8M +iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -w 8M bw __BW__ 1.5 2.0 ns ip link set dev __IFNAME__ mtu 65520 -iperf3 BW ns __GW6__%__IFNAME__ 10003 __TIME__ __OPTS__ -w 8M +iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -w 8M bw __BW__ 2.0 2.5 iperf3k host @@ -156,7 +157,7 @@ lat - lat - lat - hostb tcp_rr --nolog -P 10003 -C 10013 -6 -nsout LAT tcp_rr --nolog -P 10003 -C 10013 -6 -c -H __GW6__%__IFNAME__ | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_rr --nolog -l1 -P 10003 -C 10013 -6 -c -H __MAP_HOST6__ | sed -n 's/^throughput=\(.*\)/\1/p' hostw lat __LAT__ 150 100 @@ -165,7 +166,7 @@ lat - lat - lat - hostb tcp_crr --nolog -P 10003 -C 10013 -6 -nsout LAT tcp_crr --nolog -P 10003 -C 10013 -6 -c -H __GW6__%__IFNAME__ | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_crr --nolog -l1 -P 10003 -C 10013 -6 -c -H __MAP_HOST6__ | sed -n 's/^throughput=\(.*\)/\1/p' hostw lat __LAT__ 1500 500 @@ -174,16 +175,16 @@ tr TCP throughput over IPv4: ns to host iperf3s host 10003 ns ip link set dev __IFNAME__ mtu 1500 -iperf3 BW ns __GW__ 10003 __TIME__ __OPTS__ -w 512k +iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -w 512k bw __BW__ 0.2 0.4 ns ip link set dev __IFNAME__ mtu 4000 -iperf3 BW ns __GW__ 10003 __TIME__ __OPTS__ -w 1M +iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -w 1M bw __BW__ 0.3 0.5 ns ip link set dev __IFNAME__ mtu 16384 -iperf3 BW ns __GW__ 10003 __TIME__ __OPTS__ -w 8M +iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -w 8M bw __BW__ 1.5 2.0 ns ip link set dev __IFNAME__ mtu 65520 -iperf3 BW ns __GW__ 10003 __TIME__ __OPTS__ -w 8M +iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -w 8M bw __BW__ 2.0 2.5 iperf3k host @@ -193,7 +194,7 @@ lat - lat - lat - hostb tcp_rr --nolog -P 10003 -C 10013 -4 -nsout LAT tcp_rr --nolog -P 10003 -C 10013 -4 -c -H __GW__ | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_rr --nolog -l1 -P 10003 -C 10013 -4 -c -H __MAP_HOST4__ | sed -n 's/^throughput=\(.*\)/\1/p' hostw lat __LAT__ 150 100 @@ -202,7 +203,7 @@ lat - lat - lat - hostb tcp_crr --nolog -P 10003 -C 10013 -4 -nsout LAT tcp_crr --nolog -P 10003 -C 10013 -4 -c -H __GW__ | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_crr --nolog -l1 -P 10003 -C 10013 -4 -c -H __MAP_HOST4__ | sed -n 's/^throughput=\(.*\)/\1/p' hostw lat __LAT__ 1500 500 @@ -224,7 +225,7 @@ lat - lat - lat - nsb tcp_rr --nolog -P 10002 -C 10012 -6 -hout LAT tcp_rr --nolog -P 10002 -C 10012 -6 -c -H __ADDR6__ | sed -n 's/^throughput=\(.*\)/\1/p' +hout LAT tcp_rr --nolog -l1 -P 10002 -C 10012 -6 -c -H __ADDR6__ | sed -n 's/^throughput=\(.*\)/\1/p' nsw lat __LAT__ 150 100 @@ -234,7 +235,7 @@ lat - lat - sleep 1 nsb tcp_crr --nolog -P 10002 -C 10012 -6 -hout LAT tcp_crr --nolog -P 10002 -C 10012 -6 -c -H __ADDR6__ | sed -n 's/^throughput=\(.*\)/\1/p' +hout LAT tcp_crr --nolog -l1 -P 10002 -C 10012 -6 -c -H __ADDR6__ | sed -n 's/^throughput=\(.*\)/\1/p' nsw lat __LAT__ 5000 10000 @@ -256,7 +257,7 @@ lat - lat - lat - nsb tcp_rr --nolog -P 10002 -C 10012 -4 -hout LAT tcp_rr --nolog -P 10002 -C 10012 -4 -c -H __ADDR__ | sed -n 's/^throughput=\(.*\)/\1/p' +hout LAT tcp_rr --nolog -l1 -P 10002 -C 10012 -4 -c -H __ADDR__ | sed -n 's/^throughput=\(.*\)/\1/p' nsw lat __LAT__ 150 100 @@ -266,7 +267,7 @@ lat - lat - sleep 1 nsb tcp_crr --nolog -P 10002 -C 10012 -4 -hout LAT tcp_crr --nolog -P 10002 -C 10012 -4 -c -H __ADDR__ | sed -n 's/^throughput=\(.*\)/\1/p' +hout LAT tcp_crr --nolog -l1 -P 10002 -C 10012 -4 -c -H __ADDR__ | sed -n 's/^throughput=\(.*\)/\1/p' nsw lat __LAT__ 5000 10000 diff --git a/test/perf/pasta_udp b/test/perf/pasta_udp index 6acbfd3..544bf17 100644 --- a/test/perf/pasta_udp +++ b/test/perf/pasta_udp @@ -14,6 +14,9 @@ htools bc head ip sleep iperf3 udp_rr jq sed nstools ip sleep iperf3 udp_rr jq sed +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 + test pasta: throughput and latency (local traffic) hout FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1 @@ -21,7 +24,7 @@ hout FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sy hout FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__ set THREADS 1 -set TIME 10 +set TIME 1 set OPTS -u -P __THREADS__ info Throughput in Gbps, latency in µs, one thread at __FREQ__ GHz @@ -133,8 +136,6 @@ te test pasta: throughput and latency (traffic via tap) -nsout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' -nsout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway' nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' info Throughput in Gbps, latency in µs, one thread at __FREQ__ GHz @@ -146,13 +147,13 @@ tr UDP throughput over IPv6: ns to host iperf3s host 10003 # (datagram size) = (packet size) - 48: 40 bytes of IPv6 header, 8 of UDP header -iperf3 BW ns __GW6__%__IFNAME__ 10003 __TIME__ __OPTS__ -b 8G -l 1472 +iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -b 8G -l 1472 bw __BW__ 0.3 0.5 -iperf3 BW ns __GW6__%__IFNAME__ 10003 __TIME__ __OPTS__ -b 12G -l 3972 +iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -b 12G -l 3972 bw __BW__ 0.5 0.8 -iperf3 BW ns __GW6__%__IFNAME__ 10003 __TIME__ __OPTS__ -b 20G -l 16356 +iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -b 20G -l 16356 bw __BW__ 3.0 4.0 -iperf3 BW ns __GW6__%__IFNAME__ 10003 __TIME__ __OPTS__ -b 30G -l 65472 +iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -b 30G -l 65472 bw __BW__ 6.0 7.0 iperf3k host @@ -162,7 +163,7 @@ lat - lat - lat - hostb udp_rr --nolog -P 10003 -C 10013 -6 -nsout LAT udp_rr --nolog -P 10003 -C 10013 -6 -c -H __GW6__%__IFNAME__ | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT udp_rr --nolog -P 10003 -C 10013 -6 -c -H __MAP_HOST6__ | sed -n 's/^throughput=\(.*\)/\1/p' hostw lat __LAT__ 200 150 @@ -171,13 +172,13 @@ tr UDP throughput over IPv4: ns to host iperf3s host 10003 # (datagram size) = (packet size) - 28: 20 bytes of IPv4 header, 8 of UDP header -iperf3 BW ns __GW__ 10003 __TIME__ __OPTS__ -b 8G -l 1472 +iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -b 8G -l 1472 bw __BW__ 0.3 0.5 -iperf3 BW ns __GW__ 10003 __TIME__ __OPTS__ -b 12G -l 3972 +iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -b 12G -l 3972 bw __BW__ 0.5 0.8 -iperf3 BW ns __GW__ 10003 __TIME__ __OPTS__ -b 20G -l 16356 +iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -b 20G -l 16356 bw __BW__ 3.0 4.0 -iperf3 BW ns __GW__ 10003 __TIME__ __OPTS__ -b 30G -l 65492 +iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -b 30G -l 65492 bw __BW__ 6.0 7.0 iperf3k host @@ -187,7 +188,7 @@ lat - lat - lat - hostb udp_rr --nolog -P 10003 -C 10013 -4 -nsout LAT udp_rr --nolog -P 10003 -C 10013 -4 -c -H __GW__ | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT udp_rr --nolog -P 10003 -C 10013 -4 -c -H __MAP_HOST4__ | sed -n 's/^throughput=\(.*\)/\1/p' hostw lat __LAT__ 200 150 @@ -101,7 +101,7 @@ run() { VALGRIND=1 setup passt_in_ns test passt/ndp - test passt/dhcp + test passt_in_ns/dhcp test passt_in_ns/icmp test passt_in_ns/tcp test passt_in_ns/udp @@ -115,7 +115,7 @@ run() { VALGRIND=0 setup passt_in_ns test passt/ndp - test passt/dhcp + test passt_in_ns/dhcp test perf/passt_tcp test perf/passt_udp test perf/pasta_tcp diff --git a/test/valgrind.supp b/test/valgrind.supp index a158394..735b5f6 100644 --- a/test/valgrind.supp +++ b/test/valgrind.supp @@ -6,3 +6,12 @@ ... fun:tcp_sock_consume } + +# same as above, for architectures with the recv() system call (at least i686): +{ + passt_recv_MSG_TRUNC_into_NULL_buffer + Memcheck:Param + socketcall.recv(buf) + ... + fun:tcp_sock_consume +} @@ -178,8 +178,7 @@ enum udp_iov_idx { /* IOVs and msghdr arrays for receiving datagrams from sockets */ static struct iovec udp_iov_recv [UDP_MAX_FRAMES]; -static struct mmsghdr udp4_mh_recv [UDP_MAX_FRAMES]; -static struct mmsghdr udp6_mh_recv [UDP_MAX_FRAMES]; +static struct mmsghdr udp_mh_recv [UDP_MAX_FRAMES]; /* IOVs and msghdr arrays for sending "spliced" datagrams to sockets */ static union sockaddr_inany udp_splice_to; @@ -222,6 +221,7 @@ void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) static void udp_iov_init_one(const struct ctx *c, size_t i) { struct udp_payload_t *payload = &udp_payload[i]; + struct msghdr *mh = &udp_mh_recv[i].msg_hdr; struct udp_meta_t *meta = &udp_meta[i]; struct iovec *siov = &udp_iov_recv[i]; struct iovec *tiov = udp_l2_iov[i]; @@ -236,27 +236,10 @@ static void udp_iov_init_one(const struct ctx *c, size_t i) tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph); tiov[UDP_IOV_PAYLOAD].iov_base = payload; - /* It's useful to have separate msghdr arrays for receiving. Otherwise, - * an IPv4 recv() will alter msg_namelen, so we'd have to reset it every - * time or risk truncating the address on future IPv6 recv()s. - */ - if (c->ifi4) { - struct msghdr *mh = &udp4_mh_recv[i].msg_hdr; - - mh->msg_name = &meta->s_in; - mh->msg_namelen = sizeof(struct sockaddr_in); - mh->msg_iov = siov; - mh->msg_iovlen = 1; - } - - if (c->ifi6) { - struct msghdr *mh = &udp6_mh_recv[i].msg_hdr; - - mh->msg_name = &meta->s_in; - mh->msg_namelen = sizeof(struct sockaddr_in6); - mh->msg_iov = siov; - mh->msg_iovlen = 1; - } + mh->msg_name = &meta->s_in; + mh->msg_namelen = sizeof(meta->s_in); + mh->msg_iov = siov; + mh->msg_iovlen = 1; } /** @@ -321,7 +304,7 @@ static void udp_splice_send(const struct ctx *c, size_t start, size_t n, static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, const struct flowside *toside, size_t dlen) { - const struct in_addr *src = inany_v4(&toside->faddr); + const struct in_addr *src = inany_v4(&toside->oaddr); const struct in_addr *dst = inany_v4(&toside->eaddr); size_t l4len = dlen + sizeof(bp->uh); size_t l3len = l4len + sizeof(*ip4h); @@ -333,7 +316,7 @@ static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, ip4h->saddr = src->s_addr; ip4h->check = csum_ip4_header(l3len, IPPROTO_UDP, *src, *dst); - bp->uh.source = htons(toside->fport); + bp->uh.source = htons(toside->oport); bp->uh.dest = htons(toside->eport); bp->uh.len = htons(l4len); csum_udp4(&bp->uh, *src, *dst, bp->data, dlen); @@ -357,15 +340,15 @@ static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, ip6h->payload_len = htons(l4len); ip6h->daddr = toside->eaddr.a6; - ip6h->saddr = toside->faddr.a6; + ip6h->saddr = toside->oaddr.a6; ip6h->version = 6; ip6h->nexthdr = IPPROTO_UDP; ip6h->hop_limit = 255; - bp->uh.source = htons(toside->fport); + bp->uh.source = htons(toside->oport); bp->uh.dest = htons(toside->eport); bp->uh.len = ip6h->payload_len; - csum_udp6(&bp->uh, &toside->faddr.a6, &toside->eaddr.a6, bp->data, dlen); + csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, bp->data, dlen); return l4len; } @@ -384,7 +367,7 @@ static void udp_tap_prepare(const struct mmsghdr *mmh, unsigned idx, struct udp_meta_t *bm = &udp_meta[idx]; size_t l4len; - if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->faddr)) { + if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) { l4len = udp_update_hdr6(&bm->ip6h, bp, toside, mmh[idx].msg_len); tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + sizeof(udp6_eth_hdr)); @@ -404,11 +387,12 @@ static void udp_tap_prepare(const struct mmsghdr *mmh, unsigned idx, * udp_sock_recverr() - Receive and clear an error from a socket * @s: Socket to receive from * - * Return: true if errors received and processed, false if no more errors + * Return: 1 if error received and processed, 0 if no more errors in queue, < 0 + * if there was an error reading the queue * * #syscalls recvmsg */ -static bool udp_sock_recverr(int s) +static int udp_sock_recverr(int s) { const struct sock_extended_err *ee; const struct cmsghdr *hdr; @@ -425,14 +409,16 @@ static bool udp_sock_recverr(int s) rc = recvmsg(s, &mh, MSG_ERRQUEUE); if (rc < 0) { - if (errno != EAGAIN && errno != EWOULDBLOCK) - err_perror("Failed to read error queue"); - return false; + if (errno == EAGAIN || errno == EWOULDBLOCK) + return 0; + + err_perror("UDP: Failed to read error queue"); + return -1; } if (!(mh.msg_flags & MSG_ERRQUEUE)) { err("Missing MSG_ERRQUEUE flag reading error queue"); - return false; + return -1; } hdr = CMSG_FIRSTHDR(&mh); @@ -441,7 +427,7 @@ static bool udp_sock_recverr(int s) (hdr->cmsg_level == IPPROTO_IPV6 && hdr->cmsg_type == IPV6_RECVERR))) { err("Unexpected cmsg reading error queue"); - return false; + return -1; } ee = (const struct sock_extended_err *)CMSG_DATA(hdr); @@ -450,7 +436,54 @@ static bool udp_sock_recverr(int s) debug("%s error on UDP socket %i: %s", str_ee_origin(ee), s, strerror(ee->ee_errno)); - return true; + return 1; +} + +/** + * udp_sock_errs() - Process errors on a socket + * @c: Execution context + * @s: Socket to receive from + * @events: epoll events bitmap + * + * Return: Number of errors handled, or < 0 if we have an unrecoverable error + */ +static int udp_sock_errs(const struct ctx *c, int s, uint32_t events) +{ + unsigned n_err = 0; + socklen_t errlen; + int rc, err; + + ASSERT(!c->no_udp); + + if (!(events & EPOLLERR)) + return 0; /* Nothing to do */ + + /* Empty the error queue */ + while ((rc = udp_sock_recverr(s)) > 0) + n_err += rc; + + if (rc < 0) + return -1; /* error reading error, unrecoverable */ + + errlen = sizeof(err); + if (getsockopt(s, SOL_SOCKET, SO_ERROR, &err, &errlen) < 0 || + errlen != sizeof(err)) { + err_perror("Error reading SO_ERROR"); + return -1; /* error reading error, unrecoverable */ + } + + if (err) { + debug("Unqueued error on UDP socket %i: %s", s, strerror(err)); + n_err++; + } + + if (!n_err) { + /* EPOLLERR, but no errors to clear !? */ + err("EPOLLERR event without reported errors on socket %i", s); + return -1; /* no way to clear, unrecoverable */ + } + + return n_err; } /** @@ -460,7 +493,9 @@ static bool udp_sock_recverr(int s) * @events: epoll events bitmap * @mmh mmsghdr array to receive into * - * #syscalls recvmmsg + * Return: Number of datagrams received + * + * #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64 */ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events, struct mmsghdr *mmh) @@ -476,12 +511,6 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events, ASSERT(!c->no_udp); - /* Clear any errors first */ - if (events & EPOLLERR) { - while (udp_sock_recverr(s)) - ; - } - if (!(events & EPOLLIN)) return 0; @@ -506,10 +535,17 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events, void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { - struct mmsghdr *mmh_recv = ref.udp.v6 ? udp6_mh_recv : udp4_mh_recv; + const socklen_t sasize = sizeof(udp_meta[0].s_in); int n, i; - if ((n = udp_sock_recv(c, ref.fd, events, mmh_recv)) <= 0) + if (udp_sock_errs(c, ref.fd, events) < 0) { + err("UDP: Unrecoverable error on listening socket:" + " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port); + /* FIXME: what now? close/re-open socket? */ + return; + } + + if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv)) <= 0) return; /* We divide datagrams into batches based on how we need to send them, @@ -518,6 +554,7 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, * populate it one entry *ahead* of the loop counter. */ udp_meta[0].tosidx = udp_flow_from_sock(c, ref, &udp_meta[0].s_in, now); + udp_mh_recv[0].msg_hdr.msg_namelen = sasize; for (i = 0; i < n; ) { flow_sidx_t batchsidx = udp_meta[i].tosidx; uint8_t batchpif = pif_at_sidx(batchsidx); @@ -525,9 +562,9 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, do { if (pif_is_socket(batchpif)) { - udp_splice_prepare(mmh_recv, i); + udp_splice_prepare(udp_mh_recv, i); } else if (batchpif == PIF_TAP) { - udp_tap_prepare(mmh_recv, i, + udp_tap_prepare(udp_mh_recv, i, flowside_at_sidx(batchsidx)); } @@ -537,6 +574,7 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, udp_meta[i].tosidx = udp_flow_from_sock(c, ref, &udp_meta[i].s_in, now); + udp_mh_recv[i].msg_hdr.msg_namelen = sasize; } while (flow_sidx_eq(udp_meta[i].tosidx, batchsidx)); if (pif_is_socket(batchpif)) { @@ -572,19 +610,23 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { - const struct flowside *fromside = flowside_at_sidx(ref.flowside); flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside); const struct flowside *toside = flowside_at_sidx(tosidx); struct udp_flow *uflow = udp_at_sidx(ref.flowside); int from_s = uflow->s[ref.flowside.sidei]; - bool v6 = !inany_v4(&fromside->eaddr); - struct mmsghdr *mmh_recv = v6 ? udp6_mh_recv : udp4_mh_recv; uint8_t topif = pif_at_sidx(tosidx); int n, i; ASSERT(!c->no_udp && uflow); - if ((n = udp_sock_recv(c, from_s, events, mmh_recv)) <= 0) + if (udp_sock_errs(c, from_s, events) < 0) { + flow_err(uflow, "Unrecoverable error on reply socket"); + flow_err_details(uflow); + udp_flow_close(c, uflow); + return; + } + + if ((n = udp_sock_recv(c, from_s, events, udp_mh_recv)) <= 0) return; flow_trace(uflow, "Received %d datagrams on reply socket", n); @@ -592,9 +634,11 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, for (i = 0; i < n; i++) { if (pif_is_socket(topif)) - udp_splice_prepare(mmh_recv, i); + udp_splice_prepare(udp_mh_recv, i); else if (topif == PIF_TAP) - udp_tap_prepare(mmh_recv, i, toside); + udp_tap_prepare(udp_mh_recv, i, toside); + /* Restore sockaddr length clobbered by recvmsg() */ + udp_mh_recv[i].msg_hdr.msg_namelen = sizeof(udp_meta[i].s_in); } if (pif_is_socket(topif)) { @@ -729,45 +773,58 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif, int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, const void *addr, const char *ifname, in_port_t port) { - union udp_listen_epoll_ref uref = { .port = port }; - int s, r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1; + union udp_listen_epoll_ref uref = { + .pif = ns ? PIF_SPLICE : PIF_HOST, + .port = port, + }; + int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1; ASSERT(!c->no_udp); - if (ns) - uref.pif = PIF_SPLICE; - else - uref.pif = PIF_HOST; - - if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) { - uref.v6 = 0; + if (af == AF_UNSPEC && c->ifi4 && c->ifi6) { + int s; + /* Attempt to get a dual stack socket */ if (!ns) { - r4 = s = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN, - addr, ifname, port, uref.u32); - + s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN, + addr, ifname, port, uref.u32); udp_splice_init[V4][port] = s < 0 ? -1 : s; + udp_splice_init[V6][port] = s < 0 ? -1 : s; } else { - r4 = s = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN, - &in4addr_loopback, - ifname, port, uref.u32); + s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN, + &in4addr_loopback, ifname, port, uref.u32); udp_splice_ns[V4][port] = s < 0 ? -1 : s; + udp_splice_ns[V6][port] = s < 0 ? -1 : s; } + if (IN_INTERVAL(0, FD_REF_MAX, s)) + return 0; } - if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) { - uref.v6 = 1; + if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) { + if (!ns) { + r4 = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN, + addr, ifname, port, uref.u32); + udp_splice_init[V4][port] = r4 < 0 ? -1 : r4; + } else { + r4 = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN, + &in4addr_loopback, + ifname, port, uref.u32); + udp_splice_ns[V4][port] = r4 < 0 ? -1 : r4; + } + } + + if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) { if (!ns) { - r6 = s = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN, - addr, ifname, port, uref.u32); + r6 = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN, + addr, ifname, port, uref.u32); - udp_splice_init[V6][port] = s < 0 ? -1 : s; + udp_splice_init[V6][port] = r6 < 0 ? -1 : r6; } else { - r6 = s = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN, - &in6addr_loopback, - ifname, port, uref.u32); - udp_splice_ns[V6][port] = s < 0 ? -1 : s; + r6 = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN, + &in6addr_loopback, + ifname, port, uref.u32); + udp_splice_ns[V6][port] = r6 < 0 ? -1 : r6; } } @@ -26,14 +26,12 @@ void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s); * union udp_listen_epoll_ref - epoll reference for "listening" UDP sockets * @port: Source port for connected sockets, bound port otherwise * @pif: pif for this socket - * @v6: Set for IPv6 sockets or connections * @u32: Opaque u32 value of reference */ union udp_listen_epoll_ref { struct { in_port_t port; uint8_t pif; - bool v6:1; }; uint32_t u32; }; @@ -8,6 +8,7 @@ #include <errno.h> #include <fcntl.h> #include <sys/uio.h> +#include <unistd.h> #include "util.h" #include "passt.h" @@ -38,8 +39,11 @@ struct udp_flow *udp_at_sidx(flow_sidx_t sidx) * @c: Execution context * @uflow: UDP flow */ -static void udp_flow_close(const struct ctx *c, struct udp_flow *uflow) +void udp_flow_close(const struct ctx *c, struct udp_flow *uflow) { + if (uflow->closed) + return; /* Nothing to do */ + if (uflow->s[INISIDE] >= 0) { /* The listening socket needs to stay in epoll */ close(uflow->s[INISIDE]); @@ -55,6 +59,8 @@ static void udp_flow_close(const struct ctx *c, struct udp_flow *uflow) flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE)); if (!pif_is_socket(uflow->f.pif[TGTSIDE])) flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE)); + + uflow->closed = true; } /** @@ -174,7 +180,7 @@ cancel: * @s_in: Source socket address, filled in by recvmmsg() * @now: Timestamp * - * #syscalls fcntl + * #syscalls fcntl arm:fcntl64 ppc64:fcntl64 i686:fcntl64 * * Return: sidx for the destination side of the flow for this packet, or * FLOW_SIDX_NONE if we couldn't find or create a flow. @@ -256,6 +262,17 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c, } /** + * udp_flow_defer() - Deferred per-flow handling (clean up aborted flows) + * @uflow: Flow to handle + * + * Return: true if the connection is ready to free, false otherwise + */ +bool udp_flow_defer(const struct udp_flow *uflow) +{ + return uflow->closed; +} + +/** * udp_flow_timer() - Handler for timed events related to a given flow * @c: Execution context * @uflow: UDP flow @@ -10,6 +10,7 @@ /** * struct udp - Descriptor for a flow of UDP packets * @f: Generic flow information + * @closed: Flow is already closed * @ts: Activity timestamp * @s: Socket fd (or -1) for each side of the flow */ @@ -17,6 +18,7 @@ struct udp_flow { /* Must be first element */ struct flow_common f; + bool closed :1; time_t ts; int s[SIDES]; }; @@ -30,6 +32,8 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c, const void *saddr, const void *daddr, in_port_t srcport, in_port_t dstport, const struct timespec *now); +void udp_flow_close(const struct ctx *c, struct udp_flow *uflow); +bool udp_flow_defer(const struct udp_flow *uflow); bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow, const struct timespec *now); @@ -199,8 +199,7 @@ int sock_l4(const struct ctx *c, sa_family_t af, enum epoll_type type, if (bind_addr) { addr6.sin6_addr = *(struct in6_addr *)bind_addr; - if (!memcmp(bind_addr, &c->ip6.addr_ll, - sizeof(c->ip6.addr_ll))) + if (IN6_IS_ADDR_LINKLOCAL(bind_addr)) addr6.sin6_scope_id = c->ifi6; } return sock_l4_sa(c, type, &addr6, sizeof(addr6), ifname, @@ -250,7 +249,7 @@ void sock_probe_mem(struct ctx *c) int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b) { if (a->tv_nsec < b->tv_nsec) { - return (b->tv_nsec - a->tv_nsec) / 1000 + + return (a->tv_nsec + 1000000000 - b->tv_nsec) / 1000 + (a->tv_sec - b->tv_sec - 1) * 1000000; } @@ -676,6 +675,25 @@ const char *sockaddr_ntop(const void *sa, char *dst, socklen_t size) return dst; } +/** eth_ntop() - Convert an Ethernet MAC address to text format + * @mac: MAC address + * @dst: Output buffer, minimum ETH_ADDRSTRLEN bytes + * @size: Size of buffer at @dst + * + * Return: On success, a non-null pointer to @dst, NULL on failure + */ +const char *eth_ntop(const unsigned char *mac, char *dst, size_t size) +{ + int len; + + len = snprintf(dst, size, "%02x:%02x:%02x:%02x:%02x:%02x", + mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); + if (len < 0 || (size_t)len >= size) + return NULL; + + return dst; +} + /** str_ee_origin() - Convert socket extended error origin to a string * @ee: Socket extended error structure * @@ -710,7 +728,7 @@ void close_open_files(int argc, char **argv) int name, rc; do { - name = getopt_long(argc, argv, "+:F", optfd, NULL); + name = getopt_long(argc, argv, "-:F:", optfd, NULL); if (name == 'F') { errno = 0; @@ -14,6 +14,9 @@ #include <string.h> #include <signal.h> #include <arpa/inet.h> +#include <unistd.h> +#include <sys/syscall.h> +#include <linux/close_range.h> #include "log.h" @@ -92,11 +95,7 @@ #define FD_PROTO(x, proto) \ (IN_INTERVAL(c->proto.fd_min, c->proto.fd_max, (x))) -#define PORT_EPHEMERAL_MIN ((1 << 15) + (1 << 14)) /* RFC 6335 */ -#define PORT_IS_EPHEMERAL(port) ((port) >= PORT_EPHEMERAL_MIN) - #define MAC_ZERO ((uint8_t [ETH_ALEN]){ 0 }) -#define MAC_LAA ((uint8_t [ETH_ALEN]){ BIT(1), 0, 0, 0, 0, 0 }) #define MAC_IS_ZERO(addr) (!memcmp((addr), MAC_ZERO, ETH_ALEN)) #ifndef __bswap_constant_16 @@ -160,6 +159,25 @@ struct ctx; /* cppcheck-suppress funcArgNamesDifferent */ __attribute__ ((weak)) int ffsl(long int i) { return __builtin_ffsl(i); } + +#ifdef CLOSE_RANGE_UNSHARE /* Linux kernel >= 5.9 */ +/* glibc < 2.34 and musl as of 1.2.5 need these */ +#ifndef SYS_close_range +#define SYS_close_range 436 +#endif +__attribute__ ((weak)) +/* cppcheck-suppress funcArgNamesDifferent */ +int close_range(unsigned int first, unsigned int last, int flags) { + return syscall(SYS_close_range, first, last, flags); +} +#else +/* No reasonable fallback option */ +/* cppcheck-suppress funcArgNamesDifferent */ +int close_range(unsigned int first, unsigned int last, int flags) { + return 0; +} +#endif + int sock_l4_sa(const struct ctx *c, enum epoll_type type, const void *sa, socklen_t sl, const char *ifname, bool v6only, uint32_t data); @@ -215,9 +233,12 @@ static inline const char *af_name(sa_family_t af) #define SOCKADDR_STRLEN MAX(SOCKADDR_INET_STRLEN, SOCKADDR_INET6_STRLEN) +#define ETH_ADDRSTRLEN (sizeof("00:11:22:33:44:55")) + struct sock_extended_err; const char *sockaddr_ntop(const void *sa, char *dst, socklen_t size); +const char *eth_ntop(const unsigned char *mac, char *dst, size_t size); const char *str_ee_origin(const struct sock_extended_err *ee); /** |