aboutgitcodebugslistschat
diff options
context:
space:
mode:
-rw-r--r--Makefile11
-rw-r--r--README.md4
-rw-r--r--arp.c4
-rw-r--r--conf.c343
-rw-r--r--contrib/apparmor/abstractions/passt2
-rw-r--r--contrib/selinux/passt.te3
-rw-r--r--contrib/selinux/pasta.te2
-rw-r--r--dhcp.c21
-rw-r--r--dhcpv6.c21
-rw-r--r--flow.c141
-rw-r--r--flow.h25
-rw-r--r--fwd.c244
-rw-r--r--fwd.h3
-rw-r--r--icmp.c4
-rw-r--r--log.c16
-rw-r--r--ndp.c9
-rw-r--r--netlink.c146
-rw-r--r--netlink.h6
-rw-r--r--passt.143
-rw-r--r--passt.c8
-rw-r--r--passt.h53
-rw-r--r--pasta.c48
-rwxr-xr-xseccomp.sh5
-rw-r--r--tap.c110
-rw-r--r--tcp.c37
-rw-r--r--tcp_buf.c11
-rw-r--r--tcp_internal.h2
-rw-r--r--tcp_splice.c2
-rw-r--r--test/README.md9
-rw-r--r--test/lib/layout14
-rwxr-xr-xtest/lib/setup21
-rwxr-xr-xtest/lib/term31
-rwxr-xr-xtest/lib/test2
-rwxr-xr-xtest/passt.mbuto10
-rw-r--r--test/passt_in_ns/dhcp73
-rw-r--r--test/passt_in_ns/tcp38
-rw-r--r--test/passt_in_ns/udp22
-rw-r--r--test/pasta_options/log_to_file4
-rw-r--r--test/perf/passt_tcp47
-rw-r--r--test/perf/passt_udp33
-rw-r--r--test/perf/pasta_tcp55
-rw-r--r--test/perf/pasta_udp27
-rwxr-xr-xtest/run4
-rw-r--r--test/valgrind.supp9
-rw-r--r--udp.c213
-rw-r--r--udp.h2
-rw-r--r--udp_flow.c21
-rw-r--r--udp_flow.h4
-rw-r--r--util.c26
-rw-r--r--util.h29
50 files changed, 1379 insertions, 639 deletions
diff --git a/Makefile b/Makefile
index b6329e3..74a9513 100644
--- a/Makefile
+++ b/Makefile
@@ -33,9 +33,16 @@ AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/MIPS64EL/MIPSEL64/')
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/HPPA/PARISC/')
AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/SH4/SH/')
+# On some systems enabling optimization also enables source fortification,
+# automagically. Do not override it.
+FORTIFY_FLAG :=
+ifeq ($(shell $(CC) -O2 -dM -E - < /dev/null 2>&1 | grep ' _FORTIFY_SOURCE ' > /dev/null; echo $$?),1)
+FORTIFY_FLAG := -D_FORTIFY_SOURCE=2
+endif
+
FLAGS := -Wall -Wextra -Wno-format-zero-length
FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
-FLAGS += -D_FORTIFY_SOURCE=2 -O2 -pie -fPIE
+FLAGS += $(FORTIFY_FLAG) -O2 -pie -fPIE
FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE)
FLAGS += -DNETNS_RUN_DIR=\"/run/netns\"
FLAGS += -DPASST_AUDIT_ARCH=AUDIT_ARCH_$(AUDIT_ARCH)
@@ -129,7 +136,7 @@ qrap: $(QRAP_SRCS) passt.h
valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction \
rt_sigreturn getpid gettid kill clock_gettime mmap \
- munmap open unlink gettimeofday futex
+ mmap2 munmap open unlink gettimeofday futex
valgrind: FLAGS += -g -DVALGRIND
valgrind: all
diff --git a/README.md b/README.md
index bb114ea..752e59f 100644
--- a/README.md
+++ b/README.md
@@ -338,7 +338,9 @@ speeding up local connections, and usually requiring NAT. _pasta_:
[_slirp4netns_ replacement](/passt/tree/slirp4netns.sh)
* ✅ out-of-tree patch for
[Kata Containers](/passt/tree/contrib/kata-containers) available
-* ⌚ drop-in replacement for VPNKit (rootless Docker)
+* ✅ rootless Docker
+ [network back-end](https://docs.docker.com/engine/security/rootless/#networking-errors)
+ via moby/rootlesskit
### Availability
* official packages for:
diff --git a/arp.c b/arp.c
index 93b22c5..53334da 100644
--- a/arp.c
+++ b/arp.c
@@ -72,7 +72,7 @@ int arp(const struct ctx *c, const struct pool *p)
ah->ar_op = htons(ARPOP_REPLY);
memcpy(am->tha, am->sha, sizeof(am->tha));
- memcpy(am->sha, c->mac, sizeof(am->sha));
+ memcpy(am->sha, c->our_tap_mac, sizeof(am->sha));
memcpy(swap, am->tip, sizeof(am->tip));
memcpy(am->tip, am->sip, sizeof(am->tip));
@@ -80,7 +80,7 @@ int arp(const struct ctx *c, const struct pool *p)
l2len = sizeof(*eh) + sizeof(*ah) + sizeof(*am);
memcpy(eh->h_dest, eh->h_source, sizeof(eh->h_dest));
- memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
+ memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
tap_send_single(c, eh, l2len);
diff --git a/conf.c b/conf.c
index ed097bd..b275886 100644
--- a/conf.c
+++ b/conf.c
@@ -156,9 +156,15 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
die("'all' port forwarding is only allowed for passt");
fwd->mode = FWD_ALL;
- memset(fwd->map, 0xff, PORT_EPHEMERAL_MIN / 8);
- for (i = 0; i < PORT_EPHEMERAL_MIN; i++) {
+ /* Skip port 0. It has special meaning for many socket APIs, so
+ * trying to bind it is not really safe.
+ */
+ for (i = 1; i < NUM_PORTS; i++) {
+ if (fwd_port_is_ephemeral(i))
+ continue;
+
+ bitmap_set(fwd->map, i);
if (optname == 't') {
ret = tcp_sock_init(c, AF_UNSPEC, NULL, NULL,
i);
@@ -259,8 +265,12 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
} while ((p = next_chunk(p, ',')));
if (exclude_only) {
- for (i = 0; i < PORT_EPHEMERAL_MIN; i++) {
- if (bitmap_isset(exclude, i))
+ /* Skip port 0. It has special meaning for many socket APIs, so
+ * trying to bind it is not really safe.
+ */
+ for (i = 1; i < NUM_PORTS; i++) {
+ if (fwd_port_is_ephemeral(i) ||
+ bitmap_isset(exclude, i))
continue;
bitmap_set(fwd->map, i);
@@ -353,55 +363,93 @@ bind_all_fail:
/**
* add_dns4() - Possibly add the IPv4 address of a DNS resolver to configuration
* @c: Execution context
- * @addr: Address found in /etc/resolv.conf
- * @conf: Pointer to reference of current entry in array of IPv4 resolvers
+ * @addr: Guest nameserver IPv4 address
+ * @idx: Index of free entry in array of IPv4 resolvers
+ *
+ * Return: Number of entries added (0 or 1)
*/
-static void add_dns4(struct ctx *c, const struct in_addr *addr,
- struct in_addr **conf)
+static unsigned add_dns4(struct ctx *c, const struct in_addr *addr,
+ unsigned idx)
{
- /* Guest or container can only access local addresses via redirect */
- if (IN4_IS_ADDR_LOOPBACK(addr)) {
- if (!c->no_map_gw) {
- **conf = c->ip4.gw;
- (*conf)++;
-
- if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match))
- c->ip4.dns_match = c->ip4.gw;
- }
- } else {
- **conf = *addr;
- (*conf)++;
- }
+ if (idx >= ARRAY_SIZE(c->ip4.dns))
+ return 0;
- if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host))
- c->ip4.dns_host = *addr;
+ c->ip4.dns[idx] = *addr;
+ return 1;
}
/**
* add_dns6() - Possibly add the IPv6 address of a DNS resolver to configuration
* @c: Execution context
- * @addr: Address found in /etc/resolv.conf
- * @conf: Pointer to reference of current entry in array of IPv6 resolvers
+ * @addr: Guest nameserver IPv6 address
+ * @idx: Index of free entry in array of IPv6 resolvers
+ *
+ * Return: Number of entries added (0 or 1)
*/
-static void add_dns6(struct ctx *c,
- struct in6_addr *addr, struct in6_addr **conf)
+static unsigned add_dns6(struct ctx *c, const struct in6_addr *addr,
+ unsigned idx)
{
- /* Guest or container can only access local addresses via redirect */
- if (IN6_IS_ADDR_LOOPBACK(addr)) {
- if (!c->no_map_gw) {
- memcpy(*conf, &c->ip6.gw, sizeof(**conf));
- (*conf)++;
+ if (idx >= ARRAY_SIZE(c->ip6.dns))
+ return 0;
- if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match))
- memcpy(&c->ip6.dns_match, addr, sizeof(*addr));
+ c->ip6.dns[idx] = *addr;
+ return 1;
+}
+
+/**
+ * add_dns_resolv() - Possibly add ns from host resolv.conf to configuration
+ * @c: Execution context
+ * @nameserver: Nameserver address string from /etc/resolv.conf
+ * @idx4: Pointer to index of current entry in array of IPv4 resolvers
+ * @idx6: Pointer to index of current entry in array of IPv6 resolvers
+ *
+ * @idx4 or @idx6 may be NULL, in which case resolvers of the corresponding type
+ * are ignored.
+ */
+static void add_dns_resolv(struct ctx *c, const char *nameserver,
+ unsigned *idx4, unsigned *idx6)
+{
+ struct in6_addr ns6;
+ struct in_addr ns4;
+
+ if (idx4 && inet_pton(AF_INET, nameserver, &ns4)) {
+ if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host))
+ c->ip4.dns_host = ns4;
+
+ /* Guest or container can only access local addresses via
+ * redirect
+ */
+ if (IN4_IS_ADDR_LOOPBACK(&ns4)) {
+ if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback))
+ return;
+
+ ns4 = c->ip4.map_host_loopback;
+ if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match))
+ c->ip4.dns_match = c->ip4.map_host_loopback;
}
- } else {
- memcpy(*conf, addr, sizeof(**conf));
- (*conf)++;
+
+ *idx4 += add_dns4(c, &ns4, *idx4);
}
- if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host))
- c->ip6.dns_host = *addr;
+ if (idx6 && inet_pton(AF_INET6, nameserver, &ns6)) {
+ if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host))
+ c->ip6.dns_host = ns6;
+
+ /* Guest or container can only access local addresses via
+ * redirect
+ */
+ if (IN6_IS_ADDR_LOOPBACK(&ns6)) {
+ if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback))
+ return;
+
+ ns6 = c->ip6.map_host_loopback;
+
+ if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match))
+ c->ip6.dns_match = c->ip6.map_host_loopback;
+ }
+
+ *idx6 += add_dns6(c, &ns6, *idx6);
+ }
}
/**
@@ -410,18 +458,16 @@ static void add_dns6(struct ctx *c,
*/
static void get_dns(struct ctx *c)
{
- struct in6_addr *dns6 = &c->ip6.dns[0], dns6_tmp;
- struct in_addr *dns4 = &c->ip4.dns[0], dns4_tmp;
int dns4_set, dns6_set, dnss_set, dns_set, fd;
+ unsigned dns4_idx = 0, dns6_idx = 0;
struct fqdn *s = c->dns_search;
struct lineread resolvconf;
- unsigned int added = 0;
ssize_t line_len;
char *line, *end;
const char *p;
- dns4_set = !c->ifi4 || !IN4_IS_ADDR_UNSPECIFIED(dns4);
- dns6_set = !c->ifi6 || !IN6_IS_ADDR_UNSPECIFIED(dns6);
+ dns4_set = !c->ifi4 || !IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns[0]);
+ dns6_set = !c->ifi6 || !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns[0]);
dnss_set = !!*s->n || c->no_dns_search;
dns_set = (dns4_set && dns6_set) || c->no_dns;
@@ -442,19 +488,9 @@ static void get_dns(struct ctx *c)
if (end)
*end = 0;
- if (!dns4_set &&
- dns4 - &c->ip4.dns[0] < ARRAY_SIZE(c->ip4.dns) - 1
- && inet_pton(AF_INET, p + 1, &dns4_tmp)) {
- add_dns4(c, &dns4_tmp, &dns4);
- added++;
- }
-
- if (!dns6_set &&
- dns6 - &c->ip6.dns[0] < ARRAY_SIZE(c->ip6.dns) - 1
- && inet_pton(AF_INET6, p + 1, &dns6_tmp)) {
- add_dns6(c, &dns6_tmp, &dns6);
- added++;
- }
+ add_dns_resolv(c, p + 1,
+ dns4_set ? NULL : &dns4_idx,
+ dns6_set ? NULL : &dns6_idx);
} else if (!dnss_set && strstr(line, "search ") == line &&
s == c->dns_search) {
end = strpbrk(line, "\n");
@@ -481,7 +517,7 @@ static void get_dns(struct ctx *c)
out:
if (!dns_set) {
- if (!added)
+ if (!(dns4_idx + dns6_idx))
warn("Couldn't get any nameserver address");
if (c->no_dhcp_dns)
@@ -586,12 +622,10 @@ static int conf_ip4_prefix(const char *arg)
* conf_ip4() - Verify or detect IPv4 support, get relevant addresses
* @ifi: Host interface to attempt (0 to determine one)
* @ip4: IPv4 context (will be written)
- * @mac: MAC address to use (written if unset)
*
* Return: Interface index for IPv4, or 0 on failure.
*/
-static unsigned int conf_ip4(unsigned int ifi,
- struct ip4_ctx *ip4, unsigned char *mac)
+static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4)
{
if (!ifi)
ifi = nl_get_ext_if(nl_sock, AF_INET);
@@ -601,8 +635,9 @@ static unsigned int conf_ip4(unsigned int ifi,
return 0;
}
- if (IN4_IS_ADDR_UNSPECIFIED(&ip4->gw)) {
- int rc = nl_route_get_def(nl_sock, ifi, AF_INET, &ip4->gw);
+ if (IN4_IS_ADDR_UNSPECIFIED(&ip4->guest_gw)) {
+ int rc = nl_route_get_def(nl_sock, ifi, AF_INET,
+ &ip4->guest_gw);
if (rc < 0) {
err("Couldn't discover IPv4 gateway address: %s",
strerror(-rc));
@@ -632,21 +667,9 @@ static unsigned int conf_ip4(unsigned int ifi,
ip4->prefix_len = 32;
}
- memcpy(&ip4->addr_seen, &ip4->addr, sizeof(ip4->addr_seen));
+ ip4->addr_seen = ip4->addr;
- if (MAC_IS_ZERO(mac)) {
- int rc = nl_link_get_mac(nl_sock, ifi, mac);
- if (rc < 0) {
- char ifname[IFNAMSIZ];
-
- err("Couldn't discover MAC address for %s: %s",
- if_indextoname(ifi, ifname), strerror(-rc));
- return 0;
- }
-
- if (MAC_IS_ZERO(mac))
- memcpy(mac, MAC_LAA, ETH_ALEN);
- }
+ ip4->our_tap_addr = ip4->guest_gw;
if (IN4_IS_ADDR_UNSPECIFIED(&ip4->addr))
return 0;
@@ -658,12 +681,10 @@ static unsigned int conf_ip4(unsigned int ifi,
* conf_ip6() - Verify or detect IPv6 support, get relevant addresses
* @ifi: Host interface to attempt (0 to determine one)
* @ip6: IPv6 context (will be written)
- * @mac: MAC address to use (written if unset)
*
* Return: Interface index for IPv6, or 0 on failure.
*/
-static unsigned int conf_ip6(unsigned int ifi,
- struct ip6_ctx *ip6, unsigned char *mac)
+static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6)
{
int prefix_len = 0;
int rc;
@@ -676,8 +697,8 @@ static unsigned int conf_ip6(unsigned int ifi,
return 0;
}
- if (IN6_IS_ADDR_UNSPECIFIED(&ip6->gw)) {
- rc = nl_route_get_def(nl_sock, ifi, AF_INET6, &ip6->gw);
+ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->guest_gw)) {
+ rc = nl_route_get_def(nl_sock, ifi, AF_INET6, &ip6->guest_gw);
if (rc < 0) {
err("Couldn't discover IPv6 gateway address: %s",
strerror(-rc));
@@ -687,30 +708,19 @@ static unsigned int conf_ip6(unsigned int ifi,
rc = nl_addr_get(nl_sock, ifi, AF_INET6,
IN6_IS_ADDR_UNSPECIFIED(&ip6->addr) ? &ip6->addr : NULL,
- &prefix_len, &ip6->addr_ll);
+ &prefix_len, &ip6->our_tap_ll);
if (rc < 0) {
err("Couldn't discover IPv6 address: %s", strerror(-rc));
return 0;
}
- memcpy(&ip6->addr_seen, &ip6->addr, sizeof(ip6->addr));
- memcpy(&ip6->addr_ll_seen, &ip6->addr_ll, sizeof(ip6->addr_ll));
-
- if (MAC_IS_ZERO(mac)) {
- rc = nl_link_get_mac(nl_sock, ifi, mac);
- if (rc < 0) {
- char ifname[IFNAMSIZ];
- err("Couldn't discover MAC address for %s: %s",
- if_indextoname(ifi, ifname), strerror(-rc));
- return 0;
- }
+ ip6->addr_seen = ip6->addr;
- if (MAC_IS_ZERO(mac))
- memcpy(mac, MAC_LAA, ETH_ALEN);
- }
+ if (IN6_IS_ADDR_LINKLOCAL(&ip6->guest_gw))
+ ip6->our_tap_ll = ip6->guest_gw;
if (IN6_IS_ADDR_UNSPECIFIED(&ip6->addr) ||
- IN6_IS_ADDR_UNSPECIFIED(&ip6->addr_ll))
+ IN6_IS_ADDR_UNSPECIFIED(&ip6->our_tap_ll))
return 0;
return ifi;
@@ -817,6 +827,12 @@ static void usage(const char *name, FILE *f, int status)
fprintf(f, " --no-dhcp-search No list in DHCP/DHCPv6/NDP\n");
fprintf(f,
+ " --map-host-loopback ADDR Translate ADDR to refer to host\n"
+ " can be specified zero to two times (for IPv4 and IPv6)\n"
+ " default: gateway address\n"
+ " --map-guest-addr ADDR Translate ADDR to guest's address\n"
+ " can be specified zero to two times (for IPv4 and IPv6)\n"
+ " default: none\n"
" --dns-forward ADDR Forward DNS queries sent to ADDR\n"
" can be specified zero to two times (for IPv4 and IPv6)\n"
" default: don't forward DNS queries\n"
@@ -921,7 +937,8 @@ pasta_opts:
*/
static void conf_print(const struct ctx *c)
{
- char buf4[INET_ADDRSTRLEN], buf6[INET6_ADDRSTRLEN], ifn[IFNAMSIZ];
+ char buf4[INET_ADDRSTRLEN], buf6[INET6_ADDRSTRLEN];
+ char bufmac[ETH_ADDRSTRLEN], ifn[IFNAMSIZ];
int i;
info("Template interface: %s%s%s%s%s",
@@ -955,11 +972,14 @@ static void conf_print(const struct ctx *c)
info("Namespace interface: %s", c->pasta_ifn);
info("MAC:");
- info(" host: %02x:%02x:%02x:%02x:%02x:%02x",
- c->mac[0], c->mac[1], c->mac[2],
- c->mac[3], c->mac[4], c->mac[5]);
+ info(" host: %s", eth_ntop(c->our_tap_mac, bufmac, sizeof(bufmac)));
if (c->ifi4) {
+ if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback))
+ info(" NAT to host 127.0.0.1: %s",
+ inet_ntop(AF_INET, &c->ip4.map_host_loopback,
+ buf4, sizeof(buf4)));
+
if (!c->no_dhcp) {
uint32_t mask;
@@ -971,7 +991,8 @@ static void conf_print(const struct ctx *c)
info(" mask: %s",
inet_ntop(AF_INET, &mask, buf4, sizeof(buf4)));
info(" router: %s",
- inet_ntop(AF_INET, &c->ip4.gw, buf4, sizeof(buf4)));
+ inet_ntop(AF_INET, &c->ip4.guest_gw,
+ buf4, sizeof(buf4)));
}
for (i = 0; !IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns[i]); i++) {
@@ -989,6 +1010,11 @@ static void conf_print(const struct ctx *c)
}
if (c->ifi6) {
+ if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback))
+ info(" NAT to host ::1: %s",
+ inet_ntop(AF_INET6, &c->ip6.map_host_loopback,
+ buf6, sizeof(buf6)));
+
if (!c->no_ndp && !c->no_dhcpv6)
info("NDP/DHCPv6:");
else if (!c->no_ndp)
@@ -1001,9 +1027,10 @@ static void conf_print(const struct ctx *c)
info(" assign: %s",
inet_ntop(AF_INET6, &c->ip6.addr, buf6, sizeof(buf6)));
info(" router: %s",
- inet_ntop(AF_INET6, &c->ip6.gw, buf6, sizeof(buf6)));
+ inet_ntop(AF_INET6, &c->ip6.guest_gw, buf6, sizeof(buf6)));
info(" our link-local: %s",
- inet_ntop(AF_INET6, &c->ip6.addr_ll, buf6, sizeof(buf6)));
+ inet_ntop(AF_INET6, &c->ip6.our_tap_ll,
+ buf6, sizeof(buf6)));
dns6:
for (i = 0; !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns[i]); i++) {
@@ -1122,6 +1149,38 @@ static void conf_ugid(char *runas, uid_t *uid, gid_t *gid)
}
/**
+ * conf_nat() - Parse --map-host-loopback or --map-guest-addr option
+ * @arg: String argument to option
+ * @addr4: IPv4 to update with parsed address
+ * @addr6: IPv6 to update with parsed address
+ * @no_map_gw: --no-map-gw flag, or NULL, updated for "none" argument
+ */
+static void conf_nat(const char *arg, struct in_addr *addr4,
+ struct in6_addr *addr6, int *no_map_gw)
+{
+ if (strcmp(arg, "none") == 0) {
+ *addr4 = in4addr_any;
+ *addr6 = in6addr_any;
+ if (no_map_gw)
+ *no_map_gw = 1;
+ }
+
+ if (inet_pton(AF_INET6, arg, addr6) &&
+ !IN6_IS_ADDR_UNSPECIFIED(addr6) &&
+ !IN6_IS_ADDR_LOOPBACK(addr6) &&
+ !IN6_IS_ADDR_MULTICAST(addr6))
+ return;
+
+ if (inet_pton(AF_INET, arg, addr4) &&
+ !IN4_IS_ADDR_UNSPECIFIED(addr4) &&
+ !IN4_IS_ADDR_LOOPBACK(addr4) &&
+ !IN4_IS_ADDR_MULTICAST(addr4))
+ return;
+
+ die("Invalid address to remap to host: %s", optarg);
+}
+
+/**
* conf_open_files() - Open files as requested by configuration
* @c: Execution context
*/
@@ -1174,7 +1233,7 @@ fail:
*/
void conf(struct ctx *c, int argc, char **argv)
{
- int netns_only = 0;
+ int netns_only = 0, no_map_gw = 0;
const struct option options[] = {
{"debug", no_argument, NULL, 'd' },
{"quiet", no_argument, NULL, 'q' },
@@ -1203,7 +1262,7 @@ void conf(struct ctx *c, int argc, char **argv)
{"no-dhcpv6", no_argument, &c->no_dhcpv6, 1 },
{"no-ndp", no_argument, &c->no_ndp, 1 },
{"no-ra", no_argument, &c->no_ra, 1 },
- {"no-map-gw", no_argument, &c->no_map_gw, 1 },
+ {"no-map-gw", no_argument, &no_map_gw, 1 },
{"ipv4-only", no_argument, NULL, '4' },
{"ipv6-only", no_argument, NULL, '6' },
{"one-off", no_argument, NULL, '1' },
@@ -1230,6 +1289,8 @@ void conf(struct ctx *c, int argc, char **argv)
{"no-copy-routes", no_argument, NULL, 18 },
{"no-copy-addrs", no_argument, NULL, 19 },
{"netns-only", no_argument, NULL, 20 },
+ {"map-host-loopback", required_argument, NULL, 21 },
+ {"map-guest-addr", required_argument, NULL, 22 },
{ 0 },
};
const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt";
@@ -1237,8 +1298,7 @@ void conf(struct ctx *c, int argc, char **argv)
bool copy_addrs_opt = false, copy_routes_opt = false;
enum fwd_ports_mode fwd_default = FWD_NONE;
bool v4_only = false, v6_only = false;
- struct in6_addr *dns6 = c->ip6.dns;
- struct in_addr *dns4 = c->ip4.dns;
+ unsigned dns4_idx = 0, dns6_idx = 0;
struct fqdn *dnss = c->dns_search;
unsigned int ifi4 = 0, ifi6 = 0;
const char *logfile = NULL;
@@ -1260,8 +1320,9 @@ void conf(struct ctx *c, int argc, char **argv)
c->tcp.fwd_in.mode = c->tcp.fwd_out.mode = FWD_UNSET;
c->udp.fwd_in.mode = c->udp.fwd_out.mode = FWD_UNSET;
+ memcpy(c->our_tap_mac, MAC_OUR_LAA, ETH_ALEN);
- optind = 1;
+ optind = 0;
do {
name = getopt_long(argc, argv, optstring, options, NULL);
@@ -1290,7 +1351,7 @@ void conf(struct ctx *c, int argc, char **argv)
if (c->mode != MODE_PASTA)
die("--ns-mac-addr is for pasta mode only");
- parse_mac(c->mac_guest, optarg);
+ parse_mac(c->guest_mac, optarg);
break;
case 5:
if (c->mode != MODE_PASTA)
@@ -1399,6 +1460,14 @@ void conf(struct ctx *c, int argc, char **argv)
netns_only = 1;
*userns = 0;
break;
+ case 21:
+ conf_nat(optarg, &c->ip4.map_host_loopback,
+ &c->ip6.map_host_loopback, &no_map_gw);
+ break;
+ case 22:
+ conf_nat(optarg, &c->ip4.map_guest_addr,
+ &c->ip6.map_guest_addr, NULL);
+ break;
case 'd':
c->debug = 1;
c->quiet = 0;
@@ -1501,21 +1570,21 @@ void conf(struct ctx *c, int argc, char **argv)
break;
case 'M':
- parse_mac(c->mac, optarg);
+ parse_mac(c->our_tap_mac, optarg);
break;
case 'g':
- if (inet_pton(AF_INET6, optarg, &c->ip6.gw) &&
- !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.gw) &&
- !IN6_IS_ADDR_LOOPBACK(&c->ip6.gw)) {
+ if (inet_pton(AF_INET6, optarg, &c->ip6.guest_gw) &&
+ !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.guest_gw) &&
+ !IN6_IS_ADDR_LOOPBACK(&c->ip6.guest_gw)) {
if (c->mode == MODE_PASTA)
c->ip6.no_copy_routes = true;
break;
}
- if (inet_pton(AF_INET, optarg, &c->ip4.gw) &&
- !IN4_IS_ADDR_UNSPECIFIED(&c->ip4.gw) &&
- !IN4_IS_ADDR_BROADCAST(&c->ip4.gw) &&
- !IN4_IS_ADDR_LOOPBACK(&c->ip4.gw)) {
+ if (inet_pton(AF_INET, optarg, &c->ip4.guest_gw) &&
+ !IN4_IS_ADDR_UNSPECIFIED(&c->ip4.guest_gw) &&
+ !IN4_IS_ADDR_BROADCAST(&c->ip4.guest_gw) &&
+ !IN4_IS_ADDR_LOOPBACK(&c->ip4.guest_gw)) {
if (c->mode == MODE_PASTA)
c->ip4.no_copy_routes = true;
break;
@@ -1630,25 +1699,31 @@ void conf(struct ctx *c, int argc, char **argv)
nl_sock_init(c, false);
if (!v6_only)
- c->ifi4 = conf_ip4(ifi4, &c->ip4, c->mac);
+ c->ifi4 = conf_ip4(ifi4, &c->ip4);
if (!v4_only)
- c->ifi6 = conf_ip6(ifi6, &c->ip6, c->mac);
+ c->ifi6 = conf_ip6(ifi6, &c->ip6);
if ((!c->ifi4 && !c->ifi6) ||
(*c->ip4.ifname_out && !c->ifi4) ||
(*c->ip6.ifname_out && !c->ifi6))
die("External interface not usable");
- if (c->ifi4 && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.gw))
- c->no_map_gw = c->no_dhcp = 1;
+ if (c->ifi4 && !no_map_gw &&
+ IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback))
+ c->ip4.map_host_loopback = c->ip4.guest_gw;
- if (c->ifi6 && IN6_IS_ADDR_UNSPECIFIED(&c->ip6.gw))
- c->no_map_gw = 1;
+ if (c->ifi6 && !no_map_gw &&
+ IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback))
+ c->ip6.map_host_loopback = c->ip6.guest_gw;
+
+ if (c->ifi4 && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.guest_gw))
+ c->no_dhcp = 1;
/* Inbound port options & DNS can be parsed now (after IPv4/IPv6
* settings)
*/
+ fwd_probe_ephemeral();
udp_portmap_clear();
- optind = 1;
+ optind = 0;
do {
name = getopt_long(argc, argv, optstring, options, NULL);
@@ -1663,13 +1738,13 @@ void conf(struct ctx *c, int argc, char **argv)
if (!strcmp(optarg, "none")) {
c->no_dns = 1;
- dns4 = &c->ip4.dns[0];
+ dns4_idx = 0;
memset(c->ip4.dns, 0, sizeof(c->ip4.dns));
c->ip4.dns[0] = (struct in_addr){ 0 };
c->ip4.dns_match = (struct in_addr){ 0 };
c->ip4.dns_host = (struct in_addr){ 0 };
- dns6 = &c->ip6.dns[0];
+ dns6_idx = 0;
memset(c->ip6.dns, 0, sizeof(c->ip6.dns));
c->ip6.dns_match = (struct in6_addr){ 0 };
c->ip6.dns_host = (struct in6_addr){ 0 };
@@ -1679,15 +1754,13 @@ void conf(struct ctx *c, int argc, char **argv)
c->no_dns = 0;
- if (dns4 - &c->ip4.dns[0] < ARRAY_SIZE(c->ip4.dns) &&
- inet_pton(AF_INET, optarg, &dns4_tmp)) {
- add_dns4(c, &dns4_tmp, &dns4);
+ if (inet_pton(AF_INET, optarg, &dns4_tmp)) {
+ dns4_idx += add_dns4(c, &dns4_tmp, dns4_idx);
continue;
}
- if (dns6 - &c->ip6.dns[0] < ARRAY_SIZE(c->ip6.dns) &&
- inet_pton(AF_INET6, optarg, &dns6_tmp)) {
- add_dns6(c, &dns6_tmp, &dns6);
+ if (inet_pton(AF_INET6, optarg, &dns6_tmp)) {
+ dns6_idx += add_dns6(c, &dns6_tmp, dns6_idx);
continue;
}
@@ -1720,7 +1793,7 @@ void conf(struct ctx *c, int argc, char **argv)
nl_sock_init(c, true);
/* ...and outbound port options now that namespaces are set up. */
- optind = 1;
+ optind = 0;
do {
name = getopt_long(argc, argv, optstring, options, NULL);
diff --git a/contrib/apparmor/abstractions/passt b/contrib/apparmor/abstractions/passt
index d245115..43fd63f 100644
--- a/contrib/apparmor/abstractions/passt
+++ b/contrib/apparmor/abstractions/passt
@@ -34,6 +34,8 @@
owner @{PROC}/@{pid}/uid_map r, # conf_ugid()
+ @{PROC}/sys/net/ipv4/ip_local_port_range r, # fwd_probe_ephemeral()
+
network netlink raw, # nl_sock_init_do(), netlink.c
network inet stream, # tcp.c
diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te
index bbb0917..80bf780 100644
--- a/contrib/selinux/passt.te
+++ b/contrib/selinux/passt.te
@@ -50,6 +50,7 @@ require {
type passwd_file_t;
class netlink_route_socket { bind create nlmsg_read };
+ type sysctl_net_t;
class capability { sys_tty_config setuid setgid };
class cap_userns { setpcap sys_admin sys_ptrace };
@@ -104,6 +105,8 @@ allow passt_t net_conf_t:lnk_file read;
allow passt_t tmp_t:sock_file { create unlink write };
allow passt_t self:netlink_route_socket { bind create nlmsg_read read write setopt };
kernel_search_network_sysctl(passt_t)
+allow passt_t sysctl_net_t:dir search;
+allow passt_t sysctl_net_t:file { open read };
corenet_tcp_bind_all_nodes(passt_t)
corenet_udp_bind_all_nodes(passt_t)
diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te
index 4e36c3f..310383c 100644
--- a/contrib/selinux/pasta.te
+++ b/contrib/selinux/pasta.te
@@ -196,7 +196,7 @@ allow pasta_t ifconfig_var_run_t:dir { read search watch };
allow pasta_t self:tun_socket create;
allow pasta_t tun_tap_device_t:chr_file { ioctl open read write };
allow pasta_t sysctl_net_t:dir search;
-allow pasta_t sysctl_net_t:file { open write };
+allow pasta_t sysctl_net_t:file { open read write };
allow pasta_t kernel_t:system module_request;
allow pasta_t nsfs_t:file read;
diff --git a/dhcp.c b/dhcp.c
index aa9f59d..a06f143 100644
--- a/dhcp.c
+++ b/dhcp.c
@@ -276,6 +276,7 @@ static void opt_set_dns_search(const struct ctx *c, size_t max_len)
int dhcp(const struct ctx *c, const struct pool *p)
{
size_t mlen, dlen, offset = 0, opt_len, opt_off = 0;
+ char macstr[ETH_ADDRSTRLEN];
const struct ethhdr *eh;
const struct iphdr *iph;
const struct udphdr *uh;
@@ -340,26 +341,26 @@ int dhcp(const struct ctx *c, const struct pool *p)
return -1;
}
- info(" from %02x:%02x:%02x:%02x:%02x:%02x",
- m->chaddr[0], m->chaddr[1], m->chaddr[2],
- m->chaddr[3], m->chaddr[4], m->chaddr[5]);
+ info(" from %s", eth_ntop(m->chaddr, macstr, sizeof(macstr)));
m->yiaddr = c->ip4.addr;
mask.s_addr = htonl(0xffffffff << (32 - c->ip4.prefix_len));
- memcpy(opts[1].s, &mask, sizeof(mask));
- memcpy(opts[3].s, &c->ip4.gw, sizeof(c->ip4.gw));
- memcpy(opts[54].s, &c->ip4.gw, sizeof(c->ip4.gw));
+ memcpy(opts[1].s, &mask, sizeof(mask));
+ memcpy(opts[3].s, &c->ip4.guest_gw, sizeof(c->ip4.guest_gw));
+ memcpy(opts[54].s, &c->ip4.our_tap_addr, sizeof(c->ip4.our_tap_addr));
/* If the gateway is not on the assigned subnet, send an option 121
* (Classless Static Routing) adding a dummy route to it.
*/
if ((c->ip4.addr.s_addr & mask.s_addr)
- != (c->ip4.gw.s_addr & mask.s_addr)) {
+ != (c->ip4.guest_gw.s_addr & mask.s_addr)) {
/* a.b.c.d/32:0.0.0.0, 0:a.b.c.d */
opts[121].slen = 14;
opts[121].s[0] = 32;
- memcpy(opts[121].s + 1, &c->ip4.gw, sizeof(c->ip4.gw));
- memcpy(opts[121].s + 10, &c->ip4.gw, sizeof(c->ip4.gw));
+ memcpy(opts[121].s + 1,
+ &c->ip4.guest_gw, sizeof(c->ip4.guest_gw));
+ memcpy(opts[121].s + 10,
+ &c->ip4.guest_gw, sizeof(c->ip4.guest_gw));
}
if (c->mtu != -1) {
@@ -378,7 +379,7 @@ int dhcp(const struct ctx *c, const struct pool *p)
opt_set_dns_search(c, sizeof(m->o));
dlen = offsetof(struct msg, o) + fill(m);
- tap_udp4_send(c, c->ip4.gw, 67, c->ip4.addr, 68, m, dlen);
+ tap_udp4_send(c, c->ip4.our_tap_addr, 67, c->ip4.addr, 68, m, dlen);
return 1;
}
diff --git a/dhcpv6.c b/dhcpv6.c
index 7dcca2a..14a5c7e 100644
--- a/dhcpv6.c
+++ b/dhcpv6.c
@@ -298,7 +298,8 @@ static struct opt_hdr *dhcpv6_ia_notonlink(const struct pool *p,
{
char buf[INET6_ADDRSTRLEN];
struct in6_addr req_addr;
- struct opt_hdr *ia, *h;
+ const struct opt_hdr *h;
+ struct opt_hdr *ia;
size_t offset;
int ia_type;
@@ -312,12 +313,13 @@ ia_ta:
offset += sizeof(struct opt_ia_na);
while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) {
- struct opt_ia_addr *opt_addr = (struct opt_ia_addr *)h;
+ const struct opt_ia_addr *opt_addr;
if (ntohs(h->l) != OPT_VSIZE(ia_addr))
return NULL;
- memcpy(&req_addr, &opt_addr->addr, sizeof(req_addr));
+ opt_addr = (const struct opt_ia_addr *)h;
+ req_addr = opt_addr->addr;
if (!IN6_ARE_ADDR_EQUAL(la, &req_addr)) {
info("DHCPv6: requested address %s not on link",
inet_ntop(AF_INET6, &req_addr,
@@ -363,7 +365,7 @@ static size_t dhcpv6_dns_fill(const struct ctx *c, char *buf, int offset)
srv->hdr.l = 0;
}
- memcpy(&srv->addr[i], &c->ip6.dns[i], sizeof(srv->addr[i]));
+ srv->addr[i] = c->ip6.dns[i];
srv->hdr.l += sizeof(srv->addr[i]);
offset += sizeof(srv->addr[i]);
}
@@ -451,10 +453,7 @@ int dhcpv6(struct ctx *c, const struct pool *p,
c->ip6.addr_ll_seen = *saddr;
- if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw))
- src = &c->ip6.gw;
- else
- src = &c->ip6.addr_ll;
+ src = &c->ip6.our_tap_ll;
mh = packet_get(p, 0, sizeof(*uh), sizeof(*mh), NULL);
if (!mh)
@@ -574,8 +573,10 @@ void dhcpv6_init(const struct ctx *c)
resp.server_id.duid_time = duid_time;
resp_not_on_link.server_id.duid_time = duid_time;
- memcpy(resp.server_id.duid_lladdr, c->mac, sizeof(c->mac));
- memcpy(resp_not_on_link.server_id.duid_lladdr, c->mac, sizeof(c->mac));
+ memcpy(resp.server_id.duid_lladdr,
+ c->our_tap_mac, sizeof(c->our_tap_mac));
+ memcpy(resp_not_on_link.server_id.duid_lladdr,
+ c->our_tap_mac, sizeof(c->our_tap_mac));
resp.ia_addr.addr = c->ip6.addr;
}
diff --git a/flow.c b/flow.c
index 687e9fd..1ea112b 100644
--- a/flow.c
+++ b/flow.c
@@ -127,18 +127,18 @@ static struct timespec flow_timer_run;
* @af: Address family (AF_INET or AF_INET6)
* @eaddr: Endpoint address (pointer to in_addr or in6_addr)
* @eport: Endpoint port
- * @faddr: Forwarding address (pointer to in_addr or in6_addr)
- * @fport: Forwarding port
+ * @oaddr: Our address (pointer to in_addr or in6_addr)
+ * @oport: Our port
*/
static void flowside_from_af(struct flowside *side, sa_family_t af,
const void *eaddr, in_port_t eport,
- const void *faddr, in_port_t fport)
+ const void *oaddr, in_port_t oport)
{
- if (faddr)
- inany_from_af(&side->faddr, af, faddr);
+ if (oaddr)
+ inany_from_af(&side->oaddr, af, oaddr);
else
- side->faddr = inany_any6;
- side->fport = fport;
+ side->oaddr = inany_any6;
+ side->oport = oport;
if (eaddr)
inany_from_af(&side->eaddr, af, eaddr);
@@ -193,8 +193,8 @@ static int flowside_sock_splice(void *arg)
* @tgt: Target flowside
* @data: epoll reference portion for protocol handlers
*
- * Return: socket fd of protocol @proto bound to the forwarding address and port
- * from @tgt (if specified).
+ * Return: socket fd of protocol @proto bound to our address and port from @tgt
+ * (if specified).
*/
int flowside_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
const struct flowside *tgt, uint32_t data)
@@ -205,11 +205,11 @@ int flowside_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
ASSERT(pif_is_socket(pif));
- pif_sockaddr(c, &sa, &sl, pif, &tgt->faddr, tgt->fport);
+ pif_sockaddr(c, &sa, &sl, pif, &tgt->oaddr, tgt->oport);
switch (pif) {
case PIF_HOST:
- if (inany_is_loopback(&tgt->faddr))
+ if (inany_is_loopback(&tgt->oaddr))
ifname = NULL;
else if (sa.sa_family == AF_INET)
ifname = c->ip4.ifname_out;
@@ -283,46 +283,60 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
"Flow %u (%s): %s", flow_idx(f), type_or_state, msg);
}
-/**
- * flow_set_state() - Change flow's state
- * @f: Flow changing state
- * @state: New state
+/** flow_log_details_() - Log the details of a flow
+ * @f: flow to log
+ * @pri: Log priority
+ * @state: State to log details according to
+ *
+ * Logs the details of the flow: endpoints, interfaces, type etc.
*/
-static void flow_set_state(struct flow_common *f, enum flow_state state)
+void flow_log_details_(const struct flow_common *f, int pri,
+ enum flow_state state)
{
char estr0[INANY_ADDRSTRLEN], fstr0[INANY_ADDRSTRLEN];
char estr1[INANY_ADDRSTRLEN], fstr1[INANY_ADDRSTRLEN];
const struct flowside *ini = &f->side[INISIDE];
const struct flowside *tgt = &f->side[TGTSIDE];
- uint8_t oldstate = f->state;
-
- ASSERT(state < FLOW_NUM_STATES);
- ASSERT(oldstate < FLOW_NUM_STATES);
- f->state = state;
- flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
- FLOW_STATE(f));
-
- if (MAX(state, oldstate) >= FLOW_STATE_TGT)
- flow_log_(f, LOG_DEBUG,
+ if (state >= FLOW_STATE_TGT)
+ flow_log_(f, pri,
"%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu",
pif_name(f->pif[INISIDE]),
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
ini->eport,
- inany_ntop(&ini->faddr, fstr0, sizeof(fstr0)),
- ini->fport,
+ inany_ntop(&ini->oaddr, fstr0, sizeof(fstr0)),
+ ini->oport,
pif_name(f->pif[TGTSIDE]),
- inany_ntop(&tgt->faddr, fstr1, sizeof(fstr1)),
- tgt->fport,
+ inany_ntop(&tgt->oaddr, fstr1, sizeof(fstr1)),
+ tgt->oport,
inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)),
tgt->eport);
- else if (MAX(state, oldstate) >= FLOW_STATE_INI)
- flow_log_(f, LOG_DEBUG, "%s [%s]:%hu -> [%s]:%hu => ?",
+ else if (state >= FLOW_STATE_INI)
+ flow_log_(f, pri, "%s [%s]:%hu -> [%s]:%hu => ?",
pif_name(f->pif[INISIDE]),
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
ini->eport,
- inany_ntop(&ini->faddr, fstr0, sizeof(fstr0)),
- ini->fport);
+ inany_ntop(&ini->oaddr, fstr0, sizeof(fstr0)),
+ ini->oport);
+}
+
+/**
+ * flow_set_state() - Change flow's state
+ * @f: Flow changing state
+ * @state: New state
+ */
+static void flow_set_state(struct flow_common *f, enum flow_state state)
+{
+ uint8_t oldstate = f->state;
+
+ ASSERT(state < FLOW_NUM_STATES);
+ ASSERT(oldstate < FLOW_NUM_STATES);
+
+ f->state = state;
+ flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
+ FLOW_STATE(f));
+
+ flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate));
}
/**
@@ -347,7 +361,7 @@ static void flow_initiate_(union flow *flow, uint8_t pif)
* flow_initiate_af() - Move flow to INI, setting INISIDE details
* @flow: Flow to change state
* @pif: pif of the initiating side
- * @af: Address family of @eaddr and @faddr
+ * @af: Address family of @saddr and @daddr
* @saddr: Source address (pointer to in_addr or in6_addr)
* @sport: Endpoint port
* @daddr: Destination address (pointer to in_addr or in6_addr)
@@ -384,10 +398,10 @@ const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa);
if (inany_v4(&ini->eaddr))
- ini->faddr = inany_any4;
+ ini->oaddr = inany_any4;
else
- ini->faddr = inany_any6;
- ini->fport = dport;
+ ini->oaddr = inany_any6;
+ ini->oport = dport;
flow_initiate_(flow, pif);
return ini;
}
@@ -432,8 +446,8 @@ const struct flowside *flow_target(const struct ctx *c, union flow *flow,
pif_name(f->pif[INISIDE]),
inany_ntop(&ini->eaddr, estr, sizeof(estr)),
ini->eport,
- inany_ntop(&ini->faddr, fstr, sizeof(fstr)),
- ini->fport);
+ inany_ntop(&ini->oaddr, fstr, sizeof(fstr)),
+ ini->oport);
}
if (tgtpif == PIF_NONE)
@@ -561,18 +575,12 @@ static uint64_t flow_hash(const struct ctx *c, uint8_t proto, uint8_t pif,
{
struct siphash_state state = SIPHASH_INIT(c->hash_secret);
- /* For the hash table to work, we need complete endpoint information,
- * and at least a forwarding port.
- */
- ASSERT(pif != PIF_NONE && !inany_is_unspecified(&side->eaddr) &&
- side->eport != 0 && side->fport != 0);
-
- inany_siphash_feed(&state, &side->faddr);
+ inany_siphash_feed(&state, &side->oaddr);
inany_siphash_feed(&state, &side->eaddr);
return siphash_final(&state, 38, (uint64_t)proto << 40 |
(uint64_t)pif << 32 |
- (uint64_t)side->fport << 16 |
+ (uint64_t)side->oport << 16 |
(uint64_t)side->eport);
}
@@ -586,8 +594,16 @@ static uint64_t flow_hash(const struct ctx *c, uint8_t proto, uint8_t pif,
static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx)
{
const struct flow_common *f = &flow_at_sidx(sidx)->f;
- return flow_hash(c, FLOW_PROTO(f),
- f->pif[sidx.sidei], &f->side[sidx.sidei]);
+ const struct flowside *side = &f->side[sidx.sidei];
+ uint8_t pif = f->pif[sidx.sidei];
+
+ /* For the hash table to work, entries must have complete endpoint
+ * information, and at least a forwarding port.
+ */
+ ASSERT(pif != PIF_NONE && !inany_is_unspecified(&side->eaddr) &&
+ side->eport != 0 && side->oport != 0);
+
+ return flow_hash(c, FLOW_PROTO(f), pif, side);
}
/**
@@ -695,7 +711,7 @@ static flow_sidx_t flowside_lookup(const struct ctx *c, uint8_t proto,
!(FLOW_PROTO(&flow->f) == proto &&
flow->f.pif[sidx.sidei] == pif &&
flowside_eq(&flow->f.side[sidx.sidei], side)))
- b = (b + 1) % FLOW_HASH_SIZE;
+ b = mod_sub(b, 1, FLOW_HASH_SIZE);
return flow_hashtab[b];
}
@@ -707,20 +723,20 @@ static flow_sidx_t flowside_lookup(const struct ctx *c, uint8_t proto,
* @pif: Interface of the flow
* @af: Address family, AF_INET or AF_INET6
* @eaddr: Guest side endpoint address (guest local address)
- * @faddr: Guest side forwarding address (guest remote address)
+ * @oaddr: Our guest side address (guest remote address)
* @eport: Guest side endpoint port (guest local port)
- * @fport: Guest side forwarding port (guest remote port)
+ * @oport: Our guest side port (guest remote port)
*
* Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
*/
flow_sidx_t flow_lookup_af(const struct ctx *c,
uint8_t proto, uint8_t pif, sa_family_t af,
- const void *eaddr, const void *faddr,
- in_port_t eport, in_port_t fport)
+ const void *eaddr, const void *oaddr,
+ in_port_t eport, in_port_t oport)
{
struct flowside side;
- flowside_from_af(&side, af, eaddr, eport, faddr, fport);
+ flowside_from_af(&side, af, eaddr, eport, oaddr, oport);
return flowside_lookup(c, proto, pif, &side);
}
@@ -730,22 +746,22 @@ flow_sidx_t flow_lookup_af(const struct ctx *c,
* @proto: Protocol of the flow (IP L4 protocol number)
* @pif: Interface of the flow
* @esa: Socket address of the endpoint
- * @fport: Forwarding port number
+ * @oport: Our port number
*
* Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
*/
flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
- const void *esa, in_port_t fport)
+ const void *esa, in_port_t oport)
{
struct flowside side = {
- .fport = fport,
+ .oport = oport,
};
inany_from_sockaddr(&side.eaddr, &side.eport, esa);
if (inany_v4(&side.eaddr))
- side.faddr = inany_any4;
+ side.oaddr = inany_any4;
else
- side.faddr = inany_any6;
+ side.oaddr = inany_any6;
return flowside_lookup(c, proto, pif, &side);
}
@@ -830,7 +846,8 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
closed = icmp_ping_timer(c, &flow->ping, now);
break;
case FLOW_UDP:
- if (timer)
+ closed = udp_flow_defer(&flow->udp);
+ if (!closed && timer)
closed = udp_flow_timer(c, &flow->udp, now);
break;
default:
diff --git a/flow.h b/flow.h
index 078fd60..24ba3ef 100644
--- a/flow.h
+++ b/flow.h
@@ -140,14 +140,14 @@ extern const uint8_t flow_proto[];
/**
* struct flowside - Address information for one side of a flow
* @eaddr: Endpoint address (remote address from passt's PoV)
- * @faddr: Forwarding address (local address from passt's PoV)
+ * @oaddr: Our address (local address from passt's PoV)
* @eport: Endpoint port
- * @fport: Forwarding port
+ * @oport: Our port
*/
struct flowside {
- union inany_addr faddr;
+ union inany_addr oaddr;
union inany_addr eaddr;
- in_port_t fport;
+ in_port_t oport;
in_port_t eport;
};
@@ -162,8 +162,8 @@ static inline bool flowside_eq(const struct flowside *left,
{
return inany_equals(&left->eaddr, &right->eaddr) &&
left->eport == right->eport &&
- inany_equals(&left->faddr, &right->faddr) &&
- left->fport == right->fport;
+ inany_equals(&left->oaddr, &right->oaddr) &&
+ left->oport == right->oport;
}
int flowside_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
@@ -240,10 +240,10 @@ uint64_t flow_hash_insert(const struct ctx *c, flow_sidx_t sidx);
void flow_hash_remove(const struct ctx *c, flow_sidx_t sidx);
flow_sidx_t flow_lookup_af(const struct ctx *c,
uint8_t proto, uint8_t pif, sa_family_t af,
- const void *eaddr, const void *faddr,
- in_port_t eport, in_port_t fport);
+ const void *eaddr, const void *oaddr,
+ in_port_t eport, in_port_t oport);
flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
- const void *esa, in_port_t fport);
+ const void *esa, in_port_t oport);
union flow;
@@ -264,4 +264,11 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
flow_dbg((f), __VA_ARGS__); \
} while (0)
+void flow_log_details_(const struct flow_common *f, int pri,
+ enum flow_state state);
+#define flow_log_details(f_, pri) \
+ flow_log_details_(&((f_)->f), (pri), (f_)->f.state)
+#define flow_dbg_details(f_) flow_log_details((f_), LOG_DEBUG)
+#define flow_err_details(f_) flow_log_details((f_), LOG_ERR)
+
#endif /* FLOW_H */
diff --git a/fwd.c b/fwd.c
index dea36f6..a505098 100644
--- a/fwd.c
+++ b/fwd.c
@@ -27,6 +27,80 @@
#include "lineread.h"
#include "flow_table.h"
+/* Empheral port range: values from RFC 6335 */
+static in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14);
+static in_port_t fwd_ephemeral_max = NUM_PORTS - 1;
+
+#define PORT_RANGE_SYSCTL "/proc/sys/net/ipv4/ip_local_port_range"
+
+/** fwd_probe_ephemeral() - Determine what ports this host considers ephemeral
+ *
+ * Work out what ports the host thinks are emphemeral and record it for later
+ * use by fwd_port_is_ephemeral(). If we're unable to probe, assume the range
+ * recommended by RFC 6335.
+ */
+void fwd_probe_ephemeral(void)
+{
+ char *line, *tab, *end;
+ struct lineread lr;
+ long min, max;
+ ssize_t len;
+ int fd;
+
+ fd = open(PORT_RANGE_SYSCTL, O_RDONLY | O_CLOEXEC);
+ if (fd < 0) {
+ warn_perror("Unable to open %s", PORT_RANGE_SYSCTL);
+ return;
+ }
+
+ lineread_init(&lr, fd);
+ len = lineread_get(&lr, &line);
+ close(fd);
+
+ if (len < 0)
+ goto parse_err;
+
+ tab = strchr(line, '\t');
+ if (!tab)
+ goto parse_err;
+ *tab = '\0';
+
+ errno = 0;
+ min = strtol(line, &end, 10);
+ if (*end || errno)
+ goto parse_err;
+
+ errno = 0;
+ max = strtol(tab + 1, &end, 10);
+ if (*end || errno)
+ goto parse_err;
+
+ if (min < 0 || min >= NUM_PORTS ||
+ max < 0 || max >= NUM_PORTS)
+ goto parse_err;
+
+ fwd_ephemeral_min = min;
+ fwd_ephemeral_max = max;
+
+ return;
+
+parse_err:
+ warn("Unable to parse %s", PORT_RANGE_SYSCTL);
+}
+
+/**
+ * fwd_port_is_ephemeral() - Is port number ephemeral?
+ * @port: Port number
+ *
+ * Return: true if @port is ephemeral, that is may be allocated by the kernel as
+ * a local port for outgoing connections or datagrams, but should not be
+ * used for binding services to.
+ */
+bool fwd_port_is_ephemeral(in_port_t port)
+{
+ return (port >= fwd_ephemeral_min) && (port <= fwd_ephemeral_max);
+}
+
/* See enum in kernel's include/net/tcp_states.h */
#define UDP_LISTEN 0x07
#define TCP_LISTEN 0x0a
@@ -167,7 +241,86 @@ void fwd_scan_ports_init(struct ctx *c)
static bool is_dns_flow(uint8_t proto, const struct flowside *ini)
{
return ((proto == IPPROTO_UDP) || (proto == IPPROTO_TCP)) &&
- ((ini->fport == 53) || (ini->fport == 853));
+ ((ini->oport == 53) || (ini->oport == 853));
+}
+
+/**
+ * fwd_guest_accessible4() - Is IPv4 address guest-accessible
+ * @c: Execution context
+ * @addr: Host visible IPv4 address
+ *
+ * Return: true if @addr on the host is accessible to the guest without
+ * translation, false otherwise
+ */
+static bool fwd_guest_accessible4(const struct ctx *c,
+ const struct in_addr *addr)
+{
+ if (IN4_IS_ADDR_LOOPBACK(addr))
+ return false;
+
+ /* In socket interfaces 0.0.0.0 generally means "any" or unspecified,
+ * however on the wire it can mean "this host on this network". Since
+ * that has a different meaning for host and guest, we can't let it
+ * through untranslated.
+ */
+ if (IN4_IS_ADDR_UNSPECIFIED(addr))
+ return false;
+
+ /* For IPv4, addr_seen is initialised to addr, so is always a valid
+ * address
+ */
+ if (IN4_ARE_ADDR_EQUAL(addr, &c->ip4.addr) ||
+ IN4_ARE_ADDR_EQUAL(addr, &c->ip4.addr_seen))
+ return false;
+
+ return true;
+}
+
+/**
+ * fwd_guest_accessible6() - Is IPv6 address guest-accessible
+ * @c: Execution context
+ * @addr: Host visible IPv6 address
+ *
+ * Return: true if @addr on the host is accessible to the guest without
+ * translation, false otherwise
+ */
+static bool fwd_guest_accessible6(const struct ctx *c,
+ const struct in6_addr *addr)
+{
+ if (IN6_IS_ADDR_LOOPBACK(addr))
+ return false;
+
+ if (IN6_ARE_ADDR_EQUAL(addr, &c->ip6.addr))
+ return false;
+
+ /* For IPv6, addr_seen starts unspecified, because we don't know what LL
+ * address the guest will take until we see it. Only check against it
+ * if it has been set to a real address.
+ */
+ if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr_seen) &&
+ IN6_ARE_ADDR_EQUAL(addr, &c->ip6.addr_seen))
+ return false;
+
+ return true;
+}
+
+/**
+ * fwd_guest_accessible() - Is IPv[46] address guest-accessible
+ * @c: Execution context
+ * @addr: Host visible IPv[46] address
+ *
+ * Return: true if @addr on the host is accessible to the guest without
+ * translation, false otherwise
+ */
+static bool fwd_guest_accessible(const struct ctx *c,
+ const union inany_addr *addr)
+{
+ const struct in_addr *a4 = inany_v4(addr);
+
+ if (a4)
+ return fwd_guest_accessible4(c, a4);
+
+ return fwd_guest_accessible6(c, &addr->a6);
}
/**
@@ -184,33 +337,37 @@ uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
const struct flowside *ini, struct flowside *tgt)
{
if (is_dns_flow(proto, ini) &&
- inany_equals4(&ini->faddr, &c->ip4.dns_match))
+ inany_equals4(&ini->oaddr, &c->ip4.dns_match))
tgt->eaddr = inany_from_v4(c->ip4.dns_host);
else if (is_dns_flow(proto, ini) &&
- inany_equals6(&ini->faddr, &c->ip6.dns_match))
+ inany_equals6(&ini->oaddr, &c->ip6.dns_match))
tgt->eaddr.a6 = c->ip6.dns_host;
- else if (!c->no_map_gw && inany_equals4(&ini->faddr, &c->ip4.gw))
+ else if (inany_equals4(&ini->oaddr, &c->ip4.map_host_loopback))
tgt->eaddr = inany_loopback4;
- else if (!c->no_map_gw && inany_equals6(&ini->faddr, &c->ip6.gw))
+ else if (inany_equals6(&ini->oaddr, &c->ip6.map_host_loopback))
tgt->eaddr = inany_loopback6;
+ else if (inany_equals4(&ini->oaddr, &c->ip4.map_guest_addr))
+ tgt->eaddr = inany_from_v4(c->ip4.addr);
+ else if (inany_equals6(&ini->oaddr, &c->ip6.map_guest_addr))
+ tgt->eaddr.a6 = c->ip6.addr;
else
- tgt->eaddr = ini->faddr;
+ tgt->eaddr = ini->oaddr;
- tgt->eport = ini->fport;
+ tgt->eport = ini->oport;
/* The relevant addr_out controls the host side source address. This
* may be unspecified, which allows the kernel to pick an address.
*/
if (inany_v4(&tgt->eaddr))
- tgt->faddr = inany_from_v4(c->ip4.addr_out);
+ tgt->oaddr = inany_from_v4(c->ip4.addr_out);
else
- tgt->faddr.a6 = c->ip6.addr_out;
+ tgt->oaddr.a6 = c->ip6.addr_out;
/* Let the kernel pick a host side source port */
- tgt->fport = 0;
+ tgt->oport = 0;
if (proto == IPPROTO_UDP) {
/* But for UDP we preserve the source port */
- tgt->fport = ini->eport;
+ tgt->oport = ini->eport;
}
return PIF_HOST;
@@ -230,13 +387,13 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
const struct flowside *ini, struct flowside *tgt)
{
if (!inany_is_loopback(&ini->eaddr) ||
- (!inany_is_loopback(&ini->faddr) && !inany_is_unspecified(&ini->faddr))) {
+ (!inany_is_loopback(&ini->oaddr) && !inany_is_unspecified(&ini->oaddr))) {
char estr[INANY_ADDRSTRLEN], fstr[INANY_ADDRSTRLEN];
debug("Non loopback address on %s: [%s]:%hu -> [%s]:%hu",
pif_name(PIF_SPLICE),
inany_ntop(&ini->eaddr, estr, sizeof(estr)), ini->eport,
- inany_ntop(&ini->faddr, fstr, sizeof(fstr)), ini->fport);
+ inany_ntop(&ini->oaddr, fstr, sizeof(fstr)), ini->oport);
return PIF_NONE;
}
@@ -248,20 +405,20 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
/* Preserve the specific loopback adddress used, but let the kernel pick
* a source port on the target side
*/
- tgt->faddr = ini->eaddr;
- tgt->fport = 0;
+ tgt->oaddr = ini->eaddr;
+ tgt->oport = 0;
- tgt->eport = ini->fport;
+ tgt->eport = ini->oport;
if (proto == IPPROTO_TCP)
tgt->eport += c->tcp.fwd_out.delta[tgt->eport];
else if (proto == IPPROTO_UDP)
tgt->eport += c->udp.fwd_out.delta[tgt->eport];
/* Let the kernel pick a host side source port */
- tgt->fport = 0;
+ tgt->oport = 0;
if (proto == IPPROTO_UDP)
/* But for UDP preserve the source port */
- tgt->fport = ini->eport;
+ tgt->oport = ini->eport;
return PIF_HOST;
}
@@ -280,7 +437,7 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
const struct flowside *ini, struct flowside *tgt)
{
/* Common for spliced and non-spliced cases */
- tgt->eport = ini->fport;
+ tgt->eport = ini->oport;
if (proto == IPPROTO_TCP)
tgt->eport += c->tcp.fwd_in.delta[tgt->eport];
else if (proto == IPPROTO_UDP)
@@ -293,11 +450,11 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
/* Preserve the specific loopback adddress used, but let the
* kernel pick a source port on the target side
*/
- tgt->faddr = ini->eaddr;
- tgt->fport = 0;
+ tgt->oaddr = ini->eaddr;
+ tgt->oport = 0;
if (proto == IPPROTO_UDP)
/* But for UDP preserve the source port */
- tgt->fport = ini->eport;
+ tgt->oport = ini->eport;
if (inany_v4(&ini->eaddr))
tgt->eaddr = inany_loopback4;
@@ -307,26 +464,37 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
return PIF_SPLICE;
}
- tgt->faddr = ini->eaddr;
- tgt->fport = ini->eport;
-
- if (inany_is_loopback4(&tgt->faddr) ||
- inany_is_unspecified4(&tgt->faddr) ||
- inany_equals4(&tgt->faddr, &c->ip4.addr_seen)) {
- tgt->faddr = inany_from_v4(c->ip4.gw);
- } else if (inany_is_loopback6(&tgt->faddr) ||
- inany_equals6(&tgt->faddr, &c->ip6.addr_seen) ||
- inany_equals6(&tgt->faddr, &c->ip6.addr)) {
- if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw))
- tgt->faddr.a6 = c->ip6.gw;
- else
- tgt->faddr.a6 = c->ip6.addr_ll;
+ if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) &&
+ inany_equals4(&ini->eaddr, &in4addr_loopback)) {
+ /* Specifically 127.0.0.1, not 127.0.0.0/8 */
+ tgt->oaddr = inany_from_v4(c->ip4.map_host_loopback);
+ } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) &&
+ inany_equals6(&ini->eaddr, &in6addr_loopback)) {
+ tgt->oaddr.a6 = c->ip6.map_host_loopback;
+ } else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) &&
+ inany_equals4(&ini->eaddr, &c->ip4.addr)) {
+ tgt->oaddr = inany_from_v4(c->ip4.map_guest_addr);
+ } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) &&
+ inany_equals6(&ini->eaddr, &c->ip6.addr)) {
+ tgt->oaddr.a6 = c->ip6.map_guest_addr;
+ } else if (!fwd_guest_accessible(c, &ini->eaddr)) {
+ if (inany_v4(&ini->eaddr)) {
+ if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.our_tap_addr))
+ /* No source address we can use */
+ return PIF_NONE;
+ tgt->oaddr = inany_from_v4(c->ip4.our_tap_addr);
+ } else {
+ tgt->oaddr.a6 = c->ip6.our_tap_ll;
+ }
+ } else {
+ tgt->oaddr = ini->eaddr;
}
+ tgt->oport = ini->eport;
- if (inany_v4(&tgt->faddr)) {
+ if (inany_v4(&tgt->oaddr)) {
tgt->eaddr = inany_from_v4(c->ip4.addr_seen);
} else {
- if (inany_is_linklocal6(&tgt->faddr))
+ if (inany_is_linklocal6(&tgt->oaddr))
tgt->eaddr.a6 = c->ip6.addr_ll_seen;
else
tgt->eaddr.a6 = c->ip6.addr_seen;
diff --git a/fwd.h b/fwd.h
index b4aa8d5..3562f3c 100644
--- a/fwd.h
+++ b/fwd.h
@@ -12,6 +12,9 @@ struct flowside;
/* Number of ports for both TCP and UDP */
#define NUM_PORTS (1U << 16)
+void fwd_probe_ephemeral(void);
+bool fwd_port_is_ephemeral(in_port_t port);
+
enum fwd_ports_mode {
FWD_UNSET = 0,
FWD_SPEC = 1,
diff --git a/icmp.c b/icmp.c
index cb81c76..f514dbc 100644
--- a/icmp.c
+++ b/icmp.c
@@ -125,13 +125,13 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref)
ini->eport, seq);
if (pingf->f.type == FLOW_PING4) {
- const struct in_addr *saddr = inany_v4(&ini->faddr);
+ const struct in_addr *saddr = inany_v4(&ini->oaddr);
const struct in_addr *daddr = inany_v4(&ini->eaddr);
ASSERT(saddr && daddr); /* Must have IPv4 addresses */
tap_icmp4_send(c, *saddr, *daddr, buf, n);
} else if (pingf->f.type == FLOW_PING6) {
- const struct in6_addr *saddr = &ini->faddr.a6;
+ const struct in6_addr *saddr = &ini->oaddr.a6;
const struct in6_addr *daddr = &ini->eaddr.a6;
tap_icmp6_send(c, saddr, daddr, buf, n);
diff --git a/log.c b/log.c
index e7202d0..a61468e 100644
--- a/log.c
+++ b/log.c
@@ -98,7 +98,7 @@ const char *logfile_prefix[] = {
* @fd: Log file descriptor
* @now: Current timestamp
*
- * #syscalls lseek ppc64le:_llseek ppc64:_llseek arm:_llseek
+ * #syscalls lseek ppc64le:_llseek ppc64:_llseek arm:_llseek i686:_llseek
*/
static void logfile_rotate_fallocate(int fd, const struct timespec *now)
{
@@ -224,19 +224,23 @@ static int logfile_rotate(int fd, const struct timespec *now)
/**
* logfile_write() - Write entry to log file, trigger rotation if full
* @newline: Append newline at the end of the message, if missing
+ * @cont: Continuation of a previous message, on the same line
* @pri: Facility and level map, same as priority for vsyslog()
* @now: Timestamp
* @format: Same as vsyslog() format
* @ap: Same as vsyslog() ap
*/
-static void logfile_write(bool newline, int pri, const struct timespec *now,
+static void logfile_write(bool newline, bool cont, int pri,
+ const struct timespec *now,
const char *format, va_list ap)
{
char buf[BUFSIZ];
- int n;
+ int n = 0;
- n = logtime_fmt(buf, BUFSIZ, now);
- n += snprintf(buf + n, BUFSIZ - n, ": %s", logfile_prefix[pri]);
+ if (!cont) {
+ n += logtime_fmt(buf, BUFSIZ, now);
+ n += snprintf(buf + n, BUFSIZ - n, ": %s", logfile_prefix[pri]);
+ }
n += vsnprintf(buf + n, BUFSIZ - n, format, ap);
@@ -278,7 +282,7 @@ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
va_copy(ap2, ap); /* Don't clobber ap, we need it again */
if (log_file != -1)
- logfile_write(newline, pri, now, format, ap2);
+ logfile_write(newline, cont, pri, now, format, ap2);
else if (!(log_mask & LOG_MASK(LOG_DEBUG)))
passt_vsyslog(newline, pri, format, ap2);
diff --git a/ndp.c b/ndp.c
index 6dcb487..a1ee834 100644
--- a/ndp.c
+++ b/ndp.c
@@ -247,7 +247,7 @@ int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr,
memcpy(&na.target_addr, &ns->target_addr,
sizeof(na.target_addr));
- memcpy(na.target_l2_addr.mac, c->mac, ETH_ALEN);
+ memcpy(na.target_l2_addr.mac, c->our_tap_mac, ETH_ALEN);
} else if (ih->icmp6_type == RS) {
size_t dns_s_len = 0;
@@ -331,7 +331,7 @@ int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr,
}
dns_done:
- memcpy(&ra.source_ll.mac, c->mac, ETH_ALEN);
+ memcpy(&ra.source_ll.mac, c->our_tap_mac, ETH_ALEN);
} else {
return 1;
}
@@ -341,10 +341,7 @@ dns_done:
else
c->ip6.addr_seen = *saddr;
- if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw))
- rsaddr = &c->ip6.gw;
- else
- rsaddr = &c->ip6.addr_ll;
+ rsaddr = &c->ip6.our_tap_ll;
if (ih->icmp6_type == NS) {
dlen = sizeof(struct ndp_na);
diff --git a/netlink.c b/netlink.c
index 093de26..0bdbabf 100644
--- a/netlink.c
+++ b/netlink.c
@@ -674,6 +674,63 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
}
/**
+ * nl_addr_set_ll_nodad() - Set IFA_F_NODAD on IPv6 link-local addresses
+ * @s: Netlink socket
+ * @ifi: Interface index in target namespace
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int nl_addr_set_ll_nodad(int s, unsigned int ifi)
+{
+ struct req_t {
+ struct nlmsghdr nlh;
+ struct ifaddrmsg ifa;
+ } req = {
+ .ifa.ifa_family = AF_INET6,
+ .ifa.ifa_index = ifi,
+ };
+ uint32_t seq, last_seq = 0;
+ ssize_t status, ret = 0;
+ struct nlmsghdr *nh;
+ char buf[NLBUFSIZ];
+
+ seq = nl_send(s, &req, RTM_GETADDR, NLM_F_DUMP, sizeof(req));
+ nl_foreach_oftype(nh, status, s, buf, seq, RTM_NEWADDR) {
+ struct ifaddrmsg *ifa = (struct ifaddrmsg *)NLMSG_DATA(nh);
+ struct rtattr *rta;
+ size_t na;
+
+ if (ifa->ifa_index != ifi || ifa->ifa_scope != RT_SCOPE_LINK)
+ continue;
+
+ ifa->ifa_flags |= IFA_F_NODAD;
+
+ for (rta = IFA_RTA(ifa), na = IFA_PAYLOAD(nh); RTA_OK(rta, na);
+ rta = RTA_NEXT(rta, na)) {
+ /* If 32-bit flags are used, add IFA_F_NODAD there */
+ if (rta->rta_type == IFA_FLAGS)
+ *(uint32_t *)RTA_DATA(rta) |= IFA_F_NODAD;
+ }
+
+ last_seq = nl_send(s, nh, RTM_NEWADDR, NLM_F_REPLACE,
+ nh->nlmsg_len);
+ }
+
+ if (status < 0)
+ ret = status;
+
+ for (seq = seq + 1; seq <= last_seq; seq++) {
+ nl_foreach(nh, status, s, buf, seq)
+ warn("netlink: Unexpected response message");
+
+ if (!ret && status < 0)
+ ret = status;
+ }
+
+ return ret;
+}
+
+/**
* nl_addr_get() - Get most specific global address, given interface and family
* @s: Netlink socket
* @ifi: Interface index in outer network namespace
@@ -682,7 +739,7 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
* @prefix_len: Mask or prefix length, to fill (for IPv4)
* @addr_l: Link-scoped address to fill (for IPv6)
*
- * Return: 9 on success, negative error code on failure
+ * Return: 0 on success, negative error code on failure
*/
int nl_addr_get(int s, unsigned int ifi, sa_family_t af,
void *addr, int *prefix_len, void *addr_l)
@@ -740,7 +797,54 @@ int nl_addr_get(int s, unsigned int ifi, sa_family_t af,
}
/**
- * nl_add_set() - Set IP addresses for given interface and address family
+ * nl_addr_get_ll() - Get first IPv6 link-local address for a given interface
+ * @s: Netlink socket
+ * @ifi: Interface index in outer network namespace
+ * @addr: Link-local address to fill
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int nl_addr_get_ll(int s, unsigned int ifi, struct in6_addr *addr)
+{
+ struct req_t {
+ struct nlmsghdr nlh;
+ struct ifaddrmsg ifa;
+ } req = {
+ .ifa.ifa_family = AF_INET6,
+ .ifa.ifa_index = ifi,
+ };
+ struct nlmsghdr *nh;
+ bool found = false;
+ char buf[NLBUFSIZ];
+ ssize_t status;
+ uint32_t seq;
+
+ seq = nl_send(s, &req, RTM_GETADDR, NLM_F_DUMP, sizeof(req));
+ nl_foreach_oftype(nh, status, s, buf, seq, RTM_NEWADDR) {
+ struct ifaddrmsg *ifa = (struct ifaddrmsg *)NLMSG_DATA(nh);
+ struct rtattr *rta;
+ size_t na;
+
+ if (ifa->ifa_index != ifi || ifa->ifa_scope != RT_SCOPE_LINK ||
+ found)
+ continue;
+
+ for (rta = IFA_RTA(ifa), na = IFA_PAYLOAD(nh); RTA_OK(rta, na);
+ rta = RTA_NEXT(rta, na)) {
+ if (rta->rta_type != IFA_ADDRESS)
+ continue;
+
+ if (!found) {
+ memcpy(addr, RTA_DATA(rta), RTA_PAYLOAD(rta));
+ found = true;
+ }
+ }
+ }
+ return status;
+}
+
+/**
+ * nl_addr_set() - Set IP addresses for given interface and address family
* @s: Netlink socket
* @ifi: Interface index
* @af: Address family
@@ -942,14 +1046,14 @@ int nl_link_set_mac(int s, unsigned int ifi, const void *mac)
}
/**
- * nl_link_up() - Bring link up
+ * nl_link_set_mtu() - Set link MTU
* @s: Netlink socket
* @ifi: Interface index
- * @mtu: If non-zero, set interface MTU
+ * @mtu: Interface MTU
*
* Return: 0 on success, negative error code on failure
*/
-int nl_link_up(int s, unsigned int ifi, int mtu)
+int nl_link_set_mtu(int s, unsigned int ifi, int mtu)
{
struct req_t {
struct nlmsghdr nlh;
@@ -959,17 +1063,35 @@ int nl_link_up(int s, unsigned int ifi, int mtu)
} req = {
.ifm.ifi_family = AF_UNSPEC,
.ifm.ifi_index = ifi,
- .ifm.ifi_flags = IFF_UP,
- .ifm.ifi_change = IFF_UP,
.rta.rta_type = IFLA_MTU,
.rta.rta_len = RTA_LENGTH(sizeof(unsigned int)),
.mtu = mtu,
};
- ssize_t len = sizeof(req);
- if (!mtu)
- /* Shorten request to drop MTU attribute */
- len = offsetof(struct req_t, rta);
+ return nl_do(s, &req, RTM_NEWLINK, 0, sizeof(req));
+}
+
+/**
+ * nl_link_set_flags() - Set link flags
+ * @s: Netlink socket
+ * @ifi: Interface index
+ * @set: Device flags to set
+ * @change: Mask of device flag changes
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int nl_link_set_flags(int s, unsigned int ifi,
+ unsigned int set, unsigned int change)
+{
+ struct req_t {
+ struct nlmsghdr nlh;
+ struct ifinfomsg ifm;
+ } req = {
+ .ifm.ifi_family = AF_UNSPEC,
+ .ifm.ifi_index = ifi,
+ .ifm.ifi_flags = set,
+ .ifm.ifi_change = change,
+ };
- return nl_do(s, &req, RTM_NEWLINK, 0, len);
+ return nl_do(s, &req, RTM_NEWLINK, 0, sizeof(req));
}
diff --git a/netlink.h b/netlink.h
index 3a1f0de..b51e99c 100644
--- a/netlink.h
+++ b/netlink.h
@@ -19,10 +19,14 @@ int nl_addr_get(int s, unsigned int ifi, sa_family_t af,
void *addr, int *prefix_len, void *addr_l);
int nl_addr_set(int s, unsigned int ifi, sa_family_t af,
const void *addr, int prefix_len);
+int nl_addr_get_ll(int s, unsigned int ifi, struct in6_addr *addr);
+int nl_addr_set_ll_nodad(int s, unsigned int ifi);
int nl_addr_dup(int s_src, unsigned int ifi_src,
int s_dst, unsigned int ifi_dst, sa_family_t af);
int nl_link_get_mac(int s, unsigned int ifi, void *mac);
int nl_link_set_mac(int s, unsigned int ifi, const void *mac);
-int nl_link_up(int s, unsigned int ifi, int mtu);
+int nl_link_set_mtu(int s, unsigned int ifi, int mtu);
+int nl_link_set_flags(int s, unsigned int ifi,
+ unsigned int set, unsigned int change);
#endif /* NETLINK_H */
diff --git a/passt.1 b/passt.1
index 3062b71..79d134d 100644
--- a/passt.1
+++ b/passt.1
@@ -236,11 +236,15 @@ interface will be chosen instead.
.TP
.BR \-D ", " \-\-dns " " \fIaddr
-Use \fIaddr\fR (IPv4 or IPv6) for DHCP, DHCPv6, NDP or DNS forwarding, as
-configured (see options \fB--no-dhcp-dns\fR, \fB--dhcp-dns\fR,
-\fB--dns-forward\fR) instead of reading addresses from \fI/etc/resolv.conf\fR.
-This option can be specified multiple times. Specifying \fB-D none\fR disables
-usage of DNS addresses altogether.
+Instruct the guest (via DHCP, DHVPv6 or NDP) to use \fIaddr\fR (IPv4
+or IPv6) as a nameserver, as configured (see options
+\fB--no-dhcp-dns\fR, \fB--dhcp-dns\fR) instead of reading addresses
+from \fI/etc/resolv.conf\fR. This option can be specified multiple
+times. Specifying \fB-D none\fR disables usage of DNS addresses
+altogether. Unlike addresses from \fI/etc/resolv.conf\fR, \fIaddr\fR
+is given to the guest without remapping. For example \fB--dns
+127.0.0.1\fR will instruct the guest to use itself as nameserver, not
+the host.
.TP
.BR \-\-dns-forward " " \fIaddr
@@ -324,6 +328,20 @@ Disable Router Advertisements. Router Solicitations coming from guest or target
namespace will be ignored.
.TP
+.BR \-\-map-host-loopback " " \fIaddr
+Translate \fIaddr\fR to refer to the host. Packets from the guest to
+\fIaddr\fR will be redirected to the host. On the host such packets
+will appear to have both source and destination of 127.0.0.1 or ::1.
+
+If \fIaddr\fR is 'none', no address is mapped (this implies
+\fB--no-map-gw\fR). Only one IPv4 and one IPv6 address can be
+translated, if the option is specified multiple times, the last one
+takes effect.
+
+Default is to translate the guest's default gateway address, unless
+\fB--no-map-gw\fR is given, in which case no address is mapped.
+
+.TP
.BR \-\-no-map-gw
Don't remap TCP connections and untracked UDP traffic, with the gateway address
as destination, to the host. Implied if there is no gateway on the selected
@@ -331,6 +349,21 @@ default route, or if there is no default route, for any of the enabled address
families.
.TP
+.BR \-\-map-guest-addr " " \fIaddr
+Translate \fIaddr\fR in the guest to be equal to the guest's assigned
+address on the host. That is, packets from the guest to \fIaddr\fR
+will be redirected to the address assigned to the guest with \fB-a\fR,
+or by default the host's global address. This allows the guest to
+access services availble on the host's global address, even though its
+own address shadows that of the host.
+
+If \fIaddr\fR is 'none', no address is mapped. Only one IPv4 and one
+IPv6 address can be translated, and if the option is specified
+multiple times, the last one for each address type takes effect.
+
+Default is no mapping.
+
+.TP
.BR \-4 ", " \-\-ipv4-only
Enable IPv4-only operation. IPv6 traffic will be ignored.
By default, IPv6 operation is enabled as long as at least an IPv6 route and an
diff --git a/passt.c b/passt.c
index 4b3c306..ad6f0bc 100644
--- a/passt.c
+++ b/passt.c
@@ -191,11 +191,11 @@ void exit_handler(int signal)
* Return: non-zero on failure
*
* #syscalls read write writev
- * #syscalls socket bind connect getsockopt setsockopt s390x:socketcall close
- * #syscalls recvfrom sendto shutdown
+ * #syscalls socket getsockopt setsockopt s390x:socketcall i686:socketcall close
+ * #syscalls bind connect recvfrom sendto shutdown
* #syscalls arm:recv ppc64le:recv arm:send ppc64le:send
* #syscalls accept4|accept listen epoll_ctl epoll_wait|epoll_pwait epoll_pwait
- * #syscalls clock_gettime arm:clock_gettime64
+ * #syscalls clock_gettime arm:clock_gettime64 i686:clock_gettime64
*/
int main(int argc, char **argv)
{
@@ -272,7 +272,7 @@ int main(int argc, char **argv)
if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c)))
exit(EXIT_FAILURE);
- proto_update_l2_buf(c.mac_guest, c.mac);
+ proto_update_l2_buf(c.guest_mac, c.our_tap_mac);
if (c.ifi4 && !c.no_dhcp)
dhcp_init();
diff --git a/passt.h b/passt.h
index ef68403..031c9b6 100644
--- a/passt.h
+++ b/passt.h
@@ -26,6 +26,13 @@ union epoll_ref;
#include "tcp.h"
#include "udp.h"
+/* Default address for our end on the tap interface. Bit 0 of byte 0 must be 0
+ * (unicast) and bit 1 of byte 1 must be 1 (locally administered). Otherwise
+ * it's arbitrary.
+ */
+#define MAC_OUR_LAA \
+ ((uint8_t [ETH_ALEN]){0x9a, 0x55, 0x9a, 0x55, 0x9a, 0x55})
+
/**
* union epoll_ref - Breakdown of reference for epoll fd bookkeeping
* @type: Type of fd (tells us what to do with events)
@@ -94,9 +101,14 @@ enum passt_modes {
* @addr: IPv4 address assigned to guest
* @addr_seen: Latest IPv4 address seen as source from tap
* @prefixlen: IPv4 prefix length (netmask)
- * @gw: Default IPv4 gateway
+ * @guest_gw: IPv4 gateway as seen by the guest
+ * @map_host_loopback: Outbound connections to this address are NATted to the
+ * host's 127.0.0.1
+ * @map_guest_addr: Outbound connections to this address are NATted to the
+ * guest's assigned address
* @dns: DNS addresses for DHCP, zero-terminated
* @dns_match: Forward DNS query if sent to this address
+ * @our_tap_addr: IPv4 address for passt's use on tap
* @dns_host: Use this DNS on the host for forwarding
* @addr_out: Optional source address for outbound traffic
* @ifname_out: Optional interface name to bind outbound sockets to
@@ -104,15 +116,21 @@ enum passt_modes {
* @no_copy_addrs: Don't copy all addresses when configuring namespace
*/
struct ip4_ctx {
+ /* PIF_TAP addresses */
struct in_addr addr;
struct in_addr addr_seen;
int prefix_len;
- struct in_addr gw;
+ struct in_addr guest_gw;
+ struct in_addr map_host_loopback;
+ struct in_addr map_guest_addr;
struct in_addr dns[MAXNS + 1];
struct in_addr dns_match;
- struct in_addr dns_host;
+ struct in_addr our_tap_addr;
+ /* PIF_HOST addresses */
+ struct in_addr dns_host;
struct in_addr addr_out;
+
char ifname_out[IFNAMSIZ];
bool no_copy_routes;
@@ -122,12 +140,16 @@ struct ip4_ctx {
/**
* struct ip6_ctx - IPv6 execution context
* @addr: IPv6 address assigned to guest
- * @addr_ll: Link-local IPv6 address on external, routable interface
* @addr_seen: Latest IPv6 global/site address seen as source from tap
* @addr_ll_seen: Latest IPv6 link-local address seen as source from tap
- * @gw: Default IPv6 gateway
+ * @guest_gw: IPv6 gateway as seen by the guest
+ * @map_host_loopback: Outbound connections to this address are NATted to the
+ * host's [::1]
+ * @map_guest_addr: Outbound connections to this address are NATted to the
+ * guest's assigned address
* @dns: DNS addresses for DHCPv6 and NDP, zero-terminated
* @dns_match: Forward DNS query if sent to this address
+ * @our_tap_ll: Link-local IPv6 address for passt's use on tap
* @dns_host: Use this DNS on the host for forwarding
* @addr_out: Optional source address for outbound traffic
* @ifname_out: Optional interface name to bind outbound sockets to
@@ -135,16 +157,21 @@ struct ip4_ctx {
* @no_copy_addrs: Don't copy all addresses when configuring namespace
*/
struct ip6_ctx {
+ /* PIF_TAP addresses */
struct in6_addr addr;
- struct in6_addr addr_ll;
struct in6_addr addr_seen;
struct in6_addr addr_ll_seen;
- struct in6_addr gw;
+ struct in6_addr guest_gw;
+ struct in6_addr map_host_loopback;
+ struct in6_addr map_guest_addr;
struct in6_addr dns[MAXNS + 1];
struct in6_addr dns_match;
- struct in6_addr dns_host;
+ struct in6_addr our_tap_ll;
+ /* PIF_HOST addresses */
+ struct in6_addr dns_host;
struct in6_addr addr_out;
+
char ifname_out[IFNAMSIZ];
bool no_copy_routes;
@@ -172,8 +199,8 @@ struct ip6_ctx {
* @epollfd: File descriptor for epoll instance
* @fd_tap_listen: File descriptor for listening AF_UNIX socket, if any
* @fd_tap: AF_UNIX socket, tuntap device, or pre-opened socket
- * @mac: Host MAC address
- * @mac_guest: MAC address of guest or namespace, seen or configured
+ * @our_tap_mac: Pasta/passt's MAC on the tap link
+ * @guest_mac: MAC address of guest or namespace, seen or configured
* @hash_secret: 128-bit secret for siphash functions
* @ifi4: Index of template interface for IPv4, 0 if IPv4 disabled
* @ip: IPv4 configuration
@@ -198,7 +225,6 @@ struct ip6_ctx {
* @no_dhcpv6: Disable DHCPv6 server
* @no_ndp: Disable NDP handler altogether
* @no_ra: Disable router advertisements
- * @no_map_gw: Don't map connections, untracked UDP to gateway to host
* @low_wmem: Low probed net.core.wmem_max
* @low_rmem: Low probed net.core.rmem_max
*/
@@ -226,8 +252,8 @@ struct ctx {
int epollfd;
int fd_tap_listen;
int fd_tap;
- unsigned char mac[ETH_ALEN];
- unsigned char mac_guest[ETH_ALEN];
+ unsigned char our_tap_mac[ETH_ALEN];
+ unsigned char guest_mac[ETH_ALEN];
uint64_t hash_secret[2];
unsigned int ifi4;
@@ -258,7 +284,6 @@ struct ctx {
int no_dhcpv6;
int no_ndp;
int no_ra;
- int no_map_gw;
int low_wmem;
int low_rmem;
diff --git a/pasta.c b/pasta.c
index 615ff7b..307fb4a 100644
--- a/pasta.c
+++ b/pasta.c
@@ -13,7 +13,7 @@
*
* #syscalls:pasta clone waitid exit exit_group rt_sigprocmask
* #syscalls:pasta rt_sigreturn|sigreturn
- * #syscalls:pasta arm:sigreturn ppc64:sigreturn s390x:sigreturn
+ * #syscalls:pasta arm:sigreturn ppc64:sigreturn s390x:sigreturn i686:sigreturn
*/
#include <sched.h>
@@ -288,22 +288,30 @@ void pasta_ns_conf(struct ctx *c)
{
int rc = 0;
- rc = nl_link_up(nl_sock_ns, 1 /* lo */, 0);
+ rc = nl_link_set_flags(nl_sock_ns, 1 /* lo */, IFF_UP, IFF_UP);
if (rc < 0)
die("Couldn't bring up loopback interface in namespace: %s",
strerror(-rc));
/* Get or set MAC in target namespace */
- if (MAC_IS_ZERO(c->mac_guest))
- nl_link_get_mac(nl_sock_ns, c->pasta_ifi, c->mac_guest);
+ if (MAC_IS_ZERO(c->guest_mac))
+ nl_link_get_mac(nl_sock_ns, c->pasta_ifi, c->guest_mac);
else
- rc = nl_link_set_mac(nl_sock_ns, c->pasta_ifi, c->mac_guest);
+ rc = nl_link_set_mac(nl_sock_ns, c->pasta_ifi, c->guest_mac);
if (rc < 0)
die("Couldn't set MAC address in namespace: %s",
strerror(-rc));
if (c->pasta_conf_ns) {
- nl_link_up(nl_sock_ns, c->pasta_ifi, c->mtu);
+ unsigned int flags = IFF_UP;
+
+ if (c->mtu != -1)
+ nl_link_set_mtu(nl_sock_ns, c->pasta_ifi, c->mtu);
+
+ if (c->ifi6) /* Avoid duplicate address detection on link up */
+ flags |= IFF_NOARP;
+
+ nl_link_set_flags(nl_sock_ns, c->pasta_ifi, flags, flags);
if (c->ifi4) {
if (c->ip4.no_copy_addrs) {
@@ -324,7 +332,8 @@ void pasta_ns_conf(struct ctx *c)
if (c->ip4.no_copy_routes) {
rc = nl_route_set_def(nl_sock_ns, c->pasta_ifi,
- AF_INET, &c->ip4.gw);
+ AF_INET,
+ &c->ip4.guest_gw);
} else {
rc = nl_route_dup(nl_sock, c->ifi4, nl_sock_ns,
c->pasta_ifi, AF_INET);
@@ -337,6 +346,23 @@ void pasta_ns_conf(struct ctx *c)
}
if (c->ifi6) {
+ rc = nl_addr_get_ll(nl_sock_ns, c->pasta_ifi,
+ &c->ip6.addr_ll_seen);
+ if (rc < 0) {
+ warn("Can't get LL address from namespace: %s",
+ strerror(-rc));
+ }
+
+ rc = nl_addr_set_ll_nodad(nl_sock_ns, c->pasta_ifi);
+ if (rc < 0) {
+ warn("Can't set nodad for LL in namespace: %s",
+ strerror(-rc));
+ }
+
+ /* We dodged DAD: re-enable neighbour solicitations */
+ nl_link_set_flags(nl_sock_ns, c->pasta_ifi,
+ 0, IFF_NOARP);
+
if (c->ip6.no_copy_addrs) {
rc = nl_addr_set(nl_sock_ns, c->pasta_ifi,
AF_INET6, &c->ip6.addr, 64);
@@ -353,7 +379,8 @@ void pasta_ns_conf(struct ctx *c)
if (c->ip6.no_copy_routes) {
rc = nl_route_set_def(nl_sock_ns, c->pasta_ifi,
- AF_INET6, &c->ip6.gw);
+ AF_INET6,
+ &c->ip6.guest_gw);
} else {
rc = nl_route_dup(nl_sock, c->ifi6,
nl_sock_ns, c->pasta_ifi,
@@ -367,7 +394,7 @@ void pasta_ns_conf(struct ctx *c)
}
}
- proto_update_l2_buf(c->mac_guest, NULL);
+ proto_update_l2_buf(c->guest_mac, NULL);
}
/**
@@ -400,12 +427,12 @@ static int pasta_netns_quit_timer(void)
*/
void pasta_netns_quit_init(const struct ctx *c)
{
- union epoll_ref ref = { .type = EPOLL_TYPE_NSQUIT_INOTIFY };
struct epoll_event ev = { .events = EPOLLIN };
int flags = O_NONBLOCK | O_CLOEXEC;
struct statfs s = { 0 };
bool try_inotify = true;
int fd = -1, dir_fd;
+ union epoll_ref ref;
if (c->mode != MODE_PASTA || c->no_netns_quit || !*c->netns_base)
return;
@@ -436,6 +463,7 @@ void pasta_netns_quit_init(const struct ctx *c)
ref.type = EPOLL_TYPE_NSQUIT_TIMER;
} else {
close(dir_fd);
+ ref.type = EPOLL_TYPE_NSQUIT_INOTIFY;
}
if (fd > FD_REF_MAX)
diff --git a/seccomp.sh b/seccomp.sh
index 052e1c8..38aa826 100755
--- a/seccomp.sh
+++ b/seccomp.sh
@@ -242,7 +242,10 @@ for __p in ${__profiles}; do
__calls="$(sed -n 's/[\t ]*\*[\t ]*#syscalls\(:'"${__p}"'\|\)[\t ]\{1,\}\(.*\)/\2/p' ${IN})"
__calls="${__calls} ${EXTRA_SYSCALLS:-}"
__calls="$(filter ${__calls})"
- echo "seccomp profile ${__p} allows: ${__calls}" | tr '\n' ' ' | fmt -t
+
+ cols="$(stty -a | sed -n 's/.*columns \([0-9]*\).*/\1/p' || :)" 2>/dev/null
+ case $cols in [0-9]*) col_args="-w ${cols}";; *) col_args="";; esac
+ echo "seccomp profile ${__p} allows: ${__calls}" | tr '\n' ' ' | fmt -t ${col_args}
# Pad here to keep gen_profile() "simple"
__count=0
diff --git a/tap.c b/tap.c
index 87be3a6..41af6a6 100644
--- a/tap.c
+++ b/tap.c
@@ -118,8 +118,8 @@ static void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
struct ethhdr *eh = (struct ethhdr *)buf;
/* TODO: ARP table lookup */
- memcpy(eh->h_dest, c->mac_guest, ETH_ALEN);
- memcpy(eh->h_source, c->mac, ETH_ALEN);
+ memcpy(eh->h_dest, c->guest_mac, ETH_ALEN);
+ memcpy(eh->h_source, c->our_tap_mac, ETH_ALEN);
eh->h_proto = ntohs(proto);
return eh + 1;
}
@@ -946,9 +946,9 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p)
eh = (struct ethhdr *)p;
- if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) {
- memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
- proto_update_l2_buf(c->mac_guest, NULL);
+ if (memcmp(c->guest_mac, eh->h_source, ETH_ALEN)) {
+ memcpy(c->guest_mac, eh->h_source, ETH_ALEN);
+ proto_update_l2_buf(c->guest_mac, NULL);
}
switch (ntohs(eh->h_proto)) {
@@ -982,24 +982,17 @@ static void tap_sock_reset(struct ctx *c)
}
/**
- * tap_handler_passt() - Packet handler for AF_UNIX file descriptor
+ * tap_passt_input() - Handler for new data on the socket to qemu
* @c: Execution context
- * @events: epoll events
* @now: Current timestamp
*/
-void tap_handler_passt(struct ctx *c, uint32_t events,
- const struct timespec *now)
+static void tap_passt_input(struct ctx *c, const struct timespec *now)
{
static const char *partial_frame;
static ssize_t partial_len = 0;
ssize_t n;
char *p;
- if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) {
- tap_sock_reset(c);
- return;
- }
-
tap_flush_pools();
if (partial_len) {
@@ -1010,10 +1003,13 @@ void tap_handler_passt(struct ctx *c, uint32_t events,
memmove(pkt_buf, partial_frame, partial_len);
}
- n = recv(c->fd_tap, pkt_buf + partial_len, TAP_BUF_BYTES - partial_len,
- MSG_DONTWAIT);
+ do {
+ n = recv(c->fd_tap, pkt_buf + partial_len,
+ TAP_BUF_BYTES - partial_len, MSG_DONTWAIT);
+ } while ((n < 0) && errno == EINTR);
+
if (n < 0) {
- if (errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK) {
+ if (errno != EAGAIN && errno != EWOULDBLOCK) {
err_perror("Receive error on guest connection, reset");
tap_sock_reset(c);
}
@@ -1052,54 +1048,76 @@ void tap_handler_passt(struct ctx *c, uint32_t events,
}
/**
- * tap_handler_pasta() - Packet handler for /dev/net/tun file descriptor
+ * tap_handler_passt() - Event handler for AF_UNIX file descriptor
* @c: Execution context
* @events: epoll events
* @now: Current timestamp
*/
-void tap_handler_pasta(struct ctx *c, uint32_t events,
+void tap_handler_passt(struct ctx *c, uint32_t events,
const struct timespec *now)
{
- ssize_t n, len;
- int ret;
+ if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) {
+ tap_sock_reset(c);
+ return;
+ }
- if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR))
- die("Disconnect event on /dev/net/tun device, exiting");
+ if (events & EPOLLIN)
+ tap_passt_input(c, now);
+}
-redo:
- n = 0;
+/**
+ * tap_pasta_input() - Handler for new data on the socket to hypervisor
+ * @c: Execution context
+ * @now: Current timestamp
+ */
+static void tap_pasta_input(struct ctx *c, const struct timespec *now)
+{
+ ssize_t n, len;
tap_flush_pools();
-restart:
- while ((len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n)) > 0) {
- if (len < (ssize_t)sizeof(struct ethhdr) ||
- len > (ssize_t)ETH_MAX_MTU) {
- n += len;
- continue;
- }
+ for (n = 0; n <= (ssize_t)TAP_BUF_BYTES - ETH_MAX_MTU; n += len) {
+ len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU);
+ if (len == 0) {
+ die("EOF on tap device, exiting");
+ } else if (len < 0) {
+ if (errno == EINTR) {
+ len = 0;
+ continue;
+ }
- tap_add_packet(c, len, pkt_buf + n);
+ if (errno == EAGAIN && errno == EWOULDBLOCK)
+ break; /* all done for now */
- if ((n += len) == TAP_BUF_BYTES)
- break;
- }
+ die("Error on tap device, exiting");
+ }
- if (len < 0 && errno == EINTR)
- goto restart;
+ /* Ignore frames of bad length */
+ if (len < (ssize_t)sizeof(struct ethhdr) ||
+ len > (ssize_t)ETH_MAX_MTU)
+ continue;
- ret = errno;
+ tap_add_packet(c, len, pkt_buf + n);
+ }
tap_handler(c, now);
+}
- if (len > 0 || ret == EAGAIN)
- return;
-
- if (n == TAP_BUF_BYTES)
- goto redo;
+/**
+ * tap_handler_pasta() - Packet handler for /dev/net/tun file descriptor
+ * @c: Execution context
+ * @events: epoll events
+ * @now: Current timestamp
+ */
+void tap_handler_pasta(struct ctx *c, uint32_t events,
+ const struct timespec *now)
+{
+ if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR))
+ die("Disconnect event on /dev/net/tun device, exiting");
- die("Error on tap device, exiting");
+ if (events & EPOLLIN)
+ tap_pasta_input(c, now);
}
/**
@@ -1337,6 +1355,6 @@ void tap_sock_init(struct ctx *c)
* sends us packets. Use the broadcast address so that our
* first packets will reach it.
*/
- memset(&c->mac_guest, 0xff, sizeof(c->mac_guest));
+ memset(&c->guest_mac, 0xff, sizeof(c->guest_mac));
}
}
diff --git a/tcp.c b/tcp.c
index c0820ce..f9fe1b9 100644
--- a/tcp.c
+++ b/tcp.c
@@ -361,8 +361,8 @@ static const char *tcp_flag_str[] __attribute((__unused__)) = {
static int tcp_sock_init_ext [NUM_PORTS][IP_VERSIONS];
static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS];
-/* Table of guest side forwarding addresses with very low RTT (assumed
- * to be local to the host), LRU
+/* Table of our guest side addresses with very low RTT (assumed to be local to
+ * the host), LRU
*/
static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
@@ -440,7 +440,7 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
if (events == TAP_SYN_RCVD)
return EPOLLOUT | EPOLLET | EPOLLRDHUP;
- return EPOLLRDHUP;
+ return EPOLLET | EPOLLRDHUP;
}
/**
@@ -663,7 +663,7 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
int i;
for (i = 0; i < LOW_RTT_TABLE_SIZE; i++)
- if (inany_equals(&tapside->faddr, low_rtt_dst + i))
+ if (inany_equals(&tapside->oaddr, low_rtt_dst + i))
return 1;
return 0;
@@ -686,7 +686,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
return;
for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) {
- if (inany_equals(&tapside->faddr, low_rtt_dst + i))
+ if (inany_equals(&tapside->oaddr, low_rtt_dst + i))
return;
if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i))
hole = i;
@@ -698,7 +698,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
if (hole == -1)
return;
- low_rtt_dst[hole++] = tapside->faddr;
+ low_rtt_dst[hole++] = tapside->oaddr;
if (hole == LOW_RTT_TABLE_SIZE)
hole = 0;
inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any);
@@ -881,7 +881,7 @@ static void tcp_fill_header(struct tcphdr *th,
{
const struct flowside *tapside = TAPFLOW(conn);
- th->source = htons(tapside->fport);
+ th->source = htons(tapside->oport);
th->dest = htons(tapside->eport);
th->seq = htonl(seq);
th->ack_seq = htonl(conn->seq_ack_to_tap);
@@ -913,7 +913,7 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
uint32_t seq)
{
const struct flowside *tapside = TAPFLOW(conn);
- const struct in_addr *src4 = inany_v4(&tapside->faddr);
+ const struct in_addr *src4 = inany_v4(&tapside->oaddr);
const struct in_addr *dst4 = inany_v4(&tapside->eaddr);
size_t l4len = dlen + sizeof(*th);
size_t l3len = l4len + sizeof(*iph);
@@ -957,7 +957,7 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
size_t l4len = dlen + sizeof(*th);
ip6h->payload_len = htons(l4len);
- ip6h->saddr = tapside->faddr.a6;
+ ip6h->saddr = tapside->oaddr.a6;
ip6h->daddr = tapside->eaddr.a6;
ip6h->hop_limit = 255;
@@ -992,7 +992,7 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
const uint16_t *check, uint32_t seq)
{
const struct flowside *tapside = TAPFLOW(conn);
- const struct in_addr *a4 = inany_v4(&tapside->faddr);
+ const struct in_addr *a4 = inany_v4(&tapside->oaddr);
if (a4) {
return tcp_fill_headers4(conn, iov[TCP_IOV_TAP].iov_base,
@@ -1417,15 +1417,15 @@ static void tcp_bind_outbound(const struct ctx *c,
socklen_t sl;
- pif_sockaddr(c, &bind_sa, &sl, PIF_HOST, &tgt->faddr, tgt->fport);
- if (!inany_is_unspecified(&tgt->faddr) || tgt->fport) {
+ pif_sockaddr(c, &bind_sa, &sl, PIF_HOST, &tgt->oaddr, tgt->oport);
+ if (!inany_is_unspecified(&tgt->oaddr) || tgt->oport) {
if (bind(s, &bind_sa.sa, sl)) {
char sstr[INANY_ADDRSTRLEN];
flow_dbg(conn,
"Can't bind TCP outbound socket to %s:%hu: %s",
- inany_ntop(&tgt->faddr, sstr, sizeof(sstr)),
- tgt->fport, strerror(errno));
+ inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)),
+ tgt->oport, strerror(errno));
}
}
@@ -1497,12 +1497,12 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0 ||
- !inany_is_unicast(&ini->faddr) || ini->fport == 0) {
+ !inany_is_unicast(&ini->oaddr) || ini->oport == 0) {
char sstr[INANY_ADDRSTRLEN], dstr[INANY_ADDRSTRLEN];
debug("Invalid endpoint in TCP SYN: %s:%hu -> %s:%hu",
inany_ntop(&ini->eaddr, sstr, sizeof(sstr)), ini->eport,
- inany_ntop(&ini->faddr, dstr, sizeof(dstr)), ini->fport);
+ inany_ntop(&ini->oaddr, dstr, sizeof(dstr)), ini->oport);
goto cancel;
}
@@ -2100,7 +2100,8 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
goto cancel;
/* FIXME: When listening port has a specific bound address, record that
- * as the forwarding address */
+ * as our address
+ */
ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa,
ref.tcp_listen.port);
@@ -2143,7 +2144,7 @@ cancel:
* @c: Execution context
* @ref: epoll reference of timer (not connection)
*
- * #syscalls timerfd_gettime
+ * #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64
*/
void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
{
diff --git a/tcp_buf.c b/tcp_buf.c
index c31e9f3..1a39846 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -168,7 +168,6 @@ void tcp_sock4_iov_init(const struct ctx *c)
iov = tcp4_l2_flags_iov[i];
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]);
- iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i];
@@ -333,9 +332,13 @@ int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
else
dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];
- for (i = 0; i < TCP_NUM_IOVS; i++)
- memcpy(dup_iov[i].iov_base, iov[i].iov_base,
- iov[i].iov_len);
+ for (i = 0; i < TCP_NUM_IOVS; i++) {
+ /* All frames share the same ethernet header buffer */
+ if (i != TCP_IOV_ETH) {
+ memcpy(dup_iov[i].iov_base, iov[i].iov_base,
+ iov[i].iov_len);
+ }
+ }
dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len;
}
diff --git a/tcp_internal.h b/tcp_internal.h
index 8b60aab..aa8bb64 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -44,7 +44,7 @@
#define TAPFLOW(conn_) (&((conn_)->f.side[TAPSIDE(conn_)]))
#define TAP_SIDX(conn_) (FLOW_SIDX((conn_), TAPSIDE(conn_)))
-#define CONN_V4(conn) (!!inany_v4(&TAPFLOW(conn)->faddr))
+#define CONN_V4(conn) (!!inany_v4(&TAPFLOW(conn)->oaddr))
#define CONN_V6(conn) (!CONN_V4(conn))
/*
diff --git a/tcp_splice.c b/tcp_splice.c
index 483e45d..9f5cc27 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -28,7 +28,7 @@
* - FIN_SENT_0: FIN (write shutdown) sent to accepted socket
* - FIN_SENT_1: FIN (write shutdown) sent to target socket
*
- * #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64
+ * #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64 i686:fcntl64
*/
#include <sched.h>
diff --git a/test/README.md b/test/README.md
index 0936b04..91ca603 100644
--- a/test/README.md
+++ b/test/README.md
@@ -28,10 +28,11 @@ on a system, i.e. common utilities such as a shell are not included here.
Example for Debian, and possibly most Debian-based distributions:
- build-essential git jq strace iperf3 qemu-system-x86 tmux sipcalc bats bc
- catatonit clang-tidy cppcheck go isc-dhcp-common psmisc linux-cpupower socat
- netcat-openbsd fakeroot lz4 lm-sensors qemu-system-arm qemu-system-ppc
- qemu-system-misc qemu-system-x86 valgrind
+ bats bc build-essential catatonit clang-tidy conmon cppcheck crun fakeroot
+ git go iperf3 isc-dhcp-common jq libgpgme-dev libseccomp-dev linux-cpupower
+ lm-sensors lz4 netavark netcat-openbsd psmisc qemu-efi-aarch64
+ qemu-system-arm qemu-system-misc qemu-system-ppc qemu-system-x86
+ qemu-system-x86 sipcalc socat strace tmux uidmap valgrind
NOTE: the tests need a qemu version >= 7.2, or one that contains commit
13c6be96618c ("net: stream: add unix socket"): this change introduces support
diff --git a/test/lib/layout b/test/lib/layout
index f9a1cf1..4d03572 100644
--- a/test/lib/layout
+++ b/test/lib/layout
@@ -15,7 +15,7 @@
# layout_pasta() - Panes for host, pasta, and separate one for namespace
layout_pasta() {
- sleep 3
+ sleep 1
tmux kill-pane -a -t 0
cmd_write 0 clear
@@ -46,7 +46,7 @@ layout_pasta() {
# layout_passt() - Panes for host, passt, and guest
layout_passt() {
- sleep 3
+ sleep 1
tmux kill-pane -a -t 0
cmd_write 0 clear
@@ -77,7 +77,7 @@ layout_passt() {
# layout_passt_in_pasta() - Host, passt within pasta, namespace and guest
layout_passt_in_pasta() {
- sleep 3
+ sleep 1
tmux kill-pane -a -t 0
cmd_write 0 clear
@@ -113,7 +113,7 @@ layout_passt_in_pasta() {
# layout_two_guests() - Two guest panes, two passt panes, plus host and log
layout_two_guests() {
- sleep 3
+ sleep 1
tmux kill-pane -a -t 0
cmd_write 0 clear
@@ -152,7 +152,7 @@ layout_two_guests() {
# layout_demo_pasta() - Four panes for pasta demo
layout_demo_pasta() {
- sleep 3
+ sleep 1
cmd_write 0 cd ${BASEPATH}
cmd_write 0 clear
@@ -188,7 +188,7 @@ layout_demo_pasta() {
# layout_demo_passt() - Four panes for passt demo
layout_demo_passt() {
- sleep 3
+ sleep 1
cmd_write 0 cd ${BASEPATH}
cmd_write 0 clear
@@ -224,7 +224,7 @@ layout_demo_passt() {
# layout_demo_podman() - Four panes for pasta demo with Podman
layout_demo_podman() {
- sleep 3
+ sleep 1
cmd_write 0 cd ${BASEPATH}
cmd_write 0 clear
diff --git a/test/lib/setup b/test/lib/setup
index 9b39b9f..d764138 100755
--- a/test/lib/setup
+++ b/test/lib/setup
@@ -17,6 +17,8 @@ INITRAMFS="${BASEPATH}/mbuto.img"
VCPUS="$( [ $(nproc) -ge 8 ] && echo 6 || echo $(( $(nproc) / 2 + 1 )) )"
__mem_kib="$(sed -n 's/MemTotal:[ ]*\([0-9]*\) kB/\1/p' /proc/meminfo)"
VMEM="$((${__mem_kib} / 1024 / 4))"
+QEMU_ARCH="$(uname -m)"
+[ "${QEMU_ARCH}" = "i686" ] && QEMU_ARCH=i386
# setup_build() - Set up pane layout for build tests
setup_build() {
@@ -53,7 +55,7 @@ setup_passt() {
wait_for [ -f "${STATESETUP}/passt.pid" ]
GUEST_CID=94557
- context_run_bg qemu 'qemu-system-$(uname -m)' \
+ context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}" \
' -machine accel=kvm' \
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
@@ -124,7 +126,12 @@ setup_passt_in_ns() {
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
- context_run_bg pasta "./pasta ${__opts} -t 10001,10002,10011,10012 -T 10003,10013 -u 10001,10002,10011,10012 -U 10003,10013 -P ${STATESETUP}/pasta.pid --config-net ${NSTOOL} hold ${STATESETUP}/ns.hold"
+ __map_host4=192.0.2.1
+ __map_host6=2001:db8:9a55::1
+ __map_ns4=192.0.2.2
+ __map_ns6=2001:db8:9a55::2
+
+ context_run_bg pasta "./pasta ${__opts} -t 10001,10002,10011,10012 -T 10003,10013 -u 10001,10002,10011,10012 -U 10003,10013 -P ${STATESETUP}/pasta.pid --map-host-loopback ${__map_host4} --map-host-loopback ${__map_host6} --config-net ${NSTOOL} hold ${STATESETUP}/ns.hold"
wait_for [ -f "${STATESETUP}/pasta.pid" ]
context_setup_nstool qemu ${STATESETUP}/ns.hold
@@ -139,16 +146,16 @@ setup_passt_in_ns() {
if [ ${VALGRIND} -eq 1 ]; then
context_run passt "make clean"
context_run passt "make valgrind"
- context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid"
+ context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}"
else
context_run passt "make clean"
context_run passt "make"
- context_run_bg passt "./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid"
+ context_run_bg passt "./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}"
fi
wait_for [ -f "${STATESETUP}/passt.pid" ]
GUEST_CID=94557
- context_run_bg qemu 'qemu-system-$(uname -m)' \
+ context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}" \
' -machine accel=kvm' \
' -M accel=kvm:tcg' \
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
@@ -220,7 +227,7 @@ setup_two_guests() {
wait_for [ -f "${STATESETUP}/passt_2.pid" ]
GUEST_1_CID=94557
- context_run_bg qemu_1 'qemu-system-$(uname -m)' \
+ context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}" \
' -M accel=kvm:tcg' \
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
@@ -233,7 +240,7 @@ setup_two_guests() {
" -device vhost-vsock-pci,guest-cid=$GUEST_1_CID"
GUEST_2_CID=94558
- context_run_bg qemu_2 'qemu-system-$(uname -m)' \
+ context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}" \
' -M accel=kvm:tcg' \
' -m '${VMEM}' -cpu host -smp '${VCPUS} \
' -kernel ' "/boot/vmlinuz-$(uname -r)" \
diff --git a/test/lib/term b/test/lib/term
index 262937e..3834092 100755
--- a/test/lib/term
+++ b/test/lib/term
@@ -97,7 +97,6 @@ display_delay() {
switch_pane() {
tmux select-pane -t ${1}
PR_DELAY=${PR_DELAY_INIT}
- display_delay "0.2"
}
# cmd_write() - Write a command to a pane, letter by letter, and execute it
@@ -199,7 +198,7 @@ pane_run() {
# $1: Pane name
pane_wait() {
__lc="$(echo "${1}" | tr [A-Z] [a-z])"
- sleep 0.1 || sleep 1
+ sleep 0.01 || sleep 1
__done=0
while
@@ -207,7 +206,7 @@ pane_wait() {
case ${__l} in
*"$ " | *"# ") return ;;
esac
- do sleep 0.1 || sleep 1; done
+ do sleep 0.01 || sleep 1; done
}
# pane_parse() - Print last line, @EMPTY@ if command had no output
@@ -231,7 +230,7 @@ pane_status() {
__status="$(pane_parse "${1}")"
while ! [ "${__status}" -eq "${__status}" ] 2>/dev/null; do
- sleep 1
+ sleep 0.01 || sleep 1
pane_run "${1}" 'echo $?'
pane_wait "${1}"
__status="$(pane_parse "${1}")"
@@ -383,6 +382,16 @@ info_check_failed() {
printf " < failed.\n" >> "${LOGFILE}"
}
+# status_bar_blink() - Make status bar blink
+status_bar_blink() {
+ for i in `seq 1 3`; do
+ tmux set status-right-style 'bg=colour1 fg=colour196 bold'
+ sleep 0.1 || sleep 1
+ tmux set status-right-style 'bg=colour1 fg=colour233 bold'
+ sleep 0.1 || sleep 1
+ done
+}
+
# info_passed() - Display, log, and make status bar blink when a test passes
info_passed() {
switch_pane ${PANE_INFO}
@@ -391,12 +400,7 @@ info_passed() {
log "...passed."
log
- for i in `seq 1 3`; do
- tmux set status-right-style 'bg=colour1 fg=colour2 bold'
- sleep "0.1"
- tmux set status-right-style 'bg=colour1 fg=colour233 bold'
- sleep "0.1"
- done
+ [ ${FAST} -eq 1 ] || status_bar_blink
}
# info_failed() - Display, log, and make status bar blink when a test passes
@@ -407,12 +411,7 @@ info_failed() {
log "...failed."
log
- for i in `seq 1 3`; do
- tmux set status-right-style 'bg=colour1 fg=colour196 bold'
- sleep "0.1"
- tmux set status-right-style 'bg=colour1 fg=colour233 bold'
- sleep "0.1"
- done
+ [ ${FAST} -eq 1 ] || status_bar_blink
pause_continue \
"Press any key to pause test session" \
diff --git a/test/lib/test b/test/lib/test
index c525f8e..e6726be 100755
--- a/test/lib/test
+++ b/test/lib/test
@@ -33,7 +33,7 @@ test_iperf3k() {
pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid); rm s.pid'
- sleep 3 # Wait for kernel to free up ports
+ sleep 1 # Wait for kernel to free up ports
}
# test_iperf3() - Ugly helper for iperf3 directive
diff --git a/test/passt.mbuto b/test/passt.mbuto
index 436eecc..138d365 100755
--- a/test/passt.mbuto
+++ b/test/passt.mbuto
@@ -15,6 +15,14 @@ PROGS="${PROGS:-ash,dash,bash ip mount ls insmod mkdir ln cat chmod lsmod
sed tr chown sipcalc cut socat dd strace ping tail killall sleep sysctl
nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp}"
+# OpenSSH 9.8 introduced split binaries, with sshd being the daemon, and
+# sshd-session the per-session program. We need the latter as well, and the path
+# depends on the distribution. It doesn't exist on older versions.
+for bin in /usr/lib/openssh/sshd-session /usr/lib/ssh/sshd-session \
+ /usr/libexec/openssh/sshd-session; do
+ command -v "${bin}" >/dev/null && PROGS="${PROGS} ${bin}"
+done
+
KMODS="${KMODS:- virtio_net virtio_pci vmw_vsock_virtio_transport}"
LINKS="${LINKS:-
@@ -78,7 +86,7 @@ EOF
EOF
chmod 600 /root/.ssh/authorized_keys
chmod 700 /root
- socat VSOCK-LISTEN:22,fork EXEC:"sshd -i -e" 2> /var/log/vsock-ssh.log &
+ socat VSOCK-LISTEN:22,fork EXEC:"/sbin/sshd -i -e" 2> /var/log/vsock-ssh.log &
sh +m
'
diff --git a/test/passt_in_ns/dhcp b/test/passt_in_ns/dhcp
new file mode 100644
index 0000000..0ceed7c
--- /dev/null
+++ b/test/passt_in_ns/dhcp
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+# for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+# for network namespace/tap device mode
+#
+# test/passt/dhcp - Check DHCP and DHCPv6 functionality in passt mode
+#
+# Copyright (c) 2021 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+gtools ip jq dhclient sed tr
+htools ip jq sed tr head
+
+set MAP_NS4 192.0.2.2
+set MAP_NS6 2001:db8:9a55::2
+
+test Interface name
+gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
+hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+check [ -n "__IFNAME__" ]
+
+test DHCP: address
+guest /sbin/dhclient -4 __IFNAME__
+gout ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[0].local'
+hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
+check [ "__ADDR__" = "__HOST_ADDR__" ]
+
+test DHCP: route
+gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
+hout HOST_GW ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").gateway] | .[0]'
+check [ "__GW__" = "__HOST_GW__" ]
+
+test DHCP: MTU
+gout MTU ip -j link show | jq -rM '.[] | select(.ifname == "__IFNAME__").mtu'
+check [ __MTU__ = 65520 ]
+
+test DHCP: DNS
+gout DNS sed -n 's/^nameserver \([0-9]*\.\)\(.*\)/\1\2/p' /etc/resolv.conf | tr '\n' ',' | sed 's/,$//;s/$/\n/'
+hout HOST_DNS sed -n 's/^nameserver \([0-9]*\.\)\(.*\)/\1\2/p' /etc/resolv.conf | head -n3 | tr '\n' ',' | sed 's/,$//;s/$/\n/'
+check [ "__DNS__" = "__HOST_DNS__" ] || ( [ "__DNS__" = "__MAP_NS4__" ] && expr "__HOST_DNS__" : "127[.]" )
+
+# FQDNs should be terminated by dots, but the guest DHCP client might omit them:
+# strip them first
+test DHCP: search list
+gout SEARCH sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/'
+hout HOST_SEARCH sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/'
+check [ "__SEARCH__" = "__HOST_SEARCH__" ]
+
+test DHCPv6: address
+guest /sbin/dhclient -6 __IFNAME__
+gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
+hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
+check [ "__ADDR6__" = "__HOST_ADDR6__" ]
+
+test DHCPv6: route
+gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
+hout HOST_GW6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").gateway] | .[0]'
+check [ "__GW6__" = "__HOST_GW6__" ]
+
+# Strip interface specifier: interface names might differ between host and guest
+test DHCPv6: DNS
+gout DNS6 sed -n 's/^nameserver \([^:]*:\)\([^%]*\).*/\1\2/p' /etc/resolv.conf | tr '\n' ',' | sed 's/,$//;s/$/\n/'
+hout HOST_DNS6 sed -n 's/^nameserver \([^:]*:\)\([^%]*\).*/\1\2/p' /etc/resolv.conf | tr '\n' ',' | sed 's/,$//;s/$/\n/'
+check [ "__DNS6__" = "__HOST_DNS6__" ] || [ "__DNS6__" = "__MAP_NS6__" -a "__HOST_DNS6__" = "::1" ]
+
+test DHCPv6: search list
+gout SEARCH6 sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/'
+hout HOST_SEARCH6 sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/'
+check [ "__SEARCH6__" = "__HOST_SEARCH6__" ]
diff --git a/test/passt_in_ns/tcp b/test/passt_in_ns/tcp
index cdb7060..aaf340e 100644
--- a/test/passt_in_ns/tcp
+++ b/test/passt_in_ns/tcp
@@ -15,6 +15,11 @@ gtools socat ip jq
htools socat ip jq
nstools socat ip jq
+set MAP_HOST4 192.0.2.1
+set MAP_HOST6 2001:db8:9a55::1
+set MAP_NS4 192.0.2.2
+set MAP_NS6 2001:db8:9a55::2
+
set TEMP_BIG __STATEDIR__/test_big.bin
set TEMP_SMALL __STATEDIR__/test_small.bin
set TEMP_NS_BIG __STATEDIR__/test_ns_big.bin
@@ -36,16 +41,15 @@ check cmp __TEMP_NS_BIG__ __BASEPATH__/big.bin
test TCP/IPv4: guest to host: big transfer
hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc
-gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
sleep 1
-guest socat -u OPEN:/root/big.bin TCP4:__GW__:10003
+guest socat -u OPEN:/root/big.bin TCP4:__MAP_HOST4__:10003
hostw
check cmp __TEMP_BIG__ __BASEPATH__/big.bin
test TCP/IPv4: guest to ns: big transfer
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
sleep 1
-guest socat -u OPEN:/root/big.bin TCP4:__GW__:10002
+guest socat -u OPEN:/root/big.bin TCP4:__MAP_NS4__:10002
nsw
check cmp __TEMP_NS_BIG__ __BASEPATH__/big.bin
@@ -59,7 +63,7 @@ check cmp __TEMP_BIG__ __BASEPATH__/big.bin
test TCP/IPv4: ns to host (via tap): big transfer
hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc
sleep 1
-ns socat -u OPEN:__BASEPATH__/big.bin TCP4:__GW__:10003
+ns socat -u OPEN:__BASEPATH__/big.bin TCP4:__MAP_HOST4__:10003
hostw
check cmp __TEMP_BIG__ __BASEPATH__/big.bin
@@ -95,16 +99,15 @@ check cmp __TEMP_NS_SMALL__ __BASEPATH__/small.bin
test TCP/IPv4: guest to host: small transfer
hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc
-gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
sleep 1
-guest socat -u OPEN:/root/small.bin TCP4:__GW__:10003
+guest socat -u OPEN:/root/small.bin TCP4:__MAP_HOST4__:10003
hostw
check cmp __TEMP_SMALL__ __BASEPATH__/small.bin
test TCP/IPv4: guest to ns: small transfer
nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc
sleep 1
-guest socat -u OPEN:/root/small.bin TCP4:__GW__:10002
+guest socat -u OPEN:/root/small.bin TCP4:__MAP_NS4__:10002
nsw
check cmp __TEMP_NS_SMALL__ __BASEPATH__/small.bin
@@ -118,7 +121,7 @@ check cmp __TEMP_SMALL__ __BASEPATH__/small.bin
test TCP/IPv4: ns to host (via tap): small transfer
hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc
sleep 1
-ns socat -u OPEN:__BASEPATH__/small.bin TCP4:__GW__:10003
+ns socat -u OPEN:__BASEPATH__/small.bin TCP4:__MAP_HOST4__:10003
hostw
check cmp __TEMP_SMALL__ __BASEPATH__/small.bin
@@ -152,17 +155,15 @@ check cmp __TEMP_NS_BIG__ __BASEPATH__/big.bin
test TCP/IPv6: guest to host: big transfer
hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc
-gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
-gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
sleep 1
-guest socat -u OPEN:/root/big.bin TCP6:[__GW6__%__IFNAME__]:10003
+guest socat -u OPEN:/root/big.bin TCP6:[__MAP_HOST6__]:10003
hostw
check cmp __TEMP_BIG__ __BASEPATH__/big.bin
test TCP/IPv6: guest to ns: big transfer
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc
sleep 1
-guest socat -u OPEN:/root/big.bin TCP6:[__GW6__%__IFNAME__]:10002
+guest socat -u OPEN:/root/big.bin TCP6:[__MAP_NS6__]:10002
nsw
check cmp __TEMP_NS_BIG__ __BASEPATH__/big.bin
@@ -175,9 +176,8 @@ check cmp __TEMP_BIG__ __BASEPATH__/big.bin
test TCP/IPv6: ns to host (via tap): big transfer
hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc
-nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
sleep 1
-ns socat -u OPEN:__BASEPATH__/big.bin TCP6:[__GW6__%__IFNAME__]:10003
+ns socat -u OPEN:__BASEPATH__/big.bin TCP6:[__MAP_HOST6__]:10003
hostw
check cmp __TEMP_BIG__ __BASEPATH__/big.bin
@@ -190,6 +190,7 @@ guest cmp test_big.bin /root/big.bin
test TCP/IPv6: ns to guest (using namespace address): big transfer
guestb socat -u TCP6-LISTEN:10001 OPEN:test_big.bin,create,trunc
+nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[0].local'
sleep 1
ns socat -u OPEN:__BASEPATH__/big.bin TCP6:[__ADDR6__]:10001
@@ -212,17 +213,15 @@ check cmp __TEMP_NS_SMALL__ __BASEPATH__/small.bin
test TCP/IPv6: guest to host: small transfer
hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc
-gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
-gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
sleep 1
-guest socat -u OPEN:/root/small.bin TCP6:[__GW6__%__IFNAME__]:10003
+guest socat -u OPEN:/root/small.bin TCP6:[__MAP_HOST6__]:10003
hostw
check cmp __TEMP_SMALL__ __BASEPATH__/small.bin
test TCP/IPv6: guest to ns: small transfer
nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__
sleep 1
-guest socat -u OPEN:/root/small.bin TCP6:[__GW6__%__IFNAME__]:10002
+guest socat -u OPEN:/root/small.bin TCP6:[__MAP_NS6__]:10002
nsw
check cmp __TEMP_NS_SMALL__ __BASEPATH__/small.bin
@@ -235,9 +234,8 @@ check cmp __TEMP_SMALL__ __BASEPATH__/small.bin
test TCP/IPv6: ns to host (via tap): small transfer
hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc
-nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
sleep 1
-ns socat -u OPEN:__BASEPATH__/small.bin TCP6:[__GW6__%__IFNAME__]:10003
+ns socat -u OPEN:__BASEPATH__/small.bin TCP6:[__MAP_HOST6__]:10003
hostw
check cmp __TEMP_SMALL__ __BASEPATH__/small.bin
diff --git a/test/passt_in_ns/udp b/test/passt_in_ns/udp
index 8a02513..3426ab9 100644
--- a/test/passt_in_ns/udp
+++ b/test/passt_in_ns/udp
@@ -15,6 +15,11 @@ gtools socat ip jq
nstools socat ip jq
htools socat ip jq
+set MAP_HOST4 192.0.2.1
+set MAP_HOST6 2001:db8:9a55::1
+set MAP_NS4 192.0.2.2
+set MAP_NS6 2001:db8:9a55::2
+
set TEMP __STATEDIR__/test.bin
set TEMP_NS __STATEDIR__/test_ns.bin
@@ -34,16 +39,15 @@ check cmp __TEMP_NS__ __BASEPATH__/medium.bin
test UDP/IPv4: guest to host
hostb socat -u UDP4-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc
-gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
sleep 1
-guest socat -u OPEN:/root/medium.bin UDP4:__GW__:10003,shut-null
+guest socat -u OPEN:/root/medium.bin UDP4:__MAP_HOST4__:10003,shut-null
hostw
check cmp __TEMP__ __BASEPATH__/medium.bin
test UDP/IPv4: guest to ns
nsb socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
sleep 1
-guest socat -u OPEN:/root/medium.bin UDP4:__GW__:10002,shut-null
+guest socat -u OPEN:/root/medium.bin UDP4:__MAP_NS4__:10002,shut-null
nsw
check cmp __TEMP_NS__ __BASEPATH__/medium.bin
@@ -57,7 +61,7 @@ check cmp __TEMP__ __BASEPATH__/medium.bin
test UDP/IPv4: ns to host (via tap)
hostb socat -u UDP4-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc
sleep 1
-ns socat -u OPEN:__BASEPATH__/medium.bin UDP4:__GW__:10003,shut-null
+ns socat -u OPEN:__BASEPATH__/medium.bin UDP4:__MAP_HOST4__:10003,shut-null
hostw
check cmp __TEMP__ __BASEPATH__/medium.bin
@@ -93,17 +97,15 @@ check cmp __TEMP_NS__ __BASEPATH__/medium.bin
test UDP/IPv6: guest to host
hostb socat -u UDP6-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc
-gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
-gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
sleep 1
-guest socat -u OPEN:/root/medium.bin UDP6:[__GW6__%__IFNAME__]:10003,shut-null
+guest socat -u OPEN:/root/medium.bin UDP6:[__MAP_HOST6__]:10003,shut-null
hostw
check cmp __TEMP__ __BASEPATH__/medium.bin
test UDP/IPv6: guest to ns
nsb socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc
sleep 1
-guest socat -u OPEN:/root/medium.bin UDP6:[__GW6__%__IFNAME__]:10002,shut-null
+guest socat -u OPEN:/root/medium.bin UDP6:[__MAP_NS6__]:10002,shut-null
nsw
check cmp __TEMP_NS__ __BASEPATH__/medium.bin
@@ -116,9 +118,8 @@ check cmp __TEMP__ __BASEPATH__/medium.bin
test UDP/IPv6: ns to host (via tap)
hostb socat -u UDP6-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc
-nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
sleep 1
-ns socat -u OPEN:__BASEPATH__/medium.bin UDP6:[__GW6__%__IFNAME__]:10003,shut-null
+ns socat -u OPEN:__BASEPATH__/medium.bin UDP6:[__MAP_HOST6__]:10003,shut-null
hostw
check cmp __TEMP__ __BASEPATH__/medium.bin
@@ -131,6 +132,7 @@ guest cmp test.bin /root/medium.bin
test UDP/IPv6: ns to guest (using namespace address)
guestb socat -u UDP6-LISTEN:10001,null-eof OPEN:test.bin,create,trunc
+nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[0].local'
sleep 1
ns socat -u OPEN:__BASEPATH__/medium.bin UDP6:[__ADDR6__]:10001,shut-null
diff --git a/test/pasta_options/log_to_file b/test/pasta_options/log_to_file
index fe50e50..3ead06c 100644
--- a/test/pasta_options/log_to_file
+++ b/test/pasta_options/log_to_file
@@ -19,7 +19,7 @@ sleep 1
endef
def flood_log_client
-host tcp_crr --nolog -P 10001 -C 10002 -6 -c -H ::1
+host tcp_crr --nolog -l1 -P 10001 -C 10002 -6 -c -H ::1
endef
def check_log_size_mountns
@@ -42,7 +42,7 @@ pout PID2 echo $!
check head -1 __LOG_FILE__ | grep '^pasta .* [(]__PID2__[)]$'
test Maximum log size
-passtb ./pasta --config-net -d -f -l __LOG_FILE__ --log-size $((100 * 1024)) -- sh -c 'while true; do tcp_crr --nolog -P 10001 -C 10002 -6; done'
+passtb ./pasta --config-net -d -f -l __LOG_FILE__ --log-size $((100 * 1024)) -- sh -c 'while true; do tcp_crr --nolog -l1 -P 10001 -C 10002 -6; done'
sleep 1
flood_log_client
diff --git a/test/perf/passt_tcp b/test/perf/passt_tcp
index 14343cb..089d953 100644
--- a/test/perf/passt_tcp
+++ b/test/perf/passt_tcp
@@ -15,6 +15,9 @@ gtools /sbin/sysctl ip jq nproc seq sleep iperf3 tcp_rr tcp_crr # From neper
nstools /sbin/sysctl ip jq nproc seq sleep iperf3 tcp_rr tcp_crr
htools bc head sed seq
+set MAP_NS4 192.0.2.2
+set MAP_NS6 2001:db8:9a55::2
+
test passt: throughput and latency
guest /sbin/sysctl -w net.core.rmem_max=536870912
@@ -29,8 +32,6 @@ ns /sbin/sysctl -w net.ipv4.tcp_rmem="4096 524288 134217728"
ns /sbin/sysctl -w net.ipv4.tcp_wmem="4096 524288 134217728"
ns /sbin/sysctl -w net.ipv4.tcp_timestamps=0
-gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
-gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
hout FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1
@@ -38,7 +39,7 @@ hout FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sy
hout FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__
set THREADS 4
-set TIME 10
+set TIME 1
set OMIT 0.1
set OPTS -Z -P __THREADS__ -l 1M -O__OMIT__
@@ -54,16 +55,16 @@ iperf3s ns 10002
bw -
bw -
guest ip link set dev __IFNAME__ mtu 1280
-iperf3 BW guest __GW6__%__IFNAME__ 10002 __TIME__ __OPTS__ -w 4M
+iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 4M
bw __BW__ 1.2 1.5
guest ip link set dev __IFNAME__ mtu 1500
-iperf3 BW guest __GW6__%__IFNAME__ 10002 __TIME__ __OPTS__ -w 4M
+iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 4M
bw __BW__ 1.6 1.8
guest ip link set dev __IFNAME__ mtu 9000
-iperf3 BW guest __GW6__%__IFNAME__ 10002 __TIME__ __OPTS__ -w 8M
+iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 8M
bw __BW__ 4.0 5.0
guest ip link set dev __IFNAME__ mtu 65520
-iperf3 BW guest __GW6__%__IFNAME__ 10002 __TIME__ __OPTS__ -w 16M
+iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 16M
bw __BW__ 7.0 8.0
iperf3k ns
@@ -75,7 +76,7 @@ lat -
lat -
lat -
nsb tcp_rr --nolog -6
-gout LAT tcp_rr --nolog -6 -c -H __GW6__%__IFNAME__ | sed -n 's/^throughput=\(.*\)/\1/p'
+gout LAT tcp_rr --nolog -l1 -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p'
lat __LAT__ 200 150
tl TCP CRR latency over IPv6: guest to host
@@ -85,33 +86,37 @@ lat -
lat -
lat -
nsb tcp_crr --nolog -6
-gout LAT tcp_crr --nolog -6 -c -H __GW6__%__IFNAME__ | sed -n 's/^throughput=\(.*\)/\1/p'
+gout LAT tcp_crr --nolog -l1 -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p'
lat __LAT__ 500 400
tr TCP throughput over IPv4: guest to host
iperf3s ns 10002
guest ip link set dev __IFNAME__ mtu 256
-iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -w 1M
+iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 1M
bw __BW__ 0.2 0.3
guest ip link set dev __IFNAME__ mtu 576
-iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -w 1M
+iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 1M
bw __BW__ 0.5 0.8
guest ip link set dev __IFNAME__ mtu 1280
-iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -w 4M
+iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 4M
bw __BW__ 1.2 1.5
guest ip link set dev __IFNAME__ mtu 1500
-iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -w 4M
+iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 4M
bw __BW__ 1.6 1.8
guest ip link set dev __IFNAME__ mtu 9000
-iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -w 8M
+iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 8M
bw __BW__ 4.0 5.0
guest ip link set dev __IFNAME__ mtu 65520
-iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -w 16M
+iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 16M
bw __BW__ 7.0 8.0
iperf3k ns
+# Reducing MTU below 1280 deconfigures IPv6, get our address back
+guest dhclient -6 -x
+guest dhclient -6 __IFNAME__
+
tl TCP RR latency over IPv4: guest to host
lat -
lat -
@@ -119,7 +124,7 @@ lat -
lat -
lat -
nsb tcp_rr --nolog -4
-gout LAT tcp_rr --nolog -4 -c -H __GW__ | sed -n 's/^throughput=\(.*\)/\1/p'
+gout LAT tcp_rr --nolog -l1 -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p'
lat __LAT__ 200 150
tl TCP CRR latency over IPv4: guest to host
@@ -129,7 +134,7 @@ lat -
lat -
lat -
nsb tcp_crr --nolog -4
-gout LAT tcp_crr --nolog -4 -c -H __GW__ | sed -n 's/^throughput=\(.*\)/\1/p'
+gout LAT tcp_crr --nolog -l1 -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p'
lat __LAT__ 500 400
tr TCP throughput over IPv6: host to guest
@@ -153,7 +158,7 @@ lat -
lat -
guestb tcp_rr --nolog -P 10001 -C 10011 -6
sleep 1
-nsout LAT tcp_rr --nolog -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
+nsout LAT tcp_rr --nolog -l1 -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
lat __LAT__ 200 150
tl TCP CRR latency over IPv6: host to guest
@@ -164,7 +169,7 @@ lat -
lat -
guestb tcp_crr --nolog -P 10001 -C 10011 -6
sleep 1
-nsout LAT tcp_crr --nolog -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
+nsout LAT tcp_crr --nolog -l1 -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
lat __LAT__ 500 350
@@ -189,7 +194,7 @@ lat -
lat -
guestb tcp_rr --nolog -P 10001 -C 10011 -4
sleep 1
-nsout LAT tcp_rr --nolog -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
+nsout LAT tcp_rr --nolog -l1 -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
lat __LAT__ 200 150
tl TCP CRR latency over IPv6: host to guest
@@ -200,7 +205,7 @@ lat -
lat -
guestb tcp_crr --nolog -P 10001 -C 10011 -4
sleep 1
-nsout LAT tcp_crr --nolog -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
+nsout LAT tcp_crr --nolog -l1 -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
lat __LAT__ 500 300
te
diff --git a/test/perf/passt_udp b/test/perf/passt_udp
index 8919280..4c66c41 100644
--- a/test/perf/passt_udp
+++ b/test/perf/passt_udp
@@ -15,6 +15,9 @@ gtools /sbin/sysctl ip jq nproc sleep iperf3 udp_rr # From neper
nstools ip jq sleep iperf3 udp_rr
htools bc head sed
+set MAP_NS4 192.0.2.2
+set MAP_NS6 2001:db8:9a55::2
+
test passt: throughput and latency
guest /sbin/sysctl -w net.core.rmem_max=16777216
@@ -22,16 +25,12 @@ guest /sbin/sysctl -w net.core.wmem_max=16777216
guest /sbin/sysctl -w net.core.rmem_default=16777216
guest /sbin/sysctl -w net.core.wmem_default=16777216
-gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
-gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
-gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-
hout FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1
hout FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq) ) | bc -l
hout FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__
set THREADS 2
-set TIME 10
+set TIME 1
set OPTS -u -P __THREADS__ --pacing-timer 1000
info Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz
@@ -46,13 +45,13 @@ iperf3s ns 10002
bw -
bw -
-iperf3 BW guest __GW6__%__IFNAME__ 10002 __TIME__ __OPTS__ -b 3G -l 1232
+iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 3G -l 1232
bw __BW__ 0.8 1.2
-iperf3 BW guest __GW6__%__IFNAME__ 10002 __TIME__ __OPTS__ -b 4G -l 1452
+iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 4G -l 1452
bw __BW__ 1.0 1.5
-iperf3 BW guest __GW6__%__IFNAME__ 10002 __TIME__ __OPTS__ -b 8G -l 8952
+iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 8G -l 8952
bw __BW__ 4.0 5.0
-iperf3 BW guest __GW6__%__IFNAME__ 10002 __TIME__ __OPTS__ -b 15G -l 64372
+iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 15G -l 64372
bw __BW__ 4.0 5.0
iperf3k ns
@@ -64,7 +63,7 @@ lat -
lat -
lat -
nsb udp_rr --nolog -6
-gout LAT udp_rr --nolog -6 -c -H __GW6__%__IFNAME__ | sed -n 's/^throughput=\(.*\)/\1/p'
+gout LAT udp_rr --nolog -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p'
lat __LAT__ 200 150
@@ -72,17 +71,17 @@ tr UDP throughput over IPv4: guest to host
iperf3s ns 10002
# (datagram size) = (packet size) - 28: 20 bytes of IPv4 header, 8 of UDP header
-iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -b 1G -l 228
+iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 1G -l 228
bw __BW__ 0.0 0.0
-iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -b 2G -l 548
+iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 2G -l 548
bw __BW__ 0.4 0.6
-iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -b 3G -l 1252
+iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 3G -l 1252
bw __BW__ 0.8 1.2
-iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -b 4G -l 1472
+iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 4G -l 1472
bw __BW__ 1.0 1.5
-iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -b 8G -l 8972
+iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 8G -l 8972
bw __BW__ 4.0 5.0
-iperf3 BW guest __GW__ 10002 __TIME__ __OPTS__ -b 15G -l 65492
+iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 15G -l 65492
bw __BW__ 4.0 5.0
iperf3k ns
@@ -94,7 +93,7 @@ lat -
lat -
lat -
nsb udp_rr --nolog -4
-gout LAT udp_rr --nolog -4 -c -H __GW__ | sed -n 's/^throughput=\(.*\)/\1/p'
+gout LAT udp_rr --nolog -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p'
lat __LAT__ 200 150
diff --git a/test/perf/pasta_tcp b/test/perf/pasta_tcp
index 8d2f911..d1ccf7d 100644
--- a/test/perf/pasta_tcp
+++ b/test/perf/pasta_tcp
@@ -14,6 +14,9 @@
htools head ip seq bc sleep iperf3 tcp_rr tcp_crr jq sed
nstools /sbin/sysctl nproc ip seq sleep iperf3 tcp_rr tcp_crr jq sed
+set MAP_HOST4 192.0.2.1
+set MAP_HOST6 2001:db8:9a55::1
+
test pasta: throughput and latency (local connections)
ns /sbin/sysctl -w net.ipv4.tcp_rmem="131072 524288 134217728"
@@ -22,7 +25,7 @@ ns /sbin/sysctl -w net.ipv4.tcp_timestamps=0
set THREADS 4
-set TIME 10
+set TIME 1
set OMIT 0.1
set OPTS -Z -w 4M -l 1M -P __THREADS__ -O__OMIT__
@@ -46,13 +49,13 @@ iperf3k host
tl TCP RR latency over IPv6: ns to host
hostb tcp_rr --nolog -P 10003 -C 10013 -6
-nsout LAT tcp_rr --nolog -P 10003 -C 10013 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
+nsout LAT tcp_rr --nolog -l1 -P 10003 -C 10013 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
hostw
lat __LAT__ 150 100
tl TCP CRR latency over IPv6: ns to host
hostb tcp_crr --nolog -P 10003 -C 10013 -6
-nsout LAT tcp_crr --nolog -P 10003 -C 10013 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
+nsout LAT tcp_crr --nolog -l1 -P 10003 -C 10013 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
hostw
lat __LAT__ 500 350
@@ -67,13 +70,13 @@ iperf3k host
tl TCP RR latency over IPv4: ns to host
hostb tcp_rr --nolog -P 10003 -C 10013 -4
-nsout LAT tcp_rr --nolog -P 10003 -C 10013 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
+nsout LAT tcp_rr --nolog -l1 -P 10003 -C 10013 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
hostw
lat __LAT__ 150 100
tl TCP CRR latency over IPv4: ns to host
hostb tcp_crr --nolog -P 10003 -C 10013 -4
-nsout LAT tcp_crr --nolog -P 10003 -C 10013 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
+nsout LAT tcp_crr --nolog -l1 -P 10003 -C 10013 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
hostw
lat __LAT__ 500 350
@@ -87,13 +90,13 @@ iperf3k ns
tl TCP RR latency over IPv6: host to ns
nsb tcp_rr --nolog -P 10002 -C 10012 -6
-hout LAT tcp_rr --nolog -P 10002 -C 10012 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
+hout LAT tcp_rr --nolog -l1 -P 10002 -C 10012 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
nsw
lat __LAT__ 150 100
tl TCP CRR latency over IPv6: host to ns
nsb tcp_crr --nolog -P 10002 -C 10012 -6
-hout LAT tcp_crr --nolog -P 10002 -C 10012 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
+hout LAT tcp_crr --nolog -l1 -P 10002 -C 10012 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p'
nsw
lat __LAT__ 1000 700
@@ -108,13 +111,13 @@ iperf3k ns
tl TCP RR latency over IPv4: host to ns
nsb tcp_rr --nolog -P 10002 -C 10012 -4
-hout LAT tcp_rr --nolog -P 10002 -C 10012 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
+hout LAT tcp_rr --nolog -l1 -P 10002 -C 10012 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
nsw
lat __LAT__ 150 100
tl TCP CRR latency over IPv4: host to ns
nsb tcp_crr --nolog -P 10002 -C 10012 -4
-hout LAT tcp_crr --nolog -P 10002 -C 10012 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
+hout LAT tcp_crr --nolog -l1 -P 10002 -C 10012 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
nsw
lat __LAT__ 1000 700
@@ -122,8 +125,6 @@ te
test pasta: throughput and latency (connections via tap)
-nsout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
-nsout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
set THREADS 2
set OPTS -Z -P __THREADS__ -i1 -O__OMIT__
@@ -137,16 +138,16 @@ tr TCP throughput over IPv6: ns to host
iperf3s host 10003
ns ip link set dev __IFNAME__ mtu 1500
-iperf3 BW ns __GW6__%__IFNAME__ 10003 __TIME__ __OPTS__ -w 512k
+iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -w 512k
bw __BW__ 0.2 0.4
ns ip link set dev __IFNAME__ mtu 4000
-iperf3 BW ns __GW6__%__IFNAME__ 10003 __TIME__ __OPTS__ -w 1M
+iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -w 1M
bw __BW__ 0.3 0.5
ns ip link set dev __IFNAME__ mtu 16384
-iperf3 BW ns __GW6__%__IFNAME__ 10003 __TIME__ __OPTS__ -w 8M
+iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -w 8M
bw __BW__ 1.5 2.0
ns ip link set dev __IFNAME__ mtu 65520
-iperf3 BW ns __GW6__%__IFNAME__ 10003 __TIME__ __OPTS__ -w 8M
+iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -w 8M
bw __BW__ 2.0 2.5
iperf3k host
@@ -156,7 +157,7 @@ lat -
lat -
lat -
hostb tcp_rr --nolog -P 10003 -C 10013 -6
-nsout LAT tcp_rr --nolog -P 10003 -C 10013 -6 -c -H __GW6__%__IFNAME__ | sed -n 's/^throughput=\(.*\)/\1/p'
+nsout LAT tcp_rr --nolog -l1 -P 10003 -C 10013 -6 -c -H __MAP_HOST6__ | sed -n 's/^throughput=\(.*\)/\1/p'
hostw
lat __LAT__ 150 100
@@ -165,7 +166,7 @@ lat -
lat -
lat -
hostb tcp_crr --nolog -P 10003 -C 10013 -6
-nsout LAT tcp_crr --nolog -P 10003 -C 10013 -6 -c -H __GW6__%__IFNAME__ | sed -n 's/^throughput=\(.*\)/\1/p'
+nsout LAT tcp_crr --nolog -l1 -P 10003 -C 10013 -6 -c -H __MAP_HOST6__ | sed -n 's/^throughput=\(.*\)/\1/p'
hostw
lat __LAT__ 1500 500
@@ -174,16 +175,16 @@ tr TCP throughput over IPv4: ns to host
iperf3s host 10003
ns ip link set dev __IFNAME__ mtu 1500
-iperf3 BW ns __GW__ 10003 __TIME__ __OPTS__ -w 512k
+iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -w 512k
bw __BW__ 0.2 0.4
ns ip link set dev __IFNAME__ mtu 4000
-iperf3 BW ns __GW__ 10003 __TIME__ __OPTS__ -w 1M
+iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -w 1M
bw __BW__ 0.3 0.5
ns ip link set dev __IFNAME__ mtu 16384
-iperf3 BW ns __GW__ 10003 __TIME__ __OPTS__ -w 8M
+iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -w 8M
bw __BW__ 1.5 2.0
ns ip link set dev __IFNAME__ mtu 65520
-iperf3 BW ns __GW__ 10003 __TIME__ __OPTS__ -w 8M
+iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -w 8M
bw __BW__ 2.0 2.5
iperf3k host
@@ -193,7 +194,7 @@ lat -
lat -
lat -
hostb tcp_rr --nolog -P 10003 -C 10013 -4
-nsout LAT tcp_rr --nolog -P 10003 -C 10013 -4 -c -H __GW__ | sed -n 's/^throughput=\(.*\)/\1/p'
+nsout LAT tcp_rr --nolog -l1 -P 10003 -C 10013 -4 -c -H __MAP_HOST4__ | sed -n 's/^throughput=\(.*\)/\1/p'
hostw
lat __LAT__ 150 100
@@ -202,7 +203,7 @@ lat -
lat -
lat -
hostb tcp_crr --nolog -P 10003 -C 10013 -4
-nsout LAT tcp_crr --nolog -P 10003 -C 10013 -4 -c -H __GW__ | sed -n 's/^throughput=\(.*\)/\1/p'
+nsout LAT tcp_crr --nolog -l1 -P 10003 -C 10013 -4 -c -H __MAP_HOST4__ | sed -n 's/^throughput=\(.*\)/\1/p'
hostw
lat __LAT__ 1500 500
@@ -224,7 +225,7 @@ lat -
lat -
lat -
nsb tcp_rr --nolog -P 10002 -C 10012 -6
-hout LAT tcp_rr --nolog -P 10002 -C 10012 -6 -c -H __ADDR6__ | sed -n 's/^throughput=\(.*\)/\1/p'
+hout LAT tcp_rr --nolog -l1 -P 10002 -C 10012 -6 -c -H __ADDR6__ | sed -n 's/^throughput=\(.*\)/\1/p'
nsw
lat __LAT__ 150 100
@@ -234,7 +235,7 @@ lat -
lat -
sleep 1
nsb tcp_crr --nolog -P 10002 -C 10012 -6
-hout LAT tcp_crr --nolog -P 10002 -C 10012 -6 -c -H __ADDR6__ | sed -n 's/^throughput=\(.*\)/\1/p'
+hout LAT tcp_crr --nolog -l1 -P 10002 -C 10012 -6 -c -H __ADDR6__ | sed -n 's/^throughput=\(.*\)/\1/p'
nsw
lat __LAT__ 5000 10000
@@ -256,7 +257,7 @@ lat -
lat -
lat -
nsb tcp_rr --nolog -P 10002 -C 10012 -4
-hout LAT tcp_rr --nolog -P 10002 -C 10012 -4 -c -H __ADDR__ | sed -n 's/^throughput=\(.*\)/\1/p'
+hout LAT tcp_rr --nolog -l1 -P 10002 -C 10012 -4 -c -H __ADDR__ | sed -n 's/^throughput=\(.*\)/\1/p'
nsw
lat __LAT__ 150 100
@@ -266,7 +267,7 @@ lat -
lat -
sleep 1
nsb tcp_crr --nolog -P 10002 -C 10012 -4
-hout LAT tcp_crr --nolog -P 10002 -C 10012 -4 -c -H __ADDR__ | sed -n 's/^throughput=\(.*\)/\1/p'
+hout LAT tcp_crr --nolog -l1 -P 10002 -C 10012 -4 -c -H __ADDR__ | sed -n 's/^throughput=\(.*\)/\1/p'
nsw
lat __LAT__ 5000 10000
diff --git a/test/perf/pasta_udp b/test/perf/pasta_udp
index 6acbfd3..544bf17 100644
--- a/test/perf/pasta_udp
+++ b/test/perf/pasta_udp
@@ -14,6 +14,9 @@
htools bc head ip sleep iperf3 udp_rr jq sed
nstools ip sleep iperf3 udp_rr jq sed
+set MAP_HOST4 192.0.2.1
+set MAP_HOST6 2001:db8:9a55::1
+
test pasta: throughput and latency (local traffic)
hout FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1
@@ -21,7 +24,7 @@ hout FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sy
hout FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__
set THREADS 1
-set TIME 10
+set TIME 1
set OPTS -u -P __THREADS__
info Throughput in Gbps, latency in µs, one thread at __FREQ__ GHz
@@ -133,8 +136,6 @@ te
test pasta: throughput and latency (traffic via tap)
-nsout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
-nsout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
info Throughput in Gbps, latency in µs, one thread at __FREQ__ GHz
@@ -146,13 +147,13 @@ tr UDP throughput over IPv6: ns to host
iperf3s host 10003
# (datagram size) = (packet size) - 48: 40 bytes of IPv6 header, 8 of UDP header
-iperf3 BW ns __GW6__%__IFNAME__ 10003 __TIME__ __OPTS__ -b 8G -l 1472
+iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -b 8G -l 1472
bw __BW__ 0.3 0.5
-iperf3 BW ns __GW6__%__IFNAME__ 10003 __TIME__ __OPTS__ -b 12G -l 3972
+iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -b 12G -l 3972
bw __BW__ 0.5 0.8
-iperf3 BW ns __GW6__%__IFNAME__ 10003 __TIME__ __OPTS__ -b 20G -l 16356
+iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -b 20G -l 16356
bw __BW__ 3.0 4.0
-iperf3 BW ns __GW6__%__IFNAME__ 10003 __TIME__ __OPTS__ -b 30G -l 65472
+iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -b 30G -l 65472
bw __BW__ 6.0 7.0
iperf3k host
@@ -162,7 +163,7 @@ lat -
lat -
lat -
hostb udp_rr --nolog -P 10003 -C 10013 -6
-nsout LAT udp_rr --nolog -P 10003 -C 10013 -6 -c -H __GW6__%__IFNAME__ | sed -n 's/^throughput=\(.*\)/\1/p'
+nsout LAT udp_rr --nolog -P 10003 -C 10013 -6 -c -H __MAP_HOST6__ | sed -n 's/^throughput=\(.*\)/\1/p'
hostw
lat __LAT__ 200 150
@@ -171,13 +172,13 @@ tr UDP throughput over IPv4: ns to host
iperf3s host 10003
# (datagram size) = (packet size) - 28: 20 bytes of IPv4 header, 8 of UDP header
-iperf3 BW ns __GW__ 10003 __TIME__ __OPTS__ -b 8G -l 1472
+iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -b 8G -l 1472
bw __BW__ 0.3 0.5
-iperf3 BW ns __GW__ 10003 __TIME__ __OPTS__ -b 12G -l 3972
+iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -b 12G -l 3972
bw __BW__ 0.5 0.8
-iperf3 BW ns __GW__ 10003 __TIME__ __OPTS__ -b 20G -l 16356
+iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -b 20G -l 16356
bw __BW__ 3.0 4.0
-iperf3 BW ns __GW__ 10003 __TIME__ __OPTS__ -b 30G -l 65492
+iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -b 30G -l 65492
bw __BW__ 6.0 7.0
iperf3k host
@@ -187,7 +188,7 @@ lat -
lat -
lat -
hostb udp_rr --nolog -P 10003 -C 10013 -4
-nsout LAT udp_rr --nolog -P 10003 -C 10013 -4 -c -H __GW__ | sed -n 's/^throughput=\(.*\)/\1/p'
+nsout LAT udp_rr --nolog -P 10003 -C 10013 -4 -c -H __MAP_HOST4__ | sed -n 's/^throughput=\(.*\)/\1/p'
hostw
lat __LAT__ 200 150
diff --git a/test/run b/test/run
index 3b37663..cd6d707 100755
--- a/test/run
+++ b/test/run
@@ -101,7 +101,7 @@ run() {
VALGRIND=1
setup passt_in_ns
test passt/ndp
- test passt/dhcp
+ test passt_in_ns/dhcp
test passt_in_ns/icmp
test passt_in_ns/tcp
test passt_in_ns/udp
@@ -115,7 +115,7 @@ run() {
VALGRIND=0
setup passt_in_ns
test passt/ndp
- test passt/dhcp
+ test passt_in_ns/dhcp
test perf/passt_tcp
test perf/passt_udp
test perf/pasta_tcp
diff --git a/test/valgrind.supp b/test/valgrind.supp
index a158394..735b5f6 100644
--- a/test/valgrind.supp
+++ b/test/valgrind.supp
@@ -6,3 +6,12 @@
...
fun:tcp_sock_consume
}
+
+# same as above, for architectures with the recv() system call (at least i686):
+{
+ passt_recv_MSG_TRUNC_into_NULL_buffer
+ Memcheck:Param
+ socketcall.recv(buf)
+ ...
+ fun:tcp_sock_consume
+}
diff --git a/udp.c b/udp.c
index 7731257..2ba00c9 100644
--- a/udp.c
+++ b/udp.c
@@ -178,8 +178,7 @@ enum udp_iov_idx {
/* IOVs and msghdr arrays for receiving datagrams from sockets */
static struct iovec udp_iov_recv [UDP_MAX_FRAMES];
-static struct mmsghdr udp4_mh_recv [UDP_MAX_FRAMES];
-static struct mmsghdr udp6_mh_recv [UDP_MAX_FRAMES];
+static struct mmsghdr udp_mh_recv [UDP_MAX_FRAMES];
/* IOVs and msghdr arrays for sending "spliced" datagrams to sockets */
static union sockaddr_inany udp_splice_to;
@@ -222,6 +221,7 @@ void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
static void udp_iov_init_one(const struct ctx *c, size_t i)
{
struct udp_payload_t *payload = &udp_payload[i];
+ struct msghdr *mh = &udp_mh_recv[i].msg_hdr;
struct udp_meta_t *meta = &udp_meta[i];
struct iovec *siov = &udp_iov_recv[i];
struct iovec *tiov = udp_l2_iov[i];
@@ -236,27 +236,10 @@ static void udp_iov_init_one(const struct ctx *c, size_t i)
tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph);
tiov[UDP_IOV_PAYLOAD].iov_base = payload;
- /* It's useful to have separate msghdr arrays for receiving. Otherwise,
- * an IPv4 recv() will alter msg_namelen, so we'd have to reset it every
- * time or risk truncating the address on future IPv6 recv()s.
- */
- if (c->ifi4) {
- struct msghdr *mh = &udp4_mh_recv[i].msg_hdr;
-
- mh->msg_name = &meta->s_in;
- mh->msg_namelen = sizeof(struct sockaddr_in);
- mh->msg_iov = siov;
- mh->msg_iovlen = 1;
- }
-
- if (c->ifi6) {
- struct msghdr *mh = &udp6_mh_recv[i].msg_hdr;
-
- mh->msg_name = &meta->s_in;
- mh->msg_namelen = sizeof(struct sockaddr_in6);
- mh->msg_iov = siov;
- mh->msg_iovlen = 1;
- }
+ mh->msg_name = &meta->s_in;
+ mh->msg_namelen = sizeof(meta->s_in);
+ mh->msg_iov = siov;
+ mh->msg_iovlen = 1;
}
/**
@@ -321,7 +304,7 @@ static void udp_splice_send(const struct ctx *c, size_t start, size_t n,
static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
const struct flowside *toside, size_t dlen)
{
- const struct in_addr *src = inany_v4(&toside->faddr);
+ const struct in_addr *src = inany_v4(&toside->oaddr);
const struct in_addr *dst = inany_v4(&toside->eaddr);
size_t l4len = dlen + sizeof(bp->uh);
size_t l3len = l4len + sizeof(*ip4h);
@@ -333,7 +316,7 @@ static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
ip4h->saddr = src->s_addr;
ip4h->check = csum_ip4_header(l3len, IPPROTO_UDP, *src, *dst);
- bp->uh.source = htons(toside->fport);
+ bp->uh.source = htons(toside->oport);
bp->uh.dest = htons(toside->eport);
bp->uh.len = htons(l4len);
csum_udp4(&bp->uh, *src, *dst, bp->data, dlen);
@@ -357,15 +340,15 @@ static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
ip6h->payload_len = htons(l4len);
ip6h->daddr = toside->eaddr.a6;
- ip6h->saddr = toside->faddr.a6;
+ ip6h->saddr = toside->oaddr.a6;
ip6h->version = 6;
ip6h->nexthdr = IPPROTO_UDP;
ip6h->hop_limit = 255;
- bp->uh.source = htons(toside->fport);
+ bp->uh.source = htons(toside->oport);
bp->uh.dest = htons(toside->eport);
bp->uh.len = ip6h->payload_len;
- csum_udp6(&bp->uh, &toside->faddr.a6, &toside->eaddr.a6, bp->data, dlen);
+ csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, bp->data, dlen);
return l4len;
}
@@ -384,7 +367,7 @@ static void udp_tap_prepare(const struct mmsghdr *mmh, unsigned idx,
struct udp_meta_t *bm = &udp_meta[idx];
size_t l4len;
- if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->faddr)) {
+ if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) {
l4len = udp_update_hdr6(&bm->ip6h, bp, toside, mmh[idx].msg_len);
tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) +
sizeof(udp6_eth_hdr));
@@ -404,11 +387,12 @@ static void udp_tap_prepare(const struct mmsghdr *mmh, unsigned idx,
* udp_sock_recverr() - Receive and clear an error from a socket
* @s: Socket to receive from
*
- * Return: true if errors received and processed, false if no more errors
+ * Return: 1 if error received and processed, 0 if no more errors in queue, < 0
+ * if there was an error reading the queue
*
* #syscalls recvmsg
*/
-static bool udp_sock_recverr(int s)
+static int udp_sock_recverr(int s)
{
const struct sock_extended_err *ee;
const struct cmsghdr *hdr;
@@ -425,14 +409,16 @@ static bool udp_sock_recverr(int s)
rc = recvmsg(s, &mh, MSG_ERRQUEUE);
if (rc < 0) {
- if (errno != EAGAIN && errno != EWOULDBLOCK)
- err_perror("Failed to read error queue");
- return false;
+ if (errno == EAGAIN || errno == EWOULDBLOCK)
+ return 0;
+
+ err_perror("UDP: Failed to read error queue");
+ return -1;
}
if (!(mh.msg_flags & MSG_ERRQUEUE)) {
err("Missing MSG_ERRQUEUE flag reading error queue");
- return false;
+ return -1;
}
hdr = CMSG_FIRSTHDR(&mh);
@@ -441,7 +427,7 @@ static bool udp_sock_recverr(int s)
(hdr->cmsg_level == IPPROTO_IPV6 &&
hdr->cmsg_type == IPV6_RECVERR))) {
err("Unexpected cmsg reading error queue");
- return false;
+ return -1;
}
ee = (const struct sock_extended_err *)CMSG_DATA(hdr);
@@ -450,7 +436,54 @@ static bool udp_sock_recverr(int s)
debug("%s error on UDP socket %i: %s",
str_ee_origin(ee), s, strerror(ee->ee_errno));
- return true;
+ return 1;
+}
+
+/**
+ * udp_sock_errs() - Process errors on a socket
+ * @c: Execution context
+ * @s: Socket to receive from
+ * @events: epoll events bitmap
+ *
+ * Return: Number of errors handled, or < 0 if we have an unrecoverable error
+ */
+static int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
+{
+ unsigned n_err = 0;
+ socklen_t errlen;
+ int rc, err;
+
+ ASSERT(!c->no_udp);
+
+ if (!(events & EPOLLERR))
+ return 0; /* Nothing to do */
+
+ /* Empty the error queue */
+ while ((rc = udp_sock_recverr(s)) > 0)
+ n_err += rc;
+
+ if (rc < 0)
+ return -1; /* error reading error, unrecoverable */
+
+ errlen = sizeof(err);
+ if (getsockopt(s, SOL_SOCKET, SO_ERROR, &err, &errlen) < 0 ||
+ errlen != sizeof(err)) {
+ err_perror("Error reading SO_ERROR");
+ return -1; /* error reading error, unrecoverable */
+ }
+
+ if (err) {
+ debug("Unqueued error on UDP socket %i: %s", s, strerror(err));
+ n_err++;
+ }
+
+ if (!n_err) {
+ /* EPOLLERR, but no errors to clear !? */
+ err("EPOLLERR event without reported errors on socket %i", s);
+ return -1; /* no way to clear, unrecoverable */
+ }
+
+ return n_err;
}
/**
@@ -460,7 +493,9 @@ static bool udp_sock_recverr(int s)
* @events: epoll events bitmap
* @mmh mmsghdr array to receive into
*
- * #syscalls recvmmsg
+ * Return: Number of datagrams received
+ *
+ * #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64
*/
static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
struct mmsghdr *mmh)
@@ -476,12 +511,6 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
ASSERT(!c->no_udp);
- /* Clear any errors first */
- if (events & EPOLLERR) {
- while (udp_sock_recverr(s))
- ;
- }
-
if (!(events & EPOLLIN))
return 0;
@@ -506,10 +535,17 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
uint32_t events, const struct timespec *now)
{
- struct mmsghdr *mmh_recv = ref.udp.v6 ? udp6_mh_recv : udp4_mh_recv;
+ const socklen_t sasize = sizeof(udp_meta[0].s_in);
int n, i;
- if ((n = udp_sock_recv(c, ref.fd, events, mmh_recv)) <= 0)
+ if (udp_sock_errs(c, ref.fd, events) < 0) {
+ err("UDP: Unrecoverable error on listening socket:"
+ " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
+ /* FIXME: what now? close/re-open socket? */
+ return;
+ }
+
+ if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv)) <= 0)
return;
/* We divide datagrams into batches based on how we need to send them,
@@ -518,6 +554,7 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
* populate it one entry *ahead* of the loop counter.
*/
udp_meta[0].tosidx = udp_flow_from_sock(c, ref, &udp_meta[0].s_in, now);
+ udp_mh_recv[0].msg_hdr.msg_namelen = sasize;
for (i = 0; i < n; ) {
flow_sidx_t batchsidx = udp_meta[i].tosidx;
uint8_t batchpif = pif_at_sidx(batchsidx);
@@ -525,9 +562,9 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
do {
if (pif_is_socket(batchpif)) {
- udp_splice_prepare(mmh_recv, i);
+ udp_splice_prepare(udp_mh_recv, i);
} else if (batchpif == PIF_TAP) {
- udp_tap_prepare(mmh_recv, i,
+ udp_tap_prepare(udp_mh_recv, i,
flowside_at_sidx(batchsidx));
}
@@ -537,6 +574,7 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
udp_meta[i].tosidx = udp_flow_from_sock(c, ref,
&udp_meta[i].s_in,
now);
+ udp_mh_recv[i].msg_hdr.msg_namelen = sasize;
} while (flow_sidx_eq(udp_meta[i].tosidx, batchsidx));
if (pif_is_socket(batchpif)) {
@@ -572,19 +610,23 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
uint32_t events, const struct timespec *now)
{
- const struct flowside *fromside = flowside_at_sidx(ref.flowside);
flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
const struct flowside *toside = flowside_at_sidx(tosidx);
struct udp_flow *uflow = udp_at_sidx(ref.flowside);
int from_s = uflow->s[ref.flowside.sidei];
- bool v6 = !inany_v4(&fromside->eaddr);
- struct mmsghdr *mmh_recv = v6 ? udp6_mh_recv : udp4_mh_recv;
uint8_t topif = pif_at_sidx(tosidx);
int n, i;
ASSERT(!c->no_udp && uflow);
- if ((n = udp_sock_recv(c, from_s, events, mmh_recv)) <= 0)
+ if (udp_sock_errs(c, from_s, events) < 0) {
+ flow_err(uflow, "Unrecoverable error on reply socket");
+ flow_err_details(uflow);
+ udp_flow_close(c, uflow);
+ return;
+ }
+
+ if ((n = udp_sock_recv(c, from_s, events, udp_mh_recv)) <= 0)
return;
flow_trace(uflow, "Received %d datagrams on reply socket", n);
@@ -592,9 +634,11 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
for (i = 0; i < n; i++) {
if (pif_is_socket(topif))
- udp_splice_prepare(mmh_recv, i);
+ udp_splice_prepare(udp_mh_recv, i);
else if (topif == PIF_TAP)
- udp_tap_prepare(mmh_recv, i, toside);
+ udp_tap_prepare(udp_mh_recv, i, toside);
+ /* Restore sockaddr length clobbered by recvmsg() */
+ udp_mh_recv[i].msg_hdr.msg_namelen = sizeof(udp_meta[i].s_in);
}
if (pif_is_socket(topif)) {
@@ -729,45 +773,58 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
const void *addr, const char *ifname, in_port_t port)
{
- union udp_listen_epoll_ref uref = { .port = port };
- int s, r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
+ union udp_listen_epoll_ref uref = {
+ .pif = ns ? PIF_SPLICE : PIF_HOST,
+ .port = port,
+ };
+ int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
ASSERT(!c->no_udp);
- if (ns)
- uref.pif = PIF_SPLICE;
- else
- uref.pif = PIF_HOST;
-
- if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) {
- uref.v6 = 0;
+ if (af == AF_UNSPEC && c->ifi4 && c->ifi6) {
+ int s;
+ /* Attempt to get a dual stack socket */
if (!ns) {
- r4 = s = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN,
- addr, ifname, port, uref.u32);
-
+ s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN,
+ addr, ifname, port, uref.u32);
udp_splice_init[V4][port] = s < 0 ? -1 : s;
+ udp_splice_init[V6][port] = s < 0 ? -1 : s;
} else {
- r4 = s = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN,
- &in4addr_loopback,
- ifname, port, uref.u32);
+ s = sock_l4(c, AF_UNSPEC, EPOLL_TYPE_UDP_LISTEN,
+ &in4addr_loopback, ifname, port, uref.u32);
udp_splice_ns[V4][port] = s < 0 ? -1 : s;
+ udp_splice_ns[V6][port] = s < 0 ? -1 : s;
}
+ if (IN_INTERVAL(0, FD_REF_MAX, s))
+ return 0;
}
- if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) {
- uref.v6 = 1;
+ if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) {
+ if (!ns) {
+ r4 = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN,
+ addr, ifname, port, uref.u32);
+ udp_splice_init[V4][port] = r4 < 0 ? -1 : r4;
+ } else {
+ r4 = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN,
+ &in4addr_loopback,
+ ifname, port, uref.u32);
+ udp_splice_ns[V4][port] = r4 < 0 ? -1 : r4;
+ }
+ }
+
+ if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) {
if (!ns) {
- r6 = s = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN,
- addr, ifname, port, uref.u32);
+ r6 = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN,
+ addr, ifname, port, uref.u32);
- udp_splice_init[V6][port] = s < 0 ? -1 : s;
+ udp_splice_init[V6][port] = r6 < 0 ? -1 : r6;
} else {
- r6 = s = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN,
- &in6addr_loopback,
- ifname, port, uref.u32);
- udp_splice_ns[V6][port] = s < 0 ? -1 : s;
+ r6 = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN,
+ &in6addr_loopback,
+ ifname, port, uref.u32);
+ udp_splice_ns[V6][port] = r6 < 0 ? -1 : r6;
}
}
diff --git a/udp.h b/udp.h
index fb42e1c..a8e76bf 100644
--- a/udp.h
+++ b/udp.h
@@ -26,14 +26,12 @@ void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
* union udp_listen_epoll_ref - epoll reference for "listening" UDP sockets
* @port: Source port for connected sockets, bound port otherwise
* @pif: pif for this socket
- * @v6: Set for IPv6 sockets or connections
* @u32: Opaque u32 value of reference
*/
union udp_listen_epoll_ref {
struct {
in_port_t port;
uint8_t pif;
- bool v6:1;
};
uint32_t u32;
};
diff --git a/udp_flow.c b/udp_flow.c
index 8b25ad1..b81be2c 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -8,6 +8,7 @@
#include <errno.h>
#include <fcntl.h>
#include <sys/uio.h>
+#include <unistd.h>
#include "util.h"
#include "passt.h"
@@ -38,8 +39,11 @@ struct udp_flow *udp_at_sidx(flow_sidx_t sidx)
* @c: Execution context
* @uflow: UDP flow
*/
-static void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
+void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
{
+ if (uflow->closed)
+ return; /* Nothing to do */
+
if (uflow->s[INISIDE] >= 0) {
/* The listening socket needs to stay in epoll */
close(uflow->s[INISIDE]);
@@ -55,6 +59,8 @@ static void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE));
if (!pif_is_socket(uflow->f.pif[TGTSIDE]))
flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE));
+
+ uflow->closed = true;
}
/**
@@ -174,7 +180,7 @@ cancel:
* @s_in: Source socket address, filled in by recvmmsg()
* @now: Timestamp
*
- * #syscalls fcntl
+ * #syscalls fcntl arm:fcntl64 ppc64:fcntl64 i686:fcntl64
*
* Return: sidx for the destination side of the flow for this packet, or
* FLOW_SIDX_NONE if we couldn't find or create a flow.
@@ -256,6 +262,17 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
}
/**
+ * udp_flow_defer() - Deferred per-flow handling (clean up aborted flows)
+ * @uflow: Flow to handle
+ *
+ * Return: true if the connection is ready to free, false otherwise
+ */
+bool udp_flow_defer(const struct udp_flow *uflow)
+{
+ return uflow->closed;
+}
+
+/**
* udp_flow_timer() - Handler for timed events related to a given flow
* @c: Execution context
* @uflow: UDP flow
diff --git a/udp_flow.h b/udp_flow.h
index 12ddf03..9a1b059 100644
--- a/udp_flow.h
+++ b/udp_flow.h
@@ -10,6 +10,7 @@
/**
* struct udp - Descriptor for a flow of UDP packets
* @f: Generic flow information
+ * @closed: Flow is already closed
* @ts: Activity timestamp
* @s: Socket fd (or -1) for each side of the flow
*/
@@ -17,6 +18,7 @@ struct udp_flow {
/* Must be first element */
struct flow_common f;
+ bool closed :1;
time_t ts;
int s[SIDES];
};
@@ -30,6 +32,8 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
const void *saddr, const void *daddr,
in_port_t srcport, in_port_t dstport,
const struct timespec *now);
+void udp_flow_close(const struct ctx *c, struct udp_flow *uflow);
+bool udp_flow_defer(const struct udp_flow *uflow);
bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow,
const struct timespec *now);
diff --git a/util.c b/util.c
index 0b41404..eede4e5 100644
--- a/util.c
+++ b/util.c
@@ -199,8 +199,7 @@ int sock_l4(const struct ctx *c, sa_family_t af, enum epoll_type type,
if (bind_addr) {
addr6.sin6_addr = *(struct in6_addr *)bind_addr;
- if (!memcmp(bind_addr, &c->ip6.addr_ll,
- sizeof(c->ip6.addr_ll)))
+ if (IN6_IS_ADDR_LINKLOCAL(bind_addr))
addr6.sin6_scope_id = c->ifi6;
}
return sock_l4_sa(c, type, &addr6, sizeof(addr6), ifname,
@@ -250,7 +249,7 @@ void sock_probe_mem(struct ctx *c)
int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b)
{
if (a->tv_nsec < b->tv_nsec) {
- return (b->tv_nsec - a->tv_nsec) / 1000 +
+ return (a->tv_nsec + 1000000000 - b->tv_nsec) / 1000 +
(a->tv_sec - b->tv_sec - 1) * 1000000;
}
@@ -676,6 +675,25 @@ const char *sockaddr_ntop(const void *sa, char *dst, socklen_t size)
return dst;
}
+/** eth_ntop() - Convert an Ethernet MAC address to text format
+ * @mac: MAC address
+ * @dst: Output buffer, minimum ETH_ADDRSTRLEN bytes
+ * @size: Size of buffer at @dst
+ *
+ * Return: On success, a non-null pointer to @dst, NULL on failure
+ */
+const char *eth_ntop(const unsigned char *mac, char *dst, size_t size)
+{
+ int len;
+
+ len = snprintf(dst, size, "%02x:%02x:%02x:%02x:%02x:%02x",
+ mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
+ if (len < 0 || (size_t)len >= size)
+ return NULL;
+
+ return dst;
+}
+
/** str_ee_origin() - Convert socket extended error origin to a string
* @ee: Socket extended error structure
*
@@ -710,7 +728,7 @@ void close_open_files(int argc, char **argv)
int name, rc;
do {
- name = getopt_long(argc, argv, "+:F", optfd, NULL);
+ name = getopt_long(argc, argv, "-:F:", optfd, NULL);
if (name == 'F') {
errno = 0;
diff --git a/util.h b/util.h
index cb4d181..c7a59d5 100644
--- a/util.h
+++ b/util.h
@@ -14,6 +14,9 @@
#include <string.h>
#include <signal.h>
#include <arpa/inet.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <linux/close_range.h>
#include "log.h"
@@ -92,11 +95,7 @@
#define FD_PROTO(x, proto) \
(IN_INTERVAL(c->proto.fd_min, c->proto.fd_max, (x)))
-#define PORT_EPHEMERAL_MIN ((1 << 15) + (1 << 14)) /* RFC 6335 */
-#define PORT_IS_EPHEMERAL(port) ((port) >= PORT_EPHEMERAL_MIN)
-
#define MAC_ZERO ((uint8_t [ETH_ALEN]){ 0 })
-#define MAC_LAA ((uint8_t [ETH_ALEN]){ BIT(1), 0, 0, 0, 0, 0 })
#define MAC_IS_ZERO(addr) (!memcmp((addr), MAC_ZERO, ETH_ALEN))
#ifndef __bswap_constant_16
@@ -160,6 +159,25 @@ struct ctx;
/* cppcheck-suppress funcArgNamesDifferent */
__attribute__ ((weak)) int ffsl(long int i) { return __builtin_ffsl(i); }
+
+#ifdef CLOSE_RANGE_UNSHARE /* Linux kernel >= 5.9 */
+/* glibc < 2.34 and musl as of 1.2.5 need these */
+#ifndef SYS_close_range
+#define SYS_close_range 436
+#endif
+__attribute__ ((weak))
+/* cppcheck-suppress funcArgNamesDifferent */
+int close_range(unsigned int first, unsigned int last, int flags) {
+ return syscall(SYS_close_range, first, last, flags);
+}
+#else
+/* No reasonable fallback option */
+/* cppcheck-suppress funcArgNamesDifferent */
+int close_range(unsigned int first, unsigned int last, int flags) {
+ return 0;
+}
+#endif
+
int sock_l4_sa(const struct ctx *c, enum epoll_type type,
const void *sa, socklen_t sl,
const char *ifname, bool v6only, uint32_t data);
@@ -215,9 +233,12 @@ static inline const char *af_name(sa_family_t af)
#define SOCKADDR_STRLEN MAX(SOCKADDR_INET_STRLEN, SOCKADDR_INET6_STRLEN)
+#define ETH_ADDRSTRLEN (sizeof("00:11:22:33:44:55"))
+
struct sock_extended_err;
const char *sockaddr_ntop(const void *sa, char *dst, socklen_t size);
+const char *eth_ntop(const unsigned char *mac, char *dst, size_t size);
const char *str_ee_origin(const struct sock_extended_err *ee);
/**