diff options
| -rw-r--r-- | .clang-tidy | 5 | ||||
| -rw-r--r-- | README.md | 2 | ||||
| -rw-r--r-- | arp.h | 2 | ||||
| -rw-r--r-- | checksum.c | 110 | ||||
| -rw-r--r-- | checksum.h | 3 | ||||
| -rw-r--r-- | conf.c | 269 | ||||
| -rw-r--r-- | contrib/apparmor/abstractions/passt | 11 | ||||
| -rw-r--r-- | contrib/apparmor/abstractions/pasta | 2 | ||||
| -rw-r--r-- | contrib/apparmor/usr.bin.passt | 2 | ||||
| -rw-r--r-- | contrib/apparmor/usr.bin.passt-repair | 2 | ||||
| -rw-r--r-- | contrib/apparmor/usr.bin.pasta | 2 | ||||
| -rw-r--r-- | contrib/fedora/passt.spec | 2 | ||||
| -rw-r--r-- | contrib/selinux/pasta.te | 4 | ||||
| -rw-r--r-- | dhcp.c | 4 | ||||
| -rw-r--r-- | dhcpv6.c | 7 | ||||
| -rw-r--r-- | doc/platform-requirements/.gitignore | 1 | ||||
| -rw-r--r-- | doc/platform-requirements/Makefile | 8 | ||||
| -rw-r--r-- | doc/platform-requirements/listen-vs-repair.c | 6 | ||||
| -rw-r--r-- | doc/platform-requirements/tcp-close-rst.c | 204 | ||||
| -rw-r--r-- | epoll_ctl.h | 11 | ||||
| -rw-r--r-- | flow.c | 123 | ||||
| -rw-r--r-- | flow.h | 18 | ||||
| -rw-r--r-- | flow_table.h | 3 | ||||
| -rw-r--r-- | fwd.c | 442 | ||||
| -rw-r--r-- | fwd.h | 92 | ||||
| -rwxr-xr-x | hooks/pre-push | 2 | ||||
| -rw-r--r-- | icmp.c | 11 | ||||
| -rw-r--r-- | icmp.h | 4 | ||||
| -rw-r--r-- | icmp_flow.h | 2 | ||||
| -rw-r--r-- | igmp.c | 1 | ||||
| -rw-r--r-- | inany.c | 78 | ||||
| -rw-r--r-- | inany.h | 22 | ||||
| -rw-r--r-- | iov.c | 68 | ||||
| -rw-r--r-- | iov.h | 7 | ||||
| -rw-r--r-- | ip.c | 47 | ||||
| -rw-r--r-- | ip.h | 5 | ||||
| -rw-r--r-- | isolation.h | 3 | ||||
| -rw-r--r-- | lineread.h | 2 | ||||
| -rw-r--r-- | linux_dep.h | 3 | ||||
| -rw-r--r-- | log.h | 11 | ||||
| -rw-r--r-- | migrate.c | 29 | ||||
| -rw-r--r-- | migrate.h | 3 | ||||
| -rw-r--r-- | ndp.c | 4 | ||||
| -rw-r--r-- | netlink.h | 4 | ||||
| -rw-r--r-- | packet.h | 2 | ||||
| -rw-r--r-- | passt.1 | 39 | ||||
| -rw-r--r-- | passt.c | 9 | ||||
| -rw-r--r-- | passt.h | 10 | ||||
| -rw-r--r-- | pasta.c | 38 | ||||
| -rw-r--r-- | pasta.h | 2 | ||||
| -rw-r--r-- | pcap.h | 2 | ||||
| -rw-r--r-- | pif.c | 24 | ||||
| -rw-r--r-- | pif.h | 12 | ||||
| -rw-r--r-- | repair.h | 2 | ||||
| -rwxr-xr-x | seccomp.sh | 6 | ||||
| -rw-r--r-- | siphash.h | 3 | ||||
| -rw-r--r-- | tap.c | 28 | ||||
| -rw-r--r-- | tap.h | 8 | ||||
| -rw-r--r-- | tcp.c | 696 | ||||
| -rw-r--r-- | tcp.h | 44 | ||||
| -rw-r--r-- | tcp_buf.c | 34 | ||||
| -rw-r--r-- | tcp_conn.h | 21 | ||||
| -rw-r--r-- | tcp_internal.h | 22 | ||||
| -rw-r--r-- | tcp_splice.c | 164 | ||||
| -rw-r--r-- | tcp_splice.h | 2 | ||||
| -rw-r--r-- | tcp_vu.c | 57 | ||||
| -rw-r--r-- | test/Makefile | 52 | ||||
| -rwxr-xr-x | test/lib/term | 6 | ||||
| -rwxr-xr-x | test/passt.mbuto | 6 | ||||
| -rw-r--r-- | udp.c | 232 | ||||
| -rw-r--r-- | udp.h | 31 | ||||
| -rw-r--r-- | udp_flow.c | 61 | ||||
| -rw-r--r-- | udp_flow.h | 12 | ||||
| -rw-r--r-- | udp_internal.h | 9 | ||||
| -rw-r--r-- | udp_vu.c | 52 | ||||
| -rw-r--r-- | udp_vu.h | 2 | ||||
| -rw-r--r-- | util.c | 87 | ||||
| -rw-r--r-- | util.h | 6 | ||||
| -rw-r--r-- | vhost_user.c | 12 | ||||
| -rw-r--r-- | virtio.c | 8 | ||||
| -rw-r--r-- | virtio.h | 2 | ||||
| -rw-r--r-- | vu_common.c | 48 | ||||
| -rw-r--r-- | vu_common.h | 15 |
83 files changed, 2428 insertions, 1079 deletions
diff --git a/.clang-tidy b/.clang-tidy index 9d346ec..773121f 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -81,6 +81,11 @@ Checks: # precedence over addition in modern mathematical notation. Adding # parentheses to reinforce that certainly won't improve readability. - "-readability-math-missing-parentheses" + + # #if defined(FOO) is fine, and can be more consistent with other + # #if directives. Don't insist on #ifdef instead. + - "-readability-use-concise-preprocessor-directives" + WarningsAsErrors: "*" HeaderFileExtensions: - h @@ -291,7 +291,7 @@ speeding up local connections, and usually requiring NAT. _pasta_: * ✅ all capabilities dropped, other than `CAP_NET_BIND_SERVICE` (if granted) * ✅ with default options, user, mount, IPC, UTS, PID namespaces are detached * ✅ no external dependencies (other than a standard C library) -* ✅ restrictive seccomp profiles (33 syscalls allowed for _passt_, 43 for +* ✅ restrictive seccomp profiles (34 syscalls allowed for _passt_, 43 for _pasta_ on x86_64) * ✅ examples of [AppArmor](/passt/tree/contrib/apparmor) and [SELinux](/passt/tree/contrib/selinux) profiles available @@ -6,6 +6,8 @@ #ifndef ARP_H #define ARP_H +#include <linux/if_ether.h> + /** * struct arpmsg - 802.2 ARP IPv4 payload * @sha: Sender hardware address @@ -281,7 +281,7 @@ void csum_icmp6(struct icmp6hdr *icmp6hr, icmp6hr->icmp6_cksum = csum(payload, dlen, psum); } -#ifdef __AVX2__ +#if defined(__AVX2__) #include <immintrin.h> /** @@ -479,7 +479,111 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init) return init; } -#else /* __AVX2__ */ +#elif defined(__POWER9_VECTOR__) || defined(__POWER8_VECTOR__) +#include <altivec.h> + +/** + * csum_vsx() - Compute 32-bit checksum using VSX SIMD instructions + * @buf: Input buffer + * @len: Input length + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum + * + * Return: 32-bit checksum, not complemented, not folded + */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See csum_16b() */ +static uint32_t csum_vsx(const void *buf, size_t len, uint32_t init) +{ + const uint8_t *p = buf; + vector unsigned int sum_even = vec_splat_u32(0); + vector unsigned int sum_odd = vec_splat_u32(0); + const vector unsigned short ones = vec_splat_u16(1); + uint64_t sum64 = init; + +#ifdef __POWER9_VECTOR__ + while (len >= 64) { + vector unsigned char v0b = vec_vsx_ld(0, p); + vector unsigned char v1b = vec_vsx_ld(16, p); + vector unsigned char v2b = vec_vsx_ld(32, p); + vector unsigned char v3b = vec_vsx_ld(48, p); + vector unsigned short v0 = (vector unsigned short)v0b; + vector unsigned short v1 = (vector unsigned short)v1b; + vector unsigned short v2 = (vector unsigned short)v2b; + vector unsigned short v3 = (vector unsigned short)v3b; + + sum_even = vec_add(sum_even, vec_mule(v0, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); + sum_even = vec_add(sum_even, vec_mule(v1, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v1, ones)); + sum_even = vec_add(sum_even, vec_mule(v2, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v2, ones)); + sum_even = vec_add(sum_even, vec_mule(v3, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v3, ones)); + + p += 64; + len -= 64; + } +#endif + + while (len >= 32) { + vector unsigned char v0b = vec_vsx_ld(0, p); + vector unsigned char v1b = vec_vsx_ld(16, p); + vector unsigned short v0 = (vector unsigned short)v0b; + vector unsigned short v1 = (vector unsigned short)v1b; + + sum_even = vec_add(sum_even, vec_mule(v0, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); + sum_even = vec_add(sum_even, vec_mule(v1, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v1, ones)); + + p += 32; + len -= 32; + } + + while (len >= 16) { + vector unsigned char v0b = vec_vsx_ld(0, p); + vector unsigned short v0 = (vector unsigned short)v0b; + + sum_even = vec_add(sum_even, vec_mule(v0, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); + + p += 16; + len -= 16; + } + + { + vector unsigned int sum32 = vec_add(sum_even, sum_odd); + uint32_t partial[4] __attribute__((aligned(16))); + + vec_st(sum32, 0, partial); + sum64 += (uint64_t)partial[0] + partial[1] + + partial[2] + partial[3]; + } + + sum64 += sum_16b(p, len); + + sum64 = (sum64 >> 32) + (sum64 & 0xffffffff); + sum64 += sum64 >> 32; + + return (uint32_t)sum64; +} + +/** + * csum_unfolded() - Calculate the unfolded checksum of a data buffer. + * + * @buf: Input buffer + * @len: Input length + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum + * + * Return: 32-bit unfolded checksum + */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See csum_16b() */ +uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init) +{ + return csum_vsx(buf, len, init); +} +#else /* !__AVX2__ && !__POWER9_VECTOR__ && !__POWER8_VECTOR__ */ /** * csum_unfolded() - Calculate the unfolded checksum of a data buffer. * @@ -495,7 +599,7 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init) { return sum_16b(buf, len) + init; } -#endif /* !__AVX2__ */ +#endif /* !__AVX2__ && !__POWER9_VECTOR__ && !__POWER8_VECTOR__ */ /** * csum_iov_tail() - Calculate unfolded checksum for the tail of an IO vector @@ -6,6 +6,9 @@ #ifndef CHECKSUM_H #define CHECKSUM_H +#include <stddef.h> +#include <stdint.h> + struct udphdr; struct icmphdr; struct icmp6hdr; @@ -135,9 +135,9 @@ static int parse_port_range(const char *s, char **endptr, * @ifname: Listening interface * @first: First port to forward * @last: Last port to forward - * @exclude: Bitmap of ports to exclude + * @exclude: Bitmap of ports to exclude (may be NULL) * @to: Port to translate @first to when forwarding - * @weak: Ignore errors, as long as at least one port is mapped + * @flags: Flags for forwarding entries */ static void conf_ports_range_except(const struct ctx *c, char optname, const char *optarg, struct fwd_ports *fwd, @@ -145,58 +145,58 @@ static void conf_ports_range_except(const struct ctx *c, char optname, const char *ifname, uint16_t first, uint16_t last, const uint8_t *exclude, uint16_t to, - bool weak) + uint8_t flags) { - bool bound_one = false; - unsigned i; - int ret; + unsigned delta = to - first; + unsigned base, i; if (first == 0) { die("Can't forward port 0 for option '-%c %s'", optname, optarg); } - if (ifname && c->no_bindtodevice) { - die( -"Device binding for '-%c %s' unsupported (requires kernel 5.7+)", - optname, optarg); + if (addr) { + if (!c->ifi4 && inany_v4(addr)) { + die("IPv4 is disabled, can't use -%c %s", + optname, optarg); + } else if (!c->ifi6 && !inany_v4(addr)) { + die("IPv6 is disabled, can't use -%c %s", + optname, optarg); + } } - for (i = first; i <= last; i++) { - if (bitmap_isset(exclude, i)) + for (base = first; base <= last; base++) { + if (exclude && bitmap_isset(exclude, base)) continue; - if (bitmap_isset(fwd->map, i)) { - warn( -"Altering mapping of already mapped port number: %s", optarg); + for (i = base; i <= last; i++) { + if (exclude && bitmap_isset(exclude, i)) + break; } - bitmap_set(fwd->map, i); - fwd->delta[i] = to - first; - - if (optname == 't') - ret = tcp_sock_init(c, PIF_HOST, addr, ifname, i); - else if (optname == 'u') - ret = udp_sock_init(c, PIF_HOST, addr, ifname, i); - else - /* No way to check in advance for -T and -U */ - ret = 0; - - if (ret == -ENFILE || ret == -EMFILE) { - die("Can't open enough sockets for port specifier: %s", - optarg); - } + if ((optname == 'T' || optname == 'U') && c->no_bindtodevice) { + /* FIXME: Once the fwd bitmaps are removed, move this + * workaround to the caller + */ + ASSERT(!addr && ifname && !strcmp(ifname, "lo")); + warn( +"SO_BINDTODEVICE unavailable, forwarding only 127.0.0.1 and ::1 for '-%c %s'", + optname, optarg); - if (!ret) { - bound_one = true; - } else if (!weak) { - die("Failed to bind port %u (%s) for option '-%c %s'", - i, strerror_(-ret), optname, optarg); + if (c->ifi4) { + fwd_rule_add(fwd, flags, &inany_loopback4, NULL, + base, i - 1, base + delta); + } + if (c->ifi6) { + fwd_rule_add(fwd, flags, &inany_loopback6, NULL, + base, i - 1, base + delta); + } + } else { + fwd_rule_add(fwd, flags, addr, ifname, + base, i - 1, base + delta); } + base = i - 1; } - - if (!bound_one) - die("Failed to bind any port for '-%c %s'", optname, optarg); } /** @@ -262,7 +262,7 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, conf_ports_range_except(c, optname, optarg, fwd, NULL, NULL, 1, NUM_PORTS - 1, exclude, - 1, true); + 1, FWD_WEAK); return; } @@ -338,6 +338,15 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, } } while ((p = next_chunk(p, ','))); + if (ifname && c->no_bindtodevice) { + die( +"Device binding for '-%c %s' unsupported (requires kernel 5.7+)", + optname, optarg); + } + /* Outbound forwards come from guest loopback */ + if ((optname == 'T' || optname == 'U') && !ifname) + ifname = "lo"; + if (exclude_only) { /* Exclude ephemeral ports */ for (i = 0; i < NUM_PORTS; i++) @@ -347,7 +356,7 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, conf_ports_range_except(c, optname, optarg, fwd, addr, ifname, 1, NUM_PORTS - 1, exclude, - 1, true); + 1, FWD_WEAK); return; } @@ -380,7 +389,7 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, addr, ifname, orig_range.first, orig_range.last, exclude, - mapped_range.first, false); + mapped_range.first, 0); } while ((p = next_chunk(p, ','))); return; @@ -682,7 +691,7 @@ static int conf_ip4_prefix(const char *arg) return -1; } else { errno = 0; - len = strtoul(optarg, NULL, 0); + len = strtoul(arg, NULL, 0); if (len > 32 || errno) return -1; } @@ -826,7 +835,7 @@ static void conf_ip6_local(struct ip6_ctx *ip6) * usage() - Print usage, exit with given status code * @name: Executable name * @f: Stream to print usage info to - * @status: Status code for _exit() + * @status: Status code for exit(2) */ static void usage(const char *name, FILE *f, int status) { @@ -896,7 +905,7 @@ static void usage(const char *name, FILE *f, int status) " a zero value disables assignment\n" " default: 65520: maximum 802.3 MTU minus 802.3 header\n" " length, rounded to 32 bits (IPv4 words)\n" - " -a, --address ADDR Assign IPv4 or IPv6 address ADDR\n" + " -a, --address ADDR Assign IPv4 or IPv6 address ADDR[/PREFIXLEN]\n" " can be specified zero to two times (for IPv4 and IPv6)\n" " default: use addresses from interface with default route\n" " -n, --netmask MASK Assign IPv4 MASK, dot-decimal or bits\n" @@ -997,8 +1006,7 @@ static void usage(const char *name, FILE *f, int status) " SPEC is as described for TCP above\n" " default: none\n"); - (void)fflush(f); - _exit(status); + passt_exit(status); pasta_opts: @@ -1050,10 +1058,10 @@ pasta_opts: " --no-copy-addrs DEPRECATED:\n" " Don't copy all addresses to namespace\n" " --ns-mac-addr ADDR Set MAC address on tap interface\n" - " --no-splice Disable inbound socket splicing\n"); + " --no-splice Disable inbound socket splicing\n" + " --splice-only Only enable loopback forwarding\n"); - (void)fflush(f); - _exit(status); + passt_exit(status); } /** @@ -1109,7 +1117,7 @@ static void conf_print(const struct ctx *c) info("Template interface: %s%s%s%s%s", c->ifi4 > 0 ? if_indextoname(c->ifi4, ifn) : "", c->ifi4 > 0 ? " (IPv4)" : "", - (c->ifi4 && c->ifi6) ? ", " : "", + (c->ifi4 > 0 && c->ifi6 > 0) ? ", " : "", c->ifi6 > 0 ? if_indextoname(c->ifi6, ifn) : "", c->ifi6 > 0 ? " (IPv6)" : ""); } @@ -1134,7 +1142,7 @@ static void conf_print(const struct ctx *c) inet_ntop(AF_INET6, &c->ip6.addr_out, buf6, sizeof(buf6))); } - if (c->mode == MODE_PASTA) + if (c->mode == MODE_PASTA && !c->splice_only) info("Namespace interface: %s", c->pasta_ifn); info("MAC:"); @@ -1161,7 +1169,9 @@ static void conf_print(const struct ctx *c) buf4, sizeof(buf4))); } - for (i = 0; !IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns[i]); i++) { + for (i = 0; i < ARRAY_SIZE(c->ip4.dns); i++) { + if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns[i])) + break; if (!i) info("DNS:"); inet_ntop(AF_INET, &c->ip4.dns[i], buf4, sizeof(buf4)); @@ -1199,7 +1209,9 @@ static void conf_print(const struct ctx *c) buf6, sizeof(buf6))); dns6: - for (i = 0; !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns[i]); i++) { + for (i = 0; i < ARRAY_SIZE(c->ip6.dns); i++) { + if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns[i])) + break; if (!i) info("DNS:"); inet_ntop(AF_INET6, &c->ip6.dns[i], buf6, sizeof(buf6)); @@ -1212,6 +1224,17 @@ dns6: info(" %s", c->dns_search[i].n); } } + + info("Inbound TCP forwarding:"); + fwd_rules_print(&c->tcp.fwd_in); + info("Inbound UDP forwarding:"); + fwd_rules_print(&c->udp.fwd_in); + if (c->mode == MODE_PASTA) { + info("Outbound TCP forwarding:"); + fwd_rules_print(&c->tcp.fwd_out); + info("Outbound UDP forwarding:"); + fwd_rules_print(&c->udp.fwd_out); + } } /** @@ -1222,7 +1245,7 @@ dns6: * * Return: 0 on success, negative error code on failure */ -static int conf_runas(char *opt, unsigned int *uid, unsigned int *gid) +static int conf_runas(const char *opt, unsigned int *uid, unsigned int *gid) { const char *uopt, *gopt = NULL; char *sep = strchr(opt, ':'); @@ -1453,6 +1476,7 @@ void conf(struct ctx *c, int argc, char **argv) {"no-ndp", no_argument, &c->no_ndp, 1 }, {"no-ra", no_argument, &c->no_ra, 1 }, {"no-splice", no_argument, &c->no_splice, 1 }, + {"splice-only", no_argument, &c->splice_only, 1 }, {"freebind", no_argument, &c->freebind, 1 }, {"no-map-gw", no_argument, &no_map_gw, 1 }, {"ipv4-only", no_argument, NULL, '4' }, @@ -1506,6 +1530,8 @@ void conf(struct ctx *c, int argc, char **argv) unsigned dns4_idx = 0, dns6_idx = 0; unsigned long max_mtu = IP_MAX_MTU; struct fqdn *dnss = c->dns_search; + bool addr_has_prefix_len = false; + uint8_t prefix_len_from_opt = 0; unsigned int ifi4 = 0, ifi6 = 0; const char *logfile = NULL; size_t logsize = 0; @@ -1621,8 +1647,7 @@ void conf(struct ctx *c, int argc, char **argv) FPRINTF(stdout, c->mode == MODE_PASTA ? "pasta " : "passt "); FPRINTF(stdout, VERSION_BLOB); - (void)fflush(stdout); - _exit(EXIT_SUCCESS); + passt_exit(EXIT_SUCCESS); case 15: ret = snprintf(c->ip4.ifname_out, sizeof(c->ip4.ifname_out), "%s", optarg); @@ -1811,36 +1836,56 @@ void conf(struct ctx *c, int argc, char **argv) c->mtu = mtu; break; } - case 'a': - if (inet_pton(AF_INET6, optarg, &c->ip6.addr) && - !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr) && - !IN6_IS_ADDR_LOOPBACK(&c->ip6.addr) && - !IN6_IS_ADDR_V4MAPPED(&c->ip6.addr) && - !IN6_IS_ADDR_V4COMPAT(&c->ip6.addr) && - !IN6_IS_ADDR_MULTICAST(&c->ip6.addr)) { - if (c->mode == MODE_PASTA) - c->ip6.no_copy_addrs = true; - break; - } + case 'a': { + union inany_addr addr; + uint8_t prefix_len; + + addr_has_prefix_len = inany_prefix_pton(optarg, &addr, + &prefix_len); + + if (addr_has_prefix_len && prefix_len_from_opt) + die("Redundant prefix length specification"); + + if (!addr_has_prefix_len && !inany_pton(optarg, &addr)) + die("Invalid address: %s", optarg); + + if (prefix_len_from_opt && inany_v4(&addr)) + prefix_len = prefix_len_from_opt; + else if (!addr_has_prefix_len) + prefix_len = inany_default_prefix_len(&addr); + + if (inany_is_unspecified(&addr) || + inany_is_multicast(&addr) || + inany_is_loopback(&addr) || + IN6_IS_ADDR_V4COMPAT(&addr.a6)) + die("Invalid address: %s", optarg); - if (inet_pton(AF_INET, optarg, &c->ip4.addr) && - !IN4_IS_ADDR_UNSPECIFIED(&c->ip4.addr) && - !IN4_IS_ADDR_BROADCAST(&c->ip4.addr) && - !IN4_IS_ADDR_LOOPBACK(&c->ip4.addr) && - !IN4_IS_ADDR_MULTICAST(&c->ip4.addr)) { + if (inany_v4(&addr)) { + c->ip4.addr = *inany_v4(&addr); + c->ip4.prefix_len = prefix_len - 96; if (c->mode == MODE_PASTA) c->ip4.no_copy_addrs = true; - break; + } else { + c->ip6.addr = addr.a6; + if (c->mode == MODE_PASTA) + c->ip6.no_copy_addrs = true; } - - die("Invalid address: %s", optarg); break; - case 'n': - c->ip4.prefix_len = conf_ip4_prefix(optarg); - if (c->ip4.prefix_len < 0) - die("Invalid netmask: %s", optarg); + } + case 'n': { + int plen; + + if (addr_has_prefix_len) + die("Redundant prefix length specification"); + + plen = conf_ip4_prefix(optarg); + if (plen < 0) + die("Invalid prefix length: %s", optarg); + prefix_len_from_opt = plen + 96; + c->ip4.prefix_len = plen; break; + } case 'M': parse_mac(c->our_tap_mac, optarg); break; @@ -1950,8 +1995,11 @@ void conf(struct ctx *c, int argc, char **argv) } } while (name != -1); - if (c->mode != MODE_PASTA) + if (c->mode != MODE_PASTA) { c->no_splice = 1; + if (c->splice_only) + die("--splice-only is for pasta mode only"); + } if (c->mode == MODE_PASTA && !c->pasta_conf_ns) { if (copy_routes_opt) @@ -1960,6 +2008,16 @@ void conf(struct ctx *c, int argc, char **argv) die("--no-copy-addrs needs --config-net"); } + if (c->mode == MODE_PASTA && c->splice_only) { + if (c->no_splice) + die("--splice-only is incompatible with --no-splice"); + + c->host_lo_to_ns_lo = 1; + c->no_icmp = 1; + c->no_dns = 1; + c->no_dns_search = 1; + } + if (!ifi4 && *c->ip4.ifname_out) ifi4 = if_nametoindex(c->ip4.ifname_out); @@ -1983,9 +2041,9 @@ void conf(struct ctx *c, int argc, char **argv) log_conf_parsed = true; /* Stop printing everything */ nl_sock_init(c, false); - if (!v6_only) + if (!v6_only && !c->splice_only) c->ifi4 = conf_ip4(ifi4, &c->ip4); - if (!v4_only) + if (!v4_only && !c->splice_only) c->ifi6 = conf_ip6(ifi6, &c->ip6); if (c->ifi4 && c->mtu < IPV4_MIN_MTU) { @@ -2001,20 +2059,25 @@ void conf(struct ctx *c, int argc, char **argv) (*c->ip6.ifname_out && !c->ifi6)) die("External interface not usable"); + if (!c->ifi4 && !c->ifi6 && !*c->pasta_ifn) { + strncpy(c->pasta_ifn, pasta_default_ifn, + sizeof(c->pasta_ifn) - 1); + } - if (!c->ifi4 && !c->ifi6) { - info("No external interface as template, switch to local mode"); - - conf_ip4_local(&c->ip4); + if (!c->ifi4 && !v6_only) { + if (!c->splice_only) { + info("IPv4: no external interface as template, use local mode"); + conf_ip4_local(&c->ip4); + } c->ifi4 = -1; + } - conf_ip6_local(&c->ip6); - c->ifi6 = -1; - - if (!*c->pasta_ifn) { - strncpy(c->pasta_ifn, pasta_default_ifn, - sizeof(c->pasta_ifn) - 1); + if (!c->ifi6 && !v4_only) { + if (!c->splice_only) { + info("IPv6: no external interface as template, use local mode"); + conf_ip6_local(&c->ip6); } + c->ifi6 = -1; } if (c->ifi4 && !no_map_gw && @@ -2032,7 +2095,6 @@ void conf(struct ctx *c, int argc, char **argv) * settings */ fwd_probe_ephemeral(); - udp_portmap_clear(); optind = 0; do { name = getopt_long(argc, argv, optstring, options, NULL); @@ -2144,7 +2206,26 @@ void conf(struct ctx *c, int argc, char **argv) if (!c->udp.fwd_out.mode) c->udp.fwd_out.mode = fwd_default; - fwd_scan_ports_init(c); + if (c->tcp.fwd_in.mode == FWD_AUTO) { + conf_ports_range_except(c, 't', "auto", &c->tcp.fwd_in, + NULL, NULL, 1, NUM_PORTS - 1, + NULL, 1, FWD_SCAN); + } + if (c->tcp.fwd_out.mode == FWD_AUTO) { + conf_ports_range_except(c, 'T', "auto", &c->tcp.fwd_out, + NULL, "lo", 1, NUM_PORTS - 1, + NULL, 1, FWD_SCAN); + } + if (c->udp.fwd_in.mode == FWD_AUTO) { + conf_ports_range_except(c, 'u', "auto", &c->udp.fwd_in, + NULL, NULL, 1, NUM_PORTS - 1, + NULL, 1, FWD_SCAN); + } + if (c->udp.fwd_out.mode == FWD_AUTO) { + conf_ports_range_except(c, 'U', "auto", &c->udp.fwd_out, + NULL, "lo", 1, NUM_PORTS - 1, + NULL, 1, FWD_SCAN); + } if (!c->quiet) conf_print(c); diff --git a/contrib/apparmor/abstractions/passt b/contrib/apparmor/abstractions/passt index 43fd63f..85bd1ee 100644 --- a/contrib/apparmor/abstractions/passt +++ b/contrib/apparmor/abstractions/passt @@ -11,7 +11,7 @@ # Copyright (c) 2022 Red Hat GmbH # Author: Stefano Brivio <sbrivio@redhat.com> - abi <abi/3.0>, + abi <abi/4.0>, include <abstractions/base> @@ -24,6 +24,7 @@ capability setpcap, capability net_admin, capability sys_ptrace, + userns, / r, # isolate_prefork(), isolation.c mount options=(rw, runbindable) -> /, @@ -36,6 +37,14 @@ @{PROC}/sys/net/ipv4/ip_local_port_range r, # fwd_probe_ephemeral() + @{PROC}/sys/net/ipv4/tcp_syn_retries r, # tcp_get_rto_params(), tcp.c + @{PROC}/sys/net/ipv4/tcp_syn_linear_timeouts r, + @{PROC}/sys/net/ipv4/tcp_rto_max_ms r, + + # udp_get_timeout_params(), udp.c + @{PROC}/sys/net/netfilter/nf_conntrack_udp_timeout r, + @{PROC}/sys/net/netfilter/nf_conntrack_udp_timeout_stream r, + network netlink raw, # nl_sock_init_do(), netlink.c network inet stream, # tcp.c diff --git a/contrib/apparmor/abstractions/pasta b/contrib/apparmor/abstractions/pasta index 9f73bee..251d4a2 100644 --- a/contrib/apparmor/abstractions/pasta +++ b/contrib/apparmor/abstractions/pasta @@ -11,7 +11,7 @@ # Copyright (c) 2022 Red Hat GmbH # Author: Stefano Brivio <sbrivio@redhat.com> - abi <abi/3.0>, + abi <abi/4.0>, include <abstractions/passt> diff --git a/contrib/apparmor/usr.bin.passt b/contrib/apparmor/usr.bin.passt index 62a4514..c123a86 100644 --- a/contrib/apparmor/usr.bin.passt +++ b/contrib/apparmor/usr.bin.passt @@ -11,7 +11,7 @@ # Copyright (c) 2022 Red Hat GmbH # Author: Stefano Brivio <sbrivio@redhat.com> -abi <abi/3.0>, +abi <abi/4.0>, include <tunables/global> diff --git a/contrib/apparmor/usr.bin.passt-repair b/contrib/apparmor/usr.bin.passt-repair index 901189d..23ff1ce 100644 --- a/contrib/apparmor/usr.bin.passt-repair +++ b/contrib/apparmor/usr.bin.passt-repair @@ -11,7 +11,7 @@ # Copyright (c) 2025 Red Hat GmbH # Author: Stefano Brivio <sbrivio@redhat.com> -abi <abi/3.0>, +abi <abi/4.0>, #include <tunables/global> diff --git a/contrib/apparmor/usr.bin.pasta b/contrib/apparmor/usr.bin.pasta index 2483968..56b5024 100644 --- a/contrib/apparmor/usr.bin.pasta +++ b/contrib/apparmor/usr.bin.pasta @@ -11,7 +11,7 @@ # Copyright (c) 2022 Red Hat GmbH # Author: Stefano Brivio <sbrivio@redhat.com> -abi <abi/3.0>, +abi <abi/4.0>, include <tunables/global> diff --git a/contrib/fedora/passt.spec b/contrib/fedora/passt.spec index bcbe1f7..38b06b0 100644 --- a/contrib/fedora/passt.spec +++ b/contrib/fedora/passt.spec @@ -37,7 +37,7 @@ requiring any capabilities or privileges. %package selinux BuildArch: noarch Summary: SELinux support for passt and pasta -%if 0%{?fedora} >= 43 +%if 0%{?fedora} > 43 BuildRequires: selinux-policy-devel %selinux_requires_min %else diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te index 95fe42a..fb51416 100644 --- a/contrib/selinux/pasta.te +++ b/contrib/selinux/pasta.te @@ -149,7 +149,7 @@ allow pasta_t root_t:dir mounton; manage_files_pattern(pasta_t, pasta_pid_t, pasta_pid_t) files_pid_filetrans(pasta_t, pasta_pid_t, file) -allow pasta_t user_tmp_t:dir { add_name remove_name search write }; +allow pasta_t user_tmp_t:dir { add_name open read remove_name search watch write }; allow pasta_t user_tmp_t:fifo_file append; allow pasta_t user_tmp_t:file { create open write }; allow pasta_t user_tmp_t:sock_file { create unlink }; @@ -249,7 +249,9 @@ type_transition container_runtime_t user_tmp_t : dir ifconfig_var_run_t "netns"; type_transition container_runtime_t container_var_run_t : dir ifconfig_var_run_t "netns"; type_transition container_runtime_t user_tmp_t : dir ifconfig_var_run_t "rootless-netns"; type_transition container_runtime_t container_var_run_t : dir ifconfig_var_run_t "rootless-netns"; +allow pasta_t container_var_run_t:dir { add_name open rmdir write }; allow pasta_t ifconfig_var_run_t:dir { add_name open rmdir write }; +allow pasta_t container_var_run_t:file { create open write }; allow pasta_t ifconfig_var_run_t:file { create open write }; allow systemd_user_runtimedir_t ifconfig_var_run_t:dir rmdir; @@ -430,7 +430,9 @@ int dhcp(const struct ctx *c, struct iov_tail *data) } for (i = 0, opts[6].slen = 0; - !c->no_dhcp_dns && !IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns[i]); i++) { + !c->no_dhcp_dns && i < ARRAY_SIZE(c->ip4.dns); i++) { + if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns[i])) + break; ((struct in_addr *)opts[6].s)[i] = c->ip4.dns[i]; opts[6].slen += sizeof(uint32_t); } @@ -320,7 +320,7 @@ static bool dhcpv6_opt(struct iov_tail *data, uint16_t type) static bool dhcpv6_ia_notonlink(struct iov_tail *data, struct in6_addr *la) { - int ia_types[2] = { OPT_IA_NA, OPT_IA_TA }, *ia_type; + int ia_types[2] = { OPT_IA_NA, OPT_IA_TA }; struct opt_ia_addr opt_addr_storage; const struct opt_ia_addr *opt_addr; struct iov_tail current, ia_base; @@ -330,6 +330,7 @@ static bool dhcpv6_ia_notonlink(struct iov_tail *data, struct in6_addr req_addr; struct opt_hdr h_storage; const struct opt_hdr *h; + const int *ia_type; foreach(ia_type, ia_types) { current = *data; @@ -425,7 +426,9 @@ static size_t dhcpv6_dns_fill(const struct ctx *c, char *buf, int offset) if (c->no_dhcp_dns) goto search; - for (i = 0; !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns[i]); i++) { + for (i = 0; i < ARRAY_SIZE(c->ip6.dns); i++) { + if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns[i])) + break; if (!i) { srv = (struct opt_dns_servers *)(buf + offset); offset += sizeof(struct opt_hdr); diff --git a/doc/platform-requirements/.gitignore b/doc/platform-requirements/.gitignore index f6272cf..b2a0069 100644 --- a/doc/platform-requirements/.gitignore +++ b/doc/platform-requirements/.gitignore @@ -1,4 +1,5 @@ /listen-vs-repair /reuseaddr-priority /recv-zero +/tcp-close-rst /udp-close-dup diff --git a/doc/platform-requirements/Makefile b/doc/platform-requirements/Makefile index 83930ef..204341b 100644 --- a/doc/platform-requirements/Makefile +++ b/doc/platform-requirements/Makefile @@ -3,8 +3,10 @@ # Copyright Red Hat # Author: David Gibson <david@gibson.dropbear.id.au> -TARGETS = reuseaddr-priority recv-zero udp-close-dup listen-vs-repair -SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c listen-vs-repair.c +TARGETS = reuseaddr-priority recv-zero udp-close-dup listen-vs-repair \ + tcp-close-rst +SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c listen-vs-repair.c \ + tcp-close-rst.c CFLAGS = -Wall all: cppcheck clang-tidy $(TARGETS:%=check-%) @@ -25,6 +27,7 @@ clang-tidy: clang-tidy --checks=*,\ -altera-id-dependent-backward-branch,\ -altera-unroll-loops,\ + -android-cloexec-accept,\ -bugprone-easily-swappable-parameters,\ -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,\ -concurrency-mt-unsafe,\ @@ -37,6 +40,7 @@ clang-tidy: -misc-include-cleaner,\ -modernize-macro-to-enum,\ -readability-braces-around-statements,\ + -readability-function-cognitive-complexity,\ -readability-identifier-length,\ -readability-isolate-declaration \ $(SRCS) diff --git a/doc/platform-requirements/listen-vs-repair.c b/doc/platform-requirements/listen-vs-repair.c index d31fe3f..e21d168 100644 --- a/doc/platform-requirements/listen-vs-repair.c +++ b/doc/platform-requirements/listen-vs-repair.c @@ -58,9 +58,9 @@ static void net_sandbox(void) .nlh.nlmsg_len = sizeof(req), .nlh.nlmsg_seq = 1, .ifm.ifi_family = AF_UNSPEC, - .ifm.ifi_index = 1, - .ifm.ifi_flags = IFF_UP, - .ifm.ifi_change = IFF_UP, + .ifm.ifi_index = 1, + .ifm.ifi_flags = IFF_UP, + .ifm.ifi_change = IFF_UP, }; int nl; diff --git a/doc/platform-requirements/tcp-close-rst.c b/doc/platform-requirements/tcp-close-rst.c new file mode 100644 index 0000000..0e508f6 --- /dev/null +++ b/doc/platform-requirements/tcp-close-rst.c @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* tcp-close-rst.c + * + * Check what operations on a TCP socket will trigger an RST. + * + * Copyright Red Hat + * Author: David Gibson <david@gibson.dropbear.id.au> + */ + +#include <arpa/inet.h> +#include <errno.h> +#include <fcntl.h> +#include <net/if.h> +#include <netinet/in.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/socket.h> +#include <unistd.h> + +#include "common.h" + +#define DSTPORT 13258U + +#define SRCADDR(n) \ + (0x7f000000U | (n) << 16U | (n) << 8U | 0x1U) + +#define BASENUM 100 + +/* 127.0.0.1:DSTPORT */ +static const struct sockaddr_in lo_dst = SOCKADDR_INIT(INADDR_LOOPBACK, DSTPORT); + +#define LINGER 0x01U +#define SHUT_CLIENT 0x02U +#define SHUT_SERVER 0x04U + +#define NUM_OPTIONS (SHUT_SERVER << 1U) + +static void client_close(int sl, unsigned flags) +{ + struct sockaddr_in src = SOCKADDR_INIT(SRCADDR(flags), 0); + struct linger linger0 = { + .l_onoff = 1, + .l_linger = 0, + }; + int sockerr, sc, sa; + socklen_t errlen = sizeof(sockerr); + + printf("Client close %u:%s%s%s\n", flags, + flags & LINGER ? " LINGER" : "", + flags & SHUT_CLIENT ? " SHUT_CLIENT" : "", + flags & SHUT_SERVER ? " SHUT_SERVER" : ""); + + sc = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sc < 0) + die("socket() for connect(): %s\n", strerror(errno)); + + if (bind(sc, (struct sockaddr *)&src, sizeof(src)) < 0) + die("bind() for connect: %s\n", strerror(errno)); + + if (connect(sc, (struct sockaddr *)&lo_dst, sizeof(lo_dst)) < 0) + die("connect(): %s\n", strerror(errno)); + + /* cppcheck-suppress [android-cloexec-accept,unmatchedSuppression] */ + sa = accept(sl, NULL, NULL); + if (sa < 0) + die("accept(): %s\n", strerror(errno)); + + if (flags & SHUT_SERVER) + if (shutdown(sa, SHUT_WR) < 0) + die("shutdown() server: %s\n", strerror(errno)); + + if (flags & SHUT_CLIENT) + if (shutdown(sc, SHUT_WR) < 0) + die("shutdown() client: %s\n", strerror(errno)); + + if (flags & LINGER) + if (setsockopt(sc, SOL_SOCKET, SO_LINGER, + &linger0, sizeof(linger0)) < 0) + die("SO_LINGER: %s\n", strerror(errno)); + + close(sc); + + if (getsockopt(sa, SOL_SOCKET, SO_ERROR, &sockerr, &errlen) < 0) + die("SO_ERROR: %s\n", strerror(errno)); + + if (errlen != sizeof(sockerr)) + die("SO_ERROR: bad option length\n"); + + printf("Server error: %s\n", strerror(sockerr)); + + if (flags & LINGER) { + if (!(flags & SHUT_SERVER) || !(flags & SHUT_CLIENT)) { + if (sockerr == 0) + die("No error after abrupt close(), no RST?\n"); + } else { + if (sockerr != 0) + die("Error after full shutdown, bogus RST?\n"); + } + } + + close(sa); +} + +static void server_close(int sl, unsigned flags) +{ + struct sockaddr_in src = SOCKADDR_INIT(SRCADDR(flags), 0); + struct linger linger0 = { + .l_onoff = 1, + .l_linger = 0, + }; + int sockerr, sc, sa; + socklen_t errlen = sizeof(sockerr); + + printf("Server close %u:%s%s%s\n", flags, + flags & LINGER ? " LINGER" : "", + flags & SHUT_CLIENT ? " SHUT_CLIENT" : "", + flags & SHUT_SERVER ? " SHUT_SERVER" : ""); + + sc = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sc < 0) + die("socket() for connect(): %s\n", strerror(errno)); + + if (bind(sc, (struct sockaddr *)&src, sizeof(src)) < 0) + die("bind() for connect: %s\n", strerror(errno)); + + if (connect(sc, (struct sockaddr *)&lo_dst, sizeof(lo_dst)) < 0) + die("connect(): %s\n", strerror(errno)); + + /* cppcheck-suppress [android-cloexec-accept,unmatchedSuppression] */ + sa = accept(sl, NULL, NULL); + if (sa < 0) + die("accept(): %s\n", strerror(errno)); + + if (flags & SHUT_SERVER) + if (shutdown(sa, SHUT_WR) < 0) + die("shutdown() server: %s\n", strerror(errno)); + + if (flags & SHUT_CLIENT) + if (shutdown(sc, SHUT_WR) < 0) + die("shutdown() client: %s\n", strerror(errno)); + + if (flags & LINGER) + if (setsockopt(sa, SOL_SOCKET, SO_LINGER, + &linger0, sizeof(linger0)) < 0) + die("SO_LINGER: %s\n", strerror(errno)); + + close(sa); + + if (getsockopt(sc, SOL_SOCKET, SO_ERROR, &sockerr, &errlen) < 0) + die("SO_ERROR: %s\n", strerror(errno)); + + if (errlen != sizeof(sockerr)) + die("SO_ERROR: bad option length\n"); + + printf("Client error: %s\n", strerror(sockerr)); + + if (flags & LINGER) { + if (!(flags & SHUT_SERVER) || !(flags & SHUT_CLIENT)) { + if (sockerr == 0) + die("No error after abrupt close(), no RST?\n"); + } else { + if (sockerr != 0) + die("Error after full shutdown, bogus RST?\n"); + } + } + + close(sc); +} + +int main(int argc, char *argv[]) +{ + unsigned flags; + int y = 1; + int sl; + + (void)argc; + (void)argv; + + sl = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sl < 0) + die("socket() for listen: %s\n", strerror(errno)); + + if (setsockopt(sl, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y)) < 0) + die("SO_REUSEADDR for listen: %s\n", strerror(errno)); + + if (bind(sl, (struct sockaddr *)&lo_dst, sizeof(lo_dst)) < 0) + die("bind() for listen: %s\n", strerror(errno)); + + if (listen(sl, 1) < 0) + die("listen(): %s\n", strerror(errno)); + + printf("Listening on port %u\n", DSTPORT); + + for (flags = 0; flags < NUM_OPTIONS; flags++) { + client_close(sl, flags); + server_close(sl, flags); + } + + close(sl); + exit(0); +} diff --git a/epoll_ctl.h b/epoll_ctl.h index 2d7e712..879763c 100644 --- a/epoll_ctl.h +++ b/epoll_ctl.h @@ -17,31 +17,30 @@ /** * union epoll_ref - Breakdown of reference for epoll fd bookkeeping + * @u64: Opaque reference for epoll_ctl() and epoll_wait() * @type: Type of fd (tells us what to do with events) * @fd: File descriptor number (implies < 2^24 total descriptors) * @flow: Index of the flow this fd is linked to - * @tcp_listen: TCP-specific reference part for listening sockets - * @udp: UDP-specific reference part + * @flowside: Index and side of a flow this fd is linked to + * @listen: Information for listening sockets * @data: Data handled by protocol handlers * @nsdir_fd: netns dirfd for fallback timer checking if namespace is gone * @queue: vhost-user queue index for this fd - * @u64: Opaque reference for epoll_ctl() and epoll_wait() */ union epoll_ref { + uint64_t u64; struct { enum epoll_type type:8; int32_t fd:FD_REF_BITS; union { uint32_t flow; flow_sidx_t flowside; - union tcp_listen_epoll_ref tcp_listen; - union udp_listen_epoll_ref udp; + struct fwd_listen_ref listen; uint32_t data; int nsdir_fd; int queue; }; }; - uint64_t u64; }; static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data), "epoll_ref must have same size as epoll_data"); @@ -20,6 +20,7 @@ #include "flow.h" #include "flow_table.h" #include "repair.h" +#include "epoll_ctl.h" const char *flow_state_str[] = { [FLOW_STATE_FREE] = "FREE", @@ -53,6 +54,16 @@ const uint8_t flow_proto[] = { static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, "flow_proto[] doesn't match enum flow_type"); +static const enum epoll_type flow_epoll[] = { + [FLOW_TCP] = EPOLL_TYPE_TCP, + [FLOW_TCP_SPLICE] = EPOLL_TYPE_TCP_SPLICE, + [FLOW_PING4] = EPOLL_TYPE_PING, + [FLOW_PING6] = EPOLL_TYPE_PING, + [FLOW_UDP] = EPOLL_TYPE_UDP, +}; +static_assert(ARRAY_SIZE(flow_epoll) == FLOW_NUM_TYPES, + "flow_epoll[] doesn't match enum flow_type"); + #define foreach_established_tcp_flow(flow) \ flow_foreach_of_type((flow), FLOW_TCP) \ if (!tcp_flow_is_established(&(flow)->tcp)) \ @@ -116,7 +127,7 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, unsigned flow_first_free; union flow flowtab[FLOW_MAX]; static const union flow *flow_new_entry; /* = NULL */ -static int epoll_id_to_fd[EPOLLFD_ID_MAX]; +static int epoll_id_to_fd[EPOLLFD_ID_SIZE]; /* Hash table to index it */ #define FLOW_HASH_LOAD 70 /* % */ @@ -341,17 +352,6 @@ static void flow_set_state(struct flow_common *f, enum flow_state state) } /** - * flow_in_epoll() - Check if flow is registered with an epoll instance - * @f: Flow to check - * - * Return: true if flow is registered with epoll, false otherwise - */ -bool flow_in_epoll(const struct flow_common *f) -{ - return f->epollid != EPOLLFD_ID_INVALID; -} - -/** * flow_epollfd() - Get the epoll file descriptor for a flow * @f: Flow to query * @@ -359,8 +359,6 @@ bool flow_in_epoll(const struct flow_common *f) */ int flow_epollfd(const struct flow_common *f) { - ASSERT(f->epollid < EPOLLFD_ID_MAX); - return epoll_id_to_fd[f->epollid]; } @@ -371,18 +369,35 @@ int flow_epollfd(const struct flow_common *f) */ void flow_epollid_set(struct flow_common *f, int epollid) { - ASSERT(epollid < EPOLLFD_ID_MAX); + ASSERT(epollid < EPOLLFD_ID_SIZE); f->epollid = epollid; } /** - * flow_epollid_clear() - Clear the flow epoll id - * @f: Flow to update + * flow_epoll_set() - Add or modify epoll registration for a flow socket + * @f: Flow to register socket for + * @command: epoll_ctl() command: EPOLL_CTL_ADD or EPOLL_CTL_MOD + * @events: epoll events to watch for + * @fd: File descriptor to register + * @sidei: Side index of the flow + * + * Return: 0 on success, -1 on error (from epoll_ctl()) */ -void flow_epollid_clear(struct flow_common *f) +int flow_epoll_set(const struct flow_common *f, int command, uint32_t events, + int fd, unsigned int sidei) { - f->epollid = EPOLLFD_ID_INVALID; + struct epoll_event ev; + union epoll_ref ref; + + ref.fd = fd; + ref.type = flow_epoll[f->type]; + ref.flowside = flow_sidx(f, sidei); + + ev.events = events; + ev.data.u64 = ref.u64; + + return epoll_ctl(flow_epollfd(f), command, fd, &ev); } /** @@ -392,7 +407,7 @@ void flow_epollid_clear(struct flow_common *f) */ void flow_epollid_register(int epollid, int epollfd) { - ASSERT(epollid < EPOLLFD_ID_MAX); + ASSERT(epollid < EPOLLFD_ID_SIZE); epoll_id_to_fd[epollid] = epollfd; } @@ -477,17 +492,20 @@ struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif, * flow_target() - Determine where flow should forward to, and move to TGT * @c: Execution context * @flow: Flow to forward + * @rule_hint: Index of relevant forwarding rule, or -1 if unknown * @proto: Protocol * * Return: pointer to the target flowside information */ struct flowside *flow_target(const struct ctx *c, union flow *flow, - uint8_t proto) + int rule_hint, uint8_t proto) { - char estr[INANY_ADDRSTRLEN], fstr[INANY_ADDRSTRLEN]; + char estr[INANY_ADDRSTRLEN], ostr[INANY_ADDRSTRLEN]; struct flow_common *f = &flow->f; const struct flowside *ini = &f->side[INISIDE]; struct flowside *tgt = &f->side[TGTSIDE]; + const struct fwd_rule *rule = NULL; + const struct fwd_ports *fwd; uint8_t tgtpif = PIF_NONE; ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_INI); @@ -502,29 +520,57 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow, break; case PIF_SPLICE: - tgtpif = fwd_nat_from_splice(c, proto, ini, tgt); + if (proto == IPPROTO_TCP) + fwd = &c->tcp.fwd_out; + else if (proto == IPPROTO_UDP) + fwd = &c->udp.fwd_out; + else + goto nofwd; + + if (!(rule = fwd_rule_search(fwd, ini, rule_hint))) + goto norule; + + tgtpif = fwd_nat_from_splice(rule, proto, ini, tgt); break; case PIF_HOST: - tgtpif = fwd_nat_from_host(c, proto, ini, tgt); + if (proto == IPPROTO_TCP) + fwd = &c->tcp.fwd_in; + else if (proto == IPPROTO_UDP) + fwd = &c->udp.fwd_in; + else + goto nofwd; + + if (!(rule = fwd_rule_search(fwd, ini, rule_hint))) + goto norule; + + tgtpif = fwd_nat_from_host(c, rule, proto, ini, tgt); fwd_neigh_mac_get(c, &tgt->oaddr, f->tap_omac); break; - default: - flow_err(flow, "No rules to forward %s [%s]:%hu -> [%s]:%hu", - pif_name(f->pif[INISIDE]), - inany_ntop(&ini->eaddr, estr, sizeof(estr)), - ini->eport, - inany_ntop(&ini->oaddr, fstr, sizeof(fstr)), - ini->oport); + goto nofwd; } if (tgtpif == PIF_NONE) - return NULL; + goto nofwd; f->pif[TGTSIDE] = tgtpif; flow_set_state(f, FLOW_STATE_TGT); return tgt; + +norule: + /* This shouldn't happen, because if there's no rule for it we should + * have no listening socket that would let us get here + */ + flow_dbg(flow, "Missing forward rule"); + flow_log_details_(f, LOG_DEBUG, f->state); + +nofwd: + flow_err(flow, "No rules to forward %s %s [%s]:%hu -> [%s]:%hu", + pif_name(f->pif[INISIDE]), ipproto_name(proto), + inany_ntop(&ini->eaddr, estr, sizeof(estr)), ini->eport, + inany_ntop(&ini->oaddr, ostr, sizeof(ostr)), ini->oport); + return NULL; } /** @@ -600,7 +646,6 @@ union flow *flow_alloc(void) flow_new_entry = flow; memset(flow, 0, sizeof(*flow)); - flow_epollid_clear(&flow->f); flow_set_state(&flow->f, FLOW_STATE_NEW); return flow; @@ -978,6 +1023,9 @@ static int flow_migrate_source_rollback(struct ctx *c, unsigned bound, int ret) debug("...roll back migration"); + if (fwd_listen_sync(c, &c->tcp.fwd_in, PIF_HOST, IPPROTO_TCP) < 0) + die("Failed to re-establish listening sockets"); + foreach_established_tcp_flow(flow) { if (FLOW_IDX(flow) >= bound) break; @@ -1102,6 +1150,15 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, return flow_migrate_source_rollback(c, FLOW_MAX, rc); } + /* HACK: A local to local migrate will fail if the origin passt has the + * listening sockets still open when the destination passt tries to bind + * them. This does mean there's a window where we lost our listen()s, + * even if the migration is rolled back later. The only way to really + * fix that is to not allow local to local migration, which arguably we + * should (use namespaces for testing instead). */ + debug("Stop listen()s"); + fwd_listen_close(&c->tcp.fwd_in); + debug("Sending %u flows", ntohl(count)); if (!count) @@ -7,6 +7,12 @@ #ifndef FLOW_H #define FLOW_H +#include <stdint.h> +#include <netinet/in.h> + +#include "inany.h" +#include "util.h" + #define FLOW_TIMER_INTERVAL 1000 /* ms */ /** @@ -99,7 +105,7 @@ static_assert(FLOW_NUM_STATES <= (1 << FLOW_STATE_BITS), extern const char *flow_state_str[]; #define FLOW_STATE(f) \ - ((f)->state < FLOW_NUM_STATES ? flow_state_str[(f)->state] : "?") + ((f)->state < FLOW_NUM_STATES ? flow_state_str[(f)->state] : "?") /** * enum flow_type - Different types of packet flows we track @@ -126,7 +132,7 @@ static_assert(FLOW_NUM_TYPES <= (1 << FLOW_TYPE_BITS), extern const char *flow_type_str[]; #define FLOW_TYPE(f) \ - ((f)->type < FLOW_NUM_TYPES ? flow_type_str[(f)->type] : "?") + ((f)->type < FLOW_NUM_TYPES ? flow_type_str[(f)->type] : "?") extern const uint8_t flow_proto[]; #define FLOW_PROTO(f) \ @@ -178,7 +184,7 @@ int flowside_connect(const struct ctx *c, int s, * @pif[]: Interface for each side of the flow * @side[]: Information for each side of the flow * @tap_omac: MAC address of remote endpoint as seen from the guest - * @epollid: epollfd identifier, or EPOLLFD_ID_INVALID + * @epollid: epollfd identifier */ struct flow_common { #ifdef __GNUC__ @@ -203,8 +209,6 @@ struct flow_common { #define EPOLLFD_ID_DEFAULT 0 #define EPOLLFD_ID_SIZE (1 << EPOLLFD_ID_BITS) -#define EPOLLFD_ID_MAX (EPOLLFD_ID_SIZE - 1) -#define EPOLLFD_ID_INVALID EPOLLFD_ID_MAX #define FLOW_INDEX_BITS 17 /* 128k - 1 */ #define FLOW_MAX MAX_FROM_BITS(FLOW_INDEX_BITS) @@ -261,10 +265,10 @@ flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif, union flow; void flow_init(void); -bool flow_in_epoll(const struct flow_common *f); int flow_epollfd(const struct flow_common *f); void flow_epollid_set(struct flow_common *f, int epollid); -void flow_epollid_clear(struct flow_common *f); +int flow_epoll_set(const struct flow_common *f, int command, uint32_t events, + int fd, unsigned int sidei); void flow_epollid_register(int epollid, int epollfd); void flow_defer_handler(const struct ctx *c, const struct timespec *now); int flow_migrate_source_early(struct ctx *c, const struct migrate_stage *stage, diff --git a/flow_table.h b/flow_table.h index 5ee13ac..8fb7b5c 100644 --- a/flow_table.h +++ b/flow_table.h @@ -7,6 +7,7 @@ #ifndef FLOW_TABLE_H #define FLOW_TABLE_H +#include "pif.h" #include "tcp_conn.h" #include "icmp_flow.h" #include "udp_flow.h" @@ -207,7 +208,7 @@ const struct flowside *flow_target_af(union flow *flow, uint8_t pif, const void *saddr, in_port_t sport, const void *daddr, in_port_t dport); struct flowside *flow_target(const struct ctx *c, union flow *flow, - uint8_t proto); + int rule_hint, uint8_t proto); union flow *flow_set_type(union flow *flow, enum flow_type type); #define FLOW_SET_TYPE(flow_, t_, var_) (&flow_set_type((flow_), (t_))->var_) @@ -13,6 +13,7 @@ * Author: David Gibson <david@gibson.dropbear.id.au> */ +#include <assert.h> #include <stdint.h> #include <errno.h> #include <fcntl.h> @@ -21,7 +22,10 @@ #include <stdio.h> #include "util.h" +#include "epoll_ctl.h" #include "ip.h" +#include "siphash.h" +#include "inany.h" #include "fwd.h" #include "passt.h" #include "lineread.h" @@ -301,6 +305,20 @@ parse_err: } /** + * fwd_rule_addr() - Return match address for a rule + * @rule: Forwarding rule + * + * Return: matching address for rule, NULL if it matches all addresses + */ +static const union inany_addr *fwd_rule_addr(const struct fwd_rule *rule) +{ + if (rule->flags & FWD_DUAL_STACK_ANY) + return NULL; + + return &rule->addr; +} + +/** * fwd_port_is_ephemeral() - Is port number ephemeral? * @port: Port number * @@ -313,6 +331,350 @@ bool fwd_port_is_ephemeral(in_port_t port) return (port >= fwd_ephemeral_min) && (port <= fwd_ephemeral_max); } +/** + * fwd_rule_add() - Add a rule to a forwarding table + * @fwd: Table to add to + * @flags: Flags for this entry + * @addr: Our address to forward (NULL for both 0.0.0.0 and ::) + * @ifname: Only forward from this interface name, if non-empty + * @first: First port number to forward + * @last: Last port number to forward + * @to: First port of target port range to map to + */ +void fwd_rule_add(struct fwd_ports *fwd, uint8_t flags, + const union inany_addr *addr, const char *ifname, + in_port_t first, in_port_t last, in_port_t to) +{ + /* Flags which can be set from the caller */ + const uint8_t allowed_flags = FWD_WEAK | FWD_SCAN; + unsigned num = (unsigned)last - first + 1; + struct fwd_rule *new; + unsigned i, port; + + ASSERT(!(flags & ~allowed_flags)); + + if (fwd->count >= ARRAY_SIZE(fwd->rules)) + die("Too many port forwarding ranges"); + if ((fwd->sock_count + num) > ARRAY_SIZE(fwd->socks)) + die("Too many listening sockets"); + + /* Check for any conflicting entries */ + for (i = 0; i < fwd->count; i++) { + char newstr[INANY_ADDRSTRLEN], rulestr[INANY_ADDRSTRLEN]; + struct fwd_rule *rule = &fwd->rules[i]; + + if (!inany_matches(addr, fwd_rule_addr(rule))) + /* Non-conflicting addresses */ + continue; + + if (last < rule->first || rule->last < first) + /* Port ranges don't overlap */ + continue; + + die("Forwarding configuration conflict: %s/%u-%u versus %s/%u-%u", + inany_ntop(addr, newstr, sizeof(newstr)), first, last, + inany_ntop(fwd_rule_addr(rule), rulestr, sizeof(rulestr)), + rule->first, rule->last); + } + + new = &fwd->rules[fwd->count++]; + new->flags = flags; + + if (addr) { + new->addr = *addr; + } else { + new->addr = inany_any6; + new->flags |= FWD_DUAL_STACK_ANY; + } + + memset(new->ifname, 0, sizeof(new->ifname)); + if (ifname) { + int ret; + + ret = snprintf(new->ifname, sizeof(new->ifname), "%s", ifname); + if (ret <= 0 || (size_t)ret >= sizeof(new->ifname)) + die("Invalid interface name: %s", ifname); + } + + ASSERT(first <= last); + new->first = first; + new->last = last; + + new->to = to; + + new->socks = &fwd->socks[fwd->sock_count]; + fwd->sock_count += num; + + for (port = new->first; port <= new->last; port++) { + new->socks[port - new->first] = -1; + + /* Fill in the legacy forwarding data structures to match the table */ + if (!(new->flags & FWD_SCAN)) + bitmap_set(fwd->map, port); + } +} + +/** + * fwd_rule_match() - Does a prospective flow match a given forwarding rule? + * @rule: Forwarding rule + * @ini: Initiating side flow information + * + * Returns: true if the rule applies to the flow, false otherwise + */ +static bool fwd_rule_match(const struct fwd_rule *rule, + const struct flowside *ini) +{ + return inany_matches(&ini->oaddr, fwd_rule_addr(rule)) && + ini->oport >= rule->first && ini->oport <= rule->last; +} + +/** + * fwd_rule_search() - Find a rule which matches a prospective flow + * @fwd: Forwarding table + * @ini: Initiating side flow information + * @hint: Index of the rule in table, if known, otherwise FWD_NO_HINT + * + * Returns: first matching rule, or NULL if there is none + */ +const struct fwd_rule *fwd_rule_search(const struct fwd_ports *fwd, + const struct flowside *ini, + int hint) +{ + unsigned i; + + if (hint >= 0) { + char ostr[INANY_ADDRSTRLEN], rstr[INANY_ADDRSTRLEN]; + const struct fwd_rule *rule = &fwd->rules[hint]; + + ASSERT((unsigned)hint < fwd->count); + if (fwd_rule_match(rule, ini)) + return rule; + + debug("Incorrect rule hint: %s:%hu does not match %s:%hu-%hu", + inany_ntop(&ini->oaddr, ostr, sizeof(ostr)), ini->oport, + inany_ntop(fwd_rule_addr(rule), rstr, sizeof(rstr)), + rule->first, rule->last); + return NULL; + } + + for (i = 0; i < fwd->count; i++) { + if (fwd_rule_match(&fwd->rules[i], ini)) + return &fwd->rules[i]; + } + + return NULL; +} + +/** + * fwd_rules_print() - Print forwarding rules for debugging + * @fwd: Table to print + */ +void fwd_rules_print(const struct fwd_ports *fwd) +{ + unsigned i; + + for (i = 0; i < fwd->count; i++) { + const struct fwd_rule *rule = &fwd->rules[i]; + const char *percent = *rule->ifname ? "%" : ""; + const char *weak = "", *scan = ""; + char addr[INANY_ADDRSTRLEN]; + + inany_ntop(fwd_rule_addr(rule), addr, sizeof(addr)); + if (rule->flags & FWD_WEAK) + weak = " (best effort)"; + if (rule->flags & FWD_SCAN) + scan = " (auto-scan)"; + + if (rule->first == rule->last) { + info(" [%s]%s%s:%hu => %hu %s%s", + addr, percent, rule->ifname, + rule->first, rule->to, weak, scan); + } else { + info(" [%s]%s%s:%hu-%hu => %hu-%hu %s%s", + addr, percent, rule->ifname, + rule->first, rule->last, + rule->to, rule->last - rule->first + rule->to, + weak, scan); + } + } +} + +/** fwd_sync_one() - Create or remove listening sockets for a forward entry + * @c: Execution context + * @fwd: Forwarding table + * @rule: Forwarding rule + * @pif: Interface to create listening sockets for + * @proto: Protocol to listen for + * @scanmap: Bitmap of ports to listen for on FWD_SCAN entries + * + * Return: 0 on success, -1 on failure + */ +static int fwd_sync_one(const struct ctx *c, + const struct fwd_ports *fwd, const struct fwd_rule *rule, + uint8_t pif, uint8_t proto, const uint8_t *scanmap) +{ + const union inany_addr *addr = fwd_rule_addr(rule); + const char *ifname = rule->ifname; + bool bound_one = false; + unsigned port, idx; + + ASSERT(pif_is_socket(pif)); + + if (!*ifname) + ifname = NULL; + + idx = rule - fwd->rules; + ASSERT(idx < MAX_FWD_RULES); + + for (port = rule->first; port <= rule->last; port++) { + int fd = rule->socks[port - rule->first]; + + if ((rule->flags & FWD_SCAN) && !bitmap_isset(scanmap, port)) { + /* We don't want to listen on this port */ + if (fd >= 0) { + /* We already are, so stop */ + epoll_del(c->epollfd, fd); + close(fd); + rule->socks[port - rule->first] = -1; + } + continue; + } + + if (fd >= 0) /* Already listening, nothing to do */ { + bound_one = true; + continue; + } + + if (proto == IPPROTO_TCP) + fd = tcp_listen(c, pif, idx, addr, ifname, port); + else if (proto == IPPROTO_UDP) + fd = udp_listen(c, pif, idx, addr, ifname, port); + else + ASSERT(0); + + if (fd < 0) { + char astr[INANY_ADDRSTRLEN]; + + warn("Listen failed for %s %s port %s%s%s/%u: %s", + pif_name(pif), ipproto_name(proto), + inany_ntop(addr, astr, sizeof(astr)), + ifname ? "%" : "", ifname ? ifname : "", + port, strerror_(-fd)); + + if (!(rule->flags & FWD_WEAK)) + return -1; + + continue; + } + + rule->socks[port - rule->first] = fd; + bound_one = true; + } + + if (!bound_one && !(rule->flags & FWD_SCAN)) { + char astr[INANY_ADDRSTRLEN]; + + warn("All listens failed for %s %s %s%s%s/%u-%u", + pif_name(pif), ipproto_name(proto), + inany_ntop(addr, astr, sizeof(astr)), + ifname ? "%" : "", ifname ? ifname : "", + rule->first, rule->last); + return -1; + } + + return 0; +} + +/** struct fwd_listen_args - arguments for fwd_listen_init_() + * @c: Execution context + * @fwd: Forwarding information + * @scanmap: Bitmap of ports to auto-forward + * @pif: Interface to create listening sockets for + * @proto: Protocol + * @ret: Return code + */ +struct fwd_listen_args { + const struct ctx *c; + const struct fwd_ports *fwd; + const uint8_t *scanmap; + uint8_t pif; + uint8_t proto; + int ret; +}; + +/** fwd_listen_sync_() - Update listening sockets to match forwards + * @arg: struct fwd_listen_args with arguments + * + * Returns: zero + */ +static int fwd_listen_sync_(void *arg) +{ + struct fwd_listen_args *a = arg; + unsigned i; + + if (a->pif == PIF_SPLICE) + ns_enter(a->c); + + for (i = 0; i < a->fwd->count; i++) { + a->ret = fwd_sync_one(a->c, a->fwd, &a->fwd->rules[i], + a->pif, a->proto, a->fwd->map); + if (a->ret < 0) + break; + } + + return 0; +} + +/** fwd_listen_sync() - Call fwd_listen_sync_() in correct namespace + * @c: Execution context + * @fwd: Forwarding information + * @pif: Interface to create listening sockets for + * @proto: Protocol + * + * Return: 0 on success, -1 on failure + */ +int fwd_listen_sync(const struct ctx *c, const struct fwd_ports *fwd, + uint8_t pif, uint8_t proto) +{ + struct fwd_listen_args a = { + .c = c, .fwd = fwd, .pif = pif, .proto = proto, + }; + + if (pif == PIF_SPLICE) + NS_CALL(fwd_listen_sync_, &a); + else + fwd_listen_sync_(&a); + + if (a.ret < 0) { + err("Couldn't listen on requested %s ports", + ipproto_name(proto)); + return -1; + } + + return 0; +} + +/** fwd_listen_close() - Close all listening sockets + * @fwd: Forwarding information + */ +void fwd_listen_close(const struct fwd_ports *fwd) +{ + unsigned i; + + for (i = 0; i < fwd->count; i++) { + const struct fwd_rule *rule = &fwd->rules[i]; + unsigned port; + + for (port = rule->first; port <= rule->last; port++) { + int *fdp = &rule->socks[port - rule->first]; + if (*fdp >= 0) { + close(*fdp); + *fdp = -1; + } + } + } +} + /* See enum in kernel's include/net/tcp_states.h */ #define UDP_LISTEN 0x07 #define TCP_LISTEN 0x0a @@ -400,6 +762,28 @@ static void fwd_scan_ports_udp(struct fwd_ports *fwd, } /** + * current_listen_map() - Get bitmap of which ports we're already listening on + * @map: Bitmap to populate + * @fwd: Forwarding table to consider + */ +static void current_listen_map(uint8_t *map, const struct fwd_ports *fwd) +{ + unsigned i; + + memset(map, 0, PORT_BITMAP_SIZE); + + for (i = 0; i < fwd->count; i++) { + const struct fwd_rule *rule = &fwd->rules[i]; + unsigned port; + + for (port = rule->first; port <= rule->last; port++) { + if (rule->socks[port - rule->first] >= 0) + bitmap_set(map, port); + } + } +} + +/** * fwd_scan_ports() - Scan automatic port forwarding information * @c: Execution context */ @@ -408,10 +792,10 @@ static void fwd_scan_ports(struct ctx *c) uint8_t excl_tcp_out[PORT_BITMAP_SIZE], excl_udp_out[PORT_BITMAP_SIZE]; uint8_t excl_tcp_in[PORT_BITMAP_SIZE], excl_udp_in[PORT_BITMAP_SIZE]; - memcpy(excl_tcp_out, c->tcp.fwd_in.map, sizeof(excl_tcp_out)); - memcpy(excl_tcp_in, c->tcp.fwd_out.map, sizeof(excl_tcp_in)); - memcpy(excl_udp_out, c->udp.fwd_in.map, sizeof(excl_udp_out)); - memcpy(excl_udp_in, c->udp.fwd_out.map, sizeof(excl_udp_in)); + current_listen_map(excl_tcp_out, &c->tcp.fwd_in); + current_listen_map(excl_tcp_in, &c->tcp.fwd_out); + current_listen_map(excl_udp_out, &c->udp.fwd_in); + current_listen_map(excl_udp_in, &c->udp.fwd_out); fwd_scan_ports_tcp(&c->tcp.fwd_out, excl_tcp_out); fwd_scan_ports_tcp(&c->tcp.fwd_in, excl_tcp_in); @@ -471,10 +855,14 @@ void fwd_scan_ports_timer(struct ctx *c, const struct timespec *now) fwd_scan_ports(c); - if (!c->no_tcp) - tcp_port_rebind_all(c); - if (!c->no_udp) - udp_port_rebind_all(c); + if (!c->no_tcp) { + fwd_listen_sync(c, &c->tcp.fwd_in, PIF_HOST, IPPROTO_TCP); + fwd_listen_sync(c, &c->tcp.fwd_out, PIF_SPLICE, IPPROTO_TCP); + } + if (!c->no_udp) { + fwd_listen_sync(c, &c->udp.fwd_in, PIF_HOST, IPPROTO_UDP); + fwd_listen_sync(c, &c->udp.fwd_out, PIF_SPLICE, IPPROTO_UDP); + } } /** @@ -638,7 +1026,7 @@ uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto, /** * fwd_nat_from_splice() - Determine to forward a flow from the splice interface - * @c: Execution context + * @rule: Forwarding rule to apply * @proto: Protocol (IP L4 protocol number) * @ini: Flow address information of the initiating side * @tgt: Flow address information on the target side (updated) @@ -646,11 +1034,11 @@ uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto, * Return: pif of the target interface to forward the flow to, PIF_NONE if the * flow cannot or should not be forwarded at all. */ -uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto, +uint8_t fwd_nat_from_splice(const struct fwd_rule *rule, uint8_t proto, const struct flowside *ini, struct flowside *tgt) { if (!inany_is_loopback(&ini->eaddr) || - (!inany_is_loopback(&ini->oaddr) && !inany_is_unspecified(&ini->oaddr))) { + !inany_is_loopback(&ini->oaddr)) { char estr[INANY_ADDRSTRLEN], fstr[INANY_ADDRSTRLEN]; debug("Non loopback address on %s: [%s]:%hu -> [%s]:%hu", @@ -660,22 +1048,9 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto, return PIF_NONE; } - if (inany_v4(&ini->eaddr)) - tgt->eaddr = inany_loopback4; - else - tgt->eaddr = inany_loopback6; - - /* Preserve the specific loopback address used, but let the kernel pick - * a source port on the target side - */ + /* Preserve the src & dest (loopback) addresses */ tgt->oaddr = ini->eaddr; - tgt->oport = 0; - - tgt->eport = ini->oport; - if (proto == IPPROTO_TCP) - tgt->eport += c->tcp.fwd_out.delta[tgt->eport]; - else if (proto == IPPROTO_UDP) - tgt->eport += c->udp.fwd_out.delta[tgt->eport]; + tgt->eaddr = ini->oaddr; /* Let the kernel pick a host side source port */ tgt->oport = 0; @@ -683,6 +1058,8 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto, /* But for UDP preserve the source port */ tgt->oport = ini->eport; + tgt->eport = rule->to + (ini->oport - rule->first); + return PIF_HOST; } @@ -725,6 +1102,7 @@ bool nat_inbound(const struct ctx *c, const union inany_addr *addr, /** * fwd_nat_from_host() - Determine to forward a flow from the host interface * @c: Execution context + * @rule: Forwarding rule to apply * @proto: Protocol (IP L4 protocol number) * @ini: Flow address information of the initiating side * @tgt: Flow address information on the target side (updated) @@ -732,15 +1110,12 @@ bool nat_inbound(const struct ctx *c, const union inany_addr *addr, * Return: pif of the target interface to forward the flow to, PIF_NONE if the * flow cannot or should not be forwarded at all. */ -uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto, +uint8_t fwd_nat_from_host(const struct ctx *c, + const struct fwd_rule *rule, uint8_t proto, const struct flowside *ini, struct flowside *tgt) { /* Common for spliced and non-spliced cases */ - tgt->eport = ini->oport; - if (proto == IPPROTO_TCP) - tgt->eport += c->tcp.fwd_in.delta[tgt->eport]; - else if (proto == IPPROTO_UDP) - tgt->eport += c->udp.fwd_in.delta[tgt->eport]; + tgt->eport = rule->to + (ini->oport - rule->first); if (!c->no_splice && inany_is_loopback(&ini->eaddr) && (proto == IPPROTO_TCP || proto == IPPROTO_UDP)) { @@ -778,6 +1153,9 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto, return PIF_SPLICE; } + if (c->splice_only) + return PIF_NONE; + if (!nat_inbound(c, &ini->eaddr, &tgt->oaddr)) { if (inany_v4(&ini->eaddr)) { if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.our_tap_addr)) @@ -7,7 +7,15 @@ #ifndef FWD_H #define FWD_H -union inany_addr; +#include <assert.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> + +#include <netinet/in.h> + +#include "inany.h" + struct flowside; /* Number of ports for both TCP and UDP */ @@ -16,6 +24,50 @@ struct flowside; void fwd_probe_ephemeral(void); bool fwd_port_is_ephemeral(in_port_t port); +/** + * struct fwd_rule - Forwarding rule governing a range of ports + * @addr: Address to forward from + * @ifname: Interface to forward from + * @first: First port number to forward + * @last: Last port number to forward + * @to: Target port for @first, port n goes to @to + (n - @first) + * @socks: Array of listening sockets for this entry + * @flags: Flag mask + * FWD_DUAL_STACK_ANY - match any IPv4 or IPv6 address (@addr should be ::) + * FWD_WEAK - Don't give an error if binds fail for some forwards + * FWD_SCAN - Only forward if the matching port in the target is listening + * + * FIXME: @addr and @ifname currently ignored for outbound tables + */ +struct fwd_rule { + union inany_addr addr; + char ifname[IFNAMSIZ]; + in_port_t first; + in_port_t last; + in_port_t to; + int *socks; +#define FWD_DUAL_STACK_ANY BIT(0) +#define FWD_WEAK BIT(1) +#define FWD_SCAN BIT(2) + uint8_t flags; +}; + +#define FWD_RULE_BITS 8 +#define MAX_FWD_RULES MAX_FROM_BITS(FWD_RULE_BITS) +#define FWD_NO_HINT (-1) + +/** + * struct fwd_listen_ref - information about a single listening socket + * @port: Bound port number of the socket + * @pif: pif in which the socket is listening + * @rule: Index of forwarding rule + */ +struct fwd_listen_ref { + in_port_t port; + uint8_t pif; + unsigned rule :FWD_RULE_BITS; +}; + enum fwd_ports_mode { FWD_UNSET = 0, FWD_SPEC = 1, @@ -26,34 +78,60 @@ enum fwd_ports_mode { #define PORT_BITMAP_SIZE DIV_ROUND_UP(NUM_PORTS, 8) +/* Maximum number of listening sockets (per pif & protocol) + * + * Rationale: This lets us listen on every port for two addresses (which we need + * for -T auto without SO_BINDTODEVICE), plus a comfortable number of extras. + */ +#define MAX_LISTEN_SOCKS (NUM_PORTS * 3) + /** * fwd_ports() - Describes port forwarding for one protocol and direction - * @mode: Overall forwarding mode (all, none, auto, specific ports) + * @mode: Overall mode (all, none, auto, specific ports) * @scan4: /proc/net fd to scan for IPv4 ports when in AUTO mode * @scan6: /proc/net fd to scan for IPv6 ports when in AUTO mode + * @count: Number of forwarding rules + * @rules: Array of forwarding rules * @map: Bitmap describing which ports are forwarded - * @delta: Offset between the original destination and mapped port number + * @sock_count: Number of entries used in @socks + * @socks: Listening sockets for forwarding */ struct fwd_ports { enum fwd_ports_mode mode; int scan4; int scan6; + unsigned count; + struct fwd_rule rules[MAX_FWD_RULES]; uint8_t map[PORT_BITMAP_SIZE]; - in_port_t delta[NUM_PORTS]; + unsigned sock_count; + int socks[MAX_LISTEN_SOCKS]; }; #define FWD_PORT_SCAN_INTERVAL 1000 /* ms */ +void fwd_rule_add(struct fwd_ports *fwd, uint8_t flags, + const union inany_addr *addr, const char *ifname, + in_port_t first, in_port_t last, in_port_t to); +const struct fwd_rule *fwd_rule_search(const struct fwd_ports *fwd, + const struct flowside *ini, + int hint); +void fwd_rules_print(const struct fwd_ports *fwd); + void fwd_scan_ports_init(struct ctx *c); -void fwd_scan_ports_timer(struct ctx *c, const struct timespec *now); +void fwd_scan_ports_timer(struct ctx * c, const struct timespec *now); + +int fwd_listen_sync(const struct ctx *c, const struct fwd_ports *fwd, + uint8_t pif, uint8_t proto); +void fwd_listen_close(const struct fwd_ports *fwd); bool nat_inbound(const struct ctx *c, const union inany_addr *addr, union inany_addr *translated); uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto, const struct flowside *ini, struct flowside *tgt); -uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto, +uint8_t fwd_nat_from_splice(const struct fwd_rule *rule, uint8_t proto, const struct flowside *ini, struct flowside *tgt); -uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto, +uint8_t fwd_nat_from_host(const struct ctx *c, + const struct fwd_rule *rule, uint8_t proto, const struct flowside *ini, struct flowside *tgt); void fwd_neigh_table_update(const struct ctx *c, const union inany_addr *addr, const uint8_t *mac, bool permanent); diff --git a/hooks/pre-push b/hooks/pre-push index 8dbfa5f..839b310 100755 --- a/hooks/pre-push +++ b/hooks/pre-push @@ -62,7 +62,7 @@ ssh "${USER_HOST}" "rm -f ${BIN}/*.deb" ssh "${USER_HOST}" "rm -f ${BIN}/*.rpm" scp *.deb *.rpm "${USER_HOST}:${BIN}/" -man2html -M "/" passt.1 > passt.1.html +mandoc -Thtml passt.1 > passt.1.html scp passt.1.html "${USER_HOST}:${WEB}/" for pic in passt_overview pasta_overview; do @@ -177,13 +177,12 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, union flow *flow = flow_alloc(); struct icmp_ping_flow *pingf; const struct flowside *tgt; - union epoll_ref ref; if (!flow) return NULL; flow_initiate_af(flow, PIF_TAP, af, saddr, id, daddr, id); - if (!(tgt = flow_target(c, flow, proto))) + if (!(tgt = flow_target(c, flow, FWD_NO_HINT, proto))) goto cancel; if (flow->f.pif[TGTSIDE] != PIF_HOST) { @@ -211,12 +210,8 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, goto cancel; flow_epollid_set(&pingf->f, EPOLLFD_ID_DEFAULT); - - ref.type = EPOLL_TYPE_PING; - ref.flowside = FLOW_SIDX(flow, TGTSIDE); - ref.fd = pingf->sock; - - if (epoll_add(flow_epollfd(&pingf->f), EPOLLIN, ref) < 0) { + if (flow_epoll_set(&pingf->f, EPOLL_CTL_ADD, EPOLLIN, pingf->sock, + TGTSIDE) < 0) { close(pingf->sock); goto cancel; } @@ -6,6 +6,10 @@ #ifndef ICMP_H #define ICMP_H +#include <stdint.h> + +#include <netinet/in.h> + struct ctx; struct icmp_ping_flow; diff --git a/icmp_flow.h b/icmp_flow.h index fb93801..3af98be 100644 --- a/icmp_flow.h +++ b/icmp_flow.h @@ -7,6 +7,8 @@ #ifndef ICMP_FLOW_H #define ICMP_FLOW_H +#include "flow.h" + /** * struct icmp_ping_flow - Descriptor for a flow of ping requests/replies * @f: Generic flow information @@ -13,5 +13,4 @@ */ /* TO BE IMPLEMENTED */ -/* cppcheck-suppress unusedFunction */ __attribute__((__unused__)) static void unused(void) { } @@ -11,6 +11,7 @@ #include <assert.h> #include <netinet/in.h> #include <arpa/inet.h> +#include <errno.h> #include "util.h" #include "ip.h" @@ -21,8 +22,27 @@ const union inany_addr inany_loopback4 = INANY_INIT4(IN4ADDR_LOOPBACK_INIT); const union inany_addr inany_any4 = INANY_INIT4(IN4ADDR_ANY_INIT); +/** inany_matches - Do two addresses match? + * @a, @b: IPv[46] addresses (NULL for 0.0.0.0 & ::) + * + * Return: true if they match, false otherwise + * + * Addresses match themselves, but also unspecified addresses of the same + * family. + */ +bool inany_matches(const union inany_addr *a, const union inany_addr *b) +{ + if (!a || !b) + return true; + + if (inany_is_unspecified(a) || inany_is_unspecified(b)) + return !!inany_v4(a) == !!inany_v4(b); + + return inany_equals(a, b); +} + /** inany_ntop - Convert an IPv[46] address to text format - * @src: IPv[46] address + * @src: IPv[46] address (NULL for unspecified) * @dst: output buffer, minimum INANY_ADDRSTRLEN bytes * @size: size of buffer at @dst * @@ -30,9 +50,12 @@ const union inany_addr inany_any4 = INANY_INIT4(IN4ADDR_ANY_INIT); */ const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size) { - const struct in_addr *v4 = inany_v4(src); + const struct in_addr *v4; - if (v4) + if (!src) + return strncpy(dst, "*", size); + + if ((v4 = inany_v4(src))) return inet_ntop(AF_INET, v4, dst, size); return inet_ntop(AF_INET6, &src->a6, dst, size); @@ -57,3 +80,52 @@ int inany_pton(const char *src, union inany_addr *dst) return 0; } + +/** + * inany_prefix_pton() - Parse an IPv[46] address with prefix length + * @src: IPv[46] address and prefix length string in CIDR format + * @dst: Output buffer, filled with parsed address + * @prefix_len: Prefix length, to be filled in IPv6 format + * + * Return: 1 on success, 0 if no parseable address or prefix is found + */ +int inany_prefix_pton(const char *src, union inany_addr *dst, + uint8_t *prefix_len) +{ + char astr[INANY_ADDRSTRLEN] = { 0 }; + size_t alen = strcspn(src, "/"); + const char *pstr = &src[alen + 1]; + unsigned long plen; + char *end; + + if (alen >= INANY_ADDRSTRLEN) + return 0; + + if (src[alen] != '/') + return 0; + + strncpy(astr, src, alen); + + /* Read prefix length */ + errno = 0; + plen = strtoul(pstr, &end, 10); + if (errno || *end || plen > 128) + return 0; + + /* Read address */ + if (inet_pton(AF_INET6, astr, dst)) { + if (inany_v4(dst) && plen < 96) + return 0; + *prefix_len = plen; + return 1; + } + + if (inany_pton(astr, dst)) { + if (plen > 32) + return 0; + *prefix_len = plen + 96; + return 1; + } + + return 0; +} @@ -9,6 +9,12 @@ #ifndef INANY_H #define INANY_H +#include <assert.h> +#include <string.h> + +#include "ip.h" +#include "siphash.h" + struct siphash_state; /** union inany_addr - Represents either an IPv4 or IPv6 address @@ -96,6 +102,19 @@ static inline struct in_addr *inany_v4(const union inany_addr *addr) return (struct in_addr *)&addr->v4mapped.a4; } +/** inany_default_prefix_len() - Get default prefix length for address + * @addr: IPv4 or iPv6 address + * + * Return: Class-based prefix length for IPv4 (in IPv6 format: 104-128), + * or 64 for IPv6 + */ +static inline int inany_default_prefix_len(const union inany_addr *addr) +{ + const struct in_addr *v4 = inany_v4(addr); + + return v4 ? ip4_class_prefix_len(v4) + 96 : 64; +} + /** inany_equals - Compare two IPv[46] addresses * @a, @b: IPv[46] addresses * @@ -293,7 +312,10 @@ static inline void inany_siphash_feed(struct siphash_state *state, #define INANY_ADDRSTRLEN MAX(INET_ADDRSTRLEN, INET6_ADDRSTRLEN) +bool inany_matches(const union inany_addr *a, const union inany_addr *b); const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size); int inany_pton(const char *src, union inany_addr *dst); +int inany_prefix_pton(const char *src, union inany_addr *dst, + uint8_t *prefix_len); #endif /* INANY_H */ @@ -20,24 +20,21 @@ * Contributions after 2012-01-13 are licensed under the terms of the * GNU GPL, version 2 or (at your option) any later version. */ + #include <sys/socket.h> #include "util.h" #include "iov.h" - /** - * iov_skip_bytes() - Skip leading bytes of an IO vector - * @iov: IO vector + * iov_skip_bytes() - Find index and offset in iovec array given byte offset + * @iov: iovec array * @n: Number of entries in @iov - * @skip: Number of leading bytes of @iov to skip - * @offset: Offset of first unskipped byte in its @iov entry + * @skip: Byte offset: leading bytes of @iov to skip + * @offset: Offset within matching @iov entry, set on return, can be NULL * - * Return: index I of individual struct iovec which contains the byte at @skip - * bytes into the vector (as though all its buffers were contiguous). - * If @offset is non-NULL, update it to the offset of that byte within - * @iov[I] (guaranteed to be less than @iov[I].iov_len) If the whole - * vector has <= @skip bytes, return @n. + * Return: index of iovec array containing the @skip byte counted as if buffers + * were contiguous. If iovec has less than @skip bytes, return @n. */ size_t iov_skip_bytes(const struct iovec *iov, size_t n, size_t skip, size_t *offset) @@ -57,17 +54,14 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n, } /** - * iov_from_buf() - Copy data from a buffer to an I/O vector (struct iovec) - * efficiently. - * - * @iov: Pointer to the array of struct iovec describing the - * scatter/gather I/O vector. - * @iov_cnt: Number of elements in the iov array. - * @offset: Byte offset in the iov array where copying should start. - * @buf: Pointer to the source buffer containing the data to copy. - * @bytes: Total number of bytes to copy from buf to iov. + * iov_from_buf() - Copy from flat buffer to iovec array + * @iov: Destination iovec array + * @iov_cnt: Number of elements in the iovec array + * @offset: Destination offset in @iov counted as if buffers were contiguous + * @buf: Source buffer + * @bytes: Bytes to copy * - * Return: the number of bytes successfully copied. + * Return: number of bytes copied */ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt, size_t offset, const void *buf, size_t bytes) @@ -78,12 +72,12 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt, if (__builtin_constant_p(bytes) && iov_cnt && offset <= iov[0].iov_len && bytes <= iov[0].iov_len - offset) { memcpy((char *)iov[0].iov_base + offset, buf, bytes); + return bytes; } i = iov_skip_bytes(iov, iov_cnt, offset, &offset); - /* copying data */ for (copied = 0; copied < bytes && i < iov_cnt; i++) { size_t len = MIN(iov[i].iov_len - offset, bytes - copied); @@ -97,17 +91,14 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt, } /** - * iov_to_buf() - Copy data from a scatter/gather I/O vector (struct iovec) to - * a buffer efficiently. - * - * @iov: Pointer to the array of struct iovec describing the scatter/gather - * I/O vector. - * @iov_cnt: Number of elements in the iov array. - * @offset: Offset within the first element of iov from where copying should start. - * @buf: Pointer to the destination buffer where data will be copied. - * @bytes: Total number of bytes to copy from iov to buf. + * iov_to_buf() - Copy from iovec to flat buffer + * @iov: Source iovec array + * @iov_cnt: Number of elements in iovec array + * @offset: Source offset in @iov counted as if buffers were contiguous + * @buf: Destination buffer + * @bytes: Bytes to copy * - * Return: the number of bytes successfully copied. + * Return: number of bytes copied */ size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt, size_t offset, void *buf, size_t bytes) @@ -118,15 +109,17 @@ size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt, if (__builtin_constant_p(bytes) && iov_cnt && offset <= iov[0].iov_len && bytes <= iov[0].iov_len - offset) { memcpy(buf, (char *)iov[0].iov_base + offset, bytes); + return bytes; } i = iov_skip_bytes(iov, iov_cnt, offset, &offset); - /* copying data */ for (copied = 0; copied < bytes && i < iov_cnt; i++) { size_t len = MIN(iov[i].iov_len - offset, bytes - copied); + ASSERT(iov[i].iov_base); + memcpy((char *)buf + copied, (char *)iov[i].iov_base + offset, len); copied += len; @@ -137,14 +130,11 @@ size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt, } /** - * iov_size() - Calculate the total size of a scatter/gather I/O vector - * (struct iovec). + * iov_size() - Calculate total data size of iovec + * @iov: Source iovec array + * @iov_cnt: Number of elements in iovec array * - * @iov: Pointer to the array of struct iovec describing the - * scatter/gather I/O vector. - * @iov_cnt: Number of elements in the iov array. - * - * Return: the total size in bytes. + * Return: total size in bytes */ size_t iov_size(const struct iovec *iov, size_t iov_cnt) { @@ -15,8 +15,9 @@ #ifndef IOVEC_H #define IOVEC_H -#include <unistd.h> +#include <stdbool.h> #include <string.h> +#include <unistd.h> #define IOV_OF_LVALUE(lval) \ (struct iovec){ .iov_base = &(lval), .iov_len = sizeof(lval) } @@ -24,9 +25,9 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n, size_t skip, size_t *offset); size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt, - size_t offset, const void *buf, size_t bytes); + size_t offset, const void *buf, size_t bytes); size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt, - size_t offset, void *buf, size_t bytes); + size_t offset, void *buf, size_t bytes); size_t iov_size(const struct iovec *iov, size_t iov_cnt); /* @@ -13,6 +13,8 @@ */ #include <stddef.h> +#include <netinet/in.h> + #include "util.h" #include "ip.h" @@ -67,3 +69,48 @@ found: *proto = nh; return true; } + +/** + * ipproto_name() - Get IP protocol name from number + * @proto: IP protocol number + * + * Return: pointer to name of protocol @proto + * + * Usually this would be done with getprotobynumber(3) but that reads + * /etc/protocols and might allocate, which isn't possible for us once + * self-isolated. + */ +const char *ipproto_name(uint8_t proto) +{ + switch (proto) { + case IPPROTO_ICMP: + return "ICMP"; + case IPPROTO_TCP: + return "TCP"; + case IPPROTO_UDP: + return "UDP"; + case IPPROTO_ICMPV6: + return "ICMPv6"; + default: + return "<unknown protocol>"; + } +} + +/** + * ip4_class_prefix_len() - Get class based prefix length for IPv4 address + * @addr: IPv4 address + * + * Return: prefix length based on address class, or 32 for other + */ +int ip4_class_prefix_len(const struct in_addr *addr) +{ + in_addr_t a = ntohl(addr->s_addr); + + if (IN_CLASSA(a)) + return 32 - IN_CLASSA_NSHIFT; + if (IN_CLASSB(a)) + return 32 - IN_CLASSB_NSHIFT; + if (IN_CLASSC(a)) + return 32 - IN_CLASSC_NSHIFT; + return 32; +} @@ -9,6 +9,8 @@ #include <netinet/ip.h> #include <netinet/ip6.h> +#include "util.h" + #define IN4_IS_ADDR_UNSPECIFIED(a) \ (((struct in_addr *)(a))->s_addr == htonl_constant(INADDR_ANY)) #define IN4_IS_ADDR_BROADCAST(a) \ @@ -116,6 +118,7 @@ static inline uint32_t ip6_get_flow_lbl(const struct ipv6hdr *ip6h) } bool ipv6_l4hdr(struct iov_tail *data, uint8_t *proto, size_t *dlen); +const char *ipproto_name(uint8_t proto); /* IPv6 link-local all-nodes multicast address, ff02::1 */ static const struct in6_addr in6addr_ll_all_nodes = { @@ -135,4 +138,6 @@ static const struct in_addr in4addr_broadcast = { 0xffffffff }; #define IPV6_MIN_MTU 1280 #endif +int ip4_class_prefix_len(const struct in_addr *addr); + #endif /* IP_H */ diff --git a/isolation.h b/isolation.h index 80bb68d..0576168 100644 --- a/isolation.h +++ b/isolation.h @@ -7,6 +7,9 @@ #ifndef ISOLATION_H #define ISOLATION_H +#include <stdbool.h> +#include <unistd.h> + void isolate_initial(int argc, char **argv); void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns, enum passt_modes mode); @@ -6,6 +6,8 @@ #ifndef LINEREAD_H #define LINEREAD_H +#include <sys/types.h> + #define LINEREAD_BUFFER_SIZE 8192 /** diff --git a/linux_dep.h b/linux_dep.h index 89e590c..3f8184b 100644 --- a/linux_dep.h +++ b/linux_dep.h @@ -7,6 +7,9 @@ #ifndef LINUX_DEP_H #define LINUX_DEP_H +#include <stdint.h> +#include <unistd.h> + /* struct tcp_info_linux - Information from Linux TCP_INFO getsockopt() * * Largely derived from include/linux/tcp.h in the Linux kernel @@ -6,9 +6,16 @@ #ifndef LOG_H #define LOG_H +#include <stdarg.h> #include <stdbool.h> +#include <stddef.h> #include <syslog.h> +/* This would make more sense in util.h, but because we use it in die(), that + * would cause awkward circular reference problems. + */ +void passt_exit(int status) __attribute__((noreturn)); + #define LOGFILE_SIZE_DEFAULT (1024 * 1024UL) #define LOGFILE_CUT_RATIO 30 /* When full, cut ~30% size */ #define LOGFILE_SIZE_MIN (5UL * MAX(BUFSIZ, PAGE_SIZE)) @@ -32,13 +39,13 @@ void logmsg_perror(int pri, const char *format, ...) #define die(...) \ do { \ err(__VA_ARGS__); \ - _exit(EXIT_FAILURE); \ + passt_exit(EXIT_FAILURE); \ } while (0) #define die_perror(...) \ do { \ err_perror(__VA_ARGS__); \ - _exit(EXIT_FAILURE); \ + passt_exit(EXIT_FAILURE); \ } while (0) extern int log_file; @@ -29,13 +29,13 @@ #define MIGRATE_MAGIC 0xB1BB1D1B0BB1D1B0 /** - * struct migrate_seen_addrs_v1 - Migratable guest addresses for v1 state stream + * struct migrate_seen_addrs_v2 - Migratable guest addresses for v2 state stream * @addr6: Observed guest IPv6 address * @addr6_ll: Observed guest IPv6 link-local address * @addr4: Observed guest IPv4 address * @mac: Observed guest MAC address */ -struct migrate_seen_addrs_v1 { +struct migrate_seen_addrs_v2 { struct in6_addr addr6; struct in6_addr addr6_ll; struct in_addr addr4; @@ -43,7 +43,7 @@ struct migrate_seen_addrs_v1 { } __attribute__((packed)); /** - * seen_addrs_source_v1() - Copy and send guest observed addresses from source + * seen_addrs_source_v2() - Copy and send guest observed addresses from source * @c: Execution context * @stage: Migration stage, unused * @fd: File descriptor for state transfer @@ -51,10 +51,10 @@ struct migrate_seen_addrs_v1 { * Return: 0 on success, positive error code on failure */ /* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */ -static int seen_addrs_source_v1(struct ctx *c, +static int seen_addrs_source_v2(struct ctx *c, const struct migrate_stage *stage, int fd) { - struct migrate_seen_addrs_v1 addrs = { + struct migrate_seen_addrs_v2 addrs = { .addr6 = c->ip6.addr_seen, .addr6_ll = c->ip6.addr_ll_seen, .addr4 = c->ip4.addr_seen, @@ -71,17 +71,17 @@ static int seen_addrs_source_v1(struct ctx *c, } /** - * seen_addrs_target_v1() - Receive and use guest observed addresses on target + * seen_addrs_target_v2() - Receive and use guest observed addresses on target * @c: Execution context * @stage: Migration stage, unused * @fd: File descriptor for state transfer * * Return: 0 on success, positive error code on failure */ -static int seen_addrs_target_v1(struct ctx *c, +static int seen_addrs_target_v2(struct ctx *c, const struct migrate_stage *stage, int fd) { - struct migrate_seen_addrs_v1 addrs; + struct migrate_seen_addrs_v2 addrs; (void)stage; @@ -100,8 +100,8 @@ static int seen_addrs_target_v1(struct ctx *c, static const struct migrate_stage stages_v2[] = { { .name = "observed addresses", - .source = seen_addrs_source_v1, - .target = seen_addrs_target_v1, + .source = seen_addrs_source_v2, + .target = seen_addrs_target_v2, }, { .name = "prepare flows", @@ -123,7 +123,6 @@ static const struct migrate_version versions[] = { * MSS and omitted timestamps, which meant it usually wouldn't work. * Therefore we don't attempt to support compatibility with it. */ - { 0 }, }; /* Current encoding version */ @@ -177,9 +176,9 @@ static int migrate_source(struct ctx *c, int fd) */ static const struct migrate_version *migrate_target_read_header(int fd) { - const struct migrate_version *v; struct migrate_header h; uint32_t id, compat_id; + unsigned i; if (read_all_buf(fd, &h, sizeof(h))) return NULL; @@ -196,9 +195,9 @@ static const struct migrate_version *migrate_target_read_header(int fd) return NULL; } - for (v = versions; v->id; v++) - if (v->id <= id && v->id >= compat_id) - return v; + for (i = 0; i < ARRAY_SIZE(versions); i++) + if (versions[i].id <= id && versions[i].id >= compat_id) + return &versions[i]; errno = ENOTSUP; err("Unsupported device state version: %u", id); @@ -6,6 +6,9 @@ #ifndef MIGRATE_H #define MIGRATE_H +#include <stdbool.h> +#include <stdint.h> + /** * struct migrate_header - Migration header from source * @magic: 0xB1BB1D1B0BB1D1B0, network order @@ -285,7 +285,9 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst) size_t dns_s_len = 0; int i, n; - for (n = 0; !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns[n]); n++); + for (n = 0; n < ARRAY_SIZE(c->ip6.dns); n++) + if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns[n])) + break; if (n) { struct opt_rdnss *rdnss = (struct opt_rdnss *)ptr; *rdnss = (struct opt_rdnss) { @@ -6,6 +6,10 @@ #ifndef NETLINK_H #define NETLINK_H +#include <stdbool.h> + +#include <netinet/in.h> + extern int nl_sock; extern int nl_sock_ns; @@ -7,6 +7,8 @@ #define PACKET_H #include <stdbool.h> +#include <sys/uio.h> + #include "iov.h" #include "virtio.h" @@ -156,26 +156,35 @@ By default, the advertised MTU is 65520 bytes, that is, the maximum 802.3 MTU minus the length of a 802.3 header, rounded to 32 bits (IPv4 words). .TP -.BR \-a ", " \-\-address " " \fIaddr +.BR \-a ", " \-\-address " " \fIaddr\fR[/\fIprefix_len\fR] Assign IPv4 \fIaddr\fR via DHCP (\fByiaddr\fR), or \fIaddr\fR via DHCPv6 (option 5) and an \fIaddr\fR-based prefix via NDP Router Advertisement (option type 3) for an IPv6 \fIaddr\fR. +An optional /\fIprefix_len\fR (0-32 for IPv4, 0-128 for IPv6) can be +appended in CIDR notation (e.g. 192.0.2.1/24). This is an alternative to +using the \fB-n\fR, \fB--netmask\fR option. Mixing CIDR notation with +\fB-n\fR results in an error. +If a prefix length is assigned to an IPv6 address using this method, it will +in the current code version be overridden by the default value of 64. This option can be specified zero (for defaults) to two times (once for IPv4, once for IPv6). By default, assigned IPv4 and IPv6 addresses are taken from the host interfaces with the first default route, if any, for the corresponding IP version. If no default routes are available and there is any interface with any route for a given IP version, the first of these interfaces will be chosen instead. If no -such interface exists, the link-local address 169.254.2.1 is assigned for IPv4, -and no additional address will be assigned for IPv6. +such interface exists for a given IP version, the link-local address 169.254.2.1 +is assigned for IPv4, and no additional address will be assigned for IPv6. .TP .BR \-n ", " \-\-netmask " " \fImask Assign IPv4 netmask \fImask\fR, expressed as dot-decimal or number of bits, via -DHCP (option 1). -By default, the netmask associated to the host address matching the assigned one -is used. If there's no matching address on the host, the netmask is determined -according to the CIDR block of the assigned address (RFC 4632). +DHCP (option 1). Alternatively, the prefix length can be specified using CIDR +notation with the \fB-a\fR, \fB--address\fR option (e.g. \fB-a\fR 192.0.2.1/24). +Mixing \fB-n\fR with CIDR notation results in an error. +If no address is indicated, the netmask associated with the adopted host address, +if any, is used. If an address is indicated, but without a prefix length, the +netmask is determined based on the corresponding network class. In all other +cases, the netmask is determined by using the indicated prefix length. .TP .BR \-M ", " \-\-mac-addr " " \fIaddr @@ -194,9 +203,9 @@ first default route, if any, for the corresponding IP version. If the default route is a multipath one, the gateway is the first nexthop router returned by the kernel which has the highest weight in the set of paths. If no default routes are available and there is just one interface with any route, that -interface will be chosen instead. If no such interface exists, the link-local -address 169.254.2.2 is used for IPv4, and the link-local address fe80::1 is used -for IPv6. +interface will be chosen instead. If no such interface exists for a given IP +version, the link-local address 169.254.2.2 is used for IPv4, and the link-local +address fe80::1 is used for IPv6. Note: these addresses are also used as source address for packets directed to the guest or to the target namespace having a loopback or local source address, @@ -755,6 +764,11 @@ Default is to let the tap driver build a pseudorandom hardware address. Disable the bypass path for inbound, local traffic. See the section \fBHandling of local traffic in pasta\fR in the \fBNOTES\fR for more details. +.TP +.BR \-\-splice-only +Do not create a tap device in the namespace. In this mode, \fIpasta\fR only +forwards loopback traffic between namespaces. + .SH EXAMPLES .SS \fBpasta @@ -1117,8 +1131,9 @@ throughput of TCP connections. .SS Local mode for disconnected setups If \fBpasst\fR and \fBpasta\fR fail to find a host interface with a configured -address, other than loopback addresses, they will, obviously, not attempt to -source addresses or routes from the host. +address for a given IP version, other than loopback addresses, they will, +obviously, not attempt to source addresses or routes from the host, for that +IP version. In this case, unless configured otherwise, they will assign the IPv4 link-local address 169.254.2.1 to the guest or target namespace, and no IPv6 address. The @@ -177,8 +177,7 @@ static void exit_handler(int signal) { (void)signal; - fsync_pcap_and_log(); - _exit(EXIT_SUCCESS); + passt_exit(EXIT_SUCCESS); } /** @@ -329,7 +328,8 @@ static void passt_worker(void *opaque, int nfds, struct epoll_event *events) * #syscalls bind connect recvfrom sendto shutdown * #syscalls arm:recv ppc64le:recv arm:send ppc64le:send * #syscalls accept4 accept listen epoll_ctl epoll_wait|epoll_pwait epoll_pwait - * #syscalls clock_gettime arm:clock_gettime64 i686:clock_gettime64 + * #syscalls clock_gettime|clock_gettime64 + * #syscalls arm:clock_gettime64 i686:clock_gettime64 */ int main(int argc, char **argv) { @@ -396,9 +396,10 @@ int main(int argc, char **argv) die_perror("Failed to get CLOCK_MONOTONIC time"); flow_init(); + fwd_scan_ports_init(&c); if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c))) - _exit(EXIT_FAILURE); + passt_exit(EXIT_FAILURE); proto_update_l2_buf(c.guest_mac); @@ -74,7 +74,7 @@ enum passt_modes { * host's 127.0.0.1 * @map_guest_addr: Outbound connections to this address are NATted to the * guest's assigned address - * @dns: DNS addresses for DHCP, zero-terminated + * @dns: DNS addresses for DHCP * @dns_match: Forward DNS query if sent to this address * @our_tap_addr: IPv4 address for passt's use on tap * @dns_host: Use this DNS on the host for forwarding @@ -91,7 +91,7 @@ struct ip4_ctx { struct in_addr guest_gw; struct in_addr map_host_loopback; struct in_addr map_guest_addr; - struct in_addr dns[MAXNS + 1]; + struct in_addr dns[MAXNS]; struct in_addr dns_match; struct in_addr our_tap_addr; @@ -115,7 +115,7 @@ struct ip4_ctx { * host's [::1] * @map_guest_addr: Outbound connections to this address are NATted to the * guest's assigned address - * @dns: DNS addresses for DHCPv6 and NDP, zero-terminated + * @dns: DNS addresses for DHCPv6 and NDP * @dns_match: Forward DNS query if sent to this address * @our_tap_ll: Link-local IPv6 address for passt's use on tap * @dns_host: Use this DNS on the host for forwarding @@ -132,7 +132,7 @@ struct ip6_ctx { struct in6_addr guest_gw; struct in6_addr map_host_loopback; struct in6_addr map_guest_addr; - struct in6_addr dns[MAXNS + 1]; + struct in6_addr dns[MAXNS]; struct in6_addr dns_match; struct in6_addr our_tap_ll; @@ -200,6 +200,7 @@ struct ip6_ctx { * @no_ndp: Disable NDP handler altogether * @no_ra: Disable router advertisements * @no_splice: Disable socket splicing for inbound traffic + * @splice_only: Only enable loopback forwarding * @host_lo_to_ns_lo: Map host loopback addresses to ns loopback addresses * @freebind: Allow binding of non-local addresses for forwarding * @low_wmem: Low probed net.core.wmem_max @@ -277,6 +278,7 @@ struct ctx { int no_ndp; int no_ra; int no_splice; + int splice_only; int host_lo_to_ns_lo; int freebind; @@ -40,6 +40,7 @@ #include <arpa/inet.h> #include <netinet/in.h> #include <net/ethernet.h> +#include <sys/prctl.h> #include <sys/syscall.h> #include <linux/magic.h> @@ -70,15 +71,13 @@ void pasta_child_handler(int signal) if (pasta_child_pid && !waitid(P_PID, pasta_child_pid, &infop, WEXITED | WNOHANG)) { if (infop.si_pid == pasta_child_pid) { - fsync_pcap_and_log(); - if (infop.si_code == CLD_EXITED) - _exit(infop.si_status); + passt_exit(infop.si_status); /* If killed by a signal, si_status is the number. * Follow common shell convention of returning it + 128. */ - _exit(infop.si_status + 128); + passt_exit(infop.si_status + 128); /* Nothing to do, detached PID namespace going away */ } @@ -191,6 +190,10 @@ static int pasta_spawn_cmd(void *arg) size_t conf_hostname_len; sigset_t set; + /* If the parent dies with an error, so should we */ + if (prctl(PR_SET_PDEATHSIG, SIGKILL)) + die_perror("Couldn't set PR_SET_PDEATHSIG"); + /* We run in a detached PID and mount namespace: mount /proc over */ if (mount("", "/proc", "proc", 0, NULL)) warn_perror("Couldn't mount /proc"); @@ -217,6 +220,12 @@ static int pasta_spawn_cmd(void *arg) sigaddset(&set, SIGUSR1); sigwaitinfo(&set, NULL); + /* Once exec()ed this process is more valuable, and easier to see and + * clean up. Let us outlive our parent now. + */ + if (prctl(PR_SET_PDEATHSIG, 0)) + die_perror("Couldn't clear PR_SET_PDEATHSIG"); + execvp(a->exe, a->argv); die_perror("Failed to start command or shell"); @@ -307,6 +316,9 @@ void pasta_ns_conf(struct ctx *c) die("Couldn't bring up loopback interface in namespace: %s", strerror_(-rc)); + if (c->splice_only) + return; + /* Get or set MAC in target namespace */ if (MAC_IS_ZERO(c->guest_mac)) nl_link_get_mac(nl_sock_ns, c->pasta_ifi, c->guest_mac); @@ -339,6 +351,12 @@ void pasta_ns_conf(struct ctx *c) AF_INET); } + if (c->ifi4 == -1 && rc == -ENOTSUP) { + warn("IPv4 not supported, disabling"); + c->ifi4 = 0; + goto ipv4_done; + } + if (rc < 0) { die("Couldn't set IPv4 address(es) in namespace: %s", strerror_(-rc)); @@ -358,6 +376,7 @@ void pasta_ns_conf(struct ctx *c) strerror_(-rc)); } } +ipv4_done: if (c->ifi6) { rc = nl_addr_get_ll(nl_sock_ns, c->pasta_ifi, @@ -404,12 +423,19 @@ void pasta_ns_conf(struct ctx *c) AF_INET6); } + if (c->ifi6 == -1 && rc == -ENOTSUP) { + warn("IPv6 not supported, disabling"); + c->ifi6 = 0; + goto ipv6_done; + } + if (rc < 0) { die("Couldn't set IPv6 route(s) in guest: %s", strerror_(-rc)); } } } +ipv6_done: proto_update_l2_buf(c->guest_mac); } @@ -511,7 +537,7 @@ void pasta_netns_quit_inotify_handler(struct ctx *c, int inotify_fd) if (!strncmp(ev->name, c->netns_base, sizeof(c->netns_base))) { info("Namespace %s is gone, exiting", c->netns_base); - _exit(EXIT_SUCCESS); + passt_exit(EXIT_SUCCESS); } } } @@ -539,7 +565,7 @@ void pasta_netns_quit_timer_handler(struct ctx *c, union epoll_ref ref) return; info("Namespace %s is gone, exiting", c->netns_base); - _exit(EXIT_SUCCESS); + passt_exit(EXIT_SUCCESS); } close(fd); @@ -6,6 +6,8 @@ #ifndef PASTA_H #define PASTA_H +#include <unistd.h> + extern int pasta_child_pid; void pasta_open_ns(struct ctx *c, const char *netns); @@ -6,6 +6,8 @@ #ifndef PCAP_H #define PCAP_H +#include <stddef.h> + extern int pcap_fd; void pcap(const char *pkt, size_t l2len); @@ -50,31 +50,35 @@ void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, sa->sa_family = AF_INET6; sa->sa6.sin6_addr = addr->a6; sa->sa6.sin6_port = htons(port); - if (pif == PIF_HOST && IN6_IS_ADDR_LINKLOCAL(&addr->a6)) - sa->sa6.sin6_scope_id = c->ifi6; - else + if (IN6_IS_ADDR_LINKLOCAL(&addr->a6)) { + if (pif == PIF_HOST) + sa->sa6.sin6_scope_id = c->ifi6; + else if (pif == PIF_SPLICE) + sa->sa6.sin6_scope_id = c->pasta_ifi; + } else { sa->sa6.sin6_scope_id = 0; + } sa->sa6.sin6_flowinfo = 0; } } -/** pif_sock_l4() - Open a socket bound to an address on a specified interface +/** pif_listen() - Open a listening socket on a specified pif * @c: Execution context * @type: Socket epoll type * @pif: Interface for this socket * @addr: Address to bind to, or NULL for dual-stack any * @ifname: Interface for binding, NULL for any * @port: Port number to bind to (host byte order) - * @data: epoll reference portion for protocol handlers + * @rule: Forwarding rule index this socket belongs to * * NOTE: For namespace pifs, this must be called having already entered the * relevant namespace. * * Return: newly created socket, negative error code on failure */ -int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif, - const union inany_addr *addr, const char *ifname, - in_port_t port, uint32_t data) +int pif_listen(const struct ctx *c, enum epoll_type type, uint8_t pif, + const union inany_addr *addr, const char *ifname, + in_port_t port, unsigned rule) { union epoll_ref ref; int ret; @@ -94,7 +98,9 @@ int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif, return ref.fd; ref.type = type; - ref.data = data; + ref.listen.port = port; + ref.listen.pif = pif; + ref.listen.rule = rule; ret = epoll_add(c->epollfd, EPOLLIN, ref); if (ret < 0) { @@ -7,6 +7,12 @@ #ifndef PIF_H #define PIF_H +#include <stdbool.h> + +#include <netinet/in.h> + +#include "epoll_type.h" + union inany_addr; union sockaddr_inany; @@ -59,8 +65,8 @@ static inline bool pif_is_socket(uint8_t pif) void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, uint8_t pif, const union inany_addr *addr, in_port_t port); -int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif, - const union inany_addr *addr, const char *ifname, - in_port_t port, uint32_t data); +int pif_listen(const struct ctx *c, enum epoll_type type, uint8_t pif, + const union inany_addr *addr, const char *ifname, + in_port_t port, unsigned rule); #endif /* PIF_H */ @@ -6,6 +6,8 @@ #ifndef REPAIR_H #define REPAIR_H +#include <stdint.h> + void repair_sock_init(const struct ctx *c); int repair_listen_handler(struct ctx *c, uint32_t events); void repair_handler(struct ctx *c, uint32_t events); @@ -21,6 +21,7 @@ IN="$@" [ -z "${ARCH}" ] && ARCH="$(uname -m)" [ -z "${CC}" ] && CC="cc" +case "${ARCH}" in i[345]86) ARCH=i686 ;; esac AUDIT_ARCH="AUDIT_ARCH_$(echo ${ARCH} | tr '[a-z]' '[A-Z]' \ | sed 's/^ARM.*/ARM/' \ @@ -33,6 +34,11 @@ AUDIT_ARCH="AUDIT_ARCH_$(echo ${ARCH} | tr '[a-z]' '[A-Z]' \ HEADER="/* This file was automatically generated by $(basename ${0}) */ +#include <stddef.h> +#include <linux/audit.h> +#include <linux/filter.h> +#include <linux/seccomp.h> + #ifndef AUDIT_ARCH_PPC64LE #define AUDIT_ARCH_PPC64LE (AUDIT_ARCH_PPC64 | __AUDIT_ARCH_LE) #endif" @@ -44,6 +44,9 @@ #ifndef SIPHASH_H #define SIPHASH_H +#include <stddef.h> +#include <stdint.h> + /** * struct siphash_state - Internal state of siphash calculation */ @@ -130,9 +130,18 @@ unsigned long tap_l2_max_len(const struct ctx *c) */ void tap_send_single(const struct ctx *c, const void *data, size_t l2len) { - uint32_t vnet_len = htonl(l2len); + uint8_t padded[ETH_ZLEN] = { 0 }; struct iovec iov[2]; size_t iovcnt = 0; + uint32_t vnet_len; + + if (l2len < ETH_ZLEN) { + memcpy(padded, data, l2len); + data = padded; + l2len = ETH_ZLEN; + } + + vnet_len = htonl(l2len); switch (c->mode) { case MODE_PASST: @@ -1140,10 +1149,8 @@ void tap_sock_reset(struct ctx *c) { info("Client connection closed%s", c->one_off ? ", exiting" : ""); - if (c->one_off) { - fsync_pcap_and_log(); - _exit(EXIT_SUCCESS); - } + if (c->one_off) + passt_exit(EXIT_SUCCESS); /* Close the connected socket, wait for a new connection */ epoll_del(c->epollfd, c->fd_tap); @@ -1484,13 +1491,16 @@ static int tap_ns_tun(void *arg) */ static void tap_sock_tun_init(struct ctx *c) { - NS_CALL(tap_ns_tun, c); - if (c->fd_tap == -1) - die("Failed to set up tap device in namespace"); + if (!c->splice_only) { + NS_CALL(tap_ns_tun, c); + if (c->fd_tap == -1) + die("Failed to set up tap device in namespace"); + } pasta_ns_conf(c); - tap_start_connection(c); + if (!c->splice_only) + tap_start_connection(c); } /** @@ -6,6 +6,11 @@ #ifndef TAP_H #define TAP_H +#include <stddef.h> +#include <stdint.h> + +#include "passt.h" + /** L2_MAX_LEN_PASTA - Maximum frame length for pasta mode (with L2 header) * * The kernel tuntap device imposes a maximum frame size of 65535 including @@ -65,8 +70,7 @@ static inline struct iovec tap_hdr_iov(const struct ctx *c, */ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len) { - if (thdr) - thdr->vnet_len = htonl(l2len); + thdr->vnet_len = htonl(l2len); } unsigned long tap_l2_max_len(const struct ctx *c); @@ -190,22 +190,27 @@ * - RTO_INIT_AFTER_SYN_RETRIES: if SYN retries happened during handshake and * RTO is less than this, re-initialise RTO to this for data retransmissions * - * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE - * with TAP_FIN_SENT event), and no ACK is received within this time, reset - * the connection + * - RTT / 2 elapsed after data segment received from tap without having + * sent an ACK segment, or zero-sized window advertised to tap/guest (flag + * ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent. * - * - FIN_TIMEOUT: if a FIN segment was acknowledged by tap/guest and a FIN - * segment (write shutdown) was sent via socket (events SOCK_FIN_SENT and - * TAP_FIN_ACKED), but no socket activity is detected from the socket within - * this time, reset the connection + * RTT, here, is an approximation of the RTT value reported by the kernel via + * TCP_INFO, with a representable range from RTT_STORE_MIN (100 us) to + * RTT_STORE_MAX (3276.8 ms). The timeout value is clamped accordingly. * - * - ACT_TIMEOUT, in the presence of any event: if no activity is detected on - * either side, the connection is reset + * We also use a global interval timer for an activity timeout which doesn't + * require precision: * - * - ACK_INTERVAL elapsed after data segment received from tap without having - * sent an ACK segment, or zero-sized window advertised to tap/guest (flag - * ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent + * - INACTIVITY_INTERVAL: if a connection has had no activity for an entire + * interval, close and reset it. This means that idle connections (without + * keepalives) will be removed between INACTIVITY_INTERVAL s and + * 2*INACTIVITY_INTERVAL s after the last activity. * + * - KEEPALIVE_INTERVAL: if a connection has had no tap-side activity for an + * entire interval, send a tap-side keepalive. If the endpoint is no longer + * aware of the connection (due to a reboot, or a kernel timeout in FIN_WAIT_2 + * state) that should trigger an RST, so we won't keep track of connections + * that the guest endpoint no longer cares about. * * Summary of data flows (with ESTABLISHED event) * ---------------------------------------------- @@ -297,8 +302,6 @@ #include "ip.h" #include "passt.h" #include "tap.h" -#include "siphash.h" -#include "pcap.h" #include "tcp_splice.h" #include "log.h" #include "inany.h" @@ -341,15 +344,25 @@ enum { #define MSS_DEFAULT 536 #define WINDOW_DEFAULT 14600 /* RFC 6928 */ -#define ACK_INTERVAL 10 /* ms */ #define RTO_INIT 1 /* s, RFC 6298 */ #define RTO_INIT_AFTER_SYN_RETRIES 3 /* s, RFC 6298 */ -#define FIN_TIMEOUT 60 -#define ACT_TIMEOUT 7200 + +#define INACTIVITY_INTERVAL 7200 /* s */ +#define KEEPALIVE_INTERVAL 30 /* s */ #define LOW_RTT_TABLE_SIZE 8 #define LOW_RTT_THRESHOLD 10 /* us */ +/* Parameters to temporarily exceed sending buffer to force TCP auto-tuning */ +#define SNDBUF_BOOST_BYTES_RTT_LO 2500 /* B * s: no boost until here */ +/* ...examples: 5 MB sent * 500 ns RTT, 250 kB * 10 ms, 8 kB * 300 ms */ +#define SNDBUF_BOOST_FACTOR 150 /* % */ +#define SNDBUF_BOOST_BYTES_RTT_HI 6000 /* apply full boost factor */ +/* 12 MB sent * 500 ns RTT, 600 kB * 10 ms, 20 kB * 300 ms */ + +/* Ratio of buffer to bandwidth * delay product implying interactive traffic */ +#define SNDBUF_TO_BW_DELAY_INTERACTIVE /* > */ 20 /* (i.e. < 5% of buffer) */ + #define ACK_IF_NEEDED 0 /* See tcp_send_flag() */ #define CONN_IS_CLOSING(conn) \ @@ -401,10 +414,6 @@ static const char *tcp_flag_str[] __attribute((__unused__)) = { "ACK_FROM_TAP_DUE", "ACK_FROM_TAP_BLOCKS", "SYN_RETRIED", }; -/* Listening sockets, used for automatic port forwarding in pasta mode only */ -static int tcp_sock_init_ext [NUM_PORTS][IP_VERSIONS]; -static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS]; - /* Table of our guest side addresses with very low RTT (assumed to be local to * the host), LRU */ @@ -423,11 +432,13 @@ socklen_t tcp_info_size; sizeof(((struct tcp_info_linux *)NULL)->tcpi_##f_)) <= tcp_info_size) /* Kernel reports sending window in TCP_INFO (kernel commit 8f7baad7f035) */ -#define snd_wnd_cap tcp_info_cap(snd_wnd) +#define snd_wnd_cap tcp_info_cap(snd_wnd) /* Kernel reports bytes acked in TCP_INFO (kernel commit 0df48c26d84) */ -#define bytes_acked_cap tcp_info_cap(bytes_acked) +#define bytes_acked_cap tcp_info_cap(bytes_acked) /* Kernel reports minimum RTT in TCP_INFO (kernel commit cd9b266095f4) */ -#define min_rtt_cap tcp_info_cap(min_rtt) +#define min_rtt_cap tcp_info_cap(min_rtt) +/* Kernel reports delivery rate in TCP_INFO (kernel commit eb8329e0a04d) */ +#define delivery_rate_cap tcp_info_cap(delivery_rate) /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; @@ -508,47 +519,30 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) /** * tcp_epoll_ctl() - Add/modify/delete epoll state from connection events - * @c: Execution context * @conn: Connection pointer * * Return: 0 on success, negative error code on failure (not on deletion) */ -static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) +static int tcp_epoll_ctl(struct tcp_tap_conn *conn) { - int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; - union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock, - .flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), }; - struct epoll_event ev = { .data.u64 = ref.u64 }; - int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f) - : c->epollfd; + uint32_t events; if (conn->events == CLOSED) { - if (flow_in_epoll(&conn->f)) - epoll_del(epollfd, conn->sock); + int epollfd = flow_epollfd(&conn->f); + + epoll_del(epollfd, conn->sock); if (conn->timer != -1) epoll_del(epollfd, conn->timer); + return 0; } - ev.events = tcp_conn_epoll_events(conn->events, conn->flags); + events = tcp_conn_epoll_events(conn->events, conn->flags); - if (epoll_ctl(epollfd, m, conn->sock, &ev)) + if (flow_epoll_set(&conn->f, EPOLL_CTL_MOD, events, conn->sock, + !TAPSIDE(conn)) < 0) return -errno; - flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT); - - if (conn->timer != -1) { - union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER, - .fd = conn->sock, - .flow = FLOW_IDX(conn) }; - struct epoll_event ev_t = { .data.u64 = ref_t.u64, - .events = EPOLLIN | EPOLLET }; - - if (epoll_ctl(flow_epollfd(&conn->f), EPOLL_CTL_MOD, - conn->timer, &ev_t)) - return -errno; - } - return 0; } @@ -556,8 +550,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) * tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed * @c: Execution context * @conn: Connection pointer - * - * #syscalls timerfd_create timerfd_settime + * #syscalls timerfd_create timerfd_settime|timerfd_settime32 */ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) { @@ -567,34 +560,38 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) return; if (conn->timer == -1) { - union epoll_ref ref = { .type = EPOLL_TYPE_TCP_TIMER, - .fd = conn->sock, - .flow = FLOW_IDX(conn) }; - struct epoll_event ev = { .data.u64 = ref.u64, - .events = EPOLLIN | EPOLLET }; - int epollfd = flow_epollfd(&conn->f); + union epoll_ref ref; int fd; fd = timerfd_create(CLOCK_MONOTONIC, 0); - if (fd == -1 || fd > FD_REF_MAX) { + if (fd == -1) { flow_dbg_perror(conn, "failed to get timer"); - if (fd > -1) - close(fd); - conn->timer = -1; return; } - conn->timer = fd; + if (fd > FD_REF_MAX) { + flow_dbg(conn, "timer fd overflow (%d > %d)", + fd, FD_REF_MAX); + close(fd); + return; + } - if (epoll_ctl(epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) { - flow_dbg_perror(conn, "failed to add timer"); - close(conn->timer); - conn->timer = -1; + ref.type = EPOLL_TYPE_TCP_TIMER; + ref.flow = FLOW_IDX(conn); + ref.fd = fd; + if (epoll_add(flow_epollfd(&conn->f), EPOLLIN | EPOLLET, + ref) < 0) { + flow_dbg(conn, "failed to add timer"); + close(fd); return; } + + conn->timer = fd; } if (conn->flags & ACK_TO_TAP_DUE) { - it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000; + it.it_value.tv_sec = RTT_GET(conn) / 2 / ((long)1000 * 1000); + it.it_value.tv_nsec = RTT_GET(conn) / 2 % ((long)1000 * 1000) * + 1000; } else if (conn->flags & ACK_FROM_TAP_DUE) { int exp = conn->retries, timeout = RTO_INIT; if (!(conn->events & ESTABLISHED)) @@ -603,15 +600,23 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) timeout = MAX(timeout, RTO_INIT_AFTER_SYN_RETRIES); timeout <<= MAX(exp, 0); it.it_value.tv_sec = MIN(timeout, c->tcp.rto_max); - } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { - it.it_value.tv_sec = FIN_TIMEOUT; } else { - it.it_value.tv_sec = ACT_TIMEOUT; + /* Disarm */ + it.it_value.tv_sec = 0; + it.it_value.tv_nsec = 0; } - flow_dbg(conn, "timer expires in %llu.%03llus", - (unsigned long long)it.it_value.tv_sec, - (unsigned long long)it.it_value.tv_nsec / 1000 / 1000); + if (conn->flags & ACK_TO_TAP_DUE) { + flow_trace(conn, "timer expires in %llu.%02llums", + (unsigned long long)it.it_value.tv_sec * 1000 + + it.it_value.tv_nsec / 1000 / 1000, + (unsigned long long)it.it_value.tv_nsec + / 1000 / 10 % 100); + } else { + flow_dbg(conn, "timer expires in %llu.%03llus", + (unsigned long long)it.it_value.tv_sec, + (unsigned long long)it.it_value.tv_nsec / 1000 / 1000); + } if (timerfd_settime(conn->timer, 0, &it, NULL)) flow_perror(conn, "failed to set timer"); @@ -657,7 +662,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, } if (flag == STALLED || flag == ~STALLED) - tcp_epoll_ctl(c, conn); + tcp_epoll_ctl(conn); if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE || (flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) || @@ -714,11 +719,8 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, } else { if (event == CLOSED) flow_hash_remove(c, TAP_SIDX(conn)); - tcp_epoll_ctl(c, conn); + tcp_epoll_ctl(conn); } - - if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) - tcp_timer_ctl(c, conn); } /** @@ -774,7 +776,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, } /** - * tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.5 usage) + * tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.75 usage) * @conn: Connection pointer */ static void tcp_get_sndbuf(struct tcp_tap_conn *conn) @@ -789,11 +791,7 @@ static void tcp_get_sndbuf(struct tcp_tap_conn *conn) return; } - v = sndbuf; - if (v >= SNDBUF_BIG) - v /= 2; - else if (v > SNDBUF_SMALL) - v -= v * (v - SNDBUF_SMALL) / (SNDBUF_BIG - SNDBUF_SMALL) / 2; + v = clamped_scale(sndbuf, sndbuf, SNDBUF_SMALL, SNDBUF_BIG, 75); SNDBUF_SET(conn, MIN(INT_MAX, v)); } @@ -940,7 +938,6 @@ static void tcp_fill_header(struct tcphdr *th, * tcp_fill_headers() - Fill 802.3, IP, TCP headers * @c: Execution context * @conn: Connection pointer - * @taph: tap backend specific header * @eh: Pointer to Ethernet header * @ip4h: Pointer to IPv4 header, or NULL * @ip6h: Pointer to IPv6 header, or NULL @@ -949,12 +946,15 @@ static void tcp_fill_header(struct tcphdr *th, * @ip4_check: IPv4 checksum, if already known * @seq: Sequence number for this segment * @no_tcp_csum: Do not set TCP checksum + * + * Return: frame length (including L2 headers) */ -void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn, - struct tap_hdr *taph, struct ethhdr *eh, - struct iphdr *ip4h, struct ipv6hdr *ip6h, - struct tcphdr *th, struct iov_tail *payload, - const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum) +size_t tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn, + struct ethhdr *eh, + struct iphdr *ip4h, struct ipv6hdr *ip6h, + struct tcphdr *th, struct iov_tail *payload, + const uint16_t *ip4_check, uint32_t seq, + bool no_tcp_csum) { const struct flowside *tapside = TAPFLOW(conn); size_t l4len = iov_tail_size(payload) + sizeof(*th); @@ -1020,7 +1020,36 @@ void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn, else tcp_update_csum(psum, th, payload); - tap_hdr_update(taph, l3len + sizeof(struct ethhdr)); + return MAX(l3len + sizeof(struct ethhdr), ETH_ZLEN); +} + +/** + * tcp_sndbuf_boost() - Calculate limit of sending buffer to force auto-tuning + * @conn: Connection pointer + * @tinfo: tcp_info from kernel, must be pre-fetched + * + * Return: increased sending buffer to use as a limit for advertised window + */ +static unsigned long tcp_sndbuf_boost(const struct tcp_tap_conn *conn, + const struct tcp_info_linux *tinfo) +{ + unsigned long bytes_rtt_product; + + if (!bytes_acked_cap) + return SNDBUF_GET(conn); + + /* This is *not* a bandwidth-delay product, but it's somewhat related: + * as we send more data (usually at the beginning of a connection), we + * try to make the sending buffer progressively grow, with the RTT as a + * factor (longer delay, bigger buffer needed). + */ + bytes_rtt_product = (long long)tinfo->tcpi_bytes_acked * + tinfo->tcpi_rtt / 1000 / 1000; + + return clamped_scale(SNDBUF_GET(conn), bytes_rtt_product, + SNDBUF_BOOST_BYTES_RTT_LO, + SNDBUF_BOOST_BYTES_RTT_HI, + SNDBUF_BOOST_FACTOR); } /** @@ -1031,6 +1060,8 @@ void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn, * @tinfo: tcp_info from kernel, can be NULL if not pre-fetched * * Return: 1 if sequence or window were updated, 0 otherwise + * + * #syscalls ioctl */ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, bool force_seq, struct tcp_info_linux *tinfo) @@ -1041,6 +1072,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, socklen_t sl = sizeof(*tinfo); struct tcp_info_linux tinfo_new; uint32_t new_wnd_to_tap = prev_wnd_to_tap; + bool ack_everything = true; int s = conn->sock; /* At this point we could ack all the data we've accepted for forwarding @@ -1050,7 +1082,8 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, * control behaviour. * * For it to be possible and worth it we need: - * - The TCP_INFO Linux extension which gives us the peer acked bytes + * - The TCP_INFO Linux extensions which give us the peer acked bytes + * and the delivery rate (outbound bandwidth at receiver) * - Not to be told not to (force_seq) * - Not half-closed in the peer->guest direction * With no data coming from the peer, we might not get events which @@ -1060,13 +1093,19 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, * Data goes from socket to socket, with nothing meaningfully "in * flight". * - Not a pseudo-local connection (e.g. to a VM on the same host) - * - Large enough send buffer - * In these cases, there's not enough in flight to bother. + * If it is, there's not enough in flight to bother. + * - Sending buffer significantly larger than bandwidth * delay product + * Meaning we're not bandwidth-bound and this is likely to be + * interactive traffic where we want to preserve transparent + * connection behaviour and latency. + * + * Otherwise, we probably want to maximise throughput, which needs + * sending buffer auto-tuning, triggered in turn by filling up the + * outbound socket queue. */ - if (bytes_acked_cap && !force_seq && + if (bytes_acked_cap && delivery_rate_cap && !force_seq && !CONN_IS_CLOSING(conn) && - !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn) && - (unsigned)SNDBUF_GET(conn) >= SNDBUF_SMALL) { + !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn)) { if (!tinfo) { tinfo = &tinfo_new; if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) @@ -1075,14 +1114,24 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, /* This trips a cppcheck bug in some versions, including * cppcheck 2.18.3. - * https://sourceforge.net/p/cppcheck/discussion/general/thread/fecde59085/ + * https://trac.cppcheck.net/ticket/14191 */ /* cppcheck-suppress [uninitvar,unmatchedSuppression] */ - conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked + - conn->seq_init_from_tap; - } else { + if ((unsigned)SNDBUF_GET(conn) > (long long)tinfo->tcpi_rtt * + tinfo->tcpi_delivery_rate / + 1000 / 1000 * + SNDBUF_TO_BW_DELAY_INTERACTIVE) + ack_everything = false; + } + + if (ack_everything) { /* Fall back to acknowledging everything we got */ conn->seq_ack_to_tap = conn->seq_from_tap; + } else { + /* cppcheck bug 14191 again, see above */ + /* cppcheck-suppress [uninitvar,unmatchedSuppression] */ + conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked + + conn->seq_init_from_tap; } /* It's occasionally possible for us to go from using the fallback above @@ -1113,9 +1162,54 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) { new_wnd_to_tap = tinfo->tcpi_snd_wnd; } else { + unsigned rtt_ms_ceiling = DIV_ROUND_UP(tinfo->tcpi_rtt, 1000); + uint32_t sendq; + int limit; + + if (ioctl(s, SIOCOUTQ, &sendq)) { + debug_perror("SIOCOUTQ on socket %i, assuming 0", s); + sendq = 0; + } tcp_get_sndbuf(conn); - new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, - SNDBUF_GET(conn)); + + if ((int)sendq > SNDBUF_GET(conn)) /* Due to memory pressure? */ + limit = 0; + else if ((int)tinfo->tcpi_snd_wnd > SNDBUF_GET(conn)) + limit = tcp_sndbuf_boost(conn, tinfo) - (int)sendq; + else + limit = SNDBUF_GET(conn) - (int)sendq; + + /* If the sender uses mechanisms to prevent Silly Window + * Syndrome (SWS, described in RFC 813 Section 3) it's critical + * that, should the window ever become less than the MSS, we + * advertise a new value once it increases again to be above it. + * + * The mechanism to avoid SWS in the kernel is, implicitly, + * implemented by Nagle's algorithm (which was proposed after + * RFC 813). + * + * To this end, for simplicity, approximate a window value below + * the MSS to zero, as we already have mechanisms in place to + * force updates after the window becomes zero. This matches the + * suggestion from RFC 813, Section 4. + * + * But don't do this if, either: + * + * - there's nothing in the outbound queue: the size of the + * sending buffer is limiting us, and it won't increase if we + * don't send data, so there's no point in waiting, or + * + * - we haven't sent data in a while (somewhat arbitrarily, ten + * times the RTT), as that might indicate that the receiver + * will only process data in batches that are large enough, + * but we won't send enough to fill one because we're stuck + * with pending data in the outbound queue + */ + if (limit < MSS_GET(conn) && sendq && + tinfo->tcpi_last_data_sent < rtt_ms_ceiling * 10) + limit = 0; + + new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, limit); } new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW); @@ -1135,6 +1229,10 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, conn_flag(c, conn, ACK_TO_TAP_DUE); out: + /* Opportunistically store RTT approximation on valid TCP_INFO data */ + if (tinfo) + RTT_SET(conn, tinfo->tcpi_rtt); + return new_wnd_to_tap != prev_wnd_to_tap || conn->seq_ack_to_tap != prev_ack_to_tap; } @@ -1256,7 +1354,8 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, th->fin = !!(flags & FIN); if (th->ack) { - if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap)) + if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) && + conn->wnd_to_tap) conn_flag(c, conn, ~ACK_TO_TAP_DUE); else conn_flag(c, conn, ACK_TO_TAP_DUE); @@ -1290,7 +1389,34 @@ static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, } /** - * tcp_rst_do() - Reset a tap connection: send RST segment to tap, close socket + * tcp_sock_rst() - Close TCP connection forcing RST on socket side + * @c: Execution context + * @conn: Connection pointer + */ +static void tcp_sock_rst(const struct ctx *c, struct tcp_tap_conn *conn) +{ + const struct linger linger0 = { + .l_onoff = 1, + .l_linger = 0, + }; + + /* Force RST on socket to inform the peer + * + * We do this by setting SO_LINGER with 0 timeout, which means that + * close() will send an RST (unless the connection is already closed in + * both directions). + */ + if (setsockopt(conn->sock, SOL_SOCKET, + SO_LINGER, &linger0, sizeof(linger0)) < 0) { + flow_dbg_perror(conn, + "SO_LINGER failed, may not send RST to peer"); + } + + conn_event(c, conn, CLOSED); +} + +/** + * tcp_rst_do() - Reset a tap connection: send RST segment on both sides, close * @c: Execution context * @conn: Connection pointer */ @@ -1299,8 +1425,10 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn) if (conn->events == CLOSED) return; + /* Send RST on tap */ tcp_send_flag(c, conn, RST); - conn_event(c, conn, CLOSED); + + tcp_sock_rst(c, conn); } /** @@ -1543,7 +1671,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, ini = flow_initiate_af(flow, PIF_TAP, af, saddr, srcport, daddr, dstport); - if (!(tgt = flow_target(c, flow, IPPROTO_TCP))) + if (!(tgt = flow_target(c, flow, FWD_NO_HINT, IPPROTO_TCP))) goto cancel; if (flow->f.pif[TGTSIDE] != PIF_HOST) { @@ -1592,7 +1720,11 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, conn->sock = s; conn->timer = -1; - conn->listening_sock = -1; + flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT); + if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, s, TGTSIDE) < 0) { + flow_perror(flow, "Can't register with epoll"); + goto cancel; + } conn_event(c, conn, TAP_SYN_RCVD); conn->wnd_to_tap = WINDOW_DEFAULT; @@ -1636,7 +1768,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, conn_event(c, conn, TAP_SYN_ACK_SENT); } - tcp_epoll_ctl(c, conn); + tcp_epoll_ctl(conn); if (c->mode == MODE_VU) { /* To rebind to same oport after migration */ socklen_t sl = sizeof(sa); @@ -1771,7 +1903,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, return -1; if (th->rst) { - conn_event(c, conn, CLOSED); + tcp_sock_rst(c, conn); return 1; } @@ -1787,6 +1919,10 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, tcp_send_flag(c, conn, ACK); tcp_timer_ctl(c, conn); + if (setsockopt(conn->sock, SOL_SOCKET, SO_KEEPALIVE, + &((int){ 1 }), sizeof(int))) + flow_trace(conn, "failed to set SO_KEEPALIVE"); + if (p->count == 1) { tcp_tap_window_update(c, conn, ntohs(th->window)); @@ -1913,20 +2049,17 @@ eintr: goto eintr; if (errno == EAGAIN || errno == EWOULDBLOCK) { - tcp_send_flag(c, conn, ACK_IF_NEEDED); + tcp_send_flag(c, conn, ACK | DUP_ACK); return p->count - idx; } return -1; } - if (n < (int)(seq_from_tap - conn->seq_from_tap)) { + if (n < (int)(seq_from_tap - conn->seq_from_tap)) partial_send = 1; - conn->seq_from_tap += n; - tcp_send_flag(c, conn, ACK_IF_NEEDED); - } else { - conn->seq_from_tap += n; - } + + conn->seq_from_tap += n; out: if (keep != -1 || partial_send) { @@ -2134,10 +2267,13 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, flow_trace(conn, "packet length %zu from tap", l4len); if (th->rst) { - conn_event(c, conn, CLOSED); + tcp_sock_rst(c, conn); return 1; } + conn->inactive = false; + conn->tap_inactive = false; + if (th->ack && !(conn->events & ESTABLISHED)) tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq)); @@ -2166,7 +2302,11 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, if (th->fin) { conn->seq_from_tap++; - shutdown(conn->sock, SHUT_WR); + if (shutdown(conn->sock, SHUT_WR) < 0) { + flow_dbg_perror(conn, "shutdown() failed"); + goto reset; + } + tcp_send_flag(c, conn, ACK); conn_event(c, conn, SOCK_FIN_SENT); @@ -2241,7 +2381,11 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, socklen_t sl; struct tcp_info tinfo; - shutdown(conn->sock, SHUT_WR); + if (shutdown(conn->sock, SHUT_WR) < 0) { + flow_dbg_perror(conn, "shutdown() failed"); + goto reset; + } + conn_event(c, conn, SOCK_FIN_SENT); tcp_send_flag(c, conn, ACK); ack_due = 0; @@ -2315,6 +2459,15 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow, conn->sock = s; conn->timer = -1; conn->ws_to_tap = conn->ws_from_tap = 0; + + flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT); + if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, s, INISIDE) < 0) { + flow_perror(flow, "Can't register with epoll"); + conn_flag(c, conn, CLOSING); + FLOW_ACTIVATE(conn); + return; + } + conn_event(c, conn, SOCK_ACCEPTED); hash = flow_hash_insert(c, TAP_SIDX(conn)); @@ -2341,7 +2494,6 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow, void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, const struct timespec *now) { - struct tcp_tap_conn *conn; union sockaddr_inany sa; socklen_t sl = sizeof(sa); struct flowside *ini; @@ -2357,17 +2509,14 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, if (s < 0) goto cancel; - conn = (struct tcp_tap_conn *)flow; - conn->listening_sock = ref.fd; - tcp_sock_set_nodelay(s); /* FIXME: If useful: when the listening port has a specific bound * address, record that as our address, as implemented for vhost-user * mode only, below. */ - ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa, - NULL, ref.tcp_listen.port); + ini = flow_initiate_sa(flow, ref.listen.pif, &sa, + NULL, ref.listen.port); if (getsockname(s, &sa.sa, &sl) || inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa) < 0) @@ -2381,7 +2530,7 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, goto cancel; } - if (!flow_target(c, flow, IPPROTO_TCP)) + if (!flow_target(c, flow, ref.listen.rule, IPPROTO_TCP)) goto cancel; switch (flow->f.pif[TGTSIDE]) { @@ -2412,7 +2561,9 @@ cancel: * @c: Execution context * @ref: epoll reference of timer (not connection) * - * #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64 + * #syscalls timerfd_gettime|timerfd_gettime64 + * #syscalls arm:timerfd_gettime64 i686:timerfd_gettime64 + * #syscalls arm:timerfd_settime64 i686:timerfd_settime64 */ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) { @@ -2450,9 +2601,6 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) conn_flag(c, conn, SYN_RETRIED); tcp_timer_ctl(c, conn); } - } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { - flow_dbg(conn, "FIN timeout"); - tcp_rst(c, conn); } else if (conn->retries == TCP_MAX_RETRIES) { flow_dbg(conn, "retransmissions count exceeded"); tcp_rst(c, conn); @@ -2469,23 +2617,6 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) tcp_data_from_sock(c, conn); tcp_timer_ctl(c, conn); } - } else { - struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } }; - struct itimerspec old = { { 0 }, { 0 } }; - - /* Activity timeout: if it was already set, reset the - * connection, otherwise, it was a left-over from ACK_TO_TAP_DUE - * or ACK_FROM_TAP_DUE, so just set the long timeout in that - * case. This avoids having to preemptively reset the timer on - * ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE. - */ - if (timerfd_settime(conn->timer, 0, &new, &old)) - flow_perror(conn, "failed to set timer"); - - if (old.it_value.tv_sec == ACT_TIMEOUT) { - flow_dbg(conn, "activity timeout"); - tcp_rst(c, conn); - } } } @@ -2511,6 +2642,8 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, return; } + conn->inactive = false; + if ((conn->events & TAP_FIN_ACKED) && (events & EPOLLHUP)) { conn_event(c, conn, CLOSED); return; @@ -2552,65 +2685,18 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, } /** - * tcp_sock_init_one() - Initialise listening socket for address and port - * @c: Execution context - * @pif: Interface to open the socket for (PIF_HOST or PIF_SPLICE) - * @addr: Pointer to address for binding, NULL for dual stack any - * @ifname: Name of interface to bind to, NULL if not configured - * @port: Port, host order - * - * Return: fd for the new listening socket, negative error code on failure - * - * If pif == PIF_SPLICE, the caller must have already entered the guest ns. - */ -static int tcp_sock_init_one(const struct ctx *c, uint8_t pif, - const union inany_addr *addr, const char *ifname, - in_port_t port) -{ - union tcp_listen_epoll_ref tref = { - .port = port, - .pif = pif, - }; - const struct fwd_ports *fwd; - int s; - - if (pif == PIF_HOST) - fwd = &c->tcp.fwd_in; - else - fwd = &c->tcp.fwd_out; - - s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, pif, addr, ifname, - port, tref.u32); - - if (fwd->mode == FWD_AUTO) { - int (*socks)[IP_VERSIONS] = pif == PIF_SPLICE ? - tcp_sock_ns : tcp_sock_init_ext; - - if (!addr || inany_v4(addr)) - socks[port][V4] = s < 0 ? -1 : s; - if (!addr || !inany_v4(addr)) - socks[port][V6] = s < 0 ? -1 : s; - } - - if (s < 0) - return s; - - return s; -} - -/** - * tcp_sock_init() - Create listening socket for a given host ("inbound") port + * tcp_listen() - Create listening socket * @c: Execution context * @pif: Interface to open the socket for (PIF_HOST or PIF_SPLICE) - * @addr: Pointer to address for binding, NULL if not configured - * @ifname: Name of interface to bind to, NULL if not configured + * @rule: Index of relevant forwarding rule + * @addr: Pointer to address for binding, NULL for any + * @ifname: Name of interface to bind to, NULL for any * @port: Port, host order * - * Return: 0 on success, negative error code on failure + * Return: socket fd on success, negative error code on failure */ -int tcp_sock_init(const struct ctx *c, uint8_t pif, - const union inany_addr *addr, const char *ifname, - in_port_t port) +int tcp_listen(const struct ctx *c, uint8_t pif, unsigned rule, + const union inany_addr *addr, const char *ifname, in_port_t port) { int s; @@ -2621,69 +2707,19 @@ int tcp_sock_init(const struct ctx *c, uint8_t pif, /* Restrict to v6 only */ addr = &inany_any6; else if (inany_v4(addr)) - /* Nothing to do */ - return 0; + return -EAFNOSUPPORT; } if (!c->ifi6) { if (!addr) /* Restrict to v4 only */ addr = &inany_any4; else if (!inany_v4(addr)) - /* Nothing to do */ - return 0; - } - - s = tcp_sock_init_one(c, pif, addr, ifname, port); - if (s < 0) - return s; - if (s > FD_REF_MAX) - return -EIO; - - return 0; -} - -/** - * tcp_ns_sock_init() - Init socket to listen for spliced outbound connections - * @c: Execution context - * @port: Port, host order - */ -static void tcp_ns_sock_init(const struct ctx *c, in_port_t port) -{ - ASSERT(!c->no_tcp); - - if (!c->no_bindtodevice) { - tcp_sock_init(c, PIF_SPLICE, NULL, "lo", port); - return; + return -EAFNOSUPPORT; } - if (c->ifi4) - tcp_sock_init_one(c, PIF_SPLICE, &inany_loopback4, NULL, port); - if (c->ifi6) - tcp_sock_init_one(c, PIF_SPLICE, &inany_loopback6, NULL, port); -} - -/** - * tcp_ns_socks_init() - Bind sockets in namespace for outbound connections - * @arg: Execution context - * - * Return: 0 - */ -/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */ -static int tcp_ns_socks_init(void *arg) -{ - const struct ctx *c = (const struct ctx *)arg; - unsigned port; - - ns_enter(c); - - for (port = 0; port < NUM_PORTS; port++) { - if (!bitmap_isset(c->tcp.fwd_out.map, port)) - continue; + s = pif_listen(c, EPOLL_TYPE_TCP_LISTEN, pif, addr, ifname, port, rule); - tcp_ns_sock_init(c, port); - } - - return 0; + return s; } /** @@ -2812,7 +2848,7 @@ static void tcp_get_rto_params(struct ctx *c) * tcp_init() - Get initial sequence, hash secret, initialise per-socket data * @c: Execution context * - * Return: 0, doesn't return on failure + * Return: 0 on success, -1 on failure */ int tcp_init(struct ctx *c) { @@ -2824,15 +2860,16 @@ int tcp_init(struct ctx *c) memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4)); memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6)); - memset(tcp_sock_init_ext, 0xff, sizeof(tcp_sock_init_ext)); - memset(tcp_sock_ns, 0xff, sizeof(tcp_sock_ns)); tcp_sock_refill_init(c); + if (fwd_listen_sync(c, &c->tcp.fwd_in, PIF_HOST, IPPROTO_TCP) < 0) + return -1; if (c->mode == MODE_PASTA) { tcp_splice_init(c); - - NS_CALL(tcp_ns_socks_init, c); + if (fwd_listen_sync(c, &c->tcp.fwd_out, + PIF_SPLICE, IPPROTO_TCP) < 0) + return -1; } peek_offset_cap = (!c->ifi4 || tcp_probe_peek_offset_cap(AF_INET)) && @@ -2842,7 +2879,7 @@ int tcp_init(struct ctx *c) tcp_info_size = tcp_probe_tcp_info(); #define dbg_tcpi(f_) debug("TCP_INFO tcpi_%s field%s supported", \ - STRINGIFY(f_), tcp_info_cap(f_) ? " " : " not ") + STRINGIFY(f_), tcp_info_cap(f_) ? "" : " not") dbg_tcpi(snd_wnd); dbg_tcpi(bytes_acked); dbg_tcpi(min_rtt); @@ -2852,74 +2889,59 @@ int tcp_init(struct ctx *c) } /** - * tcp_port_rebind() - Rebind ports to match forward maps - * @c: Execution context - * @outbound: True to remap outbound forwards, otherwise inbound - * - * Must be called in namespace context if @outbound is true. + * tcp_keepalive() - Send keepalives for connections which need it + * @: Execution context */ -static void tcp_port_rebind(struct ctx *c, bool outbound) +static void tcp_keepalive(struct ctx *c, const struct timespec *now) { - const uint8_t *fmap = outbound ? c->tcp.fwd_out.map : c->tcp.fwd_in.map; - int (*socks)[IP_VERSIONS] = outbound ? tcp_sock_ns : tcp_sock_init_ext; - unsigned port; + union flow *flow; - for (port = 0; port < NUM_PORTS; port++) { - if (!bitmap_isset(fmap, port)) { - if (socks[port][V4] >= 0) { - close(socks[port][V4]); - socks[port][V4] = -1; - } + if (now->tv_sec - c->tcp.keepalive_run < KEEPALIVE_INTERVAL) + return; - if (socks[port][V6] >= 0) { - close(socks[port][V6]); - socks[port][V6] = -1; - } + c->tcp.keepalive_run = now->tv_sec; - continue; - } + flow_foreach_of_type(flow, FLOW_TCP) { + struct tcp_tap_conn *conn = &flow->tcp; - if ((c->ifi4 && socks[port][V4] == -1) || - (c->ifi6 && socks[port][V6] == -1)) { - if (outbound) - tcp_ns_sock_init(c, port); - else - tcp_sock_init(c, PIF_HOST, NULL, NULL, port); + if (conn->tap_inactive) { + flow_dbg(conn, "No tap activity for least %us, send keepalive", + KEEPALIVE_INTERVAL); + tcp_send_flag(c, conn, KEEPALIVE); } + + /* Ready to check fot next interval */ + conn->tap_inactive = true; } } /** - * tcp_port_rebind_outbound() - Rebind ports in namespace - * @arg: Execution context - * - * Called with NS_CALL() - * - * Return: 0 + * tcp_inactivity() - Scan for and close long-inactive connections + * @: Execution context */ -static int tcp_port_rebind_outbound(void *arg) +static void tcp_inactivity(struct ctx *c, const struct timespec *now) { - struct ctx *c = (struct ctx *)arg; + union flow *flow; - ns_enter(c); - tcp_port_rebind(c, true); + if (now->tv_sec - c->tcp.inactivity_run < INACTIVITY_INTERVAL) + return; - return 0; -} + debug("TCP inactivity scan"); + c->tcp.inactivity_run = now->tv_sec; -/** - * tcp_port_rebind_all() - Rebind ports to match forward maps (in host & ns) - * @c: Execution context - */ -void tcp_port_rebind_all(struct ctx *c) -{ - ASSERT(c->mode == MODE_PASTA && !c->no_tcp); + flow_foreach_of_type(flow, FLOW_TCP) { + struct tcp_tap_conn *conn = &flow->tcp; - if (c->tcp.fwd_out.mode == FWD_AUTO) - NS_CALL(tcp_port_rebind_outbound, c); + if (conn->inactive) { + /* No activity in this interval, reset */ + flow_dbg(conn, "Inactive for at least %us, resetting", + INACTIVITY_INTERVAL); + tcp_rst(c, conn); + } - if (c->tcp.fwd_in.mode == FWD_AUTO) - tcp_port_rebind(c, false); + /* Ready to check fot next interval */ + conn->inactive = true; + } } /** @@ -2927,13 +2949,14 @@ void tcp_port_rebind_all(struct ctx *c) * @c: Execution context * @now: Current timestamp */ -void tcp_timer(const struct ctx *c, const struct timespec *now) +void tcp_timer(struct ctx *c, const struct timespec *now) { - (void)now; - tcp_sock_refill_init(c); if (c->mode == MODE_PASTA) tcp_splice_refill(c); + + tcp_keepalive(c, now); + tcp_inactivity(c, now); } /** @@ -3420,7 +3443,7 @@ static int tcp_flow_repair_opt(const struct tcp_tap_conn *conn, } /** - * tcp_flow_migrate_source() - Send data (flow table) for flow, close listening + * tcp_flow_migrate_source() - Send data (flow table) for flow * @fd: Descriptor for state migration * @conn: Pointer to the TCP connection structure * @@ -3460,9 +3483,6 @@ int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn) return rc; } - if (conn->listening_sock != -1 && !fcntl(conn->listening_sock, F_GETFD)) - close(conn->listening_sock); - return 0; } @@ -3671,9 +3691,7 @@ static int tcp_flow_repair_connect(const struct ctx *c, return rc; } - flow_epollid_clear(&conn->f); conn->timer = -1; - conn->listening_sock = -1; return 0; } @@ -3731,14 +3749,19 @@ int tcp_flow_migrate_target(struct ctx *c, int fd) if ((rc = tcp_flow_repair_socket(c, conn))) { flow_err(flow, "Can't set up socket: %s, drop", strerror_(-rc)); - /* Can't leave the flow in an incomplete state */ - FLOW_ACTIVATE(conn); - return 0; + goto out; } + flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT); + if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, conn->sock, + !TAPSIDE(conn))) + goto out; /* tcp_flow_migrate_target_ext() will clean this up */ + flow_hash_insert(c, TAP_SIDX(conn)); - FLOW_ACTIVATE(conn); +out: + /* Never leave the flow in an incomplete state */ + FLOW_ACTIVATE(conn); return 0; } @@ -3862,10 +3885,15 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd int v; v = TCP_SEND_QUEUE; - if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v))) + if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v))) { flow_perror(conn, "Selecting repair queue"); - else - shutdown(s, SHUT_WR); + } else { + if (shutdown(s, SHUT_WR) < 0) { + flow_perror(conn, + "Repair mode shutdown() failed"); + goto fail; + } + } } if (tcp_flow_repair_wnd(conn, &t)) @@ -3892,8 +3920,12 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd * Call shutdown(x, SHUT_WR) *not* in repair mode, which moves us to * TCP_FIN_WAIT1. */ - if (t.tcpi_state == TCP_FIN_WAIT1) - shutdown(s, SHUT_WR); + if (t.tcpi_state == TCP_FIN_WAIT1) { + if (shutdown(s, SHUT_WR) < 0) { + flow_perror(conn, "Post-repair shutdown() failed"); + goto fail; + } + } if (tcp_set_peek_offset(conn, peek_offset)) goto fail; @@ -3901,7 +3933,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd tcp_send_flag(c, conn, ACK); tcp_data_from_sock(c, conn); - if ((rc = tcp_epoll_ctl(c, conn))) { + if ((rc = tcp_epoll_ctl(conn))) { flow_dbg(conn, "Failed to subscribe to epoll for migrated socket: %s", strerror_(-rc)); @@ -6,6 +6,14 @@ #ifndef TCP_H #define TCP_H +#include <stdbool.h> +#include <stdint.h> + +#include <netinet/in.h> +#include <sys/socket.h> + +#include "fwd.h" + #define TCP_TIMER_INTERVAL 1000 /* ms */ struct ctx; @@ -18,12 +26,10 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, uint32_t flow_lbl, const struct pool *p, int idx, const struct timespec *now); -int tcp_sock_init(const struct ctx *c, uint8_t pif, - const union inany_addr *addr, const char *ifname, - in_port_t port); +int tcp_listen(const struct ctx *c, uint8_t pif, unsigned rule, + const union inany_addr *addr, const char *ifname, in_port_t port); int tcp_init(struct ctx *c); -void tcp_port_rebind_all(struct ctx *c); -void tcp_timer(const struct ctx *c, const struct timespec *now); +void tcp_timer(struct ctx *c, const struct timespec *now); void tcp_defer_handler(struct ctx *c); void tcp_update_l2_buf(const unsigned char *eth_d); @@ -31,30 +37,6 @@ void tcp_update_l2_buf(const unsigned char *eth_d); extern bool peek_offset_cap; /** - * union tcp_epoll_ref - epoll reference portion for TCP connections - * @index: Index of connection in table - * @u32: Opaque u32 value of reference - */ -union tcp_epoll_ref { - uint32_t index:20; - uint32_t u32; -}; - -/** - * union tcp_listen_epoll_ref - epoll reference portion for TCP listening - * @port: Bound port number of the socket - * @pif: pif in which the socket is listening - * @u32: Opaque u32 value of reference - */ -union tcp_listen_epoll_ref { - struct { - in_port_t port; - uint8_t pif; - }; - uint32_t u32; -}; - -/** * struct tcp_ctx - Execution context for TCP routines * @port_to_tap: Ports bound host-side, packets to tap or spliced * @fwd_in: Port forwarding configuration for inbound packets @@ -64,6 +46,8 @@ union tcp_listen_epoll_ref { * @rto_max: Maximum retry timeout (in s) * @syn_retries: SYN retries using exponential backoff timeout * @syn_linear_timeouts: SYN retries before using exponential backoff timeout + * @keepalive_run: Time we last issued tap-side keepalives + * @inactivity_run: Time we last scanned for inactive connections */ struct tcp_ctx { struct fwd_ports fwd_in; @@ -73,6 +57,8 @@ struct tcp_ctx { int rto_max; uint8_t syn_retries; uint8_t syn_linear_timeouts; + time_t keepalive_run; + time_t inactivity_run; }; #endif /* TCP_H */ @@ -96,6 +96,7 @@ void tcp_sock_iov_init(const struct ctx *c) iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp_payload_tap_hdr[i]); iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); iov[TCP_IOV_PAYLOAD].iov_base = &tcp_payload[i]; + iov[TCP_IOV_ETH_PAD].iov_base = eth_pad; } } @@ -145,6 +146,22 @@ void tcp_payload_flush(const struct ctx *c) } /** + * tcp_l2_buf_pad() - Calculate padding to send out of padding (zero) buffer + * @iov: Pointer to iovec of frame parts we're about to send + */ +static void tcp_l2_buf_pad(struct iovec *iov) +{ + size_t l2len = iov[TCP_IOV_ETH].iov_len + + iov[TCP_IOV_IP].iov_len + + iov[TCP_IOV_PAYLOAD].iov_len; + + if (l2len < ETH_ZLEN) + iov[TCP_IOV_ETH_PAD].iov_len = ETH_ZLEN - l2len; + else + iov[TCP_IOV_ETH_PAD].iov_len = 0; +} + +/** * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers * @c: Execution context * @conn: Connection pointer @@ -166,14 +183,16 @@ static void tcp_l2_buf_fill_headers(const struct ctx *c, struct ethhdr *eh = iov[TCP_IOV_ETH].iov_base; struct ipv6hdr *ip6h = NULL; struct iphdr *ip4h = NULL; + size_t l2len; if (a4) ip4h = iov[TCP_IOV_IP].iov_base; else ip6h = iov[TCP_IOV_IP].iov_base; - tcp_fill_headers(c, conn, taph, eh, ip4h, ip6h, th, &tail, - check, seq, no_tcp_csum); + l2len = tcp_fill_headers(c, conn, eh, ip4h, ip6h, th, &tail, check, seq, + no_tcp_csum); + tap_hdr_update(taph, l2len); } /** @@ -210,8 +229,14 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) tcp_frame_conns[tcp_payload_used++] = conn; l4len = optlen + sizeof(struct tcphdr); iov[TCP_IOV_PAYLOAD].iov_len = l4len; + + if (flags & KEEPALIVE) + seq--; + tcp_l2_buf_fill_headers(c, conn, iov, NULL, seq, false); + tcp_l2_buf_pad(iov); + if (flags & DUP_ACK) { struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used]; tcp_frame_conns[tcp_payload_used++] = conn; @@ -223,6 +248,7 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) memcpy(dup_iov[TCP_IOV_PAYLOAD].iov_base, iov[TCP_IOV_PAYLOAD].iov_base, l4len); dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len; + dup_iov[TCP_IOV_ETH_PAD].iov_len = iov[TCP_IOV_ETH_PAD].iov_len; } if (tcp_payload_used > TCP_FRAMES_MEM - 2) @@ -270,6 +296,9 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, payload->th.psh = push; iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr); tcp_l2_buf_fill_headers(c, conn, iov, check, seq, false); + + tcp_l2_buf_pad(iov); + if (++tcp_payload_used > TCP_FRAMES_MEM - 1) tcp_payload_flush(c); } @@ -384,6 +413,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) } conn_event(c, conn, TAP_FIN_SENT); + conn_flag(c, conn, ACK_FROM_TAP_DUE); } return 0; @@ -9,6 +9,10 @@ #ifndef TCP_CONN_H #define TCP_CONN_H +#include <stdint.h> + +#include "flow.h" + /** * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced) * @f: Generic flow information @@ -16,9 +20,10 @@ * @ws_from_tap: Window scaling factor advertised from tap/guest * @ws_to_tap: Window scaling factor advertised to tap/guest * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS + * @tapinactive: No tao activity within the current KEEPALIVE_INTERVAL + * @inactive: No activity within the current INACTIVITY_INTERVAL * @sock: Socket descriptor number * @events: Connection events, implying connection states - * @listening_sock: Listening socket this socket was accept()ed from, or -1 * @timer: timerfd descriptor for timeout events * @flags: Connection flags representing internal attributes * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS @@ -49,6 +54,18 @@ struct tcp_tap_conn { #define MSS_SET(conn, mss) (conn->tap_mss = (mss >> (16 - TCP_MSS_BITS))) #define MSS_GET(conn) (conn->tap_mss << (16 - TCP_MSS_BITS)) +#define RTT_EXP_BITS 4 + unsigned int rtt_exp :RTT_EXP_BITS; +#define RTT_EXP_MAX MAX_FROM_BITS(RTT_EXP_BITS) +#define RTT_STORE_MIN 100 /* us, minimum representable */ +#define RTT_STORE_MAX ((long)(RTT_STORE_MIN << RTT_EXP_MAX)) +#define RTT_SET(conn, rtt) \ + (conn->rtt_exp = MIN(RTT_EXP_MAX, ilog2(MAX(1, rtt / RTT_STORE_MIN)))) +#define RTT_GET(conn) (RTT_STORE_MIN << conn->rtt_exp) + + bool tap_inactive :1; + bool inactive :1; + int sock :FD_REF_BITS; uint8_t events; @@ -66,8 +83,6 @@ struct tcp_tap_conn { #define CONN_STATE_BITS /* Setting these clears other flags */ \ (SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED) - int listening_sock; - int timer :FD_REF_BITS; uint8_t flags; diff --git a/tcp_internal.h b/tcp_internal.h index f55025c..d940885 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -6,6 +6,11 @@ #ifndef TCP_INTERNAL_H #define TCP_INTERNAL_H +#include <stdint.h> +#include <netinet/tcp.h> + +#include "util.h" + #define MAX_WS 8 #define MAX_WINDOW (1 << (16 + (MAX_WS))) @@ -38,6 +43,8 @@ /* Flags for internal usage */ #define DUP_ACK (1 << 5) +#define KEEPALIVE (1 << 6) + #define OPT_EOL 0 #define OPT_NOP 1 #define OPT_MSS 2 @@ -57,12 +64,13 @@ #define CONN_V4(conn) (!!inany_v4(&TAPFLOW(conn)->oaddr)) #define CONN_V6(conn) (!CONN_V4(conn)) -/* +/** * enum tcp_iov_parts - I/O vector parts for one TCP frame * @TCP_IOV_TAP tap backend specific header * @TCP_IOV_ETH Ethernet header * @TCP_IOV_IP IP (v4/v6) header * @TCP_IOV_PAYLOAD IP payload (TCP header + data) + * @TCP_IOV_ETH_PAD Ethernet (802.3) padding to 60 bytes * @TCP_NUM_IOVS the number of entries in the iovec array */ enum tcp_iov_parts { @@ -70,6 +78,7 @@ enum tcp_iov_parts { TCP_IOV_ETH = 1, TCP_IOV_IP = 2, TCP_IOV_PAYLOAD = 3, + TCP_IOV_ETH_PAD = 4, TCP_NUM_IOVS }; @@ -174,11 +183,12 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn); struct tcp_info_linux; -void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn, - struct tap_hdr *taph, struct ethhdr *eh, - struct iphdr *ip4h, struct ipv6hdr *ip6h, - struct tcphdr *th, struct iov_tail *payload, - const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum); +size_t tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn, + struct ethhdr *eh, + struct iphdr *ip4h, struct ipv6hdr *ip6h, + struct tcphdr *th, struct iov_tail *payload, + const uint16_t *ip4_check, uint32_t seq, + bool no_tcp_csum); int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, bool force_seq, struct tcp_info_linux *tinfo); diff --git a/tcp_splice.c b/tcp_splice.c index 717766a..d60981c 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -114,63 +114,44 @@ static struct tcp_splice_conn *conn_at_sidx(flow_sidx_t sidx) * @events: Connection event flags * @ev: Events to fill in, 0 is accepted socket, 1 is connecting socket */ -static void tcp_splice_conn_epoll_events(uint16_t events, - struct epoll_event ev[]) +static uint32_t tcp_splice_conn_epoll_events(uint16_t events, unsigned sidei) { - unsigned sidei; - - flow_foreach_sidei(sidei) - ev[sidei].events = 0; + uint32_t e = 0; if (events & SPLICE_ESTABLISHED) { - flow_foreach_sidei(sidei) { - if (!(events & FIN_SENT(!sidei))) - ev[sidei].events = EPOLLIN | EPOLLRDHUP; - } - } else if (events & SPLICE_CONNECT) { - ev[1].events = EPOLLOUT; + if (!(events & FIN_SENT(!sidei))) + e = EPOLLIN | EPOLLRDHUP; + } else if (sidei == 1 && events & SPLICE_CONNECT) { + e = EPOLLOUT; } - flow_foreach_sidei(sidei) { - if (events & OUT_WAIT(sidei)) { - ev[sidei].events |= EPOLLOUT; - ev[!sidei].events &= ~EPOLLIN; - } - } + if (events & OUT_WAIT(sidei)) + e |= EPOLLOUT; + if (events & OUT_WAIT(!sidei)) + e &= ~EPOLLIN; + + return e; } /** * tcp_splice_epoll_ctl() - Add/modify/delete epoll state from connection events - * @c: Execution context * @conn: Connection pointer * * Return: 0 on success, negative error code on failure (not on deletion) */ -static int tcp_splice_epoll_ctl(const struct ctx *c, - struct tcp_splice_conn *conn) +static int tcp_splice_epoll_ctl(struct tcp_splice_conn *conn) { - int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f) - : c->epollfd; - int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; - const union epoll_ref ref[SIDES] = { - { .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[0], - .flowside = FLOW_SIDX(conn, 0) }, - { .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[1], - .flowside = FLOW_SIDX(conn, 1) } - }; - struct epoll_event ev[SIDES] = { { .data.u64 = ref[0].u64 }, - { .data.u64 = ref[1].u64 } }; - - tcp_splice_conn_epoll_events(conn->events, ev); + uint32_t events[2]; + events[0] = tcp_splice_conn_epoll_events(conn->events, 0); + events[1] = tcp_splice_conn_epoll_events(conn->events, 1); - if (epoll_ctl(epollfd, m, conn->s[0], &ev[0]) || - epoll_ctl(epollfd, m, conn->s[1], &ev[1])) { + if (flow_epoll_set(&conn->f, EPOLL_CTL_MOD, events[0], conn->s[0], 0) || + flow_epoll_set(&conn->f, EPOLL_CTL_MOD, events[1], conn->s[1], 1)) { int ret = -errno; flow_perror(conn, "ERROR on epoll_ctl()"); return ret; } - flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT); return 0; } @@ -210,7 +191,7 @@ static void conn_flag_do(struct tcp_splice_conn *conn, } } -#define conn_flag(c, conn, flag) \ +#define conn_flag(conn, flag) \ do { \ flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \ conn_flag_do(conn, flag); \ @@ -218,12 +199,10 @@ static void conn_flag_do(struct tcp_splice_conn *conn, /** * conn_event_do() - Set and log connection events, update epoll state - * @c: Execution context * @conn: Connection pointer * @event: Connection event */ -static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn, - unsigned long event) +static void conn_event_do(struct tcp_splice_conn *conn, unsigned long event) { if (event & (event - 1)) { int flag_index = fls(~event); @@ -245,16 +224,47 @@ static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn, flow_dbg(conn, "%s", tcp_splice_event_str[flag_index]); } - if (tcp_splice_epoll_ctl(c, conn)) - conn_flag(c, conn, CLOSING); + if (tcp_splice_epoll_ctl(conn)) + conn_flag(conn, CLOSING); } -#define conn_event(c, conn, event) \ +#define conn_event(conn, event) \ do { \ flow_trace(conn, "event at %s:%i",__func__, __LINE__); \ - conn_event_do(c, conn, event); \ + conn_event_do(conn, event); \ } while (0) +/** + * tcp_splice_rst() - Close spliced connection forcing RST on each side + * @conn: Connection pointer + */ +static void tcp_splice_rst(struct tcp_splice_conn *conn) +{ + const struct linger linger0 = { + .l_onoff = 1, + .l_linger = 0, + }; + unsigned sidei; + + if (conn->flags & CLOSING) + return; /* Nothing to do */ + + /* Force RST on sockets to inform the peer + * + * We do this by setting SO_LINGER with 0 timeout, which means that + * close() will send an RST (unless the connection is already closed in + * both directions). + */ + flow_foreach_sidei(sidei) { + if (setsockopt(conn->s[sidei], SOL_SOCKET, + SO_LINGER, &linger0, sizeof(linger0)) < 0) { + flow_dbg_perror(conn, +"SO_LINGER failed, may not send RST to peer"); + } + } + + conn_flag(conn, CLOSING); +} /** * tcp_splice_flow_defer() - Deferred per-flow handling (clean up closed) @@ -320,7 +330,7 @@ static int tcp_splice_connect_finish(const struct ctx *c, if (pipe2(conn->pipe[sidei], O_NONBLOCK | O_CLOEXEC)) { flow_perror(conn, "cannot create %d->%d pipe", sidei, !sidei); - conn_flag(c, conn, CLOSING); + tcp_splice_rst(conn); return -EIO; } @@ -334,7 +344,7 @@ static int tcp_splice_connect_finish(const struct ctx *c, } if (!(conn->events & SPLICE_ESTABLISHED)) - conn_event(c, conn, SPLICE_ESTABLISHED); + conn_event(conn, SPLICE_ESTABLISHED); return 0; } @@ -381,16 +391,24 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn) pif_sockaddr(c, &sa, tgtpif, &tgt->eaddr, tgt->eport); + flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT); + if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, conn->s[0], 0) || + flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, conn->s[1], 1)) { + int ret = -errno; + flow_perror(conn, "Cannot register to epollfd"); + return ret; + } + + conn_event(conn, SPLICE_CONNECT); + if (connect(conn->s[1], &sa.sa, socklen_inany(&sa))) { if (errno != EINPROGRESS) { flow_trace(conn, "Couldn't connect socket for splice: %s", strerror_(errno)); return -errno; } - - conn_event(c, conn, SPLICE_CONNECT); } else { - conn_event(c, conn, SPLICE_ESTABLISHED); + conn_event(conn, SPLICE_ESTABLISHED); return tcp_splice_connect_finish(c, conn); } @@ -450,7 +468,7 @@ void tcp_splice_conn_from_sock(const struct ctx *c, union flow *flow, int s0) flow_trace(conn, "failed to set TCP_QUICKACK on %i", s0); if (tcp_splice_connect(c, conn)) - conn_flag(c, conn, CLOSING); + tcp_splice_rst(conn); FLOW_ACTIVATE(conn); } @@ -487,26 +505,26 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, flow_trace(conn, "Error event on socket: %s", strerror_(err)); - goto close; + goto reset; } if (conn->events == SPLICE_CONNECT) { if (!(events & EPOLLOUT)) - goto close; + goto reset; if (tcp_splice_connect_finish(c, conn)) - goto close; + goto reset; } if (events & EPOLLOUT) { fromsidei = !evsidei; - conn_event(c, conn, ~OUT_WAIT(evsidei)); + conn_event(conn, ~OUT_WAIT(evsidei)); } else { fromsidei = evsidei; } if (events & EPOLLRDHUP) /* For side 0 this is fake, but implied */ - conn_event(c, conn, FIN_RCVD(evsidei)); + conn_event(conn, FIN_RCVD(evsidei)); swap: eof = 0; @@ -528,7 +546,7 @@ retry: while (readlen < 0 && errno == EINTR); if (readlen < 0 && errno != EAGAIN) - goto close; + goto reset; flow_trace(conn, "%zi from read-side call", readlen); @@ -541,7 +559,7 @@ retry: more = SPLICE_F_MORE; if (conn->flags & lowat_set_flag) - conn_flag(c, conn, lowat_act_flag); + conn_flag(conn, lowat_act_flag); } do @@ -552,7 +570,7 @@ retry: while (written < 0 && errno == EINTR); if (written < 0 && errno != EAGAIN) - goto close; + goto reset; flow_trace(conn, "%zi from write-side call (passed %zi)", written, c->tcp.pipe_size); @@ -573,8 +591,8 @@ retry: "Setting SO_RCVLOWAT %i: %s", lowat, strerror_(errno)); } else { - conn_flag(c, conn, lowat_set_flag); - conn_flag(c, conn, lowat_act_flag); + conn_flag(conn, lowat_set_flag); + conn_flag(conn, lowat_act_flag); } } @@ -588,7 +606,7 @@ retry: if (conn->read[fromsidei] == conn->written[fromsidei]) break; - conn_event(c, conn, OUT_WAIT(!fromsidei)); + conn_event(conn, OUT_WAIT(!fromsidei)); break; } @@ -609,14 +627,18 @@ retry: flow_foreach_sidei(sidei) { if ((conn->events & FIN_RCVD(sidei)) && !(conn->events & FIN_SENT(!sidei))) { - shutdown(conn->s[!sidei], SHUT_WR); - conn_event(c, conn, FIN_SENT(!sidei)); + if (shutdown(conn->s[!sidei], SHUT_WR) < 0) + goto reset; + conn_event(conn, FIN_SENT(!sidei)); } } } - if (CONN_HAS(conn, FIN_SENT(0) | FIN_SENT(1))) - goto close; + if (CONN_HAS(conn, FIN_SENT(0) | FIN_SENT(1))) { + /* Clean close, no reset */ + conn_flag(conn, CLOSING); + return; + } if ((events & (EPOLLIN | EPOLLOUT)) == (EPOLLIN | EPOLLOUT)) { events = EPOLLIN; @@ -626,12 +648,12 @@ retry: } if (events & EPOLLHUP) - goto close; + goto reset; return; -close: - conn_flag(c, conn, CLOSING); +reset: + tcp_splice_rst(conn); } /** @@ -767,10 +789,10 @@ void tcp_splice_timer(struct tcp_splice_conn *conn) flow_trace(conn, "can't set SO_RCVLOWAT on %d", conn->s[sidei]); } - conn_flag(c, conn, ~RCVLOWAT_SET(sidei)); + conn_flag(conn, ~RCVLOWAT_SET(sidei)); } } flow_foreach_sidei(sidei) - conn_flag(c, conn, ~RCVLOWAT_ACT(sidei)); + conn_flag(conn, ~RCVLOWAT_ACT(sidei)); } diff --git a/tcp_splice.h b/tcp_splice.h index a20f3e2..dbfd55d 100644 --- a/tcp_splice.h +++ b/tcp_splice.h @@ -6,6 +6,8 @@ #ifndef TCP_SPLICE_H #define TCP_SPLICE_H +#include <stdint.h> + struct tcp_splice_conn; union sockaddr_inany; @@ -40,17 +40,16 @@ static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE]; static int head[VIRTQUEUE_MAX_SIZE + 1]; /** - * tcp_vu_hdrlen() - return the size of the header in level 2 frame (TCP) + * tcp_vu_hdrlen() - Sum size of all headers, from TCP to virtio-net * @v6: Set for IPv6 packet * - * Return: return the size of the header + * Return: total size of virtio-net, Ethernet, IP, and TCP headers */ static size_t tcp_vu_hdrlen(bool v6) { size_t hdrlen; - hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf) + - sizeof(struct ethhdr) + sizeof(struct tcphdr); + hdrlen = VNET_HLEN + sizeof(struct ethhdr) + sizeof(struct tcphdr); if (v6) hdrlen += sizeof(struct ipv6hdr); @@ -72,8 +71,8 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) { struct vu_dev *vdev = c->vdev; struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; - size_t optlen, hdrlen; struct vu_virtq_element flags_elem[2]; + size_t optlen, hdrlen, l2len; struct ipv6hdr *ip6h = NULL; struct iphdr *ip4h = NULL; struct iovec flags_iov[2]; @@ -91,14 +90,14 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) vu_set_element(&flags_elem[0], NULL, &flags_iov[0]); elem_cnt = vu_collect(vdev, vq, &flags_elem[0], 1, - hdrlen + sizeof(struct tcp_syn_opts), NULL); + MAX(hdrlen + sizeof(*opts), ETH_ZLEN + VNET_HLEN), NULL); if (elem_cnt != 1) return -1; ASSERT(flags_elem[0].in_sg[0].iov_len >= - hdrlen + sizeof(struct tcp_syn_opts)); + MAX(hdrlen + sizeof(*opts), ETH_ZLEN + VNET_HLEN)); - vu_set_vnethdr(vdev, flags_elem[0].in_sg[0].iov_base, 1); + vu_set_vnethdr(flags_elem[0].in_sg[0].iov_base, 1); eh = vu_eth(flags_elem[0].in_sg[0].iov_base); @@ -135,13 +134,17 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) flags_elem[0].in_sg[0].iov_len = hdrlen + optlen; payload = IOV_TAIL(flags_elem[0].in_sg, 1, hdrlen); - tcp_fill_headers(c, conn, NULL, eh, ip4h, ip6h, th, &payload, + if (flags & KEEPALIVE) + seq--; + + tcp_fill_headers(c, conn, eh, ip4h, ip6h, th, &payload, NULL, seq, !*c->pcap); - if (*c->pcap) { - pcap_iov(&flags_elem[0].in_sg[0], 1, - sizeof(struct virtio_net_hdr_mrg_rxbuf)); - } + l2len = optlen + hdrlen - VNET_HLEN; + vu_pad(&flags_elem[0].in_sg[0], l2len); + + if (*c->pcap) + pcap_iov(&flags_elem[0].in_sg[0], 1, VNET_HLEN); nb_ack = 1; if (flags & DUP_ACK) { @@ -157,10 +160,8 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) flags_elem[0].in_sg[0].iov_len); nb_ack++; - if (*c->pcap) { - pcap_iov(&flags_elem[1].in_sg[0], 1, - sizeof(struct virtio_net_hdr_mrg_rxbuf)); - } + if (*c->pcap) + pcap_iov(&flags_elem[1].in_sg[0], 1, VNET_HLEN); } } @@ -211,7 +212,8 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq, cnt = vu_collect(vdev, vq, &elem[elem_cnt], VIRTQUEUE_MAX_SIZE - elem_cnt, - MIN(mss, fillsize) + hdrlen, &frame_size); + MAX(MIN(mss, fillsize) + hdrlen, ETH_ZLEN + VNET_HLEN), + &frame_size); if (cnt == 0) break; @@ -254,6 +256,7 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq, len -= iov->iov_len; } + /* adjust head count */ while (*head_cnt > 0 && head[*head_cnt - 1] >= i) (*head_cnt)--; @@ -301,7 +304,7 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn, struct ethhdr *eh; /* we guess the first iovec provided by the guest can embed - * all the headers needed by L2 frame + * all the headers needed by L2 frame, including any padding */ ASSERT(iov[0].iov_len >= hdrlen); @@ -331,7 +334,7 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn, th->ack = 1; th->psh = push; - tcp_fill_headers(c, conn, NULL, eh, ip4h, ip6h, th, &payload, + tcp_fill_headers(c, conn, eh, ip4h, ip6h, th, &payload, *check, conn->seq_to_tap, no_tcp_csum); if (ip4h) *check = &ip4h->check; @@ -421,6 +424,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) } conn_event(c, conn, TAP_FIN_SENT); + conn_flag(c, conn, ACK_FROM_TAP_DUE); } return 0; @@ -446,8 +450,9 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) int buf_cnt = head[i + 1] - head[i]; ssize_t dlen = iov_size(iov, buf_cnt) - hdrlen; bool push = i == head_cnt - 1; + size_t l2len; - vu_set_vnethdr(vdev, iov->iov_base, buf_cnt); + vu_set_vnethdr(iov->iov_base, buf_cnt); /* The IPv4 header checksum varies only with dlen */ if (previous_dlen != dlen) @@ -456,10 +461,12 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) tcp_vu_prepare(c, conn, iov, buf_cnt, &check, !*c->pcap, push); - if (*c->pcap) { - pcap_iov(iov, buf_cnt, - sizeof(struct virtio_net_hdr_mrg_rxbuf)); - } + /* Pad first/single buffer only, it's at least ETH_ZLEN long */ + l2len = dlen + hdrlen - VNET_HLEN; + vu_pad(iov, l2len); + + if (*c->pcap) + pcap_iov(iov, buf_cnt, VNET_HLEN); conn->seq_to_tap += dlen; } diff --git a/test/Makefile b/test/Makefile index 5b5f0fc..6ed233a 100644 --- a/test/Makefile +++ b/test/Makefile @@ -164,82 +164,82 @@ realclean: clean # Debian downloads debian-8.11.0-openstack-%.qcow2: - $(WGET) -O $@ https://cloud.debian.org/images/cloud/OpenStack/archive/8.11.0/debian-8.11.0-openstack-$*.qcow2 + -$(WGET) -O $@ https://cloud.debian.org/images/cloud/OpenStack/archive/8.11.0/debian-8.11.0-openstack-$*.qcow2 debian-10-nocloud-%.qcow2: - $(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/latest/debian-10-nocloud-$*.qcow2 + -$(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/latest/debian-10-nocloud-$*.qcow2 debian-10-generic-ppc64el-20220911-1135.qcow2: - $(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/20220911-1135/debian-10-generic-ppc64el-20220911-1135.qcow2 + -$(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/20220911-1135/debian-10-generic-ppc64el-20220911-1135.qcow2 debian-10-generic-%.qcow2: - $(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/latest/debian-10-generic-$*.qcow2 + -$(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/latest/debian-10-generic-$*.qcow2 debian-11-nocloud-%.qcow2: - $(WGET) -O $@ https://cloud.debian.org/images/cloud/bullseye/latest/debian-11-nocloud-$*.qcow2 + -$(WGET) -O $@ https://cloud.debian.org/images/cloud/bullseye/latest/debian-11-nocloud-$*.qcow2 debian-11-generic-%.qcow2: - $(WGET) -O $@ https://cloud.debian.org/images/cloud/bullseye/latest/debian-11-generic-$*.qcow2 + -$(WGET) -O $@ https://cloud.debian.org/images/cloud/bullseye/latest/debian-11-generic-$*.qcow2 debian-11-generic-ppc64el-20250703-2162.qcow2: - $(WGET) -O $@ https://cloud.debian.org/images/cloud/bullseye/20250703-2162/debian-11-generic-ppc64el-20250703-2162.qcow2 + -$(WGET) -O $@ https://cloud.debian.org/images/cloud/bullseye/20250703-2162/debian-11-generic-ppc64el-20250703-2162.qcow2 debian-sid-nocloud-%-daily.qcow2: - $(WGET) -O $@ https://cloud.debian.org/images/cloud/sid/daily/latest/debian-sid-nocloud-$*-daily.qcow2 + -$(WGET) -O $@ https://cloud.debian.org/images/cloud/sid/daily/latest/debian-sid-nocloud-$*-daily.qcow2 # Fedora downloads Fedora-Cloud-Base-26-1.5.%.qcow2: - $(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/26/CloudImages/$*/images/Fedora-Cloud-Base-26-1.5.$*.qcow2 + -$(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/26/CloudImages/$*/images/Fedora-Cloud-Base-26-1.5.$*.qcow2 Fedora-Cloud-Base-27-1.6.%.qcow2: - $(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/27/CloudImages/$*/images/Fedora-Cloud-Base-27-1.6.$*.qcow2 + -$(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/27/CloudImages/$*/images/Fedora-Cloud-Base-27-1.6.$*.qcow2 Fedora-Cloud-Base-28-1.1.%.qcow2: - $(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/28/Cloud/$*/images/Fedora-Cloud-Base-28-1.1.$*.qcow2 + -$(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/28/Cloud/$*/images/Fedora-Cloud-Base-28-1.1.$*.qcow2 Fedora-Cloud-Base-29-1.2.%.qcow2: - $(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/29/Cloud/$*/images/Fedora-Cloud-Base-29-1.2.$*.qcow2 + -$(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/29/Cloud/$*/images/Fedora-Cloud-Base-29-1.2.$*.qcow2 Fedora-Cloud-Base-30-1.2.%.qcow2: - $(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/30/Cloud/$*/images/Fedora-Cloud-Base-30-1.2.$*.qcow2 + -$(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/30/Cloud/$*/images/Fedora-Cloud-Base-30-1.2.$*.qcow2 Fedora-Cloud-Base-31-1.9.%.qcow2: - $(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/31/Cloud/$*/images/Fedora-Cloud-Base-31-1.9.$*.qcow2 + -$(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/31/Cloud/$*/images/Fedora-Cloud-Base-31-1.9.$*.qcow2 Fedora-Cloud-Base-32-1.6.%.qcow2: - $(WGET) -O $@ https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/32/Cloud/$*/images/Fedora-Cloud-Base-32-1.6.$*.qcow2 + -$(WGET) -O $@ https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/32/Cloud/$*/images/Fedora-Cloud-Base-32-1.6.$*.qcow2 Fedora-Cloud-Base-33-1.2.%.qcow2: - $(WGET) -O $@ https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/33/Cloud/$*/images/Fedora-Cloud-Base-33-1.2.$*.qcow2 + -$(WGET) -O $@ https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/33/Cloud/$*/images/Fedora-Cloud-Base-33-1.2.$*.qcow2 Fedora-Cloud-Base-34-1.2.%.qcow2: - $(WGET) -O $@ https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/34/Cloud/$*/images/Fedora-Cloud-Base-34-1.2.$*.qcow2 + -$(WGET) -O $@ https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/34/Cloud/$*/images/Fedora-Cloud-Base-34-1.2.$*.qcow2 Fedora-Cloud-Base-35-1.2.%.qcow2: - $(WGET) -O $@ https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/35/Cloud/$*/images/Fedora-Cloud-Base-35-1.2.$*.qcow2 + -$(WGET) -O $@ https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/35/Cloud/$*/images/Fedora-Cloud-Base-35-1.2.$*.qcow2 # OpenSuSE downloads openSUSE-Leap-15.1-JeOS.x86_64-kvm-and-xen.qcow2: - $(WGET) -O $@ https://download.opensuse.org/distribution/leap/15.1/jeos/openSUSE-Leap-15.1-JeOS.x86_64-kvm-and-xen.qcow2 + -$(WGET) -O $@ https://download.opensuse.org/distribution/leap/15.1/jeos/openSUSE-Leap-15.1-JeOS.x86_64-kvm-and-xen.qcow2 openSUSE-Leap-15.2-JeOS.x86_64-kvm-and-xen.qcow2: - $(WGET) -O $@ https://download.opensuse.org/distribution/leap/15.2/appliances/openSUSE-Leap-15.2-JeOS.x86_64-kvm-and-xen.qcow2 + -$(WGET) -O $@ https://download.opensuse.org/distribution/leap/15.2/appliances/openSUSE-Leap-15.2-JeOS.x86_64-kvm-and-xen.qcow2 openSUSE-Leap-15.3-JeOS.x86_64-kvm-and-xen.qcow2: - $(WGET) -O $@ https://download.opensuse.org/distribution/leap/15.3/appliances/openSUSE-Leap-15.3-JeOS.x86_64-kvm-and-xen.qcow2 + -$(WGET) -O $@ https://download.opensuse.org/distribution/leap/15.3/appliances/openSUSE-Leap-15.3-JeOS.x86_64-kvm-and-xen.qcow2 openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz: - $(WGET) -O $@ http://download.opensuse.org/ports/aarch64/tumbleweed/appliances/openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz + -$(WGET) -O $@ http://download.opensuse.org/ports/aarch64/tumbleweed/appliances/openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz: - $(WGET) -O $@ http://download.opensuse.org/ports/armv7hl/tumbleweed/appliances/openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz + -$(WGET) -O $@ http://download.opensuse.org/ports/armv7hl/tumbleweed/appliances/openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz # Ubuntu downloads trusty-server-cloudimg-%-disk1.img: - $(WGET) -O $@ https://cloud-images.ubuntu.com/trusty/current/trusty-server-cloudimg-$*-disk1.img + -$(WGET) -O $@ https://cloud-images.ubuntu.com/trusty/current/trusty-server-cloudimg-$*-disk1.img xenial-server-cloudimg-powerpc-disk1.img: - $(WGET) -O $@ https://cloud-images.ubuntu.com/xenial/current/xenial-server-cloudimg-powerpc-disk1.img + -$(WGET) -O $@ https://cloud-images.ubuntu.com/xenial/current/xenial-server-cloudimg-powerpc-disk1.img jammy-server-cloudimg-s390x.img: - $(WGET) -O $@ https://cloud-images.ubuntu.com/jammy/current/jammy-server-cloudimg-s390x.img + -$(WGET) -O $@ https://cloud-images.ubuntu.com/jammy/current/jammy-server-cloudimg-s390x.img diff --git a/test/lib/term b/test/lib/term index 79aa2b7..89e4fdb 100755 --- a/test/lib/term +++ b/test/lib/term @@ -203,7 +203,7 @@ pane_wait() { __done=0 while - __l="$(tail -1 ${LOGDIR}/pane_${__lc}.log | sed 's/[[][^a-zA-Z]*[a-zA-Z]//g')" + __l="$(tail -1 ${LOGDIR}/pane_${__lc}.log | sed 's/[[][^a-zA-Z]*[a-zA-Z]//g;s/[]][^]*[\]//g')" case ${__l} in *"$ " | *"# ") return ;; esac @@ -215,7 +215,7 @@ pane_wait() { pane_parse() { __pane_lc="$(echo "${1}" | tr [A-Z] [a-z])" - __buf="$(tail -n2 ${LOGDIR}/pane_${__pane_lc}.log | head -n1 | sed 's/^[^\r]*\r\([^\r]\)/\1/' | tr -d '\r\n')" + __buf="$(tail -n2 ${LOGDIR}/pane_${__pane_lc}.log | head -n1 | sed 's/^[^\r]*\r\([^\r]\)/\1/;s/[]][^]*[\]//g' | tr -d '\r\n')" [ "# $(eval printf '%s' \"\$${1}_LAST_CMD\")" != "${__buf}" ] && \ [ "$ $(eval printf '%s' \"\$${1}_LAST_CMD\")" != "${__buf}" ] && @@ -707,7 +707,7 @@ term() { tmux set window-status-current-style 'bg=colour1 fg=colour233 bold' tmux set status-right '#(TZ="UTC" date -Iseconds)' - tmux set status-right-length 50 + tmux set status-right-length 64 tmux set status-right-style 'bg=colour1 fg=colour233 bold' tmux set history-limit 500000 diff --git a/test/passt.mbuto b/test/passt.mbuto index 598c254..de35c3c 100755 --- a/test/passt.mbuto +++ b/test/passt.mbuto @@ -24,6 +24,12 @@ for bin in /usr/lib/openssh/sshd-session /usr/lib/ssh/sshd-session \ command -v "${bin}" >/dev/null && PROGS="${PROGS} ${bin}" done +# OpenSSH 10 adds sshd-auth as well +for bin in /usr/lib/openssh/sshd-auth /usr/lib/ssh/sshd-auth \ + /usr/libexec/openssh/sshd-auth; do + command -v "${bin}" >/dev/null && PROGS="${PROGS} ${bin}" +done + KMODS="${KMODS:- virtio_net virtio_pci vmw_vsock_virtio_transport}" LINKS="${LINKS:- @@ -26,7 +26,10 @@ * * We track pseudo-connections of this type as flow table entries of type * FLOW_UDP. We store the time of the last traffic on the flow in uflow->ts, - * and let the flow expire if there is no traffic for UDP_CONN_TIMEOUT seconds. + * and let the flow expire if there is no traffic for UDP_TIMEOUT seconds for + * unidirectional flows and flows with only one datagram and one reply, or + * UDP_TIMEOUT_STREAM seconds for bidirectional flows with more than one + * datagram on either side. * * NOTE: This won't handle multicast protocols, or some protocols with different * port usage. We'll need specific logic if we want to handle those. @@ -118,16 +121,19 @@ #define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ +#define UDP_TIMEOUT "/proc/sys/net/netfilter/nf_conntrack_udp_timeout" +#define UDP_TIMEOUT_STREAM \ + "/proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream" + +#define UDP_TIMEOUT_DEFAULT 30 /* s */ +#define UDP_TIMEOUT_STREAM_DEFAULT 120 /* s */ + /* Maximum UDP data to be returned in ICMP messages */ #define ICMP4_MAX_DLEN 8 #define ICMP6_MAX_DLEN (IPV6_MIN_MTU \ - sizeof(struct udphdr) \ - sizeof(struct ipv6hdr)) -/* "Spliced" sockets indexed by bound port (host order) */ -static int udp_splice_ns [IP_VERSIONS][NUM_PORTS]; -static int udp_splice_init[IP_VERSIONS][NUM_PORTS]; - /* Static buffers */ /* UDP header and data for inbound messages */ @@ -164,17 +170,19 @@ udp_meta[UDP_MAX_FRAMES]; /** * enum udp_iov_idx - Indices for the buffers making up a single UDP frame - * @UDP_IOV_TAP tap specific header - * @UDP_IOV_ETH Ethernet header - * @UDP_IOV_IP IP (v4/v6) header - * @UDP_IOV_PAYLOAD IP payload (UDP header + data) - * @UDP_NUM_IOVS the number of entries in the iovec array + * @UDP_IOV_TAP tap specific header + * @UDP_IOV_ETH Ethernet header + * @UDP_IOV_IP IP (v4/v6) header + * @UDP_IOV_PAYLOAD IP payload (UDP header + data) + * @UDP_IOV_ETH_PAD Ethernet (802.3) padding to 60 bytes + * @UDP_NUM_IOVS the number of entries in the iovec array */ enum udp_iov_idx { UDP_IOV_TAP, UDP_IOV_ETH, UDP_IOV_IP, UDP_IOV_PAYLOAD, + UDP_IOV_ETH_PAD, UDP_NUM_IOVS, }; @@ -192,19 +200,6 @@ static struct mmsghdr udp_mh_splice [UDP_MAX_FRAMES]; static struct iovec udp_l2_iov [UDP_MAX_FRAMES][UDP_NUM_IOVS]; /** - * udp_portmap_clear() - Clear UDP port map before configuration - */ -void udp_portmap_clear(void) -{ - unsigned i; - - for (i = 0; i < NUM_PORTS; i++) { - udp_splice_ns[V4][i] = udp_splice_ns[V6][i] = -1; - udp_splice_init[V4][i] = udp_splice_init[V6][i] = -1; - } -} - -/** * udp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses * @eth_d: Ethernet destination address, NULL if unchanged */ @@ -239,6 +234,7 @@ static void udp_iov_init_one(const struct ctx *c, size_t i) tiov[UDP_IOV_ETH] = IOV_OF_LVALUE(udp_eth_hdr[i]); tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph); tiov[UDP_IOV_PAYLOAD].iov_base = payload; + tiov[UDP_IOV_ETH_PAD].iov_base = eth_pad; mh->msg_iov = siov; mh->msg_iovlen = 1; @@ -345,6 +341,22 @@ size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, } /** + * udp_tap_pad() - Calculate padding to send out of padding (zero) buffer + * @iov: Pointer to iovec of frame parts we're about to send + */ +static void udp_tap_pad(struct iovec *iov) +{ + size_t l2len = iov[UDP_IOV_ETH].iov_len + + iov[UDP_IOV_IP].iov_len + + iov[UDP_IOV_PAYLOAD].iov_len; + + if (l2len < ETH_ZLEN) + iov[UDP_IOV_ETH_PAD].iov_len = ETH_ZLEN - l2len; + else + iov[UDP_IOV_ETH_PAD].iov_len = 0; +} + +/** * udp_tap_prepare() - Convert one datagram into a tap frame * @mmh: Receiving mmsghdr array * @idx: Index of the datagram to prepare @@ -362,23 +374,31 @@ static void udp_tap_prepare(const struct mmsghdr *mmh, struct ethhdr *eh = (*tap_iov)[UDP_IOV_ETH].iov_base; struct udp_payload_t *bp = &udp_payload[idx]; struct udp_meta_t *bm = &udp_meta[idx]; - size_t l4len; + size_t l4len, l2len; eth_update_mac(eh, NULL, tap_omac); if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) { l4len = udp_update_hdr6(&bm->ip6h, bp, toside, mmh[idx].msg_len, no_udp_csum); - tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + ETH_HLEN); + + l2len = MAX(l4len + sizeof(bm->ip6h) + ETH_HLEN, ETH_ZLEN); + tap_hdr_update(&bm->taph, l2len); + eh->h_proto = htons_constant(ETH_P_IPV6); (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h); } else { l4len = udp_update_hdr4(&bm->ip4h, bp, toside, mmh[idx].msg_len, no_udp_csum); - tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) + ETH_HLEN); + + l2len = MAX(l4len + sizeof(bm->ip4h) + ETH_HLEN, ETH_ZLEN); + tap_hdr_update(&bm->taph, l2len); + eh->h_proto = htons_constant(ETH_P_IP); (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip4h); } (*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len; + + udp_tap_pad(*tap_iov); } /** @@ -828,12 +848,13 @@ static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n, * udp_sock_fwd() - Forward datagrams from a possibly unconnected socket * @c: Execution context * @s: Socket to forward from + * @rule_hint: Forwarding rule to use, or -1 if unknown * @frompif: Interface to which @s belongs * @port: Our (local) port number of @s * @now: Current timestamp */ -void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif, - in_port_t port, const struct timespec *now) +void udp_sock_fwd(const struct ctx *c, int s, int rule_hint, + uint8_t frompif, in_port_t port, const struct timespec *now) { union sockaddr_inany src; union inany_addr dst; @@ -858,7 +879,8 @@ void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif, continue; } - tosidx = udp_flow_from_sock(c, frompif, &dst, port, &src, now); + tosidx = udp_flow_from_sock(c, frompif, &dst, port, &src, + rule_hint, now); topif = pif_at_sidx(tosidx); if (pif_is_socket(topif)) { @@ -900,8 +922,10 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { - if (events & (EPOLLERR | EPOLLIN)) - udp_sock_fwd(c, ref.fd, ref.udp.pif, ref.udp.port, now); + if (events & (EPOLLERR | EPOLLIN)) { + udp_sock_fwd(c, ref.fd, ref.listen.rule, + ref.listen.pif, ref.listen.port, now); + } } /** @@ -939,7 +963,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, int s = ref.fd; flow_trace(uflow, "Received data on reply socket"); - uflow->ts = now->tv_sec; + udp_flow_activity(uflow, !tosidx.sidei, now); if (pif_is_socket(topif)) { udp_sock_to_sock(c, ref.fd, n, tosidx); @@ -1102,64 +1126,41 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif, } /** - * udp_sock_init() - Initialise listening socket for a given port + * udp_listen() - Initialise listening socket for a given port * @c: Execution context * @pif: Interface to open the socket for (PIF_HOST or PIF_SPLICE) + * @rule: Index of relevant forwarding rule * @addr: Pointer to address for binding, NULL if not configured * @ifname: Name of interface to bind to, NULL if not configured * @port: Port, host order * - * Return: 0 on success, negative error code on failure + * Return: socket fd on success, negative error code on failure */ -int udp_sock_init(const struct ctx *c, uint8_t pif, - const union inany_addr *addr, const char *ifname, - in_port_t port) +int udp_listen(const struct ctx *c, uint8_t pif, unsigned rule, + const union inany_addr *addr, const char *ifname, in_port_t port) { - union udp_listen_epoll_ref uref = { - .pif = pif, - .port = port, - }; - int (*socks)[NUM_PORTS]; int s; ASSERT(!c->no_udp); - ASSERT(pif_is_socket(pif)); - - if (pif == PIF_HOST) - socks = udp_splice_init; - else - socks = udp_splice_ns; if (!c->ifi4) { if (!addr) /* Restrict to v6 only */ addr = &inany_any6; else if (inany_v4(addr)) - /* Nothing to do */ - return 0; + return -EAFNOSUPPORT; } if (!c->ifi6) { if (!addr) /* Restrict to v4 only */ addr = &inany_any4; else if (!inany_v4(addr)) - /* Nothing to do */ - return 0; - } - - s = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, pif, - addr, ifname, port, uref.u32); - if (s > FD_REF_MAX) { - close(s); - s = -EIO; + return -EAFNOSUPPORT; } - if (!addr || inany_v4(addr)) - socks[V4][port] = s < 0 ? -1 : s; - if (!addr || !inany_v4(addr)) - socks[V6][port] = s < 0 ? -1 : s; + s = pif_listen(c, EPOLL_TYPE_UDP_LISTEN, pif, addr, ifname, port, rule); - return s < 0 ? s : 0; + return s; } /** @@ -1183,112 +1184,45 @@ static void udp_splice_iov_init(void) } /** - * udp_ns_sock_init() - Init socket to listen for spliced outbound connections + * udp_get_timeout_params() - Get host kernel UDP timeout parameters * @c: Execution context - * @port: Port, host order */ -static void udp_ns_sock_init(const struct ctx *c, in_port_t port) +static void udp_get_timeout_params(struct ctx *c) { - ASSERT(!c->no_udp); + intmax_t v; - if (!c->no_bindtodevice) { - udp_sock_init(c, PIF_SPLICE, NULL, "lo", port); - return; - } + v = read_file_integer(UDP_TIMEOUT, UDP_TIMEOUT_DEFAULT); + c->udp.timeout = v; - if (c->ifi4) - udp_sock_init(c, PIF_SPLICE, &inany_loopback4, NULL, port); - if (c->ifi6) - udp_sock_init(c, PIF_SPLICE, &inany_loopback6, NULL, port); -} + v = read_file_integer(UDP_TIMEOUT_STREAM, UDP_TIMEOUT_STREAM_DEFAULT); + c->udp.stream_timeout = v; -/** - * udp_port_rebind() - Rebind ports to match forward maps - * @c: Execution context - * @outbound: True to remap outbound forwards, otherwise inbound - * - * Must be called in namespace context if @outbound is true. - */ -static void udp_port_rebind(struct ctx *c, bool outbound) -{ - int (*socks)[NUM_PORTS] = outbound ? udp_splice_ns : udp_splice_init; - const uint8_t *fmap - = outbound ? c->udp.fwd_out.map : c->udp.fwd_in.map; - unsigned port; - - for (port = 0; port < NUM_PORTS; port++) { - if (!bitmap_isset(fmap, port)) { - if (socks[V4][port] >= 0) { - close(socks[V4][port]); - socks[V4][port] = -1; - } - - if (socks[V6][port] >= 0) { - close(socks[V6][port]); - socks[V6][port] = -1; - } - - continue; - } - - if ((c->ifi4 && socks[V4][port] == -1) || - (c->ifi6 && socks[V6][port] == -1)) { - if (outbound) - udp_ns_sock_init(c, port); - else - udp_sock_init(c, PIF_HOST, NULL, NULL, port); - } - } -} - -/** - * udp_port_rebind_outbound() - Rebind ports in namespace - * @arg: Execution context - * - * Called with NS_CALL() - * - * Return: 0 - */ -static int udp_port_rebind_outbound(void *arg) -{ - struct ctx *c = (struct ctx *)arg; - - ns_enter(c); - udp_port_rebind(c, true); - - return 0; -} - -/** - * udp_port_rebind_all() - Rebind ports to match forward maps (in host & ns) - * @c: Execution context - */ -void udp_port_rebind_all(struct ctx *c) -{ - ASSERT(c->mode == MODE_PASTA && !c->no_udp); - - if (c->udp.fwd_out.mode == FWD_AUTO) - NS_CALL(udp_port_rebind_outbound, c); - - if (c->udp.fwd_in.mode == FWD_AUTO) - udp_port_rebind(c, false); + debug("Using UDP timeout parameters, timeout: %d, stream_timeout: %d", + c->udp.timeout, c->udp.stream_timeout); } /** * udp_init() - Initialise per-socket data, and sockets in namespace * @c: Execution context * - * Return: 0 + * Return: 0 on success, -1 on failure */ int udp_init(struct ctx *c) { ASSERT(!c->no_udp); + udp_get_timeout_params(c); + udp_iov_init(c); + if (fwd_listen_sync(c, &c->udp.fwd_in, PIF_HOST, IPPROTO_UDP) < 0) + return -1; + if (c->mode == MODE_PASTA) { udp_splice_iov_init(); - NS_CALL(udp_port_rebind_outbound, c); + if (fwd_listen_sync(c, &c->udp.fwd_out, + PIF_SPLICE, IPPROTO_UDP) < 0) + return -1; } return 0; @@ -6,7 +6,11 @@ #ifndef UDP_H #define UDP_H -void udp_portmap_clear(void); +#include <stdint.h> +#include <netinet/in.h> + +#include "fwd.h" + void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now); void udp_sock_handler(const struct ctx *c, union epoll_ref ref, @@ -15,38 +19,25 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, uint8_t ttl, const struct pool *p, int idx, const struct timespec *now); -int udp_sock_init(const struct ctx *c, uint8_t pif, - const union inany_addr *addr, const char *ifname, - in_port_t port); +int udp_listen(const struct ctx *c, uint8_t pif, unsigned rule, + const union inany_addr *addr, const char *ifname, in_port_t port); int udp_init(struct ctx *c); -void udp_port_rebind_all(struct ctx *c); void udp_update_l2_buf(const unsigned char *eth_d); /** - * union udp_listen_epoll_ref - epoll reference for "listening" UDP sockets - * @port: Source port for connected sockets, bound port otherwise - * @pif: pif for this socket - * @u32: Opaque u32 value of reference - */ -union udp_listen_epoll_ref { - struct { - in_port_t port; - uint8_t pif; - }; - uint32_t u32; -}; - - -/** * struct udp_ctx - Execution context for UDP * @fwd_in: Port forwarding configuration for inbound packets * @fwd_out: Port forwarding configuration for outbound packets * @timer_run: Timestamp of most recent timer run + * @timeout: Timeout for unidirectional flows (in s) + * @stream_timeout: Timeout for stream-like flows (in s) */ struct udp_ctx { struct fwd_ports fwd_in; struct fwd_ports fwd_out; struct timespec timer_run; + int timeout; + int stream_timeout; }; #endif /* UDP_H */ @@ -17,8 +17,6 @@ #include "udp_internal.h" #include "epoll_ctl.h" -#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */ - /** * udp_at_sidx() - Get UDP specific flow at given sidx * @sidx: Flow and side to retrieve @@ -74,11 +72,6 @@ static int udp_flow_sock(const struct ctx *c, { const struct flowside *side = &uflow->f.side[sidei]; uint8_t pif = uflow->f.pif[sidei]; - union { - flow_sidx_t sidx; - uint32_t data; - } fref = { .sidx = FLOW_SIDX(uflow, sidei) }; - union epoll_ref ref; int rc; int s; @@ -88,14 +81,9 @@ static int udp_flow_sock(const struct ctx *c, return s; } - ref.type = EPOLL_TYPE_UDP; - ref.data = fref.data; - ref.fd = s; - flow_epollid_set(&uflow->f, EPOLLFD_ID_DEFAULT); - - rc = epoll_add(flow_epollfd(&uflow->f), EPOLLIN, ref); - if (rc < 0) { + if (flow_epoll_set(&uflow->f, EPOLL_CTL_ADD, EPOLLIN, s, sidei) < 0) { + rc = -errno; close(s); return rc; } @@ -109,6 +97,7 @@ static int udp_flow_sock(const struct ctx *c, flow_dbg_perror(uflow, "Couldn't connect flow socket"); return rc; } + uflow->s[sidei] = s; /* It's possible, if unlikely, that we could receive some packets in * between the bind() and connect() which may or may not be for this @@ -139,6 +128,7 @@ static int udp_flow_sock(const struct ctx *c, * udp_flow_new() - Common setup for a new UDP flow * @c: Execution context * @flow: Initiated flow + * @rule_hint: Index of forwarding rule, or -1 if unknown * @now: Timestamp * * Return: sidx for the target side of the new UDP flow, or FLOW_SIDX_NONE @@ -147,23 +137,25 @@ static int udp_flow_sock(const struct ctx *c, * #syscalls getsockname */ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, - const struct timespec *now) + int rule_hint, const struct timespec *now) { struct udp_flow *uflow = NULL; const struct flowside *tgt; unsigned sidei; - if (!(tgt = flow_target(c, flow, IPPROTO_UDP))) + if (!(tgt = flow_target(c, flow, rule_hint, IPPROTO_UDP))) goto cancel; uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp); uflow->ts = now->tv_sec; uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1; uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = 0; + uflow->activity[INISIDE] = 1; + uflow->activity[TGTSIDE] = 0; flow_foreach_sidei(sidei) { if (pif_is_socket(uflow->f.pif[sidei])) - if ((uflow->s[sidei] = udp_flow_sock(c, uflow, sidei)) < 0) + if (udp_flow_sock(c, uflow, sidei) < 0) goto cancel; } @@ -216,6 +208,7 @@ cancel: * @dst: Our (local) address to which the datagram is arriving * @port: Our (local) port number to which the datagram is arriving * @s_in: Source socket address, filled in by recvmmsg() + * @rule_hint: Index of forwarding rule, or -1 if unknown * @now: Timestamp * * #syscalls fcntl arm:fcntl64 ppc64:fcntl64|fcntl i686:fcntl64 @@ -226,7 +219,7 @@ cancel: flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, const union inany_addr *dst, in_port_t port, const union sockaddr_inany *s_in, - const struct timespec *now) + int rule_hint, const struct timespec *now) { const struct flowside *ini; struct udp_flow *uflow; @@ -235,7 +228,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, s_in, dst, port); if ((uflow = udp_at_sidx(sidx))) { - uflow->ts = now->tv_sec; + udp_flow_activity(uflow, sidx.sidei, now); return flow_sidx_opposite(sidx); } @@ -260,7 +253,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, return FLOW_SIDX_NONE; } - return udp_flow_new(c, flow, now); + return udp_flow_new(c, flow, rule_hint, now); } /** @@ -292,7 +285,7 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c, sidx = flow_lookup_af(c, IPPROTO_UDP, pif, af, saddr, daddr, srcport, dstport); if ((uflow = udp_at_sidx(sidx))) { - uflow->ts = now->tv_sec; + udp_flow_activity(uflow, sidx.sidei, now); return flow_sidx_opposite(sidx); } @@ -316,7 +309,7 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c, return FLOW_SIDX_NONE; } - return udp_flow_new(c, flow, now); + return udp_flow_new(c, flow, FWD_NO_HINT, now); } /** @@ -332,7 +325,7 @@ static void udp_flush_flow(const struct ctx *c, { /* We don't know exactly where the datagrams will come from, but we know * they'll have an interface and oport matching this flow */ - udp_sock_fwd(c, uflow->s[sidei], uflow->f.pif[sidei], + udp_sock_fwd(c, uflow->s[sidei], -1, uflow->f.pif[sidei], uflow->f.side[sidei].oport, now); } @@ -369,9 +362,29 @@ bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow, bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow, const struct timespec *now) { - if (now->tv_sec - uflow->ts <= UDP_CONN_TIMEOUT) + int timeout = c->udp.timeout; + + if (uflow->activity[TGTSIDE] && + (uflow->activity[INISIDE] > 1 || uflow->activity[TGTSIDE] > 1)) + timeout = c->udp.stream_timeout; + + if (now->tv_sec - uflow->ts <= timeout) return false; udp_flow_close(c, uflow); return true; } + +/** + * udp_flow_activity() - Track activity of a UDP flow + * @uflow: UDP flow + * @sidei: Side index of the flow (INISIDE or TGTSIDE) + * @now: Current timestamp + */ +void udp_flow_activity(struct udp_flow *uflow, unsigned int sidei, + const struct timespec *now) +{ + uflow->ts = now->tv_sec; + if (uflow->activity[sidei] < UINT8_MAX) + uflow->activity[sidei]++; +} @@ -7,6 +7,12 @@ #ifndef UDP_FLOW_H #define UDP_FLOW_H +#include <stdbool.h> +#include <stdint.h> +#include <netinet/in.h> + +#include "flow.h" + /** * struct udp_flow - Descriptor for a flow of UDP packets * @f: Generic flow information @@ -16,6 +22,7 @@ * @flush1: @s[1] may have datagrams queued for other flows * @ts: Activity timestamp * @s: Socket fd (or -1) for each side of the flow + * @activity: Packets seen from each side of the flow, up to UINT8_MAX */ struct udp_flow { /* Must be first element */ @@ -29,13 +36,14 @@ struct udp_flow { time_t ts; int s[SIDES]; + uint8_t activity[SIDES]; }; struct udp_flow *udp_at_sidx(flow_sidx_t sidx); flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, const union inany_addr *dst, in_port_t port, const union sockaddr_inany *s_in, - const struct timespec *now); + int rule_hint, const struct timespec *now); flow_sidx_t udp_flow_from_tap(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, @@ -46,5 +54,7 @@ bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow, const struct timespec *now); bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow, const struct timespec *now); +void udp_flow_activity(struct udp_flow *uflow, unsigned int sidei, + const struct timespec *now); #endif /* UDP_FLOW_H */ diff --git a/udp_internal.h b/udp_internal.h index 96d11cf..64e4577 100644 --- a/udp_internal.h +++ b/udp_internal.h @@ -6,6 +6,9 @@ #ifndef UDP_INTERNAL_H #define UDP_INTERNAL_H +#include <netinet/in.h> +#include <netinet/udp.h> + #include "tap.h" /* needed by udp_meta_t */ /** @@ -26,9 +29,9 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, const struct flowside *toside, size_t dlen, bool no_udp_csum); size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, - const struct flowside *toside, size_t dlen, + const struct flowside *toside, size_t dlen, bool no_udp_csum); -void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif, - in_port_t port, const struct timespec *now); +void udp_sock_fwd(const struct ctx *c, int s, int rule_hint, + uint8_t frompif, in_port_t port, const struct timespec *now); #endif /* UDP_INTERNAL_H */ @@ -37,17 +37,16 @@ static struct iovec iov_vu [VIRTQUEUE_MAX_SIZE]; static struct vu_virtq_element elem [VIRTQUEUE_MAX_SIZE]; /** - * udp_vu_hdrlen() - return the size of the header in level 2 frame (UDP) + * udp_vu_hdrlen() - Sum size of all headers, from UDP to virtio-net * @v6: Set for IPv6 packet * - * Return: return the size of the header + * Return: total size of virtio-net, Ethernet, IP, and UDP headers */ static size_t udp_vu_hdrlen(bool v6) { size_t hdrlen; - hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf) + - sizeof(struct ethhdr) + sizeof(struct udphdr); + hdrlen = VNET_HLEN + sizeof(struct ethhdr) + sizeof(struct udphdr); if (v6) hdrlen += sizeof(struct ipv6hdr); @@ -65,32 +64,40 @@ static size_t udp_vu_hdrlen(bool v6) * @v6: Set for IPv6 connections * @dlen: Size of received data (output) * - * Return: number of iov entries used to store the datagram + * Return: number of iov entries used to store the datagram, 0 if the datagram + * was discarded because the virtqueue is not ready, -1 on error */ static int udp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq, int s, bool v6, ssize_t *dlen) { const struct vu_dev *vdev = c->vdev; int iov_cnt, idx, iov_used; + size_t off, hdrlen, l2len; struct msghdr msg = { 0 }; - size_t off, hdrlen; ASSERT(!c->no_udp); + if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) { + debug("Got UDP packet, but RX virtqueue not usable yet"); + + if (recvmsg(s, &msg, MSG_DONTWAIT) < 0) + debug_perror("Failed to discard datagram"); + + return 0; + } + /* compute L2 header length */ hdrlen = udp_vu_hdrlen(v6); vu_init_elem(elem, iov_vu, VIRTQUEUE_MAX_SIZE); iov_cnt = vu_collect(vdev, vq, elem, VIRTQUEUE_MAX_SIZE, - IP_MAX_MTU + ETH_HLEN + - sizeof(struct virtio_net_hdr_mrg_rxbuf), - NULL); + IP_MAX_MTU + ETH_HLEN + VNET_HLEN, NULL); if (iov_cnt == 0) - return 0; + return -1; /* reserve space for the headers */ - ASSERT(iov_vu[0].iov_len >= hdrlen); + ASSERT(iov_vu[0].iov_len >= MAX(hdrlen, ETH_ZLEN + VNET_HLEN)); iov_vu[0].iov_base = (char *)iov_vu[0].iov_base + hdrlen; iov_vu[0].iov_len -= hdrlen; @@ -101,7 +108,7 @@ static int udp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq, int s, *dlen = recvmsg(s, &msg, 0); if (*dlen < 0) { vu_queue_rewind(vq, iov_cnt); - return 0; + return -1; } /* restore the pointer to the headers address */ @@ -116,7 +123,11 @@ static int udp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq, int s, iov_vu[idx].iov_len = off; iov_used = idx + !!off; - vu_set_vnethdr(vdev, iov_vu[0].iov_base, iov_used); + /* pad frame to 60 bytes: first buffer is at least ETH_ZLEN long */ + l2len = *dlen + hdrlen - VNET_HLEN; + vu_pad(&iov_vu[0], l2len); + + vu_set_vnethdr(iov_vu[0].iov_base, iov_used); /* release unused buffers */ vu_queue_rewind(vq, iov_cnt - iov_used); @@ -212,15 +223,16 @@ void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx) int iov_used; iov_used = udp_vu_sock_recv(c, vq, s, v6, &dlen); - if (iov_used <= 0) + if (iov_used < 0) break; - udp_vu_prepare(c, toside, dlen); - if (*c->pcap) { - udp_vu_csum(toside, iov_used); - pcap_iov(iov_vu, iov_used, - sizeof(struct virtio_net_hdr_mrg_rxbuf)); + if (iov_used > 0) { + udp_vu_prepare(c, toside, dlen); + if (*c->pcap) { + udp_vu_csum(toside, iov_used); + pcap_iov(iov_vu, iov_used, VNET_HLEN); + } + vu_flush(vdev, vq, elem, iov_used); } - vu_flush(vdev, vq, elem, iov_used); } } @@ -6,6 +6,8 @@ #ifndef UDP_VU_H #define UDP_VU_H +#include "flow.h" + void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref, const struct timespec *now); void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx); @@ -25,6 +25,7 @@ #include <errno.h> #include <stdbool.h> #include <linux/errqueue.h> +#include <linux/in6.h> #include <getopt.h> #include "linux_dep.h" @@ -35,10 +36,14 @@ #include "log.h" #include "pcap.h" #include "epoll_ctl.h" +#include "pasta.h" #ifdef HAS_GETRANDOM #include <sys/random.h> #endif +/* Zero-filled buffer to pad 802.3 frames, up to 60 (ETH_ZLEN) bytes */ +uint8_t eth_pad[ETH_ZLEN] = { 0 }; + /** * sock_l4_() - Create and bind socket to socket address * @c: Execution context @@ -552,7 +557,7 @@ void pidfile_write(int fd, pid_t pid) if (write(fd, pid_buf, n) < 0) { perror("PID file write"); - _exit(EXIT_FAILURE); + passt_exit(EXIT_FAILURE); } close(fd); @@ -589,12 +594,12 @@ int __daemon(int pidfile_fd, int devnull_fd) if (pid == -1) { perror("fork"); - _exit(EXIT_FAILURE); + passt_exit(EXIT_FAILURE); } if (pid) { pidfile_write(pidfile_fd, pid); - _exit(EXIT_SUCCESS); + passt_exit(EXIT_SUCCESS); } if (setsid() < 0 || @@ -602,7 +607,7 @@ int __daemon(int pidfile_fd, int devnull_fd) dup2(devnull_fd, STDOUT_FILENO) < 0 || dup2(devnull_fd, STDERR_FILENO) < 0 || close(devnull_fd)) - _exit(EXIT_FAILURE); + passt_exit(EXIT_FAILURE); return 0; } @@ -611,6 +616,9 @@ int __daemon(int pidfile_fd, int devnull_fd) * fls() - Find last (most significant) bit set in word * @x: Word * + * Note: unlike ffs() and other implementations of fls(), notably the one from + * the Linux kernel, the starting position is 0 and not 1, that is, fls(1) = 0. + * * Return: position of most significant bit set, starting from 0, -1 if none */ int fls(unsigned long x) @@ -627,6 +635,17 @@ int fls(unsigned long x) } /** + * ilog2() - Integral part (floor) of binary logarithm (logarithm to the base 2) + * @x: Argument + * + * Return: integral part of binary logarithm of @x, -1 if undefined (if @x is 0) + */ +int ilog2(unsigned long x) +{ + return fls(x); +} + +/** * write_file() - Replace contents of file with a string * @path: File to write * @buf: String to write @@ -697,7 +716,7 @@ static ssize_t read_file(const char *path, char *buf, size_t buf_size) close(fd); - if (total_read == buf_size) { + if (total_read >= buf_size) { buf[buf_size - 1] = '\0'; return -ENOBUFS; } @@ -1141,7 +1160,7 @@ void raw_random(void *buf, size_t buflen) ret = getrandom((char *)buf + random_read, buflen - random_read, GRND_RANDOM); #else - ret = read(dev_random, (char *)buf + random_read, + ret = read(fd, (char *)buf + random_read, buflen - random_read); #endif @@ -1158,7 +1177,7 @@ void raw_random(void *buf, size_t buflen) } #ifndef HAS_GETRANDOM - close(dev_random); + close(fd); #endif if (random_read < buflen) @@ -1208,15 +1227,65 @@ void abort_with_msg(const char *fmt, ...) } /** - * fsync_pcap_and_log() - Flush pcap and log files as needed + * passt_exit() - Perform vital cleanup and exit + * + * We don't use exit(3) because on some C library versions it can do unexpected + * things that hit our seccomp profile (e.g. futex() calls). This is a bespoke + * wrapper around _exit(2) performing just the cleanup that we need. * * #syscalls fsync */ -void fsync_pcap_and_log(void) +void passt_exit(int status) { + /* Make sure we don't leave the pcap file truncated */ if (pcap_fd != -1 && fsync(pcap_fd)) warn_perror("Failed to flush pcap file, it might be truncated"); + /* Make sure we don't leave an incomplete log */ if (log_file != -1) (void)fsync(log_file); + + /* Make sure we don't leave any messages incomplete */ + (void)fflush(stderr); + (void)fflush(stdout); + + _exit(status); +} + +/** + * clamped_scale() - Scale @x from 100% to f% depending on @y's value + * @x: Value to scale + * @y: Value determining scaling + * @lo: Lower bound for @y (start of y-axis slope) + * @hi: Upper bound for @y (end of y-axis slope) + * @f: Scaling factor, percent (might be less or more than 100) + * + * Return: @x scaled by @f * linear interpolation of @y between @lo and @hi + * + * In pictures: + * + * f % -> ,---- * If @y < lo (for example, @y is y0), return @x + * /| | + * / | | * If @lo < @y < @hi (for example, @y is y1), + * / | | return @x scaled by a factor linearly + * (100 + f) / 2 % ->/ | | interpolated between 100% and f% depending on + * /| | | @y's position between @lo (100%) and @hi (f%) + * / | | | + * / | | | * If @y > @hi (for example, @y is y2), return + * 100 % -> -----' | | | @x * @f / 100 + * | | | | | + * y0 lo y1 hi y2 Example: @f = 150, @lo = 10, @hi = 20, @y = 15, + * @x = 1000 + * -> interpolated factor is 125% + * -> return 1250 + */ +long clamped_scale(long x, long y, long lo, long hi, long f) +{ + if (y < lo) + return x; + + if (y > hi) + return x * f / 100; + + return x - (x * (y - lo) / (hi - lo)) * (100 - f) / 100; } @@ -17,6 +17,7 @@ #include <arpa/inet.h> #include <unistd.h> #include <sys/syscall.h> +#include <net/ethernet.h> #include "log.h" @@ -152,6 +153,8 @@ void abort_with_msg(const char *fmt, ...) #define ntohll(x) (be64toh((x))) #define htonll(x) (htobe64((x))) +extern uint8_t eth_pad[ETH_ZLEN]; + /** * ntohl_unaligned() - Read 32-bit BE value from a possibly unaligned address * @p: Pointer to the BE value in memory @@ -230,6 +233,7 @@ int output_file_open(const char *path, int flags); void pidfile_write(int fd, pid_t pid); int __daemon(int pidfile_fd, int devnull_fd); int fls(unsigned long x); +int ilog2(unsigned long x); int write_file(const char *path, const char *buf); intmax_t read_file_integer(const char *path, intmax_t fallback); int write_all_buf(int fd, const void *buf, size_t len); @@ -238,7 +242,7 @@ int read_all_buf(int fd, void *buf, size_t len); int read_remainder(int fd, const struct iovec *iov, size_t cnt, size_t skip); void close_open_files(int argc, char **argv); bool snprintf_check(char *str, size_t size, const char *format, ...); -void fsync_pcap_and_log(void); +long clamped_scale(long x, long y, long lo, long hi, long f); /** * af_name() - Return name of an address family diff --git a/vhost_user.c b/vhost_user.c index aa7c869..9fe1241 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -60,7 +60,7 @@ void vu_print_capabilities(void) info("{"); info(" \"type\": \"net\""); info("}"); - _exit(EXIT_SUCCESS); + passt_exit(EXIT_SUCCESS); } /** @@ -442,7 +442,7 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev, for (i = 0; i < vdev->memory.nregions; i++) { struct vhost_user_memory_region *msg_region = &memory->regions[i]; struct vu_dev_region *dev_region = &vdev->memory.regions[i]; - void *mmap_addr; + const void *mmap_addr; debug("vhost-user region %d", i); debug(" guest_phys_addr: 0x%016"PRIx64, @@ -652,9 +652,9 @@ static bool vu_set_vring_addr_exec(struct vu_dev *vdev, struct vhost_user_msg *vmsg) { /* We need to copy the payload to vhost_vring_addr structure - * to access index because address of vmsg->payload.addr - * can be unaligned as it is packed. - */ + * to access index because address of vmsg->payload.addr + * can be unaligned as it is packed. + */ struct vhost_vring_addr addr = vmsg->payload.addr; struct vu_virtq *vq = &vdev->vq[addr.index]; @@ -1202,7 +1202,7 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events) !vdev->context->migrate_target) { if (vdev->context->migrate_exit) { info("Migration complete, exiting"); - _exit(EXIT_SUCCESS); + passt_exit(EXIT_SUCCESS); } info("Migration complete"); @@ -164,8 +164,8 @@ static inline uint16_t vring_avail_ring(const struct vu_virtq *vq, int i) */ static inline uint16_t *virtq_used_event(const struct vu_virtq *vq) { - /* For backwards compat, used event index is at *end* of avail ring. */ - return &vq->vring.avail->ring[vq->vring.num]; + /* For backwards compat, used event index is at *end* of avail ring. */ + return &vq->vring.avail->ring[vq->vring.num]; } /** @@ -356,8 +356,8 @@ void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq) */ static inline uint16_t *virtq_avail_event(const struct vu_virtq *vq) { - /* For backwards compat, avail event index is at *end* of used ring. */ - return (uint16_t *)&vq->vring.used->ring[vq->vring.num]; + /* For backwards compat, avail event index is at *end* of used ring. */ + return (uint16_t *)&vq->vring.used->ring[vq->vring.num]; } /** @@ -15,6 +15,8 @@ /* Maximum size of a virtqueue */ #define VIRTQUEUE_MAX_SIZE 1024 +#define VNET_HLEN (sizeof(struct virtio_net_hdr_mrg_rxbuf)) + /** * struct vu_ring - Virtqueue rings * @num: Size of the queue diff --git a/vu_common.c b/vu_common.c index b13b7c3..5f2ce18 100644 --- a/vu_common.c +++ b/vu_common.c @@ -76,13 +76,13 @@ void vu_init_elem(struct vu_virtq_element *elem, struct iovec *iov, int elem_cnt * in the in_sg array. * @max_elem: Number of virtqueue elements in the array * @size: Maximum size of the data in the frame - * @frame_size: The total size of the buffers (output) + * @collected: Collected buffer length, up to @size, set on return * * Return: number of elements used to contain the frame */ int vu_collect(const struct vu_dev *vdev, struct vu_virtq *vq, struct vu_virtq_element *elem, int max_elem, - size_t size, size_t *frame_size) + size_t size, size_t *collected) { size_t current_size = 0; int elem_cnt = 0; @@ -113,25 +113,24 @@ int vu_collect(const struct vu_dev *vdev, struct vu_virtq *vq, break; } - if (frame_size) - *frame_size = current_size; + if (collected) + *collected = current_size; return elem_cnt; } /** * vu_set_vnethdr() - set virtio-net headers - * @vdev: vhost-user device * @vnethdr: Address of the header to set * @num_buffers: Number of guest buffers of the frame */ -void vu_set_vnethdr(const struct vu_dev *vdev, - struct virtio_net_hdr_mrg_rxbuf *vnethdr, - int num_buffers) +void vu_set_vnethdr(struct virtio_net_hdr_mrg_rxbuf *vnethdr, int num_buffers) { vnethdr->hdr = VU_HEADER; - if (vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) - vnethdr->num_buffers = htole16(num_buffers); + /* Note: if VIRTIO_NET_F_MRG_RXBUF is not negotiated, + * num_buffers must be 1 + */ + vnethdr->num_buffers = htole16(num_buffers); } /** @@ -261,7 +260,7 @@ int vu_send_single(const struct ctx *c, const void *buf, size_t size) vu_init_elem(elem, in_sg, VIRTQUEUE_MAX_SIZE); - size += sizeof(struct virtio_net_hdr_mrg_rxbuf); + size += VNET_HLEN; elem_cnt = vu_collect(vdev, vq, elem, VIRTQUEUE_MAX_SIZE, size, &total); if (total < size) { debug("vu_send_single: no space to send the data " @@ -269,18 +268,15 @@ int vu_send_single(const struct ctx *c, const void *buf, size_t size) goto err; } - vu_set_vnethdr(vdev, in_sg[0].iov_base, elem_cnt); + vu_set_vnethdr(in_sg[0].iov_base, elem_cnt); - total -= sizeof(struct virtio_net_hdr_mrg_rxbuf); + total -= VNET_HLEN; /* copy data from the buffer to the iovec */ - iov_from_buf(in_sg, elem_cnt, sizeof(struct virtio_net_hdr_mrg_rxbuf), - buf, total); + iov_from_buf(in_sg, elem_cnt, VNET_HLEN, buf, total); - if (*c->pcap) { - pcap_iov(in_sg, elem_cnt, - sizeof(struct virtio_net_hdr_mrg_rxbuf)); - } + if (*c->pcap) + pcap_iov(in_sg, elem_cnt, VNET_HLEN); vu_flush(vdev, vq, elem, elem_cnt); @@ -293,3 +289,17 @@ err: return -1; } + +/** + * vu_pad() - Pad 802.3 frame to minimum length (60 bytes) if needed + * @iov: Buffer in iovec array where end of 802.3 frame is stored + * @l2len: Layer-2 length already filled in frame + */ +void vu_pad(struct iovec *iov, size_t l2len) +{ + if (l2len >= ETH_ZLEN) + return; + + memset((char *)iov->iov_base + iov->iov_len, 0, ETH_ZLEN - l2len); + iov->iov_len += ETH_ZLEN - l2len; +} diff --git a/vu_common.h b/vu_common.h index f538f23..865d977 100644 --- a/vu_common.h +++ b/vu_common.h @@ -7,11 +7,17 @@ #ifndef VU_COMMON_H #define VU_COMMON_H + +#include <stddef.h> + #include <linux/virtio_net.h> +#include "ip.h" +#include "virtio.h" + static inline void *vu_eth(void *base) { - return ((char *)base + sizeof(struct virtio_net_hdr_mrg_rxbuf)); + return ((char *)base + VNET_HLEN); } static inline void *vu_ip(void *base) @@ -48,14 +54,13 @@ void vu_init_elem(struct vu_virtq_element *elem, struct iovec *iov, int elem_cnt); int vu_collect(const struct vu_dev *vdev, struct vu_virtq *vq, struct vu_virtq_element *elem, int max_elem, size_t size, - size_t *frame_size); -void vu_set_vnethdr(const struct vu_dev *vdev, - struct virtio_net_hdr_mrg_rxbuf *vnethdr, - int num_buffers); + size_t *collected); +void vu_set_vnethdr(struct virtio_net_hdr_mrg_rxbuf *vnethdr, int num_buffers); void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq, struct vu_virtq_element *elem, int elem_cnt); void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref, const struct timespec *now); int vu_send_single(const struct ctx *c, const void *buf, size_t size); +void vu_pad(struct iovec *iov, size_t l2len); #endif /* VU_COMMON_H */ |
