diff options
-rw-r--r-- | Makefile | 3 | ||||
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | arp.c | 86 | ||||
-rw-r--r-- | arp.h | 2 | ||||
-rw-r--r-- | checksum.c | 8 | ||||
-rw-r--r-- | conf.c | 435 | ||||
-rw-r--r-- | conf.h | 1 | ||||
-rw-r--r-- | contrib/fedora/passt.spec | 42 | ||||
-rw-r--r-- | contrib/selinux/passt-repair.te | 16 | ||||
-rw-r--r-- | contrib/selinux/passt.te | 8 | ||||
-rw-r--r-- | contrib/selinux/pasta.fc | 10 | ||||
-rw-r--r-- | contrib/selinux/pasta.te | 48 | ||||
-rw-r--r-- | dhcp.c | 48 | ||||
-rw-r--r-- | dhcp.h | 2 | ||||
-rw-r--r-- | dhcpv6.c | 227 | ||||
-rw-r--r-- | dhcpv6.h | 2 | ||||
-rw-r--r-- | doc/platform-requirements/.gitignore | 1 | ||||
-rw-r--r-- | doc/platform-requirements/Makefile | 4 | ||||
-rw-r--r-- | doc/platform-requirements/common.h | 1 | ||||
-rw-r--r-- | doc/platform-requirements/listen-vs-repair.c | 128 | ||||
-rw-r--r-- | doc/platform-requirements/reuseaddr-priority.c | 6 | ||||
-rw-r--r-- | epoll_type.h | 4 | ||||
-rw-r--r-- | flow.c | 186 | ||||
-rw-r--r-- | flow.h | 3 | ||||
-rw-r--r-- | flow_table.h | 4 | ||||
-rw-r--r-- | fwd.c | 89 | ||||
-rw-r--r-- | fwd.h | 5 | ||||
-rw-r--r-- | icmp.c | 42 | ||||
-rw-r--r-- | icmp.h | 2 | ||||
-rw-r--r-- | inany.c | 4 | ||||
-rw-r--r-- | inany.h | 27 | ||||
-rw-r--r-- | iov.c | 133 | ||||
-rw-r--r-- | iov.h | 58 | ||||
-rw-r--r-- | ip.c | 33 | ||||
-rw-r--r-- | ip.h | 5 | ||||
-rw-r--r-- | isolation.c | 8 | ||||
-rw-r--r-- | lineread.c | 2 | ||||
-rw-r--r-- | linux_dep.h | 6 | ||||
-rw-r--r-- | log.c | 8 | ||||
-rw-r--r-- | log.h | 1 | ||||
-rw-r--r-- | migrate.c | 10 | ||||
-rw-r--r-- | ndp.c | 18 | ||||
-rw-r--r-- | ndp.h | 4 | ||||
-rw-r--r-- | netlink.c | 3 | ||||
-rw-r--r-- | packet.c | 162 | ||||
-rw-r--r-- | packet.h | 47 | ||||
-rw-r--r-- | passt-repair.1 | 6 | ||||
-rw-r--r-- | passt-repair.c | 108 | ||||
-rw-r--r-- | passt.1 | 29 | ||||
-rw-r--r-- | passt.c | 23 | ||||
-rw-r--r-- | passt.h | 13 | ||||
-rw-r--r-- | pasta.c | 26 | ||||
-rw-r--r-- | pcap.c | 56 | ||||
-rw-r--r-- | pcap.h | 2 | ||||
-rw-r--r-- | repair.c | 60 | ||||
-rw-r--r-- | repair.h | 3 | ||||
-rw-r--r-- | siphash.h | 2 | ||||
-rw-r--r-- | tap.c | 215 | ||||
-rw-r--r-- | tap.h | 35 | ||||
-rw-r--r-- | tcp.c | 654 | ||||
-rw-r--r-- | tcp.h | 1 | ||||
-rw-r--r-- | tcp_buf.c | 38 | ||||
-rw-r--r-- | tcp_conn.h | 5 | ||||
-rw-r--r-- | tcp_internal.h | 21 | ||||
-rw-r--r-- | tcp_splice.c | 47 | ||||
-rw-r--r-- | tcp_vu.c | 35 | ||||
-rw-r--r-- | test/.gitignore | 2 | ||||
-rw-r--r-- | test/Makefile | 29 | ||||
-rw-r--r-- | test/build/all | 61 | ||||
-rwxr-xr-x | test/build/build.py | 109 | ||||
-rw-r--r-- | test/build/clang_tidy | 17 | ||||
-rw-r--r-- | test/build/cppcheck | 17 | ||||
-rwxr-xr-x | test/build/static_checkers.sh | 26 | ||||
-rw-r--r-- | test/lib/exeter | 58 | ||||
-rwxr-xr-x | test/lib/setup | 4 | ||||
-rwxr-xr-x | test/lib/term | 7 | ||||
-rwxr-xr-x | test/lib/test | 7 | ||||
-rwxr-xr-x | test/passt.mbuto | 5 | ||||
-rw-r--r-- | test/pasta_options/log_to_file | 10 | ||||
-rwxr-xr-x | test/run | 24 | ||||
-rwxr-xr-x | test/smoke/smoke.sh | 33 | ||||
-rw-r--r-- | udp.c | 675 | ||||
-rw-r--r-- | udp.h | 7 | ||||
-rw-r--r-- | udp_flow.c | 232 | ||||
-rw-r--r-- | udp_flow.h | 18 | ||||
-rw-r--r-- | udp_internal.h | 6 | ||||
-rw-r--r-- | udp_vu.c | 145 | ||||
-rw-r--r-- | udp_vu.h | 8 | ||||
-rw-r--r-- | util.c | 65 | ||||
-rw-r--r-- | util.h | 43 | ||||
-rw-r--r-- | vhost_user.c | 316 | ||||
-rw-r--r-- | vhost_user.h | 6 | ||||
-rw-r--r-- | virtio.c | 34 | ||||
-rw-r--r-- | virtio.h | 32 | ||||
-rw-r--r-- | vu_common.c | 47 |
95 files changed, 3467 insertions, 1899 deletions
@@ -20,6 +20,7 @@ $(if $(TARGET),,$(error Failed to get target architecture)) # Get 'uname -m'-like architecture description for target TARGET_ARCH := $(firstword $(subst -, ,$(TARGET))) TARGET_ARCH := $(patsubst [:upper:],[:lower:],$(TARGET_ARCH)) +TARGET_ARCH := $(patsubst arm%,arm,$(TARGET_ARCH)) TARGET_ARCH := $(subst powerpc,ppc,$(TARGET_ARCH)) # On some systems enabling optimization also enables source fortification, @@ -29,7 +30,7 @@ ifeq ($(shell $(CC) -O2 -dM -E - < /dev/null 2>&1 | grep ' _FORTIFY_SOURCE ' > / FORTIFY_FLAG := -D_FORTIFY_SOURCE=2 endif -FLAGS := -Wall -Wextra -Wno-format-zero-length +FLAGS := -Wall -Wextra -Wno-format-zero-length -Wformat-security FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE FLAGS += $(FORTIFY_FLAG) -O2 -pie -fPIE FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE) @@ -291,7 +291,7 @@ speeding up local connections, and usually requiring NAT. _pasta_: * ✅ all capabilities dropped, other than `CAP_NET_BIND_SERVICE` (if granted) * ✅ with default options, user, mount, IPC, UTS, PID namespaces are detached * ✅ no external dependencies (other than a standard C library) -* ✅ restrictive seccomp profiles (30 syscalls allowed for _passt_, 41 for +* ✅ restrictive seccomp profiles (33 syscalls allowed for _passt_, 43 for _pasta_ on x86_64) * ✅ examples of [AppArmor](/passt/tree/contrib/apparmor) and [SELinux](/passt/tree/contrib/selinux) profiles available @@ -31,56 +31,84 @@ #include "tap.h" /** - * arp() - Check if this is a supported ARP message, reply as needed + * ignore_arp() - Check if we should ignore this ARP message * @c: Execution context - * @p: Packet pool, single packet with Ethernet buffer + * @ah: ARP header + * @am: ARP message * - * Return: 1 if handled, -1 on failure + * Return: true if the ARP message should be ignored, false otherwise */ -int arp(const struct ctx *c, const struct pool *p) +static bool ignore_arp(const struct ctx *c, + const struct arphdr *ah, const struct arpmsg *am) { - unsigned char swap[4]; - struct ethhdr *eh; - struct arphdr *ah; - struct arpmsg *am; - size_t l2len; - - eh = packet_get(p, 0, 0, sizeof(*eh), NULL); - ah = packet_get(p, 0, sizeof(*eh), sizeof(*ah), NULL); - am = packet_get(p, 0, sizeof(*eh) + sizeof(*ah), sizeof(*am), NULL); - - if (!eh || !ah || !am) - return -1; - if (ah->ar_hrd != htons(ARPHRD_ETHER) || ah->ar_pro != htons(ETH_P_IP) || ah->ar_hln != ETH_ALEN || ah->ar_pln != 4 || ah->ar_op != htons(ARPOP_REQUEST)) - return 1; + return true; /* Discard announcements, but not 0.0.0.0 "probes" */ if (memcmp(am->sip, &in4addr_any, sizeof(am->sip)) && !memcmp(am->sip, am->tip, sizeof(am->sip))) - return 1; + return true; /* Don't resolve the guest's assigned address, either. */ if (!memcmp(am->tip, &c->ip4.addr, sizeof(am->tip))) + return true; + + return false; +} + +/** + * arp() - Check if this is a supported ARP message, reply as needed + * @c: Execution context + * @data: Single packet with Ethernet buffer + * + * Return: 1 if handled, -1 on failure + */ +int arp(const struct ctx *c, struct iov_tail *data) +{ + struct { + struct ethhdr eh; + struct arphdr ah; + struct arpmsg am; + } __attribute__((__packed__)) resp; + struct arphdr ah_storage; + struct ethhdr eh_storage; + struct arpmsg am_storage; + const struct ethhdr *eh; + const struct arphdr *ah; + const struct arpmsg *am; + + eh = IOV_REMOVE_HEADER(data, eh_storage); + ah = IOV_REMOVE_HEADER(data, ah_storage); + am = IOV_REMOVE_HEADER(data, am_storage); + if (!eh || !ah || !am) + return -1; + + if (ignore_arp(c, ah, am)) return 1; - ah->ar_op = htons(ARPOP_REPLY); - memcpy(am->tha, am->sha, sizeof(am->tha)); - memcpy(am->sha, c->our_tap_mac, sizeof(am->sha)); + /* Ethernet header */ + resp.eh.h_proto = htons(ETH_P_ARP); + memcpy(resp.eh.h_dest, eh->h_source, sizeof(resp.eh.h_dest)); + memcpy(resp.eh.h_source, c->our_tap_mac, sizeof(resp.eh.h_source)); - memcpy(swap, am->tip, sizeof(am->tip)); - memcpy(am->tip, am->sip, sizeof(am->tip)); - memcpy(am->sip, swap, sizeof(am->sip)); + /* ARP header */ + resp.ah.ar_op = htons(ARPOP_REPLY); + resp.ah.ar_hrd = ah->ar_hrd; + resp.ah.ar_pro = ah->ar_pro; + resp.ah.ar_hln = ah->ar_hln; + resp.ah.ar_pln = ah->ar_pln; - l2len = sizeof(*eh) + sizeof(*ah) + sizeof(*am); - memcpy(eh->h_dest, eh->h_source, sizeof(eh->h_dest)); - memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source)); + /* ARP message */ + memcpy(resp.am.sha, c->our_tap_mac, sizeof(resp.am.sha)); + memcpy(resp.am.sip, am->tip, sizeof(resp.am.sip)); + memcpy(resp.am.tha, am->sha, sizeof(resp.am.tha)); + memcpy(resp.am.tip, am->sip, sizeof(resp.am.tip)); - tap_send_single(c, eh, l2len); + tap_send_single(c, &resp, sizeof(resp)); return 1; } @@ -20,6 +20,6 @@ struct arpmsg { unsigned char tip[4]; } __attribute__((__packed__)); -int arp(const struct ctx *c, const struct pool *p); +int arp(const struct ctx *c, struct iov_tail *data); #endif /* ARP_H */ @@ -145,7 +145,7 @@ uint16_t csum_ip4_header(uint16_t l3len, uint8_t protocol, * @proto: Protocol number * @saddr: Source address * @daddr: Destination address - * Returns: Partial checksum of the IPv4 header + * Return: partial checksum of the IPv4 header */ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol, struct in_addr saddr, struct in_addr daddr) @@ -225,7 +225,7 @@ void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen) * @proto: Protocol number * @saddr: Source address * @daddr: Destination address - * Returns: Partial checksum of the IPv6 header + * Return: partial checksum of the IPv6 header */ uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol, const struct in6_addr *saddr, @@ -452,7 +452,7 @@ less_than_128_bytes: } /** - * csum_unfolded - Calculate the unfolded checksum of a data buffer. + * csum_unfolded() - Calculate the unfolded checksum of a data buffer. * * @buf: Input buffer * @len: Input length @@ -481,7 +481,7 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init) } #else /* __AVX2__ */ /** - * csum_unfolded - Calculate the unfolded checksum of a data buffer. + * csum_unfolded() - Calculate the unfolded checksum of a data buffer. * * @buf: Input buffer * @len: Input length @@ -16,6 +16,7 @@ #include <errno.h> #include <fcntl.h> #include <getopt.h> +#include <libgen.h> #include <string.h> #include <sched.h> #include <sys/types.h> @@ -64,11 +65,11 @@ const char *pasta_default_ifn = "tap0"; /** - * next_chunk - Return the next piece of a string delimited by a character + * next_chunk() - Return the next piece of a string delimited by a character * @s: String to search * @c: Delimiter character * - * Return: If another @c is found in @s, returns a pointer to the + * Return: if another @c is found in @s, returns a pointer to the * character *after* the delimiter, if no further @c is in @s, * return NULL */ @@ -79,7 +80,7 @@ static char *next_chunk(const char *s, char c) } /** - * port_range - Represents a non-empty range of ports + * port_range() - Represents a non-empty range of ports * @first: First port number in the range * @last: Last port number in the range (inclusive) * @@ -124,6 +125,75 @@ static int parse_port_range(const char *s, char **endptr, } /** + * conf_ports_range_except() - Set up forwarding for a range of ports minus a + * bitmap of exclusions + * @c: Execution context + * @optname: Short option name, t, T, u, or U + * @optarg: Option argument (port specification) + * @fwd: Pointer to @fwd_ports to be updated + * @addr: Listening address + * @ifname: Listening interface + * @first: First port to forward + * @last: Last port to forward + * @exclude: Bitmap of ports to exclude + * @to: Port to translate @first to when forwarding + * @weak: Ignore errors, as long as at least one port is mapped + */ +static void conf_ports_range_except(const struct ctx *c, char optname, + const char *optarg, struct fwd_ports *fwd, + const union inany_addr *addr, + const char *ifname, + uint16_t first, uint16_t last, + const uint8_t *exclude, uint16_t to, + bool weak) +{ + bool bound_one = false; + unsigned i; + int ret; + + if (first == 0) { + die("Can't forward port 0 for option '-%c %s'", + optname, optarg); + } + + for (i = first; i <= last; i++) { + if (bitmap_isset(exclude, i)) + continue; + + if (bitmap_isset(fwd->map, i)) { + warn( +"Altering mapping of already mapped port number: %s", optarg); + } + + bitmap_set(fwd->map, i); + fwd->delta[i] = to - first; + + if (optname == 't') + ret = tcp_sock_init(c, addr, ifname, i); + else if (optname == 'u') + ret = udp_sock_init(c, 0, addr, ifname, i); + else + /* No way to check in advance for -T and -U */ + ret = 0; + + if (ret == -ENFILE || ret == -EMFILE) { + die("Can't open enough sockets for port specifier: %s", + optarg); + } + + if (!ret) { + bound_one = true; + } else if (!weak) { + die("Failed to bind port %u (%s) for option '-%c %s'", + i, strerror_(-ret), optname, optarg); + } + } + + if (!bound_one) + die("Failed to bind any port for '-%c %s'", optname, optarg); +} + +/** * conf_ports() - Parse port configuration options, initialise UDP/TCP sockets * @c: Execution context * @optname: Short option name, t, T, u, or U @@ -135,10 +205,9 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, { union inany_addr addr_buf = inany_any6, *addr = &addr_buf; char buf[BUFSIZ], *spec, *ifname = NULL, *p; - bool exclude_only = true, bound_one = false; uint8_t exclude[PORT_BITMAP_SIZE] = { 0 }; + bool exclude_only = true; unsigned i; - int ret; if (!strcmp(optarg, "none")) { if (fwd->mode) @@ -173,32 +242,15 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, fwd->mode = FWD_ALL; - /* Skip port 0. It has special meaning for many socket APIs, so - * trying to bind it is not really safe. - */ - for (i = 1; i < NUM_PORTS; i++) { + /* Exclude ephemeral ports */ + for (i = 0; i < NUM_PORTS; i++) if (fwd_port_is_ephemeral(i)) - continue; - - bitmap_set(fwd->map, i); - if (optname == 't') { - ret = tcp_sock_init(c, NULL, NULL, i); - if (ret == -ENFILE || ret == -EMFILE) - goto enfile; - if (!ret) - bound_one = true; - } else if (optname == 'u') { - ret = udp_sock_init(c, 0, NULL, NULL, i); - if (ret == -ENFILE || ret == -EMFILE) - goto enfile; - if (!ret) - bound_one = true; - } - } - - if (!bound_one) - goto bind_all_fail; + bitmap_set(exclude, i); + conf_ports_range_except(c, optname, optarg, fwd, + NULL, NULL, + 1, NUM_PORTS - 1, exclude, + 1, true); return; } @@ -275,37 +327,15 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, } while ((p = next_chunk(p, ','))); if (exclude_only) { - /* Skip port 0. It has special meaning for many socket APIs, so - * trying to bind it is not really safe. - */ - for (i = 1; i < NUM_PORTS; i++) { - if (fwd_port_is_ephemeral(i) || - bitmap_isset(exclude, i)) - continue; - - bitmap_set(fwd->map, i); - - if (optname == 't') { - ret = tcp_sock_init(c, addr, ifname, i); - if (ret == -ENFILE || ret == -EMFILE) - goto enfile; - if (!ret) - bound_one = true; - } else if (optname == 'u') { - ret = udp_sock_init(c, 0, addr, ifname, i); - if (ret == -ENFILE || ret == -EMFILE) - goto enfile; - if (!ret) - bound_one = true; - } else { - /* No way to check in advance for -T and -U */ - bound_one = true; - } - } - - if (!bound_one) - goto bind_all_fail; + /* Exclude ephemeral ports */ + for (i = 0; i < NUM_PORTS; i++) + if (fwd_port_is_ephemeral(i)) + bitmap_set(exclude, i); + conf_ports_range_except(c, optname, optarg, fwd, + addr, ifname, + 1, NUM_PORTS - 1, exclude, + 1, true); return; } @@ -334,40 +364,18 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, if ((*p != '\0') && (*p != ',')) /* Garbage after the ranges */ goto bad; - for (i = orig_range.first; i <= orig_range.last; i++) { - if (bitmap_isset(fwd->map, i)) - warn( -"Altering mapping of already mapped port number: %s", optarg); - - if (bitmap_isset(exclude, i)) - continue; - - bitmap_set(fwd->map, i); - - fwd->delta[i] = mapped_range.first - orig_range.first; - - ret = 0; - if (optname == 't') - ret = tcp_sock_init(c, addr, ifname, i); - else if (optname == 'u') - ret = udp_sock_init(c, 0, addr, ifname, i); - if (ret) - goto bind_fail; - } + conf_ports_range_except(c, optname, optarg, fwd, + addr, ifname, + orig_range.first, orig_range.last, + exclude, + mapped_range.first, false); } while ((p = next_chunk(p, ','))); return; -enfile: - die("Can't open enough sockets for port specifier: %s", optarg); bad: die("Invalid port specifier %s", optarg); mode_conflict: die("Port forwarding mode '%s' conflicts with previous mode", optarg); -bind_fail: - die("Failed to bind port %u (%s) for option '-%c %s', exiting", - i, strerror_(-ret), optname, optarg); -bind_all_fail: - die("Failed to bind any port for '-%c %s', exiting", optname, optarg); } /** @@ -376,7 +384,7 @@ bind_all_fail: * @addr: Guest nameserver IPv4 address * @idx: Index of free entry in array of IPv4 resolvers * - * Return: Number of entries added (0 or 1) + * Return: number of entries added (0 or 1) */ static unsigned add_dns4(struct ctx *c, const struct in_addr *addr, unsigned idx) @@ -394,7 +402,7 @@ static unsigned add_dns4(struct ctx *c, const struct in_addr *addr, * @addr: Guest nameserver IPv6 address * @idx: Index of free entry in array of IPv6 resolvers * - * Return: Number of entries added (0 or 1) + * Return: number of entries added (0 or 1) */ static unsigned add_dns6(struct ctx *c, const struct in6_addr *addr, unsigned idx) @@ -407,6 +415,76 @@ static unsigned add_dns6(struct ctx *c, const struct in6_addr *addr, } /** + * add_dns_resolv4() - Possibly add one IPv4 nameserver from host's resolv.conf + * @c: Execution context + * @ns: Nameserver address + * @idx: Pointer to index of current IPv4 resolver entry, set on return + */ +static void add_dns_resolv4(struct ctx *c, struct in_addr *ns, unsigned *idx) +{ + if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host)) + c->ip4.dns_host = *ns; + + /* Special handling if guest or container can only access local + * addresses via redirect, or if the host gateway is also a resolver and + * we shadow its address + */ + if (IN4_IS_ADDR_LOOPBACK(ns) || + IN4_ARE_ADDR_EQUAL(ns, &c->ip4.map_host_loopback)) { + if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match)) { + if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback)) + return; /* Address unreachable */ + + *ns = c->ip4.map_host_loopback; + c->ip4.dns_match = c->ip4.map_host_loopback; + } else { + /* No general host mapping, but requested for DNS + * (--dns-forward and --no-map-gw): advertise resolver + * address from --dns-forward, and map that to loopback + */ + *ns = c->ip4.dns_match; + } + } + + *idx += add_dns4(c, ns, *idx); +} + +/** + * add_dns_resolv6() - Possibly add one IPv6 nameserver from host's resolv.conf + * @c: Execution context + * @ns: Nameserver address + * @idx: Pointer to index of current IPv6 resolver entry, set on return + */ +static void add_dns_resolv6(struct ctx *c, struct in6_addr *ns, unsigned *idx) +{ + if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host)) + c->ip6.dns_host = *ns; + + /* Special handling if guest or container can only access local + * addresses via redirect, or if the host gateway is also a resolver and + * we shadow its address + */ + if (IN6_IS_ADDR_LOOPBACK(ns) || + IN6_ARE_ADDR_EQUAL(ns, &c->ip6.map_host_loopback)) { + if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match)) { + if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback)) + return; /* Address unreachable */ + + *ns = c->ip6.map_host_loopback; + c->ip6.dns_match = c->ip6.map_host_loopback; + } else { + /* No general host mapping, but requested for DNS + * (--dns-forward and --no-map-gw): advertise resolver + * address from --dns-forward, and map that to loopback + */ + *ns = c->ip6.dns_match; + } + } + + *idx += add_dns6(c, ns, *idx); +} + +/** * add_dns_resolv() - Possibly add ns from host resolv.conf to configuration * @c: Execution context * @nameserver: Nameserver address string from /etc/resolv.conf @@ -422,48 +500,11 @@ static void add_dns_resolv(struct ctx *c, const char *nameserver, struct in6_addr ns6; struct in_addr ns4; - if (idx4 && inet_pton(AF_INET, nameserver, &ns4)) { - if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host)) - c->ip4.dns_host = ns4; + if (idx4 && inet_pton(AF_INET, nameserver, &ns4)) + add_dns_resolv4(c, &ns4, idx4); - /* Special handling if guest or container can only access local - * addresses via redirect, or if the host gateway is also a - * resolver and we shadow its address - */ - if (IN4_IS_ADDR_LOOPBACK(&ns4) || - IN4_ARE_ADDR_EQUAL(&ns4, &c->ip4.map_host_loopback)) { - if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback)) - return; - - ns4 = c->ip4.map_host_loopback; - if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match)) - c->ip4.dns_match = c->ip4.map_host_loopback; - } - - *idx4 += add_dns4(c, &ns4, *idx4); - } - - if (idx6 && inet_pton(AF_INET6, nameserver, &ns6)) { - if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host)) - c->ip6.dns_host = ns6; - - /* Special handling if guest or container can only access local - * addresses via redirect, or if the host gateway is also a - * resolver and we shadow its address - */ - if (IN6_IS_ADDR_LOOPBACK(&ns6) || - IN6_ARE_ADDR_EQUAL(&ns6, &c->ip6.map_host_loopback)) { - if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback)) - return; - - ns6 = c->ip6.map_host_loopback; - - if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match)) - c->ip6.dns_match = c->ip6.map_host_loopback; - } - - *idx6 += add_dns6(c, &ns6, *idx6); - } + if (idx6 && inet_pton(AF_INET6, nameserver, &ns6)) + add_dns_resolv6(c, &ns6, idx6); } /** @@ -615,7 +656,7 @@ static void conf_pasta_ns(int *netns_only, char *userns, char *netns, /** conf_ip4_prefix() - Parse an IPv4 prefix length or netmask * @arg: Netmask in dotted decimal or prefix length * - * Return: Validated prefix length on success, -1 on failure + * Return: validated prefix length on success, -1 on failure */ static int conf_ip4_prefix(const char *arg) { @@ -642,7 +683,7 @@ static int conf_ip4_prefix(const char *arg) * @ifi: Host interface to attempt (0 to determine one) * @ip4: IPv4 context (will be written) * - * Return: Interface index for IPv4, or 0 on failure. + * Return: interface index for IPv4, or 0 on failure. */ static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4) { @@ -714,7 +755,7 @@ static void conf_ip4_local(struct ip4_ctx *ip4) * @ifi: Host interface to attempt (0 to determine one) * @ip6: IPv6 context (will be written) * - * Return: Interface index for IPv6, or 0 on failure. + * Return: interface index for IPv6, or 0 on failure. */ static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6) { @@ -823,6 +864,14 @@ static void usage(const char *name, FILE *f, int status) FPRINTF(f, " --repair-path PATH path for passt-repair(1)\n" " default: append '.repair' to UNIX domain path\n"); + FPRINTF(f, + " --migrate-exit DEPRECATED:\n" + " source quits after migration\n" + " default: source keeps running after migration\n"); + FPRINTF(f, + " --migrate-no-linger DEPRECATED:\n" + " close sockets on migration\n" + " default: keep sockets open, ignore events\n"); } FPRINTF(f, @@ -934,6 +983,7 @@ static void usage(const char *name, FILE *f, int status) " SPEC is as described for TCP above\n" " default: none\n"); + (void)fflush(f); _exit(status); pasta_opts: @@ -988,10 +1038,50 @@ pasta_opts: " --ns-mac-addr ADDR Set MAC address on tap interface\n" " --no-splice Disable inbound socket splicing\n"); + (void)fflush(f); _exit(status); } /** + * conf_mode() - Determine passt/pasta's operating mode from command line + * @argc: Argument count + * @argv: Command line arguments + * + * Return: mode to operate in, PASTA or PASST + */ +enum passt_modes conf_mode(int argc, char *argv[]) +{ + int vhost_user = 0; + const struct option optvu[] = { + {"vhost-user", no_argument, &vhost_user, 1 }, + { 0 }, + }; + char argv0[PATH_MAX], *basearg0; + int name; + + optind = 0; + do { + name = getopt_long(argc, argv, "-:", optvu, NULL); + } while (name != -1); + + if (vhost_user) + return MODE_VU; + + if (argc < 1) + die("Cannot determine argv[0]"); + + strncpy(argv0, argv[0], PATH_MAX - 1); + basearg0 = basename(argv0); + if (strstr(basearg0, "pasta")) + return MODE_PASTA; + + if (strstr(basearg0, "passt")) + return MODE_PASST; + + die("Cannot determine mode, invoke as \"passt\" or \"pasta\""); +} + +/** * conf_print() - Print fundamental configuration parameters * @c: Execution context */ @@ -1225,6 +1315,8 @@ static void conf_nat(const char *arg, struct in_addr *addr4, *addr6 = in6addr_any; if (no_map_gw) *no_map_gw = 1; + + return; } if (inet_pton(AF_INET6, arg, addr6) && @@ -1276,7 +1368,7 @@ static void conf_open_files(struct ctx *c) } /** - * parse_mac - Parse a MAC address from a string + * parse_mac() - Parse a MAC address from a string * @mac: Binary MAC address, initialised on success * @str: String to parse * @@ -1386,18 +1478,21 @@ void conf(struct ctx *c, int argc, char **argv) {"socket-path", required_argument, NULL, 's' }, {"fqdn", required_argument, NULL, 27 }, {"repair-path", required_argument, NULL, 28 }, + {"migrate-exit", no_argument, NULL, 29 }, + {"migrate-no-linger", no_argument, NULL, 30 }, { 0 }, }; + const char *optstring = "+dqfel:hs:F:I:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:T:U:"; const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt"; char userns[PATH_MAX] = { 0 }, netns[PATH_MAX] = { 0 }; bool copy_addrs_opt = false, copy_routes_opt = false; enum fwd_ports_mode fwd_default = FWD_NONE; bool v4_only = false, v6_only = false; unsigned dns4_idx = 0, dns6_idx = 0; + unsigned long max_mtu = IP_MAX_MTU; struct fqdn *dnss = c->dns_search; unsigned int ifi4 = 0, ifi6 = 0; const char *logfile = NULL; - const char *optstring; size_t logsize = 0; char *runas = NULL; long fd_tap_opt; @@ -1408,12 +1503,11 @@ void conf(struct ctx *c, int argc, char **argv) if (c->mode == MODE_PASTA) { c->no_dhcp_dns = c->no_dhcp_dns_search = 1; fwd_default = FWD_AUTO; - optstring = "+dqfel:hF:I:p:P:m:a:n:M:g:i:o:D:S:H:46t:u:T:U:"; - } else { - optstring = "+dqfel:hs:F:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:"; } - c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t)); + if (tap_l2_max_len(c) - ETH_HLEN < max_mtu) + max_mtu = tap_l2_max_len(c) - ETH_HLEN; + c->mtu = ROUND_DOWN(max_mtu, sizeof(uint32_t)); c->tcp.fwd_in.mode = c->tcp.fwd_out.mode = FWD_UNSET; c->udp.fwd_in.mode = c->udp.fwd_out.mode = FWD_UNSET; memcpy(c->our_tap_mac, MAC_OUR_LAA, ETH_ALEN); @@ -1512,6 +1606,7 @@ void conf(struct ctx *c, int argc, char **argv) FPRINTF(stdout, c->mode == MODE_PASTA ? "pasta " : "passt "); FPRINTF(stdout, VERSION_BLOB); + (void)fflush(stdout); _exit(EXIT_SUCCESS); case 15: ret = snprintf(c->ip4.ifname_out, @@ -1581,9 +1676,8 @@ void conf(struct ctx *c, int argc, char **argv) die("Invalid host nameserver address: %s", optarg); case 25: - if (c->mode == MODE_PASTA) - die("--vhost-user is for passt mode only"); - c->mode = MODE_VU; + /* Already handled in conf_mode() */ + ASSERT(c->mode == MODE_VU); break; case 26: vu_print_capabilities(); @@ -1594,7 +1688,26 @@ void conf(struct ctx *c, int argc, char **argv) die("Invalid FQDN: %s", optarg); break; case 28: - /* Handle this once we checked --vhost-user */ + if (c->mode != MODE_VU && strcmp(optarg, "none")) + die("--repair-path is for vhost-user mode only"); + + if (snprintf_check(c->repair_path, + sizeof(c->repair_path), "%s", + optarg)) + die("Invalid passt-repair path: %s", optarg); + + break; + case 29: + if (c->mode != MODE_VU) + die("--migrate-exit is for vhost-user mode only"); + c->migrate_exit = true; + + break; + case 30: + if (c->mode != MODE_VU) + die("--migrate-no-linger is for vhost-user mode only"); + c->migrate_no_linger = true; + break; case 'd': c->debug = 1; @@ -1614,6 +1727,9 @@ void conf(struct ctx *c, int argc, char **argv) c->foreground = 1; break; case 's': + if (c->mode == MODE_PASTA) + die("-s is for passt / vhost-user mode only"); + ret = snprintf(c->sock_path, sizeof(c->sock_path), "%s", optarg); if (ret <= 0 || ret >= (int)sizeof(c->sock_path)) @@ -1626,7 +1742,8 @@ void conf(struct ctx *c, int argc, char **argv) fd_tap_opt = strtol(optarg, NULL, 0); if (errno || - fd_tap_opt <= STDERR_FILENO || fd_tap_opt > INT_MAX) + (fd_tap_opt != STDIN_FILENO && fd_tap_opt <= STDERR_FILENO) || + fd_tap_opt > INT_MAX) die("Invalid --fd: %s", optarg); c->fd_tap = fd_tap_opt; @@ -1634,6 +1751,9 @@ void conf(struct ctx *c, int argc, char **argv) *c->sock_path = 0; break; case 'I': + if (c->mode != MODE_PASTA) + die("-I is for pasta mode only"); + ret = snprintf(c->pasta_ifn, IFNAMSIZ, "%s", optarg); if (ret <= 0 || ret >= IFNAMSIZ) @@ -1663,9 +1783,9 @@ void conf(struct ctx *c, int argc, char **argv) if (errno || *e) die("Invalid MTU: %s", optarg); - if (mtu > ETH_MAX_MTU) { - die("MTU %lu too large (max %u)", - mtu, ETH_MAX_MTU); + if (mtu > max_mtu) { + die("MTU %lu too large (max %lu)", + mtu, max_mtu); } c->mtu = mtu; @@ -1790,11 +1910,16 @@ void conf(struct ctx *c, int argc, char **argv) break; case 't': case 'u': - case 'T': - case 'U': case 'D': /* Handle these later, once addresses are configured */ break; + case 'T': + case 'U': + if (c->mode != MODE_PASTA) + die("-%c is for pasta mode only", name); + + /* Handle properly later, once addresses are configured */ + break; case 'h': usage(argv[0], stdout, EXIT_SUCCESS); break; @@ -1883,8 +2008,8 @@ void conf(struct ctx *c, int argc, char **argv) if (c->ifi4 && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.guest_gw)) c->no_dhcp = 1; - /* Inbound port options, DNS, and --repair-path can be parsed now, after - * IPv4/IPv6 settings and --vhost-user. + /* Inbound port options and DNS can be parsed now, after IPv4/IPv6 + * settings */ fwd_probe_ephemeral(); udp_portmap_clear(); @@ -1930,16 +2055,6 @@ void conf(struct ctx *c, int argc, char **argv) } die("Cannot use DNS address %s", optarg); - } else if (name == 28) { - if (c->mode != MODE_VU && strcmp(optarg, "none")) - die("--repair-path is for vhost-user mode only"); - - if (snprintf_check(c->repair_path, - sizeof(c->repair_path), "%s", - optarg)) - die("Invalid passt-repair path: %s", optarg); - - break; } } while (name != -1); @@ -6,6 +6,7 @@ #ifndef CONF_H #define CONF_H +enum passt_modes conf_mode(int argc, char *argv[]); void conf(struct ctx *c, int argc, char **argv); #endif /* CONF_H */ diff --git a/contrib/fedora/passt.spec b/contrib/fedora/passt.spec index 745cf01..663289f 100644 --- a/contrib/fedora/passt.spec +++ b/contrib/fedora/passt.spec @@ -9,6 +9,7 @@ %global git_hash {{{ git_head }}} %global selinuxtype targeted +%global selinux_policy_version 41.41 Name: passt Version: {{{ git_version }}} @@ -33,15 +34,19 @@ for network namespaces: traffic is forwarded using a tap interface inside the namespace, without the need to create further interfaces on the host, hence not requiring any capabilities or privileges. -%package selinux -BuildArch: noarch -Summary: SELinux support for passt and pasta -Requires: %{name} = %{version}-%{release} -Requires: selinux-policy -Requires(post): %{name} -Requires(post): policycoreutils -Requires(preun): %{name} -Requires(preun): policycoreutils +%package selinux +BuildArch: noarch +Summary: SELinux support for passt and pasta +Requires: selinux-policy-%{selinuxtype} +Requires: container-selinux +Requires(post): selinux-policy-%{selinuxtype} +Requires(post): container-selinux +Requires(post): policycoreutils +Requires(post): libselinux-utils +Requires(preun): policycoreutils +BuildRequires: selinux-policy-devel +BuildRequires: pkgconfig(systemd) +Recommends: selinux-policy-%{selinuxtype} >= %{selinux_policy_version} %description selinux This package adds SELinux enforcement to passt(1), pasta(1), passt-repair(1). @@ -89,19 +94,26 @@ popd %selinux_relabel_pre -s %{selinuxtype} %post selinux -%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp -%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp -%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp +%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp %postun selinux if [ $1 -eq 0 ]; then - %selinux_modules_uninstall -s %{selinuxtype} passt - %selinux_modules_uninstall -s %{selinuxtype} pasta - %selinux_modules_uninstall -s %{selinuxtype} passt-repair + %selinux_modules_uninstall -s %{selinuxtype} passt pasta passt-repair fi %posttrans selinux %selinux_relabel_post -s %{selinuxtype} +# %selinux_relabel_post calls fixfiles(8) with the previous file_contexts file +# (see selabel_file(5)) in order to restore only the file contexts which +# actually changed. However, as file_contexts doesn't support %{USERID} +# substitutions, this will not work for specific file contexts that pasta needs +# to have under /run/user. +# +# Restore those explicitly, hiding errors from restorecon(8): we can't pass a +# path that's more specific than this, but at the same time /run/user often +# contains FUSE mountpoints that can't be accessed as root, leading to +# "Permission denied" messages, but not failures. +restorecon -R /run/user 2>/dev/null %files %license LICENSES/{GPL-2.0-or-later.txt,BSD-3-Clause.txt} diff --git a/contrib/selinux/passt-repair.te b/contrib/selinux/passt-repair.te index f171be6..7157dfb 100644 --- a/contrib/selinux/passt-repair.te +++ b/contrib/selinux/passt-repair.te @@ -61,11 +61,11 @@ allow passt_repair_t unconfined_t:unix_stream_socket { connectto read write }; allow passt_repair_t passt_t:unix_stream_socket { connectto read write }; allow passt_repair_t user_tmp_t:unix_stream_socket { connectto read write }; -allow passt_repair_t user_tmp_t:dir search; +allow passt_repair_t user_tmp_t:dir { getattr read search watch }; -allow passt_repair_t unconfined_t:sock_file { read write }; -allow passt_repair_t passt_t:sock_file { read write }; -allow passt_repair_t user_tmp_t:sock_file { read write }; +allow passt_repair_t unconfined_t:sock_file { getattr read write }; +allow passt_repair_t passt_t:sock_file { getattr read write }; +allow passt_repair_t user_tmp_t:sock_file { getattr read write }; allow passt_repair_t unconfined_t:tcp_socket { read setopt write }; allow passt_repair_t passt_t:tcp_socket { read setopt write }; @@ -80,8 +80,8 @@ allow passt_repair_t passt_t:tcp_socket { read setopt write }; allow passt_repair_t qemu_var_run_t:unix_stream_socket { connectto read write }; allow passt_repair_t virt_var_run_t:unix_stream_socket { connectto read write }; -allow passt_repair_t qemu_var_run_t:dir search; -allow passt_repair_t virt_var_run_t:dir search; +allow passt_repair_t qemu_var_run_t:dir { getattr read search watch }; +allow passt_repair_t virt_var_run_t:dir { getattr read search watch }; -allow passt_repair_t qemu_var_run_t:sock_file { read write }; -allow passt_repair_t virt_var_run_t:sock_file { read write }; +allow passt_repair_t qemu_var_run_t:sock_file { getattr read write }; +allow passt_repair_t virt_var_run_t:sock_file { getattr read write }; diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te index f8ea672..6995df8 100644 --- a/contrib/selinux/passt.te +++ b/contrib/selinux/passt.te @@ -49,7 +49,7 @@ require { type proc_net_t; type node_t; class tcp_socket { create accept listen name_bind name_connect getattr ioctl }; - class udp_socket { create accept listen }; + class udp_socket { create accept listen getattr }; class icmp_socket { bind create name_bind node_bind setopt read write }; class sock_file { create unlink write }; @@ -110,8 +110,6 @@ allow passt_t self:user_namespace create; auth_read_passwd(passt_t) allow passt_t proc_net_t:file read; -allow passt_t net_conf_t:file { open read }; -allow passt_t net_conf_t:lnk_file read; allow passt_t tmp_t:sock_file { create unlink write }; allow passt_t self:netlink_route_socket { bind create nlmsg_read read write setopt }; kernel_search_network_sysctl(passt_t) @@ -129,11 +127,13 @@ corenet_tcp_connect_all_ports(passt_t) corenet_tcp_sendrecv_all_ports(passt_t) corenet_udp_sendrecv_all_ports(passt_t) +sysnet_read_config(passt_t) + allow passt_t node_t:icmp_socket { name_bind node_bind }; allow passt_t port_t:icmp_socket name_bind; allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write getattr ioctl }; -allow passt_t self:udp_socket { create getopt setopt connect bind read write }; +allow passt_t self:udp_socket { create getopt setopt connect bind read write getattr }; allow passt_t self:icmp_socket { bind create setopt read write }; allow passt_t user_tmp_t:dir { add_name write }; diff --git a/contrib/selinux/pasta.fc b/contrib/selinux/pasta.fc index 41ee46d..e4aefc4 100644 --- a/contrib/selinux/pasta.fc +++ b/contrib/selinux/pasta.fc @@ -8,7 +8,9 @@ # Copyright (c) 2022 Red Hat GmbH # Author: Stefano Brivio <sbrivio@redhat.com> -/usr/bin/pasta system_u:object_r:pasta_exec_t:s0 -/usr/bin/pasta.avx2 system_u:object_r:pasta_exec_t:s0 -/tmp/pasta\.pcap system_u:object_r:pasta_log_t:s0 -/var/run/pasta\.pid system_u:object_r:pasta_pid_t:s0 +/usr/bin/pasta system_u:object_r:pasta_exec_t:s0 +/usr/bin/pasta.avx2 system_u:object_r:pasta_exec_t:s0 +/tmp/pasta\.pcap system_u:object_r:pasta_log_t:s0 +/var/run/pasta\.pid system_u:object_r:pasta_pid_t:s0 +/run/user/%{USERID}/netns system_u:object_r:ifconfig_var_run_t:s0 +/run/user/%{USERID}/containers/networks/rootless-netns system_u:object_r:ifconfig_var_run_t:s0 diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te index 89c8043..c0a1e9b 100644 --- a/contrib/selinux/pasta.te +++ b/contrib/selinux/pasta.te @@ -89,6 +89,15 @@ require { class capability { sys_tty_config setuid setgid }; class cap_userns { setpcap sys_admin sys_ptrace net_bind_service net_admin }; class user_namespace create; + + # Container requires + attribute_role usernetctl_roles; + role container_user_r; + role staff_r; + role user_r; + type container_runtime_t; + type container_t; + type systemd_user_runtimedir_t; } type pasta_t; @@ -113,6 +122,9 @@ init_daemon_domain(pasta_t, pasta_exec_t) allow pasta_t self:capability { setpcap net_bind_service sys_tty_config dac_read_search net_admin sys_resource setuid setgid }; allow pasta_t self:cap_userns { setpcap sys_admin sys_ptrace net_admin net_bind_service }; +# pasta only calls setuid and setgid with the current UID and GID, so this +# denial is harmless. See https://bugzilla.redhat.com/show_bug.cgi?id=2330512#c10 +dontaudit pasta_t self:cap_userns { setgid setuid }; allow pasta_t self:user_namespace create; auth_read_passwd(pasta_t) @@ -130,7 +142,7 @@ allow pasta_t user_home_t:file { open read getattr setattr execute execute_no_tr allow pasta_t user_home_dir_t:dir { search getattr open add_name read write }; allow pasta_t user_home_dir_t:file { create open read write }; allow pasta_t tmp_t:dir { add_name mounton remove_name write }; -allow pasta_t tmpfs_t:filesystem mount; +allow pasta_t tmpfs_t:filesystem { getattr mount }; allow pasta_t fs_t:filesystem unmount; allow pasta_t root_t:dir mounton; manage_files_pattern(pasta_t, pasta_pid_t, pasta_pid_t) @@ -147,15 +159,21 @@ logging_send_syslog_msg(pasta_t) allow syslogd_t self:cap_userns sys_ptrace; allow pasta_t proc_net_t:file { open read }; -allow pasta_t net_conf_t:file { open read }; allow pasta_t self:netlink_route_socket { bind create nlmsg_read nlmsg_write setopt read write }; kernel_search_network_sysctl(pasta_t) +sysnet_read_config(pasta_t) + allow pasta_t tmp_t:sock_file { create unlink write }; allow pasta_t self:tcp_socket create_stream_socket_perms; corenet_tcp_sendrecv_generic_node(pasta_t) corenet_tcp_bind_generic_node(pasta_t) +allow pasta_t container_runtime_t:dir { open read search }; +allow pasta_t container_runtime_t:fifo_file { getattr write }; +allow pasta_t container_runtime_t:file read; +allow pasta_t container_runtime_t:lnk_file read; +allow pasta_t container_t:lnk_file read; allow pasta_t pasta_port_t:tcp_socket { name_bind name_connect }; allow pasta_t pasta_port_t:udp_socket { name_bind }; allow pasta_t http_port_t:tcp_socket { name_bind name_connect }; @@ -204,7 +222,6 @@ allow pasta_t kernel_t:system module_request; allow pasta_t proc_t:dir mounton; allow pasta_t proc_t:filesystem mount; -allow pasta_t net_conf_t:lnk_file read; allow pasta_t proc_net_t:lnk_file read; allow pasta_t unconfined_t:process { noatsecure rlimitinh siginh }; @@ -213,3 +230,28 @@ allow pasta_t netutils_t:process { noatsecure rlimitinh siginh }; allow pasta_t ping_t:process { noatsecure rlimitinh siginh }; allow pasta_t user_tty_device_t:chr_file { append read write }; allow pasta_t user_devpts_t:chr_file { append read write }; + +# Allow network administration commands for non-privileged users +roleattribute container_user_r usernetctl_roles; +roleattribute staff_r usernetctl_roles; +roleattribute user_r usernetctl_roles; +role usernetctl_roles types pasta_t; + +# Make pasta in a container run under the pasta_t context +type_transition container_runtime_t pasta_exec_t : process pasta_t; +allow container_runtime_t pasta_t:process transition; + +# Label the user network namespace files +type_transition container_runtime_t user_tmp_t : dir ifconfig_var_run_t "netns"; +type_transition container_runtime_t user_tmp_t : dir ifconfig_var_run_t "rootless-netns"; +allow pasta_t ifconfig_var_run_t:dir { add_name open rmdir write }; +allow pasta_t ifconfig_var_run_t:file { create open write }; +allow systemd_user_runtimedir_t ifconfig_var_run_t:dir rmdir; + +# Allow pasta to bind to any port +bool pasta_bind_all_ports true; +if (pasta_bind_all_ports) { + allow pasta_t port_type:icmp_socket { accept getopt name_bind }; + allow pasta_t port_type:tcp_socket { accept getopt name_bind name_connect }; + allow pasta_t port_type:udp_socket { accept getopt name_bind }; +} @@ -296,33 +296,35 @@ static void opt_set_dns_search(const struct ctx *c, size_t max_len) /** * dhcp() - Check if this is a DHCP message, reply as needed * @c: Execution context - * @p: Packet pool, single packet with Ethernet buffer + * @data: Single packet with Ethernet buffer * * Return: 0 if it's not a DHCP message, 1 if handled, -1 on failure */ -int dhcp(const struct ctx *c, const struct pool *p) +int dhcp(const struct ctx *c, struct iov_tail *data) { - size_t mlen, dlen, offset = 0, opt_len, opt_off = 0; char macstr[ETH_ADDRSTRLEN]; + size_t mlen, dlen, opt_len; struct in_addr mask, dst; + struct ethhdr eh_storage; + struct iphdr iph_storage; + struct udphdr uh_storage; const struct ethhdr *eh; const struct iphdr *iph; const struct udphdr *uh; struct msg const *m; struct msg reply; unsigned int i; + struct msg m_storage; - eh = packet_get(p, 0, offset, sizeof(*eh), NULL); - offset += sizeof(*eh); - - iph = packet_get(p, 0, offset, sizeof(*iph), NULL); + eh = IOV_REMOVE_HEADER(data, eh_storage); + iph = IOV_PEEK_HEADER(data, iph_storage); if (!eh || !iph) return -1; - offset += iph->ihl * 4UL; - uh = packet_get(p, 0, offset, sizeof(*uh), &mlen); - offset += sizeof(*uh); + if (!iov_drop_header(data, iph->ihl * 4UL)) + return -1; + uh = IOV_REMOVE_HEADER(data, uh_storage); if (!uh) return -1; @@ -332,7 +334,10 @@ int dhcp(const struct ctx *c, const struct pool *p) if (c->no_dhcp) return 1; - m = packet_get(p, 0, offset, offsetof(struct msg, o), &opt_len); + mlen = iov_tail_size(data); + m = (struct msg const *)iov_remove_header_(data, &m_storage, + offsetof(struct msg, o), + __alignof__(struct msg)); if (!m || mlen != ntohs(uh->len) - sizeof(*uh) || mlen < offsetof(struct msg, o) || @@ -355,27 +360,28 @@ int dhcp(const struct ctx *c, const struct pool *p) memset(&reply.file, 0, sizeof(reply.file)); reply.magic = m->magic; - offset += offsetof(struct msg, o); - for (i = 0; i < ARRAY_SIZE(opts); i++) opts[i].clen = -1; - while (opt_off + 2 < opt_len) { - const uint8_t *olen, *val; + opt_len = iov_tail_size(data); + while (opt_len >= 2) { + uint8_t olen_storage, type_storage; + const uint8_t *olen; uint8_t *type; - type = packet_get(p, 0, offset + opt_off, 1, NULL); - olen = packet_get(p, 0, offset + opt_off + 1, 1, NULL); + type = IOV_REMOVE_HEADER(data, type_storage); + olen = IOV_REMOVE_HEADER(data, olen_storage); if (!type || !olen) return -1; - val = packet_get(p, 0, offset + opt_off + 2, *olen, NULL); - if (!val) + opt_len = iov_tail_size(data); + if (opt_len < *olen) return -1; - memcpy(&opts[*type].c, val, *olen); + iov_to_buf(&data->iov[0], data->cnt, data->off, &opts[*type].c, *olen); opts[*type].clen = *olen; - opt_off += *olen + 2; + iov_drop_header(data, *olen); + opt_len -= *olen; } opts[80].slen = -1; @@ -6,7 +6,7 @@ #ifndef DHCP_H #define DHCP_H -int dhcp(const struct ctx *c, const struct pool *p); +int dhcp(const struct ctx *c, struct iov_tail *data); void dhcp_init(void); #endif /* DHCP_H */ @@ -54,14 +54,14 @@ struct opt_hdr { uint16_t l; } __attribute__((packed)); +#define UDP_MSG_HDR_SIZE (sizeof(struct udphdr) + sizeof(struct msg_hdr)) # define OPT_SIZE_CONV(x) (htons_constant(x)) #define OPT_SIZE(x) OPT_SIZE_CONV(sizeof(struct opt_##x) - \ sizeof(struct opt_hdr)) #define OPT_VSIZE(x) (sizeof(struct opt_##x) - \ sizeof(struct opt_hdr)) #define OPT_MAX_SIZE IPV6_MIN_MTU - (sizeof(struct ipv6hdr) + \ - sizeof(struct udphdr) + \ - sizeof(struct msg_hdr)) + UDP_MSG_HDR_SIZE) /** * struct opt_client_id - DHCPv6 Client Identifier option @@ -144,7 +144,9 @@ struct opt_ia_addr { struct opt_status_code { struct opt_hdr hdr; uint16_t code; - char status_msg[sizeof(STR_NOTONLINK) - 1]; + /* "nonstring" is only supported since clang 23 */ + /* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ + __attribute__((nonstring)) char status_msg[sizeof(STR_NOTONLINK) - 1]; } __attribute__((packed)); /** @@ -278,82 +280,132 @@ static struct resp_not_on_link_t { /** * dhcpv6_opt() - Get option from DHCPv6 message - * @p: Packet pool, single packet with UDP header - * @offset: Offset to look at, 0: end of header, set to option start + * @data: Buffer with options, set to matching option on return * @type: Option type to look up, network order * - * Return: pointer to option header, or NULL on malformed or missing option + * Return: true if found and @data points to the option header, + * or false on malformed or missing option and @data is + * unmodified. */ -static struct opt_hdr *dhcpv6_opt(const struct pool *p, size_t *offset, - uint16_t type) +static bool dhcpv6_opt(struct iov_tail *data, uint16_t type) { - struct opt_hdr *o; - size_t left; + struct iov_tail head = *data; + struct opt_hdr o_storage; + const struct opt_hdr *o; - if (!*offset) - *offset = sizeof(struct udphdr) + sizeof(struct msg_hdr); - - while ((o = packet_get_try(p, 0, *offset, sizeof(*o), &left))) { + while ((o = IOV_PEEK_HEADER(data, o_storage))) { unsigned int opt_len = ntohs(o->l) + sizeof(*o); - if (ntohs(o->l) > left) - return NULL; + if (opt_len > iov_tail_size(data)) + break; if (o->t == type) - return o; + return true; - *offset += opt_len; + iov_drop_header(data, opt_len); } - return NULL; + *data = head; + return false; } /** * dhcpv6_ia_notonlink() - Check if any IA contains non-appropriate addresses - * @p: Packet pool, single packet starting from UDP header + * @data: Data to look at, packet starting from UDP header (input/output) * @la: Address we want to lease to the client * - * Return: pointer to non-appropriate IA_NA or IA_TA, if any, NULL otherwise + * Return: true and @data points to non-appropriate IA_NA or IA_TA, if any, + * false otherwise and @data is unmodified */ -static struct opt_hdr *dhcpv6_ia_notonlink(const struct pool *p, - struct in6_addr *la) +static bool dhcpv6_ia_notonlink(struct iov_tail *data, + struct in6_addr *la) { int ia_types[2] = { OPT_IA_NA, OPT_IA_TA }, *ia_type; + struct opt_ia_addr opt_addr_storage; const struct opt_ia_addr *opt_addr; + struct iov_tail current, ia_base; + struct opt_ia_na ia_storage; char buf[INET6_ADDRSTRLEN]; + const struct opt_ia_na *ia; struct in6_addr req_addr; + struct opt_hdr h_storage; const struct opt_hdr *h; - struct opt_hdr *ia; - size_t offset; foreach(ia_type, ia_types) { - offset = 0; - while ((ia = dhcpv6_opt(p, &offset, *ia_type))) { - if (ntohs(ia->l) < OPT_VSIZE(ia_na)) - return NULL; - - offset += sizeof(struct opt_ia_na); + current = *data; + while (dhcpv6_opt(¤t, *ia_type)) { + ia_base = current; + ia = IOV_REMOVE_HEADER(¤t, ia_storage); + if (!ia || ntohs(ia->hdr.l) < OPT_VSIZE(ia_na)) + goto notfound; + + while (dhcpv6_opt(¤t, OPT_IAAADR)) { + h = IOV_PEEK_HEADER(¤t, h_storage); + if (!h || ntohs(h->l) != OPT_VSIZE(ia_addr)) + goto notfound; + + opt_addr = IOV_REMOVE_HEADER(¤t, + opt_addr_storage); + if (!opt_addr) + goto notfound; - while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) { - if (ntohs(h->l) != OPT_VSIZE(ia_addr)) - return NULL; - - opt_addr = (const struct opt_ia_addr *)h; req_addr = opt_addr->addr; if (!IN6_ARE_ADDR_EQUAL(la, &req_addr)) - goto err; - - offset += sizeof(struct opt_ia_addr); + goto notonlink; } } } - return NULL; +notfound: + return false; -err: +notonlink: info("DHCPv6: requested address %s not on link", inet_ntop(AF_INET6, &req_addr, buf, sizeof(buf))); - return ia; + *data = ia_base; + return true; +} + +/** + * dhcpv6_send_ia_notonlink() - Send NotOnLink status + * @c: Execution context + * @ia_base: Non-appropriate IA_NA or IA_TA base + * @client_id_base: Client ID message option base + * @len: Client ID length + * @xid: Transaction ID for message exchange + */ +static void dhcpv6_send_ia_notonlink(struct ctx *c, + const struct iov_tail *ia_base, + const struct iov_tail *client_id_base, + int len, uint32_t xid) +{ + const struct in6_addr *src = &c->ip6.our_tap_ll; + struct opt_hdr *ia = (struct opt_hdr *)resp_not_on_link.var; + size_t n; + + info("DHCPv6: received CONFIRM with inappropriate IA," + " sending NotOnLink status in REPLY"); + + n = sizeof(struct opt_ia_na); + iov_to_buf(&ia_base->iov[0], ia_base->cnt, ia_base->off, + resp_not_on_link.var, n); + ia->l = htons(OPT_VSIZE(ia_na) + sizeof(sc_not_on_link)); + memcpy(resp_not_on_link.var + n, &sc_not_on_link, + sizeof(sc_not_on_link)); + + n += sizeof(sc_not_on_link); + iov_to_buf(&client_id_base->iov[0], client_id_base->cnt, + client_id_base->off, resp_not_on_link.var + n, + sizeof(struct opt_hdr) + len); + + n += sizeof(struct opt_hdr) + len; + + n = offsetof(struct resp_not_on_link_t, var) + n; + + resp_not_on_link.hdr.xid = xid; + + tap_udp6_send(c, src, 547, tap_ip6_daddr(c, src), 546, + xid, &resp_not_on_link, n); } /** @@ -435,17 +487,19 @@ search: /** * dhcpv6_client_fqdn_fill() - Fill in client FQDN option + * @data: Data to look at * @c: Execution context * @buf: Response message buffer where options will be appended * @offset: Offset in message buffer for new options * * Return: updated length of response message buffer. */ -static size_t dhcpv6_client_fqdn_fill(const struct pool *p, const struct ctx *c, +static size_t dhcpv6_client_fqdn_fill(const struct iov_tail *data, + const struct ctx *c, char *buf, int offset) { - struct opt_client_fqdn const *req_opt; + struct iov_tail current = *data; struct opt_client_fqdn *o; size_t opt_len; @@ -463,13 +517,16 @@ static size_t dhcpv6_client_fqdn_fill(const struct pool *p, const struct ctx *c, } o = (struct opt_client_fqdn *)(buf + offset); + o->flags = 0x00; encode_domain_name(o->domain_name, c->fqdn); - req_opt = (struct opt_client_fqdn *)dhcpv6_opt(p, &(size_t){ 0 }, - OPT_CLIENT_FQDN); - if (req_opt && req_opt->flags & 0x01 /* S flag */) - o->flags = 0x02 /* O flag */; - else - o->flags = 0x00; + if (dhcpv6_opt(¤t, OPT_CLIENT_FQDN)) { + struct opt_client_fqdn req_opt_storage; + struct opt_client_fqdn const *req_opt; + + req_opt = IOV_PEEK_HEADER(¤t, req_opt_storage); + if (req_opt && req_opt->flags & 0x01 /* S flag */) + o->flags = 0x02 /* O flag */; + } opt_len++; @@ -482,23 +539,30 @@ static size_t dhcpv6_client_fqdn_fill(const struct pool *p, const struct ctx *c, /** * dhcpv6() - Check if this is a DHCPv6 message, reply as needed * @c: Execution context - * @p: Packet pool, single packet starting from UDP header + * @data: Single packet starting from UDP header * @saddr: Source IPv6 address of original message * @daddr: Destination IPv6 address of original message * * Return: 0 if it's not a DHCPv6 message, 1 if handled, -1 on failure */ -int dhcpv6(struct ctx *c, const struct pool *p, +int dhcpv6(struct ctx *c, struct iov_tail *data, const struct in6_addr *saddr, const struct in6_addr *daddr) { - const struct opt_hdr *client_id, *server_id, *ia; + const struct opt_server_id *server_id = NULL; + const struct opt_hdr *client_id = NULL; + struct opt_server_id server_id_storage; + struct iov_tail opt, client_id_base; + const struct opt_ia_na *ia = NULL; + struct opt_hdr client_id_storage; + struct opt_ia_na ia_storage; const struct in6_addr *src; + struct msg_hdr mh_storage; const struct msg_hdr *mh; + struct udphdr uh_storage; const struct udphdr *uh; - struct opt_hdr *bad_ia; size_t mlen, n; - uh = packet_get(p, 0, 0, sizeof(*uh), &mlen); + uh = IOV_REMOVE_HEADER(data, uh_storage); if (!uh) return -1; @@ -511,6 +575,7 @@ int dhcpv6(struct ctx *c, const struct pool *p, if (!IN6_IS_ADDR_MULTICAST(daddr)) return -1; + mlen = iov_tail_size(data); if (mlen + sizeof(*uh) != ntohs(uh->len) || mlen < sizeof(*mh)) return -1; @@ -518,20 +583,26 @@ int dhcpv6(struct ctx *c, const struct pool *p, src = &c->ip6.our_tap_ll; - mh = packet_get(p, 0, sizeof(*uh), sizeof(*mh), NULL); + mh = IOV_REMOVE_HEADER(data, mh_storage); if (!mh) return -1; - client_id = dhcpv6_opt(p, &(size_t){ 0 }, OPT_CLIENTID); + client_id_base = *data; + if (dhcpv6_opt(&client_id_base, OPT_CLIENTID)) + client_id = IOV_PEEK_HEADER(&client_id_base, client_id_storage); if (!client_id || ntohs(client_id->l) > OPT_VSIZE(client_id)) return -1; - server_id = dhcpv6_opt(p, &(size_t){ 0 }, OPT_SERVERID); - if (server_id && ntohs(server_id->l) != OPT_VSIZE(server_id)) + opt = *data; + if (dhcpv6_opt(&opt, OPT_SERVERID)) + server_id = IOV_PEEK_HEADER(&opt, server_id_storage); + if (server_id && ntohs(server_id->hdr.l) != OPT_VSIZE(server_id)) return -1; - ia = dhcpv6_opt(p, &(size_t){ 0 }, OPT_IA_NA); - if (ia && ntohs(ia->l) < MIN(OPT_VSIZE(ia_na), OPT_VSIZE(ia_ta))) + opt = *data; + if (dhcpv6_opt(&opt, OPT_IA_NA)) + ia = IOV_PEEK_HEADER(&opt, ia_storage); + if (ia && ntohs(ia->hdr.l) < MIN(OPT_VSIZE(ia_na), OPT_VSIZE(ia_ta))) return -1; resp.hdr.type = TYPE_REPLY; @@ -546,29 +617,10 @@ int dhcpv6(struct ctx *c, const struct pool *p, if (mh->type == TYPE_CONFIRM && server_id) return -1; - if ((bad_ia = dhcpv6_ia_notonlink(p, &c->ip6.addr))) { - info("DHCPv6: received CONFIRM with inappropriate IA," - " sending NotOnLink status in REPLY"); - - bad_ia->l = htons(OPT_VSIZE(ia_na) + - sizeof(sc_not_on_link)); - n = sizeof(struct opt_ia_na); - memcpy(resp_not_on_link.var, bad_ia, n); - - memcpy(resp_not_on_link.var + n, - &sc_not_on_link, sizeof(sc_not_on_link)); - n += sizeof(sc_not_on_link); - - memcpy(resp_not_on_link.var + n, client_id, - sizeof(struct opt_hdr) + ntohs(client_id->l)); - n += sizeof(struct opt_hdr) + ntohs(client_id->l); - - n = offsetof(struct resp_not_on_link_t, var) + n; - - resp_not_on_link.hdr.xid = mh->xid; + if (dhcpv6_ia_notonlink(data, &c->ip6.addr)) { - tap_udp6_send(c, src, 547, tap_ip6_daddr(c, src), 546, - mh->xid, &resp_not_on_link, n); + dhcpv6_send_ia_notonlink(c, data, &client_id_base, + ntohs(client_id->l), mh->xid); return 1; } @@ -580,7 +632,7 @@ int dhcpv6(struct ctx *c, const struct pool *p, memcmp(&resp.server_id, server_id, sizeof(resp.server_id))) return -1; - if (ia || dhcpv6_opt(p, &(size_t){ 0 }, OPT_IA_TA)) + if (ia || dhcpv6_opt(data, OPT_IA_TA)) return -1; info("DHCPv6: received INFORMATION_REQUEST, sending REPLY"); @@ -606,13 +658,14 @@ int dhcpv6(struct ctx *c, const struct pool *p, if (ia) resp.ia_na.iaid = ((struct opt_ia_na *)ia)->iaid; - memcpy(&resp.client_id, client_id, - ntohs(client_id->l) + sizeof(struct opt_hdr)); + iov_to_buf(&client_id_base.iov[0], client_id_base.cnt, + client_id_base.off, &resp.client_id, + ntohs(client_id->l) + sizeof(struct opt_hdr)); n = offsetof(struct resp_t, client_id) + sizeof(struct opt_hdr) + ntohs(client_id->l); n = dhcpv6_dns_fill(c, (char *)&resp, n); - n = dhcpv6_client_fqdn_fill(p, c, (char *)&resp, n); + n = dhcpv6_client_fqdn_fill(data, c, (char *)&resp, n); resp.hdr.xid = mh->xid; @@ -6,7 +6,7 @@ #ifndef DHCPV6_H #define DHCPV6_H -int dhcpv6(struct ctx *c, const struct pool *p, +int dhcpv6(struct ctx *c, struct iov_tail *data, struct in6_addr *saddr, struct in6_addr *daddr); void dhcpv6_init(const struct ctx *c); diff --git a/doc/platform-requirements/.gitignore b/doc/platform-requirements/.gitignore index 3b5a10a..f6272cf 100644 --- a/doc/platform-requirements/.gitignore +++ b/doc/platform-requirements/.gitignore @@ -1,3 +1,4 @@ +/listen-vs-repair /reuseaddr-priority /recv-zero /udp-close-dup diff --git a/doc/platform-requirements/Makefile b/doc/platform-requirements/Makefile index 6a7d374..83930ef 100644 --- a/doc/platform-requirements/Makefile +++ b/doc/platform-requirements/Makefile @@ -3,8 +3,8 @@ # Copyright Red Hat # Author: David Gibson <david@gibson.dropbear.id.au> -TARGETS = reuseaddr-priority recv-zero udp-close-dup -SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c +TARGETS = reuseaddr-priority recv-zero udp-close-dup listen-vs-repair +SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c listen-vs-repair.c CFLAGS = -Wall all: cppcheck clang-tidy $(TARGETS:%=check-%) diff --git a/doc/platform-requirements/common.h b/doc/platform-requirements/common.h index 8844b1e..e85fc2b 100644 --- a/doc/platform-requirements/common.h +++ b/doc/platform-requirements/common.h @@ -15,6 +15,7 @@ #include <stdio.h> #include <stdlib.h> +__attribute__((format(printf, 1, 2), noreturn)) static inline void die(const char *fmt, ...) { va_list ap; diff --git a/doc/platform-requirements/listen-vs-repair.c b/doc/platform-requirements/listen-vs-repair.c new file mode 100644 index 0000000..d31fe3f --- /dev/null +++ b/doc/platform-requirements/listen-vs-repair.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* liste-vs-repair.c + * + * Do listening sockets have address conflicts with sockets under repair + * ==================================================================== + * + * When we accept() an incoming connection the accept()ed socket will have the + * same local address as the listening socket. This can be a complication on + * migration. On the migration target we've already set up listening sockets + * according to the command line. However to restore connections that we're + * migrating in we need to bind the new sockets to the same address, which would + * be an address conflict on the face of it. This test program verifies that + * enabling repair mode before bind() correctly suppresses that conflict. + * + * Copyright Red Hat + * Author: David Gibson <david@gibson.dropbear.id.au> + */ + +/* NOLINTNEXTLINE(bugprone-reserved-identifier,cert-dcl37-c,cert-dcl51-cpp) */ +#define _GNU_SOURCE + +#include <arpa/inet.h> +#include <errno.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> +#include <net/if.h> +#include <netinet/in.h> +#include <netinet/tcp.h> +#include <sched.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "common.h" + +#define PORT 13256U +#define CPORT 13257U + +/* 127.0.0.1:PORT */ +static const struct sockaddr_in addr = SOCKADDR_INIT(INADDR_LOOPBACK, PORT); + +/* 127.0.0.1:CPORT */ +static const struct sockaddr_in caddr = SOCKADDR_INIT(INADDR_LOOPBACK, CPORT); + +/* Put ourselves into a network sandbox */ +static void net_sandbox(void) +{ + /* NOLINTNEXTLINE(altera-struct-pack-align) */ + const struct req_t { + struct nlmsghdr nlh; + struct ifinfomsg ifm; + } __attribute__((packed)) req = { + .nlh.nlmsg_type = RTM_NEWLINK, + .nlh.nlmsg_flags = NLM_F_REQUEST, + .nlh.nlmsg_len = sizeof(req), + .nlh.nlmsg_seq = 1, + .ifm.ifi_family = AF_UNSPEC, + .ifm.ifi_index = 1, + .ifm.ifi_flags = IFF_UP, + .ifm.ifi_change = IFF_UP, + }; + int nl; + + if (unshare(CLONE_NEWUSER | CLONE_NEWNET)) + die("unshare(): %s\n", strerror(errno)); + + /* Bring up lo in the new netns */ + nl = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE); + if (nl < 0) + die("Can't create netlink socket: %s\n", strerror(errno)); + + if (send(nl, &req, sizeof(req), 0) < 0) + die("Netlink send(): %s\n", strerror(errno)); + close(nl); +} + +static void check(void) +{ + int s1, s2, op; + + s1 = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (s1 < 0) + die("socket() 1: %s\n", strerror(errno)); + + if (bind(s1, (struct sockaddr *)&addr, sizeof(addr))) + die("bind() 1: %s\n", strerror(errno)); + + if (listen(s1, 0)) + die("listen(): %s\n", strerror(errno)); + + s2 = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (s2 < 0) + die("socket() 2: %s\n", strerror(errno)); + + op = TCP_REPAIR_ON; + if (setsockopt(s2, SOL_TCP, TCP_REPAIR, &op, sizeof(op))) + die("TCP_REPAIR: %s\n", strerror(errno)); + + if (bind(s2, (struct sockaddr *)&addr, sizeof(addr))) + die("bind() 2: %s\n", strerror(errno)); + + if (connect(s2, (struct sockaddr *)&caddr, sizeof(caddr))) + die("connect(): %s\n", strerror(errno)); + + op = TCP_REPAIR_OFF_NO_WP; + if (setsockopt(s2, SOL_TCP, TCP_REPAIR, &op, sizeof(op))) + die("TCP_REPAIR: %s\n", strerror(errno)); + + close(s1); + close(s2); +} + +int main(int argc, char *argv[]) +{ + (void)argc; + (void)argv; + + net_sandbox(); + + check(); + + printf("Repair mode appears to properly suppress conflicts with listening sockets\n"); + + exit(0); +} diff --git a/doc/platform-requirements/reuseaddr-priority.c b/doc/platform-requirements/reuseaddr-priority.c index 701b6ff..af39a39 100644 --- a/doc/platform-requirements/reuseaddr-priority.c +++ b/doc/platform-requirements/reuseaddr-priority.c @@ -46,13 +46,13 @@ /* Different cases for receiving socket configuration */ enum sock_type { /* Socket is bound to 0.0.0.0:DSTPORT and not connected */ - SOCK_BOUND_ANY = 0, + SOCK_BOUND_ANY, /* Socket is bound to 127.0.0.1:DSTPORT and not connected */ - SOCK_BOUND_LO = 1, + SOCK_BOUND_LO, /* Socket is bound to 0.0.0.0:DSTPORT and connected to 127.0.0.1:SRCPORT */ - SOCK_CONNECTED = 2, + SOCK_CONNECTED, NUM_SOCK_TYPES, }; diff --git a/epoll_type.h b/epoll_type.h index 7f2a121..12ac59b 100644 --- a/epoll_type.h +++ b/epoll_type.h @@ -22,8 +22,8 @@ enum epoll_type { EPOLL_TYPE_TCP_TIMER, /* UDP "listening" sockets */ EPOLL_TYPE_UDP_LISTEN, - /* UDP socket for replies on a specific flow */ - EPOLL_TYPE_UDP_REPLY, + /* UDP socket for a specific flow */ + EPOLL_TYPE_UDP, /* ICMP/ICMPv6 ping sockets */ EPOLL_TYPE_PING, /* inotify fd watching for end of netns (pasta) */ @@ -81,7 +81,7 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, * * Free cluster list * flow_first_free gives the index of the first (lowest index) free cluster. - * Each free cluster has the index of the next free cluster, or MAX_FLOW if + * Each free cluster has the index of the next free cluster, or FLOW_MAX if * it is the last free cluster. Together these form a linked list of free * clusters, in strictly increasing order of index. * @@ -396,18 +396,27 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif, * @flow: Flow to change state * @pif: pif of the initiating side * @ssa: Source socket address + * @daddr: Destination address (may be NULL) * @dport: Destination port * * Return: pointer to the initiating flowside information */ struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif, const union sockaddr_inany *ssa, + const union inany_addr *daddr, in_port_t dport) { struct flowside *ini = &flow->f.side[INISIDE]; - inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa); - if (inany_v4(&ini->eaddr)) + if (inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa) < 0) { + char str[SOCKADDR_STRLEN]; + + ASSERT_WITH_MSG(0, "Bad socket address %s", + sockaddr_ntop(ssa, str, sizeof(str))); + } + if (daddr) + ini->oaddr = *daddr; + else if (inany_v4(&ini->eaddr)) ini->oaddr = inany_any4; else ini->oaddr = inany_any6; @@ -471,7 +480,9 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow, /** * flow_set_type() - Set type and move to TYPED * @flow: Flow to change state - * @pif: pif of the initiating side + * @type: New flow type to assign + * + * Return: pointer to the modified flow structure. */ union flow *flow_set_type(union flow *flow, enum flow_type type) { @@ -616,7 +627,7 @@ static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx) * @hash: Raw hash value for flow & side * @sidx: Flow and side to find bucket for * - * Return: If @sidx is in the hash table, its current bucket, otherwise a + * Return: if @sidx is in the hash table, its current bucket, otherwise a * suitable free bucket for it. */ static inline unsigned flow_hash_probe_(uint64_t hash, flow_sidx_t sidx) @@ -636,7 +647,7 @@ static inline unsigned flow_hash_probe_(uint64_t hash, flow_sidx_t sidx) * @c: Execution context * @sidx: Flow and side to find bucket for * - * Return: If @sidx is in the hash table, its current bucket, otherwise a + * Return: if @sidx is in the hash table, its current bucket, otherwise a * suitable free bucket for it. */ static inline unsigned flow_hash_probe(const struct ctx *c, flow_sidx_t sidx) @@ -751,19 +762,30 @@ flow_sidx_t flow_lookup_af(const struct ctx *c, * @proto: Protocol of the flow (IP L4 protocol number) * @pif: Interface of the flow * @esa: Socket address of the endpoint + * @oaddr: Our address (may be NULL) * @oport: Our port number * * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found */ flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif, - const void *esa, in_port_t oport) + const void *esa, + const union inany_addr *oaddr, in_port_t oport) { struct flowside side = { .oport = oport, }; - inany_from_sockaddr(&side.eaddr, &side.eport, esa); - if (inany_v4(&side.eaddr)) + if (inany_from_sockaddr(&side.eaddr, &side.eport, esa) < 0) { + char str[SOCKADDR_STRLEN]; + + warn("Flow lookup on bad socket address %s", + sockaddr_ntop(esa, str, sizeof(str))); + return FLOW_SIDX_NONE; + } + + if (oaddr) + side.oaddr = *oaddr; + else if (inany_v4(&side.eaddr)) side.oaddr = inany_any4; else side.oaddr = inany_any6; @@ -780,6 +802,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) { struct flow_free_cluster *free_head = NULL; unsigned *last_next = &flow_first_free; + bool to_free[FLOW_MAX] = { 0 }; bool timer = false; union flow *flow; @@ -790,9 +813,44 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) ASSERT(!flow_new_entry); /* Incomplete flow at end of cycle */ - flow_foreach_slot(flow) { + /* Check which flows we might need to close first, but don't free them + * yet as it's not safe to do that in the middle of flow_foreach(). + */ + flow_foreach(flow) { bool closed = false; + switch (flow->f.type) { + case FLOW_TYPE_NONE: + ASSERT(false); + break; + case FLOW_TCP: + closed = tcp_flow_defer(&flow->tcp); + break; + case FLOW_TCP_SPLICE: + closed = tcp_splice_flow_defer(&flow->tcp_splice); + if (!closed && timer) + tcp_splice_timer(c, &flow->tcp_splice); + break; + case FLOW_PING4: + case FLOW_PING6: + if (timer) + closed = icmp_ping_timer(c, &flow->ping, now); + break; + case FLOW_UDP: + closed = udp_flow_defer(c, &flow->udp, now); + if (!closed && timer) + closed = udp_flow_timer(c, &flow->udp, now); + break; + default: + /* Assume other flow types don't need any handling */ + ; + } + + to_free[FLOW_IDX(flow)] = closed; + } + + /* Second step: actually free the flows */ + flow_foreach_slot(flow) { switch (flow->f.state) { case FLOW_STATE_FREE: { unsigned skip = flow->free.n; @@ -825,60 +883,31 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) break; case FLOW_STATE_ACTIVE: - /* Nothing to do */ + if (to_free[FLOW_IDX(flow)]) { + flow_set_state(&flow->f, FLOW_STATE_FREE); + memset(flow, 0, sizeof(*flow)); + + if (free_head) { + /* Add slot to current free cluster */ + ASSERT(FLOW_IDX(flow) == + FLOW_IDX(free_head) + free_head->n); + free_head->n++; + flow->free.n = flow->free.next = 0; + } else { + /* Create new free cluster */ + free_head = &flow->free; + free_head->n = 1; + *last_next = FLOW_IDX(flow); + last_next = &free_head->next; + } + } else { + free_head = NULL; + } break; default: ASSERT(false); } - - switch (flow->f.type) { - case FLOW_TYPE_NONE: - ASSERT(false); - break; - case FLOW_TCP: - closed = tcp_flow_defer(&flow->tcp); - break; - case FLOW_TCP_SPLICE: - closed = tcp_splice_flow_defer(&flow->tcp_splice); - if (!closed && timer) - tcp_splice_timer(c, &flow->tcp_splice); - break; - case FLOW_PING4: - case FLOW_PING6: - if (timer) - closed = icmp_ping_timer(c, &flow->ping, now); - break; - case FLOW_UDP: - closed = udp_flow_defer(&flow->udp); - if (!closed && timer) - closed = udp_flow_timer(c, &flow->udp, now); - break; - default: - /* Assume other flow types don't need any handling */ - ; - } - - if (closed) { - flow_set_state(&flow->f, FLOW_STATE_FREE); - memset(flow, 0, sizeof(*flow)); - - if (free_head) { - /* Add slot to current free cluster */ - ASSERT(FLOW_IDX(flow) == - FLOW_IDX(free_head) + free_head->n); - free_head->n++; - flow->free.n = flow->free.next = 0; - } else { - /* Create new free cluster */ - free_head = &flow->free; - free_head->n = 1; - *last_next = FLOW_IDX(flow); - last_next = &free_head->next; - } - } else { - free_head = NULL; - } } *last_next = FLOW_MAX; @@ -912,6 +941,21 @@ static int flow_migrate_source_rollback(struct ctx *c, unsigned bound, int ret) } /** + * flow_migrate_need_repair() - Do we need to set repair mode for any flow? + * + * Return: true if repair mode is needed, false otherwise + */ +static bool flow_migrate_need_repair(void) +{ + union flow *flow; + + foreach_established_tcp_flow(flow) + return true; + + return false; +} + +/** * flow_migrate_repair_all() - Turn repair mode on or off for all flows * @c: Execution context * @enable: Switch repair mode on if set, off otherwise @@ -966,6 +1010,9 @@ int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage, (void)stage; (void)fd; + if (flow_migrate_need_repair()) + repair_wait(c); + if ((rc = flow_migrate_repair_all(c, true))) return -rc; @@ -1019,8 +1066,8 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, foreach_established_tcp_flow(flow) { rc = tcp_flow_migrate_source(fd, &flow->tcp); if (rc) { - err("Can't send data, flow %u: %s", FLOW_IDX(flow), - strerror_(-rc)); + flow_err(flow, "Can't send data: %s", + strerror_(-rc)); if (!first) die("Inconsistent migration state, exiting"); @@ -1044,10 +1091,10 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, * as EIO). */ foreach_established_tcp_flow(flow) { - rc = tcp_flow_migrate_source_ext(fd, &flow->tcp); + rc = tcp_flow_migrate_source_ext(c, fd, &flow->tcp); if (rc) { - err("Extended data for flow %u: %s", FLOW_IDX(flow), - strerror_(-rc)); + flow_err(flow, "Can't send extended data: %s", + strerror_(-rc)); if (rc == -EIO) die("Inconsistent migration state, exiting"); @@ -1083,6 +1130,9 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage, if (!count) return 0; + if ((rc = repair_wait(c))) + return -rc; + if ((rc = flow_migrate_repair_all(c, true))) return -rc; @@ -1092,8 +1142,8 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage, for (i = 0; i < count; i++) { rc = tcp_flow_migrate_target(c, fd); if (rc) { - debug("Migration data failure at flow %u: %s, abort", - i, strerror_(-rc)); + flow_dbg(FLOW(i), "Migration data failure, abort: %s", + strerror_(-rc)); return -rc; } } @@ -1103,8 +1153,8 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage, for (i = 0; i < count; i++) { rc = tcp_flow_migrate_target_ext(c, &flowtab[i].tcp, fd); if (rc) { - debug("Migration data failure at flow %u: %s, abort", - i, strerror_(-rc)); + flow_dbg(FLOW(i), "Migration data failure, abort: %s", + strerror_(-rc)); return -rc; } } @@ -243,7 +243,8 @@ flow_sidx_t flow_lookup_af(const struct ctx *c, const void *eaddr, const void *oaddr, in_port_t eport, in_port_t oport); flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif, - const void *esa, in_port_t oport); + const void *esa, + const union inany_addr *oaddr, in_port_t oport); union flow; diff --git a/flow_table.h b/flow_table.h index fd2c57b..5ee13ac 100644 --- a/flow_table.h +++ b/flow_table.h @@ -93,6 +93,7 @@ extern union flow flowtab[]; */ static inline unsigned flow_idx(const struct flow_common *f) { + /* NOLINTNEXTLINE(clang-analyzer-security.PointerSub) */ return (union flow *)f - flowtab; } @@ -139,7 +140,7 @@ static inline uint8_t pif_at_sidx(flow_sidx_t sidx) /** flowside_at_sidx() - Retrieve a specific flowside * @sidx: Flow & side index * - * Return: Flowside for the flow & side given by @sidx + * Return: flowside for the flow & side given by @sidx */ static inline const struct flowside *flowside_at_sidx(flow_sidx_t sidx) { @@ -199,6 +200,7 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif, const void *daddr, in_port_t dport); struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif, const union sockaddr_inany *ssa, + const union inany_addr *daddr, in_port_t dport); const struct flowside *flow_target_af(union flow *flow, uint8_t pif, sa_family_t af, @@ -324,6 +324,30 @@ static bool fwd_guest_accessible(const struct ctx *c, } /** + * nat_outbound() - Apply address translation for outbound (TAP to HOST) + * @c: Execution context + * @addr: Input address (as seen on TAP interface) + * @translated: Output address (as seen on HOST interface) + * + * Only handles translations that depend *only* on the address. Anything + * related to specific ports or flows is handled elsewhere. + */ +static void nat_outbound(const struct ctx *c, const union inany_addr *addr, + union inany_addr *translated) +{ + if (inany_equals4(addr, &c->ip4.map_host_loopback)) + *translated = inany_loopback4; + else if (inany_equals6(addr, &c->ip6.map_host_loopback)) + *translated = inany_loopback6; + else if (inany_equals4(addr, &c->ip4.map_guest_addr)) + *translated = inany_from_v4(c->ip4.addr); + else if (inany_equals6(addr, &c->ip6.map_guest_addr)) + translated->a6 = c->ip6.addr; + else + *translated = *addr; +} + +/** * fwd_nat_from_tap() - Determine to forward a flow from the tap interface * @c: Execution context * @proto: Protocol (IP L4 protocol number) @@ -342,16 +366,8 @@ uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto, else if (is_dns_flow(proto, ini) && inany_equals6(&ini->oaddr, &c->ip6.dns_match)) tgt->eaddr.a6 = c->ip6.dns_host; - else if (inany_equals4(&ini->oaddr, &c->ip4.map_host_loopback)) - tgt->eaddr = inany_loopback4; - else if (inany_equals6(&ini->oaddr, &c->ip6.map_host_loopback)) - tgt->eaddr = inany_loopback6; - else if (inany_equals4(&ini->oaddr, &c->ip4.map_guest_addr)) - tgt->eaddr = inany_from_v4(c->ip4.addr); - else if (inany_equals6(&ini->oaddr, &c->ip6.map_guest_addr)) - tgt->eaddr.a6 = c->ip6.addr; else - tgt->eaddr = ini->oaddr; + nat_outbound(c, &ini->oaddr, &tgt->eaddr); tgt->eport = ini->oport; @@ -402,7 +418,7 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto, else tgt->eaddr = inany_loopback6; - /* Preserve the specific loopback adddress used, but let the kernel pick + /* Preserve the specific loopback address used, but let the kernel pick * a source port on the target side */ tgt->oaddr = ini->eaddr; @@ -424,6 +440,42 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto, } /** + * nat_inbound() - Apply address translation for inbound (HOST to TAP) + * @c: Execution context + * @addr: Input address (as seen on HOST interface) + * @translated: Output address (as seen on TAP interface) + * + * Return: true on success, false if it couldn't translate the address + * + * Only handles translations that depend *only* on the address. Anything + * related to specific ports or flows is handled elsewhere. + */ +bool nat_inbound(const struct ctx *c, const union inany_addr *addr, + union inany_addr *translated) +{ + if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) && + inany_equals4(addr, &in4addr_loopback)) { + /* Specifically 127.0.0.1, not 127.0.0.0/8 */ + *translated = inany_from_v4(c->ip4.map_host_loopback); + } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) && + inany_equals6(addr, &in6addr_loopback)) { + translated->a6 = c->ip6.map_host_loopback; + } else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) && + inany_equals4(addr, &c->ip4.addr)) { + *translated = inany_from_v4(c->ip4.map_guest_addr); + } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) && + inany_equals6(addr, &c->ip6.addr)) { + translated->a6 = c->ip6.map_guest_addr; + } else if (fwd_guest_accessible(c, addr)) { + *translated = *addr; + } else { + return false; + } + + return true; +} + +/** * fwd_nat_from_host() - Determine to forward a flow from the host interface * @c: Execution context * @proto: Protocol (IP L4 protocol number) @@ -479,20 +531,7 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto, return PIF_SPLICE; } - if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) && - inany_equals4(&ini->eaddr, &in4addr_loopback)) { - /* Specifically 127.0.0.1, not 127.0.0.0/8 */ - tgt->oaddr = inany_from_v4(c->ip4.map_host_loopback); - } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) && - inany_equals6(&ini->eaddr, &in6addr_loopback)) { - tgt->oaddr.a6 = c->ip6.map_host_loopback; - } else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) && - inany_equals4(&ini->eaddr, &c->ip4.addr)) { - tgt->oaddr = inany_from_v4(c->ip4.map_guest_addr); - } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) && - inany_equals6(&ini->eaddr, &c->ip6.addr)) { - tgt->oaddr.a6 = c->ip6.map_guest_addr; - } else if (!fwd_guest_accessible(c, &ini->eaddr)) { + if (!nat_inbound(c, &ini->eaddr, &tgt->oaddr)) { if (inany_v4(&ini->eaddr)) { if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.our_tap_addr)) /* No source address we can use */ @@ -501,8 +540,6 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto, } else { tgt->oaddr.a6 = c->ip6.our_tap_ll; } - } else { - tgt->oaddr = ini->eaddr; } tgt->oport = ini->eport; @@ -7,6 +7,7 @@ #ifndef FWD_H #define FWD_H +union inany_addr; struct flowside; /* Number of ports for both TCP and UDP */ @@ -26,7 +27,7 @@ enum fwd_ports_mode { #define PORT_BITMAP_SIZE DIV_ROUND_UP(NUM_PORTS, 8) /** - * fwd_ports - Describes port forwarding for one protocol and direction + * fwd_ports() - Describes port forwarding for one protocol and direction * @mode: Overall forwarding mode (all, none, auto, specific ports) * @scan4: /proc/net fd to scan for IPv4 ports when in AUTO mode * @scan6: /proc/net fd to scan for IPv6 ports when in AUTO mode @@ -47,6 +48,8 @@ void fwd_scan_ports_udp(struct fwd_ports *fwd, const struct fwd_ports *rev, const struct fwd_ports *tcp_rev); void fwd_scan_ports_init(struct ctx *c); +bool nat_inbound(const struct ctx *c, const union inany_addr *addr, + union inany_addr *translated); uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto, const struct flowside *ini, struct flowside *tgt); uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto, @@ -44,6 +44,7 @@ #define ICMP_ECHO_TIMEOUT 60 /* s, timeout for ICMP socket activity */ #define ICMP_NUM_IDS (1U << 16) +#define MAX_IOV_ICMP 16 /* Arbitrary, should be enough */ /** * ping_at_sidx() - Get ping specific flow at given sidx @@ -163,7 +164,7 @@ static void icmp_ping_close(const struct ctx *c, * @saddr: Source address * @daddr: Destination address * - * Return: Newly opened ping flow, or NULL on failure + * Return: newly opened ping flow, or NULL on failure */ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, sa_family_t af, uint16_t id, @@ -229,37 +230,36 @@ cancel: * @af: Address family, AF_INET or AF_INET6 * @saddr: Source address * @daddr: Destination address - * @p: Packet pool, single packet with ICMP/ICMPv6 header + * @data: Single packet with ICMP/ICMPv6 header * @now: Current timestamp * * Return: count of consumed packets (always 1, even if malformed) */ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, - const struct pool *p, const struct timespec *now) + struct iov_tail *data, const struct timespec *now) { + struct iovec iov[MAX_IOV_ICMP]; struct icmp_ping_flow *pingf; const struct flowside *tgt; union sockaddr_inany sa; - size_t dlen, l4len; + struct msghdr msh; uint16_t id, seq; union flow *flow; uint8_t proto; - socklen_t sl; - void *pkt; + int cnt; (void)saddr; ASSERT(pif == PIF_TAP); if (af == AF_INET) { + struct icmphdr ih_storage; const struct icmphdr *ih; - if (!(pkt = packet_get(p, 0, 0, sizeof(*ih), &dlen))) + ih = IOV_PEEK_HEADER(data, ih_storage); + if (!ih) return 1; - ih = (struct icmphdr *)pkt; - l4len = dlen + sizeof(*ih); - if (ih->type != ICMP_ECHO) return 1; @@ -267,14 +267,13 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, id = ntohs(ih->un.echo.id); seq = ntohs(ih->un.echo.sequence); } else if (af == AF_INET6) { + struct icmp6hdr ih_storage; const struct icmp6hdr *ih; - if (!(pkt = packet_get(p, 0, 0, sizeof(*ih), &dlen))) + ih = IOV_PEEK_HEADER(data, ih_storage); + if (!ih) return 1; - ih = (struct icmp6hdr *)pkt; - l4len = dlen + sizeof(*ih); - if (ih->icmp6_type != ICMPV6_ECHO_REQUEST) return 1; @@ -285,6 +284,10 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, ASSERT(0); } + cnt = iov_tail_clone(&iov[0], MAX_IOV_ICMP, data); + if (cnt < 0) + return 1; + flow = flow_at_sidx(flow_lookup_af(c, proto, PIF_TAP, af, saddr, daddr, id, id)); @@ -298,8 +301,15 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, ASSERT(flow_proto[pingf->f.type] == proto); pingf->ts = now->tv_sec; - pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, 0); - if (sendto(pingf->sock, pkt, l4len, MSG_NOSIGNAL, &sa.sa, sl) < 0) { + pif_sockaddr(c, &sa, &msh.msg_namelen, PIF_HOST, &tgt->eaddr, 0); + msh.msg_name = &sa; + msh.msg_iov = iov; + msh.msg_iovlen = cnt; + msh.msg_control = NULL; + msh.msg_controllen = 0; + msh.msg_flags = 0; + + if (sendmsg(pingf->sock, &msh, MSG_NOSIGNAL) < 0) { flow_dbg_perror(pingf, "failed to relay request to socket"); } else { flow_dbg(pingf, @@ -14,7 +14,7 @@ struct icmp_ping_flow; void icmp_sock_handler(const struct ctx *c, union epoll_ref ref); int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, - const struct pool *p, const struct timespec *now); + struct iov_tail *data, const struct timespec *now); void icmp_init(void); /** @@ -25,7 +25,7 @@ const union inany_addr inany_any4 = INANY_INIT4(IN4ADDR_ANY_INIT); * @dst: output buffer, minimum INANY_ADDRSTRLEN bytes * @size: size of buffer at @dst * - * Return: On success, a non-null pointer to @dst, NULL on failure + * Return: on success, a non-null pointer to @dst, NULL on failure */ const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size) { @@ -41,7 +41,7 @@ const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size) * @src: IPv[46] address * @dst: output buffer, filled with parsed address * - * Return: On success, 1, if no parseable address is found, 0 + * Return: on success, 1, if no parseable address is found, 0 */ int inany_pton(const char *src, union inany_addr *dst) { @@ -237,23 +237,30 @@ static inline void inany_from_af(union inany_addr *aa, } /** inany_from_sockaddr - Extract IPv[46] address and port number from sockaddr - * @aa: Pointer to store IPv[46] address + * @dst: Pointer to store IPv[46] address (output) * @port: Pointer to store port number, host order - * @addr: AF_INET or AF_INET6 socket address + * @addr: Socket address + * + * Return: 0 on success, -1 on error (bad address family) */ -static inline void inany_from_sockaddr(union inany_addr *aa, in_port_t *port, - const union sockaddr_inany *sa) +static inline int inany_from_sockaddr(union inany_addr *dst, in_port_t *port, + const void *addr) { + const union sockaddr_inany *sa = (const union sockaddr_inany *)addr; + if (sa->sa_family == AF_INET6) { - inany_from_af(aa, AF_INET6, &sa->sa6.sin6_addr); + inany_from_af(dst, AF_INET6, &sa->sa6.sin6_addr); *port = ntohs(sa->sa6.sin6_port); - } else if (sa->sa_family == AF_INET) { - inany_from_af(aa, AF_INET, &sa->sa4.sin_addr); + return 0; + } + + if (sa->sa_family == AF_INET) { + inany_from_af(dst, AF_INET, &sa->sa4.sin_addr); *port = ntohs(sa->sa4.sin_port); - } else { - /* Not valid to call with other address families */ - ASSERT(0); + return 0; } + + return -1; } /** inany_siphash_feed- Fold IPv[46] address into an in-progress siphash @@ -26,7 +26,8 @@ #include "iov.h" -/* iov_skip_bytes() - Skip leading bytes of an IO vector +/** + * iov_skip_bytes() - Skip leading bytes of an IO vector * @iov: IO vector * @n: Number of entries in @iov * @skip: Number of leading bytes of @iov to skip @@ -56,8 +57,8 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n, } /** - * iov_from_buf - Copy data from a buffer to an I/O vector (struct iovec) - * efficiently. + * iov_from_buf() - Copy data from a buffer to an I/O vector (struct iovec) + * efficiently. * * @iov: Pointer to the array of struct iovec describing the * scatter/gather I/O vector. @@ -66,7 +67,7 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n, * @buf: Pointer to the source buffer containing the data to copy. * @bytes: Total number of bytes to copy from buf to iov. * - * Returns: The number of bytes successfully copied. + * Return: the number of bytes successfully copied. */ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt, size_t offset, const void *buf, size_t bytes) @@ -96,8 +97,8 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt, } /** - * iov_to_buf - Copy data from a scatter/gather I/O vector (struct iovec) to - * a buffer efficiently. + * iov_to_buf() - Copy data from a scatter/gather I/O vector (struct iovec) to + * a buffer efficiently. * * @iov: Pointer to the array of struct iovec describing the scatter/gather * I/O vector. @@ -106,9 +107,8 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt, * @buf: Pointer to the destination buffer where data will be copied. * @bytes: Total number of bytes to copy from iov to buf. * - * Returns: The number of bytes successfully copied. + * Return: the number of bytes successfully copied. */ -/* cppcheck-suppress unusedFunction */ size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt, size_t offset, void *buf, size_t bytes) { @@ -126,6 +126,7 @@ size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt, /* copying data */ for (copied = 0; copied < bytes && i < iov_cnt; i++) { size_t len = MIN(iov[i].iov_len - offset, bytes - copied); + ASSERT(iov[i].iov_base); memcpy((char *)buf + copied, (char *)iov[i].iov_base + offset, len); copied += len; @@ -136,14 +137,14 @@ size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt, } /** - * iov_size - Calculate the total size of a scatter/gather I/O vector - * (struct iovec). + * iov_size() - Calculate the total size of a scatter/gather I/O vector + * (struct iovec). * * @iov: Pointer to the array of struct iovec describing the * scatter/gather I/O vector. * @iov_cnt: Number of elements in the iov array. * - * Returns: The total size in bytes. + * Return: the total size in bytes. */ size_t iov_size(const struct iovec *iov, size_t iov_cnt) { @@ -166,7 +167,7 @@ size_t iov_size(const struct iovec *iov, size_t iov_cnt) * includes buffers that are actually needed. This will avoid stepping through * unnecessary elements of the underlying IO vector on future operations. * - * Return: true if the tail still contains any bytes, otherwise false + * Return: true if the tail still contains any bytes, otherwise false */ bool iov_tail_prune(struct iov_tail *tail) { @@ -180,10 +181,10 @@ bool iov_tail_prune(struct iov_tail *tail) } /** - * iov_tail_size - Calculate the total size of an IO vector tail + * iov_tail_size() - Calculate the total size of an IO vector tail * @tail: IO vector tail * - * Returns: The total size in bytes. + * Return: the total size in bytes. */ size_t iov_tail_size(struct iov_tail *tail) { @@ -192,18 +193,32 @@ size_t iov_tail_size(struct iov_tail *tail) } /** - * iov_peek_header_() - Get pointer to a header from an IOV tail + * iov_drop_header() - Discard a header from an IOV tail + * @tail: IO vector tail + * @len: length to move the head of the tail + * + * Return: true if the item still contains any bytes, otherwise false + */ +bool iov_drop_header(struct iov_tail *tail, size_t len) +{ + tail->off = tail->off + len; + + return iov_tail_prune(tail); +} + +/** + * iov_check_header() - Check if a header can be accessed * @tail: IOV tail to get header from * @len: Length of header to get, in bytes * @align: Required alignment of header, in bytes * * @tail may be pruned, but will represent the same bytes as before. * - * Returns: Pointer to the first @len logical bytes of the tail, NULL if that - * overruns the IO vector, is not contiguous or doesn't have the - * requested alignment. + * Return: pointer to the first @len logical bytes of the tail, NULL if that + * overruns the IO vector, is not contiguous or doesn't have the + * requested alignment. */ -void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align) +static void *iov_check_header(struct iov_tail *tail, size_t len, size_t align) { char *p; @@ -224,25 +239,95 @@ void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align) } /** + * iov_peek_header_() - Get pointer to a header from an IOV tail + * @tail: IOV tail to get header from + * @v: Temporary memory to use if the memory in @tail + * is discontinuous + * @len: Length of header to get, in bytes + * @align: Required alignment of header, in bytes + * + * @tail may be pruned, but will represent the same bytes as before. + * + * Return: pointer to the first @len logical bytes of the tail, or to + * a copy if that overruns the IO vector, is not contiguous or + * doesn't have the requested alignment. NULL if that overruns the + * IO vector. + */ +/* cppcheck-suppress [staticFunction,unmatchedSuppression] */ +void *iov_peek_header_(struct iov_tail *tail, void *v, size_t len, size_t align) +{ + char *p = iov_check_header(tail, len, align); + size_t l; + + if (p) + return p; + + l = iov_to_buf(tail->iov, tail->cnt, tail->off, v, len); + if (l != len) + return NULL; + + return v; +} + +/** * iov_remove_header_() - Remove a header from an IOV tail * @tail: IOV tail to remove header from (modified) + * @v: Temporary memory to use if the memory in @tail + * is discontinuous * @len: Length of header to remove, in bytes * @align: Required alignment of header, in bytes * * On success, @tail is updated so that it longer includes the bytes of the * returned header. * - * Returns: Pointer to the first @len logical bytes of the tail, NULL if that - * overruns the IO vector, is not contiguous or doesn't have the - * requested alignment. + * Return: pointer to the first @len logical bytes of the tail, or to + * a copy if that overruns the IO vector, is not contiguous or + * doesn't have the requested alignment. NULL if that overruns the + * IO vector. */ -void *iov_remove_header_(struct iov_tail *tail, size_t len, size_t align) +void *iov_remove_header_(struct iov_tail *tail, void *v, size_t len, size_t align) { - char *p = iov_peek_header_(tail, len, align); + char *p = iov_peek_header_(tail, v, len, align); if (!p) return NULL; tail->off = tail->off + len; + return p; } + +/** + * iov_tail_clone() - Clone an iov tail into a new iovec array + * + * @dst_iov: Pointer to the destination array of struct iovec describing + * the scatter/gather I/O vector to shallow copy to. + * @dst_iov_cnt: Maximum number of elements in the destination iov array. + * @tail: Pointer to the source iov_tail + * + * Return: the number of elements successfully referenced from the destination + * iov array, a negative value if there is not enough room in the + * destination iov array + */ +ssize_t iov_tail_clone(struct iovec *dst_iov, size_t dst_iov_cnt, + struct iov_tail *tail) +{ + const struct iovec *iov = &tail->iov[0]; + size_t iov_cnt = tail->cnt; + size_t offset = tail->off; + unsigned int i, j; + + i = iov_skip_bytes(iov, iov_cnt, offset, &offset); + + /* assign iov references referencing a subset of the source one */ + for (j = 0; i < iov_cnt && j < dst_iov_cnt; i++, j++) { + dst_iov[j].iov_base = (char *)iov[i].iov_base + offset; + dst_iov[j].iov_len = iov[i].iov_len - offset; + offset = 0; + } + + if (j == dst_iov_cnt && i != iov_cnt) + return -1; + + return j; +} @@ -70,38 +70,68 @@ struct iov_tail { #define IOV_TAIL(iov_, cnt_, off_) \ (struct iov_tail){ .iov = (iov_), .cnt = (cnt_), .off = (off_) } +/** + * IOV_TAIL_FROM_BUF() - Create a new IOV tail from a buffer + * @buf_: Buffer address to use in the iovec + * @len_: Buffer size + * @off_: Byte offset in the buffer where the tail begins + */ +#define IOV_TAIL_FROM_BUF(buf_, len_, off_) \ + IOV_TAIL((&(const struct iovec){ .iov_base = (buf_), \ + .iov_len = (len_) }), \ + 1, \ + (off_)) + bool iov_tail_prune(struct iov_tail *tail); size_t iov_tail_size(struct iov_tail *tail); -void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align); -void *iov_remove_header_(struct iov_tail *tail, size_t len, size_t align); +bool iov_drop_header(struct iov_tail *tail, size_t len); +void *iov_peek_header_(struct iov_tail *tail, void *v, size_t len, size_t align); +void *iov_remove_header_(struct iov_tail *tail, void *v, size_t len, size_t align); +ssize_t iov_tail_clone(struct iovec *dst_iov, size_t dst_iov_cnt, + struct iov_tail *tail); /** * IOV_PEEK_HEADER() - Get typed pointer to a header from an IOV tail * @tail_: IOV tail to get header from - * @type_: Data type of the header + * @var_: Temporary buffer of the type of the header to use if + * the memory in the iovec array is not contiguous. * * @tail_ may be pruned, but will represent the same bytes as before. * - * Returns: Pointer of type (@type_ *) located at the start of @tail_, NULL if - * we can't get a contiguous and aligned pointer. + * Return: pointer of type (@type_ *) located at the start of @tail_ + * or to @var_ if iovec memory is not contiguous, NULL if + * that overruns the iovec. */ -#define IOV_PEEK_HEADER(tail_, type_) \ - ((type_ *)(iov_peek_header_((tail_), \ - sizeof(type_), __alignof__(type_)))) + +#define IOV_PEEK_HEADER(tail_, var_) \ + ((__typeof__(var_) *)(iov_peek_header_((tail_), &(var_), \ + sizeof(var_), \ + __alignof__(var_)))) /** * IOV_REMOVE_HEADER() - Remove and return typed header from an IOV tail * @tail_: IOV tail to remove header from (modified) - * @type_: Data type of the header to remove + * @var_: Temporary buffer of the type of the header to use if + * the memory in the iovec array is not contiguous. * * On success, @tail_ is updated so that it longer includes the bytes of the * returned header. * - * Returns: Pointer of type (@type_ *) located at the old start of @tail_, NULL - * if we can't get a contiguous and aligned pointer. + * Return: pointer of type (@type_ *) located at the start of @tail_ + * or to @var_ if iovec memory is not contiguous, NULL if + * that overruns the iovec. + */ + +#define IOV_REMOVE_HEADER(tail_, var_) \ + ((__typeof__(var_) *)(iov_remove_header_((tail_), &(var_), \ + sizeof(var_), __alignof__(var_)))) + +/** IOV_DROP_HEADER() - Remove a typed header from an IOV tail + * @tail_: IOV tail to remove header from (modified) + * @type_: Data type of the header to remove + * + * Return: true if the tail still contains any bytes, otherwise false */ -#define IOV_REMOVE_HEADER(tail_, type_) \ - ((type_ *)(iov_remove_header_((tail_), \ - sizeof(type_), __alignof__(type_)))) +#define IOV_DROP_HEADER(tail_, type_) iov_drop_header((tail_), sizeof(type_)) #endif /* IOVEC_H */ @@ -23,50 +23,47 @@ /** * ipv6_l4hdr() - Find pointer to L4 header in IPv6 packet and extract protocol - * @p: Packet pool, packet number @idx has IPv6 header at @offset - * @idx: Index of packet in pool - * @offset: Pre-calculated IPv6 header offset + * @data: IPv6 packet * @proto: Filled with L4 protocol number * @dlen: Data length (payload excluding header extensions), set on return * - * Return: pointer to L4 header, NULL if not found + * Return: true if the L4 header is found and @data, @proto, @dlen are set, + * false on error. Outputs are indeterminate on failure. */ -char *ipv6_l4hdr(const struct pool *p, int idx, size_t offset, uint8_t *proto, - size_t *dlen) +bool ipv6_l4hdr(struct iov_tail *data, uint8_t *proto, size_t *dlen) { + struct ipv6_opt_hdr o_storage; const struct ipv6_opt_hdr *o; + struct ipv6hdr ip6h_storage; const struct ipv6hdr *ip6h; - char *base; int hdrlen; uint8_t nh; - base = packet_get(p, idx, 0, 0, NULL); - ip6h = packet_get(p, idx, offset, sizeof(*ip6h), dlen); + ip6h = IOV_REMOVE_HEADER(data, ip6h_storage); if (!ip6h) - return NULL; - - offset += sizeof(*ip6h); + return false; nh = ip6h->nexthdr; if (!IPV6_NH_OPT(nh)) goto found; - while ((o = packet_get_try(p, idx, offset, sizeof(*o), dlen))) { + while ((o = IOV_PEEK_HEADER(data, o_storage))) { nh = o->nexthdr; hdrlen = (o->hdrlen + 1) * 8; if (IPV6_NH_OPT(nh)) - offset += hdrlen; + iov_drop_header(data, hdrlen); else goto found; } - return NULL; + return false; found: - if (nh == 59) - return NULL; + if (nh == IPPROTO_NONE) + return false; + *dlen = iov_tail_size(data); *proto = nh; - return base + offset; + return true; } @@ -115,10 +115,9 @@ static inline uint32_t ip6_get_flow_lbl(const struct ipv6hdr *ip6h) ip6h->flow_lbl[2]; } -char *ipv6_l4hdr(const struct pool *p, int idx, size_t offset, uint8_t *proto, - size_t *dlen); +bool ipv6_l4hdr(struct iov_tail *data, uint8_t *proto, size_t *dlen); -/* IPv6 link-local all-nodes multicast adddress, ff02::1 */ +/* IPv6 link-local all-nodes multicast address, ff02::1 */ static const struct in6_addr in6addr_ll_all_nodes = { .s6_addr = { 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, diff --git a/isolation.c b/isolation.c index c944fb3..bbcd23b 100644 --- a/isolation.c +++ b/isolation.c @@ -129,7 +129,7 @@ static void drop_caps_ep_except(uint64_t keep) * additional layer of protection. Executing this requires * CAP_SETPCAP, which we will have within our userns. * - * Note that dropping capabilites from the bounding set limits + * Note that dropping capabilities from the bounding set limits * exec()ed processes, but does not remove them from the effective or * permitted sets, so it doesn't reduce our own capabilities. */ @@ -174,8 +174,8 @@ static void clamp_caps(void) * Should: * - drop unneeded capabilities * - close all open files except for standard streams and the one from --fd - * Musn't: - * - remove filesytem access (we need to access files during setup) + * Mustn't: + * - remove filesystem access (we need to access files during setup) */ void isolate_initial(int argc, char **argv) { @@ -194,7 +194,7 @@ void isolate_initial(int argc, char **argv) * * It's debatable whether it's useful to drop caps when we * retain SETUID and SYS_ADMIN, but we might as well. We drop - * further capabilites in isolate_user() and + * further capabilities in isolate_user() and * isolate_prefork(). */ keep = BIT(CAP_NET_BIND_SERVICE) | BIT(CAP_SETUID) | BIT(CAP_SETGID) | @@ -70,7 +70,7 @@ static ssize_t peek_line(struct lineread *lr, bool eof) * @lr: Line reader state structure * @line: Place a pointer to the next line in this variable * - * Return: Length of line read on success, 0 on EOF, negative on error + * Return: length of line read on success, 0 on EOF, negative on error */ ssize_t lineread_get(struct lineread *lr, char **line) { diff --git a/linux_dep.h b/linux_dep.h index 240f50a..1d9e166 100644 --- a/linux_dep.h +++ b/linux_dep.h @@ -135,6 +135,12 @@ struct tcp_info_linux { #define CLOSE_RANGE_UNSHARE (1U << 1) #endif +#ifndef TCP_REPAIR_ON +#define TCP_REPAIR_ON 1 +#define TCP_REPAIR_OFF 0 +#define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */ +#endif + __attribute__ ((weak)) /* cppcheck-suppress funcArgNamesDifferent */ int close_range(unsigned int first, unsigned int last, int flags) { @@ -35,7 +35,7 @@ static int log_sock = -1; /* Optional socket to system logger */ static char log_ident[BUFSIZ]; /* Identifier string for openlog() */ static int log_mask; /* Current log priority mask */ -static int log_file = -1; /* Optional log file descriptor */ +int log_file = -1; /* Optional log file descriptor */ static size_t log_size; /* Maximum log file size in bytes */ static size_t log_written; /* Currently used bytes in log file */ static size_t log_cut_size; /* Bytes to cut at start on rotation */ @@ -54,7 +54,8 @@ bool log_stderr = true; /* Not daemonised, no shell spawned */ * logtime() - Get the current time for logging purposes * @ts: Buffer into which to store the timestamp * - * Return: pointer to @now, or NULL if there was an error retrieving the time + * Return: pointer to @ts on success, or NULL if there was + * an error retrieving the time */ static const struct timespec *logtime(struct timespec *ts) { @@ -281,6 +282,7 @@ static void passt_vsyslog(bool newline, int pri, const char *format, va_list ap) * @format: Message * @ap: Variable argument list */ +/* cppcheck-suppress [staticFunction,unmatchedSuppression] */ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap) { bool debug_print = (log_mask & LOG_MASK(LOG_DEBUG)) && log_file == -1; @@ -401,7 +403,7 @@ void __setlogmask(int mask) * logfile_init() - Open log file and write header with PID, version, path * @name: Identifier for header: passt or pasta * @path: Path to log file - * @size: Maximum size of log file: log_cut_size is calculatd here + * @size: Maximum size of log file: log_cut_size is calculated here */ void logfile_init(const char *name, const char *path, size_t size) { @@ -41,6 +41,7 @@ void logmsg_perror(int pri, const char *format, ...) _exit(EXIT_FAILURE); \ } while (0) +extern int log_file; extern int log_trace; extern bool log_conf_parsed; extern bool log_stderr; @@ -96,8 +96,8 @@ static int seen_addrs_target_v1(struct ctx *c, return 0; } -/* Stages for version 1 */ -static const struct migrate_stage stages_v1[] = { +/* Stages for version 2 */ +static const struct migrate_stage stages_v2[] = { { .name = "observed addresses", .source = seen_addrs_source_v1, @@ -118,7 +118,11 @@ static const struct migrate_stage stages_v1[] = { /* Supported encoding versions, from latest (most preferred) to oldest */ static const struct migrate_version versions[] = { - { 1, stages_v1, }, + { 2, stages_v2, }, + /* v1 was released, but not widely used. It had bad endianness for the + * MSS and omitted timestamps, which meant it usually wouldn't work. + * Therefore we don't attempt to support compatibility with it. + */ { 0 }, }; @@ -328,21 +328,28 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst) memcpy(&ra.source_ll.mac, c->our_tap_mac, ETH_ALEN); + /* NOLINTNEXTLINE(clang-analyzer-security.PointerSub) */ ndp_send(c, dst, &ra, ptr - (unsigned char *)&ra); } /** * ndp() - Check for NDP solicitations, reply as needed * @c: Execution context - * @ih: ICMPv6 header * @saddr: Source IPv6 address - * @p: Packet pool + * @data: Single packet with ICMPv6 header * * Return: 0 if not handled here, 1 if handled, -1 on failure */ -int ndp(const struct ctx *c, const struct icmp6hdr *ih, - const struct in6_addr *saddr, const struct pool *p) +int ndp(const struct ctx *c, const struct in6_addr *saddr, + struct iov_tail *data) { + struct icmp6hdr ih_storage; + const struct icmp6hdr *ih; + + ih = IOV_PEEK_HEADER(data, ih_storage); + if (!ih) + return -1; + if (ih->icmp6_type < RS || ih->icmp6_type > NA) return 0; @@ -350,9 +357,10 @@ int ndp(const struct ctx *c, const struct icmp6hdr *ih, return 1; if (ih->icmp6_type == NS) { + struct ndp_ns ns_storage; const struct ndp_ns *ns; - ns = packet_get(p, 0, 0, sizeof(struct ndp_ns), NULL); + ns = IOV_REMOVE_HEADER(data, ns_storage); if (!ns) return -1; @@ -8,8 +8,8 @@ struct icmp6hdr; -int ndp(const struct ctx *c, const struct icmp6hdr *ih, - const struct in6_addr *saddr, const struct pool *p); +int ndp(const struct ctx *c, const struct in6_addr *saddr, + struct iov_tail *data); void ndp_timer(const struct ctx *c, const struct timespec *now); #endif /* NDP_H */ @@ -199,7 +199,7 @@ static struct nlmsghdr *nl_next(int s, char *buf, struct nlmsghdr *nh, ssize_t * } /** - * nl_foreach - 'for' type macro to step through netlink response messages + * nl_foreach() - 'for' type macro to step through netlink response messages * nl_foreach_oftype - as above, but only messages of expected type * @nh: Steps through each response header (struct nlmsghdr *) * @status: When loop exits indicates if there was an error (ssize_t) @@ -1024,7 +1024,6 @@ int nl_link_get_mac(int s, unsigned int ifi, void *mac) /** * nl_link_set_mac() - Set link MAC address * @s: Netlink socket - * @ns: Use netlink socket in namespace * @ifi: Interface index * @mac: MAC address to set * @@ -23,6 +23,20 @@ #include "log.h" /** + * get_vdev_memory() - Return a pointer to the memory regions of the pool + * @p: Packet pool + * + * Return: Null if none, otherwise a pointer to vdev_memory structure + */ +static struct vdev_memory *get_vdev_memory(const struct pool *p) +{ + if (p->buf_size) + return NULL; + + return (struct vdev_memory *)p->buf; +} + +/** * packet_check_range() - Check if a memory range is valid for a pool * @p: Packet pool * @ptr: Start of desired data range @@ -35,26 +49,41 @@ static int packet_check_range(const struct pool *p, const char *ptr, size_t len, const char *func, int line) { - if (p->buf_size == 0) { + struct vdev_memory *memory; + + if (len > PACKET_MAX_LEN) { + debug("packet range length %zu (max %zu), %s:%i", + len, PACKET_MAX_LEN, func, line); + return -1; + } + + memory = get_vdev_memory(p); + if (memory) { int ret; - ret = vu_packet_check_range((void *)p->buf, ptr, len); + ret = vu_packet_check_range(memory, ptr, len); if (ret == -1) - trace("cannot find region, %s:%i", func, line); + debug("cannot find region, %s:%i", func, line); return ret; } if (ptr < p->buf) { - trace("packet range start %p before buffer start %p, %s:%i", + debug("packet range start %p before buffer start %p, %s:%i", (void *)ptr, (void *)p->buf, func, line); return -1; } - if (ptr + len > p->buf + p->buf_size) { - trace("packet range end %p after buffer end %p, %s:%i", - (void *)(ptr + len), (void *)(p->buf + p->buf_size), + if (len > p->buf_size) { + debug("packet range length %zu larger than buffer %zu, %s:%i", + len, p->buf_size, func, line); + return -1; + } + + if ((size_t)(ptr - p->buf) > p->buf_size - len) { + debug("packet range %p, len %zu after buffer end %p, %s:%i", + (void *)ptr, len, (void *)(p->buf + p->buf_size), func, line); return -1; } @@ -62,89 +91,110 @@ static int packet_check_range(const struct pool *p, const char *ptr, size_t len, return 0; } /** + * pool_can_fit() - Can a new packet fit in the pool? + * @p: Pointer to packet pool + * @data: check data can fit in the pool + * + * Return: true if @data can be added, false otherwise + */ +bool pool_can_fit(const struct pool *p, struct iov_tail *data) +{ + iov_tail_prune(data); + + return p->count + data->cnt + (data->cnt > 1) <= p->size; +} + +/** * packet_add_do() - Add data as packet descriptor to given pool * @p: Existing pool - * @len: Length of new descriptor - * @start: Start of data - * @func: For tracing: name of calling function, NULL means no trace() + * @data: Data to add + * @func: For tracing: name of calling function * @line: For tracing: caller line of function call */ -void packet_add_do(struct pool *p, size_t len, const char *start, +void packet_add_do(struct pool *p, struct iov_tail *data, const char *func, int line) { - size_t idx = p->count; + size_t idx = p->count, i, offset; - if (idx >= p->size) { - trace("add packet index %zu to pool with size %zu, %s:%i", + if (!pool_can_fit(p, data)) { + debug("add packet index %zu to pool with size %zu, %s:%i", idx, p->size, func, line); return; } - if (packet_check_range(p, start, len, func, line)) + if (!iov_tail_prune(data)) return; - if (len > UINT16_MAX) { - trace("add packet length %zu, %s:%i", len, func, line); - return; + if (data->cnt > 1) { + p->pkt[idx].iov_base = NULL; + p->pkt[idx].iov_len = data->cnt; + idx++; } - p->pkt[idx].iov_base = (void *)start; - p->pkt[idx].iov_len = len; + offset = data->off; + for (i = 0; i < data->cnt; i++) { + const char *start; + size_t len; + + len = data->iov[i].iov_len - offset; + start = (char *)data->iov[i].iov_base + offset; + offset = 0; + + if (packet_check_range(p, start, len, func, line)) + return; + + p->pkt[idx].iov_base = (void *)start; + p->pkt[idx].iov_len = len; + idx++; + } - p->count++; + p->count = idx; } /** * packet_get_do() - Get data range from packet descriptor from given pool * @p: Packet pool * @idx: Index of packet descriptor in pool - * @offset: Offset of data range in packet descriptor - * @len: Length of desired data range - * @left: Length of available data after range, set on return, can be NULL + * @data: IOV tail to store the address of the data (output) * @func: For tracing: name of calling function, NULL means no trace() * @line: For tracing: caller line of function call * - * Return: pointer to start of data range, NULL on invalid range or descriptor + * Return: false if packet index is invalid, true otherwise. + * If something wrong with @data, don't return at all (assert). */ -void *packet_get_do(const struct pool *p, size_t idx, size_t offset, - size_t len, size_t *left, const char *func, int line) +bool packet_get_do(const struct pool *p, size_t idx, + struct iov_tail *data, + const char *func, int line) { - char *ptr; + size_t i; - if (idx >= p->size || idx >= p->count) { - if (func) { - trace("packet %zu from pool size: %zu, count: %zu, " - "%s:%i", idx, p->size, p->count, func, line); - } - return NULL; - } + ASSERT_WITH_MSG(p->count <= p->size, + "Corrupted pool count: %zu, size: %zu, %s:%i", + p->count, p->size, func, line); - if (len > UINT16_MAX) { - if (func) { - trace("packet data length %zu, %s:%i", - len, func, line); - } - return NULL; + if (idx >= p->count) { + debug("packet %zu from pool size: %zu, count: %zu, " + "%s:%i", idx, p->size, p->count, func, line); + return false; } - if (len + offset > p->pkt[idx].iov_len) { - if (func) { - trace("data length %zu, offset %zu from length %zu, " - "%s:%i", len, offset, p->pkt[idx].iov_len, - func, line); - } - return NULL; + if (p->pkt[idx].iov_base) { + data->cnt = 1; + data->iov = &p->pkt[idx]; + } else { + data->cnt = p->pkt[idx].iov_len; + data->iov = &p->pkt[idx + 1]; } + data->off = 0; - ptr = (char *)p->pkt[idx].iov_base + offset; - - if (packet_check_range(p, ptr, len, func, line)) - return NULL; - - if (left) - *left = p->pkt[idx].iov_len - offset - len; + for (i = 0; i < data->cnt; i++) { + ASSERT_WITH_MSG(!packet_check_range(p, data->iov[i].iov_base, + data->iov[i].iov_len, + func, line), + "Corrupt packet pool, %s:%i", func, line); + } - return ptr; + return true; } /** @@ -6,10 +6,17 @@ #ifndef PACKET_H #define PACKET_H +#include <stdbool.h> +#include "iov.h" +#include "virtio.h" + +/* Maximum size of a single packet stored in pool, including headers */ +#define PACKET_MAX_LEN ((size_t)UINT16_MAX) + /** * struct pool - Generic pool of packets stored in a buffer * @buf: Buffer storing packet descriptors, - * a struct vu_dev_region array for passt vhost-user mode + * a struct vdev_region for passt vhost-user mode * @buf_size: Total size of buffer, * 0 for passt vhost-user mode * @size: Number of usable descriptors for the pool @@ -24,24 +31,21 @@ struct pool { struct iovec pkt[]; }; -int vu_packet_check_range(void *buf, const char *ptr, size_t len); -void packet_add_do(struct pool *p, size_t len, const char *start, +int vu_packet_check_range(struct vdev_memory *memory, + const char *ptr, size_t len); +void packet_add_do(struct pool *p, struct iov_tail *data, const char *func, int line); -void *packet_get_do(const struct pool *p, const size_t idx, - size_t offset, size_t len, size_t *left, - const char *func, int line); +bool packet_get_do(const struct pool *p, const size_t idx, + struct iov_tail *data, const char *func, int line); +bool pool_can_fit(const struct pool *p, struct iov_tail *data); void pool_flush(struct pool *p); -#define packet_add(p, len, start) \ - packet_add_do(p, len, start, __func__, __LINE__) - -#define packet_get(p, idx, offset, len, left) \ - packet_get_do(p, idx, offset, len, left, __func__, __LINE__) - -#define packet_get_try(p, idx, offset, len, left) \ - packet_get_do(p, idx, offset, len, left, NULL, 0) +#define packet_add(p, data) \ + packet_add_do(p, data, __func__, __LINE__) +#define packet_get(p, idx, data) \ + packet_get_do(p, idx, data, __func__, __LINE__) -#define PACKET_POOL_DECL(_name, _size, _buf) \ +#define PACKET_POOL_DECL(_name, _size) \ struct _name ## _t { \ char *buf; \ size_t buf_size; \ @@ -57,19 +61,10 @@ struct _name ## _t { \ .size = _size, \ } -#define PACKET_POOL(name, size, buf, buf_size) \ - PACKET_POOL_DECL(name, size, buf) name = \ - PACKET_POOL_INIT_NOCAST(size, buf, buf_size) - #define PACKET_INIT(name, size, buf, buf_size) \ (struct name ## _t) PACKET_POOL_INIT_NOCAST(size, buf, buf_size) -#define PACKET_POOL_NOINIT(name, size, buf) \ - PACKET_POOL_DECL(name, size, buf) name ## _storage; \ +#define PACKET_POOL_NOINIT(name, size) \ + PACKET_POOL_DECL(name, size) name ## _storage; \ static struct pool *name = (struct pool *)&name ## _storage - -#define PACKET_POOL_P(name, size, buf, buf_size) \ - PACKET_POOL(name ## _storage, size, buf, buf_size); \ - struct pool *name = (struct pool *)&name ## _storage - #endif /* PACKET_H */ diff --git a/passt-repair.1 b/passt-repair.1 index 7c1b140..e65aadd 100644 --- a/passt-repair.1 +++ b/passt-repair.1 @@ -16,13 +16,17 @@ .B passt-repair is a privileged helper setting and clearing repair mode on TCP sockets on behalf of \fBpasst\fR(1), as instructed via single-byte commands over a UNIX domain -socket, specified by \fIPATH\fR. +socket. It can be used to migrate TCP connections between guests without granting additional capabilities to \fBpasst\fR(1) itself: to migrate TCP connections, \fBpasst\fR(1) leverages repair mode, which needs the \fBCAP_NET_ADMIN\fR capability (see \fBcapabilities\fR(7)) to be set or cleared. +If \fIPATH\fR represents a UNIX domain socket, \fBpasst-repair\fR(1) attempts to +connect to it. If it is a directory, \fBpasst-repair\fR(1) waits until a file +ending with \fI.repair\fR appears in it, and then attempts to connect to it. + .SH PROTOCOL \fBpasst-repair\fR(1) connects to \fBpasst\fR(1) using the socket specified via diff --git a/passt-repair.c b/passt-repair.c index e0c366e..c3c140f 100644 --- a/passt-repair.c +++ b/passt-repair.c @@ -16,11 +16,14 @@ * off. Reply by echoing the command. Exit on EOF. */ +#include <sys/inotify.h> #include <sys/prctl.h> #include <sys/types.h> #include <sys/socket.h> +#include <sys/stat.h> #include <sys/un.h> #include <errno.h> +#include <stdbool.h> #include <stddef.h> #include <stdio.h> #include <stdlib.h> @@ -37,8 +40,11 @@ #include <linux/seccomp.h> #include "seccomp_repair.h" +#include "linux_dep.h" #define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */ +#define REPAIR_EXT ".repair" +#define REPAIR_EXT_LEN strlen(REPAIR_EXT) /** * main() - Entry point and whole program with loop @@ -51,6 +57,9 @@ * #syscalls:repair socket s390x:socketcall i686:socketcall * #syscalls:repair recvfrom recvmsg arm:recv ppc64le:recv * #syscalls:repair sendto sendmsg arm:send ppc64le:send + * #syscalls:repair stat|statx stat64|statx statx + * #syscalls:repair fstat|fstat64 newfstatat|fstatat64 + * #syscalls:repair inotify_init1 inotify_add_watch */ int main(int argc, char **argv) { @@ -58,12 +67,14 @@ int main(int argc, char **argv) __attribute__ ((aligned(__alignof__(struct cmsghdr)))); struct sockaddr_un a = { AF_UNIX, "" }; int fds[SCM_MAX_FD], s, ret, i, n = 0; + bool inotify_dir = false; struct sock_fprog prog; int8_t cmd = INT8_MAX; struct cmsghdr *cmsg; struct msghdr msg; struct iovec iov; size_t cmsg_len; + struct stat sb; int op; prctl(PR_SET_DUMPABLE, 0); @@ -73,7 +84,7 @@ int main(int argc, char **argv) prog.filter = filter_repair; if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) || prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) { - fprintf(stderr, "Failed to apply seccomp filter"); + fprintf(stderr, "Failed to apply seccomp filter\n"); _exit(1); } @@ -90,19 +101,96 @@ int main(int argc, char **argv) _exit(2); } - ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]); + if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { + fprintf(stderr, "Failed to create AF_UNIX socket: %i\n", errno); + _exit(1); + } + + if ((stat(argv[1], &sb))) { + fprintf(stderr, "Can't stat() %s: %i\n", argv[1], errno); + _exit(1); + } + + if ((sb.st_mode & S_IFMT) == S_IFDIR) { + char buf[sizeof(struct inotify_event) + NAME_MAX + 1] + __attribute__ ((aligned(__alignof__(struct inotify_event)))); + const struct inotify_event *ev = NULL; + char path[PATH_MAX + 1]; + bool found = false; + ssize_t n; + int fd; + + if ((fd = inotify_init1(IN_CLOEXEC)) < 0) { + fprintf(stderr, "inotify_init1: %i\n", errno); + _exit(1); + } + + if (inotify_add_watch(fd, argv[1], IN_CREATE) < 0) { + fprintf(stderr, "inotify_add_watch: %i\n", errno); + _exit(1); + } + + do { + char *p; + + n = read(fd, buf, sizeof(buf)); + if (n < 0) { + fprintf(stderr, "inotify read: %i\n", errno); + _exit(1); + } + buf[n - 1] = '\0'; + + if (n < (ssize_t)sizeof(*ev)) { + fprintf(stderr, "Short inotify read: %zi\n", n); + continue; + } + + for (p = buf; p < buf + n; p += sizeof(*ev) + ev->len) { + ev = (const struct inotify_event *)p; + + if (ev->len >= REPAIR_EXT_LEN && + !memcmp(ev->name + + strnlen(ev->name, ev->len) - + REPAIR_EXT_LEN, + REPAIR_EXT, REPAIR_EXT_LEN)) { + found = true; + break; + } + } + } while (!found); + + if (ev->len > NAME_MAX + 1 || ev->name[ev->len - 1] != '\0') { + fprintf(stderr, "Invalid filename from inotify\n"); + _exit(1); + } + + snprintf(path, sizeof(path), "%s/%s", argv[1], ev->name); + if ((stat(path, &sb))) { + fprintf(stderr, "Can't stat() %s: %i\n", path, errno); + _exit(1); + } + + ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", path); + inotify_dir = true; + } else { + ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]); + } + if (ret <= 0 || ret >= (int)sizeof(a.sun_path)) { - fprintf(stderr, "Invalid socket path: %s\n", argv[1]); + fprintf(stderr, "Invalid socket path\n"); _exit(2); } - if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { - fprintf(stderr, "Failed to create AF_UNIX socket: %i\n", errno); - _exit(1); + if ((sb.st_mode & S_IFMT) != S_IFSOCK) { + fprintf(stderr, "%s is not a socket\n", a.sun_path); + _exit(2); } - if (connect(s, (struct sockaddr *)&a, sizeof(a))) { - fprintf(stderr, "Failed to connect to %s: %s\n", argv[1], + while (connect(s, (struct sockaddr *)&a, sizeof(a))) { + if (inotify_dir && errno == ECONNREFUSED) + continue; + + fprintf(stderr, "Failed to connect to %s: %s\n", a.sun_path, strerror(errno)); _exit(1); } @@ -158,8 +246,8 @@ loop: for (i = 0; i < n; i++) { if (setsockopt(fds[i], SOL_TCP, TCP_REPAIR, &op, sizeof(op))) { fprintf(stderr, - "Setting TCP_REPAIR to %i on socket %i: %s", op, - fds[i], strerror(errno)); + "Setting TCP_REPAIR to %i on socket %i: %s\n", + op, fds[i], strerror(errno)); _exit(1); } @@ -440,6 +440,30 @@ chosen for the hypervisor UNIX domain socket. No socket is created if not in \-\-vhost-user mode. .TP +.BR \-\-migrate-exit (DEPRECATED) +Exit after a completed migration as source. By default, \fBpasst\fR keeps +running and the migrated guest can continue using its connection, or a new guest +can connect. + +Note that this configuration option is \fBdeprecated\fR and will be removed in a +future version. It is not expected to be of any use, and it simply reflects a +legacy behaviour. If you have any use for this, refer to \fBREPORTING BUGS\fR +below. + +.TP +.BR \-\-migrate-no-linger (DEPRECATED) +Close TCP sockets on the source instance once migration completes. + +By default, sockets are kept open, and events on data sockets are ignored, so +that any further message reaching sockets after the source migrated is silently +ignored, to avoid connection resets in case data is received after migration. + +Note that this configuration option is \fBdeprecated\fR and will be removed in a +future version. It is not expected to be of any use, and it simply reflects a +legacy behaviour. If you have any use for this, refer to \fBREPORTING BUGS\fR +below. + +.TP .BR \-F ", " \-\-fd " " \fIFD Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened in the parent process and \fBpasst\fR inherits it when run as a child. This @@ -454,6 +478,11 @@ is closed. Quit after handling a single client connection, that is, once the client closes the socket, or once we get a socket error. +\fBNote\fR: this option has no effect after \fBpasst\fR completes a migration as +source, because, in that case, exiting would close sockets for active +connections, which would in turn cause connection resets if any further data is +received. See also the description of \fI\-\-migrate-no-linger\fR. + .TP .BR \-t ", " \-\-tcp-ports " " \fIspec Configure TCP port forwarding to guest. \fIspec\fR can be one of: @@ -68,7 +68,7 @@ char *epoll_type_str[] = { [EPOLL_TYPE_TCP_LISTEN] = "listening TCP socket", [EPOLL_TYPE_TCP_TIMER] = "TCP timer", [EPOLL_TYPE_UDP_LISTEN] = "listening UDP socket", - [EPOLL_TYPE_UDP_REPLY] = "UDP reply socket", + [EPOLL_TYPE_UDP] = "UDP flow socket", [EPOLL_TYPE_PING] = "ICMP/ICMPv6 ping socket", [EPOLL_TYPE_NSQUIT_INOTIFY] = "namespace inotify watch", [EPOLL_TYPE_NSQUIT_TIMER] = "namespace timer watch", @@ -170,6 +170,7 @@ static void exit_handler(int signal) { (void)signal; + fsync_pcap_and_log(); _exit(EXIT_SUCCESS); } @@ -191,7 +192,6 @@ int main(int argc, char **argv) { struct epoll_event events[EPOLL_EVENTS]; int nfds, i, devnull_fd = -1; - char argv0[PATH_MAX], *name; struct ctx c = { 0 }; struct rlimit limit; struct timespec now; @@ -213,27 +213,18 @@ int main(int argc, char **argv) sigaction(SIGTERM, &sa, NULL); sigaction(SIGQUIT, &sa, NULL); - if (argc < 1) - _exit(EXIT_FAILURE); + c.mode = conf_mode(argc, argv); - strncpy(argv0, argv[0], PATH_MAX - 1); - name = basename(argv0); - if (strstr(name, "pasta")) { + if (c.mode == MODE_PASTA) { sa.sa_handler = pasta_child_handler; if (sigaction(SIGCHLD, &sa, NULL)) die_perror("Couldn't install signal handlers"); - - c.mode = MODE_PASTA; - } else if (strstr(name, "passt")) { - c.mode = MODE_PASST; - } else { - _exit(EXIT_FAILURE); } if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) die_perror("Couldn't set disposition for SIGPIPE"); - madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE); + madvise(pkt_buf, sizeof(pkt_buf), MADV_HUGEPAGE); c.epollfd = epoll_create1(EPOLL_CLOEXEC); if (c.epollfd == -1) @@ -349,8 +340,8 @@ loop: case EPOLL_TYPE_UDP_LISTEN: udp_listen_sock_handler(&c, ref, eventmask, &now); break; - case EPOLL_TYPE_UDP_REPLY: - udp_reply_sock_handler(&c, ref, eventmask, &now); + case EPOLL_TYPE_UDP: + udp_sock_handler(&c, ref, eventmask, &now); break; case EPOLL_TYPE_PING: icmp_sock_handler(&c, ref); @@ -69,12 +69,9 @@ union epoll_ref { static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data), "epoll_ref must have same size as epoll_data"); -#define TAP_BUF_BYTES \ - ROUND_DOWN(((ETH_MAX_MTU + sizeof(uint32_t)) * 128), PAGE_SIZE) -#define TAP_MSGS \ - DIV_ROUND_UP(TAP_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t)) +/* Large enough for ~128 maximum size frames */ +#define PKT_BUF_BYTES (8UL << 20) -#define PKT_BUF_BYTES MAX(TAP_BUF_BYTES, 0) extern char pkt_buf [PKT_BUF_BYTES]; extern char *epoll_type_str[]; @@ -211,7 +208,7 @@ struct ip6_ctx { * @guest_mac: MAC address of guest or namespace, seen or configured * @hash_secret: 128-bit secret for siphash functions * @ifi4: Template interface for IPv4, -1: none, 0: IPv4 disabled - * @ip: IPv4 configuration + * @ip4: IPv4 configuration * @dns_search: DNS search list * @hostname: Guest hostname * @fqdn: Guest FQDN @@ -244,6 +241,8 @@ struct ip6_ctx { * @device_state_fd: Device state migration channel * @device_state_result: Device state migration result * @migrate_target: Are we the target, on the next migration request? + * @migrate_no_linger: Close sockets as we migrate them + * @migrate_exit: Exit (on source) once migration is complete */ struct ctx { enum passt_modes mode; @@ -321,6 +320,8 @@ struct ctx { int device_state_fd; int device_state_result; bool migrate_target; + bool migrate_no_linger; + bool migrate_exit; }; void proto_update_l2_buf(const unsigned char *eth_d, @@ -57,21 +57,21 @@ int pasta_child_pid; /** * pasta_child_handler() - Exit once shell exits (if we started it), reap clones - * @signal: Unused, handler deals with SIGCHLD only + * @signal: Signal number; this handler deals with SIGCHLD only */ void pasta_child_handler(int signal) { int errno_save = errno; siginfo_t infop; - (void)signal; - if (signal != SIGCHLD) return; if (pasta_child_pid && !waitid(P_PID, pasta_child_pid, &infop, WEXITED | WNOHANG)) { if (infop.si_pid == pasta_child_pid) { + fsync_pcap_and_log(); + if (infop.si_code == CLD_EXITED) _exit(infop.si_status); @@ -498,17 +498,23 @@ void pasta_netns_quit_init(const struct ctx *c) */ void pasta_netns_quit_inotify_handler(struct ctx *c, int inotify_fd) { - char buf[sizeof(struct inotify_event) + NAME_MAX + 1]; - const struct inotify_event *in_ev = (struct inotify_event *)buf; + char buf[sizeof(struct inotify_event) + NAME_MAX + 1] + __attribute__ ((aligned(__alignof__(struct inotify_event)))); + const struct inotify_event *ev; + ssize_t n; + char *p; - if (read(inotify_fd, buf, sizeof(buf)) < (ssize_t)sizeof(*in_ev)) + if ((n = read(inotify_fd, buf, sizeof(buf))) < (ssize_t)sizeof(*ev)) return; - if (strncmp(in_ev->name, c->netns_base, sizeof(c->netns_base))) - return; + for (p = buf; p < buf + n; p += sizeof(*ev) + ev->len) { + ev = (const struct inotify_event *)p; - info("Namespace %s is gone, exiting", c->netns_base); - _exit(EXIT_SUCCESS); + if (!strncmp(ev->name, c->netns_base, sizeof(c->netns_base))) { + info("Namespace %s is gone, exiting", c->netns_base); + _exit(EXIT_SUCCESS); + } + } } /** @@ -33,32 +33,11 @@ #include "log.h" #include "pcap.h" #include "iov.h" +#include "tap.h" #define PCAP_VERSION_MINOR 4 -static int pcap_fd = -1; - -/* See pcap.h from libpcap, or pcap-savefile(5) */ -static const struct { - uint32_t magic; -#define PCAP_MAGIC 0xa1b2c3d4 - - uint16_t major; -#define PCAP_VERSION_MAJOR 2 - - uint16_t minor; -#define PCAP_VERSION_MINOR 4 - - int32_t thiszone; - uint32_t sigfigs; - uint32_t snaplen; - - uint32_t linktype; -#define PCAP_LINKTYPE_ETHERNET 1 -} pcap_hdr = { - PCAP_MAGIC, PCAP_VERSION_MAJOR, PCAP_VERSION_MINOR, 0, 0, ETH_MAX_MTU, - PCAP_LINKTYPE_ETHERNET -}; +int pcap_fd = -1; struct pcap_pkthdr { uint32_t tv_sec; @@ -73,8 +52,6 @@ struct pcap_pkthdr { * @iovcnt: Number of buffers (@iov entries) in frame * @offset: Byte offset of the L2 headers within @iov * @now: Timestamp - * - * Returns: 0 on success, -errno on error writing to the file */ static void pcap_frame(const struct iovec *iov, size_t iovcnt, size_t offset, const struct timespec *now) @@ -97,6 +74,7 @@ static void pcap_frame(const struct iovec *iov, size_t iovcnt, * @pkt: Pointer to data buffer, including L2 headers * @l2len: L2 frame length */ +/* cppcheck-suppress unusedFunction */ void pcap(const char *pkt, size_t l2len) { struct iovec iov = { (char *)pkt, l2len }; @@ -134,10 +112,9 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n, pcap_frame(iov + i * frame_parts, frame_parts, offset, &now); } -/* - * pcap_iov - Write packet data described by an I/O vector +/** + * pcap_iov() - Write packet data described by an I/O vector * to a pcap file descriptor. - * * @iov: Pointer to the array of struct iovec describing the I/O vector * containing packet data to write, including L2 header * @iovcnt: Number of buffers (@iov entries) @@ -162,6 +139,29 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset) */ void pcap_init(struct ctx *c) { + /* See pcap.h from libpcap, or pcap-savefile(5) */ +#define PCAP_MAGIC 0xa1b2c3d4 +#define PCAP_VERSION_MAJOR 2 +#define PCAP_VERSION_MINOR 4 +#define PCAP_LINKTYPE_ETHERNET 1 + const struct { + uint32_t magic; + uint16_t major; + uint16_t minor; + + int32_t thiszone; + uint32_t sigfigs; + uint32_t snaplen; + + uint32_t linktype; + } pcap_hdr = { + .magic = PCAP_MAGIC, + .major = PCAP_VERSION_MAJOR, + .minor = PCAP_VERSION_MINOR, + .snaplen = tap_l2_max_len(c), + .linktype = PCAP_LINKTYPE_ETHERNET + }; + if (pcap_fd != -1) return; @@ -6,6 +6,8 @@ #ifndef PCAP_H #define PCAP_H +extern int pcap_fd; + void pcap(const char *pkt, size_t l2len); void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n, size_t offset); @@ -27,6 +27,10 @@ #define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */ +/* Wait for a while for TCP_REPAIR helper to connect if it's not there yet */ +#define REPAIR_ACCEPT_TIMEOUT_MS 10 +#define REPAIR_ACCEPT_TIMEOUT_US (REPAIR_ACCEPT_TIMEOUT_MS * 1000) + /* Pending file descriptors for next repair_flush() call, or command change */ static int repair_fds[SCM_MAX_FD]; @@ -64,18 +68,21 @@ void repair_sock_init(const struct ctx *c) * repair_listen_handler() - Handle events on TCP_REPAIR helper listening socket * @c: Execution context * @events: epoll events + * + * Return: 0 on valid event with new connected socket, error code on failure */ -void repair_listen_handler(struct ctx *c, uint32_t events) +int repair_listen_handler(struct ctx *c, uint32_t events) { union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR }; struct epoll_event ev = { 0 }; struct ucred ucred; socklen_t len; + int rc; if (events != EPOLLIN) { debug("Spurious event 0x%04x on TCP_REPAIR helper socket", events); - return; + return EINVAL; } len = sizeof(ucred); @@ -86,18 +93,19 @@ void repair_listen_handler(struct ctx *c, uint32_t events) SOCK_NONBLOCK); if (discard == -1) - return; + return errno; if (!getsockopt(discard, SOL_SOCKET, SO_PEERCRED, &ucred, &len)) info("Discarding TCP_REPAIR helper, PID %i", ucred.pid); close(discard); - return; + return EEXIST; } if ((c->fd_repair = accept4(c->fd_repair_listen, NULL, NULL, 0)) < 0) { + rc = errno; debug_perror("accept4() on TCP_REPAIR helper listening socket"); - return; + return rc; } if (!getsockopt(c->fd_repair, SOL_SOCKET, SO_PEERCRED, &ucred, &len)) @@ -107,10 +115,14 @@ void repair_listen_handler(struct ctx *c, uint32_t events) ev.events = EPOLLHUP | EPOLLET; ev.data.u64 = ref.u64; if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair, &ev)) { + rc = errno; debug_perror("epoll_ctl() on TCP_REPAIR helper socket"); close(c->fd_repair); c->fd_repair = -1; + return rc; } + + return 0; } /** @@ -139,6 +151,44 @@ void repair_handler(struct ctx *c, uint32_t events) } /** + * repair_wait() - Wait (with timeout) for TCP_REPAIR helper to connect + * @c: Execution context + * + * Return: 0 on success or if already connected, error code on failure + */ +int repair_wait(struct ctx *c) +{ + struct timeval tv = { .tv_sec = 0, + .tv_usec = (long)(REPAIR_ACCEPT_TIMEOUT_US) }; + int rc; + + static_assert(REPAIR_ACCEPT_TIMEOUT_US < 1000 * 1000, + ".tv_usec is greater than 1000 * 1000"); + + if (c->fd_repair >= 0) + return 0; + + if (c->fd_repair_listen == -1) + return ENOENT; + + if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO, + &tv, sizeof(tv))) { + rc = errno; + err_perror("Set timeout on TCP_REPAIR listening socket"); + return rc; + } + + rc = repair_listen_handler(c, EPOLLIN); + + tv.tv_usec = 0; + if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO, + &tv, sizeof(tv))) + err_perror("Clear timeout on TCP_REPAIR listening socket"); + + return rc; +} + +/** * repair_flush() - Flush current set of sockets to helper, with current command * @c: Execution context * @@ -7,9 +7,10 @@ #define REPAIR_H void repair_sock_init(const struct ctx *c); -void repair_listen_handler(struct ctx *c, uint32_t events); +int repair_listen_handler(struct ctx *c, uint32_t events); void repair_handler(struct ctx *c, uint32_t events); void repair_close(struct ctx *c); +int repair_wait(struct ctx *c); int repair_flush(struct ctx *c); int repair_set(struct ctx *c, int s, int cmd); @@ -99,7 +99,7 @@ static inline void siphash_feed(struct siphash_state *state, uint64_t in) } /** - * siphash_final - Finalize SipHash calculations + * siphash_final() - Finalize SipHash calculations * @v: siphash state (4 x 64-bit integers) * @len: Total length of input data * @tail: Final data for the hash (<= 7 bytes) @@ -62,14 +62,67 @@ #include "vhost_user.h" #include "vu_common.h" +/* Maximum allowed frame lengths (including L2 header) */ + +/* Verify that an L2 frame length limit is large enough to contain the header, + * but small enough to fit in the packet pool + */ +#define CHECK_FRAME_LEN(len) \ + static_assert((len) >= ETH_HLEN && (len) <= PACKET_MAX_LEN, \ + #len " has bad value") + +CHECK_FRAME_LEN(L2_MAX_LEN_PASTA); +CHECK_FRAME_LEN(L2_MAX_LEN_PASST); +CHECK_FRAME_LEN(L2_MAX_LEN_VU); + +/* We try size the packet pools so that we can use a single batch for the entire + * packet buffer. This might be exceeded for vhost-user, though, which uses its + * own buffers rather than pkt_buf. + * + * This is just a tuning parameter, the code will work with slightly more + * overhead if it's incorrect. So, we estimate based on the minimum practical + * frame size - an empty UDP datagram - rather than the minimum theoretical + * frame size. + * + * FIXME: Profile to work out how big this actually needs to be to amortise + * per-batch syscall overheads + */ +#define TAP_MSGS_IP4 \ + DIV_ROUND_UP(sizeof(pkt_buf), \ + ETH_HLEN + sizeof(struct iphdr) + sizeof(struct udphdr)) +#define TAP_MSGS_IP6 \ + DIV_ROUND_UP(sizeof(pkt_buf), \ + ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct udphdr)) + /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */ -static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf); -static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf); +static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS_IP4); +static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS_IP6); #define TAP_SEQS 128 /* Different L4 tuples in one batch */ #define FRAGMENT_MSG_RATE 10 /* # seconds between fragment warnings */ /** + * tap_l2_max_len() - Maximum frame size (including L2 header) for current mode + * @c: Execution context + */ +unsigned long tap_l2_max_len(const struct ctx *c) +{ + /* NOLINTBEGIN(bugprone-branch-clone): values can be the same */ + switch (c->mode) { + case MODE_PASST: + return L2_MAX_LEN_PASST; + case MODE_PASTA: + return L2_MAX_LEN_PASTA; + case MODE_VU: + return L2_MAX_LEN_VU; + } + /* NOLINTEND(bugprone-branch-clone) */ + ASSERT(0); + + return 0; /* Unreachable, for cppcheck's sake */ +} + +/** * tap_send_single() - Send a single frame * @c: Execution context * @data: Packet buffer @@ -502,12 +555,13 @@ void eth_update_mac(struct ethhdr *eh, memcpy(eh->h_source, eth_s, sizeof(eh->h_source)); } -PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf); +PACKET_POOL_DECL(pool_l4, UIO_MAXIOV); /** * struct l4_seq4_t - Message sequence for one protocol handler call, IPv4 * @msgs: Count of messages in sequence * @protocol: Protocol number + * @ttl: Time to live * @source: Source port * @dest: Destination port * @saddr: Source address @@ -516,6 +570,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf); */ static struct tap4_l4_t { uint8_t protocol; + uint8_t ttl; uint16_t source; uint16_t dest; @@ -535,6 +590,7 @@ static struct tap4_l4_t { * @dest: Destination port * @saddr: Source address * @daddr: Destination address + * @hop_limit: Hop limit * @msg: Array of messages that can be handled in a single call */ static struct tap6_l4_t { @@ -547,6 +603,8 @@ static struct tap6_l4_t { struct in6_addr saddr; struct in6_addr daddr; + uint8_t hop_limit; + struct pool_l4_t p; } tap6_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */]; @@ -648,26 +706,31 @@ static int tap4_handler(struct ctx *c, const struct pool *in, i = 0; resume: for (seq_count = 0, seq = NULL; i < in->count; i++) { - size_t l2len, l3len, hlen, l4len; + size_t l3len, hlen, l4len; + struct ethhdr eh_storage; + struct iphdr iph_storage; + struct udphdr uh_storage; const struct ethhdr *eh; const struct udphdr *uh; + struct iov_tail data; struct iphdr *iph; - const char *l4h; - packet_get(in, i, 0, 0, &l2len); + if (!packet_get(in, i, &data)) + continue; - eh = packet_get(in, i, 0, sizeof(*eh), &l3len); + eh = IOV_PEEK_HEADER(&data, eh_storage); if (!eh) continue; if (ntohs(eh->h_proto) == ETH_P_ARP) { - PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); - - packet_add(pkt, l2len, (char *)eh); - arp(c, pkt); + arp(c, &data); continue; } - iph = packet_get(in, i, sizeof(*eh), sizeof(*iph), NULL); + if (!iov_drop_header(&data, sizeof(*eh))) + continue; + l3len = iov_tail_size(&data); + + iph = IOV_PEEK_HEADER(&data, iph_storage); if (!iph) continue; @@ -695,34 +758,32 @@ resume: if (iph->saddr && c->ip4.addr_seen.s_addr != iph->saddr) c->ip4.addr_seen.s_addr = iph->saddr; - l4h = packet_get(in, i, sizeof(*eh) + hlen, l4len, NULL); - if (!l4h) + if (!iov_drop_header(&data, hlen)) + continue; + if (iov_tail_size(&data) != l4len) continue; if (iph->protocol == IPPROTO_ICMP) { - PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); - if (c->no_icmp) continue; tap_packet_debug(iph, NULL, NULL, 0, NULL, 1); - packet_add(pkt, l4len, l4h); icmp_tap_handler(c, PIF_TAP, AF_INET, &iph->saddr, &iph->daddr, - pkt, now); + &data, now); continue; } - uh = packet_get(in, i, sizeof(*eh) + hlen, sizeof(*uh), NULL); + uh = IOV_PEEK_HEADER(&data, uh_storage); if (!uh) continue; if (iph->protocol == IPPROTO_UDP) { - PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); + struct iov_tail eh_data; - packet_add(pkt, l2len, (char *)eh); - if (dhcp(c, pkt)) + packet_get(in, i, &eh_data); + if (dhcp(c, &eh_data)) continue; } @@ -735,7 +796,8 @@ resume: #define L4_MATCH(iph, uh, seq) \ ((seq)->protocol == (iph)->protocol && \ (seq)->source == (uh)->source && (seq)->dest == (uh)->dest && \ - (seq)->saddr.s_addr == (iph)->saddr && (seq)->daddr.s_addr == (iph)->daddr) + (seq)->saddr.s_addr == (iph)->saddr && \ + (seq)->daddr.s_addr == (iph)->daddr && (seq)->ttl == (iph)->ttl) #define L4_SET(iph, uh, seq) \ do { \ @@ -744,6 +806,7 @@ resume: (seq)->dest = (uh)->dest; \ (seq)->saddr.s_addr = (iph)->saddr; \ (seq)->daddr.s_addr = (iph)->daddr; \ + (seq)->ttl = (iph)->ttl; \ } while (0) if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV) @@ -770,7 +833,7 @@ resume: #undef L4_SET append: - packet_add((struct pool *)&seq->p, l4len, l4h); + packet_add((struct pool *)&seq->p, &data); } for (j = 0, seq = tap4_l4; j < seq_count; j++, seq++) { @@ -792,7 +855,7 @@ append: for (k = 0; k < p->count; ) k += udp_tap_handler(c, PIF_TAP, AF_INET, &seq->saddr, &seq->daddr, - p, k, now); + seq->ttl, p, k, now); } } @@ -824,20 +887,28 @@ resume: for (seq_count = 0, seq = NULL; i < in->count; i++) { size_t l4len, plen, check; struct in6_addr *saddr, *daddr; + struct ipv6hdr ip6h_storage; + struct ethhdr eh_storage; + struct udphdr uh_storage; const struct ethhdr *eh; const struct udphdr *uh; + struct iov_tail data; struct ipv6hdr *ip6h; uint8_t proto; - char *l4h; - eh = packet_get(in, i, 0, sizeof(*eh), NULL); + if (!packet_get(in, i, &data)) + return -1; + + eh = IOV_REMOVE_HEADER(&data, eh_storage); if (!eh) continue; - ip6h = packet_get(in, i, sizeof(*eh), sizeof(*ip6h), &check); + ip6h = IOV_PEEK_HEADER(&data, ip6h_storage); if (!ip6h) continue; + check = iov_tail_size(&data) - sizeof(*ip6h); + saddr = &ip6h->saddr; daddr = &ip6h->daddr; @@ -845,7 +916,7 @@ resume: if (plen != check) continue; - if (!(l4h = ipv6_l4hdr(in, i, sizeof(*eh), &proto, &l4len))) + if (!ipv6_l4hdr(&data, &proto, &l4len)) continue; if (IN6_IS_ADDR_LOOPBACK(saddr) || IN6_IS_ADDR_LOOPBACK(daddr)) { @@ -871,7 +942,7 @@ resume: } if (proto == IPPROTO_ICMPV6) { - PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); + struct iov_tail ndp_data; if (c->no_icmp) continue; @@ -879,28 +950,27 @@ resume: if (l4len < sizeof(struct icmp6hdr)) continue; - packet_add(pkt, l4len, l4h); - - if (ndp(c, (struct icmp6hdr *)l4h, saddr, pkt)) + ndp_data = data; + if (ndp(c, saddr, &ndp_data)) continue; tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1); icmp_tap_handler(c, PIF_TAP, AF_INET6, - saddr, daddr, pkt, now); + saddr, daddr, &data, now); continue; } if (l4len < sizeof(*uh)) continue; - uh = (struct udphdr *)l4h; + uh = IOV_PEEK_HEADER(&data, uh_storage); + if (!uh) + continue; if (proto == IPPROTO_UDP) { - PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); - - packet_add(pkt, l4len, l4h); + struct iov_tail uh_data = data; - if (dhcpv6(c, pkt, saddr, daddr)) + if (dhcpv6(c, &uh_data, saddr, daddr)) continue; } @@ -915,7 +985,8 @@ resume: (seq)->dest == (uh)->dest && \ (seq)->flow_lbl == ip6_get_flow_lbl(ip6h) && \ IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr) && \ - IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr)) + IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr) && \ + (seq)->hop_limit == (ip6h)->hop_limit) #define L4_SET(ip6h, proto, uh, seq) \ do { \ @@ -925,6 +996,7 @@ resume: (seq)->flow_lbl = ip6_get_flow_lbl(ip6h); \ (seq)->saddr = *saddr; \ (seq)->daddr = *daddr; \ + (seq)->hop_limit = (ip6h)->hop_limit; \ } while (0) if (seq && L4_MATCH(ip6h, proto, uh, seq) && @@ -952,7 +1024,7 @@ resume: #undef L4_SET append: - packet_add((struct pool *)&seq->p, l4len, l4h); + packet_add((struct pool *)&seq->p, &data); } for (j = 0, seq = tap6_l4; j < seq_count; j++, seq++) { @@ -975,7 +1047,7 @@ append: for (k = 0; k < p->count; ) k += udp_tap_handler(c, PIF_TAP, AF_INET6, &seq->saddr, &seq->daddr, - p, k, now); + seq->hop_limit, p, k, now); } } @@ -1008,16 +1080,20 @@ void tap_handler(struct ctx *c, const struct timespec *now) /** * tap_add_packet() - Queue/capture packet, update notion of guest MAC address * @c: Execution context - * @l2len: Total L2 packet length - * @p: Packet buffer + * @data: Packet to add to the pool + * @now: Current timestamp */ -void tap_add_packet(struct ctx *c, ssize_t l2len, char *p) +void tap_add_packet(struct ctx *c, struct iov_tail *data, + const struct timespec *now) { + struct ethhdr eh_storage; const struct ethhdr *eh; - pcap(p, l2len); + pcap_iov(data->iov, data->cnt, data->off); - eh = (struct ethhdr *)p; + eh = IOV_PEEK_HEADER(data, eh_storage); + if (!eh) + return; if (memcmp(c->guest_mac, eh->h_source, ETH_ALEN)) { memcpy(c->guest_mac, eh->h_source, ETH_ALEN); @@ -1027,10 +1103,18 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p) switch (ntohs(eh->h_proto)) { case ETH_P_ARP: case ETH_P_IP: - packet_add(pool_tap4, l2len, p); + if (!pool_can_fit(pool_tap4, data)) { + tap4_handler(c, pool_tap4, now); + pool_flush(pool_tap4); + } + packet_add(pool_tap4, data); break; case ETH_P_IPV6: - packet_add(pool_tap6, l2len, p); + if (!pool_can_fit(pool_tap6, data)) { + tap6_handler(c, pool_tap6, now); + pool_flush(pool_tap6); + } + packet_add(pool_tap6, data); break; default: break; @@ -1045,8 +1129,10 @@ void tap_sock_reset(struct ctx *c) { info("Client connection closed%s", c->one_off ? ", exiting" : ""); - if (c->one_off) + if (c->one_off) { + fsync_pcap_and_log(); _exit(EXIT_SUCCESS); + } /* Close the connected socket, wait for a new connection */ epoll_del(c, c->fd_tap); @@ -1080,7 +1166,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now) do { n = recv(c->fd_tap, pkt_buf + partial_len, - TAP_BUF_BYTES - partial_len, MSG_DONTWAIT); + sizeof(pkt_buf) - partial_len, MSG_DONTWAIT); } while ((n < 0) && errno == EINTR); if (n < 0) { @@ -1096,8 +1182,9 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now) while (n >= (ssize_t)sizeof(uint32_t)) { uint32_t l2len = ntohl_unaligned(p); + struct iov_tail data; - if (l2len < sizeof(struct ethhdr) || l2len > ETH_MAX_MTU) { + if (l2len < sizeof(struct ethhdr) || l2len > L2_MAX_LEN_PASST) { err("Bad frame size from guest, resetting connection"); tap_sock_reset(c); return; @@ -1110,7 +1197,8 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now) p += sizeof(uint32_t); n -= sizeof(uint32_t); - tap_add_packet(c, l2len, p); + data = IOV_TAIL_FROM_BUF(p, l2len, 0); + tap_add_packet(c, &data, now); p += l2len; n -= l2len; @@ -1151,8 +1239,12 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now) tap_flush_pools(); - for (n = 0; n <= (ssize_t)(TAP_BUF_BYTES - ETH_MAX_MTU); n += len) { - len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU); + for (n = 0; + n <= (ssize_t)(sizeof(pkt_buf) - L2_MAX_LEN_PASTA); + n += len) { + struct iov_tail data; + + len = read(c->fd_tap, pkt_buf + n, L2_MAX_LEN_PASTA); if (len == 0) { die("EOF on tap device, exiting"); @@ -1170,10 +1262,11 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now) /* Ignore frames of bad length */ if (len < (ssize_t)sizeof(struct ethhdr) || - len > (ssize_t)ETH_MAX_MTU) + len > (ssize_t)L2_MAX_LEN_PASTA) continue; - tap_add_packet(c, len, pkt_buf + n); + data = IOV_TAIL_FROM_BUF(pkt_buf + n, len, 0); + tap_add_packet(c, &data, now); } tap_handler(c, now); @@ -1367,12 +1460,12 @@ static void tap_sock_tun_init(struct ctx *c) * @base: Buffer base * @size Buffer size */ -void tap_sock_update_pool(void *base, size_t size) +static void tap_sock_update_pool(void *base, size_t size) { int i; - pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, base, size); - pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, base, size); + pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS_IP4, base, size); + pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS_IP6, base, size); for (i = 0; i < TAP_SEQS; i++) { tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, base, size); @@ -1388,8 +1481,8 @@ void tap_sock_update_pool(void *base, size_t size) void tap_backend_init(struct ctx *c) { if (c->mode == MODE_VU) { - tap_sock_update_pool(NULL, 0); vu_init(c); + tap_sock_update_pool(&c->vdev->memory, 0); } else { tap_sock_update_pool(pkt_buf, sizeof(pkt_buf)); } @@ -6,6 +6,31 @@ #ifndef TAP_H #define TAP_H +/** L2_MAX_LEN_PASTA - Maximum frame length for pasta mode (with L2 header) + * + * The kernel tuntap device imposes a maximum frame size of 65535 including + * 'hard_header_len' (14 bytes for L2 Ethernet in the case of "tap" mode). + */ +#define L2_MAX_LEN_PASTA USHRT_MAX + +/** L2_MAX_LEN_PASST - Maximum frame length for passt mode (with L2 header) + * + * The only structural limit the QEMU socket protocol imposes on frames is + * (2^32-1) bytes, but that would be ludicrously long in practice. For now, + * limit it somewhat arbitrarily to 65535 bytes. FIXME: Work out an appropriate + * limit with more precision. + */ +#define L2_MAX_LEN_PASST USHRT_MAX + +/** L2_MAX_LEN_VU - Maximum frame length for vhost-user mode (with L2 header) + * + * vhost-user allows multiple buffers per frame, each of which can be quite + * large, so the inherent frame size limit is rather large. Much larger than is + * actually useful for IP. For now limit arbitrarily to 65535 bytes. FIXME: + * Work out an appropriate limit with more precision. + */ +#define L2_MAX_LEN_VU USHRT_MAX + struct udphdr; /** @@ -21,8 +46,8 @@ struct tap_hdr { * @c: Execution context * @taph: Pointer to tap specific header buffer * - * Returns: A struct iovec covering the correct portion of @taph to use as the - * tap specific header in the current configuration. + * Return: a struct iovec covering the correct portion of @taph to use as the + * tap specific header in the current configuration. */ static inline struct iovec tap_hdr_iov(const struct ctx *c, struct tap_hdr *thdr) @@ -44,6 +69,7 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len) thdr->vnet_len = htonl(l2len); } +unsigned long tap_l2_max_len(const struct ctx *c); void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto); void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, struct in_addr dst, size_t l4len, uint8_t proto); @@ -89,10 +115,9 @@ void tap_handler_passt(struct ctx *c, uint32_t events, const struct timespec *now); int tap_sock_unix_open(char *sock_path); void tap_sock_reset(struct ctx *c); -void tap_sock_update_pool(void *base, size_t size); void tap_backend_init(struct ctx *c); void tap_flush_pools(void); void tap_handler(struct ctx *c, const struct timespec *now); -void tap_add_packet(struct ctx *c, ssize_t l2len, char *p); - +void tap_add_packet(struct ctx *c, struct iov_tail *data, + const struct timespec *now); #endif /* TAP_H */ @@ -310,6 +310,16 @@ #include "tcp_buf.h" #include "tcp_vu.h" +/* + * The size of TCP header (including options) is given by doff (Data Offset) + * that is a 4-bit value specifying the number of 32-bit words in the header. + * The maximum value of doff is 15 [(1 << 4) - 1]. + * The maximum length in bytes of options is 15 minus the number of 32-bit + * words in the minimal TCP header (5) multiplied by the length of a 32-bit + * word (4). + */ +#define OPTLEN_MAX (((1UL << 4) - 1 - 5) * 4UL) + #ifndef __USE_MISC /* From Linux UAPI, missing in netinet/tcp.h provided by musl */ struct tcp_repair_opt { @@ -389,7 +399,7 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS]; */ static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE]; -char tcp_buf_discard [MAX_WINDOW]; +char tcp_buf_discard [BUF_DISCARD_SIZE]; /* Does the kernel support TCP_PEEK_OFF? */ bool peek_offset_cap; @@ -434,19 +444,20 @@ static struct tcp_tap_conn *conn_at_sidx(flow_sidx_t sidx) } /** - * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported - * @s: Socket to update + * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on connection if supported + * @conn: Pointer to the TCP connection structure * @offset: Offset in bytes * - * Return: -1 when it fails, 0 otherwise. + * Return: -1 when it fails, 0 otherwise. */ -int tcp_set_peek_offset(int s, int offset) +int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset) { if (!peek_offset_cap) return 0; - if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) { - err("Failed to set SO_PEEK_OFF to %i in socket %i", offset, s); + if (setsockopt(conn->sock, SOL_SOCKET, SO_PEEK_OFF, + &offset, sizeof(offset))) { + flow_perror(conn, "Failed to set SO_PEEK_OFF to %i", offset); return -1; } return 0; @@ -455,7 +466,7 @@ int tcp_set_peek_offset(int s, int offset) /** * tcp_conn_epoll_events() - epoll events mask for given connection state * @events: Current connection events - * @conn_flags Connection flags + * @conn_flags: Connection flags * * Return: epoll events mask corresponding to implied connection state */ @@ -1078,7 +1089,7 @@ out: * tcp_update_seqack_from_tap() - ACK number from tap and related flags/counters * @c: Execution context * @conn: Connection pointer - * @seq Current ACK sequence, host order + * @seq: Current ACK sequence, host order */ static void tcp_update_seqack_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, uint32_t seq) @@ -1097,12 +1108,32 @@ static void tcp_update_seqack_from_tap(const struct ctx *c, } /** + * tcp_rewind_seq() - Rewind sequence to tap and socket offset to current ACK + * @c: Execution context + * @conn: Connection pointer + * + * Return: 0 on success, -1 on failure, with connection reset + */ +static int tcp_rewind_seq(const struct ctx *c, struct tcp_tap_conn *conn) +{ + conn->seq_to_tap = conn->seq_ack_from_tap; + conn->events &= ~TAP_FIN_SENT; + + if (tcp_set_peek_offset(conn, 0)) { + tcp_rst(c, conn); + return -1; + } + + return 0; +} + +/** * tcp_prepare_flags() - Prepare header for flags-only segment (no payload) * @c: Execution context * @conn: Connection pointer * @flags: TCP flags: if not set, send segment only if ACK is due * @th: TCP header to update - * @data: buffer to store TCP option + * @opts: TCP option buffer (output parameter) * @optlen: size of the TCP option buffer (output parameter) * * Return: < 0 error code on connection reset, @@ -1165,6 +1196,7 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, th->doff = (sizeof(*th) + *optlen) / 4; th->ack = !!(flags & ACK); + th->psh = !!(flags & PSH); th->rst = !!(flags & RST); th->syn = !!(flags & SYN); th->fin = !!(flags & FIN); @@ -1236,30 +1268,41 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn, /** * tcp_tap_window_update() - Process an updated window from tap side + * @c: Execution context * @conn: Connection pointer - * @window: Window value, host order, unscaled + * @wnd: Window value, host order, unscaled + * + * Return: false on zero window (not stored to wnd_from_tap), true otherwise */ -static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd) +static bool tcp_tap_window_update(const struct ctx *c, + struct tcp_tap_conn *conn, unsigned wnd) { wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap); /* Work-around for bug introduced in peer kernel code, commit - * e2142825c120 ("net: tcp: send zero-window ACK when no memory"). - * We don't update if window shrank to zero. + * e2142825c120 ("net: tcp: send zero-window ACK when no memory"): don't + * update the window if it shrank to zero, so that we'll eventually + * retry to send data, but rewind the sequence as that obviously implies + * that no data beyond the updated window will be acknowledged. */ - if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) - return; + if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) { + tcp_rewind_seq(c, conn); + return false; + } conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX); /* FIXME: reflect the tap-side receiver's window back to the sock-side * sender by adjusting SO_RCVBUF? */ + return true; } /** * tcp_init_seq() - Calculate initial sequence number according to RFC 6528 * @hash: Hash of connection details * @now: Current timestamp + * + * Return: the calculated 32-bit initial sequence number. */ static uint32_t tcp_init_seq(uint64_t hash, const struct timespec *now) { @@ -1316,7 +1359,7 @@ static int tcp_conn_new_sock(sa_family_t af) * tcp_conn_sock() - Obtain a connectable socket in the host/init namespace * @af: Address family (AF_INET or AF_INET6) * - * Return: Socket fd on success, -errno on failure + * Return: socket fd on success, -errno on failure */ int tcp_conn_sock(sa_family_t af) { @@ -1545,9 +1588,8 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, if (c->mode == MODE_VU) { /* To rebind to same oport after migration */ sl = sizeof(sa); - if (!getsockname(s, &sa.sa, &sl)) - inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa); - else + if (getsockname(s, &sa.sa, &sl) || + inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa) < 0) err_perror("Can't get local address for socket %i", s); } @@ -1611,6 +1653,23 @@ static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) } /** + * tcp_packet_data_len() - Get data (TCP payload) length for a TCP packet + * @th: Pointer to TCP header + * @l4len: TCP packet length, including TCP header + * + * Return: data length of TCP packet, -1 on invalid value of Data Offset field + */ +static ssize_t tcp_packet_data_len(const struct tcphdr *th, size_t l4len) +{ + size_t off = th->doff * 4UL; + + if (off < sizeof(*th) || off > l4len) + return -1; + + return l4len - off; +} + +/** * tcp_data_from_tap() - tap/guest data for established connection * @c: Execution context * @conn: Connection pointer @@ -1639,16 +1698,22 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, for (i = idx, iov_i = 0; i < (int)p->count; i++) { uint32_t seq, seq_offset, ack_seq; + struct tcphdr th_storage; const struct tcphdr *th; - char *data; - size_t off; + struct iov_tail data; + size_t off, size; + int count; - th = packet_get(p, i, 0, sizeof(*th), &len); + if (!packet_get(p, i, &data)) + return -1; + + th = IOV_PEEK_HEADER(&data, th_storage); if (!th) return -1; - len += sizeof(*th); + len = iov_tail_size(&data); off = th->doff * 4UL; + if (off < sizeof(*th) || off > len) return -1; @@ -1658,9 +1723,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, } len -= off; - data = packet_get(p, i, off, len, NULL); - if (!data) - continue; + iov_drop_header(&data, off); seq = ntohl(th->seq); if (SEQ_LT(seq, conn->seq_from_tap) && len <= 1) { @@ -1672,7 +1735,8 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, tcp_timer_ctl(c, conn); if (p->count == 1) { - tcp_tap_window_update(conn, ntohs(th->window)); + tcp_tap_window_update(c, conn, + ntohs(th->window)); return 1; } @@ -1691,6 +1755,15 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, ack_seq == max_ack_seq && ntohs(th->window) == max_ack_seq_wnd; + /* See tcp_tap_window_update() for details. On + * top of that, we also need to check here if a + * zero-window update is contained in a batch of + * packets that includes a non-zero window as + * well. + */ + if (!ntohs(th->window)) + tcp_rewind_seq(c, conn); + max_ack_seq_wnd = ntohs(th->window); max_ack_seq = ack_seq; } @@ -1734,10 +1807,14 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, continue; } - tcp_iov[iov_i].iov_base = data + seq_offset; - tcp_iov[iov_i].iov_len = len - seq_offset; - seq_from_tap += tcp_iov[iov_i].iov_len; - iov_i++; + iov_drop_header(&data, seq_offset); + size = len - seq_offset; + count = iov_tail_clone(&tcp_iov[iov_i], UIO_MAXIOV - iov_i, + &data); + if (count < 0) + break; + seq_from_tap += size; + iov_i += count; if (keep == i) keep = -1; @@ -1750,17 +1827,16 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, if (ack && !tcp_sock_consume(conn, max_ack_seq)) tcp_update_seqack_from_tap(c, conn, max_ack_seq); - tcp_tap_window_update(conn, max_ack_seq_wnd); + tcp_tap_window_update(c, conn, max_ack_seq_wnd); if (retr) { flow_trace(conn, "fast re-transmit, ACK: %u, previous sequence: %u", - max_ack_seq, conn->seq_to_tap); - conn->seq_to_tap = max_ack_seq; - if (tcp_set_peek_offset(conn->sock, 0)) { - tcp_rst(c, conn); + conn->seq_ack_from_tap, conn->seq_to_tap); + + if (tcp_rewind_seq(c, conn)) return -1; - } + tcp_data_from_sock(c, conn); } @@ -1840,7 +1916,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c, const struct tcphdr *th, const char *opts, size_t optlen) { - tcp_tap_window_update(conn, ntohs(th->window)); + tcp_tap_window_update(c, conn, ntohs(th->window)); tcp_get_tap_ws(conn, opts, optlen); /* First value is not scaled */ @@ -1854,7 +1930,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c, conn->seq_ack_to_tap = conn->seq_from_tap; conn_event(c, conn, ESTABLISHED); - if (tcp_set_peek_offset(conn->sock, 0)) { + if (tcp_set_peek_offset(conn, 0)) { tcp_rst(c, conn); return; } @@ -1955,8 +2031,11 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const struct pool *p, int idx, const struct timespec *now) { struct tcp_tap_conn *conn; + struct tcphdr th_storage; const struct tcphdr *th; - size_t optlen, len; + char optsc[OPTLEN_MAX]; + struct iov_tail data; + size_t optlen, l4len; const char *opts; union flow *flow; flow_sidx_t sidx; @@ -1965,15 +2044,19 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, (void)pif; - th = packet_get(p, idx, 0, sizeof(*th), &len); + if (!packet_get(p, idx, &data)) + return 1; + + l4len = iov_tail_size(&data); + + th = IOV_REMOVE_HEADER(&data, th_storage); if (!th) return 1; - len += sizeof(*th); optlen = th->doff * 4UL - sizeof(*th); /* Static checkers might fail to see this: */ - optlen = MIN(optlen, ((1UL << 4) /* from doff width */ - 6) * 4UL); - opts = packet_get(p, idx, sizeof(*th), optlen, NULL); + optlen = MIN(optlen, OPTLEN_MAX); + opts = (char *)iov_remove_header_(&data, &optsc[0], optlen, 1); sidx = flow_lookup_af(c, IPPROTO_TCP, PIF_TAP, af, saddr, daddr, ntohs(th->source), ntohs(th->dest)); @@ -1985,7 +2068,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, tcp_conn_from_tap(c, af, saddr, daddr, th, opts, optlen, now); else - tcp_rst_no_conn(c, af, saddr, daddr, flow_lbl, th, len); + tcp_rst_no_conn(c, af, saddr, daddr, flow_lbl, th, l4len); return 1; } @@ -1993,7 +2076,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, ASSERT(pif_at_sidx(sidx) == PIF_TAP); conn = &flow->tcp; - flow_trace(conn, "packet length %zu from tap", len); + flow_trace(conn, "packet length %zu from tap", l4len); if (th->rst) { conn_event(c, conn, CLOSED); @@ -2022,7 +2105,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, goto reset; conn_event(c, conn, ESTABLISHED); - if (tcp_set_peek_offset(conn->sock, 0)) + if (tcp_set_peek_offset(conn, 0)) goto reset; if (th->fin) { @@ -2038,9 +2121,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, if (!th->ack) goto reset; - tcp_tap_window_update(conn, ntohs(th->window)); - - tcp_data_from_sock(c, conn); + if (tcp_tap_window_update(c, conn, ntohs(th->window))) + tcp_data_from_sock(c, conn); if (p->count - idx == 1) return 1; @@ -2048,13 +2130,38 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, /* Established connections not accepting data from tap */ if (conn->events & TAP_FIN_RCVD) { - tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq)); - tcp_tap_window_update(conn, ntohs(th->window)); - tcp_data_from_sock(c, conn); + bool retr; - if (conn->events & SOCK_FIN_RCVD && - conn->seq_ack_from_tap == conn->seq_to_tap) - conn_event(c, conn, CLOSED); + retr = th->ack && !tcp_packet_data_len(th, l4len) && !th->fin && + ntohl(th->ack_seq) == conn->seq_ack_from_tap && + ntohs(th->window) == conn->wnd_from_tap; + + /* On socket flush failure, pretend there was no ACK, try again + * later + */ + if (th->ack && !tcp_sock_consume(conn, ntohl(th->ack_seq))) + tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq)); + + if (retr) { + flow_trace(conn, + "fast re-transmit, ACK: %u, previous sequence: %u", + ntohl(th->ack_seq), conn->seq_to_tap); + + if (tcp_rewind_seq(c, conn)) + return -1; + } + + if (tcp_tap_window_update(c, conn, ntohs(th->window)) || retr) + tcp_data_from_sock(c, conn); + + if (conn->seq_ack_from_tap == conn->seq_to_tap) { + if (th->ack && conn->events & TAP_FIN_SENT) + conn_event(c, conn, TAP_FIN_ACKED); + + if (conn->events & SOCK_FIN_RCVD && + conn->events & TAP_FIN_ACKED) + conn_event(c, conn, CLOSED); + } return 1; } @@ -2199,12 +2306,11 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, * mode only, below. */ ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa, - ref.tcp_listen.port); + NULL, ref.tcp_listen.port); if (c->mode == MODE_VU) { /* Rebind to same address after migration */ - if (!getsockname(s, &sa.sa, &sl)) - inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa); - else + if (getsockname(s, &sa.sa, &sl) || + inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa) < 0) err_perror("Can't get local address for socket %i", s); } @@ -2282,16 +2388,16 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) tcp_rst(c, conn); } else { flow_dbg(conn, "ACK timeout, retry"); - conn->retrans++; - conn->seq_to_tap = conn->seq_ack_from_tap; + if (!conn->wnd_from_tap) conn->wnd_from_tap = 1; /* Zero-window probe */ - if (tcp_set_peek_offset(conn->sock, 0)) { - tcp_rst(c, conn); - } else { - tcp_data_from_sock(c, conn); - tcp_timer_ctl(c, conn); - } + + conn->retrans++; + if (tcp_rewind_seq(c, conn)) + return; + + tcp_data_from_sock(c, conn); + tcp_timer_ctl(c, conn); } } else { struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } }; @@ -2335,7 +2441,7 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, return; } - if ((conn->events & TAP_FIN_SENT) && (events & EPOLLHUP)) { + if ((conn->events & TAP_FIN_ACKED) && (events & EPOLLHUP)) { conn_event(c, conn, CLOSED); return; } @@ -2604,7 +2710,7 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af) /** * tcp_probe_tcp_info() - Check what data TCP_INFO reports * - * Return: Number of bytes returned by TCP_INFO getsockopt() + * Return: number of bytes returned by TCP_INFO getsockopt() */ static socklen_t tcp_probe_tcp_info(void) { @@ -2810,20 +2916,21 @@ int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn) /** * tcp_flow_dump_tinfo() - Dump window scale, tcpi_state, tcpi_options - * @c: Execution context + * @conn: Pointer to the TCP connection structure * @t: Extended migration data * * Return: 0 on success, negative error code on failure */ -static int tcp_flow_dump_tinfo(int s, struct tcp_tap_transfer_ext *t) +static int tcp_flow_dump_tinfo(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) { struct tcp_info tinfo; socklen_t sl; sl = sizeof(tinfo); - if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) { + if (getsockopt(conn->sock, SOL_TCP, TCP_INFO, &tinfo, &sl)) { int rc = -errno; - err_perror("Querying TCP_INFO, socket %i", s); + flow_perror(conn, "Querying TCP_INFO"); return rc; } @@ -2837,39 +2944,95 @@ static int tcp_flow_dump_tinfo(int s, struct tcp_tap_transfer_ext *t) /** * tcp_flow_dump_mss() - Dump MSS clamp (not current MSS) via TCP_MAXSEG - * @c: Execution context + * @conn: Pointer to the TCP connection structure * @t: Extended migration data * * Return: 0 on success, negative error code on failure */ -static int tcp_flow_dump_mss(int s, struct tcp_tap_transfer_ext *t) +static int tcp_flow_dump_mss(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) { socklen_t sl = sizeof(t->mss); + int val; - if (getsockopt(s, SOL_TCP, TCP_MAXSEG, &t->mss, &sl)) { + if (getsockopt(conn->sock, SOL_TCP, TCP_MAXSEG, &val, &sl)) { int rc = -errno; - err_perror("Getting MSS, socket %i", s); + flow_perror(conn, "Getting MSS"); return rc; } + t->mss = (uint32_t)val; + + return 0; +} + + +/** + * tcp_flow_dump_timestamp() - Dump RFC 7323 timestamp via TCP_TIMESTAMP + * @conn: Pointer to the TCP connection structure + * @t: Extended migration data (tcpi_options must be populated) + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_dump_timestamp(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) +{ + int val = 0; + + if (t->tcpi_options & TCPI_OPT_TIMESTAMPS) { + socklen_t sl = sizeof(val); + + if (getsockopt(conn->sock, SOL_TCP, TCP_TIMESTAMP, &val, &sl)) { + int rc = -errno; + flow_perror(conn, "Getting RFC 7323 timestamp"); + return rc; + } + } + + t->timestamp = (uint32_t)val; + return 0; +} + +/** + * tcp_flow_repair_timestamp() - Restore RFC 7323 timestamp via TCP_TIMESTAMP + * @conn: Pointer to the TCP connection structure + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_timestamp(const struct tcp_tap_conn *conn, + const struct tcp_tap_transfer_ext *t) +{ + int val = (int)t->timestamp; + + if (t->tcpi_options & TCPI_OPT_TIMESTAMPS) { + if (setsockopt(conn->sock, SOL_TCP, TCP_TIMESTAMP, + &val, sizeof(val))) { + int rc = -errno; + flow_perror(conn, "Setting RFC 7323 timestamp"); + return rc; + } + } + return 0; } /** * tcp_flow_dump_wnd() - Dump current tcp_repair_window parameters - * @c: Execution context + * @conn: Pointer to the TCP connection structure * @t: Extended migration data * * Return: 0 on success, negative error code on failure */ -static int tcp_flow_dump_wnd(int s, struct tcp_tap_transfer_ext *t) +static int tcp_flow_dump_wnd(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) { struct tcp_repair_window wnd; socklen_t sl = sizeof(wnd); - if (getsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, &sl)) { + if (getsockopt(conn->sock, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, &sl)) { int rc = -errno; - err_perror("Getting window repair data, socket %i", s); + flow_perror(conn, "Getting window repair data"); return rc; } @@ -2893,12 +3056,13 @@ static int tcp_flow_dump_wnd(int s, struct tcp_tap_transfer_ext *t) /** * tcp_flow_repair_wnd() - Restore window parameters from extended data - * @c: Execution context + * @conn: Pointer to the TCP connection structure * @t: Extended migration data * * Return: 0 on success, negative error code on failure */ -static int tcp_flow_repair_wnd(int s, const struct tcp_tap_transfer_ext *t) +static int tcp_flow_repair_wnd(const struct tcp_tap_conn *conn, + const struct tcp_tap_transfer_ext *t) { struct tcp_repair_window wnd; @@ -2908,9 +3072,10 @@ static int tcp_flow_repair_wnd(int s, const struct tcp_tap_transfer_ext *t) wnd.rcv_wnd = t->rcv_wnd; wnd.rcv_wup = t->rcv_wup; - if (setsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, sizeof(wnd))) { + if (setsockopt(conn->sock, IPPROTO_TCP, TCP_REPAIR_WINDOW, + &wnd, sizeof(wnd))) { int rc = -errno; - err_perror("Setting window data, socket %i", s); + flow_perror(conn, "Setting window data"); return rc; } @@ -2919,16 +3084,17 @@ static int tcp_flow_repair_wnd(int s, const struct tcp_tap_transfer_ext *t) /** * tcp_flow_select_queue() - Select queue (receive or send) for next operation - * @s: Socket + * @conn: Connection to select queue for * @queue: TCP_RECV_QUEUE or TCP_SEND_QUEUE * * Return: 0 on success, negative error code on failure */ -static int tcp_flow_select_queue(int s, int queue) +static int tcp_flow_select_queue(const struct tcp_tap_conn *conn, int queue) { - if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue))) { + if (setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_QUEUE, + &queue, sizeof(queue))) { int rc = -errno; - err_perror("Selecting TCP_SEND_QUEUE, socket %i", s); + flow_perror(conn, "Selecting TCP_SEND_QUEUE"); return rc; } @@ -2937,26 +3103,28 @@ static int tcp_flow_select_queue(int s, int queue) /** * tcp_flow_dump_sndqueue() - Dump send queue, length of sent and not sent data - * @s: Socket + * @conn: Connection to dump queue for * @t: Extended migration data * * Return: 0 on success, negative error code on failure * * #syscalls:vu ioctl */ -static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t) +static int tcp_flow_dump_sndqueue(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) { + int s = conn->sock; ssize_t rc; if (ioctl(s, SIOCOUTQ, &t->sndq) < 0) { rc = -errno; - err_perror("Getting send queue size, socket %i", s); + flow_perror(conn, "Getting send queue size"); return rc; } if (ioctl(s, SIOCOUTQNSD, &t->notsent) < 0) { rc = -errno; - err_perror("Getting not sent count, socket %i", s); + flow_perror(conn, "Getting not sent count"); return rc; } @@ -2975,14 +3143,16 @@ static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t) } if (t->notsent > t->sndq) { - err("Invalid notsent count socket %i, send: %u, not sent: %u", - s, t->sndq, t->notsent); + flow_err(conn, + "Invalid notsent count socket %i, send: %u, not sent: %u", + s, t->sndq, t->notsent); return -EINVAL; } if (t->sndq > TCP_MIGRATE_SND_QUEUE_MAX) { - err("Send queue too large to migrate socket %i: %u bytes", - s, t->sndq); + flow_err(conn, + "Send queue too large to migrate socket %i: %u bytes", + s, t->sndq); return -ENOBUFS; } @@ -2993,13 +3163,13 @@ static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t) rc = 0; } else { rc = -errno; - err_perror("Can't read send queue, socket %i", s); + flow_perror(conn, "Can't read send queue"); return rc; } } if ((uint32_t)rc < t->sndq) { - err("Short read migrating send queue"); + flow_err(conn, "Short read migrating send queue"); return -ENXIO; } @@ -3010,19 +3180,20 @@ static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t) /** * tcp_flow_repair_queue() - Restore contents of a given (pre-selected) queue - * @s: Socket + * @conn: Connection to repair queue for * @len: Length of data to be restored * @buf: Buffer with content of pending data queue * * Return: 0 on success, negative error code on failure */ -static int tcp_flow_repair_queue(int s, size_t len, uint8_t *buf) +static int tcp_flow_repair_queue(const struct tcp_tap_conn *conn, + size_t len, uint8_t *buf) { size_t chunk = len; uint8_t *p = buf; while (len > 0) { - ssize_t rc = send(s, p, MIN(len, chunk), 0); + ssize_t rc = send(conn->sock, p, MIN(len, chunk), 0); if (rc < 0) { if ((errno == ENOBUFS || errno == ENOMEM) && @@ -3032,7 +3203,7 @@ static int tcp_flow_repair_queue(int s, size_t len, uint8_t *buf) } rc = -errno; - err_perror("Can't write queue, socket %i", s); + flow_perror(conn, "Can't write queue"); return rc; } @@ -3045,18 +3216,18 @@ static int tcp_flow_repair_queue(int s, size_t len, uint8_t *buf) /** * tcp_flow_dump_seq() - Dump current sequence of pre-selected queue - * @s: Socket + * @conn: Pointer to the TCP connection structure * @v: Sequence value, set on return * * Return: 0 on success, negative error code on failure */ -static int tcp_flow_dump_seq(int s, uint32_t *v) +static int tcp_flow_dump_seq(const struct tcp_tap_conn *conn, uint32_t *v) { socklen_t sl = sizeof(*v); - if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, v, &sl)) { + if (getsockopt(conn->sock, SOL_TCP, TCP_QUEUE_SEQ, v, &sl)) { int rc = -errno; - err_perror("Dumping sequence, socket %i", s); + flow_perror(conn, "Dumping sequence"); return rc; } @@ -3065,16 +3236,17 @@ static int tcp_flow_dump_seq(int s, uint32_t *v) /** * tcp_flow_repair_seq() - Restore sequence for pre-selected queue - * @s: Socket + * @conn: Connection to repair sequences for * @v: Sequence value to be set * * Return: 0 on success, negative error code on failure */ -static int tcp_flow_repair_seq(int s, const uint32_t *v) +static int tcp_flow_repair_seq(const struct tcp_tap_conn *conn, + const uint32_t *v) { - if (setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, v, sizeof(*v))) { + if (setsockopt(conn->sock, SOL_TCP, TCP_QUEUE_SEQ, v, sizeof(*v))) { int rc = -errno; - err_perror("Setting sequence, socket %i", s); + flow_perror(conn, "Setting sequence"); return rc; } @@ -3083,15 +3255,17 @@ static int tcp_flow_repair_seq(int s, const uint32_t *v) /** * tcp_flow_dump_rcvqueue() - Dump receive queue and its length, seal/block it - * @s: Socket + * @conn: Pointer to the TCP connection structure * @t: Extended migration data * * Return: 0 on success, negative error code on failure * * #syscalls:vu ioctl */ -static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t) +static int tcp_flow_dump_rcvqueue(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) { + int s = conn->sock; ssize_t rc; if (ioctl(s, SIOCINQ, &t->rcvq) < 0) { @@ -3111,8 +3285,9 @@ static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t) t->rcvq--; if (t->rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) { - err("Receive queue too large to migrate socket %i: %u bytes", - s, t->rcvq); + flow_err(conn, + "Receive queue too large to migrate socket: %u bytes", + t->rcvq); return -ENOBUFS; } @@ -3122,13 +3297,13 @@ static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t) rc = 0; } else { rc = -errno; - err_perror("Can't read receive queue for socket %i", s); + flow_perror(conn, "Can't read receive queue"); return rc; } } if ((uint32_t)rc < t->rcvq) { - err("Short read migrating receive queue"); + flow_err(conn, "Short read migrating receive queue"); return -ENXIO; } @@ -3137,12 +3312,13 @@ static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t) /** * tcp_flow_repair_opt() - Set repair "options" (MSS, scale, SACK, timestamps) - * @s: Socket + * @conn: Pointer to the TCP connection structure * @t: Extended migration data * * Return: 0 on success, negative error code on failure */ -static int tcp_flow_repair_opt(int s, const struct tcp_tap_transfer_ext *t) +static int tcp_flow_repair_opt(const struct tcp_tap_conn *conn, + const struct tcp_tap_transfer_ext *t) { const struct tcp_repair_opt opts[] = { { TCPOPT_WINDOW, t->snd_ws + (t->rcv_ws << 16) }, @@ -3156,9 +3332,9 @@ static int tcp_flow_repair_opt(int s, const struct tcp_tap_transfer_ext *t) !!(t->tcpi_options & TCPI_OPT_SACK) + !!(t->tcpi_options & TCPI_OPT_TIMESTAMPS)); - if (setsockopt(s, SOL_TCP, TCP_REPAIR_OPTIONS, opts, sl)) { + if (setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_OPTIONS, opts, sl)) { int rc = -errno; - err_perror("Setting repair options, socket %i", s); + flow_perror(conn, "Setting repair options"); return rc; } @@ -3214,12 +3390,14 @@ int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn) /** * tcp_flow_migrate_source_ext() - Dump queues, close sockets, send final data + * @c: Execution context * @fd: Descriptor for state migration * @conn: Pointer to the TCP connection structure * * Return: 0 on success, negative (not -EIO) on failure, -EIO on sending failure */ -int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn) +int tcp_flow_migrate_source_ext(const struct ctx *c, + int fd, const struct tcp_tap_conn *conn) { uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap; struct tcp_tap_transfer_ext *t = &migrate_ext[FLOW_IDX(conn)]; @@ -3229,39 +3407,45 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn) /* Disable SO_PEEK_OFF, it will make accessing the queues in repair mode * weird. */ - if (tcp_set_peek_offset(s, -1)) { + if (tcp_set_peek_offset(conn, -1)) { rc = -errno; goto fail; } - if ((rc = tcp_flow_dump_tinfo(s, t))) + if ((rc = tcp_flow_dump_tinfo(conn, t))) goto fail; - if ((rc = tcp_flow_dump_mss(s, t))) + if ((rc = tcp_flow_dump_mss(conn, t))) goto fail; - if ((rc = tcp_flow_dump_wnd(s, t))) + if ((rc = tcp_flow_dump_timestamp(conn, t))) goto fail; - if ((rc = tcp_flow_select_queue(s, TCP_SEND_QUEUE))) + if ((rc = tcp_flow_dump_wnd(conn, t))) goto fail; - if ((rc = tcp_flow_dump_sndqueue(s, t))) + if ((rc = tcp_flow_select_queue(conn, TCP_SEND_QUEUE))) goto fail; - if ((rc = tcp_flow_dump_seq(s, &t->seq_snd))) + if ((rc = tcp_flow_dump_sndqueue(conn, t))) goto fail; - if ((rc = tcp_flow_select_queue(s, TCP_RECV_QUEUE))) + if ((rc = tcp_flow_dump_seq(conn, &t->seq_snd))) goto fail; - if ((rc = tcp_flow_dump_rcvqueue(s, t))) + if ((rc = tcp_flow_select_queue(conn, TCP_RECV_QUEUE))) goto fail; - if ((rc = tcp_flow_dump_seq(s, &t->seq_rcv))) + if ((rc = tcp_flow_dump_rcvqueue(conn, t))) goto fail; - close(s); + if ((rc = tcp_flow_dump_seq(conn, &t->seq_rcv))) + goto fail; + + if (c->migrate_no_linger) + close(s); + else + epoll_del(c, s); /* Adjustments unrelated to FIN segments: sequence numbers we dumped are * based on the end of the queues. @@ -3269,14 +3453,14 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn) t->seq_rcv -= t->rcvq; t->seq_snd -= t->sndq; - debug("Extended migration data, socket %i sequences send %u receive %u", - s, t->seq_snd, t->seq_rcv); - debug(" pending queues: send %u not sent %u receive %u", - t->sndq, t->notsent, t->rcvq); - debug(" window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u", - t->snd_wl1, t->snd_wnd, t->max_window, t->rcv_wnd, t->rcv_wup); - debug(" SO_PEEK_OFF %s offset=%"PRIu32, - peek_offset_cap ? "enabled" : "disabled", peek_offset); + flow_dbg(conn, "Extended migration data, socket %i sequences send %u receive %u", + s, t->seq_snd, t->seq_rcv); + flow_dbg(conn, " pending queues: send %u not sent %u receive %u", + t->sndq, t->notsent, t->rcvq); + flow_dbg(conn, " window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u", + t->snd_wl1, t->snd_wnd, t->max_window, t->rcv_wnd, t->rcv_wup); + flow_dbg(conn, " SO_PEEK_OFF %s offset=%"PRIu32, + peek_offset_cap ? "enabled" : "disabled", peek_offset); /* Endianness fix-ups */ t->seq_snd = htonl(t->seq_snd); @@ -3284,6 +3468,8 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn) t->sndq = htonl(t->sndq); t->notsent = htonl(t->notsent); t->rcvq = htonl(t->rcvq); + t->mss = htonl(t->mss); + t->timestamp = htonl(t->timestamp); t->snd_wl1 = htonl(t->snd_wl1); t->snd_wnd = htonl(t->snd_wnd); @@ -3292,17 +3478,17 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn) t->rcv_wup = htonl(t->rcv_wup); if (write_all_buf(fd, t, sizeof(*t))) { - err_perror("Failed to write extended data, socket %i", s); + flow_perror(conn, "Failed to write extended data"); return -EIO; } if (write_all_buf(fd, tcp_migrate_snd_queue, ntohl(t->sndq))) { - err_perror("Failed to write send queue data, socket %i", s); + flow_perror(conn, "Failed to write send queue data"); return -EIO; } if (write_all_buf(fd, tcp_migrate_rcv_queue, ntohl(t->rcvq))) { - err_perror("Failed to write receive queue data, socket %i", s); + flow_perror(conn, "Failed to write receive queue data"); return -EIO; } @@ -3317,7 +3503,7 @@ fail: t->tcpi_state = 0; /* Not defined: tell the target to skip this flow */ if (write_all_buf(fd, t, sizeof(*t))) { - err_perror("Failed to write extended data, socket %i", s); + flow_perror(conn, "Failed to write extended data"); return -EIO; } @@ -3337,32 +3523,22 @@ fail: static int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn) { sa_family_t af = CONN_V4(conn) ? AF_INET : AF_INET6; - const struct flowside *sockside = HOSTFLOW(conn); - union sockaddr_inany a; - socklen_t sl; int s, rc; - pif_sockaddr(c, &a, &sl, PIF_HOST, &sockside->oaddr, sockside->oport); - if ((conn->sock = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, IPPROTO_TCP)) < 0) { rc = -errno; - err_perror("Failed to create socket for migrated flow"); + flow_perror(conn, "Failed to create socket for migrated flow"); return rc; } s = conn->sock; if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &(int){ 1 }, sizeof(int))) - debug_perror("Setting SO_REUSEADDR on socket %i", s); + flow_dbg_perror(conn, "Failed to set SO_REUSEADDR on socket %i", + s); tcp_sock_set_nodelay(s); - if (bind(s, &a.sa, sizeof(a))) { - rc = -errno; - err_perror("Failed to bind socket for migrated flow"); - goto err; - } - if ((rc = tcp_flow_repair_on(c, conn))) goto err; @@ -3375,6 +3551,30 @@ err: } /** + * tcp_flow_repair_bind() - Bind socket in repair mode + * @c: Execution context + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_bind(const struct ctx *c, struct tcp_tap_conn *conn) +{ + const struct flowside *sockside = HOSTFLOW(conn); + union sockaddr_inany a; + socklen_t sl; + + pif_sockaddr(c, &a, &sl, PIF_HOST, &sockside->oaddr, sockside->oport); + + if (bind(conn->sock, &a.sa, sizeof(a))) { + int rc = -errno; + flow_perror(conn, "Failed to bind socket for migrated flow"); + return rc; + } + + return 0; +} + +/** * tcp_flow_repair_connect() - Connect socket in repair mode, then turn it off * @c: Execution context * @conn: Pointer to the TCP connection structure @@ -3390,7 +3590,7 @@ static int tcp_flow_repair_connect(const struct ctx *c, rc = flowside_connect(c, conn->sock, PIF_HOST, tgt); if (rc) { rc = -errno; - err_perror("Failed to connect migrated socket %i", conn->sock); + flow_perror(conn, "Failed to connect migrated socket"); return rc; } @@ -3421,8 +3621,8 @@ int tcp_flow_migrate_target(struct ctx *c, int fd) } if (read_all_buf(fd, &t, sizeof(t))) { + flow_perror(flow, "Failed to receive migration data"); flow_alloc_cancel(flow); - err_perror("Failed to receive migration data"); return -errno; } @@ -3481,7 +3681,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd if (read_all_buf(fd, &t, sizeof(t))) { rc = -errno; - err_perror("Failed to read extended data for socket %i", s); + flow_perror(conn, "Failed to read extended data"); return rc; } @@ -3496,6 +3696,8 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd t.sndq = ntohl(t.sndq); t.notsent = ntohl(t.notsent); t.rcvq = ntohl(t.rcvq); + t.mss = ntohl(t.mss); + t.timestamp = ntohl(t.timestamp); t.snd_wl1 = ntohl(t.snd_wl1); t.snd_wnd = ntohl(t.snd_wnd); @@ -3503,31 +3705,34 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd t.rcv_wnd = ntohl(t.rcv_wnd); t.rcv_wup = ntohl(t.rcv_wup); - debug("Extended migration data, socket %i sequences send %u receive %u", - s, t.seq_snd, t.seq_rcv); - debug(" pending queues: send %u not sent %u receive %u", - t.sndq, t.notsent, t.rcvq); - debug(" window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u", - t.snd_wl1, t.snd_wnd, t.max_window, t.rcv_wnd, t.rcv_wup); - debug(" SO_PEEK_OFF %s offset=%"PRIu32, - peek_offset_cap ? "enabled" : "disabled", peek_offset); + flow_dbg(conn, + "Extended migration data, socket %i sequences send %u receive %u", + s, t.seq_snd, t.seq_rcv); + flow_dbg(conn, " pending queues: send %u not sent %u receive %u", + t.sndq, t.notsent, t.rcvq); + flow_dbg(conn, + " window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u", + t.snd_wl1, t.snd_wnd, t.max_window, t.rcv_wnd, t.rcv_wup); + flow_dbg(conn, " SO_PEEK_OFF %s offset=%"PRIu32, + peek_offset_cap ? "enabled" : "disabled", peek_offset); if (t.sndq > TCP_MIGRATE_SND_QUEUE_MAX || t.notsent > t.sndq || t.rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) { - err("Bad queues socket %i, send: %u, not sent: %u, receive: %u", - s, t.sndq, t.notsent, t.rcvq); + flow_err(conn, + "Bad queues socket %i, send: %u, not sent: %u, receive: %u", + s, t.sndq, t.notsent, t.rcvq); return -EINVAL; } if (read_all_buf(fd, tcp_migrate_snd_queue, t.sndq)) { rc = -errno; - err_perror("Failed to read send queue data, socket %i", s); + flow_perror(conn, "Failed to read send queue data"); return rc; } if (read_all_buf(fd, tcp_migrate_rcv_queue, t.rcvq)) { rc = -errno; - err_perror("Failed to read receive queue data, socket %i", s); + flow_perror(conn, "Failed to read receive queue data"); return rc; } @@ -3535,32 +3740,38 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd /* We weren't able to create the socket, discard flow */ goto fail; - if (tcp_flow_select_queue(s, TCP_SEND_QUEUE)) + if (tcp_flow_repair_bind(c, conn)) + goto fail; + + if (tcp_flow_repair_timestamp(conn, &t)) goto fail; - if (tcp_flow_repair_seq(s, &t.seq_snd)) + if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE)) goto fail; - if (tcp_flow_select_queue(s, TCP_RECV_QUEUE)) + if (tcp_flow_repair_seq(conn, &t.seq_snd)) goto fail; - if (tcp_flow_repair_seq(s, &t.seq_rcv)) + if (tcp_flow_select_queue(conn, TCP_RECV_QUEUE)) + goto fail; + + if (tcp_flow_repair_seq(conn, &t.seq_rcv)) goto fail; if (tcp_flow_repair_connect(c, conn)) goto fail; - if (tcp_flow_repair_queue(s, t.rcvq, tcp_migrate_rcv_queue)) + if (tcp_flow_repair_queue(conn, t.rcvq, tcp_migrate_rcv_queue)) goto fail; - if (tcp_flow_select_queue(s, TCP_SEND_QUEUE)) + if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE)) goto fail; - if (tcp_flow_repair_queue(s, t.sndq - t.notsent, + if (tcp_flow_repair_queue(conn, t.sndq - t.notsent, tcp_migrate_snd_queue)) goto fail; - if (tcp_flow_repair_opt(s, &t)) + if (tcp_flow_repair_opt(conn, &t)) goto fail; /* If we sent a FIN sent and it was acknowledged (TCP_FIN_WAIT2), don't @@ -3575,19 +3786,19 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd v = TCP_SEND_QUEUE; if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v))) - debug_perror("Selecting repair queue, socket %i", s); + flow_perror(conn, "Selecting repair queue"); else shutdown(s, SHUT_WR); } - if (tcp_flow_repair_wnd(s, &t)) + if (tcp_flow_repair_wnd(conn, &t)) goto fail; tcp_flow_repair_off(c, conn); repair_flush(c); if (t.notsent) { - if (tcp_flow_repair_queue(s, t.notsent, + if (tcp_flow_repair_queue(conn, t.notsent, tcp_migrate_snd_queue + (t.sndq - t.notsent))) { /* This sometimes seems to fail for unclear reasons. @@ -3607,15 +3818,16 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd if (t.tcpi_state == TCP_FIN_WAIT1) shutdown(s, SHUT_WR); - if (tcp_set_peek_offset(conn->sock, peek_offset)) + if (tcp_set_peek_offset(conn, peek_offset)) goto fail; tcp_send_flag(c, conn, ACK); tcp_data_from_sock(c, conn); if ((rc = tcp_epoll_ctl(c, conn))) { - debug("Failed to subscribe to epoll for migrated socket %i: %s", - conn->sock, strerror_(-rc)); + flow_dbg(conn, + "Failed to subscribe to epoll for migrated socket: %s", + strerror_(-rc)); goto fail; } @@ -3632,3 +3844,67 @@ fail: return 0; } + +/** + * tcp_prepare_iov() - Prepare iov according to kernel capability + * @msg: Message header to update + * @iov: iovec to receive TCP payload and data to discard + * @already_sent: Bytes sent after the last acknowledged one + * @payload_iov_cnt: Number of TCP payload iovec entries + * + * Return: 0 on success, -1 if already_sent cannot be discarded fully + */ +int tcp_prepare_iov(struct msghdr *msg, struct iovec *iov, + uint32_t already_sent, int payload_iov_cnt) +{ + /* + * IOV layout + * |- tcp_buf_discard -|---------- TCP data slots ------------| + * + * with discarded data: + * |------ddddddddddddd|ttttttttttttt-------------------------| + * ^ + * | + * msg_iov + * + * without discarded data: + * |-------------------|ttttttttttttt-------------------------| + * ^ + * | + * msg_iov + * d: discard data + * t: TCP data + */ + if (peek_offset_cap) { + msg->msg_iov = iov + DISCARD_IOV_NUM; + msg->msg_iovlen = payload_iov_cnt; + } else { + int discard_cnt, discard_iov_rem; + struct iovec *iov_start; + int i; + + discard_cnt = DIV_ROUND_UP(already_sent, BUF_DISCARD_SIZE); + if (discard_cnt > DISCARD_IOV_NUM) { + debug("Failed to discard %u already sent bytes", + already_sent); + return -1; + } + + discard_iov_rem = already_sent % BUF_DISCARD_SIZE; + + iov_start = iov + (DISCARD_IOV_NUM - discard_cnt); + + /* Multiple iov entries pointing to the same buffer */ + for (i = 0; i < discard_cnt; i++) { + iov_start[i].iov_base = tcp_buf_discard; + iov_start[i].iov_len = BUF_DISCARD_SIZE; + } + if (discard_iov_rem) + iov[DISCARD_IOV_NUM - 1].iov_len = discard_iov_rem; + + msg->msg_iov = iov_start; + msg->msg_iovlen = discard_cnt + payload_iov_cnt; + } + + return 0; +} @@ -25,7 +25,6 @@ void tcp_timer(struct ctx *c, const struct timespec *now); void tcp_defer_handler(struct ctx *c); void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s); -int tcp_set_peek_offset(int s, int offset); extern bool peek_offset_cap; @@ -60,7 +60,7 @@ static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM]; static unsigned int tcp_payload_used; /* recvmsg()/sendmsg() data for tap */ -static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; +static struct iovec iov_sock [TCP_FRAMES_MEM + DISCARD_IOV_NUM]; static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS]; @@ -104,7 +104,7 @@ void tcp_sock_iov_init(const struct ctx *c) /** * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission - * @ctx: Execution context + * @c: Execution context * @conns: Array of connection pointers corresponding to queued frames * @frames: Two-dimensional array containing queued frames with sub-iovs * @num_frames: Number of entries in the two arrays to be compared @@ -125,7 +125,7 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns, conn->seq_to_tap = seq; peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap; - if (tcp_set_peek_offset(conn->sock, peek_offset)) + if (tcp_set_peek_offset(conn, peek_offset)) tcp_rst(c, conn); } } @@ -148,7 +148,7 @@ void tcp_payload_flush(const struct ctx *c) } /** - * tcp_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers + * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers * @conn: Connection pointer * @iov: Pointer to an array of iovec of TCP pre-cooked buffers * @check: Checksum, if already known @@ -160,7 +160,7 @@ static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, uint32_t seq, bool no_tcp_csum) { struct iov_tail tail = IOV_TAIL(&iov[TCP_IOV_PAYLOAD], 1, 0); - struct tcphdr *th = IOV_REMOVE_HEADER(&tail, struct tcphdr); + struct tcphdr th_storage, *th = IOV_REMOVE_HEADER(&tail, th_storage); struct tap_hdr *taph = iov[TCP_IOV_TAP].iov_base; const struct flowside *tapside = TAPFLOW(conn); const struct in_addr *a4 = inany_v4(&tapside->oaddr); @@ -209,13 +209,14 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) if (ret <= 0) return ret; - tcp_payload_used++; + tcp_frame_conns[tcp_payload_used++] = conn; l4len = optlen + sizeof(struct tcphdr); iov[TCP_IOV_PAYLOAD].iov_len = l4len; tcp_l2_buf_fill_headers(conn, iov, NULL, seq, false); if (flags & DUP_ACK) { - struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++]; + struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used]; + tcp_frame_conns[tcp_payload_used++] = conn; memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_len); @@ -304,7 +305,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) conn->seq_ack_from_tap, conn->seq_to_tap); conn->seq_to_tap = conn->seq_ack_from_tap; already_sent = 0; - if (tcp_set_peek_offset(s, 0)) { + if (tcp_set_peek_offset(conn, 0)) { tcp_rst(c, conn); return -1; } @@ -326,15 +327,9 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) iov_rem = (wnd_scaled - already_sent) % mss; } - /* Prepare iov according to kernel capability */ - if (!peek_offset_cap) { - mh_sock.msg_iov = iov_sock; - iov_sock[0].iov_base = tcp_buf_discard; - iov_sock[0].iov_len = already_sent; - mh_sock.msg_iovlen = fill_bufs + 1; - } else { - mh_sock.msg_iov = &iov_sock[1]; - mh_sock.msg_iovlen = fill_bufs; + if (tcp_prepare_iov(&mh_sock, iov_sock, already_sent, fill_bufs)) { + tcp_rst(c, conn); + return -1; } if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) { @@ -344,12 +339,12 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) tcp_payload_used = 0; } - for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { + for (i = 0, iov = iov_sock + DISCARD_IOV_NUM; i < fill_bufs; i++, iov++) { iov->iov_base = &tcp_payload[tcp_payload_used + i].data; iov->iov_len = mss; } if (iov_rem) - iov_sock[fill_bufs].iov_len = iov_rem; + iov_sock[fill_bufs + DISCARD_IOV_NUM - 1].iov_len = iov_rem; /* Receive into buffers, don't dequeue until acknowledged by guest. */ do @@ -369,7 +364,10 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) } if (!len) { - if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { + if (already_sent) { + conn_flag(c, conn, STALLED); + } else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == + SOCK_FIN_RCVD) { int ret = tcp_buf_send_flag(c, conn, FIN | ACK); if (ret) { tcp_rst(c, conn); @@ -152,6 +152,7 @@ struct tcp_tap_transfer { * @notsent: Part of pending send queue that wasn't sent out yet * @rcvq: Length of pending receive queue * @mss: Socket-side MSS clamp + * @timestamp: RFC 7323 timestamp * @snd_wl1: Next sequence used in window probe (next sequence - 1) * @snd_wnd: Socket-side sending window * @max_window: Window clamp @@ -171,6 +172,7 @@ struct tcp_tap_transfer_ext { uint32_t rcvq; uint32_t mss; + uint32_t timestamp; /* We can't just use struct tcp_repair_window: we need network order */ uint32_t snd_wl1; @@ -234,7 +236,8 @@ int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn); int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn); int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn); -int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn); +int tcp_flow_migrate_source_ext(const struct ctx *c, int fd, + const struct tcp_tap_conn *conn); int tcp_flow_migrate_target(struct ctx *c, int fd); int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd); diff --git a/tcp_internal.h b/tcp_internal.h index 6f5e054..5cb6cba 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -9,6 +9,9 @@ #define MAX_WS 8 #define MAX_WINDOW (1 << (16 + (MAX_WS))) +#define BUF_DISCARD_SIZE (1 << 20) +#define DISCARD_IOV_NUM DIV_ROUND_UP(MAX_WINDOW, BUF_DISCARD_SIZE) + #define MSS4 ROUND_DOWN(IP_MAX_MTU - \ sizeof(struct tcphdr) - \ sizeof(struct iphdr), \ @@ -18,14 +21,19 @@ sizeof(struct ipv6hdr), \ sizeof(uint32_t)) -#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW) -#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW) -#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW) -#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW) +#define SEQ_LE(a, b) \ + ((uint32_t)(b) - (uint32_t)(a) < MAX_WINDOW) +#define SEQ_LT(a, b) \ + ((uint32_t)(b) - (uint32_t)(a) - 1 < MAX_WINDOW) +#define SEQ_GE(a, b) \ + ((uint32_t)(a) - (uint32_t)(b) < MAX_WINDOW) +#define SEQ_GT(a, b) \ + ((uint32_t)(a) - (uint32_t)(b) - 1 < MAX_WINDOW) #define FIN (1 << 0) #define SYN (1 << 1) #define RST (1 << 2) +#define PSH (1 << 3) #define ACK (1 << 4) /* Flags for internal usage */ @@ -138,7 +146,7 @@ struct tcp_syn_opts { .ws = TCP_OPT_WS(ws_), \ }) -extern char tcp_buf_discard [MAX_WINDOW]; +extern char tcp_buf_discard [BUF_DISCARD_SIZE]; void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, unsigned long flag); @@ -177,5 +185,8 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags, struct tcphdr *th, struct tcp_syn_opts *opts, size_t *optlen); +int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset); +int tcp_prepare_iov(struct msghdr *msg, struct iovec *iov, + uint32_t already_sent, int payload_iov_cnt); #endif /* TCP_INTERNAL_H */ diff --git a/tcp_splice.c b/tcp_splice.c index 0d10e3d..26cb630 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -95,7 +95,7 @@ static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af); * conn_at_sidx() - Get spliced TCP connection specific flow at given sidx * @sidx: Flow and side to retrieve * - * Return: Spliced TCP connection at @sidx, or NULL of @sidx is invalid. + * Return: spliced TCP connection at @sidx, or NULL of @sidx is invalid. * Asserts if the flow at @sidx is not FLOW_TCP_SPLICE. */ static struct tcp_splice_conn *conn_at_sidx(flow_sidx_t sidx) @@ -402,7 +402,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn) * @c: Execution context * @af: Address family (AF_INET or AF_INET6) * - * Return: Socket fd in the namespace on success, -errno on failure + * Return: socket fd in the namespace on success, -errno on failure */ static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af) { @@ -520,20 +520,21 @@ swap: int more = 0; retry: - readlen = splice(conn->s[fromsidei], NULL, - conn->pipe[fromsidei][1], NULL, - c->tcp.pipe_size, - SPLICE_F_MOVE | SPLICE_F_NONBLOCK); + do + readlen = splice(conn->s[fromsidei], NULL, + conn->pipe[fromsidei][1], NULL, + c->tcp.pipe_size, + SPLICE_F_MOVE | SPLICE_F_NONBLOCK); + while (readlen < 0 && errno == EINTR); + + if (readlen < 0 && errno != EAGAIN) + goto close; + flow_trace(conn, "%zi from read-side call", readlen); - if (readlen < 0) { - if (errno == EINTR) - goto retry; - if (errno != EAGAIN) - goto close; - } else if (!readlen) { + if (!readlen) { eof = 1; - } else { + } else if (readlen > 0) { never_read = 0; if (readlen >= (long)c->tcp.pipe_size * 90 / 100) @@ -543,10 +544,16 @@ retry: conn_flag(c, conn, lowat_act_flag); } -eintr: - written = splice(conn->pipe[fromsidei][0], NULL, - conn->s[!fromsidei], NULL, c->tcp.pipe_size, - SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK); + do + written = splice(conn->pipe[fromsidei][0], NULL, + conn->s[!fromsidei], NULL, + c->tcp.pipe_size, + SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK); + while (written < 0 && errno == EINTR); + + if (written < 0 && errno != EAGAIN) + goto close; + flow_trace(conn, "%zi from write-side call (passed %zi)", written, c->tcp.pipe_size); @@ -578,12 +585,6 @@ eintr: conn->written[fromsidei] += written > 0 ? written : 0; if (written < 0) { - if (errno == EINTR) - goto eintr; - - if (errno != EAGAIN) - goto close; - if (conn->read[fromsidei] == conn->written[fromsidei]) break; @@ -35,7 +35,7 @@ #include "vu_common.h" #include <time.h> -static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + 1]; +static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + DISCARD_IOV_NUM]; static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE]; static int head[VIRTQUEUE_MAX_SIZE + 1]; @@ -43,7 +43,7 @@ static int head[VIRTQUEUE_MAX_SIZE + 1]; * tcp_vu_hdrlen() - return the size of the header in level 2 frame (TCP) * @v6: Set for IPv6 packet * - * Return: Return the size of the header + * Return: return the size of the header */ static size_t tcp_vu_hdrlen(bool v6) { @@ -171,21 +171,23 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) /** tcp_vu_sock_recv() - Receive datastream from socket into vhost-user buffers * @c: Execution context + * @vq: virtqueue to use to receive data * @conn: Connection pointer * @v6: Set for IPv6 connections * @already_sent: Number of bytes already sent * @fillsize: Maximum bytes to fill in guest-side receiving window * @iov_cnt: number of iov (output) + * @head_cnt: Pointer to store the count of head iov entries (output) * - * Return: Number of iov entries used to store the data or negative error code + * Return: number of bytes received from the socket, or a negative error code + * on failure. */ -static ssize_t tcp_vu_sock_recv(const struct ctx *c, +static ssize_t tcp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq, const struct tcp_tap_conn *conn, bool v6, uint32_t already_sent, size_t fillsize, int *iov_cnt, int *head_cnt) { - struct vu_dev *vdev = c->vdev; - struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + const struct vu_dev *vdev = c->vdev; struct msghdr mh_sock = { 0 }; uint16_t mss = MSS_GET(conn); int s = conn->sock; @@ -198,7 +200,7 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c, hdrlen = tcp_vu_hdrlen(v6); - vu_init_elem(elem, &iov_vu[1], VIRTQUEUE_MAX_SIZE); + vu_init_elem(elem, &iov_vu[DISCARD_IOV_NUM], VIRTQUEUE_MAX_SIZE); elem_cnt = 0; *head_cnt = 0; @@ -226,16 +228,9 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c, elem_cnt += cnt; } - if (peek_offset_cap) { - mh_sock.msg_iov = iov_vu + 1; - mh_sock.msg_iovlen = elem_cnt; - } else { - iov_vu[0].iov_base = tcp_buf_discard; - iov_vu[0].iov_len = already_sent; - - mh_sock.msg_iov = iov_vu; - mh_sock.msg_iovlen = elem_cnt + 1; - } + if (tcp_prepare_iov(&mh_sock, iov_vu, already_sent, elem_cnt)) + /* Expect caller to do a TCP reset */ + return -1; do ret = recvmsg(s, &mh_sock, MSG_PEEK); @@ -349,7 +344,7 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn, * @c: Execution context * @conn: Connection pointer * - * Return: Negative on connection reset, 0 otherwise + * Return: negative on connection reset, 0 otherwise */ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) { @@ -376,7 +371,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) conn->seq_ack_from_tap, conn->seq_to_tap); conn->seq_to_tap = conn->seq_ack_from_tap; already_sent = 0; - if (tcp_set_peek_offset(conn->sock, 0)) { + if (tcp_set_peek_offset(conn, 0)) { tcp_rst(c, conn); return -1; } @@ -396,7 +391,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) /* collect the buffers from vhost-user and fill them with the * data from the socket */ - len = tcp_vu_sock_recv(c, conn, v6, already_sent, fillsize, + len = tcp_vu_sock_recv(c, vq, conn, v6, already_sent, fillsize, &iov_cnt, &head_cnt); if (len < 0) { if (len != -EAGAIN && len != -EWOULDBLOCK) { diff --git a/test/.gitignore b/test/.gitignore index 3573444..9412f0d 100644 --- a/test/.gitignore +++ b/test/.gitignore @@ -11,3 +11,5 @@ nstool rampstream guest-key guest-key.pub +/exeter/ +*.bats diff --git a/test/Makefile b/test/Makefile index bf63db8..4938827 100644 --- a/test/Makefile +++ b/test/Makefile @@ -5,6 +5,8 @@ # Copyright Red Hat # Author: David Gibson <david@gibson.dropbear.id.au> +BATS = bats -j $(shell nproc) +EXETOOL = exeter/exetool/exetool WGET = wget -c DEBIAN_IMGS = debian-8.11.0-openstack-amd64.qcow2 \ @@ -13,7 +15,7 @@ DEBIAN_IMGS = debian-8.11.0-openstack-amd64.qcow2 \ debian-10-generic-ppc64el-20220911-1135.qcow2 \ debian-11-nocloud-amd64.qcow2 \ debian-11-generic-arm64.qcow2 \ - debian-11-generic-ppc64el.qcow2 \ + debian-11-generic-ppc64el-20250703-2162.qcow2 \ debian-sid-nocloud-amd64-daily.qcow2 \ debian-sid-nocloud-arm64-daily.qcow2 \ debian-sid-nocloud-ppc64el-daily.qcow2 @@ -50,18 +52,24 @@ UBUNTU_NEW_IMGS = xenial-server-cloudimg-powerpc-disk1.img \ jammy-server-cloudimg-s390x.img UBUNTU_IMGS = $(UBUNTU_OLD_IMGS) $(UBUNTU_NEW_IMGS) -DOWNLOAD_ASSETS = mbuto podman \ +DOWNLOAD_ASSETS = exeter mbuto podman \ $(DEBIAN_IMGS) $(FEDORA_IMGS) $(OPENSUSE_IMGS) $(UBUNTU_IMGS) TESTDATA_ASSETS = small.bin big.bin medium.bin \ rampstream LOCAL_ASSETS = mbuto.img mbuto.mem.img podman/bin/podman QEMU_EFI.fd \ $(DEBIAN_IMGS:%=prepared-%) $(FEDORA_IMGS:%=prepared-%) \ $(UBUNTU_NEW_IMGS:%=prepared-%) \ - nstool guest-key guest-key.pub \ + nstool guest-key guest-key.pub $(EXETOOL) \ $(TESTDATA_ASSETS) ASSETS = $(DOWNLOAD_ASSETS) $(LOCAL_ASSETS) +EXETER_PYPATH = exeter/py3 +EXETER_BATS = smoke/smoke.sh.bats \ + build/build.py.bats build/static_checkers.sh.bats +BATS_FILES = $(EXETER_BATS) \ + podman/test/system/505-networking-pasta.bats + CFLAGS = -Wall -Werror -Wextra -pedantic -std=c99 assets: $(ASSETS) @@ -70,6 +78,11 @@ assets: $(ASSETS) pull-%: % git -C $* pull +exeter: + git clone https://gitlab.com/dgibson/exeter.git + +exeter/exetool/exetool: pull-exeter + mbuto: git clone git://mbuto.sh/mbuto @@ -115,6 +128,12 @@ medium.bin: big.bin: dd if=/dev/urandom bs=1M count=10 of=$@ +$(EXETER_BATS): %.bats: % $(EXETOOL) + PYTHONPATH=$(EXETER_PYPATH) $(EXETOOL) bats -- $< > $@ + +bats: $(BATS_FILES) pull-podman + PYTHONPATH=$(EXETER_PYPATH) CONTAINERS_HELPER_BINARY_DIR=.. $(BATS) $(BATS_FILES) + check: assets ./run @@ -124,6 +143,7 @@ debug: assets clean: rm -f perf.js *~ rm -f $(LOCAL_ASSETS) + rm -f $(EXETER_BATS) rm -rf test_logs rm -f prepared-*.qcow2 prepared-*.img @@ -149,6 +169,9 @@ debian-11-nocloud-%.qcow2: debian-11-generic-%.qcow2: $(WGET) -O $@ https://cloud.debian.org/images/cloud/bullseye/latest/debian-11-generic-$*.qcow2 +debian-11-generic-ppc64el-20250703-2162.qcow2: + $(WGET) -O $@ https://cloud.debian.org/images/cloud/bullseye/20250703-2162/debian-11-generic-ppc64el-20250703-2162.qcow2 + debian-sid-nocloud-%-daily.qcow2: $(WGET) -O $@ https://cloud.debian.org/images/cloud/sid/daily/latest/debian-sid-nocloud-$*-daily.qcow2 diff --git a/test/build/all b/test/build/all deleted file mode 100644 index 1f79e0d..0000000 --- a/test/build/all +++ /dev/null @@ -1,61 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-or-later -# -# PASST - Plug A Simple Socket Transport -# for qemu/UNIX domain socket mode -# -# PASTA - Pack A Subtle Tap Abstraction -# for network namespace/tap device mode -# -# test/build/all - Build targets, one by one, then all together, check output -# -# Copyright (c) 2021 Red Hat GmbH -# Author: Stefano Brivio <sbrivio@redhat.com> - -htools make cc rm uname getconf mkdir cp rm man - -test Build passt -host make clean -check ! [ -e passt ] -host CFLAGS="-Werror" make passt -check [ -f passt ] - -test Build pasta -host make clean -check ! [ -e pasta ] -host CFLAGS="-Werror" make pasta -check [ -h pasta ] - -test Build qrap -host make clean -check ! [ -e qrap ] -host CFLAGS="-Werror" make qrap -check [ -f qrap ] - -test Build all -host make clean -check ! [ -e passt ] -check ! [ -e pasta ] -check ! [ -e qrap ] -host CFLAGS="-Werror" make -check [ -f passt ] -check [ -h pasta ] -check [ -f qrap ] - -test Install -host mkdir __STATEDIR__/prefix -host prefix=__STATEDIR__/prefix make install -check [ -f __STATEDIR__/prefix/bin/passt ] -check [ -h __STATEDIR__/prefix/bin/pasta ] -check [ -f __STATEDIR__/prefix/bin/qrap ] -check man -M __STATEDIR__/prefix/share/man -W passt -check man -M __STATEDIR__/prefix/share/man -W pasta -check man -M __STATEDIR__/prefix/share/man -W qrap - -test Uninstall -host prefix=__STATEDIR__/prefix make uninstall -check ! [ -f __STATEDIR__/prefix/bin/passt ] -check ! [ -h __STATEDIR__/prefix/bin/pasta ] -check ! [ -f __STATEDIR__/prefix/bin/qrap ] -check ! man -M __STATEDIR__/prefix/share/man -W passt 2>/dev/null -check ! man -M __STATEDIR__/prefix/share/man -W pasta 2>/dev/null -check ! man -M __STATEDIR__/prefix/share/man -W qrap 2>/dev/null diff --git a/test/build/build.py b/test/build/build.py new file mode 100755 index 0000000..e49287c --- /dev/null +++ b/test/build/build.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +# +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/build/build.py - Test build and install targets +# +# Copyright Red Hat +# Author: David Gibson <david@gibson.dropbear.id.au> + +import contextlib +import os +from pathlib import Path +import subprocess +import tempfile +from typing import Iterable, Iterator + +import exeter + +def sh(cmd): + """Run given command in a shell""" + subprocess.run(cmd, shell=True) + + +@contextlib.contextmanager +def clone_sources() -> Iterator[str]: + """Create a temporary copy of the passt sources. + + When the context enters create a temporary directory and copy the + passt sources into it. Clean it up when the context exits. + """ + + os.chdir('..') # Move from test/ to repo base + with tempfile.TemporaryDirectory(ignore_cleanup_errors=False) as tmpdir: + sh(f"cp --parents -d $(git ls-files) {tmpdir}") + os.chdir(tmpdir) + yield tmpdir + + +def test_make(target: str, expected_files: list[str]) -> None: + """Test `make {target}` + + Arguments: + target -- make target to invoke + expected_files -- files make is expected to create + + Verifies that + 1) `make target` completes successfully + 2) expected_files care created by `make target` + 3) expected_files are removed by `make clean` + """ + + ex_paths = [Path(f) for f in expected_files] + with clone_sources(): + for p in ex_paths: + assert not p.exists(), f"{p} existed before make" + sh(f'make {target} CFLAGS="-Werror"') + for p in ex_paths: + assert p.exists(), f"{p} wasn't made" + sh('make clean') + for p in ex_paths: + assert not p.exists(), f"{p} existed after make clean" + + +exeter.register('make_passt', test_make, 'passt', ['passt']) +exeter.register('make_pasta', test_make, 'pasta', ['pasta']) +exeter.register('make_qrap', test_make, 'qrap', ['qrap']) +exeter.register('make_all', test_make, 'all', ['passt', 'pasta', 'qrap']) + + +@exeter.test +def test_install_uninstall() -> None: + """Test `make install` and `make uninstall` + + Tests that `make install` installs the expected files to the + install prefix, and that `make uninstall` removes them again. + """ + + with clone_sources(): + with tempfile.TemporaryDirectory(ignore_cleanup_errors=False) \ + as prefix: + bindir = Path(prefix) / 'bin' + mandir = Path(prefix) / 'share/man' + progs = ['passt', 'pasta', 'qrap'] + + # Install + sh(f'make install CFLAGS="-Werror" prefix={prefix}') + + for prog in progs: + exe = bindir / prog + assert exe.is_file(), f"{exe} does not exist as a regular file" + sh(f'man -M {mandir} -W {prog}') + + # Uninstall + sh(f'make uninstall prefix={prefix}') + + for prog in progs: + exe = bindir / prog + assert not exe.exists(), f"{exe} exists after uninstall" + sh(f'! man -M {mandir} -W {prog}') + + +if __name__ == '__main__': + exeter.main() diff --git a/test/build/clang_tidy b/test/build/clang_tidy deleted file mode 100644 index 40573bf..0000000 --- a/test/build/clang_tidy +++ /dev/null @@ -1,17 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-or-later -# -# PASST - Plug A Simple Socket Transport -# for qemu/UNIX domain socket mode -# -# PASTA - Pack A Subtle Tap Abstraction -# for network namespace/tap device mode -# -# test/build/clang_tidy - Run source through clang-tidy(1) linter -# -# Copyright (c) 2021 Red Hat GmbH -# Author: Stefano Brivio <sbrivio@redhat.com> - -htools clang-tidy - -test Run clang-tidy -host make clang-tidy diff --git a/test/build/cppcheck b/test/build/cppcheck deleted file mode 100644 index 0e1dbce..0000000 --- a/test/build/cppcheck +++ /dev/null @@ -1,17 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-or-later -# -# PASST - Plug A Simple Socket Transport -# for qemu/UNIX domain socket mode -# -# PASTA - Pack A Subtle Tap Abstraction -# for network namespace/tap device mode -# -# test/build/cppcheck - Run source through cppcheck(1) linter -# -# Copyright (c) 2021 Red Hat GmbH -# Author: Stefano Brivio <sbrivio@redhat.com> - -htools cppcheck - -test Run cppcheck -host make cppcheck diff --git a/test/build/static_checkers.sh b/test/build/static_checkers.sh new file mode 100755 index 0000000..42806e7 --- /dev/null +++ b/test/build/static_checkers.sh @@ -0,0 +1,26 @@ +#! /bin/sh +# +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/build/static_checkers.sh - Run static checkers +# +# Copyright Red Hat +# Author: David Gibson <david@gibson.dropbear.id.au> + +. $(dirname $0)/../exeter/sh/exeter.sh + +exeter_register cppcheck make -C .. cppcheck +exeter_set_description cppcheck "passt sources pass cppcheck" + +exeter_register clang_tidy make -C .. clang-tidy +exeter_set_description clang_tidy "passt sources pass clang-tidy" + +exeter_main "$@" + + diff --git a/test/lib/exeter b/test/lib/exeter new file mode 100644 index 0000000..3b19bea --- /dev/null +++ b/test/lib/exeter @@ -0,0 +1,58 @@ +#!/bin/sh +# +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/lib/exeter - Run exeter tests within the rest of passt's tests +# +# Copyright Red Hat +# Author: David Gibson <david@gibson.dropbear.id.au> + +EXETOOL="$BASEPATH/exeter/exetool/exetool" + +# is_exeter() - Determine if a test file is an exeter program +# $@: Command line to invoke test program +is_exeter() { + $EXETOOL probe -- "$@" +} + +# exeter() - Run each test in an exeter program, logging each test separately +# $@: Command line to invoke exeter test program +exeter() { + STATESETUP="${STATEBASE}/$1" + mkdir -p "${STATESETUP}" + + context_setup_host host + layout_host + + cd test + + __ntests=$($EXETOOL list -- "$@" | wc -l) + if [ $? != 0 ]; then + info "Failed to get exeter manifest for $@" + pause_continue \ + "Press any key to pause test session" \ + "Resuming in " \ + "Paused, press any key to continue" \ + 5 + return + fi + + status_file_start "$* (exeter)" ${__ntests} + [ ${CI} -eq 1 ] && video_link "${1}" + + for __testid in $($EXETOOL list -- "$@"); do + __desc="$($EXETOOL desc -- "$@" -- ${__testid})" + status_test_start "${__desc}" + context_run host "$@" "${__testid}" && status_test_ok || status_test_fail + done + + cd .. + + teardown_context_watch ${PANE_HOST} host +} diff --git a/test/lib/setup b/test/lib/setup index 575bc21..5994598 100755 --- a/test/lib/setup +++ b/test/lib/setup @@ -350,7 +350,7 @@ setup_migrate() { sleep 1 - __opts="--vhost-user" + __opts="--vhost-user --migrate-exit --migrate-no-linger" [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap" [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" @@ -360,7 +360,7 @@ setup_migrate() { context_run_bg passt_repair_1 "./passt-repair ${STATESETUP}/passt_1.socket.repair" - __opts="--vhost-user" + __opts="--vhost-user --migrate-exit --migrate-no-linger" [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap" [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" diff --git a/test/lib/term b/test/lib/term index ed690de..089364c 100755 --- a/test/lib/term +++ b/test/lib/term @@ -19,6 +19,7 @@ STATUS_FILE_INDEX=0 STATUS_COLS= STATUS_PASS=0 STATUS_FAIL=0 +STATUS_SKIPPED=0 PR_RED='\033[1;31m' PR_GREEN='\033[1;32m' @@ -439,19 +440,21 @@ info_layout() { # status_test_ok() - Update counter of passed tests, log and display message status_test_ok() { STATUS_PASS=$((STATUS_PASS + 1)) - tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | #(TZ="UTC" date -Iseconds)" + tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)" info_passed } # status_test_fail() - Update counter of failed tests, log and display message status_test_fail() { STATUS_FAIL=$((STATUS_FAIL + 1)) - tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | #(TZ="UTC" date -Iseconds)" + tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)" info_failed } # status_test_fail() - Update counter of failed tests, log and display message status_test_skip() { + STATUS_SKIPPED=$((STATUS_SKIPPED + 1)) + tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)" info_skipped } diff --git a/test/lib/test b/test/lib/test index 758250a..7349674 100755 --- a/test/lib/test +++ b/test/lib/test @@ -20,10 +20,7 @@ test_iperf3s() { __sctx="${1}" __port="${2}" - pane_or_context_run_bg "${__sctx}" \ - 'iperf3 -s -p'${__port}' & echo $! > s.pid' \ - - sleep 1 # Wait for server to be ready + pane_or_context_run "${__sctx}" 'iperf3 -s -p'${__port}' -D -I s.pid' } # test_iperf3k() - Kill iperf3 server @@ -31,7 +28,7 @@ test_iperf3s() { test_iperf3k() { __sctx="${1}" - pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid); rm s.pid' + pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid)' sleep 1 # Wait for kernel to free up ports } diff --git a/test/passt.mbuto b/test/passt.mbuto index 5e00132..176cf3f 100755 --- a/test/passt.mbuto +++ b/test/passt.mbuto @@ -28,7 +28,10 @@ KMODS="${KMODS:- virtio_net virtio_pci vmw_vsock_virtio_transport}" LINKS="${LINKS:- ash,dash,bash /init - ash,dash,bash /bin/sh}" + ash,dash,bash /bin/sh + sshd /usr/sbin/sshd + dhclient /usr/sbin/dhclient + sysctl /usr/sbin/sysctl}" DIRS="${DIRS} /tmp /usr/sbin /usr/share /var/log /var/lib /etc/ssh /run/sshd /root/.ssh" diff --git a/test/pasta_options/log_to_file b/test/pasta_options/log_to_file index 3ead06c..db78b04 100644 --- a/test/pasta_options/log_to_file +++ b/test/pasta_options/log_to_file @@ -30,19 +30,19 @@ endef test Log creation -set PORTS -t 10001,10002 -u 10001,10002 +set PORTS -t 10001,10002 -u 10001,10002 -T none -U none set LOG_FILE __STATEDIR__/pasta.log -passt ./pasta -l __LOG_FILE__ -- /bin/true +passt ./pasta __PORTS__ -l __LOG_FILE__ -- /bin/true check [ -s __LOG_FILE__ ] test Log truncated on creation -passt ./pasta -l __LOG_FILE__ -- /bin/true & wait +passt ./pasta __PORTS__ -l __LOG_FILE__ -- /bin/true & wait pout PID2 echo $! check head -1 __LOG_FILE__ | grep '^pasta .* [(]__PID2__[)]$' test Maximum log size -passtb ./pasta --config-net -d -f -l __LOG_FILE__ --log-size $((100 * 1024)) -- sh -c 'while true; do tcp_crr --nolog -l1 -P 10001 -C 10002 -6; done' +passtb ./pasta __PORTS__ --config-net -d -f -l __LOG_FILE__ --log-size $((100 * 1024)) -- sh -c 'while true; do tcp_crr --nolog -l1 -P 10001 -C 10002 -6; done' sleep 1 flood_log_client @@ -67,7 +67,7 @@ passt unshare -rUm passt mkdir __STATEDIR__/t passt mount -t tmpfs none __STATEDIR__/t set LOG_FILE __STATEDIR__/t/log -passt ./pasta --config-net -d -l __LOG_FILE__ --log-size $((100 * 1024)) +passt ./pasta __PORTS__ --config-net -d -l __LOG_FILE__ --log-size $((100 * 1024)) flood_log_server flood_log_client @@ -43,6 +43,9 @@ KERNEL=${KERNEL:-"/boot/vmlinuz-$(uname -r)"} COMMIT="$(git log --oneline --no-decorate -1)" +# Let exeter tests written in Python find their modules +export PYTHONPATH=${BASEPATH}/exeter/py3 + . lib/util . lib/context . lib/setup @@ -53,6 +56,7 @@ COMMIT="$(git log --oneline --no-decorate -1)" . lib/layout_ugly . lib/test . lib/video +. lib/exeter # cleanup() - Remove temporary files cleanup() { @@ -67,11 +71,9 @@ run() { perf_init [ ${CI} -eq 1 ] && video_start ci - setup build - test build/all - test build/cppcheck - test build/clang_tidy - teardown build + exeter smoke/smoke.sh + exeter build/build.py + exeter build/static_checkers.sh setup pasta test pasta/ndp @@ -202,7 +204,7 @@ skip_distro() { perf_finish [ ${CI} -eq 1 ] && video_stop - log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}" + log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}, SKIPPED: ${STATUS_SKIPPED}" pause_continue \ "Press any key to keep test session open" \ @@ -223,6 +225,10 @@ run_selected() { __setup= for __test; do + if is_exeter "test/${__test}"; then + exeter "${__test}" + continue + fi # HACK: the migrate tests need the setup repeated for # each test if [ "${__test%%/*}" != "${__setup}" -o \ @@ -234,9 +240,9 @@ run_selected() { test "${__test}" done - teardown "${__setup}" + [ -n "${__setup}" ] && teardown "${__setup}" - log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}" + log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}, SKIPPED: ${STATUS_SKIPPED}" pause_continue \ "Press any key to keep test session open" \ @@ -307,4 +313,4 @@ fi tail -n1 ${LOGFILE} echo "Log at ${LOGFILE}" -exit $(tail -n1 ${LOGFILE} | sed -n 's/.*FAIL: \(.*\)$/\1/p') +exit $(tail -n1 ${LOGFILE} | sed -n 's/.*FAIL: \(.*\),.*$/\1/p') diff --git a/test/smoke/smoke.sh b/test/smoke/smoke.sh new file mode 100755 index 0000000..a642fb9 --- /dev/null +++ b/test/smoke/smoke.sh @@ -0,0 +1,33 @@ +#! /bin/sh +# +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/smoke/smoke.sh - Basic smoke tests +# +# Copyright Red Hat +# Author: David Gibson <david@gibson.dropbear.id.au> + +. $(dirname $0)/../exeter/sh/exeter.sh + +PASST=$(dirname $0)/../../passt +PASTA=$(dirname $0)/../../pasta + +exeter_register passt_version $PASST --version +exeter_set_description passt_version "Check passt --version works" + +exeter_register pasta_version $PASTA --version +exeter_set_description pasta_version "Check pasta --version works" + +exeter_register passt_help $PASST --help +exeter_set_description passt_help "Check passt --help works" + +exeter_register pasta_help $PASTA --help +exeter_set_description pasta_help "Check pasta --help works" + +exeter_main "$@" @@ -39,27 +39,30 @@ * could receive packets from multiple flows, so we use a hash table match to * find the specific flow for a datagram. * - * When a UDP flow is initiated from a listening socket we take a duplicate of - * the socket and store it in uflow->s[INISIDE]. This will last for the - * lifetime of the flow, even if the original listening socket is closed due to - * port auto-probing. The duplicate is used to deliver replies back to the - * originating side. + * Flow sockets + * ============ * - * Reply sockets - * ============= - * - * When a UDP flow targets a socket, we create a "reply" socket in + * When a UDP flow targets a socket, we create a "flow" socket in * uflow->s[TGTSIDE] both to deliver datagrams to the target side and receive * replies on the target side. This socket is both bound and connected and has - * EPOLL_TYPE_UDP_REPLY. The connect() means it will only receive datagrams + * EPOLL_TYPE_UDP. The connect() means it will only receive datagrams * associated with this flow, so the epoll reference directly points to the flow * and we don't need a hash lookup. * - * NOTE: it's possible that the reply socket could have a bound address - * overlapping with an unrelated listening socket. We assume datagrams for the - * flow will come to the reply socket in preference to a listening socket. The - * sample program doc/platform-requirements/reuseaddr-priority.c documents and - * tests that assumption. + * When a flow is initiated from a listening socket, we create a "flow" socket + * with the same bound address as the listening socket, but also connect()ed to + * the flow's peer. This is stored in uflow->s[INISIDE] and will last for the + * lifetime of the flow, even if the original listening socket is closed due to + * port auto-probing. The duplicate is used to deliver replies back to the + * originating side. + * + * NOTE: A flow socket can have a bound address overlapping with a listening + * socket. That will happen naturally for flows initiated from a socket, but is + * also possible (though unlikely) for tap initiated flows, depending on the + * source port. We assume datagrams for the flow will come to a connect()ed + * socket in preference to a listening socket. The sample program + * doc/platform-requirements/reuseaddr-priority.c documents and tests that + * assumption. * * "Spliced" flows * =============== @@ -71,8 +74,7 @@ * actually used; it doesn't make sense for datagrams and instead a pair of * recvmmsg() and sendmmsg() is used to forward the datagrams. * - * Note that a spliced flow will have *both* a duplicated listening socket and a - * reply socket (see above). + * Note that a spliced flow will have two flow sockets (see above). */ #include <sched.h> @@ -114,6 +116,8 @@ #include "udp_internal.h" #include "udp_vu.h" +#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ + /* Maximum UDP data to be returned in ICMP messages */ #define ICMP4_MAX_DLEN 8 #define ICMP6_MAX_DLEN (IPV6_MIN_MTU \ @@ -136,26 +140,31 @@ static struct ethhdr udp4_eth_hdr; static struct ethhdr udp6_eth_hdr; /** - * struct udp_meta_t - Pre-cooked headers and metadata for UDP packets + * struct udp_meta_t - Pre-cooked headers for UDP packets * @ip6h: Pre-filled IPv6 header (except for payload_len and addresses) * @ip4h: Pre-filled IPv4 header (except for tot_len and saddr) * @taph: Tap backend specific header - * @s_in: Source socket address, filled in by recvmmsg() - * @tosidx: sidx for the destination side of this datagram's flow */ static struct udp_meta_t { struct ipv6hdr ip6h; struct iphdr ip4h; struct tap_hdr taph; - - union sockaddr_inany s_in; - flow_sidx_t tosidx; } #ifdef __AVX2__ __attribute__ ((aligned(32))) #endif udp_meta[UDP_MAX_FRAMES]; +#define PKTINFO_SPACE \ + MAX(CMSG_SPACE(sizeof(struct in_pktinfo)), \ + CMSG_SPACE(sizeof(struct in6_pktinfo))) + +#define RECVERR_SPACE \ + MAX(CMSG_SPACE(sizeof(struct sock_extended_err) + \ + sizeof(struct sockaddr_in)), \ + CMSG_SPACE(sizeof(struct sock_extended_err) + \ + sizeof(struct sockaddr_in6))) + /** * enum udp_iov_idx - Indices for the buffers making up a single UDP frame * @UDP_IOV_TAP tap specific header @@ -232,8 +241,6 @@ static void udp_iov_init_one(const struct ctx *c, size_t i) tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph); tiov[UDP_IOV_PAYLOAD].iov_base = payload; - mh->msg_name = &meta->s_in; - mh->msg_namelen = sizeof(meta->s_in); mh->msg_iov = siov; mh->msg_iovlen = 1; } @@ -254,41 +261,6 @@ static void udp_iov_init(const struct ctx *c) } /** - * udp_splice_prepare() - Prepare one datagram for splicing - * @mmh: Receiving mmsghdr array - * @idx: Index of the datagram to prepare - */ -static void udp_splice_prepare(struct mmsghdr *mmh, unsigned idx) -{ - udp_mh_splice[idx].msg_hdr.msg_iov->iov_len = mmh[idx].msg_len; -} - -/** - * udp_splice_send() - Send a batch of datagrams from socket to socket - * @c: Execution context - * @start: Index of batch's first datagram in udp[46]_l2_buf - * @n: Number of datagrams in batch - * @src: Source port for datagram (target side) - * @dst: Destination port for datagrams (target side) - * @ref: epoll reference for origin socket - * @now: Timestamp - */ -static void udp_splice_send(const struct ctx *c, size_t start, size_t n, - flow_sidx_t tosidx) -{ - const struct flowside *toside = flowside_at_sidx(tosidx); - const struct udp_flow *uflow = udp_at_sidx(tosidx); - uint8_t topif = pif_at_sidx(tosidx); - int s = uflow->s[tosidx.sidei]; - socklen_t sl; - - pif_sockaddr(c, &udp_splice_to, &sl, topif, - &toside->eaddr, toside->eport); - - sendmmsg(s, udp_mh_splice + start, n, MSG_NOSIGNAL); -} - -/** * udp_update_hdr4() - Update headers for one IPv4 datagram * @ip4h: Pre-filled IPv4 header (except for tot_len and saddr) * @bp: Pointer to udp_payload_t to update @@ -411,7 +383,7 @@ static void udp_tap_prepare(const struct mmsghdr *mmh, } /** - * udp_send_conn_fail_icmp4() - Construct and send ICMPv4 to local peer + * udp_send_tap_icmp4() - Construct and send ICMPv4 to local peer * @c: Execution context * @ee: Extended error descriptor * @toside: Destination side of flow @@ -419,11 +391,11 @@ static void udp_tap_prepare(const struct mmsghdr *mmh, * @in: First bytes (max 8) of original UDP message body * @dlen: Length of the read part of original UDP message body */ -static void udp_send_conn_fail_icmp4(const struct ctx *c, - const struct sock_extended_err *ee, - const struct flowside *toside, - struct in_addr saddr, - const void *in, size_t dlen) +static void udp_send_tap_icmp4(const struct ctx *c, + const struct sock_extended_err *ee, + const struct flowside *toside, + struct in_addr saddr, + const void *in, size_t dlen) { struct in_addr oaddr = toside->oaddr.v4mapped.a4; struct in_addr eaddr = toside->eaddr.v4mapped.a4; @@ -455,7 +427,7 @@ static void udp_send_conn_fail_icmp4(const struct ctx *c, /** - * udp_send_conn_fail_icmp6() - Construct and send ICMPv6 to local peer + * udp_send_tap_icmp6() - Construct and send ICMPv6 to local peer * @c: Execution context * @ee: Extended error descriptor * @toside: Destination side of flow @@ -464,11 +436,11 @@ static void udp_send_conn_fail_icmp4(const struct ctx *c, * @dlen: Length of the read part of original UDP message body * @flow: IPv6 flow identifier */ -static void udp_send_conn_fail_icmp6(const struct ctx *c, - const struct sock_extended_err *ee, - const struct flowside *toside, - const struct in6_addr *saddr, - void *in, size_t dlen, uint32_t flow) +static void udp_send_tap_icmp6(const struct ctx *c, + const struct sock_extended_err *ee, + const struct flowside *toside, + const struct in6_addr *saddr, + void *in, size_t dlen, uint32_t flow) { const struct in6_addr *oaddr = &toside->oaddr.a6; const struct in6_addr *eaddr = &toside->eaddr.a6; @@ -499,35 +471,83 @@ static void udp_send_conn_fail_icmp6(const struct ctx *c, } /** + * udp_pktinfo() - Retrieve packet destination address from cmsg + * @msg: msghdr into which message has been received + * @dst: (Local) destination address of message in @msg (output) + * + * Return: 0 on success, -1 if the information was missing (@dst is set to + * inany_any6). + */ +static int udp_pktinfo(struct msghdr *msg, union inany_addr *dst) +{ + struct cmsghdr *hdr; + + for (hdr = CMSG_FIRSTHDR(msg); hdr; hdr = CMSG_NXTHDR(msg, hdr)) { + if (hdr->cmsg_level == IPPROTO_IP && + hdr->cmsg_type == IP_PKTINFO) { + const struct in_pktinfo *i4 = (void *)CMSG_DATA(hdr); + + *dst = inany_from_v4(i4->ipi_addr); + return 0; + } + + if (hdr->cmsg_level == IPPROTO_IPV6 && + hdr->cmsg_type == IPV6_PKTINFO) { + const struct in6_pktinfo *i6 = (void *)CMSG_DATA(hdr); + + dst->a6 = i6->ipi6_addr; + return 0; + } + } + + debug("Missing PKTINFO cmsg on datagram"); + *dst = inany_any6; + return -1; +} + +/** * udp_sock_recverr() - Receive and clear an error from a socket * @c: Execution context - * @ref: epoll reference + * @s: Socket to receive errors from + * @sidx: Flow and side of @s, or FLOW_SIDX_NONE if unknown + * @pif: Interface on which the error occurred + * (only used if @sidx == FLOW_SIDX_NONE) + * @port: Local port number of @s (only used if @sidx == FLOW_SIDX_NONE) * * Return: 1 if error received and processed, 0 if no more errors in queue, < 0 * if there was an error reading the queue * * #syscalls recvmsg */ -static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref) +static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx, + uint8_t pif, in_port_t port) { + char buf[PKTINFO_SPACE + RECVERR_SPACE]; const struct sock_extended_err *ee; - const struct cmsghdr *hdr; - union sockaddr_inany saddr; - char buf[CMSG_SPACE(sizeof(*ee))]; char data[ICMP6_MAX_DLEN]; - int s = ref.fd; + struct cmsghdr *hdr; struct iovec iov = { .iov_base = data, .iov_len = sizeof(data) }; + union sockaddr_inany src; struct msghdr mh = { - .msg_name = &saddr, - .msg_namelen = sizeof(saddr), + .msg_name = &src, + .msg_namelen = sizeof(src), .msg_iov = &iov, .msg_iovlen = 1, .msg_control = buf, .msg_controllen = sizeof(buf), }; + const struct flowside *fromside, *toside; + union inany_addr offender, otap; + char astr[INANY_ADDRSTRLEN]; + char sastr[SOCKADDR_STRLEN]; + const struct in_addr *o4; + in_port_t offender_port; + struct udp_flow *uflow; + uint8_t topif; + size_t dlen; ssize_t rc; rc = recvmsg(s, &mh, MSG_ERRQUEUE); @@ -544,61 +564,111 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref) return -1; } - hdr = CMSG_FIRSTHDR(&mh); - if (!((hdr->cmsg_level == IPPROTO_IP && - hdr->cmsg_type == IP_RECVERR) || - (hdr->cmsg_level == IPPROTO_IPV6 && - hdr->cmsg_type == IPV6_RECVERR))) { - err("Unexpected cmsg reading error queue"); + for (hdr = CMSG_FIRSTHDR(&mh); hdr; hdr = CMSG_NXTHDR(&mh, hdr)) { + if ((hdr->cmsg_level == IPPROTO_IP && + hdr->cmsg_type == IP_RECVERR) || + (hdr->cmsg_level == IPPROTO_IPV6 && + hdr->cmsg_type == IPV6_RECVERR)) + break; + } + + if (!hdr) { + err("Missing RECVERR cmsg in error queue"); return -1; } ee = (const struct sock_extended_err *)CMSG_DATA(hdr); - if (ref.type == EPOLL_TYPE_UDP_REPLY) { - flow_sidx_t sidx = flow_sidx_opposite(ref.flowside); - const struct flowside *toside = flowside_at_sidx(sidx); - size_t dlen = rc; - - if (hdr->cmsg_level == IPPROTO_IP) { - dlen = MIN(dlen, ICMP4_MAX_DLEN); - udp_send_conn_fail_icmp4(c, ee, toside, saddr.sa4.sin_addr, - data, dlen); - } else if (hdr->cmsg_level == IPPROTO_IPV6) { - udp_send_conn_fail_icmp6(c, ee, toside, - &saddr.sa6.sin6_addr, - data, dlen, sidx.flowi); + + debug("%s error on UDP socket %i: %s", + str_ee_origin(ee), s, strerror_(ee->ee_errno)); + + if (!flow_sidx_valid(sidx)) { + /* No hint from the socket, determine flow from addresses */ + union inany_addr dst; + + if (udp_pktinfo(&mh, &dst) < 0) { + debug("Missing PKTINFO on UDP error"); + return 1; + } + + sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, &src, &dst, port); + if (!flow_sidx_valid(sidx)) { + debug("Ignoring UDP error without flow"); + return 1; } } else { - trace("Ignoring received IP_RECVERR cmsg on listener socket"); + pif = pif_at_sidx(sidx); } - debug("%s error on UDP socket %i: %s", - str_ee_origin(ee), s, strerror_(ee->ee_errno)); + uflow = udp_at_sidx(sidx); + ASSERT(uflow); + fromside = &uflow->f.side[sidx.sidei]; + toside = &uflow->f.side[!sidx.sidei]; + topif = uflow->f.pif[!sidx.sidei]; + dlen = rc; + + if (inany_from_sockaddr(&offender, &offender_port, + SO_EE_OFFENDER(ee)) < 0) + goto fail; + + if (pif != PIF_HOST || topif != PIF_TAP) + /* XXX Can we support any other cases? */ + goto fail; + + /* If the offender *is* the endpoint, make sure our translation is + * consistent with the flow's translation. This matters if the flow + * endpoint has a port specific translation (like --dns-match). + */ + if (inany_equals(&offender, &fromside->eaddr)) + otap = toside->oaddr; + else if (!nat_inbound(c, &offender, &otap)) + goto fail; + + if (hdr->cmsg_level == IPPROTO_IP && + (o4 = inany_v4(&otap)) && inany_v4(&toside->eaddr)) { + dlen = MIN(dlen, ICMP4_MAX_DLEN); + udp_send_tap_icmp4(c, ee, toside, *o4, data, dlen); + return 1; + } + + if (hdr->cmsg_level == IPPROTO_IPV6 && !inany_v4(&toside->eaddr)) { + udp_send_tap_icmp6(c, ee, toside, &otap.a6, data, dlen, + FLOW_IDX(uflow)); + return 1; + } + +fail: + flow_dbg(uflow, "Can't propagate %s error from %s %s to %s %s", + str_ee_origin(ee), + pif_name(pif), + sockaddr_ntop(SO_EE_OFFENDER(ee), sastr, sizeof(sastr)), + pif_name(topif), + inany_ntop(&toside->eaddr, astr, sizeof(astr))); return 1; } /** * udp_sock_errs() - Process errors on a socket * @c: Execution context - * @ref: epoll reference - * @events: epoll events bitmap + * @s: Socket to receive errors from + * @sidx: Flow and side of @s, or FLOW_SIDX_NONE if unknown + * @pif: Interface on which the error occurred + * (only used if @sidx == FLOW_SIDX_NONE) + * @port: Local port number of @s (only used if @sidx == FLOW_SIDX_NONE) * - * Return: Number of errors handled, or < 0 if we have an unrecoverable error + * Return: number of errors handled, or < 0 if we have an unrecoverable error */ -int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events) +static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx, + uint8_t pif, in_port_t port) { unsigned n_err = 0; socklen_t errlen; - int s = ref.fd; int rc, err; ASSERT(!c->no_udp); - if (!(events & EPOLLERR)) - return 0; /* Nothing to do */ - /* Empty the error queue */ - while ((rc = udp_sock_recverr(c, ref)) > 0) + while ((rc = udp_sock_recverr(c, s, sidx, pif, port)) > 0) n_err += rc; if (rc < 0) @@ -626,36 +696,61 @@ int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events) } /** + * udp_peek_addr() - Get source address for next packet + * @s: Socket to get information from + * @src: Socket address (output) + * @dst: (Local) destination address (output) + * + * Return: 0 if no more packets, 1 on success, -ve error code on error + */ +static int udp_peek_addr(int s, union sockaddr_inany *src, + union inany_addr *dst) +{ + char sastr[SOCKADDR_STRLEN], dstr[INANY_ADDRSTRLEN]; + char cmsg[PKTINFO_SPACE]; + struct msghdr msg = { + .msg_name = src, + .msg_namelen = sizeof(*src), + .msg_control = cmsg, + .msg_controllen = sizeof(cmsg), + }; + int rc; + + rc = recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT); + if (rc < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) + return 0; + return -errno; + } + + udp_pktinfo(&msg, dst); + + trace("Peeked UDP datagram: %s -> %s", + sockaddr_ntop(src, sastr, sizeof(sastr)), + inany_ntop(dst, dstr, sizeof(dstr))); + + return 1; +} + +/** * udp_sock_recv() - Receive datagrams from a socket * @c: Execution context * @s: Socket to receive from - * @events: epoll events bitmap - * @mmh mmsghdr array to receive into + * @mmh: mmsghdr array to receive into + * @n: Maximum number of datagrams to receive * - * Return: Number of datagrams received + * Return: number of datagrams received * * #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64 */ -static int udp_sock_recv(const struct ctx *c, int s, uint32_t events, - struct mmsghdr *mmh) +static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n) { - /* For not entirely clear reasons (data locality?) pasta gets better - * throughput if we receive tap datagrams one at a atime. For small - * splice datagrams throughput is slightly better if we do batch, but - * it's slightly worse for large splice datagrams. Since we don't know - * before we receive whether we'll use tap or splice, always go one at a - * time for pasta mode. - */ - int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES); - ASSERT(!c->no_udp); - if (!(events & EPOLLIN)) - return 0; - n = recvmmsg(s, mmh, n, 0, NULL); if (n < 0) { - err_perror("Error receiving datagrams"); + trace("Error receiving datagrams: %s", strerror_(errno)); + /* Bail out and let the EPOLLERR handler deal with it */ return 0; } @@ -663,78 +758,121 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events, } /** - * udp_buf_listen_sock_handler() - Handle new data from socket + * udp_sock_to_sock() - Forward datagrams from socket to socket * @c: Execution context - * @ref: epoll reference - * @events: epoll events bitmap - * @now: Current timestamp + * @from_s: Socket to receive datagrams from + * @n: Maximum number of datagrams to forward + * @tosidx: Flow & side to forward datagrams to * - * #syscalls recvmmsg + * #syscalls sendmmsg */ -static void udp_buf_listen_sock_handler(const struct ctx *c, - union epoll_ref ref, uint32_t events, - const struct timespec *now) +static void udp_sock_to_sock(const struct ctx *c, int from_s, int n, + flow_sidx_t tosidx) { - const socklen_t sasize = sizeof(udp_meta[0].s_in); - int n, i; + const struct flowside *toside = flowside_at_sidx(tosidx); + const struct udp_flow *uflow = udp_at_sidx(tosidx); + uint8_t topif = pif_at_sidx(tosidx); + int to_s = uflow->s[tosidx.sidei]; + socklen_t sl; + int i; - if (udp_sock_errs(c, ref, events) < 0) { - err("UDP: Unrecoverable error on listening socket:" - " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port); - /* FIXME: what now? close/re-open socket? */ + if ((n = udp_sock_recv(c, from_s, udp_mh_recv, n)) <= 0) return; + + for (i = 0; i < n; i++) { + udp_mh_splice[i].msg_hdr.msg_iov->iov_len + = udp_mh_recv[i].msg_len; } - if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv)) <= 0) + pif_sockaddr(c, &udp_splice_to, &sl, topif, + &toside->eaddr, toside->eport); + + sendmmsg(to_s, udp_mh_splice, n, MSG_NOSIGNAL); +} + +/** + * udp_buf_sock_to_tap() - Forward datagrams from socket to tap + * @c: Execution context + * @s: Socket to read data from + * @n: Maximum number of datagrams to forward + * @tosidx: Flow & side to forward data from @s to + */ +static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n, + flow_sidx_t tosidx) +{ + const struct flowside *toside = flowside_at_sidx(tosidx); + int i; + + if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0) return; - /* We divide datagrams into batches based on how we need to send them, - * determined by udp_meta[i].tosidx. To avoid either two passes through - * the array, or recalculating tosidx for a single entry, we have to - * populate it one entry *ahead* of the loop counter. - */ - udp_meta[0].tosidx = udp_flow_from_sock(c, ref, &udp_meta[0].s_in, now); - udp_mh_recv[0].msg_hdr.msg_namelen = sasize; - for (i = 0; i < n; ) { - flow_sidx_t batchsidx = udp_meta[i].tosidx; - uint8_t batchpif = pif_at_sidx(batchsidx); - int batchstart = i; - - do { - if (pif_is_socket(batchpif)) { - udp_splice_prepare(udp_mh_recv, i); - } else if (batchpif == PIF_TAP) { - udp_tap_prepare(udp_mh_recv, i, - flowside_at_sidx(batchsidx), - false); + for (i = 0; i < n; i++) + udp_tap_prepare(udp_mh_recv, i, toside, false); + + tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n); +} + +/** + * udp_sock_fwd() - Forward datagrams from a possibly unconnected socket + * @c: Execution context + * @s: Socket to forward from + * @frompif: Interface to which @s belongs + * @port: Our (local) port number of @s + * @now: Current timestamp + */ +void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif, + in_port_t port, const struct timespec *now) +{ + union sockaddr_inany src; + union inany_addr dst; + int rc; + + while ((rc = udp_peek_addr(s, &src, &dst)) != 0) { + bool discard = false; + flow_sidx_t tosidx; + uint8_t topif; + + if (rc < 0) { + trace("Error peeking at socket address: %s", + strerror_(-rc)); + /* Clear errors & carry on */ + if (udp_sock_errs(c, s, FLOW_SIDX_NONE, + frompif, port) < 0) { + err( +"UDP: Unrecoverable error on listening socket: (%s port %hu)", + pif_name(frompif), port); + /* FIXME: what now? close/re-open socket? */ } + continue; + } + + tosidx = udp_flow_from_sock(c, frompif, &dst, port, &src, now); + topif = pif_at_sidx(tosidx); - if (++i >= n) - break; - - udp_meta[i].tosidx = udp_flow_from_sock(c, ref, - &udp_meta[i].s_in, - now); - udp_mh_recv[i].msg_hdr.msg_namelen = sasize; - } while (flow_sidx_eq(udp_meta[i].tosidx, batchsidx)); - - if (pif_is_socket(batchpif)) { - udp_splice_send(c, batchstart, i - batchstart, - batchsidx); - } else if (batchpif == PIF_TAP) { - tap_send_frames(c, &udp_l2_iov[batchstart][0], - UDP_NUM_IOVS, i - batchstart); - } else if (flow_sidx_valid(batchsidx)) { - flow_sidx_t fromsidx = flow_sidx_opposite(batchsidx); - struct udp_flow *uflow = udp_at_sidx(batchsidx); + if (pif_is_socket(topif)) { + udp_sock_to_sock(c, s, 1, tosidx); + } else if (topif == PIF_TAP) { + if (c->mode == MODE_VU) + udp_vu_sock_to_tap(c, s, 1, tosidx); + else + udp_buf_sock_to_tap(c, s, 1, tosidx); + } else if (flow_sidx_valid(tosidx)) { + struct udp_flow *uflow = udp_at_sidx(tosidx); flow_err(uflow, "No support for forwarding UDP from %s to %s", - pif_name(pif_at_sidx(fromsidx)), - pif_name(batchpif)); + pif_name(frompif), pif_name(topif)); + discard = true; } else { - debug("Discarding %d datagrams without flow", - i - batchstart); + debug("Discarding datagram without flow"); + discard = true; + } + + if (discard) { + struct msghdr msg = { 0 }; + + if (recvmsg(s, &msg, MSG_DONTWAIT) < 0) + debug_perror("Failed to discard datagram"); } } } @@ -750,87 +888,69 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { - if (c->mode == MODE_VU) { - udp_vu_listen_sock_handler(c, ref, events, now); - return; - } - - udp_buf_listen_sock_handler(c, ref, events, now); + if (events & (EPOLLERR | EPOLLIN)) + udp_sock_fwd(c, ref.fd, ref.udp.pif, ref.udp.port, now); } /** - * udp_buf_reply_sock_handler() - Handle new data from flow specific socket + * udp_sock_handler() - Handle new data from flow specific socket * @c: Execution context * @ref: epoll reference * @events: epoll events bitmap * @now: Current timestamp - * - * #syscalls recvmmsg */ -static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref, - uint32_t events, - const struct timespec *now) +void udp_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events, const struct timespec *now) { - flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside); - const struct flowside *toside = flowside_at_sidx(tosidx); struct udp_flow *uflow = udp_at_sidx(ref.flowside); - uint8_t topif = pif_at_sidx(tosidx); - int n, i, from_s; ASSERT(!c->no_udp && uflow); - from_s = uflow->s[ref.flowside.sidei]; - - if (udp_sock_errs(c, ref, events) < 0) { - flow_err(uflow, "Unrecoverable error on reply socket"); - flow_err_details(uflow); - udp_flow_close(c, uflow); - return; - } - - if ((n = udp_sock_recv(c, from_s, events, udp_mh_recv)) <= 0) - return; - - flow_trace(uflow, "Received %d datagrams on reply socket", n); - uflow->ts = now->tv_sec; - - for (i = 0; i < n; i++) { - if (pif_is_socket(topif)) - udp_splice_prepare(udp_mh_recv, i); - else if (topif == PIF_TAP) - udp_tap_prepare(udp_mh_recv, i, toside, false); - /* Restore sockaddr length clobbered by recvmsg() */ - udp_mh_recv[i].msg_hdr.msg_namelen = sizeof(udp_meta[i].s_in); - } - - if (pif_is_socket(topif)) { - udp_splice_send(c, 0, n, tosidx); - } else if (topif == PIF_TAP) { - tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n); - } else { - uint8_t frompif = pif_at_sidx(ref.flowside); - - flow_err(uflow, "No support for forwarding UDP from %s to %s", - pif_name(frompif), pif_name(topif)); + if (events & EPOLLERR) { + if (udp_sock_errs(c, ref.fd, ref.flowside, PIF_NONE, 0) < 0) { + flow_err(uflow, "Unrecoverable error on flow socket"); + goto fail; + } } -} -/** - * udp_reply_sock_handler() - Handle new data from flow specific socket - * @c: Execution context - * @ref: epoll reference - * @events: epoll events bitmap - * @now: Current timestamp - */ -void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, - uint32_t events, const struct timespec *now) -{ - if (c->mode == MODE_VU) { - udp_vu_reply_sock_handler(c, ref, events, now); - return; + if (events & EPOLLIN) { + /* For not entirely clear reasons (data locality?) pasta gets + * better throughput if we receive tap datagrams one at a + * time. For small splice datagrams throughput is slightly + * better if we do batch, but it's slightly worse for large + * splice datagrams. Since we don't know the size before we + * receive, always go one at a time for pasta mode. + */ + size_t n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES); + flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside); + uint8_t topif = pif_at_sidx(tosidx); + int s = ref.fd; + + flow_trace(uflow, "Received data on reply socket"); + uflow->ts = now->tv_sec; + + if (pif_is_socket(topif)) { + udp_sock_to_sock(c, ref.fd, n, tosidx); + } else if (topif == PIF_TAP) { + if (c->mode == MODE_VU) { + udp_vu_sock_to_tap(c, s, UDP_MAX_FRAMES, + tosidx); + } else { + udp_buf_sock_to_tap(c, s, n, tosidx); + } + } else { + flow_err(uflow, + "No support for forwarding UDP from %s to %s", + pif_name(pif_at_sidx(ref.flowside)), + pif_name(topif)); + goto fail; + } } + return; - udp_buf_reply_sock_handler(c, ref, events, now); +fail: + flow_err_details(uflow); + udp_flow_close(c, uflow); } /** @@ -840,6 +960,7 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, * @af: Address family, AF_INET or AF_INET6 * @saddr: Source address * @daddr: Destination address + * @ttl: TTL or hop limit for packets to be sent in this call * @p: Pool of UDP packets, with UDP headers * @idx: Index of first packet to process * @now: Current timestamp @@ -850,15 +971,18 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, */ int udp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, - const struct pool *p, int idx, const struct timespec *now) + uint8_t ttl, const struct pool *p, int idx, + const struct timespec *now) { const struct flowside *toside; struct mmsghdr mm[UIO_MAXIOV]; union sockaddr_inany to_sa; struct iovec m[UIO_MAXIOV]; + struct udphdr uh_storage; const struct udphdr *uh; struct udp_flow *uflow; - int i, s, count = 0; + int i, j, s, count = 0; + struct iov_tail data; flow_sidx_t tosidx; in_port_t src, dst; uint8_t topif; @@ -866,7 +990,10 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif, ASSERT(!c->no_udp); - uh = packet_get(p, idx, 0, sizeof(*uh), NULL); + if (!packet_get(p, idx, &data)) + return 1; + + uh = IOV_PEEK_HEADER(&data, uh_storage); if (!uh) return 1; @@ -898,28 +1025,34 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif, } toside = flowside_at_sidx(tosidx); - s = udp_at_sidx(tosidx)->s[tosidx.sidei]; + s = uflow->s[tosidx.sidei]; ASSERT(s >= 0); pif_sockaddr(c, &to_sa, &sl, topif, &toside->eaddr, toside->eport); - for (i = 0; i < (int)p->count - idx; i++) { - struct udphdr *uh_send; - size_t len; + for (i = 0, j = 0; i < (int)p->count - idx && j < UIO_MAXIOV; i++) { + const struct udphdr *uh_send; - uh_send = packet_get(p, idx + i, 0, sizeof(*uh), &len); + if (!packet_get(p, idx + i, &data)) + return p->count - idx; + + uh_send = IOV_REMOVE_HEADER(&data, uh_storage); if (!uh_send) return p->count - idx; mm[i].msg_hdr.msg_name = &to_sa; mm[i].msg_hdr.msg_namelen = sl; - if (len) { - m[i].iov_base = (char *)(uh_send + 1); - m[i].iov_len = len; + if (data.cnt) { + int cnt; + + cnt = iov_tail_clone(&m[j], UIO_MAXIOV - j, &data); + if (cnt < 0) + return p->count - idx; - mm[i].msg_hdr.msg_iov = m + i; - mm[i].msg_hdr.msg_iovlen = 1; + mm[i].msg_hdr.msg_iov = &m[j]; + mm[i].msg_hdr.msg_iovlen = cnt; + j += cnt; } else { mm[i].msg_hdr.msg_iov = NULL; mm[i].msg_hdr.msg_iovlen = 0; @@ -929,6 +1062,24 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif, mm[i].msg_hdr.msg_controllen = 0; mm[i].msg_hdr.msg_flags = 0; + if (ttl != uflow->ttl[tosidx.sidei]) { + uflow->ttl[tosidx.sidei] = ttl; + if (af == AF_INET) { + if (setsockopt(s, IPPROTO_IP, IP_TTL, + &ttl, sizeof(ttl)) < 0) + flow_perror(uflow, + "setsockopt IP_TTL"); + } else { + /* IPv6 hop_limit cannot be only 1 byte */ + int hop_limit = ttl; + + if (setsockopt(s, SOL_IPV6, IPV6_UNICAST_HOPS, + &hop_limit, sizeof(hop_limit)) < 0) + flow_perror(uflow, + "setsockopt IPV6_UNICAST_HOPS"); + } + } + count++; } @@ -11,11 +11,12 @@ void udp_portmap_clear(void); void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now); -void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, - uint32_t events, const struct timespec *now); +void udp_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events, const struct timespec *now); int udp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, - const struct pool *p, int idx, const struct timespec *now); + uint8_t ttl, const struct pool *p, int idx, + const struct timespec *now); int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr, const char *ifname, in_port_t port); int udp_init(struct ctx *c); @@ -9,10 +9,12 @@ #include <fcntl.h> #include <sys/uio.h> #include <unistd.h> +#include <netinet/udp.h> #include "util.h" #include "passt.h" #include "flow_table.h" +#include "udp_internal.h" #define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */ @@ -34,123 +36,153 @@ struct udp_flow *udp_at_sidx(flow_sidx_t sidx) return &flow->udp; } -/* +/** * udp_flow_close() - Close and clean up UDP flow * @c: Execution context * @uflow: UDP flow */ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow) { + unsigned sidei; + if (uflow->closed) return; /* Nothing to do */ - if (uflow->s[INISIDE] >= 0) { - /* The listening socket needs to stay in epoll */ - close(uflow->s[INISIDE]); - uflow->s[INISIDE] = -1; + flow_foreach_sidei(sidei) { + flow_hash_remove(c, FLOW_SIDX(uflow, sidei)); + if (uflow->s[sidei] >= 0) { + epoll_del(c, uflow->s[sidei]); + close(uflow->s[sidei]); + uflow->s[sidei] = -1; + } + } + + uflow->closed = true; +} + +/** + * udp_flow_sock() - Create, bind and connect a flow specific UDP socket + * @c: Execution context + * @uflow: UDP flow to open socket for + * @sidei: Side of @uflow to open socket for + * + * Return: fd of new socket on success, -ve error code on failure + */ +static int udp_flow_sock(const struct ctx *c, + struct udp_flow *uflow, unsigned sidei) +{ + const struct flowside *side = &uflow->f.side[sidei]; + uint8_t pif = uflow->f.pif[sidei]; + union { + flow_sidx_t sidx; + uint32_t data; + } fref = { .sidx = FLOW_SIDX(uflow, sidei) }; + int s; + + s = flowside_sock_l4(c, EPOLL_TYPE_UDP, pif, side, fref.data); + if (s < 0) { + flow_dbg_perror(uflow, "Couldn't open flow specific socket"); + return s; } - if (uflow->s[TGTSIDE] >= 0) { - /* But the flow specific one needs to be removed */ - epoll_del(c, uflow->s[TGTSIDE]); - close(uflow->s[TGTSIDE]); - uflow->s[TGTSIDE] = -1; + if (flowside_connect(c, s, pif, side) < 0) { + int rc = -errno; + + epoll_del(c, s); + close(s); + + flow_dbg_perror(uflow, "Couldn't connect flow socket"); + return rc; } - flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE)); - if (!pif_is_socket(uflow->f.pif[TGTSIDE])) - flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE)); - uflow->closed = true; + /* It's possible, if unlikely, that we could receive some packets in + * between the bind() and connect() which may or may not be for this + * flow. Being UDP we could just discard them, but it's not ideal. + * + * There's also a tricky case if a bunch of datagrams for a new flow + * arrive in rapid succession, the first going to the original listening + * socket and later ones going to this new socket. If we forwarded the + * datagrams from the new socket immediately here they would go before + * the datagram which established the flow. Again, not strictly wrong + * for UDP, but not ideal. + * + * So, we flag that the new socket is in a transient state where it + * might have datagrams for a different flow queued. Before the next + * epoll cycle, udp_flow_defer() will flush out any such datagrams, and + * thereafter everything on the new socket should be strictly for this + * flow. + */ + if (sidei) + uflow->flush1 = true; + else + uflow->flush0 = true; + + return s; } /** * udp_flow_new() - Common setup for a new UDP flow * @c: Execution context * @flow: Initiated flow - * @s_ini: Initiating socket (or -1) * @now: Timestamp * - * Return: UDP specific flow, if successful, NULL on failure + * Return: sidx for the target side of the new UDP flow, or FLOW_SIDX_NONE + * on failure. + * + * #syscalls getsockname */ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, - int s_ini, const struct timespec *now) + const struct timespec *now) { struct udp_flow *uflow = NULL; const struct flowside *tgt; - uint8_t tgtpif; + unsigned sidei; if (!(tgt = flow_target(c, flow, IPPROTO_UDP))) goto cancel; - tgtpif = flow->f.pif[TGTSIDE]; uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp); uflow->ts = now->tv_sec; uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1; + uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = 0; - if (s_ini >= 0) { - /* When using auto port-scanning the listening port could go - * away, so we need to duplicate the socket - */ - uflow->s[INISIDE] = fcntl(s_ini, F_DUPFD_CLOEXEC, 0); - if (uflow->s[INISIDE] < 0) { - flow_perror(uflow, - "Couldn't duplicate listening socket"); - goto cancel; - } + flow_foreach_sidei(sidei) { + if (pif_is_socket(uflow->f.pif[sidei])) + if ((uflow->s[sidei] = udp_flow_sock(c, uflow, sidei)) < 0) + goto cancel; } - if (pif_is_socket(tgtpif)) { - struct mmsghdr discard[UIO_MAXIOV] = { 0 }; - union { - flow_sidx_t sidx; - uint32_t data; - } fref = { - .sidx = FLOW_SIDX(flow, TGTSIDE), - }; - int rc; - - uflow->s[TGTSIDE] = flowside_sock_l4(c, EPOLL_TYPE_UDP_REPLY, - tgtpif, tgt, fref.data); - if (uflow->s[TGTSIDE] < 0) { - flow_dbg_perror(uflow, - "Couldn't open socket for spliced flow"); + if (uflow->s[TGTSIDE] >= 0 && inany_is_unspecified(&tgt->oaddr)) { + /* When we target a socket, we connect() it, but might not + * always bind(), leaving the kernel to pick our address. In + * that case connect() will implicitly bind() the socket, but we + * need to determine its local address so that we can match + * reply packets back to the correct flow. Update the flow with + * the information from getsockname() */ + union sockaddr_inany sa; + socklen_t sl = sizeof(sa); + in_port_t port; + + if (getsockname(uflow->s[TGTSIDE], &sa.sa, &sl) < 0 || + inany_from_sockaddr(&uflow->f.side[TGTSIDE].oaddr, + &port, &sa) < 0) { + flow_perror(uflow, "Unable to determine local address"); goto cancel; } - - if (flowside_connect(c, uflow->s[TGTSIDE], tgtpif, tgt) < 0) { - flow_dbg_perror(uflow, "Couldn't connect flow socket"); - goto cancel; - } - - /* It's possible, if unlikely, that we could receive some - * unrelated packets in between the bind() and connect() of this - * socket. For now we just discard these. We could consider - * trying to redirect these to an appropriate handler, if we - * need to. - */ - rc = recvmmsg(uflow->s[TGTSIDE], discard, ARRAY_SIZE(discard), - MSG_DONTWAIT, NULL); - if (rc >= ARRAY_SIZE(discard)) { - flow_dbg(uflow, - "Too many (%d) spurious reply datagrams", rc); + if (port != tgt->oport) { + flow_err(uflow, "Unexpected local port"); goto cancel; - } else if (rc > 0) { - flow_trace(uflow, - "Discarded %d spurious reply datagrams", rc); - } else if (errno != EAGAIN) { - flow_perror(uflow, - "Unexpected error discarding datagrams"); } } - flow_hash_insert(c, FLOW_SIDX(uflow, INISIDE)); - - /* If the target side is a socket, it will be a reply socket that knows - * its own flowside. But if it's tap, then we need to look it up by - * hash. + /* Tap sides always need to be looked up by hash. Socket sides don't + * always, but sometimes do (receiving packets on a socket not specific + * to one flow). Unconditionally hash both sides so all our bases are + * covered */ - if (!pif_is_socket(tgtpif)) - flow_hash_insert(c, FLOW_SIDX(uflow, TGTSIDE)); + flow_foreach_sidei(sidei) + flow_hash_insert(c, FLOW_SIDX(uflow, sidei)); + FLOW_ACTIVATE(uflow); return FLOW_SIDX(uflow, TGTSIDE); @@ -163,9 +195,11 @@ cancel: } /** - * udp_flow_from_sock() - Find or create UDP flow for "listening" socket + * udp_flow_from_sock() - Find or create UDP flow for incoming datagram * @c: Execution context - * @ref: epoll reference of the receiving socket + * @pif: Interface the datagram is arriving from + * @dst: Our (local) address to which the datagram is arriving + * @port: Our (local) port number to which the datagram is arriving * @s_in: Source socket address, filled in by recvmmsg() * @now: Timestamp * @@ -174,7 +208,8 @@ cancel: * Return: sidx for the destination side of the flow for this packet, or * FLOW_SIDX_NONE if we couldn't find or create a flow. */ -flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref, +flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, + const union inany_addr *dst, in_port_t port, const union sockaddr_inany *s_in, const struct timespec *now) { @@ -183,9 +218,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref, union flow *flow; flow_sidx_t sidx; - ASSERT(ref.type == EPOLL_TYPE_UDP_LISTEN); - - sidx = flow_lookup_sa(c, IPPROTO_UDP, ref.udp.pif, s_in, ref.udp.port); + sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, s_in, dst, port); if ((uflow = udp_at_sidx(sidx))) { uflow->ts = now->tv_sec; return flow_sidx_opposite(sidx); @@ -195,12 +228,11 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref, char sastr[SOCKADDR_STRLEN]; debug("Couldn't allocate flow for UDP datagram from %s %s", - pif_name(ref.udp.pif), - sockaddr_ntop(s_in, sastr, sizeof(sastr))); + pif_name(pif), sockaddr_ntop(s_in, sastr, sizeof(sastr))); return FLOW_SIDX_NONE; } - ini = flow_initiate_sa(flow, ref.udp.pif, s_in, ref.udp.port); + ini = flow_initiate_sa(flow, pif, s_in, dst, port); if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0 || ini->oport == 0) { @@ -213,7 +245,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref, return FLOW_SIDX_NONE; } - return udp_flow_new(c, flow, ref.fd, now); + return udp_flow_new(c, flow, now); } /** @@ -269,17 +301,45 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c, return FLOW_SIDX_NONE; } - return udp_flow_new(c, flow, -1, now); + return udp_flow_new(c, flow, now); +} + +/** + * udp_flush_flow() - Flush datagrams that might not be for this flow + * @c: Execution context + * @uflow: Flow to handle + * @sidei: Side of the flow to flush + * @now: Current timestamp + */ +static void udp_flush_flow(const struct ctx *c, + const struct udp_flow *uflow, unsigned sidei, + const struct timespec *now) +{ + /* We don't know exactly where the datagrams will come from, but we know + * they'll have an interface and oport matching this flow */ + udp_sock_fwd(c, uflow->s[sidei], uflow->f.pif[sidei], + uflow->f.side[sidei].oport, now); } /** * udp_flow_defer() - Deferred per-flow handling (clean up aborted flows) + * @c: Execution context * @uflow: Flow to handle + * @now: Current timestamp * * Return: true if the connection is ready to free, false otherwise */ -bool udp_flow_defer(const struct udp_flow *uflow) +bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow, + const struct timespec *now) { + if (uflow->flush0) { + udp_flush_flow(c, uflow, INISIDE, now); + uflow->flush0 = false; + } + if (uflow->flush1) { + udp_flush_flow(c, uflow, TGTSIDE, now); + uflow->flush1 = false; + } return uflow->closed; } @@ -8,9 +8,12 @@ #define UDP_FLOW_H /** - * struct udp - Descriptor for a flow of UDP packets + * struct udp_flow - Descriptor for a flow of UDP packets * @f: Generic flow information + * @ttl: TTL or hop_limit for both sides * @closed: Flow is already closed + * @flush0: @s[0] may have datagrams queued for other flows + * @flush1: @s[1] may have datagrams queued for other flows * @ts: Activity timestamp * @s: Socket fd (or -1) for each side of the flow */ @@ -18,13 +21,19 @@ struct udp_flow { /* Must be first element */ struct flow_common f; - bool closed :1; + uint8_t ttl[SIDES]; + + bool closed :1, + flush0 :1, + flush1 :1; + time_t ts; int s[SIDES]; }; struct udp_flow *udp_at_sidx(flow_sidx_t sidx); -flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref, +flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, + const union inany_addr *dst, in_port_t port, const union sockaddr_inany *s_in, const struct timespec *now); flow_sidx_t udp_flow_from_tap(const struct ctx *c, @@ -33,7 +42,8 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c, in_port_t srcport, in_port_t dstport, const struct timespec *now); void udp_flow_close(const struct ctx *c, struct udp_flow *uflow); -bool udp_flow_defer(const struct udp_flow *uflow); +bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow, + const struct timespec *now); bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow, const struct timespec *now); diff --git a/udp_internal.h b/udp_internal.h index 3b081f5..96d11cf 100644 --- a/udp_internal.h +++ b/udp_internal.h @@ -8,8 +8,6 @@ #include "tap.h" /* needed by udp_meta_t */ -#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ - /** * struct udp_payload_t - UDP header and data for inbound messages * @uh: UDP header @@ -30,5 +28,7 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, const struct flowside *toside, size_t dlen, bool no_udp_csum); -int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events); +void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif, + in_port_t port, const struct timespec *now); + #endif /* UDP_INTERNAL_H */ @@ -40,7 +40,7 @@ static struct vu_virtq_element elem [VIRTQUEUE_MAX_SIZE]; * udp_vu_hdrlen() - return the size of the header in level 2 frame (UDP) * @v6: Set for IPv6 packet * - * Return: Return the size of the header + * Return: return the size of the header */ static size_t udp_vu_hdrlen(bool v6) { @@ -58,46 +58,25 @@ static size_t udp_vu_hdrlen(bool v6) } /** - * udp_vu_sock_info() - get socket information - * @s: Socket to get information from - * @s_in: Socket address (output) - * - * Return: 0 if socket address can be read, -1 otherwise - */ -static int udp_vu_sock_info(int s, union sockaddr_inany *s_in) -{ - struct msghdr msg = { - .msg_name = s_in, - .msg_namelen = sizeof(union sockaddr_inany), - }; - - return recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT); -} - -/** * udp_vu_sock_recv() - Receive datagrams from socket into vhost-user buffers * @c: Execution context + * @vq: virtqueue to use to receive data * @s: Socket to receive from - * @events: epoll events bitmap * @v6: Set for IPv6 connections * @dlen: Size of received data (output) * - * Return: Number of iov entries used to store the datagram + * Return: number of iov entries used to store the datagram */ -static int udp_vu_sock_recv(const struct ctx *c, int s, uint32_t events, +static int udp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq, int s, bool v6, ssize_t *dlen) { - struct vu_dev *vdev = c->vdev; - struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + const struct vu_dev *vdev = c->vdev; int iov_cnt, idx, iov_used; struct msghdr msg = { 0 }; size_t off, hdrlen; ASSERT(!c->no_udp); - if (!(events & EPOLLIN)) - return 0; - /* compute L2 header length */ hdrlen = udp_vu_hdrlen(v6); @@ -214,125 +193,27 @@ static void udp_vu_csum(const struct flowside *toside, int iov_used) } /** - * udp_vu_listen_sock_handler() - Handle new data from socket + * udp_vu_sock_to_tap() - Forward datagrams from socket to tap * @c: Execution context - * @ref: epoll reference - * @events: epoll events bitmap - * @now: Current timestamp + * @s: Socket to read data from + * @n: Maximum number of datagrams to forward + * @tosidx: Flow & side to forward data from @s to */ -void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref, - uint32_t events, const struct timespec *now) +void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx) { - struct vu_dev *vdev = c->vdev; - struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; - int i; - - if (udp_sock_errs(c, ref, events) < 0) { - err("UDP: Unrecoverable error on listening socket:" - " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port); - return; - } - - for (i = 0; i < UDP_MAX_FRAMES; i++) { - const struct flowside *toside; - union sockaddr_inany s_in; - flow_sidx_t sidx; - uint8_t pif; - ssize_t dlen; - int iov_used; - bool v6; - - if (udp_vu_sock_info(ref.fd, &s_in) < 0) - break; - - sidx = udp_flow_from_sock(c, ref, &s_in, now); - pif = pif_at_sidx(sidx); - - if (pif != PIF_TAP) { - if (flow_sidx_valid(sidx)) { - flow_sidx_t fromsidx = flow_sidx_opposite(sidx); - struct udp_flow *uflow = udp_at_sidx(sidx); - - flow_err(uflow, - "No support for forwarding UDP from %s to %s", - pif_name(pif_at_sidx(fromsidx)), - pif_name(pif)); - } else { - debug("Discarding 1 datagram without flow"); - } - - continue; - } - - toside = flowside_at_sidx(sidx); - - v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)); - - iov_used = udp_vu_sock_recv(c, ref.fd, events, v6, &dlen); - if (iov_used <= 0) - break; - - udp_vu_prepare(c, toside, dlen); - if (*c->pcap) { - udp_vu_csum(toside, iov_used); - pcap_iov(iov_vu, iov_used, - sizeof(struct virtio_net_hdr_mrg_rxbuf)); - } - vu_flush(vdev, vq, elem, iov_used); - } -} - -/** - * udp_vu_reply_sock_handler() - Handle new data from flow specific socket - * @c: Execution context - * @ref: epoll reference - * @events: epoll events bitmap - * @now: Current timestamp - */ -void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref, - uint32_t events, const struct timespec *now) -{ - flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside); const struct flowside *toside = flowside_at_sidx(tosidx); - struct udp_flow *uflow = udp_at_sidx(ref.flowside); - int from_s = uflow->s[ref.flowside.sidei]; + bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)); struct vu_dev *vdev = c->vdev; struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; int i; - ASSERT(!c->no_udp); - - if (udp_sock_errs(c, ref, events) < 0) { - flow_err(uflow, "Unrecoverable error on reply socket"); - flow_err_details(uflow); - udp_flow_close(c, uflow); - return; - } - - for (i = 0; i < UDP_MAX_FRAMES; i++) { - uint8_t topif = pif_at_sidx(tosidx); + for (i = 0; i < n; i++) { ssize_t dlen; int iov_used; - bool v6; - - ASSERT(uflow); - - if (topif != PIF_TAP) { - uint8_t frompif = pif_at_sidx(ref.flowside); - - flow_err(uflow, - "No support for forwarding UDP from %s to %s", - pif_name(frompif), pif_name(topif)); - continue; - } - - v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)); - iov_used = udp_vu_sock_recv(c, from_s, events, v6, &dlen); + iov_used = udp_vu_sock_recv(c, vq, s, v6, &dlen); if (iov_used <= 0) break; - flow_trace(uflow, "Received 1 datagram on reply socket"); - uflow->ts = now->tv_sec; udp_vu_prepare(c, toside, dlen); if (*c->pcap) { @@ -6,8 +6,8 @@ #ifndef UDP_VU_H #define UDP_VU_H -void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref, - uint32_t events, const struct timespec *now); -void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref, - uint32_t events, const struct timespec *now); +void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref, + const struct timespec *now); +void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx); + #endif /* UDP_VU_H */ @@ -34,6 +34,7 @@ #include "passt.h" #include "packet.h" #include "log.h" +#include "pcap.h" #ifdef HAS_GETRANDOM #include <sys/random.h> #endif @@ -71,7 +72,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, case EPOLL_TYPE_UDP_LISTEN: freebind = c->freebind; /* fallthrough */ - case EPOLL_TYPE_UDP_REPLY: + case EPOLL_TYPE_UDP: proto = IPPROTO_UDP; socktype = SOCK_DGRAM | SOCK_NONBLOCK; break; @@ -109,11 +110,15 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, debug("Failed to set SO_REUSEADDR on socket %i", fd); if (proto == IPPROTO_UDP) { + int pktinfo = af == AF_INET ? IP_PKTINFO : IPV6_RECVPKTINFO; + int recverr = af == AF_INET ? IP_RECVERR : IPV6_RECVERR; int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6; - int opt = af == AF_INET ? IP_RECVERR : IPV6_RECVERR; - if (setsockopt(fd, level, opt, &y, sizeof(y))) + if (setsockopt(fd, level, recverr, &y, sizeof(y))) die_perror("Failed to set RECVERR on socket %i", fd); + + if (setsockopt(fd, level, pktinfo, &y, sizeof(y))) + die_perror("Failed to set PKTINFO on socket %i", fd); } if (ifname && *ifname) { @@ -360,7 +365,7 @@ void bitmap_or(uint8_t *dst, size_t size, const uint8_t *a, const uint8_t *b) dst[i] = a[i] | b[i]; } -/* +/** * ns_enter() - Enter configured user (unless already joined) and network ns * @c: Execution context * @@ -495,7 +500,8 @@ int output_file_open(const char *path, int flags) * @pidfile_fd: Open PID file descriptor * @devnull_fd: Open file descriptor for /dev/null * - * Return: child PID on success, won't return on failure + * Return: 0 in the child process on success. The parent process exits. + * Does not return in either process on failure (calls _exit). */ int __daemon(int pidfile_fd, int devnull_fd) { @@ -603,7 +609,8 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, #endif } -/* write_all_buf() - write all of a buffer to an fd +/** + * write_all_buf() - write all of a buffer to an fd * @fd: File descriptor * @buf: Pointer to base of buffer * @len: Length of buffer @@ -633,7 +640,8 @@ int write_all_buf(int fd, const void *buf, size_t len) return 0; } -/* write_remainder() - write the tail of an IO vector to an fd +/** + * write_remainder() - write the tail of an IO vector to an fd * @fd: File descriptor * @iov: IO vector * @iovcnt: Number of entries in @iov @@ -757,7 +765,7 @@ int read_remainder(int fd, const struct iovec *iov, size_t cnt, size_t skip) * @dst: output buffer, minimum SOCKADDR_STRLEN bytes * @size: size of buffer at @dst * - * Return: On success, a non-null pointer to @dst, NULL on failure + * Return: on success, a non-null pointer to @dst, NULL on failure */ const char *sockaddr_ntop(const void *sa, char *dst, socklen_t size) { @@ -817,7 +825,7 @@ const char *sockaddr_ntop(const void *sa, char *dst, socklen_t size) * @dst: Output buffer, minimum ETH_ADDRSTRLEN bytes * @size: Size of buffer at @dst * - * Return: On success, a non-null pointer to @dst, NULL on failure + * Return: on success, a non-null pointer to @dst, NULL on failure */ const char *eth_ntop(const unsigned char *mac, char *dst, size_t size) { @@ -834,7 +842,7 @@ const char *eth_ntop(const unsigned char *mac, char *dst, size_t size) /** str_ee_origin() - Convert socket extended error origin to a string * @ee: Socket extended error structure * - * Return: Static string describing error origin + * Return: static string describing error origin */ const char *str_ee_origin(const struct sock_extended_err *ee) { @@ -871,7 +879,9 @@ void close_open_files(int argc, char **argv) errno = 0; fd = strtol(optarg, NULL, 0); - if (errno || fd <= STDERR_FILENO || fd > INT_MAX) + if (errno || + (fd != STDIN_FILENO && fd <= STDERR_FILENO) || + fd > INT_MAX) die("Invalid --fd: %s", optarg); } } while (name != -1); @@ -1017,3 +1027,36 @@ void encode_domain_name(char *buf, const char *domain_name) } p[i] = 0L; } + +/** + * abort_with_msg() - Print error message and abort + * @fmt: Format string + * @...: Format parameters + */ +void abort_with_msg(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vlogmsg(true, false, LOG_CRIT, fmt, ap); + va_end(ap); + + /* This may actually cause a SIGSYS instead of SIGABRT, due to seccomp, + * but that will still get the job done. + */ + abort(); +} + +/** + * fsync_pcap_and_log() - Flush pcap and log files as needed + * + * #syscalls fsync + */ +void fsync_pcap_and_log(void) +{ + if (pcap_fd != -1 && fsync(pcap_fd)) + warn_perror("Failed to flush pcap file, it might be truncated"); + + if (log_file != -1) + (void)fsync(log_file); +} @@ -31,9 +31,6 @@ #ifndef SECCOMP_RET_KILL_PROCESS #define SECCOMP_RET_KILL_PROCESS SECCOMP_RET_KILL #endif -#ifndef ETH_MAX_MTU -#define ETH_MAX_MTU USHRT_MAX -#endif #ifndef IP_MAX_MTU #define IP_MAX_MTU USHRT_MAX #endif @@ -64,27 +61,22 @@ #define STRINGIFY(x) #x #define STR(x) STRINGIFY(x) -#ifdef CPPCHECK_6936 +void abort_with_msg(const char *fmt, ...) + __attribute__((format(printf, 1, 2), noreturn)); + /* Some cppcheck versions get confused by aborts inside a loop, causing * it to give false positive uninitialised variable warnings later in * the function, because it doesn't realise the non-initialising path * already exited. See https://trac.cppcheck.net/ticket/13227 + * + * Therefore, avoid using the usual do while wrapper we use to force the macro + * to act like a single statement requiring a ';'. */ -#define ASSERT(expr) \ - ((expr) ? (void)0 : abort()) -#else +#define ASSERT_WITH_MSG(expr, ...) \ + ((expr) ? (void)0 : abort_with_msg(__VA_ARGS__)) #define ASSERT(expr) \ - do { \ - if (!(expr)) { \ - err("ASSERTION FAILED in %s (%s:%d): %s", \ - __func__, __FILE__, __LINE__, STRINGIFY(expr)); \ - /* This may actually SIGSYS, due to seccomp, \ - * but that will still get the job done \ - */ \ - abort(); \ - } \ - } while (0) -#endif + ASSERT_WITH_MSG((expr), "ASSERTION FAILED in %s (%s:%d): %s", \ + __func__, __FILE__, __LINE__, STRINGIFY(expr)) #ifdef P_tmpdir #define TMPDIR P_tmpdir @@ -160,7 +152,7 @@ * ntohl_unaligned() - Read 32-bit BE value from a possibly unaligned address * @p: Pointer to the BE value in memory * - * Returns: Host-order value of 32-bit BE quantity at @p + * Return: host-order value of 32-bit BE quantity at @p */ static inline uint32_t ntohl_unaligned(const void *p) { @@ -234,12 +226,13 @@ int read_all_buf(int fd, void *buf, size_t len); int read_remainder(int fd, const struct iovec *iov, size_t cnt, size_t skip); void close_open_files(int argc, char **argv); bool snprintf_check(char *str, size_t size, const char *format, ...); +void fsync_pcap_and_log(void); /** * af_name() - Return name of an address family * @af: Address/protocol family (AF_INET or AF_INET6) * - * Returns: Name of the protocol family as a string + * Return: name of the protocol family as a string */ static inline const char *af_name(sa_family_t af) { @@ -379,6 +372,16 @@ static inline int wrap_accept4(int sockfd, struct sockaddr *addr, #define accept4(s, addr, addrlen, flags) \ wrap_accept4((s), (addr), (addrlen), (flags)) +static inline int wrap_getsockname(int sockfd, struct sockaddr *addr, +/* cppcheck-suppress constParameterPointer */ + socklen_t *addrlen) +{ + sa_init(addr, addrlen); + return getsockname(sockfd, addr, addrlen); +} +#define getsockname(s, addr, addrlen) \ + wrap_getsockname((s), (addr), (addrlen)) + #define PASST_MAXDNAME 254 /* 253 (RFC 1035) + 1 (the terminator) */ void encode_domain_name(char *buf, const char *domain_name); diff --git a/vhost_user.c b/vhost_user.c index 105f77a..fa343a8 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -137,8 +137,8 @@ static void *qva_to_va(struct vu_dev *dev, uint64_t qemu_addr) unsigned int i; /* Find matching memory region. */ - for (i = 0; i < dev->nregions; i++) { - const struct vu_dev_region *r = &dev->regions[i]; + for (i = 0; i < dev->memory.nregions; i++) { + const struct vu_dev_region *r = &dev->memory.regions[i]; if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) { /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ @@ -183,7 +183,7 @@ static void vmsg_set_reply_u64(struct vhost_user_msg *vmsg, uint64_t val) * @conn_fd: vhost-user command socket * @vmsg: vhost-user message * - * Return: 0 if recvmsg() has been interrupted or if there's no data to read, + * Return: 0 if recvmsg() has been interrupted or if there's no data to read, * 1 if a message has been received */ static int vu_message_read_default(int conn_fd, struct vhost_user_msg *vmsg) @@ -302,13 +302,13 @@ static void vu_message_write(int conn_fd, struct vhost_user_msg *vmsg) * @conn_fd: vhost-user command socket * @vmsg: vhost-user message */ -static void vu_send_reply(int conn_fd, struct vhost_user_msg *msg) +static void vu_send_reply(int conn_fd, struct vhost_user_msg *vmsg) { - msg->hdr.flags &= ~VHOST_USER_VERSION_MASK; - msg->hdr.flags |= VHOST_USER_VERSION; - msg->hdr.flags |= VHOST_USER_REPLY_MASK; + vmsg->hdr.flags &= ~VHOST_USER_VERSION_MASK; + vmsg->hdr.flags |= VHOST_USER_VERSION; + vmsg->hdr.flags |= VHOST_USER_REPLY_MASK; - vu_message_write(conn_fd, msg); + vu_message_write(conn_fd, vmsg); } /** @@ -316,10 +316,10 @@ static void vu_send_reply(int conn_fd, struct vhost_user_msg *msg) * @vdev: vhost-user device * @vmsg: vhost-user message * - * Return: True as a reply is requested + * Return: true as a reply is requested */ static bool vu_get_features_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { uint64_t features = 1ULL << VIRTIO_F_VERSION_1 | @@ -329,9 +329,9 @@ static bool vu_get_features_exec(struct vu_dev *vdev, (void)vdev; - vmsg_set_reply_u64(msg, features); + vmsg_set_reply_u64(vmsg, features); - debug("Sending back to guest u64: 0x%016"PRIx64, msg->payload.u64); + debug("Sending back to guest u64: 0x%016"PRIx64, vmsg->payload.u64); return true; } @@ -345,7 +345,7 @@ static void vu_set_enable_all_rings(struct vu_dev *vdev, bool enable) { uint16_t i; - for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) + for (i = 0; i < VHOST_USER_MAX_VQS; i++) vdev->vq[i].enable = enable; } @@ -354,14 +354,14 @@ static void vu_set_enable_all_rings(struct vu_dev *vdev, bool enable) * @vdev: vhost-user device * @vmsg: vhost-user message * - * Return: False as no reply is requested + * Return: false as no reply is requested */ static bool vu_set_features_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - debug("u64: 0x%016"PRIx64, msg->payload.u64); + debug("u64: 0x%016"PRIx64, vmsg->payload.u64); - vdev->features = msg->payload.u64; + vdev->features = vmsg->payload.u64; /* We only support devices conforming to VIRTIO 1.0 or * later */ @@ -379,13 +379,13 @@ static bool vu_set_features_exec(struct vu_dev *vdev, * @vdev: vhost-user device * @vmsg: vhost-user message * - * Return: False as no reply is requested + * Return: false as no reply is requested */ static bool vu_set_owner_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { (void)vdev; - (void)msg; + (void)vmsg; return false; } @@ -396,7 +396,7 @@ static bool vu_set_owner_exec(struct vu_dev *vdev, * @vdev: vhost-user device * @vq: Virtqueue * - * Return: True if ring cannot be mapped to our address space + * Return: true if ring cannot be mapped to our address space */ static bool map_ring(struct vu_dev *vdev, struct vu_virtq *vq) { @@ -418,18 +418,18 @@ static bool map_ring(struct vu_dev *vdev, struct vu_virtq *vq) * @vdev: vhost-user device * @vmsg: vhost-user message * - * Return: False as no reply is requested + * Return: false as no reply is requested * * #syscalls:vu mmap|mmap2 munmap */ static bool vu_set_mem_table_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - struct vhost_user_memory m = msg->payload.memory, *memory = &m; + struct vhost_user_memory m = vmsg->payload.memory, *memory = &m; unsigned int i; - for (i = 0; i < vdev->nregions; i++) { - const struct vu_dev_region *r = &vdev->regions[i]; + for (i = 0; i < vdev->memory.nregions; i++) { + const struct vu_dev_region *r = &vdev->memory.regions[i]; if (r->mmap_addr) { /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ @@ -437,12 +437,12 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev, r->size + r->mmap_offset); } } - vdev->nregions = memory->nregions; + vdev->memory.nregions = memory->nregions; debug("vhost-user nregions: %u", memory->nregions); - for (i = 0; i < vdev->nregions; i++) { + for (i = 0; i < vdev->memory.nregions; i++) { struct vhost_user_memory_region *msg_region = &memory->regions[i]; - struct vu_dev_region *dev_region = &vdev->regions[i]; + struct vu_dev_region *dev_region = &vdev->memory.regions[i]; void *mmap_addr; debug("vhost-user region %d", i); @@ -465,7 +465,7 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev, */ mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, PROT_READ | PROT_WRITE, MAP_SHARED | - MAP_NORESERVE, msg->fds[i], 0); + MAP_NORESERVE, vmsg->fds[i], 0); if (mmap_addr == MAP_FAILED) die_perror("vhost-user region mmap error"); @@ -474,23 +474,17 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev, debug(" mmap_addr: 0x%016"PRIx64, dev_region->mmap_addr); - close(msg->fds[i]); + close(vmsg->fds[i]); } - for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) { + for (i = 0; i < VHOST_USER_MAX_VQS; i++) { if (vdev->vq[i].vring.desc) { if (map_ring(vdev, &vdev->vq[i])) die("remapping queue %d during setmemtable", i); } } - /* As vu_packet_check_range() has no access to the number of - * memory regions, mark the end of the array with mmap_addr = 0 - */ - ASSERT(vdev->nregions < VHOST_USER_MAX_RAM_SLOTS - 1); - vdev->regions[vdev->nregions].mmap_addr = 0; - - tap_sock_update_pool(vdev->regions, 0); + ASSERT(vdev->memory.nregions < VHOST_USER_MAX_RAM_SLOTS); return false; } @@ -541,7 +535,7 @@ static void vu_log_page(uint8_t *log_table, uint64_t page) /** * vu_log_write() - Log memory write - * @dev: vhost-user device + * @vdev: vhost-user device * @address: Memory address * @length: Memory size */ @@ -566,23 +560,23 @@ void vu_log_write(const struct vu_dev *vdev, uint64_t address, uint64_t length) * @vdev: vhost-user device * @vmsg: vhost-user message * - * Return: False as no reply is requested + * Return: true as a reply is requested * * #syscalls:vu mmap|mmap2 munmap */ static bool vu_set_log_base_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { uint64_t log_mmap_size, log_mmap_offset; void *base; int fd; - if (msg->fd_num != 1 || msg->hdr.size != sizeof(msg->payload.log)) + if (vmsg->fd_num != 1 || vmsg->hdr.size != sizeof(vmsg->payload.log)) die("vhost-user: Invalid log_base message"); - fd = msg->fds[0]; - log_mmap_offset = msg->payload.log.mmap_offset; - log_mmap_size = msg->payload.log.mmap_size; + fd = vmsg->fds[0]; + log_mmap_offset = vmsg->payload.log.mmap_offset; + log_mmap_size = vmsg->payload.log.mmap_size; debug("vhost-user log mmap_offset: %"PRId64, log_mmap_offset); debug("vhost-user log mmap_size: %"PRId64, log_mmap_size); @@ -599,8 +593,8 @@ static bool vu_set_log_base_exec(struct vu_dev *vdev, vdev->log_table = base; vdev->log_size = log_mmap_size; - msg->hdr.size = sizeof(msg->payload.u64); - msg->fd_num = 0; + vmsg->hdr.size = sizeof(vmsg->payload.u64); + vmsg->fd_num = 0; return true; } @@ -610,18 +604,18 @@ static bool vu_set_log_base_exec(struct vu_dev *vdev, * @vdev: vhost-user device * @vmsg: vhost-user message * - * Return: False as no reply is requested + * Return: false as no reply is requested */ static bool vu_set_log_fd_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - if (msg->fd_num != 1) + if (vmsg->fd_num != 1) die("Invalid log_fd message"); if (vdev->log_call_fd != -1) close(vdev->log_call_fd); - vdev->log_call_fd = msg->fds[0]; + vdev->log_call_fd = vmsg->fds[0]; debug("Got log_call_fd: %d", vdev->log_call_fd); @@ -633,13 +627,13 @@ static bool vu_set_log_fd_exec(struct vu_dev *vdev, * @vdev: vhost-user device * @vmsg: vhost-user message * - * Return: False as no reply is requested + * Return: false as no reply is requested */ static bool vu_set_vring_num_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - unsigned int idx = msg->payload.state.index; - unsigned int num = msg->payload.state.num; + unsigned int idx = vmsg->payload.state.index; + unsigned int num = vmsg->payload.state.num; trace("State.index: %u", idx); trace("State.num: %u", num); @@ -653,16 +647,16 @@ static bool vu_set_vring_num_exec(struct vu_dev *vdev, * @vdev: vhost-user device * @vmsg: vhost-user message * - * Return: False as no reply is requested + * Return: false as no reply is requested */ static bool vu_set_vring_addr_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { /* We need to copy the payload to vhost_vring_addr structure - * to access index because address of msg->payload.addr + * to access index because address of vmsg->payload.addr * can be unaligned as it is packed. */ - struct vhost_vring_addr addr = msg->payload.addr; + struct vhost_vring_addr addr = vmsg->payload.addr; struct vu_virtq *vq = &vdev->vq[addr.index]; debug("vhost_vring_addr:"); @@ -677,7 +671,7 @@ static bool vu_set_vring_addr_exec(struct vu_dev *vdev, debug(" log_guest_addr: 0x%016" PRIx64, (uint64_t)addr.log_guest_addr); - vq->vra = msg->payload.addr; + vq->vra = vmsg->payload.addr; vq->vring.flags = addr.flags; vq->vring.log_guest_addr = addr.log_guest_addr; @@ -699,13 +693,13 @@ static bool vu_set_vring_addr_exec(struct vu_dev *vdev, * @vdev: vhost-user device * @vmsg: vhost-user message * - * Return: False as no reply is requested + * Return: false as no reply is requested */ static bool vu_set_vring_base_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - unsigned int idx = msg->payload.state.index; - unsigned int num = msg->payload.state.num; + unsigned int idx = vmsg->payload.state.index; + unsigned int num = vmsg->payload.state.num; debug("State.index: %u", idx); debug("State.num: %u", num); @@ -720,16 +714,16 @@ static bool vu_set_vring_base_exec(struct vu_dev *vdev, * @vdev: vhost-user device * @vmsg: vhost-user message * - * Return: True as a reply is requested + * Return: true as a reply is requested */ static bool vu_get_vring_base_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - unsigned int idx = msg->payload.state.index; + unsigned int idx = vmsg->payload.state.index; debug("State.index: %u", idx); - msg->payload.state.num = vdev->vq[idx].last_avail_idx; - msg->hdr.size = sizeof(msg->payload.state); + vmsg->payload.state.num = vdev->vq[idx].last_avail_idx; + vmsg->hdr.size = sizeof(vmsg->payload.state); vdev->vq[idx].started = false; vdev->vq[idx].vring.avail = 0; @@ -771,21 +765,21 @@ static void vu_set_watch(const struct vu_dev *vdev, int idx) * close fds if NOFD bit is set * @vmsg: vhost-user message */ -static void vu_check_queue_msg_file(struct vhost_user_msg *msg) +static void vu_check_queue_msg_file(struct vhost_user_msg *vmsg) { - bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; - int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; - if (idx >= VHOST_USER_MAX_QUEUES) + if (idx >= VHOST_USER_MAX_VQS) die("Invalid vhost-user queue index: %u", idx); if (nofd) { - vmsg_close_fds(msg); + vmsg_close_fds(vmsg); return; } - if (msg->fd_num != 1) - die("Invalid fds in vhost-user request: %d", msg->hdr.request); + if (vmsg->fd_num != 1) + die("Invalid fds in vhost-user request: %d", vmsg->hdr.request); } /** @@ -794,17 +788,17 @@ static void vu_check_queue_msg_file(struct vhost_user_msg *msg) * @vdev: vhost-user device * @vmsg: vhost-user message * - * Return: False as no reply is requested + * Return: false as no reply is requested */ static bool vu_set_vring_kick_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; - int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; - debug("u64: 0x%016"PRIx64, msg->payload.u64); + debug("u64: 0x%016"PRIx64, vmsg->payload.u64); - vu_check_queue_msg_file(msg); + vu_check_queue_msg_file(vmsg); if (vdev->vq[idx].kick_fd != -1) { epoll_del(vdev->context, vdev->vq[idx].kick_fd); @@ -813,7 +807,7 @@ static bool vu_set_vring_kick_exec(struct vu_dev *vdev, } if (!nofd) - vdev->vq[idx].kick_fd = msg->fds[0]; + vdev->vq[idx].kick_fd = vmsg->fds[0]; debug("Got kick_fd: %d for vq: %d", vdev->vq[idx].kick_fd, idx); @@ -834,17 +828,17 @@ static bool vu_set_vring_kick_exec(struct vu_dev *vdev, * @vdev: vhost-user device * @vmsg: vhost-user message * - * Return: False as no reply is requested + * Return: false as no reply is requested */ static bool vu_set_vring_call_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; - int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; - debug("u64: 0x%016"PRIx64, msg->payload.u64); + debug("u64: 0x%016"PRIx64, vmsg->payload.u64); - vu_check_queue_msg_file(msg); + vu_check_queue_msg_file(vmsg); if (vdev->vq[idx].call_fd != -1) { close(vdev->vq[idx].call_fd); @@ -852,11 +846,11 @@ static bool vu_set_vring_call_exec(struct vu_dev *vdev, } if (!nofd) - vdev->vq[idx].call_fd = msg->fds[0]; + vdev->vq[idx].call_fd = vmsg->fds[0]; /* in case of I/O hang after reconnecting */ if (vdev->vq[idx].call_fd != -1) - eventfd_write(msg->fds[0], 1); + eventfd_write(vmsg->fds[0], 1); debug("Got call_fd: %d for vq: %d", vdev->vq[idx].call_fd, idx); @@ -869,17 +863,17 @@ static bool vu_set_vring_call_exec(struct vu_dev *vdev, * @vdev: vhost-user device * @vmsg: vhost-user message * - * Return: False as no reply is requested + * Return: false as no reply is requested */ static bool vu_set_vring_err_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; - int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; - debug("u64: 0x%016"PRIx64, msg->payload.u64); + debug("u64: 0x%016"PRIx64, vmsg->payload.u64); - vu_check_queue_msg_file(msg); + vu_check_queue_msg_file(vmsg); if (vdev->vq[idx].err_fd != -1) { close(vdev->vq[idx].err_fd); @@ -887,7 +881,7 @@ static bool vu_set_vring_err_exec(struct vu_dev *vdev, } if (!nofd) - vdev->vq[idx].err_fd = msg->fds[0]; + vdev->vq[idx].err_fd = vmsg->fds[0]; return false; } @@ -898,10 +892,10 @@ static bool vu_set_vring_err_exec(struct vu_dev *vdev, * @vdev: vhost-user device * @vmsg: vhost-user message * - * Return: True as a reply is requested + * Return: true as a reply is requested */ static bool vu_get_protocol_features_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | @@ -909,7 +903,7 @@ static bool vu_get_protocol_features_exec(struct vu_dev *vdev, 1ULL << VHOST_USER_PROTOCOL_F_RARP; (void)vdev; - vmsg_set_reply_u64(msg, features); + vmsg_set_reply_u64(vmsg, features); return true; } @@ -919,16 +913,16 @@ static bool vu_get_protocol_features_exec(struct vu_dev *vdev, * @vdev: vhost-user device * @vmsg: vhost-user message * - * Return: False as no reply is requested + * Return: false as no reply is requested */ static bool vu_set_protocol_features_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - uint64_t features = msg->payload.u64; + uint64_t features = vmsg->payload.u64; debug("u64: 0x%016"PRIx64, features); - vdev->protocol_features = msg->payload.u64; + vdev->protocol_features = vmsg->payload.u64; return false; } @@ -938,14 +932,16 @@ static bool vu_set_protocol_features_exec(struct vu_dev *vdev, * @vdev: vhost-user device * @vmsg: vhost-user message * - * Return: True as a reply is requested + * Return: true as a reply is requested */ static bool vu_get_queue_num_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { (void)vdev; - vmsg_set_reply_u64(msg, VHOST_USER_MAX_QUEUES); + vmsg_set_reply_u64(vmsg, VHOST_USER_MAX_VQS / 2); + + debug("VHOST_USER_MAX_VQS %u", VHOST_USER_MAX_VQS / 2); return true; } @@ -955,18 +951,18 @@ static bool vu_get_queue_num_exec(struct vu_dev *vdev, * @vdev: vhost-user device * @vmsg: vhost-user message * - * Return: False as no reply is requested + * Return: false as no reply is requested */ static bool vu_set_vring_enable_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - unsigned int enable = msg->payload.state.num; - unsigned int idx = msg->payload.state.index; + unsigned int enable = vmsg->payload.state.num; + unsigned int idx = vmsg->payload.state.index; debug("State.index: %u", idx); debug("State.enable: %u", enable); - if (idx >= VHOST_USER_MAX_QUEUES) + if (idx >= VHOST_USER_MAX_VQS) die("Invalid vring_enable index: %u", idx); vdev->vq[idx].enable = enable; @@ -974,17 +970,17 @@ static bool vu_set_vring_enable_exec(struct vu_dev *vdev, } /** - * vu_set_send_rarp_exec() - vhost-user specification says: "Broadcast a fake - * RARP to notify the migration is terminated", - * but passt doesn't need to update any ARP table, - * so do nothing to silence QEMU bogus error message + * vu_send_rarp_exec() - vhost-user specification says: "Broadcast a fake + * RARP to notify the migration is terminated", + * but passt doesn't need to update any ARP table, + * so do nothing to silence QEMU bogus error message * @vdev: vhost-user device * @vmsg: vhost-user message * - * Return: False as no reply is requested + * Return: false as no reply is requested */ static bool vu_send_rarp_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { char macstr[ETH_ADDRSTRLEN]; @@ -993,7 +989,7 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev, /* ignore the command */ debug("Ignore command VHOST_USER_SEND_RARP for %s", - eth_ntop((unsigned char *)&msg->payload.u64, macstr, + eth_ntop((unsigned char *)&vmsg->payload.u64, macstr, sizeof(macstr))); return false; @@ -1004,16 +1000,16 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev, * @vdev: vhost-user device * @vmsg: vhost-user message * - * Return: True as the reply contains 0 to indicate success + * Return: true as the reply contains 0 to indicate success * and set bit 8 as we don't provide our own fd. */ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - unsigned int direction = msg->payload.transfer_state.direction; - unsigned int phase = msg->payload.transfer_state.phase; + unsigned int direction = vmsg->payload.transfer_state.direction; + unsigned int phase = vmsg->payload.transfer_state.phase; - if (msg->fd_num != 1) + if (vmsg->fd_num != 1) die("Invalid device_state_fd message"); if (phase != VHOST_USER_TRANSFER_STATE_PHASE_STOPPED) @@ -1021,13 +1017,13 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev, if (direction != VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE && direction != VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD) - die("Invalide device_state_fd direction: %d", direction); + die("Invalid device_state_fd direction: %d", direction); - migrate_request(vdev->context, msg->fds[0], + migrate_request(vdev->context, vmsg->fds[0], direction == VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD); /* We don't provide a new fd for the data transfer */ - vmsg_set_reply_u64(msg, VHOST_USER_VRING_NOFD_MASK); + vmsg_set_reply_u64(vmsg, VHOST_USER_VRING_NOFD_MASK); return true; } @@ -1037,13 +1033,13 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev, * @vdev: vhost-user device * @vmsg: vhost-user message * - * Return: True as the reply contains the migration result + * Return: true as the reply contains the migration result */ /* cppcheck-suppress constParameterCallback */ static bool vu_check_device_state_exec(struct vu_dev *vdev, - struct vhost_user_msg *msg) + struct vhost_user_msg *vmsg) { - vmsg_set_reply_u64(msg, vdev->context->device_state_result); + vmsg_set_reply_u64(vmsg, vdev->context->device_state_result); return true; } @@ -1051,7 +1047,6 @@ static bool vu_check_device_state_exec(struct vu_dev *vdev, /** * vu_init() - Initialize vhost-user device structure * @c: execution context - * @vdev: vhost-user device */ void vu_init(struct ctx *c) { @@ -1059,7 +1054,7 @@ void vu_init(struct ctx *c) c->vdev = &vdev_storage; c->vdev->context = c; - for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) { + for (i = 0; i < VHOST_USER_MAX_VQS; i++) { c->vdev->vq[i] = (struct vu_virtq){ .call_fd = -1, .kick_fd = -1, @@ -1082,7 +1077,7 @@ void vu_cleanup(struct vu_dev *vdev) { unsigned int i; - for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) { + for (i = 0; i < VHOST_USER_MAX_VQS; i++) { struct vu_virtq *vq = &vdev->vq[i]; vq->started = false; @@ -1107,8 +1102,8 @@ void vu_cleanup(struct vu_dev *vdev) vq->vring.avail = 0; } - for (i = 0; i < vdev->nregions; i++) { - const struct vu_dev_region *r = &vdev->regions[i]; + for (i = 0; i < vdev->memory.nregions; i++) { + const struct vu_dev_region *r = &vdev->memory.regions[i]; if (r->mmap_addr) { /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ @@ -1116,7 +1111,7 @@ void vu_cleanup(struct vu_dev *vdev) r->size + r->mmap_offset); } } - vdev->nregions = 0; + vdev->memory.nregions = 0; vu_close_log(vdev); @@ -1134,7 +1129,7 @@ static void vu_sock_reset(struct vu_dev *vdev) } static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev, - struct vhost_user_msg *msg) = { + struct vhost_user_msg *vmsg) = { [VHOST_USER_GET_FEATURES] = vu_get_features_exec, [VHOST_USER_SET_FEATURES] = vu_set_features_exec, [VHOST_USER_GET_PROTOCOL_FEATURES] = vu_get_protocol_features_exec, @@ -1165,7 +1160,7 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev, */ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events) { - struct vhost_user_msg msg = { 0 }; + struct vhost_user_msg vmsg = { 0 }; bool need_reply, reply_requested; int ret; @@ -1174,41 +1169,46 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events) return; } - ret = vu_message_read_default(fd, &msg); + ret = vu_message_read_default(fd, &vmsg); if (ret == 0) { vu_sock_reset(vdev); return; } debug("================ Vhost user message ================"); - debug("Request: %s (%d)", vu_request_to_string(msg.hdr.request), - msg.hdr.request); - debug("Flags: 0x%x", msg.hdr.flags); - debug("Size: %u", msg.hdr.size); + debug("Request: %s (%d)", vu_request_to_string(vmsg.hdr.request), + vmsg.hdr.request); + debug("Flags: 0x%x", vmsg.hdr.flags); + debug("Size: %u", vmsg.hdr.size); - need_reply = msg.hdr.flags & VHOST_USER_NEED_REPLY_MASK; + need_reply = vmsg.hdr.flags & VHOST_USER_NEED_REPLY_MASK; - if (msg.hdr.request >= 0 && msg.hdr.request < VHOST_USER_MAX && - vu_handle[msg.hdr.request]) - reply_requested = vu_handle[msg.hdr.request](vdev, &msg); + if (vmsg.hdr.request >= 0 && vmsg.hdr.request < VHOST_USER_MAX && + vu_handle[vmsg.hdr.request]) + reply_requested = vu_handle[vmsg.hdr.request](vdev, &vmsg); else - die("Unhandled request: %d", msg.hdr.request); + die("Unhandled request: %d", vmsg.hdr.request); /* cppcheck-suppress legacyUninitvar */ if (!reply_requested && need_reply) { - msg.payload.u64 = 0; - msg.hdr.flags = 0; - msg.hdr.size = sizeof(msg.payload.u64); - msg.fd_num = 0; + vmsg.payload.u64 = 0; + vmsg.hdr.flags = 0; + vmsg.hdr.size = sizeof(vmsg.payload.u64); + vmsg.fd_num = 0; reply_requested = true; } if (reply_requested) - vu_send_reply(fd, &msg); + vu_send_reply(fd, &vmsg); - if (msg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE && + if (vmsg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE && vdev->context->device_state_result == 0 && !vdev->context->migrate_target) { - info("Migration complete, exiting"); - _exit(EXIT_SUCCESS); + if (vdev->context->migrate_exit) { + info("Migration complete, exiting"); + _exit(EXIT_SUCCESS); + } + + info("Migration complete"); + vdev->context->one_off = false; } } diff --git a/vhost_user.h b/vhost_user.h index 1daacd1..e806a9e 100644 --- a/vhost_user.h +++ b/vhost_user.h @@ -184,7 +184,7 @@ union vhost_user_payload { }; /** - * struct vhost_user_msg - vhost-use message + * struct vhost_user_msg - vhost-user message * @hdr: Message header * @payload: Message payload * @fds: File descriptors associated with the message @@ -217,7 +217,7 @@ struct vhost_user_msg { }) /** - * vu_queue_enabled - Return state of a virtqueue + * vu_queue_enabled() - Return state of a virtqueue * @vq: virtqueue to check * * Return: true if the virqueue is enabled, false otherwise @@ -228,7 +228,7 @@ static inline bool vu_queue_enabled(const struct vu_virtq *vq) } /** - * vu_queue_started - Return state of a virtqueue + * vu_queue_started() - Return state of a virtqueue * @vq: virtqueue to check * * Return: true if the virqueue is started, false otherwise @@ -102,8 +102,8 @@ static void *vu_gpa_to_va(const struct vu_dev *dev, uint64_t *plen, return NULL; /* Find matching memory region. */ - for (i = 0; i < dev->nregions; i++) { - const struct vu_dev_region *r = &dev->regions[i]; + for (i = 0; i < dev->memory.nregions; i++) { + const struct vu_dev_region *r = &dev->memory.regions[i]; if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) { @@ -156,9 +156,9 @@ static inline uint16_t vring_avail_ring(const struct vu_virtq *vq, int i) } /** - * virtq_used_event - Get location of used event indices + * virtq_used_event() - Get location of used event indices * (only with VIRTIO_F_EVENT_IDX) - * @vq Virtqueue + * @vq: Virtqueue * * Return: return the location of the used event index */ @@ -170,7 +170,7 @@ static inline uint16_t *virtq_used_event(const struct vu_virtq *vq) /** * vring_get_used_event() - Get the used event from the available ring - * @vq Virtqueue + * @vq: Virtqueue * * Return: the used event (available only if VIRTIO_RING_F_EVENT_IDX is set) * used_event is a performant alternative where the driver @@ -235,6 +235,7 @@ static int virtqueue_read_indirect_desc(const struct vu_dev *dev, memcpy(desc, orig_desc, read_len); len -= read_len; addr += read_len; + /* NOLINTNEXTLINE(bugprone-sizeof-expression,cert-arr39-c) */ desc += read_len / sizeof(struct vring_desc); } @@ -243,9 +244,9 @@ static int virtqueue_read_indirect_desc(const struct vu_dev *dev, /** * enum virtqueue_read_desc_state - State in the descriptor chain - * @VIRTQUEUE_READ_DESC_ERROR Found an invalid descriptor - * @VIRTQUEUE_READ_DESC_DONE No more descriptors in the chain - * @VIRTQUEUE_READ_DESC_MORE there are more descriptors in the chain + * @VIRTQUEUE_READ_DESC_ERROR: Found an invalid descriptor + * @VIRTQUEUE_READ_DESC_DONE: No more descriptors in the chain + * @VIRTQUEUE_READ_DESC_MORE: there are more descriptors in the chain */ enum virtqueue_read_desc_state { VIRTQUEUE_READ_DESC_ERROR = -1, @@ -346,8 +347,9 @@ void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq) die_perror("Error writing vhost-user queue eventfd"); } -/* virtq_avail_event() - Get location of available event indices - * (only with VIRTIO_F_EVENT_IDX) +/** + * virtq_avail_event() - Get location of available event indices + * (only with VIRTIO_F_EVENT_IDX) * @vq: Virtqueue * * Return: return the location of the available event index @@ -420,8 +422,8 @@ static bool virtqueue_map_desc(const struct vu_dev *dev, } /** - * vu_queue_map_desc - Map the virtqueue descriptor ring into our virtual - * address space + * vu_queue_map_desc() - Map the virtqueue descriptor ring into our virtual + * address space * @dev: Vhost-user device * @vq: Virtqueue * @idx: First descriptor ring entry to map @@ -504,7 +506,7 @@ static int vu_queue_map_desc(const struct vu_dev *dev, * vu_queue_pop() - Pop an entry from the virtqueue * @dev: Vhost-user device * @vq: Virtqueue - * @elem: Virtqueue element to file with the entry information + * @elem: Virtqueue element to fill with the entry information * * Return: -1 if there is an error, 0 otherwise */ @@ -544,7 +546,7 @@ int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq, } /** - * vu_queue_detach_element() - Detach an element from the virqueue + * vu_queue_detach_element() - Detach an element from the virtqueue * @vq: Virtqueue */ void vu_queue_detach_element(struct vu_virtq *vq) @@ -554,7 +556,7 @@ void vu_queue_detach_element(struct vu_virtq *vq) } /** - * vu_queue_unpop() - Push back the previously popped element from the virqueue + * vu_queue_unpop() - Push back the previously popped element from the virtqueue * @vq: Virtqueue */ /* cppcheck-suppress unusedFunction */ @@ -568,6 +570,8 @@ void vu_queue_unpop(struct vu_virtq *vq) * vu_queue_rewind() - Push back a given number of popped elements * @vq: Virtqueue * @num: Number of element to unpop + * + * Return: true on success, false if not */ bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num) { @@ -88,7 +88,7 @@ struct vu_dev_region { uint64_t mmap_addr; }; -#define VHOST_USER_MAX_QUEUES 2 +#define VHOST_USER_MAX_VQS 2 /* * Set a reasonable maximum number of ram slots, which will be supported by @@ -97,10 +97,21 @@ struct vu_dev_region { #define VHOST_USER_MAX_RAM_SLOTS 32 /** + * struct vdev_memory - Describes the shared memory regions for a vhost-user + * device + * @nregions: Number of shared memory regions + * @regions: Guest shared memory regions + */ +struct vdev_memory { + uint32_t nregions; + struct vu_dev_region regions[VHOST_USER_MAX_RAM_SLOTS]; +}; + +/** * struct vu_dev - vhost-user device information * @context: Execution context - * @nregions: Number of shared memory regions - * @regions: Guest shared memory regions + * @memory: Shared memory regions + * @vq: Virtqueues of the device * @features: Vhost-user features * @protocol_features: Vhost-user protocol features * @log_call_fd: Eventfd to report logging update @@ -109,9 +120,8 @@ struct vu_dev_region { */ struct vu_dev { struct ctx *context; - uint32_t nregions; - struct vu_dev_region regions[VHOST_USER_MAX_RAM_SLOTS]; - struct vu_virtq vq[VHOST_USER_MAX_QUEUES]; + struct vdev_memory memory; + struct vu_virtq vq[VHOST_USER_MAX_VQS]; uint64_t features; uint64_t protocol_features; int log_call_fd; @@ -140,7 +150,7 @@ struct vu_virtq_element { * @features: Features set * @fb: Feature bit to check * - * Return: True if the feature bit is set + * Return: true if the feature bit is set */ static inline bool has_feature(uint64_t features, unsigned int fbit) { @@ -150,9 +160,9 @@ static inline bool has_feature(uint64_t features, unsigned int fbit) /** * vu_has_feature() - Check if a virtio-net feature is available * @vdev: Vhost-user device - * @bit: Feature to check + * @fbit: Feature to check * - * Return: True if the feature is available + * Return: true if the feature is available */ static inline bool vu_has_feature(const struct vu_dev *vdev, unsigned int fbit) @@ -163,9 +173,9 @@ static inline bool vu_has_feature(const struct vu_dev *vdev, /** * vu_has_protocol_feature() - Check if a vhost-user feature is available * @vdev: Vhost-user device - * @bit: Feature to check + * @fbit: Feature to check * - * Return: True if the feature is available + * Return: true if the feature is available */ /* cppcheck-suppress unusedFunction */ static inline bool vu_has_protocol_feature(const struct vu_dev *vdev, diff --git a/vu_common.c b/vu_common.c index 686a09b..b716070 100644 --- a/vu_common.c +++ b/vu_common.c @@ -25,22 +25,28 @@ /** * vu_packet_check_range() - Check if a given memory zone is contained in * a mapped guest memory region - * @buf: Array of the available memory regions + * @memory: Array of the available memory regions * @ptr: Start of desired data range - * @size: Length of desired data range + * @len: Length of desired data range * * Return: 0 if the zone is in a mapped memory region, -1 otherwise */ -int vu_packet_check_range(void *buf, const char *ptr, size_t len) +int vu_packet_check_range(struct vdev_memory *memory, + const char *ptr, size_t len) { - struct vu_dev_region *dev_region; + struct vu_dev_region *dev_region = memory->regions; + unsigned int i; - for (dev_region = buf; dev_region->mmap_addr; dev_region++) { + for (i = 0; i < memory->nregions; i++) { + uintptr_t base_addr = dev_region[i].mmap_addr + + dev_region[i].mmap_offset; /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ - char *m = (char *)(uintptr_t)dev_region->mmap_addr; + const char *base = (const char *)base_addr; - if (m <= ptr && - ptr + len <= m + dev_region->mmap_offset + dev_region->size) + ASSERT(base_addr >= dev_region[i].mmap_addr); + + if (len <= dev_region[i].size && base <= ptr && + (size_t)(ptr - base) <= dev_region[i].size - len) return 0; } @@ -159,7 +165,6 @@ static void vu_handle_tx(struct vu_dev *vdev, int index, struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE]; struct iovec out_sg[VIRTQUEUE_MAX_SIZE]; struct vu_virtq *vq = &vdev->vq[index]; - int hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); int out_sg_count; int count; @@ -172,6 +177,7 @@ static void vu_handle_tx(struct vu_dev *vdev, int index, while (count < VIRTQUEUE_MAX_SIZE && out_sg_count + VU_MAX_TX_BUFFER_NB <= VIRTQUEUE_MAX_SIZE) { int ret; + struct iov_tail data; elem[count].out_num = VU_MAX_TX_BUFFER_NB; elem[count].out_sg = &out_sg[out_sg_count]; @@ -187,25 +193,10 @@ static void vu_handle_tx(struct vu_dev *vdev, int index, warn("virtio-net transmit queue contains no out buffers"); break; } - if (elem[count].out_num == 1) { - tap_add_packet(vdev->context, - elem[count].out_sg[0].iov_len - hdrlen, - (char *)elem[count].out_sg[0].iov_base + - hdrlen); - } else { - /* vnet header can be in a separate iovec */ - if (elem[count].out_num != 2) { - debug("virtio-net transmit queue contains more than one buffer ([%d]: %u)", - count, elem[count].out_num); - } else if (elem[count].out_sg[0].iov_len != (size_t)hdrlen) { - debug("virtio-net transmit queue entry not aligned on hdrlen ([%d]: %d != %zu)", - count, hdrlen, elem[count].out_sg[0].iov_len); - } else { - tap_add_packet(vdev->context, - elem[count].out_sg[1].iov_len, - (char *)elem[count].out_sg[1].iov_base); - } - } + + data = IOV_TAIL(elem[count].out_sg, elem[count].out_num, 0); + if (IOV_DROP_HEADER(&data, struct virtio_net_hdr_mrg_rxbuf)) + tap_add_packet(vdev->context, &data, now); count++; } |