aboutgitcodebugslistschat
diff options
context:
space:
mode:
-rw-r--r--Makefile3
-rw-r--r--README.md2
-rw-r--r--arp.c86
-rw-r--r--arp.h2
-rw-r--r--checksum.c8
-rw-r--r--conf.c435
-rw-r--r--conf.h1
-rw-r--r--contrib/fedora/passt.spec42
-rw-r--r--contrib/selinux/passt-repair.te16
-rw-r--r--contrib/selinux/passt.te8
-rw-r--r--contrib/selinux/pasta.fc10
-rw-r--r--contrib/selinux/pasta.te48
-rw-r--r--dhcp.c48
-rw-r--r--dhcp.h2
-rw-r--r--dhcpv6.c227
-rw-r--r--dhcpv6.h2
-rw-r--r--doc/platform-requirements/.gitignore1
-rw-r--r--doc/platform-requirements/Makefile4
-rw-r--r--doc/platform-requirements/common.h1
-rw-r--r--doc/platform-requirements/listen-vs-repair.c128
-rw-r--r--doc/platform-requirements/reuseaddr-priority.c6
-rw-r--r--epoll_type.h4
-rw-r--r--flow.c186
-rw-r--r--flow.h3
-rw-r--r--flow_table.h4
-rw-r--r--fwd.c89
-rw-r--r--fwd.h5
-rw-r--r--icmp.c42
-rw-r--r--icmp.h2
-rw-r--r--inany.c4
-rw-r--r--inany.h27
-rw-r--r--iov.c133
-rw-r--r--iov.h58
-rw-r--r--ip.c33
-rw-r--r--ip.h5
-rw-r--r--isolation.c8
-rw-r--r--lineread.c2
-rw-r--r--linux_dep.h6
-rw-r--r--log.c8
-rw-r--r--log.h1
-rw-r--r--migrate.c10
-rw-r--r--ndp.c18
-rw-r--r--ndp.h4
-rw-r--r--netlink.c3
-rw-r--r--packet.c162
-rw-r--r--packet.h47
-rw-r--r--passt-repair.16
-rw-r--r--passt-repair.c108
-rw-r--r--passt.129
-rw-r--r--passt.c23
-rw-r--r--passt.h13
-rw-r--r--pasta.c26
-rw-r--r--pcap.c56
-rw-r--r--pcap.h2
-rw-r--r--repair.c60
-rw-r--r--repair.h3
-rw-r--r--siphash.h2
-rw-r--r--tap.c215
-rw-r--r--tap.h35
-rw-r--r--tcp.c654
-rw-r--r--tcp.h1
-rw-r--r--tcp_buf.c38
-rw-r--r--tcp_conn.h5
-rw-r--r--tcp_internal.h21
-rw-r--r--tcp_splice.c47
-rw-r--r--tcp_vu.c35
-rw-r--r--test/.gitignore2
-rw-r--r--test/Makefile29
-rw-r--r--test/build/all61
-rwxr-xr-xtest/build/build.py109
-rw-r--r--test/build/clang_tidy17
-rw-r--r--test/build/cppcheck17
-rwxr-xr-xtest/build/static_checkers.sh26
-rw-r--r--test/lib/exeter58
-rwxr-xr-xtest/lib/setup4
-rwxr-xr-xtest/lib/term7
-rwxr-xr-xtest/lib/test7
-rwxr-xr-xtest/passt.mbuto5
-rw-r--r--test/pasta_options/log_to_file10
-rwxr-xr-xtest/run24
-rwxr-xr-xtest/smoke/smoke.sh33
-rw-r--r--udp.c675
-rw-r--r--udp.h7
-rw-r--r--udp_flow.c232
-rw-r--r--udp_flow.h18
-rw-r--r--udp_internal.h6
-rw-r--r--udp_vu.c145
-rw-r--r--udp_vu.h8
-rw-r--r--util.c65
-rw-r--r--util.h43
-rw-r--r--vhost_user.c316
-rw-r--r--vhost_user.h6
-rw-r--r--virtio.c34
-rw-r--r--virtio.h32
-rw-r--r--vu_common.c47
95 files changed, 3467 insertions, 1899 deletions
diff --git a/Makefile b/Makefile
index f2ac8e5..3328f83 100644
--- a/Makefile
+++ b/Makefile
@@ -20,6 +20,7 @@ $(if $(TARGET),,$(error Failed to get target architecture))
# Get 'uname -m'-like architecture description for target
TARGET_ARCH := $(firstword $(subst -, ,$(TARGET)))
TARGET_ARCH := $(patsubst [:upper:],[:lower:],$(TARGET_ARCH))
+TARGET_ARCH := $(patsubst arm%,arm,$(TARGET_ARCH))
TARGET_ARCH := $(subst powerpc,ppc,$(TARGET_ARCH))
# On some systems enabling optimization also enables source fortification,
@@ -29,7 +30,7 @@ ifeq ($(shell $(CC) -O2 -dM -E - < /dev/null 2>&1 | grep ' _FORTIFY_SOURCE ' > /
FORTIFY_FLAG := -D_FORTIFY_SOURCE=2
endif
-FLAGS := -Wall -Wextra -Wno-format-zero-length
+FLAGS := -Wall -Wextra -Wno-format-zero-length -Wformat-security
FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
FLAGS += $(FORTIFY_FLAG) -O2 -pie -fPIE
FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE)
diff --git a/README.md b/README.md
index 54fed07..8f188f4 100644
--- a/README.md
+++ b/README.md
@@ -291,7 +291,7 @@ speeding up local connections, and usually requiring NAT. _pasta_:
* ✅ all capabilities dropped, other than `CAP_NET_BIND_SERVICE` (if granted)
* ✅ with default options, user, mount, IPC, UTS, PID namespaces are detached
* ✅ no external dependencies (other than a standard C library)
-* ✅ restrictive seccomp profiles (30 syscalls allowed for _passt_, 41 for
+* ✅ restrictive seccomp profiles (33 syscalls allowed for _passt_, 43 for
_pasta_ on x86_64)
* ✅ examples of [AppArmor](/passt/tree/contrib/apparmor) and
[SELinux](/passt/tree/contrib/selinux) profiles available
diff --git a/arp.c b/arp.c
index fc482bb..44677ad 100644
--- a/arp.c
+++ b/arp.c
@@ -31,56 +31,84 @@
#include "tap.h"
/**
- * arp() - Check if this is a supported ARP message, reply as needed
+ * ignore_arp() - Check if we should ignore this ARP message
* @c: Execution context
- * @p: Packet pool, single packet with Ethernet buffer
+ * @ah: ARP header
+ * @am: ARP message
*
- * Return: 1 if handled, -1 on failure
+ * Return: true if the ARP message should be ignored, false otherwise
*/
-int arp(const struct ctx *c, const struct pool *p)
+static bool ignore_arp(const struct ctx *c,
+ const struct arphdr *ah, const struct arpmsg *am)
{
- unsigned char swap[4];
- struct ethhdr *eh;
- struct arphdr *ah;
- struct arpmsg *am;
- size_t l2len;
-
- eh = packet_get(p, 0, 0, sizeof(*eh), NULL);
- ah = packet_get(p, 0, sizeof(*eh), sizeof(*ah), NULL);
- am = packet_get(p, 0, sizeof(*eh) + sizeof(*ah), sizeof(*am), NULL);
-
- if (!eh || !ah || !am)
- return -1;
-
if (ah->ar_hrd != htons(ARPHRD_ETHER) ||
ah->ar_pro != htons(ETH_P_IP) ||
ah->ar_hln != ETH_ALEN ||
ah->ar_pln != 4 ||
ah->ar_op != htons(ARPOP_REQUEST))
- return 1;
+ return true;
/* Discard announcements, but not 0.0.0.0 "probes" */
if (memcmp(am->sip, &in4addr_any, sizeof(am->sip)) &&
!memcmp(am->sip, am->tip, sizeof(am->sip)))
- return 1;
+ return true;
/* Don't resolve the guest's assigned address, either. */
if (!memcmp(am->tip, &c->ip4.addr, sizeof(am->tip)))
+ return true;
+
+ return false;
+}
+
+/**
+ * arp() - Check if this is a supported ARP message, reply as needed
+ * @c: Execution context
+ * @data: Single packet with Ethernet buffer
+ *
+ * Return: 1 if handled, -1 on failure
+ */
+int arp(const struct ctx *c, struct iov_tail *data)
+{
+ struct {
+ struct ethhdr eh;
+ struct arphdr ah;
+ struct arpmsg am;
+ } __attribute__((__packed__)) resp;
+ struct arphdr ah_storage;
+ struct ethhdr eh_storage;
+ struct arpmsg am_storage;
+ const struct ethhdr *eh;
+ const struct arphdr *ah;
+ const struct arpmsg *am;
+
+ eh = IOV_REMOVE_HEADER(data, eh_storage);
+ ah = IOV_REMOVE_HEADER(data, ah_storage);
+ am = IOV_REMOVE_HEADER(data, am_storage);
+ if (!eh || !ah || !am)
+ return -1;
+
+ if (ignore_arp(c, ah, am))
return 1;
- ah->ar_op = htons(ARPOP_REPLY);
- memcpy(am->tha, am->sha, sizeof(am->tha));
- memcpy(am->sha, c->our_tap_mac, sizeof(am->sha));
+ /* Ethernet header */
+ resp.eh.h_proto = htons(ETH_P_ARP);
+ memcpy(resp.eh.h_dest, eh->h_source, sizeof(resp.eh.h_dest));
+ memcpy(resp.eh.h_source, c->our_tap_mac, sizeof(resp.eh.h_source));
- memcpy(swap, am->tip, sizeof(am->tip));
- memcpy(am->tip, am->sip, sizeof(am->tip));
- memcpy(am->sip, swap, sizeof(am->sip));
+ /* ARP header */
+ resp.ah.ar_op = htons(ARPOP_REPLY);
+ resp.ah.ar_hrd = ah->ar_hrd;
+ resp.ah.ar_pro = ah->ar_pro;
+ resp.ah.ar_hln = ah->ar_hln;
+ resp.ah.ar_pln = ah->ar_pln;
- l2len = sizeof(*eh) + sizeof(*ah) + sizeof(*am);
- memcpy(eh->h_dest, eh->h_source, sizeof(eh->h_dest));
- memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
+ /* ARP message */
+ memcpy(resp.am.sha, c->our_tap_mac, sizeof(resp.am.sha));
+ memcpy(resp.am.sip, am->tip, sizeof(resp.am.sip));
+ memcpy(resp.am.tha, am->sha, sizeof(resp.am.tha));
+ memcpy(resp.am.tip, am->sip, sizeof(resp.am.tip));
- tap_send_single(c, eh, l2len);
+ tap_send_single(c, &resp, sizeof(resp));
return 1;
}
diff --git a/arp.h b/arp.h
index ac5cd16..86bcbf8 100644
--- a/arp.h
+++ b/arp.h
@@ -20,6 +20,6 @@ struct arpmsg {
unsigned char tip[4];
} __attribute__((__packed__));
-int arp(const struct ctx *c, const struct pool *p);
+int arp(const struct ctx *c, struct iov_tail *data);
#endif /* ARP_H */
diff --git a/checksum.c b/checksum.c
index 0894eca..0c3837c 100644
--- a/checksum.c
+++ b/checksum.c
@@ -145,7 +145,7 @@ uint16_t csum_ip4_header(uint16_t l3len, uint8_t protocol,
* @proto: Protocol number
* @saddr: Source address
* @daddr: Destination address
- * Returns: Partial checksum of the IPv4 header
+ * Return: partial checksum of the IPv4 header
*/
uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
struct in_addr saddr, struct in_addr daddr)
@@ -225,7 +225,7 @@ void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen)
* @proto: Protocol number
* @saddr: Source address
* @daddr: Destination address
- * Returns: Partial checksum of the IPv6 header
+ * Return: partial checksum of the IPv6 header
*/
uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
const struct in6_addr *saddr,
@@ -452,7 +452,7 @@ less_than_128_bytes:
}
/**
- * csum_unfolded - Calculate the unfolded checksum of a data buffer.
+ * csum_unfolded() - Calculate the unfolded checksum of a data buffer.
*
* @buf: Input buffer
* @len: Input length
@@ -481,7 +481,7 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init)
}
#else /* __AVX2__ */
/**
- * csum_unfolded - Calculate the unfolded checksum of a data buffer.
+ * csum_unfolded() - Calculate the unfolded checksum of a data buffer.
*
* @buf: Input buffer
* @len: Input length
diff --git a/conf.c b/conf.c
index 065e720..f47f48e 100644
--- a/conf.c
+++ b/conf.c
@@ -16,6 +16,7 @@
#include <errno.h>
#include <fcntl.h>
#include <getopt.h>
+#include <libgen.h>
#include <string.h>
#include <sched.h>
#include <sys/types.h>
@@ -64,11 +65,11 @@
const char *pasta_default_ifn = "tap0";
/**
- * next_chunk - Return the next piece of a string delimited by a character
+ * next_chunk() - Return the next piece of a string delimited by a character
* @s: String to search
* @c: Delimiter character
*
- * Return: If another @c is found in @s, returns a pointer to the
+ * Return: if another @c is found in @s, returns a pointer to the
* character *after* the delimiter, if no further @c is in @s,
* return NULL
*/
@@ -79,7 +80,7 @@ static char *next_chunk(const char *s, char c)
}
/**
- * port_range - Represents a non-empty range of ports
+ * port_range() - Represents a non-empty range of ports
* @first: First port number in the range
* @last: Last port number in the range (inclusive)
*
@@ -124,6 +125,75 @@ static int parse_port_range(const char *s, char **endptr,
}
/**
+ * conf_ports_range_except() - Set up forwarding for a range of ports minus a
+ * bitmap of exclusions
+ * @c: Execution context
+ * @optname: Short option name, t, T, u, or U
+ * @optarg: Option argument (port specification)
+ * @fwd: Pointer to @fwd_ports to be updated
+ * @addr: Listening address
+ * @ifname: Listening interface
+ * @first: First port to forward
+ * @last: Last port to forward
+ * @exclude: Bitmap of ports to exclude
+ * @to: Port to translate @first to when forwarding
+ * @weak: Ignore errors, as long as at least one port is mapped
+ */
+static void conf_ports_range_except(const struct ctx *c, char optname,
+ const char *optarg, struct fwd_ports *fwd,
+ const union inany_addr *addr,
+ const char *ifname,
+ uint16_t first, uint16_t last,
+ const uint8_t *exclude, uint16_t to,
+ bool weak)
+{
+ bool bound_one = false;
+ unsigned i;
+ int ret;
+
+ if (first == 0) {
+ die("Can't forward port 0 for option '-%c %s'",
+ optname, optarg);
+ }
+
+ for (i = first; i <= last; i++) {
+ if (bitmap_isset(exclude, i))
+ continue;
+
+ if (bitmap_isset(fwd->map, i)) {
+ warn(
+"Altering mapping of already mapped port number: %s", optarg);
+ }
+
+ bitmap_set(fwd->map, i);
+ fwd->delta[i] = to - first;
+
+ if (optname == 't')
+ ret = tcp_sock_init(c, addr, ifname, i);
+ else if (optname == 'u')
+ ret = udp_sock_init(c, 0, addr, ifname, i);
+ else
+ /* No way to check in advance for -T and -U */
+ ret = 0;
+
+ if (ret == -ENFILE || ret == -EMFILE) {
+ die("Can't open enough sockets for port specifier: %s",
+ optarg);
+ }
+
+ if (!ret) {
+ bound_one = true;
+ } else if (!weak) {
+ die("Failed to bind port %u (%s) for option '-%c %s'",
+ i, strerror_(-ret), optname, optarg);
+ }
+ }
+
+ if (!bound_one)
+ die("Failed to bind any port for '-%c %s'", optname, optarg);
+}
+
+/**
* conf_ports() - Parse port configuration options, initialise UDP/TCP sockets
* @c: Execution context
* @optname: Short option name, t, T, u, or U
@@ -135,10 +205,9 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
{
union inany_addr addr_buf = inany_any6, *addr = &addr_buf;
char buf[BUFSIZ], *spec, *ifname = NULL, *p;
- bool exclude_only = true, bound_one = false;
uint8_t exclude[PORT_BITMAP_SIZE] = { 0 };
+ bool exclude_only = true;
unsigned i;
- int ret;
if (!strcmp(optarg, "none")) {
if (fwd->mode)
@@ -173,32 +242,15 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
fwd->mode = FWD_ALL;
- /* Skip port 0. It has special meaning for many socket APIs, so
- * trying to bind it is not really safe.
- */
- for (i = 1; i < NUM_PORTS; i++) {
+ /* Exclude ephemeral ports */
+ for (i = 0; i < NUM_PORTS; i++)
if (fwd_port_is_ephemeral(i))
- continue;
-
- bitmap_set(fwd->map, i);
- if (optname == 't') {
- ret = tcp_sock_init(c, NULL, NULL, i);
- if (ret == -ENFILE || ret == -EMFILE)
- goto enfile;
- if (!ret)
- bound_one = true;
- } else if (optname == 'u') {
- ret = udp_sock_init(c, 0, NULL, NULL, i);
- if (ret == -ENFILE || ret == -EMFILE)
- goto enfile;
- if (!ret)
- bound_one = true;
- }
- }
-
- if (!bound_one)
- goto bind_all_fail;
+ bitmap_set(exclude, i);
+ conf_ports_range_except(c, optname, optarg, fwd,
+ NULL, NULL,
+ 1, NUM_PORTS - 1, exclude,
+ 1, true);
return;
}
@@ -275,37 +327,15 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
} while ((p = next_chunk(p, ',')));
if (exclude_only) {
- /* Skip port 0. It has special meaning for many socket APIs, so
- * trying to bind it is not really safe.
- */
- for (i = 1; i < NUM_PORTS; i++) {
- if (fwd_port_is_ephemeral(i) ||
- bitmap_isset(exclude, i))
- continue;
-
- bitmap_set(fwd->map, i);
-
- if (optname == 't') {
- ret = tcp_sock_init(c, addr, ifname, i);
- if (ret == -ENFILE || ret == -EMFILE)
- goto enfile;
- if (!ret)
- bound_one = true;
- } else if (optname == 'u') {
- ret = udp_sock_init(c, 0, addr, ifname, i);
- if (ret == -ENFILE || ret == -EMFILE)
- goto enfile;
- if (!ret)
- bound_one = true;
- } else {
- /* No way to check in advance for -T and -U */
- bound_one = true;
- }
- }
-
- if (!bound_one)
- goto bind_all_fail;
+ /* Exclude ephemeral ports */
+ for (i = 0; i < NUM_PORTS; i++)
+ if (fwd_port_is_ephemeral(i))
+ bitmap_set(exclude, i);
+ conf_ports_range_except(c, optname, optarg, fwd,
+ addr, ifname,
+ 1, NUM_PORTS - 1, exclude,
+ 1, true);
return;
}
@@ -334,40 +364,18 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
if ((*p != '\0') && (*p != ',')) /* Garbage after the ranges */
goto bad;
- for (i = orig_range.first; i <= orig_range.last; i++) {
- if (bitmap_isset(fwd->map, i))
- warn(
-"Altering mapping of already mapped port number: %s", optarg);
-
- if (bitmap_isset(exclude, i))
- continue;
-
- bitmap_set(fwd->map, i);
-
- fwd->delta[i] = mapped_range.first - orig_range.first;
-
- ret = 0;
- if (optname == 't')
- ret = tcp_sock_init(c, addr, ifname, i);
- else if (optname == 'u')
- ret = udp_sock_init(c, 0, addr, ifname, i);
- if (ret)
- goto bind_fail;
- }
+ conf_ports_range_except(c, optname, optarg, fwd,
+ addr, ifname,
+ orig_range.first, orig_range.last,
+ exclude,
+ mapped_range.first, false);
} while ((p = next_chunk(p, ',')));
return;
-enfile:
- die("Can't open enough sockets for port specifier: %s", optarg);
bad:
die("Invalid port specifier %s", optarg);
mode_conflict:
die("Port forwarding mode '%s' conflicts with previous mode", optarg);
-bind_fail:
- die("Failed to bind port %u (%s) for option '-%c %s', exiting",
- i, strerror_(-ret), optname, optarg);
-bind_all_fail:
- die("Failed to bind any port for '-%c %s', exiting", optname, optarg);
}
/**
@@ -376,7 +384,7 @@ bind_all_fail:
* @addr: Guest nameserver IPv4 address
* @idx: Index of free entry in array of IPv4 resolvers
*
- * Return: Number of entries added (0 or 1)
+ * Return: number of entries added (0 or 1)
*/
static unsigned add_dns4(struct ctx *c, const struct in_addr *addr,
unsigned idx)
@@ -394,7 +402,7 @@ static unsigned add_dns4(struct ctx *c, const struct in_addr *addr,
* @addr: Guest nameserver IPv6 address
* @idx: Index of free entry in array of IPv6 resolvers
*
- * Return: Number of entries added (0 or 1)
+ * Return: number of entries added (0 or 1)
*/
static unsigned add_dns6(struct ctx *c, const struct in6_addr *addr,
unsigned idx)
@@ -407,6 +415,76 @@ static unsigned add_dns6(struct ctx *c, const struct in6_addr *addr,
}
/**
+ * add_dns_resolv4() - Possibly add one IPv4 nameserver from host's resolv.conf
+ * @c: Execution context
+ * @ns: Nameserver address
+ * @idx: Pointer to index of current IPv4 resolver entry, set on return
+ */
+static void add_dns_resolv4(struct ctx *c, struct in_addr *ns, unsigned *idx)
+{
+ if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host))
+ c->ip4.dns_host = *ns;
+
+ /* Special handling if guest or container can only access local
+ * addresses via redirect, or if the host gateway is also a resolver and
+ * we shadow its address
+ */
+ if (IN4_IS_ADDR_LOOPBACK(ns) ||
+ IN4_ARE_ADDR_EQUAL(ns, &c->ip4.map_host_loopback)) {
+ if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match)) {
+ if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback))
+ return; /* Address unreachable */
+
+ *ns = c->ip4.map_host_loopback;
+ c->ip4.dns_match = c->ip4.map_host_loopback;
+ } else {
+ /* No general host mapping, but requested for DNS
+ * (--dns-forward and --no-map-gw): advertise resolver
+ * address from --dns-forward, and map that to loopback
+ */
+ *ns = c->ip4.dns_match;
+ }
+ }
+
+ *idx += add_dns4(c, ns, *idx);
+}
+
+/**
+ * add_dns_resolv6() - Possibly add one IPv6 nameserver from host's resolv.conf
+ * @c: Execution context
+ * @ns: Nameserver address
+ * @idx: Pointer to index of current IPv6 resolver entry, set on return
+ */
+static void add_dns_resolv6(struct ctx *c, struct in6_addr *ns, unsigned *idx)
+{
+ if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host))
+ c->ip6.dns_host = *ns;
+
+ /* Special handling if guest or container can only access local
+ * addresses via redirect, or if the host gateway is also a resolver and
+ * we shadow its address
+ */
+ if (IN6_IS_ADDR_LOOPBACK(ns) ||
+ IN6_ARE_ADDR_EQUAL(ns, &c->ip6.map_host_loopback)) {
+ if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match)) {
+ if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback))
+ return; /* Address unreachable */
+
+ *ns = c->ip6.map_host_loopback;
+ c->ip6.dns_match = c->ip6.map_host_loopback;
+ } else {
+ /* No general host mapping, but requested for DNS
+ * (--dns-forward and --no-map-gw): advertise resolver
+ * address from --dns-forward, and map that to loopback
+ */
+ *ns = c->ip6.dns_match;
+ }
+ }
+
+ *idx += add_dns6(c, ns, *idx);
+}
+
+/**
* add_dns_resolv() - Possibly add ns from host resolv.conf to configuration
* @c: Execution context
* @nameserver: Nameserver address string from /etc/resolv.conf
@@ -422,48 +500,11 @@ static void add_dns_resolv(struct ctx *c, const char *nameserver,
struct in6_addr ns6;
struct in_addr ns4;
- if (idx4 && inet_pton(AF_INET, nameserver, &ns4)) {
- if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host))
- c->ip4.dns_host = ns4;
+ if (idx4 && inet_pton(AF_INET, nameserver, &ns4))
+ add_dns_resolv4(c, &ns4, idx4);
- /* Special handling if guest or container can only access local
- * addresses via redirect, or if the host gateway is also a
- * resolver and we shadow its address
- */
- if (IN4_IS_ADDR_LOOPBACK(&ns4) ||
- IN4_ARE_ADDR_EQUAL(&ns4, &c->ip4.map_host_loopback)) {
- if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback))
- return;
-
- ns4 = c->ip4.map_host_loopback;
- if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match))
- c->ip4.dns_match = c->ip4.map_host_loopback;
- }
-
- *idx4 += add_dns4(c, &ns4, *idx4);
- }
-
- if (idx6 && inet_pton(AF_INET6, nameserver, &ns6)) {
- if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host))
- c->ip6.dns_host = ns6;
-
- /* Special handling if guest or container can only access local
- * addresses via redirect, or if the host gateway is also a
- * resolver and we shadow its address
- */
- if (IN6_IS_ADDR_LOOPBACK(&ns6) ||
- IN6_ARE_ADDR_EQUAL(&ns6, &c->ip6.map_host_loopback)) {
- if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback))
- return;
-
- ns6 = c->ip6.map_host_loopback;
-
- if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match))
- c->ip6.dns_match = c->ip6.map_host_loopback;
- }
-
- *idx6 += add_dns6(c, &ns6, *idx6);
- }
+ if (idx6 && inet_pton(AF_INET6, nameserver, &ns6))
+ add_dns_resolv6(c, &ns6, idx6);
}
/**
@@ -615,7 +656,7 @@ static void conf_pasta_ns(int *netns_only, char *userns, char *netns,
/** conf_ip4_prefix() - Parse an IPv4 prefix length or netmask
* @arg: Netmask in dotted decimal or prefix length
*
- * Return: Validated prefix length on success, -1 on failure
+ * Return: validated prefix length on success, -1 on failure
*/
static int conf_ip4_prefix(const char *arg)
{
@@ -642,7 +683,7 @@ static int conf_ip4_prefix(const char *arg)
* @ifi: Host interface to attempt (0 to determine one)
* @ip4: IPv4 context (will be written)
*
- * Return: Interface index for IPv4, or 0 on failure.
+ * Return: interface index for IPv4, or 0 on failure.
*/
static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4)
{
@@ -714,7 +755,7 @@ static void conf_ip4_local(struct ip4_ctx *ip4)
* @ifi: Host interface to attempt (0 to determine one)
* @ip6: IPv6 context (will be written)
*
- * Return: Interface index for IPv6, or 0 on failure.
+ * Return: interface index for IPv6, or 0 on failure.
*/
static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6)
{
@@ -823,6 +864,14 @@ static void usage(const char *name, FILE *f, int status)
FPRINTF(f,
" --repair-path PATH path for passt-repair(1)\n"
" default: append '.repair' to UNIX domain path\n");
+ FPRINTF(f,
+ " --migrate-exit DEPRECATED:\n"
+ " source quits after migration\n"
+ " default: source keeps running after migration\n");
+ FPRINTF(f,
+ " --migrate-no-linger DEPRECATED:\n"
+ " close sockets on migration\n"
+ " default: keep sockets open, ignore events\n");
}
FPRINTF(f,
@@ -934,6 +983,7 @@ static void usage(const char *name, FILE *f, int status)
" SPEC is as described for TCP above\n"
" default: none\n");
+ (void)fflush(f);
_exit(status);
pasta_opts:
@@ -988,10 +1038,50 @@ pasta_opts:
" --ns-mac-addr ADDR Set MAC address on tap interface\n"
" --no-splice Disable inbound socket splicing\n");
+ (void)fflush(f);
_exit(status);
}
/**
+ * conf_mode() - Determine passt/pasta's operating mode from command line
+ * @argc: Argument count
+ * @argv: Command line arguments
+ *
+ * Return: mode to operate in, PASTA or PASST
+ */
+enum passt_modes conf_mode(int argc, char *argv[])
+{
+ int vhost_user = 0;
+ const struct option optvu[] = {
+ {"vhost-user", no_argument, &vhost_user, 1 },
+ { 0 },
+ };
+ char argv0[PATH_MAX], *basearg0;
+ int name;
+
+ optind = 0;
+ do {
+ name = getopt_long(argc, argv, "-:", optvu, NULL);
+ } while (name != -1);
+
+ if (vhost_user)
+ return MODE_VU;
+
+ if (argc < 1)
+ die("Cannot determine argv[0]");
+
+ strncpy(argv0, argv[0], PATH_MAX - 1);
+ basearg0 = basename(argv0);
+ if (strstr(basearg0, "pasta"))
+ return MODE_PASTA;
+
+ if (strstr(basearg0, "passt"))
+ return MODE_PASST;
+
+ die("Cannot determine mode, invoke as \"passt\" or \"pasta\"");
+}
+
+/**
* conf_print() - Print fundamental configuration parameters
* @c: Execution context
*/
@@ -1225,6 +1315,8 @@ static void conf_nat(const char *arg, struct in_addr *addr4,
*addr6 = in6addr_any;
if (no_map_gw)
*no_map_gw = 1;
+
+ return;
}
if (inet_pton(AF_INET6, arg, addr6) &&
@@ -1276,7 +1368,7 @@ static void conf_open_files(struct ctx *c)
}
/**
- * parse_mac - Parse a MAC address from a string
+ * parse_mac() - Parse a MAC address from a string
* @mac: Binary MAC address, initialised on success
* @str: String to parse
*
@@ -1386,18 +1478,21 @@ void conf(struct ctx *c, int argc, char **argv)
{"socket-path", required_argument, NULL, 's' },
{"fqdn", required_argument, NULL, 27 },
{"repair-path", required_argument, NULL, 28 },
+ {"migrate-exit", no_argument, NULL, 29 },
+ {"migrate-no-linger", no_argument, NULL, 30 },
{ 0 },
};
+ const char *optstring = "+dqfel:hs:F:I:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:T:U:";
const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt";
char userns[PATH_MAX] = { 0 }, netns[PATH_MAX] = { 0 };
bool copy_addrs_opt = false, copy_routes_opt = false;
enum fwd_ports_mode fwd_default = FWD_NONE;
bool v4_only = false, v6_only = false;
unsigned dns4_idx = 0, dns6_idx = 0;
+ unsigned long max_mtu = IP_MAX_MTU;
struct fqdn *dnss = c->dns_search;
unsigned int ifi4 = 0, ifi6 = 0;
const char *logfile = NULL;
- const char *optstring;
size_t logsize = 0;
char *runas = NULL;
long fd_tap_opt;
@@ -1408,12 +1503,11 @@ void conf(struct ctx *c, int argc, char **argv)
if (c->mode == MODE_PASTA) {
c->no_dhcp_dns = c->no_dhcp_dns_search = 1;
fwd_default = FWD_AUTO;
- optstring = "+dqfel:hF:I:p:P:m:a:n:M:g:i:o:D:S:H:46t:u:T:U:";
- } else {
- optstring = "+dqfel:hs:F:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:";
}
- c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t));
+ if (tap_l2_max_len(c) - ETH_HLEN < max_mtu)
+ max_mtu = tap_l2_max_len(c) - ETH_HLEN;
+ c->mtu = ROUND_DOWN(max_mtu, sizeof(uint32_t));
c->tcp.fwd_in.mode = c->tcp.fwd_out.mode = FWD_UNSET;
c->udp.fwd_in.mode = c->udp.fwd_out.mode = FWD_UNSET;
memcpy(c->our_tap_mac, MAC_OUR_LAA, ETH_ALEN);
@@ -1512,6 +1606,7 @@ void conf(struct ctx *c, int argc, char **argv)
FPRINTF(stdout,
c->mode == MODE_PASTA ? "pasta " : "passt ");
FPRINTF(stdout, VERSION_BLOB);
+ (void)fflush(stdout);
_exit(EXIT_SUCCESS);
case 15:
ret = snprintf(c->ip4.ifname_out,
@@ -1581,9 +1676,8 @@ void conf(struct ctx *c, int argc, char **argv)
die("Invalid host nameserver address: %s", optarg);
case 25:
- if (c->mode == MODE_PASTA)
- die("--vhost-user is for passt mode only");
- c->mode = MODE_VU;
+ /* Already handled in conf_mode() */
+ ASSERT(c->mode == MODE_VU);
break;
case 26:
vu_print_capabilities();
@@ -1594,7 +1688,26 @@ void conf(struct ctx *c, int argc, char **argv)
die("Invalid FQDN: %s", optarg);
break;
case 28:
- /* Handle this once we checked --vhost-user */
+ if (c->mode != MODE_VU && strcmp(optarg, "none"))
+ die("--repair-path is for vhost-user mode only");
+
+ if (snprintf_check(c->repair_path,
+ sizeof(c->repair_path), "%s",
+ optarg))
+ die("Invalid passt-repair path: %s", optarg);
+
+ break;
+ case 29:
+ if (c->mode != MODE_VU)
+ die("--migrate-exit is for vhost-user mode only");
+ c->migrate_exit = true;
+
+ break;
+ case 30:
+ if (c->mode != MODE_VU)
+ die("--migrate-no-linger is for vhost-user mode only");
+ c->migrate_no_linger = true;
+
break;
case 'd':
c->debug = 1;
@@ -1614,6 +1727,9 @@ void conf(struct ctx *c, int argc, char **argv)
c->foreground = 1;
break;
case 's':
+ if (c->mode == MODE_PASTA)
+ die("-s is for passt / vhost-user mode only");
+
ret = snprintf(c->sock_path, sizeof(c->sock_path), "%s",
optarg);
if (ret <= 0 || ret >= (int)sizeof(c->sock_path))
@@ -1626,7 +1742,8 @@ void conf(struct ctx *c, int argc, char **argv)
fd_tap_opt = strtol(optarg, NULL, 0);
if (errno ||
- fd_tap_opt <= STDERR_FILENO || fd_tap_opt > INT_MAX)
+ (fd_tap_opt != STDIN_FILENO && fd_tap_opt <= STDERR_FILENO) ||
+ fd_tap_opt > INT_MAX)
die("Invalid --fd: %s", optarg);
c->fd_tap = fd_tap_opt;
@@ -1634,6 +1751,9 @@ void conf(struct ctx *c, int argc, char **argv)
*c->sock_path = 0;
break;
case 'I':
+ if (c->mode != MODE_PASTA)
+ die("-I is for pasta mode only");
+
ret = snprintf(c->pasta_ifn, IFNAMSIZ, "%s",
optarg);
if (ret <= 0 || ret >= IFNAMSIZ)
@@ -1663,9 +1783,9 @@ void conf(struct ctx *c, int argc, char **argv)
if (errno || *e)
die("Invalid MTU: %s", optarg);
- if (mtu > ETH_MAX_MTU) {
- die("MTU %lu too large (max %u)",
- mtu, ETH_MAX_MTU);
+ if (mtu > max_mtu) {
+ die("MTU %lu too large (max %lu)",
+ mtu, max_mtu);
}
c->mtu = mtu;
@@ -1790,11 +1910,16 @@ void conf(struct ctx *c, int argc, char **argv)
break;
case 't':
case 'u':
- case 'T':
- case 'U':
case 'D':
/* Handle these later, once addresses are configured */
break;
+ case 'T':
+ case 'U':
+ if (c->mode != MODE_PASTA)
+ die("-%c is for pasta mode only", name);
+
+ /* Handle properly later, once addresses are configured */
+ break;
case 'h':
usage(argv[0], stdout, EXIT_SUCCESS);
break;
@@ -1883,8 +2008,8 @@ void conf(struct ctx *c, int argc, char **argv)
if (c->ifi4 && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.guest_gw))
c->no_dhcp = 1;
- /* Inbound port options, DNS, and --repair-path can be parsed now, after
- * IPv4/IPv6 settings and --vhost-user.
+ /* Inbound port options and DNS can be parsed now, after IPv4/IPv6
+ * settings
*/
fwd_probe_ephemeral();
udp_portmap_clear();
@@ -1930,16 +2055,6 @@ void conf(struct ctx *c, int argc, char **argv)
}
die("Cannot use DNS address %s", optarg);
- } else if (name == 28) {
- if (c->mode != MODE_VU && strcmp(optarg, "none"))
- die("--repair-path is for vhost-user mode only");
-
- if (snprintf_check(c->repair_path,
- sizeof(c->repair_path), "%s",
- optarg))
- die("Invalid passt-repair path: %s", optarg);
-
- break;
}
} while (name != -1);
diff --git a/conf.h b/conf.h
index 9d2143d..b45ad74 100644
--- a/conf.h
+++ b/conf.h
@@ -6,6 +6,7 @@
#ifndef CONF_H
#define CONF_H
+enum passt_modes conf_mode(int argc, char *argv[]);
void conf(struct ctx *c, int argc, char **argv);
#endif /* CONF_H */
diff --git a/contrib/fedora/passt.spec b/contrib/fedora/passt.spec
index 745cf01..663289f 100644
--- a/contrib/fedora/passt.spec
+++ b/contrib/fedora/passt.spec
@@ -9,6 +9,7 @@
%global git_hash {{{ git_head }}}
%global selinuxtype targeted
+%global selinux_policy_version 41.41
Name: passt
Version: {{{ git_version }}}
@@ -33,15 +34,19 @@ for network namespaces: traffic is forwarded using a tap interface inside the
namespace, without the need to create further interfaces on the host, hence not
requiring any capabilities or privileges.
-%package selinux
-BuildArch: noarch
-Summary: SELinux support for passt and pasta
-Requires: %{name} = %{version}-%{release}
-Requires: selinux-policy
-Requires(post): %{name}
-Requires(post): policycoreutils
-Requires(preun): %{name}
-Requires(preun): policycoreutils
+%package selinux
+BuildArch: noarch
+Summary: SELinux support for passt and pasta
+Requires: selinux-policy-%{selinuxtype}
+Requires: container-selinux
+Requires(post): selinux-policy-%{selinuxtype}
+Requires(post): container-selinux
+Requires(post): policycoreutils
+Requires(post): libselinux-utils
+Requires(preun): policycoreutils
+BuildRequires: selinux-policy-devel
+BuildRequires: pkgconfig(systemd)
+Recommends: selinux-policy-%{selinuxtype} >= %{selinux_policy_version}
%description selinux
This package adds SELinux enforcement to passt(1), pasta(1), passt-repair(1).
@@ -89,19 +94,26 @@ popd
%selinux_relabel_pre -s %{selinuxtype}
%post selinux
-%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp
-%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
-%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
+%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
%postun selinux
if [ $1 -eq 0 ]; then
- %selinux_modules_uninstall -s %{selinuxtype} passt
- %selinux_modules_uninstall -s %{selinuxtype} pasta
- %selinux_modules_uninstall -s %{selinuxtype} passt-repair
+ %selinux_modules_uninstall -s %{selinuxtype} passt pasta passt-repair
fi
%posttrans selinux
%selinux_relabel_post -s %{selinuxtype}
+# %selinux_relabel_post calls fixfiles(8) with the previous file_contexts file
+# (see selabel_file(5)) in order to restore only the file contexts which
+# actually changed. However, as file_contexts doesn't support %{USERID}
+# substitutions, this will not work for specific file contexts that pasta needs
+# to have under /run/user.
+#
+# Restore those explicitly, hiding errors from restorecon(8): we can't pass a
+# path that's more specific than this, but at the same time /run/user often
+# contains FUSE mountpoints that can't be accessed as root, leading to
+# "Permission denied" messages, but not failures.
+restorecon -R /run/user 2>/dev/null
%files
%license LICENSES/{GPL-2.0-or-later.txt,BSD-3-Clause.txt}
diff --git a/contrib/selinux/passt-repair.te b/contrib/selinux/passt-repair.te
index f171be6..7157dfb 100644
--- a/contrib/selinux/passt-repair.te
+++ b/contrib/selinux/passt-repair.te
@@ -61,11 +61,11 @@ allow passt_repair_t unconfined_t:unix_stream_socket { connectto read write };
allow passt_repair_t passt_t:unix_stream_socket { connectto read write };
allow passt_repair_t user_tmp_t:unix_stream_socket { connectto read write };
-allow passt_repair_t user_tmp_t:dir search;
+allow passt_repair_t user_tmp_t:dir { getattr read search watch };
-allow passt_repair_t unconfined_t:sock_file { read write };
-allow passt_repair_t passt_t:sock_file { read write };
-allow passt_repair_t user_tmp_t:sock_file { read write };
+allow passt_repair_t unconfined_t:sock_file { getattr read write };
+allow passt_repair_t passt_t:sock_file { getattr read write };
+allow passt_repair_t user_tmp_t:sock_file { getattr read write };
allow passt_repair_t unconfined_t:tcp_socket { read setopt write };
allow passt_repair_t passt_t:tcp_socket { read setopt write };
@@ -80,8 +80,8 @@ allow passt_repair_t passt_t:tcp_socket { read setopt write };
allow passt_repair_t qemu_var_run_t:unix_stream_socket { connectto read write };
allow passt_repair_t virt_var_run_t:unix_stream_socket { connectto read write };
-allow passt_repair_t qemu_var_run_t:dir search;
-allow passt_repair_t virt_var_run_t:dir search;
+allow passt_repair_t qemu_var_run_t:dir { getattr read search watch };
+allow passt_repair_t virt_var_run_t:dir { getattr read search watch };
-allow passt_repair_t qemu_var_run_t:sock_file { read write };
-allow passt_repair_t virt_var_run_t:sock_file { read write };
+allow passt_repair_t qemu_var_run_t:sock_file { getattr read write };
+allow passt_repair_t virt_var_run_t:sock_file { getattr read write };
diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te
index f8ea672..6995df8 100644
--- a/contrib/selinux/passt.te
+++ b/contrib/selinux/passt.te
@@ -49,7 +49,7 @@ require {
type proc_net_t;
type node_t;
class tcp_socket { create accept listen name_bind name_connect getattr ioctl };
- class udp_socket { create accept listen };
+ class udp_socket { create accept listen getattr };
class icmp_socket { bind create name_bind node_bind setopt read write };
class sock_file { create unlink write };
@@ -110,8 +110,6 @@ allow passt_t self:user_namespace create;
auth_read_passwd(passt_t)
allow passt_t proc_net_t:file read;
-allow passt_t net_conf_t:file { open read };
-allow passt_t net_conf_t:lnk_file read;
allow passt_t tmp_t:sock_file { create unlink write };
allow passt_t self:netlink_route_socket { bind create nlmsg_read read write setopt };
kernel_search_network_sysctl(passt_t)
@@ -129,11 +127,13 @@ corenet_tcp_connect_all_ports(passt_t)
corenet_tcp_sendrecv_all_ports(passt_t)
corenet_udp_sendrecv_all_ports(passt_t)
+sysnet_read_config(passt_t)
+
allow passt_t node_t:icmp_socket { name_bind node_bind };
allow passt_t port_t:icmp_socket name_bind;
allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write getattr ioctl };
-allow passt_t self:udp_socket { create getopt setopt connect bind read write };
+allow passt_t self:udp_socket { create getopt setopt connect bind read write getattr };
allow passt_t self:icmp_socket { bind create setopt read write };
allow passt_t user_tmp_t:dir { add_name write };
diff --git a/contrib/selinux/pasta.fc b/contrib/selinux/pasta.fc
index 41ee46d..e4aefc4 100644
--- a/contrib/selinux/pasta.fc
+++ b/contrib/selinux/pasta.fc
@@ -8,7 +8,9 @@
# Copyright (c) 2022 Red Hat GmbH
# Author: Stefano Brivio <sbrivio@redhat.com>
-/usr/bin/pasta system_u:object_r:pasta_exec_t:s0
-/usr/bin/pasta.avx2 system_u:object_r:pasta_exec_t:s0
-/tmp/pasta\.pcap system_u:object_r:pasta_log_t:s0
-/var/run/pasta\.pid system_u:object_r:pasta_pid_t:s0
+/usr/bin/pasta system_u:object_r:pasta_exec_t:s0
+/usr/bin/pasta.avx2 system_u:object_r:pasta_exec_t:s0
+/tmp/pasta\.pcap system_u:object_r:pasta_log_t:s0
+/var/run/pasta\.pid system_u:object_r:pasta_pid_t:s0
+/run/user/%{USERID}/netns system_u:object_r:ifconfig_var_run_t:s0
+/run/user/%{USERID}/containers/networks/rootless-netns system_u:object_r:ifconfig_var_run_t:s0
diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te
index 89c8043..c0a1e9b 100644
--- a/contrib/selinux/pasta.te
+++ b/contrib/selinux/pasta.te
@@ -89,6 +89,15 @@ require {
class capability { sys_tty_config setuid setgid };
class cap_userns { setpcap sys_admin sys_ptrace net_bind_service net_admin };
class user_namespace create;
+
+ # Container requires
+ attribute_role usernetctl_roles;
+ role container_user_r;
+ role staff_r;
+ role user_r;
+ type container_runtime_t;
+ type container_t;
+ type systemd_user_runtimedir_t;
}
type pasta_t;
@@ -113,6 +122,9 @@ init_daemon_domain(pasta_t, pasta_exec_t)
allow pasta_t self:capability { setpcap net_bind_service sys_tty_config dac_read_search net_admin sys_resource setuid setgid };
allow pasta_t self:cap_userns { setpcap sys_admin sys_ptrace net_admin net_bind_service };
+# pasta only calls setuid and setgid with the current UID and GID, so this
+# denial is harmless. See https://bugzilla.redhat.com/show_bug.cgi?id=2330512#c10
+dontaudit pasta_t self:cap_userns { setgid setuid };
allow pasta_t self:user_namespace create;
auth_read_passwd(pasta_t)
@@ -130,7 +142,7 @@ allow pasta_t user_home_t:file { open read getattr setattr execute execute_no_tr
allow pasta_t user_home_dir_t:dir { search getattr open add_name read write };
allow pasta_t user_home_dir_t:file { create open read write };
allow pasta_t tmp_t:dir { add_name mounton remove_name write };
-allow pasta_t tmpfs_t:filesystem mount;
+allow pasta_t tmpfs_t:filesystem { getattr mount };
allow pasta_t fs_t:filesystem unmount;
allow pasta_t root_t:dir mounton;
manage_files_pattern(pasta_t, pasta_pid_t, pasta_pid_t)
@@ -147,15 +159,21 @@ logging_send_syslog_msg(pasta_t)
allow syslogd_t self:cap_userns sys_ptrace;
allow pasta_t proc_net_t:file { open read };
-allow pasta_t net_conf_t:file { open read };
allow pasta_t self:netlink_route_socket { bind create nlmsg_read nlmsg_write setopt read write };
kernel_search_network_sysctl(pasta_t)
+sysnet_read_config(pasta_t)
+
allow pasta_t tmp_t:sock_file { create unlink write };
allow pasta_t self:tcp_socket create_stream_socket_perms;
corenet_tcp_sendrecv_generic_node(pasta_t)
corenet_tcp_bind_generic_node(pasta_t)
+allow pasta_t container_runtime_t:dir { open read search };
+allow pasta_t container_runtime_t:fifo_file { getattr write };
+allow pasta_t container_runtime_t:file read;
+allow pasta_t container_runtime_t:lnk_file read;
+allow pasta_t container_t:lnk_file read;
allow pasta_t pasta_port_t:tcp_socket { name_bind name_connect };
allow pasta_t pasta_port_t:udp_socket { name_bind };
allow pasta_t http_port_t:tcp_socket { name_bind name_connect };
@@ -204,7 +222,6 @@ allow pasta_t kernel_t:system module_request;
allow pasta_t proc_t:dir mounton;
allow pasta_t proc_t:filesystem mount;
-allow pasta_t net_conf_t:lnk_file read;
allow pasta_t proc_net_t:lnk_file read;
allow pasta_t unconfined_t:process { noatsecure rlimitinh siginh };
@@ -213,3 +230,28 @@ allow pasta_t netutils_t:process { noatsecure rlimitinh siginh };
allow pasta_t ping_t:process { noatsecure rlimitinh siginh };
allow pasta_t user_tty_device_t:chr_file { append read write };
allow pasta_t user_devpts_t:chr_file { append read write };
+
+# Allow network administration commands for non-privileged users
+roleattribute container_user_r usernetctl_roles;
+roleattribute staff_r usernetctl_roles;
+roleattribute user_r usernetctl_roles;
+role usernetctl_roles types pasta_t;
+
+# Make pasta in a container run under the pasta_t context
+type_transition container_runtime_t pasta_exec_t : process pasta_t;
+allow container_runtime_t pasta_t:process transition;
+
+# Label the user network namespace files
+type_transition container_runtime_t user_tmp_t : dir ifconfig_var_run_t "netns";
+type_transition container_runtime_t user_tmp_t : dir ifconfig_var_run_t "rootless-netns";
+allow pasta_t ifconfig_var_run_t:dir { add_name open rmdir write };
+allow pasta_t ifconfig_var_run_t:file { create open write };
+allow systemd_user_runtimedir_t ifconfig_var_run_t:dir rmdir;
+
+# Allow pasta to bind to any port
+bool pasta_bind_all_ports true;
+if (pasta_bind_all_ports) {
+ allow pasta_t port_type:icmp_socket { accept getopt name_bind };
+ allow pasta_t port_type:tcp_socket { accept getopt name_bind name_connect };
+ allow pasta_t port_type:udp_socket { accept getopt name_bind };
+}
diff --git a/dhcp.c b/dhcp.c
index b0de04b..012cec6 100644
--- a/dhcp.c
+++ b/dhcp.c
@@ -296,33 +296,35 @@ static void opt_set_dns_search(const struct ctx *c, size_t max_len)
/**
* dhcp() - Check if this is a DHCP message, reply as needed
* @c: Execution context
- * @p: Packet pool, single packet with Ethernet buffer
+ * @data: Single packet with Ethernet buffer
*
* Return: 0 if it's not a DHCP message, 1 if handled, -1 on failure
*/
-int dhcp(const struct ctx *c, const struct pool *p)
+int dhcp(const struct ctx *c, struct iov_tail *data)
{
- size_t mlen, dlen, offset = 0, opt_len, opt_off = 0;
char macstr[ETH_ADDRSTRLEN];
+ size_t mlen, dlen, opt_len;
struct in_addr mask, dst;
+ struct ethhdr eh_storage;
+ struct iphdr iph_storage;
+ struct udphdr uh_storage;
const struct ethhdr *eh;
const struct iphdr *iph;
const struct udphdr *uh;
struct msg const *m;
struct msg reply;
unsigned int i;
+ struct msg m_storage;
- eh = packet_get(p, 0, offset, sizeof(*eh), NULL);
- offset += sizeof(*eh);
-
- iph = packet_get(p, 0, offset, sizeof(*iph), NULL);
+ eh = IOV_REMOVE_HEADER(data, eh_storage);
+ iph = IOV_PEEK_HEADER(data, iph_storage);
if (!eh || !iph)
return -1;
- offset += iph->ihl * 4UL;
- uh = packet_get(p, 0, offset, sizeof(*uh), &mlen);
- offset += sizeof(*uh);
+ if (!iov_drop_header(data, iph->ihl * 4UL))
+ return -1;
+ uh = IOV_REMOVE_HEADER(data, uh_storage);
if (!uh)
return -1;
@@ -332,7 +334,10 @@ int dhcp(const struct ctx *c, const struct pool *p)
if (c->no_dhcp)
return 1;
- m = packet_get(p, 0, offset, offsetof(struct msg, o), &opt_len);
+ mlen = iov_tail_size(data);
+ m = (struct msg const *)iov_remove_header_(data, &m_storage,
+ offsetof(struct msg, o),
+ __alignof__(struct msg));
if (!m ||
mlen != ntohs(uh->len) - sizeof(*uh) ||
mlen < offsetof(struct msg, o) ||
@@ -355,27 +360,28 @@ int dhcp(const struct ctx *c, const struct pool *p)
memset(&reply.file, 0, sizeof(reply.file));
reply.magic = m->magic;
- offset += offsetof(struct msg, o);
-
for (i = 0; i < ARRAY_SIZE(opts); i++)
opts[i].clen = -1;
- while (opt_off + 2 < opt_len) {
- const uint8_t *olen, *val;
+ opt_len = iov_tail_size(data);
+ while (opt_len >= 2) {
+ uint8_t olen_storage, type_storage;
+ const uint8_t *olen;
uint8_t *type;
- type = packet_get(p, 0, offset + opt_off, 1, NULL);
- olen = packet_get(p, 0, offset + opt_off + 1, 1, NULL);
+ type = IOV_REMOVE_HEADER(data, type_storage);
+ olen = IOV_REMOVE_HEADER(data, olen_storage);
if (!type || !olen)
return -1;
- val = packet_get(p, 0, offset + opt_off + 2, *olen, NULL);
- if (!val)
+ opt_len = iov_tail_size(data);
+ if (opt_len < *olen)
return -1;
- memcpy(&opts[*type].c, val, *olen);
+ iov_to_buf(&data->iov[0], data->cnt, data->off, &opts[*type].c, *olen);
opts[*type].clen = *olen;
- opt_off += *olen + 2;
+ iov_drop_header(data, *olen);
+ opt_len -= *olen;
}
opts[80].slen = -1;
diff --git a/dhcp.h b/dhcp.h
index 87aeecd..cd50c99 100644
--- a/dhcp.h
+++ b/dhcp.h
@@ -6,7 +6,7 @@
#ifndef DHCP_H
#define DHCP_H
-int dhcp(const struct ctx *c, const struct pool *p);
+int dhcp(const struct ctx *c, struct iov_tail *data);
void dhcp_init(void);
#endif /* DHCP_H */
diff --git a/dhcpv6.c b/dhcpv6.c
index 373a988..c1a27ab 100644
--- a/dhcpv6.c
+++ b/dhcpv6.c
@@ -54,14 +54,14 @@ struct opt_hdr {
uint16_t l;
} __attribute__((packed));
+#define UDP_MSG_HDR_SIZE (sizeof(struct udphdr) + sizeof(struct msg_hdr))
# define OPT_SIZE_CONV(x) (htons_constant(x))
#define OPT_SIZE(x) OPT_SIZE_CONV(sizeof(struct opt_##x) - \
sizeof(struct opt_hdr))
#define OPT_VSIZE(x) (sizeof(struct opt_##x) - \
sizeof(struct opt_hdr))
#define OPT_MAX_SIZE IPV6_MIN_MTU - (sizeof(struct ipv6hdr) + \
- sizeof(struct udphdr) + \
- sizeof(struct msg_hdr))
+ UDP_MSG_HDR_SIZE)
/**
* struct opt_client_id - DHCPv6 Client Identifier option
@@ -144,7 +144,9 @@ struct opt_ia_addr {
struct opt_status_code {
struct opt_hdr hdr;
uint16_t code;
- char status_msg[sizeof(STR_NOTONLINK) - 1];
+ /* "nonstring" is only supported since clang 23 */
+ /* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
+ __attribute__((nonstring)) char status_msg[sizeof(STR_NOTONLINK) - 1];
} __attribute__((packed));
/**
@@ -278,82 +280,132 @@ static struct resp_not_on_link_t {
/**
* dhcpv6_opt() - Get option from DHCPv6 message
- * @p: Packet pool, single packet with UDP header
- * @offset: Offset to look at, 0: end of header, set to option start
+ * @data: Buffer with options, set to matching option on return
* @type: Option type to look up, network order
*
- * Return: pointer to option header, or NULL on malformed or missing option
+ * Return: true if found and @data points to the option header,
+ * or false on malformed or missing option and @data is
+ * unmodified.
*/
-static struct opt_hdr *dhcpv6_opt(const struct pool *p, size_t *offset,
- uint16_t type)
+static bool dhcpv6_opt(struct iov_tail *data, uint16_t type)
{
- struct opt_hdr *o;
- size_t left;
+ struct iov_tail head = *data;
+ struct opt_hdr o_storage;
+ const struct opt_hdr *o;
- if (!*offset)
- *offset = sizeof(struct udphdr) + sizeof(struct msg_hdr);
-
- while ((o = packet_get_try(p, 0, *offset, sizeof(*o), &left))) {
+ while ((o = IOV_PEEK_HEADER(data, o_storage))) {
unsigned int opt_len = ntohs(o->l) + sizeof(*o);
- if (ntohs(o->l) > left)
- return NULL;
+ if (opt_len > iov_tail_size(data))
+ break;
if (o->t == type)
- return o;
+ return true;
- *offset += opt_len;
+ iov_drop_header(data, opt_len);
}
- return NULL;
+ *data = head;
+ return false;
}
/**
* dhcpv6_ia_notonlink() - Check if any IA contains non-appropriate addresses
- * @p: Packet pool, single packet starting from UDP header
+ * @data: Data to look at, packet starting from UDP header (input/output)
* @la: Address we want to lease to the client
*
- * Return: pointer to non-appropriate IA_NA or IA_TA, if any, NULL otherwise
+ * Return: true and @data points to non-appropriate IA_NA or IA_TA, if any,
+ * false otherwise and @data is unmodified
*/
-static struct opt_hdr *dhcpv6_ia_notonlink(const struct pool *p,
- struct in6_addr *la)
+static bool dhcpv6_ia_notonlink(struct iov_tail *data,
+ struct in6_addr *la)
{
int ia_types[2] = { OPT_IA_NA, OPT_IA_TA }, *ia_type;
+ struct opt_ia_addr opt_addr_storage;
const struct opt_ia_addr *opt_addr;
+ struct iov_tail current, ia_base;
+ struct opt_ia_na ia_storage;
char buf[INET6_ADDRSTRLEN];
+ const struct opt_ia_na *ia;
struct in6_addr req_addr;
+ struct opt_hdr h_storage;
const struct opt_hdr *h;
- struct opt_hdr *ia;
- size_t offset;
foreach(ia_type, ia_types) {
- offset = 0;
- while ((ia = dhcpv6_opt(p, &offset, *ia_type))) {
- if (ntohs(ia->l) < OPT_VSIZE(ia_na))
- return NULL;
-
- offset += sizeof(struct opt_ia_na);
+ current = *data;
+ while (dhcpv6_opt(&current, *ia_type)) {
+ ia_base = current;
+ ia = IOV_REMOVE_HEADER(&current, ia_storage);
+ if (!ia || ntohs(ia->hdr.l) < OPT_VSIZE(ia_na))
+ goto notfound;
+
+ while (dhcpv6_opt(&current, OPT_IAAADR)) {
+ h = IOV_PEEK_HEADER(&current, h_storage);
+ if (!h || ntohs(h->l) != OPT_VSIZE(ia_addr))
+ goto notfound;
+
+ opt_addr = IOV_REMOVE_HEADER(&current,
+ opt_addr_storage);
+ if (!opt_addr)
+ goto notfound;
- while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) {
- if (ntohs(h->l) != OPT_VSIZE(ia_addr))
- return NULL;
-
- opt_addr = (const struct opt_ia_addr *)h;
req_addr = opt_addr->addr;
if (!IN6_ARE_ADDR_EQUAL(la, &req_addr))
- goto err;
-
- offset += sizeof(struct opt_ia_addr);
+ goto notonlink;
}
}
}
- return NULL;
+notfound:
+ return false;
-err:
+notonlink:
info("DHCPv6: requested address %s not on link",
inet_ntop(AF_INET6, &req_addr, buf, sizeof(buf)));
- return ia;
+ *data = ia_base;
+ return true;
+}
+
+/**
+ * dhcpv6_send_ia_notonlink() - Send NotOnLink status
+ * @c: Execution context
+ * @ia_base: Non-appropriate IA_NA or IA_TA base
+ * @client_id_base: Client ID message option base
+ * @len: Client ID length
+ * @xid: Transaction ID for message exchange
+ */
+static void dhcpv6_send_ia_notonlink(struct ctx *c,
+ const struct iov_tail *ia_base,
+ const struct iov_tail *client_id_base,
+ int len, uint32_t xid)
+{
+ const struct in6_addr *src = &c->ip6.our_tap_ll;
+ struct opt_hdr *ia = (struct opt_hdr *)resp_not_on_link.var;
+ size_t n;
+
+ info("DHCPv6: received CONFIRM with inappropriate IA,"
+ " sending NotOnLink status in REPLY");
+
+ n = sizeof(struct opt_ia_na);
+ iov_to_buf(&ia_base->iov[0], ia_base->cnt, ia_base->off,
+ resp_not_on_link.var, n);
+ ia->l = htons(OPT_VSIZE(ia_na) + sizeof(sc_not_on_link));
+ memcpy(resp_not_on_link.var + n, &sc_not_on_link,
+ sizeof(sc_not_on_link));
+
+ n += sizeof(sc_not_on_link);
+ iov_to_buf(&client_id_base->iov[0], client_id_base->cnt,
+ client_id_base->off, resp_not_on_link.var + n,
+ sizeof(struct opt_hdr) + len);
+
+ n += sizeof(struct opt_hdr) + len;
+
+ n = offsetof(struct resp_not_on_link_t, var) + n;
+
+ resp_not_on_link.hdr.xid = xid;
+
+ tap_udp6_send(c, src, 547, tap_ip6_daddr(c, src), 546,
+ xid, &resp_not_on_link, n);
}
/**
@@ -435,17 +487,19 @@ search:
/**
* dhcpv6_client_fqdn_fill() - Fill in client FQDN option
+ * @data: Data to look at
* @c: Execution context
* @buf: Response message buffer where options will be appended
* @offset: Offset in message buffer for new options
*
* Return: updated length of response message buffer.
*/
-static size_t dhcpv6_client_fqdn_fill(const struct pool *p, const struct ctx *c,
+static size_t dhcpv6_client_fqdn_fill(const struct iov_tail *data,
+ const struct ctx *c,
char *buf, int offset)
{
- struct opt_client_fqdn const *req_opt;
+ struct iov_tail current = *data;
struct opt_client_fqdn *o;
size_t opt_len;
@@ -463,13 +517,16 @@ static size_t dhcpv6_client_fqdn_fill(const struct pool *p, const struct ctx *c,
}
o = (struct opt_client_fqdn *)(buf + offset);
+ o->flags = 0x00;
encode_domain_name(o->domain_name, c->fqdn);
- req_opt = (struct opt_client_fqdn *)dhcpv6_opt(p, &(size_t){ 0 },
- OPT_CLIENT_FQDN);
- if (req_opt && req_opt->flags & 0x01 /* S flag */)
- o->flags = 0x02 /* O flag */;
- else
- o->flags = 0x00;
+ if (dhcpv6_opt(&current, OPT_CLIENT_FQDN)) {
+ struct opt_client_fqdn req_opt_storage;
+ struct opt_client_fqdn const *req_opt;
+
+ req_opt = IOV_PEEK_HEADER(&current, req_opt_storage);
+ if (req_opt && req_opt->flags & 0x01 /* S flag */)
+ o->flags = 0x02 /* O flag */;
+ }
opt_len++;
@@ -482,23 +539,30 @@ static size_t dhcpv6_client_fqdn_fill(const struct pool *p, const struct ctx *c,
/**
* dhcpv6() - Check if this is a DHCPv6 message, reply as needed
* @c: Execution context
- * @p: Packet pool, single packet starting from UDP header
+ * @data: Single packet starting from UDP header
* @saddr: Source IPv6 address of original message
* @daddr: Destination IPv6 address of original message
*
* Return: 0 if it's not a DHCPv6 message, 1 if handled, -1 on failure
*/
-int dhcpv6(struct ctx *c, const struct pool *p,
+int dhcpv6(struct ctx *c, struct iov_tail *data,
const struct in6_addr *saddr, const struct in6_addr *daddr)
{
- const struct opt_hdr *client_id, *server_id, *ia;
+ const struct opt_server_id *server_id = NULL;
+ const struct opt_hdr *client_id = NULL;
+ struct opt_server_id server_id_storage;
+ struct iov_tail opt, client_id_base;
+ const struct opt_ia_na *ia = NULL;
+ struct opt_hdr client_id_storage;
+ struct opt_ia_na ia_storage;
const struct in6_addr *src;
+ struct msg_hdr mh_storage;
const struct msg_hdr *mh;
+ struct udphdr uh_storage;
const struct udphdr *uh;
- struct opt_hdr *bad_ia;
size_t mlen, n;
- uh = packet_get(p, 0, 0, sizeof(*uh), &mlen);
+ uh = IOV_REMOVE_HEADER(data, uh_storage);
if (!uh)
return -1;
@@ -511,6 +575,7 @@ int dhcpv6(struct ctx *c, const struct pool *p,
if (!IN6_IS_ADDR_MULTICAST(daddr))
return -1;
+ mlen = iov_tail_size(data);
if (mlen + sizeof(*uh) != ntohs(uh->len) || mlen < sizeof(*mh))
return -1;
@@ -518,20 +583,26 @@ int dhcpv6(struct ctx *c, const struct pool *p,
src = &c->ip6.our_tap_ll;
- mh = packet_get(p, 0, sizeof(*uh), sizeof(*mh), NULL);
+ mh = IOV_REMOVE_HEADER(data, mh_storage);
if (!mh)
return -1;
- client_id = dhcpv6_opt(p, &(size_t){ 0 }, OPT_CLIENTID);
+ client_id_base = *data;
+ if (dhcpv6_opt(&client_id_base, OPT_CLIENTID))
+ client_id = IOV_PEEK_HEADER(&client_id_base, client_id_storage);
if (!client_id || ntohs(client_id->l) > OPT_VSIZE(client_id))
return -1;
- server_id = dhcpv6_opt(p, &(size_t){ 0 }, OPT_SERVERID);
- if (server_id && ntohs(server_id->l) != OPT_VSIZE(server_id))
+ opt = *data;
+ if (dhcpv6_opt(&opt, OPT_SERVERID))
+ server_id = IOV_PEEK_HEADER(&opt, server_id_storage);
+ if (server_id && ntohs(server_id->hdr.l) != OPT_VSIZE(server_id))
return -1;
- ia = dhcpv6_opt(p, &(size_t){ 0 }, OPT_IA_NA);
- if (ia && ntohs(ia->l) < MIN(OPT_VSIZE(ia_na), OPT_VSIZE(ia_ta)))
+ opt = *data;
+ if (dhcpv6_opt(&opt, OPT_IA_NA))
+ ia = IOV_PEEK_HEADER(&opt, ia_storage);
+ if (ia && ntohs(ia->hdr.l) < MIN(OPT_VSIZE(ia_na), OPT_VSIZE(ia_ta)))
return -1;
resp.hdr.type = TYPE_REPLY;
@@ -546,29 +617,10 @@ int dhcpv6(struct ctx *c, const struct pool *p,
if (mh->type == TYPE_CONFIRM && server_id)
return -1;
- if ((bad_ia = dhcpv6_ia_notonlink(p, &c->ip6.addr))) {
- info("DHCPv6: received CONFIRM with inappropriate IA,"
- " sending NotOnLink status in REPLY");
-
- bad_ia->l = htons(OPT_VSIZE(ia_na) +
- sizeof(sc_not_on_link));
- n = sizeof(struct opt_ia_na);
- memcpy(resp_not_on_link.var, bad_ia, n);
-
- memcpy(resp_not_on_link.var + n,
- &sc_not_on_link, sizeof(sc_not_on_link));
- n += sizeof(sc_not_on_link);
-
- memcpy(resp_not_on_link.var + n, client_id,
- sizeof(struct opt_hdr) + ntohs(client_id->l));
- n += sizeof(struct opt_hdr) + ntohs(client_id->l);
-
- n = offsetof(struct resp_not_on_link_t, var) + n;
-
- resp_not_on_link.hdr.xid = mh->xid;
+ if (dhcpv6_ia_notonlink(data, &c->ip6.addr)) {
- tap_udp6_send(c, src, 547, tap_ip6_daddr(c, src), 546,
- mh->xid, &resp_not_on_link, n);
+ dhcpv6_send_ia_notonlink(c, data, &client_id_base,
+ ntohs(client_id->l), mh->xid);
return 1;
}
@@ -580,7 +632,7 @@ int dhcpv6(struct ctx *c, const struct pool *p,
memcmp(&resp.server_id, server_id, sizeof(resp.server_id)))
return -1;
- if (ia || dhcpv6_opt(p, &(size_t){ 0 }, OPT_IA_TA))
+ if (ia || dhcpv6_opt(data, OPT_IA_TA))
return -1;
info("DHCPv6: received INFORMATION_REQUEST, sending REPLY");
@@ -606,13 +658,14 @@ int dhcpv6(struct ctx *c, const struct pool *p,
if (ia)
resp.ia_na.iaid = ((struct opt_ia_na *)ia)->iaid;
- memcpy(&resp.client_id, client_id,
- ntohs(client_id->l) + sizeof(struct opt_hdr));
+ iov_to_buf(&client_id_base.iov[0], client_id_base.cnt,
+ client_id_base.off, &resp.client_id,
+ ntohs(client_id->l) + sizeof(struct opt_hdr));
n = offsetof(struct resp_t, client_id) +
sizeof(struct opt_hdr) + ntohs(client_id->l);
n = dhcpv6_dns_fill(c, (char *)&resp, n);
- n = dhcpv6_client_fqdn_fill(p, c, (char *)&resp, n);
+ n = dhcpv6_client_fqdn_fill(data, c, (char *)&resp, n);
resp.hdr.xid = mh->xid;
diff --git a/dhcpv6.h b/dhcpv6.h
index 5809988..c706dfd 100644
--- a/dhcpv6.h
+++ b/dhcpv6.h
@@ -6,7 +6,7 @@
#ifndef DHCPV6_H
#define DHCPV6_H
-int dhcpv6(struct ctx *c, const struct pool *p,
+int dhcpv6(struct ctx *c, struct iov_tail *data,
struct in6_addr *saddr, struct in6_addr *daddr);
void dhcpv6_init(const struct ctx *c);
diff --git a/doc/platform-requirements/.gitignore b/doc/platform-requirements/.gitignore
index 3b5a10a..f6272cf 100644
--- a/doc/platform-requirements/.gitignore
+++ b/doc/platform-requirements/.gitignore
@@ -1,3 +1,4 @@
+/listen-vs-repair
/reuseaddr-priority
/recv-zero
/udp-close-dup
diff --git a/doc/platform-requirements/Makefile b/doc/platform-requirements/Makefile
index 6a7d374..83930ef 100644
--- a/doc/platform-requirements/Makefile
+++ b/doc/platform-requirements/Makefile
@@ -3,8 +3,8 @@
# Copyright Red Hat
# Author: David Gibson <david@gibson.dropbear.id.au>
-TARGETS = reuseaddr-priority recv-zero udp-close-dup
-SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c
+TARGETS = reuseaddr-priority recv-zero udp-close-dup listen-vs-repair
+SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c listen-vs-repair.c
CFLAGS = -Wall
all: cppcheck clang-tidy $(TARGETS:%=check-%)
diff --git a/doc/platform-requirements/common.h b/doc/platform-requirements/common.h
index 8844b1e..e85fc2b 100644
--- a/doc/platform-requirements/common.h
+++ b/doc/platform-requirements/common.h
@@ -15,6 +15,7 @@
#include <stdio.h>
#include <stdlib.h>
+__attribute__((format(printf, 1, 2), noreturn))
static inline void die(const char *fmt, ...)
{
va_list ap;
diff --git a/doc/platform-requirements/listen-vs-repair.c b/doc/platform-requirements/listen-vs-repair.c
new file mode 100644
index 0000000..d31fe3f
--- /dev/null
+++ b/doc/platform-requirements/listen-vs-repair.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* liste-vs-repair.c
+ *
+ * Do listening sockets have address conflicts with sockets under repair
+ * ====================================================================
+ *
+ * When we accept() an incoming connection the accept()ed socket will have the
+ * same local address as the listening socket. This can be a complication on
+ * migration. On the migration target we've already set up listening sockets
+ * according to the command line. However to restore connections that we're
+ * migrating in we need to bind the new sockets to the same address, which would
+ * be an address conflict on the face of it. This test program verifies that
+ * enabling repair mode before bind() correctly suppresses that conflict.
+ *
+ * Copyright Red Hat
+ * Author: David Gibson <david@gibson.dropbear.id.au>
+ */
+
+/* NOLINTNEXTLINE(bugprone-reserved-identifier,cert-dcl37-c,cert-dcl51-cpp) */
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "common.h"
+
+#define PORT 13256U
+#define CPORT 13257U
+
+/* 127.0.0.1:PORT */
+static const struct sockaddr_in addr = SOCKADDR_INIT(INADDR_LOOPBACK, PORT);
+
+/* 127.0.0.1:CPORT */
+static const struct sockaddr_in caddr = SOCKADDR_INIT(INADDR_LOOPBACK, CPORT);
+
+/* Put ourselves into a network sandbox */
+static void net_sandbox(void)
+{
+ /* NOLINTNEXTLINE(altera-struct-pack-align) */
+ const struct req_t {
+ struct nlmsghdr nlh;
+ struct ifinfomsg ifm;
+ } __attribute__((packed)) req = {
+ .nlh.nlmsg_type = RTM_NEWLINK,
+ .nlh.nlmsg_flags = NLM_F_REQUEST,
+ .nlh.nlmsg_len = sizeof(req),
+ .nlh.nlmsg_seq = 1,
+ .ifm.ifi_family = AF_UNSPEC,
+ .ifm.ifi_index = 1,
+ .ifm.ifi_flags = IFF_UP,
+ .ifm.ifi_change = IFF_UP,
+ };
+ int nl;
+
+ if (unshare(CLONE_NEWUSER | CLONE_NEWNET))
+ die("unshare(): %s\n", strerror(errno));
+
+ /* Bring up lo in the new netns */
+ nl = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
+ if (nl < 0)
+ die("Can't create netlink socket: %s\n", strerror(errno));
+
+ if (send(nl, &req, sizeof(req), 0) < 0)
+ die("Netlink send(): %s\n", strerror(errno));
+ close(nl);
+}
+
+static void check(void)
+{
+ int s1, s2, op;
+
+ s1 = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+ if (s1 < 0)
+ die("socket() 1: %s\n", strerror(errno));
+
+ if (bind(s1, (struct sockaddr *)&addr, sizeof(addr)))
+ die("bind() 1: %s\n", strerror(errno));
+
+ if (listen(s1, 0))
+ die("listen(): %s\n", strerror(errno));
+
+ s2 = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+ if (s2 < 0)
+ die("socket() 2: %s\n", strerror(errno));
+
+ op = TCP_REPAIR_ON;
+ if (setsockopt(s2, SOL_TCP, TCP_REPAIR, &op, sizeof(op)))
+ die("TCP_REPAIR: %s\n", strerror(errno));
+
+ if (bind(s2, (struct sockaddr *)&addr, sizeof(addr)))
+ die("bind() 2: %s\n", strerror(errno));
+
+ if (connect(s2, (struct sockaddr *)&caddr, sizeof(caddr)))
+ die("connect(): %s\n", strerror(errno));
+
+ op = TCP_REPAIR_OFF_NO_WP;
+ if (setsockopt(s2, SOL_TCP, TCP_REPAIR, &op, sizeof(op)))
+ die("TCP_REPAIR: %s\n", strerror(errno));
+
+ close(s1);
+ close(s2);
+}
+
+int main(int argc, char *argv[])
+{
+ (void)argc;
+ (void)argv;
+
+ net_sandbox();
+
+ check();
+
+ printf("Repair mode appears to properly suppress conflicts with listening sockets\n");
+
+ exit(0);
+}
diff --git a/doc/platform-requirements/reuseaddr-priority.c b/doc/platform-requirements/reuseaddr-priority.c
index 701b6ff..af39a39 100644
--- a/doc/platform-requirements/reuseaddr-priority.c
+++ b/doc/platform-requirements/reuseaddr-priority.c
@@ -46,13 +46,13 @@
/* Different cases for receiving socket configuration */
enum sock_type {
/* Socket is bound to 0.0.0.0:DSTPORT and not connected */
- SOCK_BOUND_ANY = 0,
+ SOCK_BOUND_ANY,
/* Socket is bound to 127.0.0.1:DSTPORT and not connected */
- SOCK_BOUND_LO = 1,
+ SOCK_BOUND_LO,
/* Socket is bound to 0.0.0.0:DSTPORT and connected to 127.0.0.1:SRCPORT */
- SOCK_CONNECTED = 2,
+ SOCK_CONNECTED,
NUM_SOCK_TYPES,
};
diff --git a/epoll_type.h b/epoll_type.h
index 7f2a121..12ac59b 100644
--- a/epoll_type.h
+++ b/epoll_type.h
@@ -22,8 +22,8 @@ enum epoll_type {
EPOLL_TYPE_TCP_TIMER,
/* UDP "listening" sockets */
EPOLL_TYPE_UDP_LISTEN,
- /* UDP socket for replies on a specific flow */
- EPOLL_TYPE_UDP_REPLY,
+ /* UDP socket for a specific flow */
+ EPOLL_TYPE_UDP,
/* ICMP/ICMPv6 ping sockets */
EPOLL_TYPE_PING,
/* inotify fd watching for end of netns (pasta) */
diff --git a/flow.c b/flow.c
index 749c498..feefda3 100644
--- a/flow.c
+++ b/flow.c
@@ -81,7 +81,7 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
*
* Free cluster list
* flow_first_free gives the index of the first (lowest index) free cluster.
- * Each free cluster has the index of the next free cluster, or MAX_FLOW if
+ * Each free cluster has the index of the next free cluster, or FLOW_MAX if
* it is the last free cluster. Together these form a linked list of free
* clusters, in strictly increasing order of index.
*
@@ -396,18 +396,27 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
* @flow: Flow to change state
* @pif: pif of the initiating side
* @ssa: Source socket address
+ * @daddr: Destination address (may be NULL)
* @dport: Destination port
*
* Return: pointer to the initiating flowside information
*/
struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
const union sockaddr_inany *ssa,
+ const union inany_addr *daddr,
in_port_t dport)
{
struct flowside *ini = &flow->f.side[INISIDE];
- inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa);
- if (inany_v4(&ini->eaddr))
+ if (inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa) < 0) {
+ char str[SOCKADDR_STRLEN];
+
+ ASSERT_WITH_MSG(0, "Bad socket address %s",
+ sockaddr_ntop(ssa, str, sizeof(str)));
+ }
+ if (daddr)
+ ini->oaddr = *daddr;
+ else if (inany_v4(&ini->eaddr))
ini->oaddr = inany_any4;
else
ini->oaddr = inany_any6;
@@ -471,7 +480,9 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
/**
* flow_set_type() - Set type and move to TYPED
* @flow: Flow to change state
- * @pif: pif of the initiating side
+ * @type: New flow type to assign
+ *
+ * Return: pointer to the modified flow structure.
*/
union flow *flow_set_type(union flow *flow, enum flow_type type)
{
@@ -616,7 +627,7 @@ static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx)
* @hash: Raw hash value for flow & side
* @sidx: Flow and side to find bucket for
*
- * Return: If @sidx is in the hash table, its current bucket, otherwise a
+ * Return: if @sidx is in the hash table, its current bucket, otherwise a
* suitable free bucket for it.
*/
static inline unsigned flow_hash_probe_(uint64_t hash, flow_sidx_t sidx)
@@ -636,7 +647,7 @@ static inline unsigned flow_hash_probe_(uint64_t hash, flow_sidx_t sidx)
* @c: Execution context
* @sidx: Flow and side to find bucket for
*
- * Return: If @sidx is in the hash table, its current bucket, otherwise a
+ * Return: if @sidx is in the hash table, its current bucket, otherwise a
* suitable free bucket for it.
*/
static inline unsigned flow_hash_probe(const struct ctx *c, flow_sidx_t sidx)
@@ -751,19 +762,30 @@ flow_sidx_t flow_lookup_af(const struct ctx *c,
* @proto: Protocol of the flow (IP L4 protocol number)
* @pif: Interface of the flow
* @esa: Socket address of the endpoint
+ * @oaddr: Our address (may be NULL)
* @oport: Our port number
*
* Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
*/
flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
- const void *esa, in_port_t oport)
+ const void *esa,
+ const union inany_addr *oaddr, in_port_t oport)
{
struct flowside side = {
.oport = oport,
};
- inany_from_sockaddr(&side.eaddr, &side.eport, esa);
- if (inany_v4(&side.eaddr))
+ if (inany_from_sockaddr(&side.eaddr, &side.eport, esa) < 0) {
+ char str[SOCKADDR_STRLEN];
+
+ warn("Flow lookup on bad socket address %s",
+ sockaddr_ntop(esa, str, sizeof(str)));
+ return FLOW_SIDX_NONE;
+ }
+
+ if (oaddr)
+ side.oaddr = *oaddr;
+ else if (inany_v4(&side.eaddr))
side.oaddr = inany_any4;
else
side.oaddr = inany_any6;
@@ -780,6 +802,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
{
struct flow_free_cluster *free_head = NULL;
unsigned *last_next = &flow_first_free;
+ bool to_free[FLOW_MAX] = { 0 };
bool timer = false;
union flow *flow;
@@ -790,9 +813,44 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
ASSERT(!flow_new_entry); /* Incomplete flow at end of cycle */
- flow_foreach_slot(flow) {
+ /* Check which flows we might need to close first, but don't free them
+ * yet as it's not safe to do that in the middle of flow_foreach().
+ */
+ flow_foreach(flow) {
bool closed = false;
+ switch (flow->f.type) {
+ case FLOW_TYPE_NONE:
+ ASSERT(false);
+ break;
+ case FLOW_TCP:
+ closed = tcp_flow_defer(&flow->tcp);
+ break;
+ case FLOW_TCP_SPLICE:
+ closed = tcp_splice_flow_defer(&flow->tcp_splice);
+ if (!closed && timer)
+ tcp_splice_timer(c, &flow->tcp_splice);
+ break;
+ case FLOW_PING4:
+ case FLOW_PING6:
+ if (timer)
+ closed = icmp_ping_timer(c, &flow->ping, now);
+ break;
+ case FLOW_UDP:
+ closed = udp_flow_defer(c, &flow->udp, now);
+ if (!closed && timer)
+ closed = udp_flow_timer(c, &flow->udp, now);
+ break;
+ default:
+ /* Assume other flow types don't need any handling */
+ ;
+ }
+
+ to_free[FLOW_IDX(flow)] = closed;
+ }
+
+ /* Second step: actually free the flows */
+ flow_foreach_slot(flow) {
switch (flow->f.state) {
case FLOW_STATE_FREE: {
unsigned skip = flow->free.n;
@@ -825,60 +883,31 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
break;
case FLOW_STATE_ACTIVE:
- /* Nothing to do */
+ if (to_free[FLOW_IDX(flow)]) {
+ flow_set_state(&flow->f, FLOW_STATE_FREE);
+ memset(flow, 0, sizeof(*flow));
+
+ if (free_head) {
+ /* Add slot to current free cluster */
+ ASSERT(FLOW_IDX(flow) ==
+ FLOW_IDX(free_head) + free_head->n);
+ free_head->n++;
+ flow->free.n = flow->free.next = 0;
+ } else {
+ /* Create new free cluster */
+ free_head = &flow->free;
+ free_head->n = 1;
+ *last_next = FLOW_IDX(flow);
+ last_next = &free_head->next;
+ }
+ } else {
+ free_head = NULL;
+ }
break;
default:
ASSERT(false);
}
-
- switch (flow->f.type) {
- case FLOW_TYPE_NONE:
- ASSERT(false);
- break;
- case FLOW_TCP:
- closed = tcp_flow_defer(&flow->tcp);
- break;
- case FLOW_TCP_SPLICE:
- closed = tcp_splice_flow_defer(&flow->tcp_splice);
- if (!closed && timer)
- tcp_splice_timer(c, &flow->tcp_splice);
- break;
- case FLOW_PING4:
- case FLOW_PING6:
- if (timer)
- closed = icmp_ping_timer(c, &flow->ping, now);
- break;
- case FLOW_UDP:
- closed = udp_flow_defer(&flow->udp);
- if (!closed && timer)
- closed = udp_flow_timer(c, &flow->udp, now);
- break;
- default:
- /* Assume other flow types don't need any handling */
- ;
- }
-
- if (closed) {
- flow_set_state(&flow->f, FLOW_STATE_FREE);
- memset(flow, 0, sizeof(*flow));
-
- if (free_head) {
- /* Add slot to current free cluster */
- ASSERT(FLOW_IDX(flow) ==
- FLOW_IDX(free_head) + free_head->n);
- free_head->n++;
- flow->free.n = flow->free.next = 0;
- } else {
- /* Create new free cluster */
- free_head = &flow->free;
- free_head->n = 1;
- *last_next = FLOW_IDX(flow);
- last_next = &free_head->next;
- }
- } else {
- free_head = NULL;
- }
}
*last_next = FLOW_MAX;
@@ -912,6 +941,21 @@ static int flow_migrate_source_rollback(struct ctx *c, unsigned bound, int ret)
}
/**
+ * flow_migrate_need_repair() - Do we need to set repair mode for any flow?
+ *
+ * Return: true if repair mode is needed, false otherwise
+ */
+static bool flow_migrate_need_repair(void)
+{
+ union flow *flow;
+
+ foreach_established_tcp_flow(flow)
+ return true;
+
+ return false;
+}
+
+/**
* flow_migrate_repair_all() - Turn repair mode on or off for all flows
* @c: Execution context
* @enable: Switch repair mode on if set, off otherwise
@@ -966,6 +1010,9 @@ int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage,
(void)stage;
(void)fd;
+ if (flow_migrate_need_repair())
+ repair_wait(c);
+
if ((rc = flow_migrate_repair_all(c, true)))
return -rc;
@@ -1019,8 +1066,8 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
foreach_established_tcp_flow(flow) {
rc = tcp_flow_migrate_source(fd, &flow->tcp);
if (rc) {
- err("Can't send data, flow %u: %s", FLOW_IDX(flow),
- strerror_(-rc));
+ flow_err(flow, "Can't send data: %s",
+ strerror_(-rc));
if (!first)
die("Inconsistent migration state, exiting");
@@ -1044,10 +1091,10 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
* as EIO).
*/
foreach_established_tcp_flow(flow) {
- rc = tcp_flow_migrate_source_ext(fd, &flow->tcp);
+ rc = tcp_flow_migrate_source_ext(c, fd, &flow->tcp);
if (rc) {
- err("Extended data for flow %u: %s", FLOW_IDX(flow),
- strerror_(-rc));
+ flow_err(flow, "Can't send extended data: %s",
+ strerror_(-rc));
if (rc == -EIO)
die("Inconsistent migration state, exiting");
@@ -1083,6 +1130,9 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
if (!count)
return 0;
+ if ((rc = repair_wait(c)))
+ return -rc;
+
if ((rc = flow_migrate_repair_all(c, true)))
return -rc;
@@ -1092,8 +1142,8 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
for (i = 0; i < count; i++) {
rc = tcp_flow_migrate_target(c, fd);
if (rc) {
- debug("Migration data failure at flow %u: %s, abort",
- i, strerror_(-rc));
+ flow_dbg(FLOW(i), "Migration data failure, abort: %s",
+ strerror_(-rc));
return -rc;
}
}
@@ -1103,8 +1153,8 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
for (i = 0; i < count; i++) {
rc = tcp_flow_migrate_target_ext(c, &flowtab[i].tcp, fd);
if (rc) {
- debug("Migration data failure at flow %u: %s, abort",
- i, strerror_(-rc));
+ flow_dbg(FLOW(i), "Migration data failure, abort: %s",
+ strerror_(-rc));
return -rc;
}
}
diff --git a/flow.h b/flow.h
index dcf7645..cac618a 100644
--- a/flow.h
+++ b/flow.h
@@ -243,7 +243,8 @@ flow_sidx_t flow_lookup_af(const struct ctx *c,
const void *eaddr, const void *oaddr,
in_port_t eport, in_port_t oport);
flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
- const void *esa, in_port_t oport);
+ const void *esa,
+ const union inany_addr *oaddr, in_port_t oport);
union flow;
diff --git a/flow_table.h b/flow_table.h
index fd2c57b..5ee13ac 100644
--- a/flow_table.h
+++ b/flow_table.h
@@ -93,6 +93,7 @@ extern union flow flowtab[];
*/
static inline unsigned flow_idx(const struct flow_common *f)
{
+ /* NOLINTNEXTLINE(clang-analyzer-security.PointerSub) */
return (union flow *)f - flowtab;
}
@@ -139,7 +140,7 @@ static inline uint8_t pif_at_sidx(flow_sidx_t sidx)
/** flowside_at_sidx() - Retrieve a specific flowside
* @sidx: Flow & side index
*
- * Return: Flowside for the flow & side given by @sidx
+ * Return: flowside for the flow & side given by @sidx
*/
static inline const struct flowside *flowside_at_sidx(flow_sidx_t sidx)
{
@@ -199,6 +200,7 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
const void *daddr, in_port_t dport);
struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
const union sockaddr_inany *ssa,
+ const union inany_addr *daddr,
in_port_t dport);
const struct flowside *flow_target_af(union flow *flow, uint8_t pif,
sa_family_t af,
diff --git a/fwd.c b/fwd.c
index 2829cd2..250cf56 100644
--- a/fwd.c
+++ b/fwd.c
@@ -324,6 +324,30 @@ static bool fwd_guest_accessible(const struct ctx *c,
}
/**
+ * nat_outbound() - Apply address translation for outbound (TAP to HOST)
+ * @c: Execution context
+ * @addr: Input address (as seen on TAP interface)
+ * @translated: Output address (as seen on HOST interface)
+ *
+ * Only handles translations that depend *only* on the address. Anything
+ * related to specific ports or flows is handled elsewhere.
+ */
+static void nat_outbound(const struct ctx *c, const union inany_addr *addr,
+ union inany_addr *translated)
+{
+ if (inany_equals4(addr, &c->ip4.map_host_loopback))
+ *translated = inany_loopback4;
+ else if (inany_equals6(addr, &c->ip6.map_host_loopback))
+ *translated = inany_loopback6;
+ else if (inany_equals4(addr, &c->ip4.map_guest_addr))
+ *translated = inany_from_v4(c->ip4.addr);
+ else if (inany_equals6(addr, &c->ip6.map_guest_addr))
+ translated->a6 = c->ip6.addr;
+ else
+ *translated = *addr;
+}
+
+/**
* fwd_nat_from_tap() - Determine to forward a flow from the tap interface
* @c: Execution context
* @proto: Protocol (IP L4 protocol number)
@@ -342,16 +366,8 @@ uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
else if (is_dns_flow(proto, ini) &&
inany_equals6(&ini->oaddr, &c->ip6.dns_match))
tgt->eaddr.a6 = c->ip6.dns_host;
- else if (inany_equals4(&ini->oaddr, &c->ip4.map_host_loopback))
- tgt->eaddr = inany_loopback4;
- else if (inany_equals6(&ini->oaddr, &c->ip6.map_host_loopback))
- tgt->eaddr = inany_loopback6;
- else if (inany_equals4(&ini->oaddr, &c->ip4.map_guest_addr))
- tgt->eaddr = inany_from_v4(c->ip4.addr);
- else if (inany_equals6(&ini->oaddr, &c->ip6.map_guest_addr))
- tgt->eaddr.a6 = c->ip6.addr;
else
- tgt->eaddr = ini->oaddr;
+ nat_outbound(c, &ini->oaddr, &tgt->eaddr);
tgt->eport = ini->oport;
@@ -402,7 +418,7 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
else
tgt->eaddr = inany_loopback6;
- /* Preserve the specific loopback adddress used, but let the kernel pick
+ /* Preserve the specific loopback address used, but let the kernel pick
* a source port on the target side
*/
tgt->oaddr = ini->eaddr;
@@ -424,6 +440,42 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
}
/**
+ * nat_inbound() - Apply address translation for inbound (HOST to TAP)
+ * @c: Execution context
+ * @addr: Input address (as seen on HOST interface)
+ * @translated: Output address (as seen on TAP interface)
+ *
+ * Return: true on success, false if it couldn't translate the address
+ *
+ * Only handles translations that depend *only* on the address. Anything
+ * related to specific ports or flows is handled elsewhere.
+ */
+bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
+ union inany_addr *translated)
+{
+ if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) &&
+ inany_equals4(addr, &in4addr_loopback)) {
+ /* Specifically 127.0.0.1, not 127.0.0.0/8 */
+ *translated = inany_from_v4(c->ip4.map_host_loopback);
+ } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) &&
+ inany_equals6(addr, &in6addr_loopback)) {
+ translated->a6 = c->ip6.map_host_loopback;
+ } else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) &&
+ inany_equals4(addr, &c->ip4.addr)) {
+ *translated = inany_from_v4(c->ip4.map_guest_addr);
+ } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) &&
+ inany_equals6(addr, &c->ip6.addr)) {
+ translated->a6 = c->ip6.map_guest_addr;
+ } else if (fwd_guest_accessible(c, addr)) {
+ *translated = *addr;
+ } else {
+ return false;
+ }
+
+ return true;
+}
+
+/**
* fwd_nat_from_host() - Determine to forward a flow from the host interface
* @c: Execution context
* @proto: Protocol (IP L4 protocol number)
@@ -479,20 +531,7 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
return PIF_SPLICE;
}
- if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) &&
- inany_equals4(&ini->eaddr, &in4addr_loopback)) {
- /* Specifically 127.0.0.1, not 127.0.0.0/8 */
- tgt->oaddr = inany_from_v4(c->ip4.map_host_loopback);
- } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) &&
- inany_equals6(&ini->eaddr, &in6addr_loopback)) {
- tgt->oaddr.a6 = c->ip6.map_host_loopback;
- } else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) &&
- inany_equals4(&ini->eaddr, &c->ip4.addr)) {
- tgt->oaddr = inany_from_v4(c->ip4.map_guest_addr);
- } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) &&
- inany_equals6(&ini->eaddr, &c->ip6.addr)) {
- tgt->oaddr.a6 = c->ip6.map_guest_addr;
- } else if (!fwd_guest_accessible(c, &ini->eaddr)) {
+ if (!nat_inbound(c, &ini->eaddr, &tgt->oaddr)) {
if (inany_v4(&ini->eaddr)) {
if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.our_tap_addr))
/* No source address we can use */
@@ -501,8 +540,6 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
} else {
tgt->oaddr.a6 = c->ip6.our_tap_ll;
}
- } else {
- tgt->oaddr = ini->eaddr;
}
tgt->oport = ini->eport;
diff --git a/fwd.h b/fwd.h
index 3562f3c..65c7c96 100644
--- a/fwd.h
+++ b/fwd.h
@@ -7,6 +7,7 @@
#ifndef FWD_H
#define FWD_H
+union inany_addr;
struct flowside;
/* Number of ports for both TCP and UDP */
@@ -26,7 +27,7 @@ enum fwd_ports_mode {
#define PORT_BITMAP_SIZE DIV_ROUND_UP(NUM_PORTS, 8)
/**
- * fwd_ports - Describes port forwarding for one protocol and direction
+ * fwd_ports() - Describes port forwarding for one protocol and direction
* @mode: Overall forwarding mode (all, none, auto, specific ports)
* @scan4: /proc/net fd to scan for IPv4 ports when in AUTO mode
* @scan6: /proc/net fd to scan for IPv6 ports when in AUTO mode
@@ -47,6 +48,8 @@ void fwd_scan_ports_udp(struct fwd_ports *fwd, const struct fwd_ports *rev,
const struct fwd_ports *tcp_rev);
void fwd_scan_ports_init(struct ctx *c);
+bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
+ union inany_addr *translated);
uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
const struct flowside *ini, struct flowside *tgt);
uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
diff --git a/icmp.c b/icmp.c
index 7e2b342..6dffafb 100644
--- a/icmp.c
+++ b/icmp.c
@@ -44,6 +44,7 @@
#define ICMP_ECHO_TIMEOUT 60 /* s, timeout for ICMP socket activity */
#define ICMP_NUM_IDS (1U << 16)
+#define MAX_IOV_ICMP 16 /* Arbitrary, should be enough */
/**
* ping_at_sidx() - Get ping specific flow at given sidx
@@ -163,7 +164,7 @@ static void icmp_ping_close(const struct ctx *c,
* @saddr: Source address
* @daddr: Destination address
*
- * Return: Newly opened ping flow, or NULL on failure
+ * Return: newly opened ping flow, or NULL on failure
*/
static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
sa_family_t af, uint16_t id,
@@ -229,37 +230,36 @@ cancel:
* @af: Address family, AF_INET or AF_INET6
* @saddr: Source address
* @daddr: Destination address
- * @p: Packet pool, single packet with ICMP/ICMPv6 header
+ * @data: Single packet with ICMP/ICMPv6 header
* @now: Current timestamp
*
* Return: count of consumed packets (always 1, even if malformed)
*/
int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
const void *saddr, const void *daddr,
- const struct pool *p, const struct timespec *now)
+ struct iov_tail *data, const struct timespec *now)
{
+ struct iovec iov[MAX_IOV_ICMP];
struct icmp_ping_flow *pingf;
const struct flowside *tgt;
union sockaddr_inany sa;
- size_t dlen, l4len;
+ struct msghdr msh;
uint16_t id, seq;
union flow *flow;
uint8_t proto;
- socklen_t sl;
- void *pkt;
+ int cnt;
(void)saddr;
ASSERT(pif == PIF_TAP);
if (af == AF_INET) {
+ struct icmphdr ih_storage;
const struct icmphdr *ih;
- if (!(pkt = packet_get(p, 0, 0, sizeof(*ih), &dlen)))
+ ih = IOV_PEEK_HEADER(data, ih_storage);
+ if (!ih)
return 1;
- ih = (struct icmphdr *)pkt;
- l4len = dlen + sizeof(*ih);
-
if (ih->type != ICMP_ECHO)
return 1;
@@ -267,14 +267,13 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
id = ntohs(ih->un.echo.id);
seq = ntohs(ih->un.echo.sequence);
} else if (af == AF_INET6) {
+ struct icmp6hdr ih_storage;
const struct icmp6hdr *ih;
- if (!(pkt = packet_get(p, 0, 0, sizeof(*ih), &dlen)))
+ ih = IOV_PEEK_HEADER(data, ih_storage);
+ if (!ih)
return 1;
- ih = (struct icmp6hdr *)pkt;
- l4len = dlen + sizeof(*ih);
-
if (ih->icmp6_type != ICMPV6_ECHO_REQUEST)
return 1;
@@ -285,6 +284,10 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
ASSERT(0);
}
+ cnt = iov_tail_clone(&iov[0], MAX_IOV_ICMP, data);
+ if (cnt < 0)
+ return 1;
+
flow = flow_at_sidx(flow_lookup_af(c, proto, PIF_TAP,
af, saddr, daddr, id, id));
@@ -298,8 +301,15 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
ASSERT(flow_proto[pingf->f.type] == proto);
pingf->ts = now->tv_sec;
- pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, 0);
- if (sendto(pingf->sock, pkt, l4len, MSG_NOSIGNAL, &sa.sa, sl) < 0) {
+ pif_sockaddr(c, &sa, &msh.msg_namelen, PIF_HOST, &tgt->eaddr, 0);
+ msh.msg_name = &sa;
+ msh.msg_iov = iov;
+ msh.msg_iovlen = cnt;
+ msh.msg_control = NULL;
+ msh.msg_controllen = 0;
+ msh.msg_flags = 0;
+
+ if (sendmsg(pingf->sock, &msh, MSG_NOSIGNAL) < 0) {
flow_dbg_perror(pingf, "failed to relay request to socket");
} else {
flow_dbg(pingf,
diff --git a/icmp.h b/icmp.h
index 5ce22b5..d1cecb2 100644
--- a/icmp.h
+++ b/icmp.h
@@ -14,7 +14,7 @@ struct icmp_ping_flow;
void icmp_sock_handler(const struct ctx *c, union epoll_ref ref);
int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
const void *saddr, const void *daddr,
- const struct pool *p, const struct timespec *now);
+ struct iov_tail *data, const struct timespec *now);
void icmp_init(void);
/**
diff --git a/inany.c b/inany.c
index f5483bf..65a39f9 100644
--- a/inany.c
+++ b/inany.c
@@ -25,7 +25,7 @@ const union inany_addr inany_any4 = INANY_INIT4(IN4ADDR_ANY_INIT);
* @dst: output buffer, minimum INANY_ADDRSTRLEN bytes
* @size: size of buffer at @dst
*
- * Return: On success, a non-null pointer to @dst, NULL on failure
+ * Return: on success, a non-null pointer to @dst, NULL on failure
*/
const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size)
{
@@ -41,7 +41,7 @@ const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size)
* @src: IPv[46] address
* @dst: output buffer, filled with parsed address
*
- * Return: On success, 1, if no parseable address is found, 0
+ * Return: on success, 1, if no parseable address is found, 0
*/
int inany_pton(const char *src, union inany_addr *dst)
{
diff --git a/inany.h b/inany.h
index 6a12c29..7ca5cbd 100644
--- a/inany.h
+++ b/inany.h
@@ -237,23 +237,30 @@ static inline void inany_from_af(union inany_addr *aa,
}
/** inany_from_sockaddr - Extract IPv[46] address and port number from sockaddr
- * @aa: Pointer to store IPv[46] address
+ * @dst: Pointer to store IPv[46] address (output)
* @port: Pointer to store port number, host order
- * @addr: AF_INET or AF_INET6 socket address
+ * @addr: Socket address
+ *
+ * Return: 0 on success, -1 on error (bad address family)
*/
-static inline void inany_from_sockaddr(union inany_addr *aa, in_port_t *port,
- const union sockaddr_inany *sa)
+static inline int inany_from_sockaddr(union inany_addr *dst, in_port_t *port,
+ const void *addr)
{
+ const union sockaddr_inany *sa = (const union sockaddr_inany *)addr;
+
if (sa->sa_family == AF_INET6) {
- inany_from_af(aa, AF_INET6, &sa->sa6.sin6_addr);
+ inany_from_af(dst, AF_INET6, &sa->sa6.sin6_addr);
*port = ntohs(sa->sa6.sin6_port);
- } else if (sa->sa_family == AF_INET) {
- inany_from_af(aa, AF_INET, &sa->sa4.sin_addr);
+ return 0;
+ }
+
+ if (sa->sa_family == AF_INET) {
+ inany_from_af(dst, AF_INET, &sa->sa4.sin_addr);
*port = ntohs(sa->sa4.sin_port);
- } else {
- /* Not valid to call with other address families */
- ASSERT(0);
+ return 0;
}
+
+ return -1;
}
/** inany_siphash_feed- Fold IPv[46] address into an in-progress siphash
diff --git a/iov.c b/iov.c
index 3b12272..8c74a59 100644
--- a/iov.c
+++ b/iov.c
@@ -26,7 +26,8 @@
#include "iov.h"
-/* iov_skip_bytes() - Skip leading bytes of an IO vector
+/**
+ * iov_skip_bytes() - Skip leading bytes of an IO vector
* @iov: IO vector
* @n: Number of entries in @iov
* @skip: Number of leading bytes of @iov to skip
@@ -56,8 +57,8 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n,
}
/**
- * iov_from_buf - Copy data from a buffer to an I/O vector (struct iovec)
- * efficiently.
+ * iov_from_buf() - Copy data from a buffer to an I/O vector (struct iovec)
+ * efficiently.
*
* @iov: Pointer to the array of struct iovec describing the
* scatter/gather I/O vector.
@@ -66,7 +67,7 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n,
* @buf: Pointer to the source buffer containing the data to copy.
* @bytes: Total number of bytes to copy from buf to iov.
*
- * Returns: The number of bytes successfully copied.
+ * Return: the number of bytes successfully copied.
*/
size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
size_t offset, const void *buf, size_t bytes)
@@ -96,8 +97,8 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
}
/**
- * iov_to_buf - Copy data from a scatter/gather I/O vector (struct iovec) to
- * a buffer efficiently.
+ * iov_to_buf() - Copy data from a scatter/gather I/O vector (struct iovec) to
+ * a buffer efficiently.
*
* @iov: Pointer to the array of struct iovec describing the scatter/gather
* I/O vector.
@@ -106,9 +107,8 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
* @buf: Pointer to the destination buffer where data will be copied.
* @bytes: Total number of bytes to copy from iov to buf.
*
- * Returns: The number of bytes successfully copied.
+ * Return: the number of bytes successfully copied.
*/
-/* cppcheck-suppress unusedFunction */
size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt,
size_t offset, void *buf, size_t bytes)
{
@@ -126,6 +126,7 @@ size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt,
/* copying data */
for (copied = 0; copied < bytes && i < iov_cnt; i++) {
size_t len = MIN(iov[i].iov_len - offset, bytes - copied);
+ ASSERT(iov[i].iov_base);
memcpy((char *)buf + copied, (char *)iov[i].iov_base + offset,
len);
copied += len;
@@ -136,14 +137,14 @@ size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt,
}
/**
- * iov_size - Calculate the total size of a scatter/gather I/O vector
- * (struct iovec).
+ * iov_size() - Calculate the total size of a scatter/gather I/O vector
+ * (struct iovec).
*
* @iov: Pointer to the array of struct iovec describing the
* scatter/gather I/O vector.
* @iov_cnt: Number of elements in the iov array.
*
- * Returns: The total size in bytes.
+ * Return: the total size in bytes.
*/
size_t iov_size(const struct iovec *iov, size_t iov_cnt)
{
@@ -166,7 +167,7 @@ size_t iov_size(const struct iovec *iov, size_t iov_cnt)
* includes buffers that are actually needed. This will avoid stepping through
* unnecessary elements of the underlying IO vector on future operations.
*
- * Return: true if the tail still contains any bytes, otherwise false
+ * Return: true if the tail still contains any bytes, otherwise false
*/
bool iov_tail_prune(struct iov_tail *tail)
{
@@ -180,10 +181,10 @@ bool iov_tail_prune(struct iov_tail *tail)
}
/**
- * iov_tail_size - Calculate the total size of an IO vector tail
+ * iov_tail_size() - Calculate the total size of an IO vector tail
* @tail: IO vector tail
*
- * Returns: The total size in bytes.
+ * Return: the total size in bytes.
*/
size_t iov_tail_size(struct iov_tail *tail)
{
@@ -192,18 +193,32 @@ size_t iov_tail_size(struct iov_tail *tail)
}
/**
- * iov_peek_header_() - Get pointer to a header from an IOV tail
+ * iov_drop_header() - Discard a header from an IOV tail
+ * @tail: IO vector tail
+ * @len: length to move the head of the tail
+ *
+ * Return: true if the item still contains any bytes, otherwise false
+ */
+bool iov_drop_header(struct iov_tail *tail, size_t len)
+{
+ tail->off = tail->off + len;
+
+ return iov_tail_prune(tail);
+}
+
+/**
+ * iov_check_header() - Check if a header can be accessed
* @tail: IOV tail to get header from
* @len: Length of header to get, in bytes
* @align: Required alignment of header, in bytes
*
* @tail may be pruned, but will represent the same bytes as before.
*
- * Returns: Pointer to the first @len logical bytes of the tail, NULL if that
- * overruns the IO vector, is not contiguous or doesn't have the
- * requested alignment.
+ * Return: pointer to the first @len logical bytes of the tail, NULL if that
+ * overruns the IO vector, is not contiguous or doesn't have the
+ * requested alignment.
*/
-void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align)
+static void *iov_check_header(struct iov_tail *tail, size_t len, size_t align)
{
char *p;
@@ -224,25 +239,95 @@ void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align)
}
/**
+ * iov_peek_header_() - Get pointer to a header from an IOV tail
+ * @tail: IOV tail to get header from
+ * @v: Temporary memory to use if the memory in @tail
+ * is discontinuous
+ * @len: Length of header to get, in bytes
+ * @align: Required alignment of header, in bytes
+ *
+ * @tail may be pruned, but will represent the same bytes as before.
+ *
+ * Return: pointer to the first @len logical bytes of the tail, or to
+ * a copy if that overruns the IO vector, is not contiguous or
+ * doesn't have the requested alignment. NULL if that overruns the
+ * IO vector.
+ */
+/* cppcheck-suppress [staticFunction,unmatchedSuppression] */
+void *iov_peek_header_(struct iov_tail *tail, void *v, size_t len, size_t align)
+{
+ char *p = iov_check_header(tail, len, align);
+ size_t l;
+
+ if (p)
+ return p;
+
+ l = iov_to_buf(tail->iov, tail->cnt, tail->off, v, len);
+ if (l != len)
+ return NULL;
+
+ return v;
+}
+
+/**
* iov_remove_header_() - Remove a header from an IOV tail
* @tail: IOV tail to remove header from (modified)
+ * @v: Temporary memory to use if the memory in @tail
+ * is discontinuous
* @len: Length of header to remove, in bytes
* @align: Required alignment of header, in bytes
*
* On success, @tail is updated so that it longer includes the bytes of the
* returned header.
*
- * Returns: Pointer to the first @len logical bytes of the tail, NULL if that
- * overruns the IO vector, is not contiguous or doesn't have the
- * requested alignment.
+ * Return: pointer to the first @len logical bytes of the tail, or to
+ * a copy if that overruns the IO vector, is not contiguous or
+ * doesn't have the requested alignment. NULL if that overruns the
+ * IO vector.
*/
-void *iov_remove_header_(struct iov_tail *tail, size_t len, size_t align)
+void *iov_remove_header_(struct iov_tail *tail, void *v, size_t len, size_t align)
{
- char *p = iov_peek_header_(tail, len, align);
+ char *p = iov_peek_header_(tail, v, len, align);
if (!p)
return NULL;
tail->off = tail->off + len;
+
return p;
}
+
+/**
+ * iov_tail_clone() - Clone an iov tail into a new iovec array
+ *
+ * @dst_iov: Pointer to the destination array of struct iovec describing
+ * the scatter/gather I/O vector to shallow copy to.
+ * @dst_iov_cnt: Maximum number of elements in the destination iov array.
+ * @tail: Pointer to the source iov_tail
+ *
+ * Return: the number of elements successfully referenced from the destination
+ * iov array, a negative value if there is not enough room in the
+ * destination iov array
+ */
+ssize_t iov_tail_clone(struct iovec *dst_iov, size_t dst_iov_cnt,
+ struct iov_tail *tail)
+{
+ const struct iovec *iov = &tail->iov[0];
+ size_t iov_cnt = tail->cnt;
+ size_t offset = tail->off;
+ unsigned int i, j;
+
+ i = iov_skip_bytes(iov, iov_cnt, offset, &offset);
+
+ /* assign iov references referencing a subset of the source one */
+ for (j = 0; i < iov_cnt && j < dst_iov_cnt; i++, j++) {
+ dst_iov[j].iov_base = (char *)iov[i].iov_base + offset;
+ dst_iov[j].iov_len = iov[i].iov_len - offset;
+ offset = 0;
+ }
+
+ if (j == dst_iov_cnt && i != iov_cnt)
+ return -1;
+
+ return j;
+}
diff --git a/iov.h b/iov.h
index 9855bf0..ba1fda5 100644
--- a/iov.h
+++ b/iov.h
@@ -70,38 +70,68 @@ struct iov_tail {
#define IOV_TAIL(iov_, cnt_, off_) \
(struct iov_tail){ .iov = (iov_), .cnt = (cnt_), .off = (off_) }
+/**
+ * IOV_TAIL_FROM_BUF() - Create a new IOV tail from a buffer
+ * @buf_: Buffer address to use in the iovec
+ * @len_: Buffer size
+ * @off_: Byte offset in the buffer where the tail begins
+ */
+#define IOV_TAIL_FROM_BUF(buf_, len_, off_) \
+ IOV_TAIL((&(const struct iovec){ .iov_base = (buf_), \
+ .iov_len = (len_) }), \
+ 1, \
+ (off_))
+
bool iov_tail_prune(struct iov_tail *tail);
size_t iov_tail_size(struct iov_tail *tail);
-void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align);
-void *iov_remove_header_(struct iov_tail *tail, size_t len, size_t align);
+bool iov_drop_header(struct iov_tail *tail, size_t len);
+void *iov_peek_header_(struct iov_tail *tail, void *v, size_t len, size_t align);
+void *iov_remove_header_(struct iov_tail *tail, void *v, size_t len, size_t align);
+ssize_t iov_tail_clone(struct iovec *dst_iov, size_t dst_iov_cnt,
+ struct iov_tail *tail);
/**
* IOV_PEEK_HEADER() - Get typed pointer to a header from an IOV tail
* @tail_: IOV tail to get header from
- * @type_: Data type of the header
+ * @var_: Temporary buffer of the type of the header to use if
+ * the memory in the iovec array is not contiguous.
*
* @tail_ may be pruned, but will represent the same bytes as before.
*
- * Returns: Pointer of type (@type_ *) located at the start of @tail_, NULL if
- * we can't get a contiguous and aligned pointer.
+ * Return: pointer of type (@type_ *) located at the start of @tail_
+ * or to @var_ if iovec memory is not contiguous, NULL if
+ * that overruns the iovec.
*/
-#define IOV_PEEK_HEADER(tail_, type_) \
- ((type_ *)(iov_peek_header_((tail_), \
- sizeof(type_), __alignof__(type_))))
+
+#define IOV_PEEK_HEADER(tail_, var_) \
+ ((__typeof__(var_) *)(iov_peek_header_((tail_), &(var_), \
+ sizeof(var_), \
+ __alignof__(var_))))
/**
* IOV_REMOVE_HEADER() - Remove and return typed header from an IOV tail
* @tail_: IOV tail to remove header from (modified)
- * @type_: Data type of the header to remove
+ * @var_: Temporary buffer of the type of the header to use if
+ * the memory in the iovec array is not contiguous.
*
* On success, @tail_ is updated so that it longer includes the bytes of the
* returned header.
*
- * Returns: Pointer of type (@type_ *) located at the old start of @tail_, NULL
- * if we can't get a contiguous and aligned pointer.
+ * Return: pointer of type (@type_ *) located at the start of @tail_
+ * or to @var_ if iovec memory is not contiguous, NULL if
+ * that overruns the iovec.
+ */
+
+#define IOV_REMOVE_HEADER(tail_, var_) \
+ ((__typeof__(var_) *)(iov_remove_header_((tail_), &(var_), \
+ sizeof(var_), __alignof__(var_))))
+
+/** IOV_DROP_HEADER() - Remove a typed header from an IOV tail
+ * @tail_: IOV tail to remove header from (modified)
+ * @type_: Data type of the header to remove
+ *
+ * Return: true if the tail still contains any bytes, otherwise false
*/
-#define IOV_REMOVE_HEADER(tail_, type_) \
- ((type_ *)(iov_remove_header_((tail_), \
- sizeof(type_), __alignof__(type_))))
+#define IOV_DROP_HEADER(tail_, type_) iov_drop_header((tail_), sizeof(type_))
#endif /* IOVEC_H */
diff --git a/ip.c b/ip.c
index 2cc7f65..9a7f4c5 100644
--- a/ip.c
+++ b/ip.c
@@ -23,50 +23,47 @@
/**
* ipv6_l4hdr() - Find pointer to L4 header in IPv6 packet and extract protocol
- * @p: Packet pool, packet number @idx has IPv6 header at @offset
- * @idx: Index of packet in pool
- * @offset: Pre-calculated IPv6 header offset
+ * @data: IPv6 packet
* @proto: Filled with L4 protocol number
* @dlen: Data length (payload excluding header extensions), set on return
*
- * Return: pointer to L4 header, NULL if not found
+ * Return: true if the L4 header is found and @data, @proto, @dlen are set,
+ * false on error. Outputs are indeterminate on failure.
*/
-char *ipv6_l4hdr(const struct pool *p, int idx, size_t offset, uint8_t *proto,
- size_t *dlen)
+bool ipv6_l4hdr(struct iov_tail *data, uint8_t *proto, size_t *dlen)
{
+ struct ipv6_opt_hdr o_storage;
const struct ipv6_opt_hdr *o;
+ struct ipv6hdr ip6h_storage;
const struct ipv6hdr *ip6h;
- char *base;
int hdrlen;
uint8_t nh;
- base = packet_get(p, idx, 0, 0, NULL);
- ip6h = packet_get(p, idx, offset, sizeof(*ip6h), dlen);
+ ip6h = IOV_REMOVE_HEADER(data, ip6h_storage);
if (!ip6h)
- return NULL;
-
- offset += sizeof(*ip6h);
+ return false;
nh = ip6h->nexthdr;
if (!IPV6_NH_OPT(nh))
goto found;
- while ((o = packet_get_try(p, idx, offset, sizeof(*o), dlen))) {
+ while ((o = IOV_PEEK_HEADER(data, o_storage))) {
nh = o->nexthdr;
hdrlen = (o->hdrlen + 1) * 8;
if (IPV6_NH_OPT(nh))
- offset += hdrlen;
+ iov_drop_header(data, hdrlen);
else
goto found;
}
- return NULL;
+ return false;
found:
- if (nh == 59)
- return NULL;
+ if (nh == IPPROTO_NONE)
+ return false;
+ *dlen = iov_tail_size(data);
*proto = nh;
- return base + offset;
+ return true;
}
diff --git a/ip.h b/ip.h
index 471c57e..5830b92 100644
--- a/ip.h
+++ b/ip.h
@@ -115,10 +115,9 @@ static inline uint32_t ip6_get_flow_lbl(const struct ipv6hdr *ip6h)
ip6h->flow_lbl[2];
}
-char *ipv6_l4hdr(const struct pool *p, int idx, size_t offset, uint8_t *proto,
- size_t *dlen);
+bool ipv6_l4hdr(struct iov_tail *data, uint8_t *proto, size_t *dlen);
-/* IPv6 link-local all-nodes multicast adddress, ff02::1 */
+/* IPv6 link-local all-nodes multicast address, ff02::1 */
static const struct in6_addr in6addr_ll_all_nodes = {
.s6_addr = {
0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
diff --git a/isolation.c b/isolation.c
index c944fb3..bbcd23b 100644
--- a/isolation.c
+++ b/isolation.c
@@ -129,7 +129,7 @@ static void drop_caps_ep_except(uint64_t keep)
* additional layer of protection. Executing this requires
* CAP_SETPCAP, which we will have within our userns.
*
- * Note that dropping capabilites from the bounding set limits
+ * Note that dropping capabilities from the bounding set limits
* exec()ed processes, but does not remove them from the effective or
* permitted sets, so it doesn't reduce our own capabilities.
*/
@@ -174,8 +174,8 @@ static void clamp_caps(void)
* Should:
* - drop unneeded capabilities
* - close all open files except for standard streams and the one from --fd
- * Musn't:
- * - remove filesytem access (we need to access files during setup)
+ * Mustn't:
+ * - remove filesystem access (we need to access files during setup)
*/
void isolate_initial(int argc, char **argv)
{
@@ -194,7 +194,7 @@ void isolate_initial(int argc, char **argv)
*
* It's debatable whether it's useful to drop caps when we
* retain SETUID and SYS_ADMIN, but we might as well. We drop
- * further capabilites in isolate_user() and
+ * further capabilities in isolate_user() and
* isolate_prefork().
*/
keep = BIT(CAP_NET_BIND_SERVICE) | BIT(CAP_SETUID) | BIT(CAP_SETGID) |
diff --git a/lineread.c b/lineread.c
index 0387f4a..4225de6 100644
--- a/lineread.c
+++ b/lineread.c
@@ -70,7 +70,7 @@ static ssize_t peek_line(struct lineread *lr, bool eof)
* @lr: Line reader state structure
* @line: Place a pointer to the next line in this variable
*
- * Return: Length of line read on success, 0 on EOF, negative on error
+ * Return: length of line read on success, 0 on EOF, negative on error
*/
ssize_t lineread_get(struct lineread *lr, char **line)
{
diff --git a/linux_dep.h b/linux_dep.h
index 240f50a..1d9e166 100644
--- a/linux_dep.h
+++ b/linux_dep.h
@@ -135,6 +135,12 @@ struct tcp_info_linux {
#define CLOSE_RANGE_UNSHARE (1U << 1)
#endif
+#ifndef TCP_REPAIR_ON
+#define TCP_REPAIR_ON 1
+#define TCP_REPAIR_OFF 0
+#define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */
+#endif
+
__attribute__ ((weak))
/* cppcheck-suppress funcArgNamesDifferent */
int close_range(unsigned int first, unsigned int last, int flags) {
diff --git a/log.c b/log.c
index 6eda4c4..21e3673 100644
--- a/log.c
+++ b/log.c
@@ -35,7 +35,7 @@ static int log_sock = -1; /* Optional socket to system logger */
static char log_ident[BUFSIZ]; /* Identifier string for openlog() */
static int log_mask; /* Current log priority mask */
-static int log_file = -1; /* Optional log file descriptor */
+int log_file = -1; /* Optional log file descriptor */
static size_t log_size; /* Maximum log file size in bytes */
static size_t log_written; /* Currently used bytes in log file */
static size_t log_cut_size; /* Bytes to cut at start on rotation */
@@ -54,7 +54,8 @@ bool log_stderr = true; /* Not daemonised, no shell spawned */
* logtime() - Get the current time for logging purposes
* @ts: Buffer into which to store the timestamp
*
- * Return: pointer to @now, or NULL if there was an error retrieving the time
+ * Return: pointer to @ts on success, or NULL if there was
+ * an error retrieving the time
*/
static const struct timespec *logtime(struct timespec *ts)
{
@@ -281,6 +282,7 @@ static void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
* @format: Message
* @ap: Variable argument list
*/
+/* cppcheck-suppress [staticFunction,unmatchedSuppression] */
void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
{
bool debug_print = (log_mask & LOG_MASK(LOG_DEBUG)) && log_file == -1;
@@ -401,7 +403,7 @@ void __setlogmask(int mask)
* logfile_init() - Open log file and write header with PID, version, path
* @name: Identifier for header: passt or pasta
* @path: Path to log file
- * @size: Maximum size of log file: log_cut_size is calculatd here
+ * @size: Maximum size of log file: log_cut_size is calculated here
*/
void logfile_init(const char *name, const char *path, size_t size)
{
diff --git a/log.h b/log.h
index 08aa88c..c8473c0 100644
--- a/log.h
+++ b/log.h
@@ -41,6 +41,7 @@ void logmsg_perror(int pri, const char *format, ...)
_exit(EXIT_FAILURE); \
} while (0)
+extern int log_file;
extern int log_trace;
extern bool log_conf_parsed;
extern bool log_stderr;
diff --git a/migrate.c b/migrate.c
index 0fca77b..48d63a0 100644
--- a/migrate.c
+++ b/migrate.c
@@ -96,8 +96,8 @@ static int seen_addrs_target_v1(struct ctx *c,
return 0;
}
-/* Stages for version 1 */
-static const struct migrate_stage stages_v1[] = {
+/* Stages for version 2 */
+static const struct migrate_stage stages_v2[] = {
{
.name = "observed addresses",
.source = seen_addrs_source_v1,
@@ -118,7 +118,11 @@ static const struct migrate_stage stages_v1[] = {
/* Supported encoding versions, from latest (most preferred) to oldest */
static const struct migrate_version versions[] = {
- { 1, stages_v1, },
+ { 2, stages_v2, },
+ /* v1 was released, but not widely used. It had bad endianness for the
+ * MSS and omitted timestamps, which meant it usually wouldn't work.
+ * Therefore we don't attempt to support compatibility with it.
+ */
{ 0 },
};
diff --git a/ndp.c b/ndp.c
index ded2081..eb090cd 100644
--- a/ndp.c
+++ b/ndp.c
@@ -328,21 +328,28 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
memcpy(&ra.source_ll.mac, c->our_tap_mac, ETH_ALEN);
+ /* NOLINTNEXTLINE(clang-analyzer-security.PointerSub) */
ndp_send(c, dst, &ra, ptr - (unsigned char *)&ra);
}
/**
* ndp() - Check for NDP solicitations, reply as needed
* @c: Execution context
- * @ih: ICMPv6 header
* @saddr: Source IPv6 address
- * @p: Packet pool
+ * @data: Single packet with ICMPv6 header
*
* Return: 0 if not handled here, 1 if handled, -1 on failure
*/
-int ndp(const struct ctx *c, const struct icmp6hdr *ih,
- const struct in6_addr *saddr, const struct pool *p)
+int ndp(const struct ctx *c, const struct in6_addr *saddr,
+ struct iov_tail *data)
{
+ struct icmp6hdr ih_storage;
+ const struct icmp6hdr *ih;
+
+ ih = IOV_PEEK_HEADER(data, ih_storage);
+ if (!ih)
+ return -1;
+
if (ih->icmp6_type < RS || ih->icmp6_type > NA)
return 0;
@@ -350,9 +357,10 @@ int ndp(const struct ctx *c, const struct icmp6hdr *ih,
return 1;
if (ih->icmp6_type == NS) {
+ struct ndp_ns ns_storage;
const struct ndp_ns *ns;
- ns = packet_get(p, 0, 0, sizeof(struct ndp_ns), NULL);
+ ns = IOV_REMOVE_HEADER(data, ns_storage);
if (!ns)
return -1;
diff --git a/ndp.h b/ndp.h
index 41c2000..b1dd5e8 100644
--- a/ndp.h
+++ b/ndp.h
@@ -8,8 +8,8 @@
struct icmp6hdr;
-int ndp(const struct ctx *c, const struct icmp6hdr *ih,
- const struct in6_addr *saddr, const struct pool *p);
+int ndp(const struct ctx *c, const struct in6_addr *saddr,
+ struct iov_tail *data);
void ndp_timer(const struct ctx *c, const struct timespec *now);
#endif /* NDP_H */
diff --git a/netlink.c b/netlink.c
index a052504..8f82e73 100644
--- a/netlink.c
+++ b/netlink.c
@@ -199,7 +199,7 @@ static struct nlmsghdr *nl_next(int s, char *buf, struct nlmsghdr *nh, ssize_t *
}
/**
- * nl_foreach - 'for' type macro to step through netlink response messages
+ * nl_foreach() - 'for' type macro to step through netlink response messages
* nl_foreach_oftype - as above, but only messages of expected type
* @nh: Steps through each response header (struct nlmsghdr *)
* @status: When loop exits indicates if there was an error (ssize_t)
@@ -1024,7 +1024,6 @@ int nl_link_get_mac(int s, unsigned int ifi, void *mac)
/**
* nl_link_set_mac() - Set link MAC address
* @s: Netlink socket
- * @ns: Use netlink socket in namespace
* @ifi: Interface index
* @mac: MAC address to set
*
diff --git a/packet.c b/packet.c
index 0330b54..890561b 100644
--- a/packet.c
+++ b/packet.c
@@ -23,6 +23,20 @@
#include "log.h"
/**
+ * get_vdev_memory() - Return a pointer to the memory regions of the pool
+ * @p: Packet pool
+ *
+ * Return: Null if none, otherwise a pointer to vdev_memory structure
+ */
+static struct vdev_memory *get_vdev_memory(const struct pool *p)
+{
+ if (p->buf_size)
+ return NULL;
+
+ return (struct vdev_memory *)p->buf;
+}
+
+/**
* packet_check_range() - Check if a memory range is valid for a pool
* @p: Packet pool
* @ptr: Start of desired data range
@@ -35,26 +49,41 @@
static int packet_check_range(const struct pool *p, const char *ptr, size_t len,
const char *func, int line)
{
- if (p->buf_size == 0) {
+ struct vdev_memory *memory;
+
+ if (len > PACKET_MAX_LEN) {
+ debug("packet range length %zu (max %zu), %s:%i",
+ len, PACKET_MAX_LEN, func, line);
+ return -1;
+ }
+
+ memory = get_vdev_memory(p);
+ if (memory) {
int ret;
- ret = vu_packet_check_range((void *)p->buf, ptr, len);
+ ret = vu_packet_check_range(memory, ptr, len);
if (ret == -1)
- trace("cannot find region, %s:%i", func, line);
+ debug("cannot find region, %s:%i", func, line);
return ret;
}
if (ptr < p->buf) {
- trace("packet range start %p before buffer start %p, %s:%i",
+ debug("packet range start %p before buffer start %p, %s:%i",
(void *)ptr, (void *)p->buf, func, line);
return -1;
}
- if (ptr + len > p->buf + p->buf_size) {
- trace("packet range end %p after buffer end %p, %s:%i",
- (void *)(ptr + len), (void *)(p->buf + p->buf_size),
+ if (len > p->buf_size) {
+ debug("packet range length %zu larger than buffer %zu, %s:%i",
+ len, p->buf_size, func, line);
+ return -1;
+ }
+
+ if ((size_t)(ptr - p->buf) > p->buf_size - len) {
+ debug("packet range %p, len %zu after buffer end %p, %s:%i",
+ (void *)ptr, len, (void *)(p->buf + p->buf_size),
func, line);
return -1;
}
@@ -62,89 +91,110 @@ static int packet_check_range(const struct pool *p, const char *ptr, size_t len,
return 0;
}
/**
+ * pool_can_fit() - Can a new packet fit in the pool?
+ * @p: Pointer to packet pool
+ * @data: check data can fit in the pool
+ *
+ * Return: true if @data can be added, false otherwise
+ */
+bool pool_can_fit(const struct pool *p, struct iov_tail *data)
+{
+ iov_tail_prune(data);
+
+ return p->count + data->cnt + (data->cnt > 1) <= p->size;
+}
+
+/**
* packet_add_do() - Add data as packet descriptor to given pool
* @p: Existing pool
- * @len: Length of new descriptor
- * @start: Start of data
- * @func: For tracing: name of calling function, NULL means no trace()
+ * @data: Data to add
+ * @func: For tracing: name of calling function
* @line: For tracing: caller line of function call
*/
-void packet_add_do(struct pool *p, size_t len, const char *start,
+void packet_add_do(struct pool *p, struct iov_tail *data,
const char *func, int line)
{
- size_t idx = p->count;
+ size_t idx = p->count, i, offset;
- if (idx >= p->size) {
- trace("add packet index %zu to pool with size %zu, %s:%i",
+ if (!pool_can_fit(p, data)) {
+ debug("add packet index %zu to pool with size %zu, %s:%i",
idx, p->size, func, line);
return;
}
- if (packet_check_range(p, start, len, func, line))
+ if (!iov_tail_prune(data))
return;
- if (len > UINT16_MAX) {
- trace("add packet length %zu, %s:%i", len, func, line);
- return;
+ if (data->cnt > 1) {
+ p->pkt[idx].iov_base = NULL;
+ p->pkt[idx].iov_len = data->cnt;
+ idx++;
}
- p->pkt[idx].iov_base = (void *)start;
- p->pkt[idx].iov_len = len;
+ offset = data->off;
+ for (i = 0; i < data->cnt; i++) {
+ const char *start;
+ size_t len;
+
+ len = data->iov[i].iov_len - offset;
+ start = (char *)data->iov[i].iov_base + offset;
+ offset = 0;
+
+ if (packet_check_range(p, start, len, func, line))
+ return;
+
+ p->pkt[idx].iov_base = (void *)start;
+ p->pkt[idx].iov_len = len;
+ idx++;
+ }
- p->count++;
+ p->count = idx;
}
/**
* packet_get_do() - Get data range from packet descriptor from given pool
* @p: Packet pool
* @idx: Index of packet descriptor in pool
- * @offset: Offset of data range in packet descriptor
- * @len: Length of desired data range
- * @left: Length of available data after range, set on return, can be NULL
+ * @data: IOV tail to store the address of the data (output)
* @func: For tracing: name of calling function, NULL means no trace()
* @line: For tracing: caller line of function call
*
- * Return: pointer to start of data range, NULL on invalid range or descriptor
+ * Return: false if packet index is invalid, true otherwise.
+ * If something wrong with @data, don't return at all (assert).
*/
-void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
- size_t len, size_t *left, const char *func, int line)
+bool packet_get_do(const struct pool *p, size_t idx,
+ struct iov_tail *data,
+ const char *func, int line)
{
- char *ptr;
+ size_t i;
- if (idx >= p->size || idx >= p->count) {
- if (func) {
- trace("packet %zu from pool size: %zu, count: %zu, "
- "%s:%i", idx, p->size, p->count, func, line);
- }
- return NULL;
- }
+ ASSERT_WITH_MSG(p->count <= p->size,
+ "Corrupted pool count: %zu, size: %zu, %s:%i",
+ p->count, p->size, func, line);
- if (len > UINT16_MAX) {
- if (func) {
- trace("packet data length %zu, %s:%i",
- len, func, line);
- }
- return NULL;
+ if (idx >= p->count) {
+ debug("packet %zu from pool size: %zu, count: %zu, "
+ "%s:%i", idx, p->size, p->count, func, line);
+ return false;
}
- if (len + offset > p->pkt[idx].iov_len) {
- if (func) {
- trace("data length %zu, offset %zu from length %zu, "
- "%s:%i", len, offset, p->pkt[idx].iov_len,
- func, line);
- }
- return NULL;
+ if (p->pkt[idx].iov_base) {
+ data->cnt = 1;
+ data->iov = &p->pkt[idx];
+ } else {
+ data->cnt = p->pkt[idx].iov_len;
+ data->iov = &p->pkt[idx + 1];
}
+ data->off = 0;
- ptr = (char *)p->pkt[idx].iov_base + offset;
-
- if (packet_check_range(p, ptr, len, func, line))
- return NULL;
-
- if (left)
- *left = p->pkt[idx].iov_len - offset - len;
+ for (i = 0; i < data->cnt; i++) {
+ ASSERT_WITH_MSG(!packet_check_range(p, data->iov[i].iov_base,
+ data->iov[i].iov_len,
+ func, line),
+ "Corrupt packet pool, %s:%i", func, line);
+ }
- return ptr;
+ return true;
}
/**
diff --git a/packet.h b/packet.h
index bdc07fe..ba8d5c2 100644
--- a/packet.h
+++ b/packet.h
@@ -6,10 +6,17 @@
#ifndef PACKET_H
#define PACKET_H
+#include <stdbool.h>
+#include "iov.h"
+#include "virtio.h"
+
+/* Maximum size of a single packet stored in pool, including headers */
+#define PACKET_MAX_LEN ((size_t)UINT16_MAX)
+
/**
* struct pool - Generic pool of packets stored in a buffer
* @buf: Buffer storing packet descriptors,
- * a struct vu_dev_region array for passt vhost-user mode
+ * a struct vdev_region for passt vhost-user mode
* @buf_size: Total size of buffer,
* 0 for passt vhost-user mode
* @size: Number of usable descriptors for the pool
@@ -24,24 +31,21 @@ struct pool {
struct iovec pkt[];
};
-int vu_packet_check_range(void *buf, const char *ptr, size_t len);
-void packet_add_do(struct pool *p, size_t len, const char *start,
+int vu_packet_check_range(struct vdev_memory *memory,
+ const char *ptr, size_t len);
+void packet_add_do(struct pool *p, struct iov_tail *data,
const char *func, int line);
-void *packet_get_do(const struct pool *p, const size_t idx,
- size_t offset, size_t len, size_t *left,
- const char *func, int line);
+bool packet_get_do(const struct pool *p, const size_t idx,
+ struct iov_tail *data, const char *func, int line);
+bool pool_can_fit(const struct pool *p, struct iov_tail *data);
void pool_flush(struct pool *p);
-#define packet_add(p, len, start) \
- packet_add_do(p, len, start, __func__, __LINE__)
-
-#define packet_get(p, idx, offset, len, left) \
- packet_get_do(p, idx, offset, len, left, __func__, __LINE__)
-
-#define packet_get_try(p, idx, offset, len, left) \
- packet_get_do(p, idx, offset, len, left, NULL, 0)
+#define packet_add(p, data) \
+ packet_add_do(p, data, __func__, __LINE__)
+#define packet_get(p, idx, data) \
+ packet_get_do(p, idx, data, __func__, __LINE__)
-#define PACKET_POOL_DECL(_name, _size, _buf) \
+#define PACKET_POOL_DECL(_name, _size) \
struct _name ## _t { \
char *buf; \
size_t buf_size; \
@@ -57,19 +61,10 @@ struct _name ## _t { \
.size = _size, \
}
-#define PACKET_POOL(name, size, buf, buf_size) \
- PACKET_POOL_DECL(name, size, buf) name = \
- PACKET_POOL_INIT_NOCAST(size, buf, buf_size)
-
#define PACKET_INIT(name, size, buf, buf_size) \
(struct name ## _t) PACKET_POOL_INIT_NOCAST(size, buf, buf_size)
-#define PACKET_POOL_NOINIT(name, size, buf) \
- PACKET_POOL_DECL(name, size, buf) name ## _storage; \
+#define PACKET_POOL_NOINIT(name, size) \
+ PACKET_POOL_DECL(name, size) name ## _storage; \
static struct pool *name = (struct pool *)&name ## _storage
-
-#define PACKET_POOL_P(name, size, buf, buf_size) \
- PACKET_POOL(name ## _storage, size, buf, buf_size); \
- struct pool *name = (struct pool *)&name ## _storage
-
#endif /* PACKET_H */
diff --git a/passt-repair.1 b/passt-repair.1
index 7c1b140..e65aadd 100644
--- a/passt-repair.1
+++ b/passt-repair.1
@@ -16,13 +16,17 @@
.B passt-repair
is a privileged helper setting and clearing repair mode on TCP sockets on behalf
of \fBpasst\fR(1), as instructed via single-byte commands over a UNIX domain
-socket, specified by \fIPATH\fR.
+socket.
It can be used to migrate TCP connections between guests without granting
additional capabilities to \fBpasst\fR(1) itself: to migrate TCP connections,
\fBpasst\fR(1) leverages repair mode, which needs the \fBCAP_NET_ADMIN\fR
capability (see \fBcapabilities\fR(7)) to be set or cleared.
+If \fIPATH\fR represents a UNIX domain socket, \fBpasst-repair\fR(1) attempts to
+connect to it. If it is a directory, \fBpasst-repair\fR(1) waits until a file
+ending with \fI.repair\fR appears in it, and then attempts to connect to it.
+
.SH PROTOCOL
\fBpasst-repair\fR(1) connects to \fBpasst\fR(1) using the socket specified via
diff --git a/passt-repair.c b/passt-repair.c
index e0c366e..c3c140f 100644
--- a/passt-repair.c
+++ b/passt-repair.c
@@ -16,11 +16,14 @@
* off. Reply by echoing the command. Exit on EOF.
*/
+#include <sys/inotify.h>
#include <sys/prctl.h>
#include <sys/types.h>
#include <sys/socket.h>
+#include <sys/stat.h>
#include <sys/un.h>
#include <errno.h>
+#include <stdbool.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
@@ -37,8 +40,11 @@
#include <linux/seccomp.h>
#include "seccomp_repair.h"
+#include "linux_dep.h"
#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
+#define REPAIR_EXT ".repair"
+#define REPAIR_EXT_LEN strlen(REPAIR_EXT)
/**
* main() - Entry point and whole program with loop
@@ -51,6 +57,9 @@
* #syscalls:repair socket s390x:socketcall i686:socketcall
* #syscalls:repair recvfrom recvmsg arm:recv ppc64le:recv
* #syscalls:repair sendto sendmsg arm:send ppc64le:send
+ * #syscalls:repair stat|statx stat64|statx statx
+ * #syscalls:repair fstat|fstat64 newfstatat|fstatat64
+ * #syscalls:repair inotify_init1 inotify_add_watch
*/
int main(int argc, char **argv)
{
@@ -58,12 +67,14 @@ int main(int argc, char **argv)
__attribute__ ((aligned(__alignof__(struct cmsghdr))));
struct sockaddr_un a = { AF_UNIX, "" };
int fds[SCM_MAX_FD], s, ret, i, n = 0;
+ bool inotify_dir = false;
struct sock_fprog prog;
int8_t cmd = INT8_MAX;
struct cmsghdr *cmsg;
struct msghdr msg;
struct iovec iov;
size_t cmsg_len;
+ struct stat sb;
int op;
prctl(PR_SET_DUMPABLE, 0);
@@ -73,7 +84,7 @@ int main(int argc, char **argv)
prog.filter = filter_repair;
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||
prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
- fprintf(stderr, "Failed to apply seccomp filter");
+ fprintf(stderr, "Failed to apply seccomp filter\n");
_exit(1);
}
@@ -90,19 +101,96 @@ int main(int argc, char **argv)
_exit(2);
}
- ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]);
+ if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+ fprintf(stderr, "Failed to create AF_UNIX socket: %i\n", errno);
+ _exit(1);
+ }
+
+ if ((stat(argv[1], &sb))) {
+ fprintf(stderr, "Can't stat() %s: %i\n", argv[1], errno);
+ _exit(1);
+ }
+
+ if ((sb.st_mode & S_IFMT) == S_IFDIR) {
+ char buf[sizeof(struct inotify_event) + NAME_MAX + 1]
+ __attribute__ ((aligned(__alignof__(struct inotify_event))));
+ const struct inotify_event *ev = NULL;
+ char path[PATH_MAX + 1];
+ bool found = false;
+ ssize_t n;
+ int fd;
+
+ if ((fd = inotify_init1(IN_CLOEXEC)) < 0) {
+ fprintf(stderr, "inotify_init1: %i\n", errno);
+ _exit(1);
+ }
+
+ if (inotify_add_watch(fd, argv[1], IN_CREATE) < 0) {
+ fprintf(stderr, "inotify_add_watch: %i\n", errno);
+ _exit(1);
+ }
+
+ do {
+ char *p;
+
+ n = read(fd, buf, sizeof(buf));
+ if (n < 0) {
+ fprintf(stderr, "inotify read: %i\n", errno);
+ _exit(1);
+ }
+ buf[n - 1] = '\0';
+
+ if (n < (ssize_t)sizeof(*ev)) {
+ fprintf(stderr, "Short inotify read: %zi\n", n);
+ continue;
+ }
+
+ for (p = buf; p < buf + n; p += sizeof(*ev) + ev->len) {
+ ev = (const struct inotify_event *)p;
+
+ if (ev->len >= REPAIR_EXT_LEN &&
+ !memcmp(ev->name +
+ strnlen(ev->name, ev->len) -
+ REPAIR_EXT_LEN,
+ REPAIR_EXT, REPAIR_EXT_LEN)) {
+ found = true;
+ break;
+ }
+ }
+ } while (!found);
+
+ if (ev->len > NAME_MAX + 1 || ev->name[ev->len - 1] != '\0') {
+ fprintf(stderr, "Invalid filename from inotify\n");
+ _exit(1);
+ }
+
+ snprintf(path, sizeof(path), "%s/%s", argv[1], ev->name);
+ if ((stat(path, &sb))) {
+ fprintf(stderr, "Can't stat() %s: %i\n", path, errno);
+ _exit(1);
+ }
+
+ ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", path);
+ inotify_dir = true;
+ } else {
+ ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]);
+ }
+
if (ret <= 0 || ret >= (int)sizeof(a.sun_path)) {
- fprintf(stderr, "Invalid socket path: %s\n", argv[1]);
+ fprintf(stderr, "Invalid socket path\n");
_exit(2);
}
- if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
- fprintf(stderr, "Failed to create AF_UNIX socket: %i\n", errno);
- _exit(1);
+ if ((sb.st_mode & S_IFMT) != S_IFSOCK) {
+ fprintf(stderr, "%s is not a socket\n", a.sun_path);
+ _exit(2);
}
- if (connect(s, (struct sockaddr *)&a, sizeof(a))) {
- fprintf(stderr, "Failed to connect to %s: %s\n", argv[1],
+ while (connect(s, (struct sockaddr *)&a, sizeof(a))) {
+ if (inotify_dir && errno == ECONNREFUSED)
+ continue;
+
+ fprintf(stderr, "Failed to connect to %s: %s\n", a.sun_path,
strerror(errno));
_exit(1);
}
@@ -158,8 +246,8 @@ loop:
for (i = 0; i < n; i++) {
if (setsockopt(fds[i], SOL_TCP, TCP_REPAIR, &op, sizeof(op))) {
fprintf(stderr,
- "Setting TCP_REPAIR to %i on socket %i: %s", op,
- fds[i], strerror(errno));
+ "Setting TCP_REPAIR to %i on socket %i: %s\n",
+ op, fds[i], strerror(errno));
_exit(1);
}
diff --git a/passt.1 b/passt.1
index 60066c2..cef98b2 100644
--- a/passt.1
+++ b/passt.1
@@ -440,6 +440,30 @@ chosen for the hypervisor UNIX domain socket. No socket is created if not in
\-\-vhost-user mode.
.TP
+.BR \-\-migrate-exit (DEPRECATED)
+Exit after a completed migration as source. By default, \fBpasst\fR keeps
+running and the migrated guest can continue using its connection, or a new guest
+can connect.
+
+Note that this configuration option is \fBdeprecated\fR and will be removed in a
+future version. It is not expected to be of any use, and it simply reflects a
+legacy behaviour. If you have any use for this, refer to \fBREPORTING BUGS\fR
+below.
+
+.TP
+.BR \-\-migrate-no-linger (DEPRECATED)
+Close TCP sockets on the source instance once migration completes.
+
+By default, sockets are kept open, and events on data sockets are ignored, so
+that any further message reaching sockets after the source migrated is silently
+ignored, to avoid connection resets in case data is received after migration.
+
+Note that this configuration option is \fBdeprecated\fR and will be removed in a
+future version. It is not expected to be of any use, and it simply reflects a
+legacy behaviour. If you have any use for this, refer to \fBREPORTING BUGS\fR
+below.
+
+.TP
.BR \-F ", " \-\-fd " " \fIFD
Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened
in the parent process and \fBpasst\fR inherits it when run as a child. This
@@ -454,6 +478,11 @@ is closed.
Quit after handling a single client connection, that is, once the client closes
the socket, or once we get a socket error.
+\fBNote\fR: this option has no effect after \fBpasst\fR completes a migration as
+source, because, in that case, exiting would close sockets for active
+connections, which would in turn cause connection resets if any further data is
+received. See also the description of \fI\-\-migrate-no-linger\fR.
+
.TP
.BR \-t ", " \-\-tcp-ports " " \fIspec
Configure TCP port forwarding to guest. \fIspec\fR can be one of:
diff --git a/passt.c b/passt.c
index 868842b..a4ec115 100644
--- a/passt.c
+++ b/passt.c
@@ -68,7 +68,7 @@ char *epoll_type_str[] = {
[EPOLL_TYPE_TCP_LISTEN] = "listening TCP socket",
[EPOLL_TYPE_TCP_TIMER] = "TCP timer",
[EPOLL_TYPE_UDP_LISTEN] = "listening UDP socket",
- [EPOLL_TYPE_UDP_REPLY] = "UDP reply socket",
+ [EPOLL_TYPE_UDP] = "UDP flow socket",
[EPOLL_TYPE_PING] = "ICMP/ICMPv6 ping socket",
[EPOLL_TYPE_NSQUIT_INOTIFY] = "namespace inotify watch",
[EPOLL_TYPE_NSQUIT_TIMER] = "namespace timer watch",
@@ -170,6 +170,7 @@ static void exit_handler(int signal)
{
(void)signal;
+ fsync_pcap_and_log();
_exit(EXIT_SUCCESS);
}
@@ -191,7 +192,6 @@ int main(int argc, char **argv)
{
struct epoll_event events[EPOLL_EVENTS];
int nfds, i, devnull_fd = -1;
- char argv0[PATH_MAX], *name;
struct ctx c = { 0 };
struct rlimit limit;
struct timespec now;
@@ -213,27 +213,18 @@ int main(int argc, char **argv)
sigaction(SIGTERM, &sa, NULL);
sigaction(SIGQUIT, &sa, NULL);
- if (argc < 1)
- _exit(EXIT_FAILURE);
+ c.mode = conf_mode(argc, argv);
- strncpy(argv0, argv[0], PATH_MAX - 1);
- name = basename(argv0);
- if (strstr(name, "pasta")) {
+ if (c.mode == MODE_PASTA) {
sa.sa_handler = pasta_child_handler;
if (sigaction(SIGCHLD, &sa, NULL))
die_perror("Couldn't install signal handlers");
-
- c.mode = MODE_PASTA;
- } else if (strstr(name, "passt")) {
- c.mode = MODE_PASST;
- } else {
- _exit(EXIT_FAILURE);
}
if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
die_perror("Couldn't set disposition for SIGPIPE");
- madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE);
+ madvise(pkt_buf, sizeof(pkt_buf), MADV_HUGEPAGE);
c.epollfd = epoll_create1(EPOLL_CLOEXEC);
if (c.epollfd == -1)
@@ -349,8 +340,8 @@ loop:
case EPOLL_TYPE_UDP_LISTEN:
udp_listen_sock_handler(&c, ref, eventmask, &now);
break;
- case EPOLL_TYPE_UDP_REPLY:
- udp_reply_sock_handler(&c, ref, eventmask, &now);
+ case EPOLL_TYPE_UDP:
+ udp_sock_handler(&c, ref, eventmask, &now);
break;
case EPOLL_TYPE_PING:
icmp_sock_handler(&c, ref);
diff --git a/passt.h b/passt.h
index 28d1389..3ffc19f 100644
--- a/passt.h
+++ b/passt.h
@@ -69,12 +69,9 @@ union epoll_ref {
static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data),
"epoll_ref must have same size as epoll_data");
-#define TAP_BUF_BYTES \
- ROUND_DOWN(((ETH_MAX_MTU + sizeof(uint32_t)) * 128), PAGE_SIZE)
-#define TAP_MSGS \
- DIV_ROUND_UP(TAP_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
+/* Large enough for ~128 maximum size frames */
+#define PKT_BUF_BYTES (8UL << 20)
-#define PKT_BUF_BYTES MAX(TAP_BUF_BYTES, 0)
extern char pkt_buf [PKT_BUF_BYTES];
extern char *epoll_type_str[];
@@ -211,7 +208,7 @@ struct ip6_ctx {
* @guest_mac: MAC address of guest or namespace, seen or configured
* @hash_secret: 128-bit secret for siphash functions
* @ifi4: Template interface for IPv4, -1: none, 0: IPv4 disabled
- * @ip: IPv4 configuration
+ * @ip4: IPv4 configuration
* @dns_search: DNS search list
* @hostname: Guest hostname
* @fqdn: Guest FQDN
@@ -244,6 +241,8 @@ struct ip6_ctx {
* @device_state_fd: Device state migration channel
* @device_state_result: Device state migration result
* @migrate_target: Are we the target, on the next migration request?
+ * @migrate_no_linger: Close sockets as we migrate them
+ * @migrate_exit: Exit (on source) once migration is complete
*/
struct ctx {
enum passt_modes mode;
@@ -321,6 +320,8 @@ struct ctx {
int device_state_fd;
int device_state_result;
bool migrate_target;
+ bool migrate_no_linger;
+ bool migrate_exit;
};
void proto_update_l2_buf(const unsigned char *eth_d,
diff --git a/pasta.c b/pasta.c
index fa3e7de..687406b 100644
--- a/pasta.c
+++ b/pasta.c
@@ -57,21 +57,21 @@ int pasta_child_pid;
/**
* pasta_child_handler() - Exit once shell exits (if we started it), reap clones
- * @signal: Unused, handler deals with SIGCHLD only
+ * @signal: Signal number; this handler deals with SIGCHLD only
*/
void pasta_child_handler(int signal)
{
int errno_save = errno;
siginfo_t infop;
- (void)signal;
-
if (signal != SIGCHLD)
return;
if (pasta_child_pid &&
!waitid(P_PID, pasta_child_pid, &infop, WEXITED | WNOHANG)) {
if (infop.si_pid == pasta_child_pid) {
+ fsync_pcap_and_log();
+
if (infop.si_code == CLD_EXITED)
_exit(infop.si_status);
@@ -498,17 +498,23 @@ void pasta_netns_quit_init(const struct ctx *c)
*/
void pasta_netns_quit_inotify_handler(struct ctx *c, int inotify_fd)
{
- char buf[sizeof(struct inotify_event) + NAME_MAX + 1];
- const struct inotify_event *in_ev = (struct inotify_event *)buf;
+ char buf[sizeof(struct inotify_event) + NAME_MAX + 1]
+ __attribute__ ((aligned(__alignof__(struct inotify_event))));
+ const struct inotify_event *ev;
+ ssize_t n;
+ char *p;
- if (read(inotify_fd, buf, sizeof(buf)) < (ssize_t)sizeof(*in_ev))
+ if ((n = read(inotify_fd, buf, sizeof(buf))) < (ssize_t)sizeof(*ev))
return;
- if (strncmp(in_ev->name, c->netns_base, sizeof(c->netns_base)))
- return;
+ for (p = buf; p < buf + n; p += sizeof(*ev) + ev->len) {
+ ev = (const struct inotify_event *)p;
- info("Namespace %s is gone, exiting", c->netns_base);
- _exit(EXIT_SUCCESS);
+ if (!strncmp(ev->name, c->netns_base, sizeof(c->netns_base))) {
+ info("Namespace %s is gone, exiting", c->netns_base);
+ _exit(EXIT_SUCCESS);
+ }
+ }
}
/**
diff --git a/pcap.c b/pcap.c
index 3d623cf..54fba5c 100644
--- a/pcap.c
+++ b/pcap.c
@@ -33,32 +33,11 @@
#include "log.h"
#include "pcap.h"
#include "iov.h"
+#include "tap.h"
#define PCAP_VERSION_MINOR 4
-static int pcap_fd = -1;
-
-/* See pcap.h from libpcap, or pcap-savefile(5) */
-static const struct {
- uint32_t magic;
-#define PCAP_MAGIC 0xa1b2c3d4
-
- uint16_t major;
-#define PCAP_VERSION_MAJOR 2
-
- uint16_t minor;
-#define PCAP_VERSION_MINOR 4
-
- int32_t thiszone;
- uint32_t sigfigs;
- uint32_t snaplen;
-
- uint32_t linktype;
-#define PCAP_LINKTYPE_ETHERNET 1
-} pcap_hdr = {
- PCAP_MAGIC, PCAP_VERSION_MAJOR, PCAP_VERSION_MINOR, 0, 0, ETH_MAX_MTU,
- PCAP_LINKTYPE_ETHERNET
-};
+int pcap_fd = -1;
struct pcap_pkthdr {
uint32_t tv_sec;
@@ -73,8 +52,6 @@ struct pcap_pkthdr {
* @iovcnt: Number of buffers (@iov entries) in frame
* @offset: Byte offset of the L2 headers within @iov
* @now: Timestamp
- *
- * Returns: 0 on success, -errno on error writing to the file
*/
static void pcap_frame(const struct iovec *iov, size_t iovcnt,
size_t offset, const struct timespec *now)
@@ -97,6 +74,7 @@ static void pcap_frame(const struct iovec *iov, size_t iovcnt,
* @pkt: Pointer to data buffer, including L2 headers
* @l2len: L2 frame length
*/
+/* cppcheck-suppress unusedFunction */
void pcap(const char *pkt, size_t l2len)
{
struct iovec iov = { (char *)pkt, l2len };
@@ -134,10 +112,9 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
pcap_frame(iov + i * frame_parts, frame_parts, offset, &now);
}
-/*
- * pcap_iov - Write packet data described by an I/O vector
+/**
+ * pcap_iov() - Write packet data described by an I/O vector
* to a pcap file descriptor.
- *
* @iov: Pointer to the array of struct iovec describing the I/O vector
* containing packet data to write, including L2 header
* @iovcnt: Number of buffers (@iov entries)
@@ -162,6 +139,29 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
*/
void pcap_init(struct ctx *c)
{
+ /* See pcap.h from libpcap, or pcap-savefile(5) */
+#define PCAP_MAGIC 0xa1b2c3d4
+#define PCAP_VERSION_MAJOR 2
+#define PCAP_VERSION_MINOR 4
+#define PCAP_LINKTYPE_ETHERNET 1
+ const struct {
+ uint32_t magic;
+ uint16_t major;
+ uint16_t minor;
+
+ int32_t thiszone;
+ uint32_t sigfigs;
+ uint32_t snaplen;
+
+ uint32_t linktype;
+ } pcap_hdr = {
+ .magic = PCAP_MAGIC,
+ .major = PCAP_VERSION_MAJOR,
+ .minor = PCAP_VERSION_MINOR,
+ .snaplen = tap_l2_max_len(c),
+ .linktype = PCAP_LINKTYPE_ETHERNET
+ };
+
if (pcap_fd != -1)
return;
diff --git a/pcap.h b/pcap.h
index 9795f2e..2aeb53e 100644
--- a/pcap.h
+++ b/pcap.h
@@ -6,6 +6,8 @@
#ifndef PCAP_H
#define PCAP_H
+extern int pcap_fd;
+
void pcap(const char *pkt, size_t l2len);
void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
size_t offset);
diff --git a/repair.c b/repair.c
index 3ee089f..f6b1bf3 100644
--- a/repair.c
+++ b/repair.c
@@ -27,6 +27,10 @@
#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
+/* Wait for a while for TCP_REPAIR helper to connect if it's not there yet */
+#define REPAIR_ACCEPT_TIMEOUT_MS 10
+#define REPAIR_ACCEPT_TIMEOUT_US (REPAIR_ACCEPT_TIMEOUT_MS * 1000)
+
/* Pending file descriptors for next repair_flush() call, or command change */
static int repair_fds[SCM_MAX_FD];
@@ -64,18 +68,21 @@ void repair_sock_init(const struct ctx *c)
* repair_listen_handler() - Handle events on TCP_REPAIR helper listening socket
* @c: Execution context
* @events: epoll events
+ *
+ * Return: 0 on valid event with new connected socket, error code on failure
*/
-void repair_listen_handler(struct ctx *c, uint32_t events)
+int repair_listen_handler(struct ctx *c, uint32_t events)
{
union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR };
struct epoll_event ev = { 0 };
struct ucred ucred;
socklen_t len;
+ int rc;
if (events != EPOLLIN) {
debug("Spurious event 0x%04x on TCP_REPAIR helper socket",
events);
- return;
+ return EINVAL;
}
len = sizeof(ucred);
@@ -86,18 +93,19 @@ void repair_listen_handler(struct ctx *c, uint32_t events)
SOCK_NONBLOCK);
if (discard == -1)
- return;
+ return errno;
if (!getsockopt(discard, SOL_SOCKET, SO_PEERCRED, &ucred, &len))
info("Discarding TCP_REPAIR helper, PID %i", ucred.pid);
close(discard);
- return;
+ return EEXIST;
}
if ((c->fd_repair = accept4(c->fd_repair_listen, NULL, NULL, 0)) < 0) {
+ rc = errno;
debug_perror("accept4() on TCP_REPAIR helper listening socket");
- return;
+ return rc;
}
if (!getsockopt(c->fd_repair, SOL_SOCKET, SO_PEERCRED, &ucred, &len))
@@ -107,10 +115,14 @@ void repair_listen_handler(struct ctx *c, uint32_t events)
ev.events = EPOLLHUP | EPOLLET;
ev.data.u64 = ref.u64;
if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair, &ev)) {
+ rc = errno;
debug_perror("epoll_ctl() on TCP_REPAIR helper socket");
close(c->fd_repair);
c->fd_repair = -1;
+ return rc;
}
+
+ return 0;
}
/**
@@ -139,6 +151,44 @@ void repair_handler(struct ctx *c, uint32_t events)
}
/**
+ * repair_wait() - Wait (with timeout) for TCP_REPAIR helper to connect
+ * @c: Execution context
+ *
+ * Return: 0 on success or if already connected, error code on failure
+ */
+int repair_wait(struct ctx *c)
+{
+ struct timeval tv = { .tv_sec = 0,
+ .tv_usec = (long)(REPAIR_ACCEPT_TIMEOUT_US) };
+ int rc;
+
+ static_assert(REPAIR_ACCEPT_TIMEOUT_US < 1000 * 1000,
+ ".tv_usec is greater than 1000 * 1000");
+
+ if (c->fd_repair >= 0)
+ return 0;
+
+ if (c->fd_repair_listen == -1)
+ return ENOENT;
+
+ if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO,
+ &tv, sizeof(tv))) {
+ rc = errno;
+ err_perror("Set timeout on TCP_REPAIR listening socket");
+ return rc;
+ }
+
+ rc = repair_listen_handler(c, EPOLLIN);
+
+ tv.tv_usec = 0;
+ if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO,
+ &tv, sizeof(tv)))
+ err_perror("Clear timeout on TCP_REPAIR listening socket");
+
+ return rc;
+}
+
+/**
* repair_flush() - Flush current set of sockets to helper, with current command
* @c: Execution context
*
diff --git a/repair.h b/repair.h
index de279d6..ab27e67 100644
--- a/repair.h
+++ b/repair.h
@@ -7,9 +7,10 @@
#define REPAIR_H
void repair_sock_init(const struct ctx *c);
-void repair_listen_handler(struct ctx *c, uint32_t events);
+int repair_listen_handler(struct ctx *c, uint32_t events);
void repair_handler(struct ctx *c, uint32_t events);
void repair_close(struct ctx *c);
+int repair_wait(struct ctx *c);
int repair_flush(struct ctx *c);
int repair_set(struct ctx *c, int s, int cmd);
diff --git a/siphash.h b/siphash.h
index a2ca2a9..e760236 100644
--- a/siphash.h
+++ b/siphash.h
@@ -99,7 +99,7 @@ static inline void siphash_feed(struct siphash_state *state, uint64_t in)
}
/**
- * siphash_final - Finalize SipHash calculations
+ * siphash_final() - Finalize SipHash calculations
* @v: siphash state (4 x 64-bit integers)
* @len: Total length of input data
* @tail: Final data for the hash (<= 7 bytes)
diff --git a/tap.c b/tap.c
index 4541f51..7ba6399 100644
--- a/tap.c
+++ b/tap.c
@@ -62,14 +62,67 @@
#include "vhost_user.h"
#include "vu_common.h"
+/* Maximum allowed frame lengths (including L2 header) */
+
+/* Verify that an L2 frame length limit is large enough to contain the header,
+ * but small enough to fit in the packet pool
+ */
+#define CHECK_FRAME_LEN(len) \
+ static_assert((len) >= ETH_HLEN && (len) <= PACKET_MAX_LEN, \
+ #len " has bad value")
+
+CHECK_FRAME_LEN(L2_MAX_LEN_PASTA);
+CHECK_FRAME_LEN(L2_MAX_LEN_PASST);
+CHECK_FRAME_LEN(L2_MAX_LEN_VU);
+
+/* We try size the packet pools so that we can use a single batch for the entire
+ * packet buffer. This might be exceeded for vhost-user, though, which uses its
+ * own buffers rather than pkt_buf.
+ *
+ * This is just a tuning parameter, the code will work with slightly more
+ * overhead if it's incorrect. So, we estimate based on the minimum practical
+ * frame size - an empty UDP datagram - rather than the minimum theoretical
+ * frame size.
+ *
+ * FIXME: Profile to work out how big this actually needs to be to amortise
+ * per-batch syscall overheads
+ */
+#define TAP_MSGS_IP4 \
+ DIV_ROUND_UP(sizeof(pkt_buf), \
+ ETH_HLEN + sizeof(struct iphdr) + sizeof(struct udphdr))
+#define TAP_MSGS_IP6 \
+ DIV_ROUND_UP(sizeof(pkt_buf), \
+ ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct udphdr))
+
/* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */
-static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf);
-static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
+static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS_IP4);
+static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS_IP6);
#define TAP_SEQS 128 /* Different L4 tuples in one batch */
#define FRAGMENT_MSG_RATE 10 /* # seconds between fragment warnings */
/**
+ * tap_l2_max_len() - Maximum frame size (including L2 header) for current mode
+ * @c: Execution context
+ */
+unsigned long tap_l2_max_len(const struct ctx *c)
+{
+ /* NOLINTBEGIN(bugprone-branch-clone): values can be the same */
+ switch (c->mode) {
+ case MODE_PASST:
+ return L2_MAX_LEN_PASST;
+ case MODE_PASTA:
+ return L2_MAX_LEN_PASTA;
+ case MODE_VU:
+ return L2_MAX_LEN_VU;
+ }
+ /* NOLINTEND(bugprone-branch-clone) */
+ ASSERT(0);
+
+ return 0; /* Unreachable, for cppcheck's sake */
+}
+
+/**
* tap_send_single() - Send a single frame
* @c: Execution context
* @data: Packet buffer
@@ -502,12 +555,13 @@ void eth_update_mac(struct ethhdr *eh,
memcpy(eh->h_source, eth_s, sizeof(eh->h_source));
}
-PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
+PACKET_POOL_DECL(pool_l4, UIO_MAXIOV);
/**
* struct l4_seq4_t - Message sequence for one protocol handler call, IPv4
* @msgs: Count of messages in sequence
* @protocol: Protocol number
+ * @ttl: Time to live
* @source: Source port
* @dest: Destination port
* @saddr: Source address
@@ -516,6 +570,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
*/
static struct tap4_l4_t {
uint8_t protocol;
+ uint8_t ttl;
uint16_t source;
uint16_t dest;
@@ -535,6 +590,7 @@ static struct tap4_l4_t {
* @dest: Destination port
* @saddr: Source address
* @daddr: Destination address
+ * @hop_limit: Hop limit
* @msg: Array of messages that can be handled in a single call
*/
static struct tap6_l4_t {
@@ -547,6 +603,8 @@ static struct tap6_l4_t {
struct in6_addr saddr;
struct in6_addr daddr;
+ uint8_t hop_limit;
+
struct pool_l4_t p;
} tap6_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */];
@@ -648,26 +706,31 @@ static int tap4_handler(struct ctx *c, const struct pool *in,
i = 0;
resume:
for (seq_count = 0, seq = NULL; i < in->count; i++) {
- size_t l2len, l3len, hlen, l4len;
+ size_t l3len, hlen, l4len;
+ struct ethhdr eh_storage;
+ struct iphdr iph_storage;
+ struct udphdr uh_storage;
const struct ethhdr *eh;
const struct udphdr *uh;
+ struct iov_tail data;
struct iphdr *iph;
- const char *l4h;
- packet_get(in, i, 0, 0, &l2len);
+ if (!packet_get(in, i, &data))
+ continue;
- eh = packet_get(in, i, 0, sizeof(*eh), &l3len);
+ eh = IOV_PEEK_HEADER(&data, eh_storage);
if (!eh)
continue;
if (ntohs(eh->h_proto) == ETH_P_ARP) {
- PACKET_POOL_P(pkt, 1, in->buf, in->buf_size);
-
- packet_add(pkt, l2len, (char *)eh);
- arp(c, pkt);
+ arp(c, &data);
continue;
}
- iph = packet_get(in, i, sizeof(*eh), sizeof(*iph), NULL);
+ if (!iov_drop_header(&data, sizeof(*eh)))
+ continue;
+ l3len = iov_tail_size(&data);
+
+ iph = IOV_PEEK_HEADER(&data, iph_storage);
if (!iph)
continue;
@@ -695,34 +758,32 @@ resume:
if (iph->saddr && c->ip4.addr_seen.s_addr != iph->saddr)
c->ip4.addr_seen.s_addr = iph->saddr;
- l4h = packet_get(in, i, sizeof(*eh) + hlen, l4len, NULL);
- if (!l4h)
+ if (!iov_drop_header(&data, hlen))
+ continue;
+ if (iov_tail_size(&data) != l4len)
continue;
if (iph->protocol == IPPROTO_ICMP) {
- PACKET_POOL_P(pkt, 1, in->buf, in->buf_size);
-
if (c->no_icmp)
continue;
tap_packet_debug(iph, NULL, NULL, 0, NULL, 1);
- packet_add(pkt, l4len, l4h);
icmp_tap_handler(c, PIF_TAP, AF_INET,
&iph->saddr, &iph->daddr,
- pkt, now);
+ &data, now);
continue;
}
- uh = packet_get(in, i, sizeof(*eh) + hlen, sizeof(*uh), NULL);
+ uh = IOV_PEEK_HEADER(&data, uh_storage);
if (!uh)
continue;
if (iph->protocol == IPPROTO_UDP) {
- PACKET_POOL_P(pkt, 1, in->buf, in->buf_size);
+ struct iov_tail eh_data;
- packet_add(pkt, l2len, (char *)eh);
- if (dhcp(c, pkt))
+ packet_get(in, i, &eh_data);
+ if (dhcp(c, &eh_data))
continue;
}
@@ -735,7 +796,8 @@ resume:
#define L4_MATCH(iph, uh, seq) \
((seq)->protocol == (iph)->protocol && \
(seq)->source == (uh)->source && (seq)->dest == (uh)->dest && \
- (seq)->saddr.s_addr == (iph)->saddr && (seq)->daddr.s_addr == (iph)->daddr)
+ (seq)->saddr.s_addr == (iph)->saddr && \
+ (seq)->daddr.s_addr == (iph)->daddr && (seq)->ttl == (iph)->ttl)
#define L4_SET(iph, uh, seq) \
do { \
@@ -744,6 +806,7 @@ resume:
(seq)->dest = (uh)->dest; \
(seq)->saddr.s_addr = (iph)->saddr; \
(seq)->daddr.s_addr = (iph)->daddr; \
+ (seq)->ttl = (iph)->ttl; \
} while (0)
if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV)
@@ -770,7 +833,7 @@ resume:
#undef L4_SET
append:
- packet_add((struct pool *)&seq->p, l4len, l4h);
+ packet_add((struct pool *)&seq->p, &data);
}
for (j = 0, seq = tap4_l4; j < seq_count; j++, seq++) {
@@ -792,7 +855,7 @@ append:
for (k = 0; k < p->count; )
k += udp_tap_handler(c, PIF_TAP, AF_INET,
&seq->saddr, &seq->daddr,
- p, k, now);
+ seq->ttl, p, k, now);
}
}
@@ -824,20 +887,28 @@ resume:
for (seq_count = 0, seq = NULL; i < in->count; i++) {
size_t l4len, plen, check;
struct in6_addr *saddr, *daddr;
+ struct ipv6hdr ip6h_storage;
+ struct ethhdr eh_storage;
+ struct udphdr uh_storage;
const struct ethhdr *eh;
const struct udphdr *uh;
+ struct iov_tail data;
struct ipv6hdr *ip6h;
uint8_t proto;
- char *l4h;
- eh = packet_get(in, i, 0, sizeof(*eh), NULL);
+ if (!packet_get(in, i, &data))
+ return -1;
+
+ eh = IOV_REMOVE_HEADER(&data, eh_storage);
if (!eh)
continue;
- ip6h = packet_get(in, i, sizeof(*eh), sizeof(*ip6h), &check);
+ ip6h = IOV_PEEK_HEADER(&data, ip6h_storage);
if (!ip6h)
continue;
+ check = iov_tail_size(&data) - sizeof(*ip6h);
+
saddr = &ip6h->saddr;
daddr = &ip6h->daddr;
@@ -845,7 +916,7 @@ resume:
if (plen != check)
continue;
- if (!(l4h = ipv6_l4hdr(in, i, sizeof(*eh), &proto, &l4len)))
+ if (!ipv6_l4hdr(&data, &proto, &l4len))
continue;
if (IN6_IS_ADDR_LOOPBACK(saddr) || IN6_IS_ADDR_LOOPBACK(daddr)) {
@@ -871,7 +942,7 @@ resume:
}
if (proto == IPPROTO_ICMPV6) {
- PACKET_POOL_P(pkt, 1, in->buf, in->buf_size);
+ struct iov_tail ndp_data;
if (c->no_icmp)
continue;
@@ -879,28 +950,27 @@ resume:
if (l4len < sizeof(struct icmp6hdr))
continue;
- packet_add(pkt, l4len, l4h);
-
- if (ndp(c, (struct icmp6hdr *)l4h, saddr, pkt))
+ ndp_data = data;
+ if (ndp(c, saddr, &ndp_data))
continue;
tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1);
icmp_tap_handler(c, PIF_TAP, AF_INET6,
- saddr, daddr, pkt, now);
+ saddr, daddr, &data, now);
continue;
}
if (l4len < sizeof(*uh))
continue;
- uh = (struct udphdr *)l4h;
+ uh = IOV_PEEK_HEADER(&data, uh_storage);
+ if (!uh)
+ continue;
if (proto == IPPROTO_UDP) {
- PACKET_POOL_P(pkt, 1, in->buf, in->buf_size);
-
- packet_add(pkt, l4len, l4h);
+ struct iov_tail uh_data = data;
- if (dhcpv6(c, pkt, saddr, daddr))
+ if (dhcpv6(c, &uh_data, saddr, daddr))
continue;
}
@@ -915,7 +985,8 @@ resume:
(seq)->dest == (uh)->dest && \
(seq)->flow_lbl == ip6_get_flow_lbl(ip6h) && \
IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr) && \
- IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr))
+ IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr) && \
+ (seq)->hop_limit == (ip6h)->hop_limit)
#define L4_SET(ip6h, proto, uh, seq) \
do { \
@@ -925,6 +996,7 @@ resume:
(seq)->flow_lbl = ip6_get_flow_lbl(ip6h); \
(seq)->saddr = *saddr; \
(seq)->daddr = *daddr; \
+ (seq)->hop_limit = (ip6h)->hop_limit; \
} while (0)
if (seq && L4_MATCH(ip6h, proto, uh, seq) &&
@@ -952,7 +1024,7 @@ resume:
#undef L4_SET
append:
- packet_add((struct pool *)&seq->p, l4len, l4h);
+ packet_add((struct pool *)&seq->p, &data);
}
for (j = 0, seq = tap6_l4; j < seq_count; j++, seq++) {
@@ -975,7 +1047,7 @@ append:
for (k = 0; k < p->count; )
k += udp_tap_handler(c, PIF_TAP, AF_INET6,
&seq->saddr, &seq->daddr,
- p, k, now);
+ seq->hop_limit, p, k, now);
}
}
@@ -1008,16 +1080,20 @@ void tap_handler(struct ctx *c, const struct timespec *now)
/**
* tap_add_packet() - Queue/capture packet, update notion of guest MAC address
* @c: Execution context
- * @l2len: Total L2 packet length
- * @p: Packet buffer
+ * @data: Packet to add to the pool
+ * @now: Current timestamp
*/
-void tap_add_packet(struct ctx *c, ssize_t l2len, char *p)
+void tap_add_packet(struct ctx *c, struct iov_tail *data,
+ const struct timespec *now)
{
+ struct ethhdr eh_storage;
const struct ethhdr *eh;
- pcap(p, l2len);
+ pcap_iov(data->iov, data->cnt, data->off);
- eh = (struct ethhdr *)p;
+ eh = IOV_PEEK_HEADER(data, eh_storage);
+ if (!eh)
+ return;
if (memcmp(c->guest_mac, eh->h_source, ETH_ALEN)) {
memcpy(c->guest_mac, eh->h_source, ETH_ALEN);
@@ -1027,10 +1103,18 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p)
switch (ntohs(eh->h_proto)) {
case ETH_P_ARP:
case ETH_P_IP:
- packet_add(pool_tap4, l2len, p);
+ if (!pool_can_fit(pool_tap4, data)) {
+ tap4_handler(c, pool_tap4, now);
+ pool_flush(pool_tap4);
+ }
+ packet_add(pool_tap4, data);
break;
case ETH_P_IPV6:
- packet_add(pool_tap6, l2len, p);
+ if (!pool_can_fit(pool_tap6, data)) {
+ tap6_handler(c, pool_tap6, now);
+ pool_flush(pool_tap6);
+ }
+ packet_add(pool_tap6, data);
break;
default:
break;
@@ -1045,8 +1129,10 @@ void tap_sock_reset(struct ctx *c)
{
info("Client connection closed%s", c->one_off ? ", exiting" : "");
- if (c->one_off)
+ if (c->one_off) {
+ fsync_pcap_and_log();
_exit(EXIT_SUCCESS);
+ }
/* Close the connected socket, wait for a new connection */
epoll_del(c, c->fd_tap);
@@ -1080,7 +1166,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
do {
n = recv(c->fd_tap, pkt_buf + partial_len,
- TAP_BUF_BYTES - partial_len, MSG_DONTWAIT);
+ sizeof(pkt_buf) - partial_len, MSG_DONTWAIT);
} while ((n < 0) && errno == EINTR);
if (n < 0) {
@@ -1096,8 +1182,9 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
while (n >= (ssize_t)sizeof(uint32_t)) {
uint32_t l2len = ntohl_unaligned(p);
+ struct iov_tail data;
- if (l2len < sizeof(struct ethhdr) || l2len > ETH_MAX_MTU) {
+ if (l2len < sizeof(struct ethhdr) || l2len > L2_MAX_LEN_PASST) {
err("Bad frame size from guest, resetting connection");
tap_sock_reset(c);
return;
@@ -1110,7 +1197,8 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
p += sizeof(uint32_t);
n -= sizeof(uint32_t);
- tap_add_packet(c, l2len, p);
+ data = IOV_TAIL_FROM_BUF(p, l2len, 0);
+ tap_add_packet(c, &data, now);
p += l2len;
n -= l2len;
@@ -1151,8 +1239,12 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
tap_flush_pools();
- for (n = 0; n <= (ssize_t)(TAP_BUF_BYTES - ETH_MAX_MTU); n += len) {
- len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU);
+ for (n = 0;
+ n <= (ssize_t)(sizeof(pkt_buf) - L2_MAX_LEN_PASTA);
+ n += len) {
+ struct iov_tail data;
+
+ len = read(c->fd_tap, pkt_buf + n, L2_MAX_LEN_PASTA);
if (len == 0) {
die("EOF on tap device, exiting");
@@ -1170,10 +1262,11 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
/* Ignore frames of bad length */
if (len < (ssize_t)sizeof(struct ethhdr) ||
- len > (ssize_t)ETH_MAX_MTU)
+ len > (ssize_t)L2_MAX_LEN_PASTA)
continue;
- tap_add_packet(c, len, pkt_buf + n);
+ data = IOV_TAIL_FROM_BUF(pkt_buf + n, len, 0);
+ tap_add_packet(c, &data, now);
}
tap_handler(c, now);
@@ -1367,12 +1460,12 @@ static void tap_sock_tun_init(struct ctx *c)
* @base: Buffer base
* @size Buffer size
*/
-void tap_sock_update_pool(void *base, size_t size)
+static void tap_sock_update_pool(void *base, size_t size)
{
int i;
- pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, base, size);
- pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, base, size);
+ pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS_IP4, base, size);
+ pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS_IP6, base, size);
for (i = 0; i < TAP_SEQS; i++) {
tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, base, size);
@@ -1388,8 +1481,8 @@ void tap_sock_update_pool(void *base, size_t size)
void tap_backend_init(struct ctx *c)
{
if (c->mode == MODE_VU) {
- tap_sock_update_pool(NULL, 0);
vu_init(c);
+ tap_sock_update_pool(&c->vdev->memory, 0);
} else {
tap_sock_update_pool(pkt_buf, sizeof(pkt_buf));
}
diff --git a/tap.h b/tap.h
index a2c3b87..21db4d2 100644
--- a/tap.h
+++ b/tap.h
@@ -6,6 +6,31 @@
#ifndef TAP_H
#define TAP_H
+/** L2_MAX_LEN_PASTA - Maximum frame length for pasta mode (with L2 header)
+ *
+ * The kernel tuntap device imposes a maximum frame size of 65535 including
+ * 'hard_header_len' (14 bytes for L2 Ethernet in the case of "tap" mode).
+ */
+#define L2_MAX_LEN_PASTA USHRT_MAX
+
+/** L2_MAX_LEN_PASST - Maximum frame length for passt mode (with L2 header)
+ *
+ * The only structural limit the QEMU socket protocol imposes on frames is
+ * (2^32-1) bytes, but that would be ludicrously long in practice. For now,
+ * limit it somewhat arbitrarily to 65535 bytes. FIXME: Work out an appropriate
+ * limit with more precision.
+ */
+#define L2_MAX_LEN_PASST USHRT_MAX
+
+/** L2_MAX_LEN_VU - Maximum frame length for vhost-user mode (with L2 header)
+ *
+ * vhost-user allows multiple buffers per frame, each of which can be quite
+ * large, so the inherent frame size limit is rather large. Much larger than is
+ * actually useful for IP. For now limit arbitrarily to 65535 bytes. FIXME:
+ * Work out an appropriate limit with more precision.
+ */
+#define L2_MAX_LEN_VU USHRT_MAX
+
struct udphdr;
/**
@@ -21,8 +46,8 @@ struct tap_hdr {
* @c: Execution context
* @taph: Pointer to tap specific header buffer
*
- * Returns: A struct iovec covering the correct portion of @taph to use as the
- * tap specific header in the current configuration.
+ * Return: a struct iovec covering the correct portion of @taph to use as the
+ * tap specific header in the current configuration.
*/
static inline struct iovec tap_hdr_iov(const struct ctx *c,
struct tap_hdr *thdr)
@@ -44,6 +69,7 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
thdr->vnet_len = htonl(l2len);
}
+unsigned long tap_l2_max_len(const struct ctx *c);
void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
struct in_addr dst, size_t l4len, uint8_t proto);
@@ -89,10 +115,9 @@ void tap_handler_passt(struct ctx *c, uint32_t events,
const struct timespec *now);
int tap_sock_unix_open(char *sock_path);
void tap_sock_reset(struct ctx *c);
-void tap_sock_update_pool(void *base, size_t size);
void tap_backend_init(struct ctx *c);
void tap_flush_pools(void);
void tap_handler(struct ctx *c, const struct timespec *now);
-void tap_add_packet(struct ctx *c, ssize_t l2len, char *p);
-
+void tap_add_packet(struct ctx *c, struct iov_tail *data,
+ const struct timespec *now);
#endif /* TAP_H */
diff --git a/tcp.c b/tcp.c
index 32a08bd..48b1ef2 100644
--- a/tcp.c
+++ b/tcp.c
@@ -310,6 +310,16 @@
#include "tcp_buf.h"
#include "tcp_vu.h"
+/*
+ * The size of TCP header (including options) is given by doff (Data Offset)
+ * that is a 4-bit value specifying the number of 32-bit words in the header.
+ * The maximum value of doff is 15 [(1 << 4) - 1].
+ * The maximum length in bytes of options is 15 minus the number of 32-bit
+ * words in the minimal TCP header (5) multiplied by the length of a 32-bit
+ * word (4).
+ */
+#define OPTLEN_MAX (((1UL << 4) - 1 - 5) * 4UL)
+
#ifndef __USE_MISC
/* From Linux UAPI, missing in netinet/tcp.h provided by musl */
struct tcp_repair_opt {
@@ -389,7 +399,7 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS];
*/
static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
-char tcp_buf_discard [MAX_WINDOW];
+char tcp_buf_discard [BUF_DISCARD_SIZE];
/* Does the kernel support TCP_PEEK_OFF? */
bool peek_offset_cap;
@@ -434,19 +444,20 @@ static struct tcp_tap_conn *conn_at_sidx(flow_sidx_t sidx)
}
/**
- * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported
- * @s: Socket to update
+ * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on connection if supported
+ * @conn: Pointer to the TCP connection structure
* @offset: Offset in bytes
*
- * Return: -1 when it fails, 0 otherwise.
+ * Return: -1 when it fails, 0 otherwise.
*/
-int tcp_set_peek_offset(int s, int offset)
+int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset)
{
if (!peek_offset_cap)
return 0;
- if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) {
- err("Failed to set SO_PEEK_OFF to %i in socket %i", offset, s);
+ if (setsockopt(conn->sock, SOL_SOCKET, SO_PEEK_OFF,
+ &offset, sizeof(offset))) {
+ flow_perror(conn, "Failed to set SO_PEEK_OFF to %i", offset);
return -1;
}
return 0;
@@ -455,7 +466,7 @@ int tcp_set_peek_offset(int s, int offset)
/**
* tcp_conn_epoll_events() - epoll events mask for given connection state
* @events: Current connection events
- * @conn_flags Connection flags
+ * @conn_flags: Connection flags
*
* Return: epoll events mask corresponding to implied connection state
*/
@@ -1078,7 +1089,7 @@ out:
* tcp_update_seqack_from_tap() - ACK number from tap and related flags/counters
* @c: Execution context
* @conn: Connection pointer
- * @seq Current ACK sequence, host order
+ * @seq: Current ACK sequence, host order
*/
static void tcp_update_seqack_from_tap(const struct ctx *c,
struct tcp_tap_conn *conn, uint32_t seq)
@@ -1097,12 +1108,32 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
}
/**
+ * tcp_rewind_seq() - Rewind sequence to tap and socket offset to current ACK
+ * @c: Execution context
+ * @conn: Connection pointer
+ *
+ * Return: 0 on success, -1 on failure, with connection reset
+ */
+static int tcp_rewind_seq(const struct ctx *c, struct tcp_tap_conn *conn)
+{
+ conn->seq_to_tap = conn->seq_ack_from_tap;
+ conn->events &= ~TAP_FIN_SENT;
+
+ if (tcp_set_peek_offset(conn, 0)) {
+ tcp_rst(c, conn);
+ return -1;
+ }
+
+ return 0;
+}
+
+/**
* tcp_prepare_flags() - Prepare header for flags-only segment (no payload)
* @c: Execution context
* @conn: Connection pointer
* @flags: TCP flags: if not set, send segment only if ACK is due
* @th: TCP header to update
- * @data: buffer to store TCP option
+ * @opts: TCP option buffer (output parameter)
* @optlen: size of the TCP option buffer (output parameter)
*
* Return: < 0 error code on connection reset,
@@ -1165,6 +1196,7 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
th->doff = (sizeof(*th) + *optlen) / 4;
th->ack = !!(flags & ACK);
+ th->psh = !!(flags & PSH);
th->rst = !!(flags & RST);
th->syn = !!(flags & SYN);
th->fin = !!(flags & FIN);
@@ -1236,30 +1268,41 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn,
/**
* tcp_tap_window_update() - Process an updated window from tap side
+ * @c: Execution context
* @conn: Connection pointer
- * @window: Window value, host order, unscaled
+ * @wnd: Window value, host order, unscaled
+ *
+ * Return: false on zero window (not stored to wnd_from_tap), true otherwise
*/
-static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd)
+static bool tcp_tap_window_update(const struct ctx *c,
+ struct tcp_tap_conn *conn, unsigned wnd)
{
wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap);
/* Work-around for bug introduced in peer kernel code, commit
- * e2142825c120 ("net: tcp: send zero-window ACK when no memory").
- * We don't update if window shrank to zero.
+ * e2142825c120 ("net: tcp: send zero-window ACK when no memory"): don't
+ * update the window if it shrank to zero, so that we'll eventually
+ * retry to send data, but rewind the sequence as that obviously implies
+ * that no data beyond the updated window will be acknowledged.
*/
- if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap))
- return;
+ if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) {
+ tcp_rewind_seq(c, conn);
+ return false;
+ }
conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX);
/* FIXME: reflect the tap-side receiver's window back to the sock-side
* sender by adjusting SO_RCVBUF? */
+ return true;
}
/**
* tcp_init_seq() - Calculate initial sequence number according to RFC 6528
* @hash: Hash of connection details
* @now: Current timestamp
+ *
+ * Return: the calculated 32-bit initial sequence number.
*/
static uint32_t tcp_init_seq(uint64_t hash, const struct timespec *now)
{
@@ -1316,7 +1359,7 @@ static int tcp_conn_new_sock(sa_family_t af)
* tcp_conn_sock() - Obtain a connectable socket in the host/init namespace
* @af: Address family (AF_INET or AF_INET6)
*
- * Return: Socket fd on success, -errno on failure
+ * Return: socket fd on success, -errno on failure
*/
int tcp_conn_sock(sa_family_t af)
{
@@ -1545,9 +1588,8 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
if (c->mode == MODE_VU) { /* To rebind to same oport after migration */
sl = sizeof(sa);
- if (!getsockname(s, &sa.sa, &sl))
- inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa);
- else
+ if (getsockname(s, &sa.sa, &sl) ||
+ inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa) < 0)
err_perror("Can't get local address for socket %i", s);
}
@@ -1611,6 +1653,23 @@ static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
}
/**
+ * tcp_packet_data_len() - Get data (TCP payload) length for a TCP packet
+ * @th: Pointer to TCP header
+ * @l4len: TCP packet length, including TCP header
+ *
+ * Return: data length of TCP packet, -1 on invalid value of Data Offset field
+ */
+static ssize_t tcp_packet_data_len(const struct tcphdr *th, size_t l4len)
+{
+ size_t off = th->doff * 4UL;
+
+ if (off < sizeof(*th) || off > l4len)
+ return -1;
+
+ return l4len - off;
+}
+
+/**
* tcp_data_from_tap() - tap/guest data for established connection
* @c: Execution context
* @conn: Connection pointer
@@ -1639,16 +1698,22 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
for (i = idx, iov_i = 0; i < (int)p->count; i++) {
uint32_t seq, seq_offset, ack_seq;
+ struct tcphdr th_storage;
const struct tcphdr *th;
- char *data;
- size_t off;
+ struct iov_tail data;
+ size_t off, size;
+ int count;
- th = packet_get(p, i, 0, sizeof(*th), &len);
+ if (!packet_get(p, i, &data))
+ return -1;
+
+ th = IOV_PEEK_HEADER(&data, th_storage);
if (!th)
return -1;
- len += sizeof(*th);
+ len = iov_tail_size(&data);
off = th->doff * 4UL;
+
if (off < sizeof(*th) || off > len)
return -1;
@@ -1658,9 +1723,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
}
len -= off;
- data = packet_get(p, i, off, len, NULL);
- if (!data)
- continue;
+ iov_drop_header(&data, off);
seq = ntohl(th->seq);
if (SEQ_LT(seq, conn->seq_from_tap) && len <= 1) {
@@ -1672,7 +1735,8 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
tcp_timer_ctl(c, conn);
if (p->count == 1) {
- tcp_tap_window_update(conn, ntohs(th->window));
+ tcp_tap_window_update(c, conn,
+ ntohs(th->window));
return 1;
}
@@ -1691,6 +1755,15 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
ack_seq == max_ack_seq &&
ntohs(th->window) == max_ack_seq_wnd;
+ /* See tcp_tap_window_update() for details. On
+ * top of that, we also need to check here if a
+ * zero-window update is contained in a batch of
+ * packets that includes a non-zero window as
+ * well.
+ */
+ if (!ntohs(th->window))
+ tcp_rewind_seq(c, conn);
+
max_ack_seq_wnd = ntohs(th->window);
max_ack_seq = ack_seq;
}
@@ -1734,10 +1807,14 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
continue;
}
- tcp_iov[iov_i].iov_base = data + seq_offset;
- tcp_iov[iov_i].iov_len = len - seq_offset;
- seq_from_tap += tcp_iov[iov_i].iov_len;
- iov_i++;
+ iov_drop_header(&data, seq_offset);
+ size = len - seq_offset;
+ count = iov_tail_clone(&tcp_iov[iov_i], UIO_MAXIOV - iov_i,
+ &data);
+ if (count < 0)
+ break;
+ seq_from_tap += size;
+ iov_i += count;
if (keep == i)
keep = -1;
@@ -1750,17 +1827,16 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
if (ack && !tcp_sock_consume(conn, max_ack_seq))
tcp_update_seqack_from_tap(c, conn, max_ack_seq);
- tcp_tap_window_update(conn, max_ack_seq_wnd);
+ tcp_tap_window_update(c, conn, max_ack_seq_wnd);
if (retr) {
flow_trace(conn,
"fast re-transmit, ACK: %u, previous sequence: %u",
- max_ack_seq, conn->seq_to_tap);
- conn->seq_to_tap = max_ack_seq;
- if (tcp_set_peek_offset(conn->sock, 0)) {
- tcp_rst(c, conn);
+ conn->seq_ack_from_tap, conn->seq_to_tap);
+
+ if (tcp_rewind_seq(c, conn))
return -1;
- }
+
tcp_data_from_sock(c, conn);
}
@@ -1840,7 +1916,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
const struct tcphdr *th,
const char *opts, size_t optlen)
{
- tcp_tap_window_update(conn, ntohs(th->window));
+ tcp_tap_window_update(c, conn, ntohs(th->window));
tcp_get_tap_ws(conn, opts, optlen);
/* First value is not scaled */
@@ -1854,7 +1930,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
conn->seq_ack_to_tap = conn->seq_from_tap;
conn_event(c, conn, ESTABLISHED);
- if (tcp_set_peek_offset(conn->sock, 0)) {
+ if (tcp_set_peek_offset(conn, 0)) {
tcp_rst(c, conn);
return;
}
@@ -1955,8 +2031,11 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
const struct pool *p, int idx, const struct timespec *now)
{
struct tcp_tap_conn *conn;
+ struct tcphdr th_storage;
const struct tcphdr *th;
- size_t optlen, len;
+ char optsc[OPTLEN_MAX];
+ struct iov_tail data;
+ size_t optlen, l4len;
const char *opts;
union flow *flow;
flow_sidx_t sidx;
@@ -1965,15 +2044,19 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
(void)pif;
- th = packet_get(p, idx, 0, sizeof(*th), &len);
+ if (!packet_get(p, idx, &data))
+ return 1;
+
+ l4len = iov_tail_size(&data);
+
+ th = IOV_REMOVE_HEADER(&data, th_storage);
if (!th)
return 1;
- len += sizeof(*th);
optlen = th->doff * 4UL - sizeof(*th);
/* Static checkers might fail to see this: */
- optlen = MIN(optlen, ((1UL << 4) /* from doff width */ - 6) * 4UL);
- opts = packet_get(p, idx, sizeof(*th), optlen, NULL);
+ optlen = MIN(optlen, OPTLEN_MAX);
+ opts = (char *)iov_remove_header_(&data, &optsc[0], optlen, 1);
sidx = flow_lookup_af(c, IPPROTO_TCP, PIF_TAP, af, saddr, daddr,
ntohs(th->source), ntohs(th->dest));
@@ -1985,7 +2068,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
tcp_conn_from_tap(c, af, saddr, daddr, th,
opts, optlen, now);
else
- tcp_rst_no_conn(c, af, saddr, daddr, flow_lbl, th, len);
+ tcp_rst_no_conn(c, af, saddr, daddr, flow_lbl, th, l4len);
return 1;
}
@@ -1993,7 +2076,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
ASSERT(pif_at_sidx(sidx) == PIF_TAP);
conn = &flow->tcp;
- flow_trace(conn, "packet length %zu from tap", len);
+ flow_trace(conn, "packet length %zu from tap", l4len);
if (th->rst) {
conn_event(c, conn, CLOSED);
@@ -2022,7 +2105,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
goto reset;
conn_event(c, conn, ESTABLISHED);
- if (tcp_set_peek_offset(conn->sock, 0))
+ if (tcp_set_peek_offset(conn, 0))
goto reset;
if (th->fin) {
@@ -2038,9 +2121,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
if (!th->ack)
goto reset;
- tcp_tap_window_update(conn, ntohs(th->window));
-
- tcp_data_from_sock(c, conn);
+ if (tcp_tap_window_update(c, conn, ntohs(th->window)))
+ tcp_data_from_sock(c, conn);
if (p->count - idx == 1)
return 1;
@@ -2048,13 +2130,38 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
/* Established connections not accepting data from tap */
if (conn->events & TAP_FIN_RCVD) {
- tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
- tcp_tap_window_update(conn, ntohs(th->window));
- tcp_data_from_sock(c, conn);
+ bool retr;
- if (conn->events & SOCK_FIN_RCVD &&
- conn->seq_ack_from_tap == conn->seq_to_tap)
- conn_event(c, conn, CLOSED);
+ retr = th->ack && !tcp_packet_data_len(th, l4len) && !th->fin &&
+ ntohl(th->ack_seq) == conn->seq_ack_from_tap &&
+ ntohs(th->window) == conn->wnd_from_tap;
+
+ /* On socket flush failure, pretend there was no ACK, try again
+ * later
+ */
+ if (th->ack && !tcp_sock_consume(conn, ntohl(th->ack_seq)))
+ tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
+
+ if (retr) {
+ flow_trace(conn,
+ "fast re-transmit, ACK: %u, previous sequence: %u",
+ ntohl(th->ack_seq), conn->seq_to_tap);
+
+ if (tcp_rewind_seq(c, conn))
+ return -1;
+ }
+
+ if (tcp_tap_window_update(c, conn, ntohs(th->window)) || retr)
+ tcp_data_from_sock(c, conn);
+
+ if (conn->seq_ack_from_tap == conn->seq_to_tap) {
+ if (th->ack && conn->events & TAP_FIN_SENT)
+ conn_event(c, conn, TAP_FIN_ACKED);
+
+ if (conn->events & SOCK_FIN_RCVD &&
+ conn->events & TAP_FIN_ACKED)
+ conn_event(c, conn, CLOSED);
+ }
return 1;
}
@@ -2199,12 +2306,11 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
* mode only, below.
*/
ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa,
- ref.tcp_listen.port);
+ NULL, ref.tcp_listen.port);
if (c->mode == MODE_VU) { /* Rebind to same address after migration */
- if (!getsockname(s, &sa.sa, &sl))
- inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa);
- else
+ if (getsockname(s, &sa.sa, &sl) ||
+ inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa) < 0)
err_perror("Can't get local address for socket %i", s);
}
@@ -2282,16 +2388,16 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
tcp_rst(c, conn);
} else {
flow_dbg(conn, "ACK timeout, retry");
- conn->retrans++;
- conn->seq_to_tap = conn->seq_ack_from_tap;
+
if (!conn->wnd_from_tap)
conn->wnd_from_tap = 1; /* Zero-window probe */
- if (tcp_set_peek_offset(conn->sock, 0)) {
- tcp_rst(c, conn);
- } else {
- tcp_data_from_sock(c, conn);
- tcp_timer_ctl(c, conn);
- }
+
+ conn->retrans++;
+ if (tcp_rewind_seq(c, conn))
+ return;
+
+ tcp_data_from_sock(c, conn);
+ tcp_timer_ctl(c, conn);
}
} else {
struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } };
@@ -2335,7 +2441,7 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
return;
}
- if ((conn->events & TAP_FIN_SENT) && (events & EPOLLHUP)) {
+ if ((conn->events & TAP_FIN_ACKED) && (events & EPOLLHUP)) {
conn_event(c, conn, CLOSED);
return;
}
@@ -2604,7 +2710,7 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af)
/**
* tcp_probe_tcp_info() - Check what data TCP_INFO reports
*
- * Return: Number of bytes returned by TCP_INFO getsockopt()
+ * Return: number of bytes returned by TCP_INFO getsockopt()
*/
static socklen_t tcp_probe_tcp_info(void)
{
@@ -2810,20 +2916,21 @@ int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn)
/**
* tcp_flow_dump_tinfo() - Dump window scale, tcpi_state, tcpi_options
- * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
* @t: Extended migration data
*
* Return: 0 on success, negative error code on failure
*/
-static int tcp_flow_dump_tinfo(int s, struct tcp_tap_transfer_ext *t)
+static int tcp_flow_dump_tinfo(const struct tcp_tap_conn *conn,
+ struct tcp_tap_transfer_ext *t)
{
struct tcp_info tinfo;
socklen_t sl;
sl = sizeof(tinfo);
- if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
+ if (getsockopt(conn->sock, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
int rc = -errno;
- err_perror("Querying TCP_INFO, socket %i", s);
+ flow_perror(conn, "Querying TCP_INFO");
return rc;
}
@@ -2837,39 +2944,95 @@ static int tcp_flow_dump_tinfo(int s, struct tcp_tap_transfer_ext *t)
/**
* tcp_flow_dump_mss() - Dump MSS clamp (not current MSS) via TCP_MAXSEG
- * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
* @t: Extended migration data
*
* Return: 0 on success, negative error code on failure
*/
-static int tcp_flow_dump_mss(int s, struct tcp_tap_transfer_ext *t)
+static int tcp_flow_dump_mss(const struct tcp_tap_conn *conn,
+ struct tcp_tap_transfer_ext *t)
{
socklen_t sl = sizeof(t->mss);
+ int val;
- if (getsockopt(s, SOL_TCP, TCP_MAXSEG, &t->mss, &sl)) {
+ if (getsockopt(conn->sock, SOL_TCP, TCP_MAXSEG, &val, &sl)) {
int rc = -errno;
- err_perror("Getting MSS, socket %i", s);
+ flow_perror(conn, "Getting MSS");
return rc;
}
+ t->mss = (uint32_t)val;
+
+ return 0;
+}
+
+
+/**
+ * tcp_flow_dump_timestamp() - Dump RFC 7323 timestamp via TCP_TIMESTAMP
+ * @conn: Pointer to the TCP connection structure
+ * @t: Extended migration data (tcpi_options must be populated)
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_timestamp(const struct tcp_tap_conn *conn,
+ struct tcp_tap_transfer_ext *t)
+{
+ int val = 0;
+
+ if (t->tcpi_options & TCPI_OPT_TIMESTAMPS) {
+ socklen_t sl = sizeof(val);
+
+ if (getsockopt(conn->sock, SOL_TCP, TCP_TIMESTAMP, &val, &sl)) {
+ int rc = -errno;
+ flow_perror(conn, "Getting RFC 7323 timestamp");
+ return rc;
+ }
+ }
+
+ t->timestamp = (uint32_t)val;
+ return 0;
+}
+
+/**
+ * tcp_flow_repair_timestamp() - Restore RFC 7323 timestamp via TCP_TIMESTAMP
+ * @conn: Pointer to the TCP connection structure
+ * @t: Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_timestamp(const struct tcp_tap_conn *conn,
+ const struct tcp_tap_transfer_ext *t)
+{
+ int val = (int)t->timestamp;
+
+ if (t->tcpi_options & TCPI_OPT_TIMESTAMPS) {
+ if (setsockopt(conn->sock, SOL_TCP, TCP_TIMESTAMP,
+ &val, sizeof(val))) {
+ int rc = -errno;
+ flow_perror(conn, "Setting RFC 7323 timestamp");
+ return rc;
+ }
+ }
+
return 0;
}
/**
* tcp_flow_dump_wnd() - Dump current tcp_repair_window parameters
- * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
* @t: Extended migration data
*
* Return: 0 on success, negative error code on failure
*/
-static int tcp_flow_dump_wnd(int s, struct tcp_tap_transfer_ext *t)
+static int tcp_flow_dump_wnd(const struct tcp_tap_conn *conn,
+ struct tcp_tap_transfer_ext *t)
{
struct tcp_repair_window wnd;
socklen_t sl = sizeof(wnd);
- if (getsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, &sl)) {
+ if (getsockopt(conn->sock, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, &sl)) {
int rc = -errno;
- err_perror("Getting window repair data, socket %i", s);
+ flow_perror(conn, "Getting window repair data");
return rc;
}
@@ -2893,12 +3056,13 @@ static int tcp_flow_dump_wnd(int s, struct tcp_tap_transfer_ext *t)
/**
* tcp_flow_repair_wnd() - Restore window parameters from extended data
- * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
* @t: Extended migration data
*
* Return: 0 on success, negative error code on failure
*/
-static int tcp_flow_repair_wnd(int s, const struct tcp_tap_transfer_ext *t)
+static int tcp_flow_repair_wnd(const struct tcp_tap_conn *conn,
+ const struct tcp_tap_transfer_ext *t)
{
struct tcp_repair_window wnd;
@@ -2908,9 +3072,10 @@ static int tcp_flow_repair_wnd(int s, const struct tcp_tap_transfer_ext *t)
wnd.rcv_wnd = t->rcv_wnd;
wnd.rcv_wup = t->rcv_wup;
- if (setsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, sizeof(wnd))) {
+ if (setsockopt(conn->sock, IPPROTO_TCP, TCP_REPAIR_WINDOW,
+ &wnd, sizeof(wnd))) {
int rc = -errno;
- err_perror("Setting window data, socket %i", s);
+ flow_perror(conn, "Setting window data");
return rc;
}
@@ -2919,16 +3084,17 @@ static int tcp_flow_repair_wnd(int s, const struct tcp_tap_transfer_ext *t)
/**
* tcp_flow_select_queue() - Select queue (receive or send) for next operation
- * @s: Socket
+ * @conn: Connection to select queue for
* @queue: TCP_RECV_QUEUE or TCP_SEND_QUEUE
*
* Return: 0 on success, negative error code on failure
*/
-static int tcp_flow_select_queue(int s, int queue)
+static int tcp_flow_select_queue(const struct tcp_tap_conn *conn, int queue)
{
- if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue))) {
+ if (setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_QUEUE,
+ &queue, sizeof(queue))) {
int rc = -errno;
- err_perror("Selecting TCP_SEND_QUEUE, socket %i", s);
+ flow_perror(conn, "Selecting TCP_SEND_QUEUE");
return rc;
}
@@ -2937,26 +3103,28 @@ static int tcp_flow_select_queue(int s, int queue)
/**
* tcp_flow_dump_sndqueue() - Dump send queue, length of sent and not sent data
- * @s: Socket
+ * @conn: Connection to dump queue for
* @t: Extended migration data
*
* Return: 0 on success, negative error code on failure
*
* #syscalls:vu ioctl
*/
-static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t)
+static int tcp_flow_dump_sndqueue(const struct tcp_tap_conn *conn,
+ struct tcp_tap_transfer_ext *t)
{
+ int s = conn->sock;
ssize_t rc;
if (ioctl(s, SIOCOUTQ, &t->sndq) < 0) {
rc = -errno;
- err_perror("Getting send queue size, socket %i", s);
+ flow_perror(conn, "Getting send queue size");
return rc;
}
if (ioctl(s, SIOCOUTQNSD, &t->notsent) < 0) {
rc = -errno;
- err_perror("Getting not sent count, socket %i", s);
+ flow_perror(conn, "Getting not sent count");
return rc;
}
@@ -2975,14 +3143,16 @@ static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t)
}
if (t->notsent > t->sndq) {
- err("Invalid notsent count socket %i, send: %u, not sent: %u",
- s, t->sndq, t->notsent);
+ flow_err(conn,
+ "Invalid notsent count socket %i, send: %u, not sent: %u",
+ s, t->sndq, t->notsent);
return -EINVAL;
}
if (t->sndq > TCP_MIGRATE_SND_QUEUE_MAX) {
- err("Send queue too large to migrate socket %i: %u bytes",
- s, t->sndq);
+ flow_err(conn,
+ "Send queue too large to migrate socket %i: %u bytes",
+ s, t->sndq);
return -ENOBUFS;
}
@@ -2993,13 +3163,13 @@ static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t)
rc = 0;
} else {
rc = -errno;
- err_perror("Can't read send queue, socket %i", s);
+ flow_perror(conn, "Can't read send queue");
return rc;
}
}
if ((uint32_t)rc < t->sndq) {
- err("Short read migrating send queue");
+ flow_err(conn, "Short read migrating send queue");
return -ENXIO;
}
@@ -3010,19 +3180,20 @@ static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t)
/**
* tcp_flow_repair_queue() - Restore contents of a given (pre-selected) queue
- * @s: Socket
+ * @conn: Connection to repair queue for
* @len: Length of data to be restored
* @buf: Buffer with content of pending data queue
*
* Return: 0 on success, negative error code on failure
*/
-static int tcp_flow_repair_queue(int s, size_t len, uint8_t *buf)
+static int tcp_flow_repair_queue(const struct tcp_tap_conn *conn,
+ size_t len, uint8_t *buf)
{
size_t chunk = len;
uint8_t *p = buf;
while (len > 0) {
- ssize_t rc = send(s, p, MIN(len, chunk), 0);
+ ssize_t rc = send(conn->sock, p, MIN(len, chunk), 0);
if (rc < 0) {
if ((errno == ENOBUFS || errno == ENOMEM) &&
@@ -3032,7 +3203,7 @@ static int tcp_flow_repair_queue(int s, size_t len, uint8_t *buf)
}
rc = -errno;
- err_perror("Can't write queue, socket %i", s);
+ flow_perror(conn, "Can't write queue");
return rc;
}
@@ -3045,18 +3216,18 @@ static int tcp_flow_repair_queue(int s, size_t len, uint8_t *buf)
/**
* tcp_flow_dump_seq() - Dump current sequence of pre-selected queue
- * @s: Socket
+ * @conn: Pointer to the TCP connection structure
* @v: Sequence value, set on return
*
* Return: 0 on success, negative error code on failure
*/
-static int tcp_flow_dump_seq(int s, uint32_t *v)
+static int tcp_flow_dump_seq(const struct tcp_tap_conn *conn, uint32_t *v)
{
socklen_t sl = sizeof(*v);
- if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, v, &sl)) {
+ if (getsockopt(conn->sock, SOL_TCP, TCP_QUEUE_SEQ, v, &sl)) {
int rc = -errno;
- err_perror("Dumping sequence, socket %i", s);
+ flow_perror(conn, "Dumping sequence");
return rc;
}
@@ -3065,16 +3236,17 @@ static int tcp_flow_dump_seq(int s, uint32_t *v)
/**
* tcp_flow_repair_seq() - Restore sequence for pre-selected queue
- * @s: Socket
+ * @conn: Connection to repair sequences for
* @v: Sequence value to be set
*
* Return: 0 on success, negative error code on failure
*/
-static int tcp_flow_repair_seq(int s, const uint32_t *v)
+static int tcp_flow_repair_seq(const struct tcp_tap_conn *conn,
+ const uint32_t *v)
{
- if (setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, v, sizeof(*v))) {
+ if (setsockopt(conn->sock, SOL_TCP, TCP_QUEUE_SEQ, v, sizeof(*v))) {
int rc = -errno;
- err_perror("Setting sequence, socket %i", s);
+ flow_perror(conn, "Setting sequence");
return rc;
}
@@ -3083,15 +3255,17 @@ static int tcp_flow_repair_seq(int s, const uint32_t *v)
/**
* tcp_flow_dump_rcvqueue() - Dump receive queue and its length, seal/block it
- * @s: Socket
+ * @conn: Pointer to the TCP connection structure
* @t: Extended migration data
*
* Return: 0 on success, negative error code on failure
*
* #syscalls:vu ioctl
*/
-static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t)
+static int tcp_flow_dump_rcvqueue(const struct tcp_tap_conn *conn,
+ struct tcp_tap_transfer_ext *t)
{
+ int s = conn->sock;
ssize_t rc;
if (ioctl(s, SIOCINQ, &t->rcvq) < 0) {
@@ -3111,8 +3285,9 @@ static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t)
t->rcvq--;
if (t->rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) {
- err("Receive queue too large to migrate socket %i: %u bytes",
- s, t->rcvq);
+ flow_err(conn,
+ "Receive queue too large to migrate socket: %u bytes",
+ t->rcvq);
return -ENOBUFS;
}
@@ -3122,13 +3297,13 @@ static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t)
rc = 0;
} else {
rc = -errno;
- err_perror("Can't read receive queue for socket %i", s);
+ flow_perror(conn, "Can't read receive queue");
return rc;
}
}
if ((uint32_t)rc < t->rcvq) {
- err("Short read migrating receive queue");
+ flow_err(conn, "Short read migrating receive queue");
return -ENXIO;
}
@@ -3137,12 +3312,13 @@ static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t)
/**
* tcp_flow_repair_opt() - Set repair "options" (MSS, scale, SACK, timestamps)
- * @s: Socket
+ * @conn: Pointer to the TCP connection structure
* @t: Extended migration data
*
* Return: 0 on success, negative error code on failure
*/
-static int tcp_flow_repair_opt(int s, const struct tcp_tap_transfer_ext *t)
+static int tcp_flow_repair_opt(const struct tcp_tap_conn *conn,
+ const struct tcp_tap_transfer_ext *t)
{
const struct tcp_repair_opt opts[] = {
{ TCPOPT_WINDOW, t->snd_ws + (t->rcv_ws << 16) },
@@ -3156,9 +3332,9 @@ static int tcp_flow_repair_opt(int s, const struct tcp_tap_transfer_ext *t)
!!(t->tcpi_options & TCPI_OPT_SACK) +
!!(t->tcpi_options & TCPI_OPT_TIMESTAMPS));
- if (setsockopt(s, SOL_TCP, TCP_REPAIR_OPTIONS, opts, sl)) {
+ if (setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_OPTIONS, opts, sl)) {
int rc = -errno;
- err_perror("Setting repair options, socket %i", s);
+ flow_perror(conn, "Setting repair options");
return rc;
}
@@ -3214,12 +3390,14 @@ int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn)
/**
* tcp_flow_migrate_source_ext() - Dump queues, close sockets, send final data
+ * @c: Execution context
* @fd: Descriptor for state migration
* @conn: Pointer to the TCP connection structure
*
* Return: 0 on success, negative (not -EIO) on failure, -EIO on sending failure
*/
-int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
+int tcp_flow_migrate_source_ext(const struct ctx *c,
+ int fd, const struct tcp_tap_conn *conn)
{
uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
struct tcp_tap_transfer_ext *t = &migrate_ext[FLOW_IDX(conn)];
@@ -3229,39 +3407,45 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
/* Disable SO_PEEK_OFF, it will make accessing the queues in repair mode
* weird.
*/
- if (tcp_set_peek_offset(s, -1)) {
+ if (tcp_set_peek_offset(conn, -1)) {
rc = -errno;
goto fail;
}
- if ((rc = tcp_flow_dump_tinfo(s, t)))
+ if ((rc = tcp_flow_dump_tinfo(conn, t)))
goto fail;
- if ((rc = tcp_flow_dump_mss(s, t)))
+ if ((rc = tcp_flow_dump_mss(conn, t)))
goto fail;
- if ((rc = tcp_flow_dump_wnd(s, t)))
+ if ((rc = tcp_flow_dump_timestamp(conn, t)))
goto fail;
- if ((rc = tcp_flow_select_queue(s, TCP_SEND_QUEUE)))
+ if ((rc = tcp_flow_dump_wnd(conn, t)))
goto fail;
- if ((rc = tcp_flow_dump_sndqueue(s, t)))
+ if ((rc = tcp_flow_select_queue(conn, TCP_SEND_QUEUE)))
goto fail;
- if ((rc = tcp_flow_dump_seq(s, &t->seq_snd)))
+ if ((rc = tcp_flow_dump_sndqueue(conn, t)))
goto fail;
- if ((rc = tcp_flow_select_queue(s, TCP_RECV_QUEUE)))
+ if ((rc = tcp_flow_dump_seq(conn, &t->seq_snd)))
goto fail;
- if ((rc = tcp_flow_dump_rcvqueue(s, t)))
+ if ((rc = tcp_flow_select_queue(conn, TCP_RECV_QUEUE)))
goto fail;
- if ((rc = tcp_flow_dump_seq(s, &t->seq_rcv)))
+ if ((rc = tcp_flow_dump_rcvqueue(conn, t)))
goto fail;
- close(s);
+ if ((rc = tcp_flow_dump_seq(conn, &t->seq_rcv)))
+ goto fail;
+
+ if (c->migrate_no_linger)
+ close(s);
+ else
+ epoll_del(c, s);
/* Adjustments unrelated to FIN segments: sequence numbers we dumped are
* based on the end of the queues.
@@ -3269,14 +3453,14 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
t->seq_rcv -= t->rcvq;
t->seq_snd -= t->sndq;
- debug("Extended migration data, socket %i sequences send %u receive %u",
- s, t->seq_snd, t->seq_rcv);
- debug(" pending queues: send %u not sent %u receive %u",
- t->sndq, t->notsent, t->rcvq);
- debug(" window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
- t->snd_wl1, t->snd_wnd, t->max_window, t->rcv_wnd, t->rcv_wup);
- debug(" SO_PEEK_OFF %s offset=%"PRIu32,
- peek_offset_cap ? "enabled" : "disabled", peek_offset);
+ flow_dbg(conn, "Extended migration data, socket %i sequences send %u receive %u",
+ s, t->seq_snd, t->seq_rcv);
+ flow_dbg(conn, " pending queues: send %u not sent %u receive %u",
+ t->sndq, t->notsent, t->rcvq);
+ flow_dbg(conn, " window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
+ t->snd_wl1, t->snd_wnd, t->max_window, t->rcv_wnd, t->rcv_wup);
+ flow_dbg(conn, " SO_PEEK_OFF %s offset=%"PRIu32,
+ peek_offset_cap ? "enabled" : "disabled", peek_offset);
/* Endianness fix-ups */
t->seq_snd = htonl(t->seq_snd);
@@ -3284,6 +3468,8 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
t->sndq = htonl(t->sndq);
t->notsent = htonl(t->notsent);
t->rcvq = htonl(t->rcvq);
+ t->mss = htonl(t->mss);
+ t->timestamp = htonl(t->timestamp);
t->snd_wl1 = htonl(t->snd_wl1);
t->snd_wnd = htonl(t->snd_wnd);
@@ -3292,17 +3478,17 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
t->rcv_wup = htonl(t->rcv_wup);
if (write_all_buf(fd, t, sizeof(*t))) {
- err_perror("Failed to write extended data, socket %i", s);
+ flow_perror(conn, "Failed to write extended data");
return -EIO;
}
if (write_all_buf(fd, tcp_migrate_snd_queue, ntohl(t->sndq))) {
- err_perror("Failed to write send queue data, socket %i", s);
+ flow_perror(conn, "Failed to write send queue data");
return -EIO;
}
if (write_all_buf(fd, tcp_migrate_rcv_queue, ntohl(t->rcvq))) {
- err_perror("Failed to write receive queue data, socket %i", s);
+ flow_perror(conn, "Failed to write receive queue data");
return -EIO;
}
@@ -3317,7 +3503,7 @@ fail:
t->tcpi_state = 0; /* Not defined: tell the target to skip this flow */
if (write_all_buf(fd, t, sizeof(*t))) {
- err_perror("Failed to write extended data, socket %i", s);
+ flow_perror(conn, "Failed to write extended data");
return -EIO;
}
@@ -3337,32 +3523,22 @@ fail:
static int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
{
sa_family_t af = CONN_V4(conn) ? AF_INET : AF_INET6;
- const struct flowside *sockside = HOSTFLOW(conn);
- union sockaddr_inany a;
- socklen_t sl;
int s, rc;
- pif_sockaddr(c, &a, &sl, PIF_HOST, &sockside->oaddr, sockside->oport);
-
if ((conn->sock = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
IPPROTO_TCP)) < 0) {
rc = -errno;
- err_perror("Failed to create socket for migrated flow");
+ flow_perror(conn, "Failed to create socket for migrated flow");
return rc;
}
s = conn->sock;
if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &(int){ 1 }, sizeof(int)))
- debug_perror("Setting SO_REUSEADDR on socket %i", s);
+ flow_dbg_perror(conn, "Failed to set SO_REUSEADDR on socket %i",
+ s);
tcp_sock_set_nodelay(s);
- if (bind(s, &a.sa, sizeof(a))) {
- rc = -errno;
- err_perror("Failed to bind socket for migrated flow");
- goto err;
- }
-
if ((rc = tcp_flow_repair_on(c, conn)))
goto err;
@@ -3375,6 +3551,30 @@ err:
}
/**
+ * tcp_flow_repair_bind() - Bind socket in repair mode
+ * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_bind(const struct ctx *c, struct tcp_tap_conn *conn)
+{
+ const struct flowside *sockside = HOSTFLOW(conn);
+ union sockaddr_inany a;
+ socklen_t sl;
+
+ pif_sockaddr(c, &a, &sl, PIF_HOST, &sockside->oaddr, sockside->oport);
+
+ if (bind(conn->sock, &a.sa, sizeof(a))) {
+ int rc = -errno;
+ flow_perror(conn, "Failed to bind socket for migrated flow");
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
* tcp_flow_repair_connect() - Connect socket in repair mode, then turn it off
* @c: Execution context
* @conn: Pointer to the TCP connection structure
@@ -3390,7 +3590,7 @@ static int tcp_flow_repair_connect(const struct ctx *c,
rc = flowside_connect(c, conn->sock, PIF_HOST, tgt);
if (rc) {
rc = -errno;
- err_perror("Failed to connect migrated socket %i", conn->sock);
+ flow_perror(conn, "Failed to connect migrated socket");
return rc;
}
@@ -3421,8 +3621,8 @@ int tcp_flow_migrate_target(struct ctx *c, int fd)
}
if (read_all_buf(fd, &t, sizeof(t))) {
+ flow_perror(flow, "Failed to receive migration data");
flow_alloc_cancel(flow);
- err_perror("Failed to receive migration data");
return -errno;
}
@@ -3481,7 +3681,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
if (read_all_buf(fd, &t, sizeof(t))) {
rc = -errno;
- err_perror("Failed to read extended data for socket %i", s);
+ flow_perror(conn, "Failed to read extended data");
return rc;
}
@@ -3496,6 +3696,8 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
t.sndq = ntohl(t.sndq);
t.notsent = ntohl(t.notsent);
t.rcvq = ntohl(t.rcvq);
+ t.mss = ntohl(t.mss);
+ t.timestamp = ntohl(t.timestamp);
t.snd_wl1 = ntohl(t.snd_wl1);
t.snd_wnd = ntohl(t.snd_wnd);
@@ -3503,31 +3705,34 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
t.rcv_wnd = ntohl(t.rcv_wnd);
t.rcv_wup = ntohl(t.rcv_wup);
- debug("Extended migration data, socket %i sequences send %u receive %u",
- s, t.seq_snd, t.seq_rcv);
- debug(" pending queues: send %u not sent %u receive %u",
- t.sndq, t.notsent, t.rcvq);
- debug(" window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
- t.snd_wl1, t.snd_wnd, t.max_window, t.rcv_wnd, t.rcv_wup);
- debug(" SO_PEEK_OFF %s offset=%"PRIu32,
- peek_offset_cap ? "enabled" : "disabled", peek_offset);
+ flow_dbg(conn,
+ "Extended migration data, socket %i sequences send %u receive %u",
+ s, t.seq_snd, t.seq_rcv);
+ flow_dbg(conn, " pending queues: send %u not sent %u receive %u",
+ t.sndq, t.notsent, t.rcvq);
+ flow_dbg(conn,
+ " window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
+ t.snd_wl1, t.snd_wnd, t.max_window, t.rcv_wnd, t.rcv_wup);
+ flow_dbg(conn, " SO_PEEK_OFF %s offset=%"PRIu32,
+ peek_offset_cap ? "enabled" : "disabled", peek_offset);
if (t.sndq > TCP_MIGRATE_SND_QUEUE_MAX || t.notsent > t.sndq ||
t.rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) {
- err("Bad queues socket %i, send: %u, not sent: %u, receive: %u",
- s, t.sndq, t.notsent, t.rcvq);
+ flow_err(conn,
+ "Bad queues socket %i, send: %u, not sent: %u, receive: %u",
+ s, t.sndq, t.notsent, t.rcvq);
return -EINVAL;
}
if (read_all_buf(fd, tcp_migrate_snd_queue, t.sndq)) {
rc = -errno;
- err_perror("Failed to read send queue data, socket %i", s);
+ flow_perror(conn, "Failed to read send queue data");
return rc;
}
if (read_all_buf(fd, tcp_migrate_rcv_queue, t.rcvq)) {
rc = -errno;
- err_perror("Failed to read receive queue data, socket %i", s);
+ flow_perror(conn, "Failed to read receive queue data");
return rc;
}
@@ -3535,32 +3740,38 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
/* We weren't able to create the socket, discard flow */
goto fail;
- if (tcp_flow_select_queue(s, TCP_SEND_QUEUE))
+ if (tcp_flow_repair_bind(c, conn))
+ goto fail;
+
+ if (tcp_flow_repair_timestamp(conn, &t))
goto fail;
- if (tcp_flow_repair_seq(s, &t.seq_snd))
+ if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE))
goto fail;
- if (tcp_flow_select_queue(s, TCP_RECV_QUEUE))
+ if (tcp_flow_repair_seq(conn, &t.seq_snd))
goto fail;
- if (tcp_flow_repair_seq(s, &t.seq_rcv))
+ if (tcp_flow_select_queue(conn, TCP_RECV_QUEUE))
+ goto fail;
+
+ if (tcp_flow_repair_seq(conn, &t.seq_rcv))
goto fail;
if (tcp_flow_repair_connect(c, conn))
goto fail;
- if (tcp_flow_repair_queue(s, t.rcvq, tcp_migrate_rcv_queue))
+ if (tcp_flow_repair_queue(conn, t.rcvq, tcp_migrate_rcv_queue))
goto fail;
- if (tcp_flow_select_queue(s, TCP_SEND_QUEUE))
+ if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE))
goto fail;
- if (tcp_flow_repair_queue(s, t.sndq - t.notsent,
+ if (tcp_flow_repair_queue(conn, t.sndq - t.notsent,
tcp_migrate_snd_queue))
goto fail;
- if (tcp_flow_repair_opt(s, &t))
+ if (tcp_flow_repair_opt(conn, &t))
goto fail;
/* If we sent a FIN sent and it was acknowledged (TCP_FIN_WAIT2), don't
@@ -3575,19 +3786,19 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
v = TCP_SEND_QUEUE;
if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v)))
- debug_perror("Selecting repair queue, socket %i", s);
+ flow_perror(conn, "Selecting repair queue");
else
shutdown(s, SHUT_WR);
}
- if (tcp_flow_repair_wnd(s, &t))
+ if (tcp_flow_repair_wnd(conn, &t))
goto fail;
tcp_flow_repair_off(c, conn);
repair_flush(c);
if (t.notsent) {
- if (tcp_flow_repair_queue(s, t.notsent,
+ if (tcp_flow_repair_queue(conn, t.notsent,
tcp_migrate_snd_queue +
(t.sndq - t.notsent))) {
/* This sometimes seems to fail for unclear reasons.
@@ -3607,15 +3818,16 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
if (t.tcpi_state == TCP_FIN_WAIT1)
shutdown(s, SHUT_WR);
- if (tcp_set_peek_offset(conn->sock, peek_offset))
+ if (tcp_set_peek_offset(conn, peek_offset))
goto fail;
tcp_send_flag(c, conn, ACK);
tcp_data_from_sock(c, conn);
if ((rc = tcp_epoll_ctl(c, conn))) {
- debug("Failed to subscribe to epoll for migrated socket %i: %s",
- conn->sock, strerror_(-rc));
+ flow_dbg(conn,
+ "Failed to subscribe to epoll for migrated socket: %s",
+ strerror_(-rc));
goto fail;
}
@@ -3632,3 +3844,67 @@ fail:
return 0;
}
+
+/**
+ * tcp_prepare_iov() - Prepare iov according to kernel capability
+ * @msg: Message header to update
+ * @iov: iovec to receive TCP payload and data to discard
+ * @already_sent: Bytes sent after the last acknowledged one
+ * @payload_iov_cnt: Number of TCP payload iovec entries
+ *
+ * Return: 0 on success, -1 if already_sent cannot be discarded fully
+ */
+int tcp_prepare_iov(struct msghdr *msg, struct iovec *iov,
+ uint32_t already_sent, int payload_iov_cnt)
+{
+ /*
+ * IOV layout
+ * |- tcp_buf_discard -|---------- TCP data slots ------------|
+ *
+ * with discarded data:
+ * |------ddddddddddddd|ttttttttttttt-------------------------|
+ * ^
+ * |
+ * msg_iov
+ *
+ * without discarded data:
+ * |-------------------|ttttttttttttt-------------------------|
+ * ^
+ * |
+ * msg_iov
+ * d: discard data
+ * t: TCP data
+ */
+ if (peek_offset_cap) {
+ msg->msg_iov = iov + DISCARD_IOV_NUM;
+ msg->msg_iovlen = payload_iov_cnt;
+ } else {
+ int discard_cnt, discard_iov_rem;
+ struct iovec *iov_start;
+ int i;
+
+ discard_cnt = DIV_ROUND_UP(already_sent, BUF_DISCARD_SIZE);
+ if (discard_cnt > DISCARD_IOV_NUM) {
+ debug("Failed to discard %u already sent bytes",
+ already_sent);
+ return -1;
+ }
+
+ discard_iov_rem = already_sent % BUF_DISCARD_SIZE;
+
+ iov_start = iov + (DISCARD_IOV_NUM - discard_cnt);
+
+ /* Multiple iov entries pointing to the same buffer */
+ for (i = 0; i < discard_cnt; i++) {
+ iov_start[i].iov_base = tcp_buf_discard;
+ iov_start[i].iov_len = BUF_DISCARD_SIZE;
+ }
+ if (discard_iov_rem)
+ iov[DISCARD_IOV_NUM - 1].iov_len = discard_iov_rem;
+
+ msg->msg_iov = iov_start;
+ msg->msg_iovlen = discard_cnt + payload_iov_cnt;
+ }
+
+ return 0;
+}
diff --git a/tcp.h b/tcp.h
index 9142eca..234a803 100644
--- a/tcp.h
+++ b/tcp.h
@@ -25,7 +25,6 @@ void tcp_timer(struct ctx *c, const struct timespec *now);
void tcp_defer_handler(struct ctx *c);
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
-int tcp_set_peek_offset(int s, int offset);
extern bool peek_offset_cap;
diff --git a/tcp_buf.c b/tcp_buf.c
index 72d99c5..a493b5a 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -60,7 +60,7 @@ static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM];
static unsigned int tcp_payload_used;
/* recvmsg()/sendmsg() data for tap */
-static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
+static struct iovec iov_sock [TCP_FRAMES_MEM + DISCARD_IOV_NUM];
static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
@@ -104,7 +104,7 @@ void tcp_sock_iov_init(const struct ctx *c)
/**
* tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
- * @ctx: Execution context
+ * @c: Execution context
* @conns: Array of connection pointers corresponding to queued frames
* @frames: Two-dimensional array containing queued frames with sub-iovs
* @num_frames: Number of entries in the two arrays to be compared
@@ -125,7 +125,7 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
conn->seq_to_tap = seq;
peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
- if (tcp_set_peek_offset(conn->sock, peek_offset))
+ if (tcp_set_peek_offset(conn, peek_offset))
tcp_rst(c, conn);
}
}
@@ -148,7 +148,7 @@ void tcp_payload_flush(const struct ctx *c)
}
/**
- * tcp_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
+ * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
* @conn: Connection pointer
* @iov: Pointer to an array of iovec of TCP pre-cooked buffers
* @check: Checksum, if already known
@@ -160,7 +160,7 @@ static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
uint32_t seq, bool no_tcp_csum)
{
struct iov_tail tail = IOV_TAIL(&iov[TCP_IOV_PAYLOAD], 1, 0);
- struct tcphdr *th = IOV_REMOVE_HEADER(&tail, struct tcphdr);
+ struct tcphdr th_storage, *th = IOV_REMOVE_HEADER(&tail, th_storage);
struct tap_hdr *taph = iov[TCP_IOV_TAP].iov_base;
const struct flowside *tapside = TAPFLOW(conn);
const struct in_addr *a4 = inany_v4(&tapside->oaddr);
@@ -209,13 +209,14 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
if (ret <= 0)
return ret;
- tcp_payload_used++;
+ tcp_frame_conns[tcp_payload_used++] = conn;
l4len = optlen + sizeof(struct tcphdr);
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
tcp_l2_buf_fill_headers(conn, iov, NULL, seq, false);
if (flags & DUP_ACK) {
- struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++];
+ struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used];
+ tcp_frame_conns[tcp_payload_used++] = conn;
memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base,
iov[TCP_IOV_TAP].iov_len);
@@ -304,7 +305,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
conn->seq_ack_from_tap, conn->seq_to_tap);
conn->seq_to_tap = conn->seq_ack_from_tap;
already_sent = 0;
- if (tcp_set_peek_offset(s, 0)) {
+ if (tcp_set_peek_offset(conn, 0)) {
tcp_rst(c, conn);
return -1;
}
@@ -326,15 +327,9 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
iov_rem = (wnd_scaled - already_sent) % mss;
}
- /* Prepare iov according to kernel capability */
- if (!peek_offset_cap) {
- mh_sock.msg_iov = iov_sock;
- iov_sock[0].iov_base = tcp_buf_discard;
- iov_sock[0].iov_len = already_sent;
- mh_sock.msg_iovlen = fill_bufs + 1;
- } else {
- mh_sock.msg_iov = &iov_sock[1];
- mh_sock.msg_iovlen = fill_bufs;
+ if (tcp_prepare_iov(&mh_sock, iov_sock, already_sent, fill_bufs)) {
+ tcp_rst(c, conn);
+ return -1;
}
if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) {
@@ -344,12 +339,12 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
tcp_payload_used = 0;
}
- for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
+ for (i = 0, iov = iov_sock + DISCARD_IOV_NUM; i < fill_bufs; i++, iov++) {
iov->iov_base = &tcp_payload[tcp_payload_used + i].data;
iov->iov_len = mss;
}
if (iov_rem)
- iov_sock[fill_bufs].iov_len = iov_rem;
+ iov_sock[fill_bufs + DISCARD_IOV_NUM - 1].iov_len = iov_rem;
/* Receive into buffers, don't dequeue until acknowledged by guest. */
do
@@ -369,7 +364,10 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
}
if (!len) {
- if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
+ if (already_sent) {
+ conn_flag(c, conn, STALLED);
+ } else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) ==
+ SOCK_FIN_RCVD) {
int ret = tcp_buf_send_flag(c, conn, FIN | ACK);
if (ret) {
tcp_rst(c, conn);
diff --git a/tcp_conn.h b/tcp_conn.h
index 9126a36..38b5c54 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -152,6 +152,7 @@ struct tcp_tap_transfer {
* @notsent: Part of pending send queue that wasn't sent out yet
* @rcvq: Length of pending receive queue
* @mss: Socket-side MSS clamp
+ * @timestamp: RFC 7323 timestamp
* @snd_wl1: Next sequence used in window probe (next sequence - 1)
* @snd_wnd: Socket-side sending window
* @max_window: Window clamp
@@ -171,6 +172,7 @@ struct tcp_tap_transfer_ext {
uint32_t rcvq;
uint32_t mss;
+ uint32_t timestamp;
/* We can't just use struct tcp_repair_window: we need network order */
uint32_t snd_wl1;
@@ -234,7 +236,8 @@ int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn);
int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn);
-int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn);
+int tcp_flow_migrate_source_ext(const struct ctx *c, int fd,
+ const struct tcp_tap_conn *conn);
int tcp_flow_migrate_target(struct ctx *c, int fd);
int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd);
diff --git a/tcp_internal.h b/tcp_internal.h
index 6f5e054..5cb6cba 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -9,6 +9,9 @@
#define MAX_WS 8
#define MAX_WINDOW (1 << (16 + (MAX_WS)))
+#define BUF_DISCARD_SIZE (1 << 20)
+#define DISCARD_IOV_NUM DIV_ROUND_UP(MAX_WINDOW, BUF_DISCARD_SIZE)
+
#define MSS4 ROUND_DOWN(IP_MAX_MTU - \
sizeof(struct tcphdr) - \
sizeof(struct iphdr), \
@@ -18,14 +21,19 @@
sizeof(struct ipv6hdr), \
sizeof(uint32_t))
-#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW)
-#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW)
-#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW)
-#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW)
+#define SEQ_LE(a, b) \
+ ((uint32_t)(b) - (uint32_t)(a) < MAX_WINDOW)
+#define SEQ_LT(a, b) \
+ ((uint32_t)(b) - (uint32_t)(a) - 1 < MAX_WINDOW)
+#define SEQ_GE(a, b) \
+ ((uint32_t)(a) - (uint32_t)(b) < MAX_WINDOW)
+#define SEQ_GT(a, b) \
+ ((uint32_t)(a) - (uint32_t)(b) - 1 < MAX_WINDOW)
#define FIN (1 << 0)
#define SYN (1 << 1)
#define RST (1 << 2)
+#define PSH (1 << 3)
#define ACK (1 << 4)
/* Flags for internal usage */
@@ -138,7 +146,7 @@ struct tcp_syn_opts {
.ws = TCP_OPT_WS(ws_), \
})
-extern char tcp_buf_discard [MAX_WINDOW];
+extern char tcp_buf_discard [BUF_DISCARD_SIZE];
void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
unsigned long flag);
@@ -177,5 +185,8 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
size_t *optlen);
+int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset);
+int tcp_prepare_iov(struct msghdr *msg, struct iovec *iov,
+ uint32_t already_sent, int payload_iov_cnt);
#endif /* TCP_INTERNAL_H */
diff --git a/tcp_splice.c b/tcp_splice.c
index 0d10e3d..26cb630 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -95,7 +95,7 @@ static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af);
* conn_at_sidx() - Get spliced TCP connection specific flow at given sidx
* @sidx: Flow and side to retrieve
*
- * Return: Spliced TCP connection at @sidx, or NULL of @sidx is invalid.
+ * Return: spliced TCP connection at @sidx, or NULL of @sidx is invalid.
* Asserts if the flow at @sidx is not FLOW_TCP_SPLICE.
*/
static struct tcp_splice_conn *conn_at_sidx(flow_sidx_t sidx)
@@ -402,7 +402,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
* @c: Execution context
* @af: Address family (AF_INET or AF_INET6)
*
- * Return: Socket fd in the namespace on success, -errno on failure
+ * Return: socket fd in the namespace on success, -errno on failure
*/
static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af)
{
@@ -520,20 +520,21 @@ swap:
int more = 0;
retry:
- readlen = splice(conn->s[fromsidei], NULL,
- conn->pipe[fromsidei][1], NULL,
- c->tcp.pipe_size,
- SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
+ do
+ readlen = splice(conn->s[fromsidei], NULL,
+ conn->pipe[fromsidei][1], NULL,
+ c->tcp.pipe_size,
+ SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
+ while (readlen < 0 && errno == EINTR);
+
+ if (readlen < 0 && errno != EAGAIN)
+ goto close;
+
flow_trace(conn, "%zi from read-side call", readlen);
- if (readlen < 0) {
- if (errno == EINTR)
- goto retry;
- if (errno != EAGAIN)
- goto close;
- } else if (!readlen) {
+ if (!readlen) {
eof = 1;
- } else {
+ } else if (readlen > 0) {
never_read = 0;
if (readlen >= (long)c->tcp.pipe_size * 90 / 100)
@@ -543,10 +544,16 @@ retry:
conn_flag(c, conn, lowat_act_flag);
}
-eintr:
- written = splice(conn->pipe[fromsidei][0], NULL,
- conn->s[!fromsidei], NULL, c->tcp.pipe_size,
- SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
+ do
+ written = splice(conn->pipe[fromsidei][0], NULL,
+ conn->s[!fromsidei], NULL,
+ c->tcp.pipe_size,
+ SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
+ while (written < 0 && errno == EINTR);
+
+ if (written < 0 && errno != EAGAIN)
+ goto close;
+
flow_trace(conn, "%zi from write-side call (passed %zi)",
written, c->tcp.pipe_size);
@@ -578,12 +585,6 @@ eintr:
conn->written[fromsidei] += written > 0 ? written : 0;
if (written < 0) {
- if (errno == EINTR)
- goto eintr;
-
- if (errno != EAGAIN)
- goto close;
-
if (conn->read[fromsidei] == conn->written[fromsidei])
break;
diff --git a/tcp_vu.c b/tcp_vu.c
index 6891ed1..ebd3a1e 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -35,7 +35,7 @@
#include "vu_common.h"
#include <time.h>
-static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + 1];
+static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + DISCARD_IOV_NUM];
static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
static int head[VIRTQUEUE_MAX_SIZE + 1];
@@ -43,7 +43,7 @@ static int head[VIRTQUEUE_MAX_SIZE + 1];
* tcp_vu_hdrlen() - return the size of the header in level 2 frame (TCP)
* @v6: Set for IPv6 packet
*
- * Return: Return the size of the header
+ * Return: return the size of the header
*/
static size_t tcp_vu_hdrlen(bool v6)
{
@@ -171,21 +171,23 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
/** tcp_vu_sock_recv() - Receive datastream from socket into vhost-user buffers
* @c: Execution context
+ * @vq: virtqueue to use to receive data
* @conn: Connection pointer
* @v6: Set for IPv6 connections
* @already_sent: Number of bytes already sent
* @fillsize: Maximum bytes to fill in guest-side receiving window
* @iov_cnt: number of iov (output)
+ * @head_cnt: Pointer to store the count of head iov entries (output)
*
- * Return: Number of iov entries used to store the data or negative error code
+ * Return: number of bytes received from the socket, or a negative error code
+ * on failure.
*/
-static ssize_t tcp_vu_sock_recv(const struct ctx *c,
+static ssize_t tcp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq,
const struct tcp_tap_conn *conn, bool v6,
uint32_t already_sent, size_t fillsize,
int *iov_cnt, int *head_cnt)
{
- struct vu_dev *vdev = c->vdev;
- struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+ const struct vu_dev *vdev = c->vdev;
struct msghdr mh_sock = { 0 };
uint16_t mss = MSS_GET(conn);
int s = conn->sock;
@@ -198,7 +200,7 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
hdrlen = tcp_vu_hdrlen(v6);
- vu_init_elem(elem, &iov_vu[1], VIRTQUEUE_MAX_SIZE);
+ vu_init_elem(elem, &iov_vu[DISCARD_IOV_NUM], VIRTQUEUE_MAX_SIZE);
elem_cnt = 0;
*head_cnt = 0;
@@ -226,16 +228,9 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
elem_cnt += cnt;
}
- if (peek_offset_cap) {
- mh_sock.msg_iov = iov_vu + 1;
- mh_sock.msg_iovlen = elem_cnt;
- } else {
- iov_vu[0].iov_base = tcp_buf_discard;
- iov_vu[0].iov_len = already_sent;
-
- mh_sock.msg_iov = iov_vu;
- mh_sock.msg_iovlen = elem_cnt + 1;
- }
+ if (tcp_prepare_iov(&mh_sock, iov_vu, already_sent, elem_cnt))
+ /* Expect caller to do a TCP reset */
+ return -1;
do
ret = recvmsg(s, &mh_sock, MSG_PEEK);
@@ -349,7 +344,7 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
* @c: Execution context
* @conn: Connection pointer
*
- * Return: Negative on connection reset, 0 otherwise
+ * Return: negative on connection reset, 0 otherwise
*/
int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
{
@@ -376,7 +371,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
conn->seq_ack_from_tap, conn->seq_to_tap);
conn->seq_to_tap = conn->seq_ack_from_tap;
already_sent = 0;
- if (tcp_set_peek_offset(conn->sock, 0)) {
+ if (tcp_set_peek_offset(conn, 0)) {
tcp_rst(c, conn);
return -1;
}
@@ -396,7 +391,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
/* collect the buffers from vhost-user and fill them with the
* data from the socket
*/
- len = tcp_vu_sock_recv(c, conn, v6, already_sent, fillsize,
+ len = tcp_vu_sock_recv(c, vq, conn, v6, already_sent, fillsize,
&iov_cnt, &head_cnt);
if (len < 0) {
if (len != -EAGAIN && len != -EWOULDBLOCK) {
diff --git a/test/.gitignore b/test/.gitignore
index 3573444..9412f0d 100644
--- a/test/.gitignore
+++ b/test/.gitignore
@@ -11,3 +11,5 @@ nstool
rampstream
guest-key
guest-key.pub
+/exeter/
+*.bats
diff --git a/test/Makefile b/test/Makefile
index bf63db8..4938827 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -5,6 +5,8 @@
# Copyright Red Hat
# Author: David Gibson <david@gibson.dropbear.id.au>
+BATS = bats -j $(shell nproc)
+EXETOOL = exeter/exetool/exetool
WGET = wget -c
DEBIAN_IMGS = debian-8.11.0-openstack-amd64.qcow2 \
@@ -13,7 +15,7 @@ DEBIAN_IMGS = debian-8.11.0-openstack-amd64.qcow2 \
debian-10-generic-ppc64el-20220911-1135.qcow2 \
debian-11-nocloud-amd64.qcow2 \
debian-11-generic-arm64.qcow2 \
- debian-11-generic-ppc64el.qcow2 \
+ debian-11-generic-ppc64el-20250703-2162.qcow2 \
debian-sid-nocloud-amd64-daily.qcow2 \
debian-sid-nocloud-arm64-daily.qcow2 \
debian-sid-nocloud-ppc64el-daily.qcow2
@@ -50,18 +52,24 @@ UBUNTU_NEW_IMGS = xenial-server-cloudimg-powerpc-disk1.img \
jammy-server-cloudimg-s390x.img
UBUNTU_IMGS = $(UBUNTU_OLD_IMGS) $(UBUNTU_NEW_IMGS)
-DOWNLOAD_ASSETS = mbuto podman \
+DOWNLOAD_ASSETS = exeter mbuto podman \
$(DEBIAN_IMGS) $(FEDORA_IMGS) $(OPENSUSE_IMGS) $(UBUNTU_IMGS)
TESTDATA_ASSETS = small.bin big.bin medium.bin \
rampstream
LOCAL_ASSETS = mbuto.img mbuto.mem.img podman/bin/podman QEMU_EFI.fd \
$(DEBIAN_IMGS:%=prepared-%) $(FEDORA_IMGS:%=prepared-%) \
$(UBUNTU_NEW_IMGS:%=prepared-%) \
- nstool guest-key guest-key.pub \
+ nstool guest-key guest-key.pub $(EXETOOL) \
$(TESTDATA_ASSETS)
ASSETS = $(DOWNLOAD_ASSETS) $(LOCAL_ASSETS)
+EXETER_PYPATH = exeter/py3
+EXETER_BATS = smoke/smoke.sh.bats \
+ build/build.py.bats build/static_checkers.sh.bats
+BATS_FILES = $(EXETER_BATS) \
+ podman/test/system/505-networking-pasta.bats
+
CFLAGS = -Wall -Werror -Wextra -pedantic -std=c99
assets: $(ASSETS)
@@ -70,6 +78,11 @@ assets: $(ASSETS)
pull-%: %
git -C $* pull
+exeter:
+ git clone https://gitlab.com/dgibson/exeter.git
+
+exeter/exetool/exetool: pull-exeter
+
mbuto:
git clone git://mbuto.sh/mbuto
@@ -115,6 +128,12 @@ medium.bin:
big.bin:
dd if=/dev/urandom bs=1M count=10 of=$@
+$(EXETER_BATS): %.bats: % $(EXETOOL)
+ PYTHONPATH=$(EXETER_PYPATH) $(EXETOOL) bats -- $< > $@
+
+bats: $(BATS_FILES) pull-podman
+ PYTHONPATH=$(EXETER_PYPATH) CONTAINERS_HELPER_BINARY_DIR=.. $(BATS) $(BATS_FILES)
+
check: assets
./run
@@ -124,6 +143,7 @@ debug: assets
clean:
rm -f perf.js *~
rm -f $(LOCAL_ASSETS)
+ rm -f $(EXETER_BATS)
rm -rf test_logs
rm -f prepared-*.qcow2 prepared-*.img
@@ -149,6 +169,9 @@ debian-11-nocloud-%.qcow2:
debian-11-generic-%.qcow2:
$(WGET) -O $@ https://cloud.debian.org/images/cloud/bullseye/latest/debian-11-generic-$*.qcow2
+debian-11-generic-ppc64el-20250703-2162.qcow2:
+ $(WGET) -O $@ https://cloud.debian.org/images/cloud/bullseye/20250703-2162/debian-11-generic-ppc64el-20250703-2162.qcow2
+
debian-sid-nocloud-%-daily.qcow2:
$(WGET) -O $@ https://cloud.debian.org/images/cloud/sid/daily/latest/debian-sid-nocloud-$*-daily.qcow2
diff --git a/test/build/all b/test/build/all
deleted file mode 100644
index 1f79e0d..0000000
--- a/test/build/all
+++ /dev/null
@@ -1,61 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-or-later
-#
-# PASST - Plug A Simple Socket Transport
-# for qemu/UNIX domain socket mode
-#
-# PASTA - Pack A Subtle Tap Abstraction
-# for network namespace/tap device mode
-#
-# test/build/all - Build targets, one by one, then all together, check output
-#
-# Copyright (c) 2021 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
-
-htools make cc rm uname getconf mkdir cp rm man
-
-test Build passt
-host make clean
-check ! [ -e passt ]
-host CFLAGS="-Werror" make passt
-check [ -f passt ]
-
-test Build pasta
-host make clean
-check ! [ -e pasta ]
-host CFLAGS="-Werror" make pasta
-check [ -h pasta ]
-
-test Build qrap
-host make clean
-check ! [ -e qrap ]
-host CFLAGS="-Werror" make qrap
-check [ -f qrap ]
-
-test Build all
-host make clean
-check ! [ -e passt ]
-check ! [ -e pasta ]
-check ! [ -e qrap ]
-host CFLAGS="-Werror" make
-check [ -f passt ]
-check [ -h pasta ]
-check [ -f qrap ]
-
-test Install
-host mkdir __STATEDIR__/prefix
-host prefix=__STATEDIR__/prefix make install
-check [ -f __STATEDIR__/prefix/bin/passt ]
-check [ -h __STATEDIR__/prefix/bin/pasta ]
-check [ -f __STATEDIR__/prefix/bin/qrap ]
-check man -M __STATEDIR__/prefix/share/man -W passt
-check man -M __STATEDIR__/prefix/share/man -W pasta
-check man -M __STATEDIR__/prefix/share/man -W qrap
-
-test Uninstall
-host prefix=__STATEDIR__/prefix make uninstall
-check ! [ -f __STATEDIR__/prefix/bin/passt ]
-check ! [ -h __STATEDIR__/prefix/bin/pasta ]
-check ! [ -f __STATEDIR__/prefix/bin/qrap ]
-check ! man -M __STATEDIR__/prefix/share/man -W passt 2>/dev/null
-check ! man -M __STATEDIR__/prefix/share/man -W pasta 2>/dev/null
-check ! man -M __STATEDIR__/prefix/share/man -W qrap 2>/dev/null
diff --git a/test/build/build.py b/test/build/build.py
new file mode 100755
index 0000000..e49287c
--- /dev/null
+++ b/test/build/build.py
@@ -0,0 +1,109 @@
+#! /usr/bin/env python3
+#
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+# for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+# for network namespace/tap device mode
+#
+# test/build/build.py - Test build and install targets
+#
+# Copyright Red Hat
+# Author: David Gibson <david@gibson.dropbear.id.au>
+
+import contextlib
+import os
+from pathlib import Path
+import subprocess
+import tempfile
+from typing import Iterable, Iterator
+
+import exeter
+
+def sh(cmd):
+ """Run given command in a shell"""
+ subprocess.run(cmd, shell=True)
+
+
+@contextlib.contextmanager
+def clone_sources() -> Iterator[str]:
+ """Create a temporary copy of the passt sources.
+
+ When the context enters create a temporary directory and copy the
+ passt sources into it. Clean it up when the context exits.
+ """
+
+ os.chdir('..') # Move from test/ to repo base
+ with tempfile.TemporaryDirectory(ignore_cleanup_errors=False) as tmpdir:
+ sh(f"cp --parents -d $(git ls-files) {tmpdir}")
+ os.chdir(tmpdir)
+ yield tmpdir
+
+
+def test_make(target: str, expected_files: list[str]) -> None:
+ """Test `make {target}`
+
+ Arguments:
+ target -- make target to invoke
+ expected_files -- files make is expected to create
+
+ Verifies that
+ 1) `make target` completes successfully
+ 2) expected_files care created by `make target`
+ 3) expected_files are removed by `make clean`
+ """
+
+ ex_paths = [Path(f) for f in expected_files]
+ with clone_sources():
+ for p in ex_paths:
+ assert not p.exists(), f"{p} existed before make"
+ sh(f'make {target} CFLAGS="-Werror"')
+ for p in ex_paths:
+ assert p.exists(), f"{p} wasn't made"
+ sh('make clean')
+ for p in ex_paths:
+ assert not p.exists(), f"{p} existed after make clean"
+
+
+exeter.register('make_passt', test_make, 'passt', ['passt'])
+exeter.register('make_pasta', test_make, 'pasta', ['pasta'])
+exeter.register('make_qrap', test_make, 'qrap', ['qrap'])
+exeter.register('make_all', test_make, 'all', ['passt', 'pasta', 'qrap'])
+
+
+@exeter.test
+def test_install_uninstall() -> None:
+ """Test `make install` and `make uninstall`
+
+ Tests that `make install` installs the expected files to the
+ install prefix, and that `make uninstall` removes them again.
+ """
+
+ with clone_sources():
+ with tempfile.TemporaryDirectory(ignore_cleanup_errors=False) \
+ as prefix:
+ bindir = Path(prefix) / 'bin'
+ mandir = Path(prefix) / 'share/man'
+ progs = ['passt', 'pasta', 'qrap']
+
+ # Install
+ sh(f'make install CFLAGS="-Werror" prefix={prefix}')
+
+ for prog in progs:
+ exe = bindir / prog
+ assert exe.is_file(), f"{exe} does not exist as a regular file"
+ sh(f'man -M {mandir} -W {prog}')
+
+ # Uninstall
+ sh(f'make uninstall prefix={prefix}')
+
+ for prog in progs:
+ exe = bindir / prog
+ assert not exe.exists(), f"{exe} exists after uninstall"
+ sh(f'! man -M {mandir} -W {prog}')
+
+
+if __name__ == '__main__':
+ exeter.main()
diff --git a/test/build/clang_tidy b/test/build/clang_tidy
deleted file mode 100644
index 40573bf..0000000
--- a/test/build/clang_tidy
+++ /dev/null
@@ -1,17 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-or-later
-#
-# PASST - Plug A Simple Socket Transport
-# for qemu/UNIX domain socket mode
-#
-# PASTA - Pack A Subtle Tap Abstraction
-# for network namespace/tap device mode
-#
-# test/build/clang_tidy - Run source through clang-tidy(1) linter
-#
-# Copyright (c) 2021 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
-
-htools clang-tidy
-
-test Run clang-tidy
-host make clang-tidy
diff --git a/test/build/cppcheck b/test/build/cppcheck
deleted file mode 100644
index 0e1dbce..0000000
--- a/test/build/cppcheck
+++ /dev/null
@@ -1,17 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-or-later
-#
-# PASST - Plug A Simple Socket Transport
-# for qemu/UNIX domain socket mode
-#
-# PASTA - Pack A Subtle Tap Abstraction
-# for network namespace/tap device mode
-#
-# test/build/cppcheck - Run source through cppcheck(1) linter
-#
-# Copyright (c) 2021 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
-
-htools cppcheck
-
-test Run cppcheck
-host make cppcheck
diff --git a/test/build/static_checkers.sh b/test/build/static_checkers.sh
new file mode 100755
index 0000000..42806e7
--- /dev/null
+++ b/test/build/static_checkers.sh
@@ -0,0 +1,26 @@
+#! /bin/sh
+#
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+# for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+# for network namespace/tap device mode
+#
+# test/build/static_checkers.sh - Run static checkers
+#
+# Copyright Red Hat
+# Author: David Gibson <david@gibson.dropbear.id.au>
+
+. $(dirname $0)/../exeter/sh/exeter.sh
+
+exeter_register cppcheck make -C .. cppcheck
+exeter_set_description cppcheck "passt sources pass cppcheck"
+
+exeter_register clang_tidy make -C .. clang-tidy
+exeter_set_description clang_tidy "passt sources pass clang-tidy"
+
+exeter_main "$@"
+
+
diff --git a/test/lib/exeter b/test/lib/exeter
new file mode 100644
index 0000000..3b19bea
--- /dev/null
+++ b/test/lib/exeter
@@ -0,0 +1,58 @@
+#!/bin/sh
+#
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+# for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+# for network namespace/tap device mode
+#
+# test/lib/exeter - Run exeter tests within the rest of passt's tests
+#
+# Copyright Red Hat
+# Author: David Gibson <david@gibson.dropbear.id.au>
+
+EXETOOL="$BASEPATH/exeter/exetool/exetool"
+
+# is_exeter() - Determine if a test file is an exeter program
+# $@: Command line to invoke test program
+is_exeter() {
+ $EXETOOL probe -- "$@"
+}
+
+# exeter() - Run each test in an exeter program, logging each test separately
+# $@: Command line to invoke exeter test program
+exeter() {
+ STATESETUP="${STATEBASE}/$1"
+ mkdir -p "${STATESETUP}"
+
+ context_setup_host host
+ layout_host
+
+ cd test
+
+ __ntests=$($EXETOOL list -- "$@" | wc -l)
+ if [ $? != 0 ]; then
+ info "Failed to get exeter manifest for $@"
+ pause_continue \
+ "Press any key to pause test session" \
+ "Resuming in " \
+ "Paused, press any key to continue" \
+ 5
+ return
+ fi
+
+ status_file_start "$* (exeter)" ${__ntests}
+ [ ${CI} -eq 1 ] && video_link "${1}"
+
+ for __testid in $($EXETOOL list -- "$@"); do
+ __desc="$($EXETOOL desc -- "$@" -- ${__testid})"
+ status_test_start "${__desc}"
+ context_run host "$@" "${__testid}" && status_test_ok || status_test_fail
+ done
+
+ cd ..
+
+ teardown_context_watch ${PANE_HOST} host
+}
diff --git a/test/lib/setup b/test/lib/setup
index 575bc21..5994598 100755
--- a/test/lib/setup
+++ b/test/lib/setup
@@ -350,7 +350,7 @@ setup_migrate() {
sleep 1
- __opts="--vhost-user"
+ __opts="--vhost-user --migrate-exit --migrate-no-linger"
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap"
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
@@ -360,7 +360,7 @@ setup_migrate() {
context_run_bg passt_repair_1 "./passt-repair ${STATESETUP}/passt_1.socket.repair"
- __opts="--vhost-user"
+ __opts="--vhost-user --migrate-exit --migrate-no-linger"
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap"
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
diff --git a/test/lib/term b/test/lib/term
index ed690de..089364c 100755
--- a/test/lib/term
+++ b/test/lib/term
@@ -19,6 +19,7 @@ STATUS_FILE_INDEX=0
STATUS_COLS=
STATUS_PASS=0
STATUS_FAIL=0
+STATUS_SKIPPED=0
PR_RED='\033[1;31m'
PR_GREEN='\033[1;32m'
@@ -439,19 +440,21 @@ info_layout() {
# status_test_ok() - Update counter of passed tests, log and display message
status_test_ok() {
STATUS_PASS=$((STATUS_PASS + 1))
- tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | #(TZ="UTC" date -Iseconds)"
+ tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)"
info_passed
}
# status_test_fail() - Update counter of failed tests, log and display message
status_test_fail() {
STATUS_FAIL=$((STATUS_FAIL + 1))
- tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | #(TZ="UTC" date -Iseconds)"
+ tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)"
info_failed
}
# status_test_fail() - Update counter of failed tests, log and display message
status_test_skip() {
+ STATUS_SKIPPED=$((STATUS_SKIPPED + 1))
+ tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)"
info_skipped
}
diff --git a/test/lib/test b/test/lib/test
index 758250a..7349674 100755
--- a/test/lib/test
+++ b/test/lib/test
@@ -20,10 +20,7 @@ test_iperf3s() {
__sctx="${1}"
__port="${2}"
- pane_or_context_run_bg "${__sctx}" \
- 'iperf3 -s -p'${__port}' & echo $! > s.pid' \
-
- sleep 1 # Wait for server to be ready
+ pane_or_context_run "${__sctx}" 'iperf3 -s -p'${__port}' -D -I s.pid'
}
# test_iperf3k() - Kill iperf3 server
@@ -31,7 +28,7 @@ test_iperf3s() {
test_iperf3k() {
__sctx="${1}"
- pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid); rm s.pid'
+ pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid)'
sleep 1 # Wait for kernel to free up ports
}
diff --git a/test/passt.mbuto b/test/passt.mbuto
index 5e00132..176cf3f 100755
--- a/test/passt.mbuto
+++ b/test/passt.mbuto
@@ -28,7 +28,10 @@ KMODS="${KMODS:- virtio_net virtio_pci vmw_vsock_virtio_transport}"
LINKS="${LINKS:-
ash,dash,bash /init
- ash,dash,bash /bin/sh}"
+ ash,dash,bash /bin/sh
+ sshd /usr/sbin/sshd
+ dhclient /usr/sbin/dhclient
+ sysctl /usr/sbin/sysctl}"
DIRS="${DIRS} /tmp /usr/sbin /usr/share /var/log /var/lib /etc/ssh /run/sshd /root/.ssh"
diff --git a/test/pasta_options/log_to_file b/test/pasta_options/log_to_file
index 3ead06c..db78b04 100644
--- a/test/pasta_options/log_to_file
+++ b/test/pasta_options/log_to_file
@@ -30,19 +30,19 @@ endef
test Log creation
-set PORTS -t 10001,10002 -u 10001,10002
+set PORTS -t 10001,10002 -u 10001,10002 -T none -U none
set LOG_FILE __STATEDIR__/pasta.log
-passt ./pasta -l __LOG_FILE__ -- /bin/true
+passt ./pasta __PORTS__ -l __LOG_FILE__ -- /bin/true
check [ -s __LOG_FILE__ ]
test Log truncated on creation
-passt ./pasta -l __LOG_FILE__ -- /bin/true & wait
+passt ./pasta __PORTS__ -l __LOG_FILE__ -- /bin/true & wait
pout PID2 echo $!
check head -1 __LOG_FILE__ | grep '^pasta .* [(]__PID2__[)]$'
test Maximum log size
-passtb ./pasta --config-net -d -f -l __LOG_FILE__ --log-size $((100 * 1024)) -- sh -c 'while true; do tcp_crr --nolog -l1 -P 10001 -C 10002 -6; done'
+passtb ./pasta __PORTS__ --config-net -d -f -l __LOG_FILE__ --log-size $((100 * 1024)) -- sh -c 'while true; do tcp_crr --nolog -l1 -P 10001 -C 10002 -6; done'
sleep 1
flood_log_client
@@ -67,7 +67,7 @@ passt unshare -rUm
passt mkdir __STATEDIR__/t
passt mount -t tmpfs none __STATEDIR__/t
set LOG_FILE __STATEDIR__/t/log
-passt ./pasta --config-net -d -l __LOG_FILE__ --log-size $((100 * 1024))
+passt ./pasta __PORTS__ --config-net -d -l __LOG_FILE__ --log-size $((100 * 1024))
flood_log_server
flood_log_client
diff --git a/test/run b/test/run
index 4e86f30..f858e55 100755
--- a/test/run
+++ b/test/run
@@ -43,6 +43,9 @@ KERNEL=${KERNEL:-"/boot/vmlinuz-$(uname -r)"}
COMMIT="$(git log --oneline --no-decorate -1)"
+# Let exeter tests written in Python find their modules
+export PYTHONPATH=${BASEPATH}/exeter/py3
+
. lib/util
. lib/context
. lib/setup
@@ -53,6 +56,7 @@ COMMIT="$(git log --oneline --no-decorate -1)"
. lib/layout_ugly
. lib/test
. lib/video
+. lib/exeter
# cleanup() - Remove temporary files
cleanup() {
@@ -67,11 +71,9 @@ run() {
perf_init
[ ${CI} -eq 1 ] && video_start ci
- setup build
- test build/all
- test build/cppcheck
- test build/clang_tidy
- teardown build
+ exeter smoke/smoke.sh
+ exeter build/build.py
+ exeter build/static_checkers.sh
setup pasta
test pasta/ndp
@@ -202,7 +204,7 @@ skip_distro() {
perf_finish
[ ${CI} -eq 1 ] && video_stop
- log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}"
+ log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}, SKIPPED: ${STATUS_SKIPPED}"
pause_continue \
"Press any key to keep test session open" \
@@ -223,6 +225,10 @@ run_selected() {
__setup=
for __test; do
+ if is_exeter "test/${__test}"; then
+ exeter "${__test}"
+ continue
+ fi
# HACK: the migrate tests need the setup repeated for
# each test
if [ "${__test%%/*}" != "${__setup}" -o \
@@ -234,9 +240,9 @@ run_selected() {
test "${__test}"
done
- teardown "${__setup}"
+ [ -n "${__setup}" ] && teardown "${__setup}"
- log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}"
+ log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}, SKIPPED: ${STATUS_SKIPPED}"
pause_continue \
"Press any key to keep test session open" \
@@ -307,4 +313,4 @@ fi
tail -n1 ${LOGFILE}
echo "Log at ${LOGFILE}"
-exit $(tail -n1 ${LOGFILE} | sed -n 's/.*FAIL: \(.*\)$/\1/p')
+exit $(tail -n1 ${LOGFILE} | sed -n 's/.*FAIL: \(.*\),.*$/\1/p')
diff --git a/test/smoke/smoke.sh b/test/smoke/smoke.sh
new file mode 100755
index 0000000..a642fb9
--- /dev/null
+++ b/test/smoke/smoke.sh
@@ -0,0 +1,33 @@
+#! /bin/sh
+#
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+# for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+# for network namespace/tap device mode
+#
+# test/smoke/smoke.sh - Basic smoke tests
+#
+# Copyright Red Hat
+# Author: David Gibson <david@gibson.dropbear.id.au>
+
+. $(dirname $0)/../exeter/sh/exeter.sh
+
+PASST=$(dirname $0)/../../passt
+PASTA=$(dirname $0)/../../pasta
+
+exeter_register passt_version $PASST --version
+exeter_set_description passt_version "Check passt --version works"
+
+exeter_register pasta_version $PASTA --version
+exeter_set_description pasta_version "Check pasta --version works"
+
+exeter_register passt_help $PASST --help
+exeter_set_description passt_help "Check passt --help works"
+
+exeter_register pasta_help $PASTA --help
+exeter_set_description pasta_help "Check pasta --help works"
+
+exeter_main "$@"
diff --git a/udp.c b/udp.c
index 80520cb..86585b7 100644
--- a/udp.c
+++ b/udp.c
@@ -39,27 +39,30 @@
* could receive packets from multiple flows, so we use a hash table match to
* find the specific flow for a datagram.
*
- * When a UDP flow is initiated from a listening socket we take a duplicate of
- * the socket and store it in uflow->s[INISIDE]. This will last for the
- * lifetime of the flow, even if the original listening socket is closed due to
- * port auto-probing. The duplicate is used to deliver replies back to the
- * originating side.
+ * Flow sockets
+ * ============
*
- * Reply sockets
- * =============
- *
- * When a UDP flow targets a socket, we create a "reply" socket in
+ * When a UDP flow targets a socket, we create a "flow" socket in
* uflow->s[TGTSIDE] both to deliver datagrams to the target side and receive
* replies on the target side. This socket is both bound and connected and has
- * EPOLL_TYPE_UDP_REPLY. The connect() means it will only receive datagrams
+ * EPOLL_TYPE_UDP. The connect() means it will only receive datagrams
* associated with this flow, so the epoll reference directly points to the flow
* and we don't need a hash lookup.
*
- * NOTE: it's possible that the reply socket could have a bound address
- * overlapping with an unrelated listening socket. We assume datagrams for the
- * flow will come to the reply socket in preference to a listening socket. The
- * sample program doc/platform-requirements/reuseaddr-priority.c documents and
- * tests that assumption.
+ * When a flow is initiated from a listening socket, we create a "flow" socket
+ * with the same bound address as the listening socket, but also connect()ed to
+ * the flow's peer. This is stored in uflow->s[INISIDE] and will last for the
+ * lifetime of the flow, even if the original listening socket is closed due to
+ * port auto-probing. The duplicate is used to deliver replies back to the
+ * originating side.
+ *
+ * NOTE: A flow socket can have a bound address overlapping with a listening
+ * socket. That will happen naturally for flows initiated from a socket, but is
+ * also possible (though unlikely) for tap initiated flows, depending on the
+ * source port. We assume datagrams for the flow will come to a connect()ed
+ * socket in preference to a listening socket. The sample program
+ * doc/platform-requirements/reuseaddr-priority.c documents and tests that
+ * assumption.
*
* "Spliced" flows
* ===============
@@ -71,8 +74,7 @@
* actually used; it doesn't make sense for datagrams and instead a pair of
* recvmmsg() and sendmmsg() is used to forward the datagrams.
*
- * Note that a spliced flow will have *both* a duplicated listening socket and a
- * reply socket (see above).
+ * Note that a spliced flow will have two flow sockets (see above).
*/
#include <sched.h>
@@ -114,6 +116,8 @@
#include "udp_internal.h"
#include "udp_vu.h"
+#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */
+
/* Maximum UDP data to be returned in ICMP messages */
#define ICMP4_MAX_DLEN 8
#define ICMP6_MAX_DLEN (IPV6_MIN_MTU \
@@ -136,26 +140,31 @@ static struct ethhdr udp4_eth_hdr;
static struct ethhdr udp6_eth_hdr;
/**
- * struct udp_meta_t - Pre-cooked headers and metadata for UDP packets
+ * struct udp_meta_t - Pre-cooked headers for UDP packets
* @ip6h: Pre-filled IPv6 header (except for payload_len and addresses)
* @ip4h: Pre-filled IPv4 header (except for tot_len and saddr)
* @taph: Tap backend specific header
- * @s_in: Source socket address, filled in by recvmmsg()
- * @tosidx: sidx for the destination side of this datagram's flow
*/
static struct udp_meta_t {
struct ipv6hdr ip6h;
struct iphdr ip4h;
struct tap_hdr taph;
-
- union sockaddr_inany s_in;
- flow_sidx_t tosidx;
}
#ifdef __AVX2__
__attribute__ ((aligned(32)))
#endif
udp_meta[UDP_MAX_FRAMES];
+#define PKTINFO_SPACE \
+ MAX(CMSG_SPACE(sizeof(struct in_pktinfo)), \
+ CMSG_SPACE(sizeof(struct in6_pktinfo)))
+
+#define RECVERR_SPACE \
+ MAX(CMSG_SPACE(sizeof(struct sock_extended_err) + \
+ sizeof(struct sockaddr_in)), \
+ CMSG_SPACE(sizeof(struct sock_extended_err) + \
+ sizeof(struct sockaddr_in6)))
+
/**
* enum udp_iov_idx - Indices for the buffers making up a single UDP frame
* @UDP_IOV_TAP tap specific header
@@ -232,8 +241,6 @@ static void udp_iov_init_one(const struct ctx *c, size_t i)
tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph);
tiov[UDP_IOV_PAYLOAD].iov_base = payload;
- mh->msg_name = &meta->s_in;
- mh->msg_namelen = sizeof(meta->s_in);
mh->msg_iov = siov;
mh->msg_iovlen = 1;
}
@@ -254,41 +261,6 @@ static void udp_iov_init(const struct ctx *c)
}
/**
- * udp_splice_prepare() - Prepare one datagram for splicing
- * @mmh: Receiving mmsghdr array
- * @idx: Index of the datagram to prepare
- */
-static void udp_splice_prepare(struct mmsghdr *mmh, unsigned idx)
-{
- udp_mh_splice[idx].msg_hdr.msg_iov->iov_len = mmh[idx].msg_len;
-}
-
-/**
- * udp_splice_send() - Send a batch of datagrams from socket to socket
- * @c: Execution context
- * @start: Index of batch's first datagram in udp[46]_l2_buf
- * @n: Number of datagrams in batch
- * @src: Source port for datagram (target side)
- * @dst: Destination port for datagrams (target side)
- * @ref: epoll reference for origin socket
- * @now: Timestamp
- */
-static void udp_splice_send(const struct ctx *c, size_t start, size_t n,
- flow_sidx_t tosidx)
-{
- const struct flowside *toside = flowside_at_sidx(tosidx);
- const struct udp_flow *uflow = udp_at_sidx(tosidx);
- uint8_t topif = pif_at_sidx(tosidx);
- int s = uflow->s[tosidx.sidei];
- socklen_t sl;
-
- pif_sockaddr(c, &udp_splice_to, &sl, topif,
- &toside->eaddr, toside->eport);
-
- sendmmsg(s, udp_mh_splice + start, n, MSG_NOSIGNAL);
-}
-
-/**
* udp_update_hdr4() - Update headers for one IPv4 datagram
* @ip4h: Pre-filled IPv4 header (except for tot_len and saddr)
* @bp: Pointer to udp_payload_t to update
@@ -411,7 +383,7 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
}
/**
- * udp_send_conn_fail_icmp4() - Construct and send ICMPv4 to local peer
+ * udp_send_tap_icmp4() - Construct and send ICMPv4 to local peer
* @c: Execution context
* @ee: Extended error descriptor
* @toside: Destination side of flow
@@ -419,11 +391,11 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
* @in: First bytes (max 8) of original UDP message body
* @dlen: Length of the read part of original UDP message body
*/
-static void udp_send_conn_fail_icmp4(const struct ctx *c,
- const struct sock_extended_err *ee,
- const struct flowside *toside,
- struct in_addr saddr,
- const void *in, size_t dlen)
+static void udp_send_tap_icmp4(const struct ctx *c,
+ const struct sock_extended_err *ee,
+ const struct flowside *toside,
+ struct in_addr saddr,
+ const void *in, size_t dlen)
{
struct in_addr oaddr = toside->oaddr.v4mapped.a4;
struct in_addr eaddr = toside->eaddr.v4mapped.a4;
@@ -455,7 +427,7 @@ static void udp_send_conn_fail_icmp4(const struct ctx *c,
/**
- * udp_send_conn_fail_icmp6() - Construct and send ICMPv6 to local peer
+ * udp_send_tap_icmp6() - Construct and send ICMPv6 to local peer
* @c: Execution context
* @ee: Extended error descriptor
* @toside: Destination side of flow
@@ -464,11 +436,11 @@ static void udp_send_conn_fail_icmp4(const struct ctx *c,
* @dlen: Length of the read part of original UDP message body
* @flow: IPv6 flow identifier
*/
-static void udp_send_conn_fail_icmp6(const struct ctx *c,
- const struct sock_extended_err *ee,
- const struct flowside *toside,
- const struct in6_addr *saddr,
- void *in, size_t dlen, uint32_t flow)
+static void udp_send_tap_icmp6(const struct ctx *c,
+ const struct sock_extended_err *ee,
+ const struct flowside *toside,
+ const struct in6_addr *saddr,
+ void *in, size_t dlen, uint32_t flow)
{
const struct in6_addr *oaddr = &toside->oaddr.a6;
const struct in6_addr *eaddr = &toside->eaddr.a6;
@@ -499,35 +471,83 @@ static void udp_send_conn_fail_icmp6(const struct ctx *c,
}
/**
+ * udp_pktinfo() - Retrieve packet destination address from cmsg
+ * @msg: msghdr into which message has been received
+ * @dst: (Local) destination address of message in @msg (output)
+ *
+ * Return: 0 on success, -1 if the information was missing (@dst is set to
+ * inany_any6).
+ */
+static int udp_pktinfo(struct msghdr *msg, union inany_addr *dst)
+{
+ struct cmsghdr *hdr;
+
+ for (hdr = CMSG_FIRSTHDR(msg); hdr; hdr = CMSG_NXTHDR(msg, hdr)) {
+ if (hdr->cmsg_level == IPPROTO_IP &&
+ hdr->cmsg_type == IP_PKTINFO) {
+ const struct in_pktinfo *i4 = (void *)CMSG_DATA(hdr);
+
+ *dst = inany_from_v4(i4->ipi_addr);
+ return 0;
+ }
+
+ if (hdr->cmsg_level == IPPROTO_IPV6 &&
+ hdr->cmsg_type == IPV6_PKTINFO) {
+ const struct in6_pktinfo *i6 = (void *)CMSG_DATA(hdr);
+
+ dst->a6 = i6->ipi6_addr;
+ return 0;
+ }
+ }
+
+ debug("Missing PKTINFO cmsg on datagram");
+ *dst = inany_any6;
+ return -1;
+}
+
+/**
* udp_sock_recverr() - Receive and clear an error from a socket
* @c: Execution context
- * @ref: epoll reference
+ * @s: Socket to receive errors from
+ * @sidx: Flow and side of @s, or FLOW_SIDX_NONE if unknown
+ * @pif: Interface on which the error occurred
+ * (only used if @sidx == FLOW_SIDX_NONE)
+ * @port: Local port number of @s (only used if @sidx == FLOW_SIDX_NONE)
*
* Return: 1 if error received and processed, 0 if no more errors in queue, < 0
* if there was an error reading the queue
*
* #syscalls recvmsg
*/
-static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
+static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx,
+ uint8_t pif, in_port_t port)
{
+ char buf[PKTINFO_SPACE + RECVERR_SPACE];
const struct sock_extended_err *ee;
- const struct cmsghdr *hdr;
- union sockaddr_inany saddr;
- char buf[CMSG_SPACE(sizeof(*ee))];
char data[ICMP6_MAX_DLEN];
- int s = ref.fd;
+ struct cmsghdr *hdr;
struct iovec iov = {
.iov_base = data,
.iov_len = sizeof(data)
};
+ union sockaddr_inany src;
struct msghdr mh = {
- .msg_name = &saddr,
- .msg_namelen = sizeof(saddr),
+ .msg_name = &src,
+ .msg_namelen = sizeof(src),
.msg_iov = &iov,
.msg_iovlen = 1,
.msg_control = buf,
.msg_controllen = sizeof(buf),
};
+ const struct flowside *fromside, *toside;
+ union inany_addr offender, otap;
+ char astr[INANY_ADDRSTRLEN];
+ char sastr[SOCKADDR_STRLEN];
+ const struct in_addr *o4;
+ in_port_t offender_port;
+ struct udp_flow *uflow;
+ uint8_t topif;
+ size_t dlen;
ssize_t rc;
rc = recvmsg(s, &mh, MSG_ERRQUEUE);
@@ -544,61 +564,111 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
return -1;
}
- hdr = CMSG_FIRSTHDR(&mh);
- if (!((hdr->cmsg_level == IPPROTO_IP &&
- hdr->cmsg_type == IP_RECVERR) ||
- (hdr->cmsg_level == IPPROTO_IPV6 &&
- hdr->cmsg_type == IPV6_RECVERR))) {
- err("Unexpected cmsg reading error queue");
+ for (hdr = CMSG_FIRSTHDR(&mh); hdr; hdr = CMSG_NXTHDR(&mh, hdr)) {
+ if ((hdr->cmsg_level == IPPROTO_IP &&
+ hdr->cmsg_type == IP_RECVERR) ||
+ (hdr->cmsg_level == IPPROTO_IPV6 &&
+ hdr->cmsg_type == IPV6_RECVERR))
+ break;
+ }
+
+ if (!hdr) {
+ err("Missing RECVERR cmsg in error queue");
return -1;
}
ee = (const struct sock_extended_err *)CMSG_DATA(hdr);
- if (ref.type == EPOLL_TYPE_UDP_REPLY) {
- flow_sidx_t sidx = flow_sidx_opposite(ref.flowside);
- const struct flowside *toside = flowside_at_sidx(sidx);
- size_t dlen = rc;
-
- if (hdr->cmsg_level == IPPROTO_IP) {
- dlen = MIN(dlen, ICMP4_MAX_DLEN);
- udp_send_conn_fail_icmp4(c, ee, toside, saddr.sa4.sin_addr,
- data, dlen);
- } else if (hdr->cmsg_level == IPPROTO_IPV6) {
- udp_send_conn_fail_icmp6(c, ee, toside,
- &saddr.sa6.sin6_addr,
- data, dlen, sidx.flowi);
+
+ debug("%s error on UDP socket %i: %s",
+ str_ee_origin(ee), s, strerror_(ee->ee_errno));
+
+ if (!flow_sidx_valid(sidx)) {
+ /* No hint from the socket, determine flow from addresses */
+ union inany_addr dst;
+
+ if (udp_pktinfo(&mh, &dst) < 0) {
+ debug("Missing PKTINFO on UDP error");
+ return 1;
+ }
+
+ sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, &src, &dst, port);
+ if (!flow_sidx_valid(sidx)) {
+ debug("Ignoring UDP error without flow");
+ return 1;
}
} else {
- trace("Ignoring received IP_RECVERR cmsg on listener socket");
+ pif = pif_at_sidx(sidx);
}
- debug("%s error on UDP socket %i: %s",
- str_ee_origin(ee), s, strerror_(ee->ee_errno));
+ uflow = udp_at_sidx(sidx);
+ ASSERT(uflow);
+ fromside = &uflow->f.side[sidx.sidei];
+ toside = &uflow->f.side[!sidx.sidei];
+ topif = uflow->f.pif[!sidx.sidei];
+ dlen = rc;
+
+ if (inany_from_sockaddr(&offender, &offender_port,
+ SO_EE_OFFENDER(ee)) < 0)
+ goto fail;
+
+ if (pif != PIF_HOST || topif != PIF_TAP)
+ /* XXX Can we support any other cases? */
+ goto fail;
+
+ /* If the offender *is* the endpoint, make sure our translation is
+ * consistent with the flow's translation. This matters if the flow
+ * endpoint has a port specific translation (like --dns-match).
+ */
+ if (inany_equals(&offender, &fromside->eaddr))
+ otap = toside->oaddr;
+ else if (!nat_inbound(c, &offender, &otap))
+ goto fail;
+
+ if (hdr->cmsg_level == IPPROTO_IP &&
+ (o4 = inany_v4(&otap)) && inany_v4(&toside->eaddr)) {
+ dlen = MIN(dlen, ICMP4_MAX_DLEN);
+ udp_send_tap_icmp4(c, ee, toside, *o4, data, dlen);
+ return 1;
+ }
+
+ if (hdr->cmsg_level == IPPROTO_IPV6 && !inany_v4(&toside->eaddr)) {
+ udp_send_tap_icmp6(c, ee, toside, &otap.a6, data, dlen,
+ FLOW_IDX(uflow));
+ return 1;
+ }
+
+fail:
+ flow_dbg(uflow, "Can't propagate %s error from %s %s to %s %s",
+ str_ee_origin(ee),
+ pif_name(pif),
+ sockaddr_ntop(SO_EE_OFFENDER(ee), sastr, sizeof(sastr)),
+ pif_name(topif),
+ inany_ntop(&toside->eaddr, astr, sizeof(astr)));
return 1;
}
/**
* udp_sock_errs() - Process errors on a socket
* @c: Execution context
- * @ref: epoll reference
- * @events: epoll events bitmap
+ * @s: Socket to receive errors from
+ * @sidx: Flow and side of @s, or FLOW_SIDX_NONE if unknown
+ * @pif: Interface on which the error occurred
+ * (only used if @sidx == FLOW_SIDX_NONE)
+ * @port: Local port number of @s (only used if @sidx == FLOW_SIDX_NONE)
*
- * Return: Number of errors handled, or < 0 if we have an unrecoverable error
+ * Return: number of errors handled, or < 0 if we have an unrecoverable error
*/
-int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events)
+static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx,
+ uint8_t pif, in_port_t port)
{
unsigned n_err = 0;
socklen_t errlen;
- int s = ref.fd;
int rc, err;
ASSERT(!c->no_udp);
- if (!(events & EPOLLERR))
- return 0; /* Nothing to do */
-
/* Empty the error queue */
- while ((rc = udp_sock_recverr(c, ref)) > 0)
+ while ((rc = udp_sock_recverr(c, s, sidx, pif, port)) > 0)
n_err += rc;
if (rc < 0)
@@ -626,36 +696,61 @@ int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events)
}
/**
+ * udp_peek_addr() - Get source address for next packet
+ * @s: Socket to get information from
+ * @src: Socket address (output)
+ * @dst: (Local) destination address (output)
+ *
+ * Return: 0 if no more packets, 1 on success, -ve error code on error
+ */
+static int udp_peek_addr(int s, union sockaddr_inany *src,
+ union inany_addr *dst)
+{
+ char sastr[SOCKADDR_STRLEN], dstr[INANY_ADDRSTRLEN];
+ char cmsg[PKTINFO_SPACE];
+ struct msghdr msg = {
+ .msg_name = src,
+ .msg_namelen = sizeof(*src),
+ .msg_control = cmsg,
+ .msg_controllen = sizeof(cmsg),
+ };
+ int rc;
+
+ rc = recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT);
+ if (rc < 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK)
+ return 0;
+ return -errno;
+ }
+
+ udp_pktinfo(&msg, dst);
+
+ trace("Peeked UDP datagram: %s -> %s",
+ sockaddr_ntop(src, sastr, sizeof(sastr)),
+ inany_ntop(dst, dstr, sizeof(dstr)));
+
+ return 1;
+}
+
+/**
* udp_sock_recv() - Receive datagrams from a socket
* @c: Execution context
* @s: Socket to receive from
- * @events: epoll events bitmap
- * @mmh mmsghdr array to receive into
+ * @mmh: mmsghdr array to receive into
+ * @n: Maximum number of datagrams to receive
*
- * Return: Number of datagrams received
+ * Return: number of datagrams received
*
* #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64
*/
-static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
- struct mmsghdr *mmh)
+static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n)
{
- /* For not entirely clear reasons (data locality?) pasta gets better
- * throughput if we receive tap datagrams one at a atime. For small
- * splice datagrams throughput is slightly better if we do batch, but
- * it's slightly worse for large splice datagrams. Since we don't know
- * before we receive whether we'll use tap or splice, always go one at a
- * time for pasta mode.
- */
- int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES);
-
ASSERT(!c->no_udp);
- if (!(events & EPOLLIN))
- return 0;
-
n = recvmmsg(s, mmh, n, 0, NULL);
if (n < 0) {
- err_perror("Error receiving datagrams");
+ trace("Error receiving datagrams: %s", strerror_(errno));
+ /* Bail out and let the EPOLLERR handler deal with it */
return 0;
}
@@ -663,78 +758,121 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
}
/**
- * udp_buf_listen_sock_handler() - Handle new data from socket
+ * udp_sock_to_sock() - Forward datagrams from socket to socket
* @c: Execution context
- * @ref: epoll reference
- * @events: epoll events bitmap
- * @now: Current timestamp
+ * @from_s: Socket to receive datagrams from
+ * @n: Maximum number of datagrams to forward
+ * @tosidx: Flow & side to forward datagrams to
*
- * #syscalls recvmmsg
+ * #syscalls sendmmsg
*/
-static void udp_buf_listen_sock_handler(const struct ctx *c,
- union epoll_ref ref, uint32_t events,
- const struct timespec *now)
+static void udp_sock_to_sock(const struct ctx *c, int from_s, int n,
+ flow_sidx_t tosidx)
{
- const socklen_t sasize = sizeof(udp_meta[0].s_in);
- int n, i;
+ const struct flowside *toside = flowside_at_sidx(tosidx);
+ const struct udp_flow *uflow = udp_at_sidx(tosidx);
+ uint8_t topif = pif_at_sidx(tosidx);
+ int to_s = uflow->s[tosidx.sidei];
+ socklen_t sl;
+ int i;
- if (udp_sock_errs(c, ref, events) < 0) {
- err("UDP: Unrecoverable error on listening socket:"
- " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
- /* FIXME: what now? close/re-open socket? */
+ if ((n = udp_sock_recv(c, from_s, udp_mh_recv, n)) <= 0)
return;
+
+ for (i = 0; i < n; i++) {
+ udp_mh_splice[i].msg_hdr.msg_iov->iov_len
+ = udp_mh_recv[i].msg_len;
}
- if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv)) <= 0)
+ pif_sockaddr(c, &udp_splice_to, &sl, topif,
+ &toside->eaddr, toside->eport);
+
+ sendmmsg(to_s, udp_mh_splice, n, MSG_NOSIGNAL);
+}
+
+/**
+ * udp_buf_sock_to_tap() - Forward datagrams from socket to tap
+ * @c: Execution context
+ * @s: Socket to read data from
+ * @n: Maximum number of datagrams to forward
+ * @tosidx: Flow & side to forward data from @s to
+ */
+static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n,
+ flow_sidx_t tosidx)
+{
+ const struct flowside *toside = flowside_at_sidx(tosidx);
+ int i;
+
+ if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0)
return;
- /* We divide datagrams into batches based on how we need to send them,
- * determined by udp_meta[i].tosidx. To avoid either two passes through
- * the array, or recalculating tosidx for a single entry, we have to
- * populate it one entry *ahead* of the loop counter.
- */
- udp_meta[0].tosidx = udp_flow_from_sock(c, ref, &udp_meta[0].s_in, now);
- udp_mh_recv[0].msg_hdr.msg_namelen = sasize;
- for (i = 0; i < n; ) {
- flow_sidx_t batchsidx = udp_meta[i].tosidx;
- uint8_t batchpif = pif_at_sidx(batchsidx);
- int batchstart = i;
-
- do {
- if (pif_is_socket(batchpif)) {
- udp_splice_prepare(udp_mh_recv, i);
- } else if (batchpif == PIF_TAP) {
- udp_tap_prepare(udp_mh_recv, i,
- flowside_at_sidx(batchsidx),
- false);
+ for (i = 0; i < n; i++)
+ udp_tap_prepare(udp_mh_recv, i, toside, false);
+
+ tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
+}
+
+/**
+ * udp_sock_fwd() - Forward datagrams from a possibly unconnected socket
+ * @c: Execution context
+ * @s: Socket to forward from
+ * @frompif: Interface to which @s belongs
+ * @port: Our (local) port number of @s
+ * @now: Current timestamp
+ */
+void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
+ in_port_t port, const struct timespec *now)
+{
+ union sockaddr_inany src;
+ union inany_addr dst;
+ int rc;
+
+ while ((rc = udp_peek_addr(s, &src, &dst)) != 0) {
+ bool discard = false;
+ flow_sidx_t tosidx;
+ uint8_t topif;
+
+ if (rc < 0) {
+ trace("Error peeking at socket address: %s",
+ strerror_(-rc));
+ /* Clear errors & carry on */
+ if (udp_sock_errs(c, s, FLOW_SIDX_NONE,
+ frompif, port) < 0) {
+ err(
+"UDP: Unrecoverable error on listening socket: (%s port %hu)",
+ pif_name(frompif), port);
+ /* FIXME: what now? close/re-open socket? */
}
+ continue;
+ }
+
+ tosidx = udp_flow_from_sock(c, frompif, &dst, port, &src, now);
+ topif = pif_at_sidx(tosidx);
- if (++i >= n)
- break;
-
- udp_meta[i].tosidx = udp_flow_from_sock(c, ref,
- &udp_meta[i].s_in,
- now);
- udp_mh_recv[i].msg_hdr.msg_namelen = sasize;
- } while (flow_sidx_eq(udp_meta[i].tosidx, batchsidx));
-
- if (pif_is_socket(batchpif)) {
- udp_splice_send(c, batchstart, i - batchstart,
- batchsidx);
- } else if (batchpif == PIF_TAP) {
- tap_send_frames(c, &udp_l2_iov[batchstart][0],
- UDP_NUM_IOVS, i - batchstart);
- } else if (flow_sidx_valid(batchsidx)) {
- flow_sidx_t fromsidx = flow_sidx_opposite(batchsidx);
- struct udp_flow *uflow = udp_at_sidx(batchsidx);
+ if (pif_is_socket(topif)) {
+ udp_sock_to_sock(c, s, 1, tosidx);
+ } else if (topif == PIF_TAP) {
+ if (c->mode == MODE_VU)
+ udp_vu_sock_to_tap(c, s, 1, tosidx);
+ else
+ udp_buf_sock_to_tap(c, s, 1, tosidx);
+ } else if (flow_sidx_valid(tosidx)) {
+ struct udp_flow *uflow = udp_at_sidx(tosidx);
flow_err(uflow,
"No support for forwarding UDP from %s to %s",
- pif_name(pif_at_sidx(fromsidx)),
- pif_name(batchpif));
+ pif_name(frompif), pif_name(topif));
+ discard = true;
} else {
- debug("Discarding %d datagrams without flow",
- i - batchstart);
+ debug("Discarding datagram without flow");
+ discard = true;
+ }
+
+ if (discard) {
+ struct msghdr msg = { 0 };
+
+ if (recvmsg(s, &msg, MSG_DONTWAIT) < 0)
+ debug_perror("Failed to discard datagram");
}
}
}
@@ -750,87 +888,69 @@ void udp_listen_sock_handler(const struct ctx *c,
union epoll_ref ref, uint32_t events,
const struct timespec *now)
{
- if (c->mode == MODE_VU) {
- udp_vu_listen_sock_handler(c, ref, events, now);
- return;
- }
-
- udp_buf_listen_sock_handler(c, ref, events, now);
+ if (events & (EPOLLERR | EPOLLIN))
+ udp_sock_fwd(c, ref.fd, ref.udp.pif, ref.udp.port, now);
}
/**
- * udp_buf_reply_sock_handler() - Handle new data from flow specific socket
+ * udp_sock_handler() - Handle new data from flow specific socket
* @c: Execution context
* @ref: epoll reference
* @events: epoll events bitmap
* @now: Current timestamp
- *
- * #syscalls recvmmsg
*/
-static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
- uint32_t events,
- const struct timespec *now)
+void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
+ uint32_t events, const struct timespec *now)
{
- flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
- const struct flowside *toside = flowside_at_sidx(tosidx);
struct udp_flow *uflow = udp_at_sidx(ref.flowside);
- uint8_t topif = pif_at_sidx(tosidx);
- int n, i, from_s;
ASSERT(!c->no_udp && uflow);
- from_s = uflow->s[ref.flowside.sidei];
-
- if (udp_sock_errs(c, ref, events) < 0) {
- flow_err(uflow, "Unrecoverable error on reply socket");
- flow_err_details(uflow);
- udp_flow_close(c, uflow);
- return;
- }
-
- if ((n = udp_sock_recv(c, from_s, events, udp_mh_recv)) <= 0)
- return;
-
- flow_trace(uflow, "Received %d datagrams on reply socket", n);
- uflow->ts = now->tv_sec;
-
- for (i = 0; i < n; i++) {
- if (pif_is_socket(topif))
- udp_splice_prepare(udp_mh_recv, i);
- else if (topif == PIF_TAP)
- udp_tap_prepare(udp_mh_recv, i, toside, false);
- /* Restore sockaddr length clobbered by recvmsg() */
- udp_mh_recv[i].msg_hdr.msg_namelen = sizeof(udp_meta[i].s_in);
- }
-
- if (pif_is_socket(topif)) {
- udp_splice_send(c, 0, n, tosidx);
- } else if (topif == PIF_TAP) {
- tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
- } else {
- uint8_t frompif = pif_at_sidx(ref.flowside);
-
- flow_err(uflow, "No support for forwarding UDP from %s to %s",
- pif_name(frompif), pif_name(topif));
+ if (events & EPOLLERR) {
+ if (udp_sock_errs(c, ref.fd, ref.flowside, PIF_NONE, 0) < 0) {
+ flow_err(uflow, "Unrecoverable error on flow socket");
+ goto fail;
+ }
}
-}
-/**
- * udp_reply_sock_handler() - Handle new data from flow specific socket
- * @c: Execution context
- * @ref: epoll reference
- * @events: epoll events bitmap
- * @now: Current timestamp
- */
-void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
- uint32_t events, const struct timespec *now)
-{
- if (c->mode == MODE_VU) {
- udp_vu_reply_sock_handler(c, ref, events, now);
- return;
+ if (events & EPOLLIN) {
+ /* For not entirely clear reasons (data locality?) pasta gets
+ * better throughput if we receive tap datagrams one at a
+ * time. For small splice datagrams throughput is slightly
+ * better if we do batch, but it's slightly worse for large
+ * splice datagrams. Since we don't know the size before we
+ * receive, always go one at a time for pasta mode.
+ */
+ size_t n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES);
+ flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
+ uint8_t topif = pif_at_sidx(tosidx);
+ int s = ref.fd;
+
+ flow_trace(uflow, "Received data on reply socket");
+ uflow->ts = now->tv_sec;
+
+ if (pif_is_socket(topif)) {
+ udp_sock_to_sock(c, ref.fd, n, tosidx);
+ } else if (topif == PIF_TAP) {
+ if (c->mode == MODE_VU) {
+ udp_vu_sock_to_tap(c, s, UDP_MAX_FRAMES,
+ tosidx);
+ } else {
+ udp_buf_sock_to_tap(c, s, n, tosidx);
+ }
+ } else {
+ flow_err(uflow,
+ "No support for forwarding UDP from %s to %s",
+ pif_name(pif_at_sidx(ref.flowside)),
+ pif_name(topif));
+ goto fail;
+ }
}
+ return;
- udp_buf_reply_sock_handler(c, ref, events, now);
+fail:
+ flow_err_details(uflow);
+ udp_flow_close(c, uflow);
}
/**
@@ -840,6 +960,7 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
* @af: Address family, AF_INET or AF_INET6
* @saddr: Source address
* @daddr: Destination address
+ * @ttl: TTL or hop limit for packets to be sent in this call
* @p: Pool of UDP packets, with UDP headers
* @idx: Index of first packet to process
* @now: Current timestamp
@@ -850,15 +971,18 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
*/
int udp_tap_handler(const struct ctx *c, uint8_t pif,
sa_family_t af, const void *saddr, const void *daddr,
- const struct pool *p, int idx, const struct timespec *now)
+ uint8_t ttl, const struct pool *p, int idx,
+ const struct timespec *now)
{
const struct flowside *toside;
struct mmsghdr mm[UIO_MAXIOV];
union sockaddr_inany to_sa;
struct iovec m[UIO_MAXIOV];
+ struct udphdr uh_storage;
const struct udphdr *uh;
struct udp_flow *uflow;
- int i, s, count = 0;
+ int i, j, s, count = 0;
+ struct iov_tail data;
flow_sidx_t tosidx;
in_port_t src, dst;
uint8_t topif;
@@ -866,7 +990,10 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
ASSERT(!c->no_udp);
- uh = packet_get(p, idx, 0, sizeof(*uh), NULL);
+ if (!packet_get(p, idx, &data))
+ return 1;
+
+ uh = IOV_PEEK_HEADER(&data, uh_storage);
if (!uh)
return 1;
@@ -898,28 +1025,34 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
}
toside = flowside_at_sidx(tosidx);
- s = udp_at_sidx(tosidx)->s[tosidx.sidei];
+ s = uflow->s[tosidx.sidei];
ASSERT(s >= 0);
pif_sockaddr(c, &to_sa, &sl, topif, &toside->eaddr, toside->eport);
- for (i = 0; i < (int)p->count - idx; i++) {
- struct udphdr *uh_send;
- size_t len;
+ for (i = 0, j = 0; i < (int)p->count - idx && j < UIO_MAXIOV; i++) {
+ const struct udphdr *uh_send;
- uh_send = packet_get(p, idx + i, 0, sizeof(*uh), &len);
+ if (!packet_get(p, idx + i, &data))
+ return p->count - idx;
+
+ uh_send = IOV_REMOVE_HEADER(&data, uh_storage);
if (!uh_send)
return p->count - idx;
mm[i].msg_hdr.msg_name = &to_sa;
mm[i].msg_hdr.msg_namelen = sl;
- if (len) {
- m[i].iov_base = (char *)(uh_send + 1);
- m[i].iov_len = len;
+ if (data.cnt) {
+ int cnt;
+
+ cnt = iov_tail_clone(&m[j], UIO_MAXIOV - j, &data);
+ if (cnt < 0)
+ return p->count - idx;
- mm[i].msg_hdr.msg_iov = m + i;
- mm[i].msg_hdr.msg_iovlen = 1;
+ mm[i].msg_hdr.msg_iov = &m[j];
+ mm[i].msg_hdr.msg_iovlen = cnt;
+ j += cnt;
} else {
mm[i].msg_hdr.msg_iov = NULL;
mm[i].msg_hdr.msg_iovlen = 0;
@@ -929,6 +1062,24 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
mm[i].msg_hdr.msg_controllen = 0;
mm[i].msg_hdr.msg_flags = 0;
+ if (ttl != uflow->ttl[tosidx.sidei]) {
+ uflow->ttl[tosidx.sidei] = ttl;
+ if (af == AF_INET) {
+ if (setsockopt(s, IPPROTO_IP, IP_TTL,
+ &ttl, sizeof(ttl)) < 0)
+ flow_perror(uflow,
+ "setsockopt IP_TTL");
+ } else {
+ /* IPv6 hop_limit cannot be only 1 byte */
+ int hop_limit = ttl;
+
+ if (setsockopt(s, SOL_IPV6, IPV6_UNICAST_HOPS,
+ &hop_limit, sizeof(hop_limit)) < 0)
+ flow_perror(uflow,
+ "setsockopt IPV6_UNICAST_HOPS");
+ }
+ }
+
count++;
}
diff --git a/udp.h b/udp.h
index de2df6d..8f8531a 100644
--- a/udp.h
+++ b/udp.h
@@ -11,11 +11,12 @@
void udp_portmap_clear(void);
void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
uint32_t events, const struct timespec *now);
-void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
- uint32_t events, const struct timespec *now);
+void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
+ uint32_t events, const struct timespec *now);
int udp_tap_handler(const struct ctx *c, uint8_t pif,
sa_family_t af, const void *saddr, const void *daddr,
- const struct pool *p, int idx, const struct timespec *now);
+ uint8_t ttl, const struct pool *p, int idx,
+ const struct timespec *now);
int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
const char *ifname, in_port_t port);
int udp_init(struct ctx *c);
diff --git a/udp_flow.c b/udp_flow.c
index c6b8630..cef3fb5 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -9,10 +9,12 @@
#include <fcntl.h>
#include <sys/uio.h>
#include <unistd.h>
+#include <netinet/udp.h>
#include "util.h"
#include "passt.h"
#include "flow_table.h"
+#include "udp_internal.h"
#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */
@@ -34,123 +36,153 @@ struct udp_flow *udp_at_sidx(flow_sidx_t sidx)
return &flow->udp;
}
-/*
+/**
* udp_flow_close() - Close and clean up UDP flow
* @c: Execution context
* @uflow: UDP flow
*/
void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
{
+ unsigned sidei;
+
if (uflow->closed)
return; /* Nothing to do */
- if (uflow->s[INISIDE] >= 0) {
- /* The listening socket needs to stay in epoll */
- close(uflow->s[INISIDE]);
- uflow->s[INISIDE] = -1;
+ flow_foreach_sidei(sidei) {
+ flow_hash_remove(c, FLOW_SIDX(uflow, sidei));
+ if (uflow->s[sidei] >= 0) {
+ epoll_del(c, uflow->s[sidei]);
+ close(uflow->s[sidei]);
+ uflow->s[sidei] = -1;
+ }
+ }
+
+ uflow->closed = true;
+}
+
+/**
+ * udp_flow_sock() - Create, bind and connect a flow specific UDP socket
+ * @c: Execution context
+ * @uflow: UDP flow to open socket for
+ * @sidei: Side of @uflow to open socket for
+ *
+ * Return: fd of new socket on success, -ve error code on failure
+ */
+static int udp_flow_sock(const struct ctx *c,
+ struct udp_flow *uflow, unsigned sidei)
+{
+ const struct flowside *side = &uflow->f.side[sidei];
+ uint8_t pif = uflow->f.pif[sidei];
+ union {
+ flow_sidx_t sidx;
+ uint32_t data;
+ } fref = { .sidx = FLOW_SIDX(uflow, sidei) };
+ int s;
+
+ s = flowside_sock_l4(c, EPOLL_TYPE_UDP, pif, side, fref.data);
+ if (s < 0) {
+ flow_dbg_perror(uflow, "Couldn't open flow specific socket");
+ return s;
}
- if (uflow->s[TGTSIDE] >= 0) {
- /* But the flow specific one needs to be removed */
- epoll_del(c, uflow->s[TGTSIDE]);
- close(uflow->s[TGTSIDE]);
- uflow->s[TGTSIDE] = -1;
+ if (flowside_connect(c, s, pif, side) < 0) {
+ int rc = -errno;
+
+ epoll_del(c, s);
+ close(s);
+
+ flow_dbg_perror(uflow, "Couldn't connect flow socket");
+ return rc;
}
- flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE));
- if (!pif_is_socket(uflow->f.pif[TGTSIDE]))
- flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE));
- uflow->closed = true;
+ /* It's possible, if unlikely, that we could receive some packets in
+ * between the bind() and connect() which may or may not be for this
+ * flow. Being UDP we could just discard them, but it's not ideal.
+ *
+ * There's also a tricky case if a bunch of datagrams for a new flow
+ * arrive in rapid succession, the first going to the original listening
+ * socket and later ones going to this new socket. If we forwarded the
+ * datagrams from the new socket immediately here they would go before
+ * the datagram which established the flow. Again, not strictly wrong
+ * for UDP, but not ideal.
+ *
+ * So, we flag that the new socket is in a transient state where it
+ * might have datagrams for a different flow queued. Before the next
+ * epoll cycle, udp_flow_defer() will flush out any such datagrams, and
+ * thereafter everything on the new socket should be strictly for this
+ * flow.
+ */
+ if (sidei)
+ uflow->flush1 = true;
+ else
+ uflow->flush0 = true;
+
+ return s;
}
/**
* udp_flow_new() - Common setup for a new UDP flow
* @c: Execution context
* @flow: Initiated flow
- * @s_ini: Initiating socket (or -1)
* @now: Timestamp
*
- * Return: UDP specific flow, if successful, NULL on failure
+ * Return: sidx for the target side of the new UDP flow, or FLOW_SIDX_NONE
+ * on failure.
+ *
+ * #syscalls getsockname
*/
static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
- int s_ini, const struct timespec *now)
+ const struct timespec *now)
{
struct udp_flow *uflow = NULL;
const struct flowside *tgt;
- uint8_t tgtpif;
+ unsigned sidei;
if (!(tgt = flow_target(c, flow, IPPROTO_UDP)))
goto cancel;
- tgtpif = flow->f.pif[TGTSIDE];
uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
uflow->ts = now->tv_sec;
uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
+ uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = 0;
- if (s_ini >= 0) {
- /* When using auto port-scanning the listening port could go
- * away, so we need to duplicate the socket
- */
- uflow->s[INISIDE] = fcntl(s_ini, F_DUPFD_CLOEXEC, 0);
- if (uflow->s[INISIDE] < 0) {
- flow_perror(uflow,
- "Couldn't duplicate listening socket");
- goto cancel;
- }
+ flow_foreach_sidei(sidei) {
+ if (pif_is_socket(uflow->f.pif[sidei]))
+ if ((uflow->s[sidei] = udp_flow_sock(c, uflow, sidei)) < 0)
+ goto cancel;
}
- if (pif_is_socket(tgtpif)) {
- struct mmsghdr discard[UIO_MAXIOV] = { 0 };
- union {
- flow_sidx_t sidx;
- uint32_t data;
- } fref = {
- .sidx = FLOW_SIDX(flow, TGTSIDE),
- };
- int rc;
-
- uflow->s[TGTSIDE] = flowside_sock_l4(c, EPOLL_TYPE_UDP_REPLY,
- tgtpif, tgt, fref.data);
- if (uflow->s[TGTSIDE] < 0) {
- flow_dbg_perror(uflow,
- "Couldn't open socket for spliced flow");
+ if (uflow->s[TGTSIDE] >= 0 && inany_is_unspecified(&tgt->oaddr)) {
+ /* When we target a socket, we connect() it, but might not
+ * always bind(), leaving the kernel to pick our address. In
+ * that case connect() will implicitly bind() the socket, but we
+ * need to determine its local address so that we can match
+ * reply packets back to the correct flow. Update the flow with
+ * the information from getsockname() */
+ union sockaddr_inany sa;
+ socklen_t sl = sizeof(sa);
+ in_port_t port;
+
+ if (getsockname(uflow->s[TGTSIDE], &sa.sa, &sl) < 0 ||
+ inany_from_sockaddr(&uflow->f.side[TGTSIDE].oaddr,
+ &port, &sa) < 0) {
+ flow_perror(uflow, "Unable to determine local address");
goto cancel;
}
-
- if (flowside_connect(c, uflow->s[TGTSIDE], tgtpif, tgt) < 0) {
- flow_dbg_perror(uflow, "Couldn't connect flow socket");
- goto cancel;
- }
-
- /* It's possible, if unlikely, that we could receive some
- * unrelated packets in between the bind() and connect() of this
- * socket. For now we just discard these. We could consider
- * trying to redirect these to an appropriate handler, if we
- * need to.
- */
- rc = recvmmsg(uflow->s[TGTSIDE], discard, ARRAY_SIZE(discard),
- MSG_DONTWAIT, NULL);
- if (rc >= ARRAY_SIZE(discard)) {
- flow_dbg(uflow,
- "Too many (%d) spurious reply datagrams", rc);
+ if (port != tgt->oport) {
+ flow_err(uflow, "Unexpected local port");
goto cancel;
- } else if (rc > 0) {
- flow_trace(uflow,
- "Discarded %d spurious reply datagrams", rc);
- } else if (errno != EAGAIN) {
- flow_perror(uflow,
- "Unexpected error discarding datagrams");
}
}
- flow_hash_insert(c, FLOW_SIDX(uflow, INISIDE));
-
- /* If the target side is a socket, it will be a reply socket that knows
- * its own flowside. But if it's tap, then we need to look it up by
- * hash.
+ /* Tap sides always need to be looked up by hash. Socket sides don't
+ * always, but sometimes do (receiving packets on a socket not specific
+ * to one flow). Unconditionally hash both sides so all our bases are
+ * covered
*/
- if (!pif_is_socket(tgtpif))
- flow_hash_insert(c, FLOW_SIDX(uflow, TGTSIDE));
+ flow_foreach_sidei(sidei)
+ flow_hash_insert(c, FLOW_SIDX(uflow, sidei));
+
FLOW_ACTIVATE(uflow);
return FLOW_SIDX(uflow, TGTSIDE);
@@ -163,9 +195,11 @@ cancel:
}
/**
- * udp_flow_from_sock() - Find or create UDP flow for "listening" socket
+ * udp_flow_from_sock() - Find or create UDP flow for incoming datagram
* @c: Execution context
- * @ref: epoll reference of the receiving socket
+ * @pif: Interface the datagram is arriving from
+ * @dst: Our (local) address to which the datagram is arriving
+ * @port: Our (local) port number to which the datagram is arriving
* @s_in: Source socket address, filled in by recvmmsg()
* @now: Timestamp
*
@@ -174,7 +208,8 @@ cancel:
* Return: sidx for the destination side of the flow for this packet, or
* FLOW_SIDX_NONE if we couldn't find or create a flow.
*/
-flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
+flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif,
+ const union inany_addr *dst, in_port_t port,
const union sockaddr_inany *s_in,
const struct timespec *now)
{
@@ -183,9 +218,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
union flow *flow;
flow_sidx_t sidx;
- ASSERT(ref.type == EPOLL_TYPE_UDP_LISTEN);
-
- sidx = flow_lookup_sa(c, IPPROTO_UDP, ref.udp.pif, s_in, ref.udp.port);
+ sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, s_in, dst, port);
if ((uflow = udp_at_sidx(sidx))) {
uflow->ts = now->tv_sec;
return flow_sidx_opposite(sidx);
@@ -195,12 +228,11 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
char sastr[SOCKADDR_STRLEN];
debug("Couldn't allocate flow for UDP datagram from %s %s",
- pif_name(ref.udp.pif),
- sockaddr_ntop(s_in, sastr, sizeof(sastr)));
+ pif_name(pif), sockaddr_ntop(s_in, sastr, sizeof(sastr)));
return FLOW_SIDX_NONE;
}
- ini = flow_initiate_sa(flow, ref.udp.pif, s_in, ref.udp.port);
+ ini = flow_initiate_sa(flow, pif, s_in, dst, port);
if (!inany_is_unicast(&ini->eaddr) ||
ini->eport == 0 || ini->oport == 0) {
@@ -213,7 +245,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
return FLOW_SIDX_NONE;
}
- return udp_flow_new(c, flow, ref.fd, now);
+ return udp_flow_new(c, flow, now);
}
/**
@@ -269,17 +301,45 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
return FLOW_SIDX_NONE;
}
- return udp_flow_new(c, flow, -1, now);
+ return udp_flow_new(c, flow, now);
+}
+
+/**
+ * udp_flush_flow() - Flush datagrams that might not be for this flow
+ * @c: Execution context
+ * @uflow: Flow to handle
+ * @sidei: Side of the flow to flush
+ * @now: Current timestamp
+ */
+static void udp_flush_flow(const struct ctx *c,
+ const struct udp_flow *uflow, unsigned sidei,
+ const struct timespec *now)
+{
+ /* We don't know exactly where the datagrams will come from, but we know
+ * they'll have an interface and oport matching this flow */
+ udp_sock_fwd(c, uflow->s[sidei], uflow->f.pif[sidei],
+ uflow->f.side[sidei].oport, now);
}
/**
* udp_flow_defer() - Deferred per-flow handling (clean up aborted flows)
+ * @c: Execution context
* @uflow: Flow to handle
+ * @now: Current timestamp
*
* Return: true if the connection is ready to free, false otherwise
*/
-bool udp_flow_defer(const struct udp_flow *uflow)
+bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow,
+ const struct timespec *now)
{
+ if (uflow->flush0) {
+ udp_flush_flow(c, uflow, INISIDE, now);
+ uflow->flush0 = false;
+ }
+ if (uflow->flush1) {
+ udp_flush_flow(c, uflow, TGTSIDE, now);
+ uflow->flush1 = false;
+ }
return uflow->closed;
}
diff --git a/udp_flow.h b/udp_flow.h
index 9a1b059..4c528e9 100644
--- a/udp_flow.h
+++ b/udp_flow.h
@@ -8,9 +8,12 @@
#define UDP_FLOW_H
/**
- * struct udp - Descriptor for a flow of UDP packets
+ * struct udp_flow - Descriptor for a flow of UDP packets
* @f: Generic flow information
+ * @ttl: TTL or hop_limit for both sides
* @closed: Flow is already closed
+ * @flush0: @s[0] may have datagrams queued for other flows
+ * @flush1: @s[1] may have datagrams queued for other flows
* @ts: Activity timestamp
* @s: Socket fd (or -1) for each side of the flow
*/
@@ -18,13 +21,19 @@ struct udp_flow {
/* Must be first element */
struct flow_common f;
- bool closed :1;
+ uint8_t ttl[SIDES];
+
+ bool closed :1,
+ flush0 :1,
+ flush1 :1;
+
time_t ts;
int s[SIDES];
};
struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
-flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
+flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif,
+ const union inany_addr *dst, in_port_t port,
const union sockaddr_inany *s_in,
const struct timespec *now);
flow_sidx_t udp_flow_from_tap(const struct ctx *c,
@@ -33,7 +42,8 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
in_port_t srcport, in_port_t dstport,
const struct timespec *now);
void udp_flow_close(const struct ctx *c, struct udp_flow *uflow);
-bool udp_flow_defer(const struct udp_flow *uflow);
+bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow,
+ const struct timespec *now);
bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow,
const struct timespec *now);
diff --git a/udp_internal.h b/udp_internal.h
index 3b081f5..96d11cf 100644
--- a/udp_internal.h
+++ b/udp_internal.h
@@ -8,8 +8,6 @@
#include "tap.h" /* needed by udp_meta_t */
-#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */
-
/**
* struct udp_payload_t - UDP header and data for inbound messages
* @uh: UDP header
@@ -30,5 +28,7 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
const struct flowside *toside, size_t dlen,
bool no_udp_csum);
-int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events);
+void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
+ in_port_t port, const struct timespec *now);
+
#endif /* UDP_INTERNAL_H */
diff --git a/udp_vu.c b/udp_vu.c
index c26a223..099677f 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -40,7 +40,7 @@ static struct vu_virtq_element elem [VIRTQUEUE_MAX_SIZE];
* udp_vu_hdrlen() - return the size of the header in level 2 frame (UDP)
* @v6: Set for IPv6 packet
*
- * Return: Return the size of the header
+ * Return: return the size of the header
*/
static size_t udp_vu_hdrlen(bool v6)
{
@@ -58,46 +58,25 @@ static size_t udp_vu_hdrlen(bool v6)
}
/**
- * udp_vu_sock_info() - get socket information
- * @s: Socket to get information from
- * @s_in: Socket address (output)
- *
- * Return: 0 if socket address can be read, -1 otherwise
- */
-static int udp_vu_sock_info(int s, union sockaddr_inany *s_in)
-{
- struct msghdr msg = {
- .msg_name = s_in,
- .msg_namelen = sizeof(union sockaddr_inany),
- };
-
- return recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT);
-}
-
-/**
* udp_vu_sock_recv() - Receive datagrams from socket into vhost-user buffers
* @c: Execution context
+ * @vq: virtqueue to use to receive data
* @s: Socket to receive from
- * @events: epoll events bitmap
* @v6: Set for IPv6 connections
* @dlen: Size of received data (output)
*
- * Return: Number of iov entries used to store the datagram
+ * Return: number of iov entries used to store the datagram
*/
-static int udp_vu_sock_recv(const struct ctx *c, int s, uint32_t events,
+static int udp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq, int s,
bool v6, ssize_t *dlen)
{
- struct vu_dev *vdev = c->vdev;
- struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+ const struct vu_dev *vdev = c->vdev;
int iov_cnt, idx, iov_used;
struct msghdr msg = { 0 };
size_t off, hdrlen;
ASSERT(!c->no_udp);
- if (!(events & EPOLLIN))
- return 0;
-
/* compute L2 header length */
hdrlen = udp_vu_hdrlen(v6);
@@ -214,125 +193,27 @@ static void udp_vu_csum(const struct flowside *toside, int iov_used)
}
/**
- * udp_vu_listen_sock_handler() - Handle new data from socket
+ * udp_vu_sock_to_tap() - Forward datagrams from socket to tap
* @c: Execution context
- * @ref: epoll reference
- * @events: epoll events bitmap
- * @now: Current timestamp
+ * @s: Socket to read data from
+ * @n: Maximum number of datagrams to forward
+ * @tosidx: Flow & side to forward data from @s to
*/
-void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
- uint32_t events, const struct timespec *now)
+void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx)
{
- struct vu_dev *vdev = c->vdev;
- struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
- int i;
-
- if (udp_sock_errs(c, ref, events) < 0) {
- err("UDP: Unrecoverable error on listening socket:"
- " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
- return;
- }
-
- for (i = 0; i < UDP_MAX_FRAMES; i++) {
- const struct flowside *toside;
- union sockaddr_inany s_in;
- flow_sidx_t sidx;
- uint8_t pif;
- ssize_t dlen;
- int iov_used;
- bool v6;
-
- if (udp_vu_sock_info(ref.fd, &s_in) < 0)
- break;
-
- sidx = udp_flow_from_sock(c, ref, &s_in, now);
- pif = pif_at_sidx(sidx);
-
- if (pif != PIF_TAP) {
- if (flow_sidx_valid(sidx)) {
- flow_sidx_t fromsidx = flow_sidx_opposite(sidx);
- struct udp_flow *uflow = udp_at_sidx(sidx);
-
- flow_err(uflow,
- "No support for forwarding UDP from %s to %s",
- pif_name(pif_at_sidx(fromsidx)),
- pif_name(pif));
- } else {
- debug("Discarding 1 datagram without flow");
- }
-
- continue;
- }
-
- toside = flowside_at_sidx(sidx);
-
- v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
-
- iov_used = udp_vu_sock_recv(c, ref.fd, events, v6, &dlen);
- if (iov_used <= 0)
- break;
-
- udp_vu_prepare(c, toside, dlen);
- if (*c->pcap) {
- udp_vu_csum(toside, iov_used);
- pcap_iov(iov_vu, iov_used,
- sizeof(struct virtio_net_hdr_mrg_rxbuf));
- }
- vu_flush(vdev, vq, elem, iov_used);
- }
-}
-
-/**
- * udp_vu_reply_sock_handler() - Handle new data from flow specific socket
- * @c: Execution context
- * @ref: epoll reference
- * @events: epoll events bitmap
- * @now: Current timestamp
- */
-void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
- uint32_t events, const struct timespec *now)
-{
- flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
const struct flowside *toside = flowside_at_sidx(tosidx);
- struct udp_flow *uflow = udp_at_sidx(ref.flowside);
- int from_s = uflow->s[ref.flowside.sidei];
+ bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
struct vu_dev *vdev = c->vdev;
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
int i;
- ASSERT(!c->no_udp);
-
- if (udp_sock_errs(c, ref, events) < 0) {
- flow_err(uflow, "Unrecoverable error on reply socket");
- flow_err_details(uflow);
- udp_flow_close(c, uflow);
- return;
- }
-
- for (i = 0; i < UDP_MAX_FRAMES; i++) {
- uint8_t topif = pif_at_sidx(tosidx);
+ for (i = 0; i < n; i++) {
ssize_t dlen;
int iov_used;
- bool v6;
-
- ASSERT(uflow);
-
- if (topif != PIF_TAP) {
- uint8_t frompif = pif_at_sidx(ref.flowside);
-
- flow_err(uflow,
- "No support for forwarding UDP from %s to %s",
- pif_name(frompif), pif_name(topif));
- continue;
- }
-
- v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
- iov_used = udp_vu_sock_recv(c, from_s, events, v6, &dlen);
+ iov_used = udp_vu_sock_recv(c, vq, s, v6, &dlen);
if (iov_used <= 0)
break;
- flow_trace(uflow, "Received 1 datagram on reply socket");
- uflow->ts = now->tv_sec;
udp_vu_prepare(c, toside, dlen);
if (*c->pcap) {
diff --git a/udp_vu.h b/udp_vu.h
index ba7018d..576b0e7 100644
--- a/udp_vu.h
+++ b/udp_vu.h
@@ -6,8 +6,8 @@
#ifndef UDP_VU_H
#define UDP_VU_H
-void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
- uint32_t events, const struct timespec *now);
-void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
- uint32_t events, const struct timespec *now);
+void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
+ const struct timespec *now);
+void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx);
+
#endif /* UDP_VU_H */
diff --git a/util.c b/util.c
index 656e86a..c492f90 100644
--- a/util.c
+++ b/util.c
@@ -34,6 +34,7 @@
#include "passt.h"
#include "packet.h"
#include "log.h"
+#include "pcap.h"
#ifdef HAS_GETRANDOM
#include <sys/random.h>
#endif
@@ -71,7 +72,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
case EPOLL_TYPE_UDP_LISTEN:
freebind = c->freebind;
/* fallthrough */
- case EPOLL_TYPE_UDP_REPLY:
+ case EPOLL_TYPE_UDP:
proto = IPPROTO_UDP;
socktype = SOCK_DGRAM | SOCK_NONBLOCK;
break;
@@ -109,11 +110,15 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
debug("Failed to set SO_REUSEADDR on socket %i", fd);
if (proto == IPPROTO_UDP) {
+ int pktinfo = af == AF_INET ? IP_PKTINFO : IPV6_RECVPKTINFO;
+ int recverr = af == AF_INET ? IP_RECVERR : IPV6_RECVERR;
int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6;
- int opt = af == AF_INET ? IP_RECVERR : IPV6_RECVERR;
- if (setsockopt(fd, level, opt, &y, sizeof(y)))
+ if (setsockopt(fd, level, recverr, &y, sizeof(y)))
die_perror("Failed to set RECVERR on socket %i", fd);
+
+ if (setsockopt(fd, level, pktinfo, &y, sizeof(y)))
+ die_perror("Failed to set PKTINFO on socket %i", fd);
}
if (ifname && *ifname) {
@@ -360,7 +365,7 @@ void bitmap_or(uint8_t *dst, size_t size, const uint8_t *a, const uint8_t *b)
dst[i] = a[i] | b[i];
}
-/*
+/**
* ns_enter() - Enter configured user (unless already joined) and network ns
* @c: Execution context
*
@@ -495,7 +500,8 @@ int output_file_open(const char *path, int flags)
* @pidfile_fd: Open PID file descriptor
* @devnull_fd: Open file descriptor for /dev/null
*
- * Return: child PID on success, won't return on failure
+ * Return: 0 in the child process on success. The parent process exits.
+ * Does not return in either process on failure (calls _exit).
*/
int __daemon(int pidfile_fd, int devnull_fd)
{
@@ -603,7 +609,8 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
#endif
}
-/* write_all_buf() - write all of a buffer to an fd
+/**
+ * write_all_buf() - write all of a buffer to an fd
* @fd: File descriptor
* @buf: Pointer to base of buffer
* @len: Length of buffer
@@ -633,7 +640,8 @@ int write_all_buf(int fd, const void *buf, size_t len)
return 0;
}
-/* write_remainder() - write the tail of an IO vector to an fd
+/**
+ * write_remainder() - write the tail of an IO vector to an fd
* @fd: File descriptor
* @iov: IO vector
* @iovcnt: Number of entries in @iov
@@ -757,7 +765,7 @@ int read_remainder(int fd, const struct iovec *iov, size_t cnt, size_t skip)
* @dst: output buffer, minimum SOCKADDR_STRLEN bytes
* @size: size of buffer at @dst
*
- * Return: On success, a non-null pointer to @dst, NULL on failure
+ * Return: on success, a non-null pointer to @dst, NULL on failure
*/
const char *sockaddr_ntop(const void *sa, char *dst, socklen_t size)
{
@@ -817,7 +825,7 @@ const char *sockaddr_ntop(const void *sa, char *dst, socklen_t size)
* @dst: Output buffer, minimum ETH_ADDRSTRLEN bytes
* @size: Size of buffer at @dst
*
- * Return: On success, a non-null pointer to @dst, NULL on failure
+ * Return: on success, a non-null pointer to @dst, NULL on failure
*/
const char *eth_ntop(const unsigned char *mac, char *dst, size_t size)
{
@@ -834,7 +842,7 @@ const char *eth_ntop(const unsigned char *mac, char *dst, size_t size)
/** str_ee_origin() - Convert socket extended error origin to a string
* @ee: Socket extended error structure
*
- * Return: Static string describing error origin
+ * Return: static string describing error origin
*/
const char *str_ee_origin(const struct sock_extended_err *ee)
{
@@ -871,7 +879,9 @@ void close_open_files(int argc, char **argv)
errno = 0;
fd = strtol(optarg, NULL, 0);
- if (errno || fd <= STDERR_FILENO || fd > INT_MAX)
+ if (errno ||
+ (fd != STDIN_FILENO && fd <= STDERR_FILENO) ||
+ fd > INT_MAX)
die("Invalid --fd: %s", optarg);
}
} while (name != -1);
@@ -1017,3 +1027,36 @@ void encode_domain_name(char *buf, const char *domain_name)
}
p[i] = 0L;
}
+
+/**
+ * abort_with_msg() - Print error message and abort
+ * @fmt: Format string
+ * @...: Format parameters
+ */
+void abort_with_msg(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vlogmsg(true, false, LOG_CRIT, fmt, ap);
+ va_end(ap);
+
+ /* This may actually cause a SIGSYS instead of SIGABRT, due to seccomp,
+ * but that will still get the job done.
+ */
+ abort();
+}
+
+/**
+ * fsync_pcap_and_log() - Flush pcap and log files as needed
+ *
+ * #syscalls fsync
+ */
+void fsync_pcap_and_log(void)
+{
+ if (pcap_fd != -1 && fsync(pcap_fd))
+ warn_perror("Failed to flush pcap file, it might be truncated");
+
+ if (log_file != -1)
+ (void)fsync(log_file);
+}
diff --git a/util.h b/util.h
index 0f70f4d..2a8c38f 100644
--- a/util.h
+++ b/util.h
@@ -31,9 +31,6 @@
#ifndef SECCOMP_RET_KILL_PROCESS
#define SECCOMP_RET_KILL_PROCESS SECCOMP_RET_KILL
#endif
-#ifndef ETH_MAX_MTU
-#define ETH_MAX_MTU USHRT_MAX
-#endif
#ifndef IP_MAX_MTU
#define IP_MAX_MTU USHRT_MAX
#endif
@@ -64,27 +61,22 @@
#define STRINGIFY(x) #x
#define STR(x) STRINGIFY(x)
-#ifdef CPPCHECK_6936
+void abort_with_msg(const char *fmt, ...)
+ __attribute__((format(printf, 1, 2), noreturn));
+
/* Some cppcheck versions get confused by aborts inside a loop, causing
* it to give false positive uninitialised variable warnings later in
* the function, because it doesn't realise the non-initialising path
* already exited. See https://trac.cppcheck.net/ticket/13227
+ *
+ * Therefore, avoid using the usual do while wrapper we use to force the macro
+ * to act like a single statement requiring a ';'.
*/
-#define ASSERT(expr) \
- ((expr) ? (void)0 : abort())
-#else
+#define ASSERT_WITH_MSG(expr, ...) \
+ ((expr) ? (void)0 : abort_with_msg(__VA_ARGS__))
#define ASSERT(expr) \
- do { \
- if (!(expr)) { \
- err("ASSERTION FAILED in %s (%s:%d): %s", \
- __func__, __FILE__, __LINE__, STRINGIFY(expr)); \
- /* This may actually SIGSYS, due to seccomp, \
- * but that will still get the job done \
- */ \
- abort(); \
- } \
- } while (0)
-#endif
+ ASSERT_WITH_MSG((expr), "ASSERTION FAILED in %s (%s:%d): %s", \
+ __func__, __FILE__, __LINE__, STRINGIFY(expr))
#ifdef P_tmpdir
#define TMPDIR P_tmpdir
@@ -160,7 +152,7 @@
* ntohl_unaligned() - Read 32-bit BE value from a possibly unaligned address
* @p: Pointer to the BE value in memory
*
- * Returns: Host-order value of 32-bit BE quantity at @p
+ * Return: host-order value of 32-bit BE quantity at @p
*/
static inline uint32_t ntohl_unaligned(const void *p)
{
@@ -234,12 +226,13 @@ int read_all_buf(int fd, void *buf, size_t len);
int read_remainder(int fd, const struct iovec *iov, size_t cnt, size_t skip);
void close_open_files(int argc, char **argv);
bool snprintf_check(char *str, size_t size, const char *format, ...);
+void fsync_pcap_and_log(void);
/**
* af_name() - Return name of an address family
* @af: Address/protocol family (AF_INET or AF_INET6)
*
- * Returns: Name of the protocol family as a string
+ * Return: name of the protocol family as a string
*/
static inline const char *af_name(sa_family_t af)
{
@@ -379,6 +372,16 @@ static inline int wrap_accept4(int sockfd, struct sockaddr *addr,
#define accept4(s, addr, addrlen, flags) \
wrap_accept4((s), (addr), (addrlen), (flags))
+static inline int wrap_getsockname(int sockfd, struct sockaddr *addr,
+/* cppcheck-suppress constParameterPointer */
+ socklen_t *addrlen)
+{
+ sa_init(addr, addrlen);
+ return getsockname(sockfd, addr, addrlen);
+}
+#define getsockname(s, addr, addrlen) \
+ wrap_getsockname((s), (addr), (addrlen))
+
#define PASST_MAXDNAME 254 /* 253 (RFC 1035) + 1 (the terminator) */
void encode_domain_name(char *buf, const char *domain_name);
diff --git a/vhost_user.c b/vhost_user.c
index 105f77a..fa343a8 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -137,8 +137,8 @@ static void *qva_to_va(struct vu_dev *dev, uint64_t qemu_addr)
unsigned int i;
/* Find matching memory region. */
- for (i = 0; i < dev->nregions; i++) {
- const struct vu_dev_region *r = &dev->regions[i];
+ for (i = 0; i < dev->memory.nregions; i++) {
+ const struct vu_dev_region *r = &dev->memory.regions[i];
if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) {
/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
@@ -183,7 +183,7 @@ static void vmsg_set_reply_u64(struct vhost_user_msg *vmsg, uint64_t val)
* @conn_fd: vhost-user command socket
* @vmsg: vhost-user message
*
- * Return: 0 if recvmsg() has been interrupted or if there's no data to read,
+ * Return: 0 if recvmsg() has been interrupted or if there's no data to read,
* 1 if a message has been received
*/
static int vu_message_read_default(int conn_fd, struct vhost_user_msg *vmsg)
@@ -302,13 +302,13 @@ static void vu_message_write(int conn_fd, struct vhost_user_msg *vmsg)
* @conn_fd: vhost-user command socket
* @vmsg: vhost-user message
*/
-static void vu_send_reply(int conn_fd, struct vhost_user_msg *msg)
+static void vu_send_reply(int conn_fd, struct vhost_user_msg *vmsg)
{
- msg->hdr.flags &= ~VHOST_USER_VERSION_MASK;
- msg->hdr.flags |= VHOST_USER_VERSION;
- msg->hdr.flags |= VHOST_USER_REPLY_MASK;
+ vmsg->hdr.flags &= ~VHOST_USER_VERSION_MASK;
+ vmsg->hdr.flags |= VHOST_USER_VERSION;
+ vmsg->hdr.flags |= VHOST_USER_REPLY_MASK;
- vu_message_write(conn_fd, msg);
+ vu_message_write(conn_fd, vmsg);
}
/**
@@ -316,10 +316,10 @@ static void vu_send_reply(int conn_fd, struct vhost_user_msg *msg)
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: True as a reply is requested
+ * Return: true as a reply is requested
*/
static bool vu_get_features_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
uint64_t features =
1ULL << VIRTIO_F_VERSION_1 |
@@ -329,9 +329,9 @@ static bool vu_get_features_exec(struct vu_dev *vdev,
(void)vdev;
- vmsg_set_reply_u64(msg, features);
+ vmsg_set_reply_u64(vmsg, features);
- debug("Sending back to guest u64: 0x%016"PRIx64, msg->payload.u64);
+ debug("Sending back to guest u64: 0x%016"PRIx64, vmsg->payload.u64);
return true;
}
@@ -345,7 +345,7 @@ static void vu_set_enable_all_rings(struct vu_dev *vdev, bool enable)
{
uint16_t i;
- for (i = 0; i < VHOST_USER_MAX_QUEUES; i++)
+ for (i = 0; i < VHOST_USER_MAX_VQS; i++)
vdev->vq[i].enable = enable;
}
@@ -354,14 +354,14 @@ static void vu_set_enable_all_rings(struct vu_dev *vdev, bool enable)
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_set_features_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- debug("u64: 0x%016"PRIx64, msg->payload.u64);
+ debug("u64: 0x%016"PRIx64, vmsg->payload.u64);
- vdev->features = msg->payload.u64;
+ vdev->features = vmsg->payload.u64;
/* We only support devices conforming to VIRTIO 1.0 or
* later
*/
@@ -379,13 +379,13 @@ static bool vu_set_features_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_set_owner_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
(void)vdev;
- (void)msg;
+ (void)vmsg;
return false;
}
@@ -396,7 +396,7 @@ static bool vu_set_owner_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vq: Virtqueue
*
- * Return: True if ring cannot be mapped to our address space
+ * Return: true if ring cannot be mapped to our address space
*/
static bool map_ring(struct vu_dev *vdev, struct vu_virtq *vq)
{
@@ -418,18 +418,18 @@ static bool map_ring(struct vu_dev *vdev, struct vu_virtq *vq)
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*
* #syscalls:vu mmap|mmap2 munmap
*/
static bool vu_set_mem_table_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- struct vhost_user_memory m = msg->payload.memory, *memory = &m;
+ struct vhost_user_memory m = vmsg->payload.memory, *memory = &m;
unsigned int i;
- for (i = 0; i < vdev->nregions; i++) {
- const struct vu_dev_region *r = &vdev->regions[i];
+ for (i = 0; i < vdev->memory.nregions; i++) {
+ const struct vu_dev_region *r = &vdev->memory.regions[i];
if (r->mmap_addr) {
/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
@@ -437,12 +437,12 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev,
r->size + r->mmap_offset);
}
}
- vdev->nregions = memory->nregions;
+ vdev->memory.nregions = memory->nregions;
debug("vhost-user nregions: %u", memory->nregions);
- for (i = 0; i < vdev->nregions; i++) {
+ for (i = 0; i < vdev->memory.nregions; i++) {
struct vhost_user_memory_region *msg_region = &memory->regions[i];
- struct vu_dev_region *dev_region = &vdev->regions[i];
+ struct vu_dev_region *dev_region = &vdev->memory.regions[i];
void *mmap_addr;
debug("vhost-user region %d", i);
@@ -465,7 +465,7 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev,
*/
mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
PROT_READ | PROT_WRITE, MAP_SHARED |
- MAP_NORESERVE, msg->fds[i], 0);
+ MAP_NORESERVE, vmsg->fds[i], 0);
if (mmap_addr == MAP_FAILED)
die_perror("vhost-user region mmap error");
@@ -474,23 +474,17 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev,
debug(" mmap_addr: 0x%016"PRIx64,
dev_region->mmap_addr);
- close(msg->fds[i]);
+ close(vmsg->fds[i]);
}
- for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) {
+ for (i = 0; i < VHOST_USER_MAX_VQS; i++) {
if (vdev->vq[i].vring.desc) {
if (map_ring(vdev, &vdev->vq[i]))
die("remapping queue %d during setmemtable", i);
}
}
- /* As vu_packet_check_range() has no access to the number of
- * memory regions, mark the end of the array with mmap_addr = 0
- */
- ASSERT(vdev->nregions < VHOST_USER_MAX_RAM_SLOTS - 1);
- vdev->regions[vdev->nregions].mmap_addr = 0;
-
- tap_sock_update_pool(vdev->regions, 0);
+ ASSERT(vdev->memory.nregions < VHOST_USER_MAX_RAM_SLOTS);
return false;
}
@@ -541,7 +535,7 @@ static void vu_log_page(uint8_t *log_table, uint64_t page)
/**
* vu_log_write() - Log memory write
- * @dev: vhost-user device
+ * @vdev: vhost-user device
* @address: Memory address
* @length: Memory size
*/
@@ -566,23 +560,23 @@ void vu_log_write(const struct vu_dev *vdev, uint64_t address, uint64_t length)
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: true as a reply is requested
*
* #syscalls:vu mmap|mmap2 munmap
*/
static bool vu_set_log_base_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
uint64_t log_mmap_size, log_mmap_offset;
void *base;
int fd;
- if (msg->fd_num != 1 || msg->hdr.size != sizeof(msg->payload.log))
+ if (vmsg->fd_num != 1 || vmsg->hdr.size != sizeof(vmsg->payload.log))
die("vhost-user: Invalid log_base message");
- fd = msg->fds[0];
- log_mmap_offset = msg->payload.log.mmap_offset;
- log_mmap_size = msg->payload.log.mmap_size;
+ fd = vmsg->fds[0];
+ log_mmap_offset = vmsg->payload.log.mmap_offset;
+ log_mmap_size = vmsg->payload.log.mmap_size;
debug("vhost-user log mmap_offset: %"PRId64, log_mmap_offset);
debug("vhost-user log mmap_size: %"PRId64, log_mmap_size);
@@ -599,8 +593,8 @@ static bool vu_set_log_base_exec(struct vu_dev *vdev,
vdev->log_table = base;
vdev->log_size = log_mmap_size;
- msg->hdr.size = sizeof(msg->payload.u64);
- msg->fd_num = 0;
+ vmsg->hdr.size = sizeof(vmsg->payload.u64);
+ vmsg->fd_num = 0;
return true;
}
@@ -610,18 +604,18 @@ static bool vu_set_log_base_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_set_log_fd_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- if (msg->fd_num != 1)
+ if (vmsg->fd_num != 1)
die("Invalid log_fd message");
if (vdev->log_call_fd != -1)
close(vdev->log_call_fd);
- vdev->log_call_fd = msg->fds[0];
+ vdev->log_call_fd = vmsg->fds[0];
debug("Got log_call_fd: %d", vdev->log_call_fd);
@@ -633,13 +627,13 @@ static bool vu_set_log_fd_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_set_vring_num_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- unsigned int idx = msg->payload.state.index;
- unsigned int num = msg->payload.state.num;
+ unsigned int idx = vmsg->payload.state.index;
+ unsigned int num = vmsg->payload.state.num;
trace("State.index: %u", idx);
trace("State.num: %u", num);
@@ -653,16 +647,16 @@ static bool vu_set_vring_num_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_set_vring_addr_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
/* We need to copy the payload to vhost_vring_addr structure
- * to access index because address of msg->payload.addr
+ * to access index because address of vmsg->payload.addr
* can be unaligned as it is packed.
*/
- struct vhost_vring_addr addr = msg->payload.addr;
+ struct vhost_vring_addr addr = vmsg->payload.addr;
struct vu_virtq *vq = &vdev->vq[addr.index];
debug("vhost_vring_addr:");
@@ -677,7 +671,7 @@ static bool vu_set_vring_addr_exec(struct vu_dev *vdev,
debug(" log_guest_addr: 0x%016" PRIx64,
(uint64_t)addr.log_guest_addr);
- vq->vra = msg->payload.addr;
+ vq->vra = vmsg->payload.addr;
vq->vring.flags = addr.flags;
vq->vring.log_guest_addr = addr.log_guest_addr;
@@ -699,13 +693,13 @@ static bool vu_set_vring_addr_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_set_vring_base_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- unsigned int idx = msg->payload.state.index;
- unsigned int num = msg->payload.state.num;
+ unsigned int idx = vmsg->payload.state.index;
+ unsigned int num = vmsg->payload.state.num;
debug("State.index: %u", idx);
debug("State.num: %u", num);
@@ -720,16 +714,16 @@ static bool vu_set_vring_base_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: True as a reply is requested
+ * Return: true as a reply is requested
*/
static bool vu_get_vring_base_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- unsigned int idx = msg->payload.state.index;
+ unsigned int idx = vmsg->payload.state.index;
debug("State.index: %u", idx);
- msg->payload.state.num = vdev->vq[idx].last_avail_idx;
- msg->hdr.size = sizeof(msg->payload.state);
+ vmsg->payload.state.num = vdev->vq[idx].last_avail_idx;
+ vmsg->hdr.size = sizeof(vmsg->payload.state);
vdev->vq[idx].started = false;
vdev->vq[idx].vring.avail = 0;
@@ -771,21 +765,21 @@ static void vu_set_watch(const struct vu_dev *vdev, int idx)
* close fds if NOFD bit is set
* @vmsg: vhost-user message
*/
-static void vu_check_queue_msg_file(struct vhost_user_msg *msg)
+static void vu_check_queue_msg_file(struct vhost_user_msg *vmsg)
{
- bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
- int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
+ int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
- if (idx >= VHOST_USER_MAX_QUEUES)
+ if (idx >= VHOST_USER_MAX_VQS)
die("Invalid vhost-user queue index: %u", idx);
if (nofd) {
- vmsg_close_fds(msg);
+ vmsg_close_fds(vmsg);
return;
}
- if (msg->fd_num != 1)
- die("Invalid fds in vhost-user request: %d", msg->hdr.request);
+ if (vmsg->fd_num != 1)
+ die("Invalid fds in vhost-user request: %d", vmsg->hdr.request);
}
/**
@@ -794,17 +788,17 @@ static void vu_check_queue_msg_file(struct vhost_user_msg *msg)
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_set_vring_kick_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
- int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
+ int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
- debug("u64: 0x%016"PRIx64, msg->payload.u64);
+ debug("u64: 0x%016"PRIx64, vmsg->payload.u64);
- vu_check_queue_msg_file(msg);
+ vu_check_queue_msg_file(vmsg);
if (vdev->vq[idx].kick_fd != -1) {
epoll_del(vdev->context, vdev->vq[idx].kick_fd);
@@ -813,7 +807,7 @@ static bool vu_set_vring_kick_exec(struct vu_dev *vdev,
}
if (!nofd)
- vdev->vq[idx].kick_fd = msg->fds[0];
+ vdev->vq[idx].kick_fd = vmsg->fds[0];
debug("Got kick_fd: %d for vq: %d", vdev->vq[idx].kick_fd, idx);
@@ -834,17 +828,17 @@ static bool vu_set_vring_kick_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_set_vring_call_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
- int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
+ int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
- debug("u64: 0x%016"PRIx64, msg->payload.u64);
+ debug("u64: 0x%016"PRIx64, vmsg->payload.u64);
- vu_check_queue_msg_file(msg);
+ vu_check_queue_msg_file(vmsg);
if (vdev->vq[idx].call_fd != -1) {
close(vdev->vq[idx].call_fd);
@@ -852,11 +846,11 @@ static bool vu_set_vring_call_exec(struct vu_dev *vdev,
}
if (!nofd)
- vdev->vq[idx].call_fd = msg->fds[0];
+ vdev->vq[idx].call_fd = vmsg->fds[0];
/* in case of I/O hang after reconnecting */
if (vdev->vq[idx].call_fd != -1)
- eventfd_write(msg->fds[0], 1);
+ eventfd_write(vmsg->fds[0], 1);
debug("Got call_fd: %d for vq: %d", vdev->vq[idx].call_fd, idx);
@@ -869,17 +863,17 @@ static bool vu_set_vring_call_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_set_vring_err_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
- int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
+ int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
- debug("u64: 0x%016"PRIx64, msg->payload.u64);
+ debug("u64: 0x%016"PRIx64, vmsg->payload.u64);
- vu_check_queue_msg_file(msg);
+ vu_check_queue_msg_file(vmsg);
if (vdev->vq[idx].err_fd != -1) {
close(vdev->vq[idx].err_fd);
@@ -887,7 +881,7 @@ static bool vu_set_vring_err_exec(struct vu_dev *vdev,
}
if (!nofd)
- vdev->vq[idx].err_fd = msg->fds[0];
+ vdev->vq[idx].err_fd = vmsg->fds[0];
return false;
}
@@ -898,10 +892,10 @@ static bool vu_set_vring_err_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: True as a reply is requested
+ * Return: true as a reply is requested
*/
static bool vu_get_protocol_features_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK |
1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD |
@@ -909,7 +903,7 @@ static bool vu_get_protocol_features_exec(struct vu_dev *vdev,
1ULL << VHOST_USER_PROTOCOL_F_RARP;
(void)vdev;
- vmsg_set_reply_u64(msg, features);
+ vmsg_set_reply_u64(vmsg, features);
return true;
}
@@ -919,16 +913,16 @@ static bool vu_get_protocol_features_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_set_protocol_features_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- uint64_t features = msg->payload.u64;
+ uint64_t features = vmsg->payload.u64;
debug("u64: 0x%016"PRIx64, features);
- vdev->protocol_features = msg->payload.u64;
+ vdev->protocol_features = vmsg->payload.u64;
return false;
}
@@ -938,14 +932,16 @@ static bool vu_set_protocol_features_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: True as a reply is requested
+ * Return: true as a reply is requested
*/
static bool vu_get_queue_num_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
(void)vdev;
- vmsg_set_reply_u64(msg, VHOST_USER_MAX_QUEUES);
+ vmsg_set_reply_u64(vmsg, VHOST_USER_MAX_VQS / 2);
+
+ debug("VHOST_USER_MAX_VQS %u", VHOST_USER_MAX_VQS / 2);
return true;
}
@@ -955,18 +951,18 @@ static bool vu_get_queue_num_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_set_vring_enable_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- unsigned int enable = msg->payload.state.num;
- unsigned int idx = msg->payload.state.index;
+ unsigned int enable = vmsg->payload.state.num;
+ unsigned int idx = vmsg->payload.state.index;
debug("State.index: %u", idx);
debug("State.enable: %u", enable);
- if (idx >= VHOST_USER_MAX_QUEUES)
+ if (idx >= VHOST_USER_MAX_VQS)
die("Invalid vring_enable index: %u", idx);
vdev->vq[idx].enable = enable;
@@ -974,17 +970,17 @@ static bool vu_set_vring_enable_exec(struct vu_dev *vdev,
}
/**
- * vu_set_send_rarp_exec() - vhost-user specification says: "Broadcast a fake
- * RARP to notify the migration is terminated",
- * but passt doesn't need to update any ARP table,
- * so do nothing to silence QEMU bogus error message
+ * vu_send_rarp_exec() - vhost-user specification says: "Broadcast a fake
+ * RARP to notify the migration is terminated",
+ * but passt doesn't need to update any ARP table,
+ * so do nothing to silence QEMU bogus error message
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_send_rarp_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
char macstr[ETH_ADDRSTRLEN];
@@ -993,7 +989,7 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev,
/* ignore the command */
debug("Ignore command VHOST_USER_SEND_RARP for %s",
- eth_ntop((unsigned char *)&msg->payload.u64, macstr,
+ eth_ntop((unsigned char *)&vmsg->payload.u64, macstr,
sizeof(macstr)));
return false;
@@ -1004,16 +1000,16 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: True as the reply contains 0 to indicate success
+ * Return: true as the reply contains 0 to indicate success
* and set bit 8 as we don't provide our own fd.
*/
static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- unsigned int direction = msg->payload.transfer_state.direction;
- unsigned int phase = msg->payload.transfer_state.phase;
+ unsigned int direction = vmsg->payload.transfer_state.direction;
+ unsigned int phase = vmsg->payload.transfer_state.phase;
- if (msg->fd_num != 1)
+ if (vmsg->fd_num != 1)
die("Invalid device_state_fd message");
if (phase != VHOST_USER_TRANSFER_STATE_PHASE_STOPPED)
@@ -1021,13 +1017,13 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
if (direction != VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE &&
direction != VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD)
- die("Invalide device_state_fd direction: %d", direction);
+ die("Invalid device_state_fd direction: %d", direction);
- migrate_request(vdev->context, msg->fds[0],
+ migrate_request(vdev->context, vmsg->fds[0],
direction == VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD);
/* We don't provide a new fd for the data transfer */
- vmsg_set_reply_u64(msg, VHOST_USER_VRING_NOFD_MASK);
+ vmsg_set_reply_u64(vmsg, VHOST_USER_VRING_NOFD_MASK);
return true;
}
@@ -1037,13 +1033,13 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: True as the reply contains the migration result
+ * Return: true as the reply contains the migration result
*/
/* cppcheck-suppress constParameterCallback */
static bool vu_check_device_state_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- vmsg_set_reply_u64(msg, vdev->context->device_state_result);
+ vmsg_set_reply_u64(vmsg, vdev->context->device_state_result);
return true;
}
@@ -1051,7 +1047,6 @@ static bool vu_check_device_state_exec(struct vu_dev *vdev,
/**
* vu_init() - Initialize vhost-user device structure
* @c: execution context
- * @vdev: vhost-user device
*/
void vu_init(struct ctx *c)
{
@@ -1059,7 +1054,7 @@ void vu_init(struct ctx *c)
c->vdev = &vdev_storage;
c->vdev->context = c;
- for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) {
+ for (i = 0; i < VHOST_USER_MAX_VQS; i++) {
c->vdev->vq[i] = (struct vu_virtq){
.call_fd = -1,
.kick_fd = -1,
@@ -1082,7 +1077,7 @@ void vu_cleanup(struct vu_dev *vdev)
{
unsigned int i;
- for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) {
+ for (i = 0; i < VHOST_USER_MAX_VQS; i++) {
struct vu_virtq *vq = &vdev->vq[i];
vq->started = false;
@@ -1107,8 +1102,8 @@ void vu_cleanup(struct vu_dev *vdev)
vq->vring.avail = 0;
}
- for (i = 0; i < vdev->nregions; i++) {
- const struct vu_dev_region *r = &vdev->regions[i];
+ for (i = 0; i < vdev->memory.nregions; i++) {
+ const struct vu_dev_region *r = &vdev->memory.regions[i];
if (r->mmap_addr) {
/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
@@ -1116,7 +1111,7 @@ void vu_cleanup(struct vu_dev *vdev)
r->size + r->mmap_offset);
}
}
- vdev->nregions = 0;
+ vdev->memory.nregions = 0;
vu_close_log(vdev);
@@ -1134,7 +1129,7 @@ static void vu_sock_reset(struct vu_dev *vdev)
}
static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev,
- struct vhost_user_msg *msg) = {
+ struct vhost_user_msg *vmsg) = {
[VHOST_USER_GET_FEATURES] = vu_get_features_exec,
[VHOST_USER_SET_FEATURES] = vu_set_features_exec,
[VHOST_USER_GET_PROTOCOL_FEATURES] = vu_get_protocol_features_exec,
@@ -1165,7 +1160,7 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev,
*/
void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
{
- struct vhost_user_msg msg = { 0 };
+ struct vhost_user_msg vmsg = { 0 };
bool need_reply, reply_requested;
int ret;
@@ -1174,41 +1169,46 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
return;
}
- ret = vu_message_read_default(fd, &msg);
+ ret = vu_message_read_default(fd, &vmsg);
if (ret == 0) {
vu_sock_reset(vdev);
return;
}
debug("================ Vhost user message ================");
- debug("Request: %s (%d)", vu_request_to_string(msg.hdr.request),
- msg.hdr.request);
- debug("Flags: 0x%x", msg.hdr.flags);
- debug("Size: %u", msg.hdr.size);
+ debug("Request: %s (%d)", vu_request_to_string(vmsg.hdr.request),
+ vmsg.hdr.request);
+ debug("Flags: 0x%x", vmsg.hdr.flags);
+ debug("Size: %u", vmsg.hdr.size);
- need_reply = msg.hdr.flags & VHOST_USER_NEED_REPLY_MASK;
+ need_reply = vmsg.hdr.flags & VHOST_USER_NEED_REPLY_MASK;
- if (msg.hdr.request >= 0 && msg.hdr.request < VHOST_USER_MAX &&
- vu_handle[msg.hdr.request])
- reply_requested = vu_handle[msg.hdr.request](vdev, &msg);
+ if (vmsg.hdr.request >= 0 && vmsg.hdr.request < VHOST_USER_MAX &&
+ vu_handle[vmsg.hdr.request])
+ reply_requested = vu_handle[vmsg.hdr.request](vdev, &vmsg);
else
- die("Unhandled request: %d", msg.hdr.request);
+ die("Unhandled request: %d", vmsg.hdr.request);
/* cppcheck-suppress legacyUninitvar */
if (!reply_requested && need_reply) {
- msg.payload.u64 = 0;
- msg.hdr.flags = 0;
- msg.hdr.size = sizeof(msg.payload.u64);
- msg.fd_num = 0;
+ vmsg.payload.u64 = 0;
+ vmsg.hdr.flags = 0;
+ vmsg.hdr.size = sizeof(vmsg.payload.u64);
+ vmsg.fd_num = 0;
reply_requested = true;
}
if (reply_requested)
- vu_send_reply(fd, &msg);
+ vu_send_reply(fd, &vmsg);
- if (msg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE &&
+ if (vmsg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE &&
vdev->context->device_state_result == 0 &&
!vdev->context->migrate_target) {
- info("Migration complete, exiting");
- _exit(EXIT_SUCCESS);
+ if (vdev->context->migrate_exit) {
+ info("Migration complete, exiting");
+ _exit(EXIT_SUCCESS);
+ }
+
+ info("Migration complete");
+ vdev->context->one_off = false;
}
}
diff --git a/vhost_user.h b/vhost_user.h
index 1daacd1..e806a9e 100644
--- a/vhost_user.h
+++ b/vhost_user.h
@@ -184,7 +184,7 @@ union vhost_user_payload {
};
/**
- * struct vhost_user_msg - vhost-use message
+ * struct vhost_user_msg - vhost-user message
* @hdr: Message header
* @payload: Message payload
* @fds: File descriptors associated with the message
@@ -217,7 +217,7 @@ struct vhost_user_msg {
})
/**
- * vu_queue_enabled - Return state of a virtqueue
+ * vu_queue_enabled() - Return state of a virtqueue
* @vq: virtqueue to check
*
* Return: true if the virqueue is enabled, false otherwise
@@ -228,7 +228,7 @@ static inline bool vu_queue_enabled(const struct vu_virtq *vq)
}
/**
- * vu_queue_started - Return state of a virtqueue
+ * vu_queue_started() - Return state of a virtqueue
* @vq: virtqueue to check
*
* Return: true if the virqueue is started, false otherwise
diff --git a/virtio.c b/virtio.c
index bc2b89a..bd388c2 100644
--- a/virtio.c
+++ b/virtio.c
@@ -102,8 +102,8 @@ static void *vu_gpa_to_va(const struct vu_dev *dev, uint64_t *plen,
return NULL;
/* Find matching memory region. */
- for (i = 0; i < dev->nregions; i++) {
- const struct vu_dev_region *r = &dev->regions[i];
+ for (i = 0; i < dev->memory.nregions; i++) {
+ const struct vu_dev_region *r = &dev->memory.regions[i];
if ((guest_addr >= r->gpa) &&
(guest_addr < (r->gpa + r->size))) {
@@ -156,9 +156,9 @@ static inline uint16_t vring_avail_ring(const struct vu_virtq *vq, int i)
}
/**
- * virtq_used_event - Get location of used event indices
+ * virtq_used_event() - Get location of used event indices
* (only with VIRTIO_F_EVENT_IDX)
- * @vq Virtqueue
+ * @vq: Virtqueue
*
* Return: return the location of the used event index
*/
@@ -170,7 +170,7 @@ static inline uint16_t *virtq_used_event(const struct vu_virtq *vq)
/**
* vring_get_used_event() - Get the used event from the available ring
- * @vq Virtqueue
+ * @vq: Virtqueue
*
* Return: the used event (available only if VIRTIO_RING_F_EVENT_IDX is set)
* used_event is a performant alternative where the driver
@@ -235,6 +235,7 @@ static int virtqueue_read_indirect_desc(const struct vu_dev *dev,
memcpy(desc, orig_desc, read_len);
len -= read_len;
addr += read_len;
+ /* NOLINTNEXTLINE(bugprone-sizeof-expression,cert-arr39-c) */
desc += read_len / sizeof(struct vring_desc);
}
@@ -243,9 +244,9 @@ static int virtqueue_read_indirect_desc(const struct vu_dev *dev,
/**
* enum virtqueue_read_desc_state - State in the descriptor chain
- * @VIRTQUEUE_READ_DESC_ERROR Found an invalid descriptor
- * @VIRTQUEUE_READ_DESC_DONE No more descriptors in the chain
- * @VIRTQUEUE_READ_DESC_MORE there are more descriptors in the chain
+ * @VIRTQUEUE_READ_DESC_ERROR: Found an invalid descriptor
+ * @VIRTQUEUE_READ_DESC_DONE: No more descriptors in the chain
+ * @VIRTQUEUE_READ_DESC_MORE: there are more descriptors in the chain
*/
enum virtqueue_read_desc_state {
VIRTQUEUE_READ_DESC_ERROR = -1,
@@ -346,8 +347,9 @@ void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq)
die_perror("Error writing vhost-user queue eventfd");
}
-/* virtq_avail_event() - Get location of available event indices
- * (only with VIRTIO_F_EVENT_IDX)
+/**
+ * virtq_avail_event() - Get location of available event indices
+ * (only with VIRTIO_F_EVENT_IDX)
* @vq: Virtqueue
*
* Return: return the location of the available event index
@@ -420,8 +422,8 @@ static bool virtqueue_map_desc(const struct vu_dev *dev,
}
/**
- * vu_queue_map_desc - Map the virtqueue descriptor ring into our virtual
- * address space
+ * vu_queue_map_desc() - Map the virtqueue descriptor ring into our virtual
+ * address space
* @dev: Vhost-user device
* @vq: Virtqueue
* @idx: First descriptor ring entry to map
@@ -504,7 +506,7 @@ static int vu_queue_map_desc(const struct vu_dev *dev,
* vu_queue_pop() - Pop an entry from the virtqueue
* @dev: Vhost-user device
* @vq: Virtqueue
- * @elem: Virtqueue element to file with the entry information
+ * @elem: Virtqueue element to fill with the entry information
*
* Return: -1 if there is an error, 0 otherwise
*/
@@ -544,7 +546,7 @@ int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq,
}
/**
- * vu_queue_detach_element() - Detach an element from the virqueue
+ * vu_queue_detach_element() - Detach an element from the virtqueue
* @vq: Virtqueue
*/
void vu_queue_detach_element(struct vu_virtq *vq)
@@ -554,7 +556,7 @@ void vu_queue_detach_element(struct vu_virtq *vq)
}
/**
- * vu_queue_unpop() - Push back the previously popped element from the virqueue
+ * vu_queue_unpop() - Push back the previously popped element from the virtqueue
* @vq: Virtqueue
*/
/* cppcheck-suppress unusedFunction */
@@ -568,6 +570,8 @@ void vu_queue_unpop(struct vu_virtq *vq)
* vu_queue_rewind() - Push back a given number of popped elements
* @vq: Virtqueue
* @num: Number of element to unpop
+ *
+ * Return: true on success, false if not
*/
bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num)
{
diff --git a/virtio.h b/virtio.h
index 7a370bd..12caaa0 100644
--- a/virtio.h
+++ b/virtio.h
@@ -88,7 +88,7 @@ struct vu_dev_region {
uint64_t mmap_addr;
};
-#define VHOST_USER_MAX_QUEUES 2
+#define VHOST_USER_MAX_VQS 2
/*
* Set a reasonable maximum number of ram slots, which will be supported by
@@ -97,10 +97,21 @@ struct vu_dev_region {
#define VHOST_USER_MAX_RAM_SLOTS 32
/**
+ * struct vdev_memory - Describes the shared memory regions for a vhost-user
+ * device
+ * @nregions: Number of shared memory regions
+ * @regions: Guest shared memory regions
+ */
+struct vdev_memory {
+ uint32_t nregions;
+ struct vu_dev_region regions[VHOST_USER_MAX_RAM_SLOTS];
+};
+
+/**
* struct vu_dev - vhost-user device information
* @context: Execution context
- * @nregions: Number of shared memory regions
- * @regions: Guest shared memory regions
+ * @memory: Shared memory regions
+ * @vq: Virtqueues of the device
* @features: Vhost-user features
* @protocol_features: Vhost-user protocol features
* @log_call_fd: Eventfd to report logging update
@@ -109,9 +120,8 @@ struct vu_dev_region {
*/
struct vu_dev {
struct ctx *context;
- uint32_t nregions;
- struct vu_dev_region regions[VHOST_USER_MAX_RAM_SLOTS];
- struct vu_virtq vq[VHOST_USER_MAX_QUEUES];
+ struct vdev_memory memory;
+ struct vu_virtq vq[VHOST_USER_MAX_VQS];
uint64_t features;
uint64_t protocol_features;
int log_call_fd;
@@ -140,7 +150,7 @@ struct vu_virtq_element {
* @features: Features set
* @fb: Feature bit to check
*
- * Return: True if the feature bit is set
+ * Return: true if the feature bit is set
*/
static inline bool has_feature(uint64_t features, unsigned int fbit)
{
@@ -150,9 +160,9 @@ static inline bool has_feature(uint64_t features, unsigned int fbit)
/**
* vu_has_feature() - Check if a virtio-net feature is available
* @vdev: Vhost-user device
- * @bit: Feature to check
+ * @fbit: Feature to check
*
- * Return: True if the feature is available
+ * Return: true if the feature is available
*/
static inline bool vu_has_feature(const struct vu_dev *vdev,
unsigned int fbit)
@@ -163,9 +173,9 @@ static inline bool vu_has_feature(const struct vu_dev *vdev,
/**
* vu_has_protocol_feature() - Check if a vhost-user feature is available
* @vdev: Vhost-user device
- * @bit: Feature to check
+ * @fbit: Feature to check
*
- * Return: True if the feature is available
+ * Return: true if the feature is available
*/
/* cppcheck-suppress unusedFunction */
static inline bool vu_has_protocol_feature(const struct vu_dev *vdev,
diff --git a/vu_common.c b/vu_common.c
index 686a09b..b716070 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -25,22 +25,28 @@
/**
* vu_packet_check_range() - Check if a given memory zone is contained in
* a mapped guest memory region
- * @buf: Array of the available memory regions
+ * @memory: Array of the available memory regions
* @ptr: Start of desired data range
- * @size: Length of desired data range
+ * @len: Length of desired data range
*
* Return: 0 if the zone is in a mapped memory region, -1 otherwise
*/
-int vu_packet_check_range(void *buf, const char *ptr, size_t len)
+int vu_packet_check_range(struct vdev_memory *memory,
+ const char *ptr, size_t len)
{
- struct vu_dev_region *dev_region;
+ struct vu_dev_region *dev_region = memory->regions;
+ unsigned int i;
- for (dev_region = buf; dev_region->mmap_addr; dev_region++) {
+ for (i = 0; i < memory->nregions; i++) {
+ uintptr_t base_addr = dev_region[i].mmap_addr +
+ dev_region[i].mmap_offset;
/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
- char *m = (char *)(uintptr_t)dev_region->mmap_addr;
+ const char *base = (const char *)base_addr;
- if (m <= ptr &&
- ptr + len <= m + dev_region->mmap_offset + dev_region->size)
+ ASSERT(base_addr >= dev_region[i].mmap_addr);
+
+ if (len <= dev_region[i].size && base <= ptr &&
+ (size_t)(ptr - base) <= dev_region[i].size - len)
return 0;
}
@@ -159,7 +165,6 @@ static void vu_handle_tx(struct vu_dev *vdev, int index,
struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
struct iovec out_sg[VIRTQUEUE_MAX_SIZE];
struct vu_virtq *vq = &vdev->vq[index];
- int hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
int out_sg_count;
int count;
@@ -172,6 +177,7 @@ static void vu_handle_tx(struct vu_dev *vdev, int index,
while (count < VIRTQUEUE_MAX_SIZE &&
out_sg_count + VU_MAX_TX_BUFFER_NB <= VIRTQUEUE_MAX_SIZE) {
int ret;
+ struct iov_tail data;
elem[count].out_num = VU_MAX_TX_BUFFER_NB;
elem[count].out_sg = &out_sg[out_sg_count];
@@ -187,25 +193,10 @@ static void vu_handle_tx(struct vu_dev *vdev, int index,
warn("virtio-net transmit queue contains no out buffers");
break;
}
- if (elem[count].out_num == 1) {
- tap_add_packet(vdev->context,
- elem[count].out_sg[0].iov_len - hdrlen,
- (char *)elem[count].out_sg[0].iov_base +
- hdrlen);
- } else {
- /* vnet header can be in a separate iovec */
- if (elem[count].out_num != 2) {
- debug("virtio-net transmit queue contains more than one buffer ([%d]: %u)",
- count, elem[count].out_num);
- } else if (elem[count].out_sg[0].iov_len != (size_t)hdrlen) {
- debug("virtio-net transmit queue entry not aligned on hdrlen ([%d]: %d != %zu)",
- count, hdrlen, elem[count].out_sg[0].iov_len);
- } else {
- tap_add_packet(vdev->context,
- elem[count].out_sg[1].iov_len,
- (char *)elem[count].out_sg[1].iov_base);
- }
- }
+
+ data = IOV_TAIL(elem[count].out_sg, elem[count].out_num, 0);
+ if (IOV_DROP_HEADER(&data, struct virtio_net_hdr_mrg_rxbuf))
+ tap_add_packet(vdev->context, &data, now);
count++;
}