aboutgitcodebugslistschat
diff options
context:
space:
mode:
-rw-r--r--CONTRIBUTING.md156
-rw-r--r--Makefile25
-rw-r--r--README.md6
-rw-r--r--arp.c178
-rw-r--r--arp.h5
-rw-r--r--checksum.c8
-rw-r--r--conf.c484
-rw-r--r--conf.h1
-rw-r--r--contrib/fedora/passt.spec33
-rw-r--r--contrib/selinux/passt-repair.te16
-rw-r--r--contrib/selinux/passt.te8
-rw-r--r--contrib/selinux/pasta.fc16
-rw-r--r--contrib/selinux/pasta.te53
-rw-r--r--dhcp.c48
-rw-r--r--dhcp.h2
-rw-r--r--dhcpv6.c235
-rw-r--r--dhcpv6.h2
-rw-r--r--doc/platform-requirements/.gitignore1
-rw-r--r--doc/platform-requirements/Makefile4
-rw-r--r--doc/platform-requirements/common.h1
-rw-r--r--doc/platform-requirements/listen-vs-repair.c128
-rw-r--r--doc/platform-requirements/reuseaddr-priority.c6
-rw-r--r--epoll_ctl.c45
-rw-r--r--epoll_ctl.h51
-rw-r--r--epoll_type.h6
-rw-r--r--flow.c278
-rw-r--r--flow.h22
-rw-r--r--flow_table.h4
-rw-r--r--fwd.c410
-rw-r--r--fwd.h19
-rw-r--r--icmp.c74
-rw-r--r--icmp.h4
-rw-r--r--inany.c5
-rw-r--r--inany.h44
-rw-r--r--iov.c183
-rw-r--r--iov.h58
-rw-r--r--ip.c33
-rw-r--r--ip.h5
-rw-r--r--isolation.c13
-rw-r--r--lineread.c2
-rw-r--r--linux_dep.h8
-rw-r--r--log.c8
-rw-r--r--log.h10
-rw-r--r--migrate.c10
-rw-r--r--ndp.c57
-rw-r--r--ndp.h6
-rw-r--r--netlink.c220
-rw-r--r--netlink.h4
-rw-r--r--packet.c162
-rw-r--r--packet.h47
-rw-r--r--passt-repair.16
-rw-r--r--passt-repair.c108
-rw-r--r--passt.157
-rw-r--r--passt.c275
-rw-r--r--passt.h54
-rw-r--r--pasta.c50
-rw-r--r--pcap.c56
-rw-r--r--pcap.h2
-rw-r--r--pif.c55
-rw-r--r--pif.h2
-rw-r--r--repair.c76
-rw-r--r--repair.h3
-rwxr-xr-xseccomp.sh3
-rw-r--r--siphash.h2
-rw-r--r--tap.c320
-rw-r--r--tap.h43
-rw-r--r--tcp.c1246
-rw-r--r--tcp.h17
-rw-r--r--tcp_buf.c112
-rw-r--r--tcp_conn.h35
-rw-r--r--tcp_internal.h29
-rw-r--r--tcp_splice.c82
-rw-r--r--tcp_vu.c64
-rw-r--r--test/.gitignore2
-rw-r--r--test/Makefile93
-rw-r--r--test/README.md35
-rw-r--r--test/build/all61
-rwxr-xr-xtest/build/build.py110
-rw-r--r--test/build/clang_tidy17
-rw-r--r--test/build/cppcheck17
-rwxr-xr-xtest/build/static_checkers.sh42
-rw-r--r--test/demo/podman4
-rw-r--r--test/lib/exeter66
-rwxr-xr-xtest/lib/setup4
-rwxr-xr-xtest/lib/term33
-rwxr-xr-xtest/lib/test7
-rw-r--r--test/memory/passt2
-rw-r--r--test/migrate/basic4
-rw-r--r--test/migrate/basic_fin4
-rw-r--r--test/migrate/iperf3_bidir64
-rw-r--r--test/migrate/iperf3_in64
-rw-r--r--test/migrate/iperf3_many_out64
-rw-r--r--test/migrate/iperf3_out64
-rw-r--r--test/migrate/rampstream_in4
-rw-r--r--test/migrate/rampstream_out4
-rwxr-xr-xtest/passt.mbuto11
-rwxr-xr-xtest/passt.mem.mbuto8
-rw-r--r--test/passt/dhcp4
-rw-r--r--test/passt/ndp6
-rw-r--r--test/passt_in_ns/dhcp4
-rw-r--r--test/pasta/dhcp4
-rw-r--r--test/pasta/ndp6
-rw-r--r--test/pasta_options/log_to_file10
-rw-r--r--test/perf/passt_tcp6
-rw-r--r--test/perf/pasta_tcp2
-rw-r--r--test/perf/pasta_udp10
-rwxr-xr-xtest/prepare-distro-img.sh4
-rwxr-xr-xtest/run24
-rwxr-xr-xtest/smoke/smoke.sh33
-rw-r--r--test/two_guests/basic6
-rw-r--r--udp.c920
-rw-r--r--udp.h18
-rw-r--r--udp_flow.c247
-rw-r--r--udp_flow.h18
-rw-r--r--udp_internal.h6
-rw-r--r--udp_vu.c153
-rw-r--r--udp_vu.h8
-rw-r--r--util.c385
-rw-r--r--util.h68
-rw-r--r--vhost_user.c333
-rw-r--r--vhost_user.h6
-rw-r--r--virtio.c34
-rw-r--r--virtio.h32
-rw-r--r--vu_common.c71
-rw-r--r--vu_common.h3
125 files changed, 6086 insertions, 2780 deletions
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..64c191f
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,156 @@
+<!---
+SPDX-License-Identifier: GPL-2.0-or-later
+Copyright Red Hat
+Author: Yumei Huang <yuhuang@redhat.com>
+-->
+
+# Contributing to passt
+
+Thank you for your interest in contributing! This document explains how
+to prepare patches and participate in the email-based review process.
+
+## Workflow
+
+### 1. Clone the project
+
+ git clone git://passt.top/passt
+
+### 2. Make Changes and Commit
+
+* You can decide to work on the master branch or a separate branch as below:
+
+ cd passt
+ git checkout -b my-feature-branch
+
+* Edit the source code or documentation following the
+ [Linux kernel coding style](https://www.kernel.org/doc/html/latest/process/coding-style.html).
+ When adding or editing local variable declarations, please use the reverse
+ Christmas tree as described
+ [here](https://www.kernel.org/doc/Documentation/process/maintainer-kvm-x86.rst)
+ and [here](https://hisham.hm/2018/06/16/when-listing-repeated-things-make-pyramids/).
+
+* Stage your changes:
+
+ git add <file1> <file2> ...
+
+* Commit with a message:
+
+ git commit
+
+ The message should describe your changes. See
+ [this link](https://docs.kernel.org/process/submitting-patches.html#describe-your-changes)
+ for details. Here is an example of commit message format:
+
+ Subsystem: Brief summary
+
+ More detailed explanation if needed, wrapped at 72 chars.
+
+ The `Subsystem` means: which part of the code your change touches.
+ For example, it could be "tcp", "test", or "doc" etc.
+
+ If there are some references, use "Links:" tag for anything.
+
+ Besides, passt uses the Linux kernel's "Signed-off-by" process. If you can
+ certify the below:
+
+ Developer's Certificate of Origin 1.1
+
+ By making a contribution to this project, I certify that:
+
+ (a) The contribution was created in whole or in part by me and I
+ have the right to submit it under the open source license
+ indicated in the file; or
+
+ (b) The contribution is based upon previous work that, to the best
+ of my knowledge, is covered under an appropriate open source
+ license and I have the right under that license to submit that
+ work with modifications, whether created in whole or in part
+ by me, under the same open source license (unless I am
+ permitted to submit under a different license), as indicated
+ in the file; or
+
+ (c) The contribution was provided directly to me by some other
+ person who certified (a), (b) or (c) and I have not modified
+ it.
+
+ (d) I understand and agree that this project and the contribution
+ are public and that a record of the contribution (including all
+ personal information I submit with it, including my sign-off) is
+ maintained indefinitely and may be redistributed consistent with
+ this project or the open source license(s) involved.
+
+ Add this line:
+
+ Signed-off-by: Random J Developer <random@developer.example.org>
+
+ using your name. This will be done for you automatically if you use
+ `git commit -s`. Reverts should also include "Signed-off-by". `git
+ revert -s` does that for you.
+
+ Any further SoBs (Signed-off-by:'s) following the author's SoB are
+ from people handling and transporting the patch, but were not involved
+ in its development. SoB chains should reflect the **real** route a
+ patch took as it was propagated to the maintainers, with the first SoB
+ entry signalling primary authorship of a single author.
+
+### 3. Generate Patches
+
+Use `git format-patch` to generate patch(es):
+
+ git format-patch -o outgoing/ origin/master
+
+It will generate numbered patch files such as 0001-...patch, 0002-...patch
+etc. in the `outgoing` folder.
+
+Or you can use `git format-patch -n`. For example if you want to format just
+three patches:
+
+ git format-patch -3 -o outgoing/
+
+If you send a series of patches, use the `--cover-letter` option with
+`git format-patch`:
+
+ git format-patch -o outgoing/ origin/main --cover-letter
+
+This will generate a cover letter besides your patches. Edit the cover
+letter before sending.
+
+### 4. Send Patches
+
+Use `git send-email` to send patches directly to the mailing list:
+
+ git send-email --to=passt-dev@passt.top outgoing/000*.patch
+
+If there are CCs (e.g. maintainers, reviewers), you can add them with `--cc`:
+
+ git send-email --to=passt-dev@passt.top --cc=maintainer@example.com
+ outgoing/000*.patch
+
+### 5. Responding to Review Feedback
+
+* Be open to feedback on both code and documentation.
+
+* Update your patch as needed, and regenerate patches:
+
+ git add <file1> <file2> ...
+ git commit --amend
+ git format-patch -v2 -o outgoing/ origin/master
+
+* Send the revised patches:
+
+ git send-email --to=passt-dev@passt.top outgoing/v2-000*.patch
+
+### 6. Tips and Best Practices
+
+* Keep changes focused and easy to review. Please refer to
+ [split-changes](https://docs.kernel.org/process/submitting-patches.html#split-changes)
+ to separate each logical change into a separate patch.
+
+* Test your changes thoroughly. Refer to
+ [test/README.md](/passt/tree/test/README.md) file for testing.
+ It's recommended to run at least a 'make cppcheck' and 'make clang-tidy'
+ other than a specific manual test of the functionality / issue at hand.
+
+* Include documentation updates if your change affects usage.
+
+Thank you for helping improve passt! Your contributions make a big difference.
diff --git a/Makefile b/Makefile
index f2ac8e5..91e037b 100644
--- a/Makefile
+++ b/Makefile
@@ -20,6 +20,7 @@ $(if $(TARGET),,$(error Failed to get target architecture))
# Get 'uname -m'-like architecture description for target
TARGET_ARCH := $(firstword $(subst -, ,$(TARGET)))
TARGET_ARCH := $(patsubst [:upper:],[:lower:],$(TARGET_ARCH))
+TARGET_ARCH := $(patsubst arm%,arm,$(TARGET_ARCH))
TARGET_ARCH := $(subst powerpc,ppc,$(TARGET_ARCH))
# On some systems enabling optimization also enables source fortification,
@@ -29,30 +30,30 @@ ifeq ($(shell $(CC) -O2 -dM -E - < /dev/null 2>&1 | grep ' _FORTIFY_SOURCE ' > /
FORTIFY_FLAG := -D_FORTIFY_SOURCE=2
endif
-FLAGS := -Wall -Wextra -Wno-format-zero-length
+FLAGS := -Wall -Wextra -Wno-format-zero-length -Wformat-security
FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
FLAGS += $(FORTIFY_FLAG) -O2 -pie -fPIE
FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE)
FLAGS += -DVERSION=\"$(VERSION)\"
FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
-PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
- icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
- ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c \
- repair.c tap.c tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c \
- udp_vu.c util.c vhost_user.c virtio.c vu_common.c
+PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c epoll_ctl.c \
+ flow.c fwd.c icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c \
+ log.c mld.c ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c \
+ pif.c repair.c tap.c tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c \
+ udp_flow.c udp_vu.c util.c vhost_user.c virtio.c vu_common.c
QRAP_SRCS = qrap.c
PASST_REPAIR_SRCS = passt-repair.c
SRCS = $(PASST_SRCS) $(QRAP_SRCS) $(PASST_REPAIR_SRCS)
MANPAGES = passt.1 pasta.1 qrap.1 passt-repair.1
-PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
- flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
- lineread.h log.h migrate.h ndp.h netlink.h packet.h passt.h pasta.h \
- pcap.h pif.h repair.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h \
- tcp_internal.h tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h \
- udp_vu.h util.h vhost_user.h virtio.h vu_common.h
+PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h epoll_ctl.h \
+ flow.h fwd.h flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h \
+ isolation.h lineread.h log.h migrate.h ndp.h netlink.h packet.h \
+ passt.h pasta.h pcap.h pif.h repair.h siphash.h tap.h tcp.h tcp_buf.h \
+ tcp_conn.h tcp_internal.h tcp_splice.h tcp_vu.h udp.h udp_flow.h \
+ udp_internal.h udp_vu.h util.h vhost_user.h virtio.h vu_common.h
HEADERS = $(PASST_HEADERS) seccomp.h
C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}
diff --git a/README.md b/README.md
index 54fed07..8fdc0a3 100644
--- a/README.md
+++ b/README.md
@@ -291,7 +291,7 @@ speeding up local connections, and usually requiring NAT. _pasta_:
* ✅ all capabilities dropped, other than `CAP_NET_BIND_SERVICE` (if granted)
* ✅ with default options, user, mount, IPC, UTS, PID namespaces are detached
* ✅ no external dependencies (other than a standard C library)
-* ✅ restrictive seccomp profiles (30 syscalls allowed for _passt_, 41 for
+* ✅ restrictive seccomp profiles (34 syscalls allowed for _passt_, 43 for
_pasta_ on x86_64)
* ✅ examples of [AppArmor](/passt/tree/contrib/apparmor) and
[SELinux](/passt/tree/contrib/selinux) profiles available
@@ -625,7 +625,9 @@ See also the [test logs](/builds/latest/test/).
### [Mailing Lists](/passt/lists)
* Submit, review patches, and discuss development ideas on
- [`passt-dev`](https://lists.passt.top/postorius/lists/passt-dev.passt.top/)
+ [`passt-dev`](https://lists.passt.top/postorius/lists/passt-dev.passt.top/).
+ Please refer to the [CONTRIBUTING.md](/passt/tree/CONTRIBUTING.md) file for
+ details.
* Ask your questions and discuss usage needs on
[`passt-user`](https://lists.passt.top/postorius/lists/passt-user.passt.top/)
diff --git a/arp.c b/arp.c
index fc482bb..bb042e9 100644
--- a/arp.c
+++ b/arp.c
@@ -31,56 +31,176 @@
#include "tap.h"
/**
- * arp() - Check if this is a supported ARP message, reply as needed
+ * ignore_arp() - Check if we should ignore this ARP message
* @c: Execution context
- * @p: Packet pool, single packet with Ethernet buffer
+ * @ah: ARP header
+ * @am: ARP message
*
- * Return: 1 if handled, -1 on failure
+ * Return: true if the ARP message should be ignored, false otherwise
*/
-int arp(const struct ctx *c, const struct pool *p)
+static bool ignore_arp(const struct ctx *c,
+ const struct arphdr *ah, const struct arpmsg *am)
{
- unsigned char swap[4];
- struct ethhdr *eh;
- struct arphdr *ah;
- struct arpmsg *am;
- size_t l2len;
-
- eh = packet_get(p, 0, 0, sizeof(*eh), NULL);
- ah = packet_get(p, 0, sizeof(*eh), sizeof(*ah), NULL);
- am = packet_get(p, 0, sizeof(*eh) + sizeof(*ah), sizeof(*am), NULL);
-
- if (!eh || !ah || !am)
- return -1;
-
if (ah->ar_hrd != htons(ARPHRD_ETHER) ||
ah->ar_pro != htons(ETH_P_IP) ||
ah->ar_hln != ETH_ALEN ||
ah->ar_pln != 4 ||
ah->ar_op != htons(ARPOP_REQUEST))
- return 1;
+ return true;
/* Discard announcements, but not 0.0.0.0 "probes" */
if (memcmp(am->sip, &in4addr_any, sizeof(am->sip)) &&
!memcmp(am->sip, am->tip, sizeof(am->sip)))
- return 1;
+ return true;
/* Don't resolve the guest's assigned address, either. */
if (!memcmp(am->tip, &c->ip4.addr, sizeof(am->tip)))
+ return true;
+
+ return false;
+}
+
+/**
+ * arp() - Check if this is a supported ARP message, reply as needed
+ * @c: Execution context
+ * @data: Single packet with Ethernet buffer
+ *
+ * Return: 1 if handled, -1 on failure
+ */
+int arp(const struct ctx *c, struct iov_tail *data)
+{
+ union inany_addr tgt;
+ struct {
+ struct ethhdr eh;
+ struct arphdr ah;
+ struct arpmsg am;
+ } __attribute__((__packed__)) resp;
+ struct arphdr ah_storage;
+ struct ethhdr eh_storage;
+ struct arpmsg am_storage;
+ const struct ethhdr *eh;
+ const struct arphdr *ah;
+ const struct arpmsg *am;
+
+ eh = IOV_REMOVE_HEADER(data, eh_storage);
+ ah = IOV_REMOVE_HEADER(data, ah_storage);
+ am = IOV_REMOVE_HEADER(data, am_storage);
+ if (!eh || !ah || !am)
+ return -1;
+
+ if (ignore_arp(c, ah, am))
return 1;
- ah->ar_op = htons(ARPOP_REPLY);
- memcpy(am->tha, am->sha, sizeof(am->tha));
- memcpy(am->sha, c->our_tap_mac, sizeof(am->sha));
+ /* Ethernet header */
+ resp.eh.h_proto = htons(ETH_P_ARP);
+ memcpy(resp.eh.h_dest, eh->h_source, sizeof(resp.eh.h_dest));
+ memcpy(resp.eh.h_source, c->our_tap_mac, sizeof(resp.eh.h_source));
- memcpy(swap, am->tip, sizeof(am->tip));
- memcpy(am->tip, am->sip, sizeof(am->tip));
- memcpy(am->sip, swap, sizeof(am->sip));
+ /* ARP header */
+ resp.ah.ar_op = htons(ARPOP_REPLY);
+ resp.ah.ar_hrd = ah->ar_hrd;
+ resp.ah.ar_pro = ah->ar_pro;
+ resp.ah.ar_hln = ah->ar_hln;
+ resp.ah.ar_pln = ah->ar_pln;
- l2len = sizeof(*eh) + sizeof(*ah) + sizeof(*am);
- memcpy(eh->h_dest, eh->h_source, sizeof(eh->h_dest));
- memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
+ /* MAC address to return in ARP message */
+ inany_from_af(&tgt, AF_INET, am->tip);
+ fwd_neigh_mac_get(c, &tgt, resp.am.sha);
- tap_send_single(c, eh, l2len);
+ /* Rest of ARP message */
+ memcpy(resp.am.sip, am->tip, sizeof(resp.am.sip));
+ memcpy(resp.am.tha, am->sha, sizeof(resp.am.tha));
+ memcpy(resp.am.tip, am->sip, sizeof(resp.am.tip));
+
+ tap_send_single(c, &resp, sizeof(resp));
return 1;
}
+
+/**
+ * arp_send_init_req() - Send initial ARP request to retrieve guest MAC address
+ * @c: Execution context
+ */
+void arp_send_init_req(const struct ctx *c)
+{
+ struct {
+ struct ethhdr eh;
+ struct arphdr ah;
+ struct arpmsg am;
+ } __attribute__((__packed__)) req;
+
+ /* Ethernet header */
+ req.eh.h_proto = htons(ETH_P_ARP);
+ memcpy(req.eh.h_dest, MAC_BROADCAST, sizeof(req.eh.h_dest));
+ memcpy(req.eh.h_source, c->our_tap_mac, sizeof(req.eh.h_source));
+
+ /* ARP header */
+ req.ah.ar_op = htons(ARPOP_REQUEST);
+ req.ah.ar_hrd = htons(ARPHRD_ETHER);
+ req.ah.ar_pro = htons(ETH_P_IP);
+ req.ah.ar_hln = ETH_ALEN;
+ req.ah.ar_pln = 4;
+
+ /* ARP message */
+ memcpy(req.am.sha, c->our_tap_mac, sizeof(req.am.sha));
+ memcpy(req.am.sip, &c->ip4.our_tap_addr, sizeof(req.am.sip));
+ memcpy(req.am.tha, MAC_BROADCAST, sizeof(req.am.tha));
+ memcpy(req.am.tip, &c->ip4.addr, sizeof(req.am.tip));
+
+ debug("Sending initial ARP request for guest MAC address");
+ tap_send_single(c, &req, sizeof(req));
+}
+
+/**
+ * arp_announce() - Send an ARP announcement for an IPv4 host
+ * @c: Execution context
+ * @ip: IPv4 address we announce as owned by @mac
+ * @mac: MAC address to advertise for @ip
+ */
+void arp_announce(const struct ctx *c, struct in_addr *ip,
+ const unsigned char *mac)
+{
+ char ip_str[INET_ADDRSTRLEN];
+ char mac_str[ETH_ADDRSTRLEN];
+ struct {
+ struct ethhdr eh;
+ struct arphdr ah;
+ struct arpmsg am;
+ } __attribute__((__packed__)) msg;
+
+ if (!tap_is_ready(c))
+ return;
+
+ /* Ethernet header */
+ msg.eh.h_proto = htons(ETH_P_ARP);
+ memcpy(msg.eh.h_dest, MAC_BROADCAST, sizeof(msg.eh.h_dest));
+ memcpy(msg.eh.h_source, mac, sizeof(msg.eh.h_source));
+
+ /* ARP header */
+ msg.ah.ar_op = htons(ARPOP_REQUEST);
+ msg.ah.ar_hrd = htons(ARPHRD_ETHER);
+ msg.ah.ar_pro = htons(ETH_P_IP);
+ msg.ah.ar_hln = ETH_ALEN;
+ msg.ah.ar_pln = 4;
+
+ /* RFC5227, section 2.1.1, about Probe messages: "The client MUST fill
+ * in the 'sender hardware address' field of the ARP Request with the
+ * hardware address of the interface through which it is sending the
+ * packet. [...] The 'target hardware address' field is ignored and
+ * SHOULD be set to all zeroes."
+ *
+ * RFC5227, section 2.3: "An ARP Announcement is identical to the ARP
+ * Probe described above, except that now the sender and target IP
+ * addresses are both set to the host's newly selected IPv4 address."
+ */
+ memcpy(msg.am.sha, mac, sizeof(msg.am.sha));
+ memcpy(msg.am.sip, ip, sizeof(msg.am.sip));
+ memcpy(msg.am.tha, MAC_ZERO, sizeof(msg.am.tha));
+ memcpy(msg.am.tip, ip, sizeof(msg.am.tip));
+
+ inet_ntop(AF_INET, ip, ip_str, sizeof(ip_str));
+ eth_ntop(mac, mac_str, sizeof(mac_str));
+ debug("ARP announcement for %s / %s", ip_str, mac_str);
+
+ tap_send_single(c, &msg, sizeof(msg));
+}
diff --git a/arp.h b/arp.h
index ac5cd16..4862e90 100644
--- a/arp.h
+++ b/arp.h
@@ -20,6 +20,9 @@ struct arpmsg {
unsigned char tip[4];
} __attribute__((__packed__));
-int arp(const struct ctx *c, const struct pool *p);
+int arp(const struct ctx *c, struct iov_tail *data);
+void arp_send_init_req(const struct ctx *c);
+void arp_announce(const struct ctx *c, struct in_addr *ip,
+ const unsigned char *mac);
#endif /* ARP_H */
diff --git a/checksum.c b/checksum.c
index 0894eca..0c3837c 100644
--- a/checksum.c
+++ b/checksum.c
@@ -145,7 +145,7 @@ uint16_t csum_ip4_header(uint16_t l3len, uint8_t protocol,
* @proto: Protocol number
* @saddr: Source address
* @daddr: Destination address
- * Returns: Partial checksum of the IPv4 header
+ * Return: partial checksum of the IPv4 header
*/
uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol,
struct in_addr saddr, struct in_addr daddr)
@@ -225,7 +225,7 @@ void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen)
* @proto: Protocol number
* @saddr: Source address
* @daddr: Destination address
- * Returns: Partial checksum of the IPv6 header
+ * Return: partial checksum of the IPv6 header
*/
uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol,
const struct in6_addr *saddr,
@@ -452,7 +452,7 @@ less_than_128_bytes:
}
/**
- * csum_unfolded - Calculate the unfolded checksum of a data buffer.
+ * csum_unfolded() - Calculate the unfolded checksum of a data buffer.
*
* @buf: Input buffer
* @len: Input length
@@ -481,7 +481,7 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init)
}
#else /* __AVX2__ */
/**
- * csum_unfolded - Calculate the unfolded checksum of a data buffer.
+ * csum_unfolded() - Calculate the unfolded checksum of a data buffer.
*
* @buf: Input buffer
* @len: Input length
diff --git a/conf.c b/conf.c
index 065e720..2942c8c 100644
--- a/conf.c
+++ b/conf.c
@@ -16,6 +16,7 @@
#include <errno.h>
#include <fcntl.h>
#include <getopt.h>
+#include <libgen.h>
#include <string.h>
#include <sched.h>
#include <sys/types.h>
@@ -64,11 +65,11 @@
const char *pasta_default_ifn = "tap0";
/**
- * next_chunk - Return the next piece of a string delimited by a character
+ * next_chunk() - Return the next piece of a string delimited by a character
* @s: String to search
* @c: Delimiter character
*
- * Return: If another @c is found in @s, returns a pointer to the
+ * Return: if another @c is found in @s, returns a pointer to the
* character *after* the delimiter, if no further @c is in @s,
* return NULL
*/
@@ -79,7 +80,7 @@ static char *next_chunk(const char *s, char c)
}
/**
- * port_range - Represents a non-empty range of ports
+ * port_range() - Represents a non-empty range of ports
* @first: First port number in the range
* @last: Last port number in the range (inclusive)
*
@@ -124,6 +125,81 @@ static int parse_port_range(const char *s, char **endptr,
}
/**
+ * conf_ports_range_except() - Set up forwarding for a range of ports minus a
+ * bitmap of exclusions
+ * @c: Execution context
+ * @optname: Short option name, t, T, u, or U
+ * @optarg: Option argument (port specification)
+ * @fwd: Pointer to @fwd_ports to be updated
+ * @addr: Listening address
+ * @ifname: Listening interface
+ * @first: First port to forward
+ * @last: Last port to forward
+ * @exclude: Bitmap of ports to exclude
+ * @to: Port to translate @first to when forwarding
+ * @weak: Ignore errors, as long as at least one port is mapped
+ */
+static void conf_ports_range_except(const struct ctx *c, char optname,
+ const char *optarg, struct fwd_ports *fwd,
+ const union inany_addr *addr,
+ const char *ifname,
+ uint16_t first, uint16_t last,
+ const uint8_t *exclude, uint16_t to,
+ bool weak)
+{
+ bool bound_one = false;
+ unsigned i;
+ int ret;
+
+ if (first == 0) {
+ die("Can't forward port 0 for option '-%c %s'",
+ optname, optarg);
+ }
+
+ if (ifname && c->no_bindtodevice) {
+ die(
+"Device binding for '-%c %s' unsupported (requires kernel 5.7+)",
+ optname, optarg);
+ }
+
+ for (i = first; i <= last; i++) {
+ if (bitmap_isset(exclude, i))
+ continue;
+
+ if (bitmap_isset(fwd->map, i)) {
+ warn(
+"Altering mapping of already mapped port number: %s", optarg);
+ }
+
+ bitmap_set(fwd->map, i);
+ fwd->delta[i] = to - first;
+
+ if (optname == 't')
+ ret = tcp_sock_init(c, PIF_HOST, addr, ifname, i);
+ else if (optname == 'u')
+ ret = udp_sock_init(c, PIF_HOST, addr, ifname, i);
+ else
+ /* No way to check in advance for -T and -U */
+ ret = 0;
+
+ if (ret == -ENFILE || ret == -EMFILE) {
+ die("Can't open enough sockets for port specifier: %s",
+ optarg);
+ }
+
+ if (!ret) {
+ bound_one = true;
+ } else if (!weak) {
+ die("Failed to bind port %u (%s) for option '-%c %s'",
+ i, strerror_(-ret), optname, optarg);
+ }
+ }
+
+ if (!bound_one)
+ die("Failed to bind any port for '-%c %s'", optname, optarg);
+}
+
+/**
* conf_ports() - Parse port configuration options, initialise UDP/TCP sockets
* @c: Execution context
* @optname: Short option name, t, T, u, or U
@@ -135,10 +211,9 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
{
union inany_addr addr_buf = inany_any6, *addr = &addr_buf;
char buf[BUFSIZ], *spec, *ifname = NULL, *p;
- bool exclude_only = true, bound_one = false;
uint8_t exclude[PORT_BITMAP_SIZE] = { 0 };
+ bool exclude_only = true;
unsigned i;
- int ret;
if (!strcmp(optarg, "none")) {
if (fwd->mode)
@@ -160,6 +235,12 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
if (c->mode != MODE_PASTA)
die("'auto' port forwarding is only allowed for pasta");
+ if ((optname == 'T' || optname == 'U') && c->no_bindtodevice) {
+ warn(
+"'-%c auto' enabled without unprivileged SO_BINDTODEVICE", optname);
+ warn(
+"Forwarding from addresses other than 127.0.0.1 will not work");
+ }
fwd->mode = FWD_AUTO;
return;
}
@@ -173,32 +254,15 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
fwd->mode = FWD_ALL;
- /* Skip port 0. It has special meaning for many socket APIs, so
- * trying to bind it is not really safe.
- */
- for (i = 1; i < NUM_PORTS; i++) {
+ /* Exclude ephemeral ports */
+ for (i = 0; i < NUM_PORTS; i++)
if (fwd_port_is_ephemeral(i))
- continue;
-
- bitmap_set(fwd->map, i);
- if (optname == 't') {
- ret = tcp_sock_init(c, NULL, NULL, i);
- if (ret == -ENFILE || ret == -EMFILE)
- goto enfile;
- if (!ret)
- bound_one = true;
- } else if (optname == 'u') {
- ret = udp_sock_init(c, 0, NULL, NULL, i);
- if (ret == -ENFILE || ret == -EMFILE)
- goto enfile;
- if (!ret)
- bound_one = true;
- }
- }
-
- if (!bound_one)
- goto bind_all_fail;
+ bitmap_set(exclude, i);
+ conf_ports_range_except(c, optname, optarg, fwd,
+ NULL, NULL,
+ 1, NUM_PORTS - 1, exclude,
+ 1, true);
return;
}
@@ -275,37 +339,15 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
} while ((p = next_chunk(p, ',')));
if (exclude_only) {
- /* Skip port 0. It has special meaning for many socket APIs, so
- * trying to bind it is not really safe.
- */
- for (i = 1; i < NUM_PORTS; i++) {
- if (fwd_port_is_ephemeral(i) ||
- bitmap_isset(exclude, i))
- continue;
-
- bitmap_set(fwd->map, i);
-
- if (optname == 't') {
- ret = tcp_sock_init(c, addr, ifname, i);
- if (ret == -ENFILE || ret == -EMFILE)
- goto enfile;
- if (!ret)
- bound_one = true;
- } else if (optname == 'u') {
- ret = udp_sock_init(c, 0, addr, ifname, i);
- if (ret == -ENFILE || ret == -EMFILE)
- goto enfile;
- if (!ret)
- bound_one = true;
- } else {
- /* No way to check in advance for -T and -U */
- bound_one = true;
- }
- }
-
- if (!bound_one)
- goto bind_all_fail;
+ /* Exclude ephemeral ports */
+ for (i = 0; i < NUM_PORTS; i++)
+ if (fwd_port_is_ephemeral(i))
+ bitmap_set(exclude, i);
+ conf_ports_range_except(c, optname, optarg, fwd,
+ addr, ifname,
+ 1, NUM_PORTS - 1, exclude,
+ 1, true);
return;
}
@@ -334,40 +376,18 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
if ((*p != '\0') && (*p != ',')) /* Garbage after the ranges */
goto bad;
- for (i = orig_range.first; i <= orig_range.last; i++) {
- if (bitmap_isset(fwd->map, i))
- warn(
-"Altering mapping of already mapped port number: %s", optarg);
-
- if (bitmap_isset(exclude, i))
- continue;
-
- bitmap_set(fwd->map, i);
-
- fwd->delta[i] = mapped_range.first - orig_range.first;
-
- ret = 0;
- if (optname == 't')
- ret = tcp_sock_init(c, addr, ifname, i);
- else if (optname == 'u')
- ret = udp_sock_init(c, 0, addr, ifname, i);
- if (ret)
- goto bind_fail;
- }
+ conf_ports_range_except(c, optname, optarg, fwd,
+ addr, ifname,
+ orig_range.first, orig_range.last,
+ exclude,
+ mapped_range.first, false);
} while ((p = next_chunk(p, ',')));
return;
-enfile:
- die("Can't open enough sockets for port specifier: %s", optarg);
bad:
die("Invalid port specifier %s", optarg);
mode_conflict:
die("Port forwarding mode '%s' conflicts with previous mode", optarg);
-bind_fail:
- die("Failed to bind port %u (%s) for option '-%c %s', exiting",
- i, strerror_(-ret), optname, optarg);
-bind_all_fail:
- die("Failed to bind any port for '-%c %s', exiting", optname, optarg);
}
/**
@@ -376,7 +396,7 @@ bind_all_fail:
* @addr: Guest nameserver IPv4 address
* @idx: Index of free entry in array of IPv4 resolvers
*
- * Return: Number of entries added (0 or 1)
+ * Return: number of entries added (0 or 1)
*/
static unsigned add_dns4(struct ctx *c, const struct in_addr *addr,
unsigned idx)
@@ -394,7 +414,7 @@ static unsigned add_dns4(struct ctx *c, const struct in_addr *addr,
* @addr: Guest nameserver IPv6 address
* @idx: Index of free entry in array of IPv6 resolvers
*
- * Return: Number of entries added (0 or 1)
+ * Return: number of entries added (0 or 1)
*/
static unsigned add_dns6(struct ctx *c, const struct in6_addr *addr,
unsigned idx)
@@ -407,6 +427,76 @@ static unsigned add_dns6(struct ctx *c, const struct in6_addr *addr,
}
/**
+ * add_dns_resolv4() - Possibly add one IPv4 nameserver from host's resolv.conf
+ * @c: Execution context
+ * @ns: Nameserver address
+ * @idx: Pointer to index of current IPv4 resolver entry, set on return
+ */
+static void add_dns_resolv4(struct ctx *c, struct in_addr *ns, unsigned *idx)
+{
+ if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host))
+ c->ip4.dns_host = *ns;
+
+ /* Special handling if guest or container can only access local
+ * addresses via redirect, or if the host gateway is also a resolver and
+ * we shadow its address
+ */
+ if (IN4_IS_ADDR_LOOPBACK(ns) ||
+ IN4_ARE_ADDR_EQUAL(ns, &c->ip4.map_host_loopback)) {
+ if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match)) {
+ if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback))
+ return; /* Address unreachable */
+
+ *ns = c->ip4.map_host_loopback;
+ c->ip4.dns_match = c->ip4.map_host_loopback;
+ } else {
+ /* No general host mapping, but requested for DNS
+ * (--dns-forward and --no-map-gw): advertise resolver
+ * address from --dns-forward, and map that to loopback
+ */
+ *ns = c->ip4.dns_match;
+ }
+ }
+
+ *idx += add_dns4(c, ns, *idx);
+}
+
+/**
+ * add_dns_resolv6() - Possibly add one IPv6 nameserver from host's resolv.conf
+ * @c: Execution context
+ * @ns: Nameserver address
+ * @idx: Pointer to index of current IPv6 resolver entry, set on return
+ */
+static void add_dns_resolv6(struct ctx *c, struct in6_addr *ns, unsigned *idx)
+{
+ if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host))
+ c->ip6.dns_host = *ns;
+
+ /* Special handling if guest or container can only access local
+ * addresses via redirect, or if the host gateway is also a resolver and
+ * we shadow its address
+ */
+ if (IN6_IS_ADDR_LOOPBACK(ns) ||
+ IN6_ARE_ADDR_EQUAL(ns, &c->ip6.map_host_loopback)) {
+ if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match)) {
+ if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback))
+ return; /* Address unreachable */
+
+ *ns = c->ip6.map_host_loopback;
+ c->ip6.dns_match = c->ip6.map_host_loopback;
+ } else {
+ /* No general host mapping, but requested for DNS
+ * (--dns-forward and --no-map-gw): advertise resolver
+ * address from --dns-forward, and map that to loopback
+ */
+ *ns = c->ip6.dns_match;
+ }
+ }
+
+ *idx += add_dns6(c, ns, *idx);
+}
+
+/**
* add_dns_resolv() - Possibly add ns from host resolv.conf to configuration
* @c: Execution context
* @nameserver: Nameserver address string from /etc/resolv.conf
@@ -422,48 +512,11 @@ static void add_dns_resolv(struct ctx *c, const char *nameserver,
struct in6_addr ns6;
struct in_addr ns4;
- if (idx4 && inet_pton(AF_INET, nameserver, &ns4)) {
- if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host))
- c->ip4.dns_host = ns4;
-
- /* Special handling if guest or container can only access local
- * addresses via redirect, or if the host gateway is also a
- * resolver and we shadow its address
- */
- if (IN4_IS_ADDR_LOOPBACK(&ns4) ||
- IN4_ARE_ADDR_EQUAL(&ns4, &c->ip4.map_host_loopback)) {
- if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback))
- return;
-
- ns4 = c->ip4.map_host_loopback;
- if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match))
- c->ip4.dns_match = c->ip4.map_host_loopback;
- }
-
- *idx4 += add_dns4(c, &ns4, *idx4);
- }
-
- if (idx6 && inet_pton(AF_INET6, nameserver, &ns6)) {
- if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host))
- c->ip6.dns_host = ns6;
-
- /* Special handling if guest or container can only access local
- * addresses via redirect, or if the host gateway is also a
- * resolver and we shadow its address
- */
- if (IN6_IS_ADDR_LOOPBACK(&ns6) ||
- IN6_ARE_ADDR_EQUAL(&ns6, &c->ip6.map_host_loopback)) {
- if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback))
- return;
+ if (idx4 && inet_pton(AF_INET, nameserver, &ns4))
+ add_dns_resolv4(c, &ns4, idx4);
- ns6 = c->ip6.map_host_loopback;
-
- if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match))
- c->ip6.dns_match = c->ip6.map_host_loopback;
- }
-
- *idx6 += add_dns6(c, &ns6, *idx6);
- }
+ if (idx6 && inet_pton(AF_INET6, nameserver, &ns6))
+ add_dns_resolv6(c, &ns6, idx6);
}
/**
@@ -615,7 +668,7 @@ static void conf_pasta_ns(int *netns_only, char *userns, char *netns,
/** conf_ip4_prefix() - Parse an IPv4 prefix length or netmask
* @arg: Netmask in dotted decimal or prefix length
*
- * Return: Validated prefix length on success, -1 on failure
+ * Return: validated prefix length on success, -1 on failure
*/
static int conf_ip4_prefix(const char *arg)
{
@@ -642,7 +695,7 @@ static int conf_ip4_prefix(const char *arg)
* @ifi: Host interface to attempt (0 to determine one)
* @ip4: IPv4 context (will be written)
*
- * Return: Interface index for IPv4, or 0 on failure.
+ * Return: interface index for IPv4, or 0 on failure.
*/
static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4)
{
@@ -714,7 +767,7 @@ static void conf_ip4_local(struct ip4_ctx *ip4)
* @ifi: Host interface to attempt (0 to determine one)
* @ip6: IPv6 context (will be written)
*
- * Return: Interface index for IPv6, or 0 on failure.
+ * Return: interface index for IPv6, or 0 on failure.
*/
static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6)
{
@@ -773,7 +826,7 @@ static void conf_ip6_local(struct ip6_ctx *ip6)
* usage() - Print usage, exit with given status code
* @name: Executable name
* @f: Stream to print usage info to
- * @status: Status code for _exit()
+ * @status: Status code for exit(2)
*/
static void usage(const char *name, FILE *f, int status)
{
@@ -794,6 +847,8 @@ static void usage(const char *name, FILE *f, int status)
"\n"
" -d, --debug Be verbose\n"
" --trace Be extra verbose, implies --debug\n"
+ " --stats DELAY Display events statistics\n"
+ " minimum DELAY seconds between updates\n"
" -q, --quiet Don't print informational messages\n"
" -f, --foreground Don't run in background\n"
" default: run in background\n"
@@ -823,6 +878,14 @@ static void usage(const char *name, FILE *f, int status)
FPRINTF(f,
" --repair-path PATH path for passt-repair(1)\n"
" default: append '.repair' to UNIX domain path\n");
+ FPRINTF(f,
+ " --migrate-exit DEPRECATED:\n"
+ " source quits after migration\n"
+ " default: source keeps running after migration\n");
+ FPRINTF(f,
+ " --migrate-no-linger DEPRECATED:\n"
+ " close sockets on migration\n"
+ " default: keep sockets open, ignore events\n");
}
FPRINTF(f,
@@ -934,7 +997,7 @@ static void usage(const char *name, FILE *f, int status)
" SPEC is as described for TCP above\n"
" default: none\n");
- _exit(status);
+ passt_exit(status);
pasta_opts:
@@ -988,7 +1051,46 @@ pasta_opts:
" --ns-mac-addr ADDR Set MAC address on tap interface\n"
" --no-splice Disable inbound socket splicing\n");
- _exit(status);
+ passt_exit(status);
+}
+
+/**
+ * conf_mode() - Determine passt/pasta's operating mode from command line
+ * @argc: Argument count
+ * @argv: Command line arguments
+ *
+ * Return: mode to operate in, PASTA or PASST
+ */
+enum passt_modes conf_mode(int argc, char *argv[])
+{
+ int vhost_user = 0;
+ const struct option optvu[] = {
+ {"vhost-user", no_argument, &vhost_user, 1 },
+ { 0 },
+ };
+ char argv0[PATH_MAX], *basearg0;
+ int name;
+
+ optind = 0;
+ do {
+ name = getopt_long(argc, argv, "-:", optvu, NULL);
+ } while (name != -1);
+
+ if (vhost_user)
+ return MODE_VU;
+
+ if (argc < 1)
+ die("Cannot determine argv[0]");
+
+ strncpy(argv0, argv[0], PATH_MAX - 1);
+ basearg0 = basename(argv0);
+ if (strstr(basearg0, "pasta"))
+ return MODE_PASTA;
+
+ if (strstr(basearg0, "passt"))
+ return MODE_PASST;
+
+ die("Cannot determine mode, invoke as \"passt\" or \"pasta\"");
}
/**
@@ -1005,7 +1107,7 @@ static void conf_print(const struct ctx *c)
info("Template interface: %s%s%s%s%s",
c->ifi4 > 0 ? if_indextoname(c->ifi4, ifn) : "",
c->ifi4 > 0 ? " (IPv4)" : "",
- (c->ifi4 && c->ifi6) ? ", " : "",
+ (c->ifi4 > 0 && c->ifi6 > 0) ? ", " : "",
c->ifi6 > 0 ? if_indextoname(c->ifi6, ifn) : "",
c->ifi6 > 0 ? " (IPv6)" : "");
}
@@ -1225,6 +1327,8 @@ static void conf_nat(const char *arg, struct in_addr *addr4,
*addr6 = in6addr_any;
if (no_map_gw)
*no_map_gw = 1;
+
+ return;
}
if (inet_pton(AF_INET6, arg, addr6) &&
@@ -1276,7 +1380,7 @@ static void conf_open_files(struct ctx *c)
}
/**
- * parse_mac - Parse a MAC address from a string
+ * parse_mac() - Parse a MAC address from a string
* @mac: Binary MAC address, initialised on success
* @str: String to parse
*
@@ -1386,18 +1490,22 @@ void conf(struct ctx *c, int argc, char **argv)
{"socket-path", required_argument, NULL, 's' },
{"fqdn", required_argument, NULL, 27 },
{"repair-path", required_argument, NULL, 28 },
+ {"migrate-exit", no_argument, NULL, 29 },
+ {"migrate-no-linger", no_argument, NULL, 30 },
+ {"stats", required_argument, NULL, 31 },
{ 0 },
};
+ const char *optstring = "+dqfel:hs:F:I:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:T:U:";
const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt";
char userns[PATH_MAX] = { 0 }, netns[PATH_MAX] = { 0 };
bool copy_addrs_opt = false, copy_routes_opt = false;
enum fwd_ports_mode fwd_default = FWD_NONE;
bool v4_only = false, v6_only = false;
unsigned dns4_idx = 0, dns6_idx = 0;
+ unsigned long max_mtu = IP_MAX_MTU;
struct fqdn *dnss = c->dns_search;
unsigned int ifi4 = 0, ifi6 = 0;
const char *logfile = NULL;
- const char *optstring;
size_t logsize = 0;
char *runas = NULL;
long fd_tap_opt;
@@ -1408,12 +1516,11 @@ void conf(struct ctx *c, int argc, char **argv)
if (c->mode == MODE_PASTA) {
c->no_dhcp_dns = c->no_dhcp_dns_search = 1;
fwd_default = FWD_AUTO;
- optstring = "+dqfel:hF:I:p:P:m:a:n:M:g:i:o:D:S:H:46t:u:T:U:";
- } else {
- optstring = "+dqfel:hs:F:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:";
}
- c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t));
+ if (tap_l2_max_len(c) - ETH_HLEN < max_mtu)
+ max_mtu = tap_l2_max_len(c) - ETH_HLEN;
+ c->mtu = ROUND_DOWN(max_mtu, sizeof(uint32_t));
c->tcp.fwd_in.mode = c->tcp.fwd_out.mode = FWD_UNSET;
c->udp.fwd_in.mode = c->udp.fwd_out.mode = FWD_UNSET;
memcpy(c->our_tap_mac, MAC_OUR_LAA, ETH_ALEN);
@@ -1512,7 +1619,7 @@ void conf(struct ctx *c, int argc, char **argv)
FPRINTF(stdout,
c->mode == MODE_PASTA ? "pasta " : "passt ");
FPRINTF(stdout, VERSION_BLOB);
- _exit(EXIT_SUCCESS);
+ passt_exit(EXIT_SUCCESS);
case 15:
ret = snprintf(c->ip4.ifname_out,
sizeof(c->ip4.ifname_out), "%s", optarg);
@@ -1581,9 +1688,8 @@ void conf(struct ctx *c, int argc, char **argv)
die("Invalid host nameserver address: %s", optarg);
case 25:
- if (c->mode == MODE_PASTA)
- die("--vhost-user is for passt mode only");
- c->mode = MODE_VU;
+ /* Already handled in conf_mode() */
+ ASSERT(c->mode == MODE_VU);
break;
case 26:
vu_print_capabilities();
@@ -1594,7 +1700,31 @@ void conf(struct ctx *c, int argc, char **argv)
die("Invalid FQDN: %s", optarg);
break;
case 28:
- /* Handle this once we checked --vhost-user */
+ if (c->mode != MODE_VU && strcmp(optarg, "none"))
+ die("--repair-path is for vhost-user mode only");
+
+ if (snprintf_check(c->repair_path,
+ sizeof(c->repair_path), "%s",
+ optarg))
+ die("Invalid passt-repair path: %s", optarg);
+
+ break;
+ case 29:
+ if (c->mode != MODE_VU)
+ die("--migrate-exit is for vhost-user mode only");
+ c->migrate_exit = true;
+
+ break;
+ case 30:
+ if (c->mode != MODE_VU)
+ die("--migrate-no-linger is for vhost-user mode only");
+ c->migrate_no_linger = true;
+
+ break;
+ case 31:
+ if (!c->foreground)
+ die("Can't display statistics if not running in foreground");
+ c->stats = strtol(optarg, NULL, 0);
break;
case 'd':
c->debug = 1;
@@ -1614,6 +1744,9 @@ void conf(struct ctx *c, int argc, char **argv)
c->foreground = 1;
break;
case 's':
+ if (c->mode == MODE_PASTA)
+ die("-s is for passt / vhost-user mode only");
+
ret = snprintf(c->sock_path, sizeof(c->sock_path), "%s",
optarg);
if (ret <= 0 || ret >= (int)sizeof(c->sock_path))
@@ -1626,7 +1759,8 @@ void conf(struct ctx *c, int argc, char **argv)
fd_tap_opt = strtol(optarg, NULL, 0);
if (errno ||
- fd_tap_opt <= STDERR_FILENO || fd_tap_opt > INT_MAX)
+ (fd_tap_opt != STDIN_FILENO && fd_tap_opt <= STDERR_FILENO) ||
+ fd_tap_opt > INT_MAX)
die("Invalid --fd: %s", optarg);
c->fd_tap = fd_tap_opt;
@@ -1634,6 +1768,9 @@ void conf(struct ctx *c, int argc, char **argv)
*c->sock_path = 0;
break;
case 'I':
+ if (c->mode != MODE_PASTA)
+ die("-I is for pasta mode only");
+
ret = snprintf(c->pasta_ifn, IFNAMSIZ, "%s",
optarg);
if (ret <= 0 || ret >= IFNAMSIZ)
@@ -1663,9 +1800,9 @@ void conf(struct ctx *c, int argc, char **argv)
if (errno || *e)
die("Invalid MTU: %s", optarg);
- if (mtu > ETH_MAX_MTU) {
- die("MTU %lu too large (max %u)",
- mtu, ETH_MAX_MTU);
+ if (mtu > max_mtu) {
+ die("MTU %lu too large (max %lu)",
+ mtu, max_mtu);
}
c->mtu = mtu;
@@ -1790,11 +1927,16 @@ void conf(struct ctx *c, int argc, char **argv)
break;
case 't':
case 'u':
- case 'T':
- case 'U':
case 'D':
/* Handle these later, once addresses are configured */
break;
+ case 'T':
+ case 'U':
+ if (c->mode != MODE_PASTA)
+ die("-%c is for pasta mode only", name);
+
+ /* Handle properly later, once addresses are configured */
+ break;
case 'h':
usage(argv[0], stdout, EXIT_SUCCESS);
break;
@@ -1856,20 +1998,23 @@ void conf(struct ctx *c, int argc, char **argv)
(*c->ip6.ifname_out && !c->ifi6))
die("External interface not usable");
+ if (!c->ifi4 && !c->ifi6 && !*c->pasta_ifn) {
+ strncpy(c->pasta_ifn, pasta_default_ifn,
+ sizeof(c->pasta_ifn) - 1);
+ }
- if (!c->ifi4 && !c->ifi6) {
- info("No external interface as template, switch to local mode");
+ if (!c->ifi4 && !v6_only) {
+ info("IPv4: no external interface as template, use local mode");
conf_ip4_local(&c->ip4);
c->ifi4 = -1;
+ }
+
+ if (!c->ifi6 && !v4_only) {
+ info("IPv6: no external interface as template, use local mode");
conf_ip6_local(&c->ip6);
c->ifi6 = -1;
-
- if (!*c->pasta_ifn) {
- strncpy(c->pasta_ifn, pasta_default_ifn,
- sizeof(c->pasta_ifn) - 1);
- }
}
if (c->ifi4 && !no_map_gw &&
@@ -1883,8 +2028,8 @@ void conf(struct ctx *c, int argc, char **argv)
if (c->ifi4 && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.guest_gw))
c->no_dhcp = 1;
- /* Inbound port options, DNS, and --repair-path can be parsed now, after
- * IPv4/IPv6 settings and --vhost-user.
+ /* Inbound port options and DNS can be parsed now, after IPv4/IPv6
+ * settings
*/
fwd_probe_ephemeral();
udp_portmap_clear();
@@ -1930,16 +2075,6 @@ void conf(struct ctx *c, int argc, char **argv)
}
die("Cannot use DNS address %s", optarg);
- } else if (name == 28) {
- if (c->mode != MODE_VU && strcmp(optarg, "none"))
- die("--repair-path is for vhost-user mode only");
-
- if (snprintf_check(c->repair_path,
- sizeof(c->repair_path), "%s",
- optarg))
- die("Invalid passt-repair path: %s", optarg);
-
- break;
}
} while (name != -1);
@@ -1952,6 +2087,9 @@ void conf(struct ctx *c, int argc, char **argv)
isolate_user(uid, gid, !netns_only, userns, c->mode);
+ if (c->no_icmp)
+ c->no_ndp = 1;
+
if (c->pasta_conf_ns)
c->no_ra = 1;
@@ -2006,8 +2144,6 @@ void conf(struct ctx *c, int argc, char **argv)
if (!c->udp.fwd_out.mode)
c->udp.fwd_out.mode = fwd_default;
- fwd_scan_ports_init(c);
-
if (!c->quiet)
conf_print(c);
}
diff --git a/conf.h b/conf.h
index 9d2143d..b45ad74 100644
--- a/conf.h
+++ b/conf.h
@@ -6,6 +6,7 @@
#ifndef CONF_H
#define CONF_H
+enum passt_modes conf_mode(int argc, char *argv[]);
void conf(struct ctx *c, int argc, char **argv);
#endif /* CONF_H */
diff --git a/contrib/fedora/passt.spec b/contrib/fedora/passt.spec
index 745cf01..38b06b0 100644
--- a/contrib/fedora/passt.spec
+++ b/contrib/fedora/passt.spec
@@ -9,6 +9,7 @@
%global git_hash {{{ git_head }}}
%global selinuxtype targeted
+%global selinux_policy_version 41.41
Name: passt
Version: {{{ git_version }}}
@@ -33,15 +34,21 @@ for network namespaces: traffic is forwarded using a tap interface inside the
namespace, without the need to create further interfaces on the host, hence not
requiring any capabilities or privileges.
-%package selinux
-BuildArch: noarch
-Summary: SELinux support for passt and pasta
-Requires: %{name} = %{version}-%{release}
-Requires: selinux-policy
-Requires(post): %{name}
-Requires(post): policycoreutils
-Requires(preun): %{name}
-Requires(preun): policycoreutils
+%package selinux
+BuildArch: noarch
+Summary: SELinux support for passt and pasta
+%if 0%{?fedora} > 43
+BuildRequires: selinux-policy-devel
+%selinux_requires_min
+%else
+BuildRequires: pkgconfig(systemd)
+Requires(post): libselinux-utils
+Requires(post): policycoreutils
+%endif
+Requires: container-selinux
+Requires: selinux-policy-%{selinuxtype}
+Requires(post): container-selinux
+Requires(post): selinux-policy-%{selinuxtype}
%description selinux
This package adds SELinux enforcement to passt(1), pasta(1), passt-repair(1).
@@ -89,15 +96,11 @@ popd
%selinux_relabel_pre -s %{selinuxtype}
%post selinux
-%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp
-%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
-%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
+%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
%postun selinux
if [ $1 -eq 0 ]; then
- %selinux_modules_uninstall -s %{selinuxtype} passt
- %selinux_modules_uninstall -s %{selinuxtype} pasta
- %selinux_modules_uninstall -s %{selinuxtype} passt-repair
+ %selinux_modules_uninstall -s %{selinuxtype} passt pasta passt-repair
fi
%posttrans selinux
diff --git a/contrib/selinux/passt-repair.te b/contrib/selinux/passt-repair.te
index f171be6..7157dfb 100644
--- a/contrib/selinux/passt-repair.te
+++ b/contrib/selinux/passt-repair.te
@@ -61,11 +61,11 @@ allow passt_repair_t unconfined_t:unix_stream_socket { connectto read write };
allow passt_repair_t passt_t:unix_stream_socket { connectto read write };
allow passt_repair_t user_tmp_t:unix_stream_socket { connectto read write };
-allow passt_repair_t user_tmp_t:dir search;
+allow passt_repair_t user_tmp_t:dir { getattr read search watch };
-allow passt_repair_t unconfined_t:sock_file { read write };
-allow passt_repair_t passt_t:sock_file { read write };
-allow passt_repair_t user_tmp_t:sock_file { read write };
+allow passt_repair_t unconfined_t:sock_file { getattr read write };
+allow passt_repair_t passt_t:sock_file { getattr read write };
+allow passt_repair_t user_tmp_t:sock_file { getattr read write };
allow passt_repair_t unconfined_t:tcp_socket { read setopt write };
allow passt_repair_t passt_t:tcp_socket { read setopt write };
@@ -80,8 +80,8 @@ allow passt_repair_t passt_t:tcp_socket { read setopt write };
allow passt_repair_t qemu_var_run_t:unix_stream_socket { connectto read write };
allow passt_repair_t virt_var_run_t:unix_stream_socket { connectto read write };
-allow passt_repair_t qemu_var_run_t:dir search;
-allow passt_repair_t virt_var_run_t:dir search;
+allow passt_repair_t qemu_var_run_t:dir { getattr read search watch };
+allow passt_repair_t virt_var_run_t:dir { getattr read search watch };
-allow passt_repair_t qemu_var_run_t:sock_file { read write };
-allow passt_repair_t virt_var_run_t:sock_file { read write };
+allow passt_repair_t qemu_var_run_t:sock_file { getattr read write };
+allow passt_repair_t virt_var_run_t:sock_file { getattr read write };
diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te
index f8ea672..6995df8 100644
--- a/contrib/selinux/passt.te
+++ b/contrib/selinux/passt.te
@@ -49,7 +49,7 @@ require {
type proc_net_t;
type node_t;
class tcp_socket { create accept listen name_bind name_connect getattr ioctl };
- class udp_socket { create accept listen };
+ class udp_socket { create accept listen getattr };
class icmp_socket { bind create name_bind node_bind setopt read write };
class sock_file { create unlink write };
@@ -110,8 +110,6 @@ allow passt_t self:user_namespace create;
auth_read_passwd(passt_t)
allow passt_t proc_net_t:file read;
-allow passt_t net_conf_t:file { open read };
-allow passt_t net_conf_t:lnk_file read;
allow passt_t tmp_t:sock_file { create unlink write };
allow passt_t self:netlink_route_socket { bind create nlmsg_read read write setopt };
kernel_search_network_sysctl(passt_t)
@@ -129,11 +127,13 @@ corenet_tcp_connect_all_ports(passt_t)
corenet_tcp_sendrecv_all_ports(passt_t)
corenet_udp_sendrecv_all_ports(passt_t)
+sysnet_read_config(passt_t)
+
allow passt_t node_t:icmp_socket { name_bind node_bind };
allow passt_t port_t:icmp_socket name_bind;
allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write getattr ioctl };
-allow passt_t self:udp_socket { create getopt setopt connect bind read write };
+allow passt_t self:udp_socket { create getopt setopt connect bind read write getattr };
allow passt_t self:icmp_socket { bind create setopt read write };
allow passt_t user_tmp_t:dir { add_name write };
diff --git a/contrib/selinux/pasta.fc b/contrib/selinux/pasta.fc
index 41ee46d..82dbcbe 100644
--- a/contrib/selinux/pasta.fc
+++ b/contrib/selinux/pasta.fc
@@ -8,7 +8,15 @@
# Copyright (c) 2022 Red Hat GmbH
# Author: Stefano Brivio <sbrivio@redhat.com>
-/usr/bin/pasta system_u:object_r:pasta_exec_t:s0
-/usr/bin/pasta.avx2 system_u:object_r:pasta_exec_t:s0
-/tmp/pasta\.pcap system_u:object_r:pasta_log_t:s0
-/var/run/pasta\.pid system_u:object_r:pasta_pid_t:s0
+/usr/bin/pasta system_u:object_r:pasta_exec_t:s0
+/usr/bin/pasta.avx2 system_u:object_r:pasta_exec_t:s0
+/tmp/pasta\.pcap system_u:object_r:pasta_log_t:s0
+/var/run/pasta\.pid system_u:object_r:pasta_pid_t:s0
+/run/user/[0-9]+/netns system_u:object_r:ifconfig_var_run_t:s0
+/run/user/[0-9]+/containers/networks/rootless-netns system_u:object_r:ifconfig_var_run_t:s0
+# In case XDG_RUNTIME_DIR is not set (i.e. no systemd user session) Podman falls
+# back to a location under /tmp
+/tmp/storage-run-[0-9]+/netns system_u:object_r:ifconfig_var_run_t:s0
+/tmp/storage-run-[0-9]+/containers/networks/rootless-netns system_u:object_r:ifconfig_var_run_t:s0
+/tmp/containers-user-[0-9]+/netns system_u:object_r:ifconfig_var_run_t:s0
+/tmp/containers-user-[0-9]+/containers/networks/rootless-netns system_u:object_r:ifconfig_var_run_t:s0
diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te
index 89c8043..95fe42a 100644
--- a/contrib/selinux/pasta.te
+++ b/contrib/selinux/pasta.te
@@ -89,6 +89,16 @@ require {
class capability { sys_tty_config setuid setgid };
class cap_userns { setpcap sys_admin sys_ptrace net_bind_service net_admin };
class user_namespace create;
+
+ # Container requires
+ attribute_role usernetctl_roles;
+ role container_user_r;
+ role staff_r;
+ role user_r;
+ type container_runtime_t;
+ type container_var_run_t;
+ type container_t;
+ type systemd_user_runtimedir_t;
}
type pasta_t;
@@ -113,6 +123,9 @@ init_daemon_domain(pasta_t, pasta_exec_t)
allow pasta_t self:capability { setpcap net_bind_service sys_tty_config dac_read_search net_admin sys_resource setuid setgid };
allow pasta_t self:cap_userns { setpcap sys_admin sys_ptrace net_admin net_bind_service };
+# pasta only calls setuid and setgid with the current UID and GID, so this
+# denial is harmless. See https://bugzilla.redhat.com/show_bug.cgi?id=2330512#c10
+dontaudit pasta_t self:cap_userns { setgid setuid };
allow pasta_t self:user_namespace create;
auth_read_passwd(pasta_t)
@@ -130,7 +143,7 @@ allow pasta_t user_home_t:file { open read getattr setattr execute execute_no_tr
allow pasta_t user_home_dir_t:dir { search getattr open add_name read write };
allow pasta_t user_home_dir_t:file { create open read write };
allow pasta_t tmp_t:dir { add_name mounton remove_name write };
-allow pasta_t tmpfs_t:filesystem mount;
+allow pasta_t tmpfs_t:filesystem { getattr mount };
allow pasta_t fs_t:filesystem unmount;
allow pasta_t root_t:dir mounton;
manage_files_pattern(pasta_t, pasta_pid_t, pasta_pid_t)
@@ -147,15 +160,21 @@ logging_send_syslog_msg(pasta_t)
allow syslogd_t self:cap_userns sys_ptrace;
allow pasta_t proc_net_t:file { open read };
-allow pasta_t net_conf_t:file { open read };
allow pasta_t self:netlink_route_socket { bind create nlmsg_read nlmsg_write setopt read write };
kernel_search_network_sysctl(pasta_t)
+sysnet_read_config(pasta_t)
+
allow pasta_t tmp_t:sock_file { create unlink write };
allow pasta_t self:tcp_socket create_stream_socket_perms;
corenet_tcp_sendrecv_generic_node(pasta_t)
corenet_tcp_bind_generic_node(pasta_t)
+allow pasta_t container_runtime_t:dir { open read search };
+allow pasta_t container_runtime_t:fifo_file { getattr write };
+allow pasta_t container_runtime_t:file read;
+allow pasta_t container_runtime_t:lnk_file read;
+allow pasta_t container_t:lnk_file read;
allow pasta_t pasta_port_t:tcp_socket { name_bind name_connect };
allow pasta_t pasta_port_t:udp_socket { name_bind };
allow pasta_t http_port_t:tcp_socket { name_bind name_connect };
@@ -204,7 +223,6 @@ allow pasta_t kernel_t:system module_request;
allow pasta_t proc_t:dir mounton;
allow pasta_t proc_t:filesystem mount;
-allow pasta_t net_conf_t:lnk_file read;
allow pasta_t proc_net_t:lnk_file read;
allow pasta_t unconfined_t:process { noatsecure rlimitinh siginh };
@@ -213,3 +231,32 @@ allow pasta_t netutils_t:process { noatsecure rlimitinh siginh };
allow pasta_t ping_t:process { noatsecure rlimitinh siginh };
allow pasta_t user_tty_device_t:chr_file { append read write };
allow pasta_t user_devpts_t:chr_file { append read write };
+
+# Allow network administration commands for non-privileged users
+roleattribute container_user_r usernetctl_roles;
+roleattribute staff_r usernetctl_roles;
+roleattribute user_r usernetctl_roles;
+role usernetctl_roles types pasta_t;
+
+# Make pasta in a container run under the pasta_t context
+type_transition container_runtime_t pasta_exec_t : process pasta_t;
+allow container_runtime_t pasta_t:process transition;
+
+# Label the user network namespace files
+# Note: Podman files used to be user_tmp_t but are now container_var_run_t since
+# https://github.com/containers/container-selinux/pull/405
+type_transition container_runtime_t user_tmp_t : dir ifconfig_var_run_t "netns";
+type_transition container_runtime_t container_var_run_t : dir ifconfig_var_run_t "netns";
+type_transition container_runtime_t user_tmp_t : dir ifconfig_var_run_t "rootless-netns";
+type_transition container_runtime_t container_var_run_t : dir ifconfig_var_run_t "rootless-netns";
+allow pasta_t ifconfig_var_run_t:dir { add_name open rmdir write };
+allow pasta_t ifconfig_var_run_t:file { create open write };
+allow systemd_user_runtimedir_t ifconfig_var_run_t:dir rmdir;
+
+# Allow pasta to bind to any port
+bool pasta_bind_all_ports true;
+if (pasta_bind_all_ports) {
+ allow pasta_t port_type:icmp_socket { accept getopt name_bind };
+ allow pasta_t port_type:tcp_socket { accept getopt name_bind name_connect };
+ allow pasta_t port_type:udp_socket { accept getopt name_bind };
+}
diff --git a/dhcp.c b/dhcp.c
index b0de04b..6b9c2e3 100644
--- a/dhcp.c
+++ b/dhcp.c
@@ -296,33 +296,35 @@ static void opt_set_dns_search(const struct ctx *c, size_t max_len)
/**
* dhcp() - Check if this is a DHCP message, reply as needed
* @c: Execution context
- * @p: Packet pool, single packet with Ethernet buffer
+ * @data: Single packet with Ethernet buffer
*
* Return: 0 if it's not a DHCP message, 1 if handled, -1 on failure
*/
-int dhcp(const struct ctx *c, const struct pool *p)
+int dhcp(const struct ctx *c, struct iov_tail *data)
{
- size_t mlen, dlen, offset = 0, opt_len, opt_off = 0;
char macstr[ETH_ADDRSTRLEN];
+ size_t mlen, dlen, opt_len;
struct in_addr mask, dst;
+ struct ethhdr eh_storage;
+ struct iphdr iph_storage;
+ struct udphdr uh_storage;
const struct ethhdr *eh;
const struct iphdr *iph;
const struct udphdr *uh;
+ struct msg m_storage;
struct msg const *m;
struct msg reply;
unsigned int i;
- eh = packet_get(p, 0, offset, sizeof(*eh), NULL);
- offset += sizeof(*eh);
-
- iph = packet_get(p, 0, offset, sizeof(*iph), NULL);
+ eh = IOV_REMOVE_HEADER(data, eh_storage);
+ iph = IOV_PEEK_HEADER(data, iph_storage);
if (!eh || !iph)
return -1;
- offset += iph->ihl * 4UL;
- uh = packet_get(p, 0, offset, sizeof(*uh), &mlen);
- offset += sizeof(*uh);
+ if (!iov_drop_header(data, iph->ihl * 4UL))
+ return -1;
+ uh = IOV_REMOVE_HEADER(data, uh_storage);
if (!uh)
return -1;
@@ -332,7 +334,10 @@ int dhcp(const struct ctx *c, const struct pool *p)
if (c->no_dhcp)
return 1;
- m = packet_get(p, 0, offset, offsetof(struct msg, o), &opt_len);
+ mlen = iov_tail_size(data);
+ m = (struct msg const *)iov_remove_header_(data, &m_storage,
+ offsetof(struct msg, o),
+ __alignof__(struct msg));
if (!m ||
mlen != ntohs(uh->len) - sizeof(*uh) ||
mlen < offsetof(struct msg, o) ||
@@ -355,27 +360,28 @@ int dhcp(const struct ctx *c, const struct pool *p)
memset(&reply.file, 0, sizeof(reply.file));
reply.magic = m->magic;
- offset += offsetof(struct msg, o);
-
for (i = 0; i < ARRAY_SIZE(opts); i++)
opts[i].clen = -1;
- while (opt_off + 2 < opt_len) {
- const uint8_t *olen, *val;
+ opt_len = iov_tail_size(data);
+ while (opt_len >= 2) {
+ uint8_t olen_storage, type_storage;
+ const uint8_t *olen;
uint8_t *type;
- type = packet_get(p, 0, offset + opt_off, 1, NULL);
- olen = packet_get(p, 0, offset + opt_off + 1, 1, NULL);
+ type = IOV_REMOVE_HEADER(data, type_storage);
+ olen = IOV_REMOVE_HEADER(data, olen_storage);
if (!type || !olen)
return -1;
- val = packet_get(p, 0, offset + opt_off + 2, *olen, NULL);
- if (!val)
+ opt_len = iov_tail_size(data);
+ if (opt_len < *olen)
return -1;
- memcpy(&opts[*type].c, val, *olen);
+ iov_to_buf(&data->iov[0], data->cnt, data->off, &opts[*type].c, *olen);
opts[*type].clen = *olen;
- opt_off += *olen + 2;
+ iov_drop_header(data, *olen);
+ opt_len -= *olen;
}
opts[80].slen = -1;
diff --git a/dhcp.h b/dhcp.h
index 87aeecd..cd50c99 100644
--- a/dhcp.h
+++ b/dhcp.h
@@ -6,7 +6,7 @@
#ifndef DHCP_H
#define DHCP_H
-int dhcp(const struct ctx *c, const struct pool *p);
+int dhcp(const struct ctx *c, struct iov_tail *data);
void dhcp_init(void);
#endif /* DHCP_H */
diff --git a/dhcpv6.c b/dhcpv6.c
index 373a988..e4df0db 100644
--- a/dhcpv6.c
+++ b/dhcpv6.c
@@ -54,14 +54,14 @@ struct opt_hdr {
uint16_t l;
} __attribute__((packed));
+#define UDP_MSG_HDR_SIZE (sizeof(struct udphdr) + sizeof(struct msg_hdr))
# define OPT_SIZE_CONV(x) (htons_constant(x))
#define OPT_SIZE(x) OPT_SIZE_CONV(sizeof(struct opt_##x) - \
sizeof(struct opt_hdr))
#define OPT_VSIZE(x) (sizeof(struct opt_##x) - \
sizeof(struct opt_hdr))
#define OPT_MAX_SIZE IPV6_MIN_MTU - (sizeof(struct ipv6hdr) + \
- sizeof(struct udphdr) + \
- sizeof(struct msg_hdr))
+ UDP_MSG_HDR_SIZE)
/**
* struct opt_client_id - DHCPv6 Client Identifier option
@@ -144,7 +144,9 @@ struct opt_ia_addr {
struct opt_status_code {
struct opt_hdr hdr;
uint16_t code;
- char status_msg[sizeof(STR_NOTONLINK) - 1];
+ /* "nonstring" is only supported since clang 23 */
+ /* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */
+ __attribute__((nonstring)) char status_msg[sizeof(STR_NOTONLINK) - 1];
} __attribute__((packed));
/**
@@ -278,82 +280,132 @@ static struct resp_not_on_link_t {
/**
* dhcpv6_opt() - Get option from DHCPv6 message
- * @p: Packet pool, single packet with UDP header
- * @offset: Offset to look at, 0: end of header, set to option start
+ * @data: Buffer with options, set to matching option on return
* @type: Option type to look up, network order
*
- * Return: pointer to option header, or NULL on malformed or missing option
+ * Return: true if found and @data points to the option header,
+ * or false on malformed or missing option and @data is
+ * unmodified.
*/
-static struct opt_hdr *dhcpv6_opt(const struct pool *p, size_t *offset,
- uint16_t type)
+static bool dhcpv6_opt(struct iov_tail *data, uint16_t type)
{
- struct opt_hdr *o;
- size_t left;
+ struct iov_tail head = *data;
+ struct opt_hdr o_storage;
+ const struct opt_hdr *o;
- if (!*offset)
- *offset = sizeof(struct udphdr) + sizeof(struct msg_hdr);
-
- while ((o = packet_get_try(p, 0, *offset, sizeof(*o), &left))) {
+ while ((o = IOV_PEEK_HEADER(data, o_storage))) {
unsigned int opt_len = ntohs(o->l) + sizeof(*o);
- if (ntohs(o->l) > left)
- return NULL;
+ if (opt_len > iov_tail_size(data))
+ break;
if (o->t == type)
- return o;
+ return true;
- *offset += opt_len;
+ iov_drop_header(data, opt_len);
}
- return NULL;
+ *data = head;
+ return false;
}
/**
* dhcpv6_ia_notonlink() - Check if any IA contains non-appropriate addresses
- * @p: Packet pool, single packet starting from UDP header
+ * @data: Data to look at, packet starting from UDP header (input/output)
* @la: Address we want to lease to the client
*
- * Return: pointer to non-appropriate IA_NA or IA_TA, if any, NULL otherwise
+ * Return: true and @data points to non-appropriate IA_NA or IA_TA, if any,
+ * false otherwise and @data is unmodified
*/
-static struct opt_hdr *dhcpv6_ia_notonlink(const struct pool *p,
- struct in6_addr *la)
+static bool dhcpv6_ia_notonlink(struct iov_tail *data,
+ struct in6_addr *la)
{
int ia_types[2] = { OPT_IA_NA, OPT_IA_TA }, *ia_type;
+ struct opt_ia_addr opt_addr_storage;
const struct opt_ia_addr *opt_addr;
+ struct iov_tail current, ia_base;
+ struct opt_ia_na ia_storage;
char buf[INET6_ADDRSTRLEN];
+ const struct opt_ia_na *ia;
struct in6_addr req_addr;
+ struct opt_hdr h_storage;
const struct opt_hdr *h;
- struct opt_hdr *ia;
- size_t offset;
foreach(ia_type, ia_types) {
- offset = 0;
- while ((ia = dhcpv6_opt(p, &offset, *ia_type))) {
- if (ntohs(ia->l) < OPT_VSIZE(ia_na))
- return NULL;
-
- offset += sizeof(struct opt_ia_na);
+ current = *data;
+ while (dhcpv6_opt(&current, *ia_type)) {
+ ia_base = current;
+ ia = IOV_REMOVE_HEADER(&current, ia_storage);
+ if (!ia || ntohs(ia->hdr.l) < OPT_VSIZE(ia_na))
+ goto notfound;
+
+ while (dhcpv6_opt(&current, OPT_IAAADR)) {
+ h = IOV_PEEK_HEADER(&current, h_storage);
+ if (!h || ntohs(h->l) != OPT_VSIZE(ia_addr))
+ goto notfound;
+
+ opt_addr = IOV_REMOVE_HEADER(&current,
+ opt_addr_storage);
+ if (!opt_addr)
+ goto notfound;
- while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) {
- if (ntohs(h->l) != OPT_VSIZE(ia_addr))
- return NULL;
-
- opt_addr = (const struct opt_ia_addr *)h;
req_addr = opt_addr->addr;
if (!IN6_ARE_ADDR_EQUAL(la, &req_addr))
- goto err;
-
- offset += sizeof(struct opt_ia_addr);
+ goto notonlink;
}
}
}
- return NULL;
+notfound:
+ return false;
-err:
+notonlink:
info("DHCPv6: requested address %s not on link",
inet_ntop(AF_INET6, &req_addr, buf, sizeof(buf)));
- return ia;
+ *data = ia_base;
+ return true;
+}
+
+/**
+ * dhcpv6_send_ia_notonlink() - Send NotOnLink status
+ * @c: Execution context
+ * @ia_base: Non-appropriate IA_NA or IA_TA base
+ * @client_id_base: Client ID message option base
+ * @len: Client ID length
+ * @xid: Transaction ID for message exchange
+ */
+static void dhcpv6_send_ia_notonlink(struct ctx *c,
+ const struct iov_tail *ia_base,
+ const struct iov_tail *client_id_base,
+ int len, uint32_t xid)
+{
+ const struct in6_addr *src = &c->ip6.our_tap_ll;
+ struct opt_hdr *ia = (struct opt_hdr *)resp_not_on_link.var;
+ size_t n;
+
+ info("DHCPv6: received CONFIRM with inappropriate IA,"
+ " sending NotOnLink status in REPLY");
+
+ n = sizeof(struct opt_ia_na);
+ iov_to_buf(&ia_base->iov[0], ia_base->cnt, ia_base->off,
+ resp_not_on_link.var, n);
+ ia->l = htons(OPT_VSIZE(ia_na) + sizeof(sc_not_on_link));
+ memcpy(resp_not_on_link.var + n, &sc_not_on_link,
+ sizeof(sc_not_on_link));
+
+ n += sizeof(sc_not_on_link);
+ iov_to_buf(&client_id_base->iov[0], client_id_base->cnt,
+ client_id_base->off, resp_not_on_link.var + n,
+ sizeof(struct opt_hdr) + len);
+
+ n += sizeof(struct opt_hdr) + len;
+
+ n = offsetof(struct resp_not_on_link_t, var) + n;
+
+ resp_not_on_link.hdr.xid = xid;
+
+ tap_udp6_send(c, src, 547, tap_ip6_daddr(c, src), 546,
+ xid, &resp_not_on_link, n);
}
/**
@@ -435,17 +487,19 @@ search:
/**
* dhcpv6_client_fqdn_fill() - Fill in client FQDN option
+ * @data: Data to look at
* @c: Execution context
* @buf: Response message buffer where options will be appended
* @offset: Offset in message buffer for new options
*
* Return: updated length of response message buffer.
*/
-static size_t dhcpv6_client_fqdn_fill(const struct pool *p, const struct ctx *c,
+static size_t dhcpv6_client_fqdn_fill(const struct iov_tail *data,
+ const struct ctx *c,
char *buf, int offset)
{
- struct opt_client_fqdn const *req_opt;
+ struct iov_tail current = *data;
struct opt_client_fqdn *o;
size_t opt_len;
@@ -463,13 +517,16 @@ static size_t dhcpv6_client_fqdn_fill(const struct pool *p, const struct ctx *c,
}
o = (struct opt_client_fqdn *)(buf + offset);
+ o->flags = 0x00;
encode_domain_name(o->domain_name, c->fqdn);
- req_opt = (struct opt_client_fqdn *)dhcpv6_opt(p, &(size_t){ 0 },
- OPT_CLIENT_FQDN);
- if (req_opt && req_opt->flags & 0x01 /* S flag */)
- o->flags = 0x02 /* O flag */;
- else
- o->flags = 0x00;
+ if (dhcpv6_opt(&current, OPT_CLIENT_FQDN)) {
+ struct opt_client_fqdn req_opt_storage;
+ struct opt_client_fqdn const *req_opt;
+
+ req_opt = IOV_PEEK_HEADER(&current, req_opt_storage);
+ if (req_opt && req_opt->flags & 0x01 /* S flag */)
+ o->flags = 0x02 /* O flag */;
+ }
opt_len++;
@@ -482,23 +539,38 @@ static size_t dhcpv6_client_fqdn_fill(const struct pool *p, const struct ctx *c,
/**
* dhcpv6() - Check if this is a DHCPv6 message, reply as needed
* @c: Execution context
- * @p: Packet pool, single packet starting from UDP header
+ * @data: Single packet starting from UDP header
* @saddr: Source IPv6 address of original message
* @daddr: Destination IPv6 address of original message
*
* Return: 0 if it's not a DHCPv6 message, 1 if handled, -1 on failure
*/
-int dhcpv6(struct ctx *c, const struct pool *p,
+int dhcpv6(struct ctx *c, struct iov_tail *data,
const struct in6_addr *saddr, const struct in6_addr *daddr)
{
- const struct opt_hdr *client_id, *server_id, *ia;
+ const struct opt_server_id *server_id = NULL;
+ const struct opt_hdr *client_id = NULL;
+ /* The _storage variables can't be local to the blocks they're used in,
+ * because IOV_*_HEADER() may return pointers to them which are
+ * dereferenced afterwards. Since we don't have Rust-like lifetime
+ * tracking, cppcheck can't reasonably determine that, so we must
+ * suppress its warnings. */
+ /* cppcheck-suppress [variableScope,unmatchedSuppression] */
+ struct opt_server_id server_id_storage;
+ struct iov_tail opt, client_id_base;
+ const struct opt_ia_na *ia = NULL;
+ /* cppcheck-suppress [variableScope,unmatchedSuppression] */
+ struct opt_hdr client_id_storage;
+ /* cppcheck-suppress [variableScope,unmatchedSuppression] */
+ struct opt_ia_na ia_storage;
const struct in6_addr *src;
+ struct msg_hdr mh_storage;
const struct msg_hdr *mh;
+ struct udphdr uh_storage;
const struct udphdr *uh;
- struct opt_hdr *bad_ia;
size_t mlen, n;
- uh = packet_get(p, 0, 0, sizeof(*uh), &mlen);
+ uh = IOV_REMOVE_HEADER(data, uh_storage);
if (!uh)
return -1;
@@ -511,6 +583,7 @@ int dhcpv6(struct ctx *c, const struct pool *p,
if (!IN6_IS_ADDR_MULTICAST(daddr))
return -1;
+ mlen = iov_tail_size(data);
if (mlen + sizeof(*uh) != ntohs(uh->len) || mlen < sizeof(*mh))
return -1;
@@ -518,20 +591,26 @@ int dhcpv6(struct ctx *c, const struct pool *p,
src = &c->ip6.our_tap_ll;
- mh = packet_get(p, 0, sizeof(*uh), sizeof(*mh), NULL);
+ mh = IOV_REMOVE_HEADER(data, mh_storage);
if (!mh)
return -1;
- client_id = dhcpv6_opt(p, &(size_t){ 0 }, OPT_CLIENTID);
+ client_id_base = *data;
+ if (dhcpv6_opt(&client_id_base, OPT_CLIENTID))
+ client_id = IOV_PEEK_HEADER(&client_id_base, client_id_storage);
if (!client_id || ntohs(client_id->l) > OPT_VSIZE(client_id))
return -1;
- server_id = dhcpv6_opt(p, &(size_t){ 0 }, OPT_SERVERID);
- if (server_id && ntohs(server_id->l) != OPT_VSIZE(server_id))
+ opt = *data;
+ if (dhcpv6_opt(&opt, OPT_SERVERID))
+ server_id = IOV_PEEK_HEADER(&opt, server_id_storage);
+ if (server_id && ntohs(server_id->hdr.l) != OPT_VSIZE(server_id))
return -1;
- ia = dhcpv6_opt(p, &(size_t){ 0 }, OPT_IA_NA);
- if (ia && ntohs(ia->l) < MIN(OPT_VSIZE(ia_na), OPT_VSIZE(ia_ta)))
+ opt = *data;
+ if (dhcpv6_opt(&opt, OPT_IA_NA))
+ ia = IOV_PEEK_HEADER(&opt, ia_storage);
+ if (ia && ntohs(ia->hdr.l) < MIN(OPT_VSIZE(ia_na), OPT_VSIZE(ia_ta)))
return -1;
resp.hdr.type = TYPE_REPLY;
@@ -546,29 +625,10 @@ int dhcpv6(struct ctx *c, const struct pool *p,
if (mh->type == TYPE_CONFIRM && server_id)
return -1;
- if ((bad_ia = dhcpv6_ia_notonlink(p, &c->ip6.addr))) {
- info("DHCPv6: received CONFIRM with inappropriate IA,"
- " sending NotOnLink status in REPLY");
-
- bad_ia->l = htons(OPT_VSIZE(ia_na) +
- sizeof(sc_not_on_link));
- n = sizeof(struct opt_ia_na);
- memcpy(resp_not_on_link.var, bad_ia, n);
-
- memcpy(resp_not_on_link.var + n,
- &sc_not_on_link, sizeof(sc_not_on_link));
- n += sizeof(sc_not_on_link);
-
- memcpy(resp_not_on_link.var + n, client_id,
- sizeof(struct opt_hdr) + ntohs(client_id->l));
- n += sizeof(struct opt_hdr) + ntohs(client_id->l);
-
- n = offsetof(struct resp_not_on_link_t, var) + n;
-
- resp_not_on_link.hdr.xid = mh->xid;
+ if (dhcpv6_ia_notonlink(data, &c->ip6.addr)) {
- tap_udp6_send(c, src, 547, tap_ip6_daddr(c, src), 546,
- mh->xid, &resp_not_on_link, n);
+ dhcpv6_send_ia_notonlink(c, data, &client_id_base,
+ ntohs(client_id->l), mh->xid);
return 1;
}
@@ -580,7 +640,7 @@ int dhcpv6(struct ctx *c, const struct pool *p,
memcmp(&resp.server_id, server_id, sizeof(resp.server_id)))
return -1;
- if (ia || dhcpv6_opt(p, &(size_t){ 0 }, OPT_IA_TA))
+ if (ia || dhcpv6_opt(data, OPT_IA_TA))
return -1;
info("DHCPv6: received INFORMATION_REQUEST, sending REPLY");
@@ -606,13 +666,14 @@ int dhcpv6(struct ctx *c, const struct pool *p,
if (ia)
resp.ia_na.iaid = ((struct opt_ia_na *)ia)->iaid;
- memcpy(&resp.client_id, client_id,
- ntohs(client_id->l) + sizeof(struct opt_hdr));
+ iov_to_buf(&client_id_base.iov[0], client_id_base.cnt,
+ client_id_base.off, &resp.client_id,
+ ntohs(client_id->l) + sizeof(struct opt_hdr));
n = offsetof(struct resp_t, client_id) +
sizeof(struct opt_hdr) + ntohs(client_id->l);
n = dhcpv6_dns_fill(c, (char *)&resp, n);
- n = dhcpv6_client_fqdn_fill(p, c, (char *)&resp, n);
+ n = dhcpv6_client_fqdn_fill(data, c, (char *)&resp, n);
resp.hdr.xid = mh->xid;
diff --git a/dhcpv6.h b/dhcpv6.h
index 5809988..c706dfd 100644
--- a/dhcpv6.h
+++ b/dhcpv6.h
@@ -6,7 +6,7 @@
#ifndef DHCPV6_H
#define DHCPV6_H
-int dhcpv6(struct ctx *c, const struct pool *p,
+int dhcpv6(struct ctx *c, struct iov_tail *data,
struct in6_addr *saddr, struct in6_addr *daddr);
void dhcpv6_init(const struct ctx *c);
diff --git a/doc/platform-requirements/.gitignore b/doc/platform-requirements/.gitignore
index 3b5a10a..f6272cf 100644
--- a/doc/platform-requirements/.gitignore
+++ b/doc/platform-requirements/.gitignore
@@ -1,3 +1,4 @@
+/listen-vs-repair
/reuseaddr-priority
/recv-zero
/udp-close-dup
diff --git a/doc/platform-requirements/Makefile b/doc/platform-requirements/Makefile
index 6a7d374..83930ef 100644
--- a/doc/platform-requirements/Makefile
+++ b/doc/platform-requirements/Makefile
@@ -3,8 +3,8 @@
# Copyright Red Hat
# Author: David Gibson <david@gibson.dropbear.id.au>
-TARGETS = reuseaddr-priority recv-zero udp-close-dup
-SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c
+TARGETS = reuseaddr-priority recv-zero udp-close-dup listen-vs-repair
+SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c listen-vs-repair.c
CFLAGS = -Wall
all: cppcheck clang-tidy $(TARGETS:%=check-%)
diff --git a/doc/platform-requirements/common.h b/doc/platform-requirements/common.h
index 8844b1e..e85fc2b 100644
--- a/doc/platform-requirements/common.h
+++ b/doc/platform-requirements/common.h
@@ -15,6 +15,7 @@
#include <stdio.h>
#include <stdlib.h>
+__attribute__((format(printf, 1, 2), noreturn))
static inline void die(const char *fmt, ...)
{
va_list ap;
diff --git a/doc/platform-requirements/listen-vs-repair.c b/doc/platform-requirements/listen-vs-repair.c
new file mode 100644
index 0000000..d31fe3f
--- /dev/null
+++ b/doc/platform-requirements/listen-vs-repair.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* liste-vs-repair.c
+ *
+ * Do listening sockets have address conflicts with sockets under repair
+ * ====================================================================
+ *
+ * When we accept() an incoming connection the accept()ed socket will have the
+ * same local address as the listening socket. This can be a complication on
+ * migration. On the migration target we've already set up listening sockets
+ * according to the command line. However to restore connections that we're
+ * migrating in we need to bind the new sockets to the same address, which would
+ * be an address conflict on the face of it. This test program verifies that
+ * enabling repair mode before bind() correctly suppresses that conflict.
+ *
+ * Copyright Red Hat
+ * Author: David Gibson <david@gibson.dropbear.id.au>
+ */
+
+/* NOLINTNEXTLINE(bugprone-reserved-identifier,cert-dcl37-c,cert-dcl51-cpp) */
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "common.h"
+
+#define PORT 13256U
+#define CPORT 13257U
+
+/* 127.0.0.1:PORT */
+static const struct sockaddr_in addr = SOCKADDR_INIT(INADDR_LOOPBACK, PORT);
+
+/* 127.0.0.1:CPORT */
+static const struct sockaddr_in caddr = SOCKADDR_INIT(INADDR_LOOPBACK, CPORT);
+
+/* Put ourselves into a network sandbox */
+static void net_sandbox(void)
+{
+ /* NOLINTNEXTLINE(altera-struct-pack-align) */
+ const struct req_t {
+ struct nlmsghdr nlh;
+ struct ifinfomsg ifm;
+ } __attribute__((packed)) req = {
+ .nlh.nlmsg_type = RTM_NEWLINK,
+ .nlh.nlmsg_flags = NLM_F_REQUEST,
+ .nlh.nlmsg_len = sizeof(req),
+ .nlh.nlmsg_seq = 1,
+ .ifm.ifi_family = AF_UNSPEC,
+ .ifm.ifi_index = 1,
+ .ifm.ifi_flags = IFF_UP,
+ .ifm.ifi_change = IFF_UP,
+ };
+ int nl;
+
+ if (unshare(CLONE_NEWUSER | CLONE_NEWNET))
+ die("unshare(): %s\n", strerror(errno));
+
+ /* Bring up lo in the new netns */
+ nl = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
+ if (nl < 0)
+ die("Can't create netlink socket: %s\n", strerror(errno));
+
+ if (send(nl, &req, sizeof(req), 0) < 0)
+ die("Netlink send(): %s\n", strerror(errno));
+ close(nl);
+}
+
+static void check(void)
+{
+ int s1, s2, op;
+
+ s1 = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+ if (s1 < 0)
+ die("socket() 1: %s\n", strerror(errno));
+
+ if (bind(s1, (struct sockaddr *)&addr, sizeof(addr)))
+ die("bind() 1: %s\n", strerror(errno));
+
+ if (listen(s1, 0))
+ die("listen(): %s\n", strerror(errno));
+
+ s2 = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+ if (s2 < 0)
+ die("socket() 2: %s\n", strerror(errno));
+
+ op = TCP_REPAIR_ON;
+ if (setsockopt(s2, SOL_TCP, TCP_REPAIR, &op, sizeof(op)))
+ die("TCP_REPAIR: %s\n", strerror(errno));
+
+ if (bind(s2, (struct sockaddr *)&addr, sizeof(addr)))
+ die("bind() 2: %s\n", strerror(errno));
+
+ if (connect(s2, (struct sockaddr *)&caddr, sizeof(caddr)))
+ die("connect(): %s\n", strerror(errno));
+
+ op = TCP_REPAIR_OFF_NO_WP;
+ if (setsockopt(s2, SOL_TCP, TCP_REPAIR, &op, sizeof(op)))
+ die("TCP_REPAIR: %s\n", strerror(errno));
+
+ close(s1);
+ close(s2);
+}
+
+int main(int argc, char *argv[])
+{
+ (void)argc;
+ (void)argv;
+
+ net_sandbox();
+
+ check();
+
+ printf("Repair mode appears to properly suppress conflicts with listening sockets\n");
+
+ exit(0);
+}
diff --git a/doc/platform-requirements/reuseaddr-priority.c b/doc/platform-requirements/reuseaddr-priority.c
index 701b6ff..af39a39 100644
--- a/doc/platform-requirements/reuseaddr-priority.c
+++ b/doc/platform-requirements/reuseaddr-priority.c
@@ -46,13 +46,13 @@
/* Different cases for receiving socket configuration */
enum sock_type {
/* Socket is bound to 0.0.0.0:DSTPORT and not connected */
- SOCK_BOUND_ANY = 0,
+ SOCK_BOUND_ANY,
/* Socket is bound to 127.0.0.1:DSTPORT and not connected */
- SOCK_BOUND_LO = 1,
+ SOCK_BOUND_LO,
/* Socket is bound to 0.0.0.0:DSTPORT and connected to 127.0.0.1:SRCPORT */
- SOCK_CONNECTED = 2,
+ SOCK_CONNECTED,
NUM_SOCK_TYPES,
};
diff --git a/epoll_ctl.c b/epoll_ctl.c
new file mode 100644
index 0000000..728a2af
--- /dev/null
+++ b/epoll_ctl.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* epoll_ctl.c - epoll manipulation helpers
+ *
+ * Copyright Red Hat
+ * Author: Laurent Vivier <lvivier@redhat.com>
+ */
+
+#include <errno.h>
+
+#include "epoll_ctl.h"
+
+/**
+ * epoll_add() - Add a file descriptor to an epollfd
+ * @epollfd: epoll file descriptor to add to
+ * @events: epoll events
+ * @ref: epoll reference for the file descriptor (includes fd and metadata)
+ *
+ * Return: 0 on success, negative errno on failure
+ */
+int epoll_add(int epollfd, uint32_t events, union epoll_ref ref)
+{
+ struct epoll_event ev;
+ int ret;
+
+ ev.events = events;
+ ev.data.u64 = ref.u64;
+
+ ret = epoll_ctl(epollfd, EPOLL_CTL_ADD, ref.fd, &ev);
+ if (ret == -1) {
+ ret = -errno;
+ err("Failed to add fd to epoll: %s", strerror_(-ret));
+ }
+
+ return ret;
+}
+
+/**
+ * epoll_del() - Remove a file descriptor from an epollfd
+ * @epollfd: epoll file descriptor to remove from
+ * @fd: File descriptor to remove
+ */
+void epoll_del(int epollfd, int fd)
+{
+ epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, NULL);
+}
diff --git a/epoll_ctl.h b/epoll_ctl.h
new file mode 100644
index 0000000..2d7e712
--- /dev/null
+++ b/epoll_ctl.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright Red Hat
+ * Author: Laurent Vivier <lvivier@redhat.com>
+ */
+
+#ifndef EPOLL_CTL_H
+#define EPOLL_CTL_H
+
+#include <sys/epoll.h>
+
+#include "util.h"
+#include "passt.h"
+#include "epoll_type.h"
+#include "flow.h"
+#include "tcp.h"
+#include "udp.h"
+
+/**
+ * union epoll_ref - Breakdown of reference for epoll fd bookkeeping
+ * @type: Type of fd (tells us what to do with events)
+ * @fd: File descriptor number (implies < 2^24 total descriptors)
+ * @flow: Index of the flow this fd is linked to
+ * @tcp_listen: TCP-specific reference part for listening sockets
+ * @udp: UDP-specific reference part
+ * @data: Data handled by protocol handlers
+ * @nsdir_fd: netns dirfd for fallback timer checking if namespace is gone
+ * @queue: vhost-user queue index for this fd
+ * @u64: Opaque reference for epoll_ctl() and epoll_wait()
+ */
+union epoll_ref {
+ struct {
+ enum epoll_type type:8;
+ int32_t fd:FD_REF_BITS;
+ union {
+ uint32_t flow;
+ flow_sidx_t flowside;
+ union tcp_listen_epoll_ref tcp_listen;
+ union udp_listen_epoll_ref udp;
+ uint32_t data;
+ int nsdir_fd;
+ int queue;
+ };
+ };
+ uint64_t u64;
+};
+static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data),
+ "epoll_ref must have same size as epoll_data");
+
+int epoll_add(int epollfd, uint32_t events, union epoll_ref ref);
+void epoll_del(int epollfd, int fd);
+#endif /* EPOLL_CTL_H */
diff --git a/epoll_type.h b/epoll_type.h
index 7f2a121..a90ffb6 100644
--- a/epoll_type.h
+++ b/epoll_type.h
@@ -22,8 +22,8 @@ enum epoll_type {
EPOLL_TYPE_TCP_TIMER,
/* UDP "listening" sockets */
EPOLL_TYPE_UDP_LISTEN,
- /* UDP socket for replies on a specific flow */
- EPOLL_TYPE_UDP_REPLY,
+ /* UDP socket for a specific flow */
+ EPOLL_TYPE_UDP,
/* ICMP/ICMPv6 ping sockets */
EPOLL_TYPE_PING,
/* inotify fd watching for end of netns (pasta) */
@@ -44,6 +44,8 @@ enum epoll_type {
EPOLL_TYPE_REPAIR_LISTEN,
/* TCP_REPAIR helper socket */
EPOLL_TYPE_REPAIR,
+ /* Netlink neighbour subscription socket */
+ EPOLL_TYPE_NL_NEIGH,
EPOLL_NUM_TYPES,
};
diff --git a/flow.c b/flow.c
index 749c498..4f53486 100644
--- a/flow.c
+++ b/flow.c
@@ -81,7 +81,7 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
*
* Free cluster list
* flow_first_free gives the index of the first (lowest index) free cluster.
- * Each free cluster has the index of the next free cluster, or MAX_FLOW if
+ * Each free cluster has the index of the next free cluster, or FLOW_MAX if
* it is the last free cluster. Together these form a linked list of free
* clusters, in strictly increasing order of index.
*
@@ -116,6 +116,7 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
unsigned flow_first_free;
union flow flowtab[FLOW_MAX];
static const union flow *flow_new_entry; /* = NULL */
+static int epoll_id_to_fd[EPOLLFD_ID_MAX];
/* Hash table to index it */
#define FLOW_HASH_LOAD 70 /* % */
@@ -162,18 +163,13 @@ static void flowside_from_af(struct flowside *side, sa_family_t af,
* @err: Filled in with errno if something failed
* @type: Socket epoll type
* @sa: Socket address
- * @sl: Length of @sa
- * @data: epoll reference data
*/
struct flowside_sock_args {
const struct ctx *c;
int fd;
int err;
enum epoll_type type;
- const struct sockaddr *sa;
- socklen_t sl;
- const char *path;
- uint32_t data;
+ const union sockaddr_inany *sa;
};
/** flowside_sock_splice() - Create and bind socket for PIF_SPLICE based on flowside
@@ -187,8 +183,7 @@ static int flowside_sock_splice(void *arg)
ns_enter(a->c);
- a->fd = sock_l4_sa(a->c, a->type, a->sa, a->sl, NULL,
- a->sa->sa_family == AF_INET6, a->data);
+ a->fd = sock_l4(a->c, a->type, a->sa, NULL);
a->err = errno;
return 0;
@@ -205,15 +200,14 @@ static int flowside_sock_splice(void *arg)
* (if specified).
*/
int flowside_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
- const struct flowside *tgt, uint32_t data)
+ const struct flowside *tgt)
{
const char *ifname = NULL;
union sockaddr_inany sa;
- socklen_t sl;
ASSERT(pif_is_socket(pif));
- pif_sockaddr(c, &sa, &sl, pif, &tgt->oaddr, tgt->oport);
+ pif_sockaddr(c, &sa, pif, &tgt->oaddr, tgt->oport);
switch (pif) {
case PIF_HOST:
@@ -224,13 +218,11 @@ int flowside_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
else if (sa.sa_family == AF_INET6)
ifname = c->ip6.ifname_out;
- return sock_l4_sa(c, type, &sa, sl, ifname,
- sa.sa_family == AF_INET6, data);
+ return sock_l4(c, type, &sa, ifname);
case PIF_SPLICE: {
struct flowside_sock_args args = {
- .c = c, .type = type,
- .sa = &sa.sa, .sl = sl, .data = data,
+ .c = c, .type = type, .sa = &sa,
};
NS_CALL(flowside_sock_splice, &args);
errno = args.err;
@@ -259,10 +251,9 @@ int flowside_connect(const struct ctx *c, int s,
uint8_t pif, const struct flowside *tgt)
{
union sockaddr_inany sa;
- socklen_t sl;
- pif_sockaddr(c, &sa, &sl, pif, &tgt->eaddr, tgt->eport);
- return connect(s, &sa.sa, sl);
+ pif_sockaddr(c, &sa, pif, &tgt->eaddr, tgt->eport);
+ return connect(s, &sa.sa, socklen_inany(&sa));
}
/** flow_log_ - Log flow-related message
@@ -350,6 +341,68 @@ static void flow_set_state(struct flow_common *f, enum flow_state state)
}
/**
+ * flow_in_epoll() - Check if flow is registered with an epoll instance
+ * @f: Flow to check
+ *
+ * Return: true if flow is registered with epoll, false otherwise
+ */
+bool flow_in_epoll(const struct flow_common *f)
+{
+ return f->epollid != EPOLLFD_ID_INVALID;
+}
+
+/**
+ * flow_epollfd() - Get the epoll file descriptor for a flow
+ * @f: Flow to query
+ *
+ * Return: epoll file descriptor associated with the flow's thread
+ */
+int flow_epollfd(const struct flow_common *f)
+{
+ if (f->epollid >= EPOLLFD_ID_MAX) {
+ flow_log_(f, true, LOG_WARNING,
+ "Invalid epollid %i for flow, assuming default",
+ f->epollid);
+ return epoll_id_to_fd[EPOLLFD_ID_DEFAULT];
+ }
+
+ return epoll_id_to_fd[f->epollid];
+}
+
+/**
+ * flow_epollid_set() - Associate a flow with an epoll id
+ * @f: Flow to update
+ * @epollid: epoll id to associate with this flow
+ */
+void flow_epollid_set(struct flow_common *f, int epollid)
+{
+ ASSERT(epollid < EPOLLFD_ID_MAX);
+
+ f->epollid = epollid;
+}
+
+/**
+ * flow_epollid_clear() - Clear the flow epoll id
+ * @f: Flow to update
+ */
+void flow_epollid_clear(struct flow_common *f)
+{
+ f->epollid = EPOLLFD_ID_INVALID;
+}
+
+/**
+ * flow_epollid_register() - Initialize the epoll id -> fd mapping
+ * @epollid: epoll id to associate to
+ * @epollfd: epoll file descriptor for this epoll id
+ */
+void flow_epollid_register(int epollid, int epollfd)
+{
+ ASSERT(epollid < EPOLLFD_ID_MAX);
+
+ epoll_id_to_fd[epollid] = epollfd;
+}
+
+/**
* flow_initiate_() - Move flow to INI, setting pif[INISIDE]
* @flow: Flow to change state
* @pif: pif of the initiating side
@@ -396,18 +449,27 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
* @flow: Flow to change state
* @pif: pif of the initiating side
* @ssa: Source socket address
+ * @daddr: Destination address (may be NULL)
* @dport: Destination port
*
* Return: pointer to the initiating flowside information
*/
struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
const union sockaddr_inany *ssa,
+ const union inany_addr *daddr,
in_port_t dport)
{
struct flowside *ini = &flow->f.side[INISIDE];
- inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa);
- if (inany_v4(&ini->eaddr))
+ if (inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa) < 0) {
+ char str[SOCKADDR_STRLEN];
+
+ ASSERT_WITH_MSG(0, "Bad socket address %s",
+ sockaddr_ntop(ssa, str, sizeof(str)));
+ }
+ if (daddr)
+ ini->oaddr = *daddr;
+ else if (inany_v4(&ini->eaddr))
ini->oaddr = inany_any4;
else
ini->oaddr = inany_any6;
@@ -440,6 +502,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
switch (f->pif[INISIDE]) {
case PIF_TAP:
+ memcpy(f->tap_omac, MAC_UNDEF, ETH_ALEN);
tgtpif = fwd_nat_from_tap(c, proto, ini, tgt);
break;
@@ -449,6 +512,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
case PIF_HOST:
tgtpif = fwd_nat_from_host(c, proto, ini, tgt);
+ fwd_neigh_mac_get(c, &tgt->oaddr, f->tap_omac);
break;
default:
@@ -471,7 +535,9 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
/**
* flow_set_type() - Set type and move to TYPED
* @flow: Flow to change state
- * @pif: pif of the initiating side
+ * @type: New flow type to assign
+ *
+ * Return: pointer to the modified flow structure.
*/
union flow *flow_set_type(union flow *flow, enum flow_type type)
{
@@ -539,6 +605,7 @@ union flow *flow_alloc(void)
flow_new_entry = flow;
memset(flow, 0, sizeof(*flow));
+ flow_epollid_clear(&flow->f);
flow_set_state(&flow->f, FLOW_STATE_NEW);
return flow;
@@ -616,7 +683,7 @@ static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx)
* @hash: Raw hash value for flow & side
* @sidx: Flow and side to find bucket for
*
- * Return: If @sidx is in the hash table, its current bucket, otherwise a
+ * Return: if @sidx is in the hash table, its current bucket, otherwise a
* suitable free bucket for it.
*/
static inline unsigned flow_hash_probe_(uint64_t hash, flow_sidx_t sidx)
@@ -636,7 +703,7 @@ static inline unsigned flow_hash_probe_(uint64_t hash, flow_sidx_t sidx)
* @c: Execution context
* @sidx: Flow and side to find bucket for
*
- * Return: If @sidx is in the hash table, its current bucket, otherwise a
+ * Return: if @sidx is in the hash table, its current bucket, otherwise a
* suitable free bucket for it.
*/
static inline unsigned flow_hash_probe(const struct ctx *c, flow_sidx_t sidx)
@@ -751,19 +818,30 @@ flow_sidx_t flow_lookup_af(const struct ctx *c,
* @proto: Protocol of the flow (IP L4 protocol number)
* @pif: Interface of the flow
* @esa: Socket address of the endpoint
+ * @oaddr: Our address (may be NULL)
* @oport: Our port number
*
* Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
*/
flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
- const void *esa, in_port_t oport)
+ const void *esa,
+ const union inany_addr *oaddr, in_port_t oport)
{
struct flowside side = {
.oport = oport,
};
- inany_from_sockaddr(&side.eaddr, &side.eport, esa);
- if (inany_v4(&side.eaddr))
+ if (inany_from_sockaddr(&side.eaddr, &side.eport, esa) < 0) {
+ char str[SOCKADDR_STRLEN];
+
+ warn("Flow lookup on bad socket address %s",
+ sockaddr_ntop(esa, str, sizeof(str)));
+ return FLOW_SIDX_NONE;
+ }
+
+ if (oaddr)
+ side.oaddr = *oaddr;
+ else if (inany_v4(&side.eaddr))
side.oaddr = inany_any4;
else
side.oaddr = inany_any6;
@@ -780,6 +858,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
{
struct flow_free_cluster *free_head = NULL;
unsigned *last_next = &flow_first_free;
+ bool to_free[FLOW_MAX] = { 0 };
bool timer = false;
union flow *flow;
@@ -790,9 +869,44 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
ASSERT(!flow_new_entry); /* Incomplete flow at end of cycle */
- flow_foreach_slot(flow) {
+ /* Check which flows we might need to close first, but don't free them
+ * yet as it's not safe to do that in the middle of flow_foreach().
+ */
+ flow_foreach(flow) {
bool closed = false;
+ switch (flow->f.type) {
+ case FLOW_TYPE_NONE:
+ ASSERT(false);
+ break;
+ case FLOW_TCP:
+ closed = tcp_flow_defer(&flow->tcp);
+ break;
+ case FLOW_TCP_SPLICE:
+ closed = tcp_splice_flow_defer(&flow->tcp_splice);
+ if (!closed && timer)
+ tcp_splice_timer(&flow->tcp_splice);
+ break;
+ case FLOW_PING4:
+ case FLOW_PING6:
+ if (timer)
+ closed = icmp_ping_timer(c, &flow->ping, now);
+ break;
+ case FLOW_UDP:
+ closed = udp_flow_defer(c, &flow->udp, now);
+ if (!closed && timer)
+ closed = udp_flow_timer(c, &flow->udp, now);
+ break;
+ default:
+ /* Assume other flow types don't need any handling */
+ ;
+ }
+
+ to_free[FLOW_IDX(flow)] = closed;
+ }
+
+ /* Second step: actually free the flows */
+ flow_foreach_slot(flow) {
switch (flow->f.state) {
case FLOW_STATE_FREE: {
unsigned skip = flow->free.n;
@@ -825,60 +939,31 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
break;
case FLOW_STATE_ACTIVE:
- /* Nothing to do */
+ if (to_free[FLOW_IDX(flow)]) {
+ flow_set_state(&flow->f, FLOW_STATE_FREE);
+ memset(flow, 0, sizeof(*flow));
+
+ if (free_head) {
+ /* Add slot to current free cluster */
+ ASSERT(FLOW_IDX(flow) ==
+ FLOW_IDX(free_head) + free_head->n);
+ free_head->n++;
+ flow->free.n = flow->free.next = 0;
+ } else {
+ /* Create new free cluster */
+ free_head = &flow->free;
+ free_head->n = 1;
+ *last_next = FLOW_IDX(flow);
+ last_next = &free_head->next;
+ }
+ } else {
+ free_head = NULL;
+ }
break;
default:
ASSERT(false);
}
-
- switch (flow->f.type) {
- case FLOW_TYPE_NONE:
- ASSERT(false);
- break;
- case FLOW_TCP:
- closed = tcp_flow_defer(&flow->tcp);
- break;
- case FLOW_TCP_SPLICE:
- closed = tcp_splice_flow_defer(&flow->tcp_splice);
- if (!closed && timer)
- tcp_splice_timer(c, &flow->tcp_splice);
- break;
- case FLOW_PING4:
- case FLOW_PING6:
- if (timer)
- closed = icmp_ping_timer(c, &flow->ping, now);
- break;
- case FLOW_UDP:
- closed = udp_flow_defer(&flow->udp);
- if (!closed && timer)
- closed = udp_flow_timer(c, &flow->udp, now);
- break;
- default:
- /* Assume other flow types don't need any handling */
- ;
- }
-
- if (closed) {
- flow_set_state(&flow->f, FLOW_STATE_FREE);
- memset(flow, 0, sizeof(*flow));
-
- if (free_head) {
- /* Add slot to current free cluster */
- ASSERT(FLOW_IDX(flow) ==
- FLOW_IDX(free_head) + free_head->n);
- free_head->n++;
- flow->free.n = flow->free.next = 0;
- } else {
- /* Create new free cluster */
- free_head = &flow->free;
- free_head->n = 1;
- *last_next = FLOW_IDX(flow);
- last_next = &free_head->next;
- }
- } else {
- free_head = NULL;
- }
}
*last_next = FLOW_MAX;
@@ -912,6 +997,21 @@ static int flow_migrate_source_rollback(struct ctx *c, unsigned bound, int ret)
}
/**
+ * flow_migrate_need_repair() - Do we need to set repair mode for any flow?
+ *
+ * Return: true if repair mode is needed, false otherwise
+ */
+static bool flow_migrate_need_repair(void)
+{
+ union flow *flow;
+
+ foreach_established_tcp_flow(flow)
+ return true;
+
+ return false;
+}
+
+/**
* flow_migrate_repair_all() - Turn repair mode on or off for all flows
* @c: Execution context
* @enable: Switch repair mode on if set, off otherwise
@@ -966,6 +1066,9 @@ int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage,
(void)stage;
(void)fd;
+ if (flow_migrate_need_repair())
+ repair_wait(c);
+
if ((rc = flow_migrate_repair_all(c, true)))
return -rc;
@@ -1019,8 +1122,8 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
foreach_established_tcp_flow(flow) {
rc = tcp_flow_migrate_source(fd, &flow->tcp);
if (rc) {
- err("Can't send data, flow %u: %s", FLOW_IDX(flow),
- strerror_(-rc));
+ flow_err(flow, "Can't send data: %s",
+ strerror_(-rc));
if (!first)
die("Inconsistent migration state, exiting");
@@ -1044,10 +1147,10 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
* as EIO).
*/
foreach_established_tcp_flow(flow) {
- rc = tcp_flow_migrate_source_ext(fd, &flow->tcp);
+ rc = tcp_flow_migrate_source_ext(c, fd, &flow->tcp);
if (rc) {
- err("Extended data for flow %u: %s", FLOW_IDX(flow),
- strerror_(-rc));
+ flow_err(flow, "Can't send extended data: %s",
+ strerror_(-rc));
if (rc == -EIO)
die("Inconsistent migration state, exiting");
@@ -1083,6 +1186,9 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
if (!count)
return 0;
+ if ((rc = repair_wait(c)))
+ return -rc;
+
if ((rc = flow_migrate_repair_all(c, true)))
return -rc;
@@ -1092,8 +1198,8 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
for (i = 0; i < count; i++) {
rc = tcp_flow_migrate_target(c, fd);
if (rc) {
- debug("Migration data failure at flow %u: %s, abort",
- i, strerror_(-rc));
+ flow_dbg(FLOW(i), "Migration data failure, abort: %s",
+ strerror_(-rc));
return -rc;
}
}
@@ -1103,8 +1209,8 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
for (i = 0; i < count; i++) {
rc = tcp_flow_migrate_target_ext(c, &flowtab[i].tcp, fd);
if (rc) {
- debug("Migration data failure at flow %u: %s, abort",
- i, strerror_(-rc));
+ flow_dbg(FLOW(i), "Migration data failure, abort: %s",
+ strerror_(-rc));
return -rc;
}
}
diff --git a/flow.h b/flow.h
index dcf7645..b43b0b1 100644
--- a/flow.h
+++ b/flow.h
@@ -167,7 +167,7 @@ static inline bool flowside_eq(const struct flowside *left,
}
int flowside_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
- const struct flowside *tgt, uint32_t data);
+ const struct flowside *tgt);
int flowside_connect(const struct ctx *c, int s,
uint8_t pif, const struct flowside *tgt);
@@ -177,6 +177,8 @@ int flowside_connect(const struct ctx *c, int s,
* @type: Type of packet flow
* @pif[]: Interface for each side of the flow
* @side[]: Information for each side of the flow
+ * @tap_omac: MAC address of remote endpoint as seen from the guest
+ * @epollid: epollfd identifier, or EPOLLFD_ID_INVALID
*/
struct flow_common {
#ifdef __GNUC__
@@ -192,8 +194,18 @@ struct flow_common {
#endif
uint8_t pif[SIDES];
struct flowside side[SIDES];
+
+ uint8_t tap_omac[6];
+
+#define EPOLLFD_ID_BITS 8
+ unsigned int epollid:EPOLLFD_ID_BITS;
};
+#define EPOLLFD_ID_DEFAULT 0
+#define EPOLLFD_ID_SIZE (1 << EPOLLFD_ID_BITS)
+#define EPOLLFD_ID_MAX (EPOLLFD_ID_SIZE - 1)
+#define EPOLLFD_ID_INVALID EPOLLFD_ID_MAX
+
#define FLOW_INDEX_BITS 17 /* 128k - 1 */
#define FLOW_MAX MAX_FROM_BITS(FLOW_INDEX_BITS)
@@ -243,11 +255,17 @@ flow_sidx_t flow_lookup_af(const struct ctx *c,
const void *eaddr, const void *oaddr,
in_port_t eport, in_port_t oport);
flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
- const void *esa, in_port_t oport);
+ const void *esa,
+ const union inany_addr *oaddr, in_port_t oport);
union flow;
void flow_init(void);
+bool flow_in_epoll(const struct flow_common *f);
+int flow_epollfd(const struct flow_common *f);
+void flow_epollid_set(struct flow_common *f, int epollid);
+void flow_epollid_clear(struct flow_common *f);
+void flow_epollid_register(int epollid, int epollfd);
void flow_defer_handler(const struct ctx *c, const struct timespec *now);
int flow_migrate_source_early(struct ctx *c, const struct migrate_stage *stage,
int fd);
diff --git a/flow_table.h b/flow_table.h
index fd2c57b..5ee13ac 100644
--- a/flow_table.h
+++ b/flow_table.h
@@ -93,6 +93,7 @@ extern union flow flowtab[];
*/
static inline unsigned flow_idx(const struct flow_common *f)
{
+ /* NOLINTNEXTLINE(clang-analyzer-security.PointerSub) */
return (union flow *)f - flowtab;
}
@@ -139,7 +140,7 @@ static inline uint8_t pif_at_sidx(flow_sidx_t sidx)
/** flowside_at_sidx() - Retrieve a specific flowside
* @sidx: Flow & side index
*
- * Return: Flowside for the flow & side given by @sidx
+ * Return: flowside for the flow & side given by @sidx
*/
static inline const struct flowside *flowside_at_sidx(flow_sidx_t sidx)
{
@@ -199,6 +200,7 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
const void *daddr, in_port_t dport);
struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
const union sockaddr_inany *ssa,
+ const union inany_addr *daddr,
in_port_t dport);
const struct flowside *flow_target_af(union flow *flow, uint8_t pif,
sa_family_t af,
diff --git a/fwd.c b/fwd.c
index 2829cd2..44a0e10 100644
--- a/fwd.c
+++ b/fwd.c
@@ -26,13 +26,225 @@
#include "passt.h"
#include "lineread.h"
#include "flow_table.h"
+#include "netlink.h"
+#include "arp.h"
+#include "ndp.h"
-/* Empheral port range: values from RFC 6335 */
+/* Ephemeral port range: values from RFC 6335 */
static in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14);
static in_port_t fwd_ephemeral_max = NUM_PORTS - 1;
#define PORT_RANGE_SYSCTL "/proc/sys/net/ipv4/ip_local_port_range"
+#define NEIGH_TABLE_SLOTS 1024
+#define NEIGH_TABLE_SIZE (NEIGH_TABLE_SLOTS / 2)
+static_assert((NEIGH_TABLE_SLOTS & (NEIGH_TABLE_SLOTS - 1)) == 0,
+ "NEIGH_TABLE_SLOTS must be a power of two");
+
+/**
+ * struct neigh_table_entry - Entry in the ARP/NDP table
+ * @next: Next entry in slot or free list
+ * @addr: IP address of represented host
+ * @mac: MAC address of represented host
+ * @permanent: Entry cannot be altered or freed by notification
+ */
+struct neigh_table_entry {
+ struct neigh_table_entry *next;
+ union inany_addr addr;
+ uint8_t mac[ETH_ALEN];
+ bool permanent;
+};
+
+/**
+ * struct neigh_table - Cache of ARP/NDP table contents
+ * @entries: Entries to be plugged into the hash slots when allocated
+ * @slots: Hash table slots
+ * @free: Linked list of unused entries
+ */
+struct neigh_table {
+ struct neigh_table_entry entries[NEIGH_TABLE_SIZE];
+ struct neigh_table_entry *slots[NEIGH_TABLE_SLOTS];
+ struct neigh_table_entry *free;
+};
+
+static struct neigh_table neigh_table;
+
+/**
+ * neigh_table_slot() - Hash key to a number within the table range
+ * @c: Execution context
+ * @key: The key to be used for the hash
+ *
+ * Return: the resulting hash value
+ */
+static size_t neigh_table_slot(const struct ctx *c,
+ const union inany_addr *key)
+{
+ struct siphash_state st = SIPHASH_INIT(c->hash_secret);
+ uint32_t i;
+
+ inany_siphash_feed(&st, key);
+ i = siphash_final(&st, sizeof(*key), 0);
+
+ return ((size_t)i) & (NEIGH_TABLE_SIZE - 1);
+}
+
+/**
+ * fwd_neigh_table_find() - Find a MAC table entry
+ * @c: Execution context
+ * @addr: Neighbour address to be used as key for the lookup
+ *
+ * Return: the matching entry, if found. Otherwise NULL
+ */
+static struct neigh_table_entry *fwd_neigh_table_find(const struct ctx *c,
+ const union inany_addr *addr)
+{
+ size_t slot = neigh_table_slot(c, addr);
+ struct neigh_table_entry *e = neigh_table.slots[slot];
+
+ while (e && !inany_equals(&e->addr, addr))
+ e = e->next;
+
+ return e;
+}
+
+/**
+ * fwd_neigh_table_update() - Allocate or update neighbour table entry
+ * @c: Execution context
+ * @addr: IP address used to determine insertion slot and store in entry
+ * @mac: The MAC address associated with the neighbour address
+ * @permanent: Created entry cannot be altered or freed
+ */
+void fwd_neigh_table_update(const struct ctx *c, const union inany_addr *addr,
+ const uint8_t *mac, bool permanent)
+{
+ struct neigh_table *t = &neigh_table;
+ struct neigh_table_entry *e;
+ ssize_t slot;
+
+ /* MAC address might change sometimes */
+ e = fwd_neigh_table_find(c, addr);
+ if (e) {
+ if (!e->permanent)
+ memcpy(e->mac, mac, ETH_ALEN);
+ return;
+ }
+
+ e = t->free;
+ if (!e) {
+ debug("Failed to allocate neighbour table entry");
+ return;
+ }
+ t->free = e->next;
+ slot = neigh_table_slot(c, addr);
+ e->next = t->slots[slot];
+ t->slots[slot] = e;
+
+ memcpy(&e->addr, addr, sizeof(*addr));
+ memcpy(e->mac, mac, ETH_ALEN);
+ e->permanent = permanent;
+
+ if (!memcmp(mac, c->our_tap_mac, ETH_ALEN))
+ return;
+
+ if (inany_v4(addr))
+ arp_announce(c, inany_v4(addr), e->mac);
+ else
+ ndp_unsolicited_na(c, &addr->a6);
+}
+
+/**
+ * fwd_neigh_table_free() - Remove an entry from a slot and add it to free list
+ * @c: Execution context
+ * @addr: IP address used to find the slot for the entry
+ */
+void fwd_neigh_table_free(const struct ctx *c, const union inany_addr *addr)
+{
+ ssize_t slot = neigh_table_slot(c, addr);
+ struct neigh_table *t = &neigh_table;
+ struct neigh_table_entry *e, **prev;
+
+ prev = &t->slots[slot];
+ e = t->slots[slot];
+ while (e && !inany_equals(&e->addr, addr)) {
+ prev = &e->next;
+ e = e->next;
+ }
+
+ if (!e || e->permanent)
+ return;
+
+ *prev = e->next;
+ e->next = t->free;
+ t->free = e;
+ memset(&e->addr, 0, sizeof(*addr));
+ memset(e->mac, 0, ETH_ALEN);
+}
+
+/**
+ * fwd_neigh_mac_get() - Look up MAC address in the ARP/NDP table
+ * @c: Execution context
+ * @addr: Neighbour IP address used as lookup key
+ * @mac: Buffer for returned MAC address
+ */
+void fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
+ uint8_t *mac)
+{
+ const struct neigh_table_entry *e = fwd_neigh_table_find(c, addr);
+
+ if (!e) {
+ union inany_addr ggw;
+
+ if (inany_v4(addr))
+ ggw = inany_from_v4(c->ip4.guest_gw);
+ else
+ ggw.a6 = c->ip6.guest_gw;
+
+ e = fwd_neigh_table_find(c, &ggw);
+ }
+
+ if (e)
+ memcpy(mac, e->mac, ETH_ALEN);
+ else
+ memcpy(mac, c->our_tap_mac, ETH_ALEN);
+}
+
+/**
+ * fwd_neigh_table_init() - Initialize the neighbour table
+ * @c: Execution context
+ */
+void fwd_neigh_table_init(const struct ctx *c)
+{
+ union inany_addr mhl = inany_from_v4(c->ip4.map_host_loopback);
+ union inany_addr mga = inany_from_v4(c->ip4.map_guest_addr);
+ struct neigh_table *t = &neigh_table;
+ struct neigh_table_entry *e;
+ int i;
+
+ memset(t, 0, sizeof(*t));
+
+ for (i = 0; i < NEIGH_TABLE_SIZE; i++) {
+ e = &t->entries[i];
+ e->next = t->free;
+ t->free = e;
+ }
+
+ /* Blocker entries to stop events from hosts using these addresses */
+ if (!inany_is_unspecified4(&mhl))
+ fwd_neigh_table_update(c, &mhl, c->our_tap_mac, true);
+
+ if (!inany_is_unspecified4(&mga))
+ fwd_neigh_table_update(c, &mga, c->our_tap_mac, true);
+
+ mhl = *(union inany_addr *)&c->ip6.map_host_loopback;
+ mga = *(union inany_addr *)&c->ip6.map_guest_addr;
+
+ if (!inany_is_unspecified6(&mhl))
+ fwd_neigh_table_update(c, &mhl, c->our_tap_mac, true);
+
+ if (!inany_is_unspecified6(&mga))
+ fwd_neigh_table_update(c, &mga, c->our_tap_mac, true);
+}
+
/** fwd_probe_ephemeral() - Determine what ports this host considers ephemeral
*
* Work out what ports the host thinks are emphemeral and record it for later
@@ -110,13 +322,11 @@ bool fwd_port_is_ephemeral(in_port_t port)
* @fd: fd for relevant /proc/net file
* @lstate: Code for listening state to scan for
* @map: Bitmap where numbers of ports in listening state will be set
- * @exclude: Bitmap of ports to exclude from setting (and clear)
*
* #syscalls:pasta lseek
* #syscalls:pasta ppc64le:_llseek ppc64:_llseek arm:_llseek
*/
-static void procfs_scan_listen(int fd, unsigned int lstate,
- uint8_t *map, const uint8_t *exclude)
+static void procfs_scan_listen(int fd, unsigned int lstate, uint8_t *map)
{
struct lineread lr;
unsigned long port;
@@ -141,56 +351,72 @@ static void procfs_scan_listen(int fd, unsigned int lstate,
if (state != lstate)
continue;
- if (bitmap_isset(exclude, port))
- bitmap_clear(map, port);
- else
- bitmap_set(map, port);
+ bitmap_set(map, port);
}
}
/**
* fwd_scan_ports_tcp() - Scan /proc to update TCP forwarding map
* @fwd: Forwarding information to update
- * @rev: Forwarding information for the reverse direction
+ * @exclude: Ports to _not_ forward
*/
-void fwd_scan_ports_tcp(struct fwd_ports *fwd, const struct fwd_ports *rev)
+static void fwd_scan_ports_tcp(struct fwd_ports *fwd, const uint8_t *exclude)
{
+ if (fwd->mode != FWD_AUTO)
+ return;
+
memset(fwd->map, 0, PORT_BITMAP_SIZE);
- procfs_scan_listen(fwd->scan4, TCP_LISTEN, fwd->map, rev->map);
- procfs_scan_listen(fwd->scan6, TCP_LISTEN, fwd->map, rev->map);
+ procfs_scan_listen(fwd->scan4, TCP_LISTEN, fwd->map);
+ procfs_scan_listen(fwd->scan6, TCP_LISTEN, fwd->map);
+ bitmap_and_not(fwd->map, PORT_BITMAP_SIZE, fwd->map, exclude);
}
/**
* fwd_scan_ports_udp() - Scan /proc to update UDP forwarding map
* @fwd: Forwarding information to update
- * @rev: Forwarding information for the reverse direction
* @tcp_fwd: Corresponding TCP forwarding information
- * @tcp_rev: TCP forwarding information for the reverse direction
+ * @exclude: Ports to _not_ forward
*/
-void fwd_scan_ports_udp(struct fwd_ports *fwd, const struct fwd_ports *rev,
- const struct fwd_ports *tcp_fwd,
- const struct fwd_ports *tcp_rev)
+static void fwd_scan_ports_udp(struct fwd_ports *fwd,
+ const struct fwd_ports *tcp_fwd,
+ const uint8_t *exclude)
{
- uint8_t exclude[PORT_BITMAP_SIZE];
-
- bitmap_or(exclude, PORT_BITMAP_SIZE, rev->map, tcp_rev->map);
+ if (fwd->mode != FWD_AUTO)
+ return;
memset(fwd->map, 0, PORT_BITMAP_SIZE);
- procfs_scan_listen(fwd->scan4, UDP_LISTEN, fwd->map, exclude);
- procfs_scan_listen(fwd->scan6, UDP_LISTEN, fwd->map, exclude);
+ procfs_scan_listen(fwd->scan4, UDP_LISTEN, fwd->map);
+ procfs_scan_listen(fwd->scan6, UDP_LISTEN, fwd->map);
/* Also forward UDP ports with the same numbers as bound TCP ports.
* This is useful for a handful of protocols (e.g. iperf3) where a TCP
* control port is used to set up transfers on a corresponding UDP
* port.
- *
- * This means we need to skip numbers of TCP ports bound on the other
- * side, too. Otherwise, we would detect corresponding UDP ports as
- * bound and try to forward them from the opposite side, but it's
- * already us handling them.
*/
- procfs_scan_listen(tcp_fwd->scan4, TCP_LISTEN, fwd->map, exclude);
- procfs_scan_listen(tcp_fwd->scan6, TCP_LISTEN, fwd->map, exclude);
+ procfs_scan_listen(tcp_fwd->scan4, TCP_LISTEN, fwd->map);
+ procfs_scan_listen(tcp_fwd->scan6, TCP_LISTEN, fwd->map);
+
+ bitmap_and_not(fwd->map, PORT_BITMAP_SIZE, fwd->map, exclude);
+}
+
+/**
+ * fwd_scan_ports() - Scan automatic port forwarding information
+ * @c: Execution context
+ */
+static void fwd_scan_ports(struct ctx *c)
+{
+ uint8_t excl_tcp_out[PORT_BITMAP_SIZE], excl_udp_out[PORT_BITMAP_SIZE];
+ uint8_t excl_tcp_in[PORT_BITMAP_SIZE], excl_udp_in[PORT_BITMAP_SIZE];
+
+ memcpy(excl_tcp_out, c->tcp.fwd_in.map, sizeof(excl_tcp_out));
+ memcpy(excl_tcp_in, c->tcp.fwd_out.map, sizeof(excl_tcp_in));
+ memcpy(excl_udp_out, c->udp.fwd_in.map, sizeof(excl_udp_out));
+ memcpy(excl_udp_in, c->udp.fwd_out.map, sizeof(excl_udp_in));
+
+ fwd_scan_ports_tcp(&c->tcp.fwd_out, excl_tcp_out);
+ fwd_scan_ports_tcp(&c->tcp.fwd_in, excl_tcp_in);
+ fwd_scan_ports_udp(&c->udp.fwd_out, &c->tcp.fwd_out, excl_udp_out);
+ fwd_scan_ports_udp(&c->udp.fwd_in, &c->tcp.fwd_in, excl_udp_in);
}
/**
@@ -209,25 +435,46 @@ void fwd_scan_ports_init(struct ctx *c)
if (c->tcp.fwd_in.mode == FWD_AUTO) {
c->tcp.fwd_in.scan4 = open_in_ns(c, "/proc/net/tcp", flags);
c->tcp.fwd_in.scan6 = open_in_ns(c, "/proc/net/tcp6", flags);
- fwd_scan_ports_tcp(&c->tcp.fwd_in, &c->tcp.fwd_out);
}
if (c->udp.fwd_in.mode == FWD_AUTO) {
c->udp.fwd_in.scan4 = open_in_ns(c, "/proc/net/udp", flags);
c->udp.fwd_in.scan6 = open_in_ns(c, "/proc/net/udp6", flags);
- fwd_scan_ports_udp(&c->udp.fwd_in, &c->udp.fwd_out,
- &c->tcp.fwd_in, &c->tcp.fwd_out);
}
if (c->tcp.fwd_out.mode == FWD_AUTO) {
c->tcp.fwd_out.scan4 = open("/proc/net/tcp", flags);
c->tcp.fwd_out.scan6 = open("/proc/net/tcp6", flags);
- fwd_scan_ports_tcp(&c->tcp.fwd_out, &c->tcp.fwd_in);
}
if (c->udp.fwd_out.mode == FWD_AUTO) {
c->udp.fwd_out.scan4 = open("/proc/net/udp", flags);
c->udp.fwd_out.scan6 = open("/proc/net/udp6", flags);
- fwd_scan_ports_udp(&c->udp.fwd_out, &c->udp.fwd_in,
- &c->tcp.fwd_out, &c->tcp.fwd_in);
}
+ fwd_scan_ports(c);
+}
+
+/* Last time we scanned for open ports */
+static struct timespec scan_ports_run;
+
+/**
+ * fwd_scan_ports_timer() - Rescan open port information when necessary
+ * @c: Execution context
+ * @now: Current (monotonic) time
+ */
+void fwd_scan_ports_timer(struct ctx *c, const struct timespec *now)
+{
+ if (c->mode != MODE_PASTA)
+ return;
+
+ if (timespec_diff_ms(now, &scan_ports_run) < FWD_PORT_SCAN_INTERVAL)
+ return;
+
+ scan_ports_run = *now;
+
+ fwd_scan_ports(c);
+
+ if (!c->no_tcp)
+ tcp_port_rebind_all(c);
+ if (!c->no_udp)
+ udp_port_rebind_all(c);
}
/**
@@ -324,6 +571,30 @@ static bool fwd_guest_accessible(const struct ctx *c,
}
/**
+ * nat_outbound() - Apply address translation for outbound (TAP to HOST)
+ * @c: Execution context
+ * @addr: Input address (as seen on TAP interface)
+ * @translated: Output address (as seen on HOST interface)
+ *
+ * Only handles translations that depend *only* on the address. Anything
+ * related to specific ports or flows is handled elsewhere.
+ */
+static void nat_outbound(const struct ctx *c, const union inany_addr *addr,
+ union inany_addr *translated)
+{
+ if (inany_equals4(addr, &c->ip4.map_host_loopback))
+ *translated = inany_loopback4;
+ else if (inany_equals6(addr, &c->ip6.map_host_loopback))
+ *translated = inany_loopback6;
+ else if (inany_equals4(addr, &c->ip4.map_guest_addr))
+ *translated = inany_from_v4(c->ip4.addr);
+ else if (inany_equals6(addr, &c->ip6.map_guest_addr))
+ translated->a6 = c->ip6.addr;
+ else
+ *translated = *addr;
+}
+
+/**
* fwd_nat_from_tap() - Determine to forward a flow from the tap interface
* @c: Execution context
* @proto: Protocol (IP L4 protocol number)
@@ -342,16 +613,8 @@ uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
else if (is_dns_flow(proto, ini) &&
inany_equals6(&ini->oaddr, &c->ip6.dns_match))
tgt->eaddr.a6 = c->ip6.dns_host;
- else if (inany_equals4(&ini->oaddr, &c->ip4.map_host_loopback))
- tgt->eaddr = inany_loopback4;
- else if (inany_equals6(&ini->oaddr, &c->ip6.map_host_loopback))
- tgt->eaddr = inany_loopback6;
- else if (inany_equals4(&ini->oaddr, &c->ip4.map_guest_addr))
- tgt->eaddr = inany_from_v4(c->ip4.addr);
- else if (inany_equals6(&ini->oaddr, &c->ip6.map_guest_addr))
- tgt->eaddr.a6 = c->ip6.addr;
else
- tgt->eaddr = ini->oaddr;
+ nat_outbound(c, &ini->oaddr, &tgt->eaddr);
tgt->eport = ini->oport;
@@ -397,12 +660,14 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
return PIF_NONE;
}
- if (inany_v4(&ini->eaddr))
+ if (!inany_is_unspecified(&ini->oaddr))
+ tgt->eaddr = ini->oaddr;
+ else if (inany_v4(&ini->oaddr))
tgt->eaddr = inany_loopback4;
else
tgt->eaddr = inany_loopback6;
- /* Preserve the specific loopback adddress used, but let the kernel pick
+ /* Preserve the specific loopback address used, but let the kernel pick
* a source port on the target side
*/
tgt->oaddr = ini->eaddr;
@@ -424,6 +689,42 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
}
/**
+ * nat_inbound() - Apply address translation for inbound (HOST to TAP)
+ * @c: Execution context
+ * @addr: Input address (as seen on HOST interface)
+ * @translated: Output address (as seen on TAP interface)
+ *
+ * Return: true on success, false if it couldn't translate the address
+ *
+ * Only handles translations that depend *only* on the address. Anything
+ * related to specific ports or flows is handled elsewhere.
+ */
+bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
+ union inany_addr *translated)
+{
+ if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) &&
+ inany_equals4(addr, &in4addr_loopback)) {
+ /* Specifically 127.0.0.1, not 127.0.0.0/8 */
+ *translated = inany_from_v4(c->ip4.map_host_loopback);
+ } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) &&
+ inany_equals6(addr, &in6addr_loopback)) {
+ translated->a6 = c->ip6.map_host_loopback;
+ } else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) &&
+ inany_equals4(addr, &c->ip4.addr)) {
+ *translated = inany_from_v4(c->ip4.map_guest_addr);
+ } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) &&
+ inany_equals6(addr, &c->ip6.addr)) {
+ translated->a6 = c->ip6.map_guest_addr;
+ } else if (fwd_guest_accessible(c, addr)) {
+ *translated = *addr;
+ } else {
+ return false;
+ }
+
+ return true;
+}
+
+/**
* fwd_nat_from_host() - Determine to forward a flow from the host interface
* @c: Execution context
* @proto: Protocol (IP L4 protocol number)
@@ -479,20 +780,7 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
return PIF_SPLICE;
}
- if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) &&
- inany_equals4(&ini->eaddr, &in4addr_loopback)) {
- /* Specifically 127.0.0.1, not 127.0.0.0/8 */
- tgt->oaddr = inany_from_v4(c->ip4.map_host_loopback);
- } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) &&
- inany_equals6(&ini->eaddr, &in6addr_loopback)) {
- tgt->oaddr.a6 = c->ip6.map_host_loopback;
- } else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) &&
- inany_equals4(&ini->eaddr, &c->ip4.addr)) {
- tgt->oaddr = inany_from_v4(c->ip4.map_guest_addr);
- } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) &&
- inany_equals6(&ini->eaddr, &c->ip6.addr)) {
- tgt->oaddr.a6 = c->ip6.map_guest_addr;
- } else if (!fwd_guest_accessible(c, &ini->eaddr)) {
+ if (!nat_inbound(c, &ini->eaddr, &tgt->oaddr)) {
if (inany_v4(&ini->eaddr)) {
if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.our_tap_addr))
/* No source address we can use */
@@ -501,8 +789,6 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
} else {
tgt->oaddr.a6 = c->ip6.our_tap_ll;
}
- } else {
- tgt->oaddr = ini->eaddr;
}
tgt->oport = ini->eport;
diff --git a/fwd.h b/fwd.h
index 3562f3c..7792582 100644
--- a/fwd.h
+++ b/fwd.h
@@ -7,6 +7,7 @@
#ifndef FWD_H
#define FWD_H
+union inany_addr;
struct flowside;
/* Number of ports for both TCP and UDP */
@@ -26,7 +27,7 @@ enum fwd_ports_mode {
#define PORT_BITMAP_SIZE DIV_ROUND_UP(NUM_PORTS, 8)
/**
- * fwd_ports - Describes port forwarding for one protocol and direction
+ * fwd_ports() - Describes port forwarding for one protocol and direction
* @mode: Overall forwarding mode (all, none, auto, specific ports)
* @scan4: /proc/net fd to scan for IPv4 ports when in AUTO mode
* @scan6: /proc/net fd to scan for IPv6 ports when in AUTO mode
@@ -41,17 +42,25 @@ struct fwd_ports {
in_port_t delta[NUM_PORTS];
};
-void fwd_scan_ports_tcp(struct fwd_ports *fwd, const struct fwd_ports *rev);
-void fwd_scan_ports_udp(struct fwd_ports *fwd, const struct fwd_ports *rev,
- const struct fwd_ports *tcp_fwd,
- const struct fwd_ports *tcp_rev);
+#define FWD_PORT_SCAN_INTERVAL 1000 /* ms */
+
void fwd_scan_ports_init(struct ctx *c);
+void fwd_scan_ports_timer(struct ctx *c, const struct timespec *now);
+bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
+ union inany_addr *translated);
uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
const struct flowside *ini, struct flowside *tgt);
uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
const struct flowside *ini, struct flowside *tgt);
uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
const struct flowside *ini, struct flowside *tgt);
+void fwd_neigh_table_update(const struct ctx *c, const union inany_addr *addr,
+ const uint8_t *mac, bool permanent);
+void fwd_neigh_table_free(const struct ctx *c,
+ const union inany_addr *addr);
+void fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
+ uint8_t *mac);
+void fwd_neigh_table_init(const struct ctx *c);
#endif /* FWD_H */
diff --git a/icmp.c b/icmp.c
index 7e2b342..9564c49 100644
--- a/icmp.c
+++ b/icmp.c
@@ -15,7 +15,6 @@
#include <errno.h>
#include <net/ethernet.h>
#include <net/if.h>
-#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip_icmp.h>
#include <stdio.h>
@@ -23,10 +22,8 @@
#include <stdint.h>
#include <stddef.h>
#include <string.h>
-#include <sys/epoll.h>
#include <sys/types.h>
#include <sys/socket.h>
-#include <unistd.h>
#include <time.h>
#include <linux/icmpv6.h>
@@ -41,9 +38,11 @@
#include "inany.h"
#include "icmp.h"
#include "flow_table.h"
+#include "epoll_ctl.h"
#define ICMP_ECHO_TIMEOUT 60 /* s, timeout for ICMP socket activity */
#define ICMP_NUM_IDS (1U << 16)
+#define MAX_IOV_ICMP 16 /* Arbitrary, should be enough */
/**
* ping_at_sidx() - Get ping specific flow at given sidx
@@ -124,17 +123,21 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref)
flow_dbg(pingf, "echo reply to tap, ID: %"PRIu16", seq: %"PRIu16,
ini->eport, seq);
+ /* Check if neighbour table has a recorded MAC address */
+ if (MAC_IS_UNDEF(pingf->f.tap_omac))
+ fwd_neigh_mac_get(c, &ini->oaddr, pingf->f.tap_omac);
+
if (pingf->f.type == FLOW_PING4) {
const struct in_addr *saddr = inany_v4(&ini->oaddr);
const struct in_addr *daddr = inany_v4(&ini->eaddr);
ASSERT(saddr && daddr); /* Must have IPv4 addresses */
- tap_icmp4_send(c, *saddr, *daddr, buf, n);
+ tap_icmp4_send(c, *saddr, *daddr, buf, pingf->f.tap_omac, n);
} else if (pingf->f.type == FLOW_PING6) {
const struct in6_addr *saddr = &ini->oaddr.a6;
const struct in6_addr *daddr = &ini->eaddr.a6;
- tap_icmp6_send(c, saddr, daddr, buf, n);
+ tap_icmp6_send(c, saddr, daddr, buf, pingf->f.tap_omac, n);
}
return;
@@ -150,7 +153,7 @@ unexpected:
static void icmp_ping_close(const struct ctx *c,
const struct icmp_ping_flow *pingf)
{
- epoll_del(c, pingf->sock);
+ epoll_del(flow_epollfd(&pingf->f), pingf->sock);
close(pingf->sock);
flow_hash_remove(c, FLOW_SIDX(pingf, INISIDE));
}
@@ -163,7 +166,7 @@ static void icmp_ping_close(const struct ctx *c,
* @saddr: Source address
* @daddr: Destination address
*
- * Return: Newly opened ping flow, or NULL on failure
+ * Return: newly opened ping flow, or NULL on failure
*/
static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
sa_family_t af, uint16_t id,
@@ -171,10 +174,10 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
{
uint8_t proto = af == AF_INET ? IPPROTO_ICMP : IPPROTO_ICMPV6;
uint8_t flowtype = af == AF_INET ? FLOW_PING4 : FLOW_PING6;
- union epoll_ref ref = { .type = EPOLL_TYPE_PING };
union flow *flow = flow_alloc();
struct icmp_ping_flow *pingf;
const struct flowside *tgt;
+ union epoll_ref ref;
if (!flow)
return NULL;
@@ -195,9 +198,7 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
pingf->seq = -1;
- ref.flowside = FLOW_SIDX(flow, TGTSIDE);
- pingf->sock = flowside_sock_l4(c, EPOLL_TYPE_PING, PIF_HOST,
- tgt, ref.data);
+ pingf->sock = flowside_sock_l4(c, EPOLL_TYPE_PING, PIF_HOST, tgt);
if (pingf->sock < 0) {
warn("Cannot open \"ping\" socket. You might need to:");
@@ -209,6 +210,17 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
if (pingf->sock > FD_REF_MAX)
goto cancel;
+ flow_epollid_set(&pingf->f, EPOLLFD_ID_DEFAULT);
+
+ ref.type = EPOLL_TYPE_PING;
+ ref.flowside = FLOW_SIDX(flow, TGTSIDE);
+ ref.fd = pingf->sock;
+
+ if (epoll_add(flow_epollfd(&pingf->f), EPOLLIN, ref) < 0) {
+ close(pingf->sock);
+ goto cancel;
+ }
+
flow_dbg(pingf, "new socket %i for echo ID %"PRIu16, pingf->sock, id);
flow_hash_insert(c, FLOW_SIDX(pingf, INISIDE));
@@ -229,37 +241,36 @@ cancel:
* @af: Address family, AF_INET or AF_INET6
* @saddr: Source address
* @daddr: Destination address
- * @p: Packet pool, single packet with ICMP/ICMPv6 header
+ * @data: Single packet with ICMP/ICMPv6 header
* @now: Current timestamp
*
* Return: count of consumed packets (always 1, even if malformed)
*/
int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
const void *saddr, const void *daddr,
- const struct pool *p, const struct timespec *now)
+ struct iov_tail *data, const struct timespec *now)
{
+ struct iovec iov[MAX_IOV_ICMP];
struct icmp_ping_flow *pingf;
const struct flowside *tgt;
union sockaddr_inany sa;
- size_t dlen, l4len;
+ struct msghdr msh;
uint16_t id, seq;
union flow *flow;
uint8_t proto;
- socklen_t sl;
- void *pkt;
+ int cnt;
(void)saddr;
ASSERT(pif == PIF_TAP);
if (af == AF_INET) {
+ struct icmphdr ih_storage;
const struct icmphdr *ih;
- if (!(pkt = packet_get(p, 0, 0, sizeof(*ih), &dlen)))
+ ih = IOV_PEEK_HEADER(data, ih_storage);
+ if (!ih)
return 1;
- ih = (struct icmphdr *)pkt;
- l4len = dlen + sizeof(*ih);
-
if (ih->type != ICMP_ECHO)
return 1;
@@ -267,14 +278,13 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
id = ntohs(ih->un.echo.id);
seq = ntohs(ih->un.echo.sequence);
} else if (af == AF_INET6) {
+ struct icmp6hdr ih_storage;
const struct icmp6hdr *ih;
- if (!(pkt = packet_get(p, 0, 0, sizeof(*ih), &dlen)))
+ ih = IOV_PEEK_HEADER(data, ih_storage);
+ if (!ih)
return 1;
- ih = (struct icmp6hdr *)pkt;
- l4len = dlen + sizeof(*ih);
-
if (ih->icmp6_type != ICMPV6_ECHO_REQUEST)
return 1;
@@ -285,6 +295,10 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
ASSERT(0);
}
+ cnt = iov_tail_clone(&iov[0], MAX_IOV_ICMP, data);
+ if (cnt < 0)
+ return 1;
+
flow = flow_at_sidx(flow_lookup_af(c, proto, PIF_TAP,
af, saddr, daddr, id, id));
@@ -298,8 +312,16 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
ASSERT(flow_proto[pingf->f.type] == proto);
pingf->ts = now->tv_sec;
- pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, 0);
- if (sendto(pingf->sock, pkt, l4len, MSG_NOSIGNAL, &sa.sa, sl) < 0) {
+ pif_sockaddr(c, &sa, PIF_HOST, &tgt->eaddr, 0);
+ msh.msg_name = &sa;
+ msh.msg_namelen = socklen_inany(&sa);
+ msh.msg_iov = iov;
+ msh.msg_iovlen = cnt;
+ msh.msg_control = NULL;
+ msh.msg_controllen = 0;
+ msh.msg_flags = 0;
+
+ if (sendmsg(pingf->sock, &msh, MSG_NOSIGNAL) < 0) {
flow_dbg_perror(pingf, "failed to relay request to socket");
} else {
flow_dbg(pingf,
diff --git a/icmp.h b/icmp.h
index 5ce22b5..1a0e620 100644
--- a/icmp.h
+++ b/icmp.h
@@ -6,15 +6,13 @@
#ifndef ICMP_H
#define ICMP_H
-#define ICMP_TIMER_INTERVAL 10000 /* ms */
-
struct ctx;
struct icmp_ping_flow;
void icmp_sock_handler(const struct ctx *c, union epoll_ref ref);
int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
const void *saddr, const void *daddr,
- const struct pool *p, const struct timespec *now);
+ struct iov_tail *data, const struct timespec *now);
void icmp_init(void);
/**
diff --git a/inany.c b/inany.c
index f5483bf..7680439 100644
--- a/inany.c
+++ b/inany.c
@@ -16,6 +16,7 @@
#include "ip.h"
#include "siphash.h"
#include "inany.h"
+#include "fwd.h"
const union inany_addr inany_loopback4 = INANY_INIT4(IN4ADDR_LOOPBACK_INIT);
const union inany_addr inany_any4 = INANY_INIT4(IN4ADDR_ANY_INIT);
@@ -25,7 +26,7 @@ const union inany_addr inany_any4 = INANY_INIT4(IN4ADDR_ANY_INIT);
* @dst: output buffer, minimum INANY_ADDRSTRLEN bytes
* @size: size of buffer at @dst
*
- * Return: On success, a non-null pointer to @dst, NULL on failure
+ * Return: on success, a non-null pointer to @dst, NULL on failure
*/
const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size)
{
@@ -41,7 +42,7 @@ const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size)
* @src: IPv[46] address
* @dst: output buffer, filled with parsed address
*
- * Return: On success, 1, if no parseable address is found, 0
+ * Return: on success, 1, if no parseable address is found, 0
*/
int inany_pton(const char *src, union inany_addr *dst)
{
diff --git a/inany.h b/inany.h
index 6a12c29..61b36fb 100644
--- a/inany.h
+++ b/inany.h
@@ -67,6 +67,23 @@ union sockaddr_inany {
struct sockaddr_in6 sa6;
};
+/** socklen_inany() - Get relevant address length for sockaddr_inany address
+ * @sa: sockaddr_inany socket address
+ *
+ * Return: socket address length for bind() or connect(), from IP family in @sa
+ */
+static inline socklen_t socklen_inany(const union sockaddr_inany *sa)
+{
+ switch (sa->sa_family) {
+ case AF_INET:
+ return sizeof(sa->sa4);
+ case AF_INET6:
+ return sizeof(sa->sa6);
+ default:
+ ASSERT(0);
+ }
+}
+
/** inany_v4 - Extract IPv4 address, if present, from IPv[46] address
* @addr: IPv4 or IPv6 address
*
@@ -237,23 +254,30 @@ static inline void inany_from_af(union inany_addr *aa,
}
/** inany_from_sockaddr - Extract IPv[46] address and port number from sockaddr
- * @aa: Pointer to store IPv[46] address
+ * @dst: Pointer to store IPv[46] address (output)
* @port: Pointer to store port number, host order
- * @addr: AF_INET or AF_INET6 socket address
+ * @addr: Socket address
+ *
+ * Return: 0 on success, -1 on error (bad address family)
*/
-static inline void inany_from_sockaddr(union inany_addr *aa, in_port_t *port,
- const union sockaddr_inany *sa)
+static inline int inany_from_sockaddr(union inany_addr *dst, in_port_t *port,
+ const void *addr)
{
+ const union sockaddr_inany *sa = (const union sockaddr_inany *)addr;
+
if (sa->sa_family == AF_INET6) {
- inany_from_af(aa, AF_INET6, &sa->sa6.sin6_addr);
+ inany_from_af(dst, AF_INET6, &sa->sa6.sin6_addr);
*port = ntohs(sa->sa6.sin6_port);
- } else if (sa->sa_family == AF_INET) {
- inany_from_af(aa, AF_INET, &sa->sa4.sin_addr);
+ return 0;
+ }
+
+ if (sa->sa_family == AF_INET) {
+ inany_from_af(dst, AF_INET, &sa->sa4.sin_addr);
*port = ntohs(sa->sa4.sin_port);
- } else {
- /* Not valid to call with other address families */
- ASSERT(0);
+ return 0;
}
+
+ return -1;
}
/** inany_siphash_feed- Fold IPv[46] address into an in-progress siphash
diff --git a/iov.c b/iov.c
index 3b12272..ad726da 100644
--- a/iov.c
+++ b/iov.c
@@ -20,23 +20,21 @@
* Contributions after 2012-01-13 are licensed under the terms of the
* GNU GPL, version 2 or (at your option) any later version.
*/
+
#include <sys/socket.h>
#include "util.h"
#include "iov.h"
-
-/* iov_skip_bytes() - Skip leading bytes of an IO vector
- * @iov: IO vector
+/**
+ * iov_skip_bytes() - Find index and offset in iovec array given byte offset
+ * @iov: iovec array
* @n: Number of entries in @iov
- * @skip: Number of leading bytes of @iov to skip
- * @offset: Offset of first unskipped byte in its @iov entry
- *
- * Return: index I of individual struct iovec which contains the byte at @skip
- * bytes into the vector (as though all its buffers were contiguous).
- * If @offset is non-NULL, update it to the offset of that byte within
- * @iov[I] (guaranteed to be less than @iov[I].iov_len) If the whole
- * vector has <= @skip bytes, return @n.
+ * @skip: Byte offset: leading bytes of @iov to skip
+ * @offset: Offset within matching @iov entry, set on return, can be NULL
+ *
+ * Return: index of iovec array containing the @skip byte counted as if buffers
+ * were contiguous. If iovec has less than @skip bytes, return @n.
*/
size_t iov_skip_bytes(const struct iovec *iov, size_t n,
size_t skip, size_t *offset)
@@ -56,17 +54,14 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n,
}
/**
- * iov_from_buf - Copy data from a buffer to an I/O vector (struct iovec)
- * efficiently.
+ * iov_from_buf() - Copy from flat buffer to iovec array
+ * @iov: Destination iovec array
+ * @iov_cnt: Number of elements in the iovec array
+ * @offset: Destination offset in @iov counted as if buffers were contiguous
+ * @buf: Source buffer
+ * @bytes: Bytes to copy
*
- * @iov: Pointer to the array of struct iovec describing the
- * scatter/gather I/O vector.
- * @iov_cnt: Number of elements in the iov array.
- * @offset: Byte offset in the iov array where copying should start.
- * @buf: Pointer to the source buffer containing the data to copy.
- * @bytes: Total number of bytes to copy from buf to iov.
- *
- * Returns: The number of bytes successfully copied.
+ * Return: number of bytes copied
*/
size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
size_t offset, const void *buf, size_t bytes)
@@ -77,12 +72,12 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
if (__builtin_constant_p(bytes) && iov_cnt &&
offset <= iov[0].iov_len && bytes <= iov[0].iov_len - offset) {
memcpy((char *)iov[0].iov_base + offset, buf, bytes);
+
return bytes;
}
i = iov_skip_bytes(iov, iov_cnt, offset, &offset);
- /* copying data */
for (copied = 0; copied < bytes && i < iov_cnt; i++) {
size_t len = MIN(iov[i].iov_len - offset, bytes - copied);
@@ -96,19 +91,15 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
}
/**
- * iov_to_buf - Copy data from a scatter/gather I/O vector (struct iovec) to
- * a buffer efficiently.
- *
- * @iov: Pointer to the array of struct iovec describing the scatter/gather
- * I/O vector.
- * @iov_cnt: Number of elements in the iov array.
- * @offset: Offset within the first element of iov from where copying should start.
- * @buf: Pointer to the destination buffer where data will be copied.
- * @bytes: Total number of bytes to copy from iov to buf.
+ * iov_to_buf() - Copy from iovec to flat buffer
+ * @iov: Source iovec array
+ * @iov_cnt: Number of elements in iovec array
+ * @offset: Source offset in @iov counted as if buffers were contiguous
+ * @buf: Destination buffer
+ * @bytes: Bytes to copy
*
- * Returns: The number of bytes successfully copied.
+ * Return: number of bytes copied
*/
-/* cppcheck-suppress unusedFunction */
size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt,
size_t offset, void *buf, size_t bytes)
{
@@ -118,14 +109,17 @@ size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt,
if (__builtin_constant_p(bytes) && iov_cnt &&
offset <= iov[0].iov_len && bytes <= iov[0].iov_len - offset) {
memcpy(buf, (char *)iov[0].iov_base + offset, bytes);
+
return bytes;
}
i = iov_skip_bytes(iov, iov_cnt, offset, &offset);
- /* copying data */
for (copied = 0; copied < bytes && i < iov_cnt; i++) {
size_t len = MIN(iov[i].iov_len - offset, bytes - copied);
+
+ ASSERT(iov[i].iov_base);
+
memcpy((char *)buf + copied, (char *)iov[i].iov_base + offset,
len);
copied += len;
@@ -136,14 +130,11 @@ size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt,
}
/**
- * iov_size - Calculate the total size of a scatter/gather I/O vector
- * (struct iovec).
+ * iov_size() - Calculate total data size of iovec
+ * @iov: Source iovec array
+ * @iov_cnt: Number of elements in iovec array
*
- * @iov: Pointer to the array of struct iovec describing the
- * scatter/gather I/O vector.
- * @iov_cnt: Number of elements in the iov array.
- *
- * Returns: The total size in bytes.
+ * Return: total size in bytes
*/
size_t iov_size(const struct iovec *iov, size_t iov_cnt)
{
@@ -166,7 +157,7 @@ size_t iov_size(const struct iovec *iov, size_t iov_cnt)
* includes buffers that are actually needed. This will avoid stepping through
* unnecessary elements of the underlying IO vector on future operations.
*
- * Return: true if the tail still contains any bytes, otherwise false
+ * Return: true if the tail still contains any bytes, otherwise false
*/
bool iov_tail_prune(struct iov_tail *tail)
{
@@ -180,10 +171,10 @@ bool iov_tail_prune(struct iov_tail *tail)
}
/**
- * iov_tail_size - Calculate the total size of an IO vector tail
+ * iov_tail_size() - Calculate the total size of an IO vector tail
* @tail: IO vector tail
*
- * Returns: The total size in bytes.
+ * Return: the total size in bytes.
*/
size_t iov_tail_size(struct iov_tail *tail)
{
@@ -192,18 +183,32 @@ size_t iov_tail_size(struct iov_tail *tail)
}
/**
- * iov_peek_header_() - Get pointer to a header from an IOV tail
+ * iov_drop_header() - Discard a header from an IOV tail
+ * @tail: IO vector tail
+ * @len: length to move the head of the tail
+ *
+ * Return: true if the item still contains any bytes, otherwise false
+ */
+bool iov_drop_header(struct iov_tail *tail, size_t len)
+{
+ tail->off = tail->off + len;
+
+ return iov_tail_prune(tail);
+}
+
+/**
+ * iov_check_header() - Check if a header can be accessed
* @tail: IOV tail to get header from
* @len: Length of header to get, in bytes
* @align: Required alignment of header, in bytes
*
* @tail may be pruned, but will represent the same bytes as before.
*
- * Returns: Pointer to the first @len logical bytes of the tail, NULL if that
- * overruns the IO vector, is not contiguous or doesn't have the
- * requested alignment.
+ * Return: pointer to the first @len logical bytes of the tail, NULL if that
+ * overruns the IO vector, is not contiguous or doesn't have the
+ * requested alignment.
*/
-void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align)
+static void *iov_check_header(struct iov_tail *tail, size_t len, size_t align)
{
char *p;
@@ -224,25 +229,95 @@ void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align)
}
/**
+ * iov_peek_header_() - Get pointer to a header from an IOV tail
+ * @tail: IOV tail to get header from
+ * @v: Temporary memory to use if the memory in @tail
+ * is discontinuous
+ * @len: Length of header to get, in bytes
+ * @align: Required alignment of header, in bytes
+ *
+ * @tail may be pruned, but will represent the same bytes as before.
+ *
+ * Return: pointer to the first @len logical bytes of the tail, or to
+ * a copy if that overruns the IO vector, is not contiguous or
+ * doesn't have the requested alignment. NULL if that overruns the
+ * IO vector.
+ */
+/* cppcheck-suppress [staticFunction,unmatchedSuppression] */
+void *iov_peek_header_(struct iov_tail *tail, void *v, size_t len, size_t align)
+{
+ char *p = iov_check_header(tail, len, align);
+ size_t l;
+
+ if (p)
+ return p;
+
+ l = iov_to_buf(tail->iov, tail->cnt, tail->off, v, len);
+ if (l != len)
+ return NULL;
+
+ return v;
+}
+
+/**
* iov_remove_header_() - Remove a header from an IOV tail
* @tail: IOV tail to remove header from (modified)
+ * @v: Temporary memory to use if the memory in @tail
+ * is discontinuous
* @len: Length of header to remove, in bytes
* @align: Required alignment of header, in bytes
*
* On success, @tail is updated so that it longer includes the bytes of the
* returned header.
*
- * Returns: Pointer to the first @len logical bytes of the tail, NULL if that
- * overruns the IO vector, is not contiguous or doesn't have the
- * requested alignment.
+ * Return: pointer to the first @len logical bytes of the tail, or to
+ * a copy if that overruns the IO vector, is not contiguous or
+ * doesn't have the requested alignment. NULL if that overruns the
+ * IO vector.
*/
-void *iov_remove_header_(struct iov_tail *tail, size_t len, size_t align)
+void *iov_remove_header_(struct iov_tail *tail, void *v, size_t len, size_t align)
{
- char *p = iov_peek_header_(tail, len, align);
+ char *p = iov_peek_header_(tail, v, len, align);
if (!p)
return NULL;
tail->off = tail->off + len;
+
return p;
}
+
+/**
+ * iov_tail_clone() - Clone an iov tail into a new iovec array
+ *
+ * @dst_iov: Pointer to the destination array of struct iovec describing
+ * the scatter/gather I/O vector to shallow copy to.
+ * @dst_iov_cnt: Maximum number of elements in the destination iov array.
+ * @tail: Pointer to the source iov_tail
+ *
+ * Return: the number of elements successfully referenced from the destination
+ * iov array, a negative value if there is not enough room in the
+ * destination iov array
+ */
+ssize_t iov_tail_clone(struct iovec *dst_iov, size_t dst_iov_cnt,
+ struct iov_tail *tail)
+{
+ const struct iovec *iov = &tail->iov[0];
+ size_t iov_cnt = tail->cnt;
+ size_t offset = tail->off;
+ unsigned int i, j;
+
+ i = iov_skip_bytes(iov, iov_cnt, offset, &offset);
+
+ /* assign iov references referencing a subset of the source one */
+ for (j = 0; i < iov_cnt && j < dst_iov_cnt; i++, j++) {
+ dst_iov[j].iov_base = (char *)iov[i].iov_base + offset;
+ dst_iov[j].iov_len = iov[i].iov_len - offset;
+ offset = 0;
+ }
+
+ if (j == dst_iov_cnt && i != iov_cnt)
+ return -1;
+
+ return j;
+}
diff --git a/iov.h b/iov.h
index 9855bf0..ba1fda5 100644
--- a/iov.h
+++ b/iov.h
@@ -70,38 +70,68 @@ struct iov_tail {
#define IOV_TAIL(iov_, cnt_, off_) \
(struct iov_tail){ .iov = (iov_), .cnt = (cnt_), .off = (off_) }
+/**
+ * IOV_TAIL_FROM_BUF() - Create a new IOV tail from a buffer
+ * @buf_: Buffer address to use in the iovec
+ * @len_: Buffer size
+ * @off_: Byte offset in the buffer where the tail begins
+ */
+#define IOV_TAIL_FROM_BUF(buf_, len_, off_) \
+ IOV_TAIL((&(const struct iovec){ .iov_base = (buf_), \
+ .iov_len = (len_) }), \
+ 1, \
+ (off_))
+
bool iov_tail_prune(struct iov_tail *tail);
size_t iov_tail_size(struct iov_tail *tail);
-void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align);
-void *iov_remove_header_(struct iov_tail *tail, size_t len, size_t align);
+bool iov_drop_header(struct iov_tail *tail, size_t len);
+void *iov_peek_header_(struct iov_tail *tail, void *v, size_t len, size_t align);
+void *iov_remove_header_(struct iov_tail *tail, void *v, size_t len, size_t align);
+ssize_t iov_tail_clone(struct iovec *dst_iov, size_t dst_iov_cnt,
+ struct iov_tail *tail);
/**
* IOV_PEEK_HEADER() - Get typed pointer to a header from an IOV tail
* @tail_: IOV tail to get header from
- * @type_: Data type of the header
+ * @var_: Temporary buffer of the type of the header to use if
+ * the memory in the iovec array is not contiguous.
*
* @tail_ may be pruned, but will represent the same bytes as before.
*
- * Returns: Pointer of type (@type_ *) located at the start of @tail_, NULL if
- * we can't get a contiguous and aligned pointer.
+ * Return: pointer of type (@type_ *) located at the start of @tail_
+ * or to @var_ if iovec memory is not contiguous, NULL if
+ * that overruns the iovec.
*/
-#define IOV_PEEK_HEADER(tail_, type_) \
- ((type_ *)(iov_peek_header_((tail_), \
- sizeof(type_), __alignof__(type_))))
+
+#define IOV_PEEK_HEADER(tail_, var_) \
+ ((__typeof__(var_) *)(iov_peek_header_((tail_), &(var_), \
+ sizeof(var_), \
+ __alignof__(var_))))
/**
* IOV_REMOVE_HEADER() - Remove and return typed header from an IOV tail
* @tail_: IOV tail to remove header from (modified)
- * @type_: Data type of the header to remove
+ * @var_: Temporary buffer of the type of the header to use if
+ * the memory in the iovec array is not contiguous.
*
* On success, @tail_ is updated so that it longer includes the bytes of the
* returned header.
*
- * Returns: Pointer of type (@type_ *) located at the old start of @tail_, NULL
- * if we can't get a contiguous and aligned pointer.
+ * Return: pointer of type (@type_ *) located at the start of @tail_
+ * or to @var_ if iovec memory is not contiguous, NULL if
+ * that overruns the iovec.
+ */
+
+#define IOV_REMOVE_HEADER(tail_, var_) \
+ ((__typeof__(var_) *)(iov_remove_header_((tail_), &(var_), \
+ sizeof(var_), __alignof__(var_))))
+
+/** IOV_DROP_HEADER() - Remove a typed header from an IOV tail
+ * @tail_: IOV tail to remove header from (modified)
+ * @type_: Data type of the header to remove
+ *
+ * Return: true if the tail still contains any bytes, otherwise false
*/
-#define IOV_REMOVE_HEADER(tail_, type_) \
- ((type_ *)(iov_remove_header_((tail_), \
- sizeof(type_), __alignof__(type_))))
+#define IOV_DROP_HEADER(tail_, type_) iov_drop_header((tail_), sizeof(type_))
#endif /* IOVEC_H */
diff --git a/ip.c b/ip.c
index 2cc7f65..9a7f4c5 100644
--- a/ip.c
+++ b/ip.c
@@ -23,50 +23,47 @@
/**
* ipv6_l4hdr() - Find pointer to L4 header in IPv6 packet and extract protocol
- * @p: Packet pool, packet number @idx has IPv6 header at @offset
- * @idx: Index of packet in pool
- * @offset: Pre-calculated IPv6 header offset
+ * @data: IPv6 packet
* @proto: Filled with L4 protocol number
* @dlen: Data length (payload excluding header extensions), set on return
*
- * Return: pointer to L4 header, NULL if not found
+ * Return: true if the L4 header is found and @data, @proto, @dlen are set,
+ * false on error. Outputs are indeterminate on failure.
*/
-char *ipv6_l4hdr(const struct pool *p, int idx, size_t offset, uint8_t *proto,
- size_t *dlen)
+bool ipv6_l4hdr(struct iov_tail *data, uint8_t *proto, size_t *dlen)
{
+ struct ipv6_opt_hdr o_storage;
const struct ipv6_opt_hdr *o;
+ struct ipv6hdr ip6h_storage;
const struct ipv6hdr *ip6h;
- char *base;
int hdrlen;
uint8_t nh;
- base = packet_get(p, idx, 0, 0, NULL);
- ip6h = packet_get(p, idx, offset, sizeof(*ip6h), dlen);
+ ip6h = IOV_REMOVE_HEADER(data, ip6h_storage);
if (!ip6h)
- return NULL;
-
- offset += sizeof(*ip6h);
+ return false;
nh = ip6h->nexthdr;
if (!IPV6_NH_OPT(nh))
goto found;
- while ((o = packet_get_try(p, idx, offset, sizeof(*o), dlen))) {
+ while ((o = IOV_PEEK_HEADER(data, o_storage))) {
nh = o->nexthdr;
hdrlen = (o->hdrlen + 1) * 8;
if (IPV6_NH_OPT(nh))
- offset += hdrlen;
+ iov_drop_header(data, hdrlen);
else
goto found;
}
- return NULL;
+ return false;
found:
- if (nh == 59)
- return NULL;
+ if (nh == IPPROTO_NONE)
+ return false;
+ *dlen = iov_tail_size(data);
*proto = nh;
- return base + offset;
+ return true;
}
diff --git a/ip.h b/ip.h
index 471c57e..5830b92 100644
--- a/ip.h
+++ b/ip.h
@@ -115,10 +115,9 @@ static inline uint32_t ip6_get_flow_lbl(const struct ipv6hdr *ip6h)
ip6h->flow_lbl[2];
}
-char *ipv6_l4hdr(const struct pool *p, int idx, size_t offset, uint8_t *proto,
- size_t *dlen);
+bool ipv6_l4hdr(struct iov_tail *data, uint8_t *proto, size_t *dlen);
-/* IPv6 link-local all-nodes multicast adddress, ff02::1 */
+/* IPv6 link-local all-nodes multicast address, ff02::1 */
static const struct in6_addr in6addr_ll_all_nodes = {
.s6_addr = {
0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
diff --git a/isolation.c b/isolation.c
index c944fb3..b25f349 100644
--- a/isolation.c
+++ b/isolation.c
@@ -129,7 +129,7 @@ static void drop_caps_ep_except(uint64_t keep)
* additional layer of protection. Executing this requires
* CAP_SETPCAP, which we will have within our userns.
*
- * Note that dropping capabilites from the bounding set limits
+ * Note that dropping capabilities from the bounding set limits
* exec()ed processes, but does not remove them from the effective or
* permitted sets, so it doesn't reduce our own capabilities.
*/
@@ -174,8 +174,8 @@ static void clamp_caps(void)
* Should:
* - drop unneeded capabilities
* - close all open files except for standard streams and the one from --fd
- * Musn't:
- * - remove filesytem access (we need to access files during setup)
+ * Mustn't:
+ * - remove filesystem access (we need to access files during setup)
*/
void isolate_initial(int argc, char **argv)
{
@@ -188,17 +188,20 @@ void isolate_initial(int argc, char **argv)
* We have to keep CAP_SETUID and CAP_SETGID at this stage, so
* that we can switch user away from root.
*
+ * CAP_DAC_OVERRIDE may be required for socket setup when combined
+ * with --runas.
+ *
* We have to keep some capabilities for the --netns-only case:
* - CAP_SYS_ADMIN, so that we can setns() to the netns.
* - Keep CAP_NET_ADMIN, so that we can configure interfaces
*
* It's debatable whether it's useful to drop caps when we
* retain SETUID and SYS_ADMIN, but we might as well. We drop
- * further capabilites in isolate_user() and
+ * further capabilities in isolate_user() and
* isolate_prefork().
*/
keep = BIT(CAP_NET_BIND_SERVICE) | BIT(CAP_SETUID) | BIT(CAP_SETGID) |
- BIT(CAP_SYS_ADMIN) | BIT(CAP_NET_ADMIN);
+ BIT(CAP_SYS_ADMIN) | BIT(CAP_NET_ADMIN) | BIT(CAP_DAC_OVERRIDE);
/* Since Linux 5.12, if we want to update /proc/self/uid_map to create
* a mapping from UID 0, which only happens with pasta spawning a child
diff --git a/lineread.c b/lineread.c
index 0387f4a..4225de6 100644
--- a/lineread.c
+++ b/lineread.c
@@ -70,7 +70,7 @@ static ssize_t peek_line(struct lineread *lr, bool eof)
* @lr: Line reader state structure
* @line: Place a pointer to the next line in this variable
*
- * Return: Length of line read on success, 0 on EOF, negative on error
+ * Return: length of line read on success, 0 on EOF, negative on error
*/
ssize_t lineread_get(struct lineread *lr, char **line)
{
diff --git a/linux_dep.h b/linux_dep.h
index 240f50a..89e590c 100644
--- a/linux_dep.h
+++ b/linux_dep.h
@@ -135,8 +135,14 @@ struct tcp_info_linux {
#define CLOSE_RANGE_UNSHARE (1U << 1)
#endif
+#ifndef TCP_REPAIR_ON
+#define TCP_REPAIR_ON 1
+#define TCP_REPAIR_OFF 0
+#define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */
+#endif
+
__attribute__ ((weak))
-/* cppcheck-suppress funcArgNamesDifferent */
+/* cppcheck-suppress [funcArgNamesDifferent,unmatchedSuppression] */
int close_range(unsigned int first, unsigned int last, int flags) {
return syscall(SYS_close_range, first, last, flags);
}
diff --git a/log.c b/log.c
index 6eda4c4..21e3673 100644
--- a/log.c
+++ b/log.c
@@ -35,7 +35,7 @@ static int log_sock = -1; /* Optional socket to system logger */
static char log_ident[BUFSIZ]; /* Identifier string for openlog() */
static int log_mask; /* Current log priority mask */
-static int log_file = -1; /* Optional log file descriptor */
+int log_file = -1; /* Optional log file descriptor */
static size_t log_size; /* Maximum log file size in bytes */
static size_t log_written; /* Currently used bytes in log file */
static size_t log_cut_size; /* Bytes to cut at start on rotation */
@@ -54,7 +54,8 @@ bool log_stderr = true; /* Not daemonised, no shell spawned */
* logtime() - Get the current time for logging purposes
* @ts: Buffer into which to store the timestamp
*
- * Return: pointer to @now, or NULL if there was an error retrieving the time
+ * Return: pointer to @ts on success, or NULL if there was
+ * an error retrieving the time
*/
static const struct timespec *logtime(struct timespec *ts)
{
@@ -281,6 +282,7 @@ static void passt_vsyslog(bool newline, int pri, const char *format, va_list ap)
* @format: Message
* @ap: Variable argument list
*/
+/* cppcheck-suppress [staticFunction,unmatchedSuppression] */
void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap)
{
bool debug_print = (log_mask & LOG_MASK(LOG_DEBUG)) && log_file == -1;
@@ -401,7 +403,7 @@ void __setlogmask(int mask)
* logfile_init() - Open log file and write header with PID, version, path
* @name: Identifier for header: passt or pasta
* @path: Path to log file
- * @size: Maximum size of log file: log_cut_size is calculatd here
+ * @size: Maximum size of log file: log_cut_size is calculated here
*/
void logfile_init(const char *name, const char *path, size_t size)
{
diff --git a/log.h b/log.h
index 08aa88c..b7b2067 100644
--- a/log.h
+++ b/log.h
@@ -9,6 +9,11 @@
#include <stdbool.h>
#include <syslog.h>
+/* This would make more sense in util.h, but because we use it in die(), that
+ * would cause awkward circular reference problems.
+ */
+void passt_exit(int status) __attribute__((noreturn));
+
#define LOGFILE_SIZE_DEFAULT (1024 * 1024UL)
#define LOGFILE_CUT_RATIO 30 /* When full, cut ~30% size */
#define LOGFILE_SIZE_MIN (5UL * MAX(BUFSIZ, PAGE_SIZE))
@@ -32,15 +37,16 @@ void logmsg_perror(int pri, const char *format, ...)
#define die(...) \
do { \
err(__VA_ARGS__); \
- _exit(EXIT_FAILURE); \
+ passt_exit(EXIT_FAILURE); \
} while (0)
#define die_perror(...) \
do { \
err_perror(__VA_ARGS__); \
- _exit(EXIT_FAILURE); \
+ passt_exit(EXIT_FAILURE); \
} while (0)
+extern int log_file;
extern int log_trace;
extern bool log_conf_parsed;
extern bool log_stderr;
diff --git a/migrate.c b/migrate.c
index 0fca77b..48d63a0 100644
--- a/migrate.c
+++ b/migrate.c
@@ -96,8 +96,8 @@ static int seen_addrs_target_v1(struct ctx *c,
return 0;
}
-/* Stages for version 1 */
-static const struct migrate_stage stages_v1[] = {
+/* Stages for version 2 */
+static const struct migrate_stage stages_v2[] = {
{
.name = "observed addresses",
.source = seen_addrs_source_v1,
@@ -118,7 +118,11 @@ static const struct migrate_stage stages_v1[] = {
/* Supported encoding versions, from latest (most preferred) to oldest */
static const struct migrate_version versions[] = {
- { 1, stages_v1, },
+ { 2, stages_v2, },
+ /* v1 was released, but not widely used. It had bad endianness for the
+ * MSS and omitted timestamps, which meant it usually wouldn't work.
+ * Therefore we don't attempt to support compatibility with it.
+ */
{ 0 },
};
diff --git a/ndp.c b/ndp.c
index ded2081..eb9e313 100644
--- a/ndp.c
+++ b/ndp.c
@@ -184,7 +184,7 @@ static void ndp_send(const struct ctx *c, const struct in6_addr *dst,
{
const struct in6_addr *src = &c->ip6.our_tap_ll;
- tap_icmp6_send(c, src, dst, buf, l4len);
+ tap_icmp6_send(c, src, dst, buf, c->our_tap_mac, l4len);
}
/**
@@ -196,6 +196,7 @@ static void ndp_send(const struct ctx *c, const struct in6_addr *dst,
static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
const struct in6_addr *addr)
{
+ union inany_addr tgt;
struct ndp_na na = {
.ih = {
.icmp6_type = NA,
@@ -213,12 +214,24 @@ static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
}
};
- memcpy(na.target_l2_addr.mac, c->our_tap_mac, ETH_ALEN);
+ inany_from_af(&tgt, AF_INET6, addr);
+ fwd_neigh_mac_get(c, &tgt, na.target_l2_addr.mac);
ndp_send(c, dst, &na, sizeof(na));
}
/**
+ * ndp_unsolicited_na() - Send unsolicited NA
+ * @c: Execution context
+ * @addr: IPv6 address to advertise
+ */
+void ndp_unsolicited_na(const struct ctx *c, const struct in6_addr *addr)
+{
+ if (tap_is_ready(c))
+ ndp_na(c, &in6addr_ll_all_nodes, addr);
+}
+
+/**
* ndp_ra() - Send an NDP Router Advertisement (RA) message
* @c: Execution context
* @dst: IPv6 address to send the RA to
@@ -328,21 +341,28 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
memcpy(&ra.source_ll.mac, c->our_tap_mac, ETH_ALEN);
+ /* NOLINTNEXTLINE(clang-analyzer-security.PointerSub) */
ndp_send(c, dst, &ra, ptr - (unsigned char *)&ra);
}
/**
* ndp() - Check for NDP solicitations, reply as needed
* @c: Execution context
- * @ih: ICMPv6 header
* @saddr: Source IPv6 address
- * @p: Packet pool
+ * @data: Single packet with ICMPv6 header
*
* Return: 0 if not handled here, 1 if handled, -1 on failure
*/
-int ndp(const struct ctx *c, const struct icmp6hdr *ih,
- const struct in6_addr *saddr, const struct pool *p)
+int ndp(const struct ctx *c, const struct in6_addr *saddr,
+ struct iov_tail *data)
{
+ struct icmp6hdr ih_storage;
+ const struct icmp6hdr *ih;
+
+ ih = IOV_PEEK_HEADER(data, ih_storage);
+ if (!ih)
+ return -1;
+
if (ih->icmp6_type < RS || ih->icmp6_type > NA)
return 0;
@@ -350,9 +370,10 @@ int ndp(const struct ctx *c, const struct icmp6hdr *ih,
return 1;
if (ih->icmp6_type == NS) {
+ struct ndp_ns ns_storage;
const struct ndp_ns *ns;
- ns = packet_get(p, 0, 0, sizeof(struct ndp_ns), NULL);
+ ns = IOV_REMOVE_HEADER(data, ns_storage);
if (!ns)
return -1;
@@ -391,7 +412,7 @@ void ndp_timer(const struct ctx *c, const struct timespec *now)
time_t max_rtr_adv_interval = DEFAULT_MAX_RTR_ADV_INTERVAL;
time_t min_rtr_adv_interval, interval;
- if (c->fd_tap < 0 || c->no_ra || now->tv_sec < next_ra)
+ if (!tap_is_ready(c) || c->no_ra || now->tv_sec < next_ra)
return;
/* We must advertise before the route's lifetime expires */
@@ -430,3 +451,23 @@ void ndp_timer(const struct ctx *c, const struct timespec *now)
first:
next_ra = now->tv_sec + interval;
}
+
+/**
+ * ndp_send_init_req() - Send initial NDP NS to retrieve guest MAC address
+ * @c: Execution context
+ */
+void ndp_send_init_req(const struct ctx *c)
+{
+ struct ndp_ns ns = {
+ .ih = {
+ .icmp6_type = NS,
+ .icmp6_code = 0,
+ .icmp6_router = 0, /* Reserved */
+ .icmp6_solicited = 0, /* Reserved */
+ .icmp6_override = 0, /* Reserved */
+ },
+ .target_addr = c->ip6.addr
+ };
+ debug("Sending initial NDP NS request for guest MAC address");
+ ndp_send(c, &c->ip6.addr, &ns, sizeof(ns));
+}
diff --git a/ndp.h b/ndp.h
index 41c2000..56b756d 100644
--- a/ndp.h
+++ b/ndp.h
@@ -8,8 +8,10 @@
struct icmp6hdr;
-int ndp(const struct ctx *c, const struct icmp6hdr *ih,
- const struct in6_addr *saddr, const struct pool *p);
+int ndp(const struct ctx *c, const struct in6_addr *saddr,
+ struct iov_tail *data);
void ndp_timer(const struct ctx *c, const struct timespec *now);
+void ndp_send_init_req(const struct ctx *c);
+void ndp_unsolicited_na(const struct ctx *c, const struct in6_addr *addr);
#endif /* NDP_H */
diff --git a/netlink.c b/netlink.c
index a052504..82a2f0c 100644
--- a/netlink.c
+++ b/netlink.c
@@ -26,6 +26,7 @@
#include <arpa/inet.h>
#include <netinet/in.h>
#include <netinet/if_ether.h>
+#include <net/if_arp.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
@@ -35,11 +36,16 @@
#include "log.h"
#include "ip.h"
#include "netlink.h"
+#include "epoll_ctl.h"
/* Same as RTA_NEXT() but for nexthops: RTNH_NEXT() doesn't take 'attrlen' */
#define RTNH_NEXT_AND_DEC(rtnh, attrlen) \
((attrlen) -= RTNH_ALIGN((rtnh)->rtnh_len), RTNH_NEXT(rtnh))
+/* Convenience macro borrowed from kernel */
+#define NUD_VALID \
+ (NUD_PERMANENT | NUD_NOARP | NUD_REACHABLE | NUD_PROBE | NUD_STALE)
+
/* Netlink expects a buffer of at least 8kiB or the system page size,
* whichever is larger. 32kiB is recommended for more efficient.
* Since the largest page size on any remotely common Linux setup is
@@ -50,9 +56,10 @@
#define NLBUFSIZ 65536
/* Socket in init, in target namespace, sequence (just needs to be monotonic) */
-int nl_sock = -1;
-int nl_sock_ns = -1;
-static int nl_seq = 1;
+int nl_sock = -1;
+int nl_sock_ns = -1;
+static int nl_sock_neigh = -1;
+static int nl_seq = 1;
/**
* nl_sock_init_do() - Set up netlink sockets in init or target namespace
@@ -199,7 +206,7 @@ static struct nlmsghdr *nl_next(int s, char *buf, struct nlmsghdr *nh, ssize_t *
}
/**
- * nl_foreach - 'for' type macro to step through netlink response messages
+ * nl_foreach() - 'for' type macro to step through netlink response messages
* nl_foreach_oftype - as above, but only messages of expected type
* @nh: Steps through each response header (struct nlmsghdr *)
* @status: When loop exits indicates if there was an error (ssize_t)
@@ -565,6 +572,11 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
if (nh->nlmsg_type != RTM_NEWROUTE)
continue;
+ /* nexthop state flags don't apply to freshly created routes,
+ * and the kernel will refuse our route if they are set.
+ */
+ rtm->rtm_flags &= ~RTNH_COMPARE_MASK;
+
for (rta = RTM_RTA(rtm), na = RTM_PAYLOAD(nh); RTA_OK(rta, na);
rta = RTA_NEXT(rta, na)) {
/* RTA_OIF and RTA_MULTIPATH attributes carry the
@@ -781,7 +793,7 @@ int nl_addr_get(int s, unsigned int ifi, sa_family_t af,
prefix_max = *prefix_len = ifa->ifa_prefixlen;
} else if (af == AF_INET6 && addr &&
- ifa->ifa_scope == RT_SCOPE_UNIVERSE &&
+ ifa->ifa_scope < RT_SCOPE_LINK &&
ifa->ifa_prefixlen > prefix_max) {
memcpy(addr, RTA_DATA(rta), RTA_PAYLOAD(rta));
@@ -1024,7 +1036,6 @@ int nl_link_get_mac(int s, unsigned int ifi, void *mac)
/**
* nl_link_set_mac() - Set link MAC address
* @s: Netlink socket
- * @ns: Use netlink socket in namespace
* @ifi: Interface index
* @mac: MAC address to set
*
@@ -1099,3 +1110,200 @@ int nl_link_set_flags(int s, unsigned int ifi,
return nl_do(s, &req, RTM_NEWLINK, 0, sizeof(req));
}
+
+/**
+ * nl_neigh_msg_read() - Interpret a neighbour state message from netlink
+ * @c: Execution context
+ * @nh: Message to be read
+ */
+static void nl_neigh_msg_read(const struct ctx *c, struct nlmsghdr *nh)
+{
+ struct ndmsg *ndm = NLMSG_DATA(nh);
+ struct rtattr *rta = (struct rtattr *)(ndm + 1);
+ size_t na = NLMSG_PAYLOAD(nh, sizeof(*ndm));
+ char ip_str[INET6_ADDRSTRLEN];
+ char mac_str[ETH_ADDRSTRLEN];
+ const uint8_t *lladdr = NULL;
+ union inany_addr addr, daddr;
+ const void *dst = NULL;
+ size_t lladdr_len = 0;
+ size_t dstlen = 0;
+
+ if (nh->nlmsg_type == NLMSG_DONE)
+ return;
+
+ if (nh->nlmsg_type == NLMSG_ERROR) {
+ const struct nlmsgerr *errmsg = (struct nlmsgerr *)ndm;
+
+ warn("netlink error message on neighbour notifier: %s",
+ strerror_(-errmsg->error));
+ return;
+ }
+
+ if (nh->nlmsg_type != RTM_NEWNEIGH && nh->nlmsg_type != RTM_DELNEIGH)
+ return;
+
+ for (; RTA_OK(rta, na); rta = RTA_NEXT(rta, na)) {
+ if (rta->rta_type == NDA_DST) {
+ dst = RTA_DATA(rta);
+ dstlen = RTA_PAYLOAD(rta);
+ } else if (rta->rta_type == NDA_LLADDR) {
+ lladdr = RTA_DATA(rta);
+ lladdr_len = RTA_PAYLOAD(rta);
+ }
+ }
+
+ if (!dst)
+ return;
+
+ if (ndm->ndm_family == AF_INET && ndm->ndm_ifindex != c->ifi4)
+ return;
+
+ if (ndm->ndm_family == AF_INET6 && ndm->ndm_ifindex != c->ifi6)
+ return;
+
+ if (ndm->ndm_family != AF_INET && ndm->ndm_family != AF_INET6)
+ return;
+
+ if (ndm->ndm_family == AF_INET && dstlen != sizeof(struct in_addr)) {
+ warn("netlink: wrong address length in AF_INET notification");
+ return;
+ }
+ if (ndm->ndm_family == AF_INET6 && dstlen != sizeof(struct in6_addr)) {
+ warn("netlink: wrong address length in AF_INET6 notification");
+ return;
+ }
+
+ /* We only handle guest-side visible addresses */
+ inany_from_af(&addr, ndm->ndm_family, dst);
+ if (!nat_inbound(c, &addr, &daddr))
+ return;
+
+ inany_ntop(&daddr, ip_str, sizeof(ip_str));
+
+ if (nh->nlmsg_type == RTM_DELNEIGH) {
+ trace("neighbour notifier delete: %s", ip_str);
+ fwd_neigh_table_free(c, &daddr);
+ return;
+ }
+ if (!(ndm->ndm_state & NUD_VALID)) {
+ trace("neighbour notifier: %s unreachable, state: 0x%04x",
+ ip_str, ndm->ndm_state);
+ fwd_neigh_table_free(c, &daddr);
+ return;
+ }
+ if (!lladdr) {
+ warn("RTM_NEWNEIGH %s: missing link layer address", ip_str);
+ return;
+ }
+ if (lladdr_len != ETH_ALEN || ndm->ndm_type != ARPHRD_ETHER)
+ return;
+
+ eth_ntop(lladdr, mac_str, sizeof(mac_str));
+ trace("neighbour notifier update: %s / %s", ip_str, mac_str);
+ fwd_neigh_table_update(c, &daddr, lladdr, false);
+}
+
+/**
+ * nl_neigh_sync() - Read current contents of ARP/NDP tables
+ * @c: Execution context
+ * @proto: Protocol, AF_INET or AF_INET6
+ * @ifi: Interface index
+ */
+static void nl_neigh_sync(const struct ctx *c, int proto, int ifi)
+{
+ struct {
+ struct nlmsghdr nlh;
+ struct ndmsg ndm;
+ } req = {
+ .ndm.ndm_family = proto,
+ .ndm.ndm_ifindex = ifi,
+ };
+ struct nlmsghdr *nh;
+ char buf[NLBUFSIZ];
+ ssize_t status;
+ uint32_t seq;
+
+ seq = nl_send(nl_sock_neigh, &req, RTM_GETNEIGH,
+ NLM_F_DUMP, sizeof(req));
+ nl_foreach_oftype(nh, status, nl_sock_neigh, buf, seq, RTM_NEWNEIGH)
+ nl_neigh_msg_read(c, nh);
+ if (status < 0)
+ warn("netlink: RTM_GETNEIGH failed: %s", strerror_(-status));
+}
+
+/**
+ * nl_neigh_notify_handler() - Non-blocking drain of pending neighbour updates
+ * @c: Execution context
+ */
+void nl_neigh_notify_handler(const struct ctx *c)
+{
+ char buf[NLBUFSIZ];
+
+ for (;;) {
+ ssize_t n = recv(nl_sock_neigh, buf, sizeof(buf), MSG_DONTWAIT);
+ struct nlmsghdr *nh = (struct nlmsghdr *)buf;
+
+ if (n < 0) {
+ if (errno == EINTR)
+ continue;
+ if (errno != EAGAIN)
+ warn_perror("netlink notifier read error");
+ return;
+ }
+ for (; NLMSG_OK(nh, n); nh = NLMSG_NEXT(nh, n))
+ nl_neigh_msg_read(c, nh);
+ }
+}
+
+/**
+ * nl_neigh_notify_init() - Subscribe to neighbour events
+ * @c: Execution context
+ *
+ * Return: 0 on success, -1 on failure
+ */
+int nl_neigh_notify_init(const struct ctx *c)
+{
+ union epoll_ref ref = {
+ .type = EPOLL_TYPE_NL_NEIGH
+ };
+ struct epoll_event ev = {
+ .events = EPOLLIN
+ };
+ struct sockaddr_nl addr = {
+ .nl_family = AF_NETLINK,
+ .nl_groups = RTMGRP_NEIGH,
+ };
+
+ if (nl_sock_neigh >= 0) {
+ warn("netlink: neighbour notifier socket already exists");
+ return 0;
+ }
+
+ nl_sock_neigh = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC,
+ NETLINK_ROUTE);
+ if (nl_sock_neigh < 0) {
+ warn_perror("Failed to create neighbour notifier socket");
+ return -1;
+ }
+
+ if (bind(nl_sock_neigh, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
+ warn_perror("Failed to bind neighbour notifier socket");
+ close(nl_sock_neigh);
+ nl_sock_neigh = -1;
+ return -1;
+ }
+
+ ev.data.u64 = ref.u64;
+ if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, nl_sock_neigh, &ev) == -1) {
+ warn_perror("epoll_ctl() on neighbour notifier socket failed");
+ close(nl_sock_neigh);
+ nl_sock_neigh = -1;
+ return -1;
+ }
+
+ nl_neigh_sync(c, AF_INET, c->ifi4);
+ nl_neigh_sync(c, AF_INET6, c->ifi6);
+
+ return 0;
+}
diff --git a/netlink.h b/netlink.h
index b51e99c..8f1e9b9 100644
--- a/netlink.h
+++ b/netlink.h
@@ -17,6 +17,8 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
int s_dst, unsigned int ifi_dst, sa_family_t af);
int nl_addr_get(int s, unsigned int ifi, sa_family_t af,
void *addr, int *prefix_len, void *addr_l);
+bool nl_neigh_mac_get(int s, const union inany_addr *addr, int ifi,
+ unsigned char *mac);
int nl_addr_set(int s, unsigned int ifi, sa_family_t af,
const void *addr, int prefix_len);
int nl_addr_get_ll(int s, unsigned int ifi, struct in6_addr *addr);
@@ -28,5 +30,7 @@ int nl_link_set_mac(int s, unsigned int ifi, const void *mac);
int nl_link_set_mtu(int s, unsigned int ifi, int mtu);
int nl_link_set_flags(int s, unsigned int ifi,
unsigned int set, unsigned int change);
+int nl_neigh_notify_init(const struct ctx *c);
+void nl_neigh_notify_handler(const struct ctx *c);
#endif /* NETLINK_H */
diff --git a/packet.c b/packet.c
index 0330b54..890561b 100644
--- a/packet.c
+++ b/packet.c
@@ -23,6 +23,20 @@
#include "log.h"
/**
+ * get_vdev_memory() - Return a pointer to the memory regions of the pool
+ * @p: Packet pool
+ *
+ * Return: Null if none, otherwise a pointer to vdev_memory structure
+ */
+static struct vdev_memory *get_vdev_memory(const struct pool *p)
+{
+ if (p->buf_size)
+ return NULL;
+
+ return (struct vdev_memory *)p->buf;
+}
+
+/**
* packet_check_range() - Check if a memory range is valid for a pool
* @p: Packet pool
* @ptr: Start of desired data range
@@ -35,26 +49,41 @@
static int packet_check_range(const struct pool *p, const char *ptr, size_t len,
const char *func, int line)
{
- if (p->buf_size == 0) {
+ struct vdev_memory *memory;
+
+ if (len > PACKET_MAX_LEN) {
+ debug("packet range length %zu (max %zu), %s:%i",
+ len, PACKET_MAX_LEN, func, line);
+ return -1;
+ }
+
+ memory = get_vdev_memory(p);
+ if (memory) {
int ret;
- ret = vu_packet_check_range((void *)p->buf, ptr, len);
+ ret = vu_packet_check_range(memory, ptr, len);
if (ret == -1)
- trace("cannot find region, %s:%i", func, line);
+ debug("cannot find region, %s:%i", func, line);
return ret;
}
if (ptr < p->buf) {
- trace("packet range start %p before buffer start %p, %s:%i",
+ debug("packet range start %p before buffer start %p, %s:%i",
(void *)ptr, (void *)p->buf, func, line);
return -1;
}
- if (ptr + len > p->buf + p->buf_size) {
- trace("packet range end %p after buffer end %p, %s:%i",
- (void *)(ptr + len), (void *)(p->buf + p->buf_size),
+ if (len > p->buf_size) {
+ debug("packet range length %zu larger than buffer %zu, %s:%i",
+ len, p->buf_size, func, line);
+ return -1;
+ }
+
+ if ((size_t)(ptr - p->buf) > p->buf_size - len) {
+ debug("packet range %p, len %zu after buffer end %p, %s:%i",
+ (void *)ptr, len, (void *)(p->buf + p->buf_size),
func, line);
return -1;
}
@@ -62,89 +91,110 @@ static int packet_check_range(const struct pool *p, const char *ptr, size_t len,
return 0;
}
/**
+ * pool_can_fit() - Can a new packet fit in the pool?
+ * @p: Pointer to packet pool
+ * @data: check data can fit in the pool
+ *
+ * Return: true if @data can be added, false otherwise
+ */
+bool pool_can_fit(const struct pool *p, struct iov_tail *data)
+{
+ iov_tail_prune(data);
+
+ return p->count + data->cnt + (data->cnt > 1) <= p->size;
+}
+
+/**
* packet_add_do() - Add data as packet descriptor to given pool
* @p: Existing pool
- * @len: Length of new descriptor
- * @start: Start of data
- * @func: For tracing: name of calling function, NULL means no trace()
+ * @data: Data to add
+ * @func: For tracing: name of calling function
* @line: For tracing: caller line of function call
*/
-void packet_add_do(struct pool *p, size_t len, const char *start,
+void packet_add_do(struct pool *p, struct iov_tail *data,
const char *func, int line)
{
- size_t idx = p->count;
+ size_t idx = p->count, i, offset;
- if (idx >= p->size) {
- trace("add packet index %zu to pool with size %zu, %s:%i",
+ if (!pool_can_fit(p, data)) {
+ debug("add packet index %zu to pool with size %zu, %s:%i",
idx, p->size, func, line);
return;
}
- if (packet_check_range(p, start, len, func, line))
+ if (!iov_tail_prune(data))
return;
- if (len > UINT16_MAX) {
- trace("add packet length %zu, %s:%i", len, func, line);
- return;
+ if (data->cnt > 1) {
+ p->pkt[idx].iov_base = NULL;
+ p->pkt[idx].iov_len = data->cnt;
+ idx++;
}
- p->pkt[idx].iov_base = (void *)start;
- p->pkt[idx].iov_len = len;
+ offset = data->off;
+ for (i = 0; i < data->cnt; i++) {
+ const char *start;
+ size_t len;
+
+ len = data->iov[i].iov_len - offset;
+ start = (char *)data->iov[i].iov_base + offset;
+ offset = 0;
+
+ if (packet_check_range(p, start, len, func, line))
+ return;
+
+ p->pkt[idx].iov_base = (void *)start;
+ p->pkt[idx].iov_len = len;
+ idx++;
+ }
- p->count++;
+ p->count = idx;
}
/**
* packet_get_do() - Get data range from packet descriptor from given pool
* @p: Packet pool
* @idx: Index of packet descriptor in pool
- * @offset: Offset of data range in packet descriptor
- * @len: Length of desired data range
- * @left: Length of available data after range, set on return, can be NULL
+ * @data: IOV tail to store the address of the data (output)
* @func: For tracing: name of calling function, NULL means no trace()
* @line: For tracing: caller line of function call
*
- * Return: pointer to start of data range, NULL on invalid range or descriptor
+ * Return: false if packet index is invalid, true otherwise.
+ * If something wrong with @data, don't return at all (assert).
*/
-void *packet_get_do(const struct pool *p, size_t idx, size_t offset,
- size_t len, size_t *left, const char *func, int line)
+bool packet_get_do(const struct pool *p, size_t idx,
+ struct iov_tail *data,
+ const char *func, int line)
{
- char *ptr;
+ size_t i;
- if (idx >= p->size || idx >= p->count) {
- if (func) {
- trace("packet %zu from pool size: %zu, count: %zu, "
- "%s:%i", idx, p->size, p->count, func, line);
- }
- return NULL;
- }
+ ASSERT_WITH_MSG(p->count <= p->size,
+ "Corrupted pool count: %zu, size: %zu, %s:%i",
+ p->count, p->size, func, line);
- if (len > UINT16_MAX) {
- if (func) {
- trace("packet data length %zu, %s:%i",
- len, func, line);
- }
- return NULL;
+ if (idx >= p->count) {
+ debug("packet %zu from pool size: %zu, count: %zu, "
+ "%s:%i", idx, p->size, p->count, func, line);
+ return false;
}
- if (len + offset > p->pkt[idx].iov_len) {
- if (func) {
- trace("data length %zu, offset %zu from length %zu, "
- "%s:%i", len, offset, p->pkt[idx].iov_len,
- func, line);
- }
- return NULL;
+ if (p->pkt[idx].iov_base) {
+ data->cnt = 1;
+ data->iov = &p->pkt[idx];
+ } else {
+ data->cnt = p->pkt[idx].iov_len;
+ data->iov = &p->pkt[idx + 1];
}
+ data->off = 0;
- ptr = (char *)p->pkt[idx].iov_base + offset;
-
- if (packet_check_range(p, ptr, len, func, line))
- return NULL;
-
- if (left)
- *left = p->pkt[idx].iov_len - offset - len;
+ for (i = 0; i < data->cnt; i++) {
+ ASSERT_WITH_MSG(!packet_check_range(p, data->iov[i].iov_base,
+ data->iov[i].iov_len,
+ func, line),
+ "Corrupt packet pool, %s:%i", func, line);
+ }
- return ptr;
+ return true;
}
/**
diff --git a/packet.h b/packet.h
index bdc07fe..ba8d5c2 100644
--- a/packet.h
+++ b/packet.h
@@ -6,10 +6,17 @@
#ifndef PACKET_H
#define PACKET_H
+#include <stdbool.h>
+#include "iov.h"
+#include "virtio.h"
+
+/* Maximum size of a single packet stored in pool, including headers */
+#define PACKET_MAX_LEN ((size_t)UINT16_MAX)
+
/**
* struct pool - Generic pool of packets stored in a buffer
* @buf: Buffer storing packet descriptors,
- * a struct vu_dev_region array for passt vhost-user mode
+ * a struct vdev_region for passt vhost-user mode
* @buf_size: Total size of buffer,
* 0 for passt vhost-user mode
* @size: Number of usable descriptors for the pool
@@ -24,24 +31,21 @@ struct pool {
struct iovec pkt[];
};
-int vu_packet_check_range(void *buf, const char *ptr, size_t len);
-void packet_add_do(struct pool *p, size_t len, const char *start,
+int vu_packet_check_range(struct vdev_memory *memory,
+ const char *ptr, size_t len);
+void packet_add_do(struct pool *p, struct iov_tail *data,
const char *func, int line);
-void *packet_get_do(const struct pool *p, const size_t idx,
- size_t offset, size_t len, size_t *left,
- const char *func, int line);
+bool packet_get_do(const struct pool *p, const size_t idx,
+ struct iov_tail *data, const char *func, int line);
+bool pool_can_fit(const struct pool *p, struct iov_tail *data);
void pool_flush(struct pool *p);
-#define packet_add(p, len, start) \
- packet_add_do(p, len, start, __func__, __LINE__)
-
-#define packet_get(p, idx, offset, len, left) \
- packet_get_do(p, idx, offset, len, left, __func__, __LINE__)
-
-#define packet_get_try(p, idx, offset, len, left) \
- packet_get_do(p, idx, offset, len, left, NULL, 0)
+#define packet_add(p, data) \
+ packet_add_do(p, data, __func__, __LINE__)
+#define packet_get(p, idx, data) \
+ packet_get_do(p, idx, data, __func__, __LINE__)
-#define PACKET_POOL_DECL(_name, _size, _buf) \
+#define PACKET_POOL_DECL(_name, _size) \
struct _name ## _t { \
char *buf; \
size_t buf_size; \
@@ -57,19 +61,10 @@ struct _name ## _t { \
.size = _size, \
}
-#define PACKET_POOL(name, size, buf, buf_size) \
- PACKET_POOL_DECL(name, size, buf) name = \
- PACKET_POOL_INIT_NOCAST(size, buf, buf_size)
-
#define PACKET_INIT(name, size, buf, buf_size) \
(struct name ## _t) PACKET_POOL_INIT_NOCAST(size, buf, buf_size)
-#define PACKET_POOL_NOINIT(name, size, buf) \
- PACKET_POOL_DECL(name, size, buf) name ## _storage; \
+#define PACKET_POOL_NOINIT(name, size) \
+ PACKET_POOL_DECL(name, size) name ## _storage; \
static struct pool *name = (struct pool *)&name ## _storage
-
-#define PACKET_POOL_P(name, size, buf, buf_size) \
- PACKET_POOL(name ## _storage, size, buf, buf_size); \
- struct pool *name = (struct pool *)&name ## _storage
-
#endif /* PACKET_H */
diff --git a/passt-repair.1 b/passt-repair.1
index 7c1b140..e65aadd 100644
--- a/passt-repair.1
+++ b/passt-repair.1
@@ -16,13 +16,17 @@
.B passt-repair
is a privileged helper setting and clearing repair mode on TCP sockets on behalf
of \fBpasst\fR(1), as instructed via single-byte commands over a UNIX domain
-socket, specified by \fIPATH\fR.
+socket.
It can be used to migrate TCP connections between guests without granting
additional capabilities to \fBpasst\fR(1) itself: to migrate TCP connections,
\fBpasst\fR(1) leverages repair mode, which needs the \fBCAP_NET_ADMIN\fR
capability (see \fBcapabilities\fR(7)) to be set or cleared.
+If \fIPATH\fR represents a UNIX domain socket, \fBpasst-repair\fR(1) attempts to
+connect to it. If it is a directory, \fBpasst-repair\fR(1) waits until a file
+ending with \fI.repair\fR appears in it, and then attempts to connect to it.
+
.SH PROTOCOL
\fBpasst-repair\fR(1) connects to \fBpasst\fR(1) using the socket specified via
diff --git a/passt-repair.c b/passt-repair.c
index e0c366e..c3c140f 100644
--- a/passt-repair.c
+++ b/passt-repair.c
@@ -16,11 +16,14 @@
* off. Reply by echoing the command. Exit on EOF.
*/
+#include <sys/inotify.h>
#include <sys/prctl.h>
#include <sys/types.h>
#include <sys/socket.h>
+#include <sys/stat.h>
#include <sys/un.h>
#include <errno.h>
+#include <stdbool.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
@@ -37,8 +40,11 @@
#include <linux/seccomp.h>
#include "seccomp_repair.h"
+#include "linux_dep.h"
#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
+#define REPAIR_EXT ".repair"
+#define REPAIR_EXT_LEN strlen(REPAIR_EXT)
/**
* main() - Entry point and whole program with loop
@@ -51,6 +57,9 @@
* #syscalls:repair socket s390x:socketcall i686:socketcall
* #syscalls:repair recvfrom recvmsg arm:recv ppc64le:recv
* #syscalls:repair sendto sendmsg arm:send ppc64le:send
+ * #syscalls:repair stat|statx stat64|statx statx
+ * #syscalls:repair fstat|fstat64 newfstatat|fstatat64
+ * #syscalls:repair inotify_init1 inotify_add_watch
*/
int main(int argc, char **argv)
{
@@ -58,12 +67,14 @@ int main(int argc, char **argv)
__attribute__ ((aligned(__alignof__(struct cmsghdr))));
struct sockaddr_un a = { AF_UNIX, "" };
int fds[SCM_MAX_FD], s, ret, i, n = 0;
+ bool inotify_dir = false;
struct sock_fprog prog;
int8_t cmd = INT8_MAX;
struct cmsghdr *cmsg;
struct msghdr msg;
struct iovec iov;
size_t cmsg_len;
+ struct stat sb;
int op;
prctl(PR_SET_DUMPABLE, 0);
@@ -73,7 +84,7 @@ int main(int argc, char **argv)
prog.filter = filter_repair;
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||
prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
- fprintf(stderr, "Failed to apply seccomp filter");
+ fprintf(stderr, "Failed to apply seccomp filter\n");
_exit(1);
}
@@ -90,19 +101,96 @@ int main(int argc, char **argv)
_exit(2);
}
- ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]);
+ if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+ fprintf(stderr, "Failed to create AF_UNIX socket: %i\n", errno);
+ _exit(1);
+ }
+
+ if ((stat(argv[1], &sb))) {
+ fprintf(stderr, "Can't stat() %s: %i\n", argv[1], errno);
+ _exit(1);
+ }
+
+ if ((sb.st_mode & S_IFMT) == S_IFDIR) {
+ char buf[sizeof(struct inotify_event) + NAME_MAX + 1]
+ __attribute__ ((aligned(__alignof__(struct inotify_event))));
+ const struct inotify_event *ev = NULL;
+ char path[PATH_MAX + 1];
+ bool found = false;
+ ssize_t n;
+ int fd;
+
+ if ((fd = inotify_init1(IN_CLOEXEC)) < 0) {
+ fprintf(stderr, "inotify_init1: %i\n", errno);
+ _exit(1);
+ }
+
+ if (inotify_add_watch(fd, argv[1], IN_CREATE) < 0) {
+ fprintf(stderr, "inotify_add_watch: %i\n", errno);
+ _exit(1);
+ }
+
+ do {
+ char *p;
+
+ n = read(fd, buf, sizeof(buf));
+ if (n < 0) {
+ fprintf(stderr, "inotify read: %i\n", errno);
+ _exit(1);
+ }
+ buf[n - 1] = '\0';
+
+ if (n < (ssize_t)sizeof(*ev)) {
+ fprintf(stderr, "Short inotify read: %zi\n", n);
+ continue;
+ }
+
+ for (p = buf; p < buf + n; p += sizeof(*ev) + ev->len) {
+ ev = (const struct inotify_event *)p;
+
+ if (ev->len >= REPAIR_EXT_LEN &&
+ !memcmp(ev->name +
+ strnlen(ev->name, ev->len) -
+ REPAIR_EXT_LEN,
+ REPAIR_EXT, REPAIR_EXT_LEN)) {
+ found = true;
+ break;
+ }
+ }
+ } while (!found);
+
+ if (ev->len > NAME_MAX + 1 || ev->name[ev->len - 1] != '\0') {
+ fprintf(stderr, "Invalid filename from inotify\n");
+ _exit(1);
+ }
+
+ snprintf(path, sizeof(path), "%s/%s", argv[1], ev->name);
+ if ((stat(path, &sb))) {
+ fprintf(stderr, "Can't stat() %s: %i\n", path, errno);
+ _exit(1);
+ }
+
+ ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", path);
+ inotify_dir = true;
+ } else {
+ ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]);
+ }
+
if (ret <= 0 || ret >= (int)sizeof(a.sun_path)) {
- fprintf(stderr, "Invalid socket path: %s\n", argv[1]);
+ fprintf(stderr, "Invalid socket path\n");
_exit(2);
}
- if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
- fprintf(stderr, "Failed to create AF_UNIX socket: %i\n", errno);
- _exit(1);
+ if ((sb.st_mode & S_IFMT) != S_IFSOCK) {
+ fprintf(stderr, "%s is not a socket\n", a.sun_path);
+ _exit(2);
}
- if (connect(s, (struct sockaddr *)&a, sizeof(a))) {
- fprintf(stderr, "Failed to connect to %s: %s\n", argv[1],
+ while (connect(s, (struct sockaddr *)&a, sizeof(a))) {
+ if (inotify_dir && errno == ECONNREFUSED)
+ continue;
+
+ fprintf(stderr, "Failed to connect to %s: %s\n", a.sun_path,
strerror(errno));
_exit(1);
}
@@ -158,8 +246,8 @@ loop:
for (i = 0; i < n; i++) {
if (setsockopt(fds[i], SOL_TCP, TCP_REPAIR, &op, sizeof(op))) {
fprintf(stderr,
- "Setting TCP_REPAIR to %i on socket %i: %s", op,
- fds[i], strerror(errno));
+ "Setting TCP_REPAIR to %i on socket %i: %s\n",
+ op, fds[i], strerror(errno));
_exit(1);
}
diff --git a/passt.1 b/passt.1
index 60066c2..db0d662 100644
--- a/passt.1
+++ b/passt.1
@@ -85,6 +85,11 @@ Be verbose, don't log to the system logger.
Be extra verbose, show single packets. Implies \fB--debug\fR.
.TP
+.BR \-\-stats " " \fIDELAY\fR
+Display events statistics with a minimum \fIDELAY\fR seconds between updates.
+If there is no event, statistics are not displayed.
+
+.TP
.BR \-q ", " \-\-quiet
Don't print informational messages.
@@ -161,8 +166,8 @@ By default, assigned IPv4 and IPv6 addresses are taken from the host interfaces
with the first default route, if any, for the corresponding IP version. If no
default routes are available and there is any interface with any route for a
given IP version, the first of these interfaces will be chosen instead. If no
-such interface exists, the link-local address 169.254.2.1 is assigned for IPv4,
-and no additional address will be assigned for IPv6.
+such interface exists for a given IP version, the link-local address 169.254.2.1
+is assigned for IPv4, and no additional address will be assigned for IPv6.
.TP
.BR \-n ", " \-\-netmask " " \fImask
@@ -189,9 +194,9 @@ first default route, if any, for the corresponding IP version. If the default
route is a multipath one, the gateway is the first nexthop router returned by
the kernel which has the highest weight in the set of paths. If no default
routes are available and there is just one interface with any route, that
-interface will be chosen instead. If no such interface exists, the link-local
-address 169.254.2.2 is used for IPv4, and the link-local address fe80::1 is used
-for IPv6.
+interface will be chosen instead. If no such interface exists for a given IP
+version, the link-local address 169.254.2.2 is used for IPv4, and the link-local
+address fe80::1 is used for IPv6.
Note: these addresses are also used as source address for packets directed to
the guest or to the target namespace having a loopback or local source address,
@@ -319,8 +324,8 @@ silently dropped.
.TP
.BR \-\-no-icmp
-Disable the ICMP/ICMPv6 echo handler. ICMP and ICMPv6 echo requests coming from
-guest or target namespace will be silently dropped.
+Disable the ICMP/ICMPv6 protocol handler. ICMP and ICMPv6 requests coming from
+guest or target namespace will be silently dropped. Implies \fB--no-ndp\fR.
.TP
.BR \-\-no-dhcp
@@ -330,8 +335,8 @@ selected IPv4 default route.
.TP
.BR \-\-no-ndp
-Disable NDP responses. NDP messages coming from guest or target namespace will
-be ignored.
+Disable Neighbor Discovery. NDP messages coming from guest or target
+namespace will be ignored. No initial NDP message will be sent.
.TP
.BR \-\-no-dhcpv6
@@ -440,6 +445,30 @@ chosen for the hypervisor UNIX domain socket. No socket is created if not in
\-\-vhost-user mode.
.TP
+.BR \-\-migrate-exit (DEPRECATED)
+Exit after a completed migration as source. By default, \fBpasst\fR keeps
+running and the migrated guest can continue using its connection, or a new guest
+can connect.
+
+Note that this configuration option is \fBdeprecated\fR and will be removed in a
+future version. It is not expected to be of any use, and it simply reflects a
+legacy behaviour. If you have any use for this, refer to \fBREPORTING BUGS\fR
+below.
+
+.TP
+.BR \-\-migrate-no-linger (DEPRECATED)
+Close TCP sockets on the source instance once migration completes.
+
+By default, sockets are kept open, and events on data sockets are ignored, so
+that any further message reaching sockets after the source migrated is silently
+ignored, to avoid connection resets in case data is received after migration.
+
+Note that this configuration option is \fBdeprecated\fR and will be removed in a
+future version. It is not expected to be of any use, and it simply reflects a
+legacy behaviour. If you have any use for this, refer to \fBREPORTING BUGS\fR
+below.
+
+.TP
.BR \-F ", " \-\-fd " " \fIFD
Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened
in the parent process and \fBpasst\fR inherits it when run as a child. This
@@ -454,6 +483,11 @@ is closed.
Quit after handling a single client connection, that is, once the client closes
the socket, or once we get a socket error.
+\fBNote\fR: this option has no effect after \fBpasst\fR completes a migration as
+source, because, in that case, exiting would close sockets for active
+connections, which would in turn cause connection resets if any further data is
+received. See also the description of \fI\-\-migrate-no-linger\fR.
+
.TP
.BR \-t ", " \-\-tcp-ports " " \fIspec
Configure TCP port forwarding to guest. \fIspec\fR can be one of:
@@ -1083,8 +1117,9 @@ throughput of TCP connections.
.SS Local mode for disconnected setups
If \fBpasst\fR and \fBpasta\fR fail to find a host interface with a configured
-address, other than loopback addresses, they will, obviously, not attempt to
-source addresses or routes from the host.
+address for a given IP version, other than loopback addresses, they will,
+obviously, not attempt to source addresses or routes from the host, for that
+IP version.
In this case, unless configured otherwise, they will assign the IPv4 link-local
address 169.254.2.1 to the guest or target namespace, and no IPv6 address. The
diff --git a/passt.c b/passt.c
index 868842b..7488a84 100644
--- a/passt.c
+++ b/passt.c
@@ -19,7 +19,6 @@
* created in a separate network namespace).
*/
-#include <sys/epoll.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/resource.h>
@@ -53,11 +52,12 @@
#include "vu_common.h"
#include "migrate.h"
#include "repair.h"
+#include "netlink.h"
+#include "epoll_ctl.h"
-#define EPOLL_EVENTS 8
+#define NUM_EPOLL_EVENTS 8
-#define TIMER_INTERVAL__ MIN(TCP_TIMER_INTERVAL, UDP_TIMER_INTERVAL)
-#define TIMER_INTERVAL_ MIN(TIMER_INTERVAL__, ICMP_TIMER_INTERVAL)
+#define TIMER_INTERVAL_ MIN(TCP_TIMER_INTERVAL, FWD_PORT_SCAN_INTERVAL)
#define TIMER_INTERVAL MIN(TIMER_INTERVAL_, FLOW_TIMER_INTERVAL)
char pkt_buf[PKT_BUF_BYTES] __attribute__ ((aligned(PAGE_SIZE)));
@@ -68,7 +68,7 @@ char *epoll_type_str[] = {
[EPOLL_TYPE_TCP_LISTEN] = "listening TCP socket",
[EPOLL_TYPE_TCP_TIMER] = "TCP timer",
[EPOLL_TYPE_UDP_LISTEN] = "listening UDP socket",
- [EPOLL_TYPE_UDP_REPLY] = "UDP reply socket",
+ [EPOLL_TYPE_UDP] = "UDP flow socket",
[EPOLL_TYPE_PING] = "ICMP/ICMPv6 ping socket",
[EPOLL_TYPE_NSQUIT_INOTIFY] = "namespace inotify watch",
[EPOLL_TYPE_NSQUIT_TIMER] = "namespace timer watch",
@@ -79,11 +79,20 @@ char *epoll_type_str[] = {
[EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket",
[EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket",
[EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket",
+ [EPOLL_TYPE_NL_NEIGH] = "netlink neighbour notifier socket",
};
static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
"epoll_type_str[] doesn't match enum epoll_type");
/**
+ * struct passt_stats - Statistics
+ * @events: Event counters for epoll type events
+ */
+struct passt_stats {
+ unsigned long events[EPOLL_NUM_TYPES];
+};
+
+/**
* post_handler() - Run periodic and deferred tasks for L4 protocol handlers
* @c: Execution context
* @now: Current timestamp
@@ -110,11 +119,10 @@ static void post_handler(struct ctx *c, const struct timespec *now)
/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
CALL_PROTO_HANDLER(tcp, TCP);
- /* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
- CALL_PROTO_HANDLER(udp, UDP);
+#undef CALL_PROTO_HANDLER
flow_defer_handler(c, now);
-#undef CALL_PROTO_HANDLER
+ fwd_scan_ports_timer(c, now);
if (!c->no_ndp)
ndp_timer(c, now);
@@ -149,12 +157,11 @@ static void timer_init(struct ctx *c, const struct timespec *now)
/**
* proto_update_l2_buf() - Update scatter-gather L2 buffers in protocol handlers
* @eth_d: Ethernet destination address, NULL if unchanged
- * @eth_s: Ethernet source address, NULL if unchanged
*/
-void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
+void proto_update_l2_buf(const unsigned char *eth_d)
{
- tcp_update_l2_buf(eth_d, eth_s);
- udp_update_l2_buf(eth_d, eth_s);
+ tcp_update_l2_buf(eth_d);
+ udp_update_l2_buf(eth_d);
}
/**
@@ -170,7 +177,143 @@ static void exit_handler(int signal)
{
(void)signal;
- _exit(EXIT_SUCCESS);
+ passt_exit(EXIT_SUCCESS);
+}
+
+/**
+ * print_stats() - Print event statistics table to stderr
+ * @c: Execution context
+ * @stats: Event counters
+ * @now: Current timestamp
+ */
+static void print_stats(const struct ctx *c, const struct passt_stats *stats,
+ const struct timespec *now)
+{
+ static struct timespec before;
+ static int lines_printed;
+ long long elapsed_ns;
+ int i;
+
+ if (!c->stats)
+ return;
+
+ elapsed_ns = (now->tv_sec - before.tv_sec) * 1000000000LL +
+ (now->tv_nsec - before.tv_nsec);
+
+ if (elapsed_ns < c->stats * 1000000000LL)
+ return;
+
+ before = *now;
+
+ if (!(lines_printed % 20)) {
+ /* Table header */
+ for (i = 1; i < EPOLL_NUM_TYPES; i++) {
+ int j;
+
+ for (j = 0; j < i * (6 + 1); j++) {
+ if (j && !(j % (6 + 1)))
+ FPRINTF(stderr, "|");
+ else
+ FPRINTF(stderr, " ");
+ }
+ FPRINTF(stderr, "%s\n", epoll_type_str[i]);
+ }
+ }
+
+ FPRINTF(stderr, " ");
+ for (i = 1; i < EPOLL_NUM_TYPES; i++)
+ FPRINTF(stderr, " %6lu", stats->events[i]);
+ FPRINTF(stderr, "\n");
+ lines_printed++;
+}
+
+/**
+ * passt_worker() - Process epoll events and handle protocol operations
+ * @opaque: Pointer to execution context (struct ctx)
+ * @nfds: Number of file descriptors ready (epoll_wait return value)
+ * @events: epoll_event array of ready file descriptors
+ */
+static void passt_worker(void *opaque, int nfds, struct epoll_event *events)
+{
+ static struct passt_stats stats = { 0 };
+ struct ctx *c = opaque;
+ struct timespec now;
+ int i;
+
+ if (clock_gettime(CLOCK_MONOTONIC, &now))
+ err_perror("Failed to get CLOCK_MONOTONIC time");
+
+ for (i = 0; i < nfds; i++) {
+ union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64);
+ uint32_t eventmask = events[i].events;
+
+ trace("%s: epoll event on %s %i (events: 0x%08x)",
+ c->mode == MODE_PASTA ? "pasta" : "passt",
+ EPOLL_TYPE_STR(ref.type), ref.fd, eventmask);
+
+ switch (ref.type) {
+ case EPOLL_TYPE_TAP_PASTA:
+ tap_handler_pasta(c, eventmask, &now);
+ break;
+ case EPOLL_TYPE_TAP_PASST:
+ tap_handler_passt(c, eventmask, &now);
+ break;
+ case EPOLL_TYPE_TAP_LISTEN:
+ tap_listen_handler(c, eventmask);
+ break;
+ case EPOLL_TYPE_NSQUIT_INOTIFY:
+ pasta_netns_quit_inotify_handler(c, ref.fd);
+ break;
+ case EPOLL_TYPE_NSQUIT_TIMER:
+ pasta_netns_quit_timer_handler(c, ref);
+ break;
+ case EPOLL_TYPE_TCP:
+ tcp_sock_handler(c, ref, eventmask);
+ break;
+ case EPOLL_TYPE_TCP_SPLICE:
+ tcp_splice_sock_handler(c, ref, eventmask);
+ break;
+ case EPOLL_TYPE_TCP_LISTEN:
+ tcp_listen_handler(c, ref, &now);
+ break;
+ case EPOLL_TYPE_TCP_TIMER:
+ tcp_timer_handler(c, ref);
+ break;
+ case EPOLL_TYPE_UDP_LISTEN:
+ udp_listen_sock_handler(c, ref, eventmask, &now);
+ break;
+ case EPOLL_TYPE_UDP:
+ udp_sock_handler(c, ref, eventmask, &now);
+ break;
+ case EPOLL_TYPE_PING:
+ icmp_sock_handler(c, ref);
+ break;
+ case EPOLL_TYPE_VHOST_CMD:
+ vu_control_handler(c->vdev, c->fd_tap, eventmask);
+ break;
+ case EPOLL_TYPE_VHOST_KICK:
+ vu_kick_cb(c->vdev, ref, &now);
+ break;
+ case EPOLL_TYPE_REPAIR_LISTEN:
+ repair_listen_handler(c, eventmask);
+ break;
+ case EPOLL_TYPE_REPAIR:
+ repair_handler(c, eventmask);
+ break;
+ case EPOLL_TYPE_NL_NEIGH:
+ nl_neigh_notify_handler(c);
+ break;
+ default:
+ /* Can't happen */
+ ASSERT(0);
+ }
+ stats.events[ref.type]++;
+ print_stats(c, &stats, &now);
+ }
+
+ post_handler(c, &now);
+
+ migrate_handler(c);
}
/**
@@ -185,13 +328,13 @@ static void exit_handler(int signal)
* #syscalls bind connect recvfrom sendto shutdown
* #syscalls arm:recv ppc64le:recv arm:send ppc64le:send
* #syscalls accept4 accept listen epoll_ctl epoll_wait|epoll_pwait epoll_pwait
- * #syscalls clock_gettime arm:clock_gettime64 i686:clock_gettime64
+ * #syscalls clock_gettime|clock_gettime64
+ * #syscalls arm:clock_gettime64 i686:clock_gettime64
*/
int main(int argc, char **argv)
{
- struct epoll_event events[EPOLL_EVENTS];
- int nfds, i, devnull_fd = -1;
- char argv0[PATH_MAX], *name;
+ struct epoll_event events[NUM_EPOLL_EVENTS];
+ int nfds, devnull_fd = -1;
struct ctx c = { 0 };
struct rlimit limit;
struct timespec now;
@@ -213,31 +356,23 @@ int main(int argc, char **argv)
sigaction(SIGTERM, &sa, NULL);
sigaction(SIGQUIT, &sa, NULL);
- if (argc < 1)
- _exit(EXIT_FAILURE);
+ c.mode = conf_mode(argc, argv);
- strncpy(argv0, argv[0], PATH_MAX - 1);
- name = basename(argv0);
- if (strstr(name, "pasta")) {
+ if (c.mode == MODE_PASTA) {
sa.sa_handler = pasta_child_handler;
if (sigaction(SIGCHLD, &sa, NULL))
die_perror("Couldn't install signal handlers");
-
- c.mode = MODE_PASTA;
- } else if (strstr(name, "passt")) {
- c.mode = MODE_PASST;
- } else {
- _exit(EXIT_FAILURE);
}
if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
die_perror("Couldn't set disposition for SIGPIPE");
- madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE);
+ madvise(pkt_buf, sizeof(pkt_buf), MADV_HUGEPAGE);
c.epollfd = epoll_create1(EPOLL_CLOEXEC);
if (c.epollfd == -1)
die_perror("Failed to create epoll file descriptor");
+ flow_epollid_register(EPOLLFD_ID_DEFAULT, c.epollfd);
if (getrlimit(RLIMIT_NOFILE, &limit))
die_perror("Failed to get maximum value of open files limit");
@@ -246,7 +381,7 @@ int main(int argc, char **argv)
if (setrlimit(RLIMIT_NOFILE, &limit))
die_perror("Failed to set current limit for open files");
- sock_probe_mem(&c);
+ sock_probe_features(&c);
conf(&c, argc, argv);
trace_init(c.trace);
@@ -261,11 +396,12 @@ int main(int argc, char **argv)
die_perror("Failed to get CLOCK_MONOTONIC time");
flow_init();
+ fwd_scan_ports_init(&c);
if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c)))
- _exit(EXIT_FAILURE);
+ passt_exit(EXIT_FAILURE);
- proto_update_l2_buf(c.guest_mac, c.our_tap_mac);
+ proto_update_l2_buf(c.guest_mac);
if (c.ifi4 && !c.no_dhcp)
dhcp_init();
@@ -275,6 +411,9 @@ int main(int argc, char **argv)
pcap_init(&c);
+ fwd_neigh_table_init(&c);
+ nl_neigh_notify_init(&c);
+
if (!c.foreground) {
if ((devnull_fd = open("/dev/null", O_RDWR | O_CLOEXEC)) < 0)
die_perror("Failed to open /dev/null");
@@ -302,80 +441,12 @@ int main(int argc, char **argv)
loop:
/* NOLINTBEGIN(bugprone-branch-clone): intervals can be the same */
/* cppcheck-suppress [duplicateValueTernary, unmatchedSuppression] */
- nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL);
+ nfds = epoll_wait(c.epollfd, events, NUM_EPOLL_EVENTS, TIMER_INTERVAL);
/* NOLINTEND(bugprone-branch-clone) */
if (nfds == -1 && errno != EINTR)
die_perror("epoll_wait() failed in main loop");
- if (clock_gettime(CLOCK_MONOTONIC, &now))
- err_perror("Failed to get CLOCK_MONOTONIC time");
-
- for (i = 0; i < nfds; i++) {
- union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64);
- uint32_t eventmask = events[i].events;
-
- trace("%s: epoll event on %s %i (events: 0x%08x)",
- c.mode == MODE_PASTA ? "pasta" : "passt",
- EPOLL_TYPE_STR(ref.type), ref.fd, eventmask);
-
- switch (ref.type) {
- case EPOLL_TYPE_TAP_PASTA:
- tap_handler_pasta(&c, eventmask, &now);
- break;
- case EPOLL_TYPE_TAP_PASST:
- tap_handler_passt(&c, eventmask, &now);
- break;
- case EPOLL_TYPE_TAP_LISTEN:
- tap_listen_handler(&c, eventmask);
- break;
- case EPOLL_TYPE_NSQUIT_INOTIFY:
- pasta_netns_quit_inotify_handler(&c, ref.fd);
- break;
- case EPOLL_TYPE_NSQUIT_TIMER:
- pasta_netns_quit_timer_handler(&c, ref);
- break;
- case EPOLL_TYPE_TCP:
- tcp_sock_handler(&c, ref, eventmask);
- break;
- case EPOLL_TYPE_TCP_SPLICE:
- tcp_splice_sock_handler(&c, ref, eventmask);
- break;
- case EPOLL_TYPE_TCP_LISTEN:
- tcp_listen_handler(&c, ref, &now);
- break;
- case EPOLL_TYPE_TCP_TIMER:
- tcp_timer_handler(&c, ref);
- break;
- case EPOLL_TYPE_UDP_LISTEN:
- udp_listen_sock_handler(&c, ref, eventmask, &now);
- break;
- case EPOLL_TYPE_UDP_REPLY:
- udp_reply_sock_handler(&c, ref, eventmask, &now);
- break;
- case EPOLL_TYPE_PING:
- icmp_sock_handler(&c, ref);
- break;
- case EPOLL_TYPE_VHOST_CMD:
- vu_control_handler(c.vdev, c.fd_tap, eventmask);
- break;
- case EPOLL_TYPE_VHOST_KICK:
- vu_kick_cb(c.vdev, ref, &now);
- break;
- case EPOLL_TYPE_REPAIR_LISTEN:
- repair_listen_handler(&c, eventmask);
- break;
- case EPOLL_TYPE_REPAIR:
- repair_handler(&c, eventmask);
- break;
- default:
- /* Can't happen */
- ASSERT(0);
- }
- }
-
- post_handler(&c, &now);
-
- migrate_handler(&c);
+ passt_worker(&c, nfds, events);
goto loop;
}
diff --git a/passt.h b/passt.h
index 28d1389..79d01dd 100644
--- a/passt.h
+++ b/passt.h
@@ -35,46 +35,9 @@ union epoll_ref;
#define MAC_OUR_LAA \
((uint8_t [ETH_ALEN]){0x9a, 0x55, 0x9a, 0x55, 0x9a, 0x55})
-/**
- * union epoll_ref - Breakdown of reference for epoll fd bookkeeping
- * @type: Type of fd (tells us what to do with events)
- * @fd: File descriptor number (implies < 2^24 total descriptors)
- * @flow: Index of the flow this fd is linked to
- * @tcp_listen: TCP-specific reference part for listening sockets
- * @udp: UDP-specific reference part
- * @icmp: ICMP-specific reference part
- * @data: Data handled by protocol handlers
- * @nsdir_fd: netns dirfd for fallback timer checking if namespace is gone
- * @queue: vhost-user queue index for this fd
- * @u64: Opaque reference for epoll_ctl() and epoll_wait()
- */
-union epoll_ref {
- struct {
- enum epoll_type type:8;
-#define FD_REF_BITS 24
-#define FD_REF_MAX ((int)MAX_FROM_BITS(FD_REF_BITS))
- int32_t fd:FD_REF_BITS;
- union {
- uint32_t flow;
- flow_sidx_t flowside;
- union tcp_listen_epoll_ref tcp_listen;
- union udp_listen_epoll_ref udp;
- uint32_t data;
- int nsdir_fd;
- int queue;
- };
- };
- uint64_t u64;
-};
-static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data),
- "epoll_ref must have same size as epoll_data");
-
-#define TAP_BUF_BYTES \
- ROUND_DOWN(((ETH_MAX_MTU + sizeof(uint32_t)) * 128), PAGE_SIZE)
-#define TAP_MSGS \
- DIV_ROUND_UP(TAP_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t))
+/* Large enough for ~128 maximum size frames */
+#define PKT_BUF_BYTES (8UL << 20)
-#define PKT_BUF_BYTES MAX(TAP_BUF_BYTES, 0)
extern char pkt_buf [PKT_BUF_BYTES];
extern char *epoll_type_str[];
@@ -190,6 +153,7 @@ struct ip6_ctx {
* @mode: Operation mode, qemu/UNIX domain socket or namespace/tap
* @debug: Enable debug mode
* @trace: Enable tracing (extra debug) mode
+ * @stats: Events statistics delay (0 means disabled)
* @quiet: Don't print informational messages
* @foreground: Run in foreground, don't log to stderr by default
* @nofile: Maximum number of open files (ulimit -n)
@@ -211,7 +175,7 @@ struct ip6_ctx {
* @guest_mac: MAC address of guest or namespace, seen or configured
* @hash_secret: 128-bit secret for siphash functions
* @ifi4: Template interface for IPv4, -1: none, 0: IPv4 disabled
- * @ip: IPv4 configuration
+ * @ip4: IPv4 configuration
* @dns_search: DNS search list
* @hostname: Guest hostname
* @fqdn: Guest FQDN
@@ -240,15 +204,19 @@ struct ip6_ctx {
* @freebind: Allow binding of non-local addresses for forwarding
* @low_wmem: Low probed net.core.wmem_max
* @low_rmem: Low probed net.core.rmem_max
+ * @no_bindtodevice: Unprivileged SO_BINDTODEVICE not available
* @vdev: vhost-user device
* @device_state_fd: Device state migration channel
* @device_state_result: Device state migration result
* @migrate_target: Are we the target, on the next migration request?
+ * @migrate_no_linger: Close sockets as we migrate them
+ * @migrate_exit: Exit (on source) once migration is complete
*/
struct ctx {
enum passt_modes mode;
int debug;
int trace;
+ int stats;
int quiet;
int foreground;
int nofile;
@@ -314,6 +282,7 @@ struct ctx {
int low_wmem;
int low_rmem;
+ int no_bindtodevice;
struct vu_dev *vdev;
@@ -321,9 +290,10 @@ struct ctx {
int device_state_fd;
int device_state_result;
bool migrate_target;
+ bool migrate_no_linger;
+ bool migrate_exit;
};
-void proto_update_l2_buf(const unsigned char *eth_d,
- const unsigned char *eth_s);
+void proto_update_l2_buf(const unsigned char *eth_d);
#endif /* PASST_H */
diff --git a/pasta.c b/pasta.c
index fa3e7de..c307b8a 100644
--- a/pasta.c
+++ b/pasta.c
@@ -27,7 +27,6 @@
#include <stdint.h>
#include <unistd.h>
#include <syslog.h>
-#include <sys/epoll.h>
#include <sys/inotify.h>
#include <sys/mount.h>
#include <sys/timerfd.h>
@@ -41,6 +40,7 @@
#include <arpa/inet.h>
#include <netinet/in.h>
#include <net/ethernet.h>
+#include <sys/prctl.h>
#include <sys/syscall.h>
#include <linux/magic.h>
@@ -49,6 +49,7 @@
#include "isolation.h"
#include "netlink.h"
#include "log.h"
+#include "epoll_ctl.h"
#define HOSTNAME_PREFIX "pasta-"
@@ -57,15 +58,13 @@ int pasta_child_pid;
/**
* pasta_child_handler() - Exit once shell exits (if we started it), reap clones
- * @signal: Unused, handler deals with SIGCHLD only
+ * @signal: Signal number; this handler deals with SIGCHLD only
*/
void pasta_child_handler(int signal)
{
int errno_save = errno;
siginfo_t infop;
- (void)signal;
-
if (signal != SIGCHLD)
return;
@@ -73,12 +72,12 @@ void pasta_child_handler(int signal)
!waitid(P_PID, pasta_child_pid, &infop, WEXITED | WNOHANG)) {
if (infop.si_pid == pasta_child_pid) {
if (infop.si_code == CLD_EXITED)
- _exit(infop.si_status);
+ passt_exit(infop.si_status);
/* If killed by a signal, si_status is the number.
* Follow common shell convention of returning it + 128.
*/
- _exit(infop.si_status + 128);
+ passt_exit(infop.si_status + 128);
/* Nothing to do, detached PID namespace going away */
}
@@ -191,6 +190,10 @@ static int pasta_spawn_cmd(void *arg)
size_t conf_hostname_len;
sigset_t set;
+ /* If the parent dies with an error, so should we */
+ if (prctl(PR_SET_PDEATHSIG, SIGKILL))
+ die_perror("Couldn't set PR_SET_PDEATHSIG");
+
/* We run in a detached PID and mount namespace: mount /proc over */
if (mount("", "/proc", "proc", 0, NULL))
warn_perror("Couldn't mount /proc");
@@ -217,6 +220,12 @@ static int pasta_spawn_cmd(void *arg)
sigaddset(&set, SIGUSR1);
sigwaitinfo(&set, NULL);
+ /* Once exec()ed this process is more valuable, and easier to see and
+ * clean up. Let us outlive our parent now.
+ */
+ if (prctl(PR_SET_PDEATHSIG, 0))
+ die_perror("Couldn't clear PR_SET_PDEATHSIG");
+
execvp(a->exe, a->argv);
die_perror("Failed to start command or shell");
@@ -411,7 +420,7 @@ void pasta_ns_conf(struct ctx *c)
}
}
- proto_update_l2_buf(c->guest_mac, NULL);
+ proto_update_l2_buf(c->guest_mac);
}
/**
@@ -444,7 +453,6 @@ static int pasta_netns_quit_timer(void)
*/
void pasta_netns_quit_init(const struct ctx *c)
{
- struct epoll_event ev = { .events = EPOLLIN };
int flags = O_NONBLOCK | O_CLOEXEC;
struct statfs s = { 0 };
bool try_inotify = true;
@@ -487,8 +495,8 @@ void pasta_netns_quit_init(const struct ctx *c)
die("netns monitor file number %i too big, exiting", fd);
ref.fd = fd;
- ev.data.u64 = ref.u64;
- epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev);
+
+ epoll_add(c->epollfd, EPOLLIN, ref);
}
/**
@@ -498,17 +506,23 @@ void pasta_netns_quit_init(const struct ctx *c)
*/
void pasta_netns_quit_inotify_handler(struct ctx *c, int inotify_fd)
{
- char buf[sizeof(struct inotify_event) + NAME_MAX + 1];
- const struct inotify_event *in_ev = (struct inotify_event *)buf;
+ char buf[sizeof(struct inotify_event) + NAME_MAX + 1]
+ __attribute__ ((aligned(__alignof__(struct inotify_event))));
+ const struct inotify_event *ev;
+ ssize_t n;
+ char *p;
- if (read(inotify_fd, buf, sizeof(buf)) < (ssize_t)sizeof(*in_ev))
+ if ((n = read(inotify_fd, buf, sizeof(buf))) < (ssize_t)sizeof(*ev))
return;
- if (strncmp(in_ev->name, c->netns_base, sizeof(c->netns_base)))
- return;
+ for (p = buf; p < buf + n; p += sizeof(*ev) + ev->len) {
+ ev = (const struct inotify_event *)p;
- info("Namespace %s is gone, exiting", c->netns_base);
- _exit(EXIT_SUCCESS);
+ if (!strncmp(ev->name, c->netns_base, sizeof(c->netns_base))) {
+ info("Namespace %s is gone, exiting", c->netns_base);
+ passt_exit(EXIT_SUCCESS);
+ }
+ }
}
/**
@@ -534,7 +548,7 @@ void pasta_netns_quit_timer_handler(struct ctx *c, union epoll_ref ref)
return;
info("Namespace %s is gone, exiting", c->netns_base);
- _exit(EXIT_SUCCESS);
+ passt_exit(EXIT_SUCCESS);
}
close(fd);
diff --git a/pcap.c b/pcap.c
index 3d623cf..54fba5c 100644
--- a/pcap.c
+++ b/pcap.c
@@ -33,32 +33,11 @@
#include "log.h"
#include "pcap.h"
#include "iov.h"
+#include "tap.h"
#define PCAP_VERSION_MINOR 4
-static int pcap_fd = -1;
-
-/* See pcap.h from libpcap, or pcap-savefile(5) */
-static const struct {
- uint32_t magic;
-#define PCAP_MAGIC 0xa1b2c3d4
-
- uint16_t major;
-#define PCAP_VERSION_MAJOR 2
-
- uint16_t minor;
-#define PCAP_VERSION_MINOR 4
-
- int32_t thiszone;
- uint32_t sigfigs;
- uint32_t snaplen;
-
- uint32_t linktype;
-#define PCAP_LINKTYPE_ETHERNET 1
-} pcap_hdr = {
- PCAP_MAGIC, PCAP_VERSION_MAJOR, PCAP_VERSION_MINOR, 0, 0, ETH_MAX_MTU,
- PCAP_LINKTYPE_ETHERNET
-};
+int pcap_fd = -1;
struct pcap_pkthdr {
uint32_t tv_sec;
@@ -73,8 +52,6 @@ struct pcap_pkthdr {
* @iovcnt: Number of buffers (@iov entries) in frame
* @offset: Byte offset of the L2 headers within @iov
* @now: Timestamp
- *
- * Returns: 0 on success, -errno on error writing to the file
*/
static void pcap_frame(const struct iovec *iov, size_t iovcnt,
size_t offset, const struct timespec *now)
@@ -97,6 +74,7 @@ static void pcap_frame(const struct iovec *iov, size_t iovcnt,
* @pkt: Pointer to data buffer, including L2 headers
* @l2len: L2 frame length
*/
+/* cppcheck-suppress unusedFunction */
void pcap(const char *pkt, size_t l2len)
{
struct iovec iov = { (char *)pkt, l2len };
@@ -134,10 +112,9 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
pcap_frame(iov + i * frame_parts, frame_parts, offset, &now);
}
-/*
- * pcap_iov - Write packet data described by an I/O vector
+/**
+ * pcap_iov() - Write packet data described by an I/O vector
* to a pcap file descriptor.
- *
* @iov: Pointer to the array of struct iovec describing the I/O vector
* containing packet data to write, including L2 header
* @iovcnt: Number of buffers (@iov entries)
@@ -162,6 +139,29 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
*/
void pcap_init(struct ctx *c)
{
+ /* See pcap.h from libpcap, or pcap-savefile(5) */
+#define PCAP_MAGIC 0xa1b2c3d4
+#define PCAP_VERSION_MAJOR 2
+#define PCAP_VERSION_MINOR 4
+#define PCAP_LINKTYPE_ETHERNET 1
+ const struct {
+ uint32_t magic;
+ uint16_t major;
+ uint16_t minor;
+
+ int32_t thiszone;
+ uint32_t sigfigs;
+ uint32_t snaplen;
+
+ uint32_t linktype;
+ } pcap_hdr = {
+ .magic = PCAP_MAGIC,
+ .major = PCAP_VERSION_MAJOR,
+ .minor = PCAP_VERSION_MINOR,
+ .snaplen = tap_l2_max_len(c),
+ .linktype = PCAP_LINKTYPE_ETHERNET
+ };
+
if (pcap_fd != -1)
return;
diff --git a/pcap.h b/pcap.h
index 9795f2e..2aeb53e 100644
--- a/pcap.h
+++ b/pcap.h
@@ -6,6 +6,8 @@
#ifndef PCAP_H
#define PCAP_H
+extern int pcap_fd;
+
void pcap(const char *pkt, size_t l2len);
void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
size_t offset);
diff --git a/pif.c b/pif.c
index 592fafa..6ae970a 100644
--- a/pif.c
+++ b/pif.c
@@ -5,6 +5,7 @@
* Passt/pasta interface types and IDs
*/
+#include <errno.h>
#include <stdint.h>
#include <assert.h>
#include <netinet/in.h>
@@ -14,7 +15,7 @@
#include "siphash.h"
#include "ip.h"
#include "inany.h"
-#include "passt.h"
+#include "epoll_ctl.h"
const char *pif_type_str[] = {
[PIF_NONE] = "<none>",
@@ -29,12 +30,11 @@ static_assert(ARRAY_SIZE(pif_type_str) == PIF_NUM_TYPES,
/** pif_sockaddr() - Construct a socket address suitable for an interface
* @c: Execution context
* @sa: Pointer to sockaddr to fill in
- * @sl: Updated to relevant length of initialised @sa
* @pif: Interface to create the socket address
* @addr: IPv[46] address
* @port: Port (host byte order)
*/
-void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl,
+void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa,
uint8_t pif, const union inany_addr *addr, in_port_t port)
{
const struct in_addr *v4 = inany_v4(addr);
@@ -46,17 +46,19 @@ void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl,
sa->sa4.sin_addr = *v4;
sa->sa4.sin_port = htons(port);
memset(&sa->sa4.sin_zero, 0, sizeof(sa->sa4.sin_zero));
- *sl = sizeof(sa->sa4);
} else {
sa->sa_family = AF_INET6;
sa->sa6.sin6_addr = addr->a6;
sa->sa6.sin6_port = htons(port);
- if (pif == PIF_HOST && IN6_IS_ADDR_LINKLOCAL(&addr->a6))
- sa->sa6.sin6_scope_id = c->ifi6;
- else
+ if (IN6_IS_ADDR_LINKLOCAL(&addr->a6)) {
+ if (pif == PIF_HOST)
+ sa->sa6.sin6_scope_id = c->ifi6;
+ else if (pif == PIF_SPLICE)
+ sa->sa6.sin6_scope_id = c->pasta_ifi;
+ } else {
sa->sa6.sin6_scope_id = 0;
+ }
sa->sa6.sin6_flowinfo = 0;
- *sl = sizeof(sa->sa6);
}
}
@@ -78,26 +80,31 @@ int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
const union inany_addr *addr, const char *ifname,
in_port_t port, uint32_t data)
{
- union sockaddr_inany sa = {
- .sa6.sin6_family = AF_INET6,
- .sa6.sin6_addr = in6addr_any,
- .sa6.sin6_port = htons(port),
- };
- socklen_t sl;
+ union epoll_ref ref;
+ int ret;
ASSERT(pif_is_socket(pif));
- if (pif == PIF_SPLICE) {
- /* Sanity checks */
- ASSERT(!ifname);
- ASSERT(addr && inany_is_loopback(addr));
+ if (!addr) {
+ ref.fd = sock_l4_dualstack_any(c, type, port, ifname);
+ } else {
+ union sockaddr_inany sa;
+
+ pif_sockaddr(c, &sa, pif, addr, port);
+ ref.fd = sock_l4(c, type, &sa, ifname);
}
- if (!addr)
- return sock_l4_sa(c, type, &sa, sizeof(sa.sa6),
- ifname, false, data);
+ if (ref.fd < 0)
+ return ref.fd;
+
+ ref.type = type;
+ ref.data = data;
+
+ ret = epoll_add(c->epollfd, EPOLLIN, ref);
+ if (ret < 0) {
+ close(ref.fd);
+ return ret;
+ }
- pif_sockaddr(c, &sa, &sl, pif, addr, port);
- return sock_l4_sa(c, type, &sa, sl,
- ifname, sa.sa_family == AF_INET6, data);
+ return ref.fd;
}
diff --git a/pif.h b/pif.h
index f029282..0f7f667 100644
--- a/pif.h
+++ b/pif.h
@@ -57,7 +57,7 @@ static inline bool pif_is_socket(uint8_t pif)
return pif == PIF_HOST || pif == PIF_SPLICE;
}
-void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl,
+void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa,
uint8_t pif, const union inany_addr *addr, in_port_t port);
int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
const union inany_addr *addr, const char *ifname,
diff --git a/repair.c b/repair.c
index 3ee089f..69c5307 100644
--- a/repair.c
+++ b/repair.c
@@ -22,11 +22,16 @@
#include "inany.h"
#include "flow.h"
#include "flow_table.h"
+#include "epoll_ctl.h"
#include "repair.h"
#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
+/* Wait for a while for TCP_REPAIR helper to connect if it's not there yet */
+#define REPAIR_ACCEPT_TIMEOUT_MS 10
+#define REPAIR_ACCEPT_TIMEOUT_US (REPAIR_ACCEPT_TIMEOUT_MS * 1000)
+
/* Pending file descriptors for next repair_flush() call, or command change */
static int repair_fds[SCM_MAX_FD];
@@ -43,7 +48,6 @@ static int repair_nfds;
void repair_sock_init(const struct ctx *c)
{
union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR_LISTEN };
- struct epoll_event ev = { 0 };
if (c->fd_repair_listen == -1)
return;
@@ -54,28 +58,28 @@ void repair_sock_init(const struct ctx *c)
}
ref.fd = c->fd_repair_listen;
- ev.events = EPOLLIN | EPOLLHUP | EPOLLET;
- ev.data.u64 = ref.u64;
- if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair_listen, &ev))
- err_perror("repair helper socket epoll_ctl(), won't migrate");
+ if (epoll_add(c->epollfd, EPOLLIN | EPOLLHUP | EPOLLET, ref))
+ err("repair helper socket epoll_ctl(), won't migrate");
}
/**
* repair_listen_handler() - Handle events on TCP_REPAIR helper listening socket
* @c: Execution context
* @events: epoll events
+ *
+ * Return: 0 on valid event with new connected socket, error code on failure
*/
-void repair_listen_handler(struct ctx *c, uint32_t events)
+int repair_listen_handler(struct ctx *c, uint32_t events)
{
union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR };
- struct epoll_event ev = { 0 };
struct ucred ucred;
socklen_t len;
+ int rc;
if (events != EPOLLIN) {
debug("Spurious event 0x%04x on TCP_REPAIR helper socket",
events);
- return;
+ return EINVAL;
}
len = sizeof(ucred);
@@ -86,31 +90,35 @@ void repair_listen_handler(struct ctx *c, uint32_t events)
SOCK_NONBLOCK);
if (discard == -1)
- return;
+ return errno;
if (!getsockopt(discard, SOL_SOCKET, SO_PEERCRED, &ucred, &len))
info("Discarding TCP_REPAIR helper, PID %i", ucred.pid);
close(discard);
- return;
+ return EEXIST;
}
if ((c->fd_repair = accept4(c->fd_repair_listen, NULL, NULL, 0)) < 0) {
+ rc = errno;
debug_perror("accept4() on TCP_REPAIR helper listening socket");
- return;
+ return rc;
}
if (!getsockopt(c->fd_repair, SOL_SOCKET, SO_PEERCRED, &ucred, &len))
info("Accepted TCP_REPAIR helper, PID %i", ucred.pid);
ref.fd = c->fd_repair;
- ev.events = EPOLLHUP | EPOLLET;
- ev.data.u64 = ref.u64;
- if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair, &ev)) {
- debug_perror("epoll_ctl() on TCP_REPAIR helper socket");
+
+ rc = epoll_add(c->epollfd, EPOLLHUP | EPOLLET, ref);
+ if (rc < 0) {
+ debug("epoll_ctl() on TCP_REPAIR helper socket");
close(c->fd_repair);
c->fd_repair = -1;
+ return rc;
}
+
+ return 0;
}
/**
@@ -139,6 +147,44 @@ void repair_handler(struct ctx *c, uint32_t events)
}
/**
+ * repair_wait() - Wait (with timeout) for TCP_REPAIR helper to connect
+ * @c: Execution context
+ *
+ * Return: 0 on success or if already connected, error code on failure
+ */
+int repair_wait(struct ctx *c)
+{
+ struct timeval tv = { .tv_sec = 0,
+ .tv_usec = (long)(REPAIR_ACCEPT_TIMEOUT_US) };
+ int rc;
+
+ static_assert(REPAIR_ACCEPT_TIMEOUT_US < 1000 * 1000,
+ ".tv_usec is greater than 1000 * 1000");
+
+ if (c->fd_repair >= 0)
+ return 0;
+
+ if (c->fd_repair_listen == -1)
+ return ENOENT;
+
+ if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO,
+ &tv, sizeof(tv))) {
+ rc = errno;
+ err_perror("Set timeout on TCP_REPAIR listening socket");
+ return rc;
+ }
+
+ rc = repair_listen_handler(c, EPOLLIN);
+
+ tv.tv_usec = 0;
+ if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO,
+ &tv, sizeof(tv)))
+ err_perror("Clear timeout on TCP_REPAIR listening socket");
+
+ return rc;
+}
+
+/**
* repair_flush() - Flush current set of sockets to helper, with current command
* @c: Execution context
*
diff --git a/repair.h b/repair.h
index de279d6..ab27e67 100644
--- a/repair.h
+++ b/repair.h
@@ -7,9 +7,10 @@
#define REPAIR_H
void repair_sock_init(const struct ctx *c);
-void repair_listen_handler(struct ctx *c, uint32_t events);
+int repair_listen_handler(struct ctx *c, uint32_t events);
void repair_handler(struct ctx *c, uint32_t events);
void repair_close(struct ctx *c);
+int repair_wait(struct ctx *c);
int repair_flush(struct ctx *c);
int repair_set(struct ctx *c, int s, int cmd);
diff --git a/seccomp.sh b/seccomp.sh
index a7bc417..60ebe84 100755
--- a/seccomp.sh
+++ b/seccomp.sh
@@ -21,8 +21,9 @@ IN="$@"
[ -z "${ARCH}" ] && ARCH="$(uname -m)"
[ -z "${CC}" ] && CC="cc"
+case "${ARCH}" in i[345]86) ARCH=i686 ;; esac
-AUDIT_ARCH="AUDIT_ARCH_$(echo ${ARCH} | tr [a-z] [A-Z] \
+AUDIT_ARCH="AUDIT_ARCH_$(echo ${ARCH} | tr '[a-z]' '[A-Z]' \
| sed 's/^ARM.*/ARM/' \
| sed 's/I[456]86/I386/' \
| sed 's/PPC64/PPC/' \
diff --git a/siphash.h b/siphash.h
index a2ca2a9..e760236 100644
--- a/siphash.h
+++ b/siphash.h
@@ -99,7 +99,7 @@ static inline void siphash_feed(struct siphash_state *state, uint64_t in)
}
/**
- * siphash_final - Finalize SipHash calculations
+ * siphash_final() - Finalize SipHash calculations
* @v: siphash state (4 x 64-bit integers)
* @len: Total length of input data
* @tail: Final data for the hash (<= 7 bytes)
diff --git a/tap.c b/tap.c
index 4541f51..9d1344b 100644
--- a/tap.c
+++ b/tap.c
@@ -26,7 +26,6 @@
#include <netinet/in.h>
#include <arpa/inet.h>
#include <stdint.h>
-#include <sys/epoll.h>
#include <sys/ioctl.h>
#include <sys/types.h>
#include <sys/stat.h>
@@ -61,15 +60,69 @@
#include "log.h"
#include "vhost_user.h"
#include "vu_common.h"
+#include "epoll_ctl.h"
+
+/* Maximum allowed frame lengths (including L2 header) */
+
+/* Verify that an L2 frame length limit is large enough to contain the header,
+ * but small enough to fit in the packet pool
+ */
+#define CHECK_FRAME_LEN(len) \
+ static_assert((len) >= ETH_HLEN && (len) <= PACKET_MAX_LEN, \
+ #len " has bad value")
+
+CHECK_FRAME_LEN(L2_MAX_LEN_PASTA);
+CHECK_FRAME_LEN(L2_MAX_LEN_PASST);
+CHECK_FRAME_LEN(L2_MAX_LEN_VU);
+
+/* We try size the packet pools so that we can use a single batch for the entire
+ * packet buffer. This might be exceeded for vhost-user, though, which uses its
+ * own buffers rather than pkt_buf.
+ *
+ * This is just a tuning parameter, the code will work with slightly more
+ * overhead if it's incorrect. So, we estimate based on the minimum practical
+ * frame size - an empty UDP datagram - rather than the minimum theoretical
+ * frame size.
+ *
+ * FIXME: Profile to work out how big this actually needs to be to amortise
+ * per-batch syscall overheads
+ */
+#define TAP_MSGS_IP4 \
+ DIV_ROUND_UP(sizeof(pkt_buf), \
+ ETH_HLEN + sizeof(struct iphdr) + sizeof(struct udphdr))
+#define TAP_MSGS_IP6 \
+ DIV_ROUND_UP(sizeof(pkt_buf), \
+ ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct udphdr))
/* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */
-static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf);
-static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
+static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS_IP4);
+static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS_IP6);
#define TAP_SEQS 128 /* Different L4 tuples in one batch */
#define FRAGMENT_MSG_RATE 10 /* # seconds between fragment warnings */
/**
+ * tap_l2_max_len() - Maximum frame size (including L2 header) for current mode
+ * @c: Execution context
+ */
+unsigned long tap_l2_max_len(const struct ctx *c)
+{
+ /* NOLINTBEGIN(bugprone-branch-clone): values can be the same */
+ switch (c->mode) {
+ case MODE_PASST:
+ return L2_MAX_LEN_PASST;
+ case MODE_PASTA:
+ return L2_MAX_LEN_PASTA;
+ case MODE_VU:
+ return L2_MAX_LEN_VU;
+ }
+ /* NOLINTEND(bugprone-branch-clone) */
+ ASSERT(0);
+
+ return 0; /* Unreachable, for cppcheck's sake */
+}
+
+/**
* tap_send_single() - Send a single frame
* @c: Execution context
* @data: Packet buffer
@@ -77,9 +130,18 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
*/
void tap_send_single(const struct ctx *c, const void *data, size_t l2len)
{
- uint32_t vnet_len = htonl(l2len);
+ uint8_t padded[ETH_ZLEN] = { 0 };
struct iovec iov[2];
size_t iovcnt = 0;
+ uint32_t vnet_len;
+
+ if (l2len < ETH_ZLEN) {
+ memcpy(padded, data, l2len);
+ data = padded;
+ l2len = ETH_ZLEN;
+ }
+
+ vnet_len = htonl(l2len);
switch (c->mode) {
case MODE_PASST:
@@ -118,24 +180,26 @@ const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
* tap_push_l2h() - Build an L2 header for an inbound packet
* @c: Execution context
* @buf: Buffer address at which to generate header
+ * @src_mac: MAC address to be used as source for message
* @proto: Ethernet protocol number for L3
*
* Return: pointer at which to write the packet's payload
*/
-void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
+void *tap_push_l2h(const struct ctx *c, void *buf,
+ const void *src_mac, uint16_t proto)
{
struct ethhdr *eh = (struct ethhdr *)buf;
- /* TODO: ARP table lookup */
+ /* TODO: ARP lookup on tap side */
memcpy(eh->h_dest, c->guest_mac, ETH_ALEN);
- memcpy(eh->h_source, c->our_tap_mac, ETH_ALEN);
+ memcpy(eh->h_source, src_mac, ETH_ALEN);
eh->h_proto = ntohs(proto);
return eh + 1;
}
/**
* tap_push_ip4h() - Build IPv4 header for inbound packet, with checksum
- * @c: Execution context
+ * @ip4h: Buffer in which to build the IPv4 header
* @src: IPv4 source address
* @dst: IPv4 destination address
* @l4len: IPv4 payload length
@@ -164,7 +228,7 @@ void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
/**
* tap_push_uh4() - Build UDPv4 header with checksum
- * @c: Execution context
+ * @uh: Buffer in which to build the UDP header
* @src: IPv4 source address
* @sport: UDP source port
* @dst: IPv4 destination address
@@ -208,7 +272,7 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
{
size_t l4len = dlen + sizeof(struct udphdr);
char buf[USHRT_MAX];
- struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
+ struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IP);
struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP);
char *data = tap_push_uh4(uh, src, sport, dst, dport, in, dlen);
@@ -222,13 +286,14 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
* @src: IPv4 source address
* @dst: IPv4 destination address
* @in: ICMP packet, including ICMP header
+ * @src_mac: MAC address to be used as source for message
* @l4len: ICMP packet length, including ICMP header
*/
void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
- const void *in, size_t l4len)
+ const void *in, const void *src_mac, size_t l4len)
{
char buf[USHRT_MAX];
- struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
+ struct iphdr *ip4h = tap_push_l2h(c, buf, src_mac, ETH_P_IP);
struct icmphdr *icmp4h = tap_push_ip4h(ip4h, src, dst,
l4len, IPPROTO_ICMP);
@@ -240,7 +305,7 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
/**
* tap_push_ip6h() - Build IPv6 header for inbound packet
- * @c: Execution context
+ * @ip6h: Buffer in which to build the IPv6 header
* @src: IPv6 source address
* @dst: IPv6 destination address
* @l4len: L4 payload length
@@ -266,7 +331,7 @@ void *tap_push_ip6h(struct ipv6hdr *ip6h,
/**
* tap_push_uh6() - Build UDPv6 header with checksum
- * @c: Execution context
+ * @uh: Buffer in which to build the UDP header
* @src: IPv6 source address
* @sport: UDP source port
* @dst: IPv6 destination address
@@ -314,7 +379,7 @@ void tap_udp6_send(const struct ctx *c,
{
size_t l4len = dlen + sizeof(struct udphdr);
char buf[USHRT_MAX];
- struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
+ struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IPV6);
struct udphdr *uh = tap_push_ip6h(ip6h, src, dst,
l4len, IPPROTO_UDP, flow);
char *data = tap_push_uh6(uh, src, sport, dst, dport, in, dlen);
@@ -329,14 +394,15 @@ void tap_udp6_send(const struct ctx *c,
* @src: IPv6 source address
* @dst: IPv6 destination address
* @in: ICMP packet, including ICMP header
+ * @src_mac: MAC address to be used as source for message
* @l4len: ICMP packet length, including ICMP header
*/
void tap_icmp6_send(const struct ctx *c,
const struct in6_addr *src, const struct in6_addr *dst,
- const void *in, size_t l4len)
+ const void *in, const void *src_mac, size_t l4len)
{
char buf[USHRT_MAX];
- struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
+ struct ipv6hdr *ip6h = tap_push_l2h(c, buf, src_mac, ETH_P_IPV6);
struct icmp6hdr *icmp6h = tap_push_ip6h(ip6h, src, dst, l4len,
IPPROTO_ICMPV6, 0);
@@ -454,13 +520,16 @@ static size_t tap_send_frames_passt(const struct ctx *c,
* @iov must have total length @bufs_per_frame * @nframes, with each set of
* @bufs_per_frame contiguous buffers representing a single frame.
*
- * Return: number of frames actually sent
+ * Return: number of frames actually sent, or accounted as sent
*/
size_t tap_send_frames(const struct ctx *c, const struct iovec *iov,
size_t bufs_per_frame, size_t nframes)
{
size_t m;
+ if (c->fd_tap == -1)
+ return nframes;
+
if (!nframes)
return 0;
@@ -502,12 +571,13 @@ void eth_update_mac(struct ethhdr *eh,
memcpy(eh->h_source, eth_s, sizeof(eh->h_source));
}
-PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
+PACKET_POOL_DECL(pool_l4, UIO_MAXIOV);
/**
* struct l4_seq4_t - Message sequence for one protocol handler call, IPv4
* @msgs: Count of messages in sequence
* @protocol: Protocol number
+ * @ttl: Time to live
* @source: Source port
* @dest: Destination port
* @saddr: Source address
@@ -516,6 +586,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
*/
static struct tap4_l4_t {
uint8_t protocol;
+ uint8_t ttl;
uint16_t source;
uint16_t dest;
@@ -535,6 +606,7 @@ static struct tap4_l4_t {
* @dest: Destination port
* @saddr: Source address
* @daddr: Destination address
+ * @hop_limit: Hop limit
* @msg: Array of messages that can be handled in a single call
*/
static struct tap6_l4_t {
@@ -547,6 +619,8 @@ static struct tap6_l4_t {
struct in6_addr saddr;
struct in6_addr daddr;
+ uint8_t hop_limit;
+
struct pool_l4_t p;
} tap6_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */];
@@ -648,26 +722,31 @@ static int tap4_handler(struct ctx *c, const struct pool *in,
i = 0;
resume:
for (seq_count = 0, seq = NULL; i < in->count; i++) {
- size_t l2len, l3len, hlen, l4len;
+ size_t l3len, hlen, l4len;
+ struct ethhdr eh_storage;
+ struct iphdr iph_storage;
+ struct udphdr uh_storage;
const struct ethhdr *eh;
const struct udphdr *uh;
+ struct iov_tail data;
struct iphdr *iph;
- const char *l4h;
- packet_get(in, i, 0, 0, &l2len);
+ if (!packet_get(in, i, &data))
+ continue;
- eh = packet_get(in, i, 0, sizeof(*eh), &l3len);
+ eh = IOV_PEEK_HEADER(&data, eh_storage);
if (!eh)
continue;
if (ntohs(eh->h_proto) == ETH_P_ARP) {
- PACKET_POOL_P(pkt, 1, in->buf, in->buf_size);
-
- packet_add(pkt, l2len, (char *)eh);
- arp(c, pkt);
+ arp(c, &data);
continue;
}
- iph = packet_get(in, i, sizeof(*eh), sizeof(*iph), NULL);
+ if (!iov_drop_header(&data, sizeof(*eh)))
+ continue;
+ l3len = iov_tail_size(&data);
+
+ iph = IOV_PEEK_HEADER(&data, iph_storage);
if (!iph)
continue;
@@ -695,34 +774,32 @@ resume:
if (iph->saddr && c->ip4.addr_seen.s_addr != iph->saddr)
c->ip4.addr_seen.s_addr = iph->saddr;
- l4h = packet_get(in, i, sizeof(*eh) + hlen, l4len, NULL);
- if (!l4h)
+ if (!iov_drop_header(&data, hlen))
+ continue;
+ if (iov_tail_size(&data) != l4len)
continue;
if (iph->protocol == IPPROTO_ICMP) {
- PACKET_POOL_P(pkt, 1, in->buf, in->buf_size);
-
if (c->no_icmp)
continue;
tap_packet_debug(iph, NULL, NULL, 0, NULL, 1);
- packet_add(pkt, l4len, l4h);
icmp_tap_handler(c, PIF_TAP, AF_INET,
&iph->saddr, &iph->daddr,
- pkt, now);
+ &data, now);
continue;
}
- uh = packet_get(in, i, sizeof(*eh) + hlen, sizeof(*uh), NULL);
+ uh = IOV_PEEK_HEADER(&data, uh_storage);
if (!uh)
continue;
if (iph->protocol == IPPROTO_UDP) {
- PACKET_POOL_P(pkt, 1, in->buf, in->buf_size);
+ struct iov_tail eh_data;
- packet_add(pkt, l2len, (char *)eh);
- if (dhcp(c, pkt))
+ packet_get(in, i, &eh_data);
+ if (dhcp(c, &eh_data))
continue;
}
@@ -735,7 +812,8 @@ resume:
#define L4_MATCH(iph, uh, seq) \
((seq)->protocol == (iph)->protocol && \
(seq)->source == (uh)->source && (seq)->dest == (uh)->dest && \
- (seq)->saddr.s_addr == (iph)->saddr && (seq)->daddr.s_addr == (iph)->daddr)
+ (seq)->saddr.s_addr == (iph)->saddr && \
+ (seq)->daddr.s_addr == (iph)->daddr && (seq)->ttl == (iph)->ttl)
#define L4_SET(iph, uh, seq) \
do { \
@@ -744,6 +822,7 @@ resume:
(seq)->dest = (uh)->dest; \
(seq)->saddr.s_addr = (iph)->saddr; \
(seq)->daddr.s_addr = (iph)->daddr; \
+ (seq)->ttl = (iph)->ttl; \
} while (0)
if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV)
@@ -770,7 +849,7 @@ resume:
#undef L4_SET
append:
- packet_add((struct pool *)&seq->p, l4len, l4h);
+ packet_add((struct pool *)&seq->p, &data);
}
for (j = 0, seq = tap4_l4; j < seq_count; j++, seq++) {
@@ -792,7 +871,7 @@ append:
for (k = 0; k < p->count; )
k += udp_tap_handler(c, PIF_TAP, AF_INET,
&seq->saddr, &seq->daddr,
- p, k, now);
+ seq->ttl, p, k, now);
}
}
@@ -824,20 +903,28 @@ resume:
for (seq_count = 0, seq = NULL; i < in->count; i++) {
size_t l4len, plen, check;
struct in6_addr *saddr, *daddr;
+ struct ipv6hdr ip6h_storage;
+ struct ethhdr eh_storage;
+ struct udphdr uh_storage;
const struct ethhdr *eh;
const struct udphdr *uh;
+ struct iov_tail data;
struct ipv6hdr *ip6h;
uint8_t proto;
- char *l4h;
- eh = packet_get(in, i, 0, sizeof(*eh), NULL);
+ if (!packet_get(in, i, &data))
+ return -1;
+
+ eh = IOV_REMOVE_HEADER(&data, eh_storage);
if (!eh)
continue;
- ip6h = packet_get(in, i, sizeof(*eh), sizeof(*ip6h), &check);
+ ip6h = IOV_PEEK_HEADER(&data, ip6h_storage);
if (!ip6h)
continue;
+ check = iov_tail_size(&data) - sizeof(*ip6h);
+
saddr = &ip6h->saddr;
daddr = &ip6h->daddr;
@@ -845,7 +932,7 @@ resume:
if (plen != check)
continue;
- if (!(l4h = ipv6_l4hdr(in, i, sizeof(*eh), &proto, &l4len)))
+ if (!ipv6_l4hdr(&data, &proto, &l4len))
continue;
if (IN6_IS_ADDR_LOOPBACK(saddr) || IN6_IS_ADDR_LOOPBACK(daddr)) {
@@ -871,7 +958,7 @@ resume:
}
if (proto == IPPROTO_ICMPV6) {
- PACKET_POOL_P(pkt, 1, in->buf, in->buf_size);
+ struct iov_tail ndp_data;
if (c->no_icmp)
continue;
@@ -879,28 +966,27 @@ resume:
if (l4len < sizeof(struct icmp6hdr))
continue;
- packet_add(pkt, l4len, l4h);
-
- if (ndp(c, (struct icmp6hdr *)l4h, saddr, pkt))
+ ndp_data = data;
+ if (ndp(c, saddr, &ndp_data))
continue;
tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1);
icmp_tap_handler(c, PIF_TAP, AF_INET6,
- saddr, daddr, pkt, now);
+ saddr, daddr, &data, now);
continue;
}
if (l4len < sizeof(*uh))
continue;
- uh = (struct udphdr *)l4h;
+ uh = IOV_PEEK_HEADER(&data, uh_storage);
+ if (!uh)
+ continue;
if (proto == IPPROTO_UDP) {
- PACKET_POOL_P(pkt, 1, in->buf, in->buf_size);
+ struct iov_tail uh_data = data;
- packet_add(pkt, l4len, l4h);
-
- if (dhcpv6(c, pkt, saddr, daddr))
+ if (dhcpv6(c, &uh_data, saddr, daddr))
continue;
}
@@ -915,7 +1001,8 @@ resume:
(seq)->dest == (uh)->dest && \
(seq)->flow_lbl == ip6_get_flow_lbl(ip6h) && \
IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr) && \
- IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr))
+ IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr) && \
+ (seq)->hop_limit == (ip6h)->hop_limit)
#define L4_SET(ip6h, proto, uh, seq) \
do { \
@@ -925,6 +1012,7 @@ resume:
(seq)->flow_lbl = ip6_get_flow_lbl(ip6h); \
(seq)->saddr = *saddr; \
(seq)->daddr = *daddr; \
+ (seq)->hop_limit = (ip6h)->hop_limit; \
} while (0)
if (seq && L4_MATCH(ip6h, proto, uh, seq) &&
@@ -952,7 +1040,7 @@ resume:
#undef L4_SET
append:
- packet_add((struct pool *)&seq->p, l4len, l4h);
+ packet_add((struct pool *)&seq->p, &data);
}
for (j = 0, seq = tap6_l4; j < seq_count; j++, seq++) {
@@ -975,7 +1063,7 @@ append:
for (k = 0; k < p->count; )
k += udp_tap_handler(c, PIF_TAP, AF_INET6,
&seq->saddr, &seq->daddr,
- p, k, now);
+ seq->hop_limit, p, k, now);
}
}
@@ -1008,29 +1096,45 @@ void tap_handler(struct ctx *c, const struct timespec *now)
/**
* tap_add_packet() - Queue/capture packet, update notion of guest MAC address
* @c: Execution context
- * @l2len: Total L2 packet length
- * @p: Packet buffer
+ * @data: Packet to add to the pool
+ * @now: Current timestamp
*/
-void tap_add_packet(struct ctx *c, ssize_t l2len, char *p)
+void tap_add_packet(struct ctx *c, struct iov_tail *data,
+ const struct timespec *now)
{
+ struct ethhdr eh_storage;
const struct ethhdr *eh;
- pcap(p, l2len);
+ pcap_iov(data->iov, data->cnt, data->off);
- eh = (struct ethhdr *)p;
+ eh = IOV_PEEK_HEADER(data, eh_storage);
+ if (!eh)
+ return;
if (memcmp(c->guest_mac, eh->h_source, ETH_ALEN)) {
+ char bufmac[ETH_ADDRSTRLEN];
+
memcpy(c->guest_mac, eh->h_source, ETH_ALEN);
- proto_update_l2_buf(c->guest_mac, NULL);
+ debug("New guest MAC address observed: %s",
+ eth_ntop(c->guest_mac, bufmac, sizeof(bufmac)));
+ proto_update_l2_buf(c->guest_mac);
}
switch (ntohs(eh->h_proto)) {
case ETH_P_ARP:
case ETH_P_IP:
- packet_add(pool_tap4, l2len, p);
+ if (!pool_can_fit(pool_tap4, data)) {
+ tap4_handler(c, pool_tap4, now);
+ pool_flush(pool_tap4);
+ }
+ packet_add(pool_tap4, data);
break;
case ETH_P_IPV6:
- packet_add(pool_tap6, l2len, p);
+ if (!pool_can_fit(pool_tap6, data)) {
+ tap6_handler(c, pool_tap6, now);
+ pool_flush(pool_tap6);
+ }
+ packet_add(pool_tap6, data);
break;
default:
break;
@@ -1046,10 +1150,10 @@ void tap_sock_reset(struct ctx *c)
info("Client connection closed%s", c->one_off ? ", exiting" : "");
if (c->one_off)
- _exit(EXIT_SUCCESS);
+ passt_exit(EXIT_SUCCESS);
/* Close the connected socket, wait for a new connection */
- epoll_del(c, c->fd_tap);
+ epoll_del(c->epollfd, c->fd_tap);
close(c->fd_tap);
c->fd_tap = -1;
if (c->mode == MODE_VU)
@@ -1080,7 +1184,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
do {
n = recv(c->fd_tap, pkt_buf + partial_len,
- TAP_BUF_BYTES - partial_len, MSG_DONTWAIT);
+ sizeof(pkt_buf) - partial_len, MSG_DONTWAIT);
} while ((n < 0) && errno == EINTR);
if (n < 0) {
@@ -1096,8 +1200,9 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
while (n >= (ssize_t)sizeof(uint32_t)) {
uint32_t l2len = ntohl_unaligned(p);
+ struct iov_tail data;
- if (l2len < sizeof(struct ethhdr) || l2len > ETH_MAX_MTU) {
+ if (l2len < sizeof(struct ethhdr) || l2len > L2_MAX_LEN_PASST) {
err("Bad frame size from guest, resetting connection");
tap_sock_reset(c);
return;
@@ -1110,7 +1215,8 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now)
p += sizeof(uint32_t);
n -= sizeof(uint32_t);
- tap_add_packet(c, l2len, p);
+ data = IOV_TAIL_FROM_BUF(p, l2len, 0);
+ tap_add_packet(c, &data, now);
p += l2len;
n -= l2len;
@@ -1151,8 +1257,12 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
tap_flush_pools();
- for (n = 0; n <= (ssize_t)(TAP_BUF_BYTES - ETH_MAX_MTU); n += len) {
- len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU);
+ for (n = 0;
+ n <= (ssize_t)(sizeof(pkt_buf) - L2_MAX_LEN_PASTA);
+ n += len) {
+ struct iov_tail data;
+
+ len = read(c->fd_tap, pkt_buf + n, L2_MAX_LEN_PASTA);
if (len == 0) {
die("EOF on tap device, exiting");
@@ -1170,10 +1280,11 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)
/* Ignore frames of bad length */
if (len < (ssize_t)sizeof(struct ethhdr) ||
- len > (ssize_t)ETH_MAX_MTU)
+ len > (ssize_t)L2_MAX_LEN_PASTA)
continue;
- tap_add_packet(c, len, pkt_buf + n);
+ data = IOV_TAIL_FROM_BUF(pkt_buf + n, len, 0);
+ tap_add_packet(c, &data, now);
}
tap_handler(c, now);
@@ -1227,14 +1338,36 @@ static void tap_backend_show_hints(struct ctx *c)
static void tap_sock_unix_init(const struct ctx *c)
{
union epoll_ref ref = { .type = EPOLL_TYPE_TAP_LISTEN };
- struct epoll_event ev = { 0 };
listen(c->fd_tap_listen, 0);
ref.fd = c->fd_tap_listen;
- ev.events = EPOLLIN | EPOLLET;
- ev.data.u64 = ref.u64;
- epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap_listen, &ev);
+
+ epoll_add(c->epollfd, EPOLLIN | EPOLLET, ref);
+}
+
+/**
+ * tap_is_ready() - Check if tap interface is ready to send packets
+ * @c: Execution context
+ *
+ * For pasta mode, checks if the tap interface is up.
+ * For other modes, just checks if fd_tap is valid.
+ *
+ * Return: true if ready, false otherwise
+ */
+bool tap_is_ready(const struct ctx *c)
+{
+ if (c->fd_tap < 0)
+ return false;
+
+ if (c->mode == MODE_PASTA) {
+ /* If pasta_conf_ns is set, the interface was configured and
+ * brought up during initialization. If not, it's still down.
+ */
+ return c->pasta_conf_ns;
+ }
+
+ return true;
}
/**
@@ -1243,7 +1376,6 @@ static void tap_sock_unix_init(const struct ctx *c)
*/
static void tap_start_connection(const struct ctx *c)
{
- struct epoll_event ev = { 0 };
union epoll_ref ref = { 0 };
ref.fd = c->fd_tap;
@@ -1259,9 +1391,15 @@ static void tap_start_connection(const struct ctx *c)
break;
}
- ev.events = EPOLLIN | EPOLLRDHUP;
- ev.data.u64 = ref.u64;
- epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
+ epoll_add(c->epollfd, EPOLLIN | EPOLLRDHUP, ref);
+
+ if (!tap_is_ready(c))
+ return;
+
+ if (c->ifi4)
+ arp_send_init_req(c);
+ if (c->ifi6 && !c->no_ndp)
+ ndp_send_init_req(c);
}
/**
@@ -1367,12 +1505,12 @@ static void tap_sock_tun_init(struct ctx *c)
* @base: Buffer base
* @size Buffer size
*/
-void tap_sock_update_pool(void *base, size_t size)
+static void tap_sock_update_pool(void *base, size_t size)
{
int i;
- pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, base, size);
- pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, base, size);
+ pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS_IP4, base, size);
+ pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS_IP6, base, size);
for (i = 0; i < TAP_SEQS; i++) {
tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, base, size);
@@ -1388,8 +1526,8 @@ void tap_sock_update_pool(void *base, size_t size)
void tap_backend_init(struct ctx *c)
{
if (c->mode == MODE_VU) {
- tap_sock_update_pool(NULL, 0);
vu_init(c);
+ tap_sock_update_pool(&c->vdev->memory, 0);
} else {
tap_sock_update_pool(pkt_buf, sizeof(pkt_buf));
}
@@ -1410,11 +1548,11 @@ void tap_backend_init(struct ctx *c)
case MODE_PASST:
tap_sock_unix_init(c);
- /* In passt mode, we don't know the guest's MAC address until it
- * sends us packets. Use the broadcast address so that our
- * first packets will reach it.
+ /* In passt mode, we don't know the guest's MAC address until
+ * it sends us packets. Until then, use the broadcast address
+ * so that our first packets will have a chance to reach it.
*/
- memset(&c->guest_mac, 0xff, sizeof(c->guest_mac));
+ memcpy(&c->guest_mac, MAC_BROADCAST, sizeof(c->guest_mac));
break;
}
diff --git a/tap.h b/tap.h
index a2c3b87..ee22a9d 100644
--- a/tap.h
+++ b/tap.h
@@ -6,6 +6,31 @@
#ifndef TAP_H
#define TAP_H
+/** L2_MAX_LEN_PASTA - Maximum frame length for pasta mode (with L2 header)
+ *
+ * The kernel tuntap device imposes a maximum frame size of 65535 including
+ * 'hard_header_len' (14 bytes for L2 Ethernet in the case of "tap" mode).
+ */
+#define L2_MAX_LEN_PASTA USHRT_MAX
+
+/** L2_MAX_LEN_PASST - Maximum frame length for passt mode (with L2 header)
+ *
+ * The only structural limit the QEMU socket protocol imposes on frames is
+ * (2^32-1) bytes, but that would be ludicrously long in practice. For now,
+ * limit it somewhat arbitrarily to 65535 bytes. FIXME: Work out an appropriate
+ * limit with more precision.
+ */
+#define L2_MAX_LEN_PASST USHRT_MAX
+
+/** L2_MAX_LEN_VU - Maximum frame length for vhost-user mode (with L2 header)
+ *
+ * vhost-user allows multiple buffers per frame, each of which can be quite
+ * large, so the inherent frame size limit is rather large. Much larger than is
+ * actually useful for IP. For now limit arbitrarily to 65535 bytes. FIXME:
+ * Work out an appropriate limit with more precision.
+ */
+#define L2_MAX_LEN_VU USHRT_MAX
+
struct udphdr;
/**
@@ -21,8 +46,8 @@ struct tap_hdr {
* @c: Execution context
* @taph: Pointer to tap specific header buffer
*
- * Returns: A struct iovec covering the correct portion of @taph to use as the
- * tap specific header in the current configuration.
+ * Return: a struct iovec covering the correct portion of @taph to use as the
+ * tap specific header in the current configuration.
*/
static inline struct iovec tap_hdr_iov(const struct ctx *c,
struct tap_hdr *thdr)
@@ -44,7 +69,9 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
thdr->vnet_len = htonl(l2len);
}
-void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
+unsigned long tap_l2_max_len(const struct ctx *c);
+void *tap_push_l2h(const struct ctx *c, void *buf,
+ const void *src_mac, uint16_t proto);
void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
struct in_addr dst, size_t l4len, uint8_t proto);
void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
@@ -64,7 +91,7 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
struct in_addr dst, in_port_t dport,
const void *in, size_t dlen);
void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
- const void *in, size_t l4len);
+ const void *in, const void *src_mac, size_t l4len);
const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
const struct in6_addr *src);
void *tap_push_ip6h(struct ipv6hdr *ip6h,
@@ -76,12 +103,13 @@ void tap_udp6_send(const struct ctx *c,
uint32_t flow, void *in, size_t dlen);
void tap_icmp6_send(const struct ctx *c,
const struct in6_addr *src, const struct in6_addr *dst,
- const void *in, size_t l4len);
+ const void *in, const void *src_mac, size_t l4len);
void tap_send_single(const struct ctx *c, const void *data, size_t l2len);
size_t tap_send_frames(const struct ctx *c, const struct iovec *iov,
size_t bufs_per_frame, size_t nframes);
void eth_update_mac(struct ethhdr *eh,
const unsigned char *eth_d, const unsigned char *eth_s);
+bool tap_is_ready(const struct ctx *c);
void tap_listen_handler(struct ctx *c, uint32_t events);
void tap_handler_pasta(struct ctx *c, uint32_t events,
const struct timespec *now);
@@ -89,10 +117,9 @@ void tap_handler_passt(struct ctx *c, uint32_t events,
const struct timespec *now);
int tap_sock_unix_open(char *sock_path);
void tap_sock_reset(struct ctx *c);
-void tap_sock_update_pool(void *base, size_t size);
void tap_backend_init(struct ctx *c);
void tap_flush_pools(void);
void tap_handler(struct ctx *c, const struct timespec *now);
-void tap_add_packet(struct ctx *c, ssize_t l2len, char *p);
-
+void tap_add_packet(struct ctx *c, struct iov_tail *data,
+ const struct timespec *now);
#endif /* TAP_H */
diff --git a/tcp.c b/tcp.c
index 32a08bd..b179e39 100644
--- a/tcp.c
+++ b/tcp.c
@@ -179,14 +179,16 @@
*
* Timeouts are implemented by means of timerfd timers, set based on flags:
*
- * - SYN_TIMEOUT: if no ACK is received from tap/guest during handshake (flag
- * ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, reset the
- * connection
+ * - RTO_INIT: if no ACK segment was received from tap/guest, either during
+ * handshake (flag ACK_FROM_TAP_DUE without ESTABLISHED event) or after
+ * sending data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data
+ * from the socket and reset sequence to what was acknowledged. This is the
+ * timeout for the first retry, in seconds. Retry TCP_MAX_RETRIES times for
+ * established connections, or (syn_retries + syn_linear_timeouts) times
+ * during the handshake, then reset the connection
*
- * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
- * data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the
- * socket and reset sequence to what was acknowledged. If this persists for
- * more than TCP_MAX_RETRANS times in a row, reset the connection
+ * - RTO_INIT_AFTER_SYN_RETRIES: if SYN retries happened during handshake and
+ * RTO is less than this, re-initialise RTO to this for data retransmissions
*
* - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE
* with TAP_FIN_SENT event), and no ACK is received within this time, reset
@@ -200,9 +202,13 @@
* - ACT_TIMEOUT, in the presence of any event: if no activity is detected on
* either side, the connection is reset
*
- * - ACK_INTERVAL elapsed after data segment received from tap without having
+ * - RTT / 2 elapsed after data segment received from tap without having
* sent an ACK segment, or zero-sized window advertised to tap/guest (flag
- * ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent
+ * ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent.
+ *
+ * RTT, here, is an approximation of the RTT value reported by the kernel via
+ * TCP_INFO, with a representable range from RTT_STORE_MIN (100 us) to
+ * RTT_STORE_MAX (3276.8 ms). The timeout value is clamped accordingly.
*
*
* Summary of data flows (with ESTABLISHED event)
@@ -279,7 +285,6 @@
#include <stdbool.h>
#include <stddef.h>
#include <string.h>
-#include <sys/epoll.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/timerfd.h>
@@ -309,6 +314,17 @@
#include "tcp_internal.h"
#include "tcp_buf.h"
#include "tcp_vu.h"
+#include "epoll_ctl.h"
+
+/*
+ * The size of TCP header (including options) is given by doff (Data Offset)
+ * that is a 4-bit value specifying the number of 32-bit words in the header.
+ * The maximum value of doff is 15 [(1 << 4) - 1].
+ * The maximum length in bytes of options is 15 minus the number of 32-bit
+ * words in the minimal TCP header (5) multiplied by the length of a 32-bit
+ * word (4).
+ */
+#define OPTLEN_MAX (((1UL << 4) - 1 - 5) * 4UL)
#ifndef __USE_MISC
/* From Linux UAPI, missing in netinet/tcp.h provided by musl */
@@ -329,15 +345,24 @@ enum {
#define MSS_DEFAULT 536
#define WINDOW_DEFAULT 14600 /* RFC 6928 */
-#define ACK_INTERVAL 10 /* ms */
-#define SYN_TIMEOUT 10 /* s */
-#define ACK_TIMEOUT 2
+#define RTO_INIT 1 /* s, RFC 6298 */
+#define RTO_INIT_AFTER_SYN_RETRIES 3 /* s, RFC 6298 */
#define FIN_TIMEOUT 60
#define ACT_TIMEOUT 7200
#define LOW_RTT_TABLE_SIZE 8
#define LOW_RTT_THRESHOLD 10 /* us */
+/* Parameters to temporarily exceed sending buffer to force TCP auto-tuning */
+#define SNDBUF_BOOST_BYTES_RTT_LO 2500 /* B * s: no boost until here */
+/* ...examples: 5 MB sent * 500 ns RTT, 250 kB * 10 ms, 8 kB * 300 ms */
+#define SNDBUF_BOOST_FACTOR 150 /* % */
+#define SNDBUF_BOOST_BYTES_RTT_HI 6000 /* apply full boost factor */
+/* 12 MB sent * 500 ns RTT, 600 kB * 10 ms, 20 kB * 300 ms */
+
+/* Ratio of buffer to bandwidth * delay product implying interactive traffic */
+#define SNDBUF_TO_BW_DELAY_INTERACTIVE /* > */ 20 /* (i.e. < 5% of buffer) */
+
#define ACK_IF_NEEDED 0 /* See tcp_send_flag() */
#define CONN_IS_CLOSING(conn) \
@@ -355,6 +380,15 @@ uint8_t tcp_migrate_rcv_queue [TCP_MIGRATE_RCV_QUEUE_MAX];
#define TCP_MIGRATE_RESTORE_CHUNK_MIN 1024 /* Try smaller when above this */
+#define SYN_RETRIES "/proc/sys/net/ipv4/tcp_syn_retries"
+#define SYN_LINEAR_TIMEOUTS "/proc/sys/net/ipv4/tcp_syn_linear_timeouts"
+#define RTO_MAX_MS "/proc/sys/net/ipv4/tcp_rto_max_ms"
+
+#define SYN_RETRIES_DEFAULT 6
+#define SYN_LINEAR_TIMEOUTS_DEFAULT 4
+#define RTO_MAX_DEFAULT 120 /* s */
+#define MAX_SYNCNT 127 /* derived from kernel's limit */
+
/* "Extended" data (not stored in the flow table) for TCP flow migration */
static struct tcp_tap_transfer_ext migrate_ext[FLOW_MAX];
@@ -377,7 +411,7 @@ static const char *tcp_state_str[] __attribute((__unused__)) = {
static const char *tcp_flag_str[] __attribute((__unused__)) = {
"STALLED", "LOCAL", "ACTIVE_CLOSE", "ACK_TO_TAP_DUE",
- "ACK_FROM_TAP_DUE", "ACK_FROM_TAP_BLOCKS",
+ "ACK_FROM_TAP_DUE", "ACK_FROM_TAP_BLOCKS", "SYN_RETRIED",
};
/* Listening sockets, used for automatic port forwarding in pasta mode only */
@@ -389,7 +423,7 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS];
*/
static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
-char tcp_buf_discard [MAX_WINDOW];
+char tcp_buf_discard [BUF_DISCARD_SIZE];
/* Does the kernel support TCP_PEEK_OFF? */
bool peek_offset_cap;
@@ -402,11 +436,13 @@ socklen_t tcp_info_size;
sizeof(((struct tcp_info_linux *)NULL)->tcpi_##f_)) <= tcp_info_size)
/* Kernel reports sending window in TCP_INFO (kernel commit 8f7baad7f035) */
-#define snd_wnd_cap tcp_info_cap(snd_wnd)
+#define snd_wnd_cap tcp_info_cap(snd_wnd)
/* Kernel reports bytes acked in TCP_INFO (kernel commit 0df48c26d84) */
-#define bytes_acked_cap tcp_info_cap(bytes_acked)
+#define bytes_acked_cap tcp_info_cap(bytes_acked)
/* Kernel reports minimum RTT in TCP_INFO (kernel commit cd9b266095f4) */
-#define min_rtt_cap tcp_info_cap(min_rtt)
+#define min_rtt_cap tcp_info_cap(min_rtt)
+/* Kernel reports delivery rate in TCP_INFO (kernel commit eb8329e0a04d) */
+#define delivery_rate_cap tcp_info_cap(delivery_rate)
/* sendmsg() to socket */
static struct iovec tcp_iov [UIO_MAXIOV];
@@ -434,19 +470,20 @@ static struct tcp_tap_conn *conn_at_sidx(flow_sidx_t sidx)
}
/**
- * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported
- * @s: Socket to update
+ * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on connection if supported
+ * @conn: Pointer to the TCP connection structure
* @offset: Offset in bytes
*
- * Return: -1 when it fails, 0 otherwise.
+ * Return: -1 when it fails, 0 otherwise.
*/
-int tcp_set_peek_offset(int s, int offset)
+int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset)
{
if (!peek_offset_cap)
return 0;
- if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) {
- err("Failed to set SO_PEEK_OFF to %i in socket %i", offset, s);
+ if (setsockopt(conn->sock, SOL_SOCKET, SO_PEEK_OFF,
+ &offset, sizeof(offset))) {
+ flow_perror(conn, "Failed to set SO_PEEK_OFF to %i", offset);
return -1;
}
return 0;
@@ -455,7 +492,7 @@ int tcp_set_peek_offset(int s, int offset)
/**
* tcp_conn_epoll_events() - epoll events mask for given connection state
* @events: Current connection events
- * @conn_flags Connection flags
+ * @conn_flags: Connection flags
*
* Return: epoll events mask corresponding to implied connection state
*/
@@ -493,25 +530,27 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
*/
static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
{
- int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
+ int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock,
.flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), };
struct epoll_event ev = { .data.u64 = ref.u64 };
+ int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f)
+ : c->epollfd;
if (conn->events == CLOSED) {
- if (conn->in_epoll)
- epoll_del(c, conn->sock);
+ if (flow_in_epoll(&conn->f))
+ epoll_del(epollfd, conn->sock);
if (conn->timer != -1)
- epoll_del(c, conn->timer);
+ epoll_del(epollfd, conn->timer);
return 0;
}
ev.events = tcp_conn_epoll_events(conn->events, conn->flags);
- if (epoll_ctl(c->epollfd, m, conn->sock, &ev))
+ if (epoll_ctl(epollfd, m, conn->sock, &ev))
return -errno;
- conn->in_epoll = true;
+ flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
if (conn->timer != -1) {
union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER,
@@ -520,7 +559,8 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
struct epoll_event ev_t = { .data.u64 = ref_t.u64,
.events = EPOLLIN | EPOLLET };
- if (epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t))
+ if (epoll_ctl(flow_epollfd(&conn->f), EPOLL_CTL_MOD,
+ conn->timer, &ev_t))
return -errno;
}
@@ -531,8 +571,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
* tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed
* @c: Execution context
* @conn: Connection pointer
- *
- * #syscalls timerfd_create timerfd_settime
+ * #syscalls timerfd_create timerfd_settime|timerfd_settime32
*/
static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
{
@@ -547,6 +586,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
.flow = FLOW_IDX(conn) };
struct epoll_event ev = { .data.u64 = ref.u64,
.events = EPOLLIN | EPOLLET };
+ int epollfd = flow_epollfd(&conn->f);
int fd;
fd = timerfd_create(CLOCK_MONOTONIC, 0);
@@ -559,7 +599,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
}
conn->timer = fd;
- if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) {
+ if (epoll_ctl(epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) {
flow_dbg_perror(conn, "failed to add timer");
close(conn->timer);
conn->timer = -1;
@@ -568,21 +608,34 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
}
if (conn->flags & ACK_TO_TAP_DUE) {
- it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000;
+ it.it_value.tv_sec = RTT_GET(conn) / 2 / ((long)1000 * 1000);
+ it.it_value.tv_nsec = RTT_GET(conn) / 2 % ((long)1000 * 1000) *
+ 1000;
} else if (conn->flags & ACK_FROM_TAP_DUE) {
+ int exp = conn->retries, timeout = RTO_INIT;
if (!(conn->events & ESTABLISHED))
- it.it_value.tv_sec = SYN_TIMEOUT;
- else
- it.it_value.tv_sec = ACK_TIMEOUT;
+ exp -= c->tcp.syn_linear_timeouts;
+ else if (conn->flags & SYN_RETRIED)
+ timeout = MAX(timeout, RTO_INIT_AFTER_SYN_RETRIES);
+ timeout <<= MAX(exp, 0);
+ it.it_value.tv_sec = MIN(timeout, c->tcp.rto_max);
} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
it.it_value.tv_sec = FIN_TIMEOUT;
} else {
it.it_value.tv_sec = ACT_TIMEOUT;
}
- flow_dbg(conn, "timer expires in %llu.%03llus",
- (unsigned long long)it.it_value.tv_sec,
- (unsigned long long)it.it_value.tv_nsec / 1000 / 1000);
+ if (conn->flags & ACK_TO_TAP_DUE) {
+ flow_trace(conn, "timer expires in %llu.%02llums",
+ (unsigned long long)it.it_value.tv_sec * 1000 +
+ it.it_value.tv_nsec / 1000 / 1000,
+ (unsigned long long)it.it_value.tv_nsec
+ / 1000 / 10 % 100);
+ } else {
+ flow_dbg(conn, "timer expires in %llu.%03llus",
+ (unsigned long long)it.it_value.tv_sec,
+ (unsigned long long)it.it_value.tv_nsec / 1000 / 1000);
+ }
if (timerfd_settime(conn->timer, 0, &it, NULL))
flow_perror(conn, "failed to set timer");
@@ -680,12 +733,13 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
flow_dbg(conn, "%s",
num == -1 ? "CLOSED" : tcp_event_str[num]);
- if (event == CLOSED)
- flow_hash_remove(c, TAP_SIDX(conn));
- else if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD))
+ if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD)) {
conn_flag(c, conn, ACTIVE_CLOSE);
- else
+ } else {
+ if (event == CLOSED)
+ flow_hash_remove(c, TAP_SIDX(conn));
tcp_epoll_ctl(c, conn);
+ }
if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
tcp_timer_ctl(c, conn);
@@ -744,7 +798,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
}
/**
- * tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.5 usage)
+ * tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.75 usage)
* @conn: Connection pointer
*/
static void tcp_get_sndbuf(struct tcp_tap_conn *conn)
@@ -759,11 +813,7 @@ static void tcp_get_sndbuf(struct tcp_tap_conn *conn)
return;
}
- v = sndbuf;
- if (v >= SNDBUF_BIG)
- v /= 2;
- else if (v > SNDBUF_SMALL)
- v -= v * (v - SNDBUF_SMALL) / (SNDBUF_BIG - SNDBUF_SMALL) / 2;
+ v = clamped_scale(sndbuf, sndbuf, SNDBUF_SMALL, SNDBUF_BIG, 75);
SNDBUF_SET(conn, MIN(INT_MAX, v));
}
@@ -908,8 +958,10 @@ static void tcp_fill_header(struct tcphdr *th,
/**
* tcp_fill_headers() - Fill 802.3, IP, TCP headers
+ * @c: Execution context
* @conn: Connection pointer
* @taph: tap backend specific header
+ * @eh: Pointer to Ethernet header
* @ip4h: Pointer to IPv4 header, or NULL
* @ip6h: Pointer to IPv6 header, or NULL
* @th: Pointer to TCP header
@@ -918,14 +970,15 @@ static void tcp_fill_header(struct tcphdr *th,
* @seq: Sequence number for this segment
* @no_tcp_csum: Do not set TCP checksum
*/
-void tcp_fill_headers(const struct tcp_tap_conn *conn,
- struct tap_hdr *taph,
+void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
+ struct tap_hdr *taph, struct ethhdr *eh,
struct iphdr *ip4h, struct ipv6hdr *ip6h,
struct tcphdr *th, struct iov_tail *payload,
const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum)
{
const struct flowside *tapside = TAPFLOW(conn);
size_t l4len = iov_tail_size(payload) + sizeof(*th);
+ uint8_t *omac = conn->f.tap_omac;
size_t l3len = l4len;
uint32_t psum = 0;
@@ -951,6 +1004,7 @@ void tcp_fill_headers(const struct tcp_tap_conn *conn,
psum = proto_ipv4_header_psum(l4len, IPPROTO_TCP,
*src4, *dst4);
}
+ eh->h_proto = htons_constant(ETH_P_IP);
}
if (ip6h) {
@@ -971,8 +1025,14 @@ void tcp_fill_headers(const struct tcp_tap_conn *conn,
&ip6h->saddr,
&ip6h->daddr);
}
+ eh->h_proto = htons_constant(ETH_P_IPV6);
}
+ /* Find if neighbour table has a recorded MAC address */
+ if (MAC_IS_UNDEF(omac))
+ fwd_neigh_mac_get(c, &tapside->oaddr, omac);
+ eth_update_mac(eh, NULL, omac);
+
tcp_fill_header(th, conn, seq);
if (no_tcp_csum)
@@ -980,7 +1040,36 @@ void tcp_fill_headers(const struct tcp_tap_conn *conn,
else
tcp_update_csum(psum, th, payload);
- tap_hdr_update(taph, l3len + sizeof(struct ethhdr));
+ tap_hdr_update(taph, MAX(l3len + sizeof(struct ethhdr), ETH_ZLEN));
+}
+
+/**
+ * tcp_sndbuf_boost() - Calculate limit of sending buffer to force auto-tuning
+ * @conn: Connection pointer
+ * @tinfo: tcp_info from kernel, must be pre-fetched
+ *
+ * Return: increased sending buffer to use as a limit for advertised window
+ */
+static unsigned long tcp_sndbuf_boost(const struct tcp_tap_conn *conn,
+ const struct tcp_info_linux *tinfo)
+{
+ unsigned long bytes_rtt_product;
+
+ if (!bytes_acked_cap)
+ return SNDBUF_GET(conn);
+
+ /* This is *not* a bandwidth-delay product, but it's somewhat related:
+ * as we send more data (usually at the beginning of a connection), we
+ * try to make the sending buffer progressively grow, with the RTT as a
+ * factor (longer delay, bigger buffer needed).
+ */
+ bytes_rtt_product = (long long)tinfo->tcpi_bytes_acked *
+ tinfo->tcpi_rtt / 1000 / 1000;
+
+ return clamped_scale(SNDBUF_GET(conn), bytes_rtt_product,
+ SNDBUF_BOOST_BYTES_RTT_LO,
+ SNDBUF_BOOST_BYTES_RTT_HI,
+ SNDBUF_BOOST_FACTOR);
}
/**
@@ -991,6 +1080,8 @@ void tcp_fill_headers(const struct tcp_tap_conn *conn,
* @tinfo: tcp_info from kernel, can be NULL if not pre-fetched
*
* Return: 1 if sequence or window were updated, 0 otherwise
+ *
+ * #syscalls ioctl
*/
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
bool force_seq, struct tcp_info_linux *tinfo)
@@ -1001,32 +1092,75 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
socklen_t sl = sizeof(*tinfo);
struct tcp_info_linux tinfo_new;
uint32_t new_wnd_to_tap = prev_wnd_to_tap;
+ bool ack_everything = true;
int s = conn->sock;
- if (!bytes_acked_cap) {
- conn->seq_ack_to_tap = conn->seq_from_tap;
- if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
- conn->seq_ack_to_tap = prev_ack_to_tap;
- } else {
- if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL ||
- tcp_rtt_dst_low(conn) || CONN_IS_CLOSING(conn) ||
- (conn->flags & LOCAL) || force_seq) {
- conn->seq_ack_to_tap = conn->seq_from_tap;
- } else if (conn->seq_ack_to_tap != conn->seq_from_tap) {
- if (!tinfo) {
- tinfo = &tinfo_new;
- if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
- return 0;
- }
+ /* At this point we could ack all the data we've accepted for forwarding
+ * (seq_from_tap). When possible, however, we want to only acknowledge
+ * what the peer has acknowledged. This makes it appear to the guest
+ * more like a direct connection to the peer, and may improve flow
+ * control behaviour.
+ *
+ * For it to be possible and worth it we need:
+ * - The TCP_INFO Linux extensions which give us the peer acked bytes
+ * and the delivery rate (outbound bandwidth at receiver)
+ * - Not to be told not to (force_seq)
+ * - Not half-closed in the peer->guest direction
+ * With no data coming from the peer, we might not get events which
+ * would prompt us to recheck bytes_acked. We could poll on a
+ * timer, but that's more trouble than it's worth.
+ * - Not a host local connection
+ * Data goes from socket to socket, with nothing meaningfully "in
+ * flight".
+ * - Not a pseudo-local connection (e.g. to a VM on the same host)
+ * If it is, there's not enough in flight to bother.
+ * - Sending buffer significantly larger than bandwidth * delay product
+ * Meaning we're not bandwidth-bound and this is likely to be
+ * interactive traffic where we want to preserve transparent
+ * connection behaviour and latency.
+ *
+ * Otherwise, we probably want to maximise throughput, which needs
+ * sending buffer auto-tuning, triggered in turn by filling up the
+ * outbound socket queue.
+ */
+ if (bytes_acked_cap && delivery_rate_cap && !force_seq &&
+ !CONN_IS_CLOSING(conn) &&
+ !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn)) {
+ if (!tinfo) {
+ tinfo = &tinfo_new;
+ if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
+ return 0;
+ }
- conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked +
- conn->seq_init_from_tap;
+ /* This trips a cppcheck bug in some versions, including
+ * cppcheck 2.18.3.
+ * https://trac.cppcheck.net/ticket/14191
+ */
+ /* cppcheck-suppress [uninitvar,unmatchedSuppression] */
+ if ((unsigned)SNDBUF_GET(conn) > (long long)tinfo->tcpi_rtt *
+ tinfo->tcpi_delivery_rate /
+ 1000 / 1000 *
+ SNDBUF_TO_BW_DELAY_INTERACTIVE)
+ ack_everything = false;
+ }
- if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
- conn->seq_ack_to_tap = prev_ack_to_tap;
- }
+ if (ack_everything) {
+ /* Fall back to acknowledging everything we got */
+ conn->seq_ack_to_tap = conn->seq_from_tap;
+ } else {
+ /* cppcheck bug 14191 again, see above */
+ /* cppcheck-suppress [uninitvar,unmatchedSuppression] */
+ conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked +
+ conn->seq_init_from_tap;
}
+ /* It's occasionally possible for us to go from using the fallback above
+ * to the tcpi_bytes_acked method. In that case, we must be careful not
+ * to let our ACKed sequence go backwards.
+ */
+ if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
+ conn->seq_ack_to_tap = prev_ack_to_tap;
+
if (!snd_wnd_cap) {
tcp_get_sndbuf(conn);
new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW);
@@ -1048,9 +1182,53 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) {
new_wnd_to_tap = tinfo->tcpi_snd_wnd;
} else {
+ uint32_t sendq;
+ int limit;
+
+ if (ioctl(s, SIOCOUTQ, &sendq)) {
+ debug_perror("SIOCOUTQ on socket %i, assuming 0", s);
+ sendq = 0;
+ }
tcp_get_sndbuf(conn);
- new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd,
- SNDBUF_GET(conn));
+
+ if ((int)sendq > SNDBUF_GET(conn)) /* Due to memory pressure? */
+ limit = 0;
+ else if ((int)tinfo->tcpi_snd_wnd > SNDBUF_GET(conn))
+ limit = tcp_sndbuf_boost(conn, tinfo) - (int)sendq;
+ else
+ limit = SNDBUF_GET(conn) - (int)sendq;
+
+ /* If the sender uses mechanisms to prevent Silly Window
+ * Syndrome (SWS, described in RFC 813 Section 3) it's critical
+ * that, should the window ever become less than the MSS, we
+ * advertise a new value once it increases again to be above it.
+ *
+ * The mechanism to avoid SWS in the kernel is, implicitly,
+ * implemented by Nagle's algorithm (which was proposed after
+ * RFC 813).
+ *
+ * To this end, for simplicity, approximate a window value below
+ * the MSS to zero, as we already have mechanisms in place to
+ * force updates after the window becomes zero. This matches the
+ * suggestion from RFC 813, Section 4.
+ *
+ * But don't do this if, either:
+ *
+ * - there's nothing in the outbound queue: the size of the
+ * sending buffer is limiting us, and it won't increase if we
+ * don't send data, so there's no point in waiting, or
+ *
+ * - we haven't sent data in a while (somewhat arbitrarily, ten
+ * times the RTT), as that might indicate that the receiver
+ * will only process data in batches that are large enough,
+ * but we won't send enough to fill one because we're stuck
+ * with pending data in the outbound queue
+ */
+ if (limit < MSS_GET(conn) && sendq &&
+ tinfo->tcpi_last_data_sent < tinfo->tcpi_rtt / 1000 * 10)
+ limit = 0;
+
+ new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, limit);
}
new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW);
@@ -1070,6 +1248,10 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
conn_flag(c, conn, ACK_TO_TAP_DUE);
out:
+ /* Opportunistically store RTT approximation on valid TCP_INFO data */
+ if (tinfo)
+ RTT_SET(conn, tinfo->tcpi_rtt);
+
return new_wnd_to_tap != prev_wnd_to_tap ||
conn->seq_ack_to_tap != prev_ack_to_tap;
}
@@ -1078,7 +1260,7 @@ out:
* tcp_update_seqack_from_tap() - ACK number from tap and related flags/counters
* @c: Execution context
* @conn: Connection pointer
- * @seq Current ACK sequence, host order
+ * @seq: Current ACK sequence, host order
*/
static void tcp_update_seqack_from_tap(const struct ctx *c,
struct tcp_tap_conn *conn, uint32_t seq)
@@ -1091,18 +1273,38 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
if (SEQ_LT(seq, conn->seq_to_tap))
conn_flag(c, conn, ACK_FROM_TAP_DUE);
- conn->retrans = 0;
+ conn->retries = 0;
conn->seq_ack_from_tap = seq;
}
}
/**
+ * tcp_rewind_seq() - Rewind sequence to tap and socket offset to current ACK
+ * @c: Execution context
+ * @conn: Connection pointer
+ *
+ * Return: 0 on success, -1 on failure, with connection reset
+ */
+static int tcp_rewind_seq(const struct ctx *c, struct tcp_tap_conn *conn)
+{
+ conn->seq_to_tap = conn->seq_ack_from_tap;
+ conn->events &= ~TAP_FIN_SENT;
+
+ if (tcp_set_peek_offset(conn, 0)) {
+ tcp_rst(c, conn);
+ return -1;
+ }
+
+ return 0;
+}
+
+/**
* tcp_prepare_flags() - Prepare header for flags-only segment (no payload)
* @c: Execution context
* @conn: Connection pointer
* @flags: TCP flags: if not set, send segment only if ACK is due
* @th: TCP header to update
- * @data: buffer to store TCP option
+ * @opts: TCP option buffer (output parameter)
* @optlen: size of the TCP option buffer (output parameter)
*
* Return: < 0 error code on connection reset,
@@ -1165,12 +1367,14 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
th->doff = (sizeof(*th) + *optlen) / 4;
th->ack = !!(flags & ACK);
+ th->psh = !!(flags & PSH);
th->rst = !!(flags & RST);
th->syn = !!(flags & SYN);
th->fin = !!(flags & FIN);
if (th->ack) {
- if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap))
+ if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) &&
+ conn->wnd_to_tap)
conn_flag(c, conn, ~ACK_TO_TAP_DUE);
else
conn_flag(c, conn, ACK_TO_TAP_DUE);
@@ -1236,30 +1440,41 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn,
/**
* tcp_tap_window_update() - Process an updated window from tap side
+ * @c: Execution context
* @conn: Connection pointer
- * @window: Window value, host order, unscaled
+ * @wnd: Window value, host order, unscaled
+ *
+ * Return: false on zero window (not stored to wnd_from_tap), true otherwise
*/
-static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd)
+static bool tcp_tap_window_update(const struct ctx *c,
+ struct tcp_tap_conn *conn, unsigned wnd)
{
wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap);
/* Work-around for bug introduced in peer kernel code, commit
- * e2142825c120 ("net: tcp: send zero-window ACK when no memory").
- * We don't update if window shrank to zero.
+ * e2142825c120 ("net: tcp: send zero-window ACK when no memory"): don't
+ * update the window if it shrank to zero, so that we'll eventually
+ * retry to send data, but rewind the sequence as that obviously implies
+ * that no data beyond the updated window will be acknowledged.
*/
- if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap))
- return;
+ if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) {
+ tcp_rewind_seq(c, conn);
+ return false;
+ }
conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX);
/* FIXME: reflect the tap-side receiver's window back to the sock-side
* sender by adjusting SO_RCVBUF? */
+ return true;
}
/**
* tcp_init_seq() - Calculate initial sequence number according to RFC 6528
* @hash: Hash of connection details
* @now: Current timestamp
+ *
+ * Return: the calculated 32-bit initial sequence number.
*/
static uint32_t tcp_init_seq(uint64_t hash, const struct timespec *now)
{
@@ -1316,7 +1531,7 @@ static int tcp_conn_new_sock(sa_family_t af)
* tcp_conn_sock() - Obtain a connectable socket in the host/init namespace
* @af: Address family (AF_INET or AF_INET6)
*
- * Return: Socket fd on success, -errno on failure
+ * Return: socket fd on success, -errno on failure
*/
int tcp_conn_sock(sa_family_t af)
{
@@ -1375,12 +1590,11 @@ static void tcp_bind_outbound(const struct ctx *c,
{
const struct flowside *tgt = &conn->f.side[TGTSIDE];
union sockaddr_inany bind_sa;
- socklen_t sl;
- pif_sockaddr(c, &bind_sa, &sl, PIF_HOST, &tgt->oaddr, tgt->oport);
+ pif_sockaddr(c, &bind_sa, PIF_HOST, &tgt->oaddr, tgt->oport);
if (!inany_is_unspecified(&tgt->oaddr) || tgt->oport) {
- if (bind(s, &bind_sa.sa, sl)) {
+ if (bind(s, &bind_sa.sa, socklen_inany(&bind_sa))) {
char sstr[INANY_ADDRSTRLEN];
flow_dbg_perror(conn,
@@ -1440,7 +1654,6 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
union flow *flow;
int s = -1, mss;
uint64_t hash;
- socklen_t sl;
if (!(flow = flow_alloc()))
return;
@@ -1473,7 +1686,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
if ((s = tcp_conn_sock(af)) < 0)
goto cancel;
- pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, tgt->eport);
+ pif_sockaddr(c, &sa, PIF_HOST, &tgt->eaddr, tgt->eport);
/* Use bind() to check if the target address is local (EADDRINUSE or
* similar) and already bound, and set the LOCAL flag in that case.
@@ -1485,7 +1698,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
*
* So, if bind() succeeds, close the socket, get a new one, and proceed.
*/
- if (bind(s, &sa.sa, sl)) {
+ if (bind(s, &sa.sa, socklen_inany(&sa))) {
if (errno != EADDRNOTAVAIL && errno != EACCES)
conn_flag(c, conn, LOCAL);
} else {
@@ -1525,7 +1738,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
tcp_bind_outbound(c, conn, s);
- if (connect(s, &sa.sa, sl)) {
+ if (connect(s, &sa.sa, socklen_inany(&sa))) {
if (errno != EINPROGRESS) {
tcp_rst(c, conn);
goto cancel;
@@ -1544,10 +1757,10 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
tcp_epoll_ctl(c, conn);
if (c->mode == MODE_VU) { /* To rebind to same oport after migration */
- sl = sizeof(sa);
- if (!getsockname(s, &sa.sa, &sl))
- inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa);
- else
+ socklen_t sl = sizeof(sa);
+
+ if (getsockname(s, &sa.sa, &sl) ||
+ inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa) < 0)
err_perror("Can't get local address for socket %i", s);
}
@@ -1611,6 +1824,23 @@ static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
}
/**
+ * tcp_packet_data_len() - Get data (TCP payload) length for a TCP packet
+ * @th: Pointer to TCP header
+ * @l4len: TCP packet length, including TCP header
+ *
+ * Return: data length of TCP packet, -1 on invalid value of Data Offset field
+ */
+static ssize_t tcp_packet_data_len(const struct tcphdr *th, size_t l4len)
+{
+ size_t off = th->doff * 4UL;
+
+ if (off < sizeof(*th) || off > l4len)
+ return -1;
+
+ return l4len - off;
+}
+
+/**
* tcp_data_from_tap() - tap/guest data for established connection
* @c: Execution context
* @conn: Connection pointer
@@ -1639,16 +1869,22 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
for (i = idx, iov_i = 0; i < (int)p->count; i++) {
uint32_t seq, seq_offset, ack_seq;
+ struct tcphdr th_storage;
const struct tcphdr *th;
- char *data;
- size_t off;
+ struct iov_tail data;
+ size_t off, size;
+ int count;
- th = packet_get(p, i, 0, sizeof(*th), &len);
+ if (!packet_get(p, i, &data))
+ return -1;
+
+ th = IOV_PEEK_HEADER(&data, th_storage);
if (!th)
return -1;
- len += sizeof(*th);
+ len = iov_tail_size(&data);
off = th->doff * 4UL;
+
if (off < sizeof(*th) || off > len)
return -1;
@@ -1658,9 +1894,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
}
len -= off;
- data = packet_get(p, i, off, len, NULL);
- if (!data)
- continue;
+ iov_drop_header(&data, off);
seq = ntohl(th->seq);
if (SEQ_LT(seq, conn->seq_from_tap) && len <= 1) {
@@ -1671,8 +1905,13 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
tcp_send_flag(c, conn, ACK);
tcp_timer_ctl(c, conn);
+ if (setsockopt(conn->sock, SOL_SOCKET, SO_KEEPALIVE,
+ &((int){ 1 }), sizeof(int)))
+ flow_trace(conn, "failed to set SO_KEEPALIVE");
+
if (p->count == 1) {
- tcp_tap_window_update(conn, ntohs(th->window));
+ tcp_tap_window_update(c, conn,
+ ntohs(th->window));
return 1;
}
@@ -1691,12 +1930,21 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
ack_seq == max_ack_seq &&
ntohs(th->window) == max_ack_seq_wnd;
+ /* See tcp_tap_window_update() for details. On
+ * top of that, we also need to check here if a
+ * zero-window update is contained in a batch of
+ * packets that includes a non-zero window as
+ * well.
+ */
+ if (!ntohs(th->window))
+ tcp_rewind_seq(c, conn);
+
max_ack_seq_wnd = ntohs(th->window);
max_ack_seq = ack_seq;
}
}
- if (th->fin)
+ if (th->fin && seq == seq_from_tap)
fin = 1;
if (!len)
@@ -1734,10 +1982,14 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
continue;
}
- tcp_iov[iov_i].iov_base = data + seq_offset;
- tcp_iov[iov_i].iov_len = len - seq_offset;
- seq_from_tap += tcp_iov[iov_i].iov_len;
- iov_i++;
+ iov_drop_header(&data, seq_offset);
+ size = len - seq_offset;
+ count = iov_tail_clone(&tcp_iov[iov_i], UIO_MAXIOV - iov_i,
+ &data);
+ if (count < 0)
+ break;
+ seq_from_tap += size;
+ iov_i += count;
if (keep == i)
keep = -1;
@@ -1750,17 +2002,16 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
if (ack && !tcp_sock_consume(conn, max_ack_seq))
tcp_update_seqack_from_tap(c, conn, max_ack_seq);
- tcp_tap_window_update(conn, max_ack_seq_wnd);
+ tcp_tap_window_update(c, conn, max_ack_seq_wnd);
if (retr) {
flow_trace(conn,
"fast re-transmit, ACK: %u, previous sequence: %u",
- max_ack_seq, conn->seq_to_tap);
- conn->seq_to_tap = max_ack_seq;
- if (tcp_set_peek_offset(conn->sock, 0)) {
- tcp_rst(c, conn);
+ conn->seq_ack_from_tap, conn->seq_to_tap);
+
+ if (tcp_rewind_seq(c, conn))
return -1;
- }
+
tcp_data_from_sock(c, conn);
}
@@ -1784,23 +2035,20 @@ eintr:
goto eintr;
if (errno == EAGAIN || errno == EWOULDBLOCK) {
- tcp_send_flag(c, conn, ACK_IF_NEEDED);
+ tcp_send_flag(c, conn, ACK | DUP_ACK);
return p->count - idx;
}
return -1;
}
- if (n < (int)(seq_from_tap - conn->seq_from_tap)) {
+ if (n < (int)(seq_from_tap - conn->seq_from_tap))
partial_send = 1;
- conn->seq_from_tap += n;
- tcp_send_flag(c, conn, ACK_IF_NEEDED);
- } else {
- conn->seq_from_tap += n;
- }
+
+ conn->seq_from_tap += n;
out:
- if (keep != -1) {
+ if (keep != -1 || partial_send) {
/* We use an 8-bit approximation here: the associated risk is
* that we skip a duplicate ACK on 8-bit sequence number
* collision. Fast retransmit is a SHOULD in RFC 5681, 3.2.
@@ -1840,7 +2088,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
const struct tcphdr *th,
const char *opts, size_t optlen)
{
- tcp_tap_window_update(conn, ntohs(th->window));
+ tcp_tap_window_update(c, conn, ntohs(th->window));
tcp_get_tap_ws(conn, opts, optlen);
/* First value is not scaled */
@@ -1854,7 +2102,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
conn->seq_ack_to_tap = conn->seq_from_tap;
conn_event(c, conn, ESTABLISHED);
- if (tcp_set_peek_offset(conn->sock, 0)) {
+ if (tcp_set_peek_offset(conn, 0)) {
tcp_rst(c, conn);
return;
}
@@ -1893,7 +2141,8 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
return;
if (af == AF_INET) {
- struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
+ struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac,
+ ETH_P_IP);
const struct in_addr *rst_src = daddr;
const struct in_addr *rst_dst = saddr;
@@ -1903,7 +2152,8 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
*rst_src, *rst_dst);
} else {
- struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
+ struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac,
+ ETH_P_IPV6);
const struct in6_addr *rst_src = daddr;
const struct in6_addr *rst_dst = saddr;
@@ -1955,8 +2205,11 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
const struct pool *p, int idx, const struct timespec *now)
{
struct tcp_tap_conn *conn;
+ struct tcphdr th_storage;
const struct tcphdr *th;
- size_t optlen, len;
+ char optsc[OPTLEN_MAX];
+ struct iov_tail data;
+ size_t optlen, l4len;
const char *opts;
union flow *flow;
flow_sidx_t sidx;
@@ -1965,15 +2218,19 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
(void)pif;
- th = packet_get(p, idx, 0, sizeof(*th), &len);
+ if (!packet_get(p, idx, &data))
+ return 1;
+
+ l4len = iov_tail_size(&data);
+
+ th = IOV_REMOVE_HEADER(&data, th_storage);
if (!th)
return 1;
- len += sizeof(*th);
optlen = th->doff * 4UL - sizeof(*th);
/* Static checkers might fail to see this: */
- optlen = MIN(optlen, ((1UL << 4) /* from doff width */ - 6) * 4UL);
- opts = packet_get(p, idx, sizeof(*th), optlen, NULL);
+ optlen = MIN(optlen, OPTLEN_MAX);
+ opts = (char *)iov_remove_header_(&data, &optsc[0], optlen, 1);
sidx = flow_lookup_af(c, IPPROTO_TCP, PIF_TAP, af, saddr, daddr,
ntohs(th->source), ntohs(th->dest));
@@ -1985,7 +2242,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
tcp_conn_from_tap(c, af, saddr, daddr, th,
opts, optlen, now);
else
- tcp_rst_no_conn(c, af, saddr, daddr, flow_lbl, th, len);
+ tcp_rst_no_conn(c, af, saddr, daddr, flow_lbl, th, l4len);
return 1;
}
@@ -1993,7 +2250,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
ASSERT(pif_at_sidx(sidx) == PIF_TAP);
conn = &flow->tcp;
- flow_trace(conn, "packet length %zu from tap", len);
+ flow_trace(conn, "packet length %zu from tap", l4len);
if (th->rst) {
conn_event(c, conn, CLOSED);
@@ -2022,7 +2279,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
goto reset;
conn_event(c, conn, ESTABLISHED);
- if (tcp_set_peek_offset(conn->sock, 0))
+ if (tcp_set_peek_offset(conn, 0))
goto reset;
if (th->fin) {
@@ -2038,9 +2295,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
if (!th->ack)
goto reset;
- tcp_tap_window_update(conn, ntohs(th->window));
-
- tcp_data_from_sock(c, conn);
+ if (tcp_tap_window_update(c, conn, ntohs(th->window)))
+ tcp_data_from_sock(c, conn);
if (p->count - idx == 1)
return 1;
@@ -2048,13 +2304,44 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
/* Established connections not accepting data from tap */
if (conn->events & TAP_FIN_RCVD) {
- tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
- tcp_tap_window_update(conn, ntohs(th->window));
- tcp_data_from_sock(c, conn);
+ size_t dlen;
+ bool retr;
- if (conn->events & SOCK_FIN_RCVD &&
- conn->seq_ack_from_tap == conn->seq_to_tap)
- conn_event(c, conn, CLOSED);
+ if ((dlen = tcp_packet_data_len(th, l4len))) {
+ flow_dbg(conn, "data segment in CLOSE-WAIT (%zu B)",
+ dlen);
+ }
+
+ retr = th->ack && !th->fin &&
+ ntohl(th->ack_seq) == conn->seq_ack_from_tap &&
+ ntohs(th->window) == conn->wnd_from_tap;
+
+ /* On socket flush failure, pretend there was no ACK, try again
+ * later
+ */
+ if (th->ack && !tcp_sock_consume(conn, ntohl(th->ack_seq)))
+ tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
+
+ if (retr) {
+ flow_trace(conn,
+ "fast re-transmit, ACK: %u, previous sequence: %u",
+ ntohl(th->ack_seq), conn->seq_to_tap);
+
+ if (tcp_rewind_seq(c, conn))
+ return -1;
+ }
+
+ if (tcp_tap_window_update(c, conn, ntohs(th->window)) || retr)
+ tcp_data_from_sock(c, conn);
+
+ if (conn->seq_ack_from_tap == conn->seq_to_tap) {
+ if (th->ack && conn->events & TAP_FIN_SENT)
+ conn_event(c, conn, TAP_FIN_ACKED);
+
+ if (conn->events & SOCK_FIN_RCVD &&
+ conn->events & TAP_FIN_ACKED)
+ conn_event(c, conn, CLOSED);
+ }
return 1;
}
@@ -2199,14 +2486,11 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
* mode only, below.
*/
ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa,
- ref.tcp_listen.port);
+ NULL, ref.tcp_listen.port);
- if (c->mode == MODE_VU) { /* Rebind to same address after migration */
- if (!getsockname(s, &sa.sa, &sl))
- inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa);
- else
- err_perror("Can't get local address for socket %i", s);
- }
+ if (getsockname(s, &sa.sa, &sl) ||
+ inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa) < 0)
+ err_perror("Can't get local address for socket %i", s);
if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0) {
char sastr[SOCKADDR_STRLEN];
@@ -2247,7 +2531,9 @@ cancel:
* @c: Execution context
* @ref: epoll reference of timer (not connection)
*
- * #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64
+ * #syscalls timerfd_gettime|timerfd_gettime64
+ * #syscalls arm:timerfd_gettime64 i686:timerfd_gettime64
+ * #syscalls arm:timerfd_settime64 i686:timerfd_settime64
*/
void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
{
@@ -2272,26 +2558,37 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
tcp_timer_ctl(c, conn);
} else if (conn->flags & ACK_FROM_TAP_DUE) {
if (!(conn->events & ESTABLISHED)) {
- flow_dbg(conn, "handshake timeout");
- tcp_rst(c, conn);
+ int max;
+ max = c->tcp.syn_retries + c->tcp.syn_linear_timeouts;
+ max = MIN(TCP_MAX_RETRIES, max);
+ if (conn->retries >= max) {
+ flow_dbg(conn, "handshake timeout");
+ tcp_rst(c, conn);
+ } else {
+ flow_trace(conn, "SYN timeout, retry");
+ tcp_send_flag(c, conn, SYN);
+ conn->retries++;
+ conn_flag(c, conn, SYN_RETRIED);
+ tcp_timer_ctl(c, conn);
+ }
} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
flow_dbg(conn, "FIN timeout");
tcp_rst(c, conn);
- } else if (conn->retrans == TCP_MAX_RETRANS) {
+ } else if (conn->retries == TCP_MAX_RETRIES) {
flow_dbg(conn, "retransmissions count exceeded");
tcp_rst(c, conn);
} else {
flow_dbg(conn, "ACK timeout, retry");
- conn->retrans++;
- conn->seq_to_tap = conn->seq_ack_from_tap;
+
if (!conn->wnd_from_tap)
conn->wnd_from_tap = 1; /* Zero-window probe */
- if (tcp_set_peek_offset(conn->sock, 0)) {
- tcp_rst(c, conn);
- } else {
- tcp_data_from_sock(c, conn);
- tcp_timer_ctl(c, conn);
- }
+
+ conn->retries++;
+ if (tcp_rewind_seq(c, conn))
+ return;
+
+ tcp_data_from_sock(c, conn);
+ tcp_timer_ctl(c, conn);
}
} else {
struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } };
@@ -2335,7 +2632,7 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
return;
}
- if ((conn->events & TAP_FIN_SENT) && (events & EPOLLHUP)) {
+ if ((conn->events & TAP_FIN_ACKED) && (events & EPOLLHUP)) {
conn_event(c, conn, CLOSED);
return;
}
@@ -2378,29 +2675,42 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
/**
* tcp_sock_init_one() - Initialise listening socket for address and port
* @c: Execution context
+ * @pif: Interface to open the socket for (PIF_HOST or PIF_SPLICE)
* @addr: Pointer to address for binding, NULL for dual stack any
* @ifname: Name of interface to bind to, NULL if not configured
* @port: Port, host order
*
* Return: fd for the new listening socket, negative error code on failure
+ *
+ * If pif == PIF_SPLICE, the caller must have already entered the guest ns.
*/
-static int tcp_sock_init_one(const struct ctx *c, const union inany_addr *addr,
- const char *ifname, in_port_t port)
+static int tcp_sock_init_one(const struct ctx *c, uint8_t pif,
+ const union inany_addr *addr, const char *ifname,
+ in_port_t port)
{
union tcp_listen_epoll_ref tref = {
.port = port,
- .pif = PIF_HOST,
+ .pif = pif,
};
+ const struct fwd_ports *fwd;
int s;
- s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_HOST, addr,
- ifname, port, tref.u32);
+ if (pif == PIF_HOST)
+ fwd = &c->tcp.fwd_in;
+ else
+ fwd = &c->tcp.fwd_out;
+
+ s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, pif, addr, ifname,
+ port, tref.u32);
+
+ if (fwd->mode == FWD_AUTO) {
+ int (*socks)[IP_VERSIONS] = pif == PIF_SPLICE ?
+ tcp_sock_ns : tcp_sock_init_ext;
- if (c->tcp.fwd_in.mode == FWD_AUTO) {
if (!addr || inany_v4(addr))
- tcp_sock_init_ext[port][V4] = s < 0 ? -1 : s;
+ socks[port][V4] = s < 0 ? -1 : s;
if (!addr || !inany_v4(addr))
- tcp_sock_init_ext[port][V6] = s < 0 ? -1 : s;
+ socks[port][V6] = s < 0 ? -1 : s;
}
if (s < 0)
@@ -2410,87 +2720,47 @@ static int tcp_sock_init_one(const struct ctx *c, const union inany_addr *addr,
}
/**
- * tcp_sock_init() - Create listening sockets for a given host ("inbound") port
+ * tcp_sock_init() - Create listening socket for a given host ("inbound") port
* @c: Execution context
+ * @pif: Interface to open the socket for (PIF_HOST or PIF_SPLICE)
* @addr: Pointer to address for binding, NULL if not configured
* @ifname: Name of interface to bind to, NULL if not configured
* @port: Port, host order
*
- * Return: 0 on (partial) success, negative error code on (complete) failure
+ * Return: 0 on success, negative error code on failure
*/
-int tcp_sock_init(const struct ctx *c, const union inany_addr *addr,
- const char *ifname, in_port_t port)
+int tcp_sock_init(const struct ctx *c, uint8_t pif,
+ const union inany_addr *addr, const char *ifname,
+ in_port_t port)
{
- int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
+ int s;
ASSERT(!c->no_tcp);
- if (!addr && c->ifi4 && c->ifi6)
- /* Attempt to get a dual stack socket */
- if (tcp_sock_init_one(c, NULL, ifname, port) >= 0)
+ if (!c->ifi4) {
+ if (!addr)
+ /* Restrict to v6 only */
+ addr = &inany_any6;
+ else if (inany_v4(addr))
+ /* Nothing to do */
return 0;
+ }
+ if (!c->ifi6) {
+ if (!addr)
+ /* Restrict to v4 only */
+ addr = &inany_any4;
+ else if (!inany_v4(addr))
+ /* Nothing to do */
+ return 0;
+ }
- /* Otherwise create a socket per IP version */
- if ((!addr || inany_v4(addr)) && c->ifi4)
- r4 = tcp_sock_init_one(c, addr ? addr : &inany_any4,
- ifname, port);
-
- if ((!addr || !inany_v4(addr)) && c->ifi6)
- r6 = tcp_sock_init_one(c, addr ? addr : &inany_any6,
- ifname, port);
-
- if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6))
- return 0;
-
- return r4 < 0 ? r4 : r6;
-}
-
-/**
- * tcp_ns_sock_init4() - Init socket to listen for outbound IPv4 connections
- * @c: Execution context
- * @port: Port, host order
- */
-static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)
-{
- union tcp_listen_epoll_ref tref = {
- .port = port,
- .pif = PIF_SPLICE,
- };
- int s;
-
- ASSERT(c->mode == MODE_PASTA);
-
- s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback4,
- NULL, port, tref.u32);
- if (s < 0)
- s = -1;
-
- if (c->tcp.fwd_out.mode == FWD_AUTO)
- tcp_sock_ns[port][V4] = s;
-}
-
-/**
- * tcp_ns_sock_init6() - Init socket to listen for outbound IPv6 connections
- * @c: Execution context
- * @port: Port, host order
- */
-static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)
-{
- union tcp_listen_epoll_ref tref = {
- .port = port,
- .pif = PIF_SPLICE,
- };
- int s;
-
- ASSERT(c->mode == MODE_PASTA);
-
- s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback6,
- NULL, port, tref.u32);
+ s = tcp_sock_init_one(c, pif, addr, ifname, port);
if (s < 0)
- s = -1;
+ return s;
+ if (s > FD_REF_MAX)
+ return -EIO;
- if (c->tcp.fwd_out.mode == FWD_AUTO)
- tcp_sock_ns[port][V6] = s;
+ return 0;
}
/**
@@ -2502,10 +2772,15 @@ static void tcp_ns_sock_init(const struct ctx *c, in_port_t port)
{
ASSERT(!c->no_tcp);
+ if (!c->no_bindtodevice) {
+ tcp_sock_init(c, PIF_SPLICE, NULL, "lo", port);
+ return;
+ }
+
if (c->ifi4)
- tcp_ns_sock_init4(c, port);
+ tcp_sock_init_one(c, PIF_SPLICE, &inany_loopback4, NULL, port);
if (c->ifi6)
- tcp_ns_sock_init6(c, port);
+ tcp_sock_init_one(c, PIF_SPLICE, &inany_loopback6, NULL, port);
}
/**
@@ -2604,7 +2879,7 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af)
/**
* tcp_probe_tcp_info() - Check what data TCP_INFO reports
*
- * Return: Number of bytes returned by TCP_INFO getsockopt()
+ * Return: number of bytes returned by TCP_INFO getsockopt()
*/
static socklen_t tcp_probe_tcp_info(void)
{
@@ -2630,6 +2905,31 @@ static socklen_t tcp_probe_tcp_info(void)
}
/**
+ * tcp_get_rto_params() - Get host kernel RTO parameters
+ * @c: Execution context
+ */
+static void tcp_get_rto_params(struct ctx *c)
+{
+ intmax_t v;
+
+ v = read_file_integer(SYN_RETRIES, SYN_RETRIES_DEFAULT);
+ c->tcp.syn_retries = MIN(v, MAX_SYNCNT);
+
+ v = read_file_integer(SYN_LINEAR_TIMEOUTS, SYN_LINEAR_TIMEOUTS_DEFAULT);
+ c->tcp.syn_linear_timeouts = MIN(v, MAX_SYNCNT);
+
+ v = read_file_integer(RTO_MAX_MS, (intmax_t)(RTO_MAX_DEFAULT * 1000));
+ c->tcp.rto_max = MIN(DIV_ROUND_UP(v, 1000), INT_MAX);
+
+ debug("Using TCP RTO parameters, syn_retries: %"PRIu8
+ ", syn_linear_timeouts: %"PRIu8
+ ", rto_max: %d",
+ c->tcp.syn_retries,
+ c->tcp.syn_linear_timeouts,
+ c->tcp.rto_max);
+}
+
+/**
* tcp_init() - Get initial sequence, hash secret, initialise per-socket data
* @c: Execution context
*
@@ -2639,6 +2939,8 @@ int tcp_init(struct ctx *c)
{
ASSERT(!c->no_tcp);
+ tcp_get_rto_params(c);
+
tcp_sock_iov_init(c);
memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4));
@@ -2661,7 +2963,7 @@ int tcp_init(struct ctx *c)
tcp_info_size = tcp_probe_tcp_info();
#define dbg_tcpi(f_) debug("TCP_INFO tcpi_%s field%s supported", \
- STRINGIFY(f_), tcp_info_cap(f_) ? " " : " not ")
+ STRINGIFY(f_), tcp_info_cap(f_) ? "" : " not")
dbg_tcpi(snd_wnd);
dbg_tcpi(bytes_acked);
dbg_tcpi(min_rtt);
@@ -2680,7 +2982,6 @@ int tcp_init(struct ctx *c)
static void tcp_port_rebind(struct ctx *c, bool outbound)
{
const uint8_t *fmap = outbound ? c->tcp.fwd_out.map : c->tcp.fwd_in.map;
- const uint8_t *rmap = outbound ? c->tcp.fwd_in.map : c->tcp.fwd_out.map;
int (*socks)[IP_VERSIONS] = outbound ? tcp_sock_ns : tcp_sock_init_ext;
unsigned port;
@@ -2699,16 +3000,12 @@ static void tcp_port_rebind(struct ctx *c, bool outbound)
continue;
}
- /* Don't loop back our own ports */
- if (bitmap_isset(rmap, port))
- continue;
-
if ((c->ifi4 && socks[port][V4] == -1) ||
(c->ifi6 && socks[port][V6] == -1)) {
if (outbound)
tcp_ns_sock_init(c, port);
else
- tcp_sock_init(c, NULL, NULL, port);
+ tcp_sock_init(c, PIF_HOST, NULL, NULL, port);
}
}
}
@@ -2732,26 +3029,29 @@ static int tcp_port_rebind_outbound(void *arg)
}
/**
+ * tcp_port_rebind_all() - Rebind ports to match forward maps (in host & ns)
+ * @c: Execution context
+ */
+void tcp_port_rebind_all(struct ctx *c)
+{
+ ASSERT(c->mode == MODE_PASTA && !c->no_tcp);
+
+ if (c->tcp.fwd_out.mode == FWD_AUTO)
+ NS_CALL(tcp_port_rebind_outbound, c);
+
+ if (c->tcp.fwd_in.mode == FWD_AUTO)
+ tcp_port_rebind(c, false);
+}
+
+/**
* tcp_timer() - Periodic tasks: port detection, closed connections, pool refill
* @c: Execution context
* @now: Current timestamp
*/
-void tcp_timer(struct ctx *c, const struct timespec *now)
+void tcp_timer(const struct ctx *c, const struct timespec *now)
{
(void)now;
- if (c->mode == MODE_PASTA) {
- if (c->tcp.fwd_out.mode == FWD_AUTO) {
- fwd_scan_ports_tcp(&c->tcp.fwd_out, &c->tcp.fwd_in);
- NS_CALL(tcp_port_rebind_outbound, c);
- }
-
- if (c->tcp.fwd_in.mode == FWD_AUTO) {
- fwd_scan_ports_tcp(&c->tcp.fwd_in, &c->tcp.fwd_out);
- tcp_port_rebind(c, false);
- }
- }
-
tcp_sock_refill_init(c);
if (c->mode == MODE_PASTA)
tcp_splice_refill(c);
@@ -2810,20 +3110,21 @@ int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn)
/**
* tcp_flow_dump_tinfo() - Dump window scale, tcpi_state, tcpi_options
- * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
* @t: Extended migration data
*
* Return: 0 on success, negative error code on failure
*/
-static int tcp_flow_dump_tinfo(int s, struct tcp_tap_transfer_ext *t)
+static int tcp_flow_dump_tinfo(const struct tcp_tap_conn *conn,
+ struct tcp_tap_transfer_ext *t)
{
struct tcp_info tinfo;
socklen_t sl;
sl = sizeof(tinfo);
- if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
+ if (getsockopt(conn->sock, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
int rc = -errno;
- err_perror("Querying TCP_INFO, socket %i", s);
+ flow_perror(conn, "Querying TCP_INFO");
return rc;
}
@@ -2837,39 +3138,95 @@ static int tcp_flow_dump_tinfo(int s, struct tcp_tap_transfer_ext *t)
/**
* tcp_flow_dump_mss() - Dump MSS clamp (not current MSS) via TCP_MAXSEG
- * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
* @t: Extended migration data
*
* Return: 0 on success, negative error code on failure
*/
-static int tcp_flow_dump_mss(int s, struct tcp_tap_transfer_ext *t)
+static int tcp_flow_dump_mss(const struct tcp_tap_conn *conn,
+ struct tcp_tap_transfer_ext *t)
{
socklen_t sl = sizeof(t->mss);
+ int val;
- if (getsockopt(s, SOL_TCP, TCP_MAXSEG, &t->mss, &sl)) {
+ if (getsockopt(conn->sock, SOL_TCP, TCP_MAXSEG, &val, &sl)) {
int rc = -errno;
- err_perror("Getting MSS, socket %i", s);
+ flow_perror(conn, "Getting MSS");
return rc;
}
+ t->mss = (uint32_t)val;
+
+ return 0;
+}
+
+
+/**
+ * tcp_flow_dump_timestamp() - Dump RFC 7323 timestamp via TCP_TIMESTAMP
+ * @conn: Pointer to the TCP connection structure
+ * @t: Extended migration data (tcpi_options must be populated)
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_timestamp(const struct tcp_tap_conn *conn,
+ struct tcp_tap_transfer_ext *t)
+{
+ int val = 0;
+
+ if (t->tcpi_options & TCPI_OPT_TIMESTAMPS) {
+ socklen_t sl = sizeof(val);
+
+ if (getsockopt(conn->sock, SOL_TCP, TCP_TIMESTAMP, &val, &sl)) {
+ int rc = -errno;
+ flow_perror(conn, "Getting RFC 7323 timestamp");
+ return rc;
+ }
+ }
+
+ t->timestamp = (uint32_t)val;
+ return 0;
+}
+
+/**
+ * tcp_flow_repair_timestamp() - Restore RFC 7323 timestamp via TCP_TIMESTAMP
+ * @conn: Pointer to the TCP connection structure
+ * @t: Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_timestamp(const struct tcp_tap_conn *conn,
+ const struct tcp_tap_transfer_ext *t)
+{
+ int val = (int)t->timestamp;
+
+ if (t->tcpi_options & TCPI_OPT_TIMESTAMPS) {
+ if (setsockopt(conn->sock, SOL_TCP, TCP_TIMESTAMP,
+ &val, sizeof(val))) {
+ int rc = -errno;
+ flow_perror(conn, "Setting RFC 7323 timestamp");
+ return rc;
+ }
+ }
+
return 0;
}
/**
* tcp_flow_dump_wnd() - Dump current tcp_repair_window parameters
- * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
* @t: Extended migration data
*
* Return: 0 on success, negative error code on failure
*/
-static int tcp_flow_dump_wnd(int s, struct tcp_tap_transfer_ext *t)
+static int tcp_flow_dump_wnd(const struct tcp_tap_conn *conn,
+ struct tcp_tap_transfer_ext *t)
{
struct tcp_repair_window wnd;
socklen_t sl = sizeof(wnd);
- if (getsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, &sl)) {
+ if (getsockopt(conn->sock, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, &sl)) {
int rc = -errno;
- err_perror("Getting window repair data, socket %i", s);
+ flow_perror(conn, "Getting window repair data");
return rc;
}
@@ -2893,12 +3250,13 @@ static int tcp_flow_dump_wnd(int s, struct tcp_tap_transfer_ext *t)
/**
* tcp_flow_repair_wnd() - Restore window parameters from extended data
- * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
* @t: Extended migration data
*
* Return: 0 on success, negative error code on failure
*/
-static int tcp_flow_repair_wnd(int s, const struct tcp_tap_transfer_ext *t)
+static int tcp_flow_repair_wnd(const struct tcp_tap_conn *conn,
+ const struct tcp_tap_transfer_ext *t)
{
struct tcp_repair_window wnd;
@@ -2908,9 +3266,10 @@ static int tcp_flow_repair_wnd(int s, const struct tcp_tap_transfer_ext *t)
wnd.rcv_wnd = t->rcv_wnd;
wnd.rcv_wup = t->rcv_wup;
- if (setsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, sizeof(wnd))) {
+ if (setsockopt(conn->sock, IPPROTO_TCP, TCP_REPAIR_WINDOW,
+ &wnd, sizeof(wnd))) {
int rc = -errno;
- err_perror("Setting window data, socket %i", s);
+ flow_perror(conn, "Setting window data");
return rc;
}
@@ -2919,16 +3278,17 @@ static int tcp_flow_repair_wnd(int s, const struct tcp_tap_transfer_ext *t)
/**
* tcp_flow_select_queue() - Select queue (receive or send) for next operation
- * @s: Socket
+ * @conn: Connection to select queue for
* @queue: TCP_RECV_QUEUE or TCP_SEND_QUEUE
*
* Return: 0 on success, negative error code on failure
*/
-static int tcp_flow_select_queue(int s, int queue)
+static int tcp_flow_select_queue(const struct tcp_tap_conn *conn, int queue)
{
- if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue))) {
+ if (setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_QUEUE,
+ &queue, sizeof(queue))) {
int rc = -errno;
- err_perror("Selecting TCP_SEND_QUEUE, socket %i", s);
+ flow_perror(conn, "Selecting TCP_SEND_QUEUE");
return rc;
}
@@ -2937,26 +3297,28 @@ static int tcp_flow_select_queue(int s, int queue)
/**
* tcp_flow_dump_sndqueue() - Dump send queue, length of sent and not sent data
- * @s: Socket
+ * @conn: Connection to dump queue for
* @t: Extended migration data
*
* Return: 0 on success, negative error code on failure
*
* #syscalls:vu ioctl
*/
-static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t)
+static int tcp_flow_dump_sndqueue(const struct tcp_tap_conn *conn,
+ struct tcp_tap_transfer_ext *t)
{
+ int s = conn->sock;
ssize_t rc;
if (ioctl(s, SIOCOUTQ, &t->sndq) < 0) {
rc = -errno;
- err_perror("Getting send queue size, socket %i", s);
+ flow_perror(conn, "Getting send queue size");
return rc;
}
if (ioctl(s, SIOCOUTQNSD, &t->notsent) < 0) {
rc = -errno;
- err_perror("Getting not sent count, socket %i", s);
+ flow_perror(conn, "Getting not sent count");
return rc;
}
@@ -2975,14 +3337,16 @@ static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t)
}
if (t->notsent > t->sndq) {
- err("Invalid notsent count socket %i, send: %u, not sent: %u",
- s, t->sndq, t->notsent);
+ flow_err(conn,
+ "Invalid notsent count socket %i, send: %u, not sent: %u",
+ s, t->sndq, t->notsent);
return -EINVAL;
}
if (t->sndq > TCP_MIGRATE_SND_QUEUE_MAX) {
- err("Send queue too large to migrate socket %i: %u bytes",
- s, t->sndq);
+ flow_err(conn,
+ "Send queue too large to migrate socket %i: %u bytes",
+ s, t->sndq);
return -ENOBUFS;
}
@@ -2993,13 +3357,13 @@ static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t)
rc = 0;
} else {
rc = -errno;
- err_perror("Can't read send queue, socket %i", s);
+ flow_perror(conn, "Can't read send queue");
return rc;
}
}
if ((uint32_t)rc < t->sndq) {
- err("Short read migrating send queue");
+ flow_err(conn, "Short read migrating send queue");
return -ENXIO;
}
@@ -3010,19 +3374,25 @@ static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t)
/**
* tcp_flow_repair_queue() - Restore contents of a given (pre-selected) queue
- * @s: Socket
+ * @conn: Connection to repair queue for
* @len: Length of data to be restored
* @buf: Buffer with content of pending data queue
*
* Return: 0 on success, negative error code on failure
*/
-static int tcp_flow_repair_queue(int s, size_t len, uint8_t *buf)
+static int tcp_flow_repair_queue(const struct tcp_tap_conn *conn,
+ size_t len, uint8_t *buf)
{
size_t chunk = len;
uint8_t *p = buf;
+ if (conn->sock < 0) {
+ flow_err(conn, "Invalid socket descriptor for repair queue");
+ return -EBADF;
+ }
+
while (len > 0) {
- ssize_t rc = send(s, p, MIN(len, chunk), 0);
+ ssize_t rc = send(conn->sock, p, MIN(len, chunk), 0);
if (rc < 0) {
if ((errno == ENOBUFS || errno == ENOMEM) &&
@@ -3032,7 +3402,7 @@ static int tcp_flow_repair_queue(int s, size_t len, uint8_t *buf)
}
rc = -errno;
- err_perror("Can't write queue, socket %i", s);
+ flow_perror(conn, "Can't write queue");
return rc;
}
@@ -3045,18 +3415,18 @@ static int tcp_flow_repair_queue(int s, size_t len, uint8_t *buf)
/**
* tcp_flow_dump_seq() - Dump current sequence of pre-selected queue
- * @s: Socket
+ * @conn: Pointer to the TCP connection structure
* @v: Sequence value, set on return
*
* Return: 0 on success, negative error code on failure
*/
-static int tcp_flow_dump_seq(int s, uint32_t *v)
+static int tcp_flow_dump_seq(const struct tcp_tap_conn *conn, uint32_t *v)
{
socklen_t sl = sizeof(*v);
- if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, v, &sl)) {
+ if (getsockopt(conn->sock, SOL_TCP, TCP_QUEUE_SEQ, v, &sl)) {
int rc = -errno;
- err_perror("Dumping sequence, socket %i", s);
+ flow_perror(conn, "Dumping sequence");
return rc;
}
@@ -3065,16 +3435,17 @@ static int tcp_flow_dump_seq(int s, uint32_t *v)
/**
* tcp_flow_repair_seq() - Restore sequence for pre-selected queue
- * @s: Socket
+ * @conn: Connection to repair sequences for
* @v: Sequence value to be set
*
* Return: 0 on success, negative error code on failure
*/
-static int tcp_flow_repair_seq(int s, const uint32_t *v)
+static int tcp_flow_repair_seq(const struct tcp_tap_conn *conn,
+ const uint32_t *v)
{
- if (setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, v, sizeof(*v))) {
+ if (setsockopt(conn->sock, SOL_TCP, TCP_QUEUE_SEQ, v, sizeof(*v))) {
int rc = -errno;
- err_perror("Setting sequence, socket %i", s);
+ flow_perror(conn, "Setting sequence");
return rc;
}
@@ -3083,15 +3454,17 @@ static int tcp_flow_repair_seq(int s, const uint32_t *v)
/**
* tcp_flow_dump_rcvqueue() - Dump receive queue and its length, seal/block it
- * @s: Socket
+ * @conn: Pointer to the TCP connection structure
* @t: Extended migration data
*
* Return: 0 on success, negative error code on failure
*
* #syscalls:vu ioctl
*/
-static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t)
+static int tcp_flow_dump_rcvqueue(const struct tcp_tap_conn *conn,
+ struct tcp_tap_transfer_ext *t)
{
+ int s = conn->sock;
ssize_t rc;
if (ioctl(s, SIOCINQ, &t->rcvq) < 0) {
@@ -3111,8 +3484,9 @@ static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t)
t->rcvq--;
if (t->rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) {
- err("Receive queue too large to migrate socket %i: %u bytes",
- s, t->rcvq);
+ flow_err(conn,
+ "Receive queue too large to migrate socket: %u bytes",
+ t->rcvq);
return -ENOBUFS;
}
@@ -3122,13 +3496,13 @@ static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t)
rc = 0;
} else {
rc = -errno;
- err_perror("Can't read receive queue for socket %i", s);
+ flow_perror(conn, "Can't read receive queue");
return rc;
}
}
if ((uint32_t)rc < t->rcvq) {
- err("Short read migrating receive queue");
+ flow_err(conn, "Short read migrating receive queue");
return -ENXIO;
}
@@ -3137,12 +3511,13 @@ static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t)
/**
* tcp_flow_repair_opt() - Set repair "options" (MSS, scale, SACK, timestamps)
- * @s: Socket
+ * @conn: Pointer to the TCP connection structure
* @t: Extended migration data
*
* Return: 0 on success, negative error code on failure
*/
-static int tcp_flow_repair_opt(int s, const struct tcp_tap_transfer_ext *t)
+static int tcp_flow_repair_opt(const struct tcp_tap_conn *conn,
+ const struct tcp_tap_transfer_ext *t)
{
const struct tcp_repair_opt opts[] = {
{ TCPOPT_WINDOW, t->snd_ws + (t->rcv_ws << 16) },
@@ -3156,9 +3531,9 @@ static int tcp_flow_repair_opt(int s, const struct tcp_tap_transfer_ext *t)
!!(t->tcpi_options & TCPI_OPT_SACK) +
!!(t->tcpi_options & TCPI_OPT_TIMESTAMPS));
- if (setsockopt(s, SOL_TCP, TCP_REPAIR_OPTIONS, opts, sl)) {
+ if (setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_OPTIONS, opts, sl)) {
int rc = -errno;
- err_perror("Setting repair options, socket %i", s);
+ flow_perror(conn, "Setting repair options");
return rc;
}
@@ -3175,7 +3550,7 @@ static int tcp_flow_repair_opt(int s, const struct tcp_tap_transfer_ext *t)
int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn)
{
struct tcp_tap_transfer t = {
- .retrans = conn->retrans,
+ .retries = conn->retries,
.ws_from_tap = conn->ws_from_tap,
.ws_to_tap = conn->ws_to_tap,
.events = conn->events,
@@ -3214,12 +3589,14 @@ int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn)
/**
* tcp_flow_migrate_source_ext() - Dump queues, close sockets, send final data
+ * @c: Execution context
* @fd: Descriptor for state migration
* @conn: Pointer to the TCP connection structure
*
* Return: 0 on success, negative (not -EIO) on failure, -EIO on sending failure
*/
-int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
+int tcp_flow_migrate_source_ext(const struct ctx *c,
+ int fd, const struct tcp_tap_conn *conn)
{
uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
struct tcp_tap_transfer_ext *t = &migrate_ext[FLOW_IDX(conn)];
@@ -3229,39 +3606,45 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
/* Disable SO_PEEK_OFF, it will make accessing the queues in repair mode
* weird.
*/
- if (tcp_set_peek_offset(s, -1)) {
+ if (tcp_set_peek_offset(conn, -1)) {
rc = -errno;
goto fail;
}
- if ((rc = tcp_flow_dump_tinfo(s, t)))
+ if ((rc = tcp_flow_dump_tinfo(conn, t)))
goto fail;
- if ((rc = tcp_flow_dump_mss(s, t)))
+ if ((rc = tcp_flow_dump_mss(conn, t)))
goto fail;
- if ((rc = tcp_flow_dump_wnd(s, t)))
+ if ((rc = tcp_flow_dump_timestamp(conn, t)))
goto fail;
- if ((rc = tcp_flow_select_queue(s, TCP_SEND_QUEUE)))
+ if ((rc = tcp_flow_dump_wnd(conn, t)))
goto fail;
- if ((rc = tcp_flow_dump_sndqueue(s, t)))
+ if ((rc = tcp_flow_select_queue(conn, TCP_SEND_QUEUE)))
goto fail;
- if ((rc = tcp_flow_dump_seq(s, &t->seq_snd)))
+ if ((rc = tcp_flow_dump_sndqueue(conn, t)))
goto fail;
- if ((rc = tcp_flow_select_queue(s, TCP_RECV_QUEUE)))
+ if ((rc = tcp_flow_dump_seq(conn, &t->seq_snd)))
goto fail;
- if ((rc = tcp_flow_dump_rcvqueue(s, t)))
+ if ((rc = tcp_flow_select_queue(conn, TCP_RECV_QUEUE)))
goto fail;
- if ((rc = tcp_flow_dump_seq(s, &t->seq_rcv)))
+ if ((rc = tcp_flow_dump_rcvqueue(conn, t)))
goto fail;
- close(s);
+ if ((rc = tcp_flow_dump_seq(conn, &t->seq_rcv)))
+ goto fail;
+
+ if (c->migrate_no_linger)
+ close(s);
+ else
+ epoll_del(flow_epollfd(&conn->f), s);
/* Adjustments unrelated to FIN segments: sequence numbers we dumped are
* based on the end of the queues.
@@ -3269,14 +3652,14 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
t->seq_rcv -= t->rcvq;
t->seq_snd -= t->sndq;
- debug("Extended migration data, socket %i sequences send %u receive %u",
- s, t->seq_snd, t->seq_rcv);
- debug(" pending queues: send %u not sent %u receive %u",
- t->sndq, t->notsent, t->rcvq);
- debug(" window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
- t->snd_wl1, t->snd_wnd, t->max_window, t->rcv_wnd, t->rcv_wup);
- debug(" SO_PEEK_OFF %s offset=%"PRIu32,
- peek_offset_cap ? "enabled" : "disabled", peek_offset);
+ flow_dbg(conn, "Extended migration data, socket %i sequences send %u receive %u",
+ s, t->seq_snd, t->seq_rcv);
+ flow_dbg(conn, " pending queues: send %u not sent %u receive %u",
+ t->sndq, t->notsent, t->rcvq);
+ flow_dbg(conn, " window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
+ t->snd_wl1, t->snd_wnd, t->max_window, t->rcv_wnd, t->rcv_wup);
+ flow_dbg(conn, " SO_PEEK_OFF %s offset=%"PRIu32,
+ peek_offset_cap ? "enabled" : "disabled", peek_offset);
/* Endianness fix-ups */
t->seq_snd = htonl(t->seq_snd);
@@ -3284,6 +3667,8 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
t->sndq = htonl(t->sndq);
t->notsent = htonl(t->notsent);
t->rcvq = htonl(t->rcvq);
+ t->mss = htonl(t->mss);
+ t->timestamp = htonl(t->timestamp);
t->snd_wl1 = htonl(t->snd_wl1);
t->snd_wnd = htonl(t->snd_wnd);
@@ -3292,17 +3677,17 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
t->rcv_wup = htonl(t->rcv_wup);
if (write_all_buf(fd, t, sizeof(*t))) {
- err_perror("Failed to write extended data, socket %i", s);
+ flow_perror(conn, "Failed to write extended data");
return -EIO;
}
if (write_all_buf(fd, tcp_migrate_snd_queue, ntohl(t->sndq))) {
- err_perror("Failed to write send queue data, socket %i", s);
+ flow_perror(conn, "Failed to write send queue data");
return -EIO;
}
if (write_all_buf(fd, tcp_migrate_rcv_queue, ntohl(t->rcvq))) {
- err_perror("Failed to write receive queue data, socket %i", s);
+ flow_perror(conn, "Failed to write receive queue data");
return -EIO;
}
@@ -3317,7 +3702,7 @@ fail:
t->tcpi_state = 0; /* Not defined: tell the target to skip this flow */
if (write_all_buf(fd, t, sizeof(*t))) {
- err_perror("Failed to write extended data, socket %i", s);
+ flow_perror(conn, "Failed to write extended data");
return -EIO;
}
@@ -3337,32 +3722,22 @@ fail:
static int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
{
sa_family_t af = CONN_V4(conn) ? AF_INET : AF_INET6;
- const struct flowside *sockside = HOSTFLOW(conn);
- union sockaddr_inany a;
- socklen_t sl;
int s, rc;
- pif_sockaddr(c, &a, &sl, PIF_HOST, &sockside->oaddr, sockside->oport);
-
if ((conn->sock = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
IPPROTO_TCP)) < 0) {
rc = -errno;
- err_perror("Failed to create socket for migrated flow");
+ flow_perror(conn, "Failed to create socket for migrated flow");
return rc;
}
s = conn->sock;
if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &(int){ 1 }, sizeof(int)))
- debug_perror("Setting SO_REUSEADDR on socket %i", s);
+ flow_dbg_perror(conn, "Failed to set SO_REUSEADDR on socket %i",
+ s);
tcp_sock_set_nodelay(s);
- if (bind(s, &a.sa, sizeof(a))) {
- rc = -errno;
- err_perror("Failed to bind socket for migrated flow");
- goto err;
- }
-
if ((rc = tcp_flow_repair_on(c, conn)))
goto err;
@@ -3375,6 +3750,29 @@ err:
}
/**
+ * tcp_flow_repair_bind() - Bind socket in repair mode
+ * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_bind(const struct ctx *c, struct tcp_tap_conn *conn)
+{
+ const struct flowside *sockside = HOSTFLOW(conn);
+ union sockaddr_inany a;
+
+ pif_sockaddr(c, &a, PIF_HOST, &sockside->oaddr, sockside->oport);
+
+ if (bind(conn->sock, &a.sa, socklen_inany(&a))) {
+ int rc = -errno;
+ flow_perror(conn, "Failed to bind socket for migrated flow");
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
* tcp_flow_repair_connect() - Connect socket in repair mode, then turn it off
* @c: Execution context
* @conn: Pointer to the TCP connection structure
@@ -3390,11 +3788,11 @@ static int tcp_flow_repair_connect(const struct ctx *c,
rc = flowside_connect(c, conn->sock, PIF_HOST, tgt);
if (rc) {
rc = -errno;
- err_perror("Failed to connect migrated socket %i", conn->sock);
+ flow_perror(conn, "Failed to connect migrated socket");
return rc;
}
- conn->in_epoll = 0;
+ flow_epollid_clear(&conn->f);
conn->timer = -1;
conn->listening_sock = -1;
@@ -3421,8 +3819,8 @@ int tcp_flow_migrate_target(struct ctx *c, int fd)
}
if (read_all_buf(fd, &t, sizeof(t))) {
+ flow_perror(flow, "Failed to receive migration data");
flow_alloc_cancel(flow);
- err_perror("Failed to receive migration data");
return -errno;
}
@@ -3431,7 +3829,7 @@ int tcp_flow_migrate_target(struct ctx *c, int fd)
memcpy(&flow->f.side, &t.side, sizeof(flow->f.side));
conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
- conn->retrans = t.retrans;
+ conn->retries = t.retries;
conn->ws_from_tap = t.ws_from_tap;
conn->ws_to_tap = t.ws_to_tap;
conn->events = t.events;
@@ -3481,7 +3879,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
if (read_all_buf(fd, &t, sizeof(t))) {
rc = -errno;
- err_perror("Failed to read extended data for socket %i", s);
+ flow_perror(conn, "Failed to read extended data");
return rc;
}
@@ -3496,6 +3894,8 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
t.sndq = ntohl(t.sndq);
t.notsent = ntohl(t.notsent);
t.rcvq = ntohl(t.rcvq);
+ t.mss = ntohl(t.mss);
+ t.timestamp = ntohl(t.timestamp);
t.snd_wl1 = ntohl(t.snd_wl1);
t.snd_wnd = ntohl(t.snd_wnd);
@@ -3503,31 +3903,34 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
t.rcv_wnd = ntohl(t.rcv_wnd);
t.rcv_wup = ntohl(t.rcv_wup);
- debug("Extended migration data, socket %i sequences send %u receive %u",
- s, t.seq_snd, t.seq_rcv);
- debug(" pending queues: send %u not sent %u receive %u",
- t.sndq, t.notsent, t.rcvq);
- debug(" window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
- t.snd_wl1, t.snd_wnd, t.max_window, t.rcv_wnd, t.rcv_wup);
- debug(" SO_PEEK_OFF %s offset=%"PRIu32,
- peek_offset_cap ? "enabled" : "disabled", peek_offset);
+ flow_dbg(conn,
+ "Extended migration data, socket %i sequences send %u receive %u",
+ s, t.seq_snd, t.seq_rcv);
+ flow_dbg(conn, " pending queues: send %u not sent %u receive %u",
+ t.sndq, t.notsent, t.rcvq);
+ flow_dbg(conn,
+ " window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
+ t.snd_wl1, t.snd_wnd, t.max_window, t.rcv_wnd, t.rcv_wup);
+ flow_dbg(conn, " SO_PEEK_OFF %s offset=%"PRIu32,
+ peek_offset_cap ? "enabled" : "disabled", peek_offset);
if (t.sndq > TCP_MIGRATE_SND_QUEUE_MAX || t.notsent > t.sndq ||
t.rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) {
- err("Bad queues socket %i, send: %u, not sent: %u, receive: %u",
- s, t.sndq, t.notsent, t.rcvq);
+ flow_err(conn,
+ "Bad queues socket %i, send: %u, not sent: %u, receive: %u",
+ s, t.sndq, t.notsent, t.rcvq);
return -EINVAL;
}
if (read_all_buf(fd, tcp_migrate_snd_queue, t.sndq)) {
rc = -errno;
- err_perror("Failed to read send queue data, socket %i", s);
+ flow_perror(conn, "Failed to read send queue data");
return rc;
}
if (read_all_buf(fd, tcp_migrate_rcv_queue, t.rcvq)) {
rc = -errno;
- err_perror("Failed to read receive queue data, socket %i", s);
+ flow_perror(conn, "Failed to read receive queue data");
return rc;
}
@@ -3535,32 +3938,38 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
/* We weren't able to create the socket, discard flow */
goto fail;
- if (tcp_flow_select_queue(s, TCP_SEND_QUEUE))
+ if (tcp_flow_repair_bind(c, conn))
+ goto fail;
+
+ if (tcp_flow_repair_timestamp(conn, &t))
+ goto fail;
+
+ if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE))
goto fail;
- if (tcp_flow_repair_seq(s, &t.seq_snd))
+ if (tcp_flow_repair_seq(conn, &t.seq_snd))
goto fail;
- if (tcp_flow_select_queue(s, TCP_RECV_QUEUE))
+ if (tcp_flow_select_queue(conn, TCP_RECV_QUEUE))
goto fail;
- if (tcp_flow_repair_seq(s, &t.seq_rcv))
+ if (tcp_flow_repair_seq(conn, &t.seq_rcv))
goto fail;
if (tcp_flow_repair_connect(c, conn))
goto fail;
- if (tcp_flow_repair_queue(s, t.rcvq, tcp_migrate_rcv_queue))
+ if (tcp_flow_repair_queue(conn, t.rcvq, tcp_migrate_rcv_queue))
goto fail;
- if (tcp_flow_select_queue(s, TCP_SEND_QUEUE))
+ if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE))
goto fail;
- if (tcp_flow_repair_queue(s, t.sndq - t.notsent,
+ if (tcp_flow_repair_queue(conn, t.sndq - t.notsent,
tcp_migrate_snd_queue))
goto fail;
- if (tcp_flow_repair_opt(s, &t))
+ if (tcp_flow_repair_opt(conn, &t))
goto fail;
/* If we sent a FIN sent and it was acknowledged (TCP_FIN_WAIT2), don't
@@ -3575,19 +3984,19 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
v = TCP_SEND_QUEUE;
if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v)))
- debug_perror("Selecting repair queue, socket %i", s);
+ flow_perror(conn, "Selecting repair queue");
else
shutdown(s, SHUT_WR);
}
- if (tcp_flow_repair_wnd(s, &t))
+ if (tcp_flow_repair_wnd(conn, &t))
goto fail;
tcp_flow_repair_off(c, conn);
repair_flush(c);
if (t.notsent) {
- if (tcp_flow_repair_queue(s, t.notsent,
+ if (tcp_flow_repair_queue(conn, t.notsent,
tcp_migrate_snd_queue +
(t.sndq - t.notsent))) {
/* This sometimes seems to fail for unclear reasons.
@@ -3607,15 +4016,16 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
if (t.tcpi_state == TCP_FIN_WAIT1)
shutdown(s, SHUT_WR);
- if (tcp_set_peek_offset(conn->sock, peek_offset))
+ if (tcp_set_peek_offset(conn, peek_offset))
goto fail;
tcp_send_flag(c, conn, ACK);
tcp_data_from_sock(c, conn);
if ((rc = tcp_epoll_ctl(c, conn))) {
- debug("Failed to subscribe to epoll for migrated socket %i: %s",
- conn->sock, strerror_(-rc));
+ flow_dbg(conn,
+ "Failed to subscribe to epoll for migrated socket: %s",
+ strerror_(-rc));
goto fail;
}
@@ -3632,3 +4042,67 @@ fail:
return 0;
}
+
+/**
+ * tcp_prepare_iov() - Prepare iov according to kernel capability
+ * @msg: Message header to update
+ * @iov: iovec to receive TCP payload and data to discard
+ * @already_sent: Bytes sent after the last acknowledged one
+ * @payload_iov_cnt: Number of TCP payload iovec entries
+ *
+ * Return: 0 on success, -1 if already_sent cannot be discarded fully
+ */
+int tcp_prepare_iov(struct msghdr *msg, struct iovec *iov,
+ uint32_t already_sent, int payload_iov_cnt)
+{
+ /*
+ * IOV layout
+ * |- tcp_buf_discard -|---------- TCP data slots ------------|
+ *
+ * with discarded data:
+ * |------ddddddddddddd|ttttttttttttt-------------------------|
+ * ^
+ * |
+ * msg_iov
+ *
+ * without discarded data:
+ * |-------------------|ttttttttttttt-------------------------|
+ * ^
+ * |
+ * msg_iov
+ * d: discard data
+ * t: TCP data
+ */
+ if (peek_offset_cap) {
+ msg->msg_iov = iov + DISCARD_IOV_NUM;
+ msg->msg_iovlen = payload_iov_cnt;
+ } else {
+ int discard_cnt, discard_iov_rem;
+ struct iovec *iov_start;
+ int i;
+
+ discard_cnt = DIV_ROUND_UP(already_sent, BUF_DISCARD_SIZE);
+ if (discard_cnt > DISCARD_IOV_NUM) {
+ debug("Failed to discard %u already sent bytes",
+ already_sent);
+ return -1;
+ }
+
+ discard_iov_rem = already_sent % BUF_DISCARD_SIZE;
+
+ iov_start = iov + (DISCARD_IOV_NUM - discard_cnt);
+
+ /* Multiple iov entries pointing to the same buffer */
+ for (i = 0; i < discard_cnt; i++) {
+ iov_start[i].iov_base = tcp_buf_discard;
+ iov_start[i].iov_len = BUF_DISCARD_SIZE;
+ }
+ if (discard_iov_rem)
+ iov[DISCARD_IOV_NUM - 1].iov_len = discard_iov_rem;
+
+ msg->msg_iov = iov_start;
+ msg->msg_iovlen = discard_cnt + payload_iov_cnt;
+ }
+
+ return 0;
+}
diff --git a/tcp.h b/tcp.h
index 9142eca..3f21e75 100644
--- a/tcp.h
+++ b/tcp.h
@@ -18,14 +18,15 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
const void *saddr, const void *daddr, uint32_t flow_lbl,
const struct pool *p, int idx, const struct timespec *now);
-int tcp_sock_init(const struct ctx *c, const union inany_addr *addr,
- const char *ifname, in_port_t port);
+int tcp_sock_init(const struct ctx *c, uint8_t pif,
+ const union inany_addr *addr, const char *ifname,
+ in_port_t port);
int tcp_init(struct ctx *c);
-void tcp_timer(struct ctx *c, const struct timespec *now);
+void tcp_port_rebind_all(struct ctx *c);
+void tcp_timer(const struct ctx *c, const struct timespec *now);
void tcp_defer_handler(struct ctx *c);
-void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
-int tcp_set_peek_offset(int s, int offset);
+void tcp_update_l2_buf(const unsigned char *eth_d);
extern bool peek_offset_cap;
@@ -60,12 +61,18 @@ union tcp_listen_epoll_ref {
* @fwd_out: Port forwarding configuration for outbound packets
* @timer_run: Timestamp of most recent timer run
* @pipe_size: Size of pipes for spliced connections
+ * @rto_max: Maximum retry timeout (in s)
+ * @syn_retries: SYN retries using exponential backoff timeout
+ * @syn_linear_timeouts: SYN retries before using exponential backoff timeout
*/
struct tcp_ctx {
struct fwd_ports fwd_in;
struct fwd_ports fwd_out;
struct timespec timer_run;
size_t pipe_size;
+ int rto_max;
+ uint8_t syn_retries;
+ uint8_t syn_linear_timeouts;
};
#endif /* TCP_H */
diff --git a/tcp_buf.c b/tcp_buf.c
index 72d99c5..5d419d3 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -40,8 +40,7 @@
/* Static buffers */
/* Ethernet header for IPv4 and IPv6 frames */
-static struct ethhdr tcp4_eth_src;
-static struct ethhdr tcp6_eth_src;
+static struct ethhdr tcp_eth_hdr[TCP_FRAMES_MEM];
static struct tap_hdr tcp_payload_tap_hdr[TCP_FRAMES_MEM];
@@ -60,19 +59,20 @@ static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM];
static unsigned int tcp_payload_used;
/* recvmsg()/sendmsg() data for tap */
-static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
+static struct iovec iov_sock [TCP_FRAMES_MEM + DISCARD_IOV_NUM];
static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
/**
* tcp_update_l2_buf() - Update Ethernet header buffers with addresses
* @eth_d: Ethernet destination address, NULL if unchanged
- * @eth_s: Ethernet source address, NULL if unchanged
*/
-void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
+void tcp_update_l2_buf(const unsigned char *eth_d)
{
- eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
- eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
+ int i;
+
+ for (i = 0; i < TCP_FRAMES_MEM; i++)
+ eth_update_mac(&tcp_eth_hdr[i], eth_d, NULL);
}
/**
@@ -85,9 +85,6 @@ void tcp_sock_iov_init(const struct ctx *c)
struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
int i;
- tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
- tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
-
for (i = 0; i < ARRAY_SIZE(tcp_payload); i++) {
tcp6_payload_ip[i] = ip6;
tcp4_payload_ip[i] = iph;
@@ -99,12 +96,13 @@ void tcp_sock_iov_init(const struct ctx *c)
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp_payload_tap_hdr[i]);
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp_payload[i];
+ iov[TCP_IOV_ETH_PAD].iov_base = eth_pad;
}
}
/**
* tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
- * @ctx: Execution context
+ * @c: Execution context
* @conns: Array of connection pointers corresponding to queued frames
* @frames: Two-dimensional array containing queued frames with sub-iovs
* @num_frames: Number of entries in the two arrays to be compared
@@ -125,7 +123,7 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
conn->seq_to_tap = seq;
peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
- if (tcp_set_peek_offset(conn->sock, peek_offset))
+ if (tcp_set_peek_offset(conn, peek_offset))
tcp_rst(c, conn);
}
}
@@ -148,22 +146,41 @@ void tcp_payload_flush(const struct ctx *c)
}
/**
- * tcp_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
+ * tcp_l2_buf_pad() - Calculate padding to send out of padding (zero) buffer
+ * @iov: Pointer to iovec of frame parts we're about to send
+ */
+static void tcp_l2_buf_pad(struct iovec *iov)
+{
+ size_t l2len = iov[TCP_IOV_ETH].iov_len +
+ iov[TCP_IOV_IP].iov_len +
+ iov[TCP_IOV_PAYLOAD].iov_len;
+
+ if (l2len < ETH_ZLEN)
+ iov[TCP_IOV_ETH_PAD].iov_len = ETH_ZLEN - l2len;
+ else
+ iov[TCP_IOV_ETH_PAD].iov_len = 0;
+}
+
+/**
+ * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
+ * @c: Execution context
* @conn: Connection pointer
* @iov: Pointer to an array of iovec of TCP pre-cooked buffers
* @check: Checksum, if already known
* @seq: Sequence number for this segment
* @no_tcp_csum: Do not set TCP checksum
*/
-static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
+static void tcp_l2_buf_fill_headers(const struct ctx *c,
+ struct tcp_tap_conn *conn,
struct iovec *iov, const uint16_t *check,
uint32_t seq, bool no_tcp_csum)
{
struct iov_tail tail = IOV_TAIL(&iov[TCP_IOV_PAYLOAD], 1, 0);
- struct tcphdr *th = IOV_REMOVE_HEADER(&tail, struct tcphdr);
+ struct tcphdr th_storage, *th = IOV_REMOVE_HEADER(&tail, th_storage);
struct tap_hdr *taph = iov[TCP_IOV_TAP].iov_base;
const struct flowside *tapside = TAPFLOW(conn);
const struct in_addr *a4 = inany_v4(&tapside->oaddr);
+ struct ethhdr *eh = iov[TCP_IOV_ETH].iov_base;
struct ipv6hdr *ip6h = NULL;
struct iphdr *ip4h = NULL;
@@ -172,7 +189,7 @@ static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
else
ip6h = iov[TCP_IOV_IP].iov_base;
- tcp_fill_headers(conn, taph, ip4h, ip6h, th, &tail,
+ tcp_fill_headers(c, conn, taph, eh, ip4h, ip6h, th, &tail,
check, seq, no_tcp_csum);
}
@@ -194,14 +211,12 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
int ret;
iov = tcp_l2_iov[tcp_payload_used];
- if (CONN_V4(conn)) {
+ if (CONN_V4(conn))
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
- iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
- } else {
+ else
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
- iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
- }
+ iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp_eth_hdr[tcp_payload_used]);
payload = iov[TCP_IOV_PAYLOAD].iov_base;
seq = conn->seq_to_tap;
ret = tcp_prepare_flags(c, conn, flags, &payload->th,
@@ -209,13 +224,16 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
if (ret <= 0)
return ret;
- tcp_payload_used++;
+ tcp_frame_conns[tcp_payload_used++] = conn;
l4len = optlen + sizeof(struct tcphdr);
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
- tcp_l2_buf_fill_headers(conn, iov, NULL, seq, false);
+ tcp_l2_buf_fill_headers(c, conn, iov, NULL, seq, false);
+
+ tcp_l2_buf_pad(iov);
if (flags & DUP_ACK) {
- struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++];
+ struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used];
+ tcp_frame_conns[tcp_payload_used++] = conn;
memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base,
iov[TCP_IOV_TAP].iov_len);
@@ -224,6 +242,7 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
memcpy(dup_iov[TCP_IOV_PAYLOAD].iov_base,
iov[TCP_IOV_PAYLOAD].iov_base, l4len);
dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len;
+ dup_iov[TCP_IOV_ETH_PAD].iov_len = iov[TCP_IOV_ETH_PAD].iov_len;
}
if (tcp_payload_used > TCP_FRAMES_MEM - 2)
@@ -259,11 +278,10 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
check = &iph->check;
}
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
- iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
} else if (CONN_V6(conn)) {
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
- iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
}
+ iov[TCP_IOV_ETH].iov_base = &tcp_eth_hdr[tcp_payload_used];
payload = iov[TCP_IOV_PAYLOAD].iov_base;
payload->th.th_off = sizeof(struct tcphdr) / 4;
payload->th.th_x2 = 0;
@@ -271,7 +289,10 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
payload->th.ack = 1;
payload->th.psh = push;
iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr);
- tcp_l2_buf_fill_headers(conn, iov, check, seq, false);
+ tcp_l2_buf_fill_headers(c, conn, iov, check, seq, false);
+
+ tcp_l2_buf_pad(iov);
+
if (++tcp_payload_used > TCP_FRAMES_MEM - 1)
tcp_payload_flush(c);
}
@@ -304,7 +325,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
conn->seq_ack_from_tap, conn->seq_to_tap);
conn->seq_to_tap = conn->seq_ack_from_tap;
already_sent = 0;
- if (tcp_set_peek_offset(s, 0)) {
+ if (tcp_set_peek_offset(conn, 0)) {
tcp_rst(c, conn);
return -1;
}
@@ -326,15 +347,9 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
iov_rem = (wnd_scaled - already_sent) % mss;
}
- /* Prepare iov according to kernel capability */
- if (!peek_offset_cap) {
- mh_sock.msg_iov = iov_sock;
- iov_sock[0].iov_base = tcp_buf_discard;
- iov_sock[0].iov_len = already_sent;
- mh_sock.msg_iovlen = fill_bufs + 1;
- } else {
- mh_sock.msg_iov = &iov_sock[1];
- mh_sock.msg_iovlen = fill_bufs;
+ if (tcp_prepare_iov(&mh_sock, iov_sock, already_sent, fill_bufs)) {
+ tcp_rst(c, conn);
+ return -1;
}
if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) {
@@ -344,12 +359,12 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
tcp_payload_used = 0;
}
- for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
+ for (i = 0, iov = iov_sock + DISCARD_IOV_NUM; i < fill_bufs; i++, iov++) {
iov->iov_base = &tcp_payload[tcp_payload_used + i].data;
iov->iov_len = mss;
}
if (iov_rem)
- iov_sock[fill_bufs].iov_len = iov_rem;
+ iov_sock[fill_bufs + DISCARD_IOV_NUM - 1].iov_len = iov_rem;
/* Receive into buffers, don't dequeue until acknowledged by guest. */
do
@@ -369,8 +384,23 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
}
if (!len) {
- if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
- int ret = tcp_buf_send_flag(c, conn, FIN | ACK);
+ if (already_sent) {
+ conn_flag(c, conn, STALLED);
+ } else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) ==
+ SOCK_FIN_RCVD) {
+ int ret;
+
+ /* On TAP_FIN_SENT, we won't get further data events
+ * from the socket, and this might be the last ACK
+ * segment we send to the tap, so update its sequence to
+ * include everything we received until now.
+ *
+ * See also the special handling on CONN_IS_CLOSING() in
+ * tcp_update_seqack_wnd().
+ */
+ conn->seq_ack_to_tap = conn->seq_from_tap;
+
+ ret = tcp_buf_send_flag(c, conn, FIN | ACK);
if (ret) {
tcp_rst(c, conn);
return ret;
diff --git a/tcp_conn.h b/tcp_conn.h
index 9126a36..9c6ff9e 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -12,8 +12,7 @@
/**
* struct tcp_tap_conn - Descriptor for a TCP connection (not spliced)
* @f: Generic flow information
- * @in_epoll: Is the connection in the epoll set?
- * @retrans: Number of retransmissions occurred due to ACK_TIMEOUT
+ * @retries: Number of retries occurred due to timeouts
* @ws_from_tap: Window scaling factor advertised from tap/guest
* @ws_to_tap: Window scaling factor advertised to tap/guest
* @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
@@ -36,11 +35,9 @@ struct tcp_tap_conn {
/* Must be first element */
struct flow_common f;
- bool in_epoll :1;
-
-#define TCP_RETRANS_BITS 3
- unsigned int retrans :TCP_RETRANS_BITS;
-#define TCP_MAX_RETRANS MAX_FROM_BITS(TCP_RETRANS_BITS)
+#define TCP_RETRIES_BITS 3
+ unsigned int retries :TCP_RETRIES_BITS;
+#define TCP_MAX_RETRIES MAX_FROM_BITS(TCP_RETRIES_BITS)
#define TCP_WS_BITS 4 /* RFC 7323 */
#define TCP_WS_MAX 14
@@ -52,6 +49,15 @@ struct tcp_tap_conn {
#define MSS_SET(conn, mss) (conn->tap_mss = (mss >> (16 - TCP_MSS_BITS)))
#define MSS_GET(conn) (conn->tap_mss << (16 - TCP_MSS_BITS))
+#define RTT_EXP_BITS 4
+ unsigned int rtt_exp :RTT_EXP_BITS;
+#define RTT_EXP_MAX MAX_FROM_BITS(RTT_EXP_BITS)
+#define RTT_STORE_MIN 100 /* us, minimum representable */
+#define RTT_STORE_MAX ((long)(RTT_STORE_MIN << RTT_EXP_MAX))
+#define RTT_SET(conn, rtt) \
+ (conn->rtt_exp = MIN(RTT_EXP_MAX, ilog2(MAX(1, rtt / RTT_STORE_MIN))))
+#define RTT_GET(conn) (RTT_STORE_MIN << conn->rtt_exp)
+
int sock :FD_REF_BITS;
uint8_t events;
@@ -80,6 +86,7 @@ struct tcp_tap_conn {
#define ACK_TO_TAP_DUE BIT(3)
#define ACK_FROM_TAP_DUE BIT(4)
#define ACK_FROM_TAP_BLOCKS BIT(5)
+#define SYN_RETRIED BIT(6)
#define SNDBUF_BITS 24
unsigned int sndbuf :SNDBUF_BITS;
@@ -102,7 +109,7 @@ struct tcp_tap_conn {
* struct tcp_tap_transfer - Migrated TCP data, flow table part, network order
* @pif: Interfaces for each side of the flow
* @side: Addresses and ports for each side of the flow
- * @retrans: Number of retransmissions occurred due to ACK_TIMEOUT
+ * @retries: Number of retries occurred due to timeouts
* @ws_from_tap: Window scaling factor advertised from tap/guest
* @ws_to_tap: Window scaling factor advertised to tap/guest
* @events: Connection events, implying connection states
@@ -122,7 +129,7 @@ struct tcp_tap_transfer {
uint8_t pif[SIDES];
struct flowside side[SIDES];
- uint8_t retrans;
+ uint8_t retries;
uint8_t ws_from_tap;
uint8_t ws_to_tap;
uint8_t events;
@@ -152,6 +159,7 @@ struct tcp_tap_transfer {
* @notsent: Part of pending send queue that wasn't sent out yet
* @rcvq: Length of pending receive queue
* @mss: Socket-side MSS clamp
+ * @timestamp: RFC 7323 timestamp
* @snd_wl1: Next sequence used in window probe (next sequence - 1)
* @snd_wnd: Socket-side sending window
* @max_window: Window clamp
@@ -171,6 +179,7 @@ struct tcp_tap_transfer_ext {
uint32_t rcvq;
uint32_t mss;
+ uint32_t timestamp;
/* We can't just use struct tcp_repair_window: we need network order */
uint32_t snd_wl1;
@@ -194,7 +203,6 @@ struct tcp_tap_transfer_ext {
* @written: Bytes written (not fully written from one other side read)
* @events: Events observed/actions performed on connection
* @flags: Connection flags (attributes, not events)
- * @in_epoll: Is the connection in the epoll set?
*/
struct tcp_splice_conn {
/* Must be first element */
@@ -218,8 +226,6 @@ struct tcp_splice_conn {
#define RCVLOWAT_SET(sidei_) ((sidei_) ? BIT(1) : BIT(0))
#define RCVLOWAT_ACT(sidei_) ((sidei_) ? BIT(3) : BIT(2))
#define CLOSING BIT(4)
-
- bool in_epoll :1;
};
/* Socket pools */
@@ -234,7 +240,8 @@ int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn);
int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn);
-int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn);
+int tcp_flow_migrate_source_ext(const struct ctx *c, int fd,
+ const struct tcp_tap_conn *conn);
int tcp_flow_migrate_target(struct ctx *c, int fd);
int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd);
@@ -242,7 +249,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
bool tcp_flow_is_established(const struct tcp_tap_conn *conn);
bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
-void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
+void tcp_splice_timer(struct tcp_splice_conn *conn);
int tcp_conn_pool_sock(int pool[]);
int tcp_conn_sock(sa_family_t af);
int tcp_sock_refill_pool(int pool[], sa_family_t af);
diff --git a/tcp_internal.h b/tcp_internal.h
index 6f5e054..5f8fb35 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -9,6 +9,9 @@
#define MAX_WS 8
#define MAX_WINDOW (1 << (16 + (MAX_WS)))
+#define BUF_DISCARD_SIZE (1 << 20)
+#define DISCARD_IOV_NUM DIV_ROUND_UP(MAX_WINDOW, BUF_DISCARD_SIZE)
+
#define MSS4 ROUND_DOWN(IP_MAX_MTU - \
sizeof(struct tcphdr) - \
sizeof(struct iphdr), \
@@ -18,14 +21,19 @@
sizeof(struct ipv6hdr), \
sizeof(uint32_t))
-#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW)
-#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW)
-#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW)
-#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW)
+#define SEQ_LE(a, b) \
+ ((uint32_t)(b) - (uint32_t)(a) < MAX_WINDOW)
+#define SEQ_LT(a, b) \
+ ((uint32_t)(b) - (uint32_t)(a) - 1 < MAX_WINDOW)
+#define SEQ_GE(a, b) \
+ ((uint32_t)(a) - (uint32_t)(b) < MAX_WINDOW)
+#define SEQ_GT(a, b) \
+ ((uint32_t)(a) - (uint32_t)(b) - 1 < MAX_WINDOW)
#define FIN (1 << 0)
#define SYN (1 << 1)
#define RST (1 << 2)
+#define PSH (1 << 3)
#define ACK (1 << 4)
/* Flags for internal usage */
@@ -49,12 +57,13 @@
#define CONN_V4(conn) (!!inany_v4(&TAPFLOW(conn)->oaddr))
#define CONN_V6(conn) (!CONN_V4(conn))
-/*
+/**
* enum tcp_iov_parts - I/O vector parts for one TCP frame
* @TCP_IOV_TAP tap backend specific header
* @TCP_IOV_ETH Ethernet header
* @TCP_IOV_IP IP (v4/v6) header
* @TCP_IOV_PAYLOAD IP payload (TCP header + data)
+ * @TCP_IOV_ETH_PAD Ethernet (802.3) padding to 60 bytes
* @TCP_NUM_IOVS the number of entries in the iovec array
*/
enum tcp_iov_parts {
@@ -62,6 +71,7 @@ enum tcp_iov_parts {
TCP_IOV_ETH = 1,
TCP_IOV_IP = 2,
TCP_IOV_PAYLOAD = 3,
+ TCP_IOV_ETH_PAD = 4,
TCP_NUM_IOVS
};
@@ -138,7 +148,7 @@ struct tcp_syn_opts {
.ws = TCP_OPT_WS(ws_), \
})
-extern char tcp_buf_discard [MAX_WINDOW];
+extern char tcp_buf_discard [BUF_DISCARD_SIZE];
void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
unsigned long flag);
@@ -166,8 +176,8 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
struct tcp_info_linux;
-void tcp_fill_headers(const struct tcp_tap_conn *conn,
- struct tap_hdr *taph,
+void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
+ struct tap_hdr *taph, struct ethhdr *eh,
struct iphdr *ip4h, struct ipv6hdr *ip6h,
struct tcphdr *th, struct iov_tail *payload,
const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum);
@@ -177,5 +187,8 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
size_t *optlen);
+int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset);
+int tcp_prepare_iov(struct msghdr *msg, struct iovec *iov,
+ uint32_t already_sent, int payload_iov_cnt);
#endif /* TCP_INTERNAL_H */
diff --git a/tcp_splice.c b/tcp_splice.c
index 0d10e3d..4405224 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -44,7 +44,6 @@
#include <net/ethernet.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
-#include <sys/epoll.h>
#include <sys/types.h>
#include <sys/socket.h>
@@ -56,6 +55,7 @@
#include "siphash.h"
#include "inany.h"
#include "flow.h"
+#include "epoll_ctl.h"
#include "flow_table.h"
@@ -95,7 +95,7 @@ static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af);
* conn_at_sidx() - Get spliced TCP connection specific flow at given sidx
* @sidx: Flow and side to retrieve
*
- * Return: Spliced TCP connection at @sidx, or NULL of @sidx is invalid.
+ * Return: spliced TCP connection at @sidx, or NULL of @sidx is invalid.
* Asserts if the flow at @sidx is not FLOW_TCP_SPLICE.
*/
static struct tcp_splice_conn *conn_at_sidx(flow_sidx_t sidx)
@@ -149,7 +149,9 @@ static void tcp_splice_conn_epoll_events(uint16_t events,
static int tcp_splice_epoll_ctl(const struct ctx *c,
struct tcp_splice_conn *conn)
{
- int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
+ int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f)
+ : c->epollfd;
+ int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
const union epoll_ref ref[SIDES] = {
{ .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[0],
.flowside = FLOW_SIDX(conn, 0) },
@@ -161,25 +163,24 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
tcp_splice_conn_epoll_events(conn->events, ev);
- if (epoll_ctl(c->epollfd, m, conn->s[0], &ev[0]) ||
- epoll_ctl(c->epollfd, m, conn->s[1], &ev[1])) {
+
+ if (epoll_ctl(epollfd, m, conn->s[0], &ev[0]) ||
+ epoll_ctl(epollfd, m, conn->s[1], &ev[1])) {
int ret = -errno;
flow_perror(conn, "ERROR on epoll_ctl()");
return ret;
}
-
- conn->in_epoll = true;
+ flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
return 0;
}
/**
* conn_flag_do() - Set/unset given flag, log, update epoll on CLOSING flag
- * @c: Execution context
* @conn: Connection pointer
* @flag: Flag to set, or ~flag to unset
*/
-static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
+static void conn_flag_do(struct tcp_splice_conn *conn,
unsigned long flag)
{
if (flag & (flag - 1)) {
@@ -204,15 +205,15 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
}
if (flag == CLOSING) {
- epoll_del(c, conn->s[0]);
- epoll_del(c, conn->s[1]);
+ epoll_del(flow_epollfd(&conn->f), conn->s[0]);
+ epoll_del(flow_epollfd(&conn->f), conn->s[1]);
}
}
#define conn_flag(c, conn, flag) \
do { \
flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \
- conn_flag_do(c, conn, flag); \
+ conn_flag_do(conn, flag); \
} while (0)
/**
@@ -351,7 +352,6 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
sa_family_t af = inany_v4(&tgt->eaddr) ? AF_INET : AF_INET6;
uint8_t tgtpif = conn->f.pif[TGTSIDE];
union sockaddr_inany sa;
- socklen_t sl;
int one = 1;
if (tgtpif == PIF_HOST)
@@ -379,16 +379,16 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
conn->s[1]);
}
- pif_sockaddr(c, &sa, &sl, tgtpif, &tgt->eaddr, tgt->eport);
+ pif_sockaddr(c, &sa, tgtpif, &tgt->eaddr, tgt->eport);
+
+ conn_event(c, conn, SPLICE_CONNECT);
- if (connect(conn->s[1], &sa.sa, sl)) {
+ if (connect(conn->s[1], &sa.sa, socklen_inany(&sa))) {
if (errno != EINPROGRESS) {
flow_trace(conn, "Couldn't connect socket for splice: %s",
strerror_(errno));
return -errno;
}
-
- conn_event(c, conn, SPLICE_CONNECT);
} else {
conn_event(c, conn, SPLICE_ESTABLISHED);
return tcp_splice_connect_finish(c, conn);
@@ -402,7 +402,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
* @c: Execution context
* @af: Address family (AF_INET or AF_INET6)
*
- * Return: Socket fd in the namespace on success, -errno on failure
+ * Return: socket fd in the namespace on success, -errno on failure
*/
static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af)
{
@@ -520,20 +520,21 @@ swap:
int more = 0;
retry:
- readlen = splice(conn->s[fromsidei], NULL,
- conn->pipe[fromsidei][1], NULL,
- c->tcp.pipe_size,
- SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
+ do
+ readlen = splice(conn->s[fromsidei], NULL,
+ conn->pipe[fromsidei][1], NULL,
+ c->tcp.pipe_size,
+ SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
+ while (readlen < 0 && errno == EINTR);
+
+ if (readlen < 0 && errno != EAGAIN)
+ goto close;
+
flow_trace(conn, "%zi from read-side call", readlen);
- if (readlen < 0) {
- if (errno == EINTR)
- goto retry;
- if (errno != EAGAIN)
- goto close;
- } else if (!readlen) {
+ if (!readlen) {
eof = 1;
- } else {
+ } else if (readlen > 0) {
never_read = 0;
if (readlen >= (long)c->tcp.pipe_size * 90 / 100)
@@ -543,10 +544,16 @@ retry:
conn_flag(c, conn, lowat_act_flag);
}
-eintr:
- written = splice(conn->pipe[fromsidei][0], NULL,
- conn->s[!fromsidei], NULL, c->tcp.pipe_size,
- SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
+ do
+ written = splice(conn->pipe[fromsidei][0], NULL,
+ conn->s[!fromsidei], NULL,
+ c->tcp.pipe_size,
+ SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
+ while (written < 0 && errno == EINTR);
+
+ if (written < 0 && errno != EAGAIN)
+ goto close;
+
flow_trace(conn, "%zi from write-side call (passed %zi)",
written, c->tcp.pipe_size);
@@ -578,12 +585,6 @@ eintr:
conn->written[fromsidei] += written > 0 ? written : 0;
if (written < 0) {
- if (errno == EINTR)
- goto eintr;
-
- if (errno != EAGAIN)
- goto close;
-
if (conn->read[fromsidei] == conn->written[fromsidei])
break;
@@ -750,10 +751,9 @@ void tcp_splice_init(struct ctx *c)
/**
* tcp_splice_timer() - Timer for spliced connections
- * @c: Execution context
* @conn: Connection to handle
*/
-void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn)
+void tcp_splice_timer(struct tcp_splice_conn *conn)
{
unsigned sidei;
diff --git a/tcp_vu.c b/tcp_vu.c
index 6891ed1..db9db78 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -35,7 +35,7 @@
#include "vu_common.h"
#include <time.h>
-static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + 1];
+static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + DISCARD_IOV_NUM];
static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
static int head[VIRTQUEUE_MAX_SIZE + 1];
@@ -43,7 +43,7 @@ static int head[VIRTQUEUE_MAX_SIZE + 1];
* tcp_vu_hdrlen() - return the size of the header in level 2 frame (TCP)
* @v6: Set for IPv6 packet
*
- * Return: Return the size of the header
+ * Return: return the size of the header
*/
static size_t tcp_vu_hdrlen(bool v6)
{
@@ -91,12 +91,12 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
vu_set_element(&flags_elem[0], NULL, &flags_iov[0]);
elem_cnt = vu_collect(vdev, vq, &flags_elem[0], 1,
- hdrlen + sizeof(struct tcp_syn_opts), NULL);
+ MAX(hdrlen + sizeof(*opts), ETH_ZLEN), NULL);
if (elem_cnt != 1)
return -1;
ASSERT(flags_elem[0].in_sg[0].iov_len >=
- hdrlen + sizeof(struct tcp_syn_opts));
+ MAX(hdrlen + sizeof(*opts), ETH_ZLEN));
vu_set_vnethdr(vdev, flags_elem[0].in_sg[0].iov_base, 1);
@@ -135,9 +135,11 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
flags_elem[0].in_sg[0].iov_len = hdrlen + optlen;
payload = IOV_TAIL(flags_elem[0].in_sg, 1, hdrlen);
- tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload,
+ tcp_fill_headers(c, conn, NULL, eh, ip4h, ip6h, th, &payload,
NULL, seq, !*c->pcap);
+ vu_pad(&flags_elem[0].in_sg[0], hdrlen + optlen);
+
if (*c->pcap) {
pcap_iov(&flags_elem[0].in_sg[0], 1,
sizeof(struct virtio_net_hdr_mrg_rxbuf));
@@ -171,21 +173,23 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
/** tcp_vu_sock_recv() - Receive datastream from socket into vhost-user buffers
* @c: Execution context
+ * @vq: virtqueue to use to receive data
* @conn: Connection pointer
* @v6: Set for IPv6 connections
* @already_sent: Number of bytes already sent
* @fillsize: Maximum bytes to fill in guest-side receiving window
* @iov_cnt: number of iov (output)
+ * @head_cnt: Pointer to store the count of head iov entries (output)
*
- * Return: Number of iov entries used to store the data or negative error code
+ * Return: number of bytes received from the socket, or a negative error code
+ * on failure.
*/
-static ssize_t tcp_vu_sock_recv(const struct ctx *c,
+static ssize_t tcp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq,
const struct tcp_tap_conn *conn, bool v6,
uint32_t already_sent, size_t fillsize,
int *iov_cnt, int *head_cnt)
{
- struct vu_dev *vdev = c->vdev;
- struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+ const struct vu_dev *vdev = c->vdev;
struct msghdr mh_sock = { 0 };
uint16_t mss = MSS_GET(conn);
int s = conn->sock;
@@ -198,7 +202,7 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
hdrlen = tcp_vu_hdrlen(v6);
- vu_init_elem(elem, &iov_vu[1], VIRTQUEUE_MAX_SIZE);
+ vu_init_elem(elem, &iov_vu[DISCARD_IOV_NUM], VIRTQUEUE_MAX_SIZE);
elem_cnt = 0;
*head_cnt = 0;
@@ -209,7 +213,8 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
cnt = vu_collect(vdev, vq, &elem[elem_cnt],
VIRTQUEUE_MAX_SIZE - elem_cnt,
- MIN(mss, fillsize) + hdrlen, &frame_size);
+ MAX(MIN(mss, fillsize) + hdrlen, ETH_ZLEN),
+ &frame_size);
if (cnt == 0)
break;
@@ -226,16 +231,9 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
elem_cnt += cnt;
}
- if (peek_offset_cap) {
- mh_sock.msg_iov = iov_vu + 1;
- mh_sock.msg_iovlen = elem_cnt;
- } else {
- iov_vu[0].iov_base = tcp_buf_discard;
- iov_vu[0].iov_len = already_sent;
-
- mh_sock.msg_iov = iov_vu;
- mh_sock.msg_iovlen = elem_cnt + 1;
- }
+ if (tcp_prepare_iov(&mh_sock, iov_vu, already_sent, elem_cnt))
+ /* Expect caller to do a TCP reset */
+ return -1;
do
ret = recvmsg(s, &mh_sock, MSG_PEEK);
@@ -259,6 +257,7 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
len -= iov->iov_len;
}
+
/* adjust head count */
while (*head_cnt > 0 && head[*head_cnt - 1] >= i)
(*head_cnt)--;
@@ -306,14 +305,13 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
struct ethhdr *eh;
/* we guess the first iovec provided by the guest can embed
- * all the headers needed by L2 frame
+ * all the headers needed by L2 frame, including any padding
*/
- ASSERT(iov[0].iov_len >= hdrlen);
+ ASSERT(iov[0].iov_len >= MAX(hdrlen, ETH_ZLEN));
eh = vu_eth(base);
memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest));
- memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
/* initialize header */
@@ -337,7 +335,7 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
th->ack = 1;
th->psh = push;
- tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload,
+ tcp_fill_headers(c, conn, NULL, eh, ip4h, ip6h, th, &payload,
*check, conn->seq_to_tap, no_tcp_csum);
if (ip4h)
*check = &ip4h->check;
@@ -349,7 +347,7 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
* @c: Execution context
* @conn: Connection pointer
*
- * Return: Negative on connection reset, 0 otherwise
+ * Return: negative on connection reset, 0 otherwise
*/
int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
{
@@ -376,7 +374,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
conn->seq_ack_from_tap, conn->seq_to_tap);
conn->seq_to_tap = conn->seq_ack_from_tap;
already_sent = 0;
- if (tcp_set_peek_offset(conn->sock, 0)) {
+ if (tcp_set_peek_offset(conn, 0)) {
tcp_rst(c, conn);
return -1;
}
@@ -396,7 +394,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
/* collect the buffers from vhost-user and fill them with the
* data from the socket
*/
- len = tcp_vu_sock_recv(c, conn, v6, already_sent, fillsize,
+ len = tcp_vu_sock_recv(c, vq, conn, v6, already_sent, fillsize,
&iov_cnt, &head_cnt);
if (len < 0) {
if (len != -EAGAIN && len != -EWOULDBLOCK) {
@@ -415,7 +413,12 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
conn_flag(c, conn, STALLED);
} else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) ==
SOCK_FIN_RCVD) {
- int ret = tcp_vu_send_flag(c, conn, FIN | ACK);
+ int ret;
+
+ /* See tcp_buf_data_from_sock() */
+ conn->seq_ack_to_tap = conn->seq_from_tap;
+
+ ret = tcp_vu_send_flag(c, conn, FIN | ACK);
if (ret) {
tcp_rst(c, conn);
return ret;
@@ -457,6 +460,9 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
tcp_vu_prepare(c, conn, iov, buf_cnt, &check, !*c->pcap, push);
+ /* Pad first/single buffer only, it's at least ETH_ZLEN long */
+ vu_pad(iov, dlen + hdrlen);
+
if (*c->pcap) {
pcap_iov(iov, buf_cnt,
sizeof(struct virtio_net_hdr_mrg_rxbuf));
diff --git a/test/.gitignore b/test/.gitignore
index 3573444..9412f0d 100644
--- a/test/.gitignore
+++ b/test/.gitignore
@@ -11,3 +11,5 @@ nstool
rampstream
guest-key
guest-key.pub
+/exeter/
+*.bats
diff --git a/test/Makefile b/test/Makefile
index bf63db8..6ed233a 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -5,7 +5,11 @@
# Copyright Red Hat
# Author: David Gibson <david@gibson.dropbear.id.au>
+BATS = bats -j $(shell nproc)
+EXETOOL = exeter/exetool/exetool
WGET = wget -c
+FLAKE8 = flake8
+MYPY = mypy --strict
DEBIAN_IMGS = debian-8.11.0-openstack-amd64.qcow2 \
debian-10-nocloud-amd64.qcow2 \
@@ -13,7 +17,7 @@ DEBIAN_IMGS = debian-8.11.0-openstack-amd64.qcow2 \
debian-10-generic-ppc64el-20220911-1135.qcow2 \
debian-11-nocloud-amd64.qcow2 \
debian-11-generic-arm64.qcow2 \
- debian-11-generic-ppc64el.qcow2 \
+ debian-11-generic-ppc64el-20250703-2162.qcow2 \
debian-sid-nocloud-amd64-daily.qcow2 \
debian-sid-nocloud-arm64-daily.qcow2 \
debian-sid-nocloud-ppc64el-daily.qcow2
@@ -50,18 +54,27 @@ UBUNTU_NEW_IMGS = xenial-server-cloudimg-powerpc-disk1.img \
jammy-server-cloudimg-s390x.img
UBUNTU_IMGS = $(UBUNTU_OLD_IMGS) $(UBUNTU_NEW_IMGS)
-DOWNLOAD_ASSETS = mbuto podman \
+DOWNLOAD_ASSETS = $(EXETOOL) mbuto podman \
$(DEBIAN_IMGS) $(FEDORA_IMGS) $(OPENSUSE_IMGS) $(UBUNTU_IMGS)
TESTDATA_ASSETS = small.bin big.bin medium.bin \
rampstream
LOCAL_ASSETS = mbuto.img mbuto.mem.img podman/bin/podman QEMU_EFI.fd \
$(DEBIAN_IMGS:%=prepared-%) $(FEDORA_IMGS:%=prepared-%) \
$(UBUNTU_NEW_IMGS:%=prepared-%) \
- nstool guest-key guest-key.pub \
- $(TESTDATA_ASSETS)
+ nstool guest-key guest-key.pub $(TESTDATA_ASSETS)
ASSETS = $(DOWNLOAD_ASSETS) $(LOCAL_ASSETS)
+EXETER_PYPATH = exeter/py3
+EXETER_PYTHON = build/build.py
+EXETER_BATS = smoke/smoke.sh.bats \
+ $(EXETER_PYTHON:%=%.bats) build/static_checkers.sh.bats
+BATS_FILES = $(EXETER_BATS) \
+ podman/test/system/505-networking-pasta.bats
+
+# Python test code (for linters)
+PYPKGS = $(EXETER_PYTHON)
+
CFLAGS = -Wall -Werror -Wextra -pedantic -std=c99
assets: $(ASSETS)
@@ -70,6 +83,11 @@ assets: $(ASSETS)
pull-%: %
git -C $* pull
+exeter:
+ git clone https://gitlab.com/dgibson/exeter.git
+
+exeter/exetool/exetool: pull-exeter
+
mbuto:
git clone git://mbuto.sh/mbuto
@@ -115,6 +133,18 @@ medium.bin:
big.bin:
dd if=/dev/urandom bs=1M count=10 of=$@
+flake8: pull-exeter
+ PYTHONPATH=$(EXETER_PYPATH) $(FLAKE8) $(PYPKGS)
+
+mypy: pull-exeter
+ PYTHONPATH=$(EXETER_PYPATH) $(MYPY) $(PYPKGS)
+
+$(EXETER_BATS): %.bats: % $(EXETOOL)
+ PYTHONPATH=$(EXETER_PYPATH) $(EXETOOL) bats -- $< > $@
+
+bats: $(BATS_FILES) pull-podman
+ PYTHONPATH=$(EXETER_PYPATH) CONTAINERS_HELPER_BINARY_DIR=.. $(BATS) $(BATS_FILES)
+
check: assets
./run
@@ -123,7 +153,9 @@ debug: assets
clean:
rm -f perf.js *~
+ rm -rf .mypy_cache
rm -f $(LOCAL_ASSETS)
+ rm -f $(EXETER_BATS)
rm -rf test_logs
rm -f prepared-*.qcow2 prepared-*.img
@@ -132,79 +164,82 @@ realclean: clean
# Debian downloads
debian-8.11.0-openstack-%.qcow2:
- $(WGET) -O $@ https://cloud.debian.org/images/cloud/OpenStack/archive/8.11.0/debian-8.11.0-openstack-$*.qcow2
+ -$(WGET) -O $@ https://cloud.debian.org/images/cloud/OpenStack/archive/8.11.0/debian-8.11.0-openstack-$*.qcow2
debian-10-nocloud-%.qcow2:
- $(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/latest/debian-10-nocloud-$*.qcow2
+ -$(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/latest/debian-10-nocloud-$*.qcow2
debian-10-generic-ppc64el-20220911-1135.qcow2:
- $(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/20220911-1135/debian-10-generic-ppc64el-20220911-1135.qcow2
+ -$(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/20220911-1135/debian-10-generic-ppc64el-20220911-1135.qcow2
debian-10-generic-%.qcow2:
- $(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/latest/debian-10-generic-$*.qcow2
+ -$(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/latest/debian-10-generic-$*.qcow2
debian-11-nocloud-%.qcow2:
- $(WGET) -O $@ https://cloud.debian.org/images/cloud/bullseye/latest/debian-11-nocloud-$*.qcow2
+ -$(WGET) -O $@ https://cloud.debian.org/images/cloud/bullseye/latest/debian-11-nocloud-$*.qcow2
debian-11-generic-%.qcow2:
- $(WGET) -O $@ https://cloud.debian.org/images/cloud/bullseye/latest/debian-11-generic-$*.qcow2
+ -$(WGET) -O $@ https://cloud.debian.org/images/cloud/bullseye/latest/debian-11-generic-$*.qcow2
+
+debian-11-generic-ppc64el-20250703-2162.qcow2:
+ -$(WGET) -O $@ https://cloud.debian.org/images/cloud/bullseye/20250703-2162/debian-11-generic-ppc64el-20250703-2162.qcow2
debian-sid-nocloud-%-daily.qcow2:
- $(WGET) -O $@ https://cloud.debian.org/images/cloud/sid/daily/latest/debian-sid-nocloud-$*-daily.qcow2
+ -$(WGET) -O $@ https://cloud.debian.org/images/cloud/sid/daily/latest/debian-sid-nocloud-$*-daily.qcow2
# Fedora downloads
Fedora-Cloud-Base-26-1.5.%.qcow2:
- $(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/26/CloudImages/$*/images/Fedora-Cloud-Base-26-1.5.$*.qcow2
+ -$(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/26/CloudImages/$*/images/Fedora-Cloud-Base-26-1.5.$*.qcow2
Fedora-Cloud-Base-27-1.6.%.qcow2:
- $(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/27/CloudImages/$*/images/Fedora-Cloud-Base-27-1.6.$*.qcow2
+ -$(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/27/CloudImages/$*/images/Fedora-Cloud-Base-27-1.6.$*.qcow2
Fedora-Cloud-Base-28-1.1.%.qcow2:
- $(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/28/Cloud/$*/images/Fedora-Cloud-Base-28-1.1.$*.qcow2
+ -$(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/28/Cloud/$*/images/Fedora-Cloud-Base-28-1.1.$*.qcow2
Fedora-Cloud-Base-29-1.2.%.qcow2:
- $(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/29/Cloud/$*/images/Fedora-Cloud-Base-29-1.2.$*.qcow2
+ -$(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/29/Cloud/$*/images/Fedora-Cloud-Base-29-1.2.$*.qcow2
Fedora-Cloud-Base-30-1.2.%.qcow2:
- $(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/30/Cloud/$*/images/Fedora-Cloud-Base-30-1.2.$*.qcow2
+ -$(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/30/Cloud/$*/images/Fedora-Cloud-Base-30-1.2.$*.qcow2
Fedora-Cloud-Base-31-1.9.%.qcow2:
- $(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/31/Cloud/$*/images/Fedora-Cloud-Base-31-1.9.$*.qcow2
+ -$(WGET) -O $@ http://archives.fedoraproject.org/pub/archive/fedora/linux/releases/31/Cloud/$*/images/Fedora-Cloud-Base-31-1.9.$*.qcow2
Fedora-Cloud-Base-32-1.6.%.qcow2:
- $(WGET) -O $@ https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/32/Cloud/$*/images/Fedora-Cloud-Base-32-1.6.$*.qcow2
+ -$(WGET) -O $@ https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/32/Cloud/$*/images/Fedora-Cloud-Base-32-1.6.$*.qcow2
Fedora-Cloud-Base-33-1.2.%.qcow2:
- $(WGET) -O $@ https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/33/Cloud/$*/images/Fedora-Cloud-Base-33-1.2.$*.qcow2
+ -$(WGET) -O $@ https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/33/Cloud/$*/images/Fedora-Cloud-Base-33-1.2.$*.qcow2
Fedora-Cloud-Base-34-1.2.%.qcow2:
- $(WGET) -O $@ https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/34/Cloud/$*/images/Fedora-Cloud-Base-34-1.2.$*.qcow2
+ -$(WGET) -O $@ https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/34/Cloud/$*/images/Fedora-Cloud-Base-34-1.2.$*.qcow2
Fedora-Cloud-Base-35-1.2.%.qcow2:
- $(WGET) -O $@ https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/35/Cloud/$*/images/Fedora-Cloud-Base-35-1.2.$*.qcow2
+ -$(WGET) -O $@ https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/35/Cloud/$*/images/Fedora-Cloud-Base-35-1.2.$*.qcow2
# OpenSuSE downloads
openSUSE-Leap-15.1-JeOS.x86_64-kvm-and-xen.qcow2:
- $(WGET) -O $@ https://download.opensuse.org/distribution/leap/15.1/jeos/openSUSE-Leap-15.1-JeOS.x86_64-kvm-and-xen.qcow2
+ -$(WGET) -O $@ https://download.opensuse.org/distribution/leap/15.1/jeos/openSUSE-Leap-15.1-JeOS.x86_64-kvm-and-xen.qcow2
openSUSE-Leap-15.2-JeOS.x86_64-kvm-and-xen.qcow2:
- $(WGET) -O $@ https://download.opensuse.org/distribution/leap/15.2/appliances/openSUSE-Leap-15.2-JeOS.x86_64-kvm-and-xen.qcow2
+ -$(WGET) -O $@ https://download.opensuse.org/distribution/leap/15.2/appliances/openSUSE-Leap-15.2-JeOS.x86_64-kvm-and-xen.qcow2
openSUSE-Leap-15.3-JeOS.x86_64-kvm-and-xen.qcow2:
- $(WGET) -O $@ https://download.opensuse.org/distribution/leap/15.3/appliances/openSUSE-Leap-15.3-JeOS.x86_64-kvm-and-xen.qcow2
+ -$(WGET) -O $@ https://download.opensuse.org/distribution/leap/15.3/appliances/openSUSE-Leap-15.3-JeOS.x86_64-kvm-and-xen.qcow2
openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz:
- $(WGET) -O $@ http://download.opensuse.org/ports/aarch64/tumbleweed/appliances/openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz
+ -$(WGET) -O $@ http://download.opensuse.org/ports/aarch64/tumbleweed/appliances/openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz
openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz:
- $(WGET) -O $@ http://download.opensuse.org/ports/armv7hl/tumbleweed/appliances/openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz
+ -$(WGET) -O $@ http://download.opensuse.org/ports/armv7hl/tumbleweed/appliances/openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz
# Ubuntu downloads
trusty-server-cloudimg-%-disk1.img:
- $(WGET) -O $@ https://cloud-images.ubuntu.com/trusty/current/trusty-server-cloudimg-$*-disk1.img
+ -$(WGET) -O $@ https://cloud-images.ubuntu.com/trusty/current/trusty-server-cloudimg-$*-disk1.img
xenial-server-cloudimg-powerpc-disk1.img:
- $(WGET) -O $@ https://cloud-images.ubuntu.com/xenial/current/xenial-server-cloudimg-powerpc-disk1.img
+ -$(WGET) -O $@ https://cloud-images.ubuntu.com/xenial/current/xenial-server-cloudimg-powerpc-disk1.img
jammy-server-cloudimg-s390x.img:
- $(WGET) -O $@ https://cloud-images.ubuntu.com/jammy/current/jammy-server-cloudimg-s390x.img
+ -$(WGET) -O $@ https://cloud-images.ubuntu.com/jammy/current/jammy-server-cloudimg-s390x.img
diff --git a/test/README.md b/test/README.md
index 91ca603..0df7533 100644
--- a/test/README.md
+++ b/test/README.md
@@ -32,7 +32,7 @@ Example for Debian, and possibly most Debian-based distributions:
git go iperf3 isc-dhcp-common jq libgpgme-dev libseccomp-dev linux-cpupower
lm-sensors lz4 netavark netcat-openbsd psmisc qemu-efi-aarch64
qemu-system-arm qemu-system-misc qemu-system-ppc qemu-system-x86
- qemu-system-x86 sipcalc socat strace tmux uidmap valgrind
+ sipcalc socat strace tmux uidmap valgrind
NOTE: the tests need a qemu version >= 7.2, or one that contains commit
13c6be96618c ("net: stream: add unix socket"): this change introduces support
@@ -81,7 +81,12 @@ The following additional packages are commonly needed:
## Regular test
-Just issue:
+Before running the tests, you need to prepare the required assets:
+
+ cd test
+ make assets
+
+Then issue:
./run
@@ -91,6 +96,32 @@ variable settings: DEBUG=1 enables debugging messages, TRACE=1 enables tracing
PCAP=1 TRACE=1 ./run
+**Note:**
+
+* Don't run the tests as root, the whole point of passt is not to run as root.
+
+* If you switch users before running the tests, you may hit "Permission denied"
+ error. It's probably due to
+ [Bug 967509](https://bugzilla.redhat.com/show_bug.cgi?id=967509).
+ If you switch users with `su` or `sudo`, the directory `/run/user/ID` may
+ not be created, and `XDG_RUNTIME_DIR` points to the /run/user directory of
+ the previous user rather than the target user.
+
+ **Workaround:** Log out and log back in as the intended user to ensure the
+ correct runtime directory is set up. Or use `machinectl shell --uid=$user`.
+
+* SELinux may prevent the tests from running correctly. To avoid this,
+ temporarily disable it by running:
+
+ setenforce 0
+
+* Some tests require a QEMU version >= 10.0.0, or a build that includes the
+ following commits:
+
+ 60f543ad917f ("virtio-net: vhost-user: Implement internal migration")
+ 3f65357313e0 ("vhost: Add stubs for the migration state transfer
+ interface")
+
## Running selected tests
Rudimentary support to run a list of selected tests, without support for
diff --git a/test/build/all b/test/build/all
deleted file mode 100644
index 1f79e0d..0000000
--- a/test/build/all
+++ /dev/null
@@ -1,61 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-or-later
-#
-# PASST - Plug A Simple Socket Transport
-# for qemu/UNIX domain socket mode
-#
-# PASTA - Pack A Subtle Tap Abstraction
-# for network namespace/tap device mode
-#
-# test/build/all - Build targets, one by one, then all together, check output
-#
-# Copyright (c) 2021 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
-
-htools make cc rm uname getconf mkdir cp rm man
-
-test Build passt
-host make clean
-check ! [ -e passt ]
-host CFLAGS="-Werror" make passt
-check [ -f passt ]
-
-test Build pasta
-host make clean
-check ! [ -e pasta ]
-host CFLAGS="-Werror" make pasta
-check [ -h pasta ]
-
-test Build qrap
-host make clean
-check ! [ -e qrap ]
-host CFLAGS="-Werror" make qrap
-check [ -f qrap ]
-
-test Build all
-host make clean
-check ! [ -e passt ]
-check ! [ -e pasta ]
-check ! [ -e qrap ]
-host CFLAGS="-Werror" make
-check [ -f passt ]
-check [ -h pasta ]
-check [ -f qrap ]
-
-test Install
-host mkdir __STATEDIR__/prefix
-host prefix=__STATEDIR__/prefix make install
-check [ -f __STATEDIR__/prefix/bin/passt ]
-check [ -h __STATEDIR__/prefix/bin/pasta ]
-check [ -f __STATEDIR__/prefix/bin/qrap ]
-check man -M __STATEDIR__/prefix/share/man -W passt
-check man -M __STATEDIR__/prefix/share/man -W pasta
-check man -M __STATEDIR__/prefix/share/man -W qrap
-
-test Uninstall
-host prefix=__STATEDIR__/prefix make uninstall
-check ! [ -f __STATEDIR__/prefix/bin/passt ]
-check ! [ -h __STATEDIR__/prefix/bin/pasta ]
-check ! [ -f __STATEDIR__/prefix/bin/qrap ]
-check ! man -M __STATEDIR__/prefix/share/man -W passt 2>/dev/null
-check ! man -M __STATEDIR__/prefix/share/man -W pasta 2>/dev/null
-check ! man -M __STATEDIR__/prefix/share/man -W qrap 2>/dev/null
diff --git a/test/build/build.py b/test/build/build.py
new file mode 100755
index 0000000..e3de830
--- /dev/null
+++ b/test/build/build.py
@@ -0,0 +1,110 @@
+#! /usr/bin/env python3
+#
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+# for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+# for network namespace/tap device mode
+#
+# test/build/build.py - Test build and install targets
+#
+# Copyright Red Hat
+# Author: David Gibson <david@gibson.dropbear.id.au>
+
+import contextlib
+import os
+from pathlib import Path
+import subprocess
+import tempfile
+from typing import Iterator
+
+import exeter
+
+
+def sh(cmd: str) -> None:
+ """Run given command in a shell"""
+ subprocess.run(cmd, shell=True)
+
+
+@contextlib.contextmanager
+def clone_sources() -> Iterator[str]:
+ """Create a temporary copy of the passt sources.
+
+ When the context enters create a temporary directory and copy the
+ passt sources into it. Clean it up when the context exits.
+ """
+
+ os.chdir('..') # Move from test/ to repo base
+ with tempfile.TemporaryDirectory(ignore_cleanup_errors=False) as tmpdir:
+ sh(f"cp --parents -d $(git ls-files) {tmpdir}")
+ os.chdir(tmpdir)
+ yield tmpdir
+
+
+def test_make(target: str, expected_files: list[str]) -> None:
+ """Test `make {target}`
+
+ Arguments:
+ target -- make target to invoke
+ expected_files -- files make is expected to create
+
+ Verifies that
+ 1) `make target` completes successfully
+ 2) expected_files care created by `make target`
+ 3) expected_files are removed by `make clean`
+ """
+
+ ex_paths = [Path(f) for f in expected_files]
+ with clone_sources():
+ for p in ex_paths:
+ assert not p.exists(), f"{p} existed before make"
+ sh(f'make {target} CFLAGS="-Werror"')
+ for p in ex_paths:
+ assert p.exists(), f"{p} wasn't made"
+ sh('make clean')
+ for p in ex_paths:
+ assert not p.exists(), f"{p} existed after make clean"
+
+
+exeter.register('make_passt', test_make, 'passt', ['passt'])
+exeter.register('make_pasta', test_make, 'pasta', ['pasta'])
+exeter.register('make_qrap', test_make, 'qrap', ['qrap'])
+exeter.register('make_all', test_make, 'all', ['passt', 'pasta', 'qrap'])
+
+
+@exeter.test
+def test_install_uninstall() -> None:
+ """Test `make install` and `make uninstall`
+
+ Tests that `make install` installs the expected files to the
+ install prefix, and that `make uninstall` removes them again.
+ """
+
+ with clone_sources():
+ with tempfile.TemporaryDirectory(ignore_cleanup_errors=False) \
+ as prefix:
+ bindir = Path(prefix) / 'bin'
+ mandir = Path(prefix) / 'share/man'
+ progs = ['passt', 'pasta', 'qrap']
+
+ # Install
+ sh(f'make install CFLAGS="-Werror" prefix={prefix}')
+
+ for prog in progs:
+ exe = bindir / prog
+ assert exe.is_file(), f"{exe} does not exist as a regular file"
+ sh(f'man -M {mandir} -W {prog}')
+
+ # Uninstall
+ sh(f'make uninstall prefix={prefix}')
+
+ for prog in progs:
+ exe = bindir / prog
+ assert not exe.exists(), f"{exe} exists after uninstall"
+ sh(f'! man -M {mandir} -W {prog}')
+
+
+if __name__ == '__main__':
+ exeter.main()
diff --git a/test/build/clang_tidy b/test/build/clang_tidy
deleted file mode 100644
index 40573bf..0000000
--- a/test/build/clang_tidy
+++ /dev/null
@@ -1,17 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-or-later
-#
-# PASST - Plug A Simple Socket Transport
-# for qemu/UNIX domain socket mode
-#
-# PASTA - Pack A Subtle Tap Abstraction
-# for network namespace/tap device mode
-#
-# test/build/clang_tidy - Run source through clang-tidy(1) linter
-#
-# Copyright (c) 2021 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
-
-htools clang-tidy
-
-test Run clang-tidy
-host make clang-tidy
diff --git a/test/build/cppcheck b/test/build/cppcheck
deleted file mode 100644
index 0e1dbce..0000000
--- a/test/build/cppcheck
+++ /dev/null
@@ -1,17 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-or-later
-#
-# PASST - Plug A Simple Socket Transport
-# for qemu/UNIX domain socket mode
-#
-# PASTA - Pack A Subtle Tap Abstraction
-# for network namespace/tap device mode
-#
-# test/build/cppcheck - Run source through cppcheck(1) linter
-#
-# Copyright (c) 2021 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
-
-htools cppcheck
-
-test Run cppcheck
-host make cppcheck
diff --git a/test/build/static_checkers.sh b/test/build/static_checkers.sh
new file mode 100755
index 0000000..96679fb
--- /dev/null
+++ b/test/build/static_checkers.sh
@@ -0,0 +1,42 @@
+#! /bin/sh
+#
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+# for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+# for network namespace/tap device mode
+#
+# test/build/static_checkers.sh - Run static checkers
+#
+# Copyright Red Hat
+# Author: David Gibson <david@gibson.dropbear.id.au>
+
+. $(dirname ${0})/../exeter/sh/exeter.sh
+
+# do_check() - Run static checker as a test if the binary is available
+# $1: Static checker (uased as both executable name and make target)
+# $@: Any additional arguments required to make
+do_check() {
+ checker="${1}"
+ shift
+ if ! which "${checker}" >/dev/null 2>/dev/null; then
+ exeter_skip "${checker} not available"
+ fi
+ make "${@}" "${checker}"
+}
+
+exeter_register cppcheck do_check cppcheck -C ..
+exeter_set_description cppcheck "passt sources pass cppcheck"
+
+exeter_register clang_tidy do_check clang-tidy -C ..
+exeter_set_description clang_tidy "passt sources pass clang-tidy"
+
+exeter_register flake8 do_check flake8
+exeter_set_description flake8 "passt tests in Python pass flake8"
+
+exeter_register mypy do_check mypy
+exeter_set_description mypy "passt tests in Python pass mypy --strict"
+
+exeter_main "${@}"
diff --git a/test/demo/podman b/test/demo/podman
index edd403a..393691c 100644
--- a/test/demo/podman
+++ b/test/demo/podman
@@ -310,8 +310,8 @@ nl
say Everything is set now, let's start
sleep 2
hout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-hout ADDR4 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local'
-hout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local'
+hout ADDR4 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope != "host" and .scope != "link").local'
+hout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope != "host" and .scope != "link").local'
hout GW4 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
hout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
diff --git a/test/lib/exeter b/test/lib/exeter
new file mode 100644
index 0000000..ccdb19c
--- /dev/null
+++ b/test/lib/exeter
@@ -0,0 +1,66 @@
+#!/bin/sh
+#
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+# for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+# for network namespace/tap device mode
+#
+# test/lib/exeter - Run exeter tests within the rest of passt's tests
+#
+# Copyright Red Hat
+# Author: David Gibson <david@gibson.dropbear.id.au>
+
+EXETOOL="${BASEPATH}/exeter/exetool/exetool"
+
+# is_exeter() - Determine if a test file is an exeter program
+# $@: Command line to invoke test program
+is_exeter() {
+ ${EXETOOL} probe -- "${@}"
+}
+
+# exeter() - Run each test in an exeter program, logging each test separately
+# $@: Command line to invoke exeter test program
+exeter() {
+ STATESETUP="${STATEBASE}/${1}"
+ mkdir -p "${STATESETUP}"
+
+ context_setup_host host
+ layout_host
+
+ cd test
+
+ __ntests=$(${EXETOOL} list -- "${@}" | wc -l)
+ if [ ${?} != 0 ]; then
+ info "Failed to get exeter manifest for ${@}"
+ pause_continue \
+ "Press any key to pause test session" \
+ "Resuming in " \
+ "Paused, press any key to continue" \
+ 5
+ return
+ fi
+
+ status_file_start "${*} (exeter)" ${__ntests}
+ [ ${CI} -eq 1 ] && video_link "${1}"
+
+ for __testid in $(${EXETOOL} list -- "${@}"); do
+ __desc="$(${EXETOOL} desc -- "${@}" -- "${__testid}")"
+ status_test_start "${__desc}"
+ status=0
+ context_run host "${*} '${__testid}'" || status="${?}"
+ if [ "${status}" = 0 ]; then
+ status_test_ok
+ elif [ "${status}" = 77 ]; then
+ status_test_skip
+ else
+ status_test_fail
+ fi
+ done
+
+ cd ..
+
+ teardown_context_watch ${PANE_HOST} host
+}
diff --git a/test/lib/setup b/test/lib/setup
index 575bc21..5994598 100755
--- a/test/lib/setup
+++ b/test/lib/setup
@@ -350,7 +350,7 @@ setup_migrate() {
sleep 1
- __opts="--vhost-user"
+ __opts="--vhost-user --migrate-exit --migrate-no-linger"
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap"
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
@@ -360,7 +360,7 @@ setup_migrate() {
context_run_bg passt_repair_1 "./passt-repair ${STATESETUP}/passt_1.socket.repair"
- __opts="--vhost-user"
+ __opts="--vhost-user --migrate-exit --migrate-no-linger"
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap"
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
diff --git a/test/lib/term b/test/lib/term
index ed690de..f596364 100755
--- a/test/lib/term
+++ b/test/lib/term
@@ -19,6 +19,7 @@ STATUS_FILE_INDEX=0
STATUS_COLS=
STATUS_PASS=0
STATUS_FAIL=0
+STATUS_SKIPPED=0
PR_RED='\033[1;31m'
PR_GREEN='\033[1;32m'
@@ -28,32 +29,32 @@ PR_NC='\033[0m'
PR_DELAY_INIT=100 # ms
# info() - Highlight test log pane, print message to it and to log file
-# $@: Message to print
+# $*: Message to print
info() {
tmux select-pane -t ${PANE_INFO}
- printf "${@}\n" >> $STATEBASE/log_pipe
- printf "${@}\n" >> "${LOGFILE}"
+ printf "%b\n" "${*}" >> $STATEBASE/log_pipe
+ printf "%b\n" "${*}" >> "${LOGFILE}"
}
# info_n() - Highlight, print message to pane and to log file without newline
-# $@: Message to print
+# $*: Message to print
info_n() {
tmux select-pane -t ${PANE_INFO}
- printf "${@}" >> $STATEBASE/log_pipe
- printf "${@}" >> "${LOGFILE}"
+ printf "%b" "${*}" >> $STATEBASE/log_pipe
+ printf "%b" "${*}" >> "${LOGFILE}"
}
# info_nolog() - Highlight test log pane, print message to it
-# $@: Message to print
+# $*: Message to print
info_nolog() {
tmux select-pane -t ${PANE_INFO}
- printf "${@}\n" >> $STATEBASE/log_pipe
+ printf "%b\n" "${*}" >> $STATEBASE/log_pipe
}
# info_nolog() - Print message to log file
-# $@: Message to print
+# $*: Message to print
log() {
- printf "${@}\n" >> "${LOGFILE}"
+ printf "%b\n" "${*}" >> "${LOGFILE}"
}
# info_nolog_n() - Send message to pane without highlighting it, without newline
@@ -362,8 +363,8 @@ status_test_start() {
info_check() {
switch_pane ${PANE_INFO}
- printf "${PR_YELLOW}?${PR_NC} ${@}" >> $STATEBASE/log_pipe
- printf "? ${@}" >> "${LOGFILE}"
+ printf "%b" "${PR_YELLOW}?${PR_NC} ${*}" >> $STATEBASE/log_pipe
+ printf "? %b" "${*}" >> "${LOGFILE}"
}
# info_check_passed() - Display and log a new line when a check passes
@@ -439,19 +440,21 @@ info_layout() {
# status_test_ok() - Update counter of passed tests, log and display message
status_test_ok() {
STATUS_PASS=$((STATUS_PASS + 1))
- tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | #(TZ="UTC" date -Iseconds)"
+ tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)"
info_passed
}
# status_test_fail() - Update counter of failed tests, log and display message
status_test_fail() {
STATUS_FAIL=$((STATUS_FAIL + 1))
- tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | #(TZ="UTC" date -Iseconds)"
+ tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)"
info_failed
}
# status_test_fail() - Update counter of failed tests, log and display message
status_test_skip() {
+ STATUS_SKIPPED=$((STATUS_SKIPPED + 1))
+ tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)"
info_skipped
}
@@ -704,7 +707,7 @@ term() {
tmux set window-status-current-style 'bg=colour1 fg=colour233 bold'
tmux set status-right '#(TZ="UTC" date -Iseconds)'
- tmux set status-right-length 50
+ tmux set status-right-length 64
tmux set status-right-style 'bg=colour1 fg=colour233 bold'
tmux set history-limit 500000
diff --git a/test/lib/test b/test/lib/test
index 758250a..7349674 100755
--- a/test/lib/test
+++ b/test/lib/test
@@ -20,10 +20,7 @@ test_iperf3s() {
__sctx="${1}"
__port="${2}"
- pane_or_context_run_bg "${__sctx}" \
- 'iperf3 -s -p'${__port}' & echo $! > s.pid' \
-
- sleep 1 # Wait for server to be ready
+ pane_or_context_run "${__sctx}" 'iperf3 -s -p'${__port}' -D -I s.pid'
}
# test_iperf3k() - Kill iperf3 server
@@ -31,7 +28,7 @@ test_iperf3s() {
test_iperf3k() {
__sctx="${1}"
- pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid); rm s.pid'
+ pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid)'
sleep 1 # Wait for kernel to free up ports
}
diff --git a/test/memory/passt b/test/memory/passt
index 7e45724..c5142ea 100644
--- a/test/memory/passt
+++ b/test/memory/passt
@@ -51,7 +51,7 @@ guest sed /proc/slabinfo -ne 's/^\([^ ]* *[^ ]* *[^ ]* *[^ ]*\).*/\\\1/p' > /tmp
guest kill \$(cat /tmp/pid)
guest diff -y --suppress-common-lines /tmp/meminfo.before /tmp/meminfo.after || :
guest nm -td -Sr --size-sort -P /bin/passt.avx2 | head -30 | tee /tmp/nm.size
-guest sed /proc/slabinfo -ne 's/\(.*<objsize>\).*$/\1/p' | tail -1; (diff -y --suppress-common-lines /tmp/slabinfo.before /tmp/slabinfo.after | sort -grk8)
+guest sed /proc/slabinfo -ne 's/\(.*<objsize>\).*$/\\\1/p' | tail -1; (diff -y --suppress-common-lines /tmp/slabinfo.before /tmp/slabinfo.after | sort -grk8)
endef
def summary
diff --git a/test/migrate/basic b/test/migrate/basic
index 3f11f7d..bab2d76 100644
--- a/test/migrate/basic
+++ b/test/migrate/basic
@@ -39,8 +39,8 @@ guest1 /sbin/dhclient -6 __IFNAME1__
# Wait for DAD to complete on the DHCP address
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
-hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
-check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
+hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope != "host" and .scope != "link" and .deprecated != true).local] | join(" ")'
+check echo "__HOST_ADDR6__" | grep -wq "__ADDR1_6__"
test TCP/IPv4: guest1/guest2 > host
g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
diff --git a/test/migrate/basic_fin b/test/migrate/basic_fin
index aa61ec5..1d92c92 100644
--- a/test/migrate/basic_fin
+++ b/test/migrate/basic_fin
@@ -39,8 +39,8 @@ guest1 /sbin/dhclient -6 __IFNAME1__
# Wait for DAD to complete on the DHCP address
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
-hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
-check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
+hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope != "host" and .scope != "link" and .deprecated != true).local] | join(" ")'
+check echo "__HOST_ADDR6__" | grep -wq "__ADDR1_6__"
test TCP/IPv4: guest1, half-close, guest2 > host
g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
diff --git a/test/migrate/iperf3_bidir6 b/test/migrate/iperf3_bidir6
index 4bfefb5..e95eee8 100644
--- a/test/migrate/iperf3_bidir6
+++ b/test/migrate/iperf3_bidir6
@@ -44,8 +44,8 @@ guest1 /sbin/dhclient -6 __IFNAME1__
# Wait for DAD to complete on the DHCP address
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
-hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
-check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
+hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope != "host" and .scope != "link" and .deprecated != true).local] | join(" ")'
+check echo "__HOST_ADDR6__" | grep -wq "__ADDR1_6__"
test TCP/IPv6 host <-> guest flood, many flows, during migration
diff --git a/test/migrate/iperf3_in6 b/test/migrate/iperf3_in6
index 16cf504..0e863a4 100644
--- a/test/migrate/iperf3_in6
+++ b/test/migrate/iperf3_in6
@@ -44,8 +44,8 @@ guest1 /sbin/dhclient -6 __IFNAME1__
# Wait for DAD to complete on the DHCP address
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
-hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
-check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
+hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope != "host" and .scope != "link" and .deprecated != true).local] | join(" ")'
+check echo "__HOST_ADDR6__" | grep -wq "__ADDR1_6__"
test TCP/IPv6 host to guest throughput during migration
diff --git a/test/migrate/iperf3_many_out6 b/test/migrate/iperf3_many_out6
index 88133f2..179e269 100644
--- a/test/migrate/iperf3_many_out6
+++ b/test/migrate/iperf3_many_out6
@@ -44,8 +44,8 @@ guest1 /sbin/dhclient -6 __IFNAME1__
# Wait for DAD to complete on the DHCP address
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
-hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
-check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
+hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope != "host" and .scope != "link" and .deprecated != true).local] | join(" ")'
+check echo "__HOST_ADDR6__" | grep -wq "__ADDR1_6__"
test TCP/IPv6 guest to host flood, many flows, during migration
diff --git a/test/migrate/iperf3_out6 b/test/migrate/iperf3_out6
index 21fbfcd..20e6e95 100644
--- a/test/migrate/iperf3_out6
+++ b/test/migrate/iperf3_out6
@@ -44,8 +44,8 @@ guest1 /sbin/dhclient -6 __IFNAME1__
# Wait for DAD to complete on the DHCP address
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
-hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
-check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
+hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope != "host" and .scope != "link" and .deprecated != true).local] | join(" ")'
+check echo "__HOST_ADDR6__" | grep -wq "__ADDR1_6__"
test TCP/IPv6 guest to host throughput during migration
diff --git a/test/migrate/rampstream_in b/test/migrate/rampstream_in
index df333ba..5212dfc 100644
--- a/test/migrate/rampstream_in
+++ b/test/migrate/rampstream_in
@@ -40,8 +40,8 @@ guest1 /sbin/dhclient -6 __IFNAME1__
# Wait for DAD to complete on the DHCP address
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
-hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
-check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
+hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope != "host" and .scope != "link" and .deprecated != true).local] | join(" ")'
+check echo "__HOST_ADDR6__" | grep -wq "__ADDR1_6__"
test TCP/IPv4: sequence check, ramps, inbound
g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
diff --git a/test/migrate/rampstream_out b/test/migrate/rampstream_out
index 8ed3229..897396d 100644
--- a/test/migrate/rampstream_out
+++ b/test/migrate/rampstream_out
@@ -40,8 +40,8 @@ guest1 /sbin/dhclient -6 __IFNAME1__
# Wait for DAD to complete on the DHCP address
guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
-hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
-check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
+hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope != "host" and .scope != "link" and .deprecated != true).local] | join(" ")'
+check echo "__HOST_ADDR6__" | grep -wq "__ADDR1_6__"
test TCP/IPv4: sequence check, ramps, outbound
g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
diff --git a/test/passt.mbuto b/test/passt.mbuto
index 5e00132..598c254 100755
--- a/test/passt.mbuto
+++ b/test/passt.mbuto
@@ -28,9 +28,12 @@ KMODS="${KMODS:- virtio_net virtio_pci vmw_vsock_virtio_transport}"
LINKS="${LINKS:-
ash,dash,bash /init
- ash,dash,bash /bin/sh}"
+ ash,dash,bash /bin/sh
+ sshd /usr/sbin/sshd
+ dhclient /usr/sbin/dhclient
+ sysctl /usr/sbin/sysctl}"
-DIRS="${DIRS} /tmp /usr/sbin /usr/share /var/log /var/lib /etc/ssh /run/sshd /root/.ssh"
+DIRS="${DIRS} /tmp /usr/sbin /usr/bin /usr/share /var/log /var/lib /etc/ssh /run/sshd /root/.ssh"
COPIES="${COPIES} small.bin,/root/small.bin medium.bin,/root/medium.bin big.bin,/root/big.bin rampstream,/bin/rampstream rampstream-check.sh,/bin/rampstream-check.sh"
@@ -61,7 +64,9 @@ set >> \$LOG
exit 0
EOF
chmod 755 /sbin/dhclient-script
- ln -s /bin /usr/bin
+ mv /bin/* /usr/bin || :
+ rm -rf /bin
+ ln -s /usr/bin /bin
ln -s /run /var/run
:> /etc/fstab
diff --git a/test/passt.mem.mbuto b/test/passt.mem.mbuto
index 532eae0..7554a43 100755
--- a/test/passt.mem.mbuto
+++ b/test/passt.mem.mbuto
@@ -12,7 +12,7 @@
PROGS="${PROGS:-ash,dash,bash chmod ip mount insmod mkdir ln cat chmod modprobe
grep mknod sed chown sleep bc ls ps mount unshare chroot cp kill diff
- head tail sort tr tee cut nm which switch_root}"
+ head tail sort tr tee cut nm which switch_root mv rm}"
KMODS="${KMODS:- dummy}"
@@ -22,12 +22,14 @@ LINKS="${LINKS:-
ash,dash,bash /init
ash,dash,bash /bin/sh}"
-DIRS="${DIRS} /tmp /sbin"
+DIRS="${DIRS} /tmp /sbin /usr/bin"
COPIES="${COPIES} ../passt.avx2,/bin/passt.avx2"
FIXUP="${FIXUP}"'
-ln -s /bin /usr/bin
+mv /bin/* /usr/bin || :
+rm -rf /bin
+ln -s /usr/bin /bin
chmod 777 /tmp
sh +m
'
diff --git a/test/passt/dhcp b/test/passt/dhcp
index 145f1ba..904faab 100644
--- a/test/passt/dhcp
+++ b/test/passt/dhcp
@@ -61,8 +61,8 @@ guest /sbin/dhclient -6 __IFNAME__
# Wait for DAD to complete
guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
-hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
-check [ "__ADDR6__" = "__HOST_ADDR6__" ]
+hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope != "host" and .scope != "link" and .deprecated != true).local] | join(" ")'
+check echo "__HOST_ADDR6__" | grep -wq "__ADDR6__"
test DHCPv6: route
gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
diff --git a/test/passt/ndp b/test/passt/ndp
index 516cd6b..80b72bb 100644
--- a/test/passt/ndp
+++ b/test/passt/ndp
@@ -25,9 +25,9 @@ check [ -n "__IFNAME__" ]
test SLAAC: prefix
gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]'
gout PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4
-hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
-hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4
-check [ "__PREFIX6__" = "__HOST_PREFIX6__" ]
+hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope != "host" and .scope != "link" and .deprecated != true).local] | join("/64 ")'
+hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4 | tr '\n' ' '
+check echo "__HOST_PREFIX6__" | grep -wq "__PREFIX6__"
test SLAAC: route
gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
diff --git a/test/passt_in_ns/dhcp b/test/passt_in_ns/dhcp
index a38a690..9d89049 100644
--- a/test/passt_in_ns/dhcp
+++ b/test/passt_in_ns/dhcp
@@ -55,8 +55,8 @@ guest /sbin/dhclient -6 __IFNAME__
# Wait for DAD to complete
guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
-hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
-check [ "__ADDR6__" = "__HOST_ADDR6__" ]
+hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope != "host" and .scope != "link" and .deprecated != true).local] | join(",")'
+check echo "__HOST_ADDR6__" | grep -wq "__ADDR6__"
test DHCPv6: route
gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
diff --git a/test/pasta/dhcp b/test/pasta/dhcp
index d4f3ad5..366935f 100644
--- a/test/pasta/dhcp
+++ b/test/pasta/dhcp
@@ -39,8 +39,8 @@ ns /sbin/dhclient -6 --no-pid __IFNAME__
ns while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
-hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
-check [ __ADDR6__ = __HOST_ADDR6__ ]
+hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope != "host" and .scope != "link" and .deprecated != true).local] | join(" ")'
+check echo "__HOST_ADDR6__" | grep -wq "__ADDR6__"
test DHCPv6: route
nsout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
diff --git a/test/pasta/ndp b/test/pasta/ndp
index 952c1ea..1d385c7 100644
--- a/test/pasta/ndp
+++ b/test/pasta/ndp
@@ -24,9 +24,9 @@ ns while ! ip -j -6 addr show dev __IFNAME__ | jq -e '.[].addr_info.[] | select(
test SLAAC: prefix
nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]'
nsout PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4
-hout HOST_ADDR6 ip -j -6 addr show|jq -rM ['.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
-hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4
-check [ "__PREFIX6__" = "__HOST_PREFIX6__" ]
+hout HOST_ADDR6 ip -j -6 addr show|jq -rM ['.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope != "host" and .scope != "link" and .deprecated != true).local]| join("/64 ")'
+hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4 | tr '\n' ' '
+check echo "__HOST_PREFIX6__" | grep -wq "__PREFIX6__"
test SLAAC: route
nsout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway'
diff --git a/test/pasta_options/log_to_file b/test/pasta_options/log_to_file
index 3ead06c..db78b04 100644
--- a/test/pasta_options/log_to_file
+++ b/test/pasta_options/log_to_file
@@ -30,19 +30,19 @@ endef
test Log creation
-set PORTS -t 10001,10002 -u 10001,10002
+set PORTS -t 10001,10002 -u 10001,10002 -T none -U none
set LOG_FILE __STATEDIR__/pasta.log
-passt ./pasta -l __LOG_FILE__ -- /bin/true
+passt ./pasta __PORTS__ -l __LOG_FILE__ -- /bin/true
check [ -s __LOG_FILE__ ]
test Log truncated on creation
-passt ./pasta -l __LOG_FILE__ -- /bin/true & wait
+passt ./pasta __PORTS__ -l __LOG_FILE__ -- /bin/true & wait
pout PID2 echo $!
check head -1 __LOG_FILE__ | grep '^pasta .* [(]__PID2__[)]$'
test Maximum log size
-passtb ./pasta --config-net -d -f -l __LOG_FILE__ --log-size $((100 * 1024)) -- sh -c 'while true; do tcp_crr --nolog -l1 -P 10001 -C 10002 -6; done'
+passtb ./pasta __PORTS__ --config-net -d -f -l __LOG_FILE__ --log-size $((100 * 1024)) -- sh -c 'while true; do tcp_crr --nolog -l1 -P 10001 -C 10002 -6; done'
sleep 1
flood_log_client
@@ -67,7 +67,7 @@ passt unshare -rUm
passt mkdir __STATEDIR__/t
passt mount -t tmpfs none __STATEDIR__/t
set LOG_FILE __STATEDIR__/t/log
-passt ./pasta --config-net -d -l __LOG_FILE__ --log-size $((100 * 1024))
+passt ./pasta __PORTS__ --config-net -d -l __LOG_FILE__ --log-size $((100 * 1024))
flood_log_server
flood_log_client
diff --git a/test/perf/passt_tcp b/test/perf/passt_tcp
index 5978c49..1a97a63 100644
--- a/test/perf/passt_tcp
+++ b/test/perf/passt_tcp
@@ -87,7 +87,7 @@ lat -
lat -
nsb tcp_crr --nolog -6
gout LAT tcp_crr --nolog -l1 -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p'
-lat __LAT__ 500 400
+lat __LAT__ 550 450
tr TCP throughput over IPv4: guest to host
iperf3s ns 10002
@@ -137,7 +137,7 @@ lat -
lat -
nsb tcp_crr --nolog -4
gout LAT tcp_crr --nolog -l1 -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p'
-lat __LAT__ 500 400
+lat __LAT__ 550 450
tr TCP throughput over IPv6: host to guest
iperf3s guest 10001
@@ -208,6 +208,6 @@ lat -
guestb tcp_crr --nolog -P 10001 -C 10011 -4
sleep 1
nsout LAT tcp_crr --nolog -l1 -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p'
-lat __LAT__ 500 300
+lat __LAT__ 500 350
te
diff --git a/test/perf/pasta_tcp b/test/perf/pasta_tcp
index bc0de3c..496d0fe 100644
--- a/test/perf/pasta_tcp
+++ b/test/perf/pasta_tcp
@@ -211,7 +211,7 @@ tr TCP throughput over IPv6: host to ns
iperf3s ns 10002
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local] | .[0]'
+nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope != "host" and .scope != "link").local] | .[0]'
bw -
bw -
bw -
diff --git a/test/perf/pasta_udp b/test/perf/pasta_udp
index ab2f3e8..c51bb6c 100644
--- a/test/perf/pasta_udp
+++ b/test/perf/pasta_udp
@@ -39,7 +39,7 @@ iperf3s host 10003
# (datagram size) = (packet size) - 48: 40 bytes of IPv6 header, 8 of UDP header
iperf3 BW ns ::1 10003 __TIME__ __OPTS__ -b 5G -l 1452
-bw __BW__ 1.0 1.5
+bw __BW__ 0.8 1.2
iperf3 BW ns ::1 10003 __TIME__ __OPTS__ -b 10G -l 3972
bw __BW__ 1.2 1.8
iperf3 BW ns ::1 10003 __TIME__ __OPTS__ -b 30G -l 16336
@@ -64,7 +64,7 @@ iperf3s host 10003
# (datagram size) = (packet size) - 28: 20 bytes of IPv4 header, 8 of UDP header
iperf3 BW ns 127.0.0.1 10003 __TIME__ __OPTS__ -b 5G -l 1372
-bw __BW__ 1.0 1.5
+bw __BW__ 0.8 1.2
iperf3 BW ns 127.0.0.1 10003 __TIME__ __OPTS__ -b 10G -l 3972
bw __BW__ 1.2 1.8
iperf3 BW ns 127.0.0.1 10003 __TIME__ __OPTS__ -b 30G -l 16356
@@ -88,7 +88,7 @@ tr UDP throughput over IPv6: host to ns
iperf3s ns 10002
iperf3 BW host ::1 10002 __TIME__ __OPTS__ -b 5G -l 1452
-bw __BW__ 1.0 1.5
+bw __BW__ 0.8 1.2
iperf3 BW host ::1 10002 __TIME__ __OPTS__ -b 10G -l 3972
bw __BW__ 1.2 1.8
iperf3 BW host ::1 10002 __TIME__ __OPTS__ -b 30G -l 16336
@@ -111,7 +111,7 @@ lat __LAT__ 200 150
tr UDP throughput over IPv4: host to ns
iperf3s ns 10002
iperf3 BW host 127.0.0.1 10002 __TIME__ __OPTS__ -b 5G -l 1372
-bw __BW__ 1.0 1.5
+bw __BW__ 0.8 1.2
iperf3 BW host 127.0.0.1 10002 __TIME__ __OPTS__ -b 10G -l 3972
bw __BW__ 1.2 1.8
iperf3 BW host 127.0.0.1 10002 __TIME__ __OPTS__ -b 30G -l 16356
@@ -196,7 +196,7 @@ tr UDP throughput over IPv6: host to ns
iperf3s ns 10002
nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
-nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local] | .[0]'
+nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope != "host" and .scope != "link").local] | .[0]'
iperf3 BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 8G -l 1472
bw __BW__ 0.3 0.5
iperf3 BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 12G -l 3972
diff --git a/test/prepare-distro-img.sh b/test/prepare-distro-img.sh
index 0d967c9..be2386e 100755
--- a/test/prepare-distro-img.sh
+++ b/test/prepare-distro-img.sh
@@ -3,6 +3,10 @@
IMG="$1"
PASST_FILES="$(echo ../*.c ../*.h ../*.sh ../*.1 ../Makefile ../README.md)"
+# This is just a workaround for Fedora and related distributions.
+# Once it gets fixed, we can drop this.
+export LIBGUESTFS_BACKEND=direct
+
virt-edit -a $IMG /lib/systemd/system/serial-getty@.service -e 's/ExecStart=.*/ExecStart=\/sbin\/agetty --autologin root -8 --keep-baud 115200,38400,9600 %I $TERM/g'
guestfish --rw -a $IMG -i <<EOF
diff --git a/test/run b/test/run
index 4e86f30..f858e55 100755
--- a/test/run
+++ b/test/run
@@ -43,6 +43,9 @@ KERNEL=${KERNEL:-"/boot/vmlinuz-$(uname -r)"}
COMMIT="$(git log --oneline --no-decorate -1)"
+# Let exeter tests written in Python find their modules
+export PYTHONPATH=${BASEPATH}/exeter/py3
+
. lib/util
. lib/context
. lib/setup
@@ -53,6 +56,7 @@ COMMIT="$(git log --oneline --no-decorate -1)"
. lib/layout_ugly
. lib/test
. lib/video
+. lib/exeter
# cleanup() - Remove temporary files
cleanup() {
@@ -67,11 +71,9 @@ run() {
perf_init
[ ${CI} -eq 1 ] && video_start ci
- setup build
- test build/all
- test build/cppcheck
- test build/clang_tidy
- teardown build
+ exeter smoke/smoke.sh
+ exeter build/build.py
+ exeter build/static_checkers.sh
setup pasta
test pasta/ndp
@@ -202,7 +204,7 @@ skip_distro() {
perf_finish
[ ${CI} -eq 1 ] && video_stop
- log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}"
+ log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}, SKIPPED: ${STATUS_SKIPPED}"
pause_continue \
"Press any key to keep test session open" \
@@ -223,6 +225,10 @@ run_selected() {
__setup=
for __test; do
+ if is_exeter "test/${__test}"; then
+ exeter "${__test}"
+ continue
+ fi
# HACK: the migrate tests need the setup repeated for
# each test
if [ "${__test%%/*}" != "${__setup}" -o \
@@ -234,9 +240,9 @@ run_selected() {
test "${__test}"
done
- teardown "${__setup}"
+ [ -n "${__setup}" ] && teardown "${__setup}"
- log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}"
+ log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}, SKIPPED: ${STATUS_SKIPPED}"
pause_continue \
"Press any key to keep test session open" \
@@ -307,4 +313,4 @@ fi
tail -n1 ${LOGFILE}
echo "Log at ${LOGFILE}"
-exit $(tail -n1 ${LOGFILE} | sed -n 's/.*FAIL: \(.*\)$/\1/p')
+exit $(tail -n1 ${LOGFILE} | sed -n 's/.*FAIL: \(.*\),.*$/\1/p')
diff --git a/test/smoke/smoke.sh b/test/smoke/smoke.sh
new file mode 100755
index 0000000..a642fb9
--- /dev/null
+++ b/test/smoke/smoke.sh
@@ -0,0 +1,33 @@
+#! /bin/sh
+#
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+# for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+# for network namespace/tap device mode
+#
+# test/smoke/smoke.sh - Basic smoke tests
+#
+# Copyright Red Hat
+# Author: David Gibson <david@gibson.dropbear.id.au>
+
+. $(dirname $0)/../exeter/sh/exeter.sh
+
+PASST=$(dirname $0)/../../passt
+PASTA=$(dirname $0)/../../pasta
+
+exeter_register passt_version $PASST --version
+exeter_set_description passt_version "Check passt --version works"
+
+exeter_register pasta_version $PASTA --version
+exeter_set_description pasta_version "Check pasta --version works"
+
+exeter_register passt_help $PASST --help
+exeter_set_description passt_help "Check passt --help works"
+
+exeter_register pasta_help $PASTA --help
+exeter_set_description pasta_help "Check pasta --help works"
+
+exeter_main "$@"
diff --git a/test/two_guests/basic b/test/two_guests/basic
index e2338ff..cb48bce 100644
--- a/test/two_guests/basic
+++ b/test/two_guests/basic
@@ -45,9 +45,9 @@ guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1;
guest2 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
g2out ADDR2_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME2__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
-hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
-check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
-check [ "__ADDR2_6__" = "__HOST_ADDR6__" ]
+hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope != "host" and .scope != "link" and .deprecated != true).local] | join(" ")'
+check echo "__HOST_ADDR6__" | grep -wq "__ADDR1_6__"
+check echo "__HOST_ADDR6__" | grep -wq "__ADDR2_6__"
test TCP/IPv4: guest 1 > guest 2
g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
diff --git a/udp.c b/udp.c
index 80520cb..08bec50 100644
--- a/udp.c
+++ b/udp.c
@@ -39,27 +39,30 @@
* could receive packets from multiple flows, so we use a hash table match to
* find the specific flow for a datagram.
*
- * When a UDP flow is initiated from a listening socket we take a duplicate of
- * the socket and store it in uflow->s[INISIDE]. This will last for the
- * lifetime of the flow, even if the original listening socket is closed due to
- * port auto-probing. The duplicate is used to deliver replies back to the
- * originating side.
- *
- * Reply sockets
- * =============
+ * Flow sockets
+ * ============
*
- * When a UDP flow targets a socket, we create a "reply" socket in
+ * When a UDP flow targets a socket, we create a "flow" socket in
* uflow->s[TGTSIDE] both to deliver datagrams to the target side and receive
* replies on the target side. This socket is both bound and connected and has
- * EPOLL_TYPE_UDP_REPLY. The connect() means it will only receive datagrams
+ * EPOLL_TYPE_UDP. The connect() means it will only receive datagrams
* associated with this flow, so the epoll reference directly points to the flow
* and we don't need a hash lookup.
*
- * NOTE: it's possible that the reply socket could have a bound address
- * overlapping with an unrelated listening socket. We assume datagrams for the
- * flow will come to the reply socket in preference to a listening socket. The
- * sample program doc/platform-requirements/reuseaddr-priority.c documents and
- * tests that assumption.
+ * When a flow is initiated from a listening socket, we create a "flow" socket
+ * with the same bound address as the listening socket, but also connect()ed to
+ * the flow's peer. This is stored in uflow->s[INISIDE] and will last for the
+ * lifetime of the flow, even if the original listening socket is closed due to
+ * port auto-probing. The duplicate is used to deliver replies back to the
+ * originating side.
+ *
+ * NOTE: A flow socket can have a bound address overlapping with a listening
+ * socket. That will happen naturally for flows initiated from a socket, but is
+ * also possible (though unlikely) for tap initiated flows, depending on the
+ * source port. We assume datagrams for the flow will come to a connect()ed
+ * socket in preference to a listening socket. The sample program
+ * doc/platform-requirements/reuseaddr-priority.c documents and tests that
+ * assumption.
*
* "Spliced" flows
* ===============
@@ -71,8 +74,7 @@
* actually used; it doesn't make sense for datagrams and instead a pair of
* recvmmsg() and sendmmsg() is used to forward the datagrams.
*
- * Note that a spliced flow will have *both* a duplicated listening socket and a
- * reply socket (see above).
+ * Note that a spliced flow will have two flow sockets (see above).
*/
#include <sched.h>
@@ -92,7 +94,6 @@
#include <stdint.h>
#include <stddef.h>
#include <string.h>
-#include <sys/epoll.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/uio.h>
@@ -113,6 +114,9 @@
#include "flow_table.h"
#include "udp_internal.h"
#include "udp_vu.h"
+#include "epoll_ctl.h"
+
+#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */
/* Maximum UDP data to be returned in ICMP messages */
#define ICMP4_MAX_DLEN 8
@@ -129,46 +133,50 @@ static int udp_splice_init[IP_VERSIONS][NUM_PORTS];
/* UDP header and data for inbound messages */
static struct udp_payload_t udp_payload[UDP_MAX_FRAMES];
-/* Ethernet header for IPv4 frames */
-static struct ethhdr udp4_eth_hdr;
-
-/* Ethernet header for IPv6 frames */
-static struct ethhdr udp6_eth_hdr;
+/* Ethernet headers for IPv4 and IPv6 frames */
+static struct ethhdr udp_eth_hdr[UDP_MAX_FRAMES];
/**
- * struct udp_meta_t - Pre-cooked headers and metadata for UDP packets
+ * struct udp_meta_t - Pre-cooked headers for UDP packets
* @ip6h: Pre-filled IPv6 header (except for payload_len and addresses)
* @ip4h: Pre-filled IPv4 header (except for tot_len and saddr)
* @taph: Tap backend specific header
- * @s_in: Source socket address, filled in by recvmmsg()
- * @tosidx: sidx for the destination side of this datagram's flow
*/
static struct udp_meta_t {
struct ipv6hdr ip6h;
struct iphdr ip4h;
struct tap_hdr taph;
-
- union sockaddr_inany s_in;
- flow_sidx_t tosidx;
}
#ifdef __AVX2__
__attribute__ ((aligned(32)))
#endif
udp_meta[UDP_MAX_FRAMES];
+#define PKTINFO_SPACE \
+ MAX(CMSG_SPACE(sizeof(struct in_pktinfo)), \
+ CMSG_SPACE(sizeof(struct in6_pktinfo)))
+
+#define RECVERR_SPACE \
+ MAX(CMSG_SPACE(sizeof(struct sock_extended_err) + \
+ sizeof(struct sockaddr_in)), \
+ CMSG_SPACE(sizeof(struct sock_extended_err) + \
+ sizeof(struct sockaddr_in6)))
+
/**
* enum udp_iov_idx - Indices for the buffers making up a single UDP frame
- * @UDP_IOV_TAP tap specific header
- * @UDP_IOV_ETH Ethernet header
- * @UDP_IOV_IP IP (v4/v6) header
- * @UDP_IOV_PAYLOAD IP payload (UDP header + data)
- * @UDP_NUM_IOVS the number of entries in the iovec array
+ * @UDP_IOV_TAP tap specific header
+ * @UDP_IOV_ETH Ethernet header
+ * @UDP_IOV_IP IP (v4/v6) header
+ * @UDP_IOV_PAYLOAD IP payload (UDP header + data)
+ * @UDP_IOV_ETH_PAD Ethernet (802.3) padding to 60 bytes
+ * @UDP_NUM_IOVS the number of entries in the iovec array
*/
enum udp_iov_idx {
UDP_IOV_TAP,
UDP_IOV_ETH,
UDP_IOV_IP,
UDP_IOV_PAYLOAD,
+ UDP_IOV_ETH_PAD,
UDP_NUM_IOVS,
};
@@ -201,12 +209,13 @@ void udp_portmap_clear(void)
/**
* udp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses
* @eth_d: Ethernet destination address, NULL if unchanged
- * @eth_s: Ethernet source address, NULL if unchanged
*/
-void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
+void udp_update_l2_buf(const unsigned char *eth_d)
{
- eth_update_mac(&udp4_eth_hdr, eth_d, eth_s);
- eth_update_mac(&udp6_eth_hdr, eth_d, eth_s);
+ int i;
+
+ for (i = 0; i < UDP_MAX_FRAMES; i++)
+ eth_update_mac(&udp_eth_hdr[i], eth_d, NULL);
}
/**
@@ -229,11 +238,11 @@ static void udp_iov_init_one(const struct ctx *c, size_t i)
*siov = IOV_OF_LVALUE(payload->data);
+ tiov[UDP_IOV_ETH] = IOV_OF_LVALUE(udp_eth_hdr[i]);
tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph);
tiov[UDP_IOV_PAYLOAD].iov_base = payload;
+ tiov[UDP_IOV_ETH_PAD].iov_base = eth_pad;
- mh->msg_name = &meta->s_in;
- mh->msg_namelen = sizeof(meta->s_in);
mh->msg_iov = siov;
mh->msg_iovlen = 1;
}
@@ -246,49 +255,11 @@ static void udp_iov_init(const struct ctx *c)
{
size_t i;
- udp4_eth_hdr.h_proto = htons_constant(ETH_P_IP);
- udp6_eth_hdr.h_proto = htons_constant(ETH_P_IPV6);
-
for (i = 0; i < UDP_MAX_FRAMES; i++)
udp_iov_init_one(c, i);
}
/**
- * udp_splice_prepare() - Prepare one datagram for splicing
- * @mmh: Receiving mmsghdr array
- * @idx: Index of the datagram to prepare
- */
-static void udp_splice_prepare(struct mmsghdr *mmh, unsigned idx)
-{
- udp_mh_splice[idx].msg_hdr.msg_iov->iov_len = mmh[idx].msg_len;
-}
-
-/**
- * udp_splice_send() - Send a batch of datagrams from socket to socket
- * @c: Execution context
- * @start: Index of batch's first datagram in udp[46]_l2_buf
- * @n: Number of datagrams in batch
- * @src: Source port for datagram (target side)
- * @dst: Destination port for datagrams (target side)
- * @ref: epoll reference for origin socket
- * @now: Timestamp
- */
-static void udp_splice_send(const struct ctx *c, size_t start, size_t n,
- flow_sidx_t tosidx)
-{
- const struct flowside *toside = flowside_at_sidx(tosidx);
- const struct udp_flow *uflow = udp_at_sidx(tosidx);
- uint8_t topif = pif_at_sidx(tosidx);
- int s = uflow->s[tosidx.sidei];
- socklen_t sl;
-
- pif_sockaddr(c, &udp_splice_to, &sl, topif,
- &toside->eaddr, toside->eport);
-
- sendmmsg(s, udp_mh_splice + start, n, MSG_NOSIGNAL);
-}
-
-/**
* udp_update_hdr4() - Update headers for one IPv4 datagram
* @ip4h: Pre-filled IPv4 header (except for tot_len and saddr)
* @bp: Pointer to udp_payload_t to update
@@ -377,41 +348,68 @@ size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
}
/**
+ * udp_tap_pad() - Calculate padding to send out of padding (zero) buffer
+ * @iov: Pointer to iovec of frame parts we're about to send
+ */
+static void udp_tap_pad(struct iovec *iov)
+{
+ size_t l2len = iov[UDP_IOV_ETH].iov_len +
+ iov[UDP_IOV_IP].iov_len +
+ iov[UDP_IOV_PAYLOAD].iov_len;
+
+ if (l2len < ETH_ZLEN)
+ iov[UDP_IOV_ETH_PAD].iov_len = ETH_ZLEN - l2len;
+ else
+ iov[UDP_IOV_ETH_PAD].iov_len = 0;
+}
+
+/**
* udp_tap_prepare() - Convert one datagram into a tap frame
* @mmh: Receiving mmsghdr array
* @idx: Index of the datagram to prepare
+ * @tap_omac: MAC address of remote endpoint as seen from the guest
* @toside: Flowside for destination side
* @no_udp_csum: Do not set UDP checksum
*/
static void udp_tap_prepare(const struct mmsghdr *mmh,
- unsigned idx, const struct flowside *toside,
+ unsigned int idx,
+ const uint8_t *tap_omac,
+ const struct flowside *toside,
bool no_udp_csum)
{
struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[idx];
+ struct ethhdr *eh = (*tap_iov)[UDP_IOV_ETH].iov_base;
struct udp_payload_t *bp = &udp_payload[idx];
struct udp_meta_t *bm = &udp_meta[idx];
- size_t l4len;
+ size_t l4len, l2len;
+ eth_update_mac(eh, NULL, tap_omac);
if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) {
l4len = udp_update_hdr6(&bm->ip6h, bp, toside,
mmh[idx].msg_len, no_udp_csum);
- tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) +
- sizeof(udp6_eth_hdr));
- (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr);
+
+ l2len = MAX(l4len + sizeof(bm->ip6h) + ETH_HLEN, ETH_ZLEN);
+ tap_hdr_update(&bm->taph, l2len);
+
+ eh->h_proto = htons_constant(ETH_P_IPV6);
(*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h);
} else {
l4len = udp_update_hdr4(&bm->ip4h, bp, toside,
mmh[idx].msg_len, no_udp_csum);
- tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) +
- sizeof(udp4_eth_hdr));
- (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr);
+
+ l2len = MAX(l4len + sizeof(bm->ip4h) + ETH_HLEN, ETH_ZLEN);
+ tap_hdr_update(&bm->taph, l2len);
+
+ eh->h_proto = htons_constant(ETH_P_IP);
(*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip4h);
}
(*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len;
+
+ udp_tap_pad(*tap_iov);
}
/**
- * udp_send_conn_fail_icmp4() - Construct and send ICMPv4 to local peer
+ * udp_send_tap_icmp4() - Construct and send ICMPv4 to local peer
* @c: Execution context
* @ee: Extended error descriptor
* @toside: Destination side of flow
@@ -419,16 +417,18 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
* @in: First bytes (max 8) of original UDP message body
* @dlen: Length of the read part of original UDP message body
*/
-static void udp_send_conn_fail_icmp4(const struct ctx *c,
- const struct sock_extended_err *ee,
- const struct flowside *toside,
- struct in_addr saddr,
- const void *in, size_t dlen)
+static void udp_send_tap_icmp4(const struct ctx *c,
+ const struct sock_extended_err *ee,
+ const struct flowside *toside,
+ struct in_addr saddr,
+ const void *in, size_t dlen)
{
struct in_addr oaddr = toside->oaddr.v4mapped.a4;
struct in_addr eaddr = toside->eaddr.v4mapped.a4;
in_port_t eport = toside->eport;
in_port_t oport = toside->oport;
+ union inany_addr saddr_any;
+ uint8_t tap_omac[ETH_ALEN];
struct {
struct icmphdr icmp4h;
struct iphdr ip4h;
@@ -450,12 +450,15 @@ static void udp_send_conn_fail_icmp4(const struct ctx *c,
tap_push_uh4(&msg.uh, eaddr, eport, oaddr, oport, in, dlen);
memcpy(&msg.data, in, dlen);
- tap_icmp4_send(c, saddr, eaddr, &msg, msglen);
+ /* Try to obtain the MAC address of the generating node */
+ saddr_any = inany_from_v4(saddr);
+ fwd_neigh_mac_get(c, &saddr_any, tap_omac);
+ tap_icmp4_send(c, saddr, eaddr, &msg, tap_omac, msglen);
}
/**
- * udp_send_conn_fail_icmp6() - Construct and send ICMPv6 to local peer
+ * udp_send_tap_icmp6() - Construct and send ICMPv6 to local peer
* @c: Execution context
* @ee: Extended error descriptor
* @toside: Destination side of flow
@@ -464,16 +467,17 @@ static void udp_send_conn_fail_icmp4(const struct ctx *c,
* @dlen: Length of the read part of original UDP message body
* @flow: IPv6 flow identifier
*/
-static void udp_send_conn_fail_icmp6(const struct ctx *c,
- const struct sock_extended_err *ee,
- const struct flowside *toside,
- const struct in6_addr *saddr,
- void *in, size_t dlen, uint32_t flow)
+static void udp_send_tap_icmp6(const struct ctx *c,
+ const struct sock_extended_err *ee,
+ const struct flowside *toside,
+ const struct in6_addr *saddr,
+ void *in, size_t dlen, uint32_t flow)
{
const struct in6_addr *oaddr = &toside->oaddr.a6;
const struct in6_addr *eaddr = &toside->eaddr.a6;
in_port_t eport = toside->eport;
in_port_t oport = toside->oport;
+ uint8_t tap_omac[ETH_ALEN];
struct {
struct icmp6_hdr icmp6h;
struct ipv6hdr ip6h;
@@ -495,39 +499,89 @@ static void udp_send_conn_fail_icmp6(const struct ctx *c,
tap_push_uh6(&msg.uh, eaddr, eport, oaddr, oport, in, dlen);
memcpy(&msg.data, in, dlen);
- tap_icmp6_send(c, saddr, eaddr, &msg, msglen);
+ /* Try to obtain the MAC address of the generating node */
+ fwd_neigh_mac_get(c, (union inany_addr *) saddr, tap_omac);
+ tap_icmp6_send(c, saddr, eaddr, &msg, tap_omac, msglen);
+}
+
+/**
+ * udp_pktinfo() - Retrieve packet destination address from cmsg
+ * @msg: msghdr into which message has been received
+ * @dst: (Local) destination address of message in @msg (output)
+ *
+ * Return: 0 on success, -1 if the information was missing (@dst is set to
+ * inany_any6).
+ */
+static int udp_pktinfo(struct msghdr *msg, union inany_addr *dst)
+{
+ struct cmsghdr *hdr;
+
+ for (hdr = CMSG_FIRSTHDR(msg); hdr; hdr = CMSG_NXTHDR(msg, hdr)) {
+ if (hdr->cmsg_level == IPPROTO_IP &&
+ hdr->cmsg_type == IP_PKTINFO) {
+ const struct in_pktinfo *i4 = (void *)CMSG_DATA(hdr);
+
+ *dst = inany_from_v4(i4->ipi_addr);
+ return 0;
+ }
+
+ if (hdr->cmsg_level == IPPROTO_IPV6 &&
+ hdr->cmsg_type == IPV6_PKTINFO) {
+ const struct in6_pktinfo *i6 = (void *)CMSG_DATA(hdr);
+
+ dst->a6 = i6->ipi6_addr;
+ return 0;
+ }
+ }
+
+ debug("Missing PKTINFO cmsg on datagram");
+ *dst = inany_any6;
+ return -1;
}
/**
* udp_sock_recverr() - Receive and clear an error from a socket
* @c: Execution context
- * @ref: epoll reference
+ * @s: Socket to receive errors from
+ * @sidx: Flow and side of @s, or FLOW_SIDX_NONE if unknown
+ * @pif: Interface on which the error occurred
+ * (only used if @sidx == FLOW_SIDX_NONE)
+ * @port: Local port number of @s (only used if @sidx == FLOW_SIDX_NONE)
*
* Return: 1 if error received and processed, 0 if no more errors in queue, < 0
* if there was an error reading the queue
*
* #syscalls recvmsg
*/
-static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
+static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx,
+ uint8_t pif, in_port_t port)
{
+ char buf[PKTINFO_SPACE + RECVERR_SPACE];
const struct sock_extended_err *ee;
- const struct cmsghdr *hdr;
- union sockaddr_inany saddr;
- char buf[CMSG_SPACE(sizeof(*ee))];
char data[ICMP6_MAX_DLEN];
- int s = ref.fd;
+ struct cmsghdr *hdr;
struct iovec iov = {
.iov_base = data,
.iov_len = sizeof(data)
};
+ union sockaddr_inany src;
struct msghdr mh = {
- .msg_name = &saddr,
- .msg_namelen = sizeof(saddr),
+ .msg_name = &src,
+ .msg_namelen = sizeof(src),
.msg_iov = &iov,
.msg_iovlen = 1,
.msg_control = buf,
.msg_controllen = sizeof(buf),
};
+ const struct flowside *fromside, *toside;
+ union inany_addr offender, otap;
+ char astr[INANY_ADDRSTRLEN];
+ char sastr[SOCKADDR_STRLEN];
+ const struct in_addr *o4;
+ in_port_t offender_port;
+ struct udp_flow *uflow;
+ uint8_t topif;
+ size_t dlen;
ssize_t rc;
rc = recvmsg(s, &mh, MSG_ERRQUEUE);
@@ -544,61 +598,111 @@ static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
return -1;
}
- hdr = CMSG_FIRSTHDR(&mh);
- if (!((hdr->cmsg_level == IPPROTO_IP &&
- hdr->cmsg_type == IP_RECVERR) ||
- (hdr->cmsg_level == IPPROTO_IPV6 &&
- hdr->cmsg_type == IPV6_RECVERR))) {
- err("Unexpected cmsg reading error queue");
+ for (hdr = CMSG_FIRSTHDR(&mh); hdr; hdr = CMSG_NXTHDR(&mh, hdr)) {
+ if ((hdr->cmsg_level == IPPROTO_IP &&
+ hdr->cmsg_type == IP_RECVERR) ||
+ (hdr->cmsg_level == IPPROTO_IPV6 &&
+ hdr->cmsg_type == IPV6_RECVERR))
+ break;
+ }
+
+ if (!hdr) {
+ err("Missing RECVERR cmsg in error queue");
return -1;
}
ee = (const struct sock_extended_err *)CMSG_DATA(hdr);
- if (ref.type == EPOLL_TYPE_UDP_REPLY) {
- flow_sidx_t sidx = flow_sidx_opposite(ref.flowside);
- const struct flowside *toside = flowside_at_sidx(sidx);
- size_t dlen = rc;
-
- if (hdr->cmsg_level == IPPROTO_IP) {
- dlen = MIN(dlen, ICMP4_MAX_DLEN);
- udp_send_conn_fail_icmp4(c, ee, toside, saddr.sa4.sin_addr,
- data, dlen);
- } else if (hdr->cmsg_level == IPPROTO_IPV6) {
- udp_send_conn_fail_icmp6(c, ee, toside,
- &saddr.sa6.sin6_addr,
- data, dlen, sidx.flowi);
+
+ debug("%s error on UDP socket %i: %s",
+ str_ee_origin(ee), s, strerror_(ee->ee_errno));
+
+ if (!flow_sidx_valid(sidx)) {
+ /* No hint from the socket, determine flow from addresses */
+ union inany_addr dst;
+
+ if (udp_pktinfo(&mh, &dst) < 0) {
+ debug("Missing PKTINFO on UDP error");
+ return 1;
+ }
+
+ sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, &src, &dst, port);
+ if (!flow_sidx_valid(sidx)) {
+ debug("Ignoring UDP error without flow");
+ return 1;
}
} else {
- trace("Ignoring received IP_RECVERR cmsg on listener socket");
+ pif = pif_at_sidx(sidx);
+ }
+
+ uflow = udp_at_sidx(sidx);
+ ASSERT(uflow);
+ fromside = &uflow->f.side[sidx.sidei];
+ toside = &uflow->f.side[!sidx.sidei];
+ topif = uflow->f.pif[!sidx.sidei];
+ dlen = rc;
+
+ if (inany_from_sockaddr(&offender, &offender_port,
+ SO_EE_OFFENDER(ee)) < 0)
+ goto fail;
+
+ if (pif != PIF_HOST || topif != PIF_TAP)
+ /* XXX Can we support any other cases? */
+ goto fail;
+
+ /* If the offender *is* the endpoint, make sure our translation is
+ * consistent with the flow's translation. This matters if the flow
+ * endpoint has a port specific translation (like --dns-match).
+ */
+ if (inany_equals(&offender, &fromside->eaddr))
+ otap = toside->oaddr;
+ else if (!nat_inbound(c, &offender, &otap))
+ goto fail;
+
+ if (hdr->cmsg_level == IPPROTO_IP &&
+ (o4 = inany_v4(&otap)) && inany_v4(&toside->eaddr)) {
+ dlen = MIN(dlen, ICMP4_MAX_DLEN);
+ udp_send_tap_icmp4(c, ee, toside, *o4, data, dlen);
+ return 1;
+ }
+
+ if (hdr->cmsg_level == IPPROTO_IPV6 && !inany_v4(&toside->eaddr)) {
+ udp_send_tap_icmp6(c, ee, toside, &otap.a6, data, dlen,
+ FLOW_IDX(uflow));
+ return 1;
}
- debug("%s error on UDP socket %i: %s",
- str_ee_origin(ee), s, strerror_(ee->ee_errno));
+fail:
+ flow_dbg(uflow, "Can't propagate %s error from %s %s to %s %s",
+ str_ee_origin(ee),
+ pif_name(pif),
+ sockaddr_ntop(SO_EE_OFFENDER(ee), sastr, sizeof(sastr)),
+ pif_name(topif),
+ inany_ntop(&toside->eaddr, astr, sizeof(astr)));
return 1;
}
/**
* udp_sock_errs() - Process errors on a socket
* @c: Execution context
- * @ref: epoll reference
- * @events: epoll events bitmap
+ * @s: Socket to receive errors from
+ * @sidx: Flow and side of @s, or FLOW_SIDX_NONE if unknown
+ * @pif: Interface on which the error occurred
+ * (only used if @sidx == FLOW_SIDX_NONE)
+ * @port: Local port number of @s (only used if @sidx == FLOW_SIDX_NONE)
*
- * Return: Number of errors handled, or < 0 if we have an unrecoverable error
+ * Return: number of errors handled, or < 0 if we have an unrecoverable error
*/
-int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events)
+static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx,
+ uint8_t pif, in_port_t port)
{
unsigned n_err = 0;
socklen_t errlen;
- int s = ref.fd;
int rc, err;
ASSERT(!c->no_udp);
- if (!(events & EPOLLERR))
- return 0; /* Nothing to do */
-
/* Empty the error queue */
- while ((rc = udp_sock_recverr(c, ref)) > 0)
+ while ((rc = udp_sock_recverr(c, s, sidx, pif, port)) > 0)
n_err += rc;
if (rc < 0)
@@ -626,36 +730,61 @@ int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events)
}
/**
+ * udp_peek_addr() - Get source address for next packet
+ * @s: Socket to get information from
+ * @src: Socket address (output)
+ * @dst: (Local) destination address (output)
+ *
+ * Return: 0 if no more packets, 1 on success, -ve error code on error
+ */
+static int udp_peek_addr(int s, union sockaddr_inany *src,
+ union inany_addr *dst)
+{
+ char sastr[SOCKADDR_STRLEN], dstr[INANY_ADDRSTRLEN];
+ char cmsg[PKTINFO_SPACE];
+ struct msghdr msg = {
+ .msg_name = src,
+ .msg_namelen = sizeof(*src),
+ .msg_control = cmsg,
+ .msg_controllen = sizeof(cmsg),
+ };
+ int rc;
+
+ rc = recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT);
+ if (rc < 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK)
+ return 0;
+ return -errno;
+ }
+
+ udp_pktinfo(&msg, dst);
+
+ trace("Peeked UDP datagram: %s -> %s",
+ sockaddr_ntop(src, sastr, sizeof(sastr)),
+ inany_ntop(dst, dstr, sizeof(dstr)));
+
+ return 1;
+}
+
+/**
* udp_sock_recv() - Receive datagrams from a socket
* @c: Execution context
* @s: Socket to receive from
- * @events: epoll events bitmap
- * @mmh mmsghdr array to receive into
+ * @mmh: mmsghdr array to receive into
+ * @n: Maximum number of datagrams to receive
*
- * Return: Number of datagrams received
+ * Return: number of datagrams received
*
* #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64
*/
-static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
- struct mmsghdr *mmh)
+static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n)
{
- /* For not entirely clear reasons (data locality?) pasta gets better
- * throughput if we receive tap datagrams one at a atime. For small
- * splice datagrams throughput is slightly better if we do batch, but
- * it's slightly worse for large splice datagrams. Since we don't know
- * before we receive whether we'll use tap or splice, always go one at a
- * time for pasta mode.
- */
- int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES);
-
ASSERT(!c->no_udp);
- if (!(events & EPOLLIN))
- return 0;
-
n = recvmmsg(s, mmh, n, 0, NULL);
if (n < 0) {
- err_perror("Error receiving datagrams");
+ trace("Error receiving datagrams: %s", strerror_(errno));
+ /* Bail out and let the EPOLLERR handler deal with it */
return 0;
}
@@ -663,78 +792,126 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events,
}
/**
- * udp_buf_listen_sock_handler() - Handle new data from socket
+ * udp_sock_to_sock() - Forward datagrams from socket to socket
* @c: Execution context
- * @ref: epoll reference
- * @events: epoll events bitmap
- * @now: Current timestamp
+ * @from_s: Socket to receive datagrams from
+ * @n: Maximum number of datagrams to forward
+ * @tosidx: Flow & side to forward datagrams to
*
- * #syscalls recvmmsg
+ * #syscalls sendmmsg
*/
-static void udp_buf_listen_sock_handler(const struct ctx *c,
- union epoll_ref ref, uint32_t events,
- const struct timespec *now)
+static void udp_sock_to_sock(const struct ctx *c, int from_s, int n,
+ flow_sidx_t tosidx)
{
- const socklen_t sasize = sizeof(udp_meta[0].s_in);
- int n, i;
+ const struct flowside *toside = flowside_at_sidx(tosidx);
+ const struct udp_flow *uflow = udp_at_sidx(tosidx);
+ uint8_t topif = pif_at_sidx(tosidx);
+ int to_s = uflow->s[tosidx.sidei];
+ int i;
- if (udp_sock_errs(c, ref, events) < 0) {
- err("UDP: Unrecoverable error on listening socket:"
- " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
- /* FIXME: what now? close/re-open socket? */
+ if ((n = udp_sock_recv(c, from_s, udp_mh_recv, n)) <= 0)
return;
+
+ for (i = 0; i < n; i++) {
+ udp_mh_splice[i].msg_hdr.msg_iov->iov_len
+ = udp_mh_recv[i].msg_len;
}
- if ((n = udp_sock_recv(c, ref.fd, events, udp_mh_recv)) <= 0)
+ pif_sockaddr(c, &udp_splice_to, topif,
+ &toside->eaddr, toside->eport);
+
+ sendmmsg(to_s, udp_mh_splice, n, MSG_NOSIGNAL);
+}
+
+/**
+ * udp_buf_sock_to_tap() - Forward datagrams from socket to tap
+ * @c: Execution context
+ * @s: Socket to read data from
+ * @n: Maximum number of datagrams to forward
+ * @tosidx: Flow & side to forward data from @s to
+ */
+static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n,
+ flow_sidx_t tosidx)
+{
+ const struct flowside *toside = flowside_at_sidx(tosidx);
+ struct udp_flow *uflow = udp_at_sidx(tosidx);
+ uint8_t *omac = uflow->f.tap_omac;
+ int i;
+
+ if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0)
return;
- /* We divide datagrams into batches based on how we need to send them,
- * determined by udp_meta[i].tosidx. To avoid either two passes through
- * the array, or recalculating tosidx for a single entry, we have to
- * populate it one entry *ahead* of the loop counter.
- */
- udp_meta[0].tosidx = udp_flow_from_sock(c, ref, &udp_meta[0].s_in, now);
- udp_mh_recv[0].msg_hdr.msg_namelen = sasize;
- for (i = 0; i < n; ) {
- flow_sidx_t batchsidx = udp_meta[i].tosidx;
- uint8_t batchpif = pif_at_sidx(batchsidx);
- int batchstart = i;
-
- do {
- if (pif_is_socket(batchpif)) {
- udp_splice_prepare(udp_mh_recv, i);
- } else if (batchpif == PIF_TAP) {
- udp_tap_prepare(udp_mh_recv, i,
- flowside_at_sidx(batchsidx),
- false);
+ /* Find if neighbour table has a recorded MAC address */
+ if (MAC_IS_UNDEF(omac))
+ fwd_neigh_mac_get(c, &toside->oaddr, omac);
+
+ for (i = 0; i < n; i++)
+ udp_tap_prepare(udp_mh_recv, i, omac, toside, false);
+
+ tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
+}
+
+/**
+ * udp_sock_fwd() - Forward datagrams from a possibly unconnected socket
+ * @c: Execution context
+ * @s: Socket to forward from
+ * @frompif: Interface to which @s belongs
+ * @port: Our (local) port number of @s
+ * @now: Current timestamp
+ */
+void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
+ in_port_t port, const struct timespec *now)
+{
+ union sockaddr_inany src;
+ union inany_addr dst;
+ int rc;
+
+ while ((rc = udp_peek_addr(s, &src, &dst)) != 0) {
+ bool discard = false;
+ flow_sidx_t tosidx;
+ uint8_t topif;
+
+ if (rc < 0) {
+ trace("Error peeking at socket address: %s",
+ strerror_(-rc));
+ /* Clear errors & carry on */
+ if (udp_sock_errs(c, s, FLOW_SIDX_NONE,
+ frompif, port) < 0) {
+ err(
+"UDP: Unrecoverable error on listening socket: (%s port %hu)",
+ pif_name(frompif), port);
+ /* FIXME: what now? close/re-open socket? */
}
+ continue;
+ }
- if (++i >= n)
- break;
-
- udp_meta[i].tosidx = udp_flow_from_sock(c, ref,
- &udp_meta[i].s_in,
- now);
- udp_mh_recv[i].msg_hdr.msg_namelen = sasize;
- } while (flow_sidx_eq(udp_meta[i].tosidx, batchsidx));
-
- if (pif_is_socket(batchpif)) {
- udp_splice_send(c, batchstart, i - batchstart,
- batchsidx);
- } else if (batchpif == PIF_TAP) {
- tap_send_frames(c, &udp_l2_iov[batchstart][0],
- UDP_NUM_IOVS, i - batchstart);
- } else if (flow_sidx_valid(batchsidx)) {
- flow_sidx_t fromsidx = flow_sidx_opposite(batchsidx);
- struct udp_flow *uflow = udp_at_sidx(batchsidx);
+ tosidx = udp_flow_from_sock(c, frompif, &dst, port, &src, now);
+ topif = pif_at_sidx(tosidx);
+
+ if (pif_is_socket(topif)) {
+ udp_sock_to_sock(c, s, 1, tosidx);
+ } else if (topif == PIF_TAP) {
+ if (c->mode == MODE_VU)
+ udp_vu_sock_to_tap(c, s, 1, tosidx);
+ else
+ udp_buf_sock_to_tap(c, s, 1, tosidx);
+ } else if (flow_sidx_valid(tosidx)) {
+ struct udp_flow *uflow = udp_at_sidx(tosidx);
flow_err(uflow,
"No support for forwarding UDP from %s to %s",
- pif_name(pif_at_sidx(fromsidx)),
- pif_name(batchpif));
+ pif_name(frompif), pif_name(topif));
+ discard = true;
} else {
- debug("Discarding %d datagrams without flow",
- i - batchstart);
+ debug("Discarding datagram without flow");
+ discard = true;
+ }
+
+ if (discard) {
+ struct msghdr msg = { 0 };
+
+ if (recvmsg(s, &msg, MSG_DONTWAIT) < 0)
+ debug_perror("Failed to discard datagram");
}
}
}
@@ -750,87 +927,69 @@ void udp_listen_sock_handler(const struct ctx *c,
union epoll_ref ref, uint32_t events,
const struct timespec *now)
{
- if (c->mode == MODE_VU) {
- udp_vu_listen_sock_handler(c, ref, events, now);
- return;
- }
-
- udp_buf_listen_sock_handler(c, ref, events, now);
+ if (events & (EPOLLERR | EPOLLIN))
+ udp_sock_fwd(c, ref.fd, ref.udp.pif, ref.udp.port, now);
}
/**
- * udp_buf_reply_sock_handler() - Handle new data from flow specific socket
+ * udp_sock_handler() - Handle new data from flow specific socket
* @c: Execution context
* @ref: epoll reference
* @events: epoll events bitmap
* @now: Current timestamp
- *
- * #syscalls recvmmsg
*/
-static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
- uint32_t events,
- const struct timespec *now)
+void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
+ uint32_t events, const struct timespec *now)
{
- flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
- const struct flowside *toside = flowside_at_sidx(tosidx);
struct udp_flow *uflow = udp_at_sidx(ref.flowside);
- uint8_t topif = pif_at_sidx(tosidx);
- int n, i, from_s;
ASSERT(!c->no_udp && uflow);
- from_s = uflow->s[ref.flowside.sidei];
-
- if (udp_sock_errs(c, ref, events) < 0) {
- flow_err(uflow, "Unrecoverable error on reply socket");
- flow_err_details(uflow);
- udp_flow_close(c, uflow);
- return;
- }
-
- if ((n = udp_sock_recv(c, from_s, events, udp_mh_recv)) <= 0)
- return;
-
- flow_trace(uflow, "Received %d datagrams on reply socket", n);
- uflow->ts = now->tv_sec;
-
- for (i = 0; i < n; i++) {
- if (pif_is_socket(topif))
- udp_splice_prepare(udp_mh_recv, i);
- else if (topif == PIF_TAP)
- udp_tap_prepare(udp_mh_recv, i, toside, false);
- /* Restore sockaddr length clobbered by recvmsg() */
- udp_mh_recv[i].msg_hdr.msg_namelen = sizeof(udp_meta[i].s_in);
- }
-
- if (pif_is_socket(topif)) {
- udp_splice_send(c, 0, n, tosidx);
- } else if (topif == PIF_TAP) {
- tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
- } else {
- uint8_t frompif = pif_at_sidx(ref.flowside);
-
- flow_err(uflow, "No support for forwarding UDP from %s to %s",
- pif_name(frompif), pif_name(topif));
+ if (events & EPOLLERR) {
+ if (udp_sock_errs(c, ref.fd, ref.flowside, PIF_NONE, 0) < 0) {
+ flow_err(uflow, "Unrecoverable error on flow socket");
+ goto fail;
+ }
}
-}
-/**
- * udp_reply_sock_handler() - Handle new data from flow specific socket
- * @c: Execution context
- * @ref: epoll reference
- * @events: epoll events bitmap
- * @now: Current timestamp
- */
-void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
- uint32_t events, const struct timespec *now)
-{
- if (c->mode == MODE_VU) {
- udp_vu_reply_sock_handler(c, ref, events, now);
- return;
+ if (events & EPOLLIN) {
+ /* For not entirely clear reasons (data locality?) pasta gets
+ * better throughput if we receive tap datagrams one at a
+ * time. For small splice datagrams throughput is slightly
+ * better if we do batch, but it's slightly worse for large
+ * splice datagrams. Since we don't know the size before we
+ * receive, always go one at a time for pasta mode.
+ */
+ size_t n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES);
+ flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
+ uint8_t topif = pif_at_sidx(tosidx);
+ int s = ref.fd;
+
+ flow_trace(uflow, "Received data on reply socket");
+ uflow->ts = now->tv_sec;
+
+ if (pif_is_socket(topif)) {
+ udp_sock_to_sock(c, ref.fd, n, tosidx);
+ } else if (topif == PIF_TAP) {
+ if (c->mode == MODE_VU) {
+ udp_vu_sock_to_tap(c, s, UDP_MAX_FRAMES,
+ tosidx);
+ } else {
+ udp_buf_sock_to_tap(c, s, n, tosidx);
+ }
+ } else {
+ flow_err(uflow,
+ "No support for forwarding UDP from %s to %s",
+ pif_name(pif_at_sidx(ref.flowside)),
+ pif_name(topif));
+ goto fail;
+ }
}
+ return;
- udp_buf_reply_sock_handler(c, ref, events, now);
+fail:
+ flow_err_details(uflow);
+ udp_flow_close(c, uflow);
}
/**
@@ -840,6 +999,7 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
* @af: Address family, AF_INET or AF_INET6
* @saddr: Source address
* @daddr: Destination address
+ * @ttl: TTL or hop limit for packets to be sent in this call
* @p: Pool of UDP packets, with UDP headers
* @idx: Index of first packet to process
* @now: Current timestamp
@@ -850,23 +1010,28 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
*/
int udp_tap_handler(const struct ctx *c, uint8_t pif,
sa_family_t af, const void *saddr, const void *daddr,
- const struct pool *p, int idx, const struct timespec *now)
+ uint8_t ttl, const struct pool *p, int idx,
+ const struct timespec *now)
{
const struct flowside *toside;
struct mmsghdr mm[UIO_MAXIOV];
union sockaddr_inany to_sa;
struct iovec m[UIO_MAXIOV];
+ struct udphdr uh_storage;
const struct udphdr *uh;
struct udp_flow *uflow;
- int i, s, count = 0;
+ int i, j, s, count = 0;
+ struct iov_tail data;
flow_sidx_t tosidx;
in_port_t src, dst;
uint8_t topif;
- socklen_t sl;
ASSERT(!c->no_udp);
- uh = packet_get(p, idx, 0, sizeof(*uh), NULL);
+ if (!packet_get(p, idx, &data))
+ return 1;
+
+ uh = IOV_PEEK_HEADER(&data, uh_storage);
if (!uh)
return 1;
@@ -898,28 +1063,34 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
}
toside = flowside_at_sidx(tosidx);
- s = udp_at_sidx(tosidx)->s[tosidx.sidei];
+ s = uflow->s[tosidx.sidei];
ASSERT(s >= 0);
- pif_sockaddr(c, &to_sa, &sl, topif, &toside->eaddr, toside->eport);
+ pif_sockaddr(c, &to_sa, topif, &toside->eaddr, toside->eport);
- for (i = 0; i < (int)p->count - idx; i++) {
- struct udphdr *uh_send;
- size_t len;
+ for (i = 0, j = 0; i < (int)p->count - idx && j < UIO_MAXIOV; i++) {
+ const struct udphdr *uh_send;
+
+ if (!packet_get(p, idx + i, &data))
+ return p->count - idx;
- uh_send = packet_get(p, idx + i, 0, sizeof(*uh), &len);
+ uh_send = IOV_REMOVE_HEADER(&data, uh_storage);
if (!uh_send)
return p->count - idx;
mm[i].msg_hdr.msg_name = &to_sa;
- mm[i].msg_hdr.msg_namelen = sl;
+ mm[i].msg_hdr.msg_namelen = socklen_inany(&to_sa);
- if (len) {
- m[i].iov_base = (char *)(uh_send + 1);
- m[i].iov_len = len;
+ if (data.cnt) {
+ int cnt;
- mm[i].msg_hdr.msg_iov = m + i;
- mm[i].msg_hdr.msg_iovlen = 1;
+ cnt = iov_tail_clone(&m[j], UIO_MAXIOV - j, &data);
+ if (cnt < 0)
+ return p->count - idx;
+
+ mm[i].msg_hdr.msg_iov = &m[j];
+ mm[i].msg_hdr.msg_iovlen = cnt;
+ j += cnt;
} else {
mm[i].msg_hdr.msg_iov = NULL;
mm[i].msg_hdr.msg_iovlen = 0;
@@ -929,6 +1100,24 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
mm[i].msg_hdr.msg_controllen = 0;
mm[i].msg_hdr.msg_flags = 0;
+ if (ttl != uflow->ttl[tosidx.sidei]) {
+ uflow->ttl[tosidx.sidei] = ttl;
+ if (af == AF_INET) {
+ if (setsockopt(s, IPPROTO_IP, IP_TTL,
+ &ttl, sizeof(ttl)) < 0)
+ flow_perror(uflow,
+ "setsockopt IP_TTL");
+ } else {
+ /* IPv6 hop_limit cannot be only 1 byte */
+ int hop_limit = ttl;
+
+ if (setsockopt(s, SOL_IPV6, IPV6_UNICAST_HOPS,
+ &hop_limit, sizeof(hop_limit)) < 0)
+ flow_perror(uflow,
+ "setsockopt IPV6_UNICAST_HOPS");
+ }
+ }
+
count++;
}
@@ -940,72 +1129,64 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
}
/**
- * udp_sock_init() - Initialise listening sockets for a given port
+ * udp_sock_init() - Initialise listening socket for a given port
* @c: Execution context
- * @ns: In pasta mode, if set, bind with loopback address in namespace
+ * @pif: Interface to open the socket for (PIF_HOST or PIF_SPLICE)
* @addr: Pointer to address for binding, NULL if not configured
* @ifname: Name of interface to bind to, NULL if not configured
* @port: Port, host order
*
- * Return: 0 on (partial) success, negative error code on (complete) failure
+ * Return: 0 on success, negative error code on failure
*/
-int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
- const char *ifname, in_port_t port)
+int udp_sock_init(const struct ctx *c, uint8_t pif,
+ const union inany_addr *addr, const char *ifname,
+ in_port_t port)
{
union udp_listen_epoll_ref uref = {
- .pif = ns ? PIF_SPLICE : PIF_HOST,
+ .pif = pif,
.port = port,
};
- int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
+ int (*socks)[NUM_PORTS];
+ int s;
ASSERT(!c->no_udp);
-
- if (!addr && c->ifi4 && c->ifi6 && !ns) {
- int s;
-
- /* Attempt to get a dual stack socket */
- s = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
- NULL, ifname, port, uref.u32);
- udp_splice_init[V4][port] = s < 0 ? -1 : s;
- udp_splice_init[V6][port] = s < 0 ? -1 : s;
- if (IN_INTERVAL(0, FD_REF_MAX, s))
+ ASSERT(pif_is_socket(pif));
+
+ if (pif == PIF_HOST)
+ socks = udp_splice_init;
+ else
+ socks = udp_splice_ns;
+
+ if (!c->ifi4) {
+ if (!addr)
+ /* Restrict to v6 only */
+ addr = &inany_any6;
+ else if (inany_v4(addr))
+ /* Nothing to do */
return 0;
}
-
- if ((!addr || inany_v4(addr)) && c->ifi4) {
- if (!ns) {
- r4 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
- addr ? addr : &inany_any4, ifname,
- port, uref.u32);
-
- udp_splice_init[V4][port] = r4 < 0 ? -1 : r4;
- } else {
- r4 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_SPLICE,
- &inany_loopback4, ifname,
- port, uref.u32);
- udp_splice_ns[V4][port] = r4 < 0 ? -1 : r4;
- }
+ if (!c->ifi6) {
+ if (!addr)
+ /* Restrict to v4 only */
+ addr = &inany_any4;
+ else if (!inany_v4(addr))
+ /* Nothing to do */
+ return 0;
}
- if ((!addr || !inany_v4(addr)) && c->ifi6) {
- if (!ns) {
- r6 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
- addr ? addr : &inany_any6, ifname,
- port, uref.u32);
-
- udp_splice_init[V6][port] = r6 < 0 ? -1 : r6;
- } else {
- r6 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_SPLICE,
- &inany_loopback6, ifname,
- port, uref.u32);
- udp_splice_ns[V6][port] = r6 < 0 ? -1 : r6;
- }
+ s = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, pif,
+ addr, ifname, port, uref.u32);
+ if (s > FD_REF_MAX) {
+ close(s);
+ s = -EIO;
}
- if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6))
- return 0;
+ if (!addr || inany_v4(addr))
+ socks[V4][port] = s < 0 ? -1 : s;
+ if (!addr || !inany_v4(addr))
+ socks[V6][port] = s < 0 ? -1 : s;
- return r4 < 0 ? r4 : r6;
+ return s < 0 ? s : 0;
}
/**
@@ -1029,6 +1210,26 @@ static void udp_splice_iov_init(void)
}
/**
+ * udp_ns_sock_init() - Init socket to listen for spliced outbound connections
+ * @c: Execution context
+ * @port: Port, host order
+ */
+static void udp_ns_sock_init(const struct ctx *c, in_port_t port)
+{
+ ASSERT(!c->no_udp);
+
+ if (!c->no_bindtodevice) {
+ udp_sock_init(c, PIF_SPLICE, NULL, "lo", port);
+ return;
+ }
+
+ if (c->ifi4)
+ udp_sock_init(c, PIF_SPLICE, &inany_loopback4, NULL, port);
+ if (c->ifi6)
+ udp_sock_init(c, PIF_SPLICE, &inany_loopback6, NULL, port);
+}
+
+/**
* udp_port_rebind() - Rebind ports to match forward maps
* @c: Execution context
* @outbound: True to remap outbound forwards, otherwise inbound
@@ -1040,8 +1241,6 @@ static void udp_port_rebind(struct ctx *c, bool outbound)
int (*socks)[NUM_PORTS] = outbound ? udp_splice_ns : udp_splice_init;
const uint8_t *fmap
= outbound ? c->udp.fwd_out.map : c->udp.fwd_in.map;
- const uint8_t *rmap
- = outbound ? c->udp.fwd_in.map : c->udp.fwd_out.map;
unsigned port;
for (port = 0; port < NUM_PORTS; port++) {
@@ -1059,13 +1258,13 @@ static void udp_port_rebind(struct ctx *c, bool outbound)
continue;
}
- /* Don't loop back our own ports */
- if (bitmap_isset(rmap, port))
- continue;
-
if ((c->ifi4 && socks[V4][port] == -1) ||
- (c->ifi6 && socks[V6][port] == -1))
- udp_sock_init(c, outbound, NULL, NULL, port);
+ (c->ifi6 && socks[V6][port] == -1)) {
+ if (outbound)
+ udp_ns_sock_init(c, port);
+ else
+ udp_sock_init(c, PIF_HOST, NULL, NULL, port);
+ }
}
}
@@ -1088,29 +1287,18 @@ static int udp_port_rebind_outbound(void *arg)
}
/**
- * udp_timer() - Scan activity bitmaps for ports with associated timed events
+ * udp_port_rebind_all() - Rebind ports to match forward maps (in host & ns)
* @c: Execution context
- * @now: Current timestamp
*/
-void udp_timer(struct ctx *c, const struct timespec *now)
+void udp_port_rebind_all(struct ctx *c)
{
- (void)now;
+ ASSERT(c->mode == MODE_PASTA && !c->no_udp);
- ASSERT(!c->no_udp);
-
- if (c->mode == MODE_PASTA) {
- if (c->udp.fwd_out.mode == FWD_AUTO) {
- fwd_scan_ports_udp(&c->udp.fwd_out, &c->udp.fwd_in,
- &c->tcp.fwd_out, &c->tcp.fwd_in);
- NS_CALL(udp_port_rebind_outbound, c);
- }
+ if (c->udp.fwd_out.mode == FWD_AUTO)
+ NS_CALL(udp_port_rebind_outbound, c);
- if (c->udp.fwd_in.mode == FWD_AUTO) {
- fwd_scan_ports_udp(&c->udp.fwd_in, &c->udp.fwd_out,
- &c->tcp.fwd_in, &c->tcp.fwd_out);
- udp_port_rebind(c, false);
- }
- }
+ if (c->udp.fwd_in.mode == FWD_AUTO)
+ udp_port_rebind(c, false);
}
/**
diff --git a/udp.h b/udp.h
index de2df6d..03e8dc5 100644
--- a/udp.h
+++ b/udp.h
@@ -6,21 +6,21 @@
#ifndef UDP_H
#define UDP_H
-#define UDP_TIMER_INTERVAL 1000 /* ms */
-
void udp_portmap_clear(void);
void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
uint32_t events, const struct timespec *now);
-void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
- uint32_t events, const struct timespec *now);
+void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
+ uint32_t events, const struct timespec *now);
int udp_tap_handler(const struct ctx *c, uint8_t pif,
sa_family_t af, const void *saddr, const void *daddr,
- const struct pool *p, int idx, const struct timespec *now);
-int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
- const char *ifname, in_port_t port);
+ uint8_t ttl, const struct pool *p, int idx,
+ const struct timespec *now);
+int udp_sock_init(const struct ctx *c, uint8_t pif,
+ const union inany_addr *addr, const char *ifname,
+ in_port_t port);
int udp_init(struct ctx *c);
-void udp_timer(struct ctx *c, const struct timespec *now);
-void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
+void udp_port_rebind_all(struct ctx *c);
+void udp_update_l2_buf(const unsigned char *eth_d);
/**
* union udp_listen_epoll_ref - epoll reference for "listening" UDP sockets
diff --git a/udp_flow.c b/udp_flow.c
index c6b8630..8907f2f 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -9,10 +9,13 @@
#include <fcntl.h>
#include <sys/uio.h>
#include <unistd.h>
+#include <netinet/udp.h>
#include "util.h"
#include "passt.h"
#include "flow_table.h"
+#include "udp_internal.h"
+#include "epoll_ctl.h"
#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */
@@ -34,123 +37,167 @@ struct udp_flow *udp_at_sidx(flow_sidx_t sidx)
return &flow->udp;
}
-/*
+/**
* udp_flow_close() - Close and clean up UDP flow
* @c: Execution context
* @uflow: UDP flow
*/
void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
{
+ unsigned sidei;
+
if (uflow->closed)
return; /* Nothing to do */
- if (uflow->s[INISIDE] >= 0) {
- /* The listening socket needs to stay in epoll */
- close(uflow->s[INISIDE]);
- uflow->s[INISIDE] = -1;
+ flow_foreach_sidei(sidei) {
+ flow_hash_remove(c, FLOW_SIDX(uflow, sidei));
+ if (uflow->s[sidei] >= 0) {
+ epoll_del(flow_epollfd(&uflow->f), uflow->s[sidei]);
+ close(uflow->s[sidei]);
+ uflow->s[sidei] = -1;
+ }
}
- if (uflow->s[TGTSIDE] >= 0) {
- /* But the flow specific one needs to be removed */
- epoll_del(c, uflow->s[TGTSIDE]);
- close(uflow->s[TGTSIDE]);
- uflow->s[TGTSIDE] = -1;
+ uflow->closed = true;
+}
+
+/**
+ * udp_flow_sock() - Create, bind and connect a flow specific UDP socket
+ * @c: Execution context
+ * @uflow: UDP flow to open socket for
+ * @sidei: Side of @uflow to open socket for
+ *
+ * Return: fd of new socket on success, -ve error code on failure
+ */
+static int udp_flow_sock(const struct ctx *c,
+ struct udp_flow *uflow, unsigned sidei)
+{
+ const struct flowside *side = &uflow->f.side[sidei];
+ uint8_t pif = uflow->f.pif[sidei];
+ union {
+ flow_sidx_t sidx;
+ uint32_t data;
+ } fref = { .sidx = FLOW_SIDX(uflow, sidei) };
+ union epoll_ref ref;
+ int rc;
+ int s;
+
+ s = flowside_sock_l4(c, EPOLL_TYPE_UDP, pif, side);
+ if (s < 0) {
+ flow_dbg_perror(uflow, "Couldn't open flow specific socket");
+ return s;
}
- flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE));
- if (!pif_is_socket(uflow->f.pif[TGTSIDE]))
- flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE));
- uflow->closed = true;
+ ref.type = EPOLL_TYPE_UDP;
+ ref.data = fref.data;
+ ref.fd = s;
+
+ flow_epollid_set(&uflow->f, EPOLLFD_ID_DEFAULT);
+
+ rc = epoll_add(flow_epollfd(&uflow->f), EPOLLIN, ref);
+ if (rc < 0) {
+ close(s);
+ return rc;
+ }
+
+ if (flowside_connect(c, s, pif, side) < 0) {
+ rc = -errno;
+
+ epoll_del(flow_epollfd(&uflow->f), s);
+ close(s);
+
+ flow_dbg_perror(uflow, "Couldn't connect flow socket");
+ return rc;
+ }
+
+ /* It's possible, if unlikely, that we could receive some packets in
+ * between the bind() and connect() which may or may not be for this
+ * flow. Being UDP we could just discard them, but it's not ideal.
+ *
+ * There's also a tricky case if a bunch of datagrams for a new flow
+ * arrive in rapid succession, the first going to the original listening
+ * socket and later ones going to this new socket. If we forwarded the
+ * datagrams from the new socket immediately here they would go before
+ * the datagram which established the flow. Again, not strictly wrong
+ * for UDP, but not ideal.
+ *
+ * So, we flag that the new socket is in a transient state where it
+ * might have datagrams for a different flow queued. Before the next
+ * epoll cycle, udp_flow_defer() will flush out any such datagrams, and
+ * thereafter everything on the new socket should be strictly for this
+ * flow.
+ */
+ if (sidei)
+ uflow->flush1 = true;
+ else
+ uflow->flush0 = true;
+
+ return s;
}
/**
* udp_flow_new() - Common setup for a new UDP flow
* @c: Execution context
* @flow: Initiated flow
- * @s_ini: Initiating socket (or -1)
* @now: Timestamp
*
- * Return: UDP specific flow, if successful, NULL on failure
+ * Return: sidx for the target side of the new UDP flow, or FLOW_SIDX_NONE
+ * on failure.
+ *
+ * #syscalls getsockname
*/
static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
- int s_ini, const struct timespec *now)
+ const struct timespec *now)
{
struct udp_flow *uflow = NULL;
const struct flowside *tgt;
- uint8_t tgtpif;
+ unsigned sidei;
if (!(tgt = flow_target(c, flow, IPPROTO_UDP)))
goto cancel;
- tgtpif = flow->f.pif[TGTSIDE];
uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
uflow->ts = now->tv_sec;
uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
+ uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = 0;
- if (s_ini >= 0) {
- /* When using auto port-scanning the listening port could go
- * away, so we need to duplicate the socket
- */
- uflow->s[INISIDE] = fcntl(s_ini, F_DUPFD_CLOEXEC, 0);
- if (uflow->s[INISIDE] < 0) {
- flow_perror(uflow,
- "Couldn't duplicate listening socket");
- goto cancel;
- }
+ flow_foreach_sidei(sidei) {
+ if (pif_is_socket(uflow->f.pif[sidei]))
+ if ((uflow->s[sidei] = udp_flow_sock(c, uflow, sidei)) < 0)
+ goto cancel;
}
- if (pif_is_socket(tgtpif)) {
- struct mmsghdr discard[UIO_MAXIOV] = { 0 };
- union {
- flow_sidx_t sidx;
- uint32_t data;
- } fref = {
- .sidx = FLOW_SIDX(flow, TGTSIDE),
- };
- int rc;
-
- uflow->s[TGTSIDE] = flowside_sock_l4(c, EPOLL_TYPE_UDP_REPLY,
- tgtpif, tgt, fref.data);
- if (uflow->s[TGTSIDE] < 0) {
- flow_dbg_perror(uflow,
- "Couldn't open socket for spliced flow");
- goto cancel;
- }
-
- if (flowside_connect(c, uflow->s[TGTSIDE], tgtpif, tgt) < 0) {
- flow_dbg_perror(uflow, "Couldn't connect flow socket");
+ if (uflow->s[TGTSIDE] >= 0 && inany_is_unspecified(&tgt->oaddr)) {
+ /* When we target a socket, we connect() it, but might not
+ * always bind(), leaving the kernel to pick our address. In
+ * that case connect() will implicitly bind() the socket, but we
+ * need to determine its local address so that we can match
+ * reply packets back to the correct flow. Update the flow with
+ * the information from getsockname() */
+ union sockaddr_inany sa;
+ socklen_t sl = sizeof(sa);
+ in_port_t port;
+
+ if (getsockname(uflow->s[TGTSIDE], &sa.sa, &sl) < 0 ||
+ inany_from_sockaddr(&uflow->f.side[TGTSIDE].oaddr,
+ &port, &sa) < 0) {
+ flow_perror(uflow, "Unable to determine local address");
goto cancel;
}
-
- /* It's possible, if unlikely, that we could receive some
- * unrelated packets in between the bind() and connect() of this
- * socket. For now we just discard these. We could consider
- * trying to redirect these to an appropriate handler, if we
- * need to.
- */
- rc = recvmmsg(uflow->s[TGTSIDE], discard, ARRAY_SIZE(discard),
- MSG_DONTWAIT, NULL);
- if (rc >= ARRAY_SIZE(discard)) {
- flow_dbg(uflow,
- "Too many (%d) spurious reply datagrams", rc);
+ if (port != tgt->oport) {
+ flow_err(uflow, "Unexpected local port");
goto cancel;
- } else if (rc > 0) {
- flow_trace(uflow,
- "Discarded %d spurious reply datagrams", rc);
- } else if (errno != EAGAIN) {
- flow_perror(uflow,
- "Unexpected error discarding datagrams");
}
}
- flow_hash_insert(c, FLOW_SIDX(uflow, INISIDE));
-
- /* If the target side is a socket, it will be a reply socket that knows
- * its own flowside. But if it's tap, then we need to look it up by
- * hash.
+ /* Tap sides always need to be looked up by hash. Socket sides don't
+ * always, but sometimes do (receiving packets on a socket not specific
+ * to one flow). Unconditionally hash both sides so all our bases are
+ * covered
*/
- if (!pif_is_socket(tgtpif))
- flow_hash_insert(c, FLOW_SIDX(uflow, TGTSIDE));
+ flow_foreach_sidei(sidei)
+ flow_hash_insert(c, FLOW_SIDX(uflow, sidei));
+
FLOW_ACTIVATE(uflow);
return FLOW_SIDX(uflow, TGTSIDE);
@@ -163,9 +210,11 @@ cancel:
}
/**
- * udp_flow_from_sock() - Find or create UDP flow for "listening" socket
+ * udp_flow_from_sock() - Find or create UDP flow for incoming datagram
* @c: Execution context
- * @ref: epoll reference of the receiving socket
+ * @pif: Interface the datagram is arriving from
+ * @dst: Our (local) address to which the datagram is arriving
+ * @port: Our (local) port number to which the datagram is arriving
* @s_in: Source socket address, filled in by recvmmsg()
* @now: Timestamp
*
@@ -174,7 +223,8 @@ cancel:
* Return: sidx for the destination side of the flow for this packet, or
* FLOW_SIDX_NONE if we couldn't find or create a flow.
*/
-flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
+flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif,
+ const union inany_addr *dst, in_port_t port,
const union sockaddr_inany *s_in,
const struct timespec *now)
{
@@ -183,9 +233,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
union flow *flow;
flow_sidx_t sidx;
- ASSERT(ref.type == EPOLL_TYPE_UDP_LISTEN);
-
- sidx = flow_lookup_sa(c, IPPROTO_UDP, ref.udp.pif, s_in, ref.udp.port);
+ sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, s_in, dst, port);
if ((uflow = udp_at_sidx(sidx))) {
uflow->ts = now->tv_sec;
return flow_sidx_opposite(sidx);
@@ -195,12 +243,11 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
char sastr[SOCKADDR_STRLEN];
debug("Couldn't allocate flow for UDP datagram from %s %s",
- pif_name(ref.udp.pif),
- sockaddr_ntop(s_in, sastr, sizeof(sastr)));
+ pif_name(pif), sockaddr_ntop(s_in, sastr, sizeof(sastr)));
return FLOW_SIDX_NONE;
}
- ini = flow_initiate_sa(flow, ref.udp.pif, s_in, ref.udp.port);
+ ini = flow_initiate_sa(flow, pif, s_in, dst, port);
if (!inany_is_unicast(&ini->eaddr) ||
ini->eport == 0 || ini->oport == 0) {
@@ -213,7 +260,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
return FLOW_SIDX_NONE;
}
- return udp_flow_new(c, flow, ref.fd, now);
+ return udp_flow_new(c, flow, now);
}
/**
@@ -269,17 +316,45 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
return FLOW_SIDX_NONE;
}
- return udp_flow_new(c, flow, -1, now);
+ return udp_flow_new(c, flow, now);
+}
+
+/**
+ * udp_flush_flow() - Flush datagrams that might not be for this flow
+ * @c: Execution context
+ * @uflow: Flow to handle
+ * @sidei: Side of the flow to flush
+ * @now: Current timestamp
+ */
+static void udp_flush_flow(const struct ctx *c,
+ const struct udp_flow *uflow, unsigned sidei,
+ const struct timespec *now)
+{
+ /* We don't know exactly where the datagrams will come from, but we know
+ * they'll have an interface and oport matching this flow */
+ udp_sock_fwd(c, uflow->s[sidei], uflow->f.pif[sidei],
+ uflow->f.side[sidei].oport, now);
}
/**
* udp_flow_defer() - Deferred per-flow handling (clean up aborted flows)
+ * @c: Execution context
* @uflow: Flow to handle
+ * @now: Current timestamp
*
* Return: true if the connection is ready to free, false otherwise
*/
-bool udp_flow_defer(const struct udp_flow *uflow)
+bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow,
+ const struct timespec *now)
{
+ if (uflow->flush0) {
+ udp_flush_flow(c, uflow, INISIDE, now);
+ uflow->flush0 = false;
+ }
+ if (uflow->flush1) {
+ udp_flush_flow(c, uflow, TGTSIDE, now);
+ uflow->flush1 = false;
+ }
return uflow->closed;
}
diff --git a/udp_flow.h b/udp_flow.h
index 9a1b059..4c528e9 100644
--- a/udp_flow.h
+++ b/udp_flow.h
@@ -8,9 +8,12 @@
#define UDP_FLOW_H
/**
- * struct udp - Descriptor for a flow of UDP packets
+ * struct udp_flow - Descriptor for a flow of UDP packets
* @f: Generic flow information
+ * @ttl: TTL or hop_limit for both sides
* @closed: Flow is already closed
+ * @flush0: @s[0] may have datagrams queued for other flows
+ * @flush1: @s[1] may have datagrams queued for other flows
* @ts: Activity timestamp
* @s: Socket fd (or -1) for each side of the flow
*/
@@ -18,13 +21,19 @@ struct udp_flow {
/* Must be first element */
struct flow_common f;
- bool closed :1;
+ uint8_t ttl[SIDES];
+
+ bool closed :1,
+ flush0 :1,
+ flush1 :1;
+
time_t ts;
int s[SIDES];
};
struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
-flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
+flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif,
+ const union inany_addr *dst, in_port_t port,
const union sockaddr_inany *s_in,
const struct timespec *now);
flow_sidx_t udp_flow_from_tap(const struct ctx *c,
@@ -33,7 +42,8 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
in_port_t srcport, in_port_t dstport,
const struct timespec *now);
void udp_flow_close(const struct ctx *c, struct udp_flow *uflow);
-bool udp_flow_defer(const struct udp_flow *uflow);
+bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow,
+ const struct timespec *now);
bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow,
const struct timespec *now);
diff --git a/udp_internal.h b/udp_internal.h
index 3b081f5..96d11cf 100644
--- a/udp_internal.h
+++ b/udp_internal.h
@@ -8,8 +8,6 @@
#include "tap.h" /* needed by udp_meta_t */
-#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */
-
/**
* struct udp_payload_t - UDP header and data for inbound messages
* @uh: UDP header
@@ -30,5 +28,7 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
const struct flowside *toside, size_t dlen,
bool no_udp_csum);
-int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events);
+void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
+ in_port_t port, const struct timespec *now);
+
#endif /* UDP_INTERNAL_H */
diff --git a/udp_vu.c b/udp_vu.c
index c26a223..c30dcf9 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -40,7 +40,7 @@ static struct vu_virtq_element elem [VIRTQUEUE_MAX_SIZE];
* udp_vu_hdrlen() - return the size of the header in level 2 frame (UDP)
* @v6: Set for IPv6 packet
*
- * Return: Return the size of the header
+ * Return: return the size of the header
*/
static size_t udp_vu_hdrlen(bool v6)
{
@@ -58,46 +58,25 @@ static size_t udp_vu_hdrlen(bool v6)
}
/**
- * udp_vu_sock_info() - get socket information
- * @s: Socket to get information from
- * @s_in: Socket address (output)
- *
- * Return: 0 if socket address can be read, -1 otherwise
- */
-static int udp_vu_sock_info(int s, union sockaddr_inany *s_in)
-{
- struct msghdr msg = {
- .msg_name = s_in,
- .msg_namelen = sizeof(union sockaddr_inany),
- };
-
- return recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT);
-}
-
-/**
* udp_vu_sock_recv() - Receive datagrams from socket into vhost-user buffers
* @c: Execution context
+ * @vq: virtqueue to use to receive data
* @s: Socket to receive from
- * @events: epoll events bitmap
* @v6: Set for IPv6 connections
* @dlen: Size of received data (output)
*
- * Return: Number of iov entries used to store the datagram
+ * Return: number of iov entries used to store the datagram
*/
-static int udp_vu_sock_recv(const struct ctx *c, int s, uint32_t events,
+static int udp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq, int s,
bool v6, ssize_t *dlen)
{
- struct vu_dev *vdev = c->vdev;
- struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+ const struct vu_dev *vdev = c->vdev;
int iov_cnt, idx, iov_used;
+ size_t off, hdrlen, l2len;
struct msghdr msg = { 0 };
- size_t off, hdrlen;
ASSERT(!c->no_udp);
- if (!(events & EPOLLIN))
- return 0;
-
/* compute L2 header length */
hdrlen = udp_vu_hdrlen(v6);
@@ -111,7 +90,7 @@ static int udp_vu_sock_recv(const struct ctx *c, int s, uint32_t events,
return 0;
/* reserve space for the headers */
- ASSERT(iov_vu[0].iov_len >= hdrlen);
+ ASSERT(iov_vu[0].iov_len >= MAX(hdrlen, ETH_ZLEN));
iov_vu[0].iov_base = (char *)iov_vu[0].iov_base + hdrlen;
iov_vu[0].iov_len -= hdrlen;
@@ -137,6 +116,10 @@ static int udp_vu_sock_recv(const struct ctx *c, int s, uint32_t events,
iov_vu[idx].iov_len = off;
iov_used = idx + !!off;
+ /* pad frame to 60 bytes: first buffer is at least ETH_ZLEN long */
+ l2len = *dlen + hdrlen - sizeof(struct virtio_net_hdr_mrg_rxbuf);
+ vu_pad(&iov_vu[0], l2len);
+
vu_set_vnethdr(vdev, iov_vu[0].iov_base, iov_used);
/* release unused buffers */
@@ -214,125 +197,27 @@ static void udp_vu_csum(const struct flowside *toside, int iov_used)
}
/**
- * udp_vu_listen_sock_handler() - Handle new data from socket
- * @c: Execution context
- * @ref: epoll reference
- * @events: epoll events bitmap
- * @now: Current timestamp
- */
-void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
- uint32_t events, const struct timespec *now)
-{
- struct vu_dev *vdev = c->vdev;
- struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
- int i;
-
- if (udp_sock_errs(c, ref, events) < 0) {
- err("UDP: Unrecoverable error on listening socket:"
- " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
- return;
- }
-
- for (i = 0; i < UDP_MAX_FRAMES; i++) {
- const struct flowside *toside;
- union sockaddr_inany s_in;
- flow_sidx_t sidx;
- uint8_t pif;
- ssize_t dlen;
- int iov_used;
- bool v6;
-
- if (udp_vu_sock_info(ref.fd, &s_in) < 0)
- break;
-
- sidx = udp_flow_from_sock(c, ref, &s_in, now);
- pif = pif_at_sidx(sidx);
-
- if (pif != PIF_TAP) {
- if (flow_sidx_valid(sidx)) {
- flow_sidx_t fromsidx = flow_sidx_opposite(sidx);
- struct udp_flow *uflow = udp_at_sidx(sidx);
-
- flow_err(uflow,
- "No support for forwarding UDP from %s to %s",
- pif_name(pif_at_sidx(fromsidx)),
- pif_name(pif));
- } else {
- debug("Discarding 1 datagram without flow");
- }
-
- continue;
- }
-
- toside = flowside_at_sidx(sidx);
-
- v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
-
- iov_used = udp_vu_sock_recv(c, ref.fd, events, v6, &dlen);
- if (iov_used <= 0)
- break;
-
- udp_vu_prepare(c, toside, dlen);
- if (*c->pcap) {
- udp_vu_csum(toside, iov_used);
- pcap_iov(iov_vu, iov_used,
- sizeof(struct virtio_net_hdr_mrg_rxbuf));
- }
- vu_flush(vdev, vq, elem, iov_used);
- }
-}
-
-/**
- * udp_vu_reply_sock_handler() - Handle new data from flow specific socket
+ * udp_vu_sock_to_tap() - Forward datagrams from socket to tap
* @c: Execution context
- * @ref: epoll reference
- * @events: epoll events bitmap
- * @now: Current timestamp
+ * @s: Socket to read data from
+ * @n: Maximum number of datagrams to forward
+ * @tosidx: Flow & side to forward data from @s to
*/
-void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
- uint32_t events, const struct timespec *now)
+void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx)
{
- flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
const struct flowside *toside = flowside_at_sidx(tosidx);
- struct udp_flow *uflow = udp_at_sidx(ref.flowside);
- int from_s = uflow->s[ref.flowside.sidei];
+ bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
struct vu_dev *vdev = c->vdev;
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
int i;
- ASSERT(!c->no_udp);
-
- if (udp_sock_errs(c, ref, events) < 0) {
- flow_err(uflow, "Unrecoverable error on reply socket");
- flow_err_details(uflow);
- udp_flow_close(c, uflow);
- return;
- }
-
- for (i = 0; i < UDP_MAX_FRAMES; i++) {
- uint8_t topif = pif_at_sidx(tosidx);
+ for (i = 0; i < n; i++) {
ssize_t dlen;
int iov_used;
- bool v6;
-
- ASSERT(uflow);
-
- if (topif != PIF_TAP) {
- uint8_t frompif = pif_at_sidx(ref.flowside);
-
- flow_err(uflow,
- "No support for forwarding UDP from %s to %s",
- pif_name(frompif), pif_name(topif));
- continue;
- }
-
- v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
- iov_used = udp_vu_sock_recv(c, from_s, events, v6, &dlen);
+ iov_used = udp_vu_sock_recv(c, vq, s, v6, &dlen);
if (iov_used <= 0)
break;
- flow_trace(uflow, "Received 1 datagram on reply socket");
- uflow->ts = now->tv_sec;
udp_vu_prepare(c, toside, dlen);
if (*c->pcap) {
diff --git a/udp_vu.h b/udp_vu.h
index ba7018d..576b0e7 100644
--- a/udp_vu.h
+++ b/udp_vu.h
@@ -6,8 +6,8 @@
#ifndef UDP_VU_H
#define UDP_VU_H
-void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
- uint32_t events, const struct timespec *now);
-void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
- uint32_t events, const struct timespec *now);
+void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref,
+ const struct timespec *now);
+void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx);
+
#endif /* UDP_VU_H */
diff --git a/util.c b/util.c
index 656e86a..2730395 100644
--- a/util.c
+++ b/util.c
@@ -18,7 +18,6 @@
#include <unistd.h>
#include <arpa/inet.h>
#include <net/ethernet.h>
-#include <sys/epoll.h>
#include <sys/uio.h>
#include <fcntl.h>
#include <string.h>
@@ -34,30 +33,32 @@
#include "passt.h"
#include "packet.h"
#include "log.h"
+#include "pcap.h"
+#include "epoll_ctl.h"
+#include "pasta.h"
#ifdef HAS_GETRANDOM
#include <sys/random.h>
#endif
+/* Zero-filled buffer to pad 802.3 frames, up to 60 (ETH_ZLEN) bytes */
+uint8_t eth_pad[ETH_ZLEN] = { 0 };
+
/**
- * sock_l4_sa() - Create and bind socket to socket address, add to epoll list
+ * sock_l4_() - Create and bind socket to socket address
* @c: Execution context
* @type: epoll type
* @sa: Socket address to bind to
- * @sl: Length of @sa
* @ifname: Interface for binding, NULL for any
- * @v6only: Set IPV6_V6ONLY socket option
- * @data: epoll reference portion for protocol handlers
+ * @v6only: If >= 0, set IPV6_V6ONLY socket option to this value
*
* Return: newly created socket, negative error code on failure
*/
-int sock_l4_sa(const struct ctx *c, enum epoll_type type,
- const void *sa, socklen_t sl,
- const char *ifname, bool v6only, uint32_t data)
+static int sock_l4_(const struct ctx *c, enum epoll_type type,
+ const union sockaddr_inany *sa, const char *ifname,
+ int v6only)
{
- sa_family_t af = ((const struct sockaddr *)sa)->sa_family;
- union epoll_ref ref = { .type = type, .data = data };
+ sa_family_t af = sa->sa_family;
bool freebind = false;
- struct epoll_event ev;
int fd, y = 1, ret;
uint8_t proto;
int socktype;
@@ -69,9 +70,8 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
freebind = c->freebind;
break;
case EPOLL_TYPE_UDP_LISTEN:
+ case EPOLL_TYPE_UDP:
freebind = c->freebind;
- /* fallthrough */
- case EPOLL_TYPE_UDP_REPLY:
proto = IPPROTO_UDP;
socktype = SOCK_DGRAM | SOCK_NONBLOCK;
break;
@@ -99,21 +99,27 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
return -EBADF;
}
- ref.fd = fd;
-
- if (v6only)
- if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &y, sizeof(y)))
- debug("Failed to set IPV6_V6ONLY on socket %i", fd);
+ if (v6only >= 0) {
+ if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY,
+ &v6only, sizeof(v6only))) {
+ debug("Failed to set IPV6_V6ONLY to %d on socket %i",
+ v6only, fd);
+ }
+ }
if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y)))
debug("Failed to set SO_REUSEADDR on socket %i", fd);
if (proto == IPPROTO_UDP) {
+ int pktinfo = af == AF_INET ? IP_PKTINFO : IPV6_RECVPKTINFO;
+ int recverr = af == AF_INET ? IP_RECVERR : IPV6_RECVERR;
int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6;
- int opt = af == AF_INET ? IP_RECVERR : IPV6_RECVERR;
- if (setsockopt(fd, level, opt, &y, sizeof(y)))
+ if (setsockopt(fd, level, recverr, &y, sizeof(y)))
die_perror("Failed to set RECVERR on socket %i", fd);
+
+ if (setsockopt(fd, level, pktinfo, &y, sizeof(y)))
+ die_perror("Failed to set PKTINFO on socket %i", fd);
}
if (ifname && *ifname) {
@@ -127,9 +133,10 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
char str[SOCKADDR_STRLEN];
ret = -errno;
- warn("Can't bind %s socket for %s to %s, closing",
- EPOLL_TYPE_STR(proto),
- sockaddr_ntop(sa, str, sizeof(str)), ifname);
+ warn("SO_BINDTODEVICE %s failed for %s on %s: %s",
+ ifname, EPOLL_TYPE_STR(type),
+ sockaddr_ntop(sa, str, sizeof(str)),
+ strerror_(-ret));
close(fd);
return ret;
}
@@ -147,7 +154,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
}
}
- if (bind(fd, sa, sl) < 0) {
+ if (bind(fd, &sa->sa, socklen_inany(sa)) < 0) {
/* We'll fail to bind to low ports if we don't have enough
* capabilities, and we'll fail to bind on already bound ports,
* this is fine. This might also fail for ICMP because of a
@@ -167,18 +174,61 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
return ret;
}
- ev.events = EPOLLIN;
- ev.data.u64 = ref.u64;
- if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev) == -1) {
- ret = -errno;
- warn("L4 epoll_ctl: %s", strerror_(-ret));
- return ret;
- }
-
return fd;
}
/**
+ * sock_l4() - Create and bind socket to given address
+ * @c: Execution context
+ * @type: epoll type
+ * @sa: Socket address to bind to
+ * @ifname: Interface for binding, NULL for any
+ *
+ * Return: newly created socket, negative error code on failure
+ */
+int sock_l4(const struct ctx *c, enum epoll_type type,
+ const union sockaddr_inany *sa, const char *ifname)
+{
+ int v6only = -1;
+
+ /* The option doesn't exist for IPv4 sockets, and we don't care about it
+ * for IPv6 sockets with a non-wildcard address.
+ */
+ if (sa->sa_family == AF_INET6 &&
+ IN6_IS_ADDR_UNSPECIFIED(&sa->sa6.sin6_addr))
+ v6only = 1;
+
+ return sock_l4_(c, type, sa, ifname, v6only);
+}
+
+/**
+ * sock_l4_dualstack_any() - Create dualstack socket bound to :: and 0.0.0.0
+ * @c: Execution context
+ * @type: epoll type
+ * @port Port to bind to (:: and 0.0.0.0)
+ * @ifname: Interface for binding, NULL for any
+ *
+ * Return: newly created socket, negative error code on failure
+ *
+ * A dual stack socket is effectively bound to both :: and 0.0.0.0.
+ */
+int sock_l4_dualstack_any(const struct ctx *c, enum epoll_type type,
+ in_port_t port, const char *ifname)
+{
+ union sockaddr_inany sa = {
+ .sa6.sin6_family = AF_INET6,
+ .sa6.sin6_addr = in6addr_any,
+ .sa6.sin6_port = htons(port),
+ };
+
+ /* Dual stack sockets require IPV6_V6ONLY == 0. Usually that's the
+ * default, but sysctl net.ipv6.bindv6only can change that, so set the
+ * sockopt explicitly.
+ */
+ return sock_l4_(c, type, &sa, ifname, 0);
+}
+
+/**
* sock_unix() - Create and bind AF_UNIX socket
* @sock_path: Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix)
*
@@ -241,12 +291,13 @@ int sock_unix(char *sock_path)
}
/**
- * sock_probe_mem() - Check if setting high SO_SNDBUF and SO_RCVBUF is allowed
+ * sock_probe_features() - Probe for socket features we might use
* @c: Execution context
*/
-void sock_probe_mem(struct ctx *c)
+void sock_probe_features(struct ctx *c)
{
int v = INT_MAX / 2, s;
+ const char lo[] = "lo";
socklen_t sl;
s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
@@ -255,6 +306,7 @@ void sock_probe_mem(struct ctx *c)
return;
}
+ /* Check if setting high SO_SNDBUF and SO_RCVBUF is allowed */
sl = sizeof(v);
if (setsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v)) ||
getsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, &sl) ||
@@ -267,6 +319,19 @@ void sock_probe_mem(struct ctx *c)
(size_t)v < RCVBUF_BIG)
c->low_rmem = 1;
+ /* Check if SO_BINDTODEVICE is available
+ *
+ * Supported since kernel version 5.7, commit c427bfec18f2 ("net: core:
+ * enable SO_BINDTODEVICE for non-root users"). Some distro kernels may
+ * have backports, of course. Record whether we can use it so that we
+ * can give more useful diagnostics.
+ */
+ if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE, lo, sizeof(lo) - 1)) {
+ if (errno != EPERM)
+ warn_perror("Unexpected error probing SO_BINDTODEVICE");
+ c->no_bindtodevice = 1;
+ }
+
close(s);
}
@@ -317,6 +382,7 @@ void bitmap_set(uint8_t *map, unsigned bit)
* @map: Pointer to bitmap
* @bit: Bit number to clear
*/
+/* cppcheck-suppress unusedFunction */
void bitmap_clear(uint8_t *map, unsigned bit)
{
unsigned long *word = (unsigned long *)map + BITMAP_WORD(bit);
@@ -346,6 +412,7 @@ bool bitmap_isset(const uint8_t *map, unsigned bit)
* @a: First operand
* @b: Second operand
*/
+/* cppcheck-suppress unusedFunction */
void bitmap_or(uint8_t *dst, size_t size, const uint8_t *a, const uint8_t *b)
{
unsigned long *dw = (unsigned long *)dst;
@@ -360,7 +427,29 @@ void bitmap_or(uint8_t *dst, size_t size, const uint8_t *a, const uint8_t *b)
dst[i] = a[i] | b[i];
}
-/*
+/**
+ * bitmap_and_not() - Logical conjunction with complement (AND NOT) of bitmap
+ * @dst: Pointer to result bitmap
+ * @size: Size of bitmaps, in bytes
+ * @a: First operand
+ * @b: Second operand
+ */
+void bitmap_and_not(uint8_t *dst, size_t size,
+ const uint8_t *a, const uint8_t *b)
+{
+ unsigned long *dw = (unsigned long *)dst;
+ unsigned long *aw = (unsigned long *)a;
+ unsigned long *bw = (unsigned long *)b;
+ size_t i;
+
+ for (i = 0; i < size / sizeof(long); i++, dw++, aw++, bw++)
+ *dw = *aw & ~*bw;
+
+ for (i = size / sizeof(long) * sizeof(long); i < size; i++)
+ dst[i] = a[i] & ~b[i];
+}
+
+/**
* ns_enter() - Enter configured user (unless already joined) and network ns
* @c: Execution context
*
@@ -467,7 +556,7 @@ void pidfile_write(int fd, pid_t pid)
if (write(fd, pid_buf, n) < 0) {
perror("PID file write");
- _exit(EXIT_FAILURE);
+ passt_exit(EXIT_FAILURE);
}
close(fd);
@@ -495,7 +584,8 @@ int output_file_open(const char *path, int flags)
* @pidfile_fd: Open PID file descriptor
* @devnull_fd: Open file descriptor for /dev/null
*
- * Return: child PID on success, won't return on failure
+ * Return: 0 in the child process on success. The parent process exits.
+ * Does not return in either process on failure (calls _exit).
*/
int __daemon(int pidfile_fd, int devnull_fd)
{
@@ -503,12 +593,12 @@ int __daemon(int pidfile_fd, int devnull_fd)
if (pid == -1) {
perror("fork");
- _exit(EXIT_FAILURE);
+ passt_exit(EXIT_FAILURE);
}
if (pid) {
pidfile_write(pidfile_fd, pid);
- _exit(EXIT_SUCCESS);
+ passt_exit(EXIT_SUCCESS);
}
if (setsid() < 0 ||
@@ -516,7 +606,7 @@ int __daemon(int pidfile_fd, int devnull_fd)
dup2(devnull_fd, STDOUT_FILENO) < 0 ||
dup2(devnull_fd, STDERR_FILENO) < 0 ||
close(devnull_fd))
- _exit(EXIT_FAILURE);
+ passt_exit(EXIT_FAILURE);
return 0;
}
@@ -525,6 +615,9 @@ int __daemon(int pidfile_fd, int devnull_fd)
* fls() - Find last (most significant) bit set in word
* @x: Word
*
+ * Note: unlike ffs() and other implementations of fls(), notably the one from
+ * the Linux kernel, the starting position is 0 and not 1, that is, fls(1) = 0.
+ *
* Return: position of most significant bit set, starting from 0, -1 if none
*/
int fls(unsigned long x)
@@ -541,6 +634,17 @@ int fls(unsigned long x)
}
/**
+ * ilog2() - Integral part (floor) of binary logarithm (logarithm to the base 2)
+ * @x: Argument
+ *
+ * Return: integral part of binary logarithm of @x, -1 if undefined (if @x is 0)
+ */
+int ilog2(unsigned long x)
+{
+ return fls(x);
+}
+
+/**
* write_file() - Replace contents of file with a string
* @path: File to write
* @buf: String to write
@@ -573,6 +677,97 @@ int write_file(const char *path, const char *buf)
return len == 0 ? 0 : -1;
}
+/**
+ * read_file() - Read contents of file into a NULL-terminated buffer
+ * @path: Path to file to read
+ * @buf: Buffer to store file contents
+ * @buf_size: Size of buffer
+ *
+ * Return: number of bytes read on success, negative error code on failure
+ */
+static ssize_t read_file(const char *path, char *buf, size_t buf_size)
+{
+ size_t total_read = 0;
+ int fd;
+
+ if (!buf_size)
+ return -EINVAL;
+
+ fd = open(path, O_RDONLY | O_CLOEXEC);
+
+ if (fd < 0)
+ return -errno;
+
+ while (total_read < buf_size) {
+ ssize_t rc = read(fd, buf + total_read, buf_size - total_read);
+
+ if (rc < 0) {
+ int errno_save = errno;
+ close(fd);
+ return -errno_save;
+ }
+
+ if (rc == 0)
+ break;
+
+ total_read += rc;
+ }
+
+ close(fd);
+
+ if (total_read == buf_size) {
+ buf[buf_size - 1] = '\0';
+ return -ENOBUFS;
+ }
+
+ buf[total_read] = '\0';
+
+ return total_read;
+}
+
+/**
+ * read_file_integer() - Read an integer value from a file
+ * @path: Path to file to read
+ * @fallback: Default value if file can't be read
+ *
+ * Return: integer value, @fallback on failure
+ */
+intmax_t read_file_integer(const char *path, intmax_t fallback)
+{
+ ssize_t bytes_read;
+ char buf[BUFSIZ];
+ intmax_t value;
+ char *end;
+
+ bytes_read = read_file(path, buf, sizeof(buf));
+
+ if (bytes_read < 0)
+ goto error;
+
+ if (bytes_read == 0) {
+ debug("Empty file %s", path);
+ goto error;
+ }
+
+ errno = 0;
+ value = strtoimax(buf, &end, 10);
+ if (*end && *end != '\n') {
+ debug("Non-numeric content in %s", path);
+ goto error;
+ }
+ if (errno) {
+ debug("Out of range value in %s: %s", path, buf);
+ goto error;
+ }
+
+ return value;
+
+error:
+ debug("Couldn't read %s, using %"PRIdMAX" as default value",
+ path, fallback);
+ return fallback;
+}
+
#ifdef __ia64__
/* Needed by do_clone() below: glibc doesn't export the prototype of __clone2(),
* use the description from clone(2).
@@ -603,7 +798,8 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
#endif
}
-/* write_all_buf() - write all of a buffer to an fd
+/**
+ * write_all_buf() - write all of a buffer to an fd
* @fd: File descriptor
* @buf: Pointer to base of buffer
* @len: Length of buffer
@@ -633,7 +829,8 @@ int write_all_buf(int fd, const void *buf, size_t len)
return 0;
}
-/* write_remainder() - write the tail of an IO vector to an fd
+/**
+ * write_remainder() - write the tail of an IO vector to an fd
* @fd: File descriptor
* @iov: IO vector
* @iovcnt: Number of entries in @iov
@@ -757,7 +954,7 @@ int read_remainder(int fd, const struct iovec *iov, size_t cnt, size_t skip)
* @dst: output buffer, minimum SOCKADDR_STRLEN bytes
* @size: size of buffer at @dst
*
- * Return: On success, a non-null pointer to @dst, NULL on failure
+ * Return: on success, a non-null pointer to @dst, NULL on failure
*/
const char *sockaddr_ntop(const void *sa, char *dst, socklen_t size)
{
@@ -817,7 +1014,7 @@ const char *sockaddr_ntop(const void *sa, char *dst, socklen_t size)
* @dst: Output buffer, minimum ETH_ADDRSTRLEN bytes
* @size: Size of buffer at @dst
*
- * Return: On success, a non-null pointer to @dst, NULL on failure
+ * Return: on success, a non-null pointer to @dst, NULL on failure
*/
const char *eth_ntop(const unsigned char *mac, char *dst, size_t size)
{
@@ -834,7 +1031,7 @@ const char *eth_ntop(const unsigned char *mac, char *dst, size_t size)
/** str_ee_origin() - Convert socket extended error origin to a string
* @ee: Socket extended error structure
*
- * Return: Static string describing error origin
+ * Return: static string describing error origin
*/
const char *str_ee_origin(const struct sock_extended_err *ee)
{
@@ -871,7 +1068,9 @@ void close_open_files(int argc, char **argv)
errno = 0;
fd = strtol(optarg, NULL, 0);
- if (errno || fd <= STDERR_FILENO || fd > INT_MAX)
+ if (errno ||
+ (fd != STDIN_FILENO && fd <= STDERR_FILENO) ||
+ fd > INT_MAX)
die("Invalid --fd: %s", optarg);
}
} while (name != -1);
@@ -985,17 +1184,6 @@ void raw_random(void *buf, size_t buflen)
}
/**
- * epoll_del() - Remove a file descriptor from our passt epoll
- * @c: Execution context
- * @fd: File descriptor to remove
- */
-void epoll_del(const struct ctx *c, int fd)
-{
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, fd, NULL);
-
-}
-
-/**
* encode_domain_name() - Encode domain name according to RFC 1035, section 3.1
* @buf: Buffer to fill in with encoded domain name
* @domain_name: Input domain name string with terminator
@@ -1017,3 +1205,86 @@ void encode_domain_name(char *buf, const char *domain_name)
}
p[i] = 0L;
}
+
+/**
+ * abort_with_msg() - Print error message and abort
+ * @fmt: Format string
+ * @...: Format parameters
+ */
+void abort_with_msg(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vlogmsg(true, false, LOG_CRIT, fmt, ap);
+ va_end(ap);
+
+ /* This may actually cause a SIGSYS instead of SIGABRT, due to seccomp,
+ * but that will still get the job done.
+ */
+ abort();
+}
+
+/**
+ * passt_exit() - Perform vital cleanup and exit
+ *
+ * We don't use exit(3) because on some C library versions it can do unexpected
+ * things that hit our seccomp profile (e.g. futex() calls). This is a bespoke
+ * wrapper around _exit(2) performing just the cleanup that we need.
+ *
+ * #syscalls fsync
+ */
+void passt_exit(int status)
+{
+ /* Make sure we don't leave the pcap file truncated */
+ if (pcap_fd != -1 && fsync(pcap_fd))
+ warn_perror("Failed to flush pcap file, it might be truncated");
+
+ /* Make sure we don't leave an incomplete log */
+ if (log_file != -1)
+ (void)fsync(log_file);
+
+ /* Make sure we don't leave any messages incomplete */
+ (void)fflush(stderr);
+ (void)fflush(stdout);
+
+ _exit(status);
+}
+
+/**
+ * clamped_scale() - Scale @x from 100% to f% depending on @y's value
+ * @x: Value to scale
+ * @y: Value determining scaling
+ * @lo: Lower bound for @y (start of y-axis slope)
+ * @hi: Upper bound for @y (end of y-axis slope)
+ * @f: Scaling factor, percent (might be less or more than 100)
+ *
+ * Return: @x scaled by @f * linear interpolation of @y between @lo and @hi
+ *
+ * In pictures:
+ *
+ * f % -> ,---- * If @y < lo (for example, @y is y0), return @x
+ * /| |
+ * / | | * If @lo < @y < @hi (for example, @y is y1),
+ * / | | return @x scaled by a factor linearly
+ * (100 + f) / 2 % ->/ | | interpolated between 100% and f% depending on
+ * /| | | @y's position between @lo (100%) and @hi (f%)
+ * / | | |
+ * / | | | * If @y > @hi (for example, @y is y2), return
+ * 100 % -> -----' | | | @x * @f / 100
+ * | | | | |
+ * y0 lo y1 hi y2 Example: @f = 150, @lo = 10, @hi = 20, @y = 15,
+ * @x = 1000
+ * -> interpolated factor is 125%
+ * -> return 1250
+ */
+long clamped_scale(long x, long y, long lo, long hi, long f)
+{
+ if (y < lo)
+ return x;
+
+ if (y > hi)
+ return x * f / 100;
+
+ return x - (x * (y - lo) / (hi - lo)) * (100 - f) / 100;
+}
diff --git a/util.h b/util.h
index 0f70f4d..4cbb5da 100644
--- a/util.h
+++ b/util.h
@@ -17,6 +17,7 @@
#include <arpa/inet.h>
#include <unistd.h>
#include <sys/syscall.h>
+#include <net/ethernet.h>
#include "log.h"
@@ -31,9 +32,6 @@
#ifndef SECCOMP_RET_KILL_PROCESS
#define SECCOMP_RET_KILL_PROCESS SECCOMP_RET_KILL
#endif
-#ifndef ETH_MAX_MTU
-#define ETH_MAX_MTU USHRT_MAX
-#endif
#ifndef IP_MAX_MTU
#define IP_MAX_MTU USHRT_MAX
#endif
@@ -64,27 +62,22 @@
#define STRINGIFY(x) #x
#define STR(x) STRINGIFY(x)
-#ifdef CPPCHECK_6936
+void abort_with_msg(const char *fmt, ...)
+ __attribute__((format(printf, 1, 2), noreturn));
+
/* Some cppcheck versions get confused by aborts inside a loop, causing
* it to give false positive uninitialised variable warnings later in
* the function, because it doesn't realise the non-initialising path
* already exited. See https://trac.cppcheck.net/ticket/13227
+ *
+ * Therefore, avoid using the usual do while wrapper we use to force the macro
+ * to act like a single statement requiring a ';'.
*/
-#define ASSERT(expr) \
- ((expr) ? (void)0 : abort())
-#else
+#define ASSERT_WITH_MSG(expr, ...) \
+ ((expr) ? (void)0 : abort_with_msg(__VA_ARGS__))
#define ASSERT(expr) \
- do { \
- if (!(expr)) { \
- err("ASSERTION FAILED in %s (%s:%d): %s", \
- __func__, __FILE__, __LINE__, STRINGIFY(expr)); \
- /* This may actually SIGSYS, due to seccomp, \
- * but that will still get the job done \
- */ \
- abort(); \
- } \
- } while (0)
-#endif
+ ASSERT_WITH_MSG((expr), "ASSERTION FAILED in %s (%s:%d): %s", \
+ __func__, __FILE__, __LINE__, STRINGIFY(expr))
#ifdef P_tmpdir
#define TMPDIR P_tmpdir
@@ -105,8 +98,12 @@
#define FD_PROTO(x, proto) \
(IN_INTERVAL(c->proto.fd_min, c->proto.fd_max, (x)))
+#define MAC_BROADCAST \
+ ((uint8_t [ETH_ALEN]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff })
#define MAC_ZERO ((uint8_t [ETH_ALEN]){ 0 })
#define MAC_IS_ZERO(addr) (!memcmp((addr), MAC_ZERO, ETH_ALEN))
+#define MAC_UNDEF MAC_BROADCAST
+#define MAC_IS_UNDEF(addr) (!memcmp((addr), MAC_UNDEF, ETH_ALEN))
#ifndef __bswap_constant_16
#define __bswap_constant_16(x) \
@@ -156,11 +153,13 @@
#define ntohll(x) (be64toh((x)))
#define htonll(x) (htobe64((x)))
+extern uint8_t eth_pad[ETH_ZLEN];
+
/**
* ntohl_unaligned() - Read 32-bit BE value from a possibly unaligned address
* @p: Pointer to the BE value in memory
*
- * Returns: Host-order value of 32-bit BE quantity at @p
+ * Return: host-order value of 32-bit BE quantity at @p
*/
static inline uint32_t ntohl_unaligned(const void *p)
{
@@ -199,6 +198,9 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
#define SNDBUF_BIG (4ULL * 1024 * 1024)
#define SNDBUF_SMALL (128ULL * 1024)
+#define FD_REF_BITS 24
+#define FD_REF_MAX ((int)MAX_FROM_BITS(FD_REF_BITS))
+
#include <net/if.h>
#include <limits.h>
#include <stdint.h>
@@ -207,18 +209,22 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
#include "packet.h"
struct ctx;
+union sockaddr_inany;
-int sock_l4_sa(const struct ctx *c, enum epoll_type type,
- const void *sa, socklen_t sl,
- const char *ifname, bool v6only, uint32_t data);
+int sock_l4(const struct ctx *c, enum epoll_type type,
+ const union sockaddr_inany *sa, const char *ifname);
+int sock_l4_dualstack_any(const struct ctx *c, enum epoll_type type,
+ in_port_t port, const char *ifname);
int sock_unix(char *sock_path);
-void sock_probe_mem(struct ctx *c);
+void sock_probe_features(struct ctx *c);
long timespec_diff_ms(const struct timespec *a, const struct timespec *b);
int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b);
void bitmap_set(uint8_t *map, unsigned bit);
void bitmap_clear(uint8_t *map, unsigned bit);
bool bitmap_isset(const uint8_t *map, unsigned bit);
void bitmap_or(uint8_t *dst, size_t size, const uint8_t *a, const uint8_t *b);
+void bitmap_and_not(uint8_t *dst, size_t size,
+ const uint8_t *a, const uint8_t *b);
char *line_read(char *buf, size_t len, int fd);
void ns_enter(const struct ctx *c);
bool ns_is_init(void);
@@ -227,19 +233,22 @@ int output_file_open(const char *path, int flags);
void pidfile_write(int fd, pid_t pid);
int __daemon(int pidfile_fd, int devnull_fd);
int fls(unsigned long x);
+int ilog2(unsigned long x);
int write_file(const char *path, const char *buf);
+intmax_t read_file_integer(const char *path, intmax_t fallback);
int write_all_buf(int fd, const void *buf, size_t len);
int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip);
int read_all_buf(int fd, void *buf, size_t len);
int read_remainder(int fd, const struct iovec *iov, size_t cnt, size_t skip);
void close_open_files(int argc, char **argv);
bool snprintf_check(char *str, size_t size, const char *format, ...);
+long clamped_scale(long x, long y, long lo, long hi, long f);
/**
* af_name() - Return name of an address family
* @af: Address/protocol family (AF_INET or AF_INET6)
*
- * Returns: Name of the protocol family as a string
+ * Return: name of the protocol family as a string
*/
static inline const char *af_name(sa_family_t af)
{
@@ -305,7 +314,6 @@ static inline bool mod_between(unsigned x, unsigned i, unsigned j, unsigned m)
#define FPRINTF(f, ...) (void)fprintf(f, __VA_ARGS__)
void raw_random(void *buf, size_t buflen);
-void epoll_del(const struct ctx *c, int fd);
/*
* Starting from glibc 2.40.9000 and commit 25a5eb4010df ("string: strerror,
@@ -379,6 +387,16 @@ static inline int wrap_accept4(int sockfd, struct sockaddr *addr,
#define accept4(s, addr, addrlen, flags) \
wrap_accept4((s), (addr), (addrlen), (flags))
+static inline int wrap_getsockname(int sockfd, struct sockaddr *addr,
+/* cppcheck-suppress constParameterPointer */
+ socklen_t *addrlen)
+{
+ sa_init(addr, addrlen);
+ return getsockname(sockfd, addr, addrlen);
+}
+#define getsockname(s, addr, addrlen) \
+ wrap_getsockname((s), (addr), (addrlen))
+
#define PASST_MAXDNAME 254 /* 253 (RFC 1035) + 1 (the terminator) */
void encode_domain_name(char *buf, const char *domain_name);
diff --git a/vhost_user.c b/vhost_user.c
index 105f77a..f9a5646 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -32,8 +32,6 @@
#include <inttypes.h>
#include <time.h>
#include <net/ethernet.h>
-#include <netinet/in.h>
-#include <sys/epoll.h>
#include <sys/eventfd.h>
#include <sys/mman.h>
#include <linux/vhost_types.h>
@@ -45,6 +43,7 @@
#include "vhost_user.h"
#include "pcap.h"
#include "migrate.h"
+#include "epoll_ctl.h"
/* vhost-user version we are compatible with */
#define VHOST_USER_VERSION 1
@@ -61,7 +60,7 @@ void vu_print_capabilities(void)
info("{");
info(" \"type\": \"net\"");
info("}");
- _exit(EXIT_SUCCESS);
+ passt_exit(EXIT_SUCCESS);
}
/**
@@ -137,8 +136,8 @@ static void *qva_to_va(struct vu_dev *dev, uint64_t qemu_addr)
unsigned int i;
/* Find matching memory region. */
- for (i = 0; i < dev->nregions; i++) {
- const struct vu_dev_region *r = &dev->regions[i];
+ for (i = 0; i < dev->memory.nregions; i++) {
+ const struct vu_dev_region *r = &dev->memory.regions[i];
if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) {
/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
@@ -183,7 +182,7 @@ static void vmsg_set_reply_u64(struct vhost_user_msg *vmsg, uint64_t val)
* @conn_fd: vhost-user command socket
* @vmsg: vhost-user message
*
- * Return: 0 if recvmsg() has been interrupted or if there's no data to read,
+ * Return: 0 if recvmsg() has been interrupted or if there's no data to read,
* 1 if a message has been received
*/
static int vu_message_read_default(int conn_fd, struct vhost_user_msg *vmsg)
@@ -302,13 +301,13 @@ static void vu_message_write(int conn_fd, struct vhost_user_msg *vmsg)
* @conn_fd: vhost-user command socket
* @vmsg: vhost-user message
*/
-static void vu_send_reply(int conn_fd, struct vhost_user_msg *msg)
+static void vu_send_reply(int conn_fd, struct vhost_user_msg *vmsg)
{
- msg->hdr.flags &= ~VHOST_USER_VERSION_MASK;
- msg->hdr.flags |= VHOST_USER_VERSION;
- msg->hdr.flags |= VHOST_USER_REPLY_MASK;
+ vmsg->hdr.flags &= ~VHOST_USER_VERSION_MASK;
+ vmsg->hdr.flags |= VHOST_USER_VERSION;
+ vmsg->hdr.flags |= VHOST_USER_REPLY_MASK;
- vu_message_write(conn_fd, msg);
+ vu_message_write(conn_fd, vmsg);
}
/**
@@ -316,10 +315,10 @@ static void vu_send_reply(int conn_fd, struct vhost_user_msg *msg)
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: True as a reply is requested
+ * Return: true as a reply is requested
*/
static bool vu_get_features_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
uint64_t features =
1ULL << VIRTIO_F_VERSION_1 |
@@ -329,9 +328,9 @@ static bool vu_get_features_exec(struct vu_dev *vdev,
(void)vdev;
- vmsg_set_reply_u64(msg, features);
+ vmsg_set_reply_u64(vmsg, features);
- debug("Sending back to guest u64: 0x%016"PRIx64, msg->payload.u64);
+ debug("Sending back to guest u64: 0x%016"PRIx64, vmsg->payload.u64);
return true;
}
@@ -345,7 +344,7 @@ static void vu_set_enable_all_rings(struct vu_dev *vdev, bool enable)
{
uint16_t i;
- for (i = 0; i < VHOST_USER_MAX_QUEUES; i++)
+ for (i = 0; i < VHOST_USER_MAX_VQS; i++)
vdev->vq[i].enable = enable;
}
@@ -354,14 +353,14 @@ static void vu_set_enable_all_rings(struct vu_dev *vdev, bool enable)
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_set_features_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- debug("u64: 0x%016"PRIx64, msg->payload.u64);
+ debug("u64: 0x%016"PRIx64, vmsg->payload.u64);
- vdev->features = msg->payload.u64;
+ vdev->features = vmsg->payload.u64;
/* We only support devices conforming to VIRTIO 1.0 or
* later
*/
@@ -379,13 +378,13 @@ static bool vu_set_features_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_set_owner_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
(void)vdev;
- (void)msg;
+ (void)vmsg;
return false;
}
@@ -396,7 +395,7 @@ static bool vu_set_owner_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vq: Virtqueue
*
- * Return: True if ring cannot be mapped to our address space
+ * Return: true if ring cannot be mapped to our address space
*/
static bool map_ring(struct vu_dev *vdev, struct vu_virtq *vq)
{
@@ -418,18 +417,18 @@ static bool map_ring(struct vu_dev *vdev, struct vu_virtq *vq)
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*
* #syscalls:vu mmap|mmap2 munmap
*/
static bool vu_set_mem_table_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- struct vhost_user_memory m = msg->payload.memory, *memory = &m;
+ struct vhost_user_memory m = vmsg->payload.memory, *memory = &m;
unsigned int i;
- for (i = 0; i < vdev->nregions; i++) {
- const struct vu_dev_region *r = &vdev->regions[i];
+ for (i = 0; i < vdev->memory.nregions; i++) {
+ const struct vu_dev_region *r = &vdev->memory.regions[i];
if (r->mmap_addr) {
/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
@@ -437,12 +436,12 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev,
r->size + r->mmap_offset);
}
}
- vdev->nregions = memory->nregions;
+ vdev->memory.nregions = memory->nregions;
debug("vhost-user nregions: %u", memory->nregions);
- for (i = 0; i < vdev->nregions; i++) {
+ for (i = 0; i < vdev->memory.nregions; i++) {
struct vhost_user_memory_region *msg_region = &memory->regions[i];
- struct vu_dev_region *dev_region = &vdev->regions[i];
+ struct vu_dev_region *dev_region = &vdev->memory.regions[i];
void *mmap_addr;
debug("vhost-user region %d", i);
@@ -465,7 +464,7 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev,
*/
mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
PROT_READ | PROT_WRITE, MAP_SHARED |
- MAP_NORESERVE, msg->fds[i], 0);
+ MAP_NORESERVE, vmsg->fds[i], 0);
if (mmap_addr == MAP_FAILED)
die_perror("vhost-user region mmap error");
@@ -474,23 +473,17 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev,
debug(" mmap_addr: 0x%016"PRIx64,
dev_region->mmap_addr);
- close(msg->fds[i]);
+ close(vmsg->fds[i]);
}
- for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) {
+ for (i = 0; i < VHOST_USER_MAX_VQS; i++) {
if (vdev->vq[i].vring.desc) {
if (map_ring(vdev, &vdev->vq[i]))
die("remapping queue %d during setmemtable", i);
}
}
- /* As vu_packet_check_range() has no access to the number of
- * memory regions, mark the end of the array with mmap_addr = 0
- */
- ASSERT(vdev->nregions < VHOST_USER_MAX_RAM_SLOTS - 1);
- vdev->regions[vdev->nregions].mmap_addr = 0;
-
- tap_sock_update_pool(vdev->regions, 0);
+ ASSERT(vdev->memory.nregions < VHOST_USER_MAX_RAM_SLOTS);
return false;
}
@@ -541,7 +534,7 @@ static void vu_log_page(uint8_t *log_table, uint64_t page)
/**
* vu_log_write() - Log memory write
- * @dev: vhost-user device
+ * @vdev: vhost-user device
* @address: Memory address
* @length: Memory size
*/
@@ -566,23 +559,23 @@ void vu_log_write(const struct vu_dev *vdev, uint64_t address, uint64_t length)
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: true as a reply is requested
*
* #syscalls:vu mmap|mmap2 munmap
*/
static bool vu_set_log_base_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
uint64_t log_mmap_size, log_mmap_offset;
void *base;
int fd;
- if (msg->fd_num != 1 || msg->hdr.size != sizeof(msg->payload.log))
+ if (vmsg->fd_num != 1 || vmsg->hdr.size != sizeof(vmsg->payload.log))
die("vhost-user: Invalid log_base message");
- fd = msg->fds[0];
- log_mmap_offset = msg->payload.log.mmap_offset;
- log_mmap_size = msg->payload.log.mmap_size;
+ fd = vmsg->fds[0];
+ log_mmap_offset = vmsg->payload.log.mmap_offset;
+ log_mmap_size = vmsg->payload.log.mmap_size;
debug("vhost-user log mmap_offset: %"PRId64, log_mmap_offset);
debug("vhost-user log mmap_size: %"PRId64, log_mmap_size);
@@ -599,8 +592,8 @@ static bool vu_set_log_base_exec(struct vu_dev *vdev,
vdev->log_table = base;
vdev->log_size = log_mmap_size;
- msg->hdr.size = sizeof(msg->payload.u64);
- msg->fd_num = 0;
+ vmsg->hdr.size = sizeof(vmsg->payload.u64);
+ vmsg->fd_num = 0;
return true;
}
@@ -610,18 +603,18 @@ static bool vu_set_log_base_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_set_log_fd_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- if (msg->fd_num != 1)
+ if (vmsg->fd_num != 1)
die("Invalid log_fd message");
if (vdev->log_call_fd != -1)
close(vdev->log_call_fd);
- vdev->log_call_fd = msg->fds[0];
+ vdev->log_call_fd = vmsg->fds[0];
debug("Got log_call_fd: %d", vdev->log_call_fd);
@@ -633,13 +626,13 @@ static bool vu_set_log_fd_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_set_vring_num_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- unsigned int idx = msg->payload.state.index;
- unsigned int num = msg->payload.state.num;
+ unsigned int idx = vmsg->payload.state.index;
+ unsigned int num = vmsg->payload.state.num;
trace("State.index: %u", idx);
trace("State.num: %u", num);
@@ -653,16 +646,16 @@ static bool vu_set_vring_num_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_set_vring_addr_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
/* We need to copy the payload to vhost_vring_addr structure
- * to access index because address of msg->payload.addr
+ * to access index because address of vmsg->payload.addr
* can be unaligned as it is packed.
*/
- struct vhost_vring_addr addr = msg->payload.addr;
+ struct vhost_vring_addr addr = vmsg->payload.addr;
struct vu_virtq *vq = &vdev->vq[addr.index];
debug("vhost_vring_addr:");
@@ -677,7 +670,7 @@ static bool vu_set_vring_addr_exec(struct vu_dev *vdev,
debug(" log_guest_addr: 0x%016" PRIx64,
(uint64_t)addr.log_guest_addr);
- vq->vra = msg->payload.addr;
+ vq->vra = vmsg->payload.addr;
vq->vring.flags = addr.flags;
vq->vring.log_guest_addr = addr.log_guest_addr;
@@ -699,13 +692,13 @@ static bool vu_set_vring_addr_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_set_vring_base_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- unsigned int idx = msg->payload.state.index;
- unsigned int num = msg->payload.state.num;
+ unsigned int idx = vmsg->payload.state.index;
+ unsigned int num = vmsg->payload.state.num;
debug("State.index: %u", idx);
debug("State.num: %u", num);
@@ -720,16 +713,16 @@ static bool vu_set_vring_base_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: True as a reply is requested
+ * Return: true as a reply is requested
*/
static bool vu_get_vring_base_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- unsigned int idx = msg->payload.state.index;
+ unsigned int idx = vmsg->payload.state.index;
debug("State.index: %u", idx);
- msg->payload.state.num = vdev->vq[idx].last_avail_idx;
- msg->hdr.size = sizeof(msg->payload.state);
+ vmsg->payload.state.num = vdev->vq[idx].last_avail_idx;
+ vmsg->hdr.size = sizeof(vmsg->payload.state);
vdev->vq[idx].started = false;
vdev->vq[idx].vring.avail = 0;
@@ -739,7 +732,7 @@ static bool vu_get_vring_base_exec(struct vu_dev *vdev,
vdev->vq[idx].call_fd = -1;
}
if (vdev->vq[idx].kick_fd != -1) {
- epoll_del(vdev->context, vdev->vq[idx].kick_fd);
+ epoll_del(vdev->context->epollfd, vdev->vq[idx].kick_fd);
close(vdev->vq[idx].kick_fd);
vdev->vq[idx].kick_fd = -1;
}
@@ -759,11 +752,8 @@ static void vu_set_watch(const struct vu_dev *vdev, int idx)
.fd = vdev->vq[idx].kick_fd,
.queue = idx
};
- struct epoll_event ev = { 0 };
- ev.data.u64 = ref.u64;
- ev.events = EPOLLIN;
- epoll_ctl(vdev->context->epollfd, EPOLL_CTL_ADD, ref.fd, &ev);
+ epoll_add(vdev->context->epollfd, EPOLLIN, ref);
}
/**
@@ -771,21 +761,21 @@ static void vu_set_watch(const struct vu_dev *vdev, int idx)
* close fds if NOFD bit is set
* @vmsg: vhost-user message
*/
-static void vu_check_queue_msg_file(struct vhost_user_msg *msg)
+static void vu_check_queue_msg_file(struct vhost_user_msg *vmsg)
{
- bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
- int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
+ int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
- if (idx >= VHOST_USER_MAX_QUEUES)
+ if (idx >= VHOST_USER_MAX_VQS)
die("Invalid vhost-user queue index: %u", idx);
if (nofd) {
- vmsg_close_fds(msg);
+ vmsg_close_fds(vmsg);
return;
}
- if (msg->fd_num != 1)
- die("Invalid fds in vhost-user request: %d", msg->hdr.request);
+ if (vmsg->fd_num != 1)
+ die("Invalid fds in vhost-user request: %d", vmsg->hdr.request);
}
/**
@@ -794,26 +784,26 @@ static void vu_check_queue_msg_file(struct vhost_user_msg *msg)
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_set_vring_kick_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
- int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
+ int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
- debug("u64: 0x%016"PRIx64, msg->payload.u64);
+ debug("u64: 0x%016"PRIx64, vmsg->payload.u64);
- vu_check_queue_msg_file(msg);
+ vu_check_queue_msg_file(vmsg);
if (vdev->vq[idx].kick_fd != -1) {
- epoll_del(vdev->context, vdev->vq[idx].kick_fd);
+ epoll_del(vdev->context->epollfd, vdev->vq[idx].kick_fd);
close(vdev->vq[idx].kick_fd);
vdev->vq[idx].kick_fd = -1;
}
if (!nofd)
- vdev->vq[idx].kick_fd = msg->fds[0];
+ vdev->vq[idx].kick_fd = vmsg->fds[0];
debug("Got kick_fd: %d for vq: %d", vdev->vq[idx].kick_fd, idx);
@@ -834,17 +824,17 @@ static bool vu_set_vring_kick_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_set_vring_call_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
- int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
+ int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
- debug("u64: 0x%016"PRIx64, msg->payload.u64);
+ debug("u64: 0x%016"PRIx64, vmsg->payload.u64);
- vu_check_queue_msg_file(msg);
+ vu_check_queue_msg_file(vmsg);
if (vdev->vq[idx].call_fd != -1) {
close(vdev->vq[idx].call_fd);
@@ -852,11 +842,11 @@ static bool vu_set_vring_call_exec(struct vu_dev *vdev,
}
if (!nofd)
- vdev->vq[idx].call_fd = msg->fds[0];
+ vdev->vq[idx].call_fd = vmsg->fds[0];
/* in case of I/O hang after reconnecting */
if (vdev->vq[idx].call_fd != -1)
- eventfd_write(msg->fds[0], 1);
+ eventfd_write(vmsg->fds[0], 1);
debug("Got call_fd: %d for vq: %d", vdev->vq[idx].call_fd, idx);
@@ -869,17 +859,17 @@ static bool vu_set_vring_call_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_set_vring_err_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
- int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
+ int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
- debug("u64: 0x%016"PRIx64, msg->payload.u64);
+ debug("u64: 0x%016"PRIx64, vmsg->payload.u64);
- vu_check_queue_msg_file(msg);
+ vu_check_queue_msg_file(vmsg);
if (vdev->vq[idx].err_fd != -1) {
close(vdev->vq[idx].err_fd);
@@ -887,7 +877,7 @@ static bool vu_set_vring_err_exec(struct vu_dev *vdev,
}
if (!nofd)
- vdev->vq[idx].err_fd = msg->fds[0];
+ vdev->vq[idx].err_fd = vmsg->fds[0];
return false;
}
@@ -898,10 +888,10 @@ static bool vu_set_vring_err_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: True as a reply is requested
+ * Return: true as a reply is requested
*/
static bool vu_get_protocol_features_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK |
1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD |
@@ -909,7 +899,7 @@ static bool vu_get_protocol_features_exec(struct vu_dev *vdev,
1ULL << VHOST_USER_PROTOCOL_F_RARP;
(void)vdev;
- vmsg_set_reply_u64(msg, features);
+ vmsg_set_reply_u64(vmsg, features);
return true;
}
@@ -919,16 +909,16 @@ static bool vu_get_protocol_features_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_set_protocol_features_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- uint64_t features = msg->payload.u64;
+ uint64_t features = vmsg->payload.u64;
debug("u64: 0x%016"PRIx64, features);
- vdev->protocol_features = msg->payload.u64;
+ vdev->protocol_features = vmsg->payload.u64;
return false;
}
@@ -938,14 +928,17 @@ static bool vu_set_protocol_features_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: True as a reply is requested
+ * Return: true as a reply is requested
*/
static bool vu_get_queue_num_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
(void)vdev;
- vmsg_set_reply_u64(msg, VHOST_USER_MAX_QUEUES);
+ /* NOLINTNEXTLINE(misc-redundant-expression) */
+ vmsg_set_reply_u64(vmsg, VHOST_USER_MAX_VQS / 2);
+
+ debug("VHOST_USER_MAX_VQS %u", VHOST_USER_MAX_VQS / 2);
return true;
}
@@ -955,18 +948,18 @@ static bool vu_get_queue_num_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_set_vring_enable_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- unsigned int enable = msg->payload.state.num;
- unsigned int idx = msg->payload.state.index;
+ unsigned int enable = vmsg->payload.state.num;
+ unsigned int idx = vmsg->payload.state.index;
debug("State.index: %u", idx);
debug("State.enable: %u", enable);
- if (idx >= VHOST_USER_MAX_QUEUES)
+ if (idx >= VHOST_USER_MAX_VQS)
die("Invalid vring_enable index: %u", idx);
vdev->vq[idx].enable = enable;
@@ -974,17 +967,17 @@ static bool vu_set_vring_enable_exec(struct vu_dev *vdev,
}
/**
- * vu_set_send_rarp_exec() - vhost-user specification says: "Broadcast a fake
- * RARP to notify the migration is terminated",
- * but passt doesn't need to update any ARP table,
- * so do nothing to silence QEMU bogus error message
+ * vu_send_rarp_exec() - vhost-user specification says: "Broadcast a fake
+ * RARP to notify the migration is terminated",
+ * but passt doesn't need to update any ARP table,
+ * so do nothing to silence QEMU bogus error message
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: False as no reply is requested
+ * Return: false as no reply is requested
*/
static bool vu_send_rarp_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
char macstr[ETH_ADDRSTRLEN];
@@ -993,7 +986,7 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev,
/* ignore the command */
debug("Ignore command VHOST_USER_SEND_RARP for %s",
- eth_ntop((unsigned char *)&msg->payload.u64, macstr,
+ eth_ntop((unsigned char *)&vmsg->payload.u64, macstr,
sizeof(macstr)));
return false;
@@ -1004,16 +997,16 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: True as the reply contains 0 to indicate success
+ * Return: true as the reply contains 0 to indicate success
* and set bit 8 as we don't provide our own fd.
*/
static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- unsigned int direction = msg->payload.transfer_state.direction;
- unsigned int phase = msg->payload.transfer_state.phase;
+ unsigned int direction = vmsg->payload.transfer_state.direction;
+ unsigned int phase = vmsg->payload.transfer_state.phase;
- if (msg->fd_num != 1)
+ if (vmsg->fd_num != 1)
die("Invalid device_state_fd message");
if (phase != VHOST_USER_TRANSFER_STATE_PHASE_STOPPED)
@@ -1021,13 +1014,13 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
if (direction != VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE &&
direction != VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD)
- die("Invalide device_state_fd direction: %d", direction);
+ die("Invalid device_state_fd direction: %d", direction);
- migrate_request(vdev->context, msg->fds[0],
+ migrate_request(vdev->context, vmsg->fds[0],
direction == VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD);
/* We don't provide a new fd for the data transfer */
- vmsg_set_reply_u64(msg, VHOST_USER_VRING_NOFD_MASK);
+ vmsg_set_reply_u64(vmsg, VHOST_USER_VRING_NOFD_MASK);
return true;
}
@@ -1037,13 +1030,13 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
* @vdev: vhost-user device
* @vmsg: vhost-user message
*
- * Return: True as the reply contains the migration result
+ * Return: true as the reply contains the migration result
*/
/* cppcheck-suppress constParameterCallback */
static bool vu_check_device_state_exec(struct vu_dev *vdev,
- struct vhost_user_msg *msg)
+ struct vhost_user_msg *vmsg)
{
- vmsg_set_reply_u64(msg, vdev->context->device_state_result);
+ vmsg_set_reply_u64(vmsg, vdev->context->device_state_result);
return true;
}
@@ -1051,7 +1044,6 @@ static bool vu_check_device_state_exec(struct vu_dev *vdev,
/**
* vu_init() - Initialize vhost-user device structure
* @c: execution context
- * @vdev: vhost-user device
*/
void vu_init(struct ctx *c)
{
@@ -1059,7 +1051,7 @@ void vu_init(struct ctx *c)
c->vdev = &vdev_storage;
c->vdev->context = c;
- for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) {
+ for (i = 0; i < VHOST_USER_MAX_VQS; i++) {
c->vdev->vq[i] = (struct vu_virtq){
.call_fd = -1,
.kick_fd = -1,
@@ -1082,7 +1074,7 @@ void vu_cleanup(struct vu_dev *vdev)
{
unsigned int i;
- for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) {
+ for (i = 0; i < VHOST_USER_MAX_VQS; i++) {
struct vu_virtq *vq = &vdev->vq[i];
vq->started = false;
@@ -1097,7 +1089,7 @@ void vu_cleanup(struct vu_dev *vdev)
vq->err_fd = -1;
}
if (vq->kick_fd != -1) {
- epoll_del(vdev->context, vq->kick_fd);
+ epoll_del(vdev->context->epollfd, vq->kick_fd);
close(vq->kick_fd);
vq->kick_fd = -1;
}
@@ -1107,8 +1099,8 @@ void vu_cleanup(struct vu_dev *vdev)
vq->vring.avail = 0;
}
- for (i = 0; i < vdev->nregions; i++) {
- const struct vu_dev_region *r = &vdev->regions[i];
+ for (i = 0; i < vdev->memory.nregions; i++) {
+ const struct vu_dev_region *r = &vdev->memory.regions[i];
if (r->mmap_addr) {
/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
@@ -1116,7 +1108,7 @@ void vu_cleanup(struct vu_dev *vdev)
r->size + r->mmap_offset);
}
}
- vdev->nregions = 0;
+ vdev->memory.nregions = 0;
vu_close_log(vdev);
@@ -1134,7 +1126,7 @@ static void vu_sock_reset(struct vu_dev *vdev)
}
static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev,
- struct vhost_user_msg *msg) = {
+ struct vhost_user_msg *vmsg) = {
[VHOST_USER_GET_FEATURES] = vu_get_features_exec,
[VHOST_USER_SET_FEATURES] = vu_set_features_exec,
[VHOST_USER_GET_PROTOCOL_FEATURES] = vu_get_protocol_features_exec,
@@ -1165,7 +1157,7 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev,
*/
void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
{
- struct vhost_user_msg msg = { 0 };
+ struct vhost_user_msg vmsg = { 0 };
bool need_reply, reply_requested;
int ret;
@@ -1174,41 +1166,46 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
return;
}
- ret = vu_message_read_default(fd, &msg);
+ ret = vu_message_read_default(fd, &vmsg);
if (ret == 0) {
vu_sock_reset(vdev);
return;
}
debug("================ Vhost user message ================");
- debug("Request: %s (%d)", vu_request_to_string(msg.hdr.request),
- msg.hdr.request);
- debug("Flags: 0x%x", msg.hdr.flags);
- debug("Size: %u", msg.hdr.size);
+ debug("Request: %s (%d)", vu_request_to_string(vmsg.hdr.request),
+ vmsg.hdr.request);
+ debug("Flags: 0x%x", vmsg.hdr.flags);
+ debug("Size: %u", vmsg.hdr.size);
- need_reply = msg.hdr.flags & VHOST_USER_NEED_REPLY_MASK;
+ need_reply = vmsg.hdr.flags & VHOST_USER_NEED_REPLY_MASK;
- if (msg.hdr.request >= 0 && msg.hdr.request < VHOST_USER_MAX &&
- vu_handle[msg.hdr.request])
- reply_requested = vu_handle[msg.hdr.request](vdev, &msg);
+ if (vmsg.hdr.request >= 0 && vmsg.hdr.request < VHOST_USER_MAX &&
+ vu_handle[vmsg.hdr.request])
+ reply_requested = vu_handle[vmsg.hdr.request](vdev, &vmsg);
else
- die("Unhandled request: %d", msg.hdr.request);
+ die("Unhandled request: %d", vmsg.hdr.request);
/* cppcheck-suppress legacyUninitvar */
if (!reply_requested && need_reply) {
- msg.payload.u64 = 0;
- msg.hdr.flags = 0;
- msg.hdr.size = sizeof(msg.payload.u64);
- msg.fd_num = 0;
+ vmsg.payload.u64 = 0;
+ vmsg.hdr.flags = 0;
+ vmsg.hdr.size = sizeof(vmsg.payload.u64);
+ vmsg.fd_num = 0;
reply_requested = true;
}
if (reply_requested)
- vu_send_reply(fd, &msg);
+ vu_send_reply(fd, &vmsg);
- if (msg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE &&
+ if (vmsg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE &&
vdev->context->device_state_result == 0 &&
!vdev->context->migrate_target) {
- info("Migration complete, exiting");
- _exit(EXIT_SUCCESS);
+ if (vdev->context->migrate_exit) {
+ info("Migration complete, exiting");
+ passt_exit(EXIT_SUCCESS);
+ }
+
+ info("Migration complete");
+ vdev->context->one_off = false;
}
}
diff --git a/vhost_user.h b/vhost_user.h
index 1daacd1..e806a9e 100644
--- a/vhost_user.h
+++ b/vhost_user.h
@@ -184,7 +184,7 @@ union vhost_user_payload {
};
/**
- * struct vhost_user_msg - vhost-use message
+ * struct vhost_user_msg - vhost-user message
* @hdr: Message header
* @payload: Message payload
* @fds: File descriptors associated with the message
@@ -217,7 +217,7 @@ struct vhost_user_msg {
})
/**
- * vu_queue_enabled - Return state of a virtqueue
+ * vu_queue_enabled() - Return state of a virtqueue
* @vq: virtqueue to check
*
* Return: true if the virqueue is enabled, false otherwise
@@ -228,7 +228,7 @@ static inline bool vu_queue_enabled(const struct vu_virtq *vq)
}
/**
- * vu_queue_started - Return state of a virtqueue
+ * vu_queue_started() - Return state of a virtqueue
* @vq: virtqueue to check
*
* Return: true if the virqueue is started, false otherwise
diff --git a/virtio.c b/virtio.c
index bc2b89a..bd388c2 100644
--- a/virtio.c
+++ b/virtio.c
@@ -102,8 +102,8 @@ static void *vu_gpa_to_va(const struct vu_dev *dev, uint64_t *plen,
return NULL;
/* Find matching memory region. */
- for (i = 0; i < dev->nregions; i++) {
- const struct vu_dev_region *r = &dev->regions[i];
+ for (i = 0; i < dev->memory.nregions; i++) {
+ const struct vu_dev_region *r = &dev->memory.regions[i];
if ((guest_addr >= r->gpa) &&
(guest_addr < (r->gpa + r->size))) {
@@ -156,9 +156,9 @@ static inline uint16_t vring_avail_ring(const struct vu_virtq *vq, int i)
}
/**
- * virtq_used_event - Get location of used event indices
+ * virtq_used_event() - Get location of used event indices
* (only with VIRTIO_F_EVENT_IDX)
- * @vq Virtqueue
+ * @vq: Virtqueue
*
* Return: return the location of the used event index
*/
@@ -170,7 +170,7 @@ static inline uint16_t *virtq_used_event(const struct vu_virtq *vq)
/**
* vring_get_used_event() - Get the used event from the available ring
- * @vq Virtqueue
+ * @vq: Virtqueue
*
* Return: the used event (available only if VIRTIO_RING_F_EVENT_IDX is set)
* used_event is a performant alternative where the driver
@@ -235,6 +235,7 @@ static int virtqueue_read_indirect_desc(const struct vu_dev *dev,
memcpy(desc, orig_desc, read_len);
len -= read_len;
addr += read_len;
+ /* NOLINTNEXTLINE(bugprone-sizeof-expression,cert-arr39-c) */
desc += read_len / sizeof(struct vring_desc);
}
@@ -243,9 +244,9 @@ static int virtqueue_read_indirect_desc(const struct vu_dev *dev,
/**
* enum virtqueue_read_desc_state - State in the descriptor chain
- * @VIRTQUEUE_READ_DESC_ERROR Found an invalid descriptor
- * @VIRTQUEUE_READ_DESC_DONE No more descriptors in the chain
- * @VIRTQUEUE_READ_DESC_MORE there are more descriptors in the chain
+ * @VIRTQUEUE_READ_DESC_ERROR: Found an invalid descriptor
+ * @VIRTQUEUE_READ_DESC_DONE: No more descriptors in the chain
+ * @VIRTQUEUE_READ_DESC_MORE: there are more descriptors in the chain
*/
enum virtqueue_read_desc_state {
VIRTQUEUE_READ_DESC_ERROR = -1,
@@ -346,8 +347,9 @@ void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq)
die_perror("Error writing vhost-user queue eventfd");
}
-/* virtq_avail_event() - Get location of available event indices
- * (only with VIRTIO_F_EVENT_IDX)
+/**
+ * virtq_avail_event() - Get location of available event indices
+ * (only with VIRTIO_F_EVENT_IDX)
* @vq: Virtqueue
*
* Return: return the location of the available event index
@@ -420,8 +422,8 @@ static bool virtqueue_map_desc(const struct vu_dev *dev,
}
/**
- * vu_queue_map_desc - Map the virtqueue descriptor ring into our virtual
- * address space
+ * vu_queue_map_desc() - Map the virtqueue descriptor ring into our virtual
+ * address space
* @dev: Vhost-user device
* @vq: Virtqueue
* @idx: First descriptor ring entry to map
@@ -504,7 +506,7 @@ static int vu_queue_map_desc(const struct vu_dev *dev,
* vu_queue_pop() - Pop an entry from the virtqueue
* @dev: Vhost-user device
* @vq: Virtqueue
- * @elem: Virtqueue element to file with the entry information
+ * @elem: Virtqueue element to fill with the entry information
*
* Return: -1 if there is an error, 0 otherwise
*/
@@ -544,7 +546,7 @@ int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq,
}
/**
- * vu_queue_detach_element() - Detach an element from the virqueue
+ * vu_queue_detach_element() - Detach an element from the virtqueue
* @vq: Virtqueue
*/
void vu_queue_detach_element(struct vu_virtq *vq)
@@ -554,7 +556,7 @@ void vu_queue_detach_element(struct vu_virtq *vq)
}
/**
- * vu_queue_unpop() - Push back the previously popped element from the virqueue
+ * vu_queue_unpop() - Push back the previously popped element from the virtqueue
* @vq: Virtqueue
*/
/* cppcheck-suppress unusedFunction */
@@ -568,6 +570,8 @@ void vu_queue_unpop(struct vu_virtq *vq)
* vu_queue_rewind() - Push back a given number of popped elements
* @vq: Virtqueue
* @num: Number of element to unpop
+ *
+ * Return: true on success, false if not
*/
bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num)
{
diff --git a/virtio.h b/virtio.h
index 7a370bd..12caaa0 100644
--- a/virtio.h
+++ b/virtio.h
@@ -88,7 +88,7 @@ struct vu_dev_region {
uint64_t mmap_addr;
};
-#define VHOST_USER_MAX_QUEUES 2
+#define VHOST_USER_MAX_VQS 2
/*
* Set a reasonable maximum number of ram slots, which will be supported by
@@ -97,10 +97,21 @@ struct vu_dev_region {
#define VHOST_USER_MAX_RAM_SLOTS 32
/**
+ * struct vdev_memory - Describes the shared memory regions for a vhost-user
+ * device
+ * @nregions: Number of shared memory regions
+ * @regions: Guest shared memory regions
+ */
+struct vdev_memory {
+ uint32_t nregions;
+ struct vu_dev_region regions[VHOST_USER_MAX_RAM_SLOTS];
+};
+
+/**
* struct vu_dev - vhost-user device information
* @context: Execution context
- * @nregions: Number of shared memory regions
- * @regions: Guest shared memory regions
+ * @memory: Shared memory regions
+ * @vq: Virtqueues of the device
* @features: Vhost-user features
* @protocol_features: Vhost-user protocol features
* @log_call_fd: Eventfd to report logging update
@@ -109,9 +120,8 @@ struct vu_dev_region {
*/
struct vu_dev {
struct ctx *context;
- uint32_t nregions;
- struct vu_dev_region regions[VHOST_USER_MAX_RAM_SLOTS];
- struct vu_virtq vq[VHOST_USER_MAX_QUEUES];
+ struct vdev_memory memory;
+ struct vu_virtq vq[VHOST_USER_MAX_VQS];
uint64_t features;
uint64_t protocol_features;
int log_call_fd;
@@ -140,7 +150,7 @@ struct vu_virtq_element {
* @features: Features set
* @fb: Feature bit to check
*
- * Return: True if the feature bit is set
+ * Return: true if the feature bit is set
*/
static inline bool has_feature(uint64_t features, unsigned int fbit)
{
@@ -150,9 +160,9 @@ static inline bool has_feature(uint64_t features, unsigned int fbit)
/**
* vu_has_feature() - Check if a virtio-net feature is available
* @vdev: Vhost-user device
- * @bit: Feature to check
+ * @fbit: Feature to check
*
- * Return: True if the feature is available
+ * Return: true if the feature is available
*/
static inline bool vu_has_feature(const struct vu_dev *vdev,
unsigned int fbit)
@@ -163,9 +173,9 @@ static inline bool vu_has_feature(const struct vu_dev *vdev,
/**
* vu_has_protocol_feature() - Check if a vhost-user feature is available
* @vdev: Vhost-user device
- * @bit: Feature to check
+ * @fbit: Feature to check
*
- * Return: True if the feature is available
+ * Return: true if the feature is available
*/
/* cppcheck-suppress unusedFunction */
static inline bool vu_has_protocol_feature(const struct vu_dev *vdev,
diff --git a/vu_common.c b/vu_common.c
index 686a09b..c682498 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -6,7 +6,6 @@
*/
#include <errno.h>
-#include <unistd.h>
#include <sys/uio.h>
#include <sys/eventfd.h>
#include <netinet/if_ether.h>
@@ -19,28 +18,35 @@
#include "pcap.h"
#include "vu_common.h"
#include "migrate.h"
+#include "epoll_ctl.h"
#define VU_MAX_TX_BUFFER_NB 2
/**
* vu_packet_check_range() - Check if a given memory zone is contained in
* a mapped guest memory region
- * @buf: Array of the available memory regions
+ * @memory: Array of the available memory regions
* @ptr: Start of desired data range
- * @size: Length of desired data range
+ * @len: Length of desired data range
*
* Return: 0 if the zone is in a mapped memory region, -1 otherwise
*/
-int vu_packet_check_range(void *buf, const char *ptr, size_t len)
+int vu_packet_check_range(struct vdev_memory *memory,
+ const char *ptr, size_t len)
{
- struct vu_dev_region *dev_region;
+ struct vu_dev_region *dev_region = memory->regions;
+ unsigned int i;
- for (dev_region = buf; dev_region->mmap_addr; dev_region++) {
+ for (i = 0; i < memory->nregions; i++) {
+ uintptr_t base_addr = dev_region[i].mmap_addr +
+ dev_region[i].mmap_offset;
/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
- char *m = (char *)(uintptr_t)dev_region->mmap_addr;
+ const char *base = (const char *)base_addr;
- if (m <= ptr &&
- ptr + len <= m + dev_region->mmap_offset + dev_region->size)
+ ASSERT(base_addr >= dev_region[i].mmap_addr);
+
+ if (len <= dev_region[i].size && base <= ptr &&
+ (size_t)(ptr - base) <= dev_region[i].size - len)
return 0;
}
@@ -70,13 +76,13 @@ void vu_init_elem(struct vu_virtq_element *elem, struct iovec *iov, int elem_cnt
* in the in_sg array.
* @max_elem: Number of virtqueue elements in the array
* @size: Maximum size of the data in the frame
- * @frame_size: The total size of the buffers (output)
+ * @collected: Collected buffer length, up to @size, set on return
*
* Return: number of elements used to contain the frame
*/
int vu_collect(const struct vu_dev *vdev, struct vu_virtq *vq,
struct vu_virtq_element *elem, int max_elem,
- size_t size, size_t *frame_size)
+ size_t size, size_t *collected)
{
size_t current_size = 0;
int elem_cnt = 0;
@@ -107,8 +113,8 @@ int vu_collect(const struct vu_dev *vdev, struct vu_virtq *vq,
break;
}
- if (frame_size)
- *frame_size = current_size;
+ if (collected)
+ *collected = current_size;
return elem_cnt;
}
@@ -159,7 +165,6 @@ static void vu_handle_tx(struct vu_dev *vdev, int index,
struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
struct iovec out_sg[VIRTQUEUE_MAX_SIZE];
struct vu_virtq *vq = &vdev->vq[index];
- int hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
int out_sg_count;
int count;
@@ -172,6 +177,7 @@ static void vu_handle_tx(struct vu_dev *vdev, int index,
while (count < VIRTQUEUE_MAX_SIZE &&
out_sg_count + VU_MAX_TX_BUFFER_NB <= VIRTQUEUE_MAX_SIZE) {
int ret;
+ struct iov_tail data;
elem[count].out_num = VU_MAX_TX_BUFFER_NB;
elem[count].out_sg = &out_sg[out_sg_count];
@@ -187,25 +193,10 @@ static void vu_handle_tx(struct vu_dev *vdev, int index,
warn("virtio-net transmit queue contains no out buffers");
break;
}
- if (elem[count].out_num == 1) {
- tap_add_packet(vdev->context,
- elem[count].out_sg[0].iov_len - hdrlen,
- (char *)elem[count].out_sg[0].iov_base +
- hdrlen);
- } else {
- /* vnet header can be in a separate iovec */
- if (elem[count].out_num != 2) {
- debug("virtio-net transmit queue contains more than one buffer ([%d]: %u)",
- count, elem[count].out_num);
- } else if (elem[count].out_sg[0].iov_len != (size_t)hdrlen) {
- debug("virtio-net transmit queue entry not aligned on hdrlen ([%d]: %d != %zu)",
- count, hdrlen, elem[count].out_sg[0].iov_len);
- } else {
- tap_add_packet(vdev->context,
- elem[count].out_sg[1].iov_len,
- (char *)elem[count].out_sg[1].iov_base);
- }
- }
+
+ data = IOV_TAIL(elem[count].out_sg, elem[count].out_num, 0);
+ if (IOV_DROP_HEADER(&data, struct virtio_net_hdr_mrg_rxbuf))
+ tap_add_packet(vdev->context, &data, now);
count++;
}
@@ -302,3 +293,17 @@ err:
return -1;
}
+
+/**
+ * vu_pad() - Pad 802.3 frame to minimum length (60 bytes) if needed
+ * @iov: Buffer in iovec array where end of 802.3 frame is stored
+ * @l2len: Layer-2 length already filled in frame
+ */
+void vu_pad(struct iovec *iov, size_t l2len)
+{
+ if (l2len >= ETH_ZLEN)
+ return;
+
+ memset((char *)iov->iov_base + iov->iov_len, 0, ETH_ZLEN - l2len);
+ iov->iov_len += ETH_ZLEN - l2len;
+}
diff --git a/vu_common.h b/vu_common.h
index f538f23..27fe7e0 100644
--- a/vu_common.h
+++ b/vu_common.h
@@ -48,7 +48,7 @@ void vu_init_elem(struct vu_virtq_element *elem, struct iovec *iov,
int elem_cnt);
int vu_collect(const struct vu_dev *vdev, struct vu_virtq *vq,
struct vu_virtq_element *elem, int max_elem, size_t size,
- size_t *frame_size);
+ size_t *collected);
void vu_set_vnethdr(const struct vu_dev *vdev,
struct virtio_net_hdr_mrg_rxbuf *vnethdr,
int num_buffers);
@@ -57,5 +57,6 @@ void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq,
void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref,
const struct timespec *now);
int vu_send_single(const struct ctx *c, const void *buf, size_t size);
+void vu_pad(struct iovec *iov, size_t l2len);
#endif /* VU_COMMON_H */