diff options
-rw-r--r-- | Makefile | 20 | ||||
-rw-r--r-- | checksum.c | 34 | ||||
-rw-r--r-- | checksum.h | 3 | ||||
-rw-r--r-- | conf.c | 161 | ||||
-rw-r--r-- | conf.h | 1 | ||||
-rw-r--r-- | contrib/fedora/passt.spec | 6 | ||||
-rw-r--r-- | contrib/selinux/passt-repair.te | 39 | ||||
-rw-r--r-- | contrib/selinux/passt.te | 25 | ||||
-rw-r--r-- | contrib/selinux/pasta.te | 14 | ||||
-rw-r--r-- | dhcp.c | 64 | ||||
-rw-r--r-- | dhcpv6.c | 99 | ||||
-rw-r--r-- | epoll_type.h | 6 | ||||
-rw-r--r-- | flow.c | 288 | ||||
-rw-r--r-- | flow.h | 26 | ||||
-rw-r--r-- | flow_table.h | 42 | ||||
-rw-r--r-- | icmp.c | 5 | ||||
-rw-r--r-- | iov.c | 1 | ||||
-rw-r--r-- | ip.h | 34 | ||||
-rw-r--r-- | log.c | 51 | ||||
-rw-r--r-- | log.h | 1 | ||||
-rw-r--r-- | migrate.c | 300 | ||||
-rw-r--r-- | migrate.h | 51 | ||||
-rw-r--r-- | ndp.c | 2 | ||||
-rw-r--r-- | netlink.c | 2 | ||||
-rw-r--r-- | packet.c | 40 | ||||
-rw-r--r-- | packet.h | 8 | ||||
-rw-r--r-- | passt-repair.1 | 6 | ||||
-rw-r--r-- | passt-repair.c | 88 | ||||
-rw-r--r-- | passt.1 | 21 | ||||
-rw-r--r-- | passt.c | 39 | ||||
-rw-r--r-- | passt.h | 30 | ||||
-rw-r--r-- | pasta.c | 19 | ||||
-rw-r--r-- | pcap.c | 46 | ||||
-rw-r--r-- | repair.c | 255 | ||||
-rw-r--r-- | repair.h | 17 | ||||
-rwxr-xr-x | seccomp.sh | 2 | ||||
-rw-r--r-- | tap.c | 214 | ||||
-rw-r--r-- | tap.h | 47 | ||||
-rw-r--r-- | tcp.c | 1125 | ||||
-rw-r--r-- | tcp.h | 2 | ||||
-rw-r--r-- | tcp_conn.h | 105 | ||||
-rw-r--r-- | tcp_internal.h | 2 | ||||
-rw-r--r-- | tcp_splice.c | 25 | ||||
-rw-r--r-- | tcp_vu.c | 21 | ||||
-rw-r--r-- | test/.gitignore | 1 | ||||
-rw-r--r-- | test/Makefile | 5 | ||||
-rw-r--r-- | test/lib/layout | 55 | ||||
-rwxr-xr-x | test/lib/setup | 148 | ||||
-rwxr-xr-x | test/lib/test | 55 | ||||
-rw-r--r-- | test/migrate/basic | 59 | ||||
-rw-r--r-- | test/migrate/basic_fin | 62 | ||||
-rw-r--r-- | test/migrate/bidirectional | 64 | ||||
-rw-r--r-- | test/migrate/bidirectional_fin | 64 | ||||
-rw-r--r-- | test/migrate/iperf3_bidir6 | 58 | ||||
-rw-r--r-- | test/migrate/iperf3_in4 | 50 | ||||
-rw-r--r-- | test/migrate/iperf3_in6 | 58 | ||||
-rw-r--r-- | test/migrate/iperf3_many_out6 | 60 | ||||
-rw-r--r-- | test/migrate/iperf3_out4 | 47 | ||||
-rw-r--r-- | test/migrate/iperf3_out6 | 58 | ||||
-rw-r--r-- | test/migrate/rampstream_in | 59 | ||||
-rw-r--r-- | test/migrate/rampstream_out | 55 | ||||
-rwxr-xr-x | test/passt.mbuto | 9 | ||||
-rw-r--r-- | test/passt/dhcp | 15 | ||||
-rwxr-xr-x | test/rampstream-check.sh | 3 | ||||
-rw-r--r-- | test/rampstream.c | 143 | ||||
-rwxr-xr-x | test/run | 42 | ||||
-rw-r--r-- | udp.c | 146 | ||||
-rw-r--r-- | udp_flow.c | 19 | ||||
-rw-r--r-- | udp_internal.h | 2 | ||||
-rw-r--r-- | udp_vu.c | 4 | ||||
-rw-r--r-- | util.c | 86 | ||||
-rw-r--r-- | util.h | 39 | ||||
-rw-r--r-- | vhost_user.c | 70 | ||||
-rw-r--r-- | vhost_user.h | 1 | ||||
-rw-r--r-- | virtio.c | 9 | ||||
-rw-r--r-- | virtio.h | 8 | ||||
-rw-r--r-- | vu_common.c | 60 | ||||
-rw-r--r-- | vu_common.h | 2 |
78 files changed, 4404 insertions, 599 deletions
@@ -38,9 +38,9 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ - ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \ - vhost_user.c virtio.c vu_common.c + ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c \ + repair.c tap.c tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c \ + udp_vu.c util.c vhost_user.c virtio.c vu_common.c QRAP_SRCS = qrap.c PASST_REPAIR_SRCS = passt-repair.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) $(PASST_REPAIR_SRCS) @@ -49,10 +49,10 @@ MANPAGES = passt.1 pasta.1 qrap.1 passt-repair.1 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ - lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ - siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ - tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h vhost_user.h \ - virtio.h vu_common.h + lineread.h log.h migrate.h ndp.h netlink.h packet.h passt.h pasta.h \ + pcap.h pif.h repair.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h \ + tcp_internal.h tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h \ + udp_vu.h util.h vhost_user.h virtio.h vu_common.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);} @@ -109,9 +109,9 @@ passt-repair: $(PASST_REPAIR_SRCS) seccomp_repair.h $(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(PASST_REPAIR_SRCS) -o passt-repair $(LDFLAGS) valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction \ - rt_sigreturn getpid gettid kill clock_gettime mmap \ - mmap2 munmap open unlink gettimeofday futex statx \ - readlink + rt_sigreturn getpid gettid kill clock_gettime \ + mmap|mmap2 munmap open unlink gettimeofday futex \ + statx readlink valgrind: FLAGS += -g -DVALGRIND valgrind: all @@ -85,7 +85,7 @@ */ /* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ __attribute__((optimize("-fno-strict-aliasing"))) -uint32_t sum_16b(const void *buf, size_t len) +static uint32_t sum_16b(const void *buf, size_t len) { const uint16_t *p = buf; uint32_t sum = 0; @@ -107,7 +107,7 @@ uint32_t sum_16b(const void *buf, size_t len) * * Return: 16-bit folded sum */ -uint16_t csum_fold(uint32_t sum) +static uint16_t csum_fold(uint32_t sum) { while (sum >> 16) sum = (sum & 0xffff) + (sum >> 16); @@ -162,6 +162,21 @@ uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol, } /** + * csum() - Compute TCP/IP-style checksum + * @buf: Input buffer + * @len: Input length + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum + * + * Return: 16-bit folded, complemented checksum + */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See csum_16b() */ +static uint16_t csum(const void *buf, size_t len, uint32_t init) +{ + return (uint16_t)~csum_fold(csum_unfolded(buf, len, init)); +} + +/** * csum_udp4() - Calculate and set checksum for a UDP over IPv4 packet * @udp4hr: UDP header, initialised apart from checksum * @saddr: IPv4 source address @@ -483,21 +498,6 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init) #endif /* !__AVX2__ */ /** - * csum() - Compute TCP/IP-style checksum - * @buf: Input buffer - * @len: Input length - * @init: Initial 32-bit checksum, 0 for no pre-computed checksum - * - * Return: 16-bit folded, complemented checksum - */ -/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ -__attribute__((optimize("-fno-strict-aliasing"))) /* See csum_16b() */ -uint16_t csum(const void *buf, size_t len, uint32_t init) -{ - return (uint16_t)~csum_fold(csum_unfolded(buf, len, init)); -} - -/** * csum_iov_tail() - Calculate unfolded checksum for the tail of an IO vector * @tail: IO vector tail to checksum * @init Initial 32-bit checksum, 0 for no pre-computed checksum @@ -11,8 +11,6 @@ struct icmphdr; struct icmp6hdr; struct iov_tail; -uint32_t sum_16b(const void *buf, size_t len); -uint16_t csum_fold(uint32_t sum); uint16_t csum_unaligned(const void *buf, size_t len, uint32_t init); uint16_t csum_ip4_header(uint16_t l3len, uint8_t protocol, struct in_addr saddr, struct in_addr daddr); @@ -32,7 +30,6 @@ void csum_icmp6(struct icmp6hdr *icmp6hr, const struct in6_addr *saddr, const struct in6_addr *daddr, const void *payload, size_t dlen); uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init); -uint16_t csum(const void *buf, size_t len, uint32_t init); uint16_t csum_iov_tail(struct iov_tail *tail, uint32_t init); #endif /* CHECKSUM_H */ @@ -820,6 +820,9 @@ static void usage(const char *name, FILE *f, int status) " UNIX domain socket is provided by -s option\n" " --print-capabilities print back-end capabilities in JSON format,\n" " only meaningful for vhost-user mode\n"); + FPRINTF(f, + " --repair-path PATH path for passt-repair(1)\n" + " default: append '.repair' to UNIX domain path\n"); } FPRINTF(f, @@ -858,7 +861,9 @@ static void usage(const char *name, FILE *f, int status) FPRINTF(f, " default: use addresses from /etc/resolv.conf\n"); FPRINTF(f, " -S, --search LIST Space-separated list, search domains\n" - " a single, empty option disables the DNS search list\n"); + " a single, empty option disables the DNS search list\n" + " -H, --hostname NAME Hostname to configure client with\n" + " --fqdn NAME FQDN to configure client with\n"); if (strstr(name, "pasta")) FPRINTF(f, " default: don't use any search list\n"); else @@ -987,6 +992,45 @@ pasta_opts: } /** + * conf_mode() - Determine passt/pasta's operating mode from command line + * @argc: Argument count + * @argv: Command line arguments + * + * Return: mode to operate in, PASTA or PASST + */ +enum passt_modes conf_mode(int argc, char *argv[]) +{ + int vhost_user = 0; + const struct option optvu[] = { + {"vhost-user", no_argument, &vhost_user, 1 }, + { 0 }, + }; + char argv0[PATH_MAX], *basearg0; + int name; + + optind = 0; + do { + name = getopt_long(argc, argv, "-:", optvu, NULL); + } while (name != -1); + + if (vhost_user) + return MODE_VU; + + if (argc < 1) + die("Cannot determine argv[0]"); + + strncpy(argv0, argv[0], PATH_MAX - 1); + basearg0 = basename(argv0); + if (strstr(basearg0, "pasta")) + return MODE_PASTA; + + if (strstr(basearg0, "passt")) + return MODE_PASST; + + die("Cannot determine mode, invoke as \"passt\" or \"pasta\""); +} + +/** * conf_print() - Print fundamental configuration parameters * @c: Execution context */ @@ -1243,8 +1287,25 @@ static void conf_nat(const char *arg, struct in_addr *addr4, */ static void conf_open_files(struct ctx *c) { - if (c->mode != MODE_PASTA && c->fd_tap == -1) - c->fd_tap_listen = tap_sock_unix_open(c->sock_path); + if (c->mode != MODE_PASTA && c->fd_tap == -1) { + c->fd_tap_listen = sock_unix(c->sock_path); + + if (c->mode == MODE_VU && strcmp(c->repair_path, "none")) { + if (!*c->repair_path && + snprintf_check(c->repair_path, + sizeof(c->repair_path), "%s.repair", + c->sock_path)) { + warn("passt-repair path %s not usable", + c->repair_path); + c->fd_repair_listen = -1; + } else { + c->fd_repair_listen = sock_unix(c->repair_path); + } + } else { + c->fd_repair_listen = -1; + } + c->fd_repair = -1; + } if (*c->pidfile) { c->pidfile_fd = output_file_open(c->pidfile, O_WRONLY); @@ -1316,6 +1377,7 @@ void conf(struct ctx *c, int argc, char **argv) {"outbound", required_argument, NULL, 'o' }, {"dns", required_argument, NULL, 'D' }, {"search", required_argument, NULL, 'S' }, + {"hostname", required_argument, NULL, 'H' }, {"no-tcp", no_argument, &c->no_tcp, 1 }, {"no-udp", no_argument, &c->no_udp, 1 }, {"no-icmp", no_argument, &c->no_icmp, 1 }, @@ -1357,21 +1419,25 @@ void conf(struct ctx *c, int argc, char **argv) {"host-lo-to-ns-lo", no_argument, NULL, 23 }, {"dns-host", required_argument, NULL, 24 }, {"vhost-user", no_argument, NULL, 25 }, + /* vhost-user backend program convention */ {"print-capabilities", no_argument, NULL, 26 }, {"socket-path", required_argument, NULL, 's' }, + {"fqdn", required_argument, NULL, 27 }, + {"repair-path", required_argument, NULL, 28 }, { 0 }, }; + const char *optstring = "+dqfel:hs:F:I:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:T:U:"; const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt"; char userns[PATH_MAX] = { 0 }, netns[PATH_MAX] = { 0 }; bool copy_addrs_opt = false, copy_routes_opt = false; enum fwd_ports_mode fwd_default = FWD_NONE; bool v4_only = false, v6_only = false; unsigned dns4_idx = 0, dns6_idx = 0; + unsigned long max_mtu = IP_MAX_MTU; struct fqdn *dnss = c->dns_search; unsigned int ifi4 = 0, ifi6 = 0; const char *logfile = NULL; - const char *optstring; size_t logsize = 0; char *runas = NULL; long fd_tap_opt; @@ -1382,11 +1448,11 @@ void conf(struct ctx *c, int argc, char **argv) if (c->mode == MODE_PASTA) { c->no_dhcp_dns = c->no_dhcp_dns_search = 1; fwd_default = FWD_AUTO; - optstring = "+dqfel:hF:I:p:P:m:a:n:M:g:i:o:D:S:46t:u:T:U:"; - } else { - optstring = "+dqfel:hs:F:p:P:m:a:n:M:g:i:o:D:S:461t:u:"; } + if (tap_l2_max_len(c) - ETH_HLEN < max_mtu) + max_mtu = tap_l2_max_len(c) - ETH_HLEN; + c->mtu = ROUND_DOWN(max_mtu, sizeof(uint32_t)); c->tcp.fwd_in.mode = c->tcp.fwd_out.mode = FWD_UNSET; c->udp.fwd_in.mode = c->udp.fwd_out.mode = FWD_UNSET; memcpy(c->our_tap_mac, MAC_OUR_LAA, ETH_ALEN); @@ -1554,13 +1620,27 @@ void conf(struct ctx *c, int argc, char **argv) die("Invalid host nameserver address: %s", optarg); case 25: - if (c->mode == MODE_PASTA) - die("--vhost-user is for passt mode only"); - c->mode = MODE_VU; + /* Already handled in conf_mode() */ + ASSERT(c->mode == MODE_VU); break; case 26: vu_print_capabilities(); break; + case 27: + if (snprintf_check(c->fqdn, PASST_MAXDNAME, + "%s", optarg)) + die("Invalid FQDN: %s", optarg); + break; + case 28: + if (c->mode != MODE_VU && strcmp(optarg, "none")) + die("--repair-path is for vhost-user mode only"); + + if (snprintf_check(c->repair_path, + sizeof(c->repair_path), "%s", + optarg)) + die("Invalid passt-repair path: %s", optarg); + + break; case 'd': c->debug = 1; c->quiet = 0; @@ -1579,6 +1659,9 @@ void conf(struct ctx *c, int argc, char **argv) c->foreground = 1; break; case 's': + if (c->mode == MODE_PASTA) + die("-s is for passt / vhost-user mode only"); + ret = snprintf(c->sock_path, sizeof(c->sock_path), "%s", optarg); if (ret <= 0 || ret >= (int)sizeof(c->sock_path)) @@ -1599,6 +1682,9 @@ void conf(struct ctx *c, int argc, char **argv) *c->sock_path = 0; break; case 'I': + if (c->mode != MODE_PASTA) + die("-I is for pasta mode only"); + ret = snprintf(c->pasta_ifn, IFNAMSIZ, "%s", optarg); if (ret <= 0 || ret >= IFNAMSIZ) @@ -1618,20 +1704,24 @@ void conf(struct ctx *c, int argc, char **argv) die("Invalid PID file: %s", optarg); break; - case 'm': - errno = 0; - c->mtu = strtol(optarg, NULL, 0); + case 'm': { + unsigned long mtu; + char *e; - if (!c->mtu) { - c->mtu = -1; - break; - } + errno = 0; + mtu = strtoul(optarg, &e, 0); - if (c->mtu < ETH_MIN_MTU || c->mtu > (int)ETH_MAX_MTU || - errno) + if (errno || *e) die("Invalid MTU: %s", optarg); + if (mtu > max_mtu) { + die("MTU %lu too large (max %lu)", + mtu, max_mtu); + } + + c->mtu = mtu; break; + } case 'a': if (inet_pton(AF_INET6, optarg, &c->ip6.addr) && !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr) && @@ -1730,6 +1820,11 @@ void conf(struct ctx *c, int argc, char **argv) die("Cannot use DNS search domain %s", optarg); break; + case 'H': + if (snprintf_check(c->hostname, PASST_MAXDNAME, + "%s", optarg)) + die("Invalid hostname: %s", optarg); + break; case '4': v4_only = true; v6_only = false; @@ -1746,11 +1841,16 @@ void conf(struct ctx *c, int argc, char **argv) break; case 't': case 'u': - case 'T': - case 'U': case 'D': /* Handle these later, once addresses are configured */ break; + case 'T': + case 'U': + if (c->mode != MODE_PASTA) + die("-%c is for pasta mode only", name); + + /* Handle properly later, once addresses are configured */ + break; case 'h': usage(argv[0], stdout, EXIT_SUCCESS); break; @@ -1798,9 +1898,21 @@ void conf(struct ctx *c, int argc, char **argv) c->ifi4 = conf_ip4(ifi4, &c->ip4); if (!v4_only) c->ifi6 = conf_ip6(ifi6, &c->ip6); + + if (c->ifi4 && c->mtu < IPV4_MIN_MTU) { + warn("MTU %"PRIu16" is too small for IPv4 (minimum %u)", + c->mtu, IPV4_MIN_MTU); + } + if (c->ifi6 && c->mtu < IPV6_MIN_MTU) { + warn("MTU %"PRIu16" is too small for IPv6 (minimum %u)", + c->mtu, IPV6_MIN_MTU); + } + if ((*c->ip4.ifname_out && !c->ifi4) || (*c->ip6.ifname_out && !c->ifi6)) die("External interface not usable"); + + if (!c->ifi4 && !c->ifi6) { info("No external interface as template, switch to local mode"); @@ -1827,8 +1939,8 @@ void conf(struct ctx *c, int argc, char **argv) if (c->ifi4 && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.guest_gw)) c->no_dhcp = 1; - /* Inbound port options & DNS can be parsed now (after IPv4/IPv6 - * settings) + /* Inbound port options and DNS can be parsed now, after IPv4/IPv6 + * settings */ fwd_probe_ephemeral(); udp_portmap_clear(); @@ -1922,9 +2034,6 @@ void conf(struct ctx *c, int argc, char **argv) c->no_dhcpv6 = 1; } - if (!c->mtu) - c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t)); - get_dns(c); if (!*c->pasta_ifn) { @@ -6,6 +6,7 @@ #ifndef CONF_H #define CONF_H +enum passt_modes conf_mode(int argc, char *argv[]); void conf(struct ctx *c, int argc, char **argv); #endif /* CONF_H */ diff --git a/contrib/fedora/passt.spec b/contrib/fedora/passt.spec index 6a83f8b..745cf01 100644 --- a/contrib/fedora/passt.spec +++ b/contrib/fedora/passt.spec @@ -44,7 +44,7 @@ Requires(preun): %{name} Requires(preun): policycoreutils %description selinux -This package adds SELinux enforcement to passt(1) and pasta(1). +This package adds SELinux enforcement to passt(1), pasta(1), passt-repair(1). %prep %setup -q -n passt-%{git_hash} @@ -82,6 +82,7 @@ make -f %{_datadir}/selinux/devel/Makefile install -p -m 644 -D passt.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/passt.pp install -p -m 644 -D passt.if %{buildroot}%{_datadir}/selinux/devel/include/distributed/passt.if install -p -m 644 -D pasta.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp +install -p -m 644 -D passt-repair.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp popd %pre selinux @@ -90,11 +91,13 @@ popd %post selinux %selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp %selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp +%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp %postun selinux if [ $1 -eq 0 ]; then %selinux_modules_uninstall -s %{selinuxtype} passt %selinux_modules_uninstall -s %{selinuxtype} pasta + %selinux_modules_uninstall -s %{selinuxtype} passt-repair fi %posttrans selinux @@ -124,6 +127,7 @@ fi %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp %{_datadir}/selinux/devel/include/distributed/passt.if %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp +%{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp %changelog {{{ passt_git_changelog }}} diff --git a/contrib/selinux/passt-repair.te b/contrib/selinux/passt-repair.te index e3ffbcd..7157dfb 100644 --- a/contrib/selinux/passt-repair.te +++ b/contrib/selinux/passt-repair.te @@ -28,12 +28,22 @@ require { type console_device_t; type user_devpts_t; type user_tmp_t; + + # Workaround: passt-repair needs to needs to access socket files + # that passt, started by libvirt, might create under different + # labels, depending on whether passt is started as root or not. + # + # However, libvirt doesn't maintain its own policy, which makes + # updates particularly complicated. To avoid breakage in the short + # term, deal with that in passt's own policy. + type qemu_var_run_t; + type virt_var_run_t; } type passt_repair_t; domain_type(passt_repair_t); type passt_repair_exec_t; -files_type(passt_repair_exec_t); +corecmd_executable_file(passt_repair_exec_t); role unconfined_r types passt_repair_t; @@ -41,7 +51,8 @@ allow passt_repair_t passt_repair_exec_t:file { read execute execute_no_trans en type_transition unconfined_t passt_repair_exec_t:process passt_repair_t; allow unconfined_t passt_repair_t:process transition; -allow passt_repair_t self:capability { dac_override net_admin net_raw }; +allow passt_repair_t self:capability { dac_override dac_read_search net_admin net_raw }; +allow passt_repair_t self:capability2 bpf; allow passt_repair_t console_device_t:chr_file { append open getattr read write ioctl }; allow passt_repair_t user_devpts_t:chr_file { append open getattr read write ioctl }; @@ -50,9 +61,27 @@ allow passt_repair_t unconfined_t:unix_stream_socket { connectto read write }; allow passt_repair_t passt_t:unix_stream_socket { connectto read write }; allow passt_repair_t user_tmp_t:unix_stream_socket { connectto read write }; -allow passt_repair_t unconfined_t:sock_file { read write }; -allow passt_repair_t passt_t:sock_file { read write }; -allow passt_repair_t user_tmp_t:sock_file { read write }; +allow passt_repair_t user_tmp_t:dir { getattr read search watch }; + +allow passt_repair_t unconfined_t:sock_file { getattr read write }; +allow passt_repair_t passt_t:sock_file { getattr read write }; +allow passt_repair_t user_tmp_t:sock_file { getattr read write }; allow passt_repair_t unconfined_t:tcp_socket { read setopt write }; allow passt_repair_t passt_t:tcp_socket { read setopt write }; + +# Workaround: passt-repair needs to needs to access socket files +# that passt, started by libvirt, might create under different +# labels, depending on whether passt is started as root or not. +# +# However, libvirt doesn't maintain its own policy, which makes +# updates particularly complicated. To avoid breakage in the short +# term, deal with that in passt's own policy. +allow passt_repair_t qemu_var_run_t:unix_stream_socket { connectto read write }; +allow passt_repair_t virt_var_run_t:unix_stream_socket { connectto read write }; + +allow passt_repair_t qemu_var_run_t:dir { getattr read search watch }; +allow passt_repair_t virt_var_run_t:dir { getattr read search watch }; + +allow passt_repair_t qemu_var_run_t:sock_file { getattr read write }; +allow passt_repair_t virt_var_run_t:sock_file { getattr read write }; diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te index c6cea34..f8ea672 100644 --- a/contrib/selinux/passt.te +++ b/contrib/selinux/passt.te @@ -20,9 +20,19 @@ require { type fs_t; type tmp_t; type user_tmp_t; + type user_home_t; type tmpfs_t; type root_t; + # Workaround: passt --vhost-user needs to map guest memory, but + # libvirt doesn't maintain its own policy, which makes updates + # particularly complicated. To avoid breakage in the short term, + # deal with it in passt's own policy. + type svirt_image_t; + type svirt_tmpfs_t; + type svirt_t; + type null_device_t; + class file { ioctl getattr setattr create read write unlink open relabelto execute execute_no_trans map }; class dir { search write add_name remove_name mounton }; class chr_file { append read write open getattr ioctl }; @@ -38,7 +48,7 @@ require { type net_conf_t; type proc_net_t; type node_t; - class tcp_socket { create accept listen name_bind name_connect }; + class tcp_socket { create accept listen name_bind name_connect getattr ioctl }; class udp_socket { create accept listen }; class icmp_socket { bind create name_bind node_bind setopt read write }; class sock_file { create unlink write }; @@ -80,6 +90,9 @@ allow passt_t root_t:dir mounton; allow passt_t tmp_t:dir { add_name mounton remove_name write }; allow passt_t tmpfs_t:filesystem mount; allow passt_t fs_t:filesystem unmount; +allow passt_t user_home_t:dir search; +allow passt_t user_tmp_t:fifo_file append; +allow passt_t user_tmp_t:file map; manage_files_pattern(passt_t, user_tmp_t, user_tmp_t) files_pid_filetrans(passt_t, user_tmp_t, file) @@ -119,7 +132,7 @@ corenet_udp_sendrecv_all_ports(passt_t) allow passt_t node_t:icmp_socket { name_bind node_bind }; allow passt_t port_t:icmp_socket name_bind; -allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write }; +allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write getattr ioctl }; allow passt_t self:udp_socket { create getopt setopt connect bind read write }; allow passt_t self:icmp_socket { bind create setopt read write }; @@ -127,3 +140,11 @@ allow passt_t user_tmp_t:dir { add_name write }; allow passt_t user_tmp_t:file { create open }; allow passt_t user_tmp_t:sock_file { create read write unlink }; allow passt_t unconfined_t:unix_stream_socket { read write }; + +# Workaround: passt --vhost-user needs to map guest memory, but +# libvirt doesn't maintain its own policy, which makes updates +# particularly complicated. To avoid breakage in the short term, +# deal with it in passt's own policy. +allow passt_t svirt_image_t:file { read write map }; +allow passt_t svirt_tmpfs_t:file { read write map }; +allow passt_t null_device_t:chr_file map; diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te index d0ff0cc..89c8043 100644 --- a/contrib/selinux/pasta.te +++ b/contrib/selinux/pasta.te @@ -18,6 +18,7 @@ require { type bin_t; type user_home_t; type user_home_dir_t; + type user_tmp_t; type fs_t; type tmp_t; type tmpfs_t; @@ -56,8 +57,10 @@ require { attribute port_type; type port_t; type http_port_t; + type http_cache_port_t; type ssh_port_t; type reserved_port_t; + type unreserved_port_t; type dns_port_t; type dhcpc_port_t; type chronyd_port_t; @@ -122,8 +125,8 @@ domain_auto_trans(pasta_t, ping_exec_t, ping_t); allow pasta_t nsfs_t:file { open read }; -allow pasta_t user_home_t:dir getattr; -allow pasta_t user_home_t:file { open read getattr setattr }; +allow pasta_t user_home_t:dir { getattr search }; +allow pasta_t user_home_t:file { open read getattr setattr execute execute_no_trans map}; allow pasta_t user_home_dir_t:dir { search getattr open add_name read write }; allow pasta_t user_home_dir_t:file { create open read write }; allow pasta_t tmp_t:dir { add_name mounton remove_name write }; @@ -133,6 +136,11 @@ allow pasta_t root_t:dir mounton; manage_files_pattern(pasta_t, pasta_pid_t, pasta_pid_t) files_pid_filetrans(pasta_t, pasta_pid_t, file) +allow pasta_t user_tmp_t:dir { add_name remove_name search write }; +allow pasta_t user_tmp_t:fifo_file append; +allow pasta_t user_tmp_t:file { create open write }; +allow pasta_t user_tmp_t:sock_file { create unlink }; + allow pasta_t console_device_t:chr_file { open write getattr ioctl }; allow pasta_t user_devpts_t:chr_file { getattr read write ioctl }; logging_send_syslog_msg(pasta_t) @@ -160,6 +168,8 @@ allow pasta_t self:udp_socket create_stream_socket_perms; allow pasta_t reserved_port_t:udp_socket name_bind; allow pasta_t llmnr_port_t:tcp_socket name_bind; allow pasta_t llmnr_port_t:udp_socket name_bind; +allow pasta_t http_cache_port_t:tcp_socket { name_bind name_connect }; +allow pasta_t unreserved_port_t:udp_socket name_bind; corenet_udp_sendrecv_generic_node(pasta_t) corenet_udp_bind_generic_node(pasta_t) allow pasta_t node_t:icmp_socket { name_bind node_bind }; @@ -63,6 +63,11 @@ static struct opt opts[255]; #define OPT_MIN 60 /* RFC 951 */ +/* Total option size (excluding end option) is 576 (RFC 2131), minus + * offset of options (268), minus end option (1). + */ +#define OPT_MAX 307 + /** * dhcp_init() - Initialise DHCP options */ @@ -122,7 +127,7 @@ struct msg { uint8_t sname[64]; uint8_t file[128]; uint32_t magic; - uint8_t o[308]; + uint8_t o[OPT_MAX + 1 /* End option */ ]; } __attribute__((__packed__)); /** @@ -130,15 +135,28 @@ struct msg { * @m: Message to fill * @o: Option number * @offset: Current offset within options field, updated on insertion + * + * Return: false if m has space to write the option, true otherwise */ -static void fill_one(struct msg *m, int o, int *offset) +static bool fill_one(struct msg *m, int o, int *offset) { + size_t slen = opts[o].slen; + + /* If we don't have space to write the option, then just skip */ + if (*offset + 2 /* code and length of option */ + slen > OPT_MAX) + return true; + m->o[*offset] = o; - m->o[*offset + 1] = opts[o].slen; - memcpy(&m->o[*offset + 2], opts[o].s, opts[o].slen); + m->o[*offset + 1] = slen; + + /* Move to option */ + *offset += 2; + + memcpy(&m->o[*offset], opts[o].s, slen); opts[o].sent = 1; - *offset += 2 + opts[o].slen; + *offset += slen; + return false; } /** @@ -159,21 +177,23 @@ static int fill(struct msg *m) * Put it there explicitly, unless requested via option 55. */ if (opts[55].clen > 0 && !memchr(opts[55].c, 53, opts[55].clen)) - fill_one(m, 53, &offset); + if (fill_one(m, 53, &offset)) + debug("DHCP: skipping option 53"); for (i = 0; i < opts[55].clen; i++) { o = opts[55].c[i]; if (opts[o].slen != -1) - fill_one(m, o, &offset); + if (fill_one(m, o, &offset)) + debug("DHCP: skipping option %i", o); } for (o = 0; o < 255; o++) { if (opts[o].slen != -1 && !opts[o].sent) - fill_one(m, o, &offset); + if (fill_one(m, o, &offset)) + debug("DHCP: skipping option %i", o); } m->o[offset++] = 255; - m->o[offset++] = 0; if (offset < OPT_MIN) { memset(&m->o[offset], 0, OPT_MIN - offset); @@ -397,7 +417,7 @@ int dhcp(const struct ctx *c, const struct pool *p) &c->ip4.guest_gw, sizeof(c->ip4.guest_gw)); } - if (c->mtu != -1) { + if (c->mtu) { opts[26].slen = 2; opts[26].s[0] = c->mtu / 256; opts[26].s[1] = c->mtu % 256; @@ -411,6 +431,30 @@ int dhcp(const struct ctx *c, const struct pool *p) if (!opts[6].slen) opts[6].slen = -1; + opt_len = strlen(c->hostname); + if (opt_len > 0) { + opts[12].slen = opt_len; + memcpy(opts[12].s, &c->hostname, opt_len); + } + + opt_len = strlen(c->fqdn); + if (opt_len > 0) { + opt_len += 3 /* flags */ + + 2; /* Length byte for first label, and terminator */ + + if (sizeof(opts[81].s) >= opt_len) { + opts[81].s[0] = 0x4; /* flags (E) */ + opts[81].s[1] = 0xff; /* RCODE1 */ + opts[81].s[2] = 0xff; /* RCODE2 */ + + encode_domain_name((char *)opts[81].s + 3, c->fqdn); + + opts[81].slen = opt_len; + } else { + debug("DHCP: client FQDN option doesn't fit, skipping"); + } + } + if (!c->no_dhcp_dns_search) opt_set_dns_search(c, sizeof(m->o)); @@ -48,6 +48,7 @@ struct opt_hdr { # define STATUS_NOTONLINK htons_constant(4) # define OPT_DNS_SERVERS htons_constant(23) # define OPT_DNS_SEARCH htons_constant(24) +# define OPT_CLIENT_FQDN htons_constant(39) #define STR_NOTONLINK "Prefix not appropriate for link." uint16_t l; @@ -58,6 +59,9 @@ struct opt_hdr { sizeof(struct opt_hdr)) #define OPT_VSIZE(x) (sizeof(struct opt_##x) - \ sizeof(struct opt_hdr)) +#define OPT_MAX_SIZE IPV6_MIN_MTU - (sizeof(struct ipv6hdr) + \ + sizeof(struct udphdr) + \ + sizeof(struct msg_hdr)) /** * struct opt_client_id - DHCPv6 Client Identifier option @@ -164,6 +168,18 @@ struct opt_dns_search { } __attribute__((packed)); /** + * struct opt_client_fqdn - Client FQDN option (RFC 4704) + * @hdr: Option header + * @flags: Flags described by RFC 4704 + * @domain_name: Client FQDN + */ +struct opt_client_fqdn { + struct opt_hdr hdr; + uint8_t flags; + char domain_name[PASST_MAXDNAME]; +} __attribute__((packed)); + +/** * struct msg_hdr - DHCPv6 client/server message header * @type: DHCP message type * @xid: Transaction ID for message exchange @@ -193,6 +209,7 @@ struct msg_hdr { * @client_id: Client Identifier, variable length * @dns_servers: DNS Recursive Name Server, here just for storage size * @dns_search: Domain Search List, here just for storage size + * @client_fqdn: Client FQDN, variable length */ static struct resp_t { struct msg_hdr hdr; @@ -203,6 +220,7 @@ static struct resp_t { struct opt_client_id client_id; struct opt_dns_servers dns_servers; struct opt_dns_search dns_search; + struct opt_client_fqdn client_fqdn; } __attribute__((__packed__)) resp = { { 0 }, SERVER_ID, @@ -228,6 +246,10 @@ static struct resp_t { { { OPT_DNS_SEARCH, 0, }, { 0 }, }, + + { { OPT_CLIENT_FQDN, 0, }, + 0, { 0 }, + }, }; static const struct opt_status_code sc_not_on_link = { @@ -346,7 +368,6 @@ static size_t dhcpv6_dns_fill(const struct ctx *c, char *buf, int offset) { struct opt_dns_servers *srv = NULL; struct opt_dns_search *srch = NULL; - char *p = NULL; int i; if (c->no_dhcp_dns) @@ -383,35 +404,82 @@ search: if (!name_len) continue; + name_len += 2; /* Length byte for first label, and terminator */ + if (name_len > + NS_MAXDNAME + 1 /* Length byte for first label */ || + name_len > 255) { + debug("DHCP: DNS search name '%s' too long, skipping", + c->dns_search[i].n); + continue; + } + if (!srch) { srch = (struct opt_dns_search *)(buf + offset); offset += sizeof(struct opt_hdr); srch->hdr.t = OPT_DNS_SEARCH; srch->hdr.l = 0; - p = srch->list; } - *p = '.'; - p = stpncpy(p + 1, c->dns_search[i].n, name_len); - p++; - srch->hdr.l += name_len + 2; - offset += name_len + 2; + encode_domain_name(buf + offset, c->dns_search[i].n); + + srch->hdr.l += name_len; + offset += name_len; + } - if (srch) { - for (i = 0; i < srch->hdr.l; i++) { - if (srch->list[i] == '.') { - srch->list[i] = strcspn(srch->list + i + 1, - "."); - } - } + if (srch) srch->hdr.l = htons(srch->hdr.l); - } return offset; } /** + * dhcpv6_client_fqdn_fill() - Fill in client FQDN option + * @c: Execution context + * @buf: Response message buffer where options will be appended + * @offset: Offset in message buffer for new options + * + * Return: updated length of response message buffer. + */ +static size_t dhcpv6_client_fqdn_fill(const struct pool *p, const struct ctx *c, + char *buf, int offset) + +{ + struct opt_client_fqdn const *req_opt; + struct opt_client_fqdn *o; + size_t opt_len; + + opt_len = strlen(c->fqdn); + if (opt_len == 0) { + return offset; + } + + opt_len += 2; /* Length byte for first label, and terminator */ + if (opt_len > OPT_MAX_SIZE - (offset + + sizeof(struct opt_hdr) + + 1 /* flags */ )) { + debug("DHCPv6: client FQDN option doesn't fit, skipping"); + return offset; + } + + o = (struct opt_client_fqdn *)(buf + offset); + encode_domain_name(o->domain_name, c->fqdn); + req_opt = (struct opt_client_fqdn *)dhcpv6_opt(p, &(size_t){ 0 }, + OPT_CLIENT_FQDN); + if (req_opt && req_opt->flags & 0x01 /* S flag */) + o->flags = 0x02 /* O flag */; + else + o->flags = 0x00; + + opt_len++; + + o->hdr.t = OPT_CLIENT_FQDN; + o->hdr.l = htons(opt_len); + + return offset + sizeof(struct opt_hdr) + opt_len; +} + +/** * dhcpv6() - Check if this is a DHCPv6 message, reply as needed * @c: Execution context * @p: Packet pool, single packet starting from UDP header @@ -544,6 +612,7 @@ int dhcpv6(struct ctx *c, const struct pool *p, n = offsetof(struct resp_t, client_id) + sizeof(struct opt_hdr) + ntohs(client_id->l); n = dhcpv6_dns_fill(c, (char *)&resp, n); + n = dhcpv6_client_fqdn_fill(p, c, (char *)&resp, n); resp.hdr.xid = mh->xid; diff --git a/epoll_type.h b/epoll_type.h index fd9eac3..7f2a121 100644 --- a/epoll_type.h +++ b/epoll_type.h @@ -40,8 +40,10 @@ enum epoll_type { EPOLL_TYPE_VHOST_CMD, /* vhost-user kick event socket */ EPOLL_TYPE_VHOST_KICK, - /* vhost-user migration socket */ - EPOLL_TYPE_VHOST_MIGRATION, + /* TCP_REPAIR helper listening socket */ + EPOLL_TYPE_REPAIR_LISTEN, + /* TCP_REPAIR helper socket */ + EPOLL_TYPE_REPAIR, EPOLL_NUM_TYPES, }; @@ -19,6 +19,7 @@ #include "inany.h" #include "flow.h" #include "flow_table.h" +#include "repair.h" const char *flow_state_str[] = { [FLOW_STATE_FREE] = "FREE", @@ -52,6 +53,13 @@ const uint8_t flow_proto[] = { static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, "flow_proto[] doesn't match enum flow_type"); +#define foreach_established_tcp_flow(flow) \ + flow_foreach_of_type((flow), FLOW_TCP) \ + if (!tcp_flow_is_established(&(flow)->tcp)) \ + /* NOLINTNEXTLINE(bugprone-branch-clone) */ \ + continue; \ + else + /* Global Flow Table */ /** @@ -259,11 +267,13 @@ int flowside_connect(const struct ctx *c, int s, /** flow_log_ - Log flow-related message * @f: flow the message is related to + * @newline: Append newline at the end of the message, if missing * @pri: Log priority * @fmt: Format string * @...: printf-arguments */ -void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) +void flow_log_(const struct flow_common *f, bool newline, int pri, + const char *fmt, ...) { const char *type_or_state; char msg[BUFSIZ]; @@ -279,7 +289,7 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) else type_or_state = FLOW_TYPE(f); - logmsg(true, false, pri, + logmsg(newline, false, pri, "Flow %u (%s): %s", flow_idx(f), type_or_state, msg); } @@ -299,7 +309,7 @@ void flow_log_details_(const struct flow_common *f, int pri, const struct flowside *tgt = &f->side[TGTSIDE]; if (state >= FLOW_STATE_TGT) - flow_log_(f, pri, + flow_log_(f, true, pri, "%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu", pif_name(f->pif[INISIDE]), inany_ntop(&ini->eaddr, estr0, sizeof(estr0)), @@ -312,7 +322,7 @@ void flow_log_details_(const struct flow_common *f, int pri, inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)), tgt->eport); else if (state >= FLOW_STATE_INI) - flow_log_(f, pri, "%s [%s]:%hu -> [%s]:%hu => ?", + flow_log_(f, true, pri, "%s [%s]:%hu -> [%s]:%hu => ?", pif_name(f->pif[INISIDE]), inany_ntop(&ini->eaddr, estr0, sizeof(estr0)), ini->eport, @@ -333,7 +343,7 @@ static void flow_set_state(struct flow_common *f, enum flow_state state) ASSERT(oldstate < FLOW_NUM_STATES); f->state = state; - flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate], + flow_log_(f, true, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate], FLOW_STATE(f)); flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate)); @@ -390,9 +400,9 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif, * * Return: pointer to the initiating flowside information */ -const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif, - const union sockaddr_inany *ssa, - in_port_t dport) +struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif, + const union sockaddr_inany *ssa, + in_port_t dport) { struct flowside *ini = &flow->f.side[INISIDE]; @@ -771,7 +781,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) struct flow_free_cluster *free_head = NULL; unsigned *last_next = &flow_first_free; bool timer = false; - unsigned idx; + union flow *flow; if (timespec_diff_ms(now, &flow_timer_run) >= FLOW_TIMER_INTERVAL) { timer = true; @@ -780,8 +790,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) ASSERT(!flow_new_entry); /* Incomplete flow at end of cycle */ - for (idx = 0; idx < FLOW_MAX; idx++) { - union flow *flow = &flowtab[idx]; + flow_foreach_slot(flow) { bool closed = false; switch (flow->f.state) { @@ -798,12 +807,12 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) } else { /* New free cluster, add to chain */ free_head = &flow->free; - *last_next = idx; + *last_next = FLOW_IDX(flow); last_next = &free_head->next; } /* Skip remaining empty entries */ - idx += skip - 1; + flow += skip - 1; continue; } @@ -856,14 +865,15 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) if (free_head) { /* Add slot to current free cluster */ - ASSERT(idx == FLOW_IDX(free_head) + free_head->n); + ASSERT(FLOW_IDX(flow) == + FLOW_IDX(free_head) + free_head->n); free_head->n++; flow->free.n = flow->free.next = 0; } else { /* Create new free cluster */ free_head = &flow->free; free_head->n = 1; - *last_next = idx; + *last_next = FLOW_IDX(flow); last_next = &free_head->next; } } else { @@ -875,6 +885,254 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) } /** + * flow_migrate_source_rollback() - Disable repair mode, return failure + * @c: Execution context + * @bound: No need to roll back flow indices >= @bound + * @ret: Negative error code + * + * Return: @ret + */ +static int flow_migrate_source_rollback(struct ctx *c, unsigned bound, int ret) +{ + union flow *flow; + + debug("...roll back migration"); + + foreach_established_tcp_flow(flow) { + if (FLOW_IDX(flow) >= bound) + break; + if (tcp_flow_repair_off(c, &flow->tcp)) + die("Failed to roll back TCP_REPAIR mode"); + } + + if (repair_flush(c)) + die("Failed to roll back TCP_REPAIR mode"); + + return ret; +} + +/** + * flow_migrate_need_repair() - Do we need to set repair mode for any flow? + * + * Return: true if repair mode is needed, false otherwise + */ +static bool flow_migrate_need_repair(void) +{ + union flow *flow; + + foreach_established_tcp_flow(flow) + return true; + + return false; +} + +/** + * flow_migrate_repair_all() - Turn repair mode on or off for all flows + * @c: Execution context + * @enable: Switch repair mode on if set, off otherwise + * + * Return: 0 on success, negative error code on failure + */ +static int flow_migrate_repair_all(struct ctx *c, bool enable) +{ + union flow *flow; + int rc; + + /* If we don't have a repair helper, there's nothing we can do */ + if (c->fd_repair < 0) + return 0; + + foreach_established_tcp_flow(flow) { + if (enable) + rc = tcp_flow_repair_on(c, &flow->tcp); + else + rc = tcp_flow_repair_off(c, &flow->tcp); + + if (rc) { + debug("Can't %s repair mode: %s", + enable ? "enable" : "disable", strerror_(-rc)); + return flow_migrate_source_rollback(c, FLOW_IDX(flow), + rc); + } + } + + if ((rc = repair_flush(c))) { + debug("Can't %s repair mode: %s", + enable ? "enable" : "disable", strerror_(-rc)); + return flow_migrate_source_rollback(c, FLOW_IDX(flow), rc); + } + + return 0; +} + +/** + * flow_migrate_source_pre() - Prepare flows for migration: enable repair mode + * @c: Execution context + * @stage: Migration stage information (unused) + * @fd: Migration file descriptor (unused) + * + * Return: 0 on success, positive error code on failure + */ +int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage, + int fd) +{ + int rc; + + (void)stage; + (void)fd; + + if (flow_migrate_need_repair()) + repair_wait(c); + + if ((rc = flow_migrate_repair_all(c, true))) + return -rc; + + return 0; +} + +/** + * flow_migrate_source() - Dump all the remaining information and send data + * @c: Execution context (unused) + * @stage: Migration stage information (unused) + * @fd: Migration file descriptor + * + * Return: 0 on success, positive error code on failure + */ +int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, + int fd) +{ + uint32_t count = 0; + bool first = true; + union flow *flow; + int rc; + + (void)c; + (void)stage; + + /* If we don't have a repair helper, we can't migrate TCP flows */ + if (c->fd_repair >= 0) { + foreach_established_tcp_flow(flow) + count++; + } + + count = htonl(count); + if (write_all_buf(fd, &count, sizeof(count))) { + rc = errno; + err_perror("Can't send flow count (%u)", ntohl(count)); + return flow_migrate_source_rollback(c, FLOW_MAX, rc); + } + + debug("Sending %u flows", ntohl(count)); + + if (!count) + return 0; + + /* Dump and send information that can be stored in the flow table. + * + * Limited rollback options here: if we fail to transfer any data (that + * is, on the first flow), undo everything and resume. Otherwise, the + * stream might now be inconsistent, and we might have closed listening + * TCP sockets, so just terminate. + */ + foreach_established_tcp_flow(flow) { + rc = tcp_flow_migrate_source(fd, &flow->tcp); + if (rc) { + err("Can't send data, flow %u: %s", FLOW_IDX(flow), + strerror_(-rc)); + if (!first) + die("Inconsistent migration state, exiting"); + + return flow_migrate_source_rollback(c, FLOW_MAX, -rc); + } + + first = false; + } + + /* And then "extended" data (including window data we saved previously): + * the target needs to set repair mode on sockets before it can set + * this stuff, but it needs sockets (and flows) for that. + * + * This also closes sockets so that the target can start connecting + * theirs: you can't sendmsg() to queues (using the socket) if the + * socket is not connected (EPIPE), not even in repair mode. And the + * target needs to restore queues now because we're sending the data. + * + * So, no rollback here, just try as hard as we can. Tolerate per-flow + * failures but not if the stream might be inconsistent (reported here + * as EIO). + */ + foreach_established_tcp_flow(flow) { + rc = tcp_flow_migrate_source_ext(fd, &flow->tcp); + if (rc) { + err("Extended data for flow %u: %s", FLOW_IDX(flow), + strerror_(-rc)); + + if (rc == -EIO) + die("Inconsistent migration state, exiting"); + } + } + + return 0; +} + +/** + * flow_migrate_target() - Receive flows and insert in flow table + * @c: Execution context + * @stage: Migration stage information (unused) + * @fd: Migration file descriptor + * + * Return: 0 on success, positive error code on failure + */ +int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage, + int fd) +{ + uint32_t count; + unsigned i; + int rc; + + (void)stage; + + if (read_all_buf(fd, &count, sizeof(count))) + return errno; + + count = ntohl(count); + debug("Receiving %u flows", count); + + if (!count) + return 0; + + repair_wait(c); + + if ((rc = flow_migrate_repair_all(c, true))) + return -rc; + + repair_flush(c); + + /* TODO: flow header with type, instead? */ + for (i = 0; i < count; i++) { + rc = tcp_flow_migrate_target(c, fd); + if (rc) { + debug("Migration data failure at flow %u: %s, abort", + i, strerror_(-rc)); + return -rc; + } + } + + repair_flush(c); + + for (i = 0; i < count; i++) { + rc = tcp_flow_migrate_target_ext(c, &flowtab[i].tcp, fd); + if (rc) { + debug("Migration data failure at flow %u: %s, abort", + i, strerror_(-rc)); + return -rc; + } + } + + return 0; +} + +/** * flow_init() - Initialise flow related data structures */ void flow_init(void) @@ -249,12 +249,20 @@ union flow; void flow_init(void); void flow_defer_handler(const struct ctx *c, const struct timespec *now); +int flow_migrate_source_early(struct ctx *c, const struct migrate_stage *stage, + int fd); +int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage, + int fd); +int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, + int fd); +int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage, + int fd); -void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) - __attribute__((format(printf, 3, 4))); - -#define flow_log(f_, pri, ...) flow_log_(&(f_)->f, (pri), __VA_ARGS__) +void flow_log_(const struct flow_common *f, bool newline, int pri, + const char *fmt, ...) + __attribute__((format(printf, 4, 5))); +#define flow_log(f_, pri, ...) flow_log_(&(f_)->f, true, (pri), __VA_ARGS__) #define flow_dbg(f, ...) flow_log((f), LOG_DEBUG, __VA_ARGS__) #define flow_err(f, ...) flow_log((f), LOG_ERR, __VA_ARGS__) @@ -264,6 +272,16 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) flow_dbg((f), __VA_ARGS__); \ } while (0) +#define flow_log_perror_(f, pri, ...) \ + do { \ + int errno_ = errno; \ + flow_log_((f), false, (pri), __VA_ARGS__); \ + logmsg(true, true, (pri), ": %s", strerror_(errno_)); \ + } while (0) + +#define flow_dbg_perror(f_, ...) flow_log_perror_(&(f_)->f, LOG_DEBUG, __VA_ARGS__) +#define flow_perror(f_, ...) flow_log_perror_(&(f_)->f, LOG_ERR, __VA_ARGS__) + void flow_log_details_(const struct flow_common *f, int pri, enum flow_state state); #define flow_log_details(f_, pri) \ diff --git a/flow_table.h b/flow_table.h index eeb6f41..fd2c57b 100644 --- a/flow_table.h +++ b/flow_table.h @@ -50,6 +50,42 @@ extern union flow flowtab[]; #define flow_foreach_sidei(sidei_) \ for ((sidei_) = INISIDE; (sidei_) < SIDES; (sidei_)++) + +/** + * flow_foreach_slot() - Step through each flow table entry + * @flow: Takes values of pointer to each flow table entry + * + * Includes FREE slots. + */ +#define flow_foreach_slot(flow) \ + for ((flow) = flowtab; FLOW_IDX(flow) < FLOW_MAX; (flow)++) + +/** + * flow_foreach() - Step through each active flow + * @flow: Takes values of pointer to each active flow + */ +#define flow_foreach(flow) \ + flow_foreach_slot((flow)) \ + if ((flow)->f.state == FLOW_STATE_FREE) \ + (flow) += (flow)->free.n - 1; \ + else if ((flow)->f.state != FLOW_STATE_ACTIVE) { \ + flow_err((flow), "Bad flow state during traversal"); \ + continue; \ + } else + +/** + * flow_foreach_of_type() - Step through each active flow of given type + * @flow: Takes values of pointer to each flow + * @type_: Type of flow to traverse + */ +#define flow_foreach_of_type(flow, type_) \ + flow_foreach((flow)) \ + if ((flow)->f.type != (type_)) \ + /* NOLINTNEXTLINE(bugprone-branch-clone) */ \ + continue; \ + else + + /** flow_idx() - Index of flow from common structure * @f: Common flow fields pointer * @@ -161,9 +197,9 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif, sa_family_t af, const void *saddr, in_port_t sport, const void *daddr, in_port_t dport); -const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif, - const union sockaddr_inany *ssa, - in_port_t dport); +struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif, + const union sockaddr_inany *ssa, + in_port_t dport); const struct flowside *flow_target_af(union flow *flow, uint8_t pif, sa_family_t af, const void *saddr, in_port_t sport, @@ -85,7 +85,7 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref) n = recvfrom(ref.fd, buf, sizeof(buf), 0, &sr.sa, &sl); if (n < 0) { - flow_err(pingf, "recvfrom() error: %s", strerror_(errno)); + flow_perror(pingf, "recvfrom() error"); return; } @@ -300,8 +300,7 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, 0); if (sendto(pingf->sock, pkt, l4len, MSG_NOSIGNAL, &sa.sa, sl) < 0) { - flow_dbg(pingf, "failed to relay request to socket: %s", - strerror_(errno)); + flow_dbg_perror(pingf, "failed to relay request to socket"); } else { flow_dbg(pingf, "echo request to socket, ID: %"PRIu16", seq: %"PRIu16, @@ -203,6 +203,7 @@ size_t iov_tail_size(struct iov_tail *tail) * overruns the IO vector, is not contiguous or doesn't have the * requested alignment. */ +/* cppcheck-suppress [staticFunction,unmatchedSuppression] */ void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align) { char *p; @@ -36,13 +36,14 @@ .tos = 0, \ .tot_len = 0, \ .id = 0, \ - .frag_off = 0, \ + .frag_off = htons(IP_DF), \ .ttl = 0xff, \ .protocol = (proto), \ .saddr = 0, \ .daddr = 0, \ } #define L2_BUF_IP4_PSUM(proto) ((uint32_t)htons_constant(0x4500) + \ + (uint32_t)htons_constant(IP_DF) + \ (uint32_t)htons(0xff00 | (proto))) @@ -90,6 +91,30 @@ struct ipv6_opt_hdr { */ } __attribute__((packed)); /* required for some archs */ +/** + * ip6_set_flow_lbl() - Set flow label in an IPv6 header + * @ip6h: Pointer to IPv6 header, updated + * @flow: Set @ip6h flow label to the low 20 bits of this integer + */ +static inline void ip6_set_flow_lbl(struct ipv6hdr *ip6h, uint32_t flow) +{ + ip6h->flow_lbl[0] = (flow >> 16) & 0xf; + ip6h->flow_lbl[1] = (flow >> 8) & 0xff; + ip6h->flow_lbl[2] = (flow >> 0) & 0xff; +} + +/** ip6_get_flow_lbl() - Get flow label from an IPv6 header + * @ip6h: Pointer to IPv6 header + * + * Return: flow label from @ip6h as an integer (<= 20 bits) + */ +static inline uint32_t ip6_get_flow_lbl(const struct ipv6hdr *ip6h) +{ + return (ip6h->flow_lbl[0] & 0xf) << 16 | + ip6h->flow_lbl[1] << 8 | + ip6h->flow_lbl[2]; +} + char *ipv6_l4hdr(const struct pool *p, int idx, size_t offset, uint8_t *proto, size_t *dlen); @@ -104,4 +129,11 @@ static const struct in6_addr in6addr_ll_all_nodes = { /* IPv4 Limited Broadcast (RFC 919, Section 7), 255.255.255.255 */ static const struct in_addr in4addr_broadcast = { 0xffffffff }; +#ifndef IPV4_MIN_MTU +#define IPV4_MIN_MTU 68 +#endif +#ifndef IPV6_MIN_MTU +#define IPV6_MIN_MTU 1280 +#endif + #endif /* IP_H */ @@ -56,7 +56,7 @@ bool log_stderr = true; /* Not daemonised, no shell spawned */ * * Return: pointer to @now, or NULL if there was an error retrieving the time */ -const struct timespec *logtime(struct timespec *ts) +static const struct timespec *logtime(struct timespec *ts) { if (clock_gettime(CLOCK_MONOTONIC, ts)) return NULL; @@ -250,6 +250,30 @@ static void logfile_write(bool newline, bool cont, int pri, } /** + * passt_vsyslog() - vsyslog() implementation not using heap memory + * @newline: Append newline at the end of the message, if missing + * @pri: Facility and level map, same as priority for vsyslog() + * @format: Same as vsyslog() format + * @ap: Same as vsyslog() ap + */ +static void passt_vsyslog(bool newline, int pri, const char *format, va_list ap) +{ + char buf[BUFSIZ]; + int n; + + /* Send without timestamp, the system logger should add it */ + n = snprintf(buf, BUFSIZ, "<%i> %s: ", pri, log_ident); + + n += vsnprintf(buf + n, BUFSIZ - n, format, ap); + + if (newline && format[strlen(format)] != '\n') + n += snprintf(buf + n, BUFSIZ - n, "\n"); + + if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr) + FPRINTF(stderr, "Failed to send %i bytes to syslog\n", n); +} + +/** * vlogmsg() - Print or send messages to log or output files as configured * @newline: Append newline at the end of the message, if missing * @cont: Continuation of a previous message, on the same line @@ -257,6 +281,7 @@ static void logfile_write(bool newline, bool cont, int pri, * @format: Message * @ap: Variable argument list */ +/* cppcheck-suppress [staticFunction,unmatchedSuppression] */ void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap) { bool debug_print = (log_mask & LOG_MASK(LOG_DEBUG)) && log_file == -1; @@ -374,30 +399,6 @@ void __setlogmask(int mask) } /** - * passt_vsyslog() - vsyslog() implementation not using heap memory - * @newline: Append newline at the end of the message, if missing - * @pri: Facility and level map, same as priority for vsyslog() - * @format: Same as vsyslog() format - * @ap: Same as vsyslog() ap - */ -void passt_vsyslog(bool newline, int pri, const char *format, va_list ap) -{ - char buf[BUFSIZ]; - int n; - - /* Send without timestamp, the system logger should add it */ - n = snprintf(buf, BUFSIZ, "<%i> %s: ", pri, log_ident); - - n += vsnprintf(buf + n, BUFSIZ - n, format, ap); - - if (newline && format[strlen(format)] != '\n') - n += snprintf(buf + n, BUFSIZ - n, "\n"); - - if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr) - FPRINTF(stderr, "Failed to send %i bytes to syslog\n", n); -} - -/** * logfile_init() - Open log file and write header with PID, version, path * @name: Identifier for header: passt or pasta * @path: Path to log file @@ -55,7 +55,6 @@ void trace_init(int enable); void __openlog(const char *ident, int option, int facility); void logfile_init(const char *name, const char *path, size_t size); -void passt_vsyslog(bool newline, int pri, const char *format, va_list ap); void __setlogmask(int mask); #endif /* LOG_H */ diff --git a/migrate.c b/migrate.c new file mode 100644 index 0000000..0fca77b --- /dev/null +++ b/migrate.c @@ -0,0 +1,300 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * migrate.c - Migration sections, layout, and routines + * + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#include <errno.h> +#include <sys/uio.h> + +#include "util.h" +#include "ip.h" +#include "passt.h" +#include "inany.h" +#include "flow.h" +#include "flow_table.h" + +#include "migrate.h" +#include "repair.h" + +/* Magic identifier for migration data */ +#define MIGRATE_MAGIC 0xB1BB1D1B0BB1D1B0 + +/** + * struct migrate_seen_addrs_v1 - Migratable guest addresses for v1 state stream + * @addr6: Observed guest IPv6 address + * @addr6_ll: Observed guest IPv6 link-local address + * @addr4: Observed guest IPv4 address + * @mac: Observed guest MAC address + */ +struct migrate_seen_addrs_v1 { + struct in6_addr addr6; + struct in6_addr addr6_ll; + struct in_addr addr4; + unsigned char mac[ETH_ALEN]; +} __attribute__((packed)); + +/** + * seen_addrs_source_v1() - Copy and send guest observed addresses from source + * @c: Execution context + * @stage: Migration stage, unused + * @fd: File descriptor for state transfer + * + * Return: 0 on success, positive error code on failure + */ +/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */ +static int seen_addrs_source_v1(struct ctx *c, + const struct migrate_stage *stage, int fd) +{ + struct migrate_seen_addrs_v1 addrs = { + .addr6 = c->ip6.addr_seen, + .addr6_ll = c->ip6.addr_ll_seen, + .addr4 = c->ip4.addr_seen, + }; + + (void)stage; + + memcpy(addrs.mac, c->guest_mac, sizeof(addrs.mac)); + + if (write_all_buf(fd, &addrs, sizeof(addrs))) + return errno; + + return 0; +} + +/** + * seen_addrs_target_v1() - Receive and use guest observed addresses on target + * @c: Execution context + * @stage: Migration stage, unused + * @fd: File descriptor for state transfer + * + * Return: 0 on success, positive error code on failure + */ +static int seen_addrs_target_v1(struct ctx *c, + const struct migrate_stage *stage, int fd) +{ + struct migrate_seen_addrs_v1 addrs; + + (void)stage; + + if (read_all_buf(fd, &addrs, sizeof(addrs))) + return errno; + + c->ip6.addr_seen = addrs.addr6; + c->ip6.addr_ll_seen = addrs.addr6_ll; + c->ip4.addr_seen = addrs.addr4; + memcpy(c->guest_mac, addrs.mac, sizeof(c->guest_mac)); + + return 0; +} + +/* Stages for version 1 */ +static const struct migrate_stage stages_v1[] = { + { + .name = "observed addresses", + .source = seen_addrs_source_v1, + .target = seen_addrs_target_v1, + }, + { + .name = "prepare flows", + .source = flow_migrate_source_pre, + .target = NULL, + }, + { + .name = "transfer flows", + .source = flow_migrate_source, + .target = flow_migrate_target, + }, + { 0 }, +}; + +/* Supported encoding versions, from latest (most preferred) to oldest */ +static const struct migrate_version versions[] = { + { 1, stages_v1, }, + { 0 }, +}; + +/* Current encoding version */ +#define CURRENT_VERSION (&versions[0]) + +/** + * migrate_source() - Migration as source, send state to hypervisor + * @c: Execution context + * @fd: File descriptor for state transfer + * + * Return: 0 on success, positive error code on failure + */ +static int migrate_source(struct ctx *c, int fd) +{ + const struct migrate_version *v = CURRENT_VERSION; + const struct migrate_header header = { + .magic = htonll_constant(MIGRATE_MAGIC), + .version = htonl(v->id), + .compat_version = htonl(v->id), + }; + const struct migrate_stage *s; + int ret; + + if (write_all_buf(fd, &header, sizeof(header))) { + ret = errno; + err("Can't send migration header: %s, abort", strerror_(ret)); + return ret; + } + + for (s = v->s; s->name; s++) { + if (!s->source) + continue; + + debug("Source side migration stage: %s", s->name); + + if ((ret = s->source(c, s, fd))) { + err("Source migration stage: %s: %s, abort", s->name, + strerror_(ret)); + return ret; + } + } + + return 0; +} + +/** + * migrate_target_read_header() - Read header in target + * @fd: Descriptor for state transfer + * + * Return: version structure on success, NULL on failure with errno set + */ +static const struct migrate_version *migrate_target_read_header(int fd) +{ + const struct migrate_version *v; + struct migrate_header h; + uint32_t id, compat_id; + + if (read_all_buf(fd, &h, sizeof(h))) + return NULL; + + id = ntohl(h.version); + compat_id = ntohl(h.compat_version); + + debug("Source magic: 0x%016" PRIx64 ", version: %u, compat: %u", + ntohll(h.magic), id, compat_id); + + if (ntohll(h.magic) != MIGRATE_MAGIC || !id || !compat_id) { + err("Invalid incoming device state"); + errno = EINVAL; + return NULL; + } + + for (v = versions; v->id; v++) + if (v->id <= id && v->id >= compat_id) + return v; + + errno = ENOTSUP; + err("Unsupported device state version: %u", id); + return NULL; +} + +/** + * migrate_target() - Migration as target, receive state from hypervisor + * @c: Execution context + * @fd: File descriptor for state transfer + * + * Return: 0 on success, positive error code on failure + */ +static int migrate_target(struct ctx *c, int fd) +{ + const struct migrate_version *v; + const struct migrate_stage *s; + int ret; + + if (!(v = migrate_target_read_header(fd))) + return errno; + + for (s = v->s; s->name; s++) { + if (!s->target) + continue; + + debug("Target side migration stage: %s", s->name); + + if ((ret = s->target(c, s, fd))) { + err("Target migration stage: %s: %s, abort", s->name, + strerror_(ret)); + return ret; + } + } + + return 0; +} + +/** + * migrate_init() - Set up things necessary for migration + * @c: Execution context + */ +void migrate_init(struct ctx *c) +{ + c->device_state_result = -1; +} + +/** + * migrate_close() - Close migration channel and connection to passt-repair + * @c: Execution context + */ +void migrate_close(struct ctx *c) +{ + if (c->device_state_fd != -1) { + debug("Closing migration channel, fd: %d", c->device_state_fd); + close(c->device_state_fd); + c->device_state_fd = -1; + c->device_state_result = -1; + } + + repair_close(c); +} + +/** + * migrate_request() - Request a migration of device state + * @c: Execution context + * @fd: fd to transfer state + * @target: Are we the target of the migration? + */ +void migrate_request(struct ctx *c, int fd, bool target) +{ + debug("Migration requested, fd: %d (was %d)", fd, c->device_state_fd); + + if (c->device_state_fd != -1) + migrate_close(c); + + c->device_state_fd = fd; + c->migrate_target = target; +} + +/** + * migrate_handler() - Send/receive passt internal state to/from hypervisor + * @c: Execution context + */ +void migrate_handler(struct ctx *c) +{ + int rc; + + if (c->device_state_fd < 0) + return; + + debug("Handling migration request from fd: %d, target: %d", + c->device_state_fd, c->migrate_target); + + if (c->migrate_target) + rc = migrate_target(c, c->device_state_fd); + else + rc = migrate_source(c, c->device_state_fd); + + migrate_close(c); + + c->device_state_result = rc; +} diff --git a/migrate.h b/migrate.h new file mode 100644 index 0000000..2c51cd9 --- /dev/null +++ b/migrate.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#ifndef MIGRATE_H +#define MIGRATE_H + +/** + * struct migrate_header - Migration header from source + * @magic: 0xB1BB1D1B0BB1D1B0, network order + * @version: Highest known, target aborts if too old, network order + * @compat_version: Lowest version compatible with @version, target aborts + * if too new, network order + */ +struct migrate_header { + uint64_t magic; + uint32_t version; + uint32_t compat_version; +} __attribute__((packed)); + +/** + * struct migrate_stage - Callbacks and parameters for one stage of migration + * @name: Stage name (for debugging) + * @source: Callback to implement this stage on the source + * @target: Callback to implement this stage on the target + */ +struct migrate_stage { + const char *name; + int (*source)(struct ctx *c, const struct migrate_stage *stage, int fd); + int (*target)(struct ctx *c, const struct migrate_stage *stage, int fd); + + /* Add here separate rollback callbacks if needed */ +}; + +/** + * struct migrate_version - Stages for a particular protocol version + * @id: Version number, host order + * @s: Ordered array of stages, NULL-terminated + */ +struct migrate_version { + uint32_t id; + const struct migrate_stage *s; +}; + +void migrate_init(struct ctx *c); +void migrate_close(struct ctx *c); +void migrate_request(struct ctx *c, int fd, bool target); +void migrate_handler(struct ctx *c); + +#endif /* MIGRATE_H */ @@ -256,7 +256,7 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst) ptr = &ra.var[0]; - if (c->mtu != -1) { + if (c->mtu) { struct opt_mtu *mtu = (struct opt_mtu *)ptr; *mtu = (struct opt_mtu) { .header = { @@ -355,7 +355,7 @@ unsigned int nl_get_ext_if(int s, sa_family_t af) * * Return: true if a gateway was found, false otherwise */ -bool nl_route_get_def_multipath(struct rtattr *rta, void *gw) +static bool nl_route_get_def_multipath(struct rtattr *rta, void *gw) { int nh_len = RTA_PAYLOAD(rta); struct rtnexthop *rtnh; @@ -23,23 +23,22 @@ #include "log.h" /** - * packet_check_range() - Check if a packet memory range is valid + * packet_check_range() - Check if a memory range is valid for a pool * @p: Packet pool - * @offset: Offset of data range in packet descriptor + * @ptr: Start of desired data range * @len: Length of desired data range - * @start: Start of the packet descriptor * @func: For tracing: name of calling function * @line: For tracing: caller line of function call * * Return: 0 if the range is valid, -1 otherwise */ -static int packet_check_range(const struct pool *p, size_t offset, size_t len, - const char *start, const char *func, int line) +static int packet_check_range(const struct pool *p, const char *ptr, size_t len, + const char *func, int line) { if (p->buf_size == 0) { int ret; - ret = vu_packet_check_range((void *)p->buf, offset, len, start); + ret = vu_packet_check_range((void *)p->buf, ptr, len); if (ret == -1) trace("cannot find region, %s:%i", func, line); @@ -47,16 +46,16 @@ static int packet_check_range(const struct pool *p, size_t offset, size_t len, return ret; } - if (start < p->buf) { - trace("packet start %p before buffer start %p, " - "%s:%i", (void *)start, (void *)p->buf, func, line); + if (ptr < p->buf) { + trace("packet range start %p before buffer start %p, %s:%i", + (void *)ptr, (void *)p->buf, func, line); return -1; } - if (start + len + offset > p->buf + p->buf_size) { - trace("packet offset plus length %zu from size %zu, " - "%s:%i", start - p->buf + len + offset, - p->buf_size, func, line); + if (ptr + len > p->buf + p->buf_size) { + trace("packet range end %p after buffer end %p, %s:%i", + (void *)(ptr + len), (void *)(p->buf + p->buf_size), + func, line); return -1; } @@ -81,10 +80,10 @@ void packet_add_do(struct pool *p, size_t len, const char *start, return; } - if (packet_check_range(p, 0, len, start, func, line)) + if (packet_check_range(p, start, len, func, line)) return; - if (len > UINT16_MAX) { + if (len > PACKET_MAX_LEN) { trace("add packet length %zu, %s:%i", len, func, line); return; } @@ -110,6 +109,8 @@ void packet_add_do(struct pool *p, size_t len, const char *start, void *packet_get_do(const struct pool *p, size_t idx, size_t offset, size_t len, size_t *left, const char *func, int line) { + char *ptr; + if (idx >= p->size || idx >= p->count) { if (func) { trace("packet %zu from pool size: %zu, count: %zu, " @@ -118,7 +119,7 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset, return NULL; } - if (len > UINT16_MAX) { + if (len > PACKET_MAX_LEN) { if (func) { trace("packet data length %zu, %s:%i", len, func, line); @@ -135,14 +136,15 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset, return NULL; } - if (packet_check_range(p, offset, len, p->pkt[idx].iov_base, - func, line)) + ptr = (char *)p->pkt[idx].iov_base + offset; + + if (packet_check_range(p, ptr, len, func, line)) return NULL; if (left) *left = p->pkt[idx].iov_len - offset - len; - return (char *)p->pkt[idx].iov_base + offset; + return ptr; } /** @@ -6,6 +6,9 @@ #ifndef PACKET_H #define PACKET_H +/* Maximum size of a single packet stored in pool, including headers */ +#define PACKET_MAX_LEN UINT16_MAX + /** * struct pool - Generic pool of packets stored in a buffer * @buf: Buffer storing packet descriptors, @@ -21,11 +24,10 @@ struct pool { size_t buf_size; size_t size; size_t count; - struct iovec pkt[1]; + struct iovec pkt[]; }; -int vu_packet_check_range(void *buf, size_t offset, size_t len, - const char *start); +int vu_packet_check_range(void *buf, const char *ptr, size_t len); void packet_add_do(struct pool *p, size_t len, const char *start, const char *func, int line); void *packet_get_do(const struct pool *p, const size_t idx, diff --git a/passt-repair.1 b/passt-repair.1 index 7c1b140..e65aadd 100644 --- a/passt-repair.1 +++ b/passt-repair.1 @@ -16,13 +16,17 @@ .B passt-repair is a privileged helper setting and clearing repair mode on TCP sockets on behalf of \fBpasst\fR(1), as instructed via single-byte commands over a UNIX domain -socket, specified by \fIPATH\fR. +socket. It can be used to migrate TCP connections between guests without granting additional capabilities to \fBpasst\fR(1) itself: to migrate TCP connections, \fBpasst\fR(1) leverages repair mode, which needs the \fBCAP_NET_ADMIN\fR capability (see \fBcapabilities\fR(7)) to be set or cleared. +If \fIPATH\fR represents a UNIX domain socket, \fBpasst-repair\fR(1) attempts to +connect to it. If it is a directory, \fBpasst-repair\fR(1) waits until a file +ending with \fI.repair\fR appears in it, and then attempts to connect to it. + .SH PROTOCOL \fBpasst-repair\fR(1) connects to \fBpasst\fR(1) using the socket specified via diff --git a/passt-repair.c b/passt-repair.c index 614cee0..8bb3f00 100644 --- a/passt-repair.c +++ b/passt-repair.c @@ -16,11 +16,14 @@ * off. Reply by echoing the command. Exit on EOF. */ +#include <sys/inotify.h> #include <sys/prctl.h> #include <sys/types.h> #include <sys/socket.h> +#include <sys/stat.h> #include <sys/un.h> #include <errno.h> +#include <stdbool.h> #include <stddef.h> #include <stdio.h> #include <stdlib.h> @@ -39,6 +42,8 @@ #include "seccomp_repair.h" #define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */ +#define REPAIR_EXT ".repair" +#define REPAIR_EXT_LEN strlen(REPAIR_EXT) /** * main() - Entry point and whole program with loop @@ -51,6 +56,9 @@ * #syscalls:repair socket s390x:socketcall i686:socketcall * #syscalls:repair recvfrom recvmsg arm:recv ppc64le:recv * #syscalls:repair sendto sendmsg arm:send ppc64le:send + * #syscalls:repair stat|statx stat64|statx statx + * #syscalls:repair fstat|fstat64 newfstatat|fstatat64 + * #syscalls:repair inotify_init1 inotify_add_watch */ int main(int argc, char **argv) { @@ -58,11 +66,14 @@ int main(int argc, char **argv) __attribute__ ((aligned(__alignof__(struct cmsghdr)))); struct sockaddr_un a = { AF_UNIX, "" }; int fds[SCM_MAX_FD], s, ret, i, n = 0; + bool inotify_dir = false; struct sock_fprog prog; int8_t cmd = INT8_MAX; struct cmsghdr *cmsg; struct msghdr msg; struct iovec iov; + size_t cmsg_len; + struct stat sb; int op; prctl(PR_SET_DUMPABLE, 0); @@ -89,19 +100,77 @@ int main(int argc, char **argv) _exit(2); } - ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]); + if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { + fprintf(stderr, "Failed to create AF_UNIX socket: %i\n", errno); + _exit(1); + } + + if ((stat(argv[1], &sb))) { + fprintf(stderr, "Can't stat() %s: %i\n", argv[1], errno); + _exit(1); + } + + if ((sb.st_mode & S_IFMT) == S_IFDIR) { + char buf[sizeof(struct inotify_event) + NAME_MAX + 1]; + const struct inotify_event *ev; + char path[PATH_MAX + 1]; + ssize_t n; + int fd; + + ev = (struct inotify_event *)buf; + + if ((fd = inotify_init1(IN_CLOEXEC)) < 0) { + fprintf(stderr, "inotify_init1: %i\n", errno); + _exit(1); + } + + if (inotify_add_watch(fd, argv[1], IN_CREATE) < 0) { + fprintf(stderr, "inotify_add_watch: %i\n", errno); + _exit(1); + } + + do { + n = read(fd, buf, sizeof(buf)); + if (n < 0) { + fprintf(stderr, "inotify read: %i", errno); + _exit(1); + } + + if (n < (ssize_t)sizeof(*ev)) { + fprintf(stderr, "Short inotify read: %zi", n); + _exit(1); + } + } while (ev->len < REPAIR_EXT_LEN || + memcmp(ev->name + strlen(ev->name) - REPAIR_EXT_LEN, + REPAIR_EXT, REPAIR_EXT_LEN)); + + snprintf(path, sizeof(path), "%s/%s", argv[1], ev->name); + if ((stat(path, &sb))) { + fprintf(stderr, "Can't stat() %s: %i\n", path, errno); + _exit(1); + } + + ret = snprintf(a.sun_path, sizeof(a.sun_path), path); + inotify_dir = true; + } else { + ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]); + } + if (ret <= 0 || ret >= (int)sizeof(a.sun_path)) { - fprintf(stderr, "Invalid socket path: %s\n", argv[1]); + fprintf(stderr, "Invalid socket path"); _exit(2); } - if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { - fprintf(stderr, "Failed to create AF_UNIX socket: %i\n", errno); - _exit(1); + if ((sb.st_mode & S_IFMT) != S_IFSOCK) { + fprintf(stderr, "%s is not a socket\n", a.sun_path); + _exit(2); } - if (connect(s, (struct sockaddr *)&a, sizeof(a))) { - fprintf(stderr, "Failed to connect to %s: %s\n", argv[1], + while (connect(s, (struct sockaddr *)&a, sizeof(a))) { + if (inotify_dir && errno == ECONNREFUSED) + continue; + + fprintf(stderr, "Failed to connect to %s: %s\n", a.sun_path, strerror(errno)); _exit(1); } @@ -131,15 +200,16 @@ loop: /* No inverse formula for CMSG_LEN(x), and building one with CMSG_LEN(0) * works but there's no guarantee it does. Search the whole domain. */ - for (i = 1; i < SCM_MAX_FD; i++) { + for (i = 1; i <= SCM_MAX_FD; i++) { if (CMSG_LEN(sizeof(int) * i) == cmsg->cmsg_len) { n = i; break; } } if (!n) { + cmsg_len = cmsg->cmsg_len; /* socklen_t is 'unsigned' on musl */ fprintf(stderr, "Invalid ancillary data length %zu from peer\n", - cmsg->cmsg_len); + cmsg_len); _exit(1); } @@ -401,6 +401,16 @@ Enable IPv6-only operation. IPv4 traffic will be ignored. By default, IPv4 operation is enabled as long as at least an IPv4 route and an interface address are configured on a given host interface. +.TP +.BR \-H ", " \-\-hostname " " \fIname +Hostname to configure the client with. +Send \fIname\fR as DHCP option 12 (hostname). + +.TP +.BR \-\-fqdn " " \fIname +FQDN to configure the client with. +Send \fIname\fR as Client FQDN: DHCP option 81 and DHCPv6 option 39. + .SS \fBpasst\fR-only options .TP @@ -419,6 +429,17 @@ Enable vhost-user. The vhost-user command socket is provided by \fB--socket\fR. Print back-end capabilities in JSON format, only meaningful for vhost-user mode. .TP +.BR \-\-repair-path " " \fIpath +Path for UNIX domain socket used by the \fBpasst-repair\fR(1) helper to connect +to \fBpasst\fR in order to set or clear the TCP_REPAIR option on sockets, during +migration. \fB--repair-path none\fR disables this interface (if you need to +specify a socket path called "none" you can prefix the path by \fI./\fR). + +Default, for \-\-vhost-user mode only, is to append \fI.repair\fR to the path +chosen for the hypervisor UNIX domain socket. No socket is created if not in +\-\-vhost-user mode. + +.TP .BR \-F ", " \-\-fd " " \fIFD Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened in the parent process and \fBpasst\fR inherits it when run as a child. This @@ -51,6 +51,8 @@ #include "tcp_splice.h" #include "ndp.h" #include "vu_common.h" +#include "migrate.h" +#include "repair.h" #define EPOLL_EVENTS 8 @@ -75,7 +77,8 @@ char *epoll_type_str[] = { [EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket", [EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket", [EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket", - [EPOLL_TYPE_VHOST_MIGRATION] = "vhost-user migration socket", + [EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket", + [EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket", }; static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, "epoll_type_str[] doesn't match enum epoll_type"); @@ -163,7 +166,7 @@ void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) * * #syscalls exit_group */ -void exit_handler(int signal) +static void exit_handler(int signal) { (void)signal; @@ -188,7 +191,6 @@ int main(int argc, char **argv) { struct epoll_event events[EPOLL_EVENTS]; int nfds, i, devnull_fd = -1; - char argv0[PATH_MAX], *name; struct ctx c = { 0 }; struct rlimit limit; struct timespec now; @@ -202,6 +204,7 @@ int main(int argc, char **argv) isolate_initial(argc, argv); c.pasta_netns_fd = c.fd_tap = c.pidfile_fd = -1; + c.device_state_fd = -1; sigemptyset(&sa.sa_mask); sa.sa_flags = 0; @@ -209,27 +212,18 @@ int main(int argc, char **argv) sigaction(SIGTERM, &sa, NULL); sigaction(SIGQUIT, &sa, NULL); - if (argc < 1) - _exit(EXIT_FAILURE); + c.mode = conf_mode(argc, argv); - strncpy(argv0, argv[0], PATH_MAX - 1); - name = basename(argv0); - if (strstr(name, "pasta")) { + if (c.mode == MODE_PASTA) { sa.sa_handler = pasta_child_handler; if (sigaction(SIGCHLD, &sa, NULL)) die_perror("Couldn't install signal handlers"); - - if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) - die_perror("Couldn't set disposition for SIGPIPE"); - - c.mode = MODE_PASTA; - } else if (strstr(name, "passt")) { - c.mode = MODE_PASST; - } else { - _exit(EXIT_FAILURE); } - madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE); + if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) + die_perror("Couldn't set disposition for SIGPIPE"); + + madvise(pkt_buf, sizeof(pkt_buf), MADV_HUGEPAGE); c.epollfd = epoll_create1(EPOLL_CLOEXEC); if (c.epollfd == -1) @@ -357,8 +351,11 @@ loop: case EPOLL_TYPE_VHOST_KICK: vu_kick_cb(c.vdev, ref, &now); break; - case EPOLL_TYPE_VHOST_MIGRATION: - vu_migrate(c.vdev, eventmask); + case EPOLL_TYPE_REPAIR_LISTEN: + repair_listen_handler(&c, eventmask); + break; + case EPOLL_TYPE_REPAIR: + repair_handler(&c, eventmask); break; default: /* Can't happen */ @@ -368,5 +365,7 @@ loop: post_handler(&c, &now); + migrate_handler(&c); + goto loop; } @@ -20,6 +20,7 @@ union epoll_ref; #include "siphash.h" #include "ip.h" #include "inany.h" +#include "migrate.h" #include "flow.h" #include "icmp.h" #include "fwd.h" @@ -68,12 +69,11 @@ union epoll_ref { static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data), "epoll_ref must have same size as epoll_data"); -#define TAP_BUF_BYTES \ - ROUND_DOWN(((ETH_MAX_MTU + sizeof(uint32_t)) * 128), PAGE_SIZE) +/* Large enough for ~128 maximum size frames */ +#define PKT_BUF_BYTES (8UL << 20) #define TAP_MSGS \ - DIV_ROUND_UP(TAP_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t)) + DIV_ROUND_UP(PKT_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t)) -#define PKT_BUF_BYTES MAX(TAP_BUF_BYTES, 0) extern char pkt_buf [PKT_BUF_BYTES]; extern char *epoll_type_str[]; @@ -193,6 +193,7 @@ struct ip6_ctx { * @foreground: Run in foreground, don't log to stderr by default * @nofile: Maximum number of open files (ulimit -n) * @sock_path: Path for UNIX domain socket + * @repair_path: TCP_REPAIR helper path, can be "none", empty for default * @pcap: Path for packet capture file * @pidfile: Path to PID file, empty string if not configured * @pidfile_fd: File descriptor for PID file, -1 if none @@ -203,12 +204,16 @@ struct ip6_ctx { * @epollfd: File descriptor for epoll instance * @fd_tap_listen: File descriptor for listening AF_UNIX socket, if any * @fd_tap: AF_UNIX socket, tuntap device, or pre-opened socket + * @fd_repair_listen: File descriptor for listening TCP_REPAIR socket, if any + * @fd_repair: Connected AF_UNIX socket for TCP_REPAIR helper * @our_tap_mac: Pasta/passt's MAC on the tap link * @guest_mac: MAC address of guest or namespace, seen or configured * @hash_secret: 128-bit secret for siphash functions * @ifi4: Template interface for IPv4, -1: none, 0: IPv4 disabled * @ip: IPv4 configuration * @dns_search: DNS search list + * @hostname: Guest hostname + * @fqdn: Guest FQDN * @ifi6: Template interface for IPv6, -1: none, 0: IPv6 disabled * @ip6: IPv6 configuration * @pasta_ifn: Name of namespace interface for pasta @@ -235,6 +240,9 @@ struct ip6_ctx { * @low_wmem: Low probed net.core.wmem_max * @low_rmem: Low probed net.core.rmem_max * @vdev: vhost-user device + * @device_state_fd: Device state migration channel + * @device_state_result: Device state migration result + * @migrate_target: Are we the target, on the next migration request? */ struct ctx { enum passt_modes mode; @@ -244,6 +252,7 @@ struct ctx { int foreground; int nofile; char sock_path[UNIX_PATH_MAX]; + char repair_path[UNIX_PATH_MAX]; char pcap[PATH_MAX]; char pidfile[PATH_MAX]; @@ -260,8 +269,12 @@ struct ctx { int epollfd; int fd_tap_listen; int fd_tap; + int fd_repair_listen; + int fd_repair; unsigned char our_tap_mac[ETH_ALEN]; unsigned char guest_mac[ETH_ALEN]; + uint16_t mtu; + uint64_t hash_secret[2]; int ifi4; @@ -269,6 +282,9 @@ struct ctx { struct fqdn dns_search[MAXDNSRCH]; + char hostname[PASST_MAXDNAME]; + char fqdn[PASST_MAXDNAME]; + int ifi6; struct ip6_ctx ip6; @@ -283,7 +299,6 @@ struct ctx { int no_icmp; struct icmp_ctx icmp; - int mtu; int no_dns; int no_dns_search; int no_dhcp_dns; @@ -300,6 +315,11 @@ struct ctx { int low_rmem; struct vu_dev *vdev; + + /* Migration */ + int device_state_fd; + int device_state_result; + bool migrate_target; }; void proto_update_l2_buf(const unsigned char *eth_d, @@ -169,10 +169,12 @@ void pasta_open_ns(struct ctx *c, const char *netns) * struct pasta_spawn_cmd_arg - Argument for pasta_spawn_cmd() * @exe: Executable to run * @argv: Command and arguments to run + * @ctx: Context to read config from */ struct pasta_spawn_cmd_arg { const char *exe; char *const *argv; + struct ctx *c; }; /** @@ -186,6 +188,7 @@ static int pasta_spawn_cmd(void *arg) { char hostname[HOST_NAME_MAX + 1] = HOSTNAME_PREFIX; const struct pasta_spawn_cmd_arg *a; + size_t conf_hostname_len; sigset_t set; /* We run in a detached PID and mount namespace: mount /proc over */ @@ -195,9 +198,15 @@ static int pasta_spawn_cmd(void *arg) if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0")) warn("Cannot set ping_group_range, ICMP requests might fail"); - if (!gethostname(hostname + sizeof(HOSTNAME_PREFIX) - 1, - HOST_NAME_MAX + 1 - sizeof(HOSTNAME_PREFIX)) || - errno == ENAMETOOLONG) { + a = (const struct pasta_spawn_cmd_arg *)arg; + + conf_hostname_len = strlen(a->c->hostname); + if (conf_hostname_len > 0) { + if (sethostname(a->c->hostname, conf_hostname_len)) + warn("Unable to set configured hostname"); + } else if (!gethostname(hostname + sizeof(HOSTNAME_PREFIX) - 1, + HOST_NAME_MAX + 1 - sizeof(HOSTNAME_PREFIX)) || + errno == ENAMETOOLONG) { hostname[HOST_NAME_MAX] = '\0'; if (sethostname(hostname, strlen(hostname))) warn("Unable to set pasta-prefixed hostname"); @@ -208,7 +217,6 @@ static int pasta_spawn_cmd(void *arg) sigaddset(&set, SIGUSR1); sigwaitinfo(&set, NULL); - a = (const struct pasta_spawn_cmd_arg *)arg; execvp(a->exe, a->argv); die_perror("Failed to start command or shell"); @@ -230,6 +238,7 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid, struct pasta_spawn_cmd_arg arg = { .exe = argv[0], .argv = argv, + .c = c, }; char uidmap[BUFSIZ], gidmap[BUFSIZ]; char *sh_argv[] = { NULL, NULL }; @@ -310,7 +319,7 @@ void pasta_ns_conf(struct ctx *c) if (c->pasta_conf_ns) { unsigned int flags = IFF_UP; - if (c->mtu != -1) + if (c->mtu) nl_link_set_mtu(nl_sock_ns, c->pasta_ifi, c->mtu); if (c->ifi6) /* Avoid duplicate address detection on link up */ @@ -33,33 +33,12 @@ #include "log.h" #include "pcap.h" #include "iov.h" +#include "tap.h" #define PCAP_VERSION_MINOR 4 static int pcap_fd = -1; -/* See pcap.h from libpcap, or pcap-savefile(5) */ -static const struct { - uint32_t magic; -#define PCAP_MAGIC 0xa1b2c3d4 - - uint16_t major; -#define PCAP_VERSION_MAJOR 2 - - uint16_t minor; -#define PCAP_VERSION_MINOR 4 - - int32_t thiszone; - uint32_t sigfigs; - uint32_t snaplen; - - uint32_t linktype; -#define PCAP_LINKTYPE_ETHERNET 1 -} pcap_hdr = { - PCAP_MAGIC, PCAP_VERSION_MAJOR, PCAP_VERSION_MINOR, 0, 0, ETH_MAX_MTU, - PCAP_LINKTYPE_ETHERNET -}; - struct pcap_pkthdr { uint32_t tv_sec; uint32_t tv_usec; @@ -162,6 +141,29 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset) */ void pcap_init(struct ctx *c) { + /* See pcap.h from libpcap, or pcap-savefile(5) */ +#define PCAP_MAGIC 0xa1b2c3d4 +#define PCAP_VERSION_MAJOR 2 +#define PCAP_VERSION_MINOR 4 +#define PCAP_LINKTYPE_ETHERNET 1 + const struct { + uint32_t magic; + uint16_t major; + uint16_t minor; + + int32_t thiszone; + uint32_t sigfigs; + uint32_t snaplen; + + uint32_t linktype; + } pcap_hdr = { + .magic = PCAP_MAGIC, + .major = PCAP_VERSION_MAJOR, + .minor = PCAP_VERSION_MINOR, + .snaplen = tap_l2_max_len(c), + .linktype = PCAP_LINKTYPE_ETHERNET + }; + if (pcap_fd != -1) return; diff --git a/repair.c b/repair.c new file mode 100644 index 0000000..149fe51 --- /dev/null +++ b/repair.c @@ -0,0 +1,255 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * repair.c - Interface (server) for passt-repair, set/clear TCP_REPAIR + * + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#include <errno.h> +#include <sys/socket.h> +#include <sys/uio.h> + +#include "util.h" +#include "ip.h" +#include "passt.h" +#include "inany.h" +#include "flow.h" +#include "flow_table.h" + +#include "repair.h" + +#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */ + +/* Wait for a while for TCP_REPAIR helper to connect if it's not there yet */ +#define REPAIR_ACCEPT_TIMEOUT_MS 10 +#define REPAIR_ACCEPT_TIMEOUT_US (REPAIR_ACCEPT_TIMEOUT_MS * 1000) + +/* Pending file descriptors for next repair_flush() call, or command change */ +static int repair_fds[SCM_MAX_FD]; + +/* Pending command: flush pending file descriptors if it changes */ +static int8_t repair_cmd; + +/* Number of pending file descriptors set in @repair_fds */ +static int repair_nfds; + +/** + * repair_sock_init() - Start listening for connections on helper socket + * @c: Execution context + */ +void repair_sock_init(const struct ctx *c) +{ + union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR_LISTEN }; + struct epoll_event ev = { 0 }; + + if (c->fd_repair_listen == -1) + return; + + if (listen(c->fd_repair_listen, 0)) { + err_perror("listen() on repair helper socket, won't migrate"); + return; + } + + ref.fd = c->fd_repair_listen; + ev.events = EPOLLIN | EPOLLHUP | EPOLLET; + ev.data.u64 = ref.u64; + if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair_listen, &ev)) + err_perror("repair helper socket epoll_ctl(), won't migrate"); +} + +/** + * repair_listen_handler() - Handle events on TCP_REPAIR helper listening socket + * @c: Execution context + * @events: epoll events + */ +void repair_listen_handler(struct ctx *c, uint32_t events) +{ + union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR }; + struct epoll_event ev = { 0 }; + struct ucred ucred; + socklen_t len; + + if (events != EPOLLIN) { + debug("Spurious event 0x%04x on TCP_REPAIR helper socket", + events); + return; + } + + len = sizeof(ucred); + + /* Another client is already connected: accept and close right away. */ + if (c->fd_repair != -1) { + int discard = accept4(c->fd_repair_listen, NULL, NULL, + SOCK_NONBLOCK); + + if (discard == -1) + return; + + if (!getsockopt(discard, SOL_SOCKET, SO_PEERCRED, &ucred, &len)) + info("Discarding TCP_REPAIR helper, PID %i", ucred.pid); + + close(discard); + return; + } + + if ((c->fd_repair = accept4(c->fd_repair_listen, NULL, NULL, 0)) < 0) { + debug_perror("accept4() on TCP_REPAIR helper listening socket"); + return; + } + + if (!getsockopt(c->fd_repair, SOL_SOCKET, SO_PEERCRED, &ucred, &len)) + info("Accepted TCP_REPAIR helper, PID %i", ucred.pid); + + ref.fd = c->fd_repair; + ev.events = EPOLLHUP | EPOLLET; + ev.data.u64 = ref.u64; + if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair, &ev)) { + debug_perror("epoll_ctl() on TCP_REPAIR helper socket"); + close(c->fd_repair); + c->fd_repair = -1; + } +} + +/** + * repair_close() - Close connection to TCP_REPAIR helper + * @c: Execution context + */ +void repair_close(struct ctx *c) +{ + debug("Closing TCP_REPAIR helper socket"); + + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_repair, NULL); + close(c->fd_repair); + c->fd_repair = -1; +} + +/** + * repair_handler() - Handle EPOLLHUP and EPOLLERR on TCP_REPAIR helper socket + * @c: Execution context + * @events: epoll events + */ +void repair_handler(struct ctx *c, uint32_t events) +{ + (void)events; + + repair_close(c); +} + +/** + * repair_wait() - Wait (with timeout) for TCP_REPAIR helper to connect + * @c: Execution context + */ +void repair_wait(struct ctx *c) +{ + struct timeval tv = { .tv_sec = 0, + .tv_usec = (long)(REPAIR_ACCEPT_TIMEOUT_US) }; + static_assert(REPAIR_ACCEPT_TIMEOUT_US < 1000 * 1000, + ".tv_usec is greater than 1000 * 1000"); + + if (c->fd_repair >= 0 || c->fd_repair_listen == -1) + return; + + if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO, + &tv, sizeof(tv))) { + err_perror("Set timeout on TCP_REPAIR listening socket"); + return; + } + + repair_listen_handler(c, EPOLLIN); + + tv.tv_usec = 0; + if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO, + &tv, sizeof(tv))) + err_perror("Clear timeout on TCP_REPAIR listening socket"); +} + +/** + * repair_flush() - Flush current set of sockets to helper, with current command + * @c: Execution context + * + * Return: 0 on success, negative error code on failure + */ +int repair_flush(struct ctx *c) +{ + char buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FD)] + __attribute__ ((aligned(__alignof__(struct cmsghdr)))) = { 0 }; + struct iovec iov = { &repair_cmd, sizeof(repair_cmd) }; + struct cmsghdr *cmsg; + struct msghdr msg; + int8_t reply; + + if (!repair_nfds) + return 0; + + msg = (struct msghdr){ .msg_name = NULL, .msg_namelen = 0, + .msg_iov = &iov, .msg_iovlen = 1, + .msg_control = buf, + .msg_controllen = CMSG_SPACE(sizeof(int) * + repair_nfds), + .msg_flags = 0 }; + cmsg = CMSG_FIRSTHDR(&msg); + + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int) * repair_nfds); + memcpy(CMSG_DATA(cmsg), repair_fds, sizeof(int) * repair_nfds); + + repair_nfds = 0; + + if (sendmsg(c->fd_repair, &msg, 0) < 0) { + int ret = -errno; + err_perror("Failed to send sockets to TCP_REPAIR helper"); + repair_close(c); + return ret; + } + + if (recv(c->fd_repair, &reply, sizeof(reply), 0) < 0) { + int ret = -errno; + err_perror("Failed to receive reply from TCP_REPAIR helper"); + repair_close(c); + return ret; + } + + if (reply != repair_cmd) { + err("Unexpected reply from TCP_REPAIR helper: %d", reply); + repair_close(c); + return -ENXIO; + } + + return 0; +} + +/** + * repair_set() - Add socket to TCP_REPAIR set with given command + * @c: Execution context + * @s: Socket to add + * @cmd: TCP_REPAIR_ON, TCP_REPAIR_OFF, or TCP_REPAIR_OFF_NO_WP + * + * Return: 0 on success, negative error code on failure + */ +int repair_set(struct ctx *c, int s, int cmd) +{ + int rc; + + if (repair_nfds && repair_cmd != cmd) { + if ((rc = repair_flush(c))) + return rc; + } + + repair_cmd = cmd; + repair_fds[repair_nfds++] = s; + + if (repair_nfds >= SCM_MAX_FD) { + if ((rc = repair_flush(c))) + return rc; + } + + return 0; +} diff --git a/repair.h b/repair.h new file mode 100644 index 0000000..1d37922 --- /dev/null +++ b/repair.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#ifndef REPAIR_H +#define REPAIR_H + +void repair_sock_init(const struct ctx *c); +void repair_listen_handler(struct ctx *c, uint32_t events); +void repair_handler(struct ctx *c, uint32_t events); +void repair_close(struct ctx *c); +void repair_wait(struct ctx *c); +int repair_flush(struct ctx *c); +int repair_set(struct ctx *c, int s, int cmd); + +#endif /* REPAIR_H */ @@ -255,7 +255,7 @@ for __p in ${__profiles}; do __calls="${__calls} ${EXTRA_SYSCALLS:-}" __calls="$(filter ${__calls})" - cols="$(stty -a | sed -n 's/.*columns \([0-9]*\).*/\1/p' || :)" 2>/dev/null + cols="$(stty -a 2>/dev/null | sed -n 's/.*columns \([0-9]*\).*/\1/p' || :)" 2>/dev/null case $cols in [0-9]*) col_args="-w ${cols}";; *) col_args="";; esac echo "seccomp profile ${__p} allows: ${__calls}" | tr '\n' ' ' | fmt -t ${col_args} @@ -56,11 +56,25 @@ #include "netlink.h" #include "pasta.h" #include "packet.h" +#include "repair.h" #include "tap.h" #include "log.h" #include "vhost_user.h" #include "vu_common.h" +/* Maximum allowed frame lengths (including L2 header) */ + +/* Verify that an L2 frame length limit is large enough to contain the header, + * but small enough to fit in the packet pool + */ +#define CHECK_FRAME_LEN(len) \ + static_assert((len) >= ETH_HLEN && (len) <= PACKET_MAX_LEN, \ + #len " has bad value") + +CHECK_FRAME_LEN(L2_MAX_LEN_PASTA); +CHECK_FRAME_LEN(L2_MAX_LEN_PASST); +CHECK_FRAME_LEN(L2_MAX_LEN_VU); + /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */ static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf); static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf); @@ -69,6 +83,25 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf); #define FRAGMENT_MSG_RATE 10 /* # seconds between fragment warnings */ /** + * tap_l2_max_len() - Maximum frame size (including L2 header) for current mode + * @c: Execution context + */ +unsigned long tap_l2_max_len(const struct ctx *c) +{ + /* NOLINTBEGIN(bugprone-branch-clone): values can be the same */ + switch (c->mode) { + case MODE_PASST: + return L2_MAX_LEN_PASST; + case MODE_PASTA: + return L2_MAX_LEN_PASTA; + case MODE_VU: + return L2_MAX_LEN_VU; + } + /* NOLINTEND(bugprone-branch-clone) */ + ASSERT(0); +} + +/** * tap_send_single() - Send a single frame * @c: Execution context * @data: Packet buffer @@ -121,7 +154,7 @@ const struct in6_addr *tap_ip6_daddr(const struct ctx *c, * * Return: pointer at which to write the packet's payload */ -static void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto) +void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto) { struct ethhdr *eh = (struct ethhdr *)buf; @@ -142,8 +175,8 @@ static void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto) * * Return: pointer at which to write the packet's payload */ -static void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, - struct in_addr dst, size_t l4len, uint8_t proto) +void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, + struct in_addr dst, size_t l4len, uint8_t proto) { uint16_t l3len = l4len + sizeof(*ip4h); @@ -152,17 +185,17 @@ static void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, ip4h->tos = 0; ip4h->tot_len = htons(l3len); ip4h->id = 0; - ip4h->frag_off = 0; + ip4h->frag_off = htons(IP_DF); ip4h->ttl = 255; ip4h->protocol = proto; ip4h->saddr = src.s_addr; ip4h->daddr = dst.s_addr; ip4h->check = csum_ip4_header(l3len, proto, src, dst); - return ip4h + 1; + return (char *)ip4h + sizeof(*ip4h); } /** - * tap_udp4_send() - Send UDP over IPv4 packet + * tap_push_uh4() - Build UDPv4 header with checksum * @c: Execution context * @src: IPv4 source address * @sport: UDP source port @@ -170,16 +203,14 @@ static void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, * @dport: UDP destination port * @in: UDP payload contents (not including UDP header) * @dlen: UDP payload length (not including UDP header) + * + * Return: pointer at which to write the packet's payload */ -void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport, +void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport, struct in_addr dst, in_port_t dport, const void *in, size_t dlen) { size_t l4len = dlen + sizeof(struct udphdr); - char buf[USHRT_MAX]; - struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP); - struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP); - char *data = (char *)(uh + 1); const struct iovec iov = { .iov_base = (void *)in, .iov_len = dlen @@ -190,8 +221,30 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport, uh->dest = htons(dport); uh->len = htons(l4len); csum_udp4(uh, src, dst, &payload); - memcpy(data, in, dlen); + return (char *)uh + sizeof(*uh); +} + +/** + * tap_udp4_send() - Send UDP over IPv4 packet + * @c: Execution context + * @src: IPv4 source address + * @sport: UDP source port + * @dst: IPv4 destination address + * @dport: UDP destination port + * @in: UDP payload contents (not including UDP header) + * @dlen: UDP payload length (not including UDP header) + */ +void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport, + struct in_addr dst, in_port_t dport, + const void *in, size_t dlen) +{ + size_t l4len = dlen + sizeof(struct udphdr); + char buf[USHRT_MAX]; + struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP); + struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP); + char *data = tap_push_uh4(uh, src, sport, dst, dport, in, dlen); + memcpy(data, in, dlen); tap_send_single(c, buf, dlen + (data - buf)); } @@ -228,10 +281,9 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst, * * Return: pointer at which to write the packet's payload */ -static void *tap_push_ip6h(struct ipv6hdr *ip6h, - const struct in6_addr *src, - const struct in6_addr *dst, - size_t l4len, uint8_t proto, uint32_t flow) +void *tap_push_ip6h(struct ipv6hdr *ip6h, + const struct in6_addr *src, const struct in6_addr *dst, + size_t l4len, uint8_t proto, uint32_t flow) { ip6h->payload_len = htons(l4len); ip6h->priority = 0; @@ -240,14 +292,12 @@ static void *tap_push_ip6h(struct ipv6hdr *ip6h, ip6h->hop_limit = 255; ip6h->saddr = *src; ip6h->daddr = *dst; - ip6h->flow_lbl[0] = (flow >> 16) & 0xf; - ip6h->flow_lbl[1] = (flow >> 8) & 0xff; - ip6h->flow_lbl[2] = (flow >> 0) & 0xff; - return ip6h + 1; + ip6_set_flow_lbl(ip6h, flow); + return (char *)ip6h + sizeof(*ip6h); } /** - * tap_udp6_send() - Send UDP over IPv6 packet + * tap_push_uh6() - Build UDPv6 header with checksum * @c: Execution context * @src: IPv6 source address * @sport: UDP source port @@ -256,18 +306,15 @@ static void *tap_push_ip6h(struct ipv6hdr *ip6h, * @flow: Flow label * @in: UDP payload contents (not including UDP header) * @dlen: UDP payload length (not including UDP header) + * + * Return: pointer at which to write the packet's payload */ -void tap_udp6_send(const struct ctx *c, +void *tap_push_uh6(struct udphdr *uh, const struct in6_addr *src, in_port_t sport, const struct in6_addr *dst, in_port_t dport, - uint32_t flow, void *in, size_t dlen) + void *in, size_t dlen) { size_t l4len = dlen + sizeof(struct udphdr); - char buf[USHRT_MAX]; - struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6); - struct udphdr *uh = tap_push_ip6h(ip6h, src, dst, - l4len, IPPROTO_UDP, flow); - char *data = (char *)(uh + 1); const struct iovec iov = { .iov_base = in, .iov_len = dlen @@ -278,8 +325,33 @@ void tap_udp6_send(const struct ctx *c, uh->dest = htons(dport); uh->len = htons(l4len); csum_udp6(uh, src, dst, &payload); - memcpy(data, in, dlen); + return (char *)uh + sizeof(*uh); +} +/** + * tap_udp6_send() - Send UDP over IPv6 packet + * @c: Execution context + * @src: IPv6 source address + * @sport: UDP source port + * @dst: IPv6 destination address + * @dport: UDP destination port + * @flow: Flow label + * @in: UDP payload contents (not including UDP header) + * @dlen: UDP payload length (not including UDP header) + */ +void tap_udp6_send(const struct ctx *c, + const struct in6_addr *src, in_port_t sport, + const struct in6_addr *dst, in_port_t dport, + uint32_t flow, void *in, size_t dlen) +{ + size_t l4len = dlen + sizeof(struct udphdr); + char buf[USHRT_MAX]; + struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6); + struct udphdr *uh = tap_push_ip6h(ip6h, src, dst, + l4len, IPPROTO_UDP, flow); + char *data = tap_push_uh6(uh, src, sport, dst, dport, in, dlen); + + memcpy(data, in, dlen); tap_send_single(c, buf, dlen + (data - buf)); } @@ -490,6 +562,7 @@ static struct tap4_l4_t { * struct l4_seq6_t - Message sequence for one protocol handler call, IPv6 * @msgs: Count of messages in sequence * @protocol: Protocol number + * @flow_lbl: IPv6 flow label * @source: Source port * @dest: Destination port * @saddr: Source address @@ -498,6 +571,7 @@ static struct tap4_l4_t { */ static struct tap6_l4_t { uint8_t protocol; + uint32_t flow_lbl :20; uint16_t source; uint16_t dest; @@ -743,7 +817,7 @@ append: for (k = 0; k < p->count; ) k += tcp_tap_handler(c, PIF_TAP, AF_INET, &seq->saddr, &seq->daddr, - p, k, now); + 0, p, k, now); } else if (seq->protocol == IPPROTO_UDP) { if (c->no_udp) continue; @@ -871,6 +945,7 @@ resume: ((seq)->protocol == (proto) && \ (seq)->source == (uh)->source && \ (seq)->dest == (uh)->dest && \ + (seq)->flow_lbl == ip6_get_flow_lbl(ip6h) && \ IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr) && \ IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr)) @@ -879,6 +954,7 @@ resume: (seq)->protocol = (proto); \ (seq)->source = (uh)->source; \ (seq)->dest = (uh)->dest; \ + (seq)->flow_lbl = ip6_get_flow_lbl(ip6h); \ (seq)->saddr = *saddr; \ (seq)->daddr = *daddr; \ } while (0) @@ -924,7 +1000,7 @@ append: for (k = 0; k < p->count; ) k += tcp_tap_handler(c, PIF_TAP, AF_INET6, &seq->saddr, &seq->daddr, - p, k, now); + seq->flow_lbl, p, k, now); } else if (seq->protocol == IPPROTO_UDP) { if (c->no_udp) continue; @@ -1036,7 +1112,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now) do { n = recv(c->fd_tap, pkt_buf + partial_len, - TAP_BUF_BYTES - partial_len, MSG_DONTWAIT); + sizeof(pkt_buf) - partial_len, MSG_DONTWAIT); } while ((n < 0) && errno == EINTR); if (n < 0) { @@ -1053,7 +1129,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now) while (n >= (ssize_t)sizeof(uint32_t)) { uint32_t l2len = ntohl_unaligned(p); - if (l2len < sizeof(struct ethhdr) || l2len > ETH_MAX_MTU) { + if (l2len < sizeof(struct ethhdr) || l2len > L2_MAX_LEN_PASST) { err("Bad frame size from guest, resetting connection"); tap_sock_reset(c); return; @@ -1107,8 +1183,10 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now) tap_flush_pools(); - for (n = 0; n <= (ssize_t)(TAP_BUF_BYTES - ETH_MAX_MTU); n += len) { - len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU); + for (n = 0; + n <= (ssize_t)(sizeof(pkt_buf) - L2_MAX_LEN_PASTA); + n += len) { + len = read(c->fd_tap, pkt_buf + n, L2_MAX_LEN_PASTA); if (len == 0) { die("EOF on tap device, exiting"); @@ -1126,7 +1204,7 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now) /* Ignore frames of bad length */ if (len < (ssize_t)sizeof(struct ethhdr) || - len > (ssize_t)ETH_MAX_MTU) + len > (ssize_t)L2_MAX_LEN_PASTA) continue; tap_add_packet(c, len, pkt_buf + n); @@ -1152,68 +1230,6 @@ void tap_handler_pasta(struct ctx *c, uint32_t events, } /** - * tap_sock_unix_open() - Create and bind AF_UNIX socket - * @sock_path: Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix) - * - * Return: socket descriptor on success, won't return on failure - */ -int tap_sock_unix_open(char *sock_path) -{ - int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); - struct sockaddr_un addr = { - .sun_family = AF_UNIX, - }; - int i; - - if (fd < 0) - die_perror("Failed to open UNIX domain socket"); - - for (i = 1; i < UNIX_SOCK_MAX; i++) { - char *path = addr.sun_path; - int ex, ret; - - if (*sock_path) - memcpy(path, sock_path, UNIX_PATH_MAX); - else if (snprintf_check(path, UNIX_PATH_MAX - 1, - UNIX_SOCK_PATH, i)) - die_perror("Can't build UNIX domain socket path"); - - ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, - 0); - if (ex < 0) - die_perror("Failed to check for UNIX domain conflicts"); - - ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr)); - if (!ret || (errno != ENOENT && errno != ECONNREFUSED && - errno != EACCES)) { - if (*sock_path) - die("Socket path %s already in use", path); - - close(ex); - continue; - } - close(ex); - - unlink(path); - ret = bind(fd, (const struct sockaddr *)&addr, sizeof(addr)); - if (*sock_path && ret) - die_perror("Failed to bind UNIX domain socket"); - - if (!ret) - break; - } - - if (i == UNIX_SOCK_MAX) - die_perror("Failed to bind UNIX domain socket"); - - info("UNIX domain socket bound at %s", addr.sun_path); - if (!*sock_path) - memcpy(sock_path, addr.sun_path, UNIX_PATH_MAX); - - return fd; -} - -/** * tap_backend_show_hints() - Give help information to start QEMU * @c: Execution context */ @@ -1423,6 +1439,8 @@ void tap_backend_init(struct ctx *c) tap_sock_tun_init(c); break; case MODE_VU: + repair_sock_init(c); + /* fall through */ case MODE_PASST: tap_sock_unix_init(c); @@ -6,7 +6,32 @@ #ifndef TAP_H #define TAP_H -#define ETH_HDR_INIT(proto) { .h_proto = htons_constant(proto) } +/** L2_MAX_LEN_PASTA - Maximum frame length for pasta mode (with L2 header) + * + * The kernel tuntap device imposes a maximum frame size of 65535 including + * 'hard_header_len' (14 bytes for L2 Ethernet in the case of "tap" mode). + */ +#define L2_MAX_LEN_PASTA USHRT_MAX + +/** L2_MAX_LEN_PASST - Maximum frame length for passt mode (with L2 header) + * + * The only structural limit the QEMU socket protocol imposes on frames is + * (2^32-1) bytes, but that would be ludicrously long in practice. For now, + * limit it somewhat arbitrarily to 65535 bytes. FIXME: Work out an appropriate + * limit with more precision. + */ +#define L2_MAX_LEN_PASST USHRT_MAX + +/** L2_MAX_LEN_VU - Maximum frame length for vhost-user mode (with L2 header) + * + * vhost-user allows multiple buffers per frame, each of which can be quite + * large, so the inherent frame size limit is rather large. Much larger than is + * actually useful for IP. For now limit arbitrarily to 65535 bytes. FIXME: + * Work out an appropriate limit with more precision. + */ +#define L2_MAX_LEN_VU USHRT_MAX + +struct udphdr; /** * struct tap_hdr - tap backend specific headers @@ -44,6 +69,23 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len) thdr->vnet_len = htonl(l2len); } +unsigned long tap_l2_max_len(const struct ctx *c); +void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto); +void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, + struct in_addr dst, size_t l4len, uint8_t proto); +void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport, + struct in_addr dst, in_port_t dport, + const void *in, size_t dlen); +void *tap_push_uh6(struct udphdr *uh, + const struct in6_addr *src, in_port_t sport, + const struct in6_addr *dst, in_port_t dport, + void *in, size_t dlen); +void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, + struct in_addr dst, size_t l4len, uint8_t proto); +void *tap_push_ip6h(struct ipv6hdr *ip6h, + const struct in6_addr *src, + const struct in6_addr *dst, + size_t l4len, uint8_t proto, uint32_t flow); void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport, struct in_addr dst, in_port_t dport, const void *in, size_t dlen); @@ -51,6 +93,9 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst, const void *in, size_t l4len); const struct in6_addr *tap_ip6_daddr(const struct ctx *c, const struct in6_addr *src); +void *tap_push_ip6h(struct ipv6hdr *ip6h, + const struct in6_addr *src, const struct in6_addr *dst, + size_t l4len, uint8_t proto, uint32_t flow); void tap_udp6_send(const struct ctx *c, const struct in6_addr *src, in_port_t sport, const struct in6_addr *dst, in_port_t dport, @@ -280,6 +280,7 @@ #include <stddef.h> #include <string.h> #include <sys/epoll.h> +#include <sys/ioctl.h> #include <sys/socket.h> #include <sys/timerfd.h> #include <sys/types.h> @@ -287,6 +288,8 @@ #include <time.h> #include <arpa/inet.h> +#include <linux/sockios.h> + #include "checksum.h" #include "util.h" #include "iov.h" @@ -299,6 +302,7 @@ #include "log.h" #include "inany.h" #include "flow.h" +#include "repair.h" #include "linux_dep.h" #include "flow_table.h" @@ -306,6 +310,21 @@ #include "tcp_buf.h" #include "tcp_vu.h" +#ifndef __USE_MISC +/* From Linux UAPI, missing in netinet/tcp.h provided by musl */ +struct tcp_repair_opt { + __u32 opt_code; + __u32 opt_val; +}; + +enum { + TCP_NO_QUEUE, + TCP_RECV_QUEUE, + TCP_SEND_QUEUE, + TCP_QUEUES_NR, +}; +#endif + /* MSS rounding: see SET_MSS() */ #define MSS_DEFAULT 536 #define WINDOW_DEFAULT 14600 /* RFC 6928 */ @@ -326,6 +345,19 @@ ((conn)->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) #define CONN_HAS(conn, set) (((conn)->events & (set)) == (set)) +/* Buffers to migrate pending data from send and receive queues. No, they don't + * use memory if we don't use them. And we're going away after this, so splurge. + */ +#define TCP_MIGRATE_SND_QUEUE_MAX (64 << 20) +#define TCP_MIGRATE_RCV_QUEUE_MAX (64 << 20) +uint8_t tcp_migrate_snd_queue [TCP_MIGRATE_SND_QUEUE_MAX]; +uint8_t tcp_migrate_rcv_queue [TCP_MIGRATE_RCV_QUEUE_MAX]; + +#define TCP_MIGRATE_RESTORE_CHUNK_MIN 1024 /* Try smaller when above this */ + +/* "Extended" data (not stored in the flow table) for TCP flow migration */ +static struct tcp_tap_transfer_ext migrate_ext[FLOW_MAX]; + static const char *tcp_event_str[] __attribute((__unused__)) = { "SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT", @@ -338,7 +370,7 @@ static const char *tcp_state_str[] __attribute((__unused__)) = { "SYN_RCVD", /* approximately maps to TAP_SYN_ACK_SENT */ /* Passive close: */ - "CLOSE_WAIT", "CLOSE_WAIT", "LAST_ACK", "LAST_ACK", "LAST_ACK", + "CLOSE_WAIT", "CLOSE_WAIT", "CLOSE_WAIT", "LAST_ACK", "LAST_ACK", /* Active close (+5): */ "CLOSING", "FIN_WAIT_1", "FIN_WAIT_1", "FIN_WAIT_2", "TIME_WAIT", }; @@ -519,8 +551,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) fd = timerfd_create(CLOCK_MONOTONIC, 0); if (fd == -1 || fd > FD_REF_MAX) { - flow_dbg(conn, "failed to get timer: %s", - strerror_(errno)); + flow_dbg_perror(conn, "failed to get timer"); if (fd > -1) close(fd); conn->timer = -1; @@ -529,8 +560,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) conn->timer = fd; if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) { - flow_dbg(conn, "failed to add timer: %s", - strerror_(errno)); + flow_dbg_perror(conn, "failed to add timer"); close(conn->timer); conn->timer = -1; return; @@ -555,7 +585,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) (unsigned long long)it.it_value.tv_nsec / 1000 / 1000); if (timerfd_settime(conn->timer, 0, &it, NULL)) - flow_err(conn, "failed to set timer: %s", strerror_(errno)); + flow_perror(conn, "failed to set timer"); } /** @@ -739,24 +769,6 @@ static void tcp_get_sndbuf(struct tcp_tap_conn *conn) } /** - * tcp_sock_set_bufsize() - Set SO_RCVBUF and SO_SNDBUF to maximum values - * @s: Socket, can be -1 to avoid check in the caller - */ -static void tcp_sock_set_bufsize(const struct ctx *c, int s) -{ - int v = INT_MAX / 2; /* Kernel clamps and rounds, no need to check */ - - if (s == -1) - return; - - if (!c->low_rmem && setsockopt(s, SOL_SOCKET, SO_RCVBUF, &v, sizeof(v))) - trace("TCP: failed to set SO_RCVBUF to %i", v); - - if (!c->low_wmem && setsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v))) - trace("TCP: failed to set SO_SNDBUF to %i", v); -} - -/** * tcp_sock_set_nodelay() - Set TCP_NODELAY option (disable Nagle's algorithm) * @s: Socket, can be -1 to avoid check in the caller */ @@ -775,7 +787,8 @@ static void tcp_sock_set_nodelay(int s) * @th: TCP header (updated) * @payload: TCP payload */ -void tcp_update_csum(uint32_t psum, struct tcphdr *th, struct iov_tail *payload) +static void tcp_update_csum(uint32_t psum, struct tcphdr *th, + struct iov_tail *payload) { th->check = 0; psum = csum_unfolded(th, sizeof(*th), psum); @@ -951,9 +964,7 @@ void tcp_fill_headers(const struct tcp_tap_conn *conn, ip6h->version = 6; ip6h->nexthdr = IPPROTO_TCP; - ip6h->flow_lbl[0] = (conn->sock >> 16) & 0xf; - ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff; - ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff; + ip6_set_flow_lbl(ip6h, conn->sock); if (!no_tcp_csum) { psum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, @@ -1127,7 +1138,7 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, if (flags & SYN) { int mss; - if (c->mtu == -1) { + if (!c->mtu) { mss = tinfo.tcpi_snd_mss; } else { mss = c->mtu - sizeof(struct tcphdr); @@ -1202,8 +1213,8 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn) if (conn->events == CLOSED) return; - if (!tcp_send_flag(c, conn, RST)) - conn_event(c, conn, CLOSED); + tcp_send_flag(c, conn, RST); + conn_event(c, conn, CLOSED); } /** @@ -1278,12 +1289,11 @@ int tcp_conn_pool_sock(int pool[]) /** * tcp_conn_new_sock() - Open and prepare new socket for connection - * @c: Execution context * @af: Address family * * Return: socket number on success, negative code if socket creation failed */ -static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) +static int tcp_conn_new_sock(sa_family_t af) { int s; @@ -1297,7 +1307,6 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) if (s < 0) return -errno; - tcp_sock_set_bufsize(c, s); tcp_sock_set_nodelay(s); return s; @@ -1305,12 +1314,11 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) /** * tcp_conn_sock() - Obtain a connectable socket in the host/init namespace - * @c: Execution context * @af: Address family (AF_INET or AF_INET6) * * Return: Socket fd on success, -errno on failure */ -int tcp_conn_sock(const struct ctx *c, sa_family_t af) +int tcp_conn_sock(sa_family_t af) { int *pool = af == AF_INET6 ? init_sock_pool6 : init_sock_pool4; int s; @@ -1321,7 +1329,7 @@ int tcp_conn_sock(const struct ctx *c, sa_family_t af) /* If the pool is empty we just open a new one without refilling the * pool to keep latency down. */ - if ((s = tcp_conn_new_sock(c, af)) >= 0) + if ((s = tcp_conn_new_sock(af)) >= 0) return s; err("TCP: Unable to open socket for new connection: %s", @@ -1375,10 +1383,10 @@ static void tcp_bind_outbound(const struct ctx *c, if (bind(s, &bind_sa.sa, sl)) { char sstr[INANY_ADDRSTRLEN]; - flow_dbg(conn, - "Can't bind TCP outbound socket to %s:%hu: %s", - inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)), - tgt->oport, strerror_(errno)); + flow_dbg_perror(conn, + "Can't bind TCP outbound socket to %s:%hu", + inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)), + tgt->oport); } } @@ -1387,9 +1395,9 @@ static void tcp_bind_outbound(const struct ctx *c, if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE, c->ip4.ifname_out, strlen(c->ip4.ifname_out))) { - flow_dbg(conn, "Can't bind IPv4 TCP socket to" - " interface %s: %s", c->ip4.ifname_out, - strerror_(errno)); + flow_dbg_perror(conn, + "Can't bind IPv4 TCP socket to interface %s", + c->ip4.ifname_out); } } } else if (bind_sa.sa_family == AF_INET6) { @@ -1397,9 +1405,9 @@ static void tcp_bind_outbound(const struct ctx *c, if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE, c->ip6.ifname_out, strlen(c->ip6.ifname_out))) { - flow_dbg(conn, "Can't bind IPv6 TCP socket to" - " interface %s: %s", c->ip6.ifname_out, - strerror_(errno)); + flow_dbg_perror(conn, + "Can't bind IPv6 TCP socket to interface %s", + c->ip6.ifname_out); } } } @@ -1462,7 +1470,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, goto cancel; } - if ((s = tcp_conn_sock(c, af)) < 0) + if ((s = tcp_conn_sock(af)) < 0) goto cancel; pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, tgt->eport); @@ -1483,12 +1491,13 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, } else { /* Not a local, bound destination, inconclusive test */ close(s); - if ((s = tcp_conn_sock(c, af)) < 0) + if ((s = tcp_conn_sock(af)) < 0) goto cancel; } conn->sock = s; conn->timer = -1; + conn->listening_sock = -1; conn_event(c, conn, TAP_SYN_RCVD); conn->wnd_to_tap = WINDOW_DEFAULT; @@ -1536,12 +1545,10 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, if (c->mode == MODE_VU) { /* To rebind to same oport after migration */ sl = sizeof(sa); - if (!getsockname(s, &sa.sa, &sl)) { + if (!getsockname(s, &sa.sa, &sl)) inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa); - } else { - err("Failed to get local address for socket: %s", - strerror_(errno)); - } + else + err_perror("Can't get local address for socket %i", s); } FLOW_ACTIVATE(conn); @@ -1664,8 +1671,10 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, tcp_send_flag(c, conn, ACK); tcp_timer_ctl(c, conn); - if (p->count == 1) + if (p->count == 1) { + tcp_tap_window_update(conn, ntohs(th->window)); return 1; + } continue; } @@ -1859,12 +1868,82 @@ static void tcp_conn_from_sock_finish(const struct ctx *c, } /** + * tcp_rst_no_conn() - Send RST in response to a packet with no connection + * @c: Execution context + * @af: Address family, AF_INET or AF_INET6 + * @saddr: Source address of the packet we're responding to + * @daddr: Destination address of the packet we're responding to + * @flow_lbl: IPv6 flow label (ignored for IPv4) + * @th: TCP header of the packet we're responding to + * @l4len: Packet length, including TCP header + */ +static void tcp_rst_no_conn(const struct ctx *c, int af, + const void *saddr, const void *daddr, + uint32_t flow_lbl, + const struct tcphdr *th, size_t l4len) +{ + struct iov_tail payload = IOV_TAIL(NULL, 0, 0); + struct tcphdr *rsth; + char buf[USHRT_MAX]; + uint32_t psum = 0; + size_t rst_l2len; + + /* Don't respond to RSTs without a connection */ + if (th->rst) + return; + + if (af == AF_INET) { + struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP); + const struct in_addr *rst_src = daddr; + const struct in_addr *rst_dst = saddr; + + rsth = tap_push_ip4h(ip4h, *rst_src, *rst_dst, + sizeof(*rsth), IPPROTO_TCP); + psum = proto_ipv4_header_psum(sizeof(*rsth), IPPROTO_TCP, + *rst_src, *rst_dst); + + } else { + struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6); + const struct in6_addr *rst_src = daddr; + const struct in6_addr *rst_dst = saddr; + + rsth = tap_push_ip6h(ip6h, rst_src, rst_dst, + sizeof(*rsth), IPPROTO_TCP, flow_lbl); + psum = proto_ipv6_header_psum(sizeof(*rsth), IPPROTO_TCP, + rst_src, rst_dst); + } + + memset(rsth, 0, sizeof(*rsth)); + + rsth->source = th->dest; + rsth->dest = th->source; + rsth->rst = 1; + rsth->doff = sizeof(*rsth) / 4UL; + + /* Sequence matching logic from RFC 9293 section 3.10.7.1 */ + if (th->ack) { + rsth->seq = th->ack_seq; + } else { + size_t dlen = l4len - th->doff * 4UL; + uint32_t ack = ntohl(th->seq) + dlen; + + rsth->ack_seq = htonl(ack); + rsth->ack = 1; + } + + tcp_update_csum(psum, rsth, &payload); + rst_l2len = ((char *)rsth - buf) + sizeof(*rsth); + tap_send_single(c, buf, rst_l2len); +} + +/** * tcp_tap_handler() - Handle packets from tap and state transitions * @c: Execution context * @pif: pif on which the packet is arriving * @af: Address family, AF_INET or AF_INET6 * @saddr: Source address * @daddr: Destination address + * @flow_lbl: IPv6 flow label (ignored for IPv4) * @p: Pool of TCP packets, with TCP headers * @idx: Index of first packet in pool to process * @now: Current timestamp @@ -1872,7 +1951,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c, * Return: count of consumed packets */ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, - const void *saddr, const void *daddr, + const void *saddr, const void *daddr, uint32_t flow_lbl, const struct pool *p, int idx, const struct timespec *now) { struct tcp_tap_conn *conn; @@ -1905,6 +1984,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, if (opts && th->syn && !th->ack) tcp_conn_from_tap(c, af, saddr, daddr, th, opts, optlen, now); + else + tcp_rst_no_conn(c, af, saddr, daddr, flow_lbl, th, len); return 1; } @@ -1968,6 +2049,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, /* Established connections not accepting data from tap */ if (conn->events & TAP_FIN_RCVD) { tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq)); + tcp_tap_window_update(conn, ntohs(th->window)); + tcp_data_from_sock(c, conn); if (conn->events & SOCK_FIN_RCVD && conn->seq_ack_from_tap == conn->seq_to_tap) @@ -1987,10 +2070,27 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, ack_due = 1; if ((conn->events & TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_SENT)) { + socklen_t sl; + struct tcp_info tinfo; + shutdown(conn->sock, SHUT_WR); conn_event(c, conn, SOCK_FIN_SENT); tcp_send_flag(c, conn, ACK); ack_due = 0; + + /* If we received a FIN, but the socket is in TCP_ESTABLISHED + * state, it must be a migrated socket. The kernel saw the FIN + * on the source socket, but not on the target socket. + * + * Approximate the effect of that FIN: as we're sending a FIN + * out ourselves, the socket is now in a state equivalent to + * LAST_ACK. Now that we sent the FIN out, close it with a RST. + */ + sl = sizeof(tinfo); + getsockopt(conn->sock, SOL_TCP, TCP_INFO, &tinfo, &sl); + if (tinfo.tcpi_state == TCP_ESTABLISHED && + conn->events & SOCK_FIN_RCVD) + goto reset; } if (ack_due) @@ -2073,9 +2173,10 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow, void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, const struct timespec *now) { - const struct flowside *ini; + struct tcp_tap_conn *conn; union sockaddr_inany sa; socklen_t sl = sizeof(sa); + struct flowside *ini; union flow *flow; int s; @@ -2088,15 +2189,25 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, if (s < 0) goto cancel; - tcp_sock_set_bufsize(c, s); + conn = (struct tcp_tap_conn *)flow; + conn->listening_sock = ref.fd; + tcp_sock_set_nodelay(s); - /* FIXME: When listening port has a specific bound address, record that - * as our address + /* FIXME: If useful: when the listening port has a specific bound + * address, record that as our address, as implemented for vhost-user + * mode only, below. */ ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa, ref.tcp_listen.port); + if (c->mode == MODE_VU) { /* Rebind to same address after migration */ + if (!getsockname(s, &sa.sa, &sl)) + inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa); + else + err_perror("Can't get local address for socket %i", s); + } + if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0) { char sastr[SOCKADDR_STRLEN]; @@ -2151,7 +2262,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) * and we just set the timer to a new point in the future: discard it. */ if (timerfd_gettime(conn->timer, &check_armed)) - flow_err(conn, "failed to read timer: %s", strerror_(errno)); + flow_perror(conn, "failed to read timer"); if (check_armed.it_value.tv_sec || check_armed.it_value.tv_nsec) return; @@ -2173,6 +2284,8 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) flow_dbg(conn, "ACK timeout, retry"); conn->retrans++; conn->seq_to_tap = conn->seq_ack_from_tap; + if (!conn->wnd_from_tap) + conn->wnd_from_tap = 1; /* Zero-window probe */ if (tcp_set_peek_offset(conn->sock, 0)) { tcp_rst(c, conn); } else { @@ -2191,8 +2304,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) * ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE. */ if (timerfd_settime(conn->timer, 0, &new, &old)) - flow_err(conn, "failed to set timer: %s", - strerror_(errno)); + flow_perror(conn, "failed to set timer"); if (old.it_value.tv_sec == ACT_TIMEOUT) { flow_dbg(conn, "activity timeout"); @@ -2386,7 +2498,7 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port) * @c: Execution context * @port: Port, host order */ -void tcp_ns_sock_init(const struct ctx *c, in_port_t port) +static void tcp_ns_sock_init(const struct ctx *c, in_port_t port) { ASSERT(!c->no_tcp); @@ -2422,13 +2534,12 @@ static int tcp_ns_socks_init(void *arg) /** * tcp_sock_refill_pool() - Refill one pool of pre-opened sockets - * @c: Execution context * @pool: Pool of sockets to refill * @af: Address family to use * * Return: 0 on success, negative error code if there was at least one error */ -int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) +int tcp_sock_refill_pool(int pool[], sa_family_t af) { int i; @@ -2438,7 +2549,7 @@ int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) if (pool[i] >= 0) continue; - if ((fd = tcp_conn_new_sock(c, af)) < 0) + if ((fd = tcp_conn_new_sock(af)) < 0) return fd; pool[i] = fd; @@ -2454,13 +2565,13 @@ int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) static void tcp_sock_refill_init(const struct ctx *c) { if (c->ifi4) { - int rc = tcp_sock_refill_pool(c, init_sock_pool4, AF_INET); + int rc = tcp_sock_refill_pool(init_sock_pool4, AF_INET); if (rc < 0) warn("TCP: Error refilling IPv4 host socket pool: %s", strerror_(-rc)); } if (c->ifi6) { - int rc = tcp_sock_refill_pool(c, init_sock_pool6, AF_INET6); + int rc = tcp_sock_refill_pool(init_sock_pool6, AF_INET6); if (rc < 0) warn("TCP: Error refilling IPv6 host socket pool: %s", strerror_(-rc)); @@ -2645,3 +2756,879 @@ void tcp_timer(struct ctx *c, const struct timespec *now) if (c->mode == MODE_PASTA) tcp_splice_refill(c); } + +/** + * tcp_flow_is_established() - Was the connection established? Includes closing + * @conn: Pointer to the TCP connection structure + * + * Return: true if the connection was established, false otherwise + */ +bool tcp_flow_is_established(const struct tcp_tap_conn *conn) +{ + return conn->events & ESTABLISHED; +} + +/** + * tcp_flow_repair_on() - Enable repair mode for a single TCP flow + * @c: Execution context + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn) +{ + int rc = 0; + + if (conn->sock < 0) + return 0; + + if ((rc = repair_set(c, conn->sock, TCP_REPAIR_ON))) + err("Failed to set TCP_REPAIR"); + + return rc; +} + +/** + * tcp_flow_repair_off() - Clear repair mode for a single TCP flow + * @c: Execution context + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn) +{ + int rc = 0; + + if (conn->sock < 0) + return 0; + + if ((rc = repair_set(c, conn->sock, TCP_REPAIR_OFF))) + err("Failed to clear TCP_REPAIR"); + + return rc; +} + +/** + * tcp_flow_dump_tinfo() - Dump window scale, tcpi_state, tcpi_options + * @c: Execution context + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_dump_tinfo(int s, struct tcp_tap_transfer_ext *t) +{ + struct tcp_info tinfo; + socklen_t sl; + + sl = sizeof(tinfo); + if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) { + int rc = -errno; + err_perror("Querying TCP_INFO, socket %i", s); + return rc; + } + + t->snd_ws = tinfo.tcpi_snd_wscale; + t->rcv_ws = tinfo.tcpi_rcv_wscale; + t->tcpi_state = tinfo.tcpi_state; + t->tcpi_options = tinfo.tcpi_options; + + return 0; +} + +/** + * tcp_flow_dump_mss() - Dump MSS clamp (not current MSS) via TCP_MAXSEG + * @c: Execution context + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_dump_mss(int s, struct tcp_tap_transfer_ext *t) +{ + socklen_t sl = sizeof(t->mss); + + if (getsockopt(s, SOL_TCP, TCP_MAXSEG, &t->mss, &sl)) { + int rc = -errno; + err_perror("Getting MSS, socket %i", s); + return rc; + } + + return 0; +} + +/** + * tcp_flow_dump_wnd() - Dump current tcp_repair_window parameters + * @c: Execution context + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_dump_wnd(int s, struct tcp_tap_transfer_ext *t) +{ + struct tcp_repair_window wnd; + socklen_t sl = sizeof(wnd); + + if (getsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, &sl)) { + int rc = -errno; + err_perror("Getting window repair data, socket %i", s); + return rc; + } + + t->snd_wl1 = wnd.snd_wl1; + t->snd_wnd = wnd.snd_wnd; + t->max_window = wnd.max_window; + t->rcv_wnd = wnd.rcv_wnd; + t->rcv_wup = wnd.rcv_wup; + + /* If we received a FIN, we also need to adjust window parameters. + * + * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state. + */ + if (t->tcpi_state == TCP_CLOSE_WAIT || t->tcpi_state == TCP_LAST_ACK) { + t->rcv_wup--; + t->rcv_wnd++; + } + + return 0; +} + +/** + * tcp_flow_repair_wnd() - Restore window parameters from extended data + * @c: Execution context + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_wnd(int s, const struct tcp_tap_transfer_ext *t) +{ + struct tcp_repair_window wnd; + + wnd.snd_wl1 = t->snd_wl1; + wnd.snd_wnd = t->snd_wnd; + wnd.max_window = t->max_window; + wnd.rcv_wnd = t->rcv_wnd; + wnd.rcv_wup = t->rcv_wup; + + if (setsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, sizeof(wnd))) { + int rc = -errno; + err_perror("Setting window data, socket %i", s); + return rc; + } + + return 0; +} + +/** + * tcp_flow_select_queue() - Select queue (receive or send) for next operation + * @s: Socket + * @queue: TCP_RECV_QUEUE or TCP_SEND_QUEUE + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_select_queue(int s, int queue) +{ + if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue))) { + int rc = -errno; + err_perror("Selecting TCP_SEND_QUEUE, socket %i", s); + return rc; + } + + return 0; +} + +/** + * tcp_flow_dump_sndqueue() - Dump send queue, length of sent and not sent data + * @s: Socket + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + * + * #syscalls:vu ioctl + */ +static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t) +{ + ssize_t rc; + + if (ioctl(s, SIOCOUTQ, &t->sndq) < 0) { + rc = -errno; + err_perror("Getting send queue size, socket %i", s); + return rc; + } + + if (ioctl(s, SIOCOUTQNSD, &t->notsent) < 0) { + rc = -errno; + err_perror("Getting not sent count, socket %i", s); + return rc; + } + + /* If we sent a FIN, SIOCOUTQ and SIOCOUTQNSD are one greater than the + * actual pending queue length, because they are based on the sequence + * numbers, not directly on the buffer contents. + * + * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state. + */ + if (t->tcpi_state == TCP_FIN_WAIT1 || t->tcpi_state == TCP_FIN_WAIT2 || + t->tcpi_state == TCP_LAST_ACK || t->tcpi_state == TCP_CLOSING) { + if (t->sndq) + t->sndq--; + if (t->notsent) + t->notsent--; + } + + if (t->notsent > t->sndq) { + err("Invalid notsent count socket %i, send: %u, not sent: %u", + s, t->sndq, t->notsent); + return -EINVAL; + } + + if (t->sndq > TCP_MIGRATE_SND_QUEUE_MAX) { + err("Send queue too large to migrate socket %i: %u bytes", + s, t->sndq); + return -ENOBUFS; + } + + rc = recv(s, tcp_migrate_snd_queue, + MIN(t->sndq, TCP_MIGRATE_SND_QUEUE_MAX), MSG_PEEK); + if (rc < 0) { + if (errno == EAGAIN) { /* EAGAIN means empty */ + rc = 0; + } else { + rc = -errno; + err_perror("Can't read send queue, socket %i", s); + return rc; + } + } + + if ((uint32_t)rc < t->sndq) { + err("Short read migrating send queue"); + return -ENXIO; + } + + t->notsent = MIN(t->notsent, t->sndq); + + return 0; +} + +/** + * tcp_flow_repair_queue() - Restore contents of a given (pre-selected) queue + * @s: Socket + * @len: Length of data to be restored + * @buf: Buffer with content of pending data queue + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_queue(int s, size_t len, uint8_t *buf) +{ + size_t chunk = len; + uint8_t *p = buf; + + while (len > 0) { + ssize_t rc = send(s, p, MIN(len, chunk), 0); + + if (rc < 0) { + if ((errno == ENOBUFS || errno == ENOMEM) && + chunk >= TCP_MIGRATE_RESTORE_CHUNK_MIN) { + chunk /= 2; + continue; + } + + rc = -errno; + err_perror("Can't write queue, socket %i", s); + return rc; + } + + len -= rc; + p += rc; + } + + return 0; +} + +/** + * tcp_flow_dump_seq() - Dump current sequence of pre-selected queue + * @s: Socket + * @v: Sequence value, set on return + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_dump_seq(int s, uint32_t *v) +{ + socklen_t sl = sizeof(*v); + + if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, v, &sl)) { + int rc = -errno; + err_perror("Dumping sequence, socket %i", s); + return rc; + } + + return 0; +} + +/** + * tcp_flow_repair_seq() - Restore sequence for pre-selected queue + * @s: Socket + * @v: Sequence value to be set + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_seq(int s, const uint32_t *v) +{ + if (setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, v, sizeof(*v))) { + int rc = -errno; + err_perror("Setting sequence, socket %i", s); + return rc; + } + + return 0; +} + +/** + * tcp_flow_dump_rcvqueue() - Dump receive queue and its length, seal/block it + * @s: Socket + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + * + * #syscalls:vu ioctl + */ +static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t) +{ + ssize_t rc; + + if (ioctl(s, SIOCINQ, &t->rcvq) < 0) { + rc = -errno; + err_perror("Get receive queue size, socket %i", s); + return rc; + } + + /* If we received a FIN, SIOCINQ is one greater than the actual number + * of bytes on the queue, because it's based on the sequence number + * rather than directly on the buffer contents. + * + * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state. + */ + if (t->rcvq && + (t->tcpi_state == TCP_CLOSE_WAIT || t->tcpi_state == TCP_LAST_ACK)) + t->rcvq--; + + if (t->rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) { + err("Receive queue too large to migrate socket %i: %u bytes", + s, t->rcvq); + return -ENOBUFS; + } + + rc = recv(s, tcp_migrate_rcv_queue, t->rcvq, MSG_PEEK); + if (rc < 0) { + if (errno == EAGAIN) { /* EAGAIN means empty */ + rc = 0; + } else { + rc = -errno; + err_perror("Can't read receive queue for socket %i", s); + return rc; + } + } + + if ((uint32_t)rc < t->rcvq) { + err("Short read migrating receive queue"); + return -ENXIO; + } + + return 0; +} + +/** + * tcp_flow_repair_opt() - Set repair "options" (MSS, scale, SACK, timestamps) + * @s: Socket + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_opt(int s, const struct tcp_tap_transfer_ext *t) +{ + const struct tcp_repair_opt opts[] = { + { TCPOPT_WINDOW, t->snd_ws + (t->rcv_ws << 16) }, + { TCPOPT_MAXSEG, t->mss }, + { TCPOPT_SACK_PERMITTED, 0 }, + { TCPOPT_TIMESTAMP, 0 }, + }; + socklen_t sl; + + sl = sizeof(opts[0]) * (2 + + !!(t->tcpi_options & TCPI_OPT_SACK) + + !!(t->tcpi_options & TCPI_OPT_TIMESTAMPS)); + + if (setsockopt(s, SOL_TCP, TCP_REPAIR_OPTIONS, opts, sl)) { + int rc = -errno; + err_perror("Setting repair options, socket %i", s); + return rc; + } + + return 0; +} + +/** + * tcp_flow_migrate_source() - Send data (flow table) for flow, close listening + * @fd: Descriptor for state migration + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn) +{ + struct tcp_tap_transfer t = { + .retrans = conn->retrans, + .ws_from_tap = conn->ws_from_tap, + .ws_to_tap = conn->ws_to_tap, + .events = conn->events, + + .tap_mss = htonl(MSS_GET(conn)), + + .sndbuf = htonl(conn->sndbuf), + + .flags = conn->flags, + .seq_dup_ack_approx = conn->seq_dup_ack_approx, + + .wnd_from_tap = htons(conn->wnd_from_tap), + .wnd_to_tap = htons(conn->wnd_to_tap), + + .seq_to_tap = htonl(conn->seq_to_tap), + .seq_ack_from_tap = htonl(conn->seq_ack_from_tap), + .seq_from_tap = htonl(conn->seq_from_tap), + .seq_ack_to_tap = htonl(conn->seq_ack_to_tap), + .seq_init_from_tap = htonl(conn->seq_init_from_tap), + }; + + memcpy(&t.pif, conn->f.pif, sizeof(t.pif)); + memcpy(&t.side, conn->f.side, sizeof(t.side)); + + if (write_all_buf(fd, &t, sizeof(t))) { + int rc = -errno; + err_perror("Can't write migration data, socket %i", conn->sock); + return rc; + } + + if (conn->listening_sock != -1 && !fcntl(conn->listening_sock, F_GETFD)) + close(conn->listening_sock); + + return 0; +} + +/** + * tcp_flow_migrate_source_ext() - Dump queues, close sockets, send final data + * @fd: Descriptor for state migration + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative (not -EIO) on failure, -EIO on sending failure + */ +int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn) +{ + uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap; + struct tcp_tap_transfer_ext *t = &migrate_ext[FLOW_IDX(conn)]; + int s = conn->sock; + int rc; + + /* Disable SO_PEEK_OFF, it will make accessing the queues in repair mode + * weird. + */ + if (tcp_set_peek_offset(s, -1)) { + rc = -errno; + goto fail; + } + + if ((rc = tcp_flow_dump_tinfo(s, t))) + goto fail; + + if ((rc = tcp_flow_dump_mss(s, t))) + goto fail; + + if ((rc = tcp_flow_dump_wnd(s, t))) + goto fail; + + if ((rc = tcp_flow_select_queue(s, TCP_SEND_QUEUE))) + goto fail; + + if ((rc = tcp_flow_dump_sndqueue(s, t))) + goto fail; + + if ((rc = tcp_flow_dump_seq(s, &t->seq_snd))) + goto fail; + + if ((rc = tcp_flow_select_queue(s, TCP_RECV_QUEUE))) + goto fail; + + if ((rc = tcp_flow_dump_rcvqueue(s, t))) + goto fail; + + if ((rc = tcp_flow_dump_seq(s, &t->seq_rcv))) + goto fail; + + close(s); + + /* Adjustments unrelated to FIN segments: sequence numbers we dumped are + * based on the end of the queues. + */ + t->seq_rcv -= t->rcvq; + t->seq_snd -= t->sndq; + + debug("Extended migration data, socket %i sequences send %u receive %u", + s, t->seq_snd, t->seq_rcv); + debug(" pending queues: send %u not sent %u receive %u", + t->sndq, t->notsent, t->rcvq); + debug(" window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u", + t->snd_wl1, t->snd_wnd, t->max_window, t->rcv_wnd, t->rcv_wup); + debug(" SO_PEEK_OFF %s offset=%"PRIu32, + peek_offset_cap ? "enabled" : "disabled", peek_offset); + + /* Endianness fix-ups */ + t->seq_snd = htonl(t->seq_snd); + t->seq_rcv = htonl(t->seq_rcv); + t->sndq = htonl(t->sndq); + t->notsent = htonl(t->notsent); + t->rcvq = htonl(t->rcvq); + + t->snd_wl1 = htonl(t->snd_wl1); + t->snd_wnd = htonl(t->snd_wnd); + t->max_window = htonl(t->max_window); + t->rcv_wnd = htonl(t->rcv_wnd); + t->rcv_wup = htonl(t->rcv_wup); + + if (write_all_buf(fd, t, sizeof(*t))) { + err_perror("Failed to write extended data, socket %i", s); + return -EIO; + } + + if (write_all_buf(fd, tcp_migrate_snd_queue, ntohl(t->sndq))) { + err_perror("Failed to write send queue data, socket %i", s); + return -EIO; + } + + if (write_all_buf(fd, tcp_migrate_rcv_queue, ntohl(t->rcvq))) { + err_perror("Failed to write receive queue data, socket %i", s); + return -EIO; + } + + return 0; + +fail: + /* For any type of failure dumping data, write an invalid extended data + * descriptor that allows us to keep the stream in sync, but tells the + * target to skip the flow. If we fail to transfer data, that's fatal: + * return -EIO in that case (and only in that case). + */ + t->tcpi_state = 0; /* Not defined: tell the target to skip this flow */ + + if (write_all_buf(fd, t, sizeof(*t))) { + err_perror("Failed to write extended data, socket %i", s); + return -EIO; + } + + if (rc == -EIO) /* but not a migration data transfer failure */ + return -ENODATA; + + return rc; +} + +/** + * tcp_flow_repair_socket() - Open and bind socket, request repair mode + * @c: Execution context + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn) +{ + sa_family_t af = CONN_V4(conn) ? AF_INET : AF_INET6; + const struct flowside *sockside = HOSTFLOW(conn); + union sockaddr_inany a; + socklen_t sl; + int s, rc; + + pif_sockaddr(c, &a, &sl, PIF_HOST, &sockside->oaddr, sockside->oport); + + if ((conn->sock = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, + IPPROTO_TCP)) < 0) { + rc = -errno; + err_perror("Failed to create socket for migrated flow"); + return rc; + } + s = conn->sock; + + if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &(int){ 1 }, sizeof(int))) + debug_perror("Setting SO_REUSEADDR on socket %i", s); + + tcp_sock_set_nodelay(s); + + if (bind(s, &a.sa, sizeof(a))) { + rc = -errno; + err_perror("Failed to bind socket for migrated flow"); + goto err; + } + + if ((rc = tcp_flow_repair_on(c, conn))) + goto err; + + return 0; + +err: + close(s); + conn->sock = -1; + return rc; +} + +/** + * tcp_flow_repair_connect() - Connect socket in repair mode, then turn it off + * @c: Execution context + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_connect(const struct ctx *c, + struct tcp_tap_conn *conn) +{ + const struct flowside *tgt = HOSTFLOW(conn); + int rc; + + rc = flowside_connect(c, conn->sock, PIF_HOST, tgt); + if (rc) { + rc = -errno; + err_perror("Failed to connect migrated socket %i", conn->sock); + return rc; + } + + conn->in_epoll = 0; + conn->timer = -1; + conn->listening_sock = -1; + + return 0; +} + +/** + * tcp_flow_migrate_target() - Receive data (flow table part) for flow, insert + * @c: Execution context + * @fd: Descriptor for state migration + * + * Return: 0 on success, negative on fatal failure, but 0 on single flow failure + */ +int tcp_flow_migrate_target(struct ctx *c, int fd) +{ + struct tcp_tap_transfer t; + struct tcp_tap_conn *conn; + union flow *flow; + int rc; + + if (!(flow = flow_alloc())) { + err("Flow table full on migration target"); + return 0; + } + + if (read_all_buf(fd, &t, sizeof(t))) { + flow_alloc_cancel(flow); + err_perror("Failed to receive migration data"); + return -errno; + } + + flow->f.state = FLOW_STATE_TGT; + memcpy(&flow->f.pif, &t.pif, sizeof(flow->f.pif)); + memcpy(&flow->f.side, &t.side, sizeof(flow->f.side)); + conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); + + conn->retrans = t.retrans; + conn->ws_from_tap = t.ws_from_tap; + conn->ws_to_tap = t.ws_to_tap; + conn->events = t.events; + + conn->sndbuf = htonl(t.sndbuf); + + conn->flags = t.flags; + conn->seq_dup_ack_approx = t.seq_dup_ack_approx; + + MSS_SET(conn, ntohl(t.tap_mss)); + + conn->wnd_from_tap = ntohs(t.wnd_from_tap); + conn->wnd_to_tap = ntohs(t.wnd_to_tap); + + conn->seq_to_tap = ntohl(t.seq_to_tap); + conn->seq_ack_from_tap = ntohl(t.seq_ack_from_tap); + conn->seq_from_tap = ntohl(t.seq_from_tap); + conn->seq_ack_to_tap = ntohl(t.seq_ack_to_tap); + conn->seq_init_from_tap = ntohl(t.seq_init_from_tap); + + if ((rc = tcp_flow_repair_socket(c, conn))) { + flow_err(flow, "Can't set up socket: %s, drop", strerror_(-rc)); + /* Can't leave the flow in an incomplete state */ + FLOW_ACTIVATE(conn); + return 0; + } + + flow_hash_insert(c, TAP_SIDX(conn)); + FLOW_ACTIVATE(conn); + + return 0; +} + +/** + * tcp_flow_migrate_target_ext() - Receive extended data for flow, set, connect + * @c: Execution context + * @conn: Connection entry to complete with extra data + * @fd: Descriptor for state migration + * + * Return: 0 on success, negative on fatal failure, but 0 on single flow failure + */ +int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd) +{ + uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap; + struct tcp_tap_transfer_ext t; + int s = conn->sock, rc; + + if (read_all_buf(fd, &t, sizeof(t))) { + rc = -errno; + err_perror("Failed to read extended data for socket %i", s); + return rc; + } + + if (!t.tcpi_state) { /* Source wants us to skip this flow */ + flow_err(conn, "Dropping as requested by source"); + goto fail; + } + + /* Endianness fix-ups */ + t.seq_snd = ntohl(t.seq_snd); + t.seq_rcv = ntohl(t.seq_rcv); + t.sndq = ntohl(t.sndq); + t.notsent = ntohl(t.notsent); + t.rcvq = ntohl(t.rcvq); + + t.snd_wl1 = ntohl(t.snd_wl1); + t.snd_wnd = ntohl(t.snd_wnd); + t.max_window = ntohl(t.max_window); + t.rcv_wnd = ntohl(t.rcv_wnd); + t.rcv_wup = ntohl(t.rcv_wup); + + debug("Extended migration data, socket %i sequences send %u receive %u", + s, t.seq_snd, t.seq_rcv); + debug(" pending queues: send %u not sent %u receive %u", + t.sndq, t.notsent, t.rcvq); + debug(" window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u", + t.snd_wl1, t.snd_wnd, t.max_window, t.rcv_wnd, t.rcv_wup); + debug(" SO_PEEK_OFF %s offset=%"PRIu32, + peek_offset_cap ? "enabled" : "disabled", peek_offset); + + if (t.sndq > TCP_MIGRATE_SND_QUEUE_MAX || t.notsent > t.sndq || + t.rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) { + err("Bad queues socket %i, send: %u, not sent: %u, receive: %u", + s, t.sndq, t.notsent, t.rcvq); + return -EINVAL; + } + + if (read_all_buf(fd, tcp_migrate_snd_queue, t.sndq)) { + rc = -errno; + err_perror("Failed to read send queue data, socket %i", s); + return rc; + } + + if (read_all_buf(fd, tcp_migrate_rcv_queue, t.rcvq)) { + rc = -errno; + err_perror("Failed to read receive queue data, socket %i", s); + return rc; + } + + if (conn->sock < 0) + /* We weren't able to create the socket, discard flow */ + goto fail; + + if (tcp_flow_select_queue(s, TCP_SEND_QUEUE)) + goto fail; + + if (tcp_flow_repair_seq(s, &t.seq_snd)) + goto fail; + + if (tcp_flow_select_queue(s, TCP_RECV_QUEUE)) + goto fail; + + if (tcp_flow_repair_seq(s, &t.seq_rcv)) + goto fail; + + if (tcp_flow_repair_connect(c, conn)) + goto fail; + + if (tcp_flow_repair_queue(s, t.rcvq, tcp_migrate_rcv_queue)) + goto fail; + + if (tcp_flow_select_queue(s, TCP_SEND_QUEUE)) + goto fail; + + if (tcp_flow_repair_queue(s, t.sndq - t.notsent, + tcp_migrate_snd_queue)) + goto fail; + + if (tcp_flow_repair_opt(s, &t)) + goto fail; + + /* If we sent a FIN sent and it was acknowledged (TCP_FIN_WAIT2), don't + * send it out, because we already sent it for sure. + * + * Call shutdown(x, SHUT_WR) in repair mode, so that we move to + * FIN_WAIT_1 (tcp_shutdown()) without sending anything + * (goto in tcp_write_xmit()). + */ + if (t.tcpi_state == TCP_FIN_WAIT2) { + int v; + + v = TCP_SEND_QUEUE; + if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v))) + debug_perror("Selecting repair queue, socket %i", s); + else + shutdown(s, SHUT_WR); + } + + if (tcp_flow_repair_wnd(s, &t)) + goto fail; + + tcp_flow_repair_off(c, conn); + repair_flush(c); + + if (t.notsent) { + if (tcp_flow_repair_queue(s, t.notsent, + tcp_migrate_snd_queue + + (t.sndq - t.notsent))) { + /* This sometimes seems to fail for unclear reasons. + * Don't fail the whole migration, just reset the flow + * and carry on to the next one. + */ + goto fail; + } + } + + /* If we sent a FIN but it wasn't acknowledged yet (TCP_FIN_WAIT1), send + * it out, because we don't know if we already sent it. + * + * Call shutdown(x, SHUT_WR) *not* in repair mode, which moves us to + * TCP_FIN_WAIT1. + */ + if (t.tcpi_state == TCP_FIN_WAIT1) + shutdown(s, SHUT_WR); + + if (tcp_set_peek_offset(conn->sock, peek_offset)) + goto fail; + + tcp_send_flag(c, conn, ACK); + tcp_data_from_sock(c, conn); + + if ((rc = tcp_epoll_ctl(c, conn))) { + debug("Failed to subscribe to epoll for migrated socket %i: %s", + conn->sock, strerror_(-rc)); + goto fail; + } + + return 0; + +fail: + if (conn->sock >= 0) { + tcp_flow_repair_off(c, conn); + repair_flush(c); + } + + conn->flags = 0; /* Not waiting for ACK, don't schedule timer */ + tcp_rst(c, conn); + + return 0; +} @@ -16,7 +16,7 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events); int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, - const void *saddr, const void *daddr, + const void *saddr, const void *daddr, uint32_t flow_lbl, const struct pool *p, int idx, const struct timespec *now); int tcp_sock_init(const struct ctx *c, const union inany_addr *addr, const char *ifname, in_port_t port); @@ -19,6 +19,7 @@ * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS * @sock: Socket descriptor number * @events: Connection events, implying connection states + * @listening_sock: Listening socket this socket was accept()ed from, or -1 * @timer: timerfd descriptor for timeout events * @flags: Connection flags representing internal attributes * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS @@ -68,6 +69,7 @@ struct tcp_tap_conn { #define CONN_STATE_BITS /* Setting these clears other flags */ \ (SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED) + int listening_sock; int timer :FD_REF_BITS; @@ -97,6 +99,93 @@ struct tcp_tap_conn { }; /** + * struct tcp_tap_transfer - Migrated TCP data, flow table part, network order + * @pif: Interfaces for each side of the flow + * @side: Addresses and ports for each side of the flow + * @retrans: Number of retransmissions occurred due to ACK_TIMEOUT + * @ws_from_tap: Window scaling factor advertised from tap/guest + * @ws_to_tap: Window scaling factor advertised to tap/guest + * @events: Connection events, implying connection states + * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS + * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS + * @flags: Connection flags representing internal attributes + * @seq_dup_ack_approx: Last duplicate ACK number sent to tap + * @wnd_from_tap: Last window size from tap, unscaled (as received) + * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) + * @seq_to_tap: Next sequence for packets to tap + * @seq_ack_from_tap: Last ACK number received from tap + * @seq_from_tap: Next sequence for packets from tap (not actually sent) + * @seq_ack_to_tap: Last ACK number sent to tap + * @seq_init_from_tap: Initial sequence number from tap +*/ +struct tcp_tap_transfer { + uint8_t pif[SIDES]; + struct flowside side[SIDES]; + + uint8_t retrans; + uint8_t ws_from_tap; + uint8_t ws_to_tap; + uint8_t events; + + uint32_t tap_mss; + + uint32_t sndbuf; + + uint8_t flags; + uint8_t seq_dup_ack_approx; + + uint16_t wnd_from_tap; + uint16_t wnd_to_tap; + + uint32_t seq_to_tap; + uint32_t seq_ack_from_tap; + uint32_t seq_from_tap; + uint32_t seq_ack_to_tap; + uint32_t seq_init_from_tap; +} __attribute__((packed, aligned(__alignof__(uint32_t)))); + +/** + * struct tcp_tap_transfer_ext - Migrated TCP data, outside flow, network order + * @seq_snd: Socket-side send sequence + * @seq_rcv: Socket-side receive sequence + * @sndq: Length of pending send queue (unacknowledged / not sent) + * @notsent: Part of pending send queue that wasn't sent out yet + * @rcvq: Length of pending receive queue + * @mss: Socket-side MSS clamp + * @snd_wl1: Next sequence used in window probe (next sequence - 1) + * @snd_wnd: Socket-side sending window + * @max_window: Window clamp + * @rcv_wnd: Socket-side receive window + * @rcv_wup: rcv_nxt on last window update sent + * @snd_ws: Window scaling factor, send + * @rcv_ws: Window scaling factor, receive + * @tcpi_state: Connection state in TCP_INFO style (enum, tcp_states.h) + * @tcpi_options: TCPI_OPT_* constants (timestamps, selective ACK) + */ +struct tcp_tap_transfer_ext { + uint32_t seq_snd; + uint32_t seq_rcv; + + uint32_t sndq; + uint32_t notsent; + uint32_t rcvq; + + uint32_t mss; + + /* We can't just use struct tcp_repair_window: we need network order */ + uint32_t snd_wl1; + uint32_t snd_wnd; + uint32_t max_window; + uint32_t rcv_wnd; + uint32_t rcv_wup; + + uint8_t snd_ws; + uint8_t rcv_ws; + uint8_t tcpi_state; + uint8_t tcpi_options; +} __attribute__((packed, aligned(__alignof__(uint32_t)))); + +/** * struct tcp_splice_conn - Descriptor for a spliced TCP connection * @f: Generic flow information * @s: File descriptor for sockets @@ -140,11 +229,23 @@ extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; bool tcp_flow_defer(const struct tcp_tap_conn *conn); + +int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn); +int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn); + +int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn); +int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn); + +int tcp_flow_migrate_target(struct ctx *c, int fd); +int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd); + +bool tcp_flow_is_established(const struct tcp_tap_conn *conn); + bool tcp_splice_flow_defer(struct tcp_splice_conn *conn); void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn); int tcp_conn_pool_sock(int pool[]); -int tcp_conn_sock(const struct ctx *c, sa_family_t af); -int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af); +int tcp_conn_sock(sa_family_t af); +int tcp_sock_refill_pool(int pool[], sa_family_t af); void tcp_splice_refill(const struct ctx *c); #endif /* TCP_CONN_H */ diff --git a/tcp_internal.h b/tcp_internal.h index 9cf31f5..6f5e054 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -166,8 +166,6 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn); struct tcp_info_linux; -void tcp_update_csum(uint32_t psum, struct tcphdr *th, - struct iov_tail *payload); void tcp_fill_headers(const struct tcp_tap_conn *conn, struct tap_hdr *taph, struct iphdr *ip4h, struct ipv6hdr *ip6h, diff --git a/tcp_splice.c b/tcp_splice.c index f048a82..0d10e3d 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -131,8 +131,12 @@ static void tcp_splice_conn_epoll_events(uint16_t events, ev[1].events = EPOLLOUT; } - flow_foreach_sidei(sidei) - ev[sidei].events |= (events & OUT_WAIT(sidei)) ? EPOLLOUT : 0; + flow_foreach_sidei(sidei) { + if (events & OUT_WAIT(sidei)) { + ev[sidei].events |= EPOLLOUT; + ev[!sidei].events &= ~EPOLLIN; + } + } } /** @@ -160,7 +164,7 @@ static int tcp_splice_epoll_ctl(const struct ctx *c, if (epoll_ctl(c->epollfd, m, conn->s[0], &ev[0]) || epoll_ctl(c->epollfd, m, conn->s[1], &ev[1])) { int ret = -errno; - flow_err(conn, "ERROR on epoll_ctl(): %s", strerror_(errno)); + flow_perror(conn, "ERROR on epoll_ctl()"); return ret; } @@ -313,8 +317,8 @@ static int tcp_splice_connect_finish(const struct ctx *c, if (conn->pipe[sidei][0] < 0) { if (pipe2(conn->pipe[sidei], O_NONBLOCK | O_CLOEXEC)) { - flow_err(conn, "cannot create %d->%d pipe: %s", - sidei, !sidei, strerror_(errno)); + flow_perror(conn, "cannot create %d->%d pipe", + sidei, !sidei); conn_flag(c, conn, CLOSING); return -EIO; } @@ -351,7 +355,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn) int one = 1; if (tgtpif == PIF_HOST) - conn->s[1] = tcp_conn_sock(c, af); + conn->s[1] = tcp_conn_sock(af); else if (tgtpif == PIF_SPLICE) conn->s[1] = tcp_conn_sock_ns(c, af); else @@ -478,8 +482,7 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, rc = getsockopt(ref.fd, SOL_SOCKET, SO_ERROR, &err, &sl); if (rc) - flow_err(conn, "Error retrieving SO_ERROR: %s", - strerror_(errno)); + flow_perror(conn, "Error retrieving SO_ERROR"); else flow_trace(conn, "Error event on socket: %s", strerror_(err)); @@ -552,7 +555,7 @@ eintr: if (readlen >= (long)c->tcp.pipe_size * 10 / 100) continue; - if (conn->flags & lowat_set_flag && + if (!(conn->flags & lowat_set_flag) && readlen > (long)c->tcp.pipe_size / 10) { int lowat = c->tcp.pipe_size / 4; @@ -703,13 +706,13 @@ static int tcp_sock_refill_ns(void *arg) ns_enter(c); if (c->ifi4) { - int rc = tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET); + int rc = tcp_sock_refill_pool(ns_sock_pool4, AF_INET); if (rc < 0) warn("TCP: Error refilling IPv4 ns socket pool: %s", strerror_(-rc)); } if (c->ifi6) { - int rc = tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6); + int rc = tcp_sock_refill_pool(ns_sock_pool6, AF_INET6); if (rc < 0) warn("TCP: Error refilling IPv6 ns socket pool: %s", strerror_(-rc)); @@ -38,7 +38,6 @@ static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + 1]; static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE]; static int head[VIRTQUEUE_MAX_SIZE + 1]; -static int head_cnt; /** * tcp_vu_hdrlen() - return the size of the header in level 2 frame (TCP) @@ -183,7 +182,7 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) static ssize_t tcp_vu_sock_recv(const struct ctx *c, const struct tcp_tap_conn *conn, bool v6, uint32_t already_sent, size_t fillsize, - int *iov_cnt) + int *iov_cnt, int *head_cnt) { struct vu_dev *vdev = c->vdev; struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; @@ -202,7 +201,7 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c, vu_init_elem(elem, &iov_vu[1], VIRTQUEUE_MAX_SIZE); elem_cnt = 0; - head_cnt = 0; + *head_cnt = 0; while (fillsize > 0 && elem_cnt < VIRTQUEUE_MAX_SIZE) { struct iovec *iov; size_t frame_size, dlen; @@ -221,7 +220,7 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c, ASSERT(iov->iov_len >= hdrlen); iov->iov_base = (char *)iov->iov_base + hdrlen; iov->iov_len -= hdrlen; - head[head_cnt++] = elem_cnt; + head[(*head_cnt)++] = elem_cnt; fillsize -= dlen; elem_cnt += cnt; @@ -261,17 +260,18 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c, len -= iov->iov_len; } /* adjust head count */ - while (head_cnt > 0 && head[head_cnt - 1] > i) - head_cnt--; + while (*head_cnt > 0 && head[*head_cnt - 1] >= i) + (*head_cnt)--; + /* mark end of array */ - head[head_cnt] = i; + head[*head_cnt] = i; *iov_cnt = i; /* release unused buffers */ vu_queue_rewind(vq, elem_cnt - i); /* restore space for headers in iov */ - for (i = 0; i < head_cnt; i++) { + for (i = 0; i < *head_cnt; i++) { struct iovec *iov = &elem[head[i]].in_sg[0]; iov->iov_base = (char *)iov->iov_base - hdrlen; @@ -357,11 +357,11 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) struct vu_dev *vdev = c->vdev; struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; ssize_t len, previous_dlen; + int i, iov_cnt, head_cnt; size_t hdrlen, fillsize; int v6 = CONN_V6(conn); uint32_t already_sent; const uint16_t *check; - int i, iov_cnt; if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) { debug("Got packet, but RX virtqueue not usable yet"); @@ -396,7 +396,8 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) /* collect the buffers from vhost-user and fill them with the * data from the socket */ - len = tcp_vu_sock_recv(c, conn, v6, already_sent, fillsize, &iov_cnt); + len = tcp_vu_sock_recv(c, conn, v6, already_sent, fillsize, + &iov_cnt, &head_cnt); if (len < 0) { if (len != -EAGAIN && len != -EWOULDBLOCK) { tcp_rst(c, conn); diff --git a/test/.gitignore b/test/.gitignore index 6dd4790..3573444 100644 --- a/test/.gitignore +++ b/test/.gitignore @@ -8,5 +8,6 @@ QEMU_EFI.fd *.raw.xz *.bin nstool +rampstream guest-key guest-key.pub diff --git a/test/Makefile b/test/Makefile index 5e49047..bf63db8 100644 --- a/test/Makefile +++ b/test/Makefile @@ -52,7 +52,8 @@ UBUNTU_IMGS = $(UBUNTU_OLD_IMGS) $(UBUNTU_NEW_IMGS) DOWNLOAD_ASSETS = mbuto podman \ $(DEBIAN_IMGS) $(FEDORA_IMGS) $(OPENSUSE_IMGS) $(UBUNTU_IMGS) -TESTDATA_ASSETS = small.bin big.bin medium.bin +TESTDATA_ASSETS = small.bin big.bin medium.bin \ + rampstream LOCAL_ASSETS = mbuto.img mbuto.mem.img podman/bin/podman QEMU_EFI.fd \ $(DEBIAN_IMGS:%=prepared-%) $(FEDORA_IMGS:%=prepared-%) \ $(UBUNTU_NEW_IMGS:%=prepared-%) \ @@ -85,7 +86,7 @@ podman/bin/podman: pull-podman guest-key guest-key.pub: ssh-keygen -f guest-key -N '' -mbuto.img: passt.mbuto mbuto/mbuto guest-key.pub $(TESTDATA_ASSETS) +mbuto.img: passt.mbuto mbuto/mbuto guest-key.pub rampstream-check.sh $(TESTDATA_ASSETS) ./mbuto/mbuto -p ./$< -c lz4 -f $@ mbuto.mem.img: passt.mem.mbuto mbuto ../passt.avx2 diff --git a/test/lib/layout b/test/lib/layout index 4d03572..fddcdc4 100644 --- a/test/lib/layout +++ b/test/lib/layout @@ -135,17 +135,70 @@ layout_two_guests() { get_info_cols pane_watch_contexts ${PANE_GUEST_1} "guest #1 in namespace #1" qemu_1 guest_1 + pane_watch_contexts ${PANE_GUEST_2} "guest #2 in namespace #1" qemu_2 guest_2 + + tmux send-keys -l -t ${PANE_INFO} 'while cat '"$STATEBASE/log_pipe"'; do :; done' + tmux send-keys -t ${PANE_INFO} -N 100 C-m + tmux select-pane -t ${PANE_INFO} -T "test log" + + pane_watch_contexts ${PANE_HOST} host host + pane_watch_contexts ${PANE_PASST_1} "passt #1 in namespace #1" pasta_1 passt_1 + pane_watch_contexts ${PANE_PASST_2} "passt #2 in namespace #1" pasta_1 passt_2 + + info_layout "two guests, two passt instances, in namespaces" + + sleep 1 +} + +# layout_migrate() - Two guest panes, two passt panes, two passt-repair panes, +# plus host and log +layout_migrate() { + sleep 1 + + tmux kill-pane -a -t 0 + cmd_write 0 clear + + tmux split-window -v -t passt_test + tmux split-window -h -l '33%' + tmux split-window -h -t passt_test:1.1 + + tmux split-window -h -l '35%' -t passt_test:1.0 + tmux split-window -v -t passt_test:1.0 + + tmux split-window -v -t passt_test:1.4 + tmux split-window -v -t passt_test:1.6 + + tmux split-window -v -t passt_test:1.3 + + PANE_GUEST_1=0 + PANE_GUEST_2=1 + PANE_INFO=2 + PANE_MON=3 + PANE_HOST=4 + PANE_PASST_REPAIR_1=5 + PANE_PASST_1=6 + PANE_PASST_REPAIR_2=7 + PANE_PASST_2=8 + + get_info_cols + + pane_watch_contexts ${PANE_GUEST_1} "guest #1 in namespace #1" qemu_1 guest_1 pane_watch_contexts ${PANE_GUEST_2} "guest #2 in namespace #2" qemu_2 guest_2 tmux send-keys -l -t ${PANE_INFO} 'while cat '"$STATEBASE/log_pipe"'; do :; done' tmux send-keys -t ${PANE_INFO} -N 100 C-m tmux select-pane -t ${PANE_INFO} -T "test log" + pane_watch_contexts ${PANE_MON} "QEMU monitor" mon mon + pane_watch_contexts ${PANE_HOST} host host + pane_watch_contexts ${PANE_PASST_REPAIR_1} "passt-repair #1 in namespace #1" repair_1 passt_repair_1 pane_watch_contexts ${PANE_PASST_1} "passt #1 in namespace #1" pasta_1 passt_1 + + pane_watch_contexts ${PANE_PASST_REPAIR_2} "passt-repair #2 in namespace #2" repair_2 passt_repair_2 pane_watch_contexts ${PANE_PASST_2} "passt #2 in namespace #2" pasta_2 passt_2 - info_layout "two guests, two passt instances, in namespaces" + info_layout "two guests, two passt + passt-repair instances, in namespaces" sleep 1 } diff --git a/test/lib/setup b/test/lib/setup index 580825f..575bc21 100755 --- a/test/lib/setup +++ b/test/lib/setup @@ -49,7 +49,7 @@ setup_passt() { context_run passt "make clean" context_run passt "make valgrind" - context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt ${__opts} -s ${STATESETUP}/passt.socket -f -t 10001 -u 10001 -P ${STATESETUP}/passt.pid" + context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt ${__opts} -s ${STATESETUP}/passt.socket -f -t 10001 -u 10001 -H hostname1 --fqdn fqdn1.passt.test -P ${STATESETUP}/passt.pid" # pidfile isn't created until passt is listening wait_for [ -f "${STATESETUP}/passt.pid" ] @@ -160,11 +160,11 @@ setup_passt_in_ns() { if [ ${VALGRIND} -eq 1 ]; then context_run passt "make clean" context_run passt "make valgrind" - context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}" + context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f ${__opts} -s ${STATESETUP}/passt.socket -H hostname1 --fqdn fqdn1.passt.test -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}" else context_run passt "make clean" context_run passt "make" - context_run_bg passt "./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}" + context_run_bg passt "./passt -f ${__opts} -s ${STATESETUP}/passt.socket -H hostname1 --fqdn fqdn1.passt.test -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}" fi wait_for [ -f "${STATESETUP}/passt.pid" ] @@ -243,7 +243,7 @@ setup_two_guests() { [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" [ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user" - context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} -t 10001 -u 10001" + context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} --fqdn fqdn1.passt.test -H hostname1 -t 10001 -u 10001" wait_for [ -f "${STATESETUP}/passt_1.pid" ] __opts= @@ -252,7 +252,7 @@ setup_two_guests() { [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" [ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user" - context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} -t 10004 -u 10004" + context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} --hostname hostname2 --fqdn fqdn2 -t 10004 -u 10004" wait_for [ -f "${STATESETUP}/passt_2.pid" ] __vmem="$((${MEM_KIB} / 1024 / 4))" @@ -305,6 +305,117 @@ setup_two_guests() { context_setup_guest guest_2 ${GUEST_2_CID} } +# setup_migrate() - Set up two namespace, run qemu, passt/passt-repair in both +setup_migrate() { + context_setup_host host + context_setup_host mon + context_setup_host pasta_1 + context_setup_host pasta_2 + + layout_migrate + + # Ports: + # + # guest #1 | guest #2 | ns #1 | host + # --------- |-----------|-----------|------------ + # 10001 as server | | to guest | to ns #1 + # 10002 | | as server | to ns #1 + # 10003 | | to init | as server + # 10004 | as server | to guest | to ns #1 + + __opts= + [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/pasta_1.pcap" + [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" + [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" + + __map_host4=192.0.2.1 + __map_host6=2001:db8:9a55::1 + __map_ns4=192.0.2.2 + __map_ns6=2001:db8:9a55::2 + + # Option 1: send stuff via spliced path in pasta + # context_run_bg pasta_1 "./pasta ${__opts} -P ${STATESETUP}/pasta_1.pid -t 10001,10002 -T 10003 -u 10001,10002 -U 10003 --config-net ${NSTOOL} hold ${STATESETUP}/ns1.hold" + # Option 2: send stuff via tap (--map-guest-addr) instead (useful to see capture of full migration) + context_run_bg pasta_1 "./pasta ${__opts} -P ${STATESETUP}/pasta_1.pid -t 10001,10002,10004 -T 10003 -u 10001,10002,10004 -U 10003 --map-guest-addr ${__map_host4} --map-guest-addr ${__map_host6} --config-net ${NSTOOL} hold ${STATESETUP}/ns1.hold" + context_setup_nstool passt_1 ${STATESETUP}/ns1.hold + context_setup_nstool passt_repair_1 ${STATESETUP}/ns1.hold + + context_setup_nstool passt_2 ${STATESETUP}/ns1.hold + context_setup_nstool passt_repair_2 ${STATESETUP}/ns1.hold + + context_setup_nstool qemu_1 ${STATESETUP}/ns1.hold + context_setup_nstool qemu_2 ${STATESETUP}/ns1.hold + + __ifname="$(context_run qemu_1 "ip -j link show | jq -rM '.[] | select(.link_type == \"ether\").ifname'")" + + sleep 1 + + __opts="--vhost-user" + [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap" + [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" + [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" + + context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} -t 10001 -u 10001" + wait_for [ -f "${STATESETUP}/passt_1.pid" ] + + context_run_bg passt_repair_1 "./passt-repair ${STATESETUP}/passt_1.socket.repair" + + __opts="--vhost-user" + [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap" + [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" + [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" + + context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} -t 10004 -u 10004" + wait_for [ -f "${STATESETUP}/passt_2.pid" ] + + context_run_bg passt_repair_2 "./passt-repair ${STATESETUP}/passt_2.socket.repair" + + __vmem="512M" # Keep migration fast + __qemu_netdev1=" \ + -chardev socket,id=c,path=${STATESETUP}/passt_1.socket \ + -netdev vhost-user,id=v,chardev=c \ + -device virtio-net,netdev=v \ + -object memory-backend-memfd,id=m,share=on,size=${__vmem} \ + -numa node,memdev=m" + __qemu_netdev2=" \ + -chardev socket,id=c,path=${STATESETUP}/passt_2.socket \ + -netdev vhost-user,id=v,chardev=c \ + -device virtio-net,netdev=v \ + -object memory-backend-memfd,id=m,share=on,size=${__vmem} \ + -numa node,memdev=m" + + GUEST_1_CID=94557 + context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}" \ + ' -M accel=kvm:tcg' \ + ' -m '${__vmem}' -cpu host -smp '${VCPUS} \ + ' -kernel '"${KERNEL}" \ + ' -initrd '${INITRAMFS}' -nographic -serial stdio' \ + ' -nodefaults' \ + ' -append "console=ttyS0 mitigations=off apparmor=0" ' \ + " ${__qemu_netdev1}" \ + " -pidfile ${STATESETUP}/qemu_1.pid" \ + " -device vhost-vsock-pci,guest-cid=$GUEST_1_CID" \ + " -monitor unix:${STATESETUP}/qemu_1_mon.sock,server,nowait" + + GUEST_2_CID=94558 + context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}" \ + ' -M accel=kvm:tcg' \ + ' -m '${__vmem}' -cpu host -smp '${VCPUS} \ + ' -kernel '"${KERNEL}" \ + ' -initrd '${INITRAMFS}' -nographic -serial stdio' \ + ' -nodefaults' \ + ' -append "console=ttyS0 mitigations=off apparmor=0" ' \ + " ${__qemu_netdev2}" \ + " -pidfile ${STATESETUP}/qemu_2.pid" \ + " -device vhost-vsock-pci,guest-cid=$GUEST_2_CID" \ + " -monitor unix:${STATESETUP}/qemu_2_mon.sock,server,nowait" \ + " -incoming tcp:0:20005" + + context_setup_guest guest_1 ${GUEST_1_CID} + # Only available after migration: + ( context_setup_guest guest_2 ${GUEST_2_CID} & ) +} + # teardown_context_watch() - Remove contexts and stop panes watching them # $1: Pane number watching # $@: Context names @@ -375,7 +486,8 @@ teardown_two_guests() { context_wait pasta_1 context_wait pasta_2 - rm -f "${STATESETUP}/passt__[12].pid" "${STATESETUP}/pasta_[12].pid" + rm "${STATESETUP}/passt_1.pid" "${STATESETUP}/passt_2.pid" + rm "${STATESETUP}/pasta_1.pid" "${STATESETUP}/pasta_2.pid" teardown_context_watch ${PANE_HOST} host teardown_context_watch ${PANE_GUEST_1} qemu_1 guest_1 @@ -384,6 +496,30 @@ teardown_two_guests() { teardown_context_watch ${PANE_PASST_2} pasta_2 passt_2 } +# teardown_migrate() - Exit namespaces, kill qemu processes, passt and pasta +teardown_migrate() { + ${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/qemu_1.pid") + ${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/qemu_2.pid") + context_wait qemu_1 + context_wait qemu_2 + + ${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/passt_2.pid") + context_wait passt_1 + context_wait passt_2 + ${NSTOOL} stop "${STATESETUP}/ns1.hold" + context_wait pasta_1 + + rm -f "${STATESETUP}/passt_1.pid" "${STATESETUP}/passt_2.pid" + rm -f "${STATESETUP}/pasta_1.pid" "${STATESETUP}/pasta_2.pid" + + teardown_context_watch ${PANE_HOST} host + + teardown_context_watch ${PANE_GUEST_1} qemu_1 guest_1 + teardown_context_watch ${PANE_GUEST_2} qemu_2 guest_2 + teardown_context_watch ${PANE_PASST_1} pasta_1 passt_1 + teardown_context_watch ${PANE_PASST_2} pasta_1 passt_2 +} + # teardown_demo_passt() - Exit namespace, kill qemu, passt and pasta teardown_demo_passt() { tmux send-keys -t ${PANE_GUEST} "C-c" diff --git a/test/lib/test b/test/lib/test index e6726be..7349674 100755 --- a/test/lib/test +++ b/test/lib/test @@ -20,10 +20,7 @@ test_iperf3s() { __sctx="${1}" __port="${2}" - pane_or_context_run_bg "${__sctx}" \ - 'iperf3 -s -p'${__port}' & echo $! > s.pid' \ - - sleep 1 # Wait for server to be ready + pane_or_context_run "${__sctx}" 'iperf3 -s -p'${__port}' -D -I s.pid' } # test_iperf3k() - Kill iperf3 server @@ -31,7 +28,7 @@ test_iperf3s() { test_iperf3k() { __sctx="${1}" - pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid); rm s.pid' + pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid)' sleep 1 # Wait for kernel to free up ports } @@ -68,6 +65,45 @@ test_iperf3() { TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__var}__" "${__bw}" )" } +# test_iperf3m() - Ugly helper for iperf3 directive, guest migration variant +# $1: Variable name: to put the measure bandwidth into +# $2: Initial source/client context +# $3: Second source/client context the guest is moving to +# $4: Destination name or address for client +# $5: Port number, ${i} is translated to process index +# $6: Run time, in seconds +# $7: Client options +test_iperf3m() { + __var="${1}"; shift + __cctx="${1}"; shift + __cctx2="${1}"; shift + __dest="${1}"; shift + __port="${1}"; shift + __time="${1}"; shift + + pane_or_context_run "${__cctx}" 'rm -f c.json' + + # A 1s wait for connection on what's basically a local link + # indicates something is pretty wrong + __timeout=1000 + pane_or_context_run_bg "${__cctx}" \ + 'iperf3 -J -c '${__dest}' -p '${__port} \ + ' --connect-timeout '${__timeout} \ + ' -t'${__time}' -i0 '"${@}"' > c.json' \ + + __jval=".end.sum_received.bits_per_second" + + sleep $((${__time} + 3)) + + pane_or_context_output "${__cctx2}" \ + 'cat c.json' + + __bw=$(pane_or_context_output "${__cctx2}" \ + 'cat c.json | jq -rMs "map('${__jval}') | add"') + + TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__var}__" "${__bw}" )" +} + test_one_line() { __line="${1}" @@ -177,6 +213,12 @@ test_one_line() { "guest2w") pane_or_context_wait guest_2 || TEST_ONE_nok=1 ;; + "mon") + pane_or_context_run mon "${__arg}" || TEST_ONE_nok=1 + ;; + "monb") + pane_or_context_run_bg mon "${__arg}" + ;; "ns") pane_or_context_run ns "${__arg}" || TEST_ONE_nok=1 ;; @@ -292,6 +334,9 @@ test_one_line() { "iperf3") test_iperf3 ${__arg} ;; + "iperf3m") + test_iperf3m ${__arg} + ;; "set") TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__arg%% *}__" "${__arg#* }")" ;; diff --git a/test/migrate/basic b/test/migrate/basic new file mode 100644 index 0000000..3f11f7d --- /dev/null +++ b/test/migrate/basic @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/basic - Check basic migration functionality +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv4: guest1/guest2 > host +g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' +hostb socat -u TCP4-LISTEN:10006 OPEN:__STATESETUP__/msg,create,trunc +sleep 1 +# Option 1: via spliced path in pasta, namespace to host +# guest1b { printf "Hello from guest 1"; sleep 10; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__GW1__:10003 +# Option 2: via --map-guest-addr (tap) in pasta, namespace to host +guest1b { printf "Hello from guest 1"; sleep 3; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__MAP_HOST4__:10006 +sleep 1 + +mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +hostw +hout MSG cat __STATESETUP__/msg +check [ "__MSG__" = "Hello from guest 1 and from guest 2" ] diff --git a/test/migrate/basic_fin b/test/migrate/basic_fin new file mode 100644 index 0000000..aa61ec5 --- /dev/null +++ b/test/migrate/basic_fin @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/basic_fin - Outbound traffic across migration, half-closed socket +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv4: guest1, half-close, guest2 > host +g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' + +hostb echo FIN | socat TCP4-LISTEN:10006,shut-down STDIO,ignoreeof > __STATESETUP__/msg +#hostb socat -u TCP4-LISTEN:10006 OPEN:__STATESETUP__/msg,create,trunc + +#sleep 20 +# Option 1: via spliced path in pasta, namespace to host +# guest1b { printf "Hello from guest 1"; sleep 10; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__GW1__:10003 +# Option 2: via --map-guest-addr (tap) in pasta, namespace to host +guest1b { printf "Hello from guest 1"; sleep 3; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__MAP_HOST4__:10006 +sleep 1 + +mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +hostw +hout MSG cat __STATESETUP__/msg +check [ "__MSG__" = "Hello from guest 1 and from guest 2" ] diff --git a/test/migrate/bidirectional b/test/migrate/bidirectional new file mode 100644 index 0000000..4c04081 --- /dev/null +++ b/test/migrate/bidirectional @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/bidirectional - Check migration with messages in both directions +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test TCP/IPv4: guest1/guest2 > host, host > guest1/guest2 +g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' + +hostb socat -u TCP4-LISTEN:10006 OPEN:__STATESETUP__/msg,create,trunc +guest1b socat -u TCP4-LISTEN:10001 OPEN:msg,create,trunc +sleep 1 + +guest1b socat -u UNIX-RECV:proxy.sock,null-eof TCP4:__MAP_HOST4__:10006 +hostb socat -u UNIX-RECV:__STATESETUP__/proxy.sock,null-eof TCP4:__ADDR1__:10001 +sleep 1 +guest1 printf "Hello from guest 1" | socat -u STDIN UNIX:proxy.sock +host printf "Dear guest 1," | socat -u STDIN UNIX:__STATESETUP__/proxy.sock +sleep 1 + +mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +sleep 2 +guest2 printf " and from guest 2" | socat -u STDIN UNIX:proxy.sock,shut-null +host printf " you are now guest 2" | socat -u STDIN UNIX:__STATESETUP__/proxy.sock,shut-null + +hostw +# FIXME: guest2w doesn't work here because shell jobs are (also) from guest #1, +# use sleep 1 for the moment +sleep 1 + +hout MSG cat __STATESETUP__/msg +check [ "__MSG__" = "Hello from guest 1 and from guest 2" ] + +g2out MSG cat msg +check [ "__MSG__" = "Dear guest 1, you are now guest 2" ] diff --git a/test/migrate/bidirectional_fin b/test/migrate/bidirectional_fin new file mode 100644 index 0000000..1c13527 --- /dev/null +++ b/test/migrate/bidirectional_fin @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/bidirectional_fin - Both directions, half-closed sockets +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test TCP/IPv4: guest1/guest2 <- (half closed) -> host +g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' + +hostb echo FIN | socat TCP4-LISTEN:10006,shut-down STDIO,ignoreeof > __STATESETUP__/msg +guest1b echo FIN | socat TCP4-LISTEN:10001,shut-down STDIO,ignoreeof > msg +sleep 1 + +guest1b socat -u UNIX-RECV:proxy.sock,null-eof TCP4:__MAP_HOST4__:10006 +hostb socat -u UNIX-RECV:__STATESETUP__/proxy.sock,null-eof TCP4:__ADDR1__:10001 +sleep 1 +guest1 printf "Hello from guest 1" | socat -u STDIN UNIX:proxy.sock +host printf "Dear guest 1," | socat -u STDIN UNIX:__STATESETUP__/proxy.sock +sleep 1 + +mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +sleep 2 +guest2 printf " and from guest 2" | socat -u STDIN UNIX:proxy.sock,shut-null +host printf " you are now guest 2" | socat -u STDIN UNIX:__STATESETUP__/proxy.sock,shut-null + +hostw +# FIXME: guest2w doesn't work here because shell jobs are (also) from guest #1, +# use sleep 1 for the moment +sleep 1 + +hout MSG cat __STATESETUP__/msg +check [ "__MSG__" = "Hello from guest 1 and from guest 2" ] + +g2out MSG cat msg +check [ "__MSG__" = "Dear guest 1, you are now guest 2" ] diff --git a/test/migrate/iperf3_bidir6 b/test/migrate/iperf3_bidir6 new file mode 100644 index 0000000..4bfefb5 --- /dev/null +++ b/test/migrate/iperf3_bidir6 @@ -0,0 +1,58 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/iperf3_bidir6 - Migration behaviour with many bidirectional flows +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +set THREADS 128 +set TIME 3 +set OMIT 0.1 +set OPTS -Z -P __THREADS__ -O__OMIT__ -N --bidir + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv6 host <-> guest flood, many flows, during migration + +monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +iperf3s host 10006 +iperf3m BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__ +bw __BW__ 1 2 + +iperf3k host diff --git a/test/migrate/iperf3_in4 b/test/migrate/iperf3_in4 new file mode 100644 index 0000000..c5f3916 --- /dev/null +++ b/test/migrate/iperf3_in4 @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/iperf3_in4 - Migration behaviour under inbound IPv4 flood +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +guest1 /sbin/sysctl -w net.core.rmem_max=33554432 +guest1 /sbin/sysctl -w net.core.wmem_max=33554432 + +set THREADS 1 +set TIME 4 +set OMIT 0.1 +set OPTS -Z -P __THREADS__ -O__OMIT__ -N -R + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test TCP/IPv4 host to guest throughput during migration + +monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +iperf3s host 10006 +iperf3m BW guest_1 guest_2 __MAP_HOST4__ 10006 __TIME__ __OPTS__ +bw __BW__ 1 2 + +iperf3k host diff --git a/test/migrate/iperf3_in6 b/test/migrate/iperf3_in6 new file mode 100644 index 0000000..16cf504 --- /dev/null +++ b/test/migrate/iperf3_in6 @@ -0,0 +1,58 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/iperf3_in6 - Migration behaviour under inbound IPv6 flood +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +set THREADS 4 +set TIME 3 +set OMIT 0.1 +set OPTS -Z -P __THREADS__ -O__OMIT__ -N -R + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv6 host to guest throughput during migration + +monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +iperf3s host 10006 +iperf3m BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__ +bw __BW__ 1 2 + +iperf3k host diff --git a/test/migrate/iperf3_many_out6 b/test/migrate/iperf3_many_out6 new file mode 100644 index 0000000..88133f2 --- /dev/null +++ b/test/migrate/iperf3_many_out6 @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/iperf3_many_out6 - Migration behaviour with many outbound flows +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +set THREADS 16 +set TIME 3 +set OMIT 0.1 +set OPTS -Z -P __THREADS__ -O__OMIT__ -N -l 1M + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv6 guest to host flood, many flows, during migration + +test TCP/IPv6 host to guest throughput during migration + +monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +iperf3s host 10006 +iperf3m BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__ +bw __BW__ 1 2 + +iperf3k host diff --git a/test/migrate/iperf3_out4 b/test/migrate/iperf3_out4 new file mode 100644 index 0000000..968057b --- /dev/null +++ b/test/migrate/iperf3_out4 @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/iperf3_out4 - Migration behaviour under outbound IPv4 flood +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +set THREADS 6 +set TIME 2 +set OMIT 0.1 +set OPTS -P __THREADS__ -O__OMIT__ -Z -N -l 1M + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test TCP/IPv4 guest to host throughput during migration + +monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +iperf3s host 10006 +iperf3m BW guest_1 guest_2 __MAP_HOST4__ 10006 __TIME__ __OPTS__ +bw __BW__ 1 2 + +iperf3k host diff --git a/test/migrate/iperf3_out6 b/test/migrate/iperf3_out6 new file mode 100644 index 0000000..21fbfcd --- /dev/null +++ b/test/migrate/iperf3_out6 @@ -0,0 +1,58 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/iperf3_out6 - Migration behaviour under outbound IPv6 flood +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +set THREADS 6 +set TIME 2 +set OMIT 0.1 +set OPTS -P __THREADS__ -O__OMIT__ -Z -N -l 1M + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv6 guest to host throughput during migration + +monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +iperf3s host 10006 +iperf3m BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__ +bw __BW__ 1 2 + +iperf3k host diff --git a/test/migrate/rampstream_in b/test/migrate/rampstream_in new file mode 100644 index 0000000..df333ba --- /dev/null +++ b/test/migrate/rampstream_in @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/rampstream_in - Check sequence correctness with inbound ramp +# +# Copyright (c) 2025 Red Hat +# Author: David Gibson <david@gibson.dropbear.id.au> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 +set RAMPS 6000000 + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv4: sequence check, ramps, inbound +g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' +guest1b socat -u TCP4-LISTEN:10001 EXEC:"rampstream-check.sh __RAMPS__" +sleep 1 +hostb socat -u EXEC:"test/rampstream send __RAMPS__" TCP4:__ADDR1__:10001 + +sleep 1 + +monb echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +hostw + +guest2 cat rampstream.err +guest2 [ $(cat rampstream.status) -eq 0 ] diff --git a/test/migrate/rampstream_out b/test/migrate/rampstream_out new file mode 100644 index 0000000..8ed3229 --- /dev/null +++ b/test/migrate/rampstream_out @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/rampstream_out - Check sequence correctness with outbound ramp +# +# Copyright (c) 2025 Red Hat +# Author: David Gibson <david@gibson.dropbear.id.au> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 +set RAMPS 6000000 + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv4: sequence check, ramps, outbound +g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' +hostb socat -u TCP4-LISTEN:10006 EXEC:"test/rampstream check __RAMPS__" +sleep 1 +guest1b socat -u EXEC:"rampstream send __RAMPS__" TCP4:__MAP_HOST4__:10006 +sleep 1 + +mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +hostw diff --git a/test/passt.mbuto b/test/passt.mbuto index d4d57cb..5e00132 100755 --- a/test/passt.mbuto +++ b/test/passt.mbuto @@ -13,7 +13,8 @@ PROGS="${PROGS:-ash,dash,bash ip mount ls insmod mkdir ln cat chmod lsmod modprobe find grep mknod mv rm umount jq iperf3 dhclient hostname sed tr chown sipcalc cut socat dd strace ping tail killall sleep sysctl - nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp tcpdump}" + nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp tcpdump + env}" # OpenSSH 9.8 introduced split binaries, with sshd being the daemon, and # sshd-session the per-session program. We need the latter as well, and the path @@ -31,7 +32,7 @@ LINKS="${LINKS:- DIRS="${DIRS} /tmp /usr/sbin /usr/share /var/log /var/lib /etc/ssh /run/sshd /root/.ssh" -COPIES="${COPIES} small.bin,/root/small.bin medium.bin,/root/medium.bin big.bin,/root/big.bin" +COPIES="${COPIES} small.bin,/root/small.bin medium.bin,/root/medium.bin big.bin,/root/big.bin rampstream,/bin/rampstream rampstream-check.sh,/bin/rampstream-check.sh" FIXUP="${FIXUP}"' mv /sbin/* /usr/sbin || : @@ -41,6 +42,7 @@ FIXUP="${FIXUP}"' #!/bin/sh LOG=/var/log/dhclient-script.log echo \${reason} \${interface} >> \$LOG +env >> \$LOG set >> \$LOG [ -n "\${new_interface_mtu}" ] && ip link set dev \${interface} mtu \${new_interface_mtu} @@ -54,7 +56,8 @@ set >> \$LOG [ -n "\${new_ip6_address}" ] && ip addr add \${new_ip6_address}/\${new_ip6_prefixlen} dev \${interface} [ -n "\${new_dhcp6_name_servers}" ] && for d in \${new_dhcp6_name_servers}; do echo "nameserver \${d}%\${interface}" >> /etc/resolv.conf; done [ -n "\${new_dhcp6_domain_search}" ] && (printf "search"; for d in \${new_dhcp6_domain_search}; do printf " %s" "\${d}"; done; printf "\n") >> /etc/resolv.conf -[ -n "\${new_host_name}" ] && hostname "\${new_host_name}" +[ -n "\${new_host_name}" ] && echo "\${new_host_name}" > /tmp/new_host_name +[ -n "\${new_fqdn_fqdn}" ] && echo "\${new_fqdn_fqdn}" > /tmp/new_fqdn_fqdn exit 0 EOF chmod 755 /sbin/dhclient-script diff --git a/test/passt/dhcp b/test/passt/dhcp index 9925ab9..145f1ba 100644 --- a/test/passt/dhcp +++ b/test/passt/dhcp @@ -11,7 +11,7 @@ # Copyright (c) 2021 Red Hat GmbH # Author: Stefano Brivio <sbrivio@redhat.com> -gtools ip jq dhclient sed tr +gtools ip jq dhclient sed tr hostname htools ip jq sed tr head test Interface name @@ -47,7 +47,16 @@ gout SEARCH sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^searc hout HOST_SEARCH sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/' check [ "__SEARCH__" = "__HOST_SEARCH__" ] +test DHCP: Hostname +gout NEW_HOST_NAME cat /tmp/new_host_name +check [ "__NEW_HOST_NAME__" = "hostname1" ] + +test DHCP: Client FQDN +gout NEW_FQDN_FQDN cat /tmp/new_fqdn_fqdn +check [ "__NEW_FQDN_FQDN__" = "fqdn1.passt.test" ] + test DHCPv6: address +guest rm /tmp/new_fqdn_fqdn guest /sbin/dhclient -6 __IFNAME__ # Wait for DAD to complete guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done @@ -70,3 +79,7 @@ test DHCPv6: search list gout SEARCH6 sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/' hout HOST_SEARCH6 sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/' check [ "__SEARCH6__" = "__HOST_SEARCH6__" ] + +test DHCPv6: Hostname +gout NEW_FQDN_FQDN cat /tmp/new_fqdn_fqdn +check [ "__NEW_FQDN_FQDN__" = "fqdn1.passt.test" ] diff --git a/test/rampstream-check.sh b/test/rampstream-check.sh new file mode 100755 index 0000000..c27acdb --- /dev/null +++ b/test/rampstream-check.sh @@ -0,0 +1,3 @@ +#! /bin/sh + +(rampstream check "$@" 2>&1; echo $? > rampstream.status) | tee rampstream.err diff --git a/test/rampstream.c b/test/rampstream.c new file mode 100644 index 0000000..8d81296 --- /dev/null +++ b/test/rampstream.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* rampstream - Generate a check and stream of bytes in a ramp pattern + * + * Copyright Red Hat + * Author: David Gibson <david@gibson.dropbear.id.au> + */ + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <sys/types.h> +#include <unistd.h> +#include <errno.h> +#include <string.h> + +/* Length of the repeating ramp. This is a deliberately not a "round" number so + * that we're very likely to misalign with likely block or chunk sizes of the + * transport. That means we'll detect gaps in the stream, even if they occur + * neatly on block boundaries. Specifically this is the largest 8-bit prime. */ +#define RAMPLEN 251 + +#define INTERVAL 10000 + +#define ARRAY_SIZE(a) ((int)(sizeof(a) / sizeof((a)[0]))) + +#define die(...) \ + do { \ + fprintf(stderr, "rampstream: " __VA_ARGS__); \ + exit(1); \ + } while (0) + +static void usage(void) +{ + die("Usage:\n" + " rampstream send <number>\n" + " Generate a ramp pattern of bytes on stdout, repeated <number>\n" + " times\n" + " rampstream check <number>\n" + " Check a ramp pattern of bytes on stdin, repeater <number>\n" + " times\n"); +} + +static void ramp_send(unsigned long long num, const uint8_t *ramp) +{ + unsigned long long i; + + for (i = 0; i < num; i++) { + int off = 0; + ssize_t rc; + + if (i % INTERVAL == 0) + fprintf(stderr, "%llu...\r", i); + + while (off < RAMPLEN) { + rc = write(1, ramp + off, RAMPLEN - off); + if (rc < 0) { + if (errno == EINTR || + errno == EAGAIN || + errno == EWOULDBLOCK) + continue; + die("Error writing ramp: %s\n", + strerror(errno)); + } + if (rc == 0) + die("Zero length write\n"); + off += rc; + } + } +} + +static void ramp_check(unsigned long long num, const uint8_t *ramp) +{ + unsigned long long i; + + for (i = 0; i < num; i++) { + uint8_t buf[RAMPLEN]; + int off = 0; + ssize_t rc; + + if (i % INTERVAL == 0) + fprintf(stderr, "%llu...\r", i); + + while (off < RAMPLEN) { + rc = read(0, buf + off, RAMPLEN - off); + if (rc < 0) { + if (errno == EINTR || + errno == EAGAIN || + errno == EWOULDBLOCK) + continue; + die("Error reading ramp: %s\n", + strerror(errno)); + } + if (rc == 0) + die("Unexpected EOF, ramp %llu, byte %d\n", + i, off); + off += rc; + } + + if (memcmp(buf, ramp, sizeof(buf)) != 0) { + int j, k; + + for (j = 0; j < RAMPLEN; j++) + if (buf[j] != ramp[j]) + break; + for (k = j; k < RAMPLEN && k < j + 16; k++) + fprintf(stderr, + "Byte %d: expected 0x%02x, got 0x%02x\n", + k, ramp[k], buf[k]); + die("Data mismatch, ramp %llu, byte %d\n", i, j); + } + } +} + +int main(int argc, char *argv[]) +{ + const char *subcmd = argv[1]; + unsigned long long num; + uint8_t ramp[RAMPLEN]; + char *e; + int i; + + if (argc < 2) + usage(); + + errno = 0; + num = strtoull(argv[2], &e, 0); + if (*e || errno) + usage(); + + /* Initialize the ramp block */ + for (i = 0; i < RAMPLEN; i++) + ramp[i] = i; + + if (strcmp(subcmd, "send") == 0) + ramp_send(num, ramp); + else if (strcmp(subcmd, "check") == 0) + ramp_check(num, ramp); + else + usage(); + + exit(0); +} @@ -130,6 +130,43 @@ run() { test two_guests_vu/basic teardown two_guests + setup migrate + test migrate/basic + teardown migrate + setup migrate + test migrate/basic_fin + teardown migrate + setup migrate + test migrate/bidirectional + teardown migrate + setup migrate + test migrate/bidirectional_fin + teardown migrate + setup migrate + test migrate/iperf3_out4 + teardown migrate + setup migrate + test migrate/iperf3_out6 + teardown migrate + setup migrate + test migrate/iperf3_in4 + teardown migrate + setup migrate + test migrate/iperf3_in6 + teardown migrate + setup migrate + test migrate/iperf3_bidir6 + teardown migrate + setup migrate + test migrate/iperf3_many_out6 + teardown migrate + setup migrate + test migrate/rampstream_in + teardown migrate + setup migrate + test migrate/rampstream_out + teardown migrate + VALGRIND=0 VHOST_USER=0 setup passt_in_ns @@ -186,7 +223,10 @@ run_selected() { __setup= for __test; do - if [ "${__test%%/*}" != "${__setup}" ]; then + # HACK: the migrate tests need the setup repeated for + # each test + if [ "${__test%%/*}" != "${__setup}" -o \ + "${__test%%/*}" = "migrate" ]; then [ -n "${__setup}" ] && teardown "${__setup}" __setup="${__test%%/*}" setup "${__setup}" @@ -87,6 +87,8 @@ #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/udp.h> +#include <netinet/ip_icmp.h> +#include <netinet/icmp6.h> #include <stdint.h> #include <stddef.h> #include <string.h> @@ -112,6 +114,12 @@ #include "udp_internal.h" #include "udp_vu.h" +/* Maximum UDP data to be returned in ICMP messages */ +#define ICMP4_MAX_DLEN 8 +#define ICMP6_MAX_DLEN (IPV6_MIN_MTU \ + - sizeof(struct udphdr) \ + - sizeof(struct ipv6hdr)) + /* "Spliced" sockets indexed by bound port (host order) */ static int udp_splice_ns [IP_VERSIONS][NUM_PORTS]; static int udp_splice_init[IP_VERSIONS][NUM_PORTS]; @@ -403,24 +411,120 @@ static void udp_tap_prepare(const struct mmsghdr *mmh, } /** + * udp_send_conn_fail_icmp4() - Construct and send ICMPv4 to local peer + * @c: Execution context + * @ee: Extended error descriptor + * @toside: Destination side of flow + * @saddr: Address of ICMP generating node + * @in: First bytes (max 8) of original UDP message body + * @dlen: Length of the read part of original UDP message body + */ +static void udp_send_conn_fail_icmp4(const struct ctx *c, + const struct sock_extended_err *ee, + const struct flowside *toside, + struct in_addr saddr, + const void *in, size_t dlen) +{ + struct in_addr oaddr = toside->oaddr.v4mapped.a4; + struct in_addr eaddr = toside->eaddr.v4mapped.a4; + in_port_t eport = toside->eport; + in_port_t oport = toside->oport; + struct { + struct icmphdr icmp4h; + struct iphdr ip4h; + struct udphdr uh; + char data[ICMP4_MAX_DLEN]; + } __attribute__((packed, aligned(__alignof__(max_align_t)))) msg; + size_t msglen = sizeof(msg) - sizeof(msg.data) + dlen; + size_t l4len = dlen + sizeof(struct udphdr); + + ASSERT(dlen <= ICMP4_MAX_DLEN); + memset(&msg, 0, sizeof(msg)); + msg.icmp4h.type = ee->ee_type; + msg.icmp4h.code = ee->ee_code; + if (ee->ee_type == ICMP_DEST_UNREACH && ee->ee_code == ICMP_FRAG_NEEDED) + msg.icmp4h.un.frag.mtu = htons((uint16_t) ee->ee_info); + + /* Reconstruct the original headers as returned in the ICMP message */ + tap_push_ip4h(&msg.ip4h, eaddr, oaddr, l4len, IPPROTO_UDP); + tap_push_uh4(&msg.uh, eaddr, eport, oaddr, oport, in, dlen); + memcpy(&msg.data, in, dlen); + + tap_icmp4_send(c, saddr, eaddr, &msg, msglen); +} + + +/** + * udp_send_conn_fail_icmp6() - Construct and send ICMPv6 to local peer + * @c: Execution context + * @ee: Extended error descriptor + * @toside: Destination side of flow + * @saddr: Address of ICMP generating node + * @in: First bytes (max 1232) of original UDP message body + * @dlen: Length of the read part of original UDP message body + * @flow: IPv6 flow identifier + */ +static void udp_send_conn_fail_icmp6(const struct ctx *c, + const struct sock_extended_err *ee, + const struct flowside *toside, + const struct in6_addr *saddr, + void *in, size_t dlen, uint32_t flow) +{ + const struct in6_addr *oaddr = &toside->oaddr.a6; + const struct in6_addr *eaddr = &toside->eaddr.a6; + in_port_t eport = toside->eport; + in_port_t oport = toside->oport; + struct { + struct icmp6_hdr icmp6h; + struct ipv6hdr ip6h; + struct udphdr uh; + char data[ICMP6_MAX_DLEN]; + } __attribute__((packed, aligned(__alignof__(max_align_t)))) msg; + size_t msglen = sizeof(msg) - sizeof(msg.data) + dlen; + size_t l4len = dlen + sizeof(struct udphdr); + + ASSERT(dlen <= ICMP6_MAX_DLEN); + memset(&msg, 0, sizeof(msg)); + msg.icmp6h.icmp6_type = ee->ee_type; + msg.icmp6h.icmp6_code = ee->ee_code; + if (ee->ee_type == ICMP6_PACKET_TOO_BIG) + msg.icmp6h.icmp6_dataun.icmp6_un_data32[0] = htonl(ee->ee_info); + + /* Reconstruct the original headers as returned in the ICMP message */ + tap_push_ip6h(&msg.ip6h, eaddr, oaddr, l4len, IPPROTO_UDP, flow); + tap_push_uh6(&msg.uh, eaddr, eport, oaddr, oport, in, dlen); + memcpy(&msg.data, in, dlen); + + tap_icmp6_send(c, saddr, eaddr, &msg, msglen); +} + +/** * udp_sock_recverr() - Receive and clear an error from a socket - * @s: Socket to receive from + * @c: Execution context + * @ref: epoll reference * * Return: 1 if error received and processed, 0 if no more errors in queue, < 0 * if there was an error reading the queue * * #syscalls recvmsg */ -static int udp_sock_recverr(int s) +static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref) { const struct sock_extended_err *ee; const struct cmsghdr *hdr; + union sockaddr_inany saddr; char buf[CMSG_SPACE(sizeof(*ee))]; + char data[ICMP6_MAX_DLEN]; + int s = ref.fd; + struct iovec iov = { + .iov_base = data, + .iov_len = sizeof(data) + }; struct msghdr mh = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = NULL, - .msg_iovlen = 0, + .msg_name = &saddr, + .msg_namelen = sizeof(saddr), + .msg_iov = &iov, + .msg_iovlen = 1, .msg_control = buf, .msg_controllen = sizeof(buf), }; @@ -450,8 +554,23 @@ static int udp_sock_recverr(int s) } ee = (const struct sock_extended_err *)CMSG_DATA(hdr); - - /* TODO: When possible propagate and otherwise handle errors */ + if (ref.type == EPOLL_TYPE_UDP_REPLY) { + flow_sidx_t sidx = flow_sidx_opposite(ref.flowside); + const struct flowside *toside = flowside_at_sidx(sidx); + size_t dlen = rc; + + if (hdr->cmsg_level == IPPROTO_IP) { + dlen = MIN(dlen, ICMP4_MAX_DLEN); + udp_send_conn_fail_icmp4(c, ee, toside, saddr.sa4.sin_addr, + data, dlen); + } else if (hdr->cmsg_level == IPPROTO_IPV6) { + udp_send_conn_fail_icmp6(c, ee, toside, + &saddr.sa6.sin6_addr, + data, dlen, sidx.flowi); + } + } else { + trace("Ignoring received IP_RECVERR cmsg on listener socket"); + } debug("%s error on UDP socket %i: %s", str_ee_origin(ee), s, strerror_(ee->ee_errno)); @@ -461,15 +580,16 @@ static int udp_sock_recverr(int s) /** * udp_sock_errs() - Process errors on a socket * @c: Execution context - * @s: Socket to receive from + * @ref: epoll reference * @events: epoll events bitmap * * Return: Number of errors handled, or < 0 if we have an unrecoverable error */ -int udp_sock_errs(const struct ctx *c, int s, uint32_t events) +int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events) { unsigned n_err = 0; socklen_t errlen; + int s = ref.fd; int rc, err; ASSERT(!c->no_udp); @@ -478,7 +598,7 @@ int udp_sock_errs(const struct ctx *c, int s, uint32_t events) return 0; /* Nothing to do */ /* Empty the error queue */ - while ((rc = udp_sock_recverr(s)) > 0) + while ((rc = udp_sock_recverr(c, ref)) > 0) n_err += rc; if (rc < 0) @@ -558,7 +678,7 @@ static void udp_buf_listen_sock_handler(const struct ctx *c, const socklen_t sasize = sizeof(udp_meta[0].s_in); int n, i; - if (udp_sock_errs(c, ref.fd, events) < 0) { + if (udp_sock_errs(c, ref, events) < 0) { err("UDP: Unrecoverable error on listening socket:" " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port); /* FIXME: what now? close/re-open socket? */ @@ -661,7 +781,7 @@ static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref, from_s = uflow->s[ref.flowside.sidei]; - if (udp_sock_errs(c, from_s, events) < 0) { + if (udp_sock_errs(c, ref, events) < 0) { flow_err(uflow, "Unrecoverable error on reply socket"); flow_err_details(uflow); udp_flow_close(c, uflow); @@ -93,9 +93,8 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, */ uflow->s[INISIDE] = fcntl(s_ini, F_DUPFD_CLOEXEC, 0); if (uflow->s[INISIDE] < 0) { - flow_err(uflow, - "Couldn't duplicate listening socket: %s", - strerror_(errno)); + flow_perror(uflow, + "Couldn't duplicate listening socket"); goto cancel; } } @@ -113,16 +112,13 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, uflow->s[TGTSIDE] = flowside_sock_l4(c, EPOLL_TYPE_UDP_REPLY, tgtpif, tgt, fref.data); if (uflow->s[TGTSIDE] < 0) { - flow_dbg(uflow, - "Couldn't open socket for spliced flow: %s", - strerror_(errno)); + flow_dbg_perror(uflow, + "Couldn't open socket for spliced flow"); goto cancel; } if (flowside_connect(c, uflow->s[TGTSIDE], tgtpif, tgt) < 0) { - flow_dbg(uflow, - "Couldn't connect flow socket: %s", - strerror_(errno)); + flow_dbg_perror(uflow, "Couldn't connect flow socket"); goto cancel; } @@ -142,9 +138,8 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, flow_trace(uflow, "Discarded %d spurious reply datagrams", rc); } else if (errno != EAGAIN) { - flow_err(uflow, - "Unexpected error discarding datagrams: %s", - strerror_(errno)); + flow_perror(uflow, + "Unexpected error discarding datagrams"); } } diff --git a/udp_internal.h b/udp_internal.h index cc80e30..3b081f5 100644 --- a/udp_internal.h +++ b/udp_internal.h @@ -30,5 +30,5 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, const struct flowside *toside, size_t dlen, bool no_udp_csum); -int udp_sock_errs(const struct ctx *c, int s, uint32_t events); +int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events); #endif /* UDP_INTERNAL_H */ @@ -227,7 +227,7 @@ void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref, struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; int i; - if (udp_sock_errs(c, ref.fd, events) < 0) { + if (udp_sock_errs(c, ref, events) < 0) { err("UDP: Unrecoverable error on listening socket:" " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port); return; @@ -302,7 +302,7 @@ void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref, ASSERT(!c->no_udp); - if (udp_sock_errs(c, from_s, events) < 0) { + if (udp_sock_errs(c, ref, events) < 0) { flow_err(uflow, "Unrecoverable error on reply socket"); flow_err_details(uflow); udp_flow_close(c, uflow); @@ -179,6 +179,68 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, } /** + * sock_unix() - Create and bind AF_UNIX socket + * @sock_path: Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix) + * + * Return: socket descriptor on success, won't return on failure + */ +int sock_unix(char *sock_path) +{ + int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + struct sockaddr_un addr = { + .sun_family = AF_UNIX, + }; + int i; + + if (fd < 0) + die_perror("Failed to open UNIX domain socket"); + + for (i = 1; i < UNIX_SOCK_MAX; i++) { + char *path = addr.sun_path; + int ex, ret; + + if (*sock_path) + memcpy(path, sock_path, UNIX_PATH_MAX); + else if (snprintf_check(path, UNIX_PATH_MAX - 1, + UNIX_SOCK_PATH, i)) + die_perror("Can't build UNIX domain socket path"); + + ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, + 0); + if (ex < 0) + die_perror("Failed to check for UNIX domain conflicts"); + + ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr)); + if (!ret || (errno != ENOENT && errno != ECONNREFUSED && + errno != EACCES)) { + if (*sock_path) + die("Socket path %s already in use", path); + + close(ex); + continue; + } + close(ex); + + unlink(path); + ret = bind(fd, (const struct sockaddr *)&addr, sizeof(addr)); + if (*sock_path && ret) + die_perror("Failed to bind UNIX domain socket"); + + if (!ret) + break; + } + + if (i == UNIX_SOCK_MAX) + die_perror("Failed to bind UNIX domain socket"); + + info("UNIX domain socket bound at %s", addr.sun_path); + if (!*sock_path) + memcpy(sock_path, addr.sun_path, UNIX_PATH_MAX); + + return fd; +} + +/** * sock_probe_mem() - Check if setting high SO_SNDBUF and SO_RCVBUF is allowed * @c: Execution context */ @@ -930,4 +992,28 @@ void raw_random(void *buf, size_t buflen) void epoll_del(const struct ctx *c, int fd) { epoll_ctl(c->epollfd, EPOLL_CTL_DEL, fd, NULL); + +} + +/** + * encode_domain_name() - Encode domain name according to RFC 1035, section 3.1 + * @buf: Buffer to fill in with encoded domain name + * @domain_name: Input domain name string with terminator + * + * The buffer's 'buf' size has to be >= strlen(domain_name) + 2 + */ +void encode_domain_name(char *buf, const char *domain_name) +{ + size_t i; + char *p; + + buf[0] = strcspn(domain_name, "."); + p = buf + 1; + for (i = 0; domain_name[i]; i++) { + if (domain_name[i] == '.') + p[i] = strcspn(domain_name + i + 1, "."); + else + p[i] = domain_name[i]; + } + p[i] = 0L; } @@ -31,12 +31,6 @@ #ifndef SECCOMP_RET_KILL_PROCESS #define SECCOMP_RET_KILL_PROCESS SECCOMP_RET_KILL #endif -#ifndef ETH_MAX_MTU -#define ETH_MAX_MTU USHRT_MAX -#endif -#ifndef ETH_MIN_MTU -#define ETH_MIN_MTU 68 -#endif #ifndef IP_MAX_MTU #define IP_MAX_MTU USHRT_MAX #endif @@ -122,14 +116,43 @@ (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24)) #endif +#ifndef __bswap_constant_32 +#define __bswap_constant_32(x) \ + ((((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >> 8) | \ + (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24)) +#endif + +#ifndef __bswap_constant_64 +#define __bswap_constant_64(x) \ + ((((x) & 0xff00000000000000ULL) >> 56) | \ + (((x) & 0x00ff000000000000ULL) >> 40) | \ + (((x) & 0x0000ff0000000000ULL) >> 24) | \ + (((x) & 0x000000ff00000000ULL) >> 8) | \ + (((x) & 0x00000000ff000000ULL) << 8) | \ + (((x) & 0x0000000000ff0000ULL) << 24) | \ + (((x) & 0x000000000000ff00ULL) << 40) | \ + (((x) & 0x00000000000000ffULL) << 56)) +#endif + #if __BYTE_ORDER == __BIG_ENDIAN #define htons_constant(x) (x) #define htonl_constant(x) (x) +#define htonll_constant(x) (x) +#define ntohs_constant(x) (x) +#define ntohl_constant(x) (x) +#define ntohll_constant(x) (x) #else #define htons_constant(x) (__bswap_constant_16(x)) #define htonl_constant(x) (__bswap_constant_32(x)) +#define htonll_constant(x) (__bswap_constant_64(x)) +#define ntohs_constant(x) (__bswap_constant_16(x)) +#define ntohl_constant(x) (__bswap_constant_32(x)) +#define ntohll_constant(x) (__bswap_constant_64(x)) #endif +#define ntohll(x) (be64toh((x))) +#define htonll(x) (htobe64((x))) + /** * ntohl_unaligned() - Read 32-bit BE value from a possibly unaligned address * @p: Pointer to the BE value in memory @@ -185,6 +208,7 @@ struct ctx; int sock_l4_sa(const struct ctx *c, enum epoll_type type, const void *sa, socklen_t sl, const char *ifname, bool v6only, uint32_t data); +int sock_unix(char *sock_path); void sock_probe_mem(struct ctx *c); long timespec_diff_ms(const struct timespec *a, const struct timespec *b); int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b); @@ -352,4 +376,7 @@ static inline int wrap_accept4(int sockfd, struct sockaddr *addr, #define accept4(s, addr, addrlen, flags) \ wrap_accept4((s), (addr), (addrlen), (flags)) +#define PASST_MAXDNAME 254 /* 253 (RFC 1035) + 1 (the terminator) */ +void encode_domain_name(char *buf, const char *domain_name); + #endif /* UTIL_H */ diff --git a/vhost_user.c b/vhost_user.c index 159f0b3..105f77a 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -44,6 +44,7 @@ #include "tap.h" #include "vhost_user.h" #include "pcap.h" +#include "migrate.h" /* vhost-user version we are compatible with */ #define VHOST_USER_VERSION 1 @@ -516,7 +517,7 @@ static void vu_close_log(struct vu_dev *vdev) * vu_log_kick() - Inform the front-end that the log has been modified * @vdev: vhost-user device */ -void vu_log_kick(const struct vu_dev *vdev) +static void vu_log_kick(const struct vu_dev *vdev) { if (vdev->log_call_fd != -1) { int rc; @@ -731,6 +732,7 @@ static bool vu_get_vring_base_exec(struct vu_dev *vdev, msg->hdr.size = sizeof(msg->payload.state); vdev->vq[idx].started = false; + vdev->vq[idx].vring.avail = 0; if (vdev->vq[idx].call_fd != -1) { close(vdev->vq[idx].call_fd); @@ -998,36 +1000,6 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev, } /** - * vu_set_migration_watch() - Add the migration file descriptor to epoll - * @vdev: vhost-user device - * @fd: File descriptor to add - * @direction: Direction of the migration (save or load backend state) - */ -static void vu_set_migration_watch(const struct vu_dev *vdev, int fd, - uint32_t direction) -{ - union epoll_ref ref = { - .type = EPOLL_TYPE_VHOST_MIGRATION, - .fd = fd, - }; - struct epoll_event ev = { 0 }; - - ev.data.u64 = ref.u64; - switch (direction) { - case VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE: - ev.events = EPOLLOUT; - break; - case VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD: - ev.events = EPOLLIN; - break; - default: - ASSERT(0); - } - - epoll_ctl(vdev->context->epollfd, EPOLL_CTL_ADD, ref.fd, &ev); -} - -/** * vu_set_device_state_fd_exec() - Set the device state migration channel * @vdev: vhost-user device * @vmsg: vhost-user message @@ -1051,16 +1023,8 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev, direction != VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD) die("Invalide device_state_fd direction: %d", direction); - if (vdev->device_state_fd != -1) { - epoll_del(vdev->context, vdev->device_state_fd); - close(vdev->device_state_fd); - } - - vdev->device_state_fd = msg->fds[0]; - vdev->device_state_result = -1; - vu_set_migration_watch(vdev, vdev->device_state_fd, direction); - - debug("Got device_state_fd: %d", vdev->device_state_fd); + migrate_request(vdev->context, msg->fds[0], + direction == VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD); /* We don't provide a new fd for the data transfer */ vmsg_set_reply_u64(msg, VHOST_USER_VRING_NOFD_MASK); @@ -1075,12 +1039,11 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev, * * Return: True as the reply contains the migration result */ +/* cppcheck-suppress constParameterCallback */ static bool vu_check_device_state_exec(struct vu_dev *vdev, struct vhost_user_msg *msg) { - (void)vdev; - - vmsg_set_reply_u64(msg, vdev->device_state_result); + vmsg_set_reply_u64(msg, vdev->context->device_state_result); return true; } @@ -1106,8 +1069,8 @@ void vu_init(struct ctx *c) } c->vdev->log_table = NULL; c->vdev->log_call_fd = -1; - c->vdev->device_state_fd = -1; - c->vdev->device_state_result = -1; + + migrate_init(c); } @@ -1157,12 +1120,8 @@ void vu_cleanup(struct vu_dev *vdev) vu_close_log(vdev); - if (vdev->device_state_fd != -1) { - epoll_del(vdev->context, vdev->device_state_fd); - close(vdev->device_state_fd); - vdev->device_state_fd = -1; - vdev->device_state_result = -1; - } + /* If we lose the VU dev, we also lose our migration channel */ + migrate_close(vdev->context); } /** @@ -1245,4 +1204,11 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events) if (reply_requested) vu_send_reply(fd, &msg); + + if (msg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE && + vdev->context->device_state_result == 0 && + !vdev->context->migrate_target) { + info("Migration complete, exiting"); + _exit(EXIT_SUCCESS); + } } diff --git a/vhost_user.h b/vhost_user.h index e769cb1..1daacd1 100644 --- a/vhost_user.h +++ b/vhost_user.h @@ -241,7 +241,6 @@ static inline bool vu_queue_started(const struct vu_virtq *vq) void vu_print_capabilities(void); void vu_init(struct ctx *c); void vu_cleanup(struct vu_dev *vdev); -void vu_log_kick(const struct vu_dev *vdev); void vu_log_write(const struct vu_dev *vdev, uint64_t address, uint64_t length); void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events); @@ -286,7 +286,7 @@ static int virtqueue_read_next_desc(const struct vring_desc *desc, * * Return: true if the virtqueue is empty, false otherwise */ -bool vu_queue_empty(struct vu_virtq *vq) +static bool vu_queue_empty(struct vu_virtq *vq) { if (!vq->vring.avail) return true; @@ -671,9 +671,10 @@ static void vu_log_queue_fill(const struct vu_dev *vdev, struct vu_virtq *vq, * @len: Size of the element * @idx: Used ring entry index */ -void vu_queue_fill_by_index(const struct vu_dev *vdev, struct vu_virtq *vq, - unsigned int index, unsigned int len, - unsigned int idx) +static void vu_queue_fill_by_index(const struct vu_dev *vdev, + struct vu_virtq *vq, + unsigned int index, unsigned int len, + unsigned int idx) { struct vring_used_elem uelem; @@ -106,8 +106,6 @@ struct vu_dev_region { * @log_call_fd: Eventfd to report logging update * @log_size: Size of the logging memory region * @log_table: Base of the logging memory region - * @device_state_fd: Device state migration channel - * @device_state_result: Device state migration result */ struct vu_dev { struct ctx *context; @@ -119,8 +117,6 @@ struct vu_dev { int log_call_fd; uint64_t log_size; uint8_t *log_table; - int device_state_fd; - int device_state_result; }; /** @@ -178,16 +174,12 @@ static inline bool vu_has_protocol_feature(const struct vu_dev *vdev, return has_feature(vdev->protocol_features, fbit); } -bool vu_queue_empty(struct vu_virtq *vq); void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq); int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq, struct vu_virtq_element *elem); void vu_queue_detach_element(struct vu_virtq *vq); void vu_queue_unpop(struct vu_virtq *vq); bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num); -void vu_queue_fill_by_index(const struct vu_dev *vdev, struct vu_virtq *vq, - unsigned int index, unsigned int len, - unsigned int idx); void vu_queue_fill(const struct vu_dev *vdev, struct vu_virtq *vq, const struct vu_virtq_element *elem, unsigned int len, unsigned int idx); diff --git a/vu_common.c b/vu_common.c index ab04d31..686a09b 100644 --- a/vu_common.c +++ b/vu_common.c @@ -5,6 +5,7 @@ * common_vu.c - vhost-user common UDP and TCP functions */ +#include <errno.h> #include <unistd.h> #include <sys/uio.h> #include <sys/eventfd.h> @@ -17,6 +18,7 @@ #include "vhost_user.h" #include "pcap.h" #include "vu_common.h" +#include "migrate.h" #define VU_MAX_TX_BUFFER_NB 2 @@ -24,14 +26,12 @@ * vu_packet_check_range() - Check if a given memory zone is contained in * a mapped guest memory region * @buf: Array of the available memory regions - * @offset: Offset of data range in packet descriptor + * @ptr: Start of desired data range * @size: Length of desired data range - * @start: Start of the packet descriptor * * Return: 0 if the zone is in a mapped memory region, -1 otherwise */ -int vu_packet_check_range(void *buf, size_t offset, size_t len, - const char *start) +int vu_packet_check_range(void *buf, const char *ptr, size_t len) { struct vu_dev_region *dev_region; @@ -39,9 +39,8 @@ int vu_packet_check_range(void *buf, size_t offset, size_t len, /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ char *m = (char *)(uintptr_t)dev_region->mmap_addr; - if (m <= start && - start + offset + len <= m + dev_region->mmap_offset + - dev_region->size) + if (m <= ptr && + ptr + len <= m + dev_region->mmap_offset + dev_region->size) return 0; } @@ -303,50 +302,3 @@ err: return -1; } - -/** - * vu_migrate() - Send/receive passt insternal state to/from QEMU - * @vdev: vhost-user device - * @events: epoll events - */ -void vu_migrate(struct vu_dev *vdev, uint32_t events) -{ - int ret; - - /* TODO: collect/set passt internal state - * and use vdev->device_state_fd to send/receive it - */ - debug("vu_migrate fd %d events %x", vdev->device_state_fd, events); - if (events & EPOLLOUT) { - debug("Saving backend state"); - - /* send some stuff */ - ret = write(vdev->device_state_fd, "PASST", 6); - /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */ - vdev->device_state_result = ret == -1 ? -1 : 0; - /* Closing the file descriptor signals the end of transfer */ - epoll_del(vdev->context, vdev->device_state_fd); - close(vdev->device_state_fd); - vdev->device_state_fd = -1; - } else if (events & EPOLLIN) { - char buf[6]; - - debug("Loading backend state"); - /* read some stuff */ - ret = read(vdev->device_state_fd, buf, sizeof(buf)); - /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */ - if (ret != sizeof(buf)) { - vdev->device_state_result = -1; - } else { - ret = strncmp(buf, "PASST", sizeof(buf)); - vdev->device_state_result = ret == 0 ? 0 : -1; - } - } else if (events & EPOLLHUP) { - debug("Closing migration channel"); - - /* The end of file signals the end of the transfer. */ - epoll_del(vdev->context, vdev->device_state_fd); - close(vdev->device_state_fd); - vdev->device_state_fd = -1; - } -} diff --git a/vu_common.h b/vu_common.h index d56c021..f538f23 100644 --- a/vu_common.h +++ b/vu_common.h @@ -57,5 +57,5 @@ void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq, void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref, const struct timespec *now); int vu_send_single(const struct ctx *c, const void *buf, size_t size); -void vu_migrate(struct vu_dev *vdev, uint32_t events); + #endif /* VU_COMMON_H */ |