diff options
161 files changed, 18922 insertions, 5440 deletions
diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..78f177a --- /dev/null +++ b/.clang-format @@ -0,0 +1,126 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# clang-format configuration file. Intended for clang-format >= 11. +# +# For more information, see: +# +# Documentation/dev-tools/clang-format.rst +# https://clang.llvm.org/docs/ClangFormat.html +# https://clang.llvm.org/docs/ClangFormatStyleOptions.html +# +--- +AccessModifierOffset: -4 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlines: Left +AlignOperands: true +AlignTrailingComments: false +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: None +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: false +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: true + AfterNamespace: true + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Custom +BreakBeforeInheritanceComma: false +BreakBeforeTernaryOperators: false +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeComma +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: false +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerIndentWidth: 8 +ContinuationIndentWidth: 8 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: false + +# Taken from: +# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ tools/ \ +# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \ +# | LC_ALL=C sort -u +ForEachMacros: + - 'for_each_nst' + +IncludeBlocks: Preserve +IncludeCategories: + - Regex: '.*' + Priority: 1 +IncludeIsMainRegex: '(Test)?$' +IndentCaseLabels: false +IndentGotoLabels: false +IndentPPDirectives: None +IndentWidth: 8 +IndentWrappedFunctionNames: false +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 8 +ObjCSpaceAfterProperty: true +ObjCSpaceBeforeProtocolList: true + +# Taken from git's rules +PenaltyBreakAssignment: 10 +PenaltyBreakBeforeFirstCallParameter: 30 +PenaltyBreakComment: 10 +PenaltyBreakFirstLessLess: 0 +PenaltyBreakString: 10 +PenaltyExcessCharacter: 100 +PenaltyReturnTypeOnItsOwnLine: 60 + +PointerAlignment: Right +ReflowComments: false +SortIncludes: false +SortUsingDeclarations: false +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatementsExceptForEachMacros +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInContainerLiterals: false +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp03 +TabWidth: 8 +UseTab: Always +... diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 0000000..9d346ec --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,93 @@ +--- +Checks: + - "clang-diagnostic-*,clang-analyzer-*,*,-modernize-*" + + # TODO: enable once https://bugs.llvm.org/show_bug.cgi?id=41311 is fixed + - "-clang-analyzer-valist.Uninitialized" + + # Dubious value, would kill readability + - "-cppcoreguidelines-init-variables" + + # Dubious value over the compiler's built-in warning. Would + # increase verbosity. + - "-bugprone-assignment-in-if-condition" + + # Debatable whether these improve readability, right now it would look + # like a mess + - "-google-readability-braces-around-statements" + - "-hicpp-braces-around-statements" + - "-readability-braces-around-statements" + + # TODO: in most cases they are justified, but probably not everywhere + # + - "-readability-magic-numbers" + - "-cppcoreguidelines-avoid-magic-numbers" + + # TODO: this is Linux-only for the moment, nice to fix eventually + - "-llvmlibc-restrict-system-libc-headers" + + # Those are needed for syscalls, epoll_wait flags, etc. + - "-hicpp-signed-bitwise" + + # Probably not doable to impement this without plain memcpy(), memset() + - "-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling" + + # TODO: not really important, but nice to fix eventually + - "-llvm-include-order" + + # Dubious value, would kill readability + - "-readability-isolate-declaration" + + # TODO: nice to fix eventually + - "-bugprone-narrowing-conversions" + - "-cppcoreguidelines-narrowing-conversions" + + # TODO: check, fix, and more in general constify wherever possible + - "-cppcoreguidelines-avoid-non-const-global-variables" + + # TODO: check paths where it might make sense to improve performance + - "-altera-unroll-loops" + - "-altera-id-dependent-backward-branch" + + # Not much can be done about them other than being careful + - "-bugprone-easily-swappable-parameters" + + # TODO: split reported functions + - "-readability-function-cognitive-complexity" + + # "Poor" alignment needed for structs reflecting message formats/headers + - "-altera-struct-pack-align" + + # TODO: check again if multithreading is implemented + - "-concurrency-mt-unsafe" + + # Complains about any identifier <3 characters, reasonable for + # globals, pointlessly verbose for locals and parameters. + - "-readability-identifier-length" + + # Wants to include headers which *directly* provide the things + # we use. That sounds nice, but means it will often want a OS + # specific header instead of a mostly standard one, such as + # <linux/limits.h> instead of <limits.h>. + - "-misc-include-cleaner" + + # Want to replace all #defines of integers with enums. Kind of + # makes sense when those defines form an enum-like set, but + # weird for cases like standalone constants, and causes other + # awkwardness for a bunch of cases we use + - "-cppcoreguidelines-macro-to-enum" + + # It's been a couple of centuries since multiplication has been granted + # precedence over addition in modern mathematical notation. Adding + # parentheses to reinforce that certainly won't improve readability. + - "-readability-math-missing-parentheses" +WarningsAsErrors: "*" +HeaderFileExtensions: + - h +ImplementationFileExtensions: + - c +HeaderFilterRegex: "" +FormatStyle: none +CheckOptions: + bugprone-suspicious-string-compare.WarnOnImplicitComparison: "false" +SystemHeaders: false @@ -0,0 +1,3 @@ +CompileFlags: + # Don't try to interpret our headers as C++' + Add: [-xc, -Wall] @@ -3,8 +3,10 @@ /passt.avx2 /pasta /pasta.avx2 +/passt-repair /qrap /pasta.1 /seccomp.h +/seccomp_repair.h /c*.json README.plain.md @@ -15,65 +15,47 @@ VERSION ?= $(shell git describe --tags HEAD 2>/dev/null || echo "unknown\ versio # the IPv6 socket API? (Linux does) DUAL_STACK_SOCKETS := 1 -RLIMIT_STACK_VAL := $(shell /bin/sh -c 'ulimit -s') -ifeq ($(RLIMIT_STACK_VAL),unlimited) -RLIMIT_STACK_VAL := 1024 -endif - TARGET ?= $(shell $(CC) -dumpmachine) +$(if $(TARGET),,$(error Failed to get target architecture)) # Get 'uname -m'-like architecture description for target -TARGET_ARCH := $(shell echo $(TARGET) | cut -f1 -d- | tr [A-Z] [a-z]) -TARGET_ARCH := $(shell echo $(TARGET_ARCH) | sed 's/powerpc/ppc/') - -AUDIT_ARCH := $(shell echo $(TARGET_ARCH) | tr [a-z] [A-Z] | sed 's/^ARM.*/ARM/') -AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/I[456]86/I386/') -AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPC64/PPC/') -AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPCLE/PPC64LE/') -AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/MIPS64EL/MIPSEL64/') -AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/HPPA/PARISC/') -AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/SH4/SH/') +TARGET_ARCH := $(firstword $(subst -, ,$(TARGET))) +TARGET_ARCH := $(patsubst [:upper:],[:lower:],$(TARGET_ARCH)) +TARGET_ARCH := $(patsubst arm%,arm,$(TARGET_ARCH)) +TARGET_ARCH := $(subst powerpc,ppc,$(TARGET_ARCH)) + +# On some systems enabling optimization also enables source fortification, +# automagically. Do not override it. +FORTIFY_FLAG := +ifeq ($(shell $(CC) -O2 -dM -E - < /dev/null 2>&1 | grep ' _FORTIFY_SOURCE ' > /dev/null; echo $$?),1) +FORTIFY_FLAG := -D_FORTIFY_SOURCE=2 +endif -FLAGS := -Wall -Wextra -Wno-format-zero-length +FLAGS := -Wall -Wextra -Wno-format-zero-length -Wformat-security FLAGS += -pedantic -std=c11 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE -FLAGS += -D_FORTIFY_SOURCE=2 -O2 -pie -fPIE +FLAGS += $(FORTIFY_FLAG) -O2 -pie -fPIE FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE) -FLAGS += -DNETNS_RUN_DIR=\"/run/netns\" -FLAGS += -DPASST_AUDIT_ARCH=AUDIT_ARCH_$(AUDIT_ARCH) -FLAGS += -DRLIMIT_STACK_VAL=$(RLIMIT_STACK_VAL) -FLAGS += -DARCH=\"$(TARGET_ARCH)\" FLAGS += -DVERSION=\"$(VERSION)\" FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ - ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_splice.c udp.c util.c + ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c \ + repair.c tap.c tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c \ + udp_vu.c util.c vhost_user.c virtio.c vu_common.c QRAP_SRCS = qrap.c -SRCS = $(PASST_SRCS) $(QRAP_SRCS) +PASST_REPAIR_SRCS = passt-repair.c +SRCS = $(PASST_SRCS) $(QRAP_SRCS) $(PASST_REPAIR_SRCS) -MANPAGES = passt.1 pasta.1 qrap.1 +MANPAGES = passt.1 pasta.1 qrap.1 passt-repair.1 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ - lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ - siphash.h tap.h tcp.h tcp_conn.h tcp_splice.h udp.h util.h + lineread.h log.h migrate.h ndp.h netlink.h packet.h passt.h pasta.h \ + pcap.h pif.h repair.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h \ + tcp_internal.h tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h \ + udp_vu.h util.h vhost_user.h virtio.h vu_common.h HEADERS = $(PASST_HEADERS) seccomp.h -C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; -ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0) - FLAGS += -DHAS_SND_WND -endif - -C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_bytes_acked = 0 }; -ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0) - FLAGS += -DHAS_BYTES_ACKED -endif - -C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_min_rtt = 0 }; -ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0) - FLAGS += -DHAS_MIN_RTT -endif - C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);} ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0) FLAGS += -DHAS_GETRANDOM @@ -83,11 +65,6 @@ ifeq ($(shell :|$(CC) -fstack-protector-strong -S -xc - -o - >/dev/null 2>&1; ec FLAGS += -fstack-protector-strong endif -C := \#define _GNU_SOURCE\n\#include <fcntl.h>\nint x = FALLOC_FL_COLLAPSE_RANGE; -ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0) - EXTRA_SYSCALLS += fallocate -endif - prefix ?= /usr/local exec_prefix ?= $(prefix) bindir ?= $(exec_prefix)/bin @@ -97,9 +74,9 @@ mandir ?= $(datarootdir)/man man1dir ?= $(mandir)/man1 ifeq ($(TARGET_ARCH),x86_64) -BIN := passt passt.avx2 pasta pasta.avx2 qrap +BIN := passt passt.avx2 pasta pasta.avx2 qrap passt-repair else -BIN := passt pasta qrap +BIN := passt pasta qrap passt-repair endif all: $(BIN) $(MANPAGES) docs @@ -108,7 +85,10 @@ static: FLAGS += -static -DGLIBC_NO_STATIC_NSS static: clean all seccomp.h: seccomp.sh $(PASST_SRCS) $(PASST_HEADERS) - @ EXTRA_SYSCALLS="$(EXTRA_SYSCALLS)" ARCH="$(TARGET_ARCH)" CC="$(CC)" ./seccomp.sh $(PASST_SRCS) $(PASST_HEADERS) + @ EXTRA_SYSCALLS="$(EXTRA_SYSCALLS)" ARCH="$(TARGET_ARCH)" CC="$(CC)" ./seccomp.sh seccomp.h $(PASST_SRCS) $(PASST_HEADERS) + +seccomp_repair.h: seccomp.sh $(PASST_REPAIR_SRCS) + @ ARCH="$(TARGET_ARCH)" CC="$(CC)" ./seccomp.sh seccomp_repair.h $(PASST_REPAIR_SRCS) passt: $(PASST_SRCS) $(HEADERS) $(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(PASST_SRCS) -o passt $(LDFLAGS) @@ -124,17 +104,21 @@ pasta.avx2 pasta.1 pasta: pasta%: passt% ln -sf $< $@ qrap: $(QRAP_SRCS) passt.h - $(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(QRAP_SRCS) -o qrap $(LDFLAGS) + $(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) -DARCH=\"$(TARGET_ARCH)\" $(QRAP_SRCS) -o qrap $(LDFLAGS) + +passt-repair: $(PASST_REPAIR_SRCS) seccomp_repair.h + $(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(PASST_REPAIR_SRCS) -o passt-repair $(LDFLAGS) valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction \ - getpid gettid kill clock_gettime mmap \ - munmap open unlink gettimeofday futex + rt_sigreturn getpid gettid kill clock_gettime \ + mmap|mmap2 munmap open unlink gettimeofday futex \ + statx readlink valgrind: FLAGS += -g -DVALGRIND valgrind: all .PHONY: clean clean: - $(RM) $(BIN) *~ *.o seccomp.h pasta.1 \ + $(RM) $(BIN) *~ *.o seccomp.h seccomp_repair.h pasta.1 \ passt.tar passt.tar.gz *.deb *.rpm \ passt.pid README.plain.md @@ -188,111 +172,11 @@ docs: README.md done < README.md; \ ) > README.plain.md -# Checkers currently disabled for clang-tidy: -# - llvmlibc-restrict-system-libc-headers -# TODO: this is Linux-only for the moment, nice to fix eventually -# -# - bugprone-macro-parentheses -# - google-readability-braces-around-statements -# - hicpp-braces-around-statements -# - readability-braces-around-statements -# Debatable whether that improves readability, right now it would look -# like a mess -# -# - readability-magic-numbers -# - cppcoreguidelines-avoid-magic-numbers -# TODO: in most cases they are justified, but probably not everywhere -# -# - clang-analyzer-valist.Uninitialized -# TODO: enable once https://bugs.llvm.org/show_bug.cgi?id=41311 is fixed -# -# - clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling -# Probably not doable to impement this without plain memcpy(), memset() -# -# - cppcoreguidelines-init-variables -# Dubious value, would kill readability -# -# - hicpp-signed-bitwise -# Those are needed for syscalls, epoll_wait flags, etc. -# -# - llvm-include-order -# TODO: not really important, but nice to fix eventually -# -# - readability-isolate-declaration -# Dubious value, would kill readability -# -# - bugprone-narrowing-conversions -# - cppcoreguidelines-narrowing-conversions -# TODO: nice to fix eventually -# -# - cppcoreguidelines-avoid-non-const-global-variables -# TODO: check, fix, and more in general constify wherever possible -# -# - altera-unroll-loops -# - altera-id-dependent-backward-branch -# TODO: check paths where it might make sense to improve performance -# -# - bugprone-easily-swappable-parameters -# Not much can be done about them other than being careful -# -# - readability-function-cognitive-complexity -# TODO: split reported functions -# -# - altera-struct-pack-align -# "Poor" alignment needed for structs reflecting message formats/headers -# -# - concurrency-mt-unsafe -# TODO: check again if multithreading is implemented -# -# - readability-identifier-length -# Complains about any identifier <3 characters, reasonable for -# globals, pointlessly verbose for locals and parameters. -# -# - bugprone-assignment-in-if-condition -# Dubious value over the compiler's built-in warning. Would -# increase verbosity. -# -# - misc-include-cleaner -# Wants to include headers which *directly* provide the things -# we use. That sounds nice, but means it will often want a OS -# specific header instead of a mostly standard one, such as -# <linux/limits.h> instead of <limits.h>. - -clang-tidy: $(SRCS) $(HEADERS) - clang-tidy -checks=*,-modernize-*,\ - -clang-analyzer-valist.Uninitialized,\ - -cppcoreguidelines-init-variables,\ - -bugprone-assignment-in-if-condition,\ - -bugprone-macro-parentheses,\ - -google-readability-braces-around-statements,\ - -hicpp-braces-around-statements,\ - -readability-braces-around-statements,\ - -readability-magic-numbers,\ - -llvmlibc-restrict-system-libc-headers,\ - -hicpp-signed-bitwise,\ - -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,\ - -llvm-include-order,\ - -cppcoreguidelines-avoid-magic-numbers,\ - -readability-isolate-declaration,\ - -bugprone-narrowing-conversions,\ - -cppcoreguidelines-narrowing-conversions,\ - -cppcoreguidelines-avoid-non-const-global-variables,\ - -altera-unroll-loops,-altera-id-dependent-backward-branch,\ - -bugprone-easily-swappable-parameters,\ - -readability-function-cognitive-complexity,\ - -altera-struct-pack-align,\ - -concurrency-mt-unsafe,\ - -readability-identifier-length,\ - -misc-include-cleaner \ - -config='{CheckOptions: [{key: bugprone-suspicious-string-compare.WarnOnImplicitComparison, value: "false"}]}' \ - --warnings-as-errors=* $(SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -DCLANG_TIDY_58992 +clang-tidy: $(PASST_SRCS) $(HEADERS) + clang-tidy $(PASST_SRCS) -- $(filter-out -pie,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \ + -DCLANG_TIDY_58992 -SYSTEM_INCLUDES := /usr/include $(wildcard /usr/include/$(TARGET)) -ifeq ($(shell $(CC) -v 2>&1 | grep -c "gcc version"),1) -VER := $(shell $(CC) -dumpversion) -SYSTEM_INCLUDES += /usr/lib/gcc/$(TARGET)/$(VER)/include -endif -cppcheck: $(SRCS) $(HEADERS) +cppcheck: $(PASST_SRCS) $(HEADERS) if cppcheck --check-level=exhaustive /dev/null > /dev/null 2>&1; then \ CPPCHECK_EXHAUSTIVE="--check-level=exhaustive"; \ else \ @@ -301,11 +185,8 @@ cppcheck: $(SRCS) $(HEADERS) cppcheck --std=c11 --error-exitcode=1 --enable=all --force \ --inconclusive --library=posix --quiet \ $${CPPCHECK_EXHAUSTIVE} \ - $(SYSTEM_INCLUDES:%=-I%) \ - $(SYSTEM_INCLUDES:%=--config-exclude=%) \ - $(SYSTEM_INCLUDES:%=--suppress=*:%/*) \ - $(SYSTEM_INCLUDES:%=--suppress=unmatchedSuppression:%/*) \ --inline-suppr \ + --suppress=missingIncludeSystem \ --suppress=unusedStructMember \ - $(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) \ - . + $(filter -D%,$(FLAGS) $(CFLAGS) $(CPPFLAGS)) -D CPPCHECK_6936 \ + $(PASST_SRCS) $(HEADERS) @@ -321,7 +321,7 @@ speeding up local connections, and usually requiring NAT. _pasta_: protocol * ✅ 4 to 50 times IPv4 TCP throughput of existing, conceptually similar solutions depending on MTU (UDP and IPv6 hard to compare) -* 🛠 [_vhost-user_ support](https://bugs.passt.top/show_bug.cgi?id=25) for +* ✅ [_vhost-user_ support](https://bugs.passt.top/show_bug.cgi?id=25) for maximum one copy on every data path and lower request-response latency * ⌚ [multithreading](https://bugs.passt.top/show_bug.cgi?id=13) * ⌚ [raw IP socket support](https://bugs.passt.top/show_bug.cgi?id=14) if @@ -338,20 +338,24 @@ speeding up local connections, and usually requiring NAT. _pasta_: [_slirp4netns_ replacement](/passt/tree/slirp4netns.sh) * ✅ out-of-tree patch for [Kata Containers](/passt/tree/contrib/kata-containers) available -* ⌚ drop-in replacement for VPNKit (rootless Docker) +* ✅ rootless Docker + [network back-end](https://docs.docker.com/engine/security/rootless/#networking-errors) + via moby/rootlesskit ### Availability * official packages for: + * ✅ [Alpine Linux](https://pkgs.alpinelinux.org/packages?name=passt) * ✅ [Arch Linux](https://archlinux.org/packages/extra/x86_64/passt/) ([aarch64](https://archlinuxarm.org/packages/aarch64/passt), [i486](https://www.archlinux32.org/packages/?q=passt)) * ✅ [CentOS Stream](https://gitlab.com/redhat/centos-stream/rpms/passt) * ✅ [Debian](https://tracker.debian.org/pkg/passt) * ✅ [Fedora](https://src.fedoraproject.org/rpms/passt) * ✅ [Gentoo](https://packages.gentoo.org/packages/net-misc/passt) + * ✅ [GNU Guix](https://packages.guix.gnu.org/packages/passt/) + * ✅ [OpenSUSE](https://build.opensuse.org/package/requests/Virtualization:containers/passt) * ✅ [Ubuntu](https://launchpad.net/ubuntu/+source/passt) * ✅ [Void Linux](https://voidlinux.org/packages/?q=passt) * unofficial packages for: * ✅ [EPEL, Mageia](https://copr.fedorainfracloud.org/coprs/sbrivio/passt/) - * 🛠 [openSUSE](https://build.opensuse.org/package/show/Virtualization:containers/passt) * ✅ unofficial [packages](https://passt.top/builds/latest/x86_64/) from x86_64 static builds for other RPM-based distributions * ✅ unofficial [packages](https://passt.top/builds/latest/x86_64/) from x86_64 @@ -396,7 +400,7 @@ services: and nameserver using SLAAC * [DHCPv6 server](/passt/tree/dhcpv6.c): a simple implementation handing out one single IPv6 address to the guest or namespace, - namely, the the same address as the first one configured for the upstream host + namely, the same address as the first one configured for the upstream host interface, and passing the nameservers configured on the host ## Addresses @@ -18,6 +18,9 @@ #include <string.h> #include <unistd.h> +#include "log.h" +#include "util.h" + /** * arch_avx2_exec() - Switch to AVX2 build if supported * @argv: Arguments from command line @@ -28,10 +31,8 @@ void arch_avx2_exec(char **argv) char exe[PATH_MAX] = { 0 }; const char *p; - if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0) { - perror("readlink /proc/self/exe"); - exit(EXIT_FAILURE); - } + if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0) + die_perror("Failed to read own /proc/self/exe link"); p = strstr(exe, ".avx2"); if (p && strlen(p) == strlen(".avx2")) @@ -40,9 +41,12 @@ void arch_avx2_exec(char **argv) if (__builtin_cpu_supports("avx2")) { char new_path[PATH_MAX + sizeof(".avx2")]; - snprintf(new_path, PATH_MAX + sizeof(".avx2"), "%s.avx2", exe); - execve(new_path, argv, environ); - perror("Can't run AVX2 build, using non-AVX2 version"); + if (snprintf_check(new_path, PATH_MAX + sizeof(".avx2"), + "%s.avx2", exe)) + die_perror("Can't build AVX2 executable path"); + + execv(new_path, argv); + warn_perror("Can't run AVX2 build, using non-AVX2 version"); } } #else @@ -43,7 +43,7 @@ int arp(const struct ctx *c, const struct pool *p) struct ethhdr *eh; struct arphdr *ah; struct arpmsg *am; - size_t len; + size_t l2len; eh = packet_get(p, 0, 0, sizeof(*eh), NULL); ah = packet_get(p, 0, sizeof(*eh), sizeof(*ah), NULL); @@ -59,30 +59,28 @@ int arp(const struct ctx *c, const struct pool *p) ah->ar_op != htons(ARPOP_REQUEST)) return 1; - /* Discard announcements (but not 0.0.0.0 "probes"): we might have the - * same IP address, hide that. - */ - if (memcmp(am->sip, (unsigned char[4]){ 0 }, sizeof(am->tip)) && + /* Discard announcements, but not 0.0.0.0 "probes" */ + if (memcmp(am->sip, &in4addr_any, sizeof(am->sip)) && !memcmp(am->sip, am->tip, sizeof(am->sip))) return 1; - /* Don't resolve our own address, either. */ + /* Don't resolve the guest's assigned address, either. */ if (!memcmp(am->tip, &c->ip4.addr, sizeof(am->tip))) return 1; ah->ar_op = htons(ARPOP_REPLY); memcpy(am->tha, am->sha, sizeof(am->tha)); - memcpy(am->sha, c->mac, sizeof(am->sha)); + memcpy(am->sha, c->our_tap_mac, sizeof(am->sha)); memcpy(swap, am->tip, sizeof(am->tip)); memcpy(am->tip, am->sip, sizeof(am->tip)); memcpy(am->sip, swap, sizeof(am->sip)); - len = sizeof(*eh) + sizeof(*ah) + sizeof(*am); + l2len = sizeof(*eh) + sizeof(*ah) + sizeof(*am); memcpy(eh->h_dest, eh->h_source, sizeof(eh->h_dest)); - memcpy(eh->h_source, c->mac, sizeof(eh->h_source)); + memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source)); - tap_send_single(c, eh, len); + tap_send_single(c, eh, l2len); return 1; } @@ -59,6 +59,7 @@ #include "util.h" #include "ip.h" #include "checksum.h" +#include "iov.h" /* Checksums are optional for UDP over IPv4, so we usually just set * them to 0. Change this to 1 to calculate real UDP over IPv4 @@ -84,7 +85,7 @@ */ /* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ __attribute__((optimize("-fno-strict-aliasing"))) -uint32_t sum_16b(const void *buf, size_t len) +static uint32_t sum_16b(const void *buf, size_t len) { const uint16_t *p = buf; uint32_t sum = 0; @@ -106,7 +107,7 @@ uint32_t sum_16b(const void *buf, size_t len) * * Return: 16-bit folded sum */ -uint16_t csum_fold(uint32_t sum) +static uint16_t csum_fold(uint32_t sum) { while (sum >> 16) sum = (sum & 0xffff) + (sum >> 16); @@ -116,19 +117,19 @@ uint16_t csum_fold(uint32_t sum) /** * csum_ip4_header() - Calculate IPv4 header checksum - * @tot_len: IPv4 payload length (data + IP header, network order) - * @protocol: Protocol number (network order) - * @saddr: IPv4 source address (network order) - * @daddr: IPv4 destination address (network order) + * @l3len: IPv4 packet length (host order) + * @protocol: Protocol number + * @saddr: IPv4 source address + * @daddr: IPv4 destination address * * Return: 16-bit folded sum of the IPv4 header */ -uint16_t csum_ip4_header(uint16_t tot_len, uint8_t protocol, +uint16_t csum_ip4_header(uint16_t l3len, uint8_t protocol, struct in_addr saddr, struct in_addr daddr) { uint32_t sum = L2_BUF_IP4_PSUM(protocol); - sum += tot_len; + sum += htons(l3len); sum += (saddr.s_addr >> 16) & 0xffff; sum += saddr.s_addr & 0xffff; sum += (daddr.s_addr >> 16) & 0xffff; @@ -140,13 +141,13 @@ uint16_t csum_ip4_header(uint16_t tot_len, uint8_t protocol, /** * proto_ipv4_header_psum() - Calculates the partial checksum of an * IPv4 header for UDP or TCP - * @tot_len: IPv4 Payload length (host order) - * @proto: Protocol number (host order) - * @saddr: Source address (network order) - * @daddr: Destination address (network order) + * @l4len: IPv4 Payload length (host order) + * @proto: Protocol number + * @saddr: Source address + * @daddr: Destination address * Returns: Partial checksum of the IPv4 header */ -uint32_t proto_ipv4_header_psum(uint16_t tot_len, uint8_t protocol, +uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol, struct in_addr saddr, struct in_addr daddr) { uint32_t psum = htons(protocol); @@ -155,32 +156,47 @@ uint32_t proto_ipv4_header_psum(uint16_t tot_len, uint8_t protocol, psum += saddr.s_addr & 0xffff; psum += (daddr.s_addr >> 16) & 0xffff; psum += daddr.s_addr & 0xffff; - psum += htons(tot_len); + psum += htons(l4len); return psum; } /** + * csum() - Compute TCP/IP-style checksum + * @buf: Input buffer + * @len: Input length + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum + * + * Return: 16-bit folded, complemented checksum + */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See csum_16b() */ +static uint16_t csum(const void *buf, size_t len, uint32_t init) +{ + return (uint16_t)~csum_fold(csum_unfolded(buf, len, init)); +} + +/** * csum_udp4() - Calculate and set checksum for a UDP over IPv4 packet * @udp4hr: UDP header, initialised apart from checksum * @saddr: IPv4 source address * @daddr: IPv4 destination address - * @payload: ICMPv4 packet payload - * @len: Length of @payload (not including UDP) + * @data: UDP payload (as IO vector tail) */ void csum_udp4(struct udphdr *udp4hr, struct in_addr saddr, struct in_addr daddr, - const void *payload, size_t len) + struct iov_tail *data) { /* UDP checksums are optional, so don't bother */ udp4hr->check = 0; if (UDP4_REAL_CHECKSUMS) { - uint16_t tot_len = len + sizeof(struct udphdr); - uint32_t psum = proto_ipv4_header_psum(tot_len, IPPROTO_UDP, + uint16_t l4len = iov_tail_size(data) + sizeof(struct udphdr); + uint32_t psum = proto_ipv4_header_psum(l4len, IPPROTO_UDP, saddr, daddr); + psum = csum_unfolded(udp4hr, sizeof(struct udphdr), psum); - udp4hr->check = csum(payload, len, psum); + udp4hr->check = csum_iov_tail(data, psum); } } @@ -188,9 +204,9 @@ void csum_udp4(struct udphdr *udp4hr, * csum_icmp4() - Calculate and set checksum for an ICMP packet * @icmp4hr: ICMP header, initialised apart from checksum * @payload: ICMP packet payload - * @len: Length of @payload (not including ICMP header) + * @dlen: Length of @payload (not including ICMP header) */ -void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t len) +void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen) { uint32_t psum; @@ -199,16 +215,16 @@ void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t len) /* Partial checksum for ICMP header alone */ psum = sum_16b(icmp4hr, sizeof(*icmp4hr)); - icmp4hr->checksum = csum(payload, len, psum); + icmp4hr->checksum = csum(payload, dlen, psum); } /** * proto_ipv6_header_psum() - Calculates the partial checksum of an * IPv6 header for UDP or TCP * @payload_len: IPv6 payload length (host order) - * @proto: Protocol number (host order) - * @saddr: Source address (network order) - * @daddr: Destination address (network order) + * @proto: Protocol number + * @saddr: Source address + * @daddr: Destination address * Returns: Partial checksum of the IPv6 header */ uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol, @@ -226,19 +242,22 @@ uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol, /** * csum_udp6() - Calculate and set checksum for a UDP over IPv6 packet * @udp6hr: UDP header, initialised apart from checksum - * @payload: UDP packet payload - * @len: Length of @payload (not including UDP header) + * @saddr: Source address + * @daddr: Destination address + * @data: UDP payload (as IO vector tail) */ void csum_udp6(struct udphdr *udp6hr, const struct in6_addr *saddr, const struct in6_addr *daddr, - const void *payload, size_t len) + struct iov_tail *data) { - uint32_t psum = proto_ipv6_header_psum(len + sizeof(struct udphdr), - IPPROTO_UDP, saddr, daddr); + uint16_t l4len = iov_tail_size(data) + sizeof(struct udphdr); + uint32_t psum = proto_ipv6_header_psum(l4len, IPPROTO_UDP, + saddr, daddr); + udp6hr->check = 0; psum = csum_unfolded(udp6hr, sizeof(struct udphdr), psum); - udp6hr->check = csum(payload, len, psum); + udp6hr->check = csum_iov_tail(data, psum); } /** @@ -247,21 +266,19 @@ void csum_udp6(struct udphdr *udp6hr, * @saddr: IPv6 source address * @daddr: IPv6 destination address * @payload: ICMP packet payload - * @len: Length of @payload (not including ICMPv6 header) + * @dlen: Length of @payload (not including ICMPv6 header) */ void csum_icmp6(struct icmp6hdr *icmp6hr, const struct in6_addr *saddr, const struct in6_addr *daddr, - const void *payload, size_t len) + const void *payload, size_t dlen) { - /* Partial checksum for the pseudo-IPv6 header */ - uint32_t psum = sum_16b(saddr, sizeof(*saddr)) + - sum_16b(daddr, sizeof(*daddr)) + - htons(len + sizeof(*icmp6hr)) + htons(IPPROTO_ICMPV6); + uint32_t psum = proto_ipv6_header_psum(dlen + sizeof(*icmp6hr), + IPPROTO_ICMPV6, saddr, daddr); icmp6hr->icmp6_cksum = 0; /* Add in partial checksum for the ICMPv6 header alone */ psum += sum_16b(icmp6hr, sizeof(*icmp6hr)); - icmp6hr->icmp6_cksum = csum(payload, len, psum); + icmp6hr->icmp6_cksum = csum(payload, dlen, psum); } #ifdef __AVX2__ @@ -450,7 +467,8 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init) intptr_t align = ROUND_UP((intptr_t)buf, sizeof(__m256i)); unsigned int pad = align - (intptr_t)buf; - if (len < pad) + /* Don't mix sum_16b() and csum_avx2() with odd padding lengths */ + if (pad & 1 || len < pad) pad = len; if (pad) @@ -480,36 +498,23 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init) #endif /* !__AVX2__ */ /** - * csum() - Compute TCP/IP-style checksum - * @buf: Input buffer - * @len: Input length - * @init: Initial 32-bit checksum, 0 for no pre-computed checksum - * - * Return: 16-bit folded, complemented checksum - */ -/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ -__attribute__((optimize("-fno-strict-aliasing"))) /* See csum_16b() */ -uint16_t csum(const void *buf, size_t len, uint32_t init) -{ - return (uint16_t)~csum_fold(csum_unfolded(buf, len, init)); -} - -/** - * csum_iov() - Calculates the unfolded checksum over an array of IO vectors - * - * @iov Pointer to the array of IO vectors - * @n Length of the array + * csum_iov_tail() - Calculate unfolded checksum for the tail of an IO vector + * @tail: IO vector tail to checksum * @init Initial 32-bit checksum, 0 for no pre-computed checksum * * Return: 16-bit folded, complemented checksum */ -/* cppcheck-suppress unusedFunction */ -uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init) +uint16_t csum_iov_tail(struct iov_tail *tail, uint32_t init) { - unsigned int i; - - for (i = 0; i < n; i++) - init = csum_unfolded(iov[i].iov_base, iov[i].iov_len, init); - + if (iov_tail_prune(tail)) { + size_t i; + + init = csum_unfolded((char *)tail->iov[0].iov_base + tail->off, + tail->iov[0].iov_len - tail->off, init); + for (i = 1; i < tail->cnt; i++) { + const struct iovec *iov = &tail->iov[i]; + init = csum_unfolded(iov->iov_base, iov->iov_len, init); + } + } return (uint16_t)~csum_fold(init); } @@ -9,29 +9,27 @@ struct udphdr; struct icmphdr; struct icmp6hdr; +struct iov_tail; -uint32_t sum_16b(const void *buf, size_t len); -uint16_t csum_fold(uint32_t sum); uint16_t csum_unaligned(const void *buf, size_t len, uint32_t init); -uint16_t csum_ip4_header(uint16_t tot_len, uint8_t protocol, +uint16_t csum_ip4_header(uint16_t l3len, uint8_t protocol, struct in_addr saddr, struct in_addr daddr); -uint32_t proto_ipv4_header_psum(uint16_t tot_len, uint8_t protocol, +uint32_t proto_ipv4_header_psum(uint16_t l4len, uint8_t protocol, struct in_addr saddr, struct in_addr daddr); void csum_udp4(struct udphdr *udp4hr, struct in_addr saddr, struct in_addr daddr, - const void *payload, size_t len); -void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t len); + struct iov_tail *data); +void csum_icmp4(struct icmphdr *icmp4hr, const void *payload, size_t dlen); uint32_t proto_ipv6_header_psum(uint16_t payload_len, uint8_t protocol, const struct in6_addr *saddr, const struct in6_addr *daddr); void csum_udp6(struct udphdr *udp6hr, const struct in6_addr *saddr, const struct in6_addr *daddr, - const void *payload, size_t len); + struct iov_tail *data); void csum_icmp6(struct icmp6hdr *icmp6hr, const struct in6_addr *saddr, const struct in6_addr *daddr, - const void *payload, size_t len); + const void *payload, size_t dlen); uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init); -uint16_t csum(const void *buf, size_t len, uint32_t init); -uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init); +uint16_t csum_iov_tail(struct iov_tail *tail, uint32_t init); #endif /* CHECKSUM_H */ @@ -16,6 +16,7 @@ #include <errno.h> #include <fcntl.h> #include <getopt.h> +#include <libgen.h> #include <string.h> #include <sched.h> #include <sys/types.h> @@ -38,12 +39,30 @@ #include "ip.h" #include "passt.h" #include "netlink.h" +#include "tap.h" #include "udp.h" #include "tcp.h" #include "pasta.h" #include "lineread.h" #include "isolation.h" #include "log.h" +#include "vhost_user.h" + +#define NETNS_RUN_DIR "/run/netns" + +#define IP4_LL_GUEST_ADDR (struct in_addr){ htonl_constant(0xa9fe0201) } + /* 169.254.2.1, libslirp default: 10.0.2.1 */ + +#define IP4_LL_GUEST_GW (struct in_addr){ htonl_constant(0xa9fe0202) } + /* 169.254.2.2, libslirp default: 10.0.2.2 */ + +#define IP4_LL_PREFIX_LEN 16 + +#define IP6_LL_GUEST_GW (struct in6_addr) \ + {{{ 0xfe, 0x80, 0, 0, 0, 0, 0, 0, \ + 0, 0, 0, 0, 0, 0, 0, 0x01 }}} + +const char *pasta_default_ifn = "tap0"; /** * next_chunk - Return the next piece of a string delimited by a character @@ -106,6 +125,75 @@ static int parse_port_range(const char *s, char **endptr, } /** + * conf_ports_range_except() - Set up forwarding for a range of ports minus a + * bitmap of exclusions + * @c: Execution context + * @optname: Short option name, t, T, u, or U + * @optarg: Option argument (port specification) + * @fwd: Pointer to @fwd_ports to be updated + * @addr: Listening address + * @ifname: Listening interface + * @first: First port to forward + * @last: Last port to forward + * @exclude: Bitmap of ports to exclude + * @to: Port to translate @first to when forwarding + * @weak: Ignore errors, as long as at least one port is mapped + */ +static void conf_ports_range_except(const struct ctx *c, char optname, + const char *optarg, struct fwd_ports *fwd, + const union inany_addr *addr, + const char *ifname, + uint16_t first, uint16_t last, + const uint8_t *exclude, uint16_t to, + bool weak) +{ + bool bound_one = false; + unsigned i; + int ret; + + if (first == 0) { + die("Can't forward port 0 for option '-%c %s'", + optname, optarg); + } + + for (i = first; i <= last; i++) { + if (bitmap_isset(exclude, i)) + continue; + + if (bitmap_isset(fwd->map, i)) { + warn( +"Altering mapping of already mapped port number: %s", optarg); + } + + bitmap_set(fwd->map, i); + fwd->delta[i] = to - first; + + if (optname == 't') + ret = tcp_sock_init(c, addr, ifname, i); + else if (optname == 'u') + ret = udp_sock_init(c, 0, addr, ifname, i); + else + /* No way to check in advance for -T and -U */ + ret = 0; + + if (ret == -ENFILE || ret == -EMFILE) { + die("Can't open enough sockets for port specifier: %s", + optarg); + } + + if (!ret) { + bound_one = true; + } else if (!weak) { + die("Failed to bind port %u (%s) for option '-%c %s'", + i, strerror_(-ret), optname, optarg); + } + } + + if (!bound_one) + die("Failed to bind any port for '-%c %s'", optname, optarg); +} + +/** * conf_ports() - Parse port configuration options, initialise UDP/TCP sockets * @c: Execution context * @optname: Short option name, t, T, u, or U @@ -115,13 +203,11 @@ static int parse_port_range(const char *s, char **endptr, static void conf_ports(const struct ctx *c, char optname, const char *optarg, struct fwd_ports *fwd) { - char addr_buf[sizeof(struct in6_addr)] = { 0 }, *addr = addr_buf; + union inany_addr addr_buf = inany_any6, *addr = &addr_buf; char buf[BUFSIZ], *spec, *ifname = NULL, *p; - bool exclude_only = true, bound_one = false; uint8_t exclude[PORT_BITMAP_SIZE] = { 0 }; - sa_family_t af = AF_UNSPEC; + bool exclude_only = true; unsigned i; - int ret; if (!strcmp(optarg, "none")) { if (fwd->mode) @@ -131,6 +217,11 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, return; } + if ((optname == 't' || optname == 'T') && c->no_tcp) + die("TCP port forwarding requested but TCP is disabled"); + if ((optname == 'u' || optname == 'U') && c->no_udp) + die("UDP port forwarding requested but UDP is disabled"); + if (!strcmp(optarg, "auto")) { if (fwd->mode) goto mode_conflict; @@ -146,33 +237,20 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, if (fwd->mode) goto mode_conflict; - if (c->mode != MODE_PASST) + if (c->mode == MODE_PASTA) die("'all' port forwarding is only allowed for passt"); fwd->mode = FWD_ALL; - memset(fwd->map, 0xff, PORT_EPHEMERAL_MIN / 8); - - for (i = 0; i < PORT_EPHEMERAL_MIN; i++) { - if (optname == 't') { - ret = tcp_sock_init(c, AF_UNSPEC, NULL, NULL, - i); - if (ret == -ENFILE || ret == -EMFILE) - goto enfile; - if (!ret) - bound_one = true; - } else if (optname == 'u') { - ret = udp_sock_init(c, 0, AF_UNSPEC, NULL, NULL, - i); - if (ret == -ENFILE || ret == -EMFILE) - goto enfile; - if (!ret) - bound_one = true; - } - } - if (!bound_one) - goto bind_all_fail; + /* Exclude ephemeral ports */ + for (i = 0; i < NUM_PORTS; i++) + if (fwd_port_is_ephemeral(i)) + bitmap_set(exclude, i); + conf_ports_range_except(c, optname, optarg, fwd, + NULL, NULL, + 1, NUM_PORTS - 1, exclude, + 1, true); return; } @@ -203,14 +281,20 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, } - if (ifname == buf + 1) /* Interface without address */ + if (ifname == buf + 1) { /* Interface without address */ addr = NULL; - else if (inet_pton(AF_INET, buf, addr)) - af = AF_INET; - else if (inet_pton(AF_INET6, buf, addr)) - af = AF_INET6; - else - goto bad; + } else { + p = buf; + + /* Allow square brackets for IPv4 too for convenience */ + if (*p == '[' && p[strlen(p) - 1] == ']') { + p[strlen(p) - 1] = '\0'; + p++; + } + + if (!inany_pton(p, addr)) + goto bad; + } } else { spec = buf; @@ -243,33 +327,15 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, } while ((p = next_chunk(p, ','))); if (exclude_only) { - for (i = 0; i < PORT_EPHEMERAL_MIN; i++) { - if (bitmap_isset(exclude, i)) - continue; - - bitmap_set(fwd->map, i); - - if (optname == 't') { - ret = tcp_sock_init(c, af, addr, ifname, i); - if (ret == -ENFILE || ret == -EMFILE) - goto enfile; - if (!ret) - bound_one = true; - } else if (optname == 'u') { - ret = udp_sock_init(c, 0, af, addr, ifname, i); - if (ret == -ENFILE || ret == -EMFILE) - goto enfile; - if (!ret) - bound_one = true; - } else { - /* No way to check in advance for -T and -U */ - bound_one = true; - } - } - - if (!bound_one) - goto bind_all_fail; - + /* Exclude ephemeral ports */ + for (i = 0; i < NUM_PORTS; i++) + if (fwd_port_is_ephemeral(i)) + bitmap_set(exclude, i); + + conf_ports_range_except(c, optname, optarg, fwd, + addr, ifname, + 1, NUM_PORTS - 1, exclude, + 1, true); return; } @@ -298,94 +364,147 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, if ((*p != '\0') && (*p != ',')) /* Garbage after the ranges */ goto bad; - for (i = orig_range.first; i <= orig_range.last; i++) { - if (bitmap_isset(fwd->map, i)) - warn( -"Altering mapping of already mapped port number: %s", optarg); - - if (bitmap_isset(exclude, i)) - continue; - - bitmap_set(fwd->map, i); - - fwd->delta[i] = mapped_range.first - orig_range.first; - - ret = 0; - if (optname == 't') - ret = tcp_sock_init(c, af, addr, ifname, i); - else if (optname == 'u') - ret = udp_sock_init(c, 0, af, addr, ifname, i); - if (ret) - goto bind_fail; - } + conf_ports_range_except(c, optname, optarg, fwd, + addr, ifname, + orig_range.first, orig_range.last, + exclude, + mapped_range.first, false); } while ((p = next_chunk(p, ','))); return; -enfile: - die("Can't open enough sockets for port specifier: %s", optarg); bad: die("Invalid port specifier %s", optarg); mode_conflict: die("Port forwarding mode '%s' conflicts with previous mode", optarg); -bind_fail: - die("Failed to bind port %u (%s) for option '-%c %s', exiting", - i, strerror(-ret), optname, optarg); -bind_all_fail: - die("Failed to bind any port for '-%c %s', exiting", optname, optarg); } /** * add_dns4() - Possibly add the IPv4 address of a DNS resolver to configuration * @c: Execution context - * @addr: Address found in /etc/resolv.conf - * @conf: Pointer to reference of current entry in array of IPv4 resolvers + * @addr: Guest nameserver IPv4 address + * @idx: Index of free entry in array of IPv4 resolvers + * + * Return: Number of entries added (0 or 1) */ -static void add_dns4(struct ctx *c, const struct in_addr *addr, - struct in_addr **conf) +static unsigned add_dns4(struct ctx *c, const struct in_addr *addr, + unsigned idx) { - /* Guest or container can only access local addresses via redirect */ - if (IN4_IS_ADDR_LOOPBACK(addr)) { - if (!c->no_map_gw) { - **conf = c->ip4.gw; - (*conf)++; - - if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match)) - c->ip4.dns_match = c->ip4.gw; - } - } else { - **conf = *addr; - (*conf)++; - } + if (idx >= ARRAY_SIZE(c->ip4.dns)) + return 0; - if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host)) - c->ip4.dns_host = *addr; + c->ip4.dns[idx] = *addr; + return 1; } /** * add_dns6() - Possibly add the IPv6 address of a DNS resolver to configuration * @c: Execution context - * @addr: Address found in /etc/resolv.conf - * @conf: Pointer to reference of current entry in array of IPv6 resolvers + * @addr: Guest nameserver IPv6 address + * @idx: Index of free entry in array of IPv6 resolvers + * + * Return: Number of entries added (0 or 1) + */ +static unsigned add_dns6(struct ctx *c, const struct in6_addr *addr, + unsigned idx) +{ + if (idx >= ARRAY_SIZE(c->ip6.dns)) + return 0; + + c->ip6.dns[idx] = *addr; + return 1; +} + +/** + * add_dns_resolv4() - Possibly add one IPv4 nameserver from host's resolv.conf + * @c: Execution context + * @ns: Nameserver address + * @idx: Pointer to index of current IPv4 resolver entry, set on return */ -static void add_dns6(struct ctx *c, - struct in6_addr *addr, struct in6_addr **conf) +static void add_dns_resolv4(struct ctx *c, struct in_addr *ns, unsigned *idx) { - /* Guest or container can only access local addresses via redirect */ - if (IN6_IS_ADDR_LOOPBACK(addr)) { - if (!c->no_map_gw) { - memcpy(*conf, &c->ip6.gw, sizeof(**conf)); - (*conf)++; - - if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match)) - memcpy(&c->ip6.dns_match, addr, sizeof(*addr)); + if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host)) + c->ip4.dns_host = *ns; + + /* Special handling if guest or container can only access local + * addresses via redirect, or if the host gateway is also a resolver and + * we shadow its address + */ + if (IN4_IS_ADDR_LOOPBACK(ns) || + IN4_ARE_ADDR_EQUAL(ns, &c->ip4.map_host_loopback)) { + if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match)) { + if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback)) + return; /* Address unreachable */ + + *ns = c->ip4.map_host_loopback; + c->ip4.dns_match = c->ip4.map_host_loopback; + } else { + /* No general host mapping, but requested for DNS + * (--dns-forward and --no-map-gw): advertise resolver + * address from --dns-forward, and map that to loopback + */ + *ns = c->ip4.dns_match; } - } else { - memcpy(*conf, addr, sizeof(**conf)); - (*conf)++; } + *idx += add_dns4(c, ns, *idx); +} + +/** + * add_dns_resolv6() - Possibly add one IPv6 nameserver from host's resolv.conf + * @c: Execution context + * @ns: Nameserver address + * @idx: Pointer to index of current IPv6 resolver entry, set on return + */ +static void add_dns_resolv6(struct ctx *c, struct in6_addr *ns, unsigned *idx) +{ if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host)) - c->ip6.dns_host = *addr; + c->ip6.dns_host = *ns; + + /* Special handling if guest or container can only access local + * addresses via redirect, or if the host gateway is also a resolver and + * we shadow its address + */ + if (IN6_IS_ADDR_LOOPBACK(ns) || + IN6_ARE_ADDR_EQUAL(ns, &c->ip6.map_host_loopback)) { + if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match)) { + if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback)) + return; /* Address unreachable */ + + *ns = c->ip6.map_host_loopback; + c->ip6.dns_match = c->ip6.map_host_loopback; + } else { + /* No general host mapping, but requested for DNS + * (--dns-forward and --no-map-gw): advertise resolver + * address from --dns-forward, and map that to loopback + */ + *ns = c->ip6.dns_match; + } + } + + *idx += add_dns6(c, ns, *idx); +} + +/** + * add_dns_resolv() - Possibly add ns from host resolv.conf to configuration + * @c: Execution context + * @nameserver: Nameserver address string from /etc/resolv.conf + * @idx4: Pointer to index of current entry in array of IPv4 resolvers + * @idx6: Pointer to index of current entry in array of IPv6 resolvers + * + * @idx4 or @idx6 may be NULL, in which case resolvers of the corresponding type + * are ignored. + */ +static void add_dns_resolv(struct ctx *c, const char *nameserver, + unsigned *idx4, unsigned *idx6) +{ + struct in6_addr ns6; + struct in_addr ns4; + + if (idx4 && inet_pton(AF_INET, nameserver, &ns4)) + add_dns_resolv4(c, &ns4, idx4); + + if (idx6 && inet_pton(AF_INET6, nameserver, &ns6)) + add_dns_resolv6(c, &ns6, idx6); } /** @@ -394,18 +513,16 @@ static void add_dns6(struct ctx *c, */ static void get_dns(struct ctx *c) { - struct in6_addr *dns6 = &c->ip6.dns[0], dns6_tmp; - struct in_addr *dns4 = &c->ip4.dns[0], dns4_tmp; int dns4_set, dns6_set, dnss_set, dns_set, fd; + unsigned dns4_idx = 0, dns6_idx = 0; struct fqdn *s = c->dns_search; struct lineread resolvconf; - unsigned int added = 0; + ssize_t line_len; char *line, *end; const char *p; - int line_len; - dns4_set = !c->ifi4 || !IN4_IS_ADDR_UNSPECIFIED(dns4); - dns6_set = !c->ifi6 || !IN6_IS_ADDR_UNSPECIFIED(dns6); + dns4_set = !c->ifi4 || !IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns[0]); + dns6_set = !c->ifi6 || !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns[0]); dnss_set = !!*s->n || c->no_dns_search; dns_set = (dns4_set && dns6_set) || c->no_dns; @@ -426,19 +543,9 @@ static void get_dns(struct ctx *c) if (end) *end = 0; - if (!dns4_set && - dns4 - &c->ip4.dns[0] < ARRAY_SIZE(c->ip4.dns) - 1 - && inet_pton(AF_INET, p + 1, &dns4_tmp)) { - add_dns4(c, &dns4_tmp, &dns4); - added++; - } - - if (!dns6_set && - dns6 - &c->ip6.dns[0] < ARRAY_SIZE(c->ip6.dns) - 1 - && inet_pton(AF_INET6, p + 1, &dns6_tmp)) { - add_dns6(c, &dns6_tmp, &dns6); - added++; - } + add_dns_resolv(c, p + 1, + dns4_set ? NULL : &dns4_idx, + dns6_set ? NULL : &dns6_idx); } else if (!dnss_set && strstr(line, "search ") == line && s == c->dns_search) { end = strpbrk(line, "\n"); @@ -452,7 +559,7 @@ static void get_dns(struct ctx *c) while (s - c->dns_search < ARRAY_SIZE(c->dns_search) - 1 /* cppcheck-suppress strtokCalled */ && (p = strtok(NULL, " \t"))) { - strncpy(s->n, p, sizeof(c->dns_search[0])); + strncpy(s->n, p, sizeof(c->dns_search[0]) - 1); s++; *s->n = 0; } @@ -460,12 +567,12 @@ static void get_dns(struct ctx *c) } if (line_len < 0) - warn("Error reading /etc/resolv.conf: %s", strerror(errno)); + warn_perror("Error reading /etc/resolv.conf"); close(fd); out: if (!dns_set) { - if (!added) + if (!(dns4_idx + dns6_idx)) warn("Couldn't get any nameserver address"); if (c->no_dhcp_dns) @@ -516,9 +623,6 @@ static void conf_netns_opt(char *netns, const char *arg) static void conf_pasta_ns(int *netns_only, char *userns, char *netns, int optind, int argc, char *argv[]) { - if (*netns_only && *userns) - die("Both --userns and --netns-only given"); - if (*netns && optind != argc) die("Both --netns and PID or command given"); @@ -532,10 +636,15 @@ static void conf_pasta_ns(int *netns_only, char *userns, char *netns, if (pidval < 0 || pidval > INT_MAX) die("Invalid PID %s", argv[optind]); - snprintf(netns, PATH_MAX, "/proc/%ld/ns/net", pidval); - if (!*userns) - snprintf(userns, PATH_MAX, "/proc/%ld/ns/user", - pidval); + if (snprintf_check(netns, PATH_MAX, + "/proc/%ld/ns/net", pidval)) + die_perror("Can't build netns path"); + + if (!*userns) { + if (snprintf_check(userns, PATH_MAX, + "/proc/%ld/ns/user", pidval)) + die_perror("Can't build userns path"); + } } } @@ -573,26 +682,25 @@ static int conf_ip4_prefix(const char *arg) * conf_ip4() - Verify or detect IPv4 support, get relevant addresses * @ifi: Host interface to attempt (0 to determine one) * @ip4: IPv4 context (will be written) - * @mac: MAC address to use (written if unset) * * Return: Interface index for IPv4, or 0 on failure. */ -static unsigned int conf_ip4(unsigned int ifi, - struct ip4_ctx *ip4, unsigned char *mac) +static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4) { if (!ifi) ifi = nl_get_ext_if(nl_sock, AF_INET); if (!ifi) { - info("Couldn't pick external interface: disabling IPv4"); + debug("Failed to detect external interface for IPv4"); return 0; } - if (IN4_IS_ADDR_UNSPECIFIED(&ip4->gw)) { - int rc = nl_route_get_def(nl_sock, ifi, AF_INET, &ip4->gw); + if (IN4_IS_ADDR_UNSPECIFIED(&ip4->guest_gw)) { + int rc = nl_route_get_def(nl_sock, ifi, AF_INET, + &ip4->guest_gw); if (rc < 0) { - err("Couldn't discover IPv4 gateway address: %s", - strerror(-rc)); + debug("Couldn't discover IPv4 gateway address: %s", + strerror_(-rc)); return 0; } } @@ -601,8 +709,8 @@ static unsigned int conf_ip4(unsigned int ifi, int rc = nl_addr_get(nl_sock, ifi, AF_INET, &ip4->addr, &ip4->prefix_len, NULL); if (rc < 0) { - err("Couldn't discover IPv4 address: %s", - strerror(-rc)); + debug("Couldn't discover IPv4 address: %s", + strerror_(-rc)); return 0; } } @@ -619,35 +727,37 @@ static unsigned int conf_ip4(unsigned int ifi, ip4->prefix_len = 32; } - memcpy(&ip4->addr_seen, &ip4->addr, sizeof(ip4->addr_seen)); + ip4->addr_seen = ip4->addr; - if (MAC_IS_ZERO(mac)) { - int rc = nl_link_get_mac(nl_sock, ifi, mac); - if (rc < 0) { - char ifname[IFNAMSIZ]; - err("Couldn't discover MAC for %s: %s", - if_indextoname(ifi, ifname), strerror(-rc)); - return 0; - } - } + ip4->our_tap_addr = ip4->guest_gw; - if (IN4_IS_ADDR_UNSPECIFIED(&ip4->addr) || - MAC_IS_ZERO(mac)) + if (IN4_IS_ADDR_UNSPECIFIED(&ip4->addr)) return 0; return ifi; } /** + * conf_ip4_local() - Configure IPv4 addresses and attributes for local mode + * @ip4: IPv4 context (will be written) + */ +static void conf_ip4_local(struct ip4_ctx *ip4) +{ + ip4->addr_seen = ip4->addr = IP4_LL_GUEST_ADDR; + ip4->our_tap_addr = ip4->guest_gw = IP4_LL_GUEST_GW; + ip4->prefix_len = IP4_LL_PREFIX_LEN; + + ip4->no_copy_addrs = ip4->no_copy_routes = true; +} + +/** * conf_ip6() - Verify or detect IPv6 support, get relevant addresses * @ifi: Host interface to attempt (0 to determine one) * @ip6: IPv6 context (will be written) - * @mac: MAC address to use (written if unset) * * Return: Interface index for IPv6, or 0 on failure. */ -static unsigned int conf_ip6(unsigned int ifi, - struct ip6_ctx *ip6, unsigned char *mac) +static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6) { int prefix_len = 0; int rc; @@ -656,249 +766,311 @@ static unsigned int conf_ip6(unsigned int ifi, ifi = nl_get_ext_if(nl_sock, AF_INET6); if (!ifi) { - info("Couldn't pick external interface: disabling IPv6"); + debug("Failed to detect external interface for IPv6"); return 0; } - if (IN6_IS_ADDR_UNSPECIFIED(&ip6->gw)) { - rc = nl_route_get_def(nl_sock, ifi, AF_INET6, &ip6->gw); + if (IN6_IS_ADDR_UNSPECIFIED(&ip6->guest_gw)) { + rc = nl_route_get_def(nl_sock, ifi, AF_INET6, &ip6->guest_gw); if (rc < 0) { - err("Couldn't discover IPv6 gateway address: %s", - strerror(-rc)); + debug("Couldn't discover IPv6 gateway address: %s", + strerror_(-rc)); return 0; } } rc = nl_addr_get(nl_sock, ifi, AF_INET6, IN6_IS_ADDR_UNSPECIFIED(&ip6->addr) ? &ip6->addr : NULL, - &prefix_len, &ip6->addr_ll); + &prefix_len, &ip6->our_tap_ll); if (rc < 0) { - err("Couldn't discover IPv6 address: %s", strerror(-rc)); + debug("Couldn't discover IPv6 address: %s", strerror_(-rc)); return 0; } - memcpy(&ip6->addr_seen, &ip6->addr, sizeof(ip6->addr)); - memcpy(&ip6->addr_ll_seen, &ip6->addr_ll, sizeof(ip6->addr_ll)); + ip6->addr_seen = ip6->addr; - if (MAC_IS_ZERO(mac)) { - rc = nl_link_get_mac(nl_sock, ifi, mac); - if (rc < 0) { - char ifname[IFNAMSIZ]; - err("Couldn't discover MAC for %s: %s", - if_indextoname(ifi, ifname), strerror(-rc)); - return 0; - } - } + if (IN6_IS_ADDR_LINKLOCAL(&ip6->guest_gw)) + ip6->our_tap_ll = ip6->guest_gw; if (IN6_IS_ADDR_UNSPECIFIED(&ip6->addr) || - IN6_IS_ADDR_UNSPECIFIED(&ip6->addr_ll) || - MAC_IS_ZERO(mac)) + IN6_IS_ADDR_UNSPECIFIED(&ip6->our_tap_ll)) return 0; return ifi; } /** - * print_usage() - Print usage, exit with given status code + * conf_ip6_local() - Configure IPv6 addresses and attributes for local mode + * @ip6: IPv6 context (will be written) + */ +static void conf_ip6_local(struct ip6_ctx *ip6) +{ + ip6->our_tap_ll = ip6->guest_gw = IP6_LL_GUEST_GW; + + ip6->no_copy_addrs = ip6->no_copy_routes = true; +} + +/** + * usage() - Print usage, exit with given status code * @name: Executable name - * @status: Status code for exit() + * @f: Stream to print usage info to + * @status: Status code for _exit() */ -static void print_usage(const char *name, int status) +static void usage(const char *name, FILE *f, int status) { if (strstr(name, "pasta")) { - info("Usage: %s [OPTION]... [COMMAND] [ARGS]...", name); - info(" %s [OPTION]... PID", name); - info(" %s [OPTION]... --netns [PATH|NAME]", name); - info(""); - info("Without PID or --netns, run the given command or a"); - info("default shell in a new network and user namespace, and"); - info("connect it via pasta."); + FPRINTF(f, "Usage: %s [OPTION]... [COMMAND] [ARGS]...\n", name); + FPRINTF(f, " %s [OPTION]... PID\n", name); + FPRINTF(f, " %s [OPTION]... --netns [PATH|NAME]\n", name); + FPRINTF(f, + "\n" + "Without PID or --netns, run the given command or a\n" + "default shell in a new network and user namespace, and\n" + "connect it via pasta.\n"); } else { - info("Usage: %s [OPTION]...", name); + FPRINTF(f, "Usage: %s [OPTION]...\n", name); } - info(""); - - - info( " -d, --debug Be verbose"); - info( " --trace Be extra verbose, implies --debug"); - info( " -q, --quiet Don't print informational messages"); - info( " -f, --foreground Don't run in background"); - info( " default: run in background if started from a TTY"); - info( " -e, --stderr Log to stderr too"); - info( " default: log to system logger only if started from a TTY"); - info( " -l, --log-file PATH Log (only) to given file"); - info( " --log-size BYTES Maximum size of log file"); - info( " default: 1 MiB"); - info( " --runas UID|UID:GID Run as given UID, GID, which can be"); - info( " numeric, or login and group names"); - info( " default: drop to user \"nobody\""); - info( " -h, --help Display this help message and exit"); - info( " --version Show version and exit"); + + FPRINTF(f, + "\n" + " -d, --debug Be verbose\n" + " --trace Be extra verbose, implies --debug\n" + " -q, --quiet Don't print informational messages\n" + " -f, --foreground Don't run in background\n" + " default: run in background\n" + " -l, --log-file PATH Log (only) to given file\n" + " --log-size BYTES Maximum size of log file\n" + " default: 1 MiB\n" + " --runas UID|UID:GID Run as given UID, GID, which can be\n" + " numeric, or login and group names\n" + " default: drop to user \"nobody\"\n" + " -h, --help Display this help message and exit\n" + " --version Show version and exit\n"); if (strstr(name, "pasta")) { - info( " -I, --ns-ifname NAME namespace interface name"); - info( " default: same interface name as external one"); + FPRINTF(f, + " -I, --ns-ifname NAME namespace interface name\n" + " default: same interface name as external one\n"); } else { - info( " -s, --socket PATH UNIX domain socket path"); - info( " default: probe free path starting from " - UNIX_SOCK_PATH, 1); + FPRINTF(f, + " -s, --socket, --socket-path PATH UNIX domain socket path\n" + " default: probe free path starting from " + UNIX_SOCK_PATH "\n", 1); + FPRINTF(f, + " --vhost-user Enable vhost-user mode\n" + " UNIX domain socket is provided by -s option\n" + " --print-capabilities print back-end capabilities in JSON format,\n" + " only meaningful for vhost-user mode\n"); + FPRINTF(f, + " --repair-path PATH path for passt-repair(1)\n" + " default: append '.repair' to UNIX domain path\n"); } - info( " -F, --fd FD Use FD as pre-opened connected socket"); - info( " -p, --pcap FILE Log tap-facing traffic to pcap file"); - info( " -P, --pid FILE Write own PID to the given file"); - info( " -m, --mtu MTU Assign MTU via DHCP/NDP"); - info( " a zero value disables assignment"); - info( " default: 65520: maximum 802.3 MTU minus 802.3 header"); - info( " length, rounded to 32 bits (IPv4 words)"); - info( " -a, --address ADDR Assign IPv4 or IPv6 address ADDR"); - info( " can be specified zero to two times (for IPv4 and IPv6)"); - info( " default: use addresses from interface with default route"); - info( " -n, --netmask MASK Assign IPv4 MASK, dot-decimal or bits"); - info( " default: netmask from matching address on the host"); - info( " -M, --mac-addr ADDR Use source MAC address ADDR"); - info( " default: MAC address from interface with default route"); - info( " -g, --gateway ADDR Pass IPv4 or IPv6 address as gateway"); - info( " default: gateway from interface with default route"); - info( " -i, --interface NAME Interface for addresses and routes"); - info( " default: from --outbound-if4 and --outbound-if6, if any"); - info( " otherwise interface with first default route"); - info( " -o, --outbound ADDR Bind to address as outbound source"); - info( " can be specified zero to two times (for IPv4 and IPv6)"); - info( " default: use source address from routing tables"); - info( " --outbound-if4 NAME Bind to outbound interface for IPv4"); - info( " default: use interface from default route"); - info( " --outbound-if6 NAME Bind to outbound interface for IPv6"); - info( " default: use interface from default route"); - info( " -D, --dns ADDR Use IPv4 or IPv6 address as DNS"); - info( " can be specified multiple times"); - info( " a single, empty option disables DNS information"); + FPRINTF(f, + " -F, --fd FD Use FD as pre-opened connected socket\n" + " -p, --pcap FILE Log tap-facing traffic to pcap file\n" + " -P, --pid FILE Write own PID to the given file\n" + " -m, --mtu MTU Assign MTU via DHCP/NDP\n" + " a zero value disables assignment\n" + " default: 65520: maximum 802.3 MTU minus 802.3 header\n" + " length, rounded to 32 bits (IPv4 words)\n" + " -a, --address ADDR Assign IPv4 or IPv6 address ADDR\n" + " can be specified zero to two times (for IPv4 and IPv6)\n" + " default: use addresses from interface with default route\n" + " -n, --netmask MASK Assign IPv4 MASK, dot-decimal or bits\n" + " default: netmask from matching address on the host\n" + " -M, --mac-addr ADDR Use source MAC address ADDR\n" + " default: 9a:55:9a:55:9a:55 (locally administered)\n" + " -g, --gateway ADDR Pass IPv4 or IPv6 address as gateway\n" + " default: gateway from interface with default route\n" + " -i, --interface NAME Interface for addresses and routes\n" + " default: from --outbound-if4 and --outbound-if6, if any\n" + " otherwise interface with first default route\n" + " -o, --outbound ADDR Bind to address as outbound source\n" + " can be specified zero to two times (for IPv4 and IPv6)\n" + " default: use source address from routing tables\n" + " --outbound-if4 NAME Bind to outbound interface for IPv4\n" + " default: use interface from default route\n" + " --outbound-if6 NAME Bind to outbound interface for IPv6\n" + " default: use interface from default route\n" + " -D, --dns ADDR Use IPv4 or IPv6 address as DNS\n" + " can be specified multiple times\n" + " a single, empty option disables DNS information\n"); if (strstr(name, "pasta")) - info( " default: don't use any addresses"); + FPRINTF(f, " default: don't use any addresses\n"); else - info( " default: use addresses from /etc/resolv.conf"); - - info( " -S, --search LIST Space-separated list, search domains"); - info( " a single, empty option disables the DNS search list"); + FPRINTF(f, " default: use addresses from /etc/resolv.conf\n"); + FPRINTF(f, + " -S, --search LIST Space-separated list, search domains\n" + " a single, empty option disables the DNS search list\n" + " -H, --hostname NAME Hostname to configure client with\n" + " --fqdn NAME FQDN to configure client with\n"); if (strstr(name, "pasta")) - info( " default: don't use any search list"); + FPRINTF(f, " default: don't use any search list\n"); else - info( " default: use search list from /etc/resolv.conf"); + FPRINTF(f, " default: use search list from /etc/resolv.conf\n"); if (strstr(name, "pasta")) - info(" --dhcp-dns \tPass DNS list via DHCP/DHCPv6/NDP"); + FPRINTF(f, " --dhcp-dns \tPass DNS list via DHCP/DHCPv6/NDP\n"); else - info(" --no-dhcp-dns No DNS list in DHCP/DHCPv6/NDP"); + FPRINTF(f, " --no-dhcp-dns No DNS list in DHCP/DHCPv6/NDP\n"); if (strstr(name, "pasta")) - info(" --dhcp-search Pass list via DHCP/DHCPv6/NDP"); + FPRINTF(f, " --dhcp-search Pass list via DHCP/DHCPv6/NDP\n"); else - info(" --no-dhcp-search No list in DHCP/DHCPv6/NDP"); - - info( " --dns-forward ADDR Forward DNS queries sent to ADDR"); - info( " can be specified zero to two times (for IPv4 and IPv6)"); - info( " default: don't forward DNS queries"); - - info( " --no-tcp Disable TCP protocol handler"); - info( " --no-udp Disable UDP protocol handler"); - info( " --no-icmp Disable ICMP/ICMPv6 protocol handler"); - info( " --no-dhcp Disable DHCP server"); - info( " --no-ndp Disable NDP responses"); - info( " --no-dhcpv6 Disable DHCPv6 server"); - info( " --no-ra Disable router advertisements"); - info( " --no-map-gw Don't map gateway address to host"); - info( " -4, --ipv4-only Enable IPv4 operation only"); - info( " -6, --ipv6-only Enable IPv6 operation only"); + FPRINTF(f, " --no-dhcp-search No list in DHCP/DHCPv6/NDP\n"); + + FPRINTF(f, + " --map-host-loopback ADDR Translate ADDR to refer to host\n" + " can be specified zero to two times (for IPv4 and IPv6)\n" + " default: gateway address\n" + " --map-guest-addr ADDR Translate ADDR to guest's address\n" + " can be specified zero to two times (for IPv4 and IPv6)\n" + " default: none\n" + " --dns-forward ADDR Forward DNS queries sent to ADDR\n" + " can be specified zero to two times (for IPv4 and IPv6)\n" + " default: don't forward DNS queries\n" + " --dns-host ADDR Host nameserver to direct queries to\n" + " can be specified zero to two times (for IPv4 and IPv6)\n" + " default: first nameserver from host's /etc/resolv.conf\n" + " --no-tcp Disable TCP protocol handler\n" + " --no-udp Disable UDP protocol handler\n" + " --no-icmp Disable ICMP/ICMPv6 protocol handler\n" + " --no-dhcp Disable DHCP server\n" + " --no-ndp Disable NDP responses\n" + " --no-dhcpv6 Disable DHCPv6 server\n" + " --no-ra Disable router advertisements\n" + " --freebind Bind to any address for forwarding\n" + " --no-map-gw Don't map gateway address to host\n" + " -4, --ipv4-only Enable IPv4 operation only\n" + " -6, --ipv6-only Enable IPv6 operation only\n"); if (strstr(name, "pasta")) goto pasta_opts; - info( " -1, --one-off Quit after handling one single client"); - info( " -t, --tcp-ports SPEC TCP port forwarding to guest"); - info( " can be specified multiple times"); - info( " SPEC can be:"); - info( " 'none': don't forward any ports"); - info( " 'all': forward all unbound, non-ephemeral ports"); - info( " a comma-separated list, optionally ranged with '-'"); - info( " and optional target ports after ':', with optional"); - info( " address specification suffixed by '/' and optional"); - info( " interface prefixed by '%%'. Ranges can be reduced by"); - info( " excluding ports or ranges prefixed by '~'"); - info( " Examples:"); - info( " -t 22 Forward local port 22 to 22 on guest"); - info( " -t 22:23 Forward local port 22 to 23 on guest"); - info( " -t 22,25 Forward ports 22, 25 to ports 22, 25"); - info( " -t 22-80 Forward ports 22 to 80"); - info( " -t 22-80:32-90 Forward ports 22 to 80 to"); - info( " corresponding port numbers plus 10"); - info( " -t 192.0.2.1/5 Bind port 5 of 192.0.2.1 to guest"); - info( " -t 5-25,~10-20 Forward ports 5 to 9, and 21 to 25"); - info( " -t ~25 Forward all ports except for 25"); - info( " default: none"); - info( " -u, --udp-ports SPEC UDP port forwarding to guest"); - info( " SPEC is as described for TCP above"); - info( " default: none"); - - exit(status); + FPRINTF(f, + " -1, --one-off Quit after handling one single client\n" + " -t, --tcp-ports SPEC TCP port forwarding to guest\n" + " can be specified multiple times\n" + " SPEC can be:\n" + " 'none': don't forward any ports\n" + " 'all': forward all unbound, non-ephemeral ports\n" + " a comma-separated list, optionally ranged with '-'\n" + " and optional target ports after ':', with optional\n" + " address specification suffixed by '/' and optional\n" + " interface prefixed by '%%'. Ranges can be reduced by\n" + " excluding ports or ranges prefixed by '~'\n" + " Examples:\n" + " -t 22 Forward local port 22 to 22 on guest\n" + " -t 22:23 Forward local port 22 to 23 on guest\n" + " -t 22,25 Forward ports 22, 25 to ports 22, 25\n" + " -t 22-80 Forward ports 22 to 80\n" + " -t 22-80:32-90 Forward ports 22 to 80 to\n" + " corresponding port numbers plus 10\n" + " -t 192.0.2.1/5 Bind port 5 of 192.0.2.1 to guest\n" + " -t 5-25,~10-20 Forward ports 5 to 9, and 21 to 25\n" + " -t ~25 Forward all ports except for 25\n" + " default: none\n" + " -u, --udp-ports SPEC UDP port forwarding to guest\n" + " SPEC is as described for TCP above\n" + " default: none\n"); + + (void)fflush(f); + _exit(status); pasta_opts: - info( " -t, --tcp-ports SPEC TCP port forwarding to namespace"); - info( " can be specified multiple times"); - info( " SPEC can be:"); - info( " 'none': don't forward any ports"); - info( " 'auto': forward all ports currently bound in namespace"); - info( " a comma-separated list, optionally ranged with '-'"); - info( " and optional target ports after ':', with optional"); - info( " address specification suffixed by '/' and optional"); - info( " interface prefixed by '%%'. Examples:"); - info( " -t 22 Forward local port 22 to port 22 in netns"); - info( " -t 22:23 Forward local port 22 to port 23"); - info( " -t 22,25 Forward ports 22, 25 to ports 22, 25"); - info( " -t 22-80 Forward ports 22 to 80"); - info( " -t 22-80:32-90 Forward ports 22 to 80 to"); - info( " corresponding port numbers plus 10"); - info( " -t 192.0.2.1/5 Bind port 5 of 192.0.2.1 to namespace"); - info( " -t 5-25,~10-20 Forward ports 5 to 9, and 21 to 25"); - info( " -t ~25 Forward all bound ports except for 25"); - info( " default: auto"); - info( " IPv6 bound ports are also forwarded for IPv4"); - info( " -u, --udp-ports SPEC UDP port forwarding to namespace"); - info( " SPEC is as described for TCP above"); - info( " default: auto"); - info( " IPv6 bound ports are also forwarded for IPv4"); - info( " unless specified, with '-t auto', UDP ports with numbers"); - info( " corresponding to forwarded TCP port numbers are"); - info( " forwarded too"); - info( " -T, --tcp-ns SPEC TCP port forwarding to init namespace"); - info( " SPEC is as described above"); - info( " default: auto"); - info( " -U, --udp-ns SPEC UDP port forwarding to init namespace"); - info( " SPEC is as described above"); - info( " default: auto"); - info( " --userns NSPATH Target user namespace to join"); - info( " --netns PATH|NAME Target network namespace to join"); - info( " --netns-only Don't join existing user namespace"); - info( " implied if PATH or NAME are given without --userns"); - info( " --no-netns-quit Don't quit if filesystem-bound target"); - info( " network namespace is deleted"); - info( " --config-net Configure tap interface in namespace"); - info( " --no-copy-routes DEPRECATED:"); - info( " Don't copy all routes to namespace"); - info( " --no-copy-addrs DEPRECATED:"); - info( " Don't copy all addresses to namespace"); - info( " --ns-mac-addr ADDR Set MAC address on tap interface"); - - exit(status); + FPRINTF(f, + " -t, --tcp-ports SPEC TCP port forwarding to namespace\n" + " can be specified multiple times\n" + " SPEC can be:\n" + " 'none': don't forward any ports\n" + " 'auto': forward all ports currently bound in namespace\n" + " a comma-separated list, optionally ranged with '-'\n" + " and optional target ports after ':', with optional\n" + " address specification suffixed by '/' and optional\n" + " interface prefixed by '%%'. Examples:\n" + " -t 22 Forward local port 22 to port 22 in netns\n" + " -t 22:23 Forward local port 22 to port 23\n" + " -t 22,25 Forward ports 22, 25 to ports 22, 25\n" + " -t 22-80 Forward ports 22 to 80\n" + " -t 22-80:32-90 Forward ports 22 to 80 to\n" + " corresponding port numbers plus 10\n" + " -t 192.0.2.1/5 Bind port 5 of 192.0.2.1 to namespace\n" + " -t 5-25,~10-20 Forward ports 5 to 9, and 21 to 25\n" + " -t ~25 Forward all bound ports except for 25\n" + " default: auto\n" + " IPv6 bound ports are also forwarded for IPv4\n" + " -u, --udp-ports SPEC UDP port forwarding to namespace\n" + " SPEC is as described for TCP above\n" + " default: auto\n" + " IPv6 bound ports are also forwarded for IPv4\n" + " unless specified, with '-t auto', UDP ports with numbers\n" + " corresponding to forwarded TCP port numbers are\n" + " forwarded too\n" + " -T, --tcp-ns SPEC TCP port forwarding to init namespace\n" + " SPEC is as described above\n" + " default: auto\n" + " -U, --udp-ns SPEC UDP port forwarding to init namespace\n" + " SPEC is as described above\n" + " default: auto\n" + " --host-lo-to-ns-lo Translate host-loopback forwards to\n" + " namespace loopback\n" + " --userns NSPATH Target user namespace to join\n" + " --netns PATH|NAME Target network namespace to join\n" + " --netns-only Don't join existing user namespace\n" + " implied if PATH or NAME are given without --userns\n" + " --no-netns-quit Don't quit if filesystem-bound target\n" + " network namespace is deleted\n" + " --config-net Configure tap interface in namespace\n" + " --no-copy-routes DEPRECATED:\n" + " Don't copy all routes to namespace\n" + " --no-copy-addrs DEPRECATED:\n" + " Don't copy all addresses to namespace\n" + " --ns-mac-addr ADDR Set MAC address on tap interface\n" + " --no-splice Disable inbound socket splicing\n"); + + (void)fflush(f); + _exit(status); } /** - * usage() - Print usage and exit with failure - * @name: Executable name + * conf_mode() - Determine passt/pasta's operating mode from command line + * @argc: Argument count + * @argv: Command line arguments + * + * Return: mode to operate in, PASTA or PASST */ -static void usage(const char *name) +enum passt_modes conf_mode(int argc, char *argv[]) { - print_usage(name, EXIT_FAILURE); + int vhost_user = 0; + const struct option optvu[] = { + {"vhost-user", no_argument, &vhost_user, 1 }, + { 0 }, + }; + char argv0[PATH_MAX], *basearg0; + int name; + + optind = 0; + do { + name = getopt_long(argc, argv, "-:", optvu, NULL); + } while (name != -1); + + if (vhost_user) + return MODE_VU; + + if (argc < 1) + die("Cannot determine argv[0]"); + + strncpy(argv0, argv[0], PATH_MAX - 1); + basearg0 = basename(argv0); + if (strstr(basearg0, "pasta")) + return MODE_PASTA; + + if (strstr(basearg0, "passt")) + return MODE_PASST; + + die("Cannot determine mode, invoke as \"passt\" or \"pasta\""); } /** @@ -907,15 +1079,18 @@ static void usage(const char *name) */ static void conf_print(const struct ctx *c) { - char buf4[INET_ADDRSTRLEN], buf6[INET6_ADDRSTRLEN], ifn[IFNAMSIZ]; + char buf4[INET_ADDRSTRLEN], buf6[INET6_ADDRSTRLEN]; + char bufmac[ETH_ADDRSTRLEN], ifn[IFNAMSIZ]; int i; - info("Template interface: %s%s%s%s%s", - c->ifi4 ? if_indextoname(c->ifi4, ifn) : "", - c->ifi4 ? " (IPv4)" : "", - (c->ifi4 && c->ifi6) ? ", " : "", - c->ifi6 ? if_indextoname(c->ifi6, ifn) : "", - c->ifi6 ? " (IPv6)" : ""); + if (c->ifi4 > 0 || c->ifi6 > 0) { + info("Template interface: %s%s%s%s%s", + c->ifi4 > 0 ? if_indextoname(c->ifi4, ifn) : "", + c->ifi4 > 0 ? " (IPv4)" : "", + (c->ifi4 && c->ifi6) ? ", " : "", + c->ifi6 > 0 ? if_indextoname(c->ifi6, ifn) : "", + c->ifi6 > 0 ? " (IPv6)" : ""); + } if (*c->ip4.ifname_out || *c->ip6.ifname_out) { info("Outbound interface: %s%s%s%s%s", @@ -941,11 +1116,14 @@ static void conf_print(const struct ctx *c) info("Namespace interface: %s", c->pasta_ifn); info("MAC:"); - info(" host: %02x:%02x:%02x:%02x:%02x:%02x", - c->mac[0], c->mac[1], c->mac[2], - c->mac[3], c->mac[4], c->mac[5]); + info(" host: %s", eth_ntop(c->our_tap_mac, bufmac, sizeof(bufmac))); if (c->ifi4) { + if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback)) + info(" NAT to host 127.0.0.1: %s", + inet_ntop(AF_INET, &c->ip4.map_host_loopback, + buf4, sizeof(buf4))); + if (!c->no_dhcp) { uint32_t mask; @@ -957,7 +1135,8 @@ static void conf_print(const struct ctx *c) info(" mask: %s", inet_ntop(AF_INET, &mask, buf4, sizeof(buf4))); info(" router: %s", - inet_ntop(AF_INET, &c->ip4.gw, buf4, sizeof(buf4))); + inet_ntop(AF_INET, &c->ip4.guest_gw, + buf4, sizeof(buf4))); } for (i = 0; !IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns[i]); i++) { @@ -975,11 +1154,16 @@ static void conf_print(const struct ctx *c) } if (c->ifi6) { + if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback)) + info(" NAT to host ::1: %s", + inet_ntop(AF_INET6, &c->ip6.map_host_loopback, + buf6, sizeof(buf6))); + if (!c->no_ndp && !c->no_dhcpv6) info("NDP/DHCPv6:"); - else if (!c->no_ndp) - info("DHCPv6:"); else if (!c->no_dhcpv6) + info("DHCPv6:"); + else if (!c->no_ndp) info("NDP:"); else goto dns6; @@ -987,9 +1171,10 @@ static void conf_print(const struct ctx *c) info(" assign: %s", inet_ntop(AF_INET6, &c->ip6.addr, buf6, sizeof(buf6))); info(" router: %s", - inet_ntop(AF_INET6, &c->ip6.gw, buf6, sizeof(buf6))); + inet_ntop(AF_INET6, &c->ip6.guest_gw, buf6, sizeof(buf6))); info(" our link-local: %s", - inet_ntop(AF_INET6, &c->ip6.addr_ll, buf6, sizeof(buf6))); + inet_ntop(AF_INET6, &c->ip6.our_tap_ll, + buf6, sizeof(buf6))); dns6: for (i = 0; !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns[i]); i++) { @@ -1089,16 +1274,14 @@ static void conf_ugid(char *runas, uid_t *uid, gid_t *gid) return; /* ...otherwise use nobody:nobody */ - warn("Don't run as root. Changing to nobody..."); + warn("Started as root, will change to nobody."); { #ifndef GLIBC_NO_STATIC_NSS const struct passwd *pw; /* cppcheck-suppress getpwnamCalled */ pw = getpwnam("nobody"); - if (!pw) { - perror("getpwnam"); - exit(EXIT_FAILURE); - } + if (!pw) + die_perror("Can't get password file entry for nobody"); *uid = pw->pw_uid; *gid = pw->pw_gid; @@ -1110,6 +1293,106 @@ static void conf_ugid(char *runas, uid_t *uid, gid_t *gid) } /** + * conf_nat() - Parse --map-host-loopback or --map-guest-addr option + * @arg: String argument to option + * @addr4: IPv4 to update with parsed address + * @addr6: IPv6 to update with parsed address + * @no_map_gw: --no-map-gw flag, or NULL, updated for "none" argument + */ +static void conf_nat(const char *arg, struct in_addr *addr4, + struct in6_addr *addr6, int *no_map_gw) +{ + if (strcmp(arg, "none") == 0) { + *addr4 = in4addr_any; + *addr6 = in6addr_any; + if (no_map_gw) + *no_map_gw = 1; + + return; + } + + if (inet_pton(AF_INET6, arg, addr6) && + !IN6_IS_ADDR_UNSPECIFIED(addr6) && + !IN6_IS_ADDR_LOOPBACK(addr6) && + !IN6_IS_ADDR_MULTICAST(addr6)) + return; + + if (inet_pton(AF_INET, arg, addr4) && + !IN4_IS_ADDR_UNSPECIFIED(addr4) && + !IN4_IS_ADDR_LOOPBACK(addr4) && + !IN4_IS_ADDR_MULTICAST(addr4)) + return; + + die("Invalid address to remap to host: %s", optarg); +} + +/** + * conf_open_files() - Open files as requested by configuration + * @c: Execution context + */ +static void conf_open_files(struct ctx *c) +{ + if (c->mode != MODE_PASTA && c->fd_tap == -1) { + c->fd_tap_listen = sock_unix(c->sock_path); + + if (c->mode == MODE_VU && strcmp(c->repair_path, "none")) { + if (!*c->repair_path && + snprintf_check(c->repair_path, + sizeof(c->repair_path), "%s.repair", + c->sock_path)) { + warn("passt-repair path %s not usable", + c->repair_path); + c->fd_repair_listen = -1; + } else { + c->fd_repair_listen = sock_unix(c->repair_path); + } + } else { + c->fd_repair_listen = -1; + } + c->fd_repair = -1; + } + + if (*c->pidfile) { + c->pidfile_fd = output_file_open(c->pidfile, O_WRONLY); + if (c->pidfile_fd < 0) + die_perror("Couldn't open PID file %s", c->pidfile); + } +} + +/** + * parse_mac() - Parse a MAC address from a string + * @mac: Binary MAC address, initialised on success + * @str: String to parse + * + * Parses @str as an Ethernet MAC address stored in @mac on success. Exits on + * failure. + */ +static void parse_mac(unsigned char mac[ETH_ALEN], const char *str) +{ + size_t i; + + if (strlen(str) != (ETH_ALEN * 3 - 1)) + goto fail; + + for (i = 0; i < ETH_ALEN; i++) { + const char *octet = str + 3 * i; + unsigned long b; + char *end; + + errno = 0; + b = strtoul(octet, &end, 16); + if (b > UCHAR_MAX || errno || end != octet + 2 || + *end != ((i == ETH_ALEN - 1) ? '\0' : ':')) + goto fail; + mac[i] = b; + } + return; + +fail: + die("Invalid MAC address: %s", str); +} + +/** * conf() - Process command-line arguments and set configuration * @c: Execution context * @argc: Argument count @@ -1117,7 +1400,7 @@ static void conf_ugid(char *runas, uid_t *uid, gid_t *gid) */ void conf(struct ctx *c, int argc, char **argv) { - int netns_only = 0; + int netns_only = 0, no_map_gw = 0; const struct option options[] = { {"debug", no_argument, NULL, 'd' }, {"quiet", no_argument, NULL, 'q' }, @@ -1139,6 +1422,7 @@ void conf(struct ctx *c, int argc, char **argv) {"outbound", required_argument, NULL, 'o' }, {"dns", required_argument, NULL, 'D' }, {"search", required_argument, NULL, 'S' }, + {"hostname", required_argument, NULL, 'H' }, {"no-tcp", no_argument, &c->no_tcp, 1 }, {"no-udp", no_argument, &c->no_udp, 1 }, {"no-icmp", no_argument, &c->no_icmp, 1 }, @@ -1146,7 +1430,9 @@ void conf(struct ctx *c, int argc, char **argv) {"no-dhcpv6", no_argument, &c->no_dhcpv6, 1 }, {"no-ndp", no_argument, &c->no_ndp, 1 }, {"no-ra", no_argument, &c->no_ra, 1 }, - {"no-map-gw", no_argument, &c->no_map_gw, 1 }, + {"no-splice", no_argument, &c->no_splice, 1 }, + {"freebind", no_argument, &c->freebind, 1 }, + {"no-map-gw", no_argument, &no_map_gw, 1 }, {"ipv4-only", no_argument, NULL, '4' }, {"ipv6-only", no_argument, NULL, '6' }, {"one-off", no_argument, NULL, '1' }, @@ -1156,7 +1442,6 @@ void conf(struct ctx *c, int argc, char **argv) {"udp-ns", required_argument, NULL, 'U' }, {"userns", required_argument, NULL, 2 }, {"netns", required_argument, NULL, 3 }, - {"netns-only", no_argument, &netns_only, 1 }, {"ns-mac-addr", required_argument, NULL, 4 }, {"dhcp-dns", no_argument, NULL, 5 }, {"no-dhcp-dns", no_argument, NULL, 6 }, @@ -1173,35 +1458,51 @@ void conf(struct ctx *c, int argc, char **argv) {"config-net", no_argument, NULL, 17 }, {"no-copy-routes", no_argument, NULL, 18 }, {"no-copy-addrs", no_argument, NULL, 19 }, + {"netns-only", no_argument, NULL, 20 }, + {"map-host-loopback", required_argument, NULL, 21 }, + {"map-guest-addr", required_argument, NULL, 22 }, + {"host-lo-to-ns-lo", no_argument, NULL, 23 }, + {"dns-host", required_argument, NULL, 24 }, + {"vhost-user", no_argument, NULL, 25 }, + + /* vhost-user backend program convention */ + {"print-capabilities", no_argument, NULL, 26 }, + {"socket-path", required_argument, NULL, 's' }, + {"fqdn", required_argument, NULL, 27 }, + {"repair-path", required_argument, NULL, 28 }, { 0 }, }; + const char *optstring = "+dqfel:hs:F:I:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:T:U:"; + const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt"; char userns[PATH_MAX] = { 0 }, netns[PATH_MAX] = { 0 }; bool copy_addrs_opt = false, copy_routes_opt = false; - struct in6_addr *dns6 = c->ip6.dns, dns6_tmp; - struct in_addr *dns4 = c->ip4.dns, dns4_tmp; enum fwd_ports_mode fwd_default = FWD_NONE; bool v4_only = false, v6_only = false; + unsigned dns4_idx = 0, dns6_idx = 0; + unsigned long max_mtu = IP_MAX_MTU; struct fqdn *dnss = c->dns_search; unsigned int ifi4 = 0, ifi6 = 0; const char *logfile = NULL; - const char *optstring; - int name, ret, b, i; size_t logsize = 0; char *runas = NULL; + long fd_tap_opt; + int name, ret; uid_t uid; gid_t gid; if (c->mode == MODE_PASTA) { c->no_dhcp_dns = c->no_dhcp_dns_search = 1; fwd_default = FWD_AUTO; - optstring = "dqfel:hF:I:p:P:m:a:n:M:g:i:o:D:S:46t:u:T:U:"; - } else { - optstring = "dqfel:hs:F:p:P:m:a:n:M:g:i:o:D:S:461t:u:"; } - c->tcp.fwd_in.mode = c->tcp.fwd_out.mode = 0; - c->udp.fwd_in.f.mode = c->udp.fwd_out.f.mode = 0; + if (tap_l2_max_len(c) - ETH_HLEN < max_mtu) + max_mtu = tap_l2_max_len(c) - ETH_HLEN; + c->mtu = ROUND_DOWN(max_mtu, sizeof(uint32_t)); + c->tcp.fwd_in.mode = c->tcp.fwd_out.mode = FWD_UNSET; + c->udp.fwd_in.mode = c->udp.fwd_out.mode = FWD_UNSET; + memcpy(c->our_tap_mac, MAC_OUR_LAA, ETH_ALEN); + optind = 0; do { name = getopt_long(argc, argv, optstring, options, NULL); @@ -1217,6 +1518,8 @@ void conf(struct ctx *c, int argc, char **argv) if (ret <= 0 || ret >= (int)sizeof(userns)) die("Invalid userns: %s", optarg); + netns_only = 0; + break; case 3: if (c->mode != MODE_PASTA) @@ -1228,14 +1531,7 @@ void conf(struct ctx *c, int argc, char **argv) if (c->mode != MODE_PASTA) die("--ns-mac-addr is for pasta mode only"); - for (i = 0; i < ETH_ALEN; i++) { - errno = 0; - b = strtol(optarg + (intptr_t)i * 3, NULL, 16); - if (b < 0 || b > UCHAR_MAX || errno) - die("Invalid MAC address: %s", optarg); - - c->mac_guest[i] = b; - } + parse_mac(c->guest_mac, optarg); break; case 5: if (c->mode != MODE_PASTA) @@ -1244,7 +1540,7 @@ void conf(struct ctx *c, int argc, char **argv) c->no_dhcp_dns = 0; break; case 6: - if (c->mode != MODE_PASST) + if (c->mode == MODE_PASTA) die("--no-dhcp-dns is for passt mode only"); c->no_dhcp_dns = 1; @@ -1256,20 +1552,18 @@ void conf(struct ctx *c, int argc, char **argv) c->no_dhcp_dns_search = 0; break; case 8: - if (c->mode != MODE_PASST) + if (c->mode == MODE_PASTA) die("--no-dhcp-search is for passt mode only"); c->no_dhcp_dns_search = 1; break; case 9: - if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match) && - inet_pton(AF_INET6, optarg, &c->ip6.dns_match) && + if (inet_pton(AF_INET6, optarg, &c->ip6.dns_match) && !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match) && !IN6_IS_ADDR_LOOPBACK(&c->ip6.dns_match)) break; - if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match) && - inet_pton(AF_INET, optarg, &c->ip4.dns_match) && + if (inet_pton(AF_INET, optarg, &c->ip4.dns_match) && !IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match) && !IN4_IS_ADDR_BROADCAST(&c->ip4.dns_match) && !IN4_IS_ADDR_LOOPBACK(&c->ip4.dns_match)) @@ -1284,24 +1578,13 @@ void conf(struct ctx *c, int argc, char **argv) c->no_netns_quit = 1; break; case 11: - if (c->trace) - die("Multiple --trace options given"); - - if (c->quiet) - die("Either --trace or --quiet"); - c->trace = c->debug = 1; + c->quiet = 0; break; case 12: - if (runas) - die("Multiple --runas options given"); - runas = optarg; break; case 13: - if (logsize) - die("Multiple --log-size options given"); - errno = 0; logsize = strtol(optarg, NULL, 0); @@ -1310,14 +1593,12 @@ void conf(struct ctx *c, int argc, char **argv) break; case 14: - fprintf(stdout, - c->mode == MODE_PASST ? "passt " : "pasta "); - fprintf(stdout, VERSION_BLOB); - exit(EXIT_SUCCESS); + FPRINTF(stdout, + c->mode == MODE_PASTA ? "pasta " : "passt "); + FPRINTF(stdout, VERSION_BLOB); + (void)fflush(stdout); + _exit(EXIT_SUCCESS); case 15: - if (*c->ip4.ifname_out) - die("Redundant outbound interface: %s", optarg); - ret = snprintf(c->ip4.ifname_out, sizeof(c->ip4.ifname_out), "%s", optarg); if (ret <= 0 || ret >= (int)sizeof(c->ip4.ifname_out)) @@ -1325,9 +1606,6 @@ void conf(struct ctx *c, int argc, char **argv) break; case 16: - if (*c->ip6.ifname_out) - die("Redundant outbound interface: %s", optarg); - ret = snprintf(c->ip6.ifname_out, sizeof(c->ip6.ifname_out), "%s", optarg); if (ret <= 0 || ret >= (int)sizeof(c->ip6.ifname_out)) @@ -1345,83 +1623,114 @@ void conf(struct ctx *c, int argc, char **argv) die("--no-copy-routes is for pasta mode only"); warn("--no-copy-routes will be dropped soon"); - c->no_copy_routes = copy_routes_opt = true; + c->ip4.no_copy_routes = c->ip6.no_copy_routes = true; + copy_routes_opt = true; break; case 19: if (c->mode != MODE_PASTA) die("--no-copy-addrs is for pasta mode only"); warn("--no-copy-addrs will be dropped soon"); - c->no_copy_addrs = copy_addrs_opt = true; + c->ip4.no_copy_addrs = c->ip6.no_copy_addrs = true; + copy_addrs_opt = true; break; - case 'd': - if (c->debug) - die("Multiple --debug options given"); - - if (c->quiet) - die("Either --debug or --quiet"); + case 20: + if (c->mode != MODE_PASTA) + die("--netns-only is for pasta mode only"); - c->debug = 1; + netns_only = 1; + *userns = 0; break; - case 'e': - if (logfile) - die("Can't log to both file and stderr"); + case 21: + conf_nat(optarg, &c->ip4.map_host_loopback, + &c->ip6.map_host_loopback, &no_map_gw); + break; + case 22: + conf_nat(optarg, &c->ip4.map_guest_addr, + &c->ip6.map_guest_addr, NULL); + break; + case 23: + if (c->mode != MODE_PASTA) + die("--host-lo-to-ns-lo is for pasta mode only"); + c->host_lo_to_ns_lo = 1; + break; + case 24: + if (inet_pton(AF_INET6, optarg, &c->ip6.dns_host) && + !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_host)) + break; - if (c->force_stderr) - die("Multiple --stderr options given"); + if (inet_pton(AF_INET, optarg, &c->ip4.dns_host) && + !IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_host) && + !IN4_IS_ADDR_BROADCAST(&c->ip4.dns_host)) + break; - c->force_stderr = 1; + die("Invalid host nameserver address: %s", optarg); + case 25: + /* Already handled in conf_mode() */ + ASSERT(c->mode == MODE_VU); break; - case 'l': - if (c->force_stderr) - die("Can't log to both stderr and file"); + case 26: + vu_print_capabilities(); + break; + case 27: + if (snprintf_check(c->fqdn, PASST_MAXDNAME, + "%s", optarg)) + die("Invalid FQDN: %s", optarg); + break; + case 28: + if (c->mode != MODE_VU && strcmp(optarg, "none")) + die("--repair-path is for vhost-user mode only"); - if (logfile) - die("Multiple --log-file options given"); + if (snprintf_check(c->repair_path, + sizeof(c->repair_path), "%s", + optarg)) + die("Invalid passt-repair path: %s", optarg); + break; + case 'd': + c->debug = 1; + c->quiet = 0; + break; + case 'e': + warn("--stderr will be dropped soon"); + break; + case 'l': logfile = optarg; break; case 'q': - if (c->quiet) - die("Multiple --quiet options given"); - - if (c->debug) - die("Either --debug or --quiet"); - c->quiet = 1; + c->debug = c->trace = 0; break; case 'f': - if (c->foreground) - die("Multiple --foreground options given"); - c->foreground = 1; break; case 's': - if (*c->sock_path) - die("Multiple --socket options given"); + if (c->mode == MODE_PASTA) + die("-s is for passt / vhost-user mode only"); - ret = snprintf(c->sock_path, UNIX_SOCK_MAX - 1, "%s", + ret = snprintf(c->sock_path, sizeof(c->sock_path), "%s", optarg); if (ret <= 0 || ret >= (int)sizeof(c->sock_path)) die("Invalid socket path: %s", optarg); + c->fd_tap = -1; break; case 'F': - if (c->fd_tap >= 0) - die("Multiple --fd options given"); - errno = 0; - c->fd_tap = strtol(optarg, NULL, 0); + fd_tap_opt = strtol(optarg, NULL, 0); - if (c->fd_tap < 0 || errno) + if (errno || + (fd_tap_opt != STDIN_FILENO && fd_tap_opt <= STDERR_FILENO) || + fd_tap_opt > INT_MAX) die("Invalid --fd: %s", optarg); + c->fd_tap = fd_tap_opt; c->one_off = true; - + *c->sock_path = 0; break; case 'I': - if (*c->pasta_ifn) - die("Multiple --ns-ifname options given"); + if (c->mode != MODE_PASTA) + die("-I is for pasta mode only"); ret = snprintf(c->pasta_ifn, IFNAMSIZ, "%s", optarg); @@ -1430,61 +1739,57 @@ void conf(struct ctx *c, int argc, char **argv) break; case 'p': - if (*c->pcap) - die("Multiple --pcap options given"); - ret = snprintf(c->pcap, sizeof(c->pcap), "%s", optarg); if (ret <= 0 || ret >= (int)sizeof(c->pcap)) die("Invalid pcap path: %s", optarg); break; case 'P': - if (*c->pid_file) - die("Multiple --pid options given"); - - ret = snprintf(c->pid_file, sizeof(c->pid_file), "%s", + ret = snprintf(c->pidfile, sizeof(c->pidfile), "%s", optarg); - if (ret <= 0 || ret >= (int)sizeof(c->pid_file)) + if (ret <= 0 || ret >= (int)sizeof(c->pidfile)) die("Invalid PID file: %s", optarg); break; - case 'm': - if (c->mtu) - die("Multiple --mtu options given"); + case 'm': { + unsigned long mtu; + char *e; errno = 0; - c->mtu = strtol(optarg, NULL, 0); + mtu = strtoul(optarg, &e, 0); - if (!c->mtu) { - c->mtu = -1; - break; - } - - if (c->mtu < ETH_MIN_MTU || c->mtu > (int)ETH_MAX_MTU || - errno) + if (errno || *e) die("Invalid MTU: %s", optarg); + if (mtu > max_mtu) { + die("MTU %lu too large (max %lu)", + mtu, max_mtu); + } + + c->mtu = mtu; break; + } case 'a': - if (c->mode == MODE_PASTA) - c->no_copy_addrs = 1; - - if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr) && - inet_pton(AF_INET6, optarg, &c->ip6.addr) && + if (inet_pton(AF_INET6, optarg, &c->ip6.addr) && !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr) && !IN6_IS_ADDR_LOOPBACK(&c->ip6.addr) && !IN6_IS_ADDR_V4MAPPED(&c->ip6.addr) && !IN6_IS_ADDR_V4COMPAT(&c->ip6.addr) && - !IN6_IS_ADDR_MULTICAST(&c->ip6.addr)) + !IN6_IS_ADDR_MULTICAST(&c->ip6.addr)) { + if (c->mode == MODE_PASTA) + c->ip6.no_copy_addrs = true; break; + } - if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.addr) && - inet_pton(AF_INET, optarg, &c->ip4.addr) && + if (inet_pton(AF_INET, optarg, &c->ip4.addr) && !IN4_IS_ADDR_UNSPECIFIED(&c->ip4.addr) && !IN4_IS_ADDR_BROADCAST(&c->ip4.addr) && !IN4_IS_ADDR_LOOPBACK(&c->ip4.addr) && - !IN4_IS_ADDR_MULTICAST(&c->ip4.addr)) + !IN4_IS_ADDR_MULTICAST(&c->ip4.addr)) { + if (c->mode == MODE_PASTA) + c->ip4.no_copy_addrs = true; break; + } die("Invalid address: %s", optarg); break; @@ -1495,45 +1800,34 @@ void conf(struct ctx *c, int argc, char **argv) break; case 'M': - for (i = 0; i < ETH_ALEN; i++) { - errno = 0; - b = strtol(optarg + (intptr_t)i * 3, NULL, 16); - if (b < 0 || b > UCHAR_MAX || errno) - die("Invalid MAC address: %s", optarg); - - c->mac[i] = b; - } + parse_mac(c->our_tap_mac, optarg); break; case 'g': - if (c->mode == MODE_PASTA) - c->no_copy_routes = 1; - - if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.gw) && - inet_pton(AF_INET6, optarg, &c->ip6.gw) && - !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.gw) && - !IN6_IS_ADDR_LOOPBACK(&c->ip6.gw)) + if (inet_pton(AF_INET6, optarg, &c->ip6.guest_gw) && + !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.guest_gw) && + !IN6_IS_ADDR_LOOPBACK(&c->ip6.guest_gw)) { + if (c->mode == MODE_PASTA) + c->ip6.no_copy_routes = true; break; + } - if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.gw) && - inet_pton(AF_INET, optarg, &c->ip4.gw) && - !IN4_IS_ADDR_UNSPECIFIED(&c->ip4.gw) && - !IN4_IS_ADDR_BROADCAST(&c->ip4.gw) && - !IN4_IS_ADDR_LOOPBACK(&c->ip4.gw)) + if (inet_pton(AF_INET, optarg, &c->ip4.guest_gw) && + !IN4_IS_ADDR_UNSPECIFIED(&c->ip4.guest_gw) && + !IN4_IS_ADDR_BROADCAST(&c->ip4.guest_gw) && + !IN4_IS_ADDR_LOOPBACK(&c->ip4.guest_gw)) { + if (c->mode == MODE_PASTA) + c->ip4.no_copy_routes = true; break; + } die("Invalid gateway address: %s", optarg); break; case 'i': - if (ifi4 || ifi6) - die("Redundant interface: %s", optarg); - if (!(ifi4 = ifi6 = if_nametoindex(optarg))) - die("Invalid interface name %s: %s", optarg, - strerror(errno)); + die_perror("Invalid interface name %s", optarg); break; case 'o': - if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr_out) && - inet_pton(AF_INET6, optarg, &c->ip6.addr_out) && + if (inet_pton(AF_INET6, optarg, &c->ip6.addr_out) && !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr_out) && !IN6_IS_ADDR_LOOPBACK(&c->ip6.addr_out) && !IN6_IS_ADDR_V4MAPPED(&c->ip6.addr_out) && @@ -1541,8 +1835,7 @@ void conf(struct ctx *c, int argc, char **argv) !IN6_IS_ADDR_MULTICAST(&c->ip6.addr_out)) break; - if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.addr_out) && - inet_pton(AF_INET, optarg, &c->ip4.addr_out) && + if (inet_pton(AF_INET, optarg, &c->ip4.addr_out) && !IN4_IS_ADDR_UNSPECIFIED(&c->ip4.addr_out) && !IN4_IS_ADDR_BROADCAST(&c->ip4.addr_out) && !IN4_IS_ADDR_MULTICAST(&c->ip4.addr_out)) @@ -1551,49 +1844,16 @@ void conf(struct ctx *c, int argc, char **argv) die("Invalid or redundant outbound address: %s", optarg); break; - case 'D': - if (!strcmp(optarg, "none")) { - if (c->no_dns) - die("Redundant DNS options"); - - if (dns4 - c->ip4.dns || dns6 - c->ip6.dns) - die("Conflicting DNS options"); - - c->no_dns = 1; - break; - } - - if (c->no_dns) - die("Conflicting DNS options"); - - if (dns4 - &c->ip4.dns[0] < ARRAY_SIZE(c->ip4.dns) && - inet_pton(AF_INET, optarg, &dns4_tmp)) { - add_dns4(c, &dns4_tmp, &dns4); - break; - } - - if (dns6 - &c->ip6.dns[0] < ARRAY_SIZE(c->ip6.dns) && - inet_pton(AF_INET6, optarg, &dns6_tmp)) { - add_dns6(c, &dns6_tmp, &dns6); - break; - } - - die("Cannot use DNS address %s", optarg); - break; case 'S': if (!strcmp(optarg, "none")) { - if (c->no_dns_search) - die("Redundant DNS search options"); + c->no_dns_search = 1; - if (dnss != c->dns_search) - die("Conflicting DNS search options"); + memset(c->dns_search, 0, sizeof(c->dns_search)); - c->no_dns_search = 1; break; } - if (c->no_dns_search) - die("Conflicting DNS search options"); + c->no_dns_search = 0; if (dnss - c->dns_search < ARRAY_SIZE(c->dns_search)) { ret = snprintf(dnss->n, sizeof(*c->dns_search), @@ -1607,43 +1867,49 @@ void conf(struct ctx *c, int argc, char **argv) die("Cannot use DNS search domain %s", optarg); break; + case 'H': + if (snprintf_check(c->hostname, PASST_MAXDNAME, + "%s", optarg)) + die("Invalid hostname: %s", optarg); + break; case '4': v4_only = true; + v6_only = false; break; case '6': v6_only = true; + v4_only = false; break; case '1': - if (c->mode != MODE_PASST) + if (c->mode == MODE_PASTA) die("--one-off is for passt mode only"); - if (c->one_off) - die("Redundant --one-off option"); - c->one_off = true; break; case 't': case 'u': + case 'D': + /* Handle these later, once addresses are configured */ + break; case 'T': case 'U': - /* Handle these later, once addresses are configured */ + if (c->mode != MODE_PASTA) + die("-%c is for pasta mode only", name); + + /* Handle properly later, once addresses are configured */ break; case 'h': - log_to_stdout = 1; - print_usage(argv[0], EXIT_SUCCESS); + usage(argv[0], stdout, EXIT_SUCCESS); break; case '?': default: - usage(argv[0]); + usage(argv[0], stderr, EXIT_FAILURE); break; } } while (name != -1); - if (v4_only && v6_only) - die("Options ipv4-only and ipv6-only are mutually exclusive"); - - if (*c->sock_path && c->fd_tap >= 0) - die("Options --socket and --fd are mutually exclusive"); + if (c->mode != MODE_PASTA) + c->no_splice = 1; if (c->mode == MODE_PASTA && !c->pasta_conf_ns) { if (copy_routes_opt) @@ -1660,14 +1926,11 @@ void conf(struct ctx *c, int argc, char **argv) conf_ugid(runas, &uid, &gid); - if (logfile) { - logfile_init(c->mode == MODE_PASST ? "passt" : "pasta", - logfile, logsize); - } + if (logfile) + logfile_init(logname, logfile, logsize); + else + __openlog(logname, 0, LOG_DAEMON); - /* Once the log mask is not LOG_EARLY, we will no longer log to stderr - * if there was a log file specified. - */ if (c->debug) __setlogmask(LOG_UPTO(LOG_DEBUG)); else if (c->quiet) @@ -1675,32 +1938,102 @@ void conf(struct ctx *c, int argc, char **argv) else __setlogmask(LOG_UPTO(LOG_INFO)); + log_conf_parsed = true; /* Stop printing everything */ + nl_sock_init(c, false); if (!v6_only) - c->ifi4 = conf_ip4(ifi4, &c->ip4, c->mac); + c->ifi4 = conf_ip4(ifi4, &c->ip4); if (!v4_only) - c->ifi6 = conf_ip6(ifi6, &c->ip6, c->mac); - if ((!c->ifi4 && !c->ifi6) || - (*c->ip4.ifname_out && !c->ifi4) || + c->ifi6 = conf_ip6(ifi6, &c->ip6); + + if (c->ifi4 && c->mtu < IPV4_MIN_MTU) { + warn("MTU %"PRIu16" is too small for IPv4 (minimum %u)", + c->mtu, IPV4_MIN_MTU); + } + if (c->ifi6 && c->mtu < IPV6_MIN_MTU) { + warn("MTU %"PRIu16" is too small for IPv6 (minimum %u)", + c->mtu, IPV6_MIN_MTU); + } + + if ((*c->ip4.ifname_out && !c->ifi4) || (*c->ip6.ifname_out && !c->ifi6)) die("External interface not usable"); - if (c->ifi4 && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.gw)) - c->no_map_gw = c->no_dhcp = 1; - if (c->ifi6 && IN6_IS_ADDR_UNSPECIFIED(&c->ip6.gw)) - c->no_map_gw = 1; + if (!c->ifi4 && !c->ifi6) { + info("No external interface as template, switch to local mode"); + + conf_ip4_local(&c->ip4); + c->ifi4 = -1; + + conf_ip6_local(&c->ip6); + c->ifi6 = -1; - /* Inbound port options can be parsed now (after IPv4/IPv6 settings) */ + if (!*c->pasta_ifn) { + strncpy(c->pasta_ifn, pasta_default_ifn, + sizeof(c->pasta_ifn) - 1); + } + } + + if (c->ifi4 && !no_map_gw && + IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback)) + c->ip4.map_host_loopback = c->ip4.guest_gw; + + if (c->ifi6 && !no_map_gw && + IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback)) + c->ip6.map_host_loopback = c->ip6.guest_gw; + + if (c->ifi4 && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.guest_gw)) + c->no_dhcp = 1; + + /* Inbound port options and DNS can be parsed now, after IPv4/IPv6 + * settings + */ + fwd_probe_ephemeral(); udp_portmap_clear(); - optind = 1; + optind = 0; do { name = getopt_long(argc, argv, optstring, options, NULL); - if (name == 't') + if (name == 't') { conf_ports(c, name, optarg, &c->tcp.fwd_in); - else if (name == 'u') - conf_ports(c, name, optarg, &c->udp.fwd_in.f); + } else if (name == 'u') { + conf_ports(c, name, optarg, &c->udp.fwd_in); + } else if (name == 'D') { + struct in6_addr dns6_tmp; + struct in_addr dns4_tmp; + + if (!strcmp(optarg, "none")) { + c->no_dns = 1; + + dns4_idx = 0; + memset(c->ip4.dns, 0, sizeof(c->ip4.dns)); + c->ip4.dns[0] = (struct in_addr){ 0 }; + c->ip4.dns_match = (struct in_addr){ 0 }; + c->ip4.dns_host = (struct in_addr){ 0 }; + + dns6_idx = 0; + memset(c->ip6.dns, 0, sizeof(c->ip6.dns)); + c->ip6.dns_match = (struct in6_addr){ 0 }; + c->ip6.dns_host = (struct in6_addr){ 0 }; + + continue; + } + + c->no_dns = 0; + + if (inet_pton(AF_INET, optarg, &dns4_tmp)) { + dns4_idx += add_dns4(c, &dns4_tmp, dns4_idx); + continue; + } + + if (inet_pton(AF_INET6, optarg, &dns6_tmp)) { + dns6_idx += add_dns6(c, &dns6_tmp, dns6_idx); + continue; + } + + die("Cannot use DNS address %s", optarg); + } } while (name != -1); if (c->mode == MODE_PASTA) @@ -1708,6 +2041,8 @@ void conf(struct ctx *c, int argc, char **argv) else if (optind != argc) die("Extra non-option argument: %s", argv[optind]); + conf_open_files(c); /* Before any possible setuid() / setgid() */ + isolate_user(uid, gid, !netns_only, userns, c->mode); if (c->pasta_conf_ns) @@ -1726,14 +2061,14 @@ void conf(struct ctx *c, int argc, char **argv) nl_sock_init(c, true); /* ...and outbound port options now that namespaces are set up. */ - optind = 1; + optind = 0; do { name = getopt_long(argc, argv, optstring, options, NULL); if (name == 'T') conf_ports(c, name, optarg, &c->tcp.fwd_out); else if (name == 'U') - conf_ports(c, name, optarg, &c->udp.fwd_out.f); + conf_ports(c, name, optarg, &c->udp.fwd_out); } while (name != -1); if (!c->ifi4) @@ -1742,17 +2077,16 @@ void conf(struct ctx *c, int argc, char **argv) if (!c->ifi6) { c->no_ndp = 1; c->no_dhcpv6 = 1; + } else if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr)) { + c->no_dhcpv6 = 1; } - if (!c->mtu) - c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t)); - get_dns(c); if (!*c->pasta_ifn) { - if (c->ifi4) + if (c->ifi4 > 0) if_indextoname(c->ifi4, c->pasta_ifn); - else + else if (c->ifi6 > 0) if_indextoname(c->ifi6, c->pasta_ifn); } @@ -1760,10 +2094,10 @@ void conf(struct ctx *c, int argc, char **argv) c->tcp.fwd_in.mode = fwd_default; if (!c->tcp.fwd_out.mode) c->tcp.fwd_out.mode = fwd_default; - if (!c->udp.fwd_in.f.mode) - c->udp.fwd_in.f.mode = fwd_default; - if (!c->udp.fwd_out.f.mode) - c->udp.fwd_out.f.mode = fwd_default; + if (!c->udp.fwd_in.mode) + c->udp.fwd_in.mode = fwd_default; + if (!c->udp.fwd_out.mode) + c->udp.fwd_out.mode = fwd_default; fwd_scan_ports_init(c); @@ -6,6 +6,7 @@ #ifndef CONF_H #define CONF_H +enum passt_modes conf_mode(int argc, char *argv[]); void conf(struct ctx *c, int argc, char **argv); #endif /* CONF_H */ diff --git a/contrib/apparmor/abstractions/passt b/contrib/apparmor/abstractions/passt index 6bb25e0..43fd63f 100644 --- a/contrib/apparmor/abstractions/passt +++ b/contrib/apparmor/abstractions/passt @@ -26,13 +26,16 @@ capability sys_ptrace, / r, # isolate_prefork(), isolation.c - mount options=(rw, runbindable) /, + mount options=(rw, runbindable) -> /, + mount "" -> "/", mount "" -> "/tmp/", pivot_root "/tmp/" -> "/tmp/", umount "/", owner @{PROC}/@{pid}/uid_map r, # conf_ugid() + @{PROC}/sys/net/ipv4/ip_local_port_range r, # fwd_probe_ephemeral() + network netlink raw, # nl_sock_init_do(), netlink.c network inet stream, # tcp.c diff --git a/contrib/apparmor/abstractions/pasta b/contrib/apparmor/abstractions/pasta index a890391..9f73bee 100644 --- a/contrib/apparmor/abstractions/pasta +++ b/contrib/apparmor/abstractions/pasta @@ -27,8 +27,9 @@ @{PROC}/@{pid}/net/udp r, @{PROC}/@{pid}/net/udp6 r, - @{run}/user/@{uid}/netns/* r, # pasta_open_ns(), pasta.c + @{run}/user/@{uid}/** rw, # pasta_open_ns() + @{PROC}/[0-9]*/ns/ r, # pasta_netns_quit_init(), @{PROC}/[0-9]*/ns/net r, # pasta_wait_for_ns(), @{PROC}/[0-9]*/ns/user r, # conf_pasta_ns() @@ -42,3 +43,5 @@ /{usr/,}bin/** Ux, /usr/bin/pasta.avx2 ix, # arch_avx2_exec(), arch.c + + ptrace r, # pasta_open_ns() diff --git a/contrib/apparmor/usr.bin.passt b/contrib/apparmor/usr.bin.passt index 564f82f..62a4514 100644 --- a/contrib/apparmor/usr.bin.passt +++ b/contrib/apparmor/usr.bin.passt @@ -19,9 +19,33 @@ profile passt /usr/bin/passt{,.avx2} { include <abstractions/passt> # Alternatively: include <abstractions/user-tmp> - owner /tmp/** w, # tap_sock_unix_init(), pcap(), - # write_pidfile(), + owner /tmp/** w, # tap_sock_unix_open(), + # tap_sock_unix_init(), pcap(), + # pidfile_open(), + # pidfile_write(), # logfile_init() - owner @{HOME}/** w, # pcap(), write_pidfile() + owner @{HOME}/** w, # pcap(), pidfile_open(), + # pidfile_write() + + # Workaround: libvirt's profile comes with a passt subprofile which includes, + # in turn, <abstractions/passt>, and adds libvirt-specific rules on top, to + # allow passt (when started by libvirtd) to write socket and PID files in the + # location requested by libvirtd itself, and to execute passt itself. + # + # However, when libvirt runs as unprivileged user, the mechanism based on + # virt-aa-helper, designed to build per-VM profiles as guests are started, + # doesn't work. The helper needs to create and load profiles on the fly, which + # can't be done by unprivileged users, of course. + # + # As a result, libvirtd runs unconfined if guests are started by unprivileged + # users, starting passt unconfined as well, which means that passt runs under + # its own stand-alone profile (this one), which implies in turn that execve() + # of /usr/bin/passt is not allowed, and socket and PID files can't be written. + # + # Duplicate libvirt-specific rules here as long as this is not solved in + # libvirt's profile itself. + /usr/bin/passt r, + owner @{run}/user/[0-9]*/libvirt/qemu/run/passt/* rw, + owner @{run}/libvirt/qemu/passt/* rw, } diff --git a/contrib/apparmor/usr.bin.passt-repair b/contrib/apparmor/usr.bin.passt-repair new file mode 100644 index 0000000..901189d --- /dev/null +++ b/contrib/apparmor/usr.bin.passt-repair @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# contrib/apparmor/usr.bin.passt-repair - AppArmor profile for passt-repair(1) +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +abi <abi/3.0>, + +#include <tunables/global> + +profile passt-repair /usr/bin/passt-repair { + #include <abstractions/base> + /** rw, # passt's ".repair" socket might be anywhere + unix (connect, receive, send) type=stream, + + capability dac_override, # connect to passt's socket as root + capability net_admin, # currently needed for TCP_REPAIR socket option + capability net_raw, # what TCP_REPAIR should require instead + + network unix stream, # connect and use UNIX domain socket + network inet stream, # use TCP sockets +} diff --git a/contrib/apparmor/usr.bin.pasta b/contrib/apparmor/usr.bin.pasta index e5ee4df..2483968 100644 --- a/contrib/apparmor/usr.bin.pasta +++ b/contrib/apparmor/usr.bin.pasta @@ -19,9 +19,13 @@ profile pasta /usr/bin/pasta{,.avx2} flags=(attach_disconnected) { include <abstractions/pasta> # Alternatively: include <abstractions/user-tmp> - owner /tmp/** w, # tap_sock_unix_init(), pcap(), - # write_pidfile(), - # logfile_init() + /tmp/** rw, # tap_sock_unix_open(), + # tap_sock_unix_init(), pcap(), + # pidfile_open(), + # pidfile_write(), + # logfile_init(), + # pasta_open_ns() - owner @{HOME}/** w, # pcap(), write_pidfile() + owner @{HOME}/** w, # pcap(), pidfile_open(), + # pidfile_write() } diff --git a/contrib/fedora/passt.spec b/contrib/fedora/passt.spec index 7950fb9..663289f 100644 --- a/contrib/fedora/passt.spec +++ b/contrib/fedora/passt.spec @@ -9,6 +9,7 @@ %global git_hash {{{ git_head }}} %global selinuxtype targeted +%global selinux_policy_version 41.41 Name: passt Version: {{{ git_version }}} @@ -33,18 +34,22 @@ for network namespaces: traffic is forwarded using a tap interface inside the namespace, without the need to create further interfaces on the host, hence not requiring any capabilities or privileges. -%package selinux -BuildArch: noarch -Summary: SELinux support for passt and pasta -Requires: %{name} = %{version}-%{release} -Requires: selinux-policy -Requires(post): %{name} -Requires(post): policycoreutils -Requires(preun): %{name} -Requires(preun): policycoreutils +%package selinux +BuildArch: noarch +Summary: SELinux support for passt and pasta +Requires: selinux-policy-%{selinuxtype} +Requires: container-selinux +Requires(post): selinux-policy-%{selinuxtype} +Requires(post): container-selinux +Requires(post): policycoreutils +Requires(post): libselinux-utils +Requires(preun): policycoreutils +BuildRequires: selinux-policy-devel +BuildRequires: pkgconfig(systemd) +Recommends: selinux-policy-%{selinuxtype} >= %{selinux_policy_version} %description selinux -This package adds SELinux enforcement to passt(1) and pasta(1). +This package adds SELinux enforcement to passt(1), pasta(1), passt-repair(1). %prep %setup -q -n passt-%{git_hash} @@ -82,23 +87,33 @@ make -f %{_datadir}/selinux/devel/Makefile install -p -m 644 -D passt.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/passt.pp install -p -m 644 -D passt.if %{buildroot}%{_datadir}/selinux/devel/include/distributed/passt.if install -p -m 644 -D pasta.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp +install -p -m 644 -D passt-repair.pp %{buildroot}%{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp popd %pre selinux %selinux_relabel_pre -s %{selinuxtype} %post selinux -%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp -%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp +%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp %postun selinux if [ $1 -eq 0 ]; then - %selinux_modules_uninstall -s %{selinuxtype} passt - %selinux_modules_uninstall -s %{selinuxtype} pasta + %selinux_modules_uninstall -s %{selinuxtype} passt pasta passt-repair fi %posttrans selinux %selinux_relabel_post -s %{selinuxtype} +# %selinux_relabel_post calls fixfiles(8) with the previous file_contexts file +# (see selabel_file(5)) in order to restore only the file contexts which +# actually changed. However, as file_contexts doesn't support %{USERID} +# substitutions, this will not work for specific file contexts that pasta needs +# to have under /run/user. +# +# Restore those explicitly, hiding errors from restorecon(8): we can't pass a +# path that's more specific than this, but at the same time /run/user often +# contains FUSE mountpoints that can't be accessed as root, leading to +# "Permission denied" messages, but not failures. +restorecon -R /run/user 2>/dev/null %files %license LICENSES/{GPL-2.0-or-later.txt,BSD-3-Clause.txt} @@ -108,9 +123,11 @@ fi %{_bindir}/passt %{_bindir}/pasta %{_bindir}/qrap +%{_bindir}/passt-repair %{_mandir}/man1/passt.1* %{_mandir}/man1/pasta.1* %{_mandir}/man1/qrap.1* +%{_mandir}/man1/passt-repair.1* %ifarch x86_64 %{_bindir}/passt.avx2 %{_mandir}/man1/passt.avx2.1* @@ -122,6 +139,7 @@ fi %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp %{_datadir}/selinux/devel/include/distributed/passt.if %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp +%{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp %changelog {{{ passt_git_changelog }}} diff --git a/contrib/fedora/rpkg.macros b/contrib/fedora/rpkg.macros index c226c84..c98b791 100644 --- a/contrib/fedora/rpkg.macros +++ b/contrib/fedora/rpkg.macros @@ -29,7 +29,11 @@ function passt_git_changelog_entry { [ -z "${__from}" ] && __from="$(git rev-list --max-parents=0 HEAD)" __date="$(git log --pretty="format:%cI" "${__to}" -1)" - __author="$(git log -1 --pretty="format:%an <%ae>" ${__to} -- contrib/fedora)" + __author="Stefano Brivio <sbrivio@redhat.com>" + # Use: + # __author="$(git log -1 --pretty="format:%an <%ae>" ${__to} -- contrib/fedora)" + # if you want the author of changelog entries to match the latest + # author for contrib/fedora printf "* %s %s - %s\n" "$(date "+%a %b %e %Y" -d "${__date}")" "${__author}" "$(git_version "${__to}")-1" diff --git a/contrib/selinux/passt-repair.fc b/contrib/selinux/passt-repair.fc new file mode 100644 index 0000000..bcd526e --- /dev/null +++ b/contrib/selinux/passt-repair.fc @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# contrib/selinux/passt-repair.fc - SELinux: File Context for passt-repair +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +/usr/bin/passt-repair system_u:object_r:passt_repair_exec_t:s0 diff --git a/contrib/selinux/passt-repair.te b/contrib/selinux/passt-repair.te new file mode 100644 index 0000000..7157dfb --- /dev/null +++ b/contrib/selinux/passt-repair.te @@ -0,0 +1,87 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# contrib/selinux/passt-repair.te - SELinux: Type Enforcement for passt-repair +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +policy_module(passt-repair, 0.1) + +require { + type unconfined_t; + type passt_t; + role unconfined_r; + class process transition; + + class file { read execute execute_no_trans entrypoint open map }; + class capability { dac_override net_admin net_raw }; + class chr_file { append open getattr read write ioctl }; + + class unix_stream_socket { create connect sendto }; + class sock_file { read write }; + + class tcp_socket { read setopt write }; + + type console_device_t; + type user_devpts_t; + type user_tmp_t; + + # Workaround: passt-repair needs to needs to access socket files + # that passt, started by libvirt, might create under different + # labels, depending on whether passt is started as root or not. + # + # However, libvirt doesn't maintain its own policy, which makes + # updates particularly complicated. To avoid breakage in the short + # term, deal with that in passt's own policy. + type qemu_var_run_t; + type virt_var_run_t; +} + +type passt_repair_t; +domain_type(passt_repair_t); +type passt_repair_exec_t; +corecmd_executable_file(passt_repair_exec_t); + +role unconfined_r types passt_repair_t; + +allow passt_repair_t passt_repair_exec_t:file { read execute execute_no_trans entrypoint open map }; +type_transition unconfined_t passt_repair_exec_t:process passt_repair_t; +allow unconfined_t passt_repair_t:process transition; + +allow passt_repair_t self:capability { dac_override dac_read_search net_admin net_raw }; +allow passt_repair_t self:capability2 bpf; + +allow passt_repair_t console_device_t:chr_file { append open getattr read write ioctl }; +allow passt_repair_t user_devpts_t:chr_file { append open getattr read write ioctl }; + +allow passt_repair_t unconfined_t:unix_stream_socket { connectto read write }; +allow passt_repair_t passt_t:unix_stream_socket { connectto read write }; +allow passt_repair_t user_tmp_t:unix_stream_socket { connectto read write }; + +allow passt_repair_t user_tmp_t:dir { getattr read search watch }; + +allow passt_repair_t unconfined_t:sock_file { getattr read write }; +allow passt_repair_t passt_t:sock_file { getattr read write }; +allow passt_repair_t user_tmp_t:sock_file { getattr read write }; + +allow passt_repair_t unconfined_t:tcp_socket { read setopt write }; +allow passt_repair_t passt_t:tcp_socket { read setopt write }; + +# Workaround: passt-repair needs to needs to access socket files +# that passt, started by libvirt, might create under different +# labels, depending on whether passt is started as root or not. +# +# However, libvirt doesn't maintain its own policy, which makes +# updates particularly complicated. To avoid breakage in the short +# term, deal with that in passt's own policy. +allow passt_repair_t qemu_var_run_t:unix_stream_socket { connectto read write }; +allow passt_repair_t virt_var_run_t:unix_stream_socket { connectto read write }; + +allow passt_repair_t qemu_var_run_t:dir { getattr read search watch }; +allow passt_repair_t virt_var_run_t:dir { getattr read search watch }; + +allow passt_repair_t qemu_var_run_t:sock_file { getattr read write }; +allow passt_repair_t virt_var_run_t:sock_file { getattr read write }; diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te index bbb0917..eb9ce72 100644 --- a/contrib/selinux/passt.te +++ b/contrib/selinux/passt.te @@ -20,9 +20,19 @@ require { type fs_t; type tmp_t; type user_tmp_t; + type user_home_t; type tmpfs_t; type root_t; + # Workaround: passt --vhost-user needs to map guest memory, but + # libvirt doesn't maintain its own policy, which makes updates + # particularly complicated. To avoid breakage in the short term, + # deal with it in passt's own policy. + type svirt_image_t; + type svirt_tmpfs_t; + type svirt_t; + type null_device_t; + class file { ioctl getattr setattr create read write unlink open relabelto execute execute_no_trans map }; class dir { search write add_name remove_name mounton }; class chr_file { append read write open getattr ioctl }; @@ -38,8 +48,8 @@ require { type net_conf_t; type proc_net_t; type node_t; - class tcp_socket { create accept listen name_bind name_connect }; - class udp_socket { create accept listen }; + class tcp_socket { create accept listen name_bind name_connect getattr ioctl }; + class udp_socket { create accept listen getattr }; class icmp_socket { bind create name_bind node_bind setopt read write }; class sock_file { create unlink write }; @@ -47,9 +57,8 @@ require { type port_t; type http_port_t; - type passwd_file_t; - class netlink_route_socket { bind create nlmsg_read }; + type sysctl_net_t; class capability { sys_tty_config setuid setgid }; class cap_userns { setpcap sys_admin sys_ptrace }; @@ -81,6 +90,9 @@ allow passt_t root_t:dir mounton; allow passt_t tmp_t:dir { add_name mounton remove_name write }; allow passt_t tmpfs_t:filesystem mount; allow passt_t fs_t:filesystem unmount; +allow passt_t user_home_t:dir search; +allow passt_t user_tmp_t:fifo_file append; +allow passt_t user_tmp_t:file map; manage_files_pattern(passt_t, user_tmp_t, user_tmp_t) files_pid_filetrans(passt_t, user_tmp_t, file) @@ -95,8 +107,7 @@ allow passt_t self:capability { sys_tty_config setpcap net_bind_service setuid s allow passt_t self:cap_userns { setpcap sys_admin sys_ptrace }; allow passt_t self:user_namespace create; -allow passt_t passwd_file_t:file read_file_perms; -sssd_search_lib(passt_t) +auth_read_passwd(passt_t) allow passt_t proc_net_t:file read; allow passt_t net_conf_t:file { open read }; @@ -104,6 +115,8 @@ allow passt_t net_conf_t:lnk_file read; allow passt_t tmp_t:sock_file { create unlink write }; allow passt_t self:netlink_route_socket { bind create nlmsg_read read write setopt }; kernel_search_network_sysctl(passt_t) +allow passt_t sysctl_net_t:dir search; +allow passt_t sysctl_net_t:file { open read }; corenet_tcp_bind_all_nodes(passt_t) corenet_udp_bind_all_nodes(passt_t) @@ -119,11 +132,19 @@ corenet_udp_sendrecv_all_ports(passt_t) allow passt_t node_t:icmp_socket { name_bind node_bind }; allow passt_t port_t:icmp_socket name_bind; -allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write }; -allow passt_t self:udp_socket { create getopt setopt connect bind read write }; +allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write getattr ioctl }; +allow passt_t self:udp_socket { create getopt setopt connect bind read write getattr }; allow passt_t self:icmp_socket { bind create setopt read write }; allow passt_t user_tmp_t:dir { add_name write }; allow passt_t user_tmp_t:file { create open }; allow passt_t user_tmp_t:sock_file { create read write unlink }; allow passt_t unconfined_t:unix_stream_socket { read write }; + +# Workaround: passt --vhost-user needs to map guest memory, but +# libvirt doesn't maintain its own policy, which makes updates +# particularly complicated. To avoid breakage in the short term, +# deal with it in passt's own policy. +allow passt_t svirt_image_t:file { read write map }; +allow passt_t svirt_tmpfs_t:file { read write map }; +allow passt_t null_device_t:chr_file map; diff --git a/contrib/selinux/pasta.fc b/contrib/selinux/pasta.fc index 41ee46d..e4aefc4 100644 --- a/contrib/selinux/pasta.fc +++ b/contrib/selinux/pasta.fc @@ -8,7 +8,9 @@ # Copyright (c) 2022 Red Hat GmbH # Author: Stefano Brivio <sbrivio@redhat.com> -/usr/bin/pasta system_u:object_r:pasta_exec_t:s0 -/usr/bin/pasta.avx2 system_u:object_r:pasta_exec_t:s0 -/tmp/pasta\.pcap system_u:object_r:pasta_log_t:s0 -/var/run/pasta\.pid system_u:object_r:pasta_pid_t:s0 +/usr/bin/pasta system_u:object_r:pasta_exec_t:s0 +/usr/bin/pasta.avx2 system_u:object_r:pasta_exec_t:s0 +/tmp/pasta\.pcap system_u:object_r:pasta_log_t:s0 +/var/run/pasta\.pid system_u:object_r:pasta_pid_t:s0 +/run/user/%{USERID}/netns system_u:object_r:ifconfig_var_run_t:s0 +/run/user/%{USERID}/containers/networks/rootless-netns system_u:object_r:ifconfig_var_run_t:s0 diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te index 0ceda06..9440d05 100644 --- a/contrib/selinux/pasta.te +++ b/contrib/selinux/pasta.te @@ -18,6 +18,7 @@ require { type bin_t; type user_home_t; type user_home_dir_t; + type user_tmp_t; type fs_t; type tmp_t; type tmpfs_t; @@ -56,8 +57,10 @@ require { attribute port_type; type port_t; type http_port_t; + type http_cache_port_t; type ssh_port_t; type reserved_port_t; + type unreserved_port_t; type dns_port_t; type dhcpc_port_t; type chronyd_port_t; @@ -68,9 +71,6 @@ require { type system_dbusd_t; type systemd_hostnamed_t; type systemd_systemctl_exec_t; - type passwd_file_t; - type sssd_public_t; - type sssd_var_lib_t; class dbus send_msg; class system module_request; class system status; @@ -89,6 +89,15 @@ require { class capability { sys_tty_config setuid setgid }; class cap_userns { setpcap sys_admin sys_ptrace net_bind_service net_admin }; class user_namespace create; + + # Container requires + attribute_role usernetctl_roles; + role container_user_r; + role staff_r; + role user_r; + type container_runtime_t; + type container_t; + type systemd_user_runtimedir_t; } type pasta_t; @@ -113,10 +122,12 @@ init_daemon_domain(pasta_t, pasta_exec_t) allow pasta_t self:capability { setpcap net_bind_service sys_tty_config dac_read_search net_admin sys_resource setuid setgid }; allow pasta_t self:cap_userns { setpcap sys_admin sys_ptrace net_admin net_bind_service }; +# pasta only calls setuid and setgid with the current UID and GID, so this +# denial is harmless. See https://bugzilla.redhat.com/show_bug.cgi?id=2330512#c10 +dontaudit pasta_t self:cap_userns { setgid setuid }; allow pasta_t self:user_namespace create; -allow pasta_t passwd_file_t:file read_file_perms; -sssd_search_lib(pasta_t) +auth_read_passwd(pasta_t) domain_auto_trans(pasta_t, bin_t, unconfined_t); domain_auto_trans(pasta_t, shell_exec_t, unconfined_t); @@ -126,17 +137,22 @@ domain_auto_trans(pasta_t, ping_exec_t, ping_t); allow pasta_t nsfs_t:file { open read }; -allow pasta_t user_home_t:dir getattr; -allow pasta_t user_home_t:file { open read getattr setattr }; +allow pasta_t user_home_t:dir { getattr search }; +allow pasta_t user_home_t:file { open read getattr setattr execute execute_no_trans map}; allow pasta_t user_home_dir_t:dir { search getattr open add_name read write }; allow pasta_t user_home_dir_t:file { create open read write }; allow pasta_t tmp_t:dir { add_name mounton remove_name write }; -allow pasta_t tmpfs_t:filesystem mount; +allow pasta_t tmpfs_t:filesystem { getattr mount }; allow pasta_t fs_t:filesystem unmount; allow pasta_t root_t:dir mounton; manage_files_pattern(pasta_t, pasta_pid_t, pasta_pid_t) files_pid_filetrans(pasta_t, pasta_pid_t, file) +allow pasta_t user_tmp_t:dir { add_name remove_name search write }; +allow pasta_t user_tmp_t:fifo_file append; +allow pasta_t user_tmp_t:file { create open write }; +allow pasta_t user_tmp_t:sock_file { create unlink }; + allow pasta_t console_device_t:chr_file { open write getattr ioctl }; allow pasta_t user_devpts_t:chr_file { getattr read write ioctl }; logging_send_syslog_msg(pasta_t) @@ -152,6 +168,11 @@ allow pasta_t tmp_t:sock_file { create unlink write }; allow pasta_t self:tcp_socket create_stream_socket_perms; corenet_tcp_sendrecv_generic_node(pasta_t) corenet_tcp_bind_generic_node(pasta_t) +allow pasta_t container_runtime_t:dir { open read search }; +allow pasta_t container_runtime_t:fifo_file { getattr write }; +allow pasta_t container_runtime_t:file read; +allow pasta_t container_runtime_t:lnk_file read; +allow pasta_t container_t:lnk_file read; allow pasta_t pasta_port_t:tcp_socket { name_bind name_connect }; allow pasta_t pasta_port_t:udp_socket { name_bind }; allow pasta_t http_port_t:tcp_socket { name_bind name_connect }; @@ -164,6 +185,8 @@ allow pasta_t self:udp_socket create_stream_socket_perms; allow pasta_t reserved_port_t:udp_socket name_bind; allow pasta_t llmnr_port_t:tcp_socket name_bind; allow pasta_t llmnr_port_t:udp_socket name_bind; +allow pasta_t http_cache_port_t:tcp_socket { name_bind name_connect }; +allow pasta_t unreserved_port_t:udp_socket name_bind; corenet_udp_sendrecv_generic_node(pasta_t) corenet_udp_bind_generic_node(pasta_t) allow pasta_t node_t:icmp_socket { name_bind node_bind }; @@ -175,15 +198,12 @@ allow pasta_t init_t:lnk_file read; allow pasta_t init_t:unix_stream_socket connectto; allow pasta_t init_t:dbus send_msg; allow pasta_t init_t:system status; -allow pasta_t unconfined_t:dir search; +allow pasta_t unconfined_t:dir { read search }; allow pasta_t unconfined_t:file read; allow pasta_t unconfined_t:lnk_file read; -allow pasta_t passwd_file_t:file { getattr open read }; allow pasta_t self:process { setpgid setcap }; allow pasta_t shell_exec_t:file { execute execute_no_trans map }; -allow pasta_t sssd_var_lib_t:dir search; -allow pasta_t sssd_public_t:dir search; allow pasta_t hostname_exec_t:file { execute execute_no_trans getattr open read map }; allow pasta_t system_dbusd_t:unix_stream_socket connectto; allow pasta_t system_dbusd_t:dbus send_msg; @@ -196,11 +216,9 @@ allow pasta_t ifconfig_var_run_t:dir { read search watch }; allow pasta_t self:tun_socket create; allow pasta_t tun_tap_device_t:chr_file { ioctl open read write }; allow pasta_t sysctl_net_t:dir search; -allow pasta_t sysctl_net_t:file { open write }; +allow pasta_t sysctl_net_t:file { open read write }; allow pasta_t kernel_t:system module_request; -allow pasta_t nsfs_t:file read; - allow pasta_t proc_t:dir mounton; allow pasta_t proc_t:filesystem mount; allow pasta_t net_conf_t:lnk_file read; @@ -211,3 +229,29 @@ allow pasta_t ifconfig_t:process { noatsecure rlimitinh siginh }; allow pasta_t netutils_t:process { noatsecure rlimitinh siginh }; allow pasta_t ping_t:process { noatsecure rlimitinh siginh }; allow pasta_t user_tty_device_t:chr_file { append read write }; +allow pasta_t user_devpts_t:chr_file { append read write }; + +# Allow network administration commands for non-privileged users +roleattribute container_user_r usernetctl_roles; +roleattribute staff_r usernetctl_roles; +roleattribute user_r usernetctl_roles; +role usernetctl_roles types pasta_t; + +# Make pasta in a container run under the pasta_t context +type_transition container_runtime_t pasta_exec_t : process pasta_t; +allow container_runtime_t pasta_t:process transition; + +# Label the user network namespace files +type_transition container_runtime_t user_tmp_t : dir ifconfig_var_run_t "netns"; +type_transition container_runtime_t user_tmp_t : dir ifconfig_var_run_t "rootless-netns"; +allow pasta_t ifconfig_var_run_t:dir { add_name open rmdir write }; +allow pasta_t ifconfig_var_run_t:file { create open write }; +allow systemd_user_runtimedir_t ifconfig_var_run_t:dir rmdir; + +# Allow pasta to bind to any port +bool pasta_bind_all_ports true; +if (pasta_bind_all_ports) { + allow pasta_t port_type:icmp_socket { accept getopt name_bind }; + allow pasta_t port_type:tcp_socket { accept getopt name_bind name_connect }; + allow pasta_t port_type:udp_socket { accept getopt name_bind }; +} @@ -36,9 +36,9 @@ /** * struct opt - DHCP option * @sent: Convenience flag, set while filling replies - * @slen: Length of option defined for server + * @slen: Length of option defined for server, -1 if not going to be sent * @s: Option payload from server - * @clen: Length of option received from client + * @clen: Length of option received from client, -1 if not received * @c: Option payload from client */ struct opt { @@ -63,11 +63,21 @@ static struct opt opts[255]; #define OPT_MIN 60 /* RFC 951 */ +/* Total option size (excluding end option) is 576 (RFC 2131), minus + * offset of options (268), minus end option (1). + */ +#define OPT_MAX 307 + /** * dhcp_init() - Initialise DHCP options */ void dhcp_init(void) { + int i; + + for (i = 0; i < ARRAY_SIZE(opts); i++) + opts[i].slen = -1; + opts[1] = (struct opt) { 0, 4, { 0 }, 0, { 0 }, }; /* Mask */ opts[3] = (struct opt) { 0, 4, { 0 }, 0, { 0 }, }; /* Router */ opts[51] = (struct opt) { 0, 4, { 0xff, @@ -107,6 +117,8 @@ struct msg { uint32_t xid; uint16_t secs; uint16_t flags; +#define FLAG_BROADCAST htons_constant(0x8000) + uint32_t ciaddr; struct in_addr yiaddr; uint32_t siaddr; @@ -115,7 +127,7 @@ struct msg { uint8_t sname[64]; uint8_t file[128]; uint32_t magic; - uint8_t o[308]; + uint8_t o[OPT_MAX + 1 /* End option */ ]; } __attribute__((__packed__)); /** @@ -123,15 +135,28 @@ struct msg { * @m: Message to fill * @o: Option number * @offset: Current offset within options field, updated on insertion + * + * Return: false if m has space to write the option, true otherwise */ -static void fill_one(struct msg *m, int o, int *offset) +static bool fill_one(struct msg *m, int o, int *offset) { + size_t slen = opts[o].slen; + + /* If we don't have space to write the option, then just skip */ + if (*offset + 2 /* code and length of option */ + slen > OPT_MAX) + return true; + m->o[*offset] = o; - m->o[*offset + 1] = opts[o].slen; - memcpy(&m->o[*offset + 2], opts[o].s, opts[o].slen); + m->o[*offset + 1] = slen; + + /* Move to option */ + *offset += 2; + + memcpy(&m->o[*offset], opts[o].s, slen); opts[o].sent = 1; - *offset += 2 + opts[o].slen; + *offset += slen; + return false; } /** @@ -144,9 +169,6 @@ static int fill(struct msg *m) { int i, o, offset = 0; - m->op = BOOTREPLY; - m->secs = 0; - for (o = 0; o < 255; o++) opts[o].sent = 0; @@ -154,22 +176,24 @@ static int fill(struct msg *m) * option 53 at the beginning of the list. * Put it there explicitly, unless requested via option 55. */ - if (!memchr(opts[55].c, 53, opts[55].clen)) - fill_one(m, 53, &offset); + if (opts[55].clen > 0 && !memchr(opts[55].c, 53, opts[55].clen)) + if (fill_one(m, 53, &offset)) + debug("DHCP: skipping option 53"); for (i = 0; i < opts[55].clen; i++) { o = opts[55].c[i]; - if (opts[o].slen) - fill_one(m, o, &offset); + if (opts[o].slen != -1) + if (fill_one(m, o, &offset)) + debug("DHCP: skipping option %i", o); } for (o = 0; o < 255; o++) { - if (opts[o].slen && !opts[o].sent) - fill_one(m, o, &offset); + if (opts[o].slen != -1 && !opts[o].sent) + if (fill_one(m, o, &offset)) + debug("DHCP: skipping option %i", o); } m->o[offset++] = 255; - m->o[offset++] = 0; if (offset < OPT_MIN) { memset(&m->o[offset], 0, OPT_MIN - offset); @@ -264,6 +288,9 @@ static void opt_set_dns_search(const struct ctx *c, size_t max_len) ".\xc0"); } } + + if (!opts[119].slen) + opts[119].slen = -1; } /** @@ -275,13 +302,15 @@ static void opt_set_dns_search(const struct ctx *c, size_t max_len) */ int dhcp(const struct ctx *c, const struct pool *p) { - size_t mlen, len, offset = 0, opt_len, opt_off = 0; + size_t mlen, dlen, offset = 0, opt_len, opt_off = 0; + char macstr[ETH_ADDRSTRLEN]; + struct in_addr mask, dst; const struct ethhdr *eh; const struct iphdr *iph; const struct udphdr *uh; - struct in_addr mask; + struct msg const *m; + struct msg reply; unsigned int i; - struct msg *m; eh = packet_get(p, 0, offset, sizeof(*eh), NULL); offset += sizeof(*eh); @@ -310,8 +339,27 @@ int dhcp(const struct ctx *c, const struct pool *p) m->op != BOOTREQUEST) return -1; + reply.op = BOOTREPLY; + reply.htype = m->htype; + reply.hlen = m->hlen; + reply.hops = 0; + reply.xid = m->xid; + reply.secs = 0; + reply.flags = m->flags; + reply.ciaddr = m->ciaddr; + reply.yiaddr = c->ip4.addr; + reply.siaddr = 0; + reply.giaddr = m->giaddr; + memcpy(&reply.chaddr, m->chaddr, sizeof(reply.chaddr)); + memset(&reply.sname, 0, sizeof(reply.sname)); + memset(&reply.file, 0, sizeof(reply.file)); + reply.magic = m->magic; + offset += offsetof(struct msg, o); + for (i = 0; i < ARRAY_SIZE(opts); i++) + opts[i].clen = -1; + while (opt_off + 2 < opt_len) { const uint8_t *olen, *val; uint8_t *type; @@ -330,39 +378,46 @@ int dhcp(const struct ctx *c, const struct pool *p) opt_off += *olen + 2; } - if (opts[53].c[0] == DHCPDISCOVER) { - info("DHCP: offer to discover"); - opts[53].s[0] = DHCPOFFER; - } else if (opts[53].c[0] == DHCPREQUEST || !opts[53].clen) { - info("%s: ack to request", opts[53].clen ? "DHCP" : "BOOTP"); + opts[80].slen = -1; + if (opts[53].clen > 0 && opts[53].c[0] == DHCPDISCOVER) { + if (opts[80].clen == -1) { + info("DHCP: offer to discover"); + opts[53].s[0] = DHCPOFFER; + } else { + info("DHCP: ack to discover (Rapid Commit)"); + opts[53].s[0] = DHCPACK; + opts[80].slen = 0; + } + } else if (opts[53].clen <= 0 || opts[53].c[0] == DHCPREQUEST) { + info("%s: ack to request", /* DHCP needs a valid message type */ + (opts[53].clen <= 0) ? "BOOTP" : "DHCP"); opts[53].s[0] = DHCPACK; } else { return -1; } - info(" from %02x:%02x:%02x:%02x:%02x:%02x", - m->chaddr[0], m->chaddr[1], m->chaddr[2], - m->chaddr[3], m->chaddr[4], m->chaddr[5]); + info(" from %s", eth_ntop(m->chaddr, macstr, sizeof(macstr))); - m->yiaddr = c->ip4.addr; mask.s_addr = htonl(0xffffffff << (32 - c->ip4.prefix_len)); - memcpy(opts[1].s, &mask, sizeof(mask)); - memcpy(opts[3].s, &c->ip4.gw, sizeof(c->ip4.gw)); - memcpy(opts[54].s, &c->ip4.gw, sizeof(c->ip4.gw)); + memcpy(opts[1].s, &mask, sizeof(mask)); + memcpy(opts[3].s, &c->ip4.guest_gw, sizeof(c->ip4.guest_gw)); + memcpy(opts[54].s, &c->ip4.our_tap_addr, sizeof(c->ip4.our_tap_addr)); /* If the gateway is not on the assigned subnet, send an option 121 * (Classless Static Routing) adding a dummy route to it. */ if ((c->ip4.addr.s_addr & mask.s_addr) - != (c->ip4.gw.s_addr & mask.s_addr)) { + != (c->ip4.guest_gw.s_addr & mask.s_addr)) { /* a.b.c.d/32:0.0.0.0, 0:a.b.c.d */ opts[121].slen = 14; opts[121].s[0] = 32; - memcpy(opts[121].s + 1, &c->ip4.gw, sizeof(c->ip4.gw)); - memcpy(opts[121].s + 10, &c->ip4.gw, sizeof(c->ip4.gw)); + memcpy(opts[121].s + 1, + &c->ip4.guest_gw, sizeof(c->ip4.guest_gw)); + memcpy(opts[121].s + 10, + &c->ip4.guest_gw, sizeof(c->ip4.guest_gw)); } - if (c->mtu != -1) { + if (c->mtu) { opts[26].slen = 2; opts[26].s[0] = c->mtu / 256; opts[26].s[1] = c->mtu % 256; @@ -373,12 +428,44 @@ int dhcp(const struct ctx *c, const struct pool *p) ((struct in_addr *)opts[6].s)[i] = c->ip4.dns[i]; opts[6].slen += sizeof(uint32_t); } + if (!opts[6].slen) + opts[6].slen = -1; + + opt_len = strlen(c->hostname); + if (opt_len > 0) { + opts[12].slen = opt_len; + memcpy(opts[12].s, &c->hostname, opt_len); + } + + opt_len = strlen(c->fqdn); + if (opt_len > 0) { + opt_len += 3 /* flags */ + + 2; /* Length byte for first label, and terminator */ + + if (sizeof(opts[81].s) >= opt_len) { + opts[81].s[0] = 0x4; /* flags (E) */ + opts[81].s[1] = 0xff; /* RCODE1 */ + opts[81].s[2] = 0xff; /* RCODE2 */ + + encode_domain_name((char *)opts[81].s + 3, c->fqdn); + + opts[81].slen = opt_len; + } else { + debug("DHCP: client FQDN option doesn't fit, skipping"); + } + } if (!c->no_dhcp_dns_search) opt_set_dns_search(c, sizeof(m->o)); - len = offsetof(struct msg, o) + fill(m); - tap_udp4_send(c, c->ip4.gw, 67, c->ip4.addr, 68, m, len); + dlen = offsetof(struct msg, o) + fill(&reply); + + if (m->flags & FLAG_BROADCAST) + dst = in4addr_broadcast; + else + dst = c->ip4.addr; + + tap_udp4_send(c, c->ip4.our_tap_addr, 67, dst, 68, &reply, dlen); return 1; } @@ -48,6 +48,7 @@ struct opt_hdr { # define STATUS_NOTONLINK htons_constant(4) # define OPT_DNS_SERVERS htons_constant(23) # define OPT_DNS_SEARCH htons_constant(24) +# define OPT_CLIENT_FQDN htons_constant(39) #define STR_NOTONLINK "Prefix not appropriate for link." uint16_t l; @@ -58,6 +59,9 @@ struct opt_hdr { sizeof(struct opt_hdr)) #define OPT_VSIZE(x) (sizeof(struct opt_##x) - \ sizeof(struct opt_hdr)) +#define OPT_MAX_SIZE IPV6_MIN_MTU - (sizeof(struct ipv6hdr) + \ + sizeof(struct udphdr) + \ + sizeof(struct msg_hdr)) /** * struct opt_client_id - DHCPv6 Client Identifier option @@ -140,7 +144,9 @@ struct opt_ia_addr { struct opt_status_code { struct opt_hdr hdr; uint16_t code; - char status_msg[sizeof(STR_NOTONLINK) - 1]; + /* "nonstring" is only supported since clang 23 */ + /* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ + __attribute__((nonstring)) char status_msg[sizeof(STR_NOTONLINK) - 1]; } __attribute__((packed)); /** @@ -164,6 +170,18 @@ struct opt_dns_search { } __attribute__((packed)); /** + * struct opt_client_fqdn - Client FQDN option (RFC 4704) + * @hdr: Option header + * @flags: Flags described by RFC 4704 + * @domain_name: Client FQDN + */ +struct opt_client_fqdn { + struct opt_hdr hdr; + uint8_t flags; + char domain_name[PASST_MAXDNAME]; +} __attribute__((packed)); + +/** * struct msg_hdr - DHCPv6 client/server message header * @type: DHCP message type * @xid: Transaction ID for message exchange @@ -193,6 +211,7 @@ struct msg_hdr { * @client_id: Client Identifier, variable length * @dns_servers: DNS Recursive Name Server, here just for storage size * @dns_search: Domain Search List, here just for storage size + * @client_fqdn: Client FQDN, variable length */ static struct resp_t { struct msg_hdr hdr; @@ -203,6 +222,7 @@ static struct resp_t { struct opt_client_id client_id; struct opt_dns_servers dns_servers; struct opt_dns_search dns_search; + struct opt_client_fqdn client_fqdn; } __attribute__((__packed__)) resp = { { 0 }, SERVER_ID, @@ -228,6 +248,10 @@ static struct resp_t { { { OPT_DNS_SEARCH, 0, }, { 0 }, }, + + { { OPT_CLIENT_FQDN, 0, }, + 0, { 0 }, + }, }; static const struct opt_status_code sc_not_on_link = { @@ -296,45 +320,42 @@ static struct opt_hdr *dhcpv6_opt(const struct pool *p, size_t *offset, static struct opt_hdr *dhcpv6_ia_notonlink(const struct pool *p, struct in6_addr *la) { + int ia_types[2] = { OPT_IA_NA, OPT_IA_TA }, *ia_type; + const struct opt_ia_addr *opt_addr; char buf[INET6_ADDRSTRLEN]; struct in6_addr req_addr; - struct opt_hdr *ia, *h; + const struct opt_hdr *h; + struct opt_hdr *ia; size_t offset; - int ia_type; - ia_type = OPT_IA_NA; -ia_ta: - offset = 0; - while ((ia = dhcpv6_opt(p, &offset, ia_type))) { - if (ntohs(ia->l) < OPT_VSIZE(ia_na)) - return NULL; + foreach(ia_type, ia_types) { + offset = 0; + while ((ia = dhcpv6_opt(p, &offset, *ia_type))) { + if (ntohs(ia->l) < OPT_VSIZE(ia_na)) + return NULL; - offset += sizeof(struct opt_ia_na); + offset += sizeof(struct opt_ia_na); - while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) { - struct opt_ia_addr *opt_addr = (struct opt_ia_addr *)h; + while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) { + if (ntohs(h->l) != OPT_VSIZE(ia_addr)) + return NULL; - if (ntohs(h->l) != OPT_VSIZE(ia_addr)) - return NULL; + opt_addr = (const struct opt_ia_addr *)h; + req_addr = opt_addr->addr; + if (!IN6_ARE_ADDR_EQUAL(la, &req_addr)) + goto err; - memcpy(&req_addr, &opt_addr->addr, sizeof(req_addr)); - if (!IN6_ARE_ADDR_EQUAL(la, &req_addr)) { - info("DHCPv6: requested address %s not on link", - inet_ntop(AF_INET6, &req_addr, - buf, sizeof(buf))); - return ia; + offset += sizeof(struct opt_ia_addr); } - - offset += sizeof(struct opt_ia_addr); } } - if (ia_type == OPT_IA_NA) { - ia_type = OPT_IA_TA; - goto ia_ta; - } - return NULL; + +err: + info("DHCPv6: requested address %s not on link", + inet_ntop(AF_INET6, &req_addr, buf, sizeof(buf))); + return ia; } /** @@ -349,7 +370,6 @@ static size_t dhcpv6_dns_fill(const struct ctx *c, char *buf, int offset) { struct opt_dns_servers *srv = NULL; struct opt_dns_search *srch = NULL; - char *p = NULL; int i; if (c->no_dhcp_dns) @@ -363,7 +383,7 @@ static size_t dhcpv6_dns_fill(const struct ctx *c, char *buf, int offset) srv->hdr.l = 0; } - memcpy(&srv->addr[i], &c->ip6.dns[i], sizeof(srv->addr[i])); + srv->addr[i] = c->ip6.dns[i]; srv->hdr.l += sizeof(srv->addr[i]); offset += sizeof(srv->addr[i]); } @@ -386,35 +406,82 @@ search: if (!name_len) continue; + name_len += 2; /* Length byte for first label, and terminator */ + if (name_len > + NS_MAXDNAME + 1 /* Length byte for first label */ || + name_len > 255) { + debug("DHCP: DNS search name '%s' too long, skipping", + c->dns_search[i].n); + continue; + } + if (!srch) { srch = (struct opt_dns_search *)(buf + offset); offset += sizeof(struct opt_hdr); srch->hdr.t = OPT_DNS_SEARCH; srch->hdr.l = 0; - p = srch->list; } - *p = '.'; - p = stpncpy(p + 1, c->dns_search[i].n, name_len); - p++; - srch->hdr.l += name_len + 2; - offset += name_len + 2; + encode_domain_name(buf + offset, c->dns_search[i].n); + + srch->hdr.l += name_len; + offset += name_len; + } - if (srch) { - for (i = 0; i < srch->hdr.l; i++) { - if (srch->list[i] == '.') { - srch->list[i] = strcspn(srch->list + i + 1, - "."); - } - } + if (srch) srch->hdr.l = htons(srch->hdr.l); - } return offset; } /** + * dhcpv6_client_fqdn_fill() - Fill in client FQDN option + * @c: Execution context + * @buf: Response message buffer where options will be appended + * @offset: Offset in message buffer for new options + * + * Return: updated length of response message buffer. + */ +static size_t dhcpv6_client_fqdn_fill(const struct pool *p, const struct ctx *c, + char *buf, int offset) + +{ + struct opt_client_fqdn const *req_opt; + struct opt_client_fqdn *o; + size_t opt_len; + + opt_len = strlen(c->fqdn); + if (opt_len == 0) { + return offset; + } + + opt_len += 2; /* Length byte for first label, and terminator */ + if (opt_len > OPT_MAX_SIZE - (offset + + sizeof(struct opt_hdr) + + 1 /* flags */ )) { + debug("DHCPv6: client FQDN option doesn't fit, skipping"); + return offset; + } + + o = (struct opt_client_fqdn *)(buf + offset); + encode_domain_name(o->domain_name, c->fqdn); + req_opt = (struct opt_client_fqdn *)dhcpv6_opt(p, &(size_t){ 0 }, + OPT_CLIENT_FQDN); + if (req_opt && req_opt->flags & 0x01 /* S flag */) + o->flags = 0x02 /* O flag */; + else + o->flags = 0x00; + + opt_len++; + + o->hdr.t = OPT_CLIENT_FQDN; + o->hdr.l = htons(opt_len); + + return offset + sizeof(struct opt_hdr) + opt_len; +} + +/** * dhcpv6() - Check if this is a DHCPv6 message, reply as needed * @c: Execution context * @p: Packet pool, single packet starting from UDP header @@ -426,11 +493,11 @@ search: int dhcpv6(struct ctx *c, const struct pool *p, const struct in6_addr *saddr, const struct in6_addr *daddr) { - struct opt_hdr *ia, *bad_ia, *client_id; - const struct opt_hdr *server_id; + const struct opt_hdr *client_id, *server_id, *ia; const struct in6_addr *src; const struct msg_hdr *mh; const struct udphdr *uh; + struct opt_hdr *bad_ia; size_t mlen, n; uh = packet_get(p, 0, 0, sizeof(*uh), &mlen); @@ -451,10 +518,7 @@ int dhcpv6(struct ctx *c, const struct pool *p, c->ip6.addr_ll_seen = *saddr; - if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw)) - src = &c->ip6.gw; - else - src = &c->ip6.addr_ll; + src = &c->ip6.our_tap_ll; mh = packet_get(p, 0, sizeof(*uh), sizeof(*mh), NULL); if (!mh) @@ -550,6 +614,7 @@ int dhcpv6(struct ctx *c, const struct pool *p, n = offsetof(struct resp_t, client_id) + sizeof(struct opt_hdr) + ntohs(client_id->l); n = dhcpv6_dns_fill(c, (char *)&resp, n); + n = dhcpv6_client_fqdn_fill(p, c, (char *)&resp, n); resp.hdr.xid = mh->xid; @@ -574,8 +639,10 @@ void dhcpv6_init(const struct ctx *c) resp.server_id.duid_time = duid_time; resp_not_on_link.server_id.duid_time = duid_time; - memcpy(resp.server_id.duid_lladdr, c->mac, sizeof(c->mac)); - memcpy(resp_not_on_link.server_id.duid_lladdr, c->mac, sizeof(c->mac)); + memcpy(resp.server_id.duid_lladdr, + c->our_tap_mac, sizeof(c->our_tap_mac)); + memcpy(resp_not_on_link.server_id.duid_lladdr, + c->our_tap_mac, sizeof(c->our_tap_mac)); resp.ia_addr.addr = c->ip6.addr; } diff --git a/doc/migration/.gitignore b/doc/migration/.gitignore new file mode 100644 index 0000000..59cb765 --- /dev/null +++ b/doc/migration/.gitignore @@ -0,0 +1,2 @@ +/source +/target diff --git a/doc/migration/Makefile b/doc/migration/Makefile new file mode 100644 index 0000000..04f6891 --- /dev/null +++ b/doc/migration/Makefile @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +TARGETS = source target +CFLAGS = -Wall -Wextra -pedantic + +all: $(TARGETS) + +$(TARGETS): %: %.c + +clean: + rm -f $(TARGETS) diff --git a/doc/migration/README b/doc/migration/README new file mode 100644 index 0000000..375603b --- /dev/null +++ b/doc/migration/README @@ -0,0 +1,51 @@ +<!--- +SPDX-License-Identifier: GPL-2.0-or-later +Copyright (c) 2025 Red Hat GmbH +Author: Stefano Brivio <sbrivio@redhat.com> +--> + +Migration +========= + +These test programs show a migration of a TCP connection from one process to +another using the TCP_REPAIR socket option. + +The two processes are a mock of the matching implementation in passt(1), and run +unprivileged, so they rely on the passt-repair helper to connect to them and set +or clear TCP_REPAIR on the connection socket, transferred to the helper using +SCM_RIGHTS. + +The passt-repair helper needs to have the CAP_NET_ADMIN capability, or run as +root. + +Example of usage +---------------- + +* Start the test server + + $ nc -l 9999 + +* Start the source side of the TCP client (mock of the source instance of passt) + + $ ./source 127.0.0.1 9999 9998 /tmp/repair.sock + +* The client sends a test string, and waits for a connection from passt-repair + + # passt-repair /tmp/repair.sock + +* The socket is now in repair mode, and `source` dumps sequences, then exits + + sending sequence: 3244673313 + receiving sequence: 2250449386 + +* Continue the connection on the target side, restarting from those sequences + + $ ./target 127.0.0.1 9999 9998 /tmp/repair.sock 3244673313 2250449386 + +* The target side now waits for a connection from passt-repair + + # passt-repair /tmp/repair.sock + +* The target side asks passt-repair to switch the socket to repair mode, sets up + the TCP sequences, then asks passt-repair to clear repair mode, and sends a + test string to the server diff --git a/doc/migration/source.c b/doc/migration/source.c new file mode 100644 index 0000000..d44ebf1 --- /dev/null +++ b/doc/migration/source.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * doc/migration/source.c - Mock of TCP migration source, use with passt-repair + * + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#include <arpa/inet.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <limits.h> +#include <unistd.h> +#include <netdb.h> +#include <netinet/tcp.h> + +int main(int argc, char **argv) +{ + struct sockaddr_in a = { AF_INET, htons(atoi(argv[3])), { 0 }, { 0 } }; + struct addrinfo hints = { 0, AF_UNSPEC, SOCK_STREAM, 0, 0, + NULL, NULL, NULL }; + struct sockaddr_un a_helper = { AF_UNIX, { 0 } }; + int seq, s, s_helper; + int8_t cmd; + struct iovec iov = { &cmd, sizeof(cmd) }; + char buf[CMSG_SPACE(sizeof(int))]; + struct msghdr msg = { NULL, 0, &iov, 1, buf, sizeof(buf), 0 }; + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + socklen_t seqlen = sizeof(int); + struct addrinfo *r; + + (void)argc; + + if (argc != 5) { + fprintf(stderr, "%s DST_ADDR DST_PORT SRC_PORT HELPER_PATH\n", + argv[0]); + return -1; + } + + strcpy(a_helper.sun_path, argv[4]); + getaddrinfo(argv[1], argv[2], &hints, &r); + + /* Connect socket to server and send some data */ + s = socket(r->ai_family, SOCK_STREAM, IPPROTO_TCP); + setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &((int){ 1 }), sizeof(int)); + bind(s, (struct sockaddr *)&a, sizeof(a)); + connect(s, r->ai_addr, r->ai_addrlen); + send(s, "before migration\n", sizeof("before migration\n"), 0); + + /* Wait for helper */ + s_helper = socket(AF_UNIX, SOCK_STREAM, 0); + unlink(a_helper.sun_path); + bind(s_helper, (struct sockaddr *)&a_helper, sizeof(a_helper)); + listen(s_helper, 1); + s_helper = accept(s_helper, NULL, NULL); + + /* Set up message for helper, with socket */ + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &s, sizeof(s)); + + /* Send command to helper: turn repair mode on, wait for reply */ + cmd = TCP_REPAIR_ON; + sendmsg(s_helper, &msg, 0); + recv(s_helper, &((int8_t){ 0 }), 1, 0); + + /* Terminate helper */ + close(s_helper); + + /* Get sending sequence */ + seq = TCP_SEND_QUEUE; + setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq)); + getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, &seqlen); + fprintf(stdout, "%u ", seq); + + /* Get receiving sequence */ + seq = TCP_RECV_QUEUE; + setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq)); + getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, &seqlen); + fprintf(stdout, "%u\n", seq); +} diff --git a/doc/migration/target.c b/doc/migration/target.c new file mode 100644 index 0000000..f7d3108 --- /dev/null +++ b/doc/migration/target.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * doc/migration/target.c - Mock of TCP migration target, use with passt-repair + * + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#include <arpa/inet.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <limits.h> +#include <unistd.h> +#include <netdb.h> +#include <netinet/tcp.h> + +int main(int argc, char **argv) +{ + struct sockaddr_in a = { AF_INET, htons(atoi(argv[3])), { 0 }, { 0 } }; + struct addrinfo hints = { 0, AF_UNSPEC, SOCK_STREAM, 0, 0, + NULL, NULL, NULL }; + struct sockaddr_un a_helper = { AF_UNIX, { 0 } }; + int s, s_helper, seq; + int8_t cmd; + struct iovec iov = { &cmd, sizeof(cmd) }; + char buf[CMSG_SPACE(sizeof(int))]; + struct msghdr msg = { NULL, 0, &iov, 1, buf, sizeof(buf), 0 }; + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + struct addrinfo *r; + + (void)argc; + + strcpy(a_helper.sun_path, argv[4]); + getaddrinfo(argv[1], argv[2], &hints, &r); + + if (argc != 7) { + fprintf(stderr, + "%s DST_ADDR DST_PORT SRC_PORT HELPER_PATH SSEQ RSEQ\n", + argv[0]); + return -1; + } + + /* Prepare socket, bind to source port */ + s = socket(r->ai_family, SOCK_STREAM, IPPROTO_TCP); + setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &((int){ 1 }), sizeof(int)); + bind(s, (struct sockaddr *)&a, sizeof(a)); + + /* Wait for helper */ + s_helper = socket(AF_UNIX, SOCK_STREAM, 0); + unlink(a_helper.sun_path); + bind(s_helper, (struct sockaddr *)&a_helper, sizeof(a_helper)); + listen(s_helper, 1); + s_helper = accept(s_helper, NULL, NULL); + + /* Set up message for helper, with socket */ + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &s, sizeof(s)); + + /* Send command to helper: turn repair mode on, wait for reply */ + cmd = TCP_REPAIR_ON; + sendmsg(s_helper, &msg, 0); + recv(s_helper, &((int){ 0 }), 1, 0); + + /* Set sending sequence */ + seq = TCP_SEND_QUEUE; + setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq)); + seq = atoi(argv[5]); + setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, sizeof(seq)); + + /* Set receiving sequence */ + seq = TCP_RECV_QUEUE; + setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &seq, sizeof(seq)); + seq = atoi(argv[6]); + setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &seq, sizeof(seq)); + + /* Connect setting kernel state only, without actual SYN / handshake */ + connect(s, r->ai_addr, r->ai_addrlen); + + /* Send command to helper: turn repair mode off, wait for reply */ + cmd = TCP_REPAIR_OFF; + sendmsg(s_helper, &msg, 0); + + recv(s_helper, &((int8_t){ 0 }), 1, 0); + + /* Terminate helper */ + close(s_helper); + + /* Send some more data */ + send(s, "after migration\n", sizeof("after migration\n"), 0); +} diff --git a/doc/platform-requirements/.gitignore b/doc/platform-requirements/.gitignore new file mode 100644 index 0000000..f6272cf --- /dev/null +++ b/doc/platform-requirements/.gitignore @@ -0,0 +1,4 @@ +/listen-vs-repair +/reuseaddr-priority +/recv-zero +/udp-close-dup diff --git a/doc/platform-requirements/Makefile b/doc/platform-requirements/Makefile new file mode 100644 index 0000000..83930ef --- /dev/null +++ b/doc/platform-requirements/Makefile @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# Copyright Red Hat +# Author: David Gibson <david@gibson.dropbear.id.au> + +TARGETS = reuseaddr-priority recv-zero udp-close-dup listen-vs-repair +SRCS = reuseaddr-priority.c recv-zero.c udp-close-dup.c listen-vs-repair.c +CFLAGS = -Wall + +all: cppcheck clang-tidy $(TARGETS:%=check-%) + +$(TARGETS): %: %.c common.c common.h + +check-%: % + ./$< + +cppcheck: + cppcheck --std=c11 --error-exitcode=1 --enable=all --force \ + --check-level=exhaustive --inline-suppr \ + --inconclusive --library=posix --quiet \ + --suppress=missingIncludeSystem \ + $(SRCS) + +clang-tidy: + clang-tidy --checks=*,\ + -altera-id-dependent-backward-branch,\ + -altera-unroll-loops,\ + -bugprone-easily-swappable-parameters,\ + -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,\ + -concurrency-mt-unsafe,\ + -cppcoreguidelines-avoid-non-const-global-variables,\ + -cppcoreguidelines-init-variables,\ + -cppcoreguidelines-macro-to-enum,\ + -google-readability-braces-around-statements,\ + -hicpp-braces-around-statements,\ + -llvmlibc-restrict-system-libc-headers,\ + -misc-include-cleaner,\ + -modernize-macro-to-enum,\ + -readability-braces-around-statements,\ + -readability-identifier-length,\ + -readability-isolate-declaration \ + $(SRCS) + +clean: + rm -f $(TARGETS) *.o *~ diff --git a/doc/platform-requirements/README b/doc/platform-requirements/README new file mode 100644 index 0000000..3914d22 --- /dev/null +++ b/doc/platform-requirements/README @@ -0,0 +1,18 @@ +Platform Requirements +===================== + +TODO: document the various Linux specific features we currently require + + +Test Programs +------------- + +In some places we rely on quite specific behaviour of sockets. +Although Linux, at least, seems to behave as required, It's not always +clear from the available documentation if this is required by POSIX or +some other specification. + +To specifically document those expectations this directory has some +test programs which explicitly check for the behaviour we need. +When/if we attempt a port to a new platform, running these to check +behaviour would be a good place to start. diff --git a/doc/platform-requirements/common.c b/doc/platform-requirements/common.c new file mode 100644 index 0000000..d687377 --- /dev/null +++ b/doc/platform-requirements/common.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* common.c + * + * Common helper functions for testing SO_REUSEADDR behaviour + * + * Copyright Red Hat + * Author: David Gibson <david@gibson.dropbear.id.au> + */ + +#include <errno.h> +#include <netinet/in.h> +#include <string.h> +#include <sys/socket.h> + +#include "common.h" + +int sock_reuseaddr(void) +{ + int y = 1; + int s; + + + s = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); + if (s < 0) + die("socket(): %s\n", strerror(errno)); + + if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y)) , 0) + die("SO_REUSEADDR: %s\n", strerror(errno)); + + return s; +} + +/* Send a token via the given connected socket */ +void send_token(int s, long token) +{ + ssize_t rc; + + rc = send(s, &token, sizeof(token), 0); + if (rc < 0) + die("send(): %s\n", strerror(errno)); + if (rc < sizeof(token)) + die("short send()\n"); +} + +/* Attempt to receive a token via the given socket. + * + * Returns true if we received the token, false if we got an EAGAIN, dies in any + * other case */ +bool recv_token(int s, long token) +{ + ssize_t rc; + long buf; + + rc = recv(s, &buf, sizeof(buf), MSG_DONTWAIT); + if (rc < 0) { + if (errno == EWOULDBLOCK) + return false; + die("recv(): %s\n", strerror(errno)); + } + if (rc < sizeof(buf)) + die("short recv()\n"); + if (buf != token) + die("data mismatch\n"); + return true; +} diff --git a/doc/platform-requirements/common.h b/doc/platform-requirements/common.h new file mode 100644 index 0000000..e85fc2b --- /dev/null +++ b/doc/platform-requirements/common.h @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* common.h + * + * Useful shared functions + * + * Copyright Red Hat + * Author: David Gibson <david@gibson.dropbear.id.au> + */ +#ifndef REUSEADDR_COMMON_H +#define REUSEADDR_COMMON_H + +#include <stdarg.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> + +__attribute__((format(printf, 1, 2), noreturn)) +static inline void die(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + (void)vfprintf(stderr, fmt, ap); + va_end(ap); + exit(EXIT_FAILURE); +} + +#if __BYTE_ORDER == __BIG_ENDIAN +#define htons_constant(x) (x) +#define htonl_constant(x) (x) +#else +#define htons_constant(x) (__bswap_constant_16(x)) +#define htonl_constant(x) (__bswap_constant_32(x)) +#endif + +#define SOCKADDR_INIT(addr, port) \ + { \ + .sin_family = AF_INET, \ + .sin_addr = { .s_addr = htonl_constant(addr) }, \ + .sin_port = htons_constant(port), \ + } + +int sock_reuseaddr(void); +void send_token(int s, long token); +bool recv_token(int s, long token); + +#endif /* REUSEADDR_COMMON_H */ diff --git a/doc/platform-requirements/listen-vs-repair.c b/doc/platform-requirements/listen-vs-repair.c new file mode 100644 index 0000000..d31fe3f --- /dev/null +++ b/doc/platform-requirements/listen-vs-repair.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* liste-vs-repair.c + * + * Do listening sockets have address conflicts with sockets under repair + * ==================================================================== + * + * When we accept() an incoming connection the accept()ed socket will have the + * same local address as the listening socket. This can be a complication on + * migration. On the migration target we've already set up listening sockets + * according to the command line. However to restore connections that we're + * migrating in we need to bind the new sockets to the same address, which would + * be an address conflict on the face of it. This test program verifies that + * enabling repair mode before bind() correctly suppresses that conflict. + * + * Copyright Red Hat + * Author: David Gibson <david@gibson.dropbear.id.au> + */ + +/* NOLINTNEXTLINE(bugprone-reserved-identifier,cert-dcl37-c,cert-dcl51-cpp) */ +#define _GNU_SOURCE + +#include <arpa/inet.h> +#include <errno.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> +#include <net/if.h> +#include <netinet/in.h> +#include <netinet/tcp.h> +#include <sched.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "common.h" + +#define PORT 13256U +#define CPORT 13257U + +/* 127.0.0.1:PORT */ +static const struct sockaddr_in addr = SOCKADDR_INIT(INADDR_LOOPBACK, PORT); + +/* 127.0.0.1:CPORT */ +static const struct sockaddr_in caddr = SOCKADDR_INIT(INADDR_LOOPBACK, CPORT); + +/* Put ourselves into a network sandbox */ +static void net_sandbox(void) +{ + /* NOLINTNEXTLINE(altera-struct-pack-align) */ + const struct req_t { + struct nlmsghdr nlh; + struct ifinfomsg ifm; + } __attribute__((packed)) req = { + .nlh.nlmsg_type = RTM_NEWLINK, + .nlh.nlmsg_flags = NLM_F_REQUEST, + .nlh.nlmsg_len = sizeof(req), + .nlh.nlmsg_seq = 1, + .ifm.ifi_family = AF_UNSPEC, + .ifm.ifi_index = 1, + .ifm.ifi_flags = IFF_UP, + .ifm.ifi_change = IFF_UP, + }; + int nl; + + if (unshare(CLONE_NEWUSER | CLONE_NEWNET)) + die("unshare(): %s\n", strerror(errno)); + + /* Bring up lo in the new netns */ + nl = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE); + if (nl < 0) + die("Can't create netlink socket: %s\n", strerror(errno)); + + if (send(nl, &req, sizeof(req), 0) < 0) + die("Netlink send(): %s\n", strerror(errno)); + close(nl); +} + +static void check(void) +{ + int s1, s2, op; + + s1 = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (s1 < 0) + die("socket() 1: %s\n", strerror(errno)); + + if (bind(s1, (struct sockaddr *)&addr, sizeof(addr))) + die("bind() 1: %s\n", strerror(errno)); + + if (listen(s1, 0)) + die("listen(): %s\n", strerror(errno)); + + s2 = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (s2 < 0) + die("socket() 2: %s\n", strerror(errno)); + + op = TCP_REPAIR_ON; + if (setsockopt(s2, SOL_TCP, TCP_REPAIR, &op, sizeof(op))) + die("TCP_REPAIR: %s\n", strerror(errno)); + + if (bind(s2, (struct sockaddr *)&addr, sizeof(addr))) + die("bind() 2: %s\n", strerror(errno)); + + if (connect(s2, (struct sockaddr *)&caddr, sizeof(caddr))) + die("connect(): %s\n", strerror(errno)); + + op = TCP_REPAIR_OFF_NO_WP; + if (setsockopt(s2, SOL_TCP, TCP_REPAIR, &op, sizeof(op))) + die("TCP_REPAIR: %s\n", strerror(errno)); + + close(s1); + close(s2); +} + +int main(int argc, char *argv[]) +{ + (void)argc; + (void)argv; + + net_sandbox(); + + check(); + + printf("Repair mode appears to properly suppress conflicts with listening sockets\n"); + + exit(0); +} diff --git a/doc/platform-requirements/recv-zero.c b/doc/platform-requirements/recv-zero.c new file mode 100644 index 0000000..2a2a561 --- /dev/null +++ b/doc/platform-requirements/recv-zero.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* recv-zero.c + * + * Verify that we're able to discard datagrams by recv()ing into a zero-length + * buffer. + * + * Copyright Red Hat + * Author: David Gibson <david@gibson.dropbear.id.au> + */ + +#include <arpa/inet.h> +#include <errno.h> +#include <net/if.h> +#include <netinet/in.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "common.h" + +#define DSTPORT 13257U + +enum discard_method { + DISCARD_NULL_BUF, + DISCARD_ZERO_IOV, + DISCARD_NULL_IOV, + NUM_METHODS, +}; + +/* 127.0.0.1:DSTPORT */ +static const struct sockaddr_in lo_dst = SOCKADDR_INIT(INADDR_LOOPBACK, DSTPORT); + +static void test_discard(enum discard_method method) +{ + struct iovec zero_iov = { .iov_base = NULL, .iov_len = 0, }; + struct msghdr mh_zero = { + .msg_iov = &zero_iov, + .msg_iovlen = 1, + }; + struct msghdr mh_null = { + .msg_iov = NULL, + .msg_iovlen = 0, + }; + long token1, token2; + int recv_s, send_s; + ssize_t rc; + + token1 = random(); + token2 = random(); + + recv_s = sock_reuseaddr(); + if (bind(recv_s, (struct sockaddr *)&lo_dst, sizeof(lo_dst)) < 0) + die("bind(): %s\n", strerror(errno)); + + send_s = sock_reuseaddr(); + if (connect(send_s, (struct sockaddr *)&lo_dst, sizeof(lo_dst)) < 0) + die("connect(): %s\n", strerror(errno)); + + send_token(send_s, token1); + send_token(send_s, token2); + + switch (method) { + case DISCARD_NULL_BUF: + /* cppcheck-suppress nullPointer */ + rc = recv(recv_s, NULL, 0, MSG_DONTWAIT); + if (rc < 0) + die("discarding recv(): %s\n", strerror(errno)); + break; + + case DISCARD_ZERO_IOV: + rc = recvmsg(recv_s, &mh_zero, MSG_DONTWAIT); + if (rc < 0) + die("recvmsg() with zero-length buffer: %s\n", + strerror(errno)); + if (!((unsigned)mh_zero.msg_flags & MSG_TRUNC)) + die("Missing MSG_TRUNC flag\n"); + break; + + case DISCARD_NULL_IOV: + rc = recvmsg(recv_s, &mh_null, MSG_DONTWAIT); + if (rc < 0) + die("recvmsg() with zero-length iov: %s\n", + strerror(errno)); + if (!((unsigned)mh_null.msg_flags & MSG_TRUNC)) + die("Missing MSG_TRUNC flag\n"); + break; + + default: + die("Bad method\n"); + } + + recv_token(recv_s, token2); + + /* cppcheck-suppress nullPointer */ + rc = recv(recv_s, NULL, 0, MSG_DONTWAIT); + if (rc < 0 && errno != EAGAIN) + die("redundant discarding recv(): %s\n", strerror(errno)); + if (rc >= 0) + die("Unexpected receive: rc=%zd\n", rc); +} + +int main(int argc, char *argv[]) +{ + enum discard_method method; + + (void)argc; + (void)argv; + + for (method = 0; method < NUM_METHODS; method++) + test_discard(method); + + printf("Discarding datagrams with 0-length receives seems to work\n"); + + exit(0); +} diff --git a/doc/platform-requirements/reuseaddr-priority.c b/doc/platform-requirements/reuseaddr-priority.c new file mode 100644 index 0000000..af39a39 --- /dev/null +++ b/doc/platform-requirements/reuseaddr-priority.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* reuseaddr-priority.c + * + * Verify which SO_REUSEADDR UDP sockets get priority to receive + * ============================================================= + * + * SO_REUSEADDR allows multiple sockets to bind to overlapping addresses, so + * there can be multiple sockets eligible to receive the same packet. The exact + * semantics of which socket will receive in this circumstance isn't very well + * documented. + * + * This program verifies that things behave the way we expect. Specifically we + * expect: + * + * - If both a connected and an unconnected socket could receive a datagram, the + * connected one will receive it in preference to the unconnected one. + * + * - If an unconnected socket bound to a specific address and an unconnected + * socket bound to the "any" address (0.0.0.0 or ::) could receive a datagram, + * then the one with a specific address will receive it in preference to the + * other. + * + * These should be true regardless of the order the sockets are created in, or + * the order they're polled in. + * + * Copyright Red Hat + * Author: David Gibson <david@gibson.dropbear.id.au> + */ + +#include <arpa/inet.h> +#include <errno.h> +#include <net/if.h> +#include <netinet/in.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "common.h" + +#define SRCPORT 13246U +#define DSTPORT 13247U + +/* Different cases for receiving socket configuration */ +enum sock_type { + /* Socket is bound to 0.0.0.0:DSTPORT and not connected */ + SOCK_BOUND_ANY, + + /* Socket is bound to 127.0.0.1:DSTPORT and not connected */ + SOCK_BOUND_LO, + + /* Socket is bound to 0.0.0.0:DSTPORT and connected to 127.0.0.1:SRCPORT */ + SOCK_CONNECTED, + + NUM_SOCK_TYPES, +}; + +typedef enum sock_type order_t[NUM_SOCK_TYPES]; + +static order_t orders[] = { + {0, 1, 2}, {0, 2, 1}, {1, 0, 2}, {1, 2, 0}, {2, 0, 1}, {2, 1, 0}, +}; + +/* 127.0.0.2 */ +#define INADDR_LOOPBACK2 ((in_addr_t)(0x7f000002)) + +/* 0.0.0.0:DSTPORT */ +static const struct sockaddr_in any_dst = SOCKADDR_INIT(INADDR_ANY, DSTPORT); +/* 127.0.0.1:DSTPORT */ +static const struct sockaddr_in lo_dst = SOCKADDR_INIT(INADDR_LOOPBACK, DSTPORT); + +/* 127.0.0.2:DSTPORT */ +static const struct sockaddr_in lo2_dst = SOCKADDR_INIT(INADDR_LOOPBACK2, DSTPORT); + +/* 127.0.0.1:SRCPORT */ +static const struct sockaddr_in lo_src = SOCKADDR_INIT(INADDR_LOOPBACK, SRCPORT); + +/* Random token to send in datagram */ +static long token; + +/* Get a socket of the specified type for receiving */ +static int sock_recv(enum sock_type type) +{ + const struct sockaddr *connect_sa = NULL; + const struct sockaddr *bind_sa = NULL; + int s; + + s = sock_reuseaddr(); + + switch (type) { + case SOCK_CONNECTED: + connect_sa = (struct sockaddr *)&lo_src; + /* fallthrough */ + case SOCK_BOUND_ANY: + bind_sa = (struct sockaddr *)&any_dst; + break; + + case SOCK_BOUND_LO: + bind_sa = (struct sockaddr *)&lo_dst; + break; + + default: + die("bug"); + } + + if (bind_sa) + if (bind(s, bind_sa, sizeof(struct sockaddr_in)) < 0) + die("bind(): %s\n", strerror(errno)); + if (connect_sa) + if (connect(s, connect_sa, sizeof(struct sockaddr_in)) < 0) + die("connect(): %s\n", strerror(errno)); + + return s; +} + +/* Get a socket suitable for sending to the given type of receiving socket */ +static int sock_send(enum sock_type type) +{ + const struct sockaddr *connect_sa = NULL; + const struct sockaddr *bind_sa = NULL; + int s; + + s = sock_reuseaddr(); + + switch (type) { + case SOCK_BOUND_ANY: + connect_sa = (struct sockaddr *)&lo2_dst; + break; + + case SOCK_CONNECTED: + bind_sa = (struct sockaddr *)&lo_src; + /* fallthrough */ + case SOCK_BOUND_LO: + connect_sa = (struct sockaddr *)&lo_dst; + break; + + default: + die("bug"); + } + + if (bind_sa) + if (bind(s, bind_sa, sizeof(struct sockaddr_in)) < 0) + die("bind(): %s\n", strerror(errno)); + if (connect_sa) + if (connect(s, connect_sa, sizeof(struct sockaddr_in)) < 0) + die("connect(): %s\n", strerror(errno)); + + return s; +} + +/* Check for expected behaviour with one specific ordering for various operations: + * + * @recv_create_order: Order to create receiving sockets in + * @send_create_order: Order to create sending sockets in + * @test_order: Order to test the behaviour of different types + * @recv_order: Order to check the receiving sockets + */ +static void check_one_order(const order_t recv_create_order, + const order_t send_create_order, + const order_t test_order, + const order_t recv_order) +{ + int rs[NUM_SOCK_TYPES]; + int ss[NUM_SOCK_TYPES]; + int nfds = 0; + int i, j; + + for (i = 0; i < NUM_SOCK_TYPES; i++) { + enum sock_type t = recv_create_order[i]; + int s; + + s = sock_recv(t); + if (s >= nfds) + nfds = s + 1; + + rs[t] = s; + } + + for (i = 0; i < NUM_SOCK_TYPES; i++) { + enum sock_type t = send_create_order[i]; + + ss[t] = sock_send(t); + } + + for (i = 0; i < NUM_SOCK_TYPES; i++) { + enum sock_type ti = test_order[i]; + int recv_via = -1; + + send_token(ss[ti], token); + + for (j = 0; j < NUM_SOCK_TYPES; j++) { + enum sock_type tj = recv_order[j]; + + if (recv_token(rs[tj], token)) { + if (recv_via != -1) + die("Received token more than once\n"); + recv_via = tj; + } + } + + if (recv_via == -1) + die("Didn't receive token at all\n"); + if (recv_via != ti) + die("Received token via unexpected socket\n"); + } + + for (i = 0; i < NUM_SOCK_TYPES; i++) { + close(rs[i]); + close(ss[i]); + } +} + +static void check_all_orders(void) +{ + int norders = sizeof(orders) / sizeof(orders[0]); + int i, j, k, l; + + for (i = 0; i < norders; i++) + for (j = 0; j < norders; j++) + for (k = 0; k < norders; k++) + for (l = 0; l < norders; l++) + check_one_order(orders[i], orders[j], + orders[k], orders[l]); +} + +int main(int argc, char *argv[]) +{ + (void)argc; + (void)argv; + + token = random(); + + check_all_orders(); + + printf("SO_REUSEADDR receive priorities seem to work as expected\n"); + + exit(0); +} diff --git a/doc/platform-requirements/udp-close-dup.c b/doc/platform-requirements/udp-close-dup.c new file mode 100644 index 0000000..99060fc --- /dev/null +++ b/doc/platform-requirements/udp-close-dup.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* udp-close-dup.c + * + * Verify that closing one dup() of a UDP socket won't stop other dups from + * receiving packets. + * + * Copyright Red Hat + * Author: David Gibson <david@gibson.dropbear.id.au> + */ + +#include <arpa/inet.h> +#include <errno.h> +#include <fcntl.h> +#include <net/if.h> +#include <netinet/in.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "common.h" + +#define DSTPORT 13257U + +/* 127.0.0.1:DSTPORT */ +static const struct sockaddr_in lo_dst = SOCKADDR_INIT(INADDR_LOOPBACK, DSTPORT); + +enum dup_method { + DUP_DUP, + DUP_FCNTL, + NUM_METHODS, +}; + +static void test_close_dup(enum dup_method method) +{ + long token; + int s1, s2, send_s; + ssize_t rc; + + s1 = sock_reuseaddr(); + if (bind(s1, (struct sockaddr *)&lo_dst, sizeof(lo_dst)) < 0) + die("bind(): %s\n", strerror(errno)); + + send_s = sock_reuseaddr(); + if (connect(send_s, (struct sockaddr *)&lo_dst, sizeof(lo_dst)) < 0) + die("connect(): %s\n", strerror(errno)); + + /* Receive before duplicating */ + token = random(); + send_token(send_s, token); + recv_token(s1, token); + + switch (method) { + case DUP_DUP: + /* NOLINTNEXTLINE(android-cloexec-dup) */ + s2 = dup(s1); + if (s2 < 0) + die("dup(): %s\n", strerror(errno)); + break; + case DUP_FCNTL: + s2 = fcntl(s1, F_DUPFD_CLOEXEC, 0); + if (s2 < 0) + die("F_DUPFD_CLOEXEC: %s\n", strerror(errno)); + break; + default: + die("Bad method\n"); + } + + /* Receive via original handle */ + token = random(); + send_token(send_s, token); + recv_token(s1, token); + + /* Receive via duplicated handle */ + token = random(); + send_token(send_s, token); + recv_token(s2, token); + + /* Close duplicate */ + rc = close(s2); + if (rc < 0) + die("close() dup: %s\n", strerror(errno)); + + /* Receive after closing duplicate */ + token = random(); + send_token(send_s, token); + recv_token(s1, token); +} + +int main(int argc, char *argv[]) +{ + enum dup_method method; + + (void)argc; + (void)argv; + + for (method = 0; method < NUM_METHODS; method++) + test_close_dup(method); + + printf("Closing dup()ed UDP sockets seems to work as expected\n"); + + exit(0); +} diff --git a/epoll_type.h b/epoll_type.h new file mode 100644 index 0000000..12ac59b --- /dev/null +++ b/epoll_type.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: David Gibson <david@gibson.dropbear.id.au> + */ + +#ifndef EPOLL_TYPE_H +#define EPOLL_TYPE_H + +/** + * enum epoll_type - Different types of fds we poll over + */ +enum epoll_type { + /* Special value to indicate an invalid type */ + EPOLL_TYPE_NONE = 0, + /* Connected TCP sockets */ + EPOLL_TYPE_TCP, + /* Connected TCP sockets (spliced) */ + EPOLL_TYPE_TCP_SPLICE, + /* Listening TCP sockets */ + EPOLL_TYPE_TCP_LISTEN, + /* timerfds used for TCP timers */ + EPOLL_TYPE_TCP_TIMER, + /* UDP "listening" sockets */ + EPOLL_TYPE_UDP_LISTEN, + /* UDP socket for a specific flow */ + EPOLL_TYPE_UDP, + /* ICMP/ICMPv6 ping sockets */ + EPOLL_TYPE_PING, + /* inotify fd watching for end of netns (pasta) */ + EPOLL_TYPE_NSQUIT_INOTIFY, + /* timer fd watching for end of netns, fallback for inotify (pasta) */ + EPOLL_TYPE_NSQUIT_TIMER, + /* tuntap character device */ + EPOLL_TYPE_TAP_PASTA, + /* socket connected to qemu */ + EPOLL_TYPE_TAP_PASST, + /* socket listening for qemu socket connections */ + EPOLL_TYPE_TAP_LISTEN, + /* vhost-user command socket */ + EPOLL_TYPE_VHOST_CMD, + /* vhost-user kick event socket */ + EPOLL_TYPE_VHOST_KICK, + /* TCP_REPAIR helper listening socket */ + EPOLL_TYPE_REPAIR_LISTEN, + /* TCP_REPAIR helper socket */ + EPOLL_TYPE_REPAIR, + + EPOLL_NUM_TYPES, +}; + +#endif /* EPOLL_TYPE_H */ @@ -5,9 +5,11 @@ * Tracking for logical "flows" of packets. */ +#include <errno.h> #include <stdint.h> #include <stdio.h> #include <unistd.h> +#include <sched.h> #include <string.h> #include "util.h" @@ -17,6 +19,18 @@ #include "inany.h" #include "flow.h" #include "flow_table.h" +#include "repair.h" + +const char *flow_state_str[] = { + [FLOW_STATE_FREE] = "FREE", + [FLOW_STATE_NEW] = "NEW", + [FLOW_STATE_INI] = "INI", + [FLOW_STATE_TGT] = "TGT", + [FLOW_STATE_TYPED] = "TYPED", + [FLOW_STATE_ACTIVE] = "ACTIVE", +}; +static_assert(ARRAY_SIZE(flow_state_str) == FLOW_NUM_STATES, + "flow_state_str[] doesn't match enum flow_state"); const char *flow_type_str[] = { [FLOW_TYPE_NONE] = "<none>", @@ -24,6 +38,7 @@ const char *flow_type_str[] = { [FLOW_TCP_SPLICE] = "TCP connection (spliced)", [FLOW_PING4] = "ICMP ping sequence", [FLOW_PING6] = "ICMPv6 ping sequence", + [FLOW_UDP] = "UDP flow", }; static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES, "flow_type_str[] doesn't match enum flow_type"); @@ -33,51 +48,19 @@ const uint8_t flow_proto[] = { [FLOW_TCP_SPLICE] = IPPROTO_TCP, [FLOW_PING4] = IPPROTO_ICMP, [FLOW_PING6] = IPPROTO_ICMPV6, + [FLOW_UDP] = IPPROTO_UDP, }; static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, "flow_proto[] doesn't match enum flow_type"); -/* Global Flow Table */ +#define foreach_established_tcp_flow(flow) \ + flow_foreach_of_type((flow), FLOW_TCP) \ + if (!tcp_flow_is_established(&(flow)->tcp)) \ + /* NOLINTNEXTLINE(bugprone-branch-clone) */ \ + continue; \ + else -/** - * DOC: Theory of Operation - flow entry life cycle - * - * An individual flow table entry moves through these logical states, usually in - * this order. - * - * FREE - Part of the general pool of free flow table entries - * Operations: - * - flow_alloc() finds an entry and moves it to ALLOC state - * - * ALLOC - A tentatively allocated entry - * Operations: - * - flow_alloc_cancel() returns the entry to FREE state - * - FLOW_START() set the entry's type and moves to START state - * Caveats: - * - It's not safe to write fields in the flow entry - * - It's not safe to allocate further entries with flow_alloc() - * - It's not safe to return to the main epoll loop (use FLOW_START() - * to move to START state before doing so) - * - It's not safe to use flow_*() logging functions - * - * START - An entry being prepared by flow type specific code - * Operations: - * - Flow type specific fields may be accessed - * - flow_*() logging functions - * - flow_alloc_cancel() returns the entry to FREE state - * Caveats: - * - Returning to the main epoll loop or allocating another entry - * with flow_alloc() implicitly moves the entry to ACTIVE state. - * - * ACTIVE - An active flow entry managed by flow type specific code - * Operations: - * - Flow type specific fields may be accessed - * - flow_*() logging functions - * - Flow may be expired by returning 'true' from flow type specific - * deferred or timer handler. This will return it to FREE state. - * Caveats: - * - It's not safe to call flow_alloc_cancel() - */ +/* Global Flow Table */ /** * DOC: Theory of Operation - allocating and freeing flow entries @@ -98,7 +81,7 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, * * Free cluster list * flow_first_free gives the index of the first (lowest index) free cluster. - * Each free cluster has the index of the next free cluster, or MAX_FLOW if + * Each free cluster has the index of the next free cluster, or FLOW_MAX if * it is the last free cluster. Together these form a linked list of free * clusters, in strictly increasing order of index. * @@ -132,18 +115,167 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, unsigned flow_first_free; union flow flowtab[FLOW_MAX]; +static const union flow *flow_new_entry; /* = NULL */ + +/* Hash table to index it */ +#define FLOW_HASH_LOAD 70 /* % */ +#define FLOW_HASH_SIZE ((2 * FLOW_MAX * 100 / FLOW_HASH_LOAD)) + +/* Table for lookup from flowside information */ +static flow_sidx_t flow_hashtab[FLOW_HASH_SIZE]; + +static_assert(ARRAY_SIZE(flow_hashtab) >= 2 * FLOW_MAX, +"Safe linear probing requires hash table with more entries than the number of sides in the flow table"); /* Last time the flow timers ran */ static struct timespec flow_timer_run; +/** flowside_from_af() - Initialise flowside from addresses + * @side: flowside to initialise + * @af: Address family (AF_INET or AF_INET6) + * @eaddr: Endpoint address (pointer to in_addr or in6_addr) + * @eport: Endpoint port + * @oaddr: Our address (pointer to in_addr or in6_addr) + * @oport: Our port + */ +static void flowside_from_af(struct flowside *side, sa_family_t af, + const void *eaddr, in_port_t eport, + const void *oaddr, in_port_t oport) +{ + if (oaddr) + inany_from_af(&side->oaddr, af, oaddr); + else + side->oaddr = inany_any6; + side->oport = oport; + + if (eaddr) + inany_from_af(&side->eaddr, af, eaddr); + else + side->eaddr = inany_any6; + side->eport = eport; +} + +/** + * struct flowside_sock_args - Parameters for flowside_sock_splice() + * @c: Execution context + * @fd: Filled in with new socket fd + * @err: Filled in with errno if something failed + * @type: Socket epoll type + * @sa: Socket address + * @sl: Length of @sa + * @data: epoll reference data + */ +struct flowside_sock_args { + const struct ctx *c; + int fd; + int err; + enum epoll_type type; + const struct sockaddr *sa; + socklen_t sl; + const char *path; + uint32_t data; +}; + +/** flowside_sock_splice() - Create and bind socket for PIF_SPLICE based on flowside + * @arg: Argument as a struct flowside_sock_args + * + * Return: 0 + */ +static int flowside_sock_splice(void *arg) +{ + struct flowside_sock_args *a = arg; + + ns_enter(a->c); + + a->fd = sock_l4_sa(a->c, a->type, a->sa, a->sl, NULL, + a->sa->sa_family == AF_INET6, a->data); + a->err = errno; + + return 0; +} + +/** flowside_sock_l4() - Create and bind socket based on flowside + * @c: Execution context + * @type: Socket epoll type + * @pif: Interface for this socket + * @tgt: Target flowside + * @data: epoll reference portion for protocol handlers + * + * Return: socket fd of protocol @proto bound to our address and port from @tgt + * (if specified). + */ +int flowside_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif, + const struct flowside *tgt, uint32_t data) +{ + const char *ifname = NULL; + union sockaddr_inany sa; + socklen_t sl; + + ASSERT(pif_is_socket(pif)); + + pif_sockaddr(c, &sa, &sl, pif, &tgt->oaddr, tgt->oport); + + switch (pif) { + case PIF_HOST: + if (inany_is_loopback(&tgt->oaddr)) + ifname = NULL; + else if (sa.sa_family == AF_INET) + ifname = c->ip4.ifname_out; + else if (sa.sa_family == AF_INET6) + ifname = c->ip6.ifname_out; + + return sock_l4_sa(c, type, &sa, sl, ifname, + sa.sa_family == AF_INET6, data); + + case PIF_SPLICE: { + struct flowside_sock_args args = { + .c = c, .type = type, + .sa = &sa.sa, .sl = sl, .data = data, + }; + NS_CALL(flowside_sock_splice, &args); + errno = args.err; + return args.fd; + } + + default: + /* If we add new socket pifs, they'll need to be implemented + * here + */ + ASSERT(0); + } +} + +/** flowside_connect() - Connect a socket based on flowside + * @c: Execution context + * @s: Socket to connect + * @pif: Target pif + * @tgt: Target flowside + * + * Connect @s to the endpoint address and port from @tgt. + * + * Return: 0 on success, negative on error + */ +int flowside_connect(const struct ctx *c, int s, + uint8_t pif, const struct flowside *tgt) +{ + union sockaddr_inany sa; + socklen_t sl; + + pif_sockaddr(c, &sa, &sl, pif, &tgt->eaddr, tgt->eport); + return connect(s, &sa.sa, sl); +} + /** flow_log_ - Log flow-related message * @f: flow the message is related to + * @newline: Append newline at the end of the message, if missing * @pri: Log priority * @fmt: Format string * @...: printf-arguments */ -void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) +void flow_log_(const struct flow_common *f, bool newline, int pri, + const char *fmt, ...) { + const char *type_or_state; char msg[BUFSIZ]; va_list args; @@ -151,40 +283,232 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) (void)vsnprintf(msg, sizeof(msg), fmt, args); va_end(args); - logmsg(pri, "Flow %u (%s): %s", flow_idx(f), FLOW_TYPE(f), msg); + /* Show type if it's set, otherwise the state */ + if (f->state < FLOW_STATE_TYPED) + type_or_state = FLOW_STATE(f); + else + type_or_state = FLOW_TYPE(f); + + logmsg(newline, false, pri, + "Flow %u (%s): %s", flow_idx(f), type_or_state, msg); +} + +/** flow_log_details_() - Log the details of a flow + * @f: flow to log + * @pri: Log priority + * @state: State to log details according to + * + * Logs the details of the flow: endpoints, interfaces, type etc. + */ +void flow_log_details_(const struct flow_common *f, int pri, + enum flow_state state) +{ + char estr0[INANY_ADDRSTRLEN], fstr0[INANY_ADDRSTRLEN]; + char estr1[INANY_ADDRSTRLEN], fstr1[INANY_ADDRSTRLEN]; + const struct flowside *ini = &f->side[INISIDE]; + const struct flowside *tgt = &f->side[TGTSIDE]; + + if (state >= FLOW_STATE_TGT) + flow_log_(f, true, pri, + "%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu", + pif_name(f->pif[INISIDE]), + inany_ntop(&ini->eaddr, estr0, sizeof(estr0)), + ini->eport, + inany_ntop(&ini->oaddr, fstr0, sizeof(fstr0)), + ini->oport, + pif_name(f->pif[TGTSIDE]), + inany_ntop(&tgt->oaddr, fstr1, sizeof(fstr1)), + tgt->oport, + inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)), + tgt->eport); + else if (state >= FLOW_STATE_INI) + flow_log_(f, true, pri, "%s [%s]:%hu -> [%s]:%hu => ?", + pif_name(f->pif[INISIDE]), + inany_ntop(&ini->eaddr, estr0, sizeof(estr0)), + ini->eport, + inany_ntop(&ini->oaddr, fstr0, sizeof(fstr0)), + ini->oport); +} + +/** + * flow_set_state() - Change flow's state + * @f: Flow changing state + * @state: New state + */ +static void flow_set_state(struct flow_common *f, enum flow_state state) +{ + uint8_t oldstate = f->state; + + ASSERT(state < FLOW_NUM_STATES); + ASSERT(oldstate < FLOW_NUM_STATES); + + f->state = state; + flow_log_(f, true, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate], + FLOW_STATE(f)); + + flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate)); +} + +/** + * flow_initiate_() - Move flow to INI, setting pif[INISIDE] + * @flow: Flow to change state + * @pif: pif of the initiating side + */ +static void flow_initiate_(union flow *flow, uint8_t pif) +{ + struct flow_common *f = &flow->f; + + ASSERT(pif != PIF_NONE); + ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_NEW); + ASSERT(f->type == FLOW_TYPE_NONE); + ASSERT(f->pif[INISIDE] == PIF_NONE && f->pif[TGTSIDE] == PIF_NONE); + + f->pif[INISIDE] = pif; + flow_set_state(f, FLOW_STATE_INI); +} + +/** + * flow_initiate_af() - Move flow to INI, setting INISIDE details + * @flow: Flow to change state + * @pif: pif of the initiating side + * @af: Address family of @saddr and @daddr + * @saddr: Source address (pointer to in_addr or in6_addr) + * @sport: Endpoint port + * @daddr: Destination address (pointer to in_addr or in6_addr) + * @dport: Destination port + * + * Return: pointer to the initiating flowside information + */ +const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif, + sa_family_t af, + const void *saddr, in_port_t sport, + const void *daddr, in_port_t dport) +{ + struct flowside *ini = &flow->f.side[INISIDE]; + + flowside_from_af(ini, af, saddr, sport, daddr, dport); + flow_initiate_(flow, pif); + return ini; +} + +/** + * flow_initiate_sa() - Move flow to INI, setting INISIDE details + * @flow: Flow to change state + * @pif: pif of the initiating side + * @ssa: Source socket address + * @daddr: Destination address (may be NULL) + * @dport: Destination port + * + * Return: pointer to the initiating flowside information + */ +struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif, + const union sockaddr_inany *ssa, + const union inany_addr *daddr, + in_port_t dport) +{ + struct flowside *ini = &flow->f.side[INISIDE]; + + if (inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa) < 0) { + char str[SOCKADDR_STRLEN]; + + ASSERT_WITH_MSG(0, "Bad socket address %s", + sockaddr_ntop(ssa, str, sizeof(str))); + } + if (daddr) + ini->oaddr = *daddr; + else if (inany_v4(&ini->eaddr)) + ini->oaddr = inany_any4; + else + ini->oaddr = inany_any6; + ini->oport = dport; + flow_initiate_(flow, pif); + return ini; } /** - * flow_start() - Set flow type for new flow and log - * @flow: Flow to set type for - * @type: Type for new flow - * @iniside: Which side initiated the new flow + * flow_target() - Determine where flow should forward to, and move to TGT + * @c: Execution context + * @flow: Flow to forward + * @proto: Protocol * - * Return: @flow + * Return: pointer to the target flowside information + */ +struct flowside *flow_target(const struct ctx *c, union flow *flow, + uint8_t proto) +{ + char estr[INANY_ADDRSTRLEN], fstr[INANY_ADDRSTRLEN]; + struct flow_common *f = &flow->f; + const struct flowside *ini = &f->side[INISIDE]; + struct flowside *tgt = &f->side[TGTSIDE]; + uint8_t tgtpif = PIF_NONE; + + ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_INI); + ASSERT(f->type == FLOW_TYPE_NONE); + ASSERT(f->pif[INISIDE] != PIF_NONE && f->pif[TGTSIDE] == PIF_NONE); + ASSERT(flow->f.state == FLOW_STATE_INI); + + switch (f->pif[INISIDE]) { + case PIF_TAP: + tgtpif = fwd_nat_from_tap(c, proto, ini, tgt); + break; + + case PIF_SPLICE: + tgtpif = fwd_nat_from_splice(c, proto, ini, tgt); + break; + + case PIF_HOST: + tgtpif = fwd_nat_from_host(c, proto, ini, tgt); + break; + + default: + flow_err(flow, "No rules to forward %s [%s]:%hu -> [%s]:%hu", + pif_name(f->pif[INISIDE]), + inany_ntop(&ini->eaddr, estr, sizeof(estr)), + ini->eport, + inany_ntop(&ini->oaddr, fstr, sizeof(fstr)), + ini->oport); + } + + if (tgtpif == PIF_NONE) + return NULL; + + f->pif[TGTSIDE] = tgtpif; + flow_set_state(f, FLOW_STATE_TGT); + return tgt; +} + +/** + * flow_set_type() - Set type and move to TYPED + * @flow: Flow to change state + * @type: New flow type to assign * - * Should be called before setting any flow type specific fields in the flow - * table entry. + * Return: pointer to the modified flow structure. */ -union flow *flow_start(union flow *flow, enum flow_type type, - unsigned iniside) +union flow *flow_set_type(union flow *flow, enum flow_type type) { - (void)iniside; - flow->f.type = type; - flow_dbg(flow, "START %s", flow_type_str[flow->f.type]); + struct flow_common *f = &flow->f; + + ASSERT(type != FLOW_TYPE_NONE); + ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_TGT); + ASSERT(f->type == FLOW_TYPE_NONE); + ASSERT(f->pif[INISIDE] != PIF_NONE && f->pif[TGTSIDE] != PIF_NONE); + + f->type = type; + flow_set_state(f, FLOW_STATE_TYPED); return flow; } /** - * flow_end() - Clear flow type for finished flow and log - * @flow: Flow to clear + * flow_activate() - Move flow to ACTIVE + * @f: Flow to change state */ -static void flow_end(union flow *flow) +void flow_activate(struct flow_common *f) { - if (flow->f.type == FLOW_TYPE_NONE) - return; /* Nothing to do */ + ASSERT(&flow_new_entry->f == f && f->state == FLOW_STATE_TYPED); + ASSERT(f->pif[INISIDE] != PIF_NONE && f->pif[TGTSIDE] != PIF_NONE); - flow_dbg(flow, "END %s", flow_type_str[flow->f.type]); - flow->f.type = FLOW_TYPE_NONE; + flow_set_state(f, FLOW_STATE_ACTIVE); + flow_new_entry = NULL; } /** @@ -196,9 +520,12 @@ union flow *flow_alloc(void) { union flow *flow = &flowtab[flow_first_free]; + ASSERT(!flow_new_entry); + if (flow_first_free >= FLOW_MAX) return NULL; + ASSERT(flow->f.state == FLOW_STATE_FREE); ASSERT(flow->f.type == FLOW_TYPE_NONE); ASSERT(flow->free.n >= 1); ASSERT(flow_first_free + flow->free.n <= FLOW_MAX); @@ -221,7 +548,10 @@ union flow *flow_alloc(void) flow_first_free = flow->free.next; } + flow_new_entry = flow; memset(flow, 0, sizeof(*flow)); + flow_set_state(&flow->f, FLOW_STATE_NEW); + return flow; } @@ -233,15 +563,234 @@ union flow *flow_alloc(void) */ void flow_alloc_cancel(union flow *flow) { + ASSERT(flow_new_entry == flow); + ASSERT(flow->f.state == FLOW_STATE_NEW || + flow->f.state == FLOW_STATE_INI || + flow->f.state == FLOW_STATE_TGT || + flow->f.state == FLOW_STATE_TYPED); ASSERT(flow_first_free > FLOW_IDX(flow)); - flow_end(flow); + flow_set_state(&flow->f, FLOW_STATE_FREE); + memset(flow, 0, sizeof(*flow)); + /* Put it back in a length 1 free cluster, don't attempt to fully * reverse flow_alloc()s steps. This will get folded together the next * time flow_defer_handler runs anyway() */ flow->free.n = 1; flow->free.next = flow_first_free; flow_first_free = FLOW_IDX(flow); + flow_new_entry = NULL; +} + +/** + * flow_hash() - Calculate hash value for one side of a flow + * @c: Execution context + * @proto: Protocol of this flow (IP L4 protocol number) + * @pif: pif of the side to hash + * @side: Flowside (must not have unspecified parts) + * + * Return: hash value + */ +static uint64_t flow_hash(const struct ctx *c, uint8_t proto, uint8_t pif, + const struct flowside *side) +{ + struct siphash_state state = SIPHASH_INIT(c->hash_secret); + + inany_siphash_feed(&state, &side->oaddr); + inany_siphash_feed(&state, &side->eaddr); + + return siphash_final(&state, 38, (uint64_t)proto << 40 | + (uint64_t)pif << 32 | + (uint64_t)side->oport << 16 | + (uint64_t)side->eport); +} + +/** + * flow_sidx_hash() - Calculate hash value for given side of a given flow + * @c: Execution context + * @sidx: Flow & side index to get hash for + * + * Return: hash value, of the flow & side represented by @sidx + */ +static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx) +{ + const struct flow_common *f = &flow_at_sidx(sidx)->f; + const struct flowside *side = &f->side[sidx.sidei]; + uint8_t pif = f->pif[sidx.sidei]; + + ASSERT(pif != PIF_NONE); + return flow_hash(c, FLOW_PROTO(f), pif, side); +} + +/** + * flow_hash_probe_() - Find hash bucket for a flow, given hash + * @hash: Raw hash value for flow & side + * @sidx: Flow and side to find bucket for + * + * Return: If @sidx is in the hash table, its current bucket, otherwise a + * suitable free bucket for it. + */ +static inline unsigned flow_hash_probe_(uint64_t hash, flow_sidx_t sidx) +{ + unsigned b = hash % FLOW_HASH_SIZE; + + /* Linear probing */ + while (flow_sidx_valid(flow_hashtab[b]) && + !flow_sidx_eq(flow_hashtab[b], sidx)) + b = mod_sub(b, 1, FLOW_HASH_SIZE); + + return b; +} + +/** + * flow_hash_probe() - Find hash bucket for a flow + * @c: Execution context + * @sidx: Flow and side to find bucket for + * + * Return: If @sidx is in the hash table, its current bucket, otherwise a + * suitable free bucket for it. + */ +static inline unsigned flow_hash_probe(const struct ctx *c, flow_sidx_t sidx) +{ + return flow_hash_probe_(flow_sidx_hash(c, sidx), sidx); +} + +/** + * flow_hash_insert() - Insert side of a flow into into hash table + * @c: Execution context + * @sidx: Flow & side index + * + * Return: raw (un-modded) hash value of side of flow + */ +uint64_t flow_hash_insert(const struct ctx *c, flow_sidx_t sidx) +{ + uint64_t hash = flow_sidx_hash(c, sidx); + unsigned b = flow_hash_probe_(hash, sidx); + + flow_hashtab[b] = sidx; + flow_dbg(flow_at_sidx(sidx), "Side %u hash table insert: bucket: %u", + sidx.sidei, b); + + return hash; +} + +/** + * flow_hash_remove() - Drop side of a flow from the hash table + * @c: Execution context + * @sidx: Side of flow to remove + */ +void flow_hash_remove(const struct ctx *c, flow_sidx_t sidx) +{ + unsigned b = flow_hash_probe(c, sidx), s; + + if (!flow_sidx_valid(flow_hashtab[b])) + return; /* Redundant remove */ + + flow_dbg(flow_at_sidx(sidx), "Side %u hash table remove: bucket: %u", + sidx.sidei, b); + + /* Scan the remainder of the cluster */ + for (s = mod_sub(b, 1, FLOW_HASH_SIZE); + flow_sidx_valid(flow_hashtab[s]); + s = mod_sub(s, 1, FLOW_HASH_SIZE)) { + unsigned h = flow_sidx_hash(c, flow_hashtab[s]) % FLOW_HASH_SIZE; + + if (!mod_between(h, s, b, FLOW_HASH_SIZE)) { + /* flow_hashtab[s] can live in flow_hashtab[b]'s slot */ + debug("hash table remove: shuffle %u -> %u", s, b); + flow_hashtab[b] = flow_hashtab[s]; + b = s; + } + } + + flow_hashtab[b] = FLOW_SIDX_NONE; +} + +/** + * flowside_lookup() - Look for a matching flowside in the flow table + * @c: Execution context + * @proto: Protocol of the flow (IP L4 protocol number) + * @pif: pif to look for in the table + * @side: Flowside to look for in the table + * + * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found + */ +static flow_sidx_t flowside_lookup(const struct ctx *c, uint8_t proto, + uint8_t pif, const struct flowside *side) +{ + flow_sidx_t sidx; + union flow *flow; + unsigned b; + + b = flow_hash(c, proto, pif, side) % FLOW_HASH_SIZE; + while ((sidx = flow_hashtab[b], flow = flow_at_sidx(sidx)) && + !(FLOW_PROTO(&flow->f) == proto && + flow->f.pif[sidx.sidei] == pif && + flowside_eq(&flow->f.side[sidx.sidei], side))) + b = mod_sub(b, 1, FLOW_HASH_SIZE); + + return flow_hashtab[b]; +} + +/** + * flow_lookup_af() - Look up a flow given addressing information + * @c: Execution context + * @proto: Protocol of the flow (IP L4 protocol number) + * @pif: Interface of the flow + * @af: Address family, AF_INET or AF_INET6 + * @eaddr: Guest side endpoint address (guest local address) + * @oaddr: Our guest side address (guest remote address) + * @eport: Guest side endpoint port (guest local port) + * @oport: Our guest side port (guest remote port) + * + * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found + */ +flow_sidx_t flow_lookup_af(const struct ctx *c, + uint8_t proto, uint8_t pif, sa_family_t af, + const void *eaddr, const void *oaddr, + in_port_t eport, in_port_t oport) +{ + struct flowside side; + + flowside_from_af(&side, af, eaddr, eport, oaddr, oport); + return flowside_lookup(c, proto, pif, &side); +} + +/** + * flow_lookup_sa() - Look up a flow given an endpoint socket address + * @c: Execution context + * @proto: Protocol of the flow (IP L4 protocol number) + * @pif: Interface of the flow + * @esa: Socket address of the endpoint + * @oaddr: Our address (may be NULL) + * @oport: Our port number + * + * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found + */ +flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif, + const void *esa, + const union inany_addr *oaddr, in_port_t oport) +{ + struct flowside side = { + .oport = oport, + }; + + if (inany_from_sockaddr(&side.eaddr, &side.eport, esa) < 0) { + char str[SOCKADDR_STRLEN]; + + warn("Flow lookup on bad socket address %s", + sockaddr_ntop(esa, str, sizeof(str))); + return FLOW_SIDX_NONE; + } + + if (oaddr) + side.oaddr = *oaddr; + else if (inany_v4(&side.eaddr)) + side.oaddr = inany_any4; + else + side.oaddr = inany_any6; + + return flowside_lookup(c, proto, pif, &side); } /** @@ -253,79 +802,111 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) { struct flow_free_cluster *free_head = NULL; unsigned *last_next = &flow_first_free; + bool to_free[FLOW_MAX] = { 0 }; bool timer = false; - unsigned idx; + union flow *flow; if (timespec_diff_ms(now, &flow_timer_run) >= FLOW_TIMER_INTERVAL) { timer = true; flow_timer_run = *now; } - for (idx = 0; idx < FLOW_MAX; idx++) { - union flow *flow = &flowtab[idx]; - bool closed = false; - - if (flow->f.type == FLOW_TYPE_NONE) { - unsigned skip = flow->free.n; - - /* First entry of a free cluster must have n >= 1 */ - ASSERT(skip); - - if (free_head) { - /* Merge into preceding free cluster */ - free_head->n += flow->free.n; - flow->free.n = flow->free.next = 0; - } else { - /* New free cluster, add to chain */ - free_head = &flow->free; - *last_next = idx; - last_next = &free_head->next; - } + ASSERT(!flow_new_entry); /* Incomplete flow at end of cycle */ - /* Skip remaining empty entries */ - idx += skip - 1; - continue; - } + /* Check which flows we might need to close first, but don't free them + * yet as it's not safe to do that in the middle of flow_foreach(). + */ + flow_foreach(flow) { + bool closed = false; switch (flow->f.type) { case FLOW_TYPE_NONE: ASSERT(false); break; case FLOW_TCP: - closed = tcp_flow_defer(flow); + closed = tcp_flow_defer(&flow->tcp); break; case FLOW_TCP_SPLICE: - closed = tcp_splice_flow_defer(flow); + closed = tcp_splice_flow_defer(&flow->tcp_splice); if (!closed && timer) - tcp_splice_timer(c, flow); + tcp_splice_timer(c, &flow->tcp_splice); break; case FLOW_PING4: case FLOW_PING6: if (timer) - closed = icmp_ping_timer(c, flow, now); + closed = icmp_ping_timer(c, &flow->ping, now); + break; + case FLOW_UDP: + closed = udp_flow_defer(c, &flow->udp, now); + if (!closed && timer) + closed = udp_flow_timer(c, &flow->udp, now); break; default: /* Assume other flow types don't need any handling */ ; } - if (closed) { - flow_end(flow); + to_free[FLOW_IDX(flow)] = closed; + } + + /* Second step: actually free the flows */ + flow_foreach_slot(flow) { + switch (flow->f.state) { + case FLOW_STATE_FREE: { + unsigned skip = flow->free.n; + + /* First entry of a free cluster must have n >= 1 */ + ASSERT(skip); if (free_head) { - /* Add slot to current free cluster */ - ASSERT(idx == FLOW_IDX(free_head) + free_head->n); - free_head->n++; + /* Merge into preceding free cluster */ + free_head->n += flow->free.n; flow->free.n = flow->free.next = 0; } else { - /* Create new free cluster */ + /* New free cluster, add to chain */ free_head = &flow->free; - free_head->n = 1; - *last_next = idx; + *last_next = FLOW_IDX(flow); last_next = &free_head->next; } - } else { - free_head = NULL; + + /* Skip remaining empty entries */ + flow += skip - 1; + continue; + } + + case FLOW_STATE_NEW: + case FLOW_STATE_INI: + case FLOW_STATE_TGT: + case FLOW_STATE_TYPED: + /* Incomplete flow at end of cycle */ + ASSERT(false); + break; + + case FLOW_STATE_ACTIVE: + if (to_free[FLOW_IDX(flow)]) { + flow_set_state(&flow->f, FLOW_STATE_FREE); + memset(flow, 0, sizeof(*flow)); + + if (free_head) { + /* Add slot to current free cluster */ + ASSERT(FLOW_IDX(flow) == + FLOW_IDX(free_head) + free_head->n); + free_head->n++; + flow->free.n = flow->free.next = 0; + } else { + /* Create new free cluster */ + free_head = &flow->free; + free_head->n = 1; + *last_next = FLOW_IDX(flow); + last_next = &free_head->next; + } + } else { + free_head = NULL; + } + break; + + default: + ASSERT(false); } } @@ -333,11 +914,265 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) } /** + * flow_migrate_source_rollback() - Disable repair mode, return failure + * @c: Execution context + * @bound: No need to roll back flow indices >= @bound + * @ret: Negative error code + * + * Return: @ret + */ +static int flow_migrate_source_rollback(struct ctx *c, unsigned bound, int ret) +{ + union flow *flow; + + debug("...roll back migration"); + + foreach_established_tcp_flow(flow) { + if (FLOW_IDX(flow) >= bound) + break; + if (tcp_flow_repair_off(c, &flow->tcp)) + die("Failed to roll back TCP_REPAIR mode"); + } + + if (repair_flush(c)) + die("Failed to roll back TCP_REPAIR mode"); + + return ret; +} + +/** + * flow_migrate_need_repair() - Do we need to set repair mode for any flow? + * + * Return: true if repair mode is needed, false otherwise + */ +static bool flow_migrate_need_repair(void) +{ + union flow *flow; + + foreach_established_tcp_flow(flow) + return true; + + return false; +} + +/** + * flow_migrate_repair_all() - Turn repair mode on or off for all flows + * @c: Execution context + * @enable: Switch repair mode on if set, off otherwise + * + * Return: 0 on success, negative error code on failure + */ +static int flow_migrate_repair_all(struct ctx *c, bool enable) +{ + union flow *flow; + int rc; + + /* If we don't have a repair helper, there's nothing we can do */ + if (c->fd_repair < 0) + return 0; + + foreach_established_tcp_flow(flow) { + if (enable) + rc = tcp_flow_repair_on(c, &flow->tcp); + else + rc = tcp_flow_repair_off(c, &flow->tcp); + + if (rc) { + debug("Can't %s repair mode: %s", + enable ? "enable" : "disable", strerror_(-rc)); + return flow_migrate_source_rollback(c, FLOW_IDX(flow), + rc); + } + } + + if ((rc = repair_flush(c))) { + debug("Can't %s repair mode: %s", + enable ? "enable" : "disable", strerror_(-rc)); + return flow_migrate_source_rollback(c, FLOW_IDX(flow), rc); + } + + return 0; +} + +/** + * flow_migrate_source_pre() - Prepare flows for migration: enable repair mode + * @c: Execution context + * @stage: Migration stage information (unused) + * @fd: Migration file descriptor (unused) + * + * Return: 0 on success, positive error code on failure + */ +int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage, + int fd) +{ + int rc; + + (void)stage; + (void)fd; + + if (flow_migrate_need_repair()) + repair_wait(c); + + if ((rc = flow_migrate_repair_all(c, true))) + return -rc; + + return 0; +} + +/** + * flow_migrate_source() - Dump all the remaining information and send data + * @c: Execution context (unused) + * @stage: Migration stage information (unused) + * @fd: Migration file descriptor + * + * Return: 0 on success, positive error code on failure + */ +int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, + int fd) +{ + uint32_t count = 0; + bool first = true; + union flow *flow; + int rc; + + (void)c; + (void)stage; + + /* If we don't have a repair helper, we can't migrate TCP flows */ + if (c->fd_repair >= 0) { + foreach_established_tcp_flow(flow) + count++; + } + + count = htonl(count); + if (write_all_buf(fd, &count, sizeof(count))) { + rc = errno; + err_perror("Can't send flow count (%u)", ntohl(count)); + return flow_migrate_source_rollback(c, FLOW_MAX, rc); + } + + debug("Sending %u flows", ntohl(count)); + + if (!count) + return 0; + + /* Dump and send information that can be stored in the flow table. + * + * Limited rollback options here: if we fail to transfer any data (that + * is, on the first flow), undo everything and resume. Otherwise, the + * stream might now be inconsistent, and we might have closed listening + * TCP sockets, so just terminate. + */ + foreach_established_tcp_flow(flow) { + rc = tcp_flow_migrate_source(fd, &flow->tcp); + if (rc) { + flow_err(flow, "Can't send data: %s", + strerror_(-rc)); + if (!first) + die("Inconsistent migration state, exiting"); + + return flow_migrate_source_rollback(c, FLOW_MAX, -rc); + } + + first = false; + } + + /* And then "extended" data (including window data we saved previously): + * the target needs to set repair mode on sockets before it can set + * this stuff, but it needs sockets (and flows) for that. + * + * This also closes sockets so that the target can start connecting + * theirs: you can't sendmsg() to queues (using the socket) if the + * socket is not connected (EPIPE), not even in repair mode. And the + * target needs to restore queues now because we're sending the data. + * + * So, no rollback here, just try as hard as we can. Tolerate per-flow + * failures but not if the stream might be inconsistent (reported here + * as EIO). + */ + foreach_established_tcp_flow(flow) { + rc = tcp_flow_migrate_source_ext(fd, &flow->tcp); + if (rc) { + flow_err(flow, "Can't send extended data: %s", + strerror_(-rc)); + + if (rc == -EIO) + die("Inconsistent migration state, exiting"); + } + } + + return 0; +} + +/** + * flow_migrate_target() - Receive flows and insert in flow table + * @c: Execution context + * @stage: Migration stage information (unused) + * @fd: Migration file descriptor + * + * Return: 0 on success, positive error code on failure + */ +int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage, + int fd) +{ + uint32_t count; + unsigned i; + int rc; + + (void)stage; + + if (read_all_buf(fd, &count, sizeof(count))) + return errno; + + count = ntohl(count); + debug("Receiving %u flows", count); + + if (!count) + return 0; + + if ((rc = repair_wait(c))) + return -rc; + + if ((rc = flow_migrate_repair_all(c, true))) + return -rc; + + repair_flush(c); + + /* TODO: flow header with type, instead? */ + for (i = 0; i < count; i++) { + rc = tcp_flow_migrate_target(c, fd); + if (rc) { + flow_dbg(FLOW(i), "Migration data failure, abort: %s", + strerror_(-rc)); + return -rc; + } + } + + repair_flush(c); + + for (i = 0; i < count; i++) { + rc = tcp_flow_migrate_target_ext(c, &flowtab[i].tcp, fd); + if (rc) { + flow_dbg(FLOW(i), "Migration data failure, abort: %s", + strerror_(-rc)); + return -rc; + } + } + + return 0; +} + +/** * flow_init() - Initialise flow related data structures */ void flow_init(void) { + unsigned b; + /* Initial state is a single free cluster containing the whole table */ flowtab[0].free.n = FLOW_MAX; flowtab[0].free.next = FLOW_MAX; + + for (b = 0; b < FLOW_HASH_SIZE; b++) + flow_hashtab[b] = FLOW_SIDX_NONE; } @@ -10,6 +10,98 @@ #define FLOW_TIMER_INTERVAL 1000 /* ms */ /** + * enum flow_state - States of a flow table entry + * + * An individual flow table entry moves through these states, usually in this + * order. + * General rules: + * - Code outside flow.c should never write common fields of union flow. + * - The state field may always be read. + * + * FREE - Part of the general pool of free flow table entries + * Operations: + * - flow_alloc() finds an entry and moves it to NEW + * + * NEW - Freshly allocated, uninitialised entry + * Operations: + * - flow_alloc_cancel() returns the entry to FREE + * - flow_initiate() sets the entry's INISIDE details and moves to + * INI + * - FLOW_SET_TYPE() sets the entry's type and moves to TYPED + * Caveats: + * - No fields other than state may be accessed + * - At most one entry may be NEW, INI, TGT or TYPED at a time, so + * it's unsafe to use flow_alloc() again until this entry moves to + * ACTIVE or FREE + * - You may not return to the main epoll loop while any flow is NEW + * + * INI - An entry with INISIDE common information completed + * Operations: + * - Common fields related to INISIDE may be read + * - flow_alloc_cancel() returns the entry to FREE + * - flow_target() sets the entry's TGTSIDE details and moves to TGT + * Caveats: + * - Other common fields may not be read + * - Type specific fields may not be read or written + * - At most one entry may be NEW, INI, TGT or TYPED at a time, so + * it's unsafe to use flow_alloc() again until this entry moves to + * ACTIVE or FREE + * - You may not return to the main epoll loop while any flow is INI + * + * TGT - An entry with only INISIDE and TGTSIDE common information completed + * Operations: + * - Common fields related to INISIDE & TGTSIDE may be read + * - flow_alloc_cancel() returns the entry to FREE + * - FLOW_SET_TYPE() sets the entry's type and moves to TYPED + * Caveats: + * - Other common fields may not be read + * - Type specific fields may not be read or written + * - At most one entry may be NEW, INI, TGT or TYPED at a time, so + * it's unsafe to use flow_alloc() again until this entry moves to + * ACTIVE or FREE + * - You may not return to the main epoll loop while any flow is TGT + * + * TYPED - Generic info initialised, type specific initialisation underway + * Operations: + * - All common fields may be read + * - Type specific fields may be read and written + * - flow_alloc_cancel() returns the entry to FREE + * - FLOW_ACTIVATE() moves the entry to ACTIVE + * Caveats: + * - At most one entry may be NEW, INI, TGT or TYPED at a time, so + * it's unsafe to use flow_alloc() again until this entry moves to + * ACTIVE or FREE + * - You may not return to the main epoll loop while any flow is + * TYPED + * + * ACTIVE - An active, fully-initialised flow entry + * Operations: + * - All common fields may be read + * - Type specific fields may be read and written + * - Flow returns to FREE when it expires, signalled by returning + * 'true' from flow type specific deferred or timer handler + * Caveats: + * - flow_alloc_cancel() may not be called on it + */ +enum flow_state { + FLOW_STATE_FREE, + FLOW_STATE_NEW, + FLOW_STATE_INI, + FLOW_STATE_TGT, + FLOW_STATE_TYPED, + FLOW_STATE_ACTIVE, + + FLOW_NUM_STATES, +}; +#define FLOW_STATE_BITS 8 +static_assert(FLOW_NUM_STATES <= (1 << FLOW_STATE_BITS), + "Too many flow states for FLOW_STATE_BITS"); + +extern const char *flow_state_str[]; +#define FLOW_STATE(f) \ + ((f)->state < FLOW_NUM_STATES ? flow_state_str[(f)->state] : "?") + +/** * enum flow_type - Different types of packet flows we track */ enum flow_type { @@ -23,9 +115,14 @@ enum flow_type { FLOW_PING4, /* ICMPv6 echo requests from guest to host and matching replies back */ FLOW_PING6, + /* UDP pseudo-connection */ + FLOW_UDP, FLOW_NUM_TYPES, }; +#define FLOW_TYPE_BITS 8 +static_assert(FLOW_NUM_TYPES <= (1 << FLOW_TYPE_BITS), + "Too many flow types for FLOW_TYPE_BITS"); extern const char *flow_type_str[]; #define FLOW_TYPE(f) \ @@ -35,12 +132,66 @@ extern const uint8_t flow_proto[]; #define FLOW_PROTO(f) \ ((f)->type < FLOW_NUM_TYPES ? flow_proto[(f)->type] : 0) +#define SIDES 2 + +#define INISIDE 0 /* Initiating side index */ +#define TGTSIDE 1 /* Target side index */ + +/** + * struct flowside - Address information for one side of a flow + * @eaddr: Endpoint address (remote address from passt's PoV) + * @oaddr: Our address (local address from passt's PoV) + * @eport: Endpoint port + * @oport: Our port + */ +struct flowside { + union inany_addr oaddr; + union inany_addr eaddr; + in_port_t oport; + in_port_t eport; +}; + +/** + * flowside_eq() - Check if two flowsides are equal + * @left, @right: Flowsides to compare + * + * Return: true if equal, false otherwise + */ +static inline bool flowside_eq(const struct flowside *left, + const struct flowside *right) +{ + return inany_equals(&left->eaddr, &right->eaddr) && + left->eport == right->eport && + inany_equals(&left->oaddr, &right->oaddr) && + left->oport == right->oport; +} + +int flowside_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif, + const struct flowside *tgt, uint32_t data); +int flowside_connect(const struct ctx *c, int s, + uint8_t pif, const struct flowside *tgt); + /** * struct flow_common - Common fields for packet flows + * @state: State of the flow table entry * @type: Type of packet flow + * @pif[]: Interface for each side of the flow + * @side[]: Information for each side of the flow */ struct flow_common { +#ifdef __GNUC__ + enum flow_state state:FLOW_STATE_BITS; + enum flow_type type:FLOW_TYPE_BITS; +#else + uint8_t state; + static_assert(sizeof(uint8_t) * 8 >= FLOW_STATE_BITS, + "Not enough bits for state field"); uint8_t type; + static_assert(sizeof(uint8_t) * 8 >= FLOW_TYPE_BITS, + "Not enough bits for type field"); +#endif + uint8_t pif[SIDES]; + struct flowside side[SIDES]; }; #define FLOW_INDEX_BITS 17 /* 128k - 1 */ @@ -49,24 +200,30 @@ struct flow_common { #define FLOW_TABLE_PRESSURE 30 /* % of FLOW_MAX */ #define FLOW_FILE_PRESSURE 30 /* % of c->nofile */ -union flow *flow_start(union flow *flow, enum flow_type type, - unsigned iniside); -#define FLOW_START(flow_, t_, var_, i_) \ - (&flow_start((flow_), (t_), (i_))->var_) - /** * struct flow_sidx - ID for one side of a specific flow - * @side: Side referenced (0 or 1) - * @flow: Index of flow referenced + * @sidei: Index of side referenced (0 or 1) + * @flowi: Index of flow referenced */ typedef struct flow_sidx { - unsigned side :1; - unsigned flow :FLOW_INDEX_BITS; + unsigned sidei :1; + unsigned flowi :FLOW_INDEX_BITS; } flow_sidx_t; static_assert(sizeof(flow_sidx_t) <= sizeof(uint32_t), "flow_sidx_t must fit within 32 bits"); -#define FLOW_SIDX_NONE ((flow_sidx_t){ .flow = FLOW_MAX }) +#define FLOW_SIDX_NONE ((flow_sidx_t){ .flowi = FLOW_MAX }) + +/** + * flow_sidx_valid() - Test if a sidx is valid + * @sidx: sidx value + * + * Return: true if @sidx refers to a valid flow & side + */ +static inline bool flow_sidx_valid(flow_sidx_t sidx) +{ + return sidx.flowi < FLOW_MAX; +} /** * flow_sidx_eq() - Test if two sidx values are equal @@ -76,19 +233,37 @@ static_assert(sizeof(flow_sidx_t) <= sizeof(uint32_t), */ static inline bool flow_sidx_eq(flow_sidx_t a, flow_sidx_t b) { - return (a.flow == b.flow) && (a.side == b.side); + return (a.flowi == b.flowi) && (a.sidei == b.sidei); } +uint64_t flow_hash_insert(const struct ctx *c, flow_sidx_t sidx); +void flow_hash_remove(const struct ctx *c, flow_sidx_t sidx); +flow_sidx_t flow_lookup_af(const struct ctx *c, + uint8_t proto, uint8_t pif, sa_family_t af, + const void *eaddr, const void *oaddr, + in_port_t eport, in_port_t oport); +flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif, + const void *esa, + const union inany_addr *oaddr, in_port_t oport); + union flow; void flow_init(void); void flow_defer_handler(const struct ctx *c, const struct timespec *now); +int flow_migrate_source_early(struct ctx *c, const struct migrate_stage *stage, + int fd); +int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage, + int fd); +int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, + int fd); +int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage, + int fd); -void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) - __attribute__((format(printf, 3, 4))); - -#define flow_log(f_, pri, ...) flow_log_(&(f_)->f, (pri), __VA_ARGS__) +void flow_log_(const struct flow_common *f, bool newline, int pri, + const char *fmt, ...) + __attribute__((format(printf, 4, 5))); +#define flow_log(f_, pri, ...) flow_log_(&(f_)->f, true, (pri), __VA_ARGS__) #define flow_dbg(f, ...) flow_log((f), LOG_DEBUG, __VA_ARGS__) #define flow_err(f, ...) flow_log((f), LOG_ERR, __VA_ARGS__) @@ -98,4 +273,21 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) flow_dbg((f), __VA_ARGS__); \ } while (0) +#define flow_log_perror_(f, pri, ...) \ + do { \ + int errno_ = errno; \ + flow_log_((f), false, (pri), __VA_ARGS__); \ + logmsg(true, true, (pri), ": %s", strerror_(errno_)); \ + } while (0) + +#define flow_dbg_perror(f_, ...) flow_log_perror_(&(f_)->f, LOG_DEBUG, __VA_ARGS__) +#define flow_perror(f_, ...) flow_log_perror_(&(f_)->f, LOG_ERR, __VA_ARGS__) + +void flow_log_details_(const struct flow_common *f, int pri, + enum flow_state state); +#define flow_log_details(f_, pri) \ + flow_log_details_(&((f_)->f), (pri), (f_)->f.state) +#define flow_dbg_details(f_) flow_log_details((f_), LOG_DEBUG) +#define flow_err_details(f_) flow_log_details((f_), LOG_ERR) + #endif /* FLOW_H */ diff --git a/flow_table.h b/flow_table.h index b7e5529..3f3f4b7 100644 --- a/flow_table.h +++ b/flow_table.h @@ -9,6 +9,7 @@ #include "tcp_conn.h" #include "icmp_flow.h" +#include "udp_flow.h" /** * struct flow_free_cluster - Information about a cluster of free entries @@ -35,76 +36,184 @@ union flow { struct tcp_tap_conn tcp; struct tcp_splice_conn tcp_splice; struct icmp_ping_flow ping; + struct udp_flow udp; }; /* Global Flow Table */ extern unsigned flow_first_free; extern union flow flowtab[]; +/** + * flow_foreach_sidei() - 'for' type macro to step through each side of flow + * @sidei_: Takes value INISIDE, then TGTSIDE + */ +#define flow_foreach_sidei(sidei_) \ + for ((sidei_) = INISIDE; (sidei_) < SIDES; (sidei_)++) + + +/** + * flow_foreach_slot() - Step through each flow table entry + * @flow: Takes values of pointer to each flow table entry + * + * Includes FREE slots. + */ +#define flow_foreach_slot(flow) \ + for ((flow) = flowtab; FLOW_IDX(flow) < FLOW_MAX; (flow)++) + +/** + * flow_foreach() - Step through each active flow + * @flow: Takes values of pointer to each active flow + */ +#define flow_foreach(flow) \ + flow_foreach_slot((flow)) \ + if ((flow)->f.state == FLOW_STATE_FREE) \ + (flow) += (flow)->free.n - 1; \ + else if ((flow)->f.state != FLOW_STATE_ACTIVE) { \ + flow_err((flow), "Bad flow state during traversal"); \ + continue; \ + } else -/** flow_idx - Index of flow from common structure +/** + * flow_foreach_of_type() - Step through each active flow of given type + * @flow: Takes values of pointer to each flow + * @type_: Type of flow to traverse + */ +#define flow_foreach_of_type(flow, type_) \ + flow_foreach((flow)) \ + if ((flow)->f.type != (type_)) \ + /* NOLINTNEXTLINE(bugprone-branch-clone) */ \ + continue; \ + else + + +/** flow_idx() - Index of flow from common structure * @f: Common flow fields pointer * * Return: index of @f in the flow table */ static inline unsigned flow_idx(const struct flow_common *f) { + /* NOLINTNEXTLINE(clang-analyzer-security.PointerSub) */ return (union flow *)f - flowtab; } -/** FLOW_IDX - Find the index of a flow +/** FLOW_IDX() - Find the index of a flow * @f_: Flow pointer, either union flow * or protocol specific * * Return: index of @f in the flow table */ #define FLOW_IDX(f_) (flow_idx(&(f_)->f)) -/** FLOW - Flow entry at a given index +/** FLOW() - Flow entry at a given index * @idx: Flow index * * Return: pointer to entry @idx in the flow table */ #define FLOW(idx) (&flowtab[(idx)]) -/** flow_at_sidx - Flow entry for a given sidx +/** flow_at_sidx() - Flow entry for a given sidx * @sidx: Flow & side index * * Return: pointer to the corresponding flow entry, or NULL */ static inline union flow *flow_at_sidx(flow_sidx_t sidx) { - if (sidx.flow >= FLOW_MAX) + if (!flow_sidx_valid(sidx)) + return NULL; + return FLOW(sidx.flowi); +} + +/** pif_at_sidx() - Interface for a given flow and side + * @sidx: Flow & side index + * + * Return: pif for the flow & side given by @sidx + */ +static inline uint8_t pif_at_sidx(flow_sidx_t sidx) +{ + const union flow *flow = flow_at_sidx(sidx); + + if (!flow) + return PIF_NONE; + return flow->f.pif[sidx.sidei]; +} + +/** flowside_at_sidx() - Retrieve a specific flowside + * @sidx: Flow & side index + * + * Return: Flowside for the flow & side given by @sidx + */ +static inline const struct flowside *flowside_at_sidx(flow_sidx_t sidx) +{ + const union flow *flow = flow_at_sidx(sidx); + + if (!flow) return NULL; - return FLOW(sidx.flow); + + return &flow->f.side[sidx.sidei]; } -/** flow_sidx_t - Index of one side of a flow from common structure +/** flow_sidx_opposite() - Get the other side of the same flow + * @sidx: Flow & side index + * + * Return: sidx for the other side of the same flow as @sidx + */ +static inline flow_sidx_t flow_sidx_opposite(flow_sidx_t sidx) +{ + if (!flow_sidx_valid(sidx)) + return FLOW_SIDX_NONE; + + return (flow_sidx_t){.flowi = sidx.flowi, .sidei = !sidx.sidei}; +} + +/** flow_sidx() - Index of one side of a flow from common structure * @f: Common flow fields pointer - * @side: Which side to refer to (0 or 1) + * @sidei: Which side to refer to (0 or 1) * * Return: index of @f and @side in the flow table */ static inline flow_sidx_t flow_sidx(const struct flow_common *f, - int side) + unsigned sidei) { /* cppcheck-suppress [knownConditionTrueFalse, unmatchedSuppression] */ - ASSERT(side == !!side); + ASSERT(sidei == !!sidei); return (flow_sidx_t){ - .side = side, - .flow = flow_idx(f), + .sidei = sidei, + .flowi = flow_idx(f), }; } -/** FLOW_SIDX - Find the index of one side of a flow +/** FLOW_SIDX() - Find the index of one side of a flow * @f_: Flow pointer, either union flow * or protocol specific - * @side: Which side to index (0 or 1) + * @sidei: Which side to index (0 or 1) * * Return: index of @f and @side in the flow table */ -#define FLOW_SIDX(f_, side) (flow_sidx(&(f_)->f, (side))) +#define FLOW_SIDX(f_, sidei) (flow_sidx(&(f_)->f, (sidei))) union flow *flow_alloc(void); void flow_alloc_cancel(union flow *flow); +const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif, + sa_family_t af, + const void *saddr, in_port_t sport, + const void *daddr, in_port_t dport); +struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif, + const union sockaddr_inany *ssa, + const union inany_addr *daddr, + in_port_t dport); +const struct flowside *flow_target_af(union flow *flow, uint8_t pif, + sa_family_t af, + const void *saddr, in_port_t sport, + const void *daddr, in_port_t dport); +struct flowside *flow_target(const struct ctx *c, union flow *flow, + uint8_t proto); + +union flow *flow_set_type(union flow *flow, enum flow_type type); +#define FLOW_SET_TYPE(flow_, t_, var_) (&flow_set_type((flow_), (t_))->var_) + +void flow_activate(struct flow_common *f); +#define FLOW_ACTIVATE(flow_) \ + (flow_activate(&(flow_)->f)) + #endif /* FLOW_TABLE_H */ @@ -25,6 +25,81 @@ #include "fwd.h" #include "passt.h" #include "lineread.h" +#include "flow_table.h" + +/* Empheral port range: values from RFC 6335 */ +static in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14); +static in_port_t fwd_ephemeral_max = NUM_PORTS - 1; + +#define PORT_RANGE_SYSCTL "/proc/sys/net/ipv4/ip_local_port_range" + +/** fwd_probe_ephemeral() - Determine what ports this host considers ephemeral + * + * Work out what ports the host thinks are emphemeral and record it for later + * use by fwd_port_is_ephemeral(). If we're unable to probe, assume the range + * recommended by RFC 6335. + */ +void fwd_probe_ephemeral(void) +{ + char *line, *tab, *end; + struct lineread lr; + long min, max; + ssize_t len; + int fd; + + fd = open(PORT_RANGE_SYSCTL, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + warn_perror("Unable to open %s", PORT_RANGE_SYSCTL); + return; + } + + lineread_init(&lr, fd); + len = lineread_get(&lr, &line); + close(fd); + + if (len < 0) + goto parse_err; + + tab = strchr(line, '\t'); + if (!tab) + goto parse_err; + *tab = '\0'; + + errno = 0; + min = strtol(line, &end, 10); + if (*end || errno) + goto parse_err; + + errno = 0; + max = strtol(tab + 1, &end, 10); + if (*end || errno) + goto parse_err; + + if (min < 0 || min >= (long)NUM_PORTS || + max < 0 || max >= (long)NUM_PORTS) + goto parse_err; + + fwd_ephemeral_min = min; + fwd_ephemeral_max = max; + + return; + +parse_err: + warn("Unable to parse %s", PORT_RANGE_SYSCTL); +} + +/** + * fwd_port_is_ephemeral() - Is port number ephemeral? + * @port: Port number + * + * Return: true if @port is ephemeral, that is may be allocated by the kernel as + * a local port for outgoing connections or datagrams, but should not be + * used for binding services to. + */ +bool fwd_port_is_ephemeral(in_port_t port) +{ + return (port >= fwd_ephemeral_min) && (port <= fwd_ephemeral_max); +} /* See enum in kernel's include/net/tcp_states.h */ #define UDP_LISTEN 0x07 @@ -38,7 +113,7 @@ * @exclude: Bitmap of ports to exclude from setting (and clear) * * #syscalls:pasta lseek - * #syscalls:pasta ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek + * #syscalls:pasta ppc64le:_llseek ppc64:_llseek arm:_llseek */ static void procfs_scan_listen(int fd, unsigned int lstate, uint8_t *map, const uint8_t *exclude) @@ -52,7 +127,7 @@ static void procfs_scan_listen(int fd, unsigned int lstate, return; if (lseek(fd, 0, SEEK_SET)) { - warn("lseek() failed on /proc/net file: %s", strerror(errno)); + warn_perror("lseek() failed on /proc/net file"); return; } @@ -128,18 +203,18 @@ void fwd_scan_ports_init(struct ctx *c) c->tcp.fwd_in.scan4 = c->tcp.fwd_in.scan6 = -1; c->tcp.fwd_out.scan4 = c->tcp.fwd_out.scan6 = -1; - c->udp.fwd_in.f.scan4 = c->udp.fwd_in.f.scan6 = -1; - c->udp.fwd_out.f.scan4 = c->udp.fwd_out.f.scan6 = -1; + c->udp.fwd_in.scan4 = c->udp.fwd_in.scan6 = -1; + c->udp.fwd_out.scan4 = c->udp.fwd_out.scan6 = -1; if (c->tcp.fwd_in.mode == FWD_AUTO) { c->tcp.fwd_in.scan4 = open_in_ns(c, "/proc/net/tcp", flags); c->tcp.fwd_in.scan6 = open_in_ns(c, "/proc/net/tcp6", flags); fwd_scan_ports_tcp(&c->tcp.fwd_in, &c->tcp.fwd_out); } - if (c->udp.fwd_in.f.mode == FWD_AUTO) { - c->udp.fwd_in.f.scan4 = open_in_ns(c, "/proc/net/udp", flags); - c->udp.fwd_in.f.scan6 = open_in_ns(c, "/proc/net/udp6", flags); - fwd_scan_ports_udp(&c->udp.fwd_in.f, &c->udp.fwd_out.f, + if (c->udp.fwd_in.mode == FWD_AUTO) { + c->udp.fwd_in.scan4 = open_in_ns(c, "/proc/net/udp", flags); + c->udp.fwd_in.scan6 = open_in_ns(c, "/proc/net/udp6", flags); + fwd_scan_ports_udp(&c->udp.fwd_in, &c->udp.fwd_out, &c->tcp.fwd_in, &c->tcp.fwd_out); } if (c->tcp.fwd_out.mode == FWD_AUTO) { @@ -147,10 +222,335 @@ void fwd_scan_ports_init(struct ctx *c) c->tcp.fwd_out.scan6 = open("/proc/net/tcp6", flags); fwd_scan_ports_tcp(&c->tcp.fwd_out, &c->tcp.fwd_in); } - if (c->udp.fwd_out.f.mode == FWD_AUTO) { - c->udp.fwd_out.f.scan4 = open("/proc/net/udp", flags); - c->udp.fwd_out.f.scan6 = open("/proc/net/udp6", flags); - fwd_scan_ports_udp(&c->udp.fwd_out.f, &c->udp.fwd_in.f, + if (c->udp.fwd_out.mode == FWD_AUTO) { + c->udp.fwd_out.scan4 = open("/proc/net/udp", flags); + c->udp.fwd_out.scan6 = open("/proc/net/udp6", flags); + fwd_scan_ports_udp(&c->udp.fwd_out, &c->udp.fwd_in, &c->tcp.fwd_out, &c->tcp.fwd_in); } } + +/** + * is_dns_flow() - Determine if flow appears to be a DNS request + * @proto: Protocol (IP L4 protocol number) + * @ini: Flow address information of the initiating side + * + * Return: true if the flow appears to be directed at a dns server, that is a + * TCP or UDP flow to port 53 (domain) or port 853 (domain-s) + */ +static bool is_dns_flow(uint8_t proto, const struct flowside *ini) +{ + return ((proto == IPPROTO_UDP) || (proto == IPPROTO_TCP)) && + ((ini->oport == 53) || (ini->oport == 853)); +} + +/** + * fwd_guest_accessible4() - Is IPv4 address guest-accessible + * @c: Execution context + * @addr: Host visible IPv4 address + * + * Return: true if @addr on the host is accessible to the guest without + * translation, false otherwise + */ +static bool fwd_guest_accessible4(const struct ctx *c, + const struct in_addr *addr) +{ + if (IN4_IS_ADDR_LOOPBACK(addr)) + return false; + + /* In socket interfaces 0.0.0.0 generally means "any" or unspecified, + * however on the wire it can mean "this host on this network". Since + * that has a different meaning for host and guest, we can't let it + * through untranslated. + */ + if (IN4_IS_ADDR_UNSPECIFIED(addr)) + return false; + + /* For IPv4, addr_seen is initialised to addr, so is always a valid + * address + */ + if (IN4_ARE_ADDR_EQUAL(addr, &c->ip4.addr) || + IN4_ARE_ADDR_EQUAL(addr, &c->ip4.addr_seen)) + return false; + + return true; +} + +/** + * fwd_guest_accessible6() - Is IPv6 address guest-accessible + * @c: Execution context + * @addr: Host visible IPv6 address + * + * Return: true if @addr on the host is accessible to the guest without + * translation, false otherwise + */ +static bool fwd_guest_accessible6(const struct ctx *c, + const struct in6_addr *addr) +{ + if (IN6_IS_ADDR_LOOPBACK(addr)) + return false; + + if (IN6_ARE_ADDR_EQUAL(addr, &c->ip6.addr)) + return false; + + /* For IPv6, addr_seen starts unspecified, because we don't know what LL + * address the guest will take until we see it. Only check against it + * if it has been set to a real address. + */ + if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr_seen) && + IN6_ARE_ADDR_EQUAL(addr, &c->ip6.addr_seen)) + return false; + + return true; +} + +/** + * fwd_guest_accessible() - Is IPv[46] address guest-accessible + * @c: Execution context + * @addr: Host visible IPv[46] address + * + * Return: true if @addr on the host is accessible to the guest without + * translation, false otherwise + */ +static bool fwd_guest_accessible(const struct ctx *c, + const union inany_addr *addr) +{ + const struct in_addr *a4 = inany_v4(addr); + + if (a4) + return fwd_guest_accessible4(c, a4); + + return fwd_guest_accessible6(c, &addr->a6); +} + +/** + * nat_outbound() - Apply address translation for outbound (TAP to HOST) + * @c: Execution context + * @addr: Input address (as seen on TAP interface) + * @translated: Output address (as seen on HOST interface) + * + * Only handles translations that depend *only* on the address. Anything + * related to specific ports or flows is handled elsewhere. + */ +static void nat_outbound(const struct ctx *c, const union inany_addr *addr, + union inany_addr *translated) +{ + if (inany_equals4(addr, &c->ip4.map_host_loopback)) + *translated = inany_loopback4; + else if (inany_equals6(addr, &c->ip6.map_host_loopback)) + *translated = inany_loopback6; + else if (inany_equals4(addr, &c->ip4.map_guest_addr)) + *translated = inany_from_v4(c->ip4.addr); + else if (inany_equals6(addr, &c->ip6.map_guest_addr)) + translated->a6 = c->ip6.addr; + else + *translated = *addr; +} + +/** + * fwd_nat_from_tap() - Determine to forward a flow from the tap interface + * @c: Execution context + * @proto: Protocol (IP L4 protocol number) + * @ini: Flow address information of the initiating side + * @tgt: Flow address information on the target side (updated) + * + * Return: pif of the target interface to forward the flow to, PIF_NONE if the + * flow cannot or should not be forwarded at all. + */ +uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto, + const struct flowside *ini, struct flowside *tgt) +{ + if (is_dns_flow(proto, ini) && + inany_equals4(&ini->oaddr, &c->ip4.dns_match)) + tgt->eaddr = inany_from_v4(c->ip4.dns_host); + else if (is_dns_flow(proto, ini) && + inany_equals6(&ini->oaddr, &c->ip6.dns_match)) + tgt->eaddr.a6 = c->ip6.dns_host; + else + nat_outbound(c, &ini->oaddr, &tgt->eaddr); + + tgt->eport = ini->oport; + + /* The relevant addr_out controls the host side source address. This + * may be unspecified, which allows the kernel to pick an address. + */ + if (inany_v4(&tgt->eaddr)) + tgt->oaddr = inany_from_v4(c->ip4.addr_out); + else + tgt->oaddr.a6 = c->ip6.addr_out; + + /* Let the kernel pick a host side source port */ + tgt->oport = 0; + if (proto == IPPROTO_UDP) { + /* But for UDP we preserve the source port */ + tgt->oport = ini->eport; + } + + return PIF_HOST; +} + +/** + * fwd_nat_from_splice() - Determine to forward a flow from the splice interface + * @c: Execution context + * @proto: Protocol (IP L4 protocol number) + * @ini: Flow address information of the initiating side + * @tgt: Flow address information on the target side (updated) + * + * Return: pif of the target interface to forward the flow to, PIF_NONE if the + * flow cannot or should not be forwarded at all. + */ +uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto, + const struct flowside *ini, struct flowside *tgt) +{ + if (!inany_is_loopback(&ini->eaddr) || + (!inany_is_loopback(&ini->oaddr) && !inany_is_unspecified(&ini->oaddr))) { + char estr[INANY_ADDRSTRLEN], fstr[INANY_ADDRSTRLEN]; + + debug("Non loopback address on %s: [%s]:%hu -> [%s]:%hu", + pif_name(PIF_SPLICE), + inany_ntop(&ini->eaddr, estr, sizeof(estr)), ini->eport, + inany_ntop(&ini->oaddr, fstr, sizeof(fstr)), ini->oport); + return PIF_NONE; + } + + if (inany_v4(&ini->eaddr)) + tgt->eaddr = inany_loopback4; + else + tgt->eaddr = inany_loopback6; + + /* Preserve the specific loopback address used, but let the kernel pick + * a source port on the target side + */ + tgt->oaddr = ini->eaddr; + tgt->oport = 0; + + tgt->eport = ini->oport; + if (proto == IPPROTO_TCP) + tgt->eport += c->tcp.fwd_out.delta[tgt->eport]; + else if (proto == IPPROTO_UDP) + tgt->eport += c->udp.fwd_out.delta[tgt->eport]; + + /* Let the kernel pick a host side source port */ + tgt->oport = 0; + if (proto == IPPROTO_UDP) + /* But for UDP preserve the source port */ + tgt->oport = ini->eport; + + return PIF_HOST; +} + +/** + * nat_inbound() - Apply address translation for inbound (HOST to TAP) + * @c: Execution context + * @addr: Input address (as seen on HOST interface) + * @translated: Output address (as seen on TAP interface) + * + * Return: true on success, false if it couldn't translate the address + * + * Only handles translations that depend *only* on the address. Anything + * related to specific ports or flows is handled elsewhere. + */ +bool nat_inbound(const struct ctx *c, const union inany_addr *addr, + union inany_addr *translated) +{ + if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) && + inany_equals4(addr, &in4addr_loopback)) { + /* Specifically 127.0.0.1, not 127.0.0.0/8 */ + *translated = inany_from_v4(c->ip4.map_host_loopback); + } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) && + inany_equals6(addr, &in6addr_loopback)) { + translated->a6 = c->ip6.map_host_loopback; + } else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) && + inany_equals4(addr, &c->ip4.addr)) { + *translated = inany_from_v4(c->ip4.map_guest_addr); + } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) && + inany_equals6(addr, &c->ip6.addr)) { + translated->a6 = c->ip6.map_guest_addr; + } else if (fwd_guest_accessible(c, addr)) { + *translated = *addr; + } else { + return false; + } + + return true; +} + +/** + * fwd_nat_from_host() - Determine to forward a flow from the host interface + * @c: Execution context + * @proto: Protocol (IP L4 protocol number) + * @ini: Flow address information of the initiating side + * @tgt: Flow address information on the target side (updated) + * + * Return: pif of the target interface to forward the flow to, PIF_NONE if the + * flow cannot or should not be forwarded at all. + */ +uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto, + const struct flowside *ini, struct flowside *tgt) +{ + /* Common for spliced and non-spliced cases */ + tgt->eport = ini->oport; + if (proto == IPPROTO_TCP) + tgt->eport += c->tcp.fwd_in.delta[tgt->eport]; + else if (proto == IPPROTO_UDP) + tgt->eport += c->udp.fwd_in.delta[tgt->eport]; + + if (!c->no_splice && inany_is_loopback(&ini->eaddr) && + (proto == IPPROTO_TCP || proto == IPPROTO_UDP)) { + /* spliceable */ + + /* The traffic will go over the guest's 'lo' interface, but by + * default use its external address, so we don't inadvertently + * expose services that listen only on the guest's loopback + * address. That can be overridden by --host-lo-to-ns-lo which + * will instead forward to the loopback address in the guest. + * + * In either case, let the kernel pick the source address to + * match. + */ + if (inany_v4(&ini->eaddr)) { + if (c->host_lo_to_ns_lo) + tgt->eaddr = inany_loopback4; + else + tgt->eaddr = inany_from_v4(c->ip4.addr_seen); + tgt->oaddr = inany_any4; + } else { + if (c->host_lo_to_ns_lo) + tgt->eaddr = inany_loopback6; + else + tgt->eaddr.a6 = c->ip6.addr_seen; + tgt->oaddr = inany_any6; + } + + /* Let the kernel pick source port */ + tgt->oport = 0; + if (proto == IPPROTO_UDP) + /* But for UDP preserve the source port */ + tgt->oport = ini->eport; + + return PIF_SPLICE; + } + + if (!nat_inbound(c, &ini->eaddr, &tgt->oaddr)) { + if (inany_v4(&ini->eaddr)) { + if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.our_tap_addr)) + /* No source address we can use */ + return PIF_NONE; + tgt->oaddr = inany_from_v4(c->ip4.our_tap_addr); + } else { + tgt->oaddr.a6 = c->ip6.our_tap_ll; + } + } + tgt->oport = ini->eport; + + if (inany_v4(&tgt->oaddr)) { + tgt->eaddr = inany_from_v4(c->ip4.addr_seen); + } else { + if (inany_is_linklocal6(&tgt->oaddr)) + tgt->eaddr.a6 = c->ip6.addr_ll_seen; + else + tgt->eaddr.a6 = c->ip6.addr_seen; + } + + return PIF_TAP; +} @@ -7,10 +7,17 @@ #ifndef FWD_H #define FWD_H +union inany_addr; +struct flowside; + /* Number of ports for both TCP and UDP */ #define NUM_PORTS (1U << 16) +void fwd_probe_ephemeral(void); +bool fwd_port_is_ephemeral(in_port_t port); + enum fwd_ports_mode { + FWD_UNSET = 0, FWD_SPEC = 1, FWD_NONE, FWD_AUTO, @@ -41,4 +48,13 @@ void fwd_scan_ports_udp(struct fwd_ports *fwd, const struct fwd_ports *rev, const struct fwd_ports *tcp_rev); void fwd_scan_ports_init(struct ctx *c); +bool nat_inbound(const struct ctx *c, const union inany_addr *addr, + union inany_addr *translated); +uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto, + const struct flowside *ini, struct flowside *tgt); +uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto, + const struct flowside *ini, struct flowside *tgt); +uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto, + const struct flowside *ini, struct flowside *tgt); + #endif /* FWD_H */ diff --git a/hooks/pre-push b/hooks/pre-push index 33a2052..8dbfa5f 100755 --- a/hooks/pre-push +++ b/hooks/pre-push @@ -56,6 +56,7 @@ cd .. make pkgs scp passt passt.avx2 passt.1 qrap qrap.1 "${USER_HOST}:${BIN}" scp pasta pasta.avx2 pasta.1 "${USER_HOST}:${BIN}" +scp passt-repair passt-repair.1 "${USER_HOST}:${BIN}" ssh "${USER_HOST}" "rm -f ${BIN}/*.deb" ssh "${USER_HOST}" "rm -f ${BIN}/*.rpm" @@ -45,14 +45,23 @@ #define ICMP_ECHO_TIMEOUT 60 /* s, timeout for ICMP socket activity */ #define ICMP_NUM_IDS (1U << 16) -/* Sides of a flow as we use them for ping streams */ -#define SOCKSIDE 0 -#define TAPSIDE 1 +/** + * ping_at_sidx() - Get ping specific flow at given sidx + * @sidx: Flow and side to retrieve + * + * Return: ping specific flow at @sidx, or NULL of @sidx is invalid. Asserts if + * the flow at @sidx is not FLOW_PING4 or FLOW_PING6 + */ +static struct icmp_ping_flow *ping_at_sidx(flow_sidx_t sidx) +{ + union flow *flow = flow_at_sidx(sidx); -#define PINGF(idx) (&(FLOW(idx)->ping)) + if (!flow) + return NULL; -/* Indexed by ICMP echo identifier */ -static struct icmp_ping_flow *icmp_id_map[IP_VERSIONS][ICMP_NUM_IDS]; + ASSERT(flow->f.type == FLOW_PING4 || flow->f.type == FLOW_PING6); + return &flow->ping; +} /** * icmp_sock_handler() - Handle new data from ICMP or ICMPv6 socket @@ -61,7 +70,8 @@ static struct icmp_ping_flow *icmp_id_map[IP_VERSIONS][ICMP_NUM_IDS]; */ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref) { - struct icmp_ping_flow *pingf = PINGF(ref.flowside.flow); + struct icmp_ping_flow *pingf = ping_at_sidx(ref.flowside); + const struct flowside *ini = &pingf->f.side[INISIDE]; union sockaddr_inany sr; socklen_t sl = sizeof(sr); char buf[USHRT_MAX]; @@ -75,7 +85,7 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref) n = recvfrom(ref.fd, buf, sizeof(buf), 0, &sr.sa, &sl); if (n < 0) { - flow_err(pingf, "recvfrom() error: %s", strerror(errno)); + flow_perror(pingf, "recvfrom() error"); return; } @@ -87,7 +97,7 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref) goto unexpected; /* Adjust packet back to guest-side ID */ - ih4->un.echo.id = htons(pingf->id); + ih4->un.echo.id = htons(ini->eport); seq = ntohs(ih4->un.echo.sequence); } else if (pingf->f.type == FLOW_PING6) { struct icmp6hdr *ih6 = (struct icmp6hdr *)buf; @@ -97,7 +107,7 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref) goto unexpected; /* Adjust packet back to guest-side ID */ - ih6->icmp6_identifier = htons(pingf->id); + ih6->icmp6_identifier = htons(ini->eport); seq = ntohs(ih6->icmp6_sequence); } else { ASSERT(0); @@ -112,13 +122,20 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref) } flow_dbg(pingf, "echo reply to tap, ID: %"PRIu16", seq: %"PRIu16, - pingf->id, seq); + ini->eport, seq); + + if (pingf->f.type == FLOW_PING4) { + const struct in_addr *saddr = inany_v4(&ini->oaddr); + const struct in_addr *daddr = inany_v4(&ini->eaddr); + + ASSERT(saddr && daddr); /* Must have IPv4 addresses */ + tap_icmp4_send(c, *saddr, *daddr, buf, n); + } else if (pingf->f.type == FLOW_PING6) { + const struct in6_addr *saddr = &ini->oaddr.a6; + const struct in6_addr *daddr = &ini->eaddr.a6; - if (pingf->f.type == FLOW_PING4) - tap_icmp4_send(c, sr.sa4.sin_addr, tap_ip4_daddr(c), buf, n); - else if (pingf->f.type == FLOW_PING6) - tap_icmp6_send(c, &sr.sa6.sin6_addr, - tap_ip6_daddr(c, &sr.sa6.sin6_addr), buf, n); + tap_icmp6_send(c, saddr, daddr, buf, n); + } return; unexpected: @@ -133,56 +150,54 @@ unexpected: static void icmp_ping_close(const struct ctx *c, const struct icmp_ping_flow *pingf) { - uint16_t id = pingf->id; - - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, pingf->sock, NULL); + epoll_del(c, pingf->sock); close(pingf->sock); - - if (pingf->f.type == FLOW_PING4) - icmp_id_map[V4][id] = NULL; - else - icmp_id_map[V6][id] = NULL; + flow_hash_remove(c, FLOW_SIDX(pingf, INISIDE)); } /** * icmp_ping_new() - Prepare a new ping socket for a new id * @c: Execution context - * @id_sock: Pointer to ping flow entry slot in icmp_id_map[] to update * @af: Address family, AF_INET or AF_INET6 * @id: ICMP id for the new socket + * @saddr: Source address + * @daddr: Destination address * * Return: Newly opened ping flow, or NULL on failure */ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, - struct icmp_ping_flow **id_sock, - sa_family_t af, uint16_t id) + sa_family_t af, uint16_t id, + const void *saddr, const void *daddr) { + uint8_t proto = af == AF_INET ? IPPROTO_ICMP : IPPROTO_ICMPV6; uint8_t flowtype = af == AF_INET ? FLOW_PING4 : FLOW_PING6; union epoll_ref ref = { .type = EPOLL_TYPE_PING }; union flow *flow = flow_alloc(); struct icmp_ping_flow *pingf; - const void *bind_addr; - const char *bind_if; + const struct flowside *tgt; if (!flow) return NULL; - pingf = FLOW_START(flow, flowtype, ping, TAPSIDE); - - pingf->seq = -1; - pingf->id = id; + flow_initiate_af(flow, PIF_TAP, af, saddr, id, daddr, id); + if (!(tgt = flow_target(c, flow, proto))) + goto cancel; - if (af == AF_INET) { - bind_addr = &c->ip4.addr_out; - bind_if = c->ip4.ifname_out; - } else { - bind_addr = &c->ip6.addr_out; - bind_if = c->ip6.ifname_out; + if (flow->f.pif[TGTSIDE] != PIF_HOST) { + flow_err(flow, "No support for forwarding %s from %s to %s", + proto == IPPROTO_ICMP ? "ICMP" : "ICMPv6", + pif_name(flow->f.pif[INISIDE]), + pif_name(flow->f.pif[TGTSIDE])); + goto cancel; } - ref.flowside = FLOW_SIDX(flow, SOCKSIDE); - pingf->sock = sock_l4(c, af, flow_proto[flowtype], bind_addr, bind_if, - 0, ref.data); + pingf = FLOW_SET_TYPE(flow, flowtype, ping); + + pingf->seq = -1; + + ref.flowside = FLOW_SIDX(flow, TGTSIDE); + pingf->sock = flowside_sock_l4(c, EPOLL_TYPE_PING, PIF_HOST, + tgt, ref.data); if (pingf->sock < 0) { warn("Cannot open \"ping\" socket. You might need to:"); @@ -196,7 +211,9 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, flow_dbg(pingf, "new socket %i for echo ID %"PRIu16, pingf->sock, id); - *id_sock = pingf; + flow_hash_insert(c, FLOW_SIDX(pingf, INISIDE)); + + FLOW_ACTIVATE(pingf); return pingf; @@ -221,11 +238,14 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, const struct pool *p, const struct timespec *now) { - union sockaddr_inany sa = { .sa_family = af }; - const socklen_t sl = af == AF_INET ? sizeof(sa.sa4) : sizeof(sa.sa6); - struct icmp_ping_flow *pingf, **id_sock; + struct icmp_ping_flow *pingf; + const struct flowside *tgt; + union sockaddr_inany sa; + size_t dlen, l4len; uint16_t id, seq; - size_t plen; + union flow *flow; + uint8_t proto; + socklen_t sl; void *pkt; (void)saddr; @@ -234,49 +254,53 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, if (af == AF_INET) { const struct icmphdr *ih; - if (!(pkt = packet_get(p, 0, 0, sizeof(*ih), &plen))) + if (!(pkt = packet_get(p, 0, 0, sizeof(*ih), &dlen))) return 1; ih = (struct icmphdr *)pkt; - plen += sizeof(*ih); + l4len = dlen + sizeof(*ih); if (ih->type != ICMP_ECHO) return 1; + proto = IPPROTO_ICMP; id = ntohs(ih->un.echo.id); - id_sock = &icmp_id_map[V4][id]; seq = ntohs(ih->un.echo.sequence); - sa.sa4.sin_addr = *(struct in_addr *)daddr; } else if (af == AF_INET6) { const struct icmp6hdr *ih; - if (!(pkt = packet_get(p, 0, 0, sizeof(*ih), &plen))) + if (!(pkt = packet_get(p, 0, 0, sizeof(*ih), &dlen))) return 1; ih = (struct icmp6hdr *)pkt; - plen += sizeof(*ih); + l4len = dlen + sizeof(*ih); if (ih->icmp6_type != ICMPV6_ECHO_REQUEST) return 1; + proto = IPPROTO_ICMPV6; id = ntohs(ih->icmp6_identifier); - id_sock = &icmp_id_map[V6][id]; seq = ntohs(ih->icmp6_sequence); - sa.sa6.sin6_addr = *(struct in6_addr *)daddr; - sa.sa6.sin6_scope_id = c->ifi6; } else { ASSERT(0); } - if (!(pingf = *id_sock)) - if (!(pingf = icmp_ping_new(c, id_sock, af, id))) - return 1; + flow = flow_at_sidx(flow_lookup_af(c, proto, PIF_TAP, + af, saddr, daddr, id, id)); + if (flow) + pingf = &flow->ping; + else if (!(pingf = icmp_ping_new(c, af, id, saddr, daddr))) + return 1; + + tgt = &pingf->f.side[TGTSIDE]; + + ASSERT(flow_proto[pingf->f.type] == proto); pingf->ts = now->tv_sec; - if (sendto(pingf->sock, pkt, plen, MSG_NOSIGNAL, &sa.sa, sl) < 0) { - flow_dbg(pingf, "failed to relay request to socket: %s", - strerror(errno)); + pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, 0); + if (sendto(pingf->sock, pkt, l4len, MSG_NOSIGNAL, &sa.sa, sl) < 0) { + flow_dbg_perror(pingf, "failed to relay request to socket"); } else { flow_dbg(pingf, "echo request to socket, ID: %"PRIu16", seq: %"PRIu16, @@ -289,16 +313,14 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, /** * icmp_ping_timer() - Handler for timed events related to a given flow * @c: Execution context - * @flow: flow table entry to check for timeout + * @pingf: Ping flow to check for timeout * @now: Current timestamp * * Return: true if the flow is ready to free, false otherwise */ -bool icmp_ping_timer(const struct ctx *c, union flow *flow, +bool icmp_ping_timer(const struct ctx *c, const struct icmp_ping_flow *pingf, const struct timespec *now) { - const struct icmp_ping_flow *pingf = &flow->ping; - if (now->tv_sec - pingf->ts <= ICMP_ECHO_TIMEOUT) return false; diff --git a/icmp_flow.h b/icmp_flow.h index 5a2eed9..fb93801 100644 --- a/icmp_flow.h +++ b/icmp_flow.h @@ -13,7 +13,6 @@ * @seq: Last sequence number sent to tap, host order, -1: not sent yet * @sock: "ping" socket * @ts: Last associated activity from tap, seconds - * @id: ICMP id for the flow as seen by the guest */ struct icmp_ping_flow { /* Must be first element */ @@ -22,10 +21,9 @@ struct icmp_ping_flow { int seq; int sock; time_t ts; - uint16_t id; }; -bool icmp_ping_timer(const struct ctx *c, union flow *flow, +bool icmp_ping_timer(const struct ctx *c, const struct icmp_ping_flow *pingf, const struct timespec *now); #endif /* ICMP_FLOW_H */ @@ -17,21 +17,8 @@ #include "siphash.h" #include "inany.h" -const union inany_addr inany_loopback4 = { - .v4mapped = { - .zero = { 0 }, - .one = { 0xff, 0xff, }, - .a4 = IN4ADDR_LOOPBACK_INIT, - }, -}; - -const union inany_addr inany_any4 = { - .v4mapped = { - .zero = { 0 }, - .one = { 0xff, 0xff, }, - .a4 = IN4ADDR_ANY_INIT, - }, -}; +const union inany_addr inany_loopback4 = INANY_INIT4(IN4ADDR_LOOPBACK_INIT); +const union inany_addr inany_any4 = INANY_INIT4(IN4ADDR_ANY_INIT); /** inany_ntop - Convert an IPv[46] address to text format * @src: IPv[46] address @@ -49,3 +36,23 @@ const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size) return inet_ntop(AF_INET6, &src->a6, dst, size); } + +/** inany_pton - Parse an IPv[46] address from text format + * @src: IPv[46] address + * @dst: output buffer, filled with parsed address + * + * Return: On success, 1, if no parseable address is found, 0 + */ +int inany_pton(const char *src, union inany_addr *dst) +{ + if (inet_pton(AF_INET, src, &dst->v4mapped.a4)) { + memset(&dst->v4mapped.zero, 0, sizeof(dst->v4mapped.zero)); + memset(&dst->v4mapped.one, 0xff, sizeof(dst->v4mapped.one)); + return 1; + } + + if (inet_pton(AF_INET6, src, &dst->a6)) + return 1; + + return 0; +} @@ -43,6 +43,17 @@ extern const union inany_addr inany_any4; #define in4addr_loopback (inany_loopback4.v4mapped.a4) #define in4addr_any (inany_any4.v4mapped.a4) +#define INANY_INIT4(a4init) { \ + .v4mapped = { \ + .zero = { 0 }, \ + .one = { 0xff, 0xff }, \ + .a4 = a4init, \ + }, \ + } + +#define inany_from_v4(a4) \ + ((union inany_addr)INANY_INIT4((a4))) + /** union sockaddr_inany - Either a sockaddr_in or a sockaddr_in6 * @sa_family: Address family, AF_INET or AF_INET6 * @sa: Plain struct sockaddr (useful to avoid casts) @@ -79,6 +90,54 @@ static inline bool inany_equals(const union inany_addr *a, return IN6_ARE_ADDR_EQUAL(&a->a6, &b->a6); } +/** inany_equals4 - Compare an IPv[46] address to an IPv4 address + * @a: IPv[46] addresses + * @b: IPv4 address + * + * Return: true if @a and @b are the same address + */ +static inline bool inany_equals4(const union inany_addr *a, + const struct in_addr *b) +{ + const struct in_addr *a4 = inany_v4(a); + + return a4 && IN4_ARE_ADDR_EQUAL(a4, b); +} + +/** inany_equals6 - Compare an IPv[46] address to an IPv6 address + * @a: IPv[46] addresses + * @b: IPv6 address + * + * Return: true if @a and @b are the same address + */ +static inline bool inany_equals6(const union inany_addr *a, + const struct in6_addr *b) +{ + return IN6_ARE_ADDR_EQUAL(&a->a6, b); +} + +/** inany_is_loopback4() - Check if address is IPv4 loopback + * @a: IPv[46] address + * + * Return: true if @a is in 127.0.0.1/8 + */ +static inline bool inany_is_loopback4(const union inany_addr *a) +{ + const struct in_addr *v4 = inany_v4(a); + + return v4 && IN4_IS_ADDR_LOOPBACK(v4); +} + +/** inany_is_loopback6() - Check if address is IPv6 loopback + * @a: IPv[46] address + * + * Return: true if @a is in ::1 + */ +static inline bool inany_is_loopback6(const union inany_addr *a) +{ + return IN6_IS_ADDR_LOOPBACK(&a->a6); +} + /** inany_is_loopback() - Check if address is loopback * @a: IPv[46] address * @@ -86,9 +145,29 @@ static inline bool inany_equals(const union inany_addr *a, */ static inline bool inany_is_loopback(const union inany_addr *a) { + return inany_is_loopback4(a) || inany_is_loopback6(a); +} + +/** inany_is_unspecified4() - Check if address is unspecified IPv4 + * @a: IPv[46] address + * + * Return: true if @a is 0.0.0.0 + */ +static inline bool inany_is_unspecified4(const union inany_addr *a) +{ const struct in_addr *v4 = inany_v4(a); - return IN6_IS_ADDR_LOOPBACK(&a->a6) || (v4 && IN4_IS_ADDR_LOOPBACK(v4)); + return v4 && IN4_IS_ADDR_UNSPECIFIED(v4); +} + +/** inany_is_unspecified6() - Check if address is unspecified IPv6 + * @a: IPv[46] address + * + * Return: true if @a is :: + */ +static inline bool inany_is_unspecified6(const union inany_addr *a) +{ + return IN6_IS_ADDR_UNSPECIFIED(&a->a6); } /** inany_is_unspecified() - Check if address is unspecified @@ -98,10 +177,19 @@ static inline bool inany_is_loopback(const union inany_addr *a) */ static inline bool inany_is_unspecified(const union inany_addr *a) { - const struct in_addr *v4 = inany_v4(a); + return inany_is_unspecified4(a) || inany_is_unspecified6(a); +} - return IN6_IS_ADDR_UNSPECIFIED(&a->a6) || - (v4 && IN4_IS_ADDR_UNSPECIFIED(v4)); +/* FIXME: consider handling of IPv4 link-local addresses */ + +/** inany_is_linklocal6() - Check if address is link-local IPv6 + * @a: IPv[46] address + * + * Return: true if @a is in fe80::/10 (IPv6 link local unicast) + */ +static inline bool inany_is_linklocal6(const union inany_addr *a) +{ + return IN6_IS_ADDR_LINKLOCAL(&a->a6); } /** inany_is_multicast() - Check if address is multicast or broadcast @@ -123,7 +211,6 @@ static inline bool inany_is_multicast(const union inany_addr *a) * * Return: true if @a is specified and a unicast address */ -/* cppcheck-suppress unusedFunction */ static inline bool inany_is_unicast(const union inany_addr *a) { return !inany_is_unspecified(a) && !inany_is_multicast(a); @@ -150,23 +237,30 @@ static inline void inany_from_af(union inany_addr *aa, } /** inany_from_sockaddr - Extract IPv[46] address and port number from sockaddr - * @aa: Pointer to store IPv[46] address + * @dst: Pointer to store IPv[46] address (output) * @port: Pointer to store port number, host order - * @addr: AF_INET or AF_INET6 socket address + * @addr: Socket address + * + * Return: 0 on success, -1 on error (bad address family) */ -static inline void inany_from_sockaddr(union inany_addr *aa, in_port_t *port, - const union sockaddr_inany *sa) +static inline int inany_from_sockaddr(union inany_addr *dst, in_port_t *port, + const void *addr) { + const union sockaddr_inany *sa = (const union sockaddr_inany *)addr; + if (sa->sa_family == AF_INET6) { - inany_from_af(aa, AF_INET6, &sa->sa6.sin6_addr); + inany_from_af(dst, AF_INET6, &sa->sa6.sin6_addr); *port = ntohs(sa->sa6.sin6_port); - } else if (sa->sa_family == AF_INET) { - inany_from_af(aa, AF_INET, &sa->sa4.sin_addr); + return 0; + } + + if (sa->sa_family == AF_INET) { + inany_from_af(dst, AF_INET, &sa->sa4.sin_addr); *port = ntohs(sa->sa4.sin_port); - } else { - /* Not valid to call with other address families */ - ASSERT(0); + return 0; } + + return -1; } /** inany_siphash_feed- Fold IPv[46] address into an in-progress siphash @@ -183,5 +277,6 @@ static inline void inany_siphash_feed(struct siphash_state *state, #define INANY_ADDRSTRLEN MAX(INET_ADDRSTRLEN, INET6_ADDRSTRLEN) const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size); +int inany_pton(const char *src, union inany_addr *dst); #endif /* INANY_H */ @@ -26,7 +26,8 @@ #include "iov.h" -/* iov_skip_bytes() - Skip leading bytes of an IO vector +/** + * iov_skip_bytes() - Skip leading bytes of an IO vector * @iov: IO vector * @n: Number of entries in @iov * @skip: Number of leading bytes of @iov to skip @@ -56,8 +57,8 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n, } /** - * iov_from_buf - Copy data from a buffer to an I/O vector (struct iovec) - * efficiently. + * iov_from_buf() - Copy data from a buffer to an I/O vector (struct iovec) + * efficiently. * * @iov: Pointer to the array of struct iovec describing the * scatter/gather I/O vector. @@ -68,7 +69,6 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n, * * Returns: The number of bytes successfully copied. */ -/* cppcheck-suppress unusedFunction */ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt, size_t offset, const void *buf, size_t bytes) { @@ -97,8 +97,8 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt, } /** - * iov_to_buf - Copy data from a scatter/gather I/O vector (struct iovec) to - * a buffer efficiently. + * iov_to_buf() - Copy data from a scatter/gather I/O vector (struct iovec) to + * a buffer efficiently. * * @iov: Pointer to the array of struct iovec describing the scatter/gather * I/O vector. @@ -137,8 +137,8 @@ size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt, } /** - * iov_size - Calculate the total size of a scatter/gather I/O vector - * (struct iovec). + * iov_size() - Calculate the total size of a scatter/gather I/O vector + * (struct iovec). * * @iov: Pointer to the array of struct iovec describing the * scatter/gather I/O vector. @@ -158,40 +158,93 @@ size_t iov_size(const struct iovec *iov, size_t iov_cnt) } /** - * iov_copy - Copy data from one scatter/gather I/O vector (struct iovec) to - * another. - * - * @dst_iov: Pointer to the destination array of struct iovec describing - * the scatter/gather I/O vector to copy to. - * @dst_iov_cnt: Number of elements in the destination iov array. - * @iov: Pointer to the source array of struct iovec describing - * the scatter/gather I/O vector to copy from. - * @iov_cnt: Number of elements in the source iov array. - * @offset: Offset within the source iov from where copying should start. - * @bytes: Total number of bytes to copy from iov to dst_iov. - * - * Returns: The number of elements successfully copied to the destination - * iov array. + * iov_tail_prune() - Remove any unneeded buffers from an IOV tail + * @tail: IO vector tail (modified) + * + * If an IOV tail's offset is large enough, it may not include any bytes from + * the first (or first several) buffers in the underlying IO vector. Modify the + * tail's representation so it contains the same logical bytes, but only + * includes buffers that are actually needed. This will avoid stepping through + * unnecessary elements of the underlying IO vector on future operations. + * + * Return: true if the tail still contains any bytes, otherwise false */ -/* cppcheck-suppress unusedFunction */ -unsigned iov_copy(struct iovec *dst_iov, size_t dst_iov_cnt, - const struct iovec *iov, size_t iov_cnt, - size_t offset, size_t bytes) +bool iov_tail_prune(struct iov_tail *tail) { - unsigned int i, j; + size_t i; - i = iov_skip_bytes(iov, iov_cnt, offset, &offset); + i = iov_skip_bytes(tail->iov, tail->cnt, tail->off, &tail->off); + tail->iov += i; + tail->cnt -= i; - /* copying data */ - for (j = 0; i < iov_cnt && j < dst_iov_cnt && bytes; i++) { - size_t len = MIN(bytes, iov[i].iov_len - offset); + return !!tail->cnt; +} - dst_iov[j].iov_base = (char *)iov[i].iov_base + offset; - dst_iov[j].iov_len = len; - j++; - bytes -= len; - offset = 0; - } +/** + * iov_tail_size - Calculate the total size of an IO vector tail + * @tail: IO vector tail + * + * Returns: The total size in bytes. + */ +size_t iov_tail_size(struct iov_tail *tail) +{ + iov_tail_prune(tail); + return iov_size(tail->iov, tail->cnt) - tail->off; +} + +/** + * iov_peek_header_() - Get pointer to a header from an IOV tail + * @tail: IOV tail to get header from + * @len: Length of header to get, in bytes + * @align: Required alignment of header, in bytes + * + * @tail may be pruned, but will represent the same bytes as before. + * + * Returns: Pointer to the first @len logical bytes of the tail, NULL if that + * overruns the IO vector, is not contiguous or doesn't have the + * requested alignment. + */ +/* cppcheck-suppress [staticFunction,unmatchedSuppression] */ +void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align) +{ + char *p; + + if (!iov_tail_prune(tail)) + return NULL; /* Nothing left */ + + if (tail->off + len < tail->off) + return NULL; /* Overflow */ + + if (tail->off + len > tail->iov[0].iov_len) + return NULL; /* Not contiguous */ + + p = (char *)tail->iov[0].iov_base + tail->off; + if ((uintptr_t)p % align) + return NULL; /* not aligned */ + + return p; +} + +/** + * iov_remove_header_() - Remove a header from an IOV tail + * @tail: IOV tail to remove header from (modified) + * @len: Length of header to remove, in bytes + * @align: Required alignment of header, in bytes + * + * On success, @tail is updated so that it longer includes the bytes of the + * returned header. + * + * Returns: Pointer to the first @len logical bytes of the tail, NULL if that + * overruns the IO vector, is not contiguous or doesn't have the + * requested alignment. + */ +void *iov_remove_header_(struct iov_tail *tail, size_t len, size_t align) +{ + char *p = iov_peek_header_(tail, len, align); + + if (!p) + return NULL; - return j; + tail->off = tail->off + len; + return p; } @@ -18,6 +18,9 @@ #include <unistd.h> #include <string.h> +#define IOV_OF_LVALUE(lval) \ + (struct iovec){ .iov_base = &(lval), .iov_len = sizeof(lval) } + size_t iov_skip_bytes(const struct iovec *iov, size_t n, size_t skip, size_t *offset); size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt, @@ -25,7 +28,80 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt, size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt, size_t offset, void *buf, size_t bytes); size_t iov_size(const struct iovec *iov, size_t iov_cnt); -unsigned iov_copy(struct iovec *dst_iov, size_t dst_iov_cnt, - const struct iovec *iov, size_t iov_cnt, - size_t offset, size_t bytes); + +/* + * DOC: Theory of Operation, struct iov_tail + * + * Sometimes a single logical network frame is split across multiple buffers, + * represented by an IO vector (struct iovec[]). We often want to process this + * one header / network layer at a time. So, it's useful to maintain a "tail" + * of the vector representing the parts we haven't yet extracted. + * + * The headers we extract need not line up with buffer boundaries (though we do + * assume they're contiguous within a single buffer for now). So, we could + * represent that tail as another struct iovec[], but that would mean copying + * the whole array of struct iovecs, just so we can adjust the offset and length + * on the first one. + * + * So, instead represent the tail as pointer into an existing struct iovec[], + * with an explicit offset for where the "tail" starts within it. If we extract + * enough headers that some buffers of the original vector no longer contain + * part of the tail, we (lazily) advance our struct iovec * to the first buffer + * we still need, and adjust the vector length and offset to match. + */ + +/** + * struct iov_tail - An IO vector which may have some headers logically removed + * @iov: IO vector + * @cnt: Number of entries in @iov + * @off: Current offset in @iov + */ +struct iov_tail { + const struct iovec *iov; + size_t cnt, off; +}; + +/** + * IOV_TAIL() - Create a new IOV tail + * @iov_: IO vector to create tail from + * @cnt_: Length of the IO vector at @iov_ + * @off_: Byte offset in the IO vector where the tail begins + */ +#define IOV_TAIL(iov_, cnt_, off_) \ + (struct iov_tail){ .iov = (iov_), .cnt = (cnt_), .off = (off_) } + +bool iov_tail_prune(struct iov_tail *tail); +size_t iov_tail_size(struct iov_tail *tail); +void *iov_peek_header_(struct iov_tail *tail, size_t len, size_t align); +void *iov_remove_header_(struct iov_tail *tail, size_t len, size_t align); + +/** + * IOV_PEEK_HEADER() - Get typed pointer to a header from an IOV tail + * @tail_: IOV tail to get header from + * @type_: Data type of the header + * + * @tail_ may be pruned, but will represent the same bytes as before. + * + * Returns: Pointer of type (@type_ *) located at the start of @tail_, NULL if + * we can't get a contiguous and aligned pointer. + */ +#define IOV_PEEK_HEADER(tail_, type_) \ + ((type_ *)(iov_peek_header_((tail_), \ + sizeof(type_), __alignof__(type_)))) + +/** + * IOV_REMOVE_HEADER() - Remove and return typed header from an IOV tail + * @tail_: IOV tail to remove header from (modified) + * @type_: Data type of the header to remove + * + * On success, @tail_ is updated so that it longer includes the bytes of the + * returned header. + * + * Returns: Pointer of type (@type_ *) located at the old start of @tail_, NULL + * if we can't get a contiguous and aligned pointer. + */ +#define IOV_REMOVE_HEADER(tail_, type_) \ + ((type_ *)(iov_remove_header_((tail_), \ + sizeof(type_), __alignof__(type_)))) + #endif /* IOVEC_H */ @@ -24,6 +24,11 @@ #define IN4ADDR_ANY_INIT \ { .s_addr = htonl_constant(INADDR_ANY) } +#define IN4_IS_ADDR_LINKLOCAL(a) \ + ((ntohl(((struct in_addr *)(a))->s_addr) >> 16) == 0xa9fe) +#define IN4_IS_PREFIX_LINKLOCAL(a, len) \ + ((len) >= 16 && IN4_IS_ADDR_LINKLOCAL(a)) + #define L2_BUF_IP4_INIT(proto) \ { \ .version = 4, \ @@ -31,15 +36,20 @@ .tos = 0, \ .tot_len = 0, \ .id = 0, \ - .frag_off = 0, \ + .frag_off = htons(IP_DF), \ .ttl = 0xff, \ .protocol = (proto), \ .saddr = 0, \ .daddr = 0, \ } #define L2_BUF_IP4_PSUM(proto) ((uint32_t)htons_constant(0x4500) + \ + (uint32_t)htons_constant(IP_DF) + \ (uint32_t)htons(0xff00 | (proto))) + +#define IN6_IS_PREFIX_LINKLOCAL(a, len) \ + ((len) >= 10 && IN6_IS_ADDR_LINKLOCAL(a)) + #define L2_BUF_IP6_INIT(proto) \ { \ .priority = 0, \ @@ -81,6 +91,49 @@ struct ipv6_opt_hdr { */ } __attribute__((packed)); /* required for some archs */ +/** + * ip6_set_flow_lbl() - Set flow label in an IPv6 header + * @ip6h: Pointer to IPv6 header, updated + * @flow: Set @ip6h flow label to the low 20 bits of this integer + */ +static inline void ip6_set_flow_lbl(struct ipv6hdr *ip6h, uint32_t flow) +{ + ip6h->flow_lbl[0] = (flow >> 16) & 0xf; + ip6h->flow_lbl[1] = (flow >> 8) & 0xff; + ip6h->flow_lbl[2] = (flow >> 0) & 0xff; +} + +/** ip6_get_flow_lbl() - Get flow label from an IPv6 header + * @ip6h: Pointer to IPv6 header + * + * Return: flow label from @ip6h as an integer (<= 20 bits) + */ +static inline uint32_t ip6_get_flow_lbl(const struct ipv6hdr *ip6h) +{ + return (ip6h->flow_lbl[0] & 0xf) << 16 | + ip6h->flow_lbl[1] << 8 | + ip6h->flow_lbl[2]; +} + char *ipv6_l4hdr(const struct pool *p, int idx, size_t offset, uint8_t *proto, size_t *dlen); + +/* IPv6 link-local all-nodes multicast address, ff02::1 */ +static const struct in6_addr in6addr_ll_all_nodes = { + .s6_addr = { + 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + }, +}; + +/* IPv4 Limited Broadcast (RFC 919, Section 7), 255.255.255.255 */ +static const struct in_addr in4addr_broadcast = { 0xffffffff }; + +#ifndef IPV4_MIN_MTU +#define IPV4_MIN_MTU 68 +#endif +#ifndef IPV6_MIN_MTU +#define IPV6_MIN_MTU 1280 +#endif + #endif /* IP_H */ diff --git a/isolation.c b/isolation.c index f394e93..bbcd23b 100644 --- a/isolation.c +++ b/isolation.c @@ -29,7 +29,8 @@ * * Executed immediately after startup, drops capabilities we don't * need at any point during execution (or which we gain back when we - * need by joining other namespaces). + * need by joining other namespaces), and closes any leaked file we + * might have inherited from the parent process. * * 2. isolate_user() * ================= @@ -105,7 +106,7 @@ static void drop_caps_ep_except(uint64_t keep) int i; if (syscall(SYS_capget, &hdr, data)) - die("Couldn't get current capabilities: %s", strerror(errno)); + die_perror("Couldn't get current capabilities"); for (i = 0; i < CAP_WORDS; i++) { uint32_t mask = keep >> (32 * i); @@ -115,7 +116,7 @@ static void drop_caps_ep_except(uint64_t keep) } if (syscall(SYS_capset, &hdr, data)) - die("Couldn't drop capabilities: %s", strerror(errno)); + die_perror("Couldn't drop capabilities"); } /** @@ -128,7 +129,7 @@ static void drop_caps_ep_except(uint64_t keep) * additional layer of protection. Executing this requires * CAP_SETPCAP, which we will have within our userns. * - * Note that dropping capabilites from the bounding set limits + * Note that dropping capabilities from the bounding set limits * exec()ed processes, but does not remove them from the effective or * permitted sets, so it doesn't reduce our own capabilities. */ @@ -152,30 +153,31 @@ static void clamp_caps(void) */ if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0) && errno != EINVAL && errno != EPERM) - die("Couldn't drop cap %i from bounding set: %s", - i, strerror(errno)); + die_perror("Couldn't drop cap %i from bounding set", i); } if (syscall(SYS_capget, &hdr, data)) - die("Couldn't get current capabilities: %s", strerror(errno)); + die_perror("Couldn't get current capabilities"); for (i = 0; i < CAP_WORDS; i++) data[i].inheritable = 0; if (syscall(SYS_capset, &hdr, data)) - die("Couldn't drop inheritable capabilities: %s", - strerror(errno)); + die_perror("Couldn't drop inheritable capabilities"); } /** - * isolate_initial() - Early, config independent self isolation + * isolate_initial() - Early, mostly config independent self isolation + * @argc: Argument count + * @argv: Command line options: only --fd (if present) is relevant here * * Should: * - drop unneeded capabilities - * Musn't: - * - remove filesytem access (we need to access files during setup) + * - close all open files except for standard streams and the one from --fd + * Mustn't: + * - remove filesystem access (we need to access files during setup) */ -void isolate_initial(void) +void isolate_initial(int argc, char **argv) { uint64_t keep; @@ -192,7 +194,7 @@ void isolate_initial(void) * * It's debatable whether it's useful to drop caps when we * retain SETUID and SYS_ADMIN, but we might as well. We drop - * further capabilites in isolate_user() and + * further capabilities in isolate_user() and * isolate_prefork(). */ keep = BIT(CAP_NET_BIND_SERVICE) | BIT(CAP_SETUID) | BIT(CAP_SETGID) | @@ -209,6 +211,8 @@ void isolate_initial(void) keep |= BIT(CAP_SETFCAP) | BIT(CAP_SYS_PTRACE); drop_caps_ep_except(keep); + + close_open_files(argc, argv); } /** @@ -234,34 +238,30 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns, if (setgroups(0, NULL)) { /* If we don't have CAP_SETGID, this will EPERM */ if (errno != EPERM) - die("Can't drop supplementary groups: %s", - strerror(errno)); + die_perror("Can't drop supplementary groups"); } if (setgid(gid) != 0) - die("Can't set GID to %u: %s", gid, strerror(errno)); + die_perror("Can't set GID to %u", gid); if (setuid(uid) != 0) - die("Can't set UID to %u: %s", uid, strerror(errno)); + die_perror("Can't set UID to %u", uid); if (*userns) { /* If given a userns, join it */ int ufd; ufd = open(userns, O_RDONLY | O_CLOEXEC); if (ufd < 0) - die("Couldn't open user namespace %s: %s", - userns, strerror(errno)); + die_perror("Couldn't open user namespace %s", userns); if (setns(ufd, CLONE_NEWUSER) != 0) - die("Couldn't enter user namespace %s: %s", - userns, strerror(errno)); + die_perror("Couldn't enter user namespace %s", userns); close(ufd); } else if (use_userns) { /* Create and join a new userns */ if (unshare(CLONE_NEWUSER) != 0) - die("Couldn't create user namespace: %s", - strerror(errno)); + die_perror("Couldn't create user namespace"); } /* Joining a new userns gives us full capabilities; drop the @@ -312,38 +312,38 @@ int isolate_prefork(const struct ctx *c) * PID namespace. For passt, use CLONE_NEWPID anyway, in case somebody * ever gets around seccomp profiles -- there's no harm in passing it. */ - if (!c->foreground || c->mode == MODE_PASST) + if (!c->foreground || c->mode != MODE_PASTA) flags |= CLONE_NEWPID; if (unshare(flags)) { - perror("unshare"); + err_perror("Failed to detach isolating namespaces"); return -errno; } if (mount("", "/", "", MS_UNBINDABLE | MS_REC, NULL)) { - perror("mount /"); + err_perror("Failed to remount /"); return -errno; } if (mount("", TMPDIR, "tmpfs", MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY, "nr_inodes=2,nr_blocks=0")) { - perror("mount tmpfs"); + err_perror("Failed to mount empty tmpfs for pivot_root()"); return -errno; } if (chdir(TMPDIR)) { - perror("chdir"); + err_perror("Failed to change directory into empty tmpfs"); return -errno; } if (syscall(SYS_pivot_root, ".", ".")) { - perror("pivot_root"); + err_perror("Failed to pivot_root() into empty tmpfs"); return -errno; } if (umount2(".", MNT_DETACH | UMOUNT_NOFOLLOW)) { - perror("umount2"); + err_perror("Failed to unmount original root filesystem"); return -errno; } @@ -379,17 +379,24 @@ void isolate_postfork(const struct ctx *c) prctl(PR_SET_DUMPABLE, 0); - if (c->mode == MODE_PASST) { + switch (c->mode) { + case MODE_PASST: prog.len = (unsigned short)ARRAY_SIZE(filter_passt); prog.filter = filter_passt; - } else { + break; + case MODE_PASTA: prog.len = (unsigned short)ARRAY_SIZE(filter_pasta); prog.filter = filter_pasta; + break; + case MODE_VU: + prog.len = (unsigned short)ARRAY_SIZE(filter_vu); + prog.filter = filter_vu; + break; + default: + ASSERT(0); } if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) || - prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) { - perror("prctl"); - exit(EXIT_FAILURE); - } + prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) + die_perror("Failed to apply seccomp filter"); } diff --git a/isolation.h b/isolation.h index 846b2af..80bb68d 100644 --- a/isolation.h +++ b/isolation.h @@ -7,7 +7,7 @@ #ifndef ISOLATION_H #define ISOLATION_H -void isolate_initial(void); +void isolate_initial(int argc, char **argv); void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns, enum passt_modes mode); int isolate_prefork(const struct ctx *c); @@ -39,13 +39,11 @@ void lineread_init(struct lineread *lr, int fd) * * Return: length of line in bytes, -1 if no line was found */ -static int peek_line(struct lineread *lr, bool eof) +static ssize_t peek_line(struct lineread *lr, bool eof) { char *nl; /* Sanity checks (which also document invariants) */ - ASSERT(lr->count >= 0); - ASSERT(lr->next_line >= 0); ASSERT(lr->next_line + lr->count >= lr->next_line); ASSERT(lr->next_line + lr->count <= LINEREAD_BUFFER_SIZE); @@ -74,13 +72,13 @@ static int peek_line(struct lineread *lr, bool eof) * * Return: Length of line read on success, 0 on EOF, negative on error */ -int lineread_get(struct lineread *lr, char **line) +ssize_t lineread_get(struct lineread *lr, char **line) { bool eof = false; - int line_len; + ssize_t line_len; while ((line_len = peek_line(lr, eof)) < 0) { - int rc; + ssize_t rc; if ((lr->next_line + lr->count) == LINEREAD_BUFFER_SIZE) { /* No space at end */ @@ -18,14 +18,15 @@ * @buf: Buffer storing data read from file. */ struct lineread { - int fd; int next_line; - int count; + int fd; + ssize_t next_line; + ssize_t count; /* One extra byte for possible trailing \0 */ char buf[LINEREAD_BUFFER_SIZE+1]; }; void lineread_init(struct lineread *lr, int fd); -int lineread_get(struct lineread *lr, char **line); +ssize_t lineread_get(struct lineread *lr, char **line); #endif /* _LINEREAD_H */ diff --git a/linux_dep.h b/linux_dep.h new file mode 100644 index 0000000..240f50a --- /dev/null +++ b/linux_dep.h @@ -0,0 +1,144 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * + * Declarations for Linux specific dependencies + */ + +#ifndef LINUX_DEP_H +#define LINUX_DEP_H + +/* struct tcp_info_linux - Information from Linux TCP_INFO getsockopt() + * + * Largely derived from include/linux/tcp.h in the Linux kernel + * + * Some fields returned by TCP_INFO have been there for ages and are shared with + * BSD. struct tcp_info from netinet/tcp.h has only those fields. There are + * also a many Linux specific extensions to the structure, which are only found + * in the linux/tcp.h version of struct tcp_info. + * + * We want to use some of those extension fields, when available. We can test + * for availability in the runtime kernel using the length returned from + * getsockopt(). However, we won't necessarily be compiled against the same + * kernel headers as we'll run with, so compiling directly against linux/tcp.h + * means wrapping every field access in an #ifdef whose #else does the same + * thing as when the field is missing at runtime. This rapidly gets messy. + * + * Instead we define here struct tcp_info_linux which includes all the Linux + * extensions that we want to use. This is taken from v6.11 of the kernel. + */ +struct tcp_info_linux { + uint8_t tcpi_state; + uint8_t tcpi_ca_state; + uint8_t tcpi_retransmits; + uint8_t tcpi_probes; + uint8_t tcpi_backoff; + uint8_t tcpi_options; + uint8_t tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4; + uint8_t tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2; + + uint32_t tcpi_rto; + uint32_t tcpi_ato; + uint32_t tcpi_snd_mss; + uint32_t tcpi_rcv_mss; + + uint32_t tcpi_unacked; + uint32_t tcpi_sacked; + uint32_t tcpi_lost; + uint32_t tcpi_retrans; + uint32_t tcpi_fackets; + + /* Times. */ + uint32_t tcpi_last_data_sent; + uint32_t tcpi_last_ack_sent; + uint32_t tcpi_last_data_recv; + uint32_t tcpi_last_ack_recv; + + /* Metrics. */ + uint32_t tcpi_pmtu; + uint32_t tcpi_rcv_ssthresh; + uint32_t tcpi_rtt; + uint32_t tcpi_rttvar; + uint32_t tcpi_snd_ssthresh; + uint32_t tcpi_snd_cwnd; + uint32_t tcpi_advmss; + uint32_t tcpi_reordering; + + uint32_t tcpi_rcv_rtt; + uint32_t tcpi_rcv_space; + + uint32_t tcpi_total_retrans; + + /* Linux extensions */ + uint64_t tcpi_pacing_rate; + uint64_t tcpi_max_pacing_rate; + uint64_t tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */ + uint64_t tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */ + uint32_t tcpi_segs_out; /* RFC4898 tcpEStatsPerfSegsOut */ + uint32_t tcpi_segs_in; /* RFC4898 tcpEStatsPerfSegsIn */ + + uint32_t tcpi_notsent_bytes; + uint32_t tcpi_min_rtt; + uint32_t tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */ + uint32_t tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */ + + uint64_t tcpi_delivery_rate; + + uint64_t tcpi_busy_time; /* Time (usec) busy sending data */ + uint64_t tcpi_rwnd_limited; /* Time (usec) limited by receive window */ + uint64_t tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */ + + uint32_t tcpi_delivered; + uint32_t tcpi_delivered_ce; + + uint64_t tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */ + uint64_t tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */ + uint32_t tcpi_dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups */ + uint32_t tcpi_reord_seen; /* reordering events seen */ + + uint32_t tcpi_rcv_ooopack; /* Out-of-order packets received */ + + uint32_t tcpi_snd_wnd; /* peer's advertised receive window after + * scaling (bytes) + */ + uint32_t tcpi_rcv_wnd; /* local advertised receive window after + * scaling (bytes) + */ + + uint32_t tcpi_rehash; /* PLB or timeout triggered rehash attempts */ + + uint16_t tcpi_total_rto; /* Total number of RTO timeouts, including + * SYN/SYN-ACK and recurring timeouts. + */ + uint16_t tcpi_total_rto_recoveries; /* Total number of RTO + * recoveries, including any + * unfinished recovery. + */ + uint32_t tcpi_total_rto_time; /* Total time spent in RTO recoveries + * in milliseconds, including any + * unfinished recovery. + */ +}; + +#include <linux/falloc.h> + +#ifndef FALLOC_FL_COLLAPSE_RANGE +#define FALLOC_FL_COLLAPSE_RANGE 0x08 +#endif + +#include <linux/close_range.h> + +/* glibc < 2.34 and musl as of 1.2.5 need these */ +#ifndef SYS_close_range +#define SYS_close_range 436 +#endif +#ifndef CLOSE_RANGE_UNSHARE /* Linux kernel < 5.9 */ +#define CLOSE_RANGE_UNSHARE (1U << 1) +#endif + +__attribute__ ((weak)) +/* cppcheck-suppress funcArgNamesDifferent */ +int close_range(unsigned int first, unsigned int last, int flags) { + return syscall(SYS_close_range, first, last, flags); +} + +#endif /* LINUX_DEP_H */ @@ -26,17 +26,14 @@ #include <stdarg.h> #include <sys/socket.h> +#include "linux_dep.h" #include "log.h" #include "util.h" #include "passt.h" -/* LOG_EARLY means we don't know yet: log everything. LOG_EMERG is unused */ -#define LOG_EARLY LOG_MASK(LOG_EMERG) - static int log_sock = -1; /* Optional socket to system logger */ static char log_ident[BUFSIZ]; /* Identifier string for openlog() */ -static int log_mask = LOG_EARLY; /* Current log priority mask */ -static int log_opt; /* Options for openlog() */ +static int log_mask; /* Current log priority mask */ static int log_file = -1; /* Optional log file descriptor */ static size_t log_size; /* Maximum log file size in bytes */ @@ -44,50 +41,47 @@ static size_t log_written; /* Currently used bytes in log file */ static size_t log_cut_size; /* Bytes to cut at start on rotation */ static char log_header[BUFSIZ]; /* File header, written back on cuts */ -static time_t log_start; /* Start timestamp */ -int log_trace; /* --trace mode enabled */ -int log_to_stdout; /* Print to stdout instead of stderr */ - -void vlogmsg(int pri, const char *format, va_list ap) -{ - bool debug_print = (log_mask & LOG_MASK(LOG_DEBUG)) && log_file == -1; - bool early_print = LOG_PRI(log_mask) == LOG_EARLY; - FILE *out = log_to_stdout ? stdout : stderr; - struct timespec tp; - - if (debug_print) { - clock_gettime(CLOCK_REALTIME, &tp); - fprintf(out, "%lli.%04lli: ", - (long long int)tp.tv_sec - log_start, - (long long int)tp.tv_nsec / (100L * 1000)); - } - - if ((log_mask & LOG_MASK(LOG_PRI(pri))) || early_print) { - va_list ap2; +struct timespec log_start; /* Start timestamp */ - va_copy(ap2, ap); /* Don't clobber ap, we need it again */ - if (log_file != -1) - logfile_write(pri, format, ap2); - else if (!(log_mask & LOG_MASK(LOG_DEBUG))) - passt_vsyslog(pri, format, ap2); +int log_trace; /* --trace mode enabled */ +bool log_conf_parsed; /* Logging options already parsed */ +bool log_stderr = true; /* Not daemonised, no shell spawned */ - va_end(ap2); - } +#define LL_STRLEN (sizeof("-9223372036854775808")) +#define LOGTIME_STRLEN (LL_STRLEN + 5) - if (debug_print || (early_print && !(log_opt & LOG_PERROR))) { - (void)vfprintf(out, format, ap); - if (format[strlen(format)] != '\n') - fprintf(out, "\n"); - } +/** + * logtime() - Get the current time for logging purposes + * @ts: Buffer into which to store the timestamp + * + * Return: pointer to @ts on success, or NULL if there was + * an error retrieving the time + */ +static const struct timespec *logtime(struct timespec *ts) +{ + if (clock_gettime(CLOCK_MONOTONIC, ts)) + return NULL; + return ts; } -void logmsg(int pri, const char *format, ...) +/** + * logtime_fmt() - Format timestamp into a string for the log + * @buf: Buffer into which to format the time + * @size: Size of @buf + * @ts: Time to format (or NULL on error) + * + * Return: number of characters written to @buf (excluding \0) + */ +static int logtime_fmt(char *buf, size_t size, const struct timespec *ts) { - va_list ap; + if (ts) { + int64_t delta = timespec_diff_us(ts, &log_start); - va_start(ap, format); - vlogmsg(pri, format, ap); - va_end(ap); + return snprintf(buf, size, "%lli.%04lli", delta / 1000000LL, + (delta / 100LL) % 10000); + } + + return snprintf(buf, size, "<error>"); } /* Prefixes for log file messages, indexed by priority */ @@ -101,126 +95,11 @@ const char *logfile_prefix[] = { }; /** - * trace_init() - Set log_trace depending on trace (debug) mode - * @enable: Tracing debug mode enabled if non-zero - */ -void trace_init(int enable) -{ - log_trace = enable; -} - -/** - * __openlog() - Non-optional openlog() implementation, for custom vsyslog() - * @ident: openlog() identity (program name) - * @option: openlog() options - * @facility: openlog() facility (LOG_DAEMON) - */ -void __openlog(const char *ident, int option, int facility) -{ - struct timespec tp; - - clock_gettime(CLOCK_REALTIME, &tp); - log_start = tp.tv_sec; - - if (log_sock < 0) { - struct sockaddr_un a = { .sun_family = AF_UNIX, }; - - log_sock = socket(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0); - if (log_sock < 0) - return; - - strncpy(a.sun_path, _PATH_LOG, sizeof(a.sun_path)); - if (connect(log_sock, (const struct sockaddr *)&a, sizeof(a))) { - close(log_sock); - log_sock = -1; - return; - } - } - - log_mask |= facility; - strncpy(log_ident, ident, sizeof(log_ident) - 1); - log_opt = option; -} - -/** - * __setlogmask() - setlogmask() wrapper, to allow custom vsyslog() - * @mask: Same as setlogmask() mask - */ -void __setlogmask(int mask) -{ - log_mask = mask; - setlogmask(mask); -} - -/** - * passt_vsyslog() - vsyslog() implementation not using heap memory - * @pri: Facility and level map, same as priority for vsyslog() - * @format: Same as vsyslog() format - * @ap: Same as vsyslog() ap - */ -void passt_vsyslog(int pri, const char *format, va_list ap) -{ - int prefix_len, n; - char buf[BUFSIZ]; - - /* Send without timestamp, the system logger should add it */ - n = prefix_len = snprintf(buf, BUFSIZ, "<%i> %s: ", pri, log_ident); - - n += vsnprintf(buf + n, BUFSIZ - n, format, ap); - - if (format[strlen(format)] != '\n') - n += snprintf(buf + n, BUFSIZ - n, "\n"); - - if (log_opt & LOG_PERROR) - fprintf(stderr, "%s", buf + prefix_len); - - if (log_sock >= 0 && send(log_sock, buf, n, 0) != n) - fprintf(stderr, "Failed to send %i bytes to syslog\n", n); -} - -/** - * logfile_init() - Open log file and write header with PID, version, path - * @name: Identifier for header: passt or pasta - * @path: Path to log file - * @size: Maximum size of log file: log_cut_size is calculatd here - */ -void logfile_init(const char *name, const char *path, size_t size) -{ - char nl = '\n', exe[PATH_MAX] = { 0 }; - int n; - - if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0) { - perror("readlink /proc/self/exe"); - exit(EXIT_FAILURE); - } - - log_file = open(path, O_CREAT | O_TRUNC | O_APPEND | O_RDWR | O_CLOEXEC, - S_IRUSR | S_IWUSR); - if (log_file == -1) - die("Couldn't open log file %s: %s", path, strerror(errno)); - - log_size = size ? size : LOGFILE_SIZE_DEFAULT; - - n = snprintf(log_header, sizeof(log_header), "%s " VERSION ": %s (%i)", - name, exe, getpid()); - - if (write(log_file, log_header, n) <= 0 || - write(log_file, &nl, 1) <= 0) { - perror("Couldn't write to log file\n"); - exit(EXIT_FAILURE); - } - - /* For FALLOC_FL_COLLAPSE_RANGE: VFS block size can be up to one page */ - log_cut_size = ROUND_UP(log_size * LOGFILE_CUT_RATIO / 100, PAGE_SIZE); -} - -#ifdef FALLOC_FL_COLLAPSE_RANGE -/** * logfile_rotate_fallocate() - Write header, set log_written after fallocate() * @fd: Log file descriptor * @now: Current timestamp * - * #syscalls lseek ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek + * #syscalls lseek ppc64le:_llseek ppc64:_llseek arm:_llseek i686:_llseek */ static void logfile_rotate_fallocate(int fd, const struct timespec *now) { @@ -233,10 +112,8 @@ static void logfile_rotate_fallocate(int fd, const struct timespec *now) if (read(fd, buf, BUFSIZ) == -1) return; - n = snprintf(buf, BUFSIZ, - "%s - log truncated at %lli.%04lli", log_header, - (long long int)(now->tv_sec - log_start), - (long long int)(now->tv_nsec / (100L * 1000))); + n = snprintf(buf, BUFSIZ, "%s - log truncated at ", log_header); + n += logtime_fmt(buf + n, BUFSIZ - n, now); /* Avoid partial lines by padding the header with spaces */ nl = memchr(buf + n + 1, '\n', BUFSIZ - n - 1); @@ -250,14 +127,13 @@ static void logfile_rotate_fallocate(int fd, const struct timespec *now) log_written -= log_cut_size; } -#endif /* FALLOC_FL_COLLAPSE_RANGE */ /** * logfile_rotate_move() - Fallback: move recent entries toward start, then cut * @fd: Log file descriptor * @now: Current timestamp * - * #syscalls lseek ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek + * #syscalls lseek ppc64le:_llseek ppc64:_llseek arm:_llseek * #syscalls ftruncate */ static void logfile_rotate_move(int fd, const struct timespec *now) @@ -266,10 +142,10 @@ static void logfile_rotate_move(int fd, const struct timespec *now) char buf[BUFSIZ]; const char *nl; - header_len = snprintf(buf, BUFSIZ, - "%s - log truncated at %lli.%04lli\n", log_header, - (long long int)(now->tv_sec - log_start), - (long long int)(now->tv_nsec / (100L * 1000))); + header_len = snprintf(buf, BUFSIZ, "%s - log truncated at ", + log_header); + header_len += logtime_fmt(buf + header_len, BUFSIZ - header_len, now); + if (lseek(fd, 0, SEEK_SET) == -1) return; if (write(fd, buf, header_len) == -1) @@ -322,21 +198,17 @@ out: * * Return: 0 on success, negative error code on failure * - * #syscalls fcntl - * - * fallocate() passed as EXTRA_SYSCALL only if FALLOC_FL_COLLAPSE_RANGE is there + * #syscalls fcntl fallocate */ static int logfile_rotate(int fd, const struct timespec *now) { if (fcntl(fd, F_SETFL, O_RDWR /* Drop O_APPEND: explicit lseek() */)) return -errno; -#ifdef FALLOC_FL_COLLAPSE_RANGE /* Only for Linux >= 3.15, extent-based ext4 or XFS, glibc >= 2.18 */ if (!fallocate(fd, FALLOC_FL_COLLAPSE_RANGE, 0, log_cut_size)) logfile_rotate_fallocate(fd, now); else -#endif logfile_rotate_move(fd, now); if (fcntl(fd, F_SETFL, O_RDWR | O_APPEND)) @@ -347,32 +219,213 @@ static int logfile_rotate(int fd, const struct timespec *now) /** * logfile_write() - Write entry to log file, trigger rotation if full + * @newline: Append newline at the end of the message, if missing + * @cont: Continuation of a previous message, on the same line * @pri: Facility and level map, same as priority for vsyslog() + * @now: Timestamp * @format: Same as vsyslog() format * @ap: Same as vsyslog() ap */ -void logfile_write(int pri, const char *format, va_list ap) +static void logfile_write(bool newline, bool cont, int pri, + const struct timespec *now, + const char *format, va_list ap) { - struct timespec now; char buf[BUFSIZ]; - int n; - - if (clock_gettime(CLOCK_REALTIME, &now)) - return; + int n = 0; - n = snprintf(buf, BUFSIZ, "%lli.%04lli: %s", - (long long int)(now.tv_sec - log_start), - (long long int)(now.tv_nsec / (100L * 1000)), - logfile_prefix[pri]); + if (!cont) { + n += logtime_fmt(buf, BUFSIZ, now); + n += snprintf(buf + n, BUFSIZ - n, ": %s", logfile_prefix[pri]); + } n += vsnprintf(buf + n, BUFSIZ - n, format, ap); - if (format[strlen(format)] != '\n') + if (newline && format[strlen(format)] != '\n') n += snprintf(buf + n, BUFSIZ - n, "\n"); - if ((log_written + n >= log_size) && logfile_rotate(log_file, &now)) + if ((log_written + n >= log_size) && logfile_rotate(log_file, now)) return; if ((n = write(log_file, buf, n)) >= 0) log_written += n; } + +/** + * passt_vsyslog() - vsyslog() implementation not using heap memory + * @newline: Append newline at the end of the message, if missing + * @pri: Facility and level map, same as priority for vsyslog() + * @format: Same as vsyslog() format + * @ap: Same as vsyslog() ap + */ +static void passt_vsyslog(bool newline, int pri, const char *format, va_list ap) +{ + char buf[BUFSIZ]; + int n; + + /* Send without timestamp, the system logger should add it */ + n = snprintf(buf, BUFSIZ, "<%i> %s: ", pri, log_ident); + + n += vsnprintf(buf + n, BUFSIZ - n, format, ap); + + if (newline && format[strlen(format)] != '\n') + n += snprintf(buf + n, BUFSIZ - n, "\n"); + + if (log_sock >= 0 && send(log_sock, buf, n, 0) != n && log_stderr) + FPRINTF(stderr, "Failed to send %i bytes to syslog\n", n); +} + +/** + * vlogmsg() - Print or send messages to log or output files as configured + * @newline: Append newline at the end of the message, if missing + * @cont: Continuation of a previous message, on the same line + * @pri: Facility and level map, same as priority for vsyslog() + * @format: Message + * @ap: Variable argument list + */ +/* cppcheck-suppress [staticFunction,unmatchedSuppression] */ +void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap) +{ + bool debug_print = (log_mask & LOG_MASK(LOG_DEBUG)) && log_file == -1; + const struct timespec *now; + struct timespec ts; + + now = logtime(&ts); + + if (debug_print && !cont) { + char timestr[LOGTIME_STRLEN]; + + logtime_fmt(timestr, sizeof(timestr), now); + FPRINTF(stderr, "%s: ", timestr); + } + + if ((log_mask & LOG_MASK(LOG_PRI(pri))) || !log_conf_parsed) { + va_list ap2; + + va_copy(ap2, ap); /* Don't clobber ap, we need it again */ + if (log_file != -1) + logfile_write(newline, cont, pri, now, format, ap2); + else if (!(log_mask & LOG_MASK(LOG_DEBUG))) + passt_vsyslog(newline, pri, format, ap2); + + va_end(ap2); + } + + if (debug_print || !log_conf_parsed || + (log_stderr && (log_mask & LOG_MASK(LOG_PRI(pri))))) { + (void)vfprintf(stderr, format, ap); + if (newline && format[strlen(format)] != '\n') + FPRINTF(stderr, "\n"); + } +} + +/** + * logmsg() - vlogmsg() wrapper for variable argument lists + * @newline: Append newline at the end of the message, if missing + * @cont: Continuation of a previous message, on the same line + * @pri: Facility and level map, same as priority for vsyslog() + * @format: Message + */ +void logmsg(bool newline, bool cont, int pri, const char *format, ...) +{ + va_list ap; + + va_start(ap, format); + vlogmsg(newline, cont, pri, format, ap); + va_end(ap); +} + +/** + * logmsg_perror() - vlogmsg() wrapper with perror()-like functionality + * @pri: Facility and level map, same as priority for vsyslog() + * @format: Message + */ +void logmsg_perror(int pri, const char *format, ...) +{ + int errno_copy = errno; + va_list ap; + + va_start(ap, format); + vlogmsg(false, false, pri, format, ap); + va_end(ap); + + logmsg(true, true, pri, ": %s", strerror_(errno_copy)); +} + +/** + * trace_init() - Set log_trace depending on trace (debug) mode + * @enable: Tracing debug mode enabled if non-zero + */ +void trace_init(int enable) +{ + log_trace = enable; +} + +/** + * __openlog() - Non-optional openlog() implementation, for custom vsyslog() + * @ident: openlog() identity (program name) + * @option: openlog() options, unused + * @facility: openlog() facility (LOG_DAEMON) + */ +void __openlog(const char *ident, int option, int facility) +{ + (void)option; + + if (log_sock < 0) { + struct sockaddr_un a = { .sun_family = AF_UNIX, }; + + log_sock = socket(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0); + if (log_sock < 0) + return; + + strncpy(a.sun_path, _PATH_LOG, sizeof(a.sun_path)); + if (connect(log_sock, (const struct sockaddr *)&a, sizeof(a))) { + close(log_sock); + log_sock = -1; + return; + } + } + + log_mask |= facility; + strncpy(log_ident, ident, sizeof(log_ident) - 1); +} + +/** + * __setlogmask() - setlogmask() wrapper, to allow custom vsyslog() + * @mask: Same as setlogmask() mask + */ +void __setlogmask(int mask) +{ + log_mask = mask; + setlogmask(mask); +} + +/** + * logfile_init() - Open log file and write header with PID, version, path + * @name: Identifier for header: passt or pasta + * @path: Path to log file + * @size: Maximum size of log file: log_cut_size is calculated here + */ +void logfile_init(const char *name, const char *path, size_t size) +{ + char nl = '\n', exe[PATH_MAX] = { 0 }; + int n; + + if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0) + die_perror("Failed to read own /proc/self/exe link"); + + log_file = output_file_open(path, O_APPEND | O_RDWR); + if (log_file == -1) + die_perror("Couldn't open log file %s", path); + + log_size = size ? size : LOGFILE_SIZE_DEFAULT; + + n = snprintf(log_header, sizeof(log_header), "%s " VERSION ": %s (%i)", + name, exe, getpid()); + + if (write(log_file, log_header, n) <= 0 || + write(log_file, &nl, 1) <= 0) + die_perror("Couldn't write to log file"); + + /* For FALLOC_FL_COLLAPSE_RANGE: VFS block size can be up to one page */ + log_cut_size = ROUND_UP(log_size * LOGFILE_CUT_RATIO / 100, PAGE_SIZE); +} @@ -6,29 +6,46 @@ #ifndef LOG_H #define LOG_H +#include <stdbool.h> #include <syslog.h> #define LOGFILE_SIZE_DEFAULT (1024 * 1024UL) #define LOGFILE_CUT_RATIO 30 /* When full, cut ~30% size */ #define LOGFILE_SIZE_MIN (5UL * MAX(BUFSIZ, PAGE_SIZE)) -void vlogmsg(int pri, const char *format, va_list ap); -void logmsg(int pri, const char *format, ...) +void vlogmsg(bool newline, bool cont, int pri, const char *format, va_list ap); +void logmsg(bool newline, bool cont, int pri, const char *format, ...) + __attribute__((format(printf, 4, 5))); +void logmsg_perror(int pri, const char *format, ...) __attribute__((format(printf, 2, 3))); -#define err(...) logmsg(LOG_ERR, __VA_ARGS__) -#define warn(...) logmsg(LOG_WARNING, __VA_ARGS__) -#define info(...) logmsg(LOG_INFO, __VA_ARGS__) -#define debug(...) logmsg(LOG_DEBUG, __VA_ARGS__) +#define err(...) logmsg(true, false, LOG_ERR, __VA_ARGS__) +#define warn(...) logmsg(true, false, LOG_WARNING, __VA_ARGS__) +#define info(...) logmsg(true, false, LOG_INFO, __VA_ARGS__) +#define debug(...) logmsg(true, false, LOG_DEBUG, __VA_ARGS__) + +#define err_perror(...) logmsg_perror( LOG_ERR, __VA_ARGS__) +#define warn_perror(...) logmsg_perror( LOG_WARNING, __VA_ARGS__) +#define info_perror(...) logmsg_perror( LOG_INFO, __VA_ARGS__) +#define debug_perror(...) logmsg_perror( LOG_DEBUG, __VA_ARGS__) #define die(...) \ do { \ err(__VA_ARGS__); \ - exit(EXIT_FAILURE); \ + _exit(EXIT_FAILURE); \ + } while (0) + +#define die_perror(...) \ + do { \ + err_perror(__VA_ARGS__); \ + _exit(EXIT_FAILURE); \ } while (0) extern int log_trace; -extern int log_to_stdout; +extern bool log_conf_parsed; +extern bool log_stderr; +extern struct timespec log_start; + void trace_init(int enable); #define trace(...) \ do { \ @@ -38,8 +55,6 @@ void trace_init(int enable); void __openlog(const char *ident, int option, int facility); void logfile_init(const char *name, const char *path, size_t size); -void passt_vsyslog(int pri, const char *format, va_list ap); -void logfile_write(int pri, const char *format, va_list ap); void __setlogmask(int mask); #endif /* LOG_H */ diff --git a/migrate.c b/migrate.c new file mode 100644 index 0000000..48d63a0 --- /dev/null +++ b/migrate.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * migrate.c - Migration sections, layout, and routines + * + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#include <errno.h> +#include <sys/uio.h> + +#include "util.h" +#include "ip.h" +#include "passt.h" +#include "inany.h" +#include "flow.h" +#include "flow_table.h" + +#include "migrate.h" +#include "repair.h" + +/* Magic identifier for migration data */ +#define MIGRATE_MAGIC 0xB1BB1D1B0BB1D1B0 + +/** + * struct migrate_seen_addrs_v1 - Migratable guest addresses for v1 state stream + * @addr6: Observed guest IPv6 address + * @addr6_ll: Observed guest IPv6 link-local address + * @addr4: Observed guest IPv4 address + * @mac: Observed guest MAC address + */ +struct migrate_seen_addrs_v1 { + struct in6_addr addr6; + struct in6_addr addr6_ll; + struct in_addr addr4; + unsigned char mac[ETH_ALEN]; +} __attribute__((packed)); + +/** + * seen_addrs_source_v1() - Copy and send guest observed addresses from source + * @c: Execution context + * @stage: Migration stage, unused + * @fd: File descriptor for state transfer + * + * Return: 0 on success, positive error code on failure + */ +/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */ +static int seen_addrs_source_v1(struct ctx *c, + const struct migrate_stage *stage, int fd) +{ + struct migrate_seen_addrs_v1 addrs = { + .addr6 = c->ip6.addr_seen, + .addr6_ll = c->ip6.addr_ll_seen, + .addr4 = c->ip4.addr_seen, + }; + + (void)stage; + + memcpy(addrs.mac, c->guest_mac, sizeof(addrs.mac)); + + if (write_all_buf(fd, &addrs, sizeof(addrs))) + return errno; + + return 0; +} + +/** + * seen_addrs_target_v1() - Receive and use guest observed addresses on target + * @c: Execution context + * @stage: Migration stage, unused + * @fd: File descriptor for state transfer + * + * Return: 0 on success, positive error code on failure + */ +static int seen_addrs_target_v1(struct ctx *c, + const struct migrate_stage *stage, int fd) +{ + struct migrate_seen_addrs_v1 addrs; + + (void)stage; + + if (read_all_buf(fd, &addrs, sizeof(addrs))) + return errno; + + c->ip6.addr_seen = addrs.addr6; + c->ip6.addr_ll_seen = addrs.addr6_ll; + c->ip4.addr_seen = addrs.addr4; + memcpy(c->guest_mac, addrs.mac, sizeof(c->guest_mac)); + + return 0; +} + +/* Stages for version 2 */ +static const struct migrate_stage stages_v2[] = { + { + .name = "observed addresses", + .source = seen_addrs_source_v1, + .target = seen_addrs_target_v1, + }, + { + .name = "prepare flows", + .source = flow_migrate_source_pre, + .target = NULL, + }, + { + .name = "transfer flows", + .source = flow_migrate_source, + .target = flow_migrate_target, + }, + { 0 }, +}; + +/* Supported encoding versions, from latest (most preferred) to oldest */ +static const struct migrate_version versions[] = { + { 2, stages_v2, }, + /* v1 was released, but not widely used. It had bad endianness for the + * MSS and omitted timestamps, which meant it usually wouldn't work. + * Therefore we don't attempt to support compatibility with it. + */ + { 0 }, +}; + +/* Current encoding version */ +#define CURRENT_VERSION (&versions[0]) + +/** + * migrate_source() - Migration as source, send state to hypervisor + * @c: Execution context + * @fd: File descriptor for state transfer + * + * Return: 0 on success, positive error code on failure + */ +static int migrate_source(struct ctx *c, int fd) +{ + const struct migrate_version *v = CURRENT_VERSION; + const struct migrate_header header = { + .magic = htonll_constant(MIGRATE_MAGIC), + .version = htonl(v->id), + .compat_version = htonl(v->id), + }; + const struct migrate_stage *s; + int ret; + + if (write_all_buf(fd, &header, sizeof(header))) { + ret = errno; + err("Can't send migration header: %s, abort", strerror_(ret)); + return ret; + } + + for (s = v->s; s->name; s++) { + if (!s->source) + continue; + + debug("Source side migration stage: %s", s->name); + + if ((ret = s->source(c, s, fd))) { + err("Source migration stage: %s: %s, abort", s->name, + strerror_(ret)); + return ret; + } + } + + return 0; +} + +/** + * migrate_target_read_header() - Read header in target + * @fd: Descriptor for state transfer + * + * Return: version structure on success, NULL on failure with errno set + */ +static const struct migrate_version *migrate_target_read_header(int fd) +{ + const struct migrate_version *v; + struct migrate_header h; + uint32_t id, compat_id; + + if (read_all_buf(fd, &h, sizeof(h))) + return NULL; + + id = ntohl(h.version); + compat_id = ntohl(h.compat_version); + + debug("Source magic: 0x%016" PRIx64 ", version: %u, compat: %u", + ntohll(h.magic), id, compat_id); + + if (ntohll(h.magic) != MIGRATE_MAGIC || !id || !compat_id) { + err("Invalid incoming device state"); + errno = EINVAL; + return NULL; + } + + for (v = versions; v->id; v++) + if (v->id <= id && v->id >= compat_id) + return v; + + errno = ENOTSUP; + err("Unsupported device state version: %u", id); + return NULL; +} + +/** + * migrate_target() - Migration as target, receive state from hypervisor + * @c: Execution context + * @fd: File descriptor for state transfer + * + * Return: 0 on success, positive error code on failure + */ +static int migrate_target(struct ctx *c, int fd) +{ + const struct migrate_version *v; + const struct migrate_stage *s; + int ret; + + if (!(v = migrate_target_read_header(fd))) + return errno; + + for (s = v->s; s->name; s++) { + if (!s->target) + continue; + + debug("Target side migration stage: %s", s->name); + + if ((ret = s->target(c, s, fd))) { + err("Target migration stage: %s: %s, abort", s->name, + strerror_(ret)); + return ret; + } + } + + return 0; +} + +/** + * migrate_init() - Set up things necessary for migration + * @c: Execution context + */ +void migrate_init(struct ctx *c) +{ + c->device_state_result = -1; +} + +/** + * migrate_close() - Close migration channel and connection to passt-repair + * @c: Execution context + */ +void migrate_close(struct ctx *c) +{ + if (c->device_state_fd != -1) { + debug("Closing migration channel, fd: %d", c->device_state_fd); + close(c->device_state_fd); + c->device_state_fd = -1; + c->device_state_result = -1; + } + + repair_close(c); +} + +/** + * migrate_request() - Request a migration of device state + * @c: Execution context + * @fd: fd to transfer state + * @target: Are we the target of the migration? + */ +void migrate_request(struct ctx *c, int fd, bool target) +{ + debug("Migration requested, fd: %d (was %d)", fd, c->device_state_fd); + + if (c->device_state_fd != -1) + migrate_close(c); + + c->device_state_fd = fd; + c->migrate_target = target; +} + +/** + * migrate_handler() - Send/receive passt internal state to/from hypervisor + * @c: Execution context + */ +void migrate_handler(struct ctx *c) +{ + int rc; + + if (c->device_state_fd < 0) + return; + + debug("Handling migration request from fd: %d, target: %d", + c->device_state_fd, c->migrate_target); + + if (c->migrate_target) + rc = migrate_target(c, c->device_state_fd); + else + rc = migrate_source(c, c->device_state_fd); + + migrate_close(c); + + c->device_state_result = rc; +} diff --git a/migrate.h b/migrate.h new file mode 100644 index 0000000..2c51cd9 --- /dev/null +++ b/migrate.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#ifndef MIGRATE_H +#define MIGRATE_H + +/** + * struct migrate_header - Migration header from source + * @magic: 0xB1BB1D1B0BB1D1B0, network order + * @version: Highest known, target aborts if too old, network order + * @compat_version: Lowest version compatible with @version, target aborts + * if too new, network order + */ +struct migrate_header { + uint64_t magic; + uint32_t version; + uint32_t compat_version; +} __attribute__((packed)); + +/** + * struct migrate_stage - Callbacks and parameters for one stage of migration + * @name: Stage name (for debugging) + * @source: Callback to implement this stage on the source + * @target: Callback to implement this stage on the target + */ +struct migrate_stage { + const char *name; + int (*source)(struct ctx *c, const struct migrate_stage *stage, int fd); + int (*target)(struct ctx *c, const struct migrate_stage *stage, int fd); + + /* Add here separate rollback callbacks if needed */ +}; + +/** + * struct migrate_version - Stages for a particular protocol version + * @id: Version number, host order + * @s: Ordered array of stages, NULL-terminated + */ +struct migrate_version { + uint32_t id; + const struct migrate_stage *s; +}; + +void migrate_init(struct ctx *c); +void migrate_close(struct ctx *c); +void migrate_request(struct ctx *c, int fd, bool target); +void migrate_handler(struct ctx *c); + +#endif /* MIGRATE_H */ @@ -33,161 +33,400 @@ #include "tap.h" #include "log.h" +#define RT_LIFETIME 65535 + #define RS 133 #define RA 134 #define NS 135 #define NA 136 +enum ndp_option_types { + OPT_SRC_L2_ADDR = 1, + OPT_TARGET_L2_ADDR = 2, + OPT_PREFIX_INFO = 3, + OPT_MTU = 5, + OPT_RDNSS_TYPE = 25, + OPT_DNSSL_TYPE = 31, +}; + /** - * ndp() - Check for NDP solicitations, reply as needed + * struct opt_header - Option header + * @type: Option type + * @len: Option length, in units of 8 bytes +*/ +struct opt_header { + uint8_t type; + uint8_t len; +} __attribute__((packed)); + +/** + * struct opt_l2_addr - Link-layer address + * @header: Option header + * @mac: MAC address + */ +struct opt_l2_addr { + struct opt_header header; + unsigned char mac[ETH_ALEN]; +} __attribute__((packed)); + +/** + * struct ndp_na - NDP Neighbor Advertisement (NA) message + * @ih: ICMPv6 header + * @target_addr: Target IPv6 address + * @target_l2_addr: Target link-layer address + */ +struct ndp_na { + struct icmp6hdr ih; + struct in6_addr target_addr; + struct opt_l2_addr target_l2_addr; +} __attribute__((packed)); + +/** + * struct opt_prefix_info - Prefix Information option + * @header: Option header + * @prefix_len: The number of leading bits in the Prefix that are valid + * @prefix_flags: Flags associated with the prefix + * @valid_lifetime: Valid lifetime (ms) + * @pref_lifetime: Preferred lifetime (ms) + * @reserved: Unused + */ +struct opt_prefix_info { + struct opt_header header; + uint8_t prefix_len; + uint8_t prefix_flags; + uint32_t valid_lifetime; + uint32_t pref_lifetime; + uint32_t reserved; +} __attribute__((packed)); + +/** + * struct opt_mtu - Maximum transmission unit (MTU) option + * @header: Option header + * @reserved: Unused + * @value: MTU value, network order + */ +struct opt_mtu { + struct opt_header header; + uint16_t reserved; + uint32_t value; +} __attribute__((packed)); + +/** + * struct rdnss - Recursive DNS Server (RDNSS) option + * @header: Option header + * @reserved: Unused + * @lifetime: Validity time (s) + * @dns: List of DNS server addresses + */ +struct opt_rdnss { + struct opt_header header; + uint16_t reserved; + uint32_t lifetime; + struct in6_addr dns[MAXNS + 1]; +} __attribute__((packed)); + +/** + * struct dnssl - DNS Search List (DNSSL) option + * @header: Option header + * @reserved: Unused + * @lifetime: Validity time (s) + * @domains: List of NULL-seperated search domains + */ +struct opt_dnssl { + struct opt_header header; + uint16_t reserved; + uint32_t lifetime; + unsigned char domains[MAXDNSRCH * NS_MAXDNAME]; +} __attribute__((packed)); + +/** + * struct ndp_ra - NDP Router Advertisement (RA) message + * @ih: ICMPv6 header + * @reachable: Reachability time, after confirmation (ms) + * @retrans: Time between retransmitted NS messages (ms) + * @prefix_info: Prefix Information option + * @prefix: IPv6 prefix + * @mtu: MTU option + * @source_ll: Target link-layer address + * @var: Variable fields + */ +struct ndp_ra { + struct icmp6hdr ih; + uint32_t reachable; + uint32_t retrans; + struct opt_prefix_info prefix_info; + struct in6_addr prefix; + struct opt_l2_addr source_ll; + + unsigned char var[sizeof(struct opt_mtu) + sizeof(struct opt_rdnss) + + sizeof(struct opt_dnssl)]; +} __attribute__((packed, aligned(__alignof__(struct in6_addr)))); + +/** + * struct ndp_ns - NDP Neighbor Solicitation (NS) message + * @ih: ICMPv6 header + * @target_addr: Target IPv6 address + */ +struct ndp_ns { + struct icmp6hdr ih; + struct in6_addr target_addr; +} __attribute__((packed, aligned(__alignof__(struct in6_addr)))); + +/** + * ndp_send() - Send an NDP message * @c: Execution context - * @ih: ICMPv6 header - * @saddr Source IPv6 address - * - * Return: 0 if not handled here, 1 if handled, -1 on failure + * @dst: IPv6 address to send the message to + * @buf: ICMPv6 header + message payload + * @l4len: Length of message, including ICMPv6 header */ -int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr) +static void ndp_send(const struct ctx *c, const struct in6_addr *dst, + const void *buf, size_t l4len) { - const struct in6_addr *rsaddr; /* src addr for reply */ - char buf[BUFSIZ] = { 0 }; - struct ipv6hdr *ip6hr; - struct icmp6hdr *ihr; - struct ethhdr *ehr; - unsigned char *p; - size_t len; + const struct in6_addr *src = &c->ip6.our_tap_ll; - if (ih->icmp6_type < RS || ih->icmp6_type > NA) - return 0; + tap_icmp6_send(c, src, dst, buf, l4len); +} - if (c->no_ndp) - return 1; +/** + * ndp_na() - Send an NDP Neighbour Advertisement (NA) message + * @c: Execution context + * @dst: IPv6 address to send the NA to + * @addr: IPv6 address to advertise + */ +static void ndp_na(const struct ctx *c, const struct in6_addr *dst, + const struct in6_addr *addr) +{ + struct ndp_na na = { + .ih = { + .icmp6_type = NA, + .icmp6_code = 0, + .icmp6_router = 1, + .icmp6_solicited = 1, + .icmp6_override = 1, + }, + .target_addr = *addr, + .target_l2_addr = { + .header = { + .type = OPT_TARGET_L2_ADDR, + .len = 1, + }, + } + }; - ehr = (struct ethhdr *)buf; - ip6hr = (struct ipv6hdr *)(ehr + 1); - ihr = (struct icmp6hdr *)(ip6hr + 1); + memcpy(na.target_l2_addr.mac, c->our_tap_mac, ETH_ALEN); - if (ih->icmp6_type == NS) { - if (IN6_IS_ADDR_UNSPECIFIED(saddr)) - return 1; + ndp_send(c, dst, &na, sizeof(na)); +} - info("NDP: received NS, sending NA"); - ihr->icmp6_type = NA; - ihr->icmp6_code = 0; - ihr->icmp6_router = 1; - ihr->icmp6_solicited = 1; - ihr->icmp6_override = 1; - - p = (unsigned char *)(ihr + 1); - memcpy(p, ih + 1, sizeof(struct in6_addr)); /* target address */ - p += 16; - *p++ = 2; /* target ll */ - *p++ = 1; /* length */ - memcpy(p, c->mac, ETH_ALEN); - p += 6; - } else if (ih->icmp6_type == RS) { - size_t dns_s_len = 0; - int i, n; +/** + * ndp_ra() - Send an NDP Router Advertisement (RA) message + * @c: Execution context + * @dst: IPv6 address to send the RA to + */ +static void ndp_ra(const struct ctx *c, const struct in6_addr *dst) +{ + struct ndp_ra ra = { + .ih = { + .icmp6_type = RA, + .icmp6_code = 0, + .icmp6_hop_limit = 255, + /* RFC 8319 */ + .icmp6_rt_lifetime = htons_constant(RT_LIFETIME), + .icmp6_addrconf_managed = 1, + }, + .prefix_info = { + .header = { + .type = OPT_PREFIX_INFO, + .len = 4, + }, + .prefix_len = 64, + .prefix_flags = 0xc0, /* prefix flags: L, A */ + .valid_lifetime = ~0U, + .pref_lifetime = ~0U, + }, + .prefix = c->ip6.addr, + .source_ll = { + .header = { + .type = OPT_SRC_L2_ADDR, + .len = 1, + }, + }, + }; + unsigned char *ptr = NULL; - if (c->no_ra) - return 1; + ptr = &ra.var[0]; - info("NDP: received RS, sending RA"); - ihr->icmp6_type = RA; - ihr->icmp6_code = 0; - ihr->icmp6_hop_limit = 255; - ihr->icmp6_rt_lifetime = htons(65535); /* RFC 8319 */ - ihr->icmp6_addrconf_managed = 1; - - p = (unsigned char *)(ihr + 1); - p += 8; /* reachable, retrans time */ - *p++ = 3; /* prefix */ - *p++ = 4; /* length */ - *p++ = 64; /* prefix length */ - *p++ = 0xc0; /* prefix flags: L, A */ - *(uint32_t *)p = (uint32_t)~0U; /* lifetime */ - p += 4; - *(uint32_t *)p = (uint32_t)~0U; /* preferred lifetime */ - p += 8; - memcpy(p, &c->ip6.addr, 8); /* prefix */ - p += 16; - - if (c->mtu != -1) { - *p++ = 5; /* type */ - *p++ = 1; /* length */ - p += 2; /* reserved */ - *(uint32_t *)p = htonl(c->mtu); /* MTU */ - p += 4; - } + if (c->mtu) { + struct opt_mtu *mtu = (struct opt_mtu *)ptr; + *mtu = (struct opt_mtu) { + .header = { + .type = OPT_MTU, + .len = 1, + }, + .value = htonl(c->mtu), + }; + ptr += sizeof(struct opt_mtu); + } - if (c->no_dhcp_dns) - goto dns_done; + if (!c->no_dhcp_dns) { + size_t dns_s_len = 0; + int i, n; for (n = 0; !IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns[n]); n++); if (n) { - *p++ = 25; /* RDNSS */ - *p++ = 1 + 2 * n; /* length */ - p += 2; /* reserved */ - *(uint32_t *)p = (uint32_t)~0U; /* lifetime */ - p += 4; - + struct opt_rdnss *rdnss = (struct opt_rdnss *)ptr; + *rdnss = (struct opt_rdnss) { + .header = { + .type = OPT_RDNSS_TYPE, + .len = 1 + 2 * n, + }, + .lifetime = ~0U, + }; for (i = 0; i < n; i++) { - memcpy(p, &c->ip6.dns[i], 16); /* address */ - p += 16; + rdnss->dns[i] = c->ip6.dns[i]; } + ptr += offsetof(struct opt_rdnss, dns) + + i * sizeof(rdnss->dns[0]); for (n = 0; *c->dns_search[n].n; n++) dns_s_len += strlen(c->dns_search[n].n) + 2; } if (!c->no_dhcp_dns_search && dns_s_len) { - *p++ = 31; /* DNSSL */ - *p++ = (dns_s_len + 8 - 1) / 8 + 1; /* length */ - p += 2; /* reserved */ - *(uint32_t *)p = (uint32_t)~0U; /* lifetime */ - p += 4; + struct opt_dnssl *dnssl = (struct opt_dnssl *)ptr; + *dnssl = (struct opt_dnssl) { + .header = { + .type = OPT_DNSSL_TYPE, + .len = DIV_ROUND_UP(dns_s_len, 8) + 1, + }, + .lifetime = ~0U, + }; + ptr = dnssl->domains; for (i = 0; i < n; i++) { + size_t len; char *dot; - *(p++) = '.'; + *(ptr++) = '.'; + + len = sizeof(dnssl->domains) - + (ptr - dnssl->domains); - strncpy((char *)p, c->dns_search[i].n, - sizeof(buf) - - ((intptr_t)p - (intptr_t)buf)); - for (dot = (char *)p - 1; *dot; dot++) { + strncpy((char *)ptr, c->dns_search[i].n, len); + for (dot = (char *)ptr - 1; *dot; dot++) { if (*dot == '.') *dot = strcspn(dot + 1, "."); } - p += strlen(c->dns_search[i].n); - *(p++) = 0; + ptr += strlen(c->dns_search[i].n); + *(ptr++) = 0; } - memset(p, 0, 8 - dns_s_len % 8); /* padding */ - p += 8 - dns_s_len % 8; + memset(ptr, 0, 8 - dns_s_len % 8); /* padding */ + ptr += 8 - dns_s_len % 8; } + } + + memcpy(&ra.source_ll.mac, c->our_tap_mac, ETH_ALEN); -dns_done: - *p++ = 1; /* source ll */ - *p++ = 1; /* length */ - memcpy(p, c->mac, ETH_ALEN); - p += 6; - } else { + /* NOLINTNEXTLINE(clang-analyzer-security.PointerSub) */ + ndp_send(c, dst, &ra, ptr - (unsigned char *)&ra); +} + +/** + * ndp() - Check for NDP solicitations, reply as needed + * @c: Execution context + * @saddr: Source IPv6 address + * @p: Packet pool + * + * Return: 0 if not handled here, 1 if handled, -1 on failure + */ +int ndp(const struct ctx *c, const struct icmp6hdr *ih, + const struct in6_addr *saddr, const struct pool *p) +{ + if (ih->icmp6_type < RS || ih->icmp6_type > NA) + return 0; + + if (c->no_ndp) return 1; - } - len = (uintptr_t)p - (uintptr_t)ihr - sizeof(*ihr); + if (ih->icmp6_type == NS) { + const struct ndp_ns *ns; + + ns = packet_get(p, 0, 0, sizeof(struct ndp_ns), NULL); + if (!ns) + return -1; - if (IN6_IS_ADDR_LINKLOCAL(saddr)) - c->ip6.addr_ll_seen = *saddr; - else - c->ip6.addr_seen = *saddr; + if (IN6_IS_ADDR_UNSPECIFIED(saddr)) + return 1; + + info("NDP: received NS, sending NA"); - if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw)) - rsaddr = &c->ip6.gw; - else - rsaddr = &c->ip6.addr_ll; + ndp_na(c, saddr, &ns->target_addr); + } else if (ih->icmp6_type == RS) { + if (c->no_ra) + return 1; - tap_icmp6_send(c, rsaddr, saddr, ihr, len + sizeof(*ihr)); + info("NDP: received RS, sending RA"); + ndp_ra(c, saddr); + } return 1; } + +/* Default interval between unsolicited RAs (seconds) */ +#define DEFAULT_MAX_RTR_ADV_INTERVAL 600 /* RFC 4861, 6.2.1 */ + +/* Minimum required interval between RAs (seconds) */ +#define MIN_DELAY_BETWEEN_RAS 3 /* RFC 4861, 10 */ + +static time_t next_ra; + +/** + * ndp_timer() - Send unsolicited NDP messages if necessary + * @c: Execution context + * @now: Current (monotonic) time + */ +void ndp_timer(const struct ctx *c, const struct timespec *now) +{ + time_t max_rtr_adv_interval = DEFAULT_MAX_RTR_ADV_INTERVAL; + time_t min_rtr_adv_interval, interval; + + if (c->fd_tap < 0 || c->no_ra || now->tv_sec < next_ra) + return; + + /* We must advertise before the route's lifetime expires */ + max_rtr_adv_interval = MIN(max_rtr_adv_interval, RT_LIFETIME - 1); + + /* But we must not go smaller than the minimum delay */ + max_rtr_adv_interval = MAX(max_rtr_adv_interval, MIN_DELAY_BETWEEN_RAS); + + /* RFC 4861, 6.2.1 */ + min_rtr_adv_interval = MAX(max_rtr_adv_interval / 3, + MIN_DELAY_BETWEEN_RAS); + + /* As required by RFC 4861, we randomise the interval between + * unsolicited RAs. This is to prevent multiple routers on a link + * getting synchronised (e.g. after booting a bunch of routers at once) + * and causing flurries of RAs at the same time. + * + * This random doesn't need to be cryptographically strong, so random(3) + * is fine. Other routers on the link also want to avoid + * synchronisation, and anything malicious has much easier ways to cause + * trouble. + * + * The modulus also makes this not strictly a uniform distribution, but, + * again, it's close enough for our purposes. + */ + interval = min_rtr_adv_interval + + random() % (max_rtr_adv_interval - min_rtr_adv_interval); + + if (!next_ra) + goto first; + + info("NDP: sending unsolicited RA, next in %llds", (long long)interval); + + ndp_ra(c, &in6addr_ll_all_nodes); + +first: + next_ra = now->tv_sec + interval; +} @@ -6,6 +6,10 @@ #ifndef NDP_H #define NDP_H -int ndp(struct ctx *c, const struct icmp6hdr *ih, const struct in6_addr *saddr); +struct icmp6hdr; + +int ndp(const struct ctx *c, const struct icmp6hdr *ih, + const struct in6_addr *saddr, const struct pool *p); +void ndp_timer(const struct ctx *c, const struct timespec *now); #endif /* NDP_H */ @@ -33,8 +33,13 @@ #include "util.h" #include "passt.h" #include "log.h" +#include "ip.h" #include "netlink.h" +/* Same as RTA_NEXT() but for nexthops: RTNH_NEXT() doesn't take 'attrlen' */ +#define RTNH_NEXT_AND_DEC(rtnh, attrlen) \ + ((attrlen) -= RTNH_ALIGN((rtnh)->rtnh_len), RTNH_NEXT(rtnh)) + /* Netlink expects a buffer of at least 8kiB or the system page size, * whichever is larger. 32kiB is recommended for more efficient. * Since the largest page size on any remotely common Linux setup is @@ -128,7 +133,7 @@ static uint32_t nl_send(int s, void *req, uint16_t type, n = send(s, req, len, 0); if (n < 0) - die("netlink: Failed to send(): %s", strerror(errno)); + die_perror("netlink: Failed to send()"); else if (n < len) die("netlink: Short send (%zd of %zd bytes)", n, len); @@ -184,7 +189,7 @@ static struct nlmsghdr *nl_next(int s, char *buf, struct nlmsghdr *nh, ssize_t * *n = recv(s, buf, NLBUFSIZ, 0); if (*n < 0) - die("netlink: Failed to recv(): %s", strerror(errno)); + die_perror("netlink: Failed to recv()"); nh = (struct nlmsghdr *)buf; if (!NLMSG_OK(nh, *n)) @@ -264,12 +269,12 @@ unsigned int nl_get_ext_if(int s, sa_family_t af) size_t na; /* Look for an interface with a default route first, failing that, look - * for any interface with a route, and pick it only if it's the only - * interface with a route. + * for any interface with a route, and pick the first one, if any. */ seq = nl_send(s, &req, RTM_GETROUTE, NLM_F_DUMP, sizeof(req)); nl_foreach_oftype(nh, status, s, buf, seq, RTM_NEWROUTE) { struct rtmsg *rtm = (struct rtmsg *)NLMSG_DATA(nh); + const void *dst = NULL; unsigned thisifi = 0; if (rtm->rtm_family != af) @@ -284,12 +289,27 @@ unsigned int nl_get_ext_if(int s, sa_family_t af) rtnh = (struct rtnexthop *)RTA_DATA(rta); thisifi = rtnh->rtnh_ifindex; + } else if (rta->rta_type == RTA_DST) { + dst = RTA_DATA(rta); } } if (!thisifi) continue; /* No interface for this route */ + /* Skip 'lo': we should test IFF_LOOPBACK, but keep it simple */ + if (thisifi == 1) + continue; + + /* Skip routes to link-local addresses */ + if (af == AF_INET && dst && + IN4_IS_PREFIX_LINKLOCAL(dst, rtm->rtm_dst_len)) + continue; + + if (af == AF_INET6 && dst && + IN6_IS_PREFIX_LINKLOCAL(dst, rtm->rtm_dst_len)) + continue; + if (rtm->rtm_dst_len == 0) { /* Default route */ ndef++; @@ -304,25 +324,26 @@ unsigned int nl_get_ext_if(int s, sa_family_t af) } if (status < 0) - warn("netlink: RTM_GETROUTE failed: %s", strerror(-status)); + warn("netlink: RTM_GETROUTE failed: %s", strerror_(-status)); if (defifi) { - if (ndef > 1) + if (ndef > 1) { info("Multiple default %s routes, picked first", - af == AF_INET ? "IPv4" : "IPv6"); + af_name(af)); + } return defifi; } if (anyifi) { - if (nany == 1) - return anyifi; - - info("Multiple interfaces with %s routes, use -i to select one", - af == AF_INET ? "IPv4" : "IPv6"); + if (nany > 1) { + info("Multiple interfaces with %s routes, picked first", + af_name(af)); + } + return anyifi; } if (!nany) - info("No interfaces with %s routes", af == AF_INET ? "IPv4" : "IPv6"); + info("No interfaces with usable %s routes", af_name(af)); return 0; } @@ -334,14 +355,15 @@ unsigned int nl_get_ext_if(int s, sa_family_t af) * * Return: true if a gateway was found, false otherwise */ -bool nl_route_get_def_multipath(struct rtattr *rta, void *gw) +static bool nl_route_get_def_multipath(struct rtattr *rta, void *gw) { + int nh_len = RTA_PAYLOAD(rta); struct rtnexthop *rtnh; bool found = false; int hops = -1; for (rtnh = (struct rtnexthop *)RTA_DATA(rta); - RTNH_OK(rtnh, RTA_PAYLOAD(rta)); rtnh = RTNH_NEXT(rtnh)) { + RTNH_OK(rtnh, nh_len); rtnh = RTNH_NEXT_AND_DEC(rtnh, nh_len)) { size_t len = rtnh->rtnh_len - sizeof(*rtnh); struct rtattr *rta_inner; @@ -536,32 +558,76 @@ int nl_route_dup(int s_src, unsigned int ifi_src, NLMSG_OK(nh, left) && (status = nl_status(nh, left, seq)) > 0; nh = NLMSG_NEXT(nh, left)) { struct rtmsg *rtm = (struct rtmsg *)NLMSG_DATA(nh); + bool discard = false; struct rtattr *rta; size_t na; if (nh->nlmsg_type != RTM_NEWROUTE) continue; - dup_routes++; - for (rta = RTM_RTA(rtm), na = RTM_PAYLOAD(nh); RTA_OK(rta, na); rta = RTA_NEXT(rta, na)) { + /* RTA_OIF and RTA_MULTIPATH attributes carry the + * identifier of a host interface. If they match the + * host interface we're copying from, change them to + * match the corresponding identifier in the target + * namespace. + * + * If RTA_OIF doesn't match (NETLINK_GET_STRICT_CHK not + * available), or if any interface index in nexthop + * objects differ from the host interface, discard the + * route altogether. + */ if (rta->rta_type == RTA_OIF) { - /* The host obviously list's the host interface - * id here, we need to change it to the - * namespace's interface id - */ + if (*(unsigned int *)RTA_DATA(rta) != ifi_src) { + discard = true; + break; + } + *(unsigned int *)RTA_DATA(rta) = ifi_dst; - } else if (rta->rta_type == RTA_PREFSRC) { - /* Host routes might include a preferred source - * address, which must be one of the host's - * addresses. However, with -a pasta will use a - * different namespace address, making such a - * route invalid in the namespace. Strip off - * RTA_PREFSRC attributes to avoid that. */ + } else if (rta->rta_type == RTA_MULTIPATH) { + int nh_len = RTA_PAYLOAD(rta); + struct rtnexthop *rtnh; + + for (rtnh = (struct rtnexthop *)RTA_DATA(rta); + RTNH_OK(rtnh, nh_len); + rtnh = RTNH_NEXT_AND_DEC(rtnh, nh_len)) { + int src = (int)ifi_src; + + if (rtnh->rtnh_ifindex != src) { + discard = true; + break; + } + + rtnh->rtnh_ifindex = ifi_dst; + } + + if (discard) + break; + } else if (rta->rta_type == RTA_PREFSRC || + rta->rta_type == RTA_NH_ID) { + /* Strip RTA_PREFSRC attributes: host routes + * might include a preferred source address, + * which must be one of the host's addresses. + * However, with -a, pasta will use a different + * namespace address, making such a route + * invalid in the namespace. + * + * Strip RTA_NH_ID attributes: host routes set + * up via routing protocols (e.g. OSPF) might + * contain a nexthop ID (and not nexthop + * objects, which are taken care of in the + * RTA_MULTIPATH case above) that's not valid + * in the target namespace. + */ rta->rta_type = RTA_UNSPEC; } } + + if (discard) + nh->nlmsg_type = NLMSG_NOOP; + else + dup_routes++; } if (!NLMSG_OK(nh, left)) { @@ -602,7 +668,8 @@ int nl_route_dup(int s_src, unsigned int ifi_src, rc = nl_do(s_dst, nh, RTM_NEWROUTE, (flags & ~NLM_F_DUMP_FILTERED) | NLM_F_CREATE, nh->nlmsg_len); - if (rc < 0 && rc != -ENETUNREACH && rc != -EEXIST) + if (rc < 0 && rc != -EEXIST && + rc != -ENETUNREACH && rc != -EHOSTUNREACH) return rc; } } @@ -611,6 +678,63 @@ int nl_route_dup(int s_src, unsigned int ifi_src, } /** + * nl_addr_set_ll_nodad() - Set IFA_F_NODAD on IPv6 link-local addresses + * @s: Netlink socket + * @ifi: Interface index in target namespace + * + * Return: 0 on success, negative error code on failure + */ +int nl_addr_set_ll_nodad(int s, unsigned int ifi) +{ + struct req_t { + struct nlmsghdr nlh; + struct ifaddrmsg ifa; + } req = { + .ifa.ifa_family = AF_INET6, + .ifa.ifa_index = ifi, + }; + uint32_t seq, last_seq = 0; + ssize_t status, ret = 0; + struct nlmsghdr *nh; + char buf[NLBUFSIZ]; + + seq = nl_send(s, &req, RTM_GETADDR, NLM_F_DUMP, sizeof(req)); + nl_foreach_oftype(nh, status, s, buf, seq, RTM_NEWADDR) { + struct ifaddrmsg *ifa = (struct ifaddrmsg *)NLMSG_DATA(nh); + struct rtattr *rta; + size_t na; + + if (ifa->ifa_index != ifi || ifa->ifa_scope != RT_SCOPE_LINK) + continue; + + ifa->ifa_flags |= IFA_F_NODAD; + + for (rta = IFA_RTA(ifa), na = IFA_PAYLOAD(nh); RTA_OK(rta, na); + rta = RTA_NEXT(rta, na)) { + /* If 32-bit flags are used, add IFA_F_NODAD there */ + if (rta->rta_type == IFA_FLAGS) + *(uint32_t *)RTA_DATA(rta) |= IFA_F_NODAD; + } + + last_seq = nl_send(s, nh, RTM_NEWADDR, NLM_F_REPLACE, + nh->nlmsg_len); + } + + if (status < 0) + ret = status; + + for (seq = seq + 1; seq <= last_seq; seq++) { + nl_foreach(nh, status, s, buf, seq) + warn("netlink: Unexpected response message"); + + if (!ret && status < 0) + ret = status; + } + + return ret; +} + +/** * nl_addr_get() - Get most specific global address, given interface and family * @s: Netlink socket * @ifi: Interface index in outer network namespace @@ -619,7 +743,7 @@ int nl_route_dup(int s_src, unsigned int ifi_src, * @prefix_len: Mask or prefix length, to fill (for IPv4) * @addr_l: Link-scoped address to fill (for IPv6) * - * Return: 9 on success, negative error code on failure + * Return: 0 on success, negative error code on failure */ int nl_addr_get(int s, unsigned int ifi, sa_family_t af, void *addr, int *prefix_len, void *addr_l) @@ -643,12 +767,13 @@ int nl_addr_get(int s, unsigned int ifi, sa_family_t af, struct rtattr *rta; size_t na; - if (ifa->ifa_index != ifi) + if (ifa->ifa_index != ifi || ifa->ifa_flags & IFA_F_DEPRECATED) continue; for (rta = IFA_RTA(ifa), na = IFA_PAYLOAD(nh); RTA_OK(rta, na); rta = RTA_NEXT(rta, na)) { - if (rta->rta_type != IFA_ADDRESS) + if ((af == AF_INET && rta->rta_type != IFA_LOCAL) || + (af == AF_INET6 && rta->rta_type != IFA_ADDRESS)) continue; if (af == AF_INET && ifa->ifa_prefixlen > prefix_max) { @@ -676,7 +801,54 @@ int nl_addr_get(int s, unsigned int ifi, sa_family_t af, } /** - * nl_add_set() - Set IP addresses for given interface and address family + * nl_addr_get_ll() - Get first IPv6 link-local address for a given interface + * @s: Netlink socket + * @ifi: Interface index in outer network namespace + * @addr: Link-local address to fill + * + * Return: 0 on success, negative error code on failure + */ +int nl_addr_get_ll(int s, unsigned int ifi, struct in6_addr *addr) +{ + struct req_t { + struct nlmsghdr nlh; + struct ifaddrmsg ifa; + } req = { + .ifa.ifa_family = AF_INET6, + .ifa.ifa_index = ifi, + }; + struct nlmsghdr *nh; + bool found = false; + char buf[NLBUFSIZ]; + ssize_t status; + uint32_t seq; + + seq = nl_send(s, &req, RTM_GETADDR, NLM_F_DUMP, sizeof(req)); + nl_foreach_oftype(nh, status, s, buf, seq, RTM_NEWADDR) { + struct ifaddrmsg *ifa = (struct ifaddrmsg *)NLMSG_DATA(nh); + struct rtattr *rta; + size_t na; + + if (ifa->ifa_index != ifi || ifa->ifa_scope != RT_SCOPE_LINK || + found) + continue; + + for (rta = IFA_RTA(ifa), na = IFA_PAYLOAD(nh); RTA_OK(rta, na); + rta = RTA_NEXT(rta, na)) { + if (rta->rta_type != IFA_ADDRESS) + continue; + + if (!found) { + memcpy(addr, RTA_DATA(rta), RTA_PAYLOAD(rta)); + found = true; + } + } + } + return status; +} + +/** + * nl_addr_set() - Set IP addresses for given interface and address family * @s: Netlink socket * @ifi: Interface index * @af: Address family @@ -779,10 +951,13 @@ int nl_addr_dup(int s_src, unsigned int ifi_src, ifa = (struct ifaddrmsg *)NLMSG_DATA(nh); if (rc < 0 || ifa->ifa_scope == RT_SCOPE_LINK || - ifa->ifa_index != ifi_src) + ifa->ifa_index != ifi_src || + ifa->ifa_flags & IFA_F_DEPRECATED) continue; ifa->ifa_index = ifi_dst; + /* Same as nl_addr_set(), but here it's more than a default */ + ifa->ifa_flags |= IFA_F_NODAD; for (rta = IFA_RTA(ifa), na = IFA_PAYLOAD(nh); RTA_OK(rta, na); rta = RTA_NEXT(rta, na)) { @@ -790,6 +965,10 @@ int nl_addr_dup(int s_src, unsigned int ifi_src, if (rta->rta_type == IFA_LABEL || rta->rta_type == IFA_CACHEINFO) rta->rta_type = IFA_UNSPEC; + + /* If 32-bit flags are used, add IFA_F_NODAD there */ + if (rta->rta_type == IFA_FLAGS) + *(uint32_t *)RTA_DATA(rta) |= IFA_F_NODAD; } rc = nl_do(s_dst, nh, RTM_NEWADDR, @@ -845,7 +1024,6 @@ int nl_link_get_mac(int s, unsigned int ifi, void *mac) /** * nl_link_set_mac() - Set link MAC address * @s: Netlink socket - * @ns: Use netlink socket in namespace * @ifi: Interface index * @mac: MAC address to set * @@ -871,14 +1049,14 @@ int nl_link_set_mac(int s, unsigned int ifi, const void *mac) } /** - * nl_link_up() - Bring link up + * nl_link_set_mtu() - Set link MTU * @s: Netlink socket * @ifi: Interface index - * @mtu: If non-zero, set interface MTU + * @mtu: Interface MTU * * Return: 0 on success, negative error code on failure */ -int nl_link_up(int s, unsigned int ifi, int mtu) +int nl_link_set_mtu(int s, unsigned int ifi, int mtu) { struct req_t { struct nlmsghdr nlh; @@ -888,17 +1066,35 @@ int nl_link_up(int s, unsigned int ifi, int mtu) } req = { .ifm.ifi_family = AF_UNSPEC, .ifm.ifi_index = ifi, - .ifm.ifi_flags = IFF_UP, - .ifm.ifi_change = IFF_UP, .rta.rta_type = IFLA_MTU, .rta.rta_len = RTA_LENGTH(sizeof(unsigned int)), .mtu = mtu, }; - ssize_t len = sizeof(req); - if (!mtu) - /* Shorten request to drop MTU attribute */ - len = offsetof(struct req_t, rta); + return nl_do(s, &req, RTM_NEWLINK, 0, sizeof(req)); +} + +/** + * nl_link_set_flags() - Set link flags + * @s: Netlink socket + * @ifi: Interface index + * @set: Device flags to set + * @change: Mask of device flag changes + * + * Return: 0 on success, negative error code on failure + */ +int nl_link_set_flags(int s, unsigned int ifi, + unsigned int set, unsigned int change) +{ + struct req_t { + struct nlmsghdr nlh; + struct ifinfomsg ifm; + } req = { + .ifm.ifi_family = AF_UNSPEC, + .ifm.ifi_index = ifi, + .ifm.ifi_flags = set, + .ifm.ifi_change = change, + }; - return nl_do(s, &req, RTM_NEWLINK, 0, len); + return nl_do(s, &req, RTM_NEWLINK, 0, sizeof(req)); } @@ -19,10 +19,14 @@ int nl_addr_get(int s, unsigned int ifi, sa_family_t af, void *addr, int *prefix_len, void *addr_l); int nl_addr_set(int s, unsigned int ifi, sa_family_t af, const void *addr, int prefix_len); +int nl_addr_get_ll(int s, unsigned int ifi, struct in6_addr *addr); +int nl_addr_set_ll_nodad(int s, unsigned int ifi); int nl_addr_dup(int s_src, unsigned int ifi_src, int s_dst, unsigned int ifi_dst, sa_family_t af); int nl_link_get_mac(int s, unsigned int ifi, void *mac); int nl_link_set_mac(int s, unsigned int ifi, const void *mac); -int nl_link_up(int s, unsigned int ifi, int mtu); +int nl_link_set_mtu(int s, unsigned int ifi, int mtu); +int nl_link_set_flags(int s, unsigned int ifi, + unsigned int set, unsigned int change); #endif /* NETLINK_H */ @@ -23,11 +23,73 @@ #include "log.h" /** + * packet_check_range() - Check if a memory range is valid for a pool + * @p: Packet pool + * @ptr: Start of desired data range + * @len: Length of desired data range + * @func: For tracing: name of calling function + * @line: For tracing: caller line of function call + * + * Return: 0 if the range is valid, -1 otherwise + */ +static int packet_check_range(const struct pool *p, const char *ptr, size_t len, + const char *func, int line) +{ + if (len > PACKET_MAX_LEN) { + debug("packet range length %zu (max %zu), %s:%i", + len, PACKET_MAX_LEN, func, line); + return -1; + } + + if (p->buf_size == 0) { + int ret; + + ret = vu_packet_check_range((void *)p->buf, ptr, len); + + if (ret == -1) + debug("cannot find region, %s:%i", func, line); + + return ret; + } + + if (ptr < p->buf) { + debug("packet range start %p before buffer start %p, %s:%i", + (void *)ptr, (void *)p->buf, func, line); + return -1; + } + + if (len > p->buf_size) { + debug("packet range length %zu larger than buffer %zu, %s:%i", + len, p->buf_size, func, line); + return -1; + } + + if ((size_t)(ptr - p->buf) > p->buf_size - len) { + debug("packet range %p, len %zu after buffer end %p, %s:%i", + (void *)ptr, len, (void *)(p->buf + p->buf_size), + func, line); + return -1; + } + + return 0; +} +/** + * pool_full() - Is a packet pool full? + * @p: Pointer to packet pool + * + * Return: true if the pool is full, false if more packets can be added + */ +bool pool_full(const struct pool *p) +{ + return p->count >= p->size; +} + +/** * packet_add_do() - Add data as packet descriptor to given pool * @p: Existing pool * @len: Length of new descriptor * @start: Start of data - * @func: For tracing: name of calling function, NULL means no trace() + * @func: For tracing: name of calling function * @line: For tracing: caller line of function call */ void packet_add_do(struct pool *p, size_t len, const char *start, @@ -35,97 +97,88 @@ void packet_add_do(struct pool *p, size_t len, const char *start, { size_t idx = p->count; - if (idx >= p->size) { - trace("add packet index %zu to pool with size %zu, %s:%i", + if (pool_full(p)) { + debug("add packet index %zu to pool with size %zu, %s:%i", idx, p->size, func, line); return; } - if (start < p->buf) { - trace("add packet start %p before buffer start %p, %s:%i", - (void *)start, (void *)p->buf, func, line); - return; - } - - if (start + len > p->buf + p->buf_size) { - trace("add packet start %p, length: %zu, buffer end %p, %s:%i", - (void *)start, len, (void *)(p->buf + p->buf_size), - func, line); - return; - } - - if (len > UINT16_MAX) { - trace("add packet length %zu, %s:%i", len, func, line); + if (packet_check_range(p, start, len, func, line)) return; - } - -#if UINTPTR_MAX == UINT64_MAX - if ((uintptr_t)start - (uintptr_t)p->buf > UINT32_MAX) { - trace("add packet start %p, buffer start %p, %s:%i", - (void *)start, (void *)p->buf, func, line); - return; - } -#endif - p->pkt[idx].offset = start - p->buf; - p->pkt[idx].len = len; + p->pkt[idx].iov_base = (void *)start; + p->pkt[idx].iov_len = len; p->count++; } /** - * packet_get_do() - Get data range from packet descriptor from given pool + * packet_get_try_do() - Get data range from packet descriptor from given pool * @p: Packet pool * @idx: Index of packet descriptor in pool * @offset: Offset of data range in packet descriptor * @len: Length of desired data range * @left: Length of available data after range, set on return, can be NULL - * @func: For tracing: name of calling function, NULL means no trace() + * @func: For tracing: name of calling function * @line: For tracing: caller line of function call * * Return: pointer to start of data range, NULL on invalid range or descriptor */ -void *packet_get_do(const struct pool *p, size_t idx, size_t offset, - size_t len, size_t *left, const char *func, int line) +void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset, + size_t len, size_t *left, const char *func, int line) { - if (idx >= p->size || idx >= p->count) { - if (func) { - trace("packet %zu from pool size: %zu, count: %zu, " - "%s:%i", idx, p->size, p->count, func, line); - } - return NULL; - } + char *ptr; - if (len > UINT16_MAX || len + offset > UINT32_MAX) { - if (func) { - trace("packet data length %zu, offset %zu, %s:%i", - len, offset, func, line); - } - return NULL; - } + ASSERT_WITH_MSG(p->count <= p->size, + "Corrupt pool count: %zu, size: %zu, %s:%i", + p->count, p->size, func, line); - if (p->pkt[idx].offset + len + offset > p->buf_size) { - if (func) { - trace("packet offset plus length %zu from size %zu, " - "%s:%i", p->pkt[idx].offset + len + offset, - p->buf_size, func, line); - } + if (idx >= p->count) { + debug("packet %zu from pool count: %zu, %s:%i", + idx, p->count, func, line); return NULL; } - if (len + offset > p->pkt[idx].len) { - if (func) { - trace("data length %zu, offset %zu from length %u, " - "%s:%i", len, offset, p->pkt[idx].len, - func, line); - } + if (offset > p->pkt[idx].iov_len || + len > (p->pkt[idx].iov_len - offset)) return NULL; - } + + ptr = (char *)p->pkt[idx].iov_base + offset; + + ASSERT_WITH_MSG(!packet_check_range(p, ptr, len, func, line), + "Corrupt packet pool, %s:%i", func, line); if (left) - *left = p->pkt[idx].len - offset - len; + *left = p->pkt[idx].iov_len - offset - len; + + return ptr; +} + +/** + * packet_get_do() - Get data range from packet descriptor from given pool + * @p: Packet pool + * @idx: Index of packet descriptor in pool + * @offset: Offset of data range in packet descriptor + * @len: Length of desired data range + * @left: Length of available data after range, set on return, can be NULL + * @func: For tracing: name of calling function + * @line: For tracing: caller line of function call + * + * Return: as packet_get_try_do() but log a trace message when returning NULL + */ +void *packet_get_do(const struct pool *p, const size_t idx, + size_t offset, size_t len, size_t *left, + const char *func, int line) +{ + void *r = packet_get_try_do(p, idx, offset, len, left, func, line); + + if (!r) { + trace("missing packet data length %zu, offset %zu from " + "length %zu, %s:%i", + len, offset, p->pkt[idx].iov_len, func, line); + } - return p->buf + p->pkt[idx].offset + offset; + return r; } /** @@ -6,20 +6,17 @@ #ifndef PACKET_H #define PACKET_H -/** - * struct desc - Generic offset-based descriptor within buffer - * @offset: Offset of descriptor relative to buffer start, 32-bit limit - * @len: Length of descriptor, host order, 16-bit limit - */ -struct desc { - uint32_t offset; - uint16_t len; -}; +#include <stdbool.h> + +/* Maximum size of a single packet stored in pool, including headers */ +#define PACKET_MAX_LEN ((size_t)UINT16_MAX) /** * struct pool - Generic pool of packets stored in a buffer - * @buf: Buffer storing packet descriptors - * @buf_size: Total size of buffer + * @buf: Buffer storing packet descriptors, + * a struct vu_dev_region array for passt vhost-user mode + * @buf_size: Total size of buffer, + * 0 for passt vhost-user mode * @size: Number of usable descriptors for the pool * @count: Number of used descriptors for the pool * @pkt: Descriptors: see macros below @@ -29,32 +26,36 @@ struct pool { size_t buf_size; size_t size; size_t count; - struct desc pkt[1]; + struct iovec pkt[]; }; +int vu_packet_check_range(void *buf, const char *ptr, size_t len); void packet_add_do(struct pool *p, size_t len, const char *start, const char *func, int line); +void *packet_get_try_do(const struct pool *p, const size_t idx, + size_t offset, size_t len, size_t *left, + const char *func, int line); void *packet_get_do(const struct pool *p, const size_t idx, size_t offset, size_t len, size_t *left, const char *func, int line); +bool pool_full(const struct pool *p); void pool_flush(struct pool *p); #define packet_add(p, len, start) \ packet_add_do(p, len, start, __func__, __LINE__) +#define packet_get_try(p, idx, offset, len, left) \ + packet_get_try_do(p, idx, offset, len, left, __func__, __LINE__) #define packet_get(p, idx, offset, len, left) \ packet_get_do(p, idx, offset, len, left, __func__, __LINE__) -#define packet_get_try(p, idx, offset, len, left) \ - packet_get_do(p, idx, offset, len, left, NULL, 0) - #define PACKET_POOL_DECL(_name, _size, _buf) \ struct _name ## _t { \ char *buf; \ size_t buf_size; \ size_t size; \ size_t count; \ - struct desc pkt[_size]; \ + struct iovec pkt[_size]; \ } #define PACKET_POOL_INIT_NOCAST(_size, _buf, _buf_size) \ diff --git a/passt-repair.1 b/passt-repair.1 new file mode 100644 index 0000000..e65aadd --- /dev/null +++ b/passt-repair.1 @@ -0,0 +1,74 @@ +.\" SPDX-License-Identifier: GPL-2.0-or-later +.\" Copyright (c) 2025 Red Hat GmbH +.\" Author: Stefano Brivio <sbrivio@redhat.com> +.TH passt-repair 1 + +.SH NAME +.B passt-repair +\- Helper setting TCP_REPAIR socket options for \fBpasst\fR(1) + +.SH SYNOPSIS +.B passt-repair +\fIPATH\fR + +.SH DESCRIPTION + +.B passt-repair +is a privileged helper setting and clearing repair mode on TCP sockets on behalf +of \fBpasst\fR(1), as instructed via single-byte commands over a UNIX domain +socket. + +It can be used to migrate TCP connections between guests without granting +additional capabilities to \fBpasst\fR(1) itself: to migrate TCP connections, +\fBpasst\fR(1) leverages repair mode, which needs the \fBCAP_NET_ADMIN\fR +capability (see \fBcapabilities\fR(7)) to be set or cleared. + +If \fIPATH\fR represents a UNIX domain socket, \fBpasst-repair\fR(1) attempts to +connect to it. If it is a directory, \fBpasst-repair\fR(1) waits until a file +ending with \fI.repair\fR appears in it, and then attempts to connect to it. + +.SH PROTOCOL + +\fBpasst-repair\fR(1) connects to \fBpasst\fR(1) using the socket specified via +\fI--repair-path\fR option in \fBpasst\fR(1) itself. By default, the name is the +same as the UNIX domain socket used for guest communication, suffixed by +\fI.repair\fR. + +The messages consist of one 8-bit signed integer that can be \fITCP_REPAIR_ON\fR +(1), \fITCP_REPAIR_OFF\fR (0), or \fITCP_REPAIR_OFF_NO_WP\fR (-1), as defined by +the Linux kernel user API, and one to SCM_MAX_FD (253) sockets as SCM_RIGHTS +(see \fBunix\fR(7)) ancillary message, sent by the server, \fBpasst\fR(1). + +The client, \fBpasst-repair\fR(1), replies with the same byte (and no ancillary +message) to indicate success, and closes the connection on failure. + +The server closes the connection on error or completion. + +.SH NOTES + +\fBpasst-repair\fR(1) can be granted the \fBCAP_NET_ADMIN\fR capability +(preferred, as it limits privileges to the strictly necessary ones), or it can +be run as root. + +.SH AUTHOR + +Stefano Brivio <sbrivio@redhat.com>. + +.SH REPORTING BUGS + +Please report issues on the bug tracker at https://bugs.passt.top/, or +send a message to the passt-user@passt.top mailing list, see +https://lists.passt.top/. + +.SH COPYRIGHT + +Copyright (c) 2025 Red Hat GmbH. + +\fBpasst-repair\fR is free software: you can redistribute them and/or modify +them under the terms of the GNU General Public License as published by the Free +Software Foundation, either version 2 of the License, or (at your option) any +later version. + +.SH SEE ALSO + +\fBpasst\fR(1), \fBqemu\fR(1), \fBcapabilities\fR(7), \fBunix\fR(7). diff --git a/passt-repair.c b/passt-repair.c new file mode 100644 index 0000000..8c59d7e --- /dev/null +++ b/passt-repair.c @@ -0,0 +1,266 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * passt-repair.c - Privileged helper to set/clear TCP_REPAIR on sockets + * + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + * + * Connect to passt via UNIX domain socket, receive sockets via SCM_RIGHTS along + * with byte commands mapping to TCP_REPAIR values, and switch repair mode on or + * off. Reply by echoing the command. Exit on EOF. + */ + +#include <sys/inotify.h> +#include <sys/prctl.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/un.h> +#include <errno.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> +#include <unistd.h> +#include <netdb.h> + +#include <netinet/tcp.h> + +#include <linux/audit.h> +#include <linux/capability.h> +#include <linux/filter.h> +#include <linux/seccomp.h> + +#include "seccomp_repair.h" + +#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */ +#define REPAIR_EXT ".repair" +#define REPAIR_EXT_LEN strlen(REPAIR_EXT) + +/** + * main() - Entry point and whole program with loop + * @argc: Argument count, must be 2 + * @argv: Argument: path of UNIX domain socket to connect to + * + * Return: 0 on success (EOF), 1 on error, 2 on usage error + * + * #syscalls:repair connect setsockopt write close exit_group + * #syscalls:repair socket s390x:socketcall i686:socketcall + * #syscalls:repair recvfrom recvmsg arm:recv ppc64le:recv + * #syscalls:repair sendto sendmsg arm:send ppc64le:send + * #syscalls:repair stat|statx stat64|statx statx + * #syscalls:repair fstat|fstat64 newfstatat|fstatat64 + * #syscalls:repair inotify_init1 inotify_add_watch + */ +int main(int argc, char **argv) +{ + char buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FD)] + __attribute__ ((aligned(__alignof__(struct cmsghdr)))); + struct sockaddr_un a = { AF_UNIX, "" }; + int fds[SCM_MAX_FD], s, ret, i, n = 0; + bool inotify_dir = false; + struct sock_fprog prog; + int8_t cmd = INT8_MAX; + struct cmsghdr *cmsg; + struct msghdr msg; + struct iovec iov; + size_t cmsg_len; + struct stat sb; + int op; + + prctl(PR_SET_DUMPABLE, 0); + + prog.len = (unsigned short)sizeof(filter_repair) / + sizeof(filter_repair[0]); + prog.filter = filter_repair; + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) || + prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) { + fprintf(stderr, "Failed to apply seccomp filter\n"); + _exit(1); + } + + iov = (struct iovec){ &cmd, sizeof(cmd) }; + msg = (struct msghdr){ .msg_name = NULL, .msg_namelen = 0, + .msg_iov = &iov, .msg_iovlen = 1, + .msg_control = buf, + .msg_controllen = sizeof(buf), + .msg_flags = 0 }; + cmsg = CMSG_FIRSTHDR(&msg); + + if (argc != 2) { + fprintf(stderr, "Usage: %s PATH\n", argv[0]); + _exit(2); + } + + if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { + fprintf(stderr, "Failed to create AF_UNIX socket: %i\n", errno); + _exit(1); + } + + if ((stat(argv[1], &sb))) { + fprintf(stderr, "Can't stat() %s: %i\n", argv[1], errno); + _exit(1); + } + + if ((sb.st_mode & S_IFMT) == S_IFDIR) { + char buf[sizeof(struct inotify_event) + NAME_MAX + 1] + __attribute__ ((aligned(__alignof__(struct inotify_event)))); + const struct inotify_event *ev = NULL; + char path[PATH_MAX + 1]; + bool found = false; + ssize_t n; + int fd; + + if ((fd = inotify_init1(IN_CLOEXEC)) < 0) { + fprintf(stderr, "inotify_init1: %i\n", errno); + _exit(1); + } + + if (inotify_add_watch(fd, argv[1], IN_CREATE) < 0) { + fprintf(stderr, "inotify_add_watch: %i\n", errno); + _exit(1); + } + + do { + char *p; + + n = read(fd, buf, sizeof(buf)); + if (n < 0) { + fprintf(stderr, "inotify read: %i\n", errno); + _exit(1); + } + buf[n - 1] = '\0'; + + if (n < (ssize_t)sizeof(*ev)) { + fprintf(stderr, "Short inotify read: %zi\n", n); + continue; + } + + for (p = buf; p < buf + n; p += sizeof(*ev) + ev->len) { + ev = (const struct inotify_event *)p; + + if (ev->len >= REPAIR_EXT_LEN && + !memcmp(ev->name + + strnlen(ev->name, ev->len) - + REPAIR_EXT_LEN, + REPAIR_EXT, REPAIR_EXT_LEN)) { + found = true; + break; + } + } + } while (!found); + + if (ev->len > NAME_MAX + 1 || ev->name[ev->len - 1] != '\0') { + fprintf(stderr, "Invalid filename from inotify\n"); + _exit(1); + } + + snprintf(path, sizeof(path), "%s/%s", argv[1], ev->name); + if ((stat(path, &sb))) { + fprintf(stderr, "Can't stat() %s: %i\n", path, errno); + _exit(1); + } + + ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", path); + inotify_dir = true; + } else { + ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]); + } + + if (ret <= 0 || ret >= (int)sizeof(a.sun_path)) { + fprintf(stderr, "Invalid socket path\n"); + _exit(2); + } + + if ((sb.st_mode & S_IFMT) != S_IFSOCK) { + fprintf(stderr, "%s is not a socket\n", a.sun_path); + _exit(2); + } + + while (connect(s, (struct sockaddr *)&a, sizeof(a))) { + if (inotify_dir && errno == ECONNREFUSED) + continue; + + fprintf(stderr, "Failed to connect to %s: %s\n", a.sun_path, + strerror(errno)); + _exit(1); + } + +loop: + ret = recvmsg(s, &msg, 0); + if (ret < 0) { + if (errno == ECONNRESET) { + ret = 0; + } else { + fprintf(stderr, "Failed to read message: %i\n", errno); + _exit(1); + } + } + + if (!ret) /* Done */ + _exit(0); + + if (!cmsg || + cmsg->cmsg_len < CMSG_LEN(sizeof(int)) || + cmsg->cmsg_len > CMSG_LEN(sizeof(int) * SCM_MAX_FD) || + cmsg->cmsg_type != SCM_RIGHTS) { + fprintf(stderr, "No/bad ancillary data from peer\n"); + _exit(1); + } + + /* No inverse formula for CMSG_LEN(x), and building one with CMSG_LEN(0) + * works but there's no guarantee it does. Search the whole domain. + */ + for (i = 1; i <= SCM_MAX_FD; i++) { + if (CMSG_LEN(sizeof(int) * i) == cmsg->cmsg_len) { + n = i; + break; + } + } + if (!n) { + cmsg_len = cmsg->cmsg_len; /* socklen_t is 'unsigned' on musl */ + fprintf(stderr, "Invalid ancillary data length %zu from peer\n", + cmsg_len); + _exit(1); + } + + memcpy(fds, CMSG_DATA(cmsg), sizeof(int) * n); + + if (cmd != TCP_REPAIR_ON && cmd != TCP_REPAIR_OFF && + cmd != TCP_REPAIR_OFF_NO_WP) { + fprintf(stderr, "Unsupported command 0x%04x\n", cmd); + _exit(1); + } + + op = cmd; + + for (i = 0; i < n; i++) { + if (setsockopt(fds[i], SOL_TCP, TCP_REPAIR, &op, sizeof(op))) { + fprintf(stderr, + "Setting TCP_REPAIR to %i on socket %i: %s\n", + op, fds[i], strerror(errno)); + _exit(1); + } + + /* Close _our_ copy */ + close(fds[i]); + } + + /* Confirm setting by echoing the command back */ + if (send(s, &cmd, sizeof(cmd), 0) < 0) { + fprintf(stderr, "Reply to %i: %s\n", op, strerror(errno)); + _exit(1); + } + + goto loop; + + return 0; +} @@ -73,6 +73,9 @@ for performance reasons. .SH OPTIONS +Unless otherwise noted below, \fBif conflicting or multiple options are given, +the last one takes effect.\fR + .TP .BR \-d ", " \-\-debug Be verbose, don't log to the system logger. @@ -92,14 +95,18 @@ detached PID namespace after starting, because the PID itself cannot change. Default is to fork into background. .TP -.BR \-e ", " \-\-stderr -Log to standard error too. -Default is to log to the system logger only, if started from an interactive -terminal, and to both system logger and standard error otherwise. +.BR \-e ", " \-\-stderr " " (DEPRECATED) +This option has no effect, and is maintained for compatibility purposes only. + +Note that this configuration option is \fBdeprecated\fR and will be removed in a +future version. .TP .BR \-l ", " \-\-log-file " " \fIPATH\fR -Log to file \fIPATH\fR, not to standard error, and not to the system logger. +Log to file \fIPATH\fR, and not to the system logger. + +Specifying this option multiple times does \fInot\fR lead to multiple log files: +the last given option takes effect. .TP .BR \-\-log-size " " \fISIZE\fR @@ -128,6 +135,9 @@ Show version and exit. Capture tap-facing (that is, guest-side or namespace-side) network packets to \fIfile\fR in \fBpcap\fR format. +Specifying this option multiple times does \fInot\fR lead to multiple capture +files: the last given option takes effect. + .TP .BR \-P ", " \-\-pid " " \fIfile Write own PID to \fIfile\fR once initialisation is done, before forking to @@ -149,8 +159,10 @@ This option can be specified zero (for defaults) to two times (once for IPv4, once for IPv6). By default, assigned IPv4 and IPv6 addresses are taken from the host interfaces with the first default route, if any, for the corresponding IP version. If no -default routes are available and there is just one interface with any route, -that interface will be chosen instead. +default routes are available and there is any interface with any route for a +given IP version, the first of these interfaces will be chosen instead. If no +such interface exists, the link-local address 169.254.2.1 is assigned for IPv4, +and no additional address will be assigned for IPv6. .TP .BR \-n ", " \-\-netmask " " \fImask @@ -164,8 +176,7 @@ according to the CIDR block of the assigned address (RFC 4632). .BR \-M ", " \-\-mac-addr " " \fIaddr Use source MAC address \fIaddr\fR when communicating to the guest or to the target namespace. -Default is to use the MAC address of the interface with the first IPv4 default -route on the host. +Default is the locally administered MAC addresses 9a:55:9a:55:9a:55. .TP .BR \-g ", " \-\-gateway " " \fIaddr @@ -178,7 +189,9 @@ first default route, if any, for the corresponding IP version. If the default route is a multipath one, the gateway is the first nexthop router returned by the kernel which has the highest weight in the set of paths. If no default routes are available and there is just one interface with any route, that -interface will be chosen instead. +interface will be chosen instead. If no such interface exists, the link-local +address 169.254.2.2 is used for IPv4, and the link-local address fe80::1 is used +for IPv6. Note: these addresses are also used as source address for packets directed to the guest or to the target namespace having a loopback or local source address, @@ -193,7 +206,9 @@ Default is to use the interfaces specified by \fB--outbound-if4\fR and If no interfaces are given, the interface with the first default routes for each IP version is selected. If no default routes are available and there is just one -interface with any route, that interface will be chosen instead. +interface with any route, that interface will be chosen instead. If no such +interface exists, host interfaces will be ignored for the purposes of assigning +addresses and routes, and link-local addresses will be used instead. .TP .BR \-o ", " \-\-outbound " " \fIaddr @@ -212,7 +227,8 @@ derive IPv4 addresses and routes. By default, the interface given by the default route is selected. If no default routes are available and there is just one interface with any route, that -interface will be chosen instead. +interface will be chosen instead. If no such interface exists, outbound sockets +will not be bound to any specific interface. .TP .BR \-\-outbound-if6 " " \fIname @@ -222,23 +238,37 @@ derive IPv6 addresses and routes. By default, the interface given by the default route is selected. If no default routes are available and there is just one interface with any route, that -interface will be chosen instead. +interface will be chosen instead. If no such interface exists, outbound sockets +will not be bound to any specific interface. .TP .BR \-D ", " \-\-dns " " \fIaddr -Use \fIaddr\fR (IPv4 or IPv6) for DHCP, DHCPv6, NDP or DNS forwarding, as -configured (see options \fB--no-dhcp-dns\fR, \fB--dhcp-dns\fR, -\fB--dns-forward\fR) instead of reading addresses from \fI/etc/resolv.conf\fR. -This option can be specified multiple times. Specifying \fB-D none\fR disables -usage of DNS addresses altogether. +Instruct the guest (via DHCP, DHVPv6 or NDP) to use \fIaddr\fR (IPv4 +or IPv6) as a nameserver, as configured (see options +\fB--no-dhcp-dns\fR, \fB--dhcp-dns\fR) instead of reading addresses +from \fI/etc/resolv.conf\fR. This option can be specified multiple +times. Specifying \fB-D none\fR disables usage of DNS addresses +altogether. Unlike addresses from \fI/etc/resolv.conf\fR, \fIaddr\fR +is given to the guest without remapping. For example \fB--dns +127.0.0.1\fR will instruct the guest to use itself as nameserver, not +the host. .TP .BR \-\-dns-forward " " \fIaddr -Map \fIaddr\fR (IPv4 or IPv6) as seen from guest or namespace to the first -configured DNS resolver (with corresponding IP version). Mapping is limited to -UDP traffic directed to port 53, and DNS answers are translated back with a -reverse mapping. -This option can be specified zero to two times (once for IPv4, once for IPv6). +Map \fIaddr\fR (IPv4 or IPv6) as seen from guest or namespace to the +nameserver (with corresponding IP version) specified by the +\fB\-\-dns-host\fR option. Maps only UDP and TCP traffic to port 53 or +port 853. Replies are translated back with a reverse mapping. This +option can be specified zero to two times (once for IPv4, once for +IPv6). + +.TP +.BR \-\-dns-host " " \fIaddr +Configure the host nameserver which guest or namespace queries to the +\fB\-\-dns-forward\fR address will be redirected to. This option can +be specified zero to two times (once for IPv4, once for IPv6). +By default, the first nameserver from the host's +\fI/etc/resolv.conf\fR. .TP .BR \-S ", " \-\-search " " \fIlist @@ -314,6 +344,30 @@ Disable Router Advertisements. Router Solicitations coming from guest or target namespace will be ignored. .TP +.BR \-\-freebind +Allow any binding address to be specified for \fB-t\fR and \fB-u\fR +options. Usually binding addresses must be addresses currently +configured on the host. With \fB\-\-freebind\fR, the +\fBIP_FREEBIND\fR or \fBIPV6_FREEBIND\fR socket option is enabled +allowing any address to be used. This is typically used to bind +addresses which might be configured on the host in future, at which +point the forwarding will immediately start operating. + +.TP +.BR \-\-map-host-loopback " " \fIaddr +Translate \fIaddr\fR to refer to the host. Packets from the guest to +\fIaddr\fR will be redirected to the host. On the host such packets +will appear to have both source and destination of 127.0.0.1 or ::1. + +If \fIaddr\fR is 'none', no address is mapped (this implies +\fB--no-map-gw\fR). Only one IPv4 and one IPv6 address can be +translated, if the option is specified multiple times, the last one +takes effect. + +Default is to translate the guest's default gateway address, unless +\fB--no-map-gw\fR is given, in which case no address is mapped. + +.TP .BR \-\-no-map-gw Don't remap TCP connections and untracked UDP traffic, with the gateway address as destination, to the host. Implied if there is no gateway on the selected @@ -321,6 +375,21 @@ default route, or if there is no default route, for any of the enabled address families. .TP +.BR \-\-map-guest-addr " " \fIaddr +Translate \fIaddr\fR in the guest to be equal to the guest's assigned +address on the host. That is, packets from the guest to \fIaddr\fR +will be redirected to the address assigned to the guest with \fB-a\fR, +or by default the host's global address. This allows the guest to +access services available on the host's global address, even though its +own address shadows that of the host. + +If \fIaddr\fR is 'none', no address is mapped. Only one IPv4 and one +IPv6 address can be translated, and if the option is specified +multiple times, the last one for each address type takes effect. + +By default, mapping happens as described for the \-\-map-host-loopback option. + +.TP .BR \-4 ", " \-\-ipv4-only Enable IPv4-only operation. IPv6 traffic will be ignored. By default, IPv6 operation is enabled as long as at least an IPv6 route and an @@ -332,16 +401,45 @@ Enable IPv6-only operation. IPv4 traffic will be ignored. By default, IPv4 operation is enabled as long as at least an IPv4 route and an interface address are configured on a given host interface. +.TP +.BR \-H ", " \-\-hostname " " \fIname +Hostname to configure the client with. +Send \fIname\fR as DHCP option 12 (hostname). + +.TP +.BR \-\-fqdn " " \fIname +FQDN to configure the client with. +Send \fIname\fR as Client FQDN: DHCP option 81 and DHCPv6 option 39. + .SS \fBpasst\fR-only options .TP -.BR \-s ", " \-\-socket " " \fIpath +.BR \-s ", " \-\-socket-path ", " \-\-socket " " \fIpath Path for UNIX domain socket used by \fBqemu\fR(1) or \fBqrap\fR(1) to connect to \fBpasst\fR. Default is to probe a free socket, not accepting connections, starting from \fI/tmp/passt_1.socket\fR to \fI/tmp/passt_64.socket\fR. .TP +.BR \-\-vhost-user +Enable vhost-user. The vhost-user command socket is provided by \fB--socket\fR. + +.TP +.BR \-\-print-capabilities +Print back-end capabilities in JSON format, only meaningful for vhost-user mode. + +.TP +.BR \-\-repair-path " " \fIpath +Path for UNIX domain socket used by the \fBpasst-repair\fR(1) helper to connect +to \fBpasst\fR in order to set or clear the TCP_REPAIR option on sockets, during +migration. \fB--repair-path none\fR disables this interface (if you need to +specify a socket path called "none" you can prefix the path by \fI./\fR). + +Default, for \-\-vhost-user mode only, is to append \fI.repair\fR to the path +chosen for the hypervisor UNIX domain socket. No socket is created if not in +\-\-vhost-user mode. + +.TP .BR \-F ", " \-\-fd " " \fIFD Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened in the parent process and \fBpasst\fR inherits it when run as a child. This @@ -442,6 +540,7 @@ Default is \fBnone\fR. .BR \-I ", " \-\-ns-ifname " " \fIname Name of tap interface to be created in target namespace. By default, the same interface name as the external, routable interface is used. +If no such interface exists, the name \fItap0\fR will be used instead. .TP .BR \-t ", " \-\-tcp-ports " " \fIspec @@ -544,6 +643,13 @@ Configure UDP port forwarding from target namespace to init namespace. Default is \fBauto\fR. .TP +.BR \-\-host-lo-to-ns-lo +If specified, connections forwarded with \fB\-t\fR and \fB\-u\fR from +the host's loopback address will appear on the loopback address in the +guest as well. Without this option such forwarded packets will appear +to come from the guest's public address. + +.TP .BR \-\-userns " " \fIspec Target user namespace to join, as a path. If PID is given, without this option, the user namespace will be the one of the corresponding process. @@ -579,7 +685,7 @@ or sourced from the host, and bring up the tap interface. .BR \-\-no-copy-routes " " (DEPRECATED) With \-\-config-net, do not copy all the routes associated to the interface we derive addresses and routes from: set up only the default gateway. Implied by --g, \-\-gateway. +-g, \-\-gateway, for the corresponding IP version only. Default is to copy all the routing entries from the interface in the outer namespace to the target namespace, translating the output interface attribute to @@ -594,7 +700,7 @@ below. .BR \-\-no-copy-addrs " " (DEPRECATED) With \-\-config-net, do not copy all the addresses associated to the interface we derive addresses and routes from: set up a single one. Implied by \-a, -\-\-address. +\-\-address, for the corresponding IP version only. Default is to copy all the addresses, except for link-local ones, from the interface from the outer namespace to the target namespace. @@ -610,6 +716,11 @@ Configure MAC address \fIaddr\fR on the tap interface in the namespace. Default is to let the tap driver build a pseudorandom hardware address. +.TP +.BR \-\-no-splice +Disable the bypass path for inbound, local traffic. See the section \fBHandling +of local traffic in pasta\fR in the \fBNOTES\fR for more details. + .SH EXAMPLES .SS \fBpasta @@ -820,26 +931,31 @@ root@localhost's password: .SH NOTES -.SS Handling of traffic with local destination and source addresses - -Both \fBpasst\fR and \fBpasta\fR can bind on ports with a local address, -depending on the configuration. Local destination or source addresses need to be -changed before packets are delivered to the guest or target namespace: most -operating systems would drop packets received from non-loopback interfaces with -local addresses, and it would also be impossible for guest or target namespace -to route answers back. - -For convenience, and somewhat arbitrarily, the source address on these packets -is translated to the address of the default IPv4 or IPv6 gateway (if any) -- -this is known to be an existing, valid address on the same subnet. - -Loopback destination addresses are instead translated to the observed external -address of the guest or target namespace. For IPv6 packets, if usage of a -link-local address by guest or namespace has ever been observed, and the -original destination address is also a link-local address, the observed -link-local address is used. Otherwise, the observed global address is used. For -both IPv4 and IPv6, if no addresses have been seen yet, the configured addresses -will be used instead. +.SS Handling of traffic with loopback destination and source addresses + +Both \fBpasst\fR and \fBpasta\fR can bind on ports with a loopback +address (127.0.0.0/8 or ::1), depending on the configuration. Loopback +destination or source addresses need to be changed before packets are +delivered to the guest or target namespace: most operating systems +would drop packets received with loopback addresses on non-loopback +interfaces, and it would also be impossible for guest or target +namespace to route answers back. + +For convenience, the source address on these packets is translated to +the address specified by the \fB\-\-map-host-loopback\fR option (with +some exceptions in pasta mode, see next section below). If not +specified this defaults, somewhat arbitrarily, to the address of +default IPv4 or IPv6 gateway (if any) -- this is known to be an +existing, valid address on the same subnet. If \fB\-\-no-map-gw\fR or +\fB\-\-map-host-loopback none\fR are specified this translation is +disabled and packets with loopback addresses are simply dropped. + +Loopback destination addresses are translated to the observed external +address of the guest or target namespace. For IPv6, the observed +link-local address is used if the translated source address is +link-local, otherwise the observed global address is used. For both +IPv4 and IPv6, if no addresses have been seen yet, the configured +addresses will be used instead. For example, if \fBpasst\fR or \fBpasta\fR receive a connection from 127.0.0.1, with destination 127.0.0.10, and the default IPv4 gateway is 192.0.2.1, while @@ -847,11 +963,15 @@ the last observed source address from guest or namespace is 192.0.2.2, this will be translated to a connection from 192.0.2.1 to 192.0.2.2. Similarly, for traffic coming from guest or namespace, packets with destination -address corresponding to the default gateway will have their destination address -translated to a loopback address, if and only if a packet, in the opposite -direction, with a loopback destination or source address, port-wise matching for -UDP, or connection-wise for TCP, has been recently forwarded to guest or -namespace. This behaviour can be disabled with \-\-no\-map\-gw. +address corresponding to the \fB\-\-map-host-loopback\fR address will have their +destination address translated to a loopback address. + +As an exception, traffic identified as DNS, originally directed to the +\fB\-\-map-host-loopback\fR address, if this address matches a resolver address +on the host, is \fBnot\fR translated to loopback, but rather handled in the same +way as if specified as \-\-dns-forward address, if no such option was given. +In the common case where the host gateway also acts a resolver, this avoids that +the host mapping shadows the gateway/resolver itself. .SS Handling of local traffic in pasta @@ -867,8 +987,15 @@ and the new socket using the \fBsplice\fR(2) system call, and for UDP, a pair of \fBrecvmmsg\fR(2) and \fBsendmmsg\fR(2) system calls deals with packet transfers. -This bypass only applies to local connections and traffic, because it's not -possible to bind sockets to foreign addresses. +Because it's not possible to bind sockets to foreign addresses, this +bypass only applies to local connections and traffic. It also means +that the address translation differs slightly from passt mode. +Connections from loopback to loopback on the host will appear to come +from the target namespace's public address within the guest, unless +\fB\-\-host-lo-to-ns-lo\fR is specified, in which case they will +appear to come from loopback in the namespace as well. The latter +behaviour used to be the default, but is usually undesirable, since it +can unintentionally expose namespace local services to the host. .SS Binding to low numbered ports (well-known or system ports, up to 1023) @@ -953,6 +1080,20 @@ If the sending window cannot be queried, it will always be announced as the current sending buffer size to guest or target namespace. This might affect throughput of TCP connections. +.SS Local mode for disconnected setups + +If \fBpasst\fR and \fBpasta\fR fail to find a host interface with a configured +address, other than loopback addresses, they will, obviously, not attempt to +source addresses or routes from the host. + +In this case, unless configured otherwise, they will assign the IPv4 link-local +address 169.254.2.1 to the guest or target namespace, and no IPv6 address. The +notion of the guest or target namespace IPv6 address is derived from the first +link-local address observed. + +Default gateways will be assigned as the link-local address 169.254.2.2 for +IPv4, and as the link-local address fe80::1 for IPv6. + .SH LIMITATIONS Currently, IGMP/MLD proxying (RFC 4605) and support for SCTP (RFC 4960) are not @@ -977,8 +1118,8 @@ https://passt.top/passt/lists. Copyright (c) 2020-2022 Red Hat GmbH. \fBpasst\fR and \fBpasta\fR are free software: you can redistribute them and/or -modify them under the terms of the GNU Affero General Public License as -published by the Free Software Foundation, either version 3 of the License, or +modify them under the terms of the GNU General Public License as +published by the Free Software Foundation, either version 2 of the License, or (at your option) any later version. .SH SEE ALSO @@ -35,9 +35,7 @@ #include <syslog.h> #include <sys/prctl.h> #include <netinet/if_ether.h> -#ifdef HAS_GETRANDOM -#include <sys/random.h> -#endif +#include <libgen.h> #include "util.h" #include "passt.h" @@ -51,6 +49,10 @@ #include "arch.h" #include "log.h" #include "tcp_splice.h" +#include "ndp.h" +#include "vu_common.h" +#include "migrate.h" +#include "repair.h" #define EPOLL_EVENTS 8 @@ -65,13 +67,18 @@ char *epoll_type_str[] = { [EPOLL_TYPE_TCP_SPLICE] = "connected spliced TCP socket", [EPOLL_TYPE_TCP_LISTEN] = "listening TCP socket", [EPOLL_TYPE_TCP_TIMER] = "TCP timer", - [EPOLL_TYPE_UDP] = "UDP socket", + [EPOLL_TYPE_UDP_LISTEN] = "listening UDP socket", + [EPOLL_TYPE_UDP] = "UDP flow socket", [EPOLL_TYPE_PING] = "ICMP/ICMPv6 ping socket", [EPOLL_TYPE_NSQUIT_INOTIFY] = "namespace inotify watch", [EPOLL_TYPE_NSQUIT_TIMER] = "namespace timer watch", [EPOLL_TYPE_TAP_PASTA] = "/dev/net/tun device", [EPOLL_TYPE_TAP_PASST] = "connected qemu socket", [EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket", + [EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket", + [EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket", + [EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket", + [EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket", }; static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, "epoll_type_str[] doesn't match enum epoll_type"); @@ -83,7 +90,7 @@ static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, */ static void post_handler(struct ctx *c, const struct timespec *now) { -#define CALL_PROTO_HANDLER(c, now, lc, uc) \ +#define CALL_PROTO_HANDLER(lc, uc) \ do { \ extern void \ lc ## _defer_handler (struct ctx *c) \ @@ -102,47 +109,31 @@ static void post_handler(struct ctx *c, const struct timespec *now) } while (0) /* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */ - CALL_PROTO_HANDLER(c, now, tcp, TCP); + CALL_PROTO_HANDLER(tcp, TCP); /* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */ - CALL_PROTO_HANDLER(c, now, udp, UDP); + CALL_PROTO_HANDLER(udp, UDP); flow_defer_handler(c, now); #undef CALL_PROTO_HANDLER + + if (!c->no_ndp) + ndp_timer(c, now); } /** - * secret_init() - Create secret value for SipHash calculations + * random_init() - Initialise things based on random data * @c: Execution context */ -static void secret_init(struct ctx *c) +static void random_init(struct ctx *c) { -#ifndef HAS_GETRANDOM - int dev_random = open("/dev/random", O_RDONLY); - unsigned int random_read = 0; - - while (dev_random && random_read < sizeof(c->hash_secret)) { - int ret = read(dev_random, - (uint8_t *)&c->hash_secret + random_read, - sizeof(c->hash_secret) - random_read); + unsigned int seed; - if (ret == -1 && errno == EINTR) - continue; + /* Create secret value for SipHash calculations */ + raw_random(&c->hash_secret, sizeof(c->hash_secret)); - if (ret <= 0) - break; - - random_read += ret; - } - if (dev_random >= 0) - close(dev_random); - if (random_read < sizeof(c->hash_secret)) { -#else - if (getrandom(&c->hash_secret, sizeof(c->hash_secret), - GRND_RANDOM) < 0) { -#endif /* !HAS_GETRANDOM */ - perror("TCP initial sequence getrandom"); - exit(EXIT_FAILURE); - } + /* Seed pseudo-RNG for things that need non-cryptographic random */ + raw_random(&seed, sizeof(seed)); + srandom(seed); } /** @@ -175,11 +166,11 @@ void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) * * #syscalls exit_group */ -void exit_handler(int signal) +static void exit_handler(int signal) { (void)signal; - exit(EXIT_SUCCESS); + _exit(EXIT_SUCCESS); } /** @@ -190,28 +181,30 @@ void exit_handler(int signal) * Return: non-zero on failure * * #syscalls read write writev - * #syscalls socket bind connect getsockopt setsockopt s390x:socketcall close - * #syscalls recvfrom sendto shutdown - * #syscalls armv6l:recv armv7l:recv ppc64le:recv - * #syscalls armv6l:send armv7l:send ppc64le:send - * #syscalls accept4|accept listen epoll_ctl epoll_wait|epoll_pwait epoll_pwait - * #syscalls clock_gettime armv6l:clock_gettime64 armv7l:clock_gettime64 + * #syscalls socket getsockopt setsockopt s390x:socketcall i686:socketcall close + * #syscalls bind connect recvfrom sendto shutdown + * #syscalls arm:recv ppc64le:recv arm:send ppc64le:send + * #syscalls accept4 accept listen epoll_ctl epoll_wait|epoll_pwait epoll_pwait + * #syscalls clock_gettime arm:clock_gettime64 i686:clock_gettime64 */ int main(int argc, char **argv) { - int nfds, i, devnull_fd = -1, pidfile_fd = -1; struct epoll_event events[EPOLL_EVENTS]; - char *log_name, argv0[PATH_MAX], *name; + int nfds, i, devnull_fd = -1; struct ctx c = { 0 }; struct rlimit limit; struct timespec now; struct sigaction sa; + if (clock_gettime(CLOCK_MONOTONIC, &log_start)) + die_perror("Failed to get CLOCK_MONOTONIC time"); + arch_avx2_exec(argv); - isolate_initial(); + isolate_initial(argc, argv); - c.pasta_netns_fd = c.fd_tap = c.fd_tap_listen = -1; + c.pasta_netns_fd = c.fd_tap = c.pidfile_fd = -1; + c.device_state_fd = -1; sigemptyset(&sa.sa_mask); sa.sa_flags = 0; @@ -219,51 +212,30 @@ int main(int argc, char **argv) sigaction(SIGTERM, &sa, NULL); sigaction(SIGQUIT, &sa, NULL); - if (argc < 1) - exit(EXIT_FAILURE); - - strncpy(argv0, argv[0], PATH_MAX - 1); - name = basename(argv0); - if (strstr(name, "pasta")) { - __openlog(log_name = "pasta", LOG_PERROR, LOG_DAEMON); + c.mode = conf_mode(argc, argv); + if (c.mode == MODE_PASTA) { sa.sa_handler = pasta_child_handler; - if (sigaction(SIGCHLD, &sa, NULL)) { - die("Couldn't install signal handlers: %s", - strerror(errno)); - } - - if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) { - die("Couldn't set disposition for SIGPIPE: %s", - strerror(errno)); - } - - c.mode = MODE_PASTA; - } else if (strstr(name, "passt")) { - __openlog(log_name = "passt", LOG_PERROR, LOG_DAEMON); - - c.mode = MODE_PASST; - } else { - exit(EXIT_FAILURE); + if (sigaction(SIGCHLD, &sa, NULL)) + die_perror("Couldn't install signal handlers"); } - madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE); + if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) + die_perror("Couldn't set disposition for SIGPIPE"); + + madvise(pkt_buf, sizeof(pkt_buf), MADV_HUGEPAGE); c.epollfd = epoll_create1(EPOLL_CLOEXEC); - if (c.epollfd == -1) { - perror("epoll_create1"); - exit(EXIT_FAILURE); - } + if (c.epollfd == -1) + die_perror("Failed to create epoll file descriptor"); + + if (getrlimit(RLIMIT_NOFILE, &limit)) + die_perror("Failed to get maximum value of open files limit"); - if (getrlimit(RLIMIT_NOFILE, &limit)) { - perror("getrlimit"); - exit(EXIT_FAILURE); - } c.nofile = limit.rlim_cur = limit.rlim_max; - if (setrlimit(RLIMIT_NOFILE, &limit)) { - perror("setrlimit"); - exit(EXIT_FAILURE); - } + if (setrlimit(RLIMIT_NOFILE, &limit)) + die_perror("Failed to set current limit for open files"); + sock_probe_mem(&c); conf(&c, argc, argv); @@ -271,18 +243,19 @@ int main(int argc, char **argv) pasta_netns_quit_init(&c); - tap_sock_init(&c); + tap_backend_init(&c); - secret_init(&c); + random_init(&c); - clock_gettime(CLOCK_MONOTONIC, &now); + if (clock_gettime(CLOCK_MONOTONIC, &now)) + die_perror("Failed to get CLOCK_MONOTONIC time"); flow_init(); if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c))) - exit(EXIT_FAILURE); + _exit(EXIT_FAILURE); - proto_update_l2_buf(c.mac_guest, c.mac); + proto_update_l2_buf(c.guest_mac, c.our_tap_mac); if (c.ifi4 && !c.no_dhcp) dhcp_init(); @@ -293,56 +266,46 @@ int main(int argc, char **argv) pcap_init(&c); if (!c.foreground) { - if ((devnull_fd = open("/dev/null", O_RDWR | O_CLOEXEC)) < 0) { - perror("/dev/null open"); - exit(EXIT_FAILURE); - } - } - - if (*c.pid_file) { - if ((pidfile_fd = open(c.pid_file, - O_CREAT | O_TRUNC | O_WRONLY | O_CLOEXEC, - S_IRUSR | S_IWUSR)) < 0) { - perror("PID file open"); - exit(EXIT_FAILURE); - } + if ((devnull_fd = open("/dev/null", O_RDWR | O_CLOEXEC)) < 0) + die_perror("Failed to open /dev/null"); } if (isolate_prefork(&c)) die("Failed to sandbox process, exiting"); - if (!c.force_stderr && !isatty(fileno(stderr))) - __openlog(log_name, 0, LOG_DAEMON); - - if (!c.foreground) - __daemon(pidfile_fd, devnull_fd); - else - write_pidfile(pidfile_fd, getpid()); + if (!c.foreground) { + __daemon(c.pidfile_fd, devnull_fd); + log_stderr = false; + } else { + pidfile_write(c.pidfile_fd, getpid()); + } - if (pasta_child_pid) + if (pasta_child_pid) { kill(pasta_child_pid, SIGUSR1); + log_stderr = false; + } isolate_postfork(&c); timer_init(&c, &now); loop: - /* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */ + /* NOLINTBEGIN(bugprone-branch-clone): intervals can be the same */ /* cppcheck-suppress [duplicateValueTernary, unmatchedSuppression] */ nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL); - if (nfds == -1 && errno != EINTR) { - perror("epoll_wait"); - exit(EXIT_FAILURE); - } + /* NOLINTEND(bugprone-branch-clone) */ + if (nfds == -1 && errno != EINTR) + die_perror("epoll_wait() failed in main loop"); - clock_gettime(CLOCK_MONOTONIC, &now); + if (clock_gettime(CLOCK_MONOTONIC, &now)) + err_perror("Failed to get CLOCK_MONOTONIC time"); for (i = 0; i < nfds; i++) { union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64); uint32_t eventmask = events[i].events; trace("%s: epoll event on %s %i (events: 0x%08x)", - c.mode == MODE_PASST ? "passt" : "pasta", + c.mode == MODE_PASTA ? "pasta" : "passt", EPOLL_TYPE_STR(ref.type), ref.fd, eventmask); switch (ref.type) { @@ -373,12 +336,27 @@ loop: case EPOLL_TYPE_TCP_TIMER: tcp_timer_handler(&c, ref); break; + case EPOLL_TYPE_UDP_LISTEN: + udp_listen_sock_handler(&c, ref, eventmask, &now); + break; case EPOLL_TYPE_UDP: udp_sock_handler(&c, ref, eventmask, &now); break; case EPOLL_TYPE_PING: icmp_sock_handler(&c, ref); break; + case EPOLL_TYPE_VHOST_CMD: + vu_control_handler(c.vdev, c.fd_tap, eventmask); + break; + case EPOLL_TYPE_VHOST_KICK: + vu_kick_cb(c.vdev, ref, &now); + break; + case EPOLL_TYPE_REPAIR_LISTEN: + repair_listen_handler(&c, eventmask); + break; + case EPOLL_TYPE_REPAIR: + repair_handler(&c, eventmask); + break; default: /* Can't happen */ ASSERT(0); @@ -387,5 +365,7 @@ loop: post_handler(&c, &now); + migrate_handler(&c); + goto loop; } @@ -9,26 +9,6 @@ #define UNIX_SOCK_MAX 100 #define UNIX_SOCK_PATH "/tmp/passt_%i.socket" -/** - * struct tap_msg - Generic message descriptor for arrays of messages - * @pkt_buf_offset: Offset from @pkt_buf - * @len: Message length, with L2 headers - */ -struct tap_msg { - uint32_t pkt_buf_offset; - uint16_t len; -}; - -/** - * struct tap_l4_msg - Layer-4 message descriptor for protocol handlers - * @pkt_buf_offset: Offset of message from @pkt_buf - * @l4_len: Length of Layer-4 payload, host order - */ -struct tap_l4_msg { - uint32_t pkt_buf_offset; - uint16_t l4_len; -}; - union epoll_ref; #include <stdbool.h> @@ -37,43 +17,23 @@ union epoll_ref; #include "pif.h" #include "packet.h" +#include "siphash.h" +#include "ip.h" +#include "inany.h" +#include "migrate.h" #include "flow.h" #include "icmp.h" #include "fwd.h" #include "tcp.h" #include "udp.h" +#include "vhost_user.h" -/** - * enum epoll_type - Different types of fds we poll over +/* Default address for our end on the tap interface. Bit 0 of byte 0 must be 0 + * (unicast) and bit 1 of byte 1 must be 1 (locally administered). Otherwise + * it's arbitrary. */ -enum epoll_type { - /* Special value to indicate an invalid type */ - EPOLL_TYPE_NONE = 0, - /* Connected TCP sockets */ - EPOLL_TYPE_TCP, - /* Connected TCP sockets (spliced) */ - EPOLL_TYPE_TCP_SPLICE, - /* Listening TCP sockets */ - EPOLL_TYPE_TCP_LISTEN, - /* timerfds used for TCP timers */ - EPOLL_TYPE_TCP_TIMER, - /* UDP sockets */ - EPOLL_TYPE_UDP, - /* ICMP/ICMPv6 ping sockets */ - EPOLL_TYPE_PING, - /* inotify fd watching for end of netns (pasta) */ - EPOLL_TYPE_NSQUIT_INOTIFY, - /* timer fd watching for end of netns, fallback for inotify (pasta) */ - EPOLL_TYPE_NSQUIT_TIMER, - /* tuntap character device */ - EPOLL_TYPE_TAP_PASTA, - /* socket connected to qemu */ - EPOLL_TYPE_TAP_PASST, - /* socket listening for qemu socket connections */ - EPOLL_TYPE_TAP_LISTEN, - - EPOLL_NUM_TYPES, -}; +#define MAC_OUR_LAA \ + ((uint8_t [ETH_ALEN]){0x9a, 0x55, 0x9a, 0x55, 0x9a, 0x55}) /** * union epoll_ref - Breakdown of reference for epoll fd bookkeeping @@ -85,6 +45,7 @@ enum epoll_type { * @icmp: ICMP-specific reference part * @data: Data handled by protocol handlers * @nsdir_fd: netns dirfd for fallback timer checking if namespace is gone + * @queue: vhost-user queue index for this fd * @u64: Opaque reference for epoll_ctl() and epoll_wait() */ union epoll_ref { @@ -97,9 +58,10 @@ union epoll_ref { uint32_t flow; flow_sidx_t flowside; union tcp_listen_epoll_ref tcp_listen; - union udp_epoll_ref udp; + union udp_listen_epoll_ref udp; uint32_t data; int nsdir_fd; + int queue; }; }; uint64_t u64; @@ -107,13 +69,9 @@ union epoll_ref { static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data), "epoll_ref must have same size as epoll_data"); -#define TAP_BUF_BYTES \ - ROUND_DOWN(((ETH_MAX_MTU + sizeof(uint32_t)) * 128), PAGE_SIZE) -#define TAP_BUF_FILL (TAP_BUF_BYTES - ETH_MAX_MTU - sizeof(uint32_t)) -#define TAP_MSGS \ - DIV_ROUND_UP(TAP_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t)) +/* Large enough for ~128 maximum size frames */ +#define PKT_BUF_BYTES (8UL << 20) -#define PKT_BUF_BYTES MAX(TAP_BUF_BYTES, 0) extern char pkt_buf [PKT_BUF_BYTES]; extern char *epoll_type_str[]; @@ -137,58 +95,89 @@ struct fqdn { enum passt_modes { MODE_PASST, MODE_PASTA, + MODE_VU, }; /** * struct ip4_ctx - IPv4 execution context - * @addr: IPv4 address for external, routable interface + * @addr: IPv4 address assigned to guest * @addr_seen: Latest IPv4 address seen as source from tap * @prefixlen: IPv4 prefix length (netmask) - * @gw: Default IPv4 gateway, network order - * @dns: DNS addresses for DHCP, zero-terminated, network order - * @dns_match: Forward DNS query if sent to this address, network order - * @dns_host: Use this DNS on the host for forwarding, network order + * @guest_gw: IPv4 gateway as seen by the guest + * @map_host_loopback: Outbound connections to this address are NATted to the + * host's 127.0.0.1 + * @map_guest_addr: Outbound connections to this address are NATted to the + * guest's assigned address + * @dns: DNS addresses for DHCP, zero-terminated + * @dns_match: Forward DNS query if sent to this address + * @our_tap_addr: IPv4 address for passt's use on tap + * @dns_host: Use this DNS on the host for forwarding * @addr_out: Optional source address for outbound traffic * @ifname_out: Optional interface name to bind outbound sockets to + * @no_copy_routes: Don't copy all routes when configuring target namespace + * @no_copy_addrs: Don't copy all addresses when configuring namespace */ struct ip4_ctx { + /* PIF_TAP addresses */ struct in_addr addr; struct in_addr addr_seen; int prefix_len; - struct in_addr gw; + struct in_addr guest_gw; + struct in_addr map_host_loopback; + struct in_addr map_guest_addr; struct in_addr dns[MAXNS + 1]; struct in_addr dns_match; - struct in_addr dns_host; + struct in_addr our_tap_addr; + /* PIF_HOST addresses */ + struct in_addr dns_host; struct in_addr addr_out; + char ifname_out[IFNAMSIZ]; + + bool no_copy_routes; + bool no_copy_addrs; }; /** * struct ip6_ctx - IPv6 execution context - * @addr: IPv6 address for external, routable interface - * @addr_ll: Link-local IPv6 address on external, routable interface + * @addr: IPv6 address assigned to guest * @addr_seen: Latest IPv6 global/site address seen as source from tap * @addr_ll_seen: Latest IPv6 link-local address seen as source from tap - * @gw: Default IPv6 gateway + * @guest_gw: IPv6 gateway as seen by the guest + * @map_host_loopback: Outbound connections to this address are NATted to the + * host's [::1] + * @map_guest_addr: Outbound connections to this address are NATted to the + * guest's assigned address * @dns: DNS addresses for DHCPv6 and NDP, zero-terminated * @dns_match: Forward DNS query if sent to this address + * @our_tap_ll: Link-local IPv6 address for passt's use on tap * @dns_host: Use this DNS on the host for forwarding * @addr_out: Optional source address for outbound traffic * @ifname_out: Optional interface name to bind outbound sockets to + * @no_copy_routes: Don't copy all routes when configuring target namespace + * @no_copy_addrs: Don't copy all addresses when configuring namespace */ struct ip6_ctx { + /* PIF_TAP addresses */ struct in6_addr addr; - struct in6_addr addr_ll; struct in6_addr addr_seen; struct in6_addr addr_ll_seen; - struct in6_addr gw; + struct in6_addr guest_gw; + struct in6_addr map_host_loopback; + struct in6_addr map_guest_addr; struct in6_addr dns[MAXNS + 1]; struct in6_addr dns_match; - struct in6_addr dns_host; + struct in6_addr our_tap_ll; + /* PIF_HOST addresses */ + struct in6_addr dns_host; struct in6_addr addr_out; + char ifname_out[IFNAMSIZ]; + + bool no_copy_routes; + bool no_copy_addrs; }; #include <netinet/if_ether.h> @@ -200,11 +189,12 @@ struct ip6_ctx { * @trace: Enable tracing (extra debug) mode * @quiet: Don't print informational messages * @foreground: Run in foreground, don't log to stderr by default - * @force_stderr: Force logging to stderr * @nofile: Maximum number of open files (ulimit -n) * @sock_path: Path for UNIX domain socket + * @repair_path: TCP_REPAIR helper path, can be "none", empty for default * @pcap: Path for packet capture file - * @pid_file: Path to PID file, empty string if not configured + * @pidfile: Path to PID file, empty string if not configured + * @pidfile_fd: File descriptor for PID file, -1 if none * @pasta_netns_fd: File descriptor for network namespace in pasta mode * @no_netns_quit: In pasta mode, don't exit if fs-bound namespace is gone * @netns_base: Base name for fs-bound namespace, if any, in pasta mode @@ -212,19 +202,21 @@ struct ip6_ctx { * @epollfd: File descriptor for epoll instance * @fd_tap_listen: File descriptor for listening AF_UNIX socket, if any * @fd_tap: AF_UNIX socket, tuntap device, or pre-opened socket - * @mac: Host MAC address - * @mac_guest: MAC address of guest or namespace, seen or configured + * @fd_repair_listen: File descriptor for listening TCP_REPAIR socket, if any + * @fd_repair: Connected AF_UNIX socket for TCP_REPAIR helper + * @our_tap_mac: Pasta/passt's MAC on the tap link + * @guest_mac: MAC address of guest or namespace, seen or configured * @hash_secret: 128-bit secret for siphash functions - * @ifi4: Index of template interface for IPv4, 0 if IPv4 disabled + * @ifi4: Template interface for IPv4, -1: none, 0: IPv4 disabled * @ip: IPv4 configuration * @dns_search: DNS search list - * @ifi6: Index of template interface for IPv6, 0 if IPv6 disabled + * @hostname: Guest hostname + * @fqdn: Guest FQDN + * @ifi6: Template interface for IPv6, -1: none, 0: IPv6 disabled * @ip6: IPv6 configuration * @pasta_ifn: Name of namespace interface for pasta * @pasta_ifi: Index of namespace interface for pasta * @pasta_conf_ns: Configure namespace after creating it - * @no_copy_routes: Don't copy all routes when configuring target namespace - * @no_copy_addrs: Don't copy all addresses when configuring namespace * @no_tcp: Disable TCP operation * @tcp: Context for TCP protocol handler * @no_tcp: Disable UDP operation @@ -240,9 +232,15 @@ struct ip6_ctx { * @no_dhcpv6: Disable DHCPv6 server * @no_ndp: Disable NDP handler altogether * @no_ra: Disable router advertisements - * @no_map_gw: Don't map connections, untracked UDP to gateway to host + * @no_splice: Disable socket splicing for inbound traffic + * @host_lo_to_ns_lo: Map host loopback addresses to ns loopback addresses + * @freebind: Allow binding of non-local addresses for forwarding * @low_wmem: Low probed net.core.wmem_max * @low_rmem: Low probed net.core.rmem_max + * @vdev: vhost-user device + * @device_state_fd: Device state migration channel + * @device_state_result: Device state migration result + * @migrate_target: Are we the target, on the next migration request? */ struct ctx { enum passt_modes mode; @@ -250,11 +248,14 @@ struct ctx { int trace; int quiet; int foreground; - int force_stderr; int nofile; char sock_path[UNIX_PATH_MAX]; + char repair_path[UNIX_PATH_MAX]; char pcap[PATH_MAX]; - char pid_file[PATH_MAX]; + + char pidfile[PATH_MAX]; + int pidfile_fd; + int one_off; int pasta_netns_fd; @@ -266,23 +267,28 @@ struct ctx { int epollfd; int fd_tap_listen; int fd_tap; - unsigned char mac[ETH_ALEN]; - unsigned char mac_guest[ETH_ALEN]; + int fd_repair_listen; + int fd_repair; + unsigned char our_tap_mac[ETH_ALEN]; + unsigned char guest_mac[ETH_ALEN]; + uint16_t mtu; + uint64_t hash_secret[2]; - unsigned int ifi4; + int ifi4; struct ip4_ctx ip4; struct fqdn dns_search[MAXDNSRCH]; - unsigned int ifi6; + char hostname[PASST_MAXDNAME]; + char fqdn[PASST_MAXDNAME]; + + int ifi6; struct ip6_ctx ip6; char pasta_ifn[IF_NAMESIZE]; unsigned int pasta_ifi; int pasta_conf_ns; - int no_copy_routes; - int no_copy_addrs; int no_tcp; struct tcp_ctx tcp; @@ -291,7 +297,6 @@ struct ctx { int no_icmp; struct icmp_ctx icmp; - int mtu; int no_dns; int no_dns_search; int no_dhcp_dns; @@ -300,10 +305,19 @@ struct ctx { int no_dhcpv6; int no_ndp; int no_ra; - int no_map_gw; + int no_splice; + int host_lo_to_ns_lo; + int freebind; int low_wmem; int low_rmem; + + struct vu_dev *vdev; + + /* Migration */ + int device_state_fd; + int device_state_result; + bool migrate_target; }; void proto_update_l2_buf(const unsigned char *eth_d, @@ -12,8 +12,8 @@ * Author: Stefano Brivio <sbrivio@redhat.com> * * #syscalls:pasta clone waitid exit exit_group rt_sigprocmask - * #syscalls:pasta rt_sigreturn|sigreturn armv6l:sigreturn armv7l:sigreturn - * #syscalls:pasta ppc64:sigreturn s390x:sigreturn + * #syscalls:pasta rt_sigreturn|sigreturn + * #syscalls:pasta arm:sigreturn ppc64:sigreturn s390x:sigreturn i686:sigreturn */ #include <sched.h> @@ -50,19 +50,20 @@ #include "netlink.h" #include "log.h" +#define HOSTNAME_PREFIX "pasta-" + /* PID of child, in case we created a namespace */ int pasta_child_pid; /** * pasta_child_handler() - Exit once shell exits (if we started it), reap clones - * @signal: Unused, handler deals with SIGCHLD only + * @signal: Signal number; this handler deals with SIGCHLD only */ void pasta_child_handler(int signal) { + int errno_save = errno; siginfo_t infop; - (void)signal; - if (signal != SIGCHLD) return; @@ -70,12 +71,12 @@ void pasta_child_handler(int signal) !waitid(P_PID, pasta_child_pid, &infop, WEXITED | WNOHANG)) { if (infop.si_pid == pasta_child_pid) { if (infop.si_code == CLD_EXITED) - exit(infop.si_status); + _exit(infop.si_status); /* If killed by a signal, si_status is the number. * Follow common shell convention of returning it + 128. */ - exit(infop.si_status + 128); + _exit(infop.si_status + 128); /* Nothing to do, detached PID namespace going away */ } @@ -83,6 +84,8 @@ void pasta_child_handler(int signal) waitid(P_ALL, 0, NULL, WEXITED | WNOHANG); waitid(P_ALL, 0, NULL, WEXITED | WNOHANG); + + errno = errno_save; } /** @@ -97,7 +100,9 @@ static int pasta_wait_for_ns(void *arg) int flags = O_RDONLY | O_CLOEXEC; char ns[PATH_MAX]; - snprintf(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid); + if (snprintf_check(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid)) + die_perror("Can't build netns path"); + do { while ((c->pasta_netns_fd = open(ns, flags)) < 0) { if (errno != ENOENT) @@ -138,17 +143,15 @@ void pasta_open_ns(struct ctx *c, const char *netns) int nfd = -1; nfd = open(netns, O_RDONLY | O_CLOEXEC); - if (nfd < 0) { - die("Couldn't open network namespace %s: %s", - netns, strerror(errno)); - } + if (nfd < 0) + die_perror("Couldn't open network namespace %s", netns); c->pasta_netns_fd = nfd; NS_CALL(ns_check, c); if (c->pasta_netns_fd < 0) - die("Couldn't switch to pasta namespaces: %s", strerror(errno)); + die_perror("Couldn't switch to pasta namespaces"); if (!c->no_netns_quit) { char buf[PATH_MAX] = { 0 }; @@ -164,10 +167,12 @@ void pasta_open_ns(struct ctx *c, const char *netns) * struct pasta_spawn_cmd_arg - Argument for pasta_spawn_cmd() * @exe: Executable to run * @argv: Command and arguments to run + * @ctx: Context to read config from */ struct pasta_spawn_cmd_arg { const char *exe; char *const *argv; + struct ctx *c; }; /** @@ -176,28 +181,43 @@ struct pasta_spawn_cmd_arg { * * Return: this function never returns */ +/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */ static int pasta_spawn_cmd(void *arg) { + char hostname[HOST_NAME_MAX + 1] = HOSTNAME_PREFIX; const struct pasta_spawn_cmd_arg *a; + size_t conf_hostname_len; sigset_t set; /* We run in a detached PID and mount namespace: mount /proc over */ if (mount("", "/proc", "proc", 0, NULL)) - warn("Couldn't mount /proc: %s", strerror(errno)); + warn_perror("Couldn't mount /proc"); if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0")) warn("Cannot set ping_group_range, ICMP requests might fail"); + a = (const struct pasta_spawn_cmd_arg *)arg; + + conf_hostname_len = strlen(a->c->hostname); + if (conf_hostname_len > 0) { + if (sethostname(a->c->hostname, conf_hostname_len)) + warn("Unable to set configured hostname"); + } else if (!gethostname(hostname + sizeof(HOSTNAME_PREFIX) - 1, + HOST_NAME_MAX + 1 - sizeof(HOSTNAME_PREFIX)) || + errno == ENAMETOOLONG) { + hostname[HOST_NAME_MAX] = '\0'; + if (sethostname(hostname, strlen(hostname))) + warn("Unable to set pasta-prefixed hostname"); + } + /* Wait for the parent to be ready: see main() */ sigemptyset(&set); sigaddset(&set, SIGUSR1); sigwaitinfo(&set, NULL); - a = (const struct pasta_spawn_cmd_arg *)arg; execvp(a->exe, a->argv); - perror("execvp"); - exit(EXIT_FAILURE); + die_perror("Failed to start command or shell"); } /** @@ -211,12 +231,14 @@ static int pasta_spawn_cmd(void *arg) void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid, int argc, char *argv[]) { + char ns_fn_stack[NS_FN_STACK_SIZE] + __attribute__ ((aligned(__alignof__(max_align_t)))); struct pasta_spawn_cmd_arg arg = { .exe = argv[0], .argv = argv, + .c = c, }; char uidmap[BUFSIZ], gidmap[BUFSIZ]; - char ns_fn_stack[NS_FN_STACK_SIZE]; char *sh_argv[] = { NULL, NULL }; char sh_arg0[PATH_MAX + 1]; sigset_t set; @@ -226,8 +248,11 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid, c->quiet = 1; /* Configure user and group mappings */ - snprintf(uidmap, BUFSIZ, "0 %u 1", uid); - snprintf(gidmap, BUFSIZ, "0 %u 1", gid); + if (snprintf_check(uidmap, BUFSIZ, "0 %u 1", uid)) + die_perror("Can't build uidmap"); + + if (snprintf_check(gidmap, BUFSIZ, "0 %u 1", gid)) + die_perror("Can't build gidmap"); if (write_file("/proc/self/uid_map", uidmap) || write_file("/proc/self/setgroups", "deny") || @@ -259,14 +284,12 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid, CLONE_NEWUTS | CLONE_NEWNS | SIGCHLD, (void *)&arg); - if (pasta_child_pid == -1) { - perror("clone"); - exit(EXIT_FAILURE); - } + if (pasta_child_pid == -1) + die_perror("Failed to clone process with detached namespaces"); NS_CALL(pasta_wait_for_ns, c); if (c->pasta_netns_fd < 0) - die("Failed to join network namespace: %s", strerror(errno)); + die_perror("Failed to join network namespace"); } /** @@ -277,25 +300,33 @@ void pasta_ns_conf(struct ctx *c) { int rc = 0; - rc = nl_link_up(nl_sock_ns, 1 /* lo */, 0); + rc = nl_link_set_flags(nl_sock_ns, 1 /* lo */, IFF_UP, IFF_UP); if (rc < 0) die("Couldn't bring up loopback interface in namespace: %s", - strerror(-rc)); + strerror_(-rc)); /* Get or set MAC in target namespace */ - if (MAC_IS_ZERO(c->mac_guest)) - nl_link_get_mac(nl_sock_ns, c->pasta_ifi, c->mac_guest); + if (MAC_IS_ZERO(c->guest_mac)) + nl_link_get_mac(nl_sock_ns, c->pasta_ifi, c->guest_mac); else - rc = nl_link_set_mac(nl_sock_ns, c->pasta_ifi, c->mac_guest); + rc = nl_link_set_mac(nl_sock_ns, c->pasta_ifi, c->guest_mac); if (rc < 0) die("Couldn't set MAC address in namespace: %s", - strerror(-rc)); + strerror_(-rc)); if (c->pasta_conf_ns) { - nl_link_up(nl_sock_ns, c->pasta_ifi, c->mtu); + unsigned int flags = IFF_UP; + + if (c->mtu) + nl_link_set_mtu(nl_sock_ns, c->pasta_ifi, c->mtu); + + if (c->ifi6) /* Avoid duplicate address detection on link up */ + flags |= IFF_NOARP; + + nl_link_set_flags(nl_sock_ns, c->pasta_ifi, flags, flags); if (c->ifi4) { - if (c->no_copy_addrs) { + if (c->ip4.no_copy_addrs) { rc = nl_addr_set(nl_sock_ns, c->pasta_ifi, AF_INET, &c->ip4.addr, @@ -308,12 +339,13 @@ void pasta_ns_conf(struct ctx *c) if (rc < 0) { die("Couldn't set IPv4 address(es) in namespace: %s", - strerror(-rc)); + strerror_(-rc)); } - if (c->no_copy_routes) { + if (c->ip4.no_copy_routes) { rc = nl_route_set_def(nl_sock_ns, c->pasta_ifi, - AF_INET, &c->ip4.gw); + AF_INET, + &c->ip4.guest_gw); } else { rc = nl_route_dup(nl_sock, c->ifi4, nl_sock_ns, c->pasta_ifi, AF_INET); @@ -321,14 +353,34 @@ void pasta_ns_conf(struct ctx *c) if (rc < 0) { die("Couldn't set IPv4 route(s) in guest: %s", - strerror(-rc)); + strerror_(-rc)); } } if (c->ifi6) { - if (c->no_copy_addrs) { - rc = nl_addr_set(nl_sock_ns, c->pasta_ifi, - AF_INET6, &c->ip6.addr, 64); + rc = nl_addr_get_ll(nl_sock_ns, c->pasta_ifi, + &c->ip6.addr_ll_seen); + if (rc < 0) { + warn("Can't get LL address from namespace: %s", + strerror_(-rc)); + } + + rc = nl_addr_set_ll_nodad(nl_sock_ns, c->pasta_ifi); + if (rc < 0) { + warn("Can't set nodad for LL in namespace: %s", + strerror_(-rc)); + } + + /* We dodged DAD: re-enable neighbour solicitations */ + nl_link_set_flags(nl_sock_ns, c->pasta_ifi, + 0, IFF_NOARP); + + if (c->ip6.no_copy_addrs) { + if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr)) { + rc = nl_addr_set(nl_sock_ns, + c->pasta_ifi, AF_INET6, + &c->ip6.addr, 64); + } } else { rc = nl_addr_dup(nl_sock, c->ifi6, nl_sock_ns, c->pasta_ifi, @@ -337,12 +389,13 @@ void pasta_ns_conf(struct ctx *c) if (rc < 0) { die("Couldn't set IPv6 address(es) in namespace: %s", - strerror(-rc)); + strerror_(-rc)); } - if (c->no_copy_routes) { + if (c->ip6.no_copy_routes) { rc = nl_route_set_def(nl_sock_ns, c->pasta_ifi, - AF_INET6, &c->ip6.gw); + AF_INET6, + &c->ip6.guest_gw); } else { rc = nl_route_dup(nl_sock, c->ifi6, nl_sock_ns, c->pasta_ifi, @@ -351,12 +404,12 @@ void pasta_ns_conf(struct ctx *c) if (rc < 0) { die("Couldn't set IPv6 route(s) in guest: %s", - strerror(-rc)); + strerror_(-rc)); } } } - proto_update_l2_buf(c->mac_guest, NULL); + proto_update_l2_buf(c->guest_mac, NULL); } /** @@ -370,12 +423,12 @@ static int pasta_netns_quit_timer(void) struct itimerspec it = { { 1, 0 }, { 1, 0 } }; /* one-second interval */ if (fd == -1) { - err("timerfd_create(): %s", strerror(errno)); + err_perror("Failed to create timerfd for quit timer"); return -errno; } if (timerfd_settime(fd, 0, &it, NULL) < 0) { - err("timerfd_settime(): %s", strerror(errno)); + err_perror("Failed to set interval for quit timer"); close(fd); return -errno; } @@ -389,29 +442,29 @@ static int pasta_netns_quit_timer(void) */ void pasta_netns_quit_init(const struct ctx *c) { - union epoll_ref ref = { .type = EPOLL_TYPE_NSQUIT_INOTIFY }; struct epoll_event ev = { .events = EPOLLIN }; int flags = O_NONBLOCK | O_CLOEXEC; struct statfs s = { 0 }; bool try_inotify = true; int fd = -1, dir_fd; + union epoll_ref ref; if (c->mode != MODE_PASTA || c->no_netns_quit || !*c->netns_base) return; if ((dir_fd = open(c->netns_dir, O_CLOEXEC | O_RDONLY)) < 0) - die("netns dir open: %s, exiting", strerror(errno)); + die("netns dir open: %s, exiting", strerror_(errno)); if (fstatfs(dir_fd, &s) || s.f_type == DEVPTS_SUPER_MAGIC || s.f_type == PROC_SUPER_MAGIC || s.f_type == SYSFS_MAGIC) try_inotify = false; if (try_inotify && (fd = inotify_init1(flags)) < 0) - warn("inotify_init1(): %s, use a timer", strerror(errno)); + warn("inotify_init1(): %s, use a timer", strerror_(errno)); if (fd >= 0 && inotify_add_watch(fd, c->netns_dir, IN_DELETE) < 0) { warn("inotify_add_watch(): %s, use a timer", - strerror(errno)); + strerror_(errno)); close(fd); fd = -1; } @@ -425,6 +478,7 @@ void pasta_netns_quit_init(const struct ctx *c) ref.type = EPOLL_TYPE_NSQUIT_TIMER; } else { close(dir_fd); + ref.type = EPOLL_TYPE_NSQUIT_INOTIFY; } if (fd > FD_REF_MAX) @@ -442,17 +496,23 @@ void pasta_netns_quit_init(const struct ctx *c) */ void pasta_netns_quit_inotify_handler(struct ctx *c, int inotify_fd) { - char buf[sizeof(struct inotify_event) + NAME_MAX + 1]; - const struct inotify_event *in_ev = (struct inotify_event *)buf; + char buf[sizeof(struct inotify_event) + NAME_MAX + 1] + __attribute__ ((aligned(__alignof__(struct inotify_event)))); + const struct inotify_event *ev; + ssize_t n; + char *p; - if (read(inotify_fd, buf, sizeof(buf)) < (ssize_t)sizeof(*in_ev)) + if ((n = read(inotify_fd, buf, sizeof(buf))) < (ssize_t)sizeof(*ev)) return; - if (strncmp(in_ev->name, c->netns_base, sizeof(c->netns_base))) - return; + for (p = buf; p < buf + n; p += sizeof(*ev) + ev->len) { + ev = (const struct inotify_event *)p; - info("Namespace %s is gone, exiting", c->netns_base); - exit(EXIT_SUCCESS); + if (!strncmp(ev->name, c->netns_base, sizeof(c->netns_base))) { + info("Namespace %s is gone, exiting", c->netns_base); + _exit(EXIT_SUCCESS); + } + } } /** @@ -468,7 +528,7 @@ void pasta_netns_quit_timer_handler(struct ctx *c, union epoll_ref ref) n = read(ref.fd, &expirations, sizeof(expirations)); if (n < 0) - die("Namespace watch timer read() error: %s", strerror(errno)); + die_perror("Namespace watch timer read() error"); if ((size_t)n < sizeof(expirations)) warn("Namespace watch timer: short read(): %zi", n); @@ -478,7 +538,7 @@ void pasta_netns_quit_timer_handler(struct ctx *c, union epoll_ref ref) return; info("Namespace %s is gone, exiting", c->netns_base); - exit(EXIT_SUCCESS); + _exit(EXIT_SUCCESS); } close(fd); @@ -33,33 +33,12 @@ #include "log.h" #include "pcap.h" #include "iov.h" +#include "tap.h" #define PCAP_VERSION_MINOR 4 static int pcap_fd = -1; -/* See pcap.h from libpcap, or pcap-savefile(5) */ -static const struct { - uint32_t magic; -#define PCAP_MAGIC 0xa1b2c3d4 - - uint16_t major; -#define PCAP_VERSION_MAJOR 2 - - uint16_t minor; -#define PCAP_VERSION_MINOR 4 - - int32_t thiszone; - uint32_t sigfigs; - uint32_t snaplen; - - uint32_t linktype; -#define PCAP_LINKTYPE_ETHERNET 1 -} pcap_hdr = { - PCAP_MAGIC, PCAP_VERSION_MAJOR, PCAP_VERSION_MINOR, 0, 0, ETH_MAX_MTU, - PCAP_LINKTYPE_ETHERNET -}; - struct pcap_pkthdr { uint32_t tv_sec; uint32_t tv_usec; @@ -73,42 +52,39 @@ struct pcap_pkthdr { * @iovcnt: Number of buffers (@iov entries) in frame * @offset: Byte offset of the L2 headers within @iov * @now: Timestamp - * - * Returns: 0 on success, -errno on error writing to the file */ static void pcap_frame(const struct iovec *iov, size_t iovcnt, size_t offset, const struct timespec *now) { - size_t len = iov_size(iov, iovcnt) - offset; + size_t l2len = iov_size(iov, iovcnt) - offset; struct pcap_pkthdr h = { .tv_sec = now->tv_sec, .tv_usec = DIV_ROUND_CLOSEST(now->tv_nsec, 1000), - .caplen = len, - .len = len + .caplen = l2len, + .len = l2len }; - struct iovec hiov = { &h, sizeof(h) }; - if (write_remainder(pcap_fd, &hiov, 1, 0) < 0 || - write_remainder(pcap_fd, iov, iovcnt, offset) < 0) { - debug("Cannot log packet, length %zu: %s", - len, strerror(errno)); - } + if (write_all_buf(pcap_fd, &h, sizeof(h)) < 0 || + write_remainder(pcap_fd, iov, iovcnt, offset) < 0) + debug_perror("Cannot log packet, length %zu", l2len); } /** * pcap() - Capture a single frame to pcap file * @pkt: Pointer to data buffer, including L2 headers - * @len: L2 packet length + * @l2len: L2 frame length */ -void pcap(const char *pkt, size_t len) +void pcap(const char *pkt, size_t l2len) { - struct iovec iov = { (char *)pkt, len }; - struct timespec now; + struct iovec iov = { (char *)pkt, l2len }; + struct timespec now = { 0 }; if (pcap_fd == -1) return; - clock_gettime(CLOCK_REALTIME, &now); + if (clock_gettime(CLOCK_REALTIME, &now)) + err_perror("Failed to get CLOCK_REALTIME time"); + pcap_frame(&iov, 1, 0, &now); } @@ -122,36 +98,38 @@ void pcap(const char *pkt, size_t len) void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n, size_t offset) { - struct timespec now; + struct timespec now = { 0 }; unsigned int i; if (pcap_fd == -1) return; - clock_gettime(CLOCK_REALTIME, &now); + if (clock_gettime(CLOCK_REALTIME, &now)) + err_perror("Failed to get CLOCK_REALTIME time"); for (i = 0; i < n; i++) pcap_frame(iov + i * frame_parts, frame_parts, offset, &now); } -/* - * pcap_iov - Write packet data described by an I/O vector +/** + * pcap_iov() - Write packet data described by an I/O vector * to a pcap file descriptor. - * * @iov: Pointer to the array of struct iovec describing the I/O vector * containing packet data to write, including L2 header * @iovcnt: Number of buffers (@iov entries) + * @offset: Offset of the L2 frame within the full data length */ -/* cppcheck-suppress unusedFunction */ -void pcap_iov(const struct iovec *iov, size_t iovcnt) +void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset) { - struct timespec now; + struct timespec now = { 0 }; if (pcap_fd == -1) return; - clock_gettime(CLOCK_REALTIME, &now); - pcap_frame(iov, iovcnt, 0, &now); + if (clock_gettime(CLOCK_REALTIME, &now)) + err_perror("Failed to get CLOCK_REALTIME time"); + + pcap_frame(iov, iovcnt, offset, &now); } /** @@ -160,7 +138,28 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt) */ void pcap_init(struct ctx *c) { - int flags = O_WRONLY | O_CREAT | O_TRUNC; + /* See pcap.h from libpcap, or pcap-savefile(5) */ +#define PCAP_MAGIC 0xa1b2c3d4 +#define PCAP_VERSION_MAJOR 2 +#define PCAP_VERSION_MINOR 4 +#define PCAP_LINKTYPE_ETHERNET 1 + const struct { + uint32_t magic; + uint16_t major; + uint16_t minor; + + int32_t thiszone; + uint32_t sigfigs; + uint32_t snaplen; + + uint32_t linktype; + } pcap_hdr = { + .magic = PCAP_MAGIC, + .major = PCAP_VERSION_MAJOR, + .minor = PCAP_VERSION_MINOR, + .snaplen = tap_l2_max_len(c), + .linktype = PCAP_LINKTYPE_ETHERNET + }; if (pcap_fd != -1) return; @@ -168,15 +167,14 @@ void pcap_init(struct ctx *c) if (!*c->pcap) return; - flags |= c->foreground ? O_CLOEXEC : 0; - pcap_fd = open(c->pcap, flags, S_IRUSR | S_IWUSR); + pcap_fd = output_file_open(c->pcap, O_WRONLY); if (pcap_fd == -1) { - perror("open"); + err_perror("Couldn't open pcap file %s", c->pcap); return; } info("Saving packet capture to %s", c->pcap); if (write(pcap_fd, &pcap_hdr, sizeof(pcap_hdr)) < 0) - warn("Cannot write PCAP header: %s", strerror(errno)); + warn_perror("Cannot write PCAP header"); } @@ -6,10 +6,10 @@ #ifndef PCAP_H #define PCAP_H -void pcap(const char *pkt, size_t len); +void pcap(const char *pkt, size_t l2len); void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n, size_t offset); -void pcap_iov(const struct iovec *iov, size_t iovcnt); +void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset); void pcap_init(struct ctx *c); #endif /* PCAP_H */ @@ -7,9 +7,14 @@ #include <stdint.h> #include <assert.h> +#include <netinet/in.h> #include "util.h" #include "pif.h" +#include "siphash.h" +#include "ip.h" +#include "inany.h" +#include "passt.h" const char *pif_type_str[] = { [PIF_NONE] = "<none>", @@ -19,3 +24,80 @@ const char *pif_type_str[] = { }; static_assert(ARRAY_SIZE(pif_type_str) == PIF_NUM_TYPES, "pif_type_str[] doesn't match enum pif_type"); + + +/** pif_sockaddr() - Construct a socket address suitable for an interface + * @c: Execution context + * @sa: Pointer to sockaddr to fill in + * @sl: Updated to relevant length of initialised @sa + * @pif: Interface to create the socket address + * @addr: IPv[46] address + * @port: Port (host byte order) + */ +void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl, + uint8_t pif, const union inany_addr *addr, in_port_t port) +{ + const struct in_addr *v4 = inany_v4(addr); + + ASSERT(pif_is_socket(pif)); + + if (v4) { + sa->sa_family = AF_INET; + sa->sa4.sin_addr = *v4; + sa->sa4.sin_port = htons(port); + memset(&sa->sa4.sin_zero, 0, sizeof(sa->sa4.sin_zero)); + *sl = sizeof(sa->sa4); + } else { + sa->sa_family = AF_INET6; + sa->sa6.sin6_addr = addr->a6; + sa->sa6.sin6_port = htons(port); + if (pif == PIF_HOST && IN6_IS_ADDR_LINKLOCAL(&addr->a6)) + sa->sa6.sin6_scope_id = c->ifi6; + else + sa->sa6.sin6_scope_id = 0; + sa->sa6.sin6_flowinfo = 0; + *sl = sizeof(sa->sa6); + } +} + +/** pif_sock_l4() - Open a socket bound to an address on a specified interface + * @c: Execution context + * @type: Socket epoll type + * @pif: Interface for this socket + * @addr: Address to bind to, or NULL for dual-stack any + * @ifname: Interface for binding, NULL for any + * @port: Port number to bind to (host byte order) + * @data: epoll reference portion for protocol handlers + * + * NOTE: For namespace pifs, this must be called having already entered the + * relevant namespace. + * + * Return: newly created socket, negative error code on failure + */ +int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif, + const union inany_addr *addr, const char *ifname, + in_port_t port, uint32_t data) +{ + union sockaddr_inany sa = { + .sa6.sin6_family = AF_INET6, + .sa6.sin6_addr = in6addr_any, + .sa6.sin6_port = htons(port), + }; + socklen_t sl; + + ASSERT(pif_is_socket(pif)); + + if (pif == PIF_SPLICE) { + /* Sanity checks */ + ASSERT(!ifname); + ASSERT(addr && inany_is_loopback(addr)); + } + + if (!addr) + return sock_l4_sa(c, type, &sa, sizeof(sa.sa6), + ifname, false, data); + + pif_sockaddr(c, &sa, &sl, pif, addr, port); + return sock_l4_sa(c, type, &sa, sl, + ifname, sa.sa_family == AF_INET6, data); +} @@ -7,6 +7,9 @@ #ifndef PIF_H #define PIF_H +union inany_addr; +union sockaddr_inany; + /** * enum pif_type - Type of passt/pasta interface ("pif") * @@ -38,10 +41,26 @@ static inline const char *pif_type(enum pif_type pt) return "?"; } -/* cppcheck-suppress unusedFunction */ static inline const char *pif_name(uint8_t pif) { return pif_type(pif); } +/** + * pif_is_socket() - Is interface implemented via L4 sockets? + * @pif: pif to check + * + * Return: true of @pif is an L4 socket based interface, otherwise false + */ +static inline bool pif_is_socket(uint8_t pif) +{ + return pif == PIF_HOST || pif == PIF_SPLICE; +} + +void pif_sockaddr(const struct ctx *c, union sockaddr_inany *sa, socklen_t *sl, + uint8_t pif, const union inany_addr *addr, in_port_t port); +int pif_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif, + const union inany_addr *addr, const char *ifname, + in_port_t port, uint32_t data); + #endif /* PIF_H */ @@ -66,8 +66,8 @@ issues to Stefano Brivio <sbrivio@redhat.com>. Copyright (c) 2020-2021 Red Hat GmbH. \fBqrap\fR is free software: you can redistribute is and/or modify it under the -terms of the GNU Affero General Public License as published by the Free Software -Foundation, either version 3 of the License, or (at your option) any later +terms of the GNU General Public License as published by the Free Software +Foundation, either version 2 of the License, or (at your option) any later version. .SH SEE ALSO diff --git a/repair.c b/repair.c new file mode 100644 index 0000000..f6b1bf3 --- /dev/null +++ b/repair.c @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * repair.c - Interface (server) for passt-repair, set/clear TCP_REPAIR + * + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#include <errno.h> +#include <sys/socket.h> +#include <sys/uio.h> + +#include "util.h" +#include "ip.h" +#include "passt.h" +#include "inany.h" +#include "flow.h" +#include "flow_table.h" + +#include "repair.h" + +#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */ + +/* Wait for a while for TCP_REPAIR helper to connect if it's not there yet */ +#define REPAIR_ACCEPT_TIMEOUT_MS 10 +#define REPAIR_ACCEPT_TIMEOUT_US (REPAIR_ACCEPT_TIMEOUT_MS * 1000) + +/* Pending file descriptors for next repair_flush() call, or command change */ +static int repair_fds[SCM_MAX_FD]; + +/* Pending command: flush pending file descriptors if it changes */ +static int8_t repair_cmd; + +/* Number of pending file descriptors set in @repair_fds */ +static int repair_nfds; + +/** + * repair_sock_init() - Start listening for connections on helper socket + * @c: Execution context + */ +void repair_sock_init(const struct ctx *c) +{ + union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR_LISTEN }; + struct epoll_event ev = { 0 }; + + if (c->fd_repair_listen == -1) + return; + + if (listen(c->fd_repair_listen, 0)) { + err_perror("listen() on repair helper socket, won't migrate"); + return; + } + + ref.fd = c->fd_repair_listen; + ev.events = EPOLLIN | EPOLLHUP | EPOLLET; + ev.data.u64 = ref.u64; + if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair_listen, &ev)) + err_perror("repair helper socket epoll_ctl(), won't migrate"); +} + +/** + * repair_listen_handler() - Handle events on TCP_REPAIR helper listening socket + * @c: Execution context + * @events: epoll events + * + * Return: 0 on valid event with new connected socket, error code on failure + */ +int repair_listen_handler(struct ctx *c, uint32_t events) +{ + union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR }; + struct epoll_event ev = { 0 }; + struct ucred ucred; + socklen_t len; + int rc; + + if (events != EPOLLIN) { + debug("Spurious event 0x%04x on TCP_REPAIR helper socket", + events); + return EINVAL; + } + + len = sizeof(ucred); + + /* Another client is already connected: accept and close right away. */ + if (c->fd_repair != -1) { + int discard = accept4(c->fd_repair_listen, NULL, NULL, + SOCK_NONBLOCK); + + if (discard == -1) + return errno; + + if (!getsockopt(discard, SOL_SOCKET, SO_PEERCRED, &ucred, &len)) + info("Discarding TCP_REPAIR helper, PID %i", ucred.pid); + + close(discard); + return EEXIST; + } + + if ((c->fd_repair = accept4(c->fd_repair_listen, NULL, NULL, 0)) < 0) { + rc = errno; + debug_perror("accept4() on TCP_REPAIR helper listening socket"); + return rc; + } + + if (!getsockopt(c->fd_repair, SOL_SOCKET, SO_PEERCRED, &ucred, &len)) + info("Accepted TCP_REPAIR helper, PID %i", ucred.pid); + + ref.fd = c->fd_repair; + ev.events = EPOLLHUP | EPOLLET; + ev.data.u64 = ref.u64; + if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair, &ev)) { + rc = errno; + debug_perror("epoll_ctl() on TCP_REPAIR helper socket"); + close(c->fd_repair); + c->fd_repair = -1; + return rc; + } + + return 0; +} + +/** + * repair_close() - Close connection to TCP_REPAIR helper + * @c: Execution context + */ +void repair_close(struct ctx *c) +{ + debug("Closing TCP_REPAIR helper socket"); + + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_repair, NULL); + close(c->fd_repair); + c->fd_repair = -1; +} + +/** + * repair_handler() - Handle EPOLLHUP and EPOLLERR on TCP_REPAIR helper socket + * @c: Execution context + * @events: epoll events + */ +void repair_handler(struct ctx *c, uint32_t events) +{ + (void)events; + + repair_close(c); +} + +/** + * repair_wait() - Wait (with timeout) for TCP_REPAIR helper to connect + * @c: Execution context + * + * Return: 0 on success or if already connected, error code on failure + */ +int repair_wait(struct ctx *c) +{ + struct timeval tv = { .tv_sec = 0, + .tv_usec = (long)(REPAIR_ACCEPT_TIMEOUT_US) }; + int rc; + + static_assert(REPAIR_ACCEPT_TIMEOUT_US < 1000 * 1000, + ".tv_usec is greater than 1000 * 1000"); + + if (c->fd_repair >= 0) + return 0; + + if (c->fd_repair_listen == -1) + return ENOENT; + + if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO, + &tv, sizeof(tv))) { + rc = errno; + err_perror("Set timeout on TCP_REPAIR listening socket"); + return rc; + } + + rc = repair_listen_handler(c, EPOLLIN); + + tv.tv_usec = 0; + if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO, + &tv, sizeof(tv))) + err_perror("Clear timeout on TCP_REPAIR listening socket"); + + return rc; +} + +/** + * repair_flush() - Flush current set of sockets to helper, with current command + * @c: Execution context + * + * Return: 0 on success, negative error code on failure + */ +int repair_flush(struct ctx *c) +{ + char buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FD)] + __attribute__ ((aligned(__alignof__(struct cmsghdr)))) = { 0 }; + struct iovec iov = { &repair_cmd, sizeof(repair_cmd) }; + struct cmsghdr *cmsg; + struct msghdr msg; + int8_t reply; + + if (!repair_nfds) + return 0; + + msg = (struct msghdr){ .msg_name = NULL, .msg_namelen = 0, + .msg_iov = &iov, .msg_iovlen = 1, + .msg_control = buf, + .msg_controllen = CMSG_SPACE(sizeof(int) * + repair_nfds), + .msg_flags = 0 }; + cmsg = CMSG_FIRSTHDR(&msg); + + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int) * repair_nfds); + memcpy(CMSG_DATA(cmsg), repair_fds, sizeof(int) * repair_nfds); + + repair_nfds = 0; + + if (sendmsg(c->fd_repair, &msg, 0) < 0) { + int ret = -errno; + err_perror("Failed to send sockets to TCP_REPAIR helper"); + repair_close(c); + return ret; + } + + if (recv(c->fd_repair, &reply, sizeof(reply), 0) < 0) { + int ret = -errno; + err_perror("Failed to receive reply from TCP_REPAIR helper"); + repair_close(c); + return ret; + } + + if (reply != repair_cmd) { + err("Unexpected reply from TCP_REPAIR helper: %d", reply); + repair_close(c); + return -ENXIO; + } + + return 0; +} + +/** + * repair_set() - Add socket to TCP_REPAIR set with given command + * @c: Execution context + * @s: Socket to add + * @cmd: TCP_REPAIR_ON, TCP_REPAIR_OFF, or TCP_REPAIR_OFF_NO_WP + * + * Return: 0 on success, negative error code on failure + */ +int repair_set(struct ctx *c, int s, int cmd) +{ + int rc; + + if (repair_nfds && repair_cmd != cmd) { + if ((rc = repair_flush(c))) + return rc; + } + + repair_cmd = cmd; + repair_fds[repair_nfds++] = s; + + if (repair_nfds >= SCM_MAX_FD) { + if ((rc = repair_flush(c))) + return rc; + } + + return 0; +} diff --git a/repair.h b/repair.h new file mode 100644 index 0000000..ab27e67 --- /dev/null +++ b/repair.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#ifndef REPAIR_H +#define REPAIR_H + +void repair_sock_init(const struct ctx *c); +int repair_listen_handler(struct ctx *c, uint32_t events); +void repair_handler(struct ctx *c, uint32_t events); +void repair_close(struct ctx *c); +int repair_wait(struct ctx *c); +int repair_flush(struct ctx *c); +int repair_set(struct ctx *c, int s, int cmd); + +#endif /* REPAIR_H */ @@ -14,12 +14,23 @@ # Author: Stefano Brivio <sbrivio@redhat.com> TMP="$(mktemp)" -IN="$@" OUT="$(mktemp)" +OUT_FINAL="${1}" +shift +IN="$@" [ -z "${ARCH}" ] && ARCH="$(uname -m)" [ -z "${CC}" ] && CC="cc" +AUDIT_ARCH="AUDIT_ARCH_$(echo ${ARCH} | tr [a-z] [A-Z] \ + | sed 's/^ARM.*/ARM/' \ + | sed 's/I[456]86/I386/' \ + | sed 's/PPC64/PPC/' \ + | sed 's/PPCLE/PPC64LE/' \ + | sed 's/MIPS64EL/MIPSEL64/' \ + | sed 's/HPPA/PARISC/' \ + | sed 's/SH4/SH/')" + HEADER="/* This file was automatically generated by $(basename ${0}) */ #ifndef AUDIT_ARCH_PPC64LE @@ -29,11 +40,11 @@ HEADER="/* This file was automatically generated by $(basename ${0}) */ # Prefix for each profile: check that 'arch' in seccomp_data is matching PRE=' struct sock_filter filter_@PROFILE@[] = { - /* cppcheck-suppress badBitmaskCheck */ + /* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, arch))), - BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, PASST_AUDIT_ARCH, 0, @KILL@), - /* cppcheck-suppress badBitmaskCheck */ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, @AUDIT_ARCH@, 0, @KILL@), + /* cppcheck-suppress [badBitmaskCheck, unmatchedSuppression] */ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))), @@ -233,7 +244,8 @@ gen_profile() { sub ${__i} CALL "NR:${__nr}" "NAME:${__name}" "ALLOW:${__allow}" done - finish PRE "PROFILE:${__profile}" "KILL:$(( __statements + 1))" + finish PRE "PROFILE:${__profile}" "KILL:$(( __statements + 1))" \ + "AUDIT_ARCH:${AUDIT_ARCH}" } printf '%s\n' "${HEADER}" > "${OUT}" @@ -242,7 +254,10 @@ for __p in ${__profiles}; do __calls="$(sed -n 's/[\t ]*\*[\t ]*#syscalls\(:'"${__p}"'\|\)[\t ]\{1,\}\(.*\)/\2/p' ${IN})" __calls="${__calls} ${EXTRA_SYSCALLS:-}" __calls="$(filter ${__calls})" - echo "seccomp profile ${__p} allows: ${__calls}" | tr '\n' ' ' | fmt -t + + cols="$(stty -a 2>/dev/null | sed -n 's/.*columns \([0-9]*\).*/\1/p' || :)" 2>/dev/null + case $cols in [0-9]*) col_args="-w ${cols}";; *) col_args="";; esac + echo "seccomp profile ${__p} allows: ${__calls}" | tr '\n' ' ' | fmt -t ${col_args} # Pad here to keep gen_profile() "simple" __count=0 @@ -255,4 +270,4 @@ for __p in ${__profiles}; do gen_profile "${__p}" ${__calls} done -mv "${OUT}" seccomp.h +mv "${OUT}" "${OUT_FINAL}" @@ -115,10 +115,4 @@ static inline uint64_t siphash_final(struct siphash_state *state, return state->v[0] ^ state->v[1] ^ state->v[2] ^ state->v[3]; } -uint64_t siphash_8b(const uint8_t *in, const uint64_t *k); -uint64_t siphash_12b(const uint8_t *in, const uint64_t *k); -uint64_t siphash_20b(const uint8_t *in, const uint64_t *k); -uint64_t siphash_32b(const uint8_t *in, const uint64_t *k); -uint64_t siphash_36b(const uint8_t *in, const uint64_t *k); - #endif /* SIPHASH_H */ @@ -56,50 +56,100 @@ #include "netlink.h" #include "pasta.h" #include "packet.h" +#include "repair.h" #include "tap.h" #include "log.h" +#include "vhost_user.h" +#include "vu_common.h" + +/* Maximum allowed frame lengths (including L2 header) */ + +/* Verify that an L2 frame length limit is large enough to contain the header, + * but small enough to fit in the packet pool + */ +#define CHECK_FRAME_LEN(len) \ + static_assert((len) >= ETH_HLEN && (len) <= PACKET_MAX_LEN, \ + #len " has bad value") + +CHECK_FRAME_LEN(L2_MAX_LEN_PASTA); +CHECK_FRAME_LEN(L2_MAX_LEN_PASST); +CHECK_FRAME_LEN(L2_MAX_LEN_VU); + +/* We try size the packet pools so that we can use a single batch for the entire + * packet buffer. This might be exceeded for vhost-user, though, which uses its + * own buffers rather than pkt_buf. + * + * This is just a tuning parameter, the code will work with slightly more + * overhead if it's incorrect. So, we estimate based on the minimum practical + * frame size - an empty UDP datagram - rather than the minimum theoretical + * frame size. + * + * FIXME: Profile to work out how big this actually needs to be to amortise + * per-batch syscall overheads + */ +#define TAP_MSGS_IP4 \ + DIV_ROUND_UP(sizeof(pkt_buf), \ + ETH_HLEN + sizeof(struct iphdr) + sizeof(struct udphdr)) +#define TAP_MSGS_IP6 \ + DIV_ROUND_UP(sizeof(pkt_buf), \ + ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct udphdr)) /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */ -static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf); -static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf); +static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS_IP4, pkt_buf); +static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS_IP6, pkt_buf); #define TAP_SEQS 128 /* Different L4 tuples in one batch */ #define FRAGMENT_MSG_RATE 10 /* # seconds between fragment warnings */ /** - * tap_send_single() - Send a single frame + * tap_l2_max_len() - Maximum frame size (including L2 header) for current mode * @c: Execution context - * @data: Packet buffer - * @len: Total L2 packet length */ -void tap_send_single(const struct ctx *c, const void *data, size_t len) +unsigned long tap_l2_max_len(const struct ctx *c) { - uint32_t vnet_len = htonl(len); - struct iovec iov[2]; - size_t iovcnt = 0; - - if (c->mode == MODE_PASST) { - iov[iovcnt].iov_base = &vnet_len; - iov[iovcnt].iov_len = sizeof(vnet_len); - iovcnt++; + /* NOLINTBEGIN(bugprone-branch-clone): values can be the same */ + switch (c->mode) { + case MODE_PASST: + return L2_MAX_LEN_PASST; + case MODE_PASTA: + return L2_MAX_LEN_PASTA; + case MODE_VU: + return L2_MAX_LEN_VU; } + /* NOLINTEND(bugprone-branch-clone) */ + ASSERT(0); - iov[iovcnt].iov_base = (void *)data; - iov[iovcnt].iov_len = len; - iovcnt++; - - tap_send_frames(c, iov, iovcnt, 1); + return 0; /* Unreachable, for cppcheck's sake */ } /** - * tap_ip4_daddr() - Normal IPv4 destination address for inbound packets + * tap_send_single() - Send a single frame * @c: Execution context - * - * Return: IPv4 address, network order + * @data: Packet buffer + * @l2len: Total L2 packet length */ -struct in_addr tap_ip4_daddr(const struct ctx *c) +void tap_send_single(const struct ctx *c, const void *data, size_t l2len) { - return c->ip4.addr_seen; + uint32_t vnet_len = htonl(l2len); + struct iovec iov[2]; + size_t iovcnt = 0; + + switch (c->mode) { + case MODE_PASST: + iov[iovcnt] = IOV_OF_LVALUE(vnet_len); + iovcnt++; + /* fall through */ + case MODE_PASTA: + iov[iovcnt].iov_base = (void *)data; + iov[iovcnt].iov_len = l2len; + iovcnt++; + + tap_send_frames(c, iov, iovcnt, 1); + break; + case MODE_VU: + vu_send_single(c, data, l2len); + break; + } } /** @@ -125,13 +175,13 @@ const struct in6_addr *tap_ip6_daddr(const struct ctx *c, * * Return: pointer at which to write the packet's payload */ -static void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto) +void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto) { struct ethhdr *eh = (struct ethhdr *)buf; /* TODO: ARP table lookup */ - memcpy(eh->h_dest, c->mac_guest, ETH_ALEN); - memcpy(eh->h_source, c->mac, ETH_ALEN); + memcpy(eh->h_dest, c->guest_mac, ETH_ALEN); + memcpy(eh->h_source, c->our_tap_mac, ETH_ALEN); eh->h_proto = ntohs(proto); return eh + 1; } @@ -139,57 +189,84 @@ static void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto) /** * tap_push_ip4h() - Build IPv4 header for inbound packet, with checksum * @c: Execution context - * @src: IPv4 source address, network order - * @dst: IPv4 destination address, network order - * @len: L4 payload length + * @src: IPv4 source address + * @dst: IPv4 destination address + * @l4len: IPv4 payload length * @proto: L4 protocol number * * Return: pointer at which to write the packet's payload */ -static void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, - struct in_addr dst, size_t len, uint8_t proto) +void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, + struct in_addr dst, size_t l4len, uint8_t proto) { + uint16_t l3len = l4len + sizeof(*ip4h); + ip4h->version = 4; ip4h->ihl = sizeof(struct iphdr) / 4; ip4h->tos = 0; - ip4h->tot_len = htons(len + sizeof(*ip4h)); + ip4h->tot_len = htons(l3len); ip4h->id = 0; - ip4h->frag_off = 0; + ip4h->frag_off = htons(IP_DF); ip4h->ttl = 255; ip4h->protocol = proto; ip4h->saddr = src.s_addr; ip4h->daddr = dst.s_addr; - ip4h->check = csum_ip4_header(ip4h->tot_len, proto, src, dst); - return ip4h + 1; + ip4h->check = csum_ip4_header(l3len, proto, src, dst); + return (char *)ip4h + sizeof(*ip4h); } /** - * tap_udp4_send() - Send UDP over IPv4 packet + * tap_push_uh4() - Build UDPv4 header with checksum * @c: Execution context * @src: IPv4 source address * @sport: UDP source port * @dst: IPv4 destination address * @dport: UDP destination port * @in: UDP payload contents (not including UDP header) - * @len: UDP payload length (not including UDP header) + * @dlen: UDP payload length (not including UDP header) + * + * Return: pointer at which to write the packet's payload */ -void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport, +void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport, struct in_addr dst, in_port_t dport, - const void *in, size_t len) + const void *in, size_t dlen) { - size_t udplen = len + sizeof(struct udphdr); - char buf[USHRT_MAX]; - struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP); - struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, udplen, IPPROTO_UDP); - char *data = (char *)(uh + 1); + size_t l4len = dlen + sizeof(struct udphdr); + const struct iovec iov = { + .iov_base = (void *)in, + .iov_len = dlen + }; + struct iov_tail payload = IOV_TAIL(&iov, 1, 0); uh->source = htons(sport); uh->dest = htons(dport); - uh->len = htons(udplen); - csum_udp4(uh, src, dst, in, len); - memcpy(data, in, len); + uh->len = htons(l4len); + csum_udp4(uh, src, dst, &payload); + return (char *)uh + sizeof(*uh); +} - tap_send_single(c, buf, len + (data - buf)); +/** + * tap_udp4_send() - Send UDP over IPv4 packet + * @c: Execution context + * @src: IPv4 source address + * @sport: UDP source port + * @dst: IPv4 destination address + * @dport: UDP destination port + * @in: UDP payload contents (not including UDP header) + * @dlen: UDP payload length (not including UDP header) + */ +void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport, + struct in_addr dst, in_port_t dport, + const void *in, size_t dlen) +{ + size_t l4len = dlen + sizeof(struct udphdr); + char buf[USHRT_MAX]; + struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP); + struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP); + char *data = tap_push_uh4(uh, src, sport, dst, dport, in, dlen); + + memcpy(data, in, dlen); + tap_send_single(c, buf, dlen + (data - buf)); } /** @@ -198,20 +275,20 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport, * @src: IPv4 source address * @dst: IPv4 destination address * @in: ICMP packet, including ICMP header - * @len: ICMP packet length, including ICMP header + * @l4len: ICMP packet length, including ICMP header */ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst, - const void *in, size_t len) + const void *in, size_t l4len) { char buf[USHRT_MAX]; struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP); struct icmphdr *icmp4h = tap_push_ip4h(ip4h, src, dst, - len, IPPROTO_ICMP); + l4len, IPPROTO_ICMP); - memcpy(icmp4h, in, len); - csum_icmp4(icmp4h, icmp4h + 1, len - sizeof(*icmp4h)); + memcpy(icmp4h, in, l4len); + csum_icmp4(icmp4h, icmp4h + 1, l4len - sizeof(*icmp4h)); - tap_send_single(c, buf, len + ((char *)icmp4h - buf)); + tap_send_single(c, buf, l4len + ((char *)icmp4h - buf)); } /** @@ -219,32 +296,29 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst, * @c: Execution context * @src: IPv6 source address * @dst: IPv6 destination address - * @len: L4 payload length + * @l4len: L4 payload length * @proto: L4 protocol number * @flow: IPv6 flow identifier * * Return: pointer at which to write the packet's payload */ -static void *tap_push_ip6h(struct ipv6hdr *ip6h, - const struct in6_addr *src, - const struct in6_addr *dst, - size_t len, uint8_t proto, uint32_t flow) +void *tap_push_ip6h(struct ipv6hdr *ip6h, + const struct in6_addr *src, const struct in6_addr *dst, + size_t l4len, uint8_t proto, uint32_t flow) { - ip6h->payload_len = htons(len); + ip6h->payload_len = htons(l4len); ip6h->priority = 0; ip6h->version = 6; ip6h->nexthdr = proto; ip6h->hop_limit = 255; ip6h->saddr = *src; ip6h->daddr = *dst; - ip6h->flow_lbl[0] = (flow >> 16) & 0xf; - ip6h->flow_lbl[1] = (flow >> 8) & 0xff; - ip6h->flow_lbl[2] = (flow >> 0) & 0xff; - return ip6h + 1; + ip6_set_flow_lbl(ip6h, flow); + return (char *)ip6h + sizeof(*ip6h); } /** - * tap_udp6_send() - Send UDP over IPv6 packet + * tap_push_uh6() - Build UDPv6 header with checksum * @c: Execution context * @src: IPv6 source address * @sport: UDP source port @@ -252,27 +326,54 @@ static void *tap_push_ip6h(struct ipv6hdr *ip6h, * @dport: UDP destination port * @flow: Flow label * @in: UDP payload contents (not including UDP header) - * @len: UDP payload length (not including UDP header) + * @dlen: UDP payload length (not including UDP header) + * + * Return: pointer at which to write the packet's payload + */ +void *tap_push_uh6(struct udphdr *uh, + const struct in6_addr *src, in_port_t sport, + const struct in6_addr *dst, in_port_t dport, + void *in, size_t dlen) +{ + size_t l4len = dlen + sizeof(struct udphdr); + const struct iovec iov = { + .iov_base = in, + .iov_len = dlen + }; + struct iov_tail payload = IOV_TAIL(&iov, 1, 0); + + uh->source = htons(sport); + uh->dest = htons(dport); + uh->len = htons(l4len); + csum_udp6(uh, src, dst, &payload); + return (char *)uh + sizeof(*uh); +} + +/** + * tap_udp6_send() - Send UDP over IPv6 packet + * @c: Execution context + * @src: IPv6 source address + * @sport: UDP source port + * @dst: IPv6 destination address + * @dport: UDP destination port + * @flow: Flow label + * @in: UDP payload contents (not including UDP header) + * @dlen: UDP payload length (not including UDP header) */ void tap_udp6_send(const struct ctx *c, const struct in6_addr *src, in_port_t sport, const struct in6_addr *dst, in_port_t dport, - uint32_t flow, const void *in, size_t len) + uint32_t flow, void *in, size_t dlen) { - size_t udplen = len + sizeof(struct udphdr); + size_t l4len = dlen + sizeof(struct udphdr); char buf[USHRT_MAX]; struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6); struct udphdr *uh = tap_push_ip6h(ip6h, src, dst, - udplen, IPPROTO_UDP, flow); - char *data = (char *)(uh + 1); - - uh->source = htons(sport); - uh->dest = htons(dport); - uh->len = htons(udplen); - csum_udp6(uh, src, dst, in, len); - memcpy(data, in, len); + l4len, IPPROTO_UDP, flow); + char *data = tap_push_uh6(uh, src, sport, dst, dport, in, dlen); - tap_send_single(c, buf, len + (data - buf)); + memcpy(data, in, dlen); + tap_send_single(c, buf, dlen + (data - buf)); } /** @@ -281,21 +382,21 @@ void tap_udp6_send(const struct ctx *c, * @src: IPv6 source address * @dst: IPv6 destination address * @in: ICMP packet, including ICMP header - * @len: ICMP packet length, including ICMP header + * @l4len: ICMP packet length, including ICMP header */ void tap_icmp6_send(const struct ctx *c, const struct in6_addr *src, const struct in6_addr *dst, - const void *in, size_t len) + const void *in, size_t l4len) { char buf[USHRT_MAX]; struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6); - struct icmp6hdr *icmp6h = tap_push_ip6h(ip6h, src, dst, len, + struct icmp6hdr *icmp6h = tap_push_ip6h(ip6h, src, dst, l4len, IPPROTO_ICMPV6, 0); - memcpy(icmp6h, in, len); - csum_icmp6(icmp6h, src, dst, icmp6h + 1, len - sizeof(*icmp6h)); + memcpy(icmp6h, in, l4len); + csum_icmp6(icmp6h, src, dst, icmp6h + 1, l4len - sizeof(*icmp6h)); - tap_send_single(c, buf, len + ((char *)icmp6h - buf)); + tap_send_single(c, buf, l4len + ((char *)icmp6h - buf)); } /** @@ -324,7 +425,7 @@ static size_t tap_send_frames_pasta(const struct ctx *c, size_t framelen = iov_size(iov + i, bufs_per_frame); if (rc < 0) { - debug("tap write: %s", strerror(errno)); + debug_perror("tap write"); switch (errno) { case EAGAIN: @@ -334,6 +435,7 @@ static size_t tap_send_frames_pasta(const struct ctx *c, case EINTR: case ENOBUFS: case ENOSPC: + case EIO: /* interface down? */ break; default: die("Write error on tap device, exiting"); @@ -386,7 +488,7 @@ static size_t tap_send_frames_passt(const struct ctx *c, size_t rembufs = bufs_per_frame - (i % bufs_per_frame); if (write_remainder(c->fd_tap, &iov[i], rembufs, buf_offset) < 0) { - err("tap: partial frame send: %s", strerror(errno)); + err_perror("tap: partial frame send"); return i; } i += rembufs; @@ -415,10 +517,18 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, if (!nframes) return 0; - if (c->mode == MODE_PASST) - m = tap_send_frames_passt(c, iov, bufs_per_frame, nframes); - else + switch (c->mode) { + case MODE_PASTA: m = tap_send_frames_pasta(c, iov, bufs_per_frame, nframes); + break; + case MODE_PASST: + m = tap_send_frames_passt(c, iov, bufs_per_frame, nframes); + break; + case MODE_VU: + /* fall through */ + default: + ASSERT(0); + } if (m < nframes) debug("tap: failed to send %zu frames of %zu", @@ -451,6 +561,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf); * struct l4_seq4_t - Message sequence for one protocol handler call, IPv4 * @msgs: Count of messages in sequence * @protocol: Protocol number + * @ttl: Time to live * @source: Source port * @dest: Destination port * @saddr: Source address @@ -459,6 +570,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf); */ static struct tap4_l4_t { uint8_t protocol; + uint8_t ttl; uint16_t source; uint16_t dest; @@ -473,14 +585,17 @@ static struct tap4_l4_t { * struct l4_seq6_t - Message sequence for one protocol handler call, IPv6 * @msgs: Count of messages in sequence * @protocol: Protocol number + * @flow_lbl: IPv6 flow label * @source: Source port * @dest: Destination port * @saddr: Source address * @daddr: Destination address + * @hop_limit: Hop limit * @msg: Array of messages that can be handled in a single call */ static struct tap6_l4_t { uint8_t protocol; + uint32_t flow_lbl :20; uint16_t source; uint16_t dest; @@ -488,6 +603,8 @@ static struct tap6_l4_t { struct in6_addr saddr; struct in6_addr daddr; + uint8_t hop_limit; + struct pool_l4_t p; } tap6_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */]; @@ -589,21 +706,21 @@ static int tap4_handler(struct ctx *c, const struct pool *in, i = 0; resume: for (seq_count = 0, seq = NULL; i < in->count; i++) { - size_t l2_len, l3_len, hlen, l4_len; + size_t l2len, l3len, hlen, l4len; const struct ethhdr *eh; const struct udphdr *uh; struct iphdr *iph; const char *l4h; - packet_get(in, i, 0, 0, &l2_len); + packet_get(in, i, 0, 0, &l2len); - eh = packet_get(in, i, 0, sizeof(*eh), &l3_len); + eh = packet_get(in, i, 0, sizeof(*eh), &l3len); if (!eh) continue; if (ntohs(eh->h_proto) == ETH_P_ARP) { - PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); - packet_add(pkt, l2_len, (char *)eh); + packet_add(pkt, l2len, (char *)eh); arp(c, pkt); continue; } @@ -613,15 +730,15 @@ resume: continue; hlen = iph->ihl * 4UL; - if (hlen < sizeof(*iph) || htons(iph->tot_len) > l3_len || - hlen > l3_len) + if (hlen < sizeof(*iph) || htons(iph->tot_len) > l3len || + hlen > l3len) continue; /* We don't handle IP fragments, drop them */ if (tap4_is_fragment(iph, now)) continue; - l4_len = htons(iph->tot_len) - hlen; + l4len = htons(iph->tot_len) - hlen; if (IN4_IS_ADDR_LOOPBACK(&iph->saddr) || IN4_IS_ADDR_LOOPBACK(&iph->daddr)) { @@ -636,19 +753,19 @@ resume: if (iph->saddr && c->ip4.addr_seen.s_addr != iph->saddr) c->ip4.addr_seen.s_addr = iph->saddr; - l4h = packet_get(in, i, sizeof(*eh) + hlen, l4_len, NULL); + l4h = packet_get(in, i, sizeof(*eh) + hlen, l4len, NULL); if (!l4h) continue; if (iph->protocol == IPPROTO_ICMP) { - PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); if (c->no_icmp) continue; tap_packet_debug(iph, NULL, NULL, 0, NULL, 1); - packet_add(pkt, l4_len, l4h); + packet_add(pkt, l4len, l4h); icmp_tap_handler(c, PIF_TAP, AF_INET, &iph->saddr, &iph->daddr, pkt, now); @@ -660,9 +777,9 @@ resume: continue; if (iph->protocol == IPPROTO_UDP) { - PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); - packet_add(pkt, l2_len, (char *)eh); + packet_add(pkt, l2len, (char *)eh); if (dhcp(c, pkt)) continue; } @@ -673,18 +790,20 @@ resume: continue; } -#define L4_MATCH(iph, uh, seq) \ - (seq->protocol == iph->protocol && \ - seq->source == uh->source && seq->dest == uh->dest && \ - seq->saddr.s_addr == iph->saddr && seq->daddr.s_addr == iph->daddr) +#define L4_MATCH(iph, uh, seq) \ + ((seq)->protocol == (iph)->protocol && \ + (seq)->source == (uh)->source && (seq)->dest == (uh)->dest && \ + (seq)->saddr.s_addr == (iph)->saddr && \ + (seq)->daddr.s_addr == (iph)->daddr && (seq)->ttl == (iph)->ttl) #define L4_SET(iph, uh, seq) \ do { \ - seq->protocol = iph->protocol; \ - seq->source = uh->source; \ - seq->dest = uh->dest; \ - seq->saddr.s_addr = iph->saddr; \ - seq->daddr.s_addr = iph->daddr; \ + (seq)->protocol = (iph)->protocol; \ + (seq)->source = (uh)->source; \ + (seq)->dest = (uh)->dest; \ + (seq)->saddr.s_addr = (iph)->saddr; \ + (seq)->daddr.s_addr = (iph)->daddr; \ + (seq)->ttl = (iph)->ttl; \ } while (0) if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV) @@ -711,7 +830,7 @@ resume: #undef L4_SET append: - packet_add((struct pool *)&seq->p, l4_len, l4h); + packet_add((struct pool *)&seq->p, l4len, l4h); } for (j = 0, seq = tap4_l4; j < seq_count; j++, seq++) { @@ -726,14 +845,14 @@ append: for (k = 0; k < p->count; ) k += tcp_tap_handler(c, PIF_TAP, AF_INET, &seq->saddr, &seq->daddr, - p, k, now); + 0, p, k, now); } else if (seq->protocol == IPPROTO_UDP) { if (c->no_udp) continue; for (k = 0; k < p->count; ) k += udp_tap_handler(c, PIF_TAP, AF_INET, &seq->saddr, &seq->daddr, - p, k, now); + seq->ttl, p, k, now); } } @@ -763,7 +882,7 @@ static int tap6_handler(struct ctx *c, const struct pool *in, i = 0; resume: for (seq_count = 0, seq = NULL; i < in->count; i++) { - size_t l4_len, plen, check; + size_t l4len, plen, check; struct in6_addr *saddr, *daddr; const struct ethhdr *eh; const struct udphdr *uh; @@ -786,7 +905,7 @@ resume: if (plen != check) continue; - if (!(l4h = ipv6_l4hdr(in, i, sizeof(*eh), &proto, &l4_len))) + if (!(l4h = ipv6_l4hdr(in, i, sizeof(*eh), &proto, &l4len))) continue; if (IN6_IS_ADDR_LOOPBACK(saddr) || IN6_IS_ADDR_LOOPBACK(daddr)) { @@ -804,38 +923,42 @@ resume: if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr_seen)) { c->ip6.addr_seen = *saddr; } + + if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr)) + c->ip6.addr = *saddr; } else if (!IN6_IS_ADDR_UNSPECIFIED(saddr)){ c->ip6.addr_seen = *saddr; } if (proto == IPPROTO_ICMPV6) { - PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); if (c->no_icmp) continue; - if (l4_len < sizeof(struct icmp6hdr)) + if (l4len < sizeof(struct icmp6hdr)) continue; - if (ndp(c, (struct icmp6hdr *)l4h, saddr)) + packet_add(pkt, l4len, l4h); + + if (ndp(c, (struct icmp6hdr *)l4h, saddr, pkt)) continue; tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1); - packet_add(pkt, l4_len, l4h); icmp_tap_handler(c, PIF_TAP, AF_INET6, saddr, daddr, pkt, now); continue; } - if (l4_len < sizeof(*uh)) + if (l4len < sizeof(*uh)) continue; uh = (struct udphdr *)l4h; if (proto == IPPROTO_UDP) { - PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); - packet_add(pkt, l4_len, l4h); + packet_add(pkt, l4len, l4h); if (dhcpv6(c, pkt, saddr, daddr)) continue; @@ -847,18 +970,23 @@ resume: } #define L4_MATCH(ip6h, proto, uh, seq) \ - (seq->protocol == proto && \ - seq->source == uh->source && seq->dest == uh->dest && \ - IN6_ARE_ADDR_EQUAL(&seq->saddr, saddr) && \ - IN6_ARE_ADDR_EQUAL(&seq->daddr, daddr)) + ((seq)->protocol == (proto) && \ + (seq)->source == (uh)->source && \ + (seq)->dest == (uh)->dest && \ + (seq)->flow_lbl == ip6_get_flow_lbl(ip6h) && \ + IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr) && \ + IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr) && \ + (seq)->hop_limit == (ip6h)->hop_limit) #define L4_SET(ip6h, proto, uh, seq) \ do { \ - seq->protocol = proto; \ - seq->source = uh->source; \ - seq->dest = uh->dest; \ - seq->saddr = *saddr; \ - seq->daddr = *daddr; \ + (seq)->protocol = (proto); \ + (seq)->source = (uh)->source; \ + (seq)->dest = (uh)->dest; \ + (seq)->flow_lbl = ip6_get_flow_lbl(ip6h); \ + (seq)->saddr = *saddr; \ + (seq)->daddr = *daddr; \ + (seq)->hop_limit = (ip6h)->hop_limit; \ } while (0) if (seq && L4_MATCH(ip6h, proto, uh, seq) && @@ -886,7 +1014,7 @@ resume: #undef L4_SET append: - packet_add((struct pool *)&seq->p, l4_len, l4h); + packet_add((struct pool *)&seq->p, l4len, l4h); } for (j = 0, seq = tap6_l4; j < seq_count; j++, seq++) { @@ -902,14 +1030,14 @@ append: for (k = 0; k < p->count; ) k += tcp_tap_handler(c, PIF_TAP, AF_INET6, &seq->saddr, &seq->daddr, - p, k, now); + seq->flow_lbl, p, k, now); } else if (seq->protocol == IPPROTO_UDP) { if (c->no_udp) continue; for (k = 0; k < p->count; ) k += udp_tap_handler(c, PIF_TAP, AF_INET6, &seq->saddr, &seq->daddr, - p, k, now); + seq->hop_limit, p, k, now); } } @@ -920,248 +1048,294 @@ append: } /** - * tap_sock_reset() - Handle closing or failure of connect AF_UNIX socket + * tap_flush_pools() - Flush both IPv4 and IPv6 packet pools + */ +void tap_flush_pools(void) +{ + pool_flush(pool_tap4); + pool_flush(pool_tap6); +} + +/** + * tap_handler() - IPv4/IPv6 and ARP packet handler for tap file descriptor + * @c: Execution context + * @now: Current timestamp + */ +void tap_handler(struct ctx *c, const struct timespec *now) +{ + tap4_handler(c, pool_tap4, now); + tap6_handler(c, pool_tap6, now); +} + +/** + * tap_add_packet() - Queue/capture packet, update notion of guest MAC address * @c: Execution context + * @l2len: Total L2 packet length + * @p: Packet buffer + * @now: Current timestamp */ -static void tap_sock_reset(struct ctx *c) +void tap_add_packet(struct ctx *c, ssize_t l2len, char *p, + const struct timespec *now) { - if (c->one_off) { - info("Client closed connection, exiting"); - exit(EXIT_SUCCESS); + const struct ethhdr *eh; + + pcap(p, l2len); + + eh = (struct ethhdr *)p; + + if (memcmp(c->guest_mac, eh->h_source, ETH_ALEN)) { + memcpy(c->guest_mac, eh->h_source, ETH_ALEN); + proto_update_l2_buf(c->guest_mac, NULL); + } + + switch (ntohs(eh->h_proto)) { + case ETH_P_ARP: + case ETH_P_IP: + if (pool_full(pool_tap4)) { + tap4_handler(c, pool_tap4, now); + pool_flush(pool_tap4); + } + packet_add(pool_tap4, l2len, p); + break; + case ETH_P_IPV6: + if (pool_full(pool_tap6)) { + tap6_handler(c, pool_tap6, now); + pool_flush(pool_tap6); + } + packet_add(pool_tap6, l2len, p); + break; + default: + break; } +} + +/** + * tap_sock_reset() - Handle closing or failure of connect AF_UNIX socket + * @c: Execution context + */ +void tap_sock_reset(struct ctx *c) +{ + info("Client connection closed%s", c->one_off ? ", exiting" : ""); + + if (c->one_off) + _exit(EXIT_SUCCESS); /* Close the connected socket, wait for a new connection */ - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL); + epoll_del(c, c->fd_tap); close(c->fd_tap); c->fd_tap = -1; + if (c->mode == MODE_VU) + vu_cleanup(c->vdev); } /** - * tap_handler_passt() - Packet handler for AF_UNIX file descriptor + * tap_passt_input() - Handler for new data on the socket to qemu * @c: Execution context - * @events: epoll events * @now: Current timestamp */ -void tap_handler_passt(struct ctx *c, uint32_t events, - const struct timespec *now) +static void tap_passt_input(struct ctx *c, const struct timespec *now) { - const struct ethhdr *eh; - ssize_t n, rem; + static const char *partial_frame; + static ssize_t partial_len = 0; + ssize_t n; char *p; - if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) { - tap_sock_reset(c); - return; - } + tap_flush_pools(); -redo: - p = pkt_buf; - rem = 0; + if (partial_len) { + /* We have a partial frame from an earlier pass. Move it to the + * start of the buffer, top up with new data, then process all + * of it. + */ + memmove(pkt_buf, partial_frame, partial_len); + } - pool_flush(pool_tap4); - pool_flush(pool_tap6); + do { + n = recv(c->fd_tap, pkt_buf + partial_len, + sizeof(pkt_buf) - partial_len, MSG_DONTWAIT); + } while ((n < 0) && errno == EINTR); - n = recv(c->fd_tap, p, TAP_BUF_FILL, MSG_DONTWAIT); if (n < 0) { - if (errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK) + if (errno != EAGAIN && errno != EWOULDBLOCK) { + err_perror("Receive error on guest connection, reset"); tap_sock_reset(c); + } return; } - while (n > (ssize_t)sizeof(uint32_t)) { - ssize_t len = ntohl(*(uint32_t *)p); + p = pkt_buf; + n += partial_len; - p += sizeof(uint32_t); - n -= sizeof(uint32_t); + while (n >= (ssize_t)sizeof(uint32_t)) { + uint32_t l2len = ntohl_unaligned(p); - /* At most one packet might not fit in a single read, and this - * needs to be blocking. - */ - if (len > n) { - rem = recv(c->fd_tap, p + n, len - n, 0); - if ((n += rem) != len) - return; + if (l2len < sizeof(struct ethhdr) || l2len > L2_MAX_LEN_PASST) { + err("Bad frame size from guest, resetting connection"); + tap_sock_reset(c); + return; } - /* Complete the partial read above before discarding a malformed - * frame, otherwise the stream will be inconsistent. - */ - if (len < (ssize_t)sizeof(*eh) || len > (ssize_t)ETH_MAX_MTU) - goto next; - - pcap(p, len); - - eh = (struct ethhdr *)p; + if (l2len + sizeof(uint32_t) > (size_t)n) + /* Leave this incomplete frame for later */ + break; - if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) { - memcpy(c->mac_guest, eh->h_source, ETH_ALEN); - proto_update_l2_buf(c->mac_guest, NULL); - } + p += sizeof(uint32_t); + n -= sizeof(uint32_t); - switch (ntohs(eh->h_proto)) { - case ETH_P_ARP: - case ETH_P_IP: - packet_add(pool_tap4, len, p); - break; - case ETH_P_IPV6: - packet_add(pool_tap6, len, p); - break; - default: - break; - } + tap_add_packet(c, l2len, p, now); -next: - p += len; - n -= len; + p += l2len; + n -= l2len; } - tap4_handler(c, pool_tap4, now); - tap6_handler(c, pool_tap6, now); + partial_len = n; + partial_frame = p; - /* We can't use EPOLLET otherwise. */ - if (rem) - goto redo; + tap_handler(c, now); } /** - * tap_handler_pasta() - Packet handler for /dev/net/tun file descriptor + * tap_handler_passt() - Event handler for AF_UNIX file descriptor * @c: Execution context * @events: epoll events * @now: Current timestamp */ -void tap_handler_pasta(struct ctx *c, uint32_t events, +void tap_handler_passt(struct ctx *c, uint32_t events, const struct timespec *now) { - ssize_t n, len; - int ret; + if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) { + tap_sock_reset(c); + return; + } - if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) - die("Disconnect event on /dev/net/tun device, exiting"); + if (events & EPOLLIN) + tap_passt_input(c, now); +} -redo: - n = 0; +/** + * tap_pasta_input() - Handler for new data on the socket to hypervisor + * @c: Execution context + * @now: Current timestamp + */ +static void tap_pasta_input(struct ctx *c, const struct timespec *now) +{ + ssize_t n, len; - pool_flush(pool_tap4); - pool_flush(pool_tap6); -restart: - while ((len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n)) > 0) { - const struct ethhdr *eh = (struct ethhdr *)(pkt_buf + n); + tap_flush_pools(); - if (len < (ssize_t)sizeof(*eh) || len > (ssize_t)ETH_MAX_MTU) { - n += len; - continue; - } + for (n = 0; + n <= (ssize_t)(sizeof(pkt_buf) - L2_MAX_LEN_PASTA); + n += len) { + len = read(c->fd_tap, pkt_buf + n, L2_MAX_LEN_PASTA); - pcap(pkt_buf + n, len); + if (len == 0) { + die("EOF on tap device, exiting"); + } else if (len < 0) { + if (errno == EINTR) { + len = 0; + continue; + } - if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) { - memcpy(c->mac_guest, eh->h_source, ETH_ALEN); - proto_update_l2_buf(c->mac_guest, NULL); - } + if (errno == EAGAIN && errno == EWOULDBLOCK) + break; /* all done for now */ - switch (ntohs(eh->h_proto)) { - case ETH_P_ARP: - case ETH_P_IP: - packet_add(pool_tap4, len, pkt_buf + n); - break; - case ETH_P_IPV6: - packet_add(pool_tap6, len, pkt_buf + n); - break; - default: - break; + die("Error on tap device, exiting"); } - if ((n += len) == TAP_BUF_BYTES) - break; - } - - if (len < 0 && errno == EINTR) - goto restart; + /* Ignore frames of bad length */ + if (len < (ssize_t)sizeof(struct ethhdr) || + len > (ssize_t)L2_MAX_LEN_PASTA) + continue; - ret = errno; + tap_add_packet(c, len, pkt_buf + n, now); + } - tap4_handler(c, pool_tap4, now); - tap6_handler(c, pool_tap6, now); + tap_handler(c, now); +} - if (len > 0 || ret == EAGAIN) - return; +/** + * tap_handler_pasta() - Packet handler for /dev/net/tun file descriptor + * @c: Execution context + * @events: epoll events + * @now: Current timestamp + */ +void tap_handler_pasta(struct ctx *c, uint32_t events, + const struct timespec *now) +{ + if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) + die("Disconnect event on /dev/net/tun device, exiting"); - if (n == TAP_BUF_BYTES) - goto redo; + if (events & EPOLLIN) + tap_pasta_input(c, now); +} - die("Error on tap device, exiting"); +/** + * tap_backend_show_hints() - Give help information to start QEMU + * @c: Execution context + */ +static void tap_backend_show_hints(struct ctx *c) +{ + switch (c->mode) { + case MODE_PASTA: + /* No hints */ + break; + case MODE_PASST: + info("\nYou can now start qemu (>= 7.2, with commit 13c6be96618c):"); + info(" kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s", + c->sock_path); + info("or qrap, for earlier qemu versions:"); + info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio"); + break; + case MODE_VU: + info("You can start qemu with:"); + info(" kvm ... -chardev socket,id=chr0,path=%s -netdev vhost-user,id=netdev0,chardev=chr0 -device virtio-net,netdev=netdev0 -object memory-backend-memfd,id=memfd0,share=on,size=$RAMSIZE -numa node,memdev=memfd0\n", + c->sock_path); + break; + } } /** - * tap_sock_unix_init() - Create and bind AF_UNIX socket, listen for connection + * tap_sock_unix_init() - Start listening for connections on AF_UNIX socket * @c: Execution context */ -static void tap_sock_unix_init(struct ctx *c) +static void tap_sock_unix_init(const struct ctx *c) { - int fd = socket(AF_UNIX, SOCK_STREAM, 0); union epoll_ref ref = { .type = EPOLL_TYPE_TAP_LISTEN }; struct epoll_event ev = { 0 }; - struct sockaddr_un addr = { - .sun_family = AF_UNIX, - }; - int i; - - if (fd < 0) - die("UNIX socket: %s", strerror(errno)); - - /* In passt mode, we don't know the guest's MAC until it sends - * us packets. Use the broadcast address so our first packets - * will reach it. - */ - memset(&c->mac_guest, 0xff, sizeof(c->mac_guest)); - - for (i = 1; i < UNIX_SOCK_MAX; i++) { - char *path = addr.sun_path; - int ex, ret; - - if (*c->sock_path) - memcpy(path, c->sock_path, UNIX_PATH_MAX); - else - snprintf(path, UNIX_PATH_MAX - 1, UNIX_SOCK_PATH, i); - - ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0); - if (ex < 0) - die("UNIX domain socket check: %s", strerror(errno)); - - ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr)); - if (!ret || (errno != ENOENT && errno != ECONNREFUSED && - errno != EACCES)) { - if (*c->sock_path) - die("Socket path %s already in use", path); - - close(ex); - continue; - } - close(ex); - - unlink(path); - if (!bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) || - *c->sock_path) - break; - } - if (i == UNIX_SOCK_MAX) - die("UNIX socket bind: %s", strerror(errno)); + listen(c->fd_tap_listen, 0); - info("UNIX domain socket bound at %s\n", addr.sun_path); - - listen(fd, 0); - - ref.fd = c->fd_tap_listen = fd; + ref.fd = c->fd_tap_listen; ev.events = EPOLLIN | EPOLLET; ev.data.u64 = ref.u64; epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap_listen, &ev); +} + +/** + * tap_start_connection() - start a new connection + * @c: Execution context + */ +static void tap_start_connection(const struct ctx *c) +{ + struct epoll_event ev = { 0 }; + union epoll_ref ref = { 0 }; + + ref.fd = c->fd_tap; + switch (c->mode) { + case MODE_PASST: + ref.type = EPOLL_TYPE_TAP_PASST; + break; + case MODE_PASTA: + ref.type = EPOLL_TYPE_TAP_PASTA; + break; + case MODE_VU: + ref.type = EPOLL_TYPE_VHOST_CMD; + break; + } - info("You can now start qemu (>= 7.2, with commit 13c6be96618c):"); - info(" kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s", - addr.sun_path); - info("or qrap, for earlier qemu versions:"); - info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio"); + ev.events = EPOLLIN | EPOLLRDHUP; + ev.data.u64 = ref.u64; + epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); } /** @@ -1171,8 +1345,6 @@ static void tap_sock_unix_init(struct ctx *c) */ void tap_listen_handler(struct ctx *c, uint32_t events) { - union epoll_ref ref = { .type = EPOLL_TYPE_TAP_PASST }; - struct epoll_event ev = { 0 }; int v = INT_MAX / 2; struct ucred ucred; socklen_t len; @@ -1211,10 +1383,7 @@ void tap_listen_handler(struct ctx *c, uint32_t events) setsockopt(c->fd_tap, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v))) trace("tap: failed to set SO_SNDBUF to %i", v); - ref.fd = c->fd_tap; - ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP; - ev.data.u64 = ref.u64; - epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); + tap_start_connection(c); } /** @@ -1238,11 +1407,11 @@ static int tap_ns_tun(void *arg) fd = open("/dev/net/tun", flags); if (fd < 0) - die("Failed to open() /dev/net/tun: %s", strerror(errno)); + die_perror("Failed to open() /dev/net/tun"); - rc = ioctl(fd, TUNSETIFF, &ifr); + rc = ioctl(fd, (int)TUNSETIFF, &ifr); if (rc < 0) - die("TUNSETIFF failed: %s", strerror(errno)); + die_perror("TUNSETIFF ioctl on /dev/net/tun failed"); if (!(c->pasta_ifi = if_nametoindex(c->pasta_ifn))) die("Tap device opened but no network interface found"); @@ -1258,59 +1427,70 @@ static int tap_ns_tun(void *arg) */ static void tap_sock_tun_init(struct ctx *c) { - union epoll_ref ref = { .type = EPOLL_TYPE_TAP_PASTA }; - struct epoll_event ev = { 0 }; - NS_CALL(tap_ns_tun, c); if (c->fd_tap == -1) die("Failed to set up tap device in namespace"); pasta_ns_conf(c); - ref.fd = c->fd_tap; - ev.events = EPOLLIN | EPOLLRDHUP; - ev.data.u64 = ref.u64; - epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); + tap_start_connection(c); } /** - * tap_sock_init() - Create and set up AF_UNIX socket or tuntap file descriptor - * @c: Execution context + * tap_sock_update_pool() - Set the buffer base and size for the pool of packets + * @base: Buffer base + * @size Buffer size */ -void tap_sock_init(struct ctx *c) +void tap_sock_update_pool(void *base, size_t size) { - size_t sz = sizeof(pkt_buf); int i; - pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, pkt_buf, sz); - pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, pkt_buf, sz); + pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS_IP4, base, size); + pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS_IP6, base, size); for (i = 0; i < TAP_SEQS; i++) { - tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz); - tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz); + tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, base, size); + tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, base, size); } +} - if (c->fd_tap != -1) { /* Passed as --fd */ - struct epoll_event ev = { 0 }; - union epoll_ref ref; +/** + * tap_backend_init() - Create and set up AF_UNIX socket or + * tuntap file descriptor + * @c: Execution context + */ +void tap_backend_init(struct ctx *c) +{ + if (c->mode == MODE_VU) { + tap_sock_update_pool(NULL, 0); + vu_init(c); + } else { + tap_sock_update_pool(pkt_buf, sizeof(pkt_buf)); + } + if (c->fd_tap != -1) { /* Passed as --fd */ ASSERT(c->one_off); - ref.fd = c->fd_tap; - if (c->mode == MODE_PASST) - ref.type = EPOLL_TYPE_TAP_PASST; - else - ref.type = EPOLL_TYPE_TAP_PASTA; - - ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP; - ev.data.u64 = ref.u64; - epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); + tap_start_connection(c); return; } - if (c->mode == MODE_PASST) { - if (c->fd_tap_listen == -1) - tap_sock_unix_init(c); - } else { + switch (c->mode) { + case MODE_PASTA: tap_sock_tun_init(c); + break; + case MODE_VU: + repair_sock_init(c); + /* fall through */ + case MODE_PASST: + tap_sock_unix_init(c); + + /* In passt mode, we don't know the guest's MAC address until it + * sends us packets. Use the broadcast address so that our + * first packets will reach it. + */ + memset(&c->guest_mac, 0xff, sizeof(c->guest_mac)); + break; } + + tap_backend_show_hints(c); } @@ -6,73 +6,104 @@ #ifndef TAP_H #define TAP_H +/** L2_MAX_LEN_PASTA - Maximum frame length for pasta mode (with L2 header) + * + * The kernel tuntap device imposes a maximum frame size of 65535 including + * 'hard_header_len' (14 bytes for L2 Ethernet in the case of "tap" mode). + */ +#define L2_MAX_LEN_PASTA USHRT_MAX + +/** L2_MAX_LEN_PASST - Maximum frame length for passt mode (with L2 header) + * + * The only structural limit the QEMU socket protocol imposes on frames is + * (2^32-1) bytes, but that would be ludicrously long in practice. For now, + * limit it somewhat arbitrarily to 65535 bytes. FIXME: Work out an appropriate + * limit with more precision. + */ +#define L2_MAX_LEN_PASST USHRT_MAX + +/** L2_MAX_LEN_VU - Maximum frame length for vhost-user mode (with L2 header) + * + * vhost-user allows multiple buffers per frame, each of which can be quite + * large, so the inherent frame size limit is rather large. Much larger than is + * actually useful for IP. For now limit arbitrarily to 65535 bytes. FIXME: + * Work out an appropriate limit with more precision. + */ +#define L2_MAX_LEN_VU USHRT_MAX + +struct udphdr; + /** - * struct tap_hdr - L2 and tap specific headers + * struct tap_hdr - tap backend specific headers * @vnet_len: Frame length (for qemu socket transport) - * @eh: Ethernet header */ struct tap_hdr { uint32_t vnet_len; - struct ethhdr eh; } __attribute__((packed)); -#define TAP_HDR_INIT(proto) { .eh.h_proto = htons_constant(proto) } - -static inline size_t tap_hdr_len_(const struct ctx *c) -{ - if (c->mode == MODE_PASST) - return sizeof(struct tap_hdr); - else - return sizeof(struct ethhdr); -} - /** - * tap_frame_base() - Find start of tap frame + * tap_hdr_iov() - struct iovec for a tap header * @c: Execution context - * @taph: Pointer to L2 and tap specific header buffer + * @taph: Pointer to tap specific header buffer * - * Returns: pointer to the start of tap frame - suitable for an - * iov_base to be passed to tap_send_frames()) + * Returns: A struct iovec covering the correct portion of @taph to use as the + * tap specific header in the current configuration. */ -static inline void *tap_frame_base(const struct ctx *c, struct tap_hdr *taph) +static inline struct iovec tap_hdr_iov(const struct ctx *c, + struct tap_hdr *thdr) { - return (char *)(taph + 1) - tap_hdr_len_(c); + return (struct iovec){ + .iov_base = thdr, + .iov_len = c->mode == MODE_PASST ? sizeof(*thdr) : 0, + }; } /** - * tap_frame_len() - Finalize tap frame and return total length - * @c: Execution context - * @taph: Tap header to finalize - * @plen: L3 packet length (excludes L2 and tap specific headers) - * - * Returns: length of the tap frame including L2 and tap specific - * headers - suitable for an iov_len to be passed to - * tap_send_frames() + * tap_hdr_update() - Update the tap specific header for a frame + * @taph: Tap specific header buffer to update + * @l2len: Frame length (including L2 headers) */ -static inline size_t tap_frame_len(const struct ctx *c, struct tap_hdr *taph, - size_t plen) +static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len) { - if (c->mode == MODE_PASST) - taph->vnet_len = htonl(plen + sizeof(taph->eh)); - return plen + tap_hdr_len_(c); + if (thdr) + thdr->vnet_len = htonl(l2len); } -struct in_addr tap_ip4_daddr(const struct ctx *c); +unsigned long tap_l2_max_len(const struct ctx *c); +void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto); +void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, + struct in_addr dst, size_t l4len, uint8_t proto); +void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport, + struct in_addr dst, in_port_t dport, + const void *in, size_t dlen); +void *tap_push_uh6(struct udphdr *uh, + const struct in6_addr *src, in_port_t sport, + const struct in6_addr *dst, in_port_t dport, + void *in, size_t dlen); +void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, + struct in_addr dst, size_t l4len, uint8_t proto); +void *tap_push_ip6h(struct ipv6hdr *ip6h, + const struct in6_addr *src, + const struct in6_addr *dst, + size_t l4len, uint8_t proto, uint32_t flow); void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport, struct in_addr dst, in_port_t dport, - const void *in, size_t len); + const void *in, size_t dlen); void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst, - const void *in, size_t len); + const void *in, size_t l4len); const struct in6_addr *tap_ip6_daddr(const struct ctx *c, const struct in6_addr *src); +void *tap_push_ip6h(struct ipv6hdr *ip6h, + const struct in6_addr *src, const struct in6_addr *dst, + size_t l4len, uint8_t proto, uint32_t flow); void tap_udp6_send(const struct ctx *c, const struct in6_addr *src, in_port_t sport, const struct in6_addr *dst, in_port_t dport, - uint32_t flow, const void *in, size_t len); + uint32_t flow, void *in, size_t dlen); void tap_icmp6_send(const struct ctx *c, const struct in6_addr *src, const struct in6_addr *dst, - const void *in, size_t len); -void tap_send_single(const struct ctx *c, const void *data, size_t len); + const void *in, size_t l4len); +void tap_send_single(const struct ctx *c, const void *data, size_t l2len); size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t bufs_per_frame, size_t nframes); void eth_update_mac(struct ethhdr *eh, @@ -82,6 +113,13 @@ void tap_handler_pasta(struct ctx *c, uint32_t events, const struct timespec *now); void tap_handler_passt(struct ctx *c, uint32_t events, const struct timespec *now); -void tap_sock_init(struct ctx *c); +int tap_sock_unix_open(char *sock_path); +void tap_sock_reset(struct ctx *c); +void tap_sock_update_pool(void *base, size_t size); +void tap_backend_init(struct ctx *c); +void tap_flush_pools(void); +void tap_handler(struct ctx *c, const struct timespec *now); +void tap_add_packet(struct ctx *c, ssize_t l2len, char *p, + const struct timespec *now); #endif /* TAP_H */ @@ -274,11 +274,13 @@ #include <net/if.h> #include <netinet/in.h> #include <netinet/ip.h> +#include <netinet/tcp.h> #include <stdint.h> #include <stdbool.h> #include <stddef.h> #include <string.h> #include <sys/epoll.h> +#include <sys/ioctl.h> #include <sys/socket.h> #include <sys/timerfd.h> #include <sys/types.h> @@ -286,10 +288,11 @@ #include <time.h> #include <arpa/inet.h> -#include <linux/tcp.h> /* For struct tcp_info */ +#include <linux/sockios.h> #include "checksum.h" #include "util.h" +#include "iov.h" #include "ip.h" #include "passt.h" #include "tap.h" @@ -299,65 +302,32 @@ #include "log.h" #include "inany.h" #include "flow.h" +#include "repair.h" +#include "linux_dep.h" #include "flow_table.h" +#include "tcp_internal.h" +#include "tcp_buf.h" +#include "tcp_vu.h" + +#ifndef __USE_MISC +/* From Linux UAPI, missing in netinet/tcp.h provided by musl */ +struct tcp_repair_opt { + __u32 opt_code; + __u32 opt_val; +}; -/* Sides of a flow as we use them in "tap" connections */ -#define SOCKSIDE 0 -#define TAPSIDE 1 - -#define TCP_FRAMES_MEM 128 -#define TCP_FRAMES \ - (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) - -#define TCP_HASH_TABLE_LOAD 70 /* % */ -#define TCP_HASH_TABLE_SIZE (FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD) - -#define MAX_WS 8 -#define MAX_WINDOW (1 << (16 + (MAX_WS))) +enum { + TCP_NO_QUEUE, + TCP_RECV_QUEUE, + TCP_SEND_QUEUE, + TCP_QUEUES_NR, +}; +#endif /* MSS rounding: see SET_MSS() */ #define MSS_DEFAULT 536 - -struct tcp4_l2_head { /* For MSS4 macro: keep in sync with tcp4_l2_buf_t */ -#ifdef __AVX2__ - uint8_t pad[26]; -#else - uint8_t pad[2]; -#endif - struct tap_hdr taph; - struct iphdr iph; - struct tcphdr th; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ -#ifdef __AVX2__ - uint8_t pad[14]; -#else - uint8_t pad[2]; -#endif - struct tap_hdr taph; - struct ipv6hdr ip6h; - struct tcphdr th; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -#define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4) -#define MSS6 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4) - #define WINDOW_DEFAULT 14600 /* RFC 6928 */ -#ifdef HAS_SND_WND -# define KERNEL_REPORTS_SND_WND(c) (c->tcp.kernel_snd_wnd) -#else -# define KERNEL_REPORTS_SND_WND(c) (0 && (c)) -#endif #define ACK_INTERVAL 10 /* ms */ #define SYN_TIMEOUT 10 /* s */ @@ -368,40 +338,25 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ #define LOW_RTT_TABLE_SIZE 8 #define LOW_RTT_THRESHOLD 10 /* us */ -/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of - * <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP - */ -#define SOL_TCP IPPROTO_TCP - -#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW) -#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW) -#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW) -#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW) - -#define FIN (1 << 0) -#define SYN (1 << 1) -#define RST (1 << 2) -#define ACK (1 << 4) -/* Flags for internal usage */ -#define DUP_ACK (1 << 5) #define ACK_IF_NEEDED 0 /* See tcp_send_flag() */ -#define OPT_EOL 0 -#define OPT_NOP 1 -#define OPT_MSS 2 -#define OPT_MSS_LEN 4 -#define OPT_WS 3 -#define OPT_WS_LEN 3 -#define OPT_SACKP 4 -#define OPT_SACK 5 -#define OPT_TS 8 - -#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr)) -#define CONN_V6(conn) (!CONN_V4(conn)) #define CONN_IS_CLOSING(conn) \ - ((conn->events & ESTABLISHED) && \ - (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) -#define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) + (((conn)->events & ESTABLISHED) && \ + ((conn)->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) +#define CONN_HAS(conn, set) (((conn)->events & (set)) == (set)) + +/* Buffers to migrate pending data from send and receive queues. No, they don't + * use memory if we don't use them. And we're going away after this, so splurge. + */ +#define TCP_MIGRATE_SND_QUEUE_MAX (64 << 20) +#define TCP_MIGRATE_RCV_QUEUE_MAX (64 << 20) +uint8_t tcp_migrate_snd_queue [TCP_MIGRATE_SND_QUEUE_MAX]; +uint8_t tcp_migrate_rcv_queue [TCP_MIGRATE_RCV_QUEUE_MAX]; + +#define TCP_MIGRATE_RESTORE_CHUNK_MIN 1024 /* Try smaller when above this */ + +/* "Extended" data (not stored in the flow table) for TCP flow migration */ +static struct tcp_tap_transfer_ext migrate_ext[FLOW_MAX]; static const char *tcp_event_str[] __attribute((__unused__)) = { "SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT", @@ -415,179 +370,93 @@ static const char *tcp_state_str[] __attribute((__unused__)) = { "SYN_RCVD", /* approximately maps to TAP_SYN_ACK_SENT */ /* Passive close: */ - "CLOSE_WAIT", "CLOSE_WAIT", "LAST_ACK", "LAST_ACK", "LAST_ACK", + "CLOSE_WAIT", "CLOSE_WAIT", "CLOSE_WAIT", "LAST_ACK", "LAST_ACK", /* Active close (+5): */ "CLOSING", "FIN_WAIT_1", "FIN_WAIT_1", "FIN_WAIT_2", "TIME_WAIT", }; static const char *tcp_flag_str[] __attribute((__unused__)) = { "STALLED", "LOCAL", "ACTIVE_CLOSE", "ACK_TO_TAP_DUE", - "ACK_FROM_TAP_DUE", + "ACK_FROM_TAP_DUE", "ACK_FROM_TAP_BLOCKS", }; /* Listening sockets, used for automatic port forwarding in pasta mode only */ static int tcp_sock_init_ext [NUM_PORTS][IP_VERSIONS]; static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS]; -/* Table of guest side forwarding addresses with very low RTT (assumed - * to be local to the host), LRU +/* Table of our guest side addresses with very low RTT (assumed to be local to + * the host), LRU */ static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE]; -/** - * tcp_buf_seq_update - Sequences to update with length of frames once sent - * @seq: Pointer to sequence number sent to tap-side, to be updated - * @len: TCP payload length - */ -struct tcp_buf_seq_update { - uint32_t *seq; - uint16_t len; -}; +char tcp_buf_discard [MAX_WINDOW]; -/* Static buffers */ +/* Does the kernel support TCP_PEEK_OFF? */ +bool peek_offset_cap; -/** - * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections - * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only - * @taph: Tap-level headers (partially pre-filled) - * @iph: Pre-filled IP header (except for tot_len and saddr) - * @uh: Headroom for TCP header - * @data: Storage for TCP payload - */ -static struct tcp4_l2_buf_t { -#ifdef __AVX2__ - uint8_t pad[26]; /* 0, align th to 32 bytes */ -#else - uint8_t pad[2]; /* align iph to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 26 2 */ - struct iphdr iph; /* 44 20 */ - struct tcphdr th; /* 64 40 */ - uint8_t data[MSS4]; /* 84 60 */ - /* 65536 65532 */ -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp4_l2_buf[TCP_FRAMES_MEM]; +/* Size of data returned by TCP_INFO getsockopt() */ +socklen_t tcp_info_size; -static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM]; +#define tcp_info_cap(f_) \ + ((offsetof(struct tcp_info_linux, tcpi_##f_) + \ + sizeof(((struct tcp_info_linux *)NULL)->tcpi_##f_)) <= tcp_info_size) -static unsigned int tcp4_l2_buf_used; - -/** - * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections - * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B - * @taph: Tap-level headers (partially pre-filled) - * @ip6h: Pre-filled IP header (except for payload_len and addresses) - * @th: Headroom for TCP header - * @data: Storage for TCP payload - */ -struct tcp6_l2_buf_t { -#ifdef __AVX2__ - uint8_t pad[14]; /* 0 align ip6h to 32 bytes */ -#else - uint8_t pad[2]; /* align ip6h to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 14 2 */ - struct ipv6hdr ip6h; /* 32 20 */ - struct tcphdr th; /* 72 60 */ - uint8_t data[MSS6]; /* 92 80 */ - /* 65536 65532 */ -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp6_l2_buf[TCP_FRAMES_MEM]; - -static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; - -static unsigned int tcp6_l2_buf_used; - -/* recvmsg()/sendmsg() data for tap */ -static char tcp_buf_discard [MAX_WINDOW]; -static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; - -static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM]; -static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM]; -static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM]; -static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM]; +/* Kernel reports sending window in TCP_INFO (kernel commit 8f7baad7f035) */ +#define snd_wnd_cap tcp_info_cap(snd_wnd) +/* Kernel reports bytes acked in TCP_INFO (kernel commit 0df48c26d84) */ +#define bytes_acked_cap tcp_info_cap(bytes_acked) +/* Kernel reports minimum RTT in TCP_INFO (kernel commit cd9b266095f4) */ +#define min_rtt_cap tcp_info_cap(min_rtt) /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; -/** - * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags) - * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only - * @taph: Tap-level headers (partially pre-filled) - * @iph: Pre-filled IP header (except for tot_len and saddr) - * @th: Headroom for TCP header - * @opts: Headroom for TCP options - */ -static struct tcp4_l2_flags_buf_t { -#ifdef __AVX2__ - uint8_t pad[26]; /* 0, align th to 32 bytes */ -#else - uint8_t pad[2]; /* align iph to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 26 2 */ - struct iphdr iph; /* 44 20 */ - struct tcphdr th; /* 64 40 */ - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp4_l2_flags_buf[TCP_FRAMES_MEM]; - -static unsigned int tcp4_l2_flags_buf_used; +/* Pools for pre-opened sockets (in init) */ +int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; +int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; /** - * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags) - * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B - * @taph: Tap-level headers (partially pre-filled) - * @ip6h: Pre-filled IP header (except for payload_len and addresses) - * @th: Headroom for TCP header - * @opts: Headroom for TCP options + * conn_at_sidx() - Get TCP connection specific flow at given sidx + * @sidx: Flow and side to retrieve + * + * Return: TCP connection at @sidx, or NULL of @sidx is invalid. Asserts if the + * flow at @sidx is not FLOW_TCP. */ -static struct tcp6_l2_flags_buf_t { -#ifdef __AVX2__ - uint8_t pad[14]; /* 0 align ip6h to 32 bytes */ -#else - uint8_t pad[2]; /* align ip6h to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 14 2 */ - struct ipv6hdr ip6h; /* 32 20 */ - struct tcphdr th /* 72 */ __attribute__ ((aligned(4))); /* 60 */ - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp6_l2_flags_buf[TCP_FRAMES_MEM]; - -static unsigned int tcp6_l2_flags_buf_used; +static struct tcp_tap_conn *conn_at_sidx(flow_sidx_t sidx) +{ + union flow *flow = flow_at_sidx(sidx); -#define CONN(idx) (&(FLOW(idx)->tcp)) + if (!flow) + return NULL; -/* Table for lookup from remote address, local port, remote port */ -static flow_sidx_t tc_hash[TCP_HASH_TABLE_SIZE]; + ASSERT(flow->f.type == FLOW_TCP); + return &flow->tcp; +} -static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, - "Safe linear probing requires hash table larger than connection table"); +/** + * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on connection if supported + * @conn: Pointer to the TCP connection structure + * @offset: Offset in bytes + * + * Return: -1 when it fails, 0 otherwise. + */ +int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset) +{ + if (!peek_offset_cap) + return 0; -/* Pools for pre-opened sockets (in init) */ -int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; -int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; + if (setsockopt(conn->sock, SOL_SOCKET, SO_PEEK_OFF, + &offset, sizeof(offset))) { + flow_perror(conn, "Failed to set SO_PEEK_OFF to %i", offset); + return -1; + } + return 0; +} /** * tcp_conn_epoll_events() - epoll events mask for given connection state * @events: Current connection events - * @conn_flags Connection flags + * @conn_flags: Connection flags * * Return: epoll events mask corresponding to implied connection state */ @@ -600,8 +469,12 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) if (events & TAP_FIN_SENT) return EPOLLET; - if (conn_flags & STALLED) - return EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLET; + if (conn_flags & STALLED) { + if (conn_flags & ACK_FROM_TAP_BLOCKS) + return EPOLLRDHUP | EPOLLET; + + return EPOLLIN | EPOLLRDHUP | EPOLLET; + } return EPOLLIN | EPOLLRDHUP; } @@ -609,17 +482,9 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) if (events == TAP_SYN_RCVD) return EPOLLOUT | EPOLLET | EPOLLRDHUP; - return EPOLLRDHUP; + return EPOLLET | EPOLLRDHUP; } -static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, - unsigned long flag); -#define conn_flag(c, conn, flag) \ - do { \ - flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \ - conn_flag_do(c, conn, flag); \ - } while (0) - /** * tcp_epoll_ctl() - Add/modify/delete epoll state from connection events * @c: Execution context @@ -631,14 +496,14 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) { int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock, - .flowside = FLOW_SIDX(conn, SOCKSIDE) }; + .flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), }; struct epoll_event ev = { .data.u64 = ref.u64 }; if (conn->events == CLOSED) { if (conn->in_epoll) - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev); + epoll_del(c, conn->sock); if (conn->timer != -1) - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev); + epoll_del(c, conn->timer); return 0; } @@ -687,8 +552,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) fd = timerfd_create(CLOCK_MONOTONIC, 0); if (fd == -1 || fd > FD_REF_MAX) { - flow_dbg(conn, "failed to get timer: %s", - strerror(errno)); + flow_dbg_perror(conn, "failed to get timer"); if (fd > -1) close(fd); conn->timer = -1; @@ -697,8 +561,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) conn->timer = fd; if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) { - flow_dbg(conn, "failed to add timer: %s", - strerror(errno)); + flow_dbg_perror(conn, "failed to add timer"); close(conn->timer); conn->timer = -1; return; @@ -722,7 +585,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) (unsigned long long)it.it_value.tv_sec, (unsigned long long)it.it_value.tv_nsec / 1000 / 1000); - timerfd_settime(conn->timer, 0, &it, NULL); + if (timerfd_settime(conn->timer, 0, &it, NULL)) + flow_perror(conn, "failed to set timer"); } /** @@ -731,8 +595,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) * @conn: Connection pointer * @flag: Flag to set, or ~flag to unset */ -static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, - unsigned long flag) +void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, + unsigned long flag) { if (flag & (flag - 1)) { int flag_index = fls(~flag); @@ -773,17 +637,14 @@ static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, tcp_timer_ctl(c, conn); } -static void tcp_hash_remove(const struct ctx *c, - const struct tcp_tap_conn *conn); - /** * conn_event_do() - Set and log connection events, update epoll state * @c: Execution context * @conn: Connection pointer * @event: Connection event */ -static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, - unsigned long event) +void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, + unsigned long event) { int prev, new, num = fls(event); @@ -821,7 +682,7 @@ static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, num == -1 ? "CLOSED" : tcp_event_str[num]); if (event == CLOSED) - tcp_hash_remove(c, conn); + flow_hash_remove(c, TAP_SIDX(conn)); else if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD)) conn_flag(c, conn, ACTIVE_CLOSE); else @@ -831,12 +692,6 @@ static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, tcp_timer_ctl(c, conn); } -#define conn_event(c, conn, event) \ - do { \ - flow_trace(conn, "event at %s:%i", __func__, __LINE__); \ - conn_event_do(c, conn, event); \ - } while (0) - /** * tcp_rtt_dst_low() - Check if low RTT was seen for connection endpoint * @conn: Connection pointer @@ -845,10 +700,11 @@ static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, */ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn) { + const struct flowside *tapside = TAPFLOW(conn); int i; for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) - if (inany_equals(&conn->faddr, low_rtt_dst + i)) + if (inany_equals(&tapside->oaddr, low_rtt_dst + i)) return 1; return 0; @@ -860,17 +716,17 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn) * @tinfo: Pointer to struct tcp_info for socket */ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, - const struct tcp_info *tinfo) + const struct tcp_info_linux *tinfo) { -#ifdef HAS_MIN_RTT + const struct flowside *tapside = TAPFLOW(conn); int i, hole = -1; - if (!tinfo->tcpi_min_rtt || + if (!min_rtt_cap || (int)tinfo->tcpi_min_rtt > LOW_RTT_THRESHOLD) return; for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) { - if (inany_equals(&conn->faddr, low_rtt_dst + i)) + if (inany_equals(&tapside->oaddr, low_rtt_dst + i)) return; if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i)) hole = i; @@ -882,14 +738,10 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, if (hole == -1) return; - low_rtt_dst[hole++] = conn->faddr; + low_rtt_dst[hole++] = tapside->oaddr; if (hole == LOW_RTT_TABLE_SIZE) hole = 0; inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any); -#else - (void)conn; - (void)tinfo; -#endif /* HAS_MIN_RTT */ } /** @@ -918,137 +770,30 @@ static void tcp_get_sndbuf(struct tcp_tap_conn *conn) } /** - * tcp_sock_set_bufsize() - Set SO_RCVBUF and SO_SNDBUF to maximum values + * tcp_sock_set_nodelay() - Set TCP_NODELAY option (disable Nagle's algorithm) * @s: Socket, can be -1 to avoid check in the caller */ -static void tcp_sock_set_bufsize(const struct ctx *c, int s) +static void tcp_sock_set_nodelay(int s) { - int v = INT_MAX / 2; /* Kernel clamps and rounds, no need to check */ - if (s == -1) return; - if (!c->low_rmem && setsockopt(s, SOL_SOCKET, SO_RCVBUF, &v, sizeof(v))) - trace("TCP: failed to set SO_RCVBUF to %i", v); - - if (!c->low_wmem && setsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v))) - trace("TCP: failed to set SO_SNDBUF to %i", v); -} - -/** - * tcp_update_check_tcp4() - Update TCP checksum from stored one - * @iph: IPv4 header - * @th: TCP header followed by TCP payload - */ -static void tcp_update_check_tcp4(const struct iphdr *iph, struct tcphdr *th) -{ - uint16_t tlen = ntohs(iph->tot_len) - sizeof(struct iphdr); - struct in_addr saddr = { .s_addr = iph->saddr }; - struct in_addr daddr = { .s_addr = iph->daddr }; - uint32_t sum = proto_ipv4_header_psum(tlen, IPPROTO_TCP, saddr, daddr); - - th->check = 0; - th->check = csum(th, tlen, sum); + if (setsockopt(s, SOL_TCP, TCP_NODELAY, &((int){ 1 }), sizeof(int))) + debug("TCP: failed to set TCP_NODELAY on socket %i", s); } /** - * tcp_update_check_tcp6() - Calculate TCP checksum for IPv6 - * @ip6h: IPv6 header - * @th: TCP header followed by TCP payload + * tcp_update_csum() - Calculate TCP checksum + * @psum: Unfolded partial checksum of the IPv4 or IPv6 pseudo-header + * @th: TCP header (updated) + * @payload: TCP payload */ -static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th) +static void tcp_update_csum(uint32_t psum, struct tcphdr *th, + struct iov_tail *payload) { - uint16_t payload_len = ntohs(ip6h->payload_len); - uint32_t sum = proto_ipv6_header_psum(payload_len, IPPROTO_TCP, - &ip6h->saddr, &ip6h->daddr); - th->check = 0; - th->check = csum(th, payload_len, sum); -} - -/** - * tcp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses - * @eth_d: Ethernet destination address, NULL if unchanged - * @eth_s: Ethernet source address, NULL if unchanged - */ -void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) -{ - int i; - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i]; - struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i]; - struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i]; - struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i]; - - eth_update_mac(&b4->taph.eh, eth_d, eth_s); - eth_update_mac(&b6->taph.eh, eth_d, eth_s); - eth_update_mac(&b4f->taph.eh, eth_d, eth_s); - eth_update_mac(&b6f->taph.eh, eth_d, eth_s); - } -} - -/** - * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets - * @c: Execution context - */ -static void tcp_sock4_iov_init(const struct ctx *c) -{ - struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); - struct iovec *iov; - int i; - - for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) { - tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IP), - .iph = iph, - .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } - }; - } - - for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) { - tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IP), - .iph = L2_BUF_IP4_INIT(IPPROTO_TCP) - }; - } - - for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_frame_base(c, &tcp4_l2_buf[i].taph); - - for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_frame_base(c, &tcp4_l2_flags_buf[i].taph); -} - -/** - * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets - * @c: Execution context - */ -static void tcp_sock6_iov_init(const struct ctx *c) -{ - struct iovec *iov; - int i; - - for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) { - tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IPV6), - .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP), - .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } - }; - } - - for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) { - tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IPV6), - .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP) - }; - } - - for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_frame_base(c, &tcp6_l2_buf[i].taph); - - for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_frame_base(c, &tcp6_l2_flags_buf[i].taph); + psum = csum_unfolded(th, sizeof(*th), psum); + th->check = csum_iov_tail(payload, psum); } /** @@ -1110,163 +855,14 @@ static int tcp_opt_get(const char *opts, size_t len, uint8_t type_find, } /** - * tcp_hash_match() - Check if a connection entry matches address and ports - * @conn: Connection entry to match against - * @faddr: Guest side forwarding address - * @eport: Guest side endpoint port - * @fport: Guest side forwarding port - * - * Return: 1 on match, 0 otherwise - */ -static int tcp_hash_match(const struct tcp_tap_conn *conn, - const union inany_addr *faddr, - in_port_t eport, in_port_t fport) -{ - if (inany_equals(&conn->faddr, faddr) && - conn->eport == eport && conn->fport == fport) - return 1; - - return 0; -} - -/** - * tcp_hash() - Calculate hash value for connection given address and ports - * @c: Execution context - * @faddr: Guest side forwarding address - * @eport: Guest side endpoint port - * @fport: Guest side forwarding port - * - * Return: hash value, needs to be adjusted for table size - */ -static uint64_t tcp_hash(const struct ctx *c, const union inany_addr *faddr, - in_port_t eport, in_port_t fport) -{ - struct siphash_state state = SIPHASH_INIT(c->hash_secret); - - inany_siphash_feed(&state, faddr); - return siphash_final(&state, 20, (uint64_t)eport << 16 | fport); -} - -/** - * tcp_conn_hash() - Calculate hash bucket of an existing connection - * @c: Execution context - * @conn: Connection - * - * Return: hash value, needs to be adjusted for table size - */ -static uint64_t tcp_conn_hash(const struct ctx *c, - const struct tcp_tap_conn *conn) -{ - return tcp_hash(c, &conn->faddr, conn->eport, conn->fport); -} - -/** - * tcp_hash_probe() - Find hash bucket for a connection - * @c: Execution context - * @conn: Connection to find bucket for - * - * Return: If @conn is in the table, its current bucket, otherwise a suitable - * free bucket for it. - */ -static inline unsigned tcp_hash_probe(const struct ctx *c, - const struct tcp_tap_conn *conn) -{ - flow_sidx_t sidx = FLOW_SIDX(conn, TAPSIDE); - unsigned b = tcp_conn_hash(c, conn) % TCP_HASH_TABLE_SIZE; - - /* Linear probing */ - while (!flow_sidx_eq(tc_hash[b], FLOW_SIDX_NONE) && - !flow_sidx_eq(tc_hash[b], sidx)) - b = mod_sub(b, 1, TCP_HASH_TABLE_SIZE); - - return b; -} - -/** - * tcp_hash_insert() - Insert connection into hash table, chain link - * @c: Execution context - * @conn: Connection pointer - */ -static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn) -{ - unsigned b = tcp_hash_probe(c, conn); - - tc_hash[b] = FLOW_SIDX(conn, TAPSIDE); - flow_dbg(conn, "hash table insert: sock %i, bucket: %u", conn->sock, b); -} - -/** - * tcp_hash_remove() - Drop connection from hash table, chain unlink - * @c: Execution context - * @conn: Connection pointer - */ -static void tcp_hash_remove(const struct ctx *c, - const struct tcp_tap_conn *conn) -{ - unsigned b = tcp_hash_probe(c, conn), s; - union flow *flow = flow_at_sidx(tc_hash[b]); - - if (!flow) - return; /* Redundant remove */ - - flow_dbg(conn, "hash table remove: sock %i, bucket: %u", conn->sock, b); - - /* Scan the remainder of the cluster */ - for (s = mod_sub(b, 1, TCP_HASH_TABLE_SIZE); - (flow = flow_at_sidx(tc_hash[s])); - s = mod_sub(s, 1, TCP_HASH_TABLE_SIZE)) { - unsigned h = tcp_conn_hash(c, &flow->tcp) % TCP_HASH_TABLE_SIZE; - - if (!mod_between(h, s, b, TCP_HASH_TABLE_SIZE)) { - /* tc_hash[s] can live in tc_hash[b]'s slot */ - debug("hash table remove: shuffle %u -> %u", s, b); - tc_hash[b] = tc_hash[s]; - b = s; - } - } - - tc_hash[b] = FLOW_SIDX_NONE; -} - -/** - * tcp_hash_lookup() - Look up connection given remote address and ports - * @c: Execution context - * @af: Address family, AF_INET or AF_INET6 - * @faddr: Guest side forwarding address (guest remote address) - * @eport: Guest side endpoint port (guest local port) - * @fport: Guest side forwarding port (guest remote port) - * - * Return: connection pointer, if found, -ENOENT otherwise - */ -static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, - sa_family_t af, const void *faddr, - in_port_t eport, in_port_t fport) -{ - union inany_addr aany; - union flow *flow; - unsigned b; - - inany_from_af(&aany, af, faddr); - - b = tcp_hash(c, &aany, eport, fport) % TCP_HASH_TABLE_SIZE; - while ((flow = flow_at_sidx(tc_hash[b])) && - !tcp_hash_match(&flow->tcp, &aany, eport, fport)) - b = mod_sub(b, 1, TCP_HASH_TABLE_SIZE); - - return &flow->tcp; -} - -/** * tcp_flow_defer() - Deferred per-flow handling (clean up closed connections) - * @flow: Flow table entry for this connection + * @conn: Connection to handle * - * Return: true if the flow is ready to free, false otherwise + * Return: true if the connection is ready to free, false otherwise */ -bool tcp_flow_defer(union flow *flow) +bool tcp_flow_defer(const struct tcp_tap_conn *conn) { - const struct tcp_tap_conn *conn = &flow->tcp; - - if (flow->tcp.events != CLOSED) + if (conn->events != CLOSED) return false; close(conn->sock); @@ -1276,46 +872,6 @@ bool tcp_flow_defer(union flow *flow) return true; } -static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); -#define tcp_rst(c, conn) \ - do { \ - flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \ - tcp_rst_do(c, conn); \ - } while (0) - -/** - * tcp_l2_flags_buf_flush() - Send out buffers for segments with no data (flags) - * @c: Execution context - */ -static void tcp_l2_flags_buf_flush(const struct ctx *c) -{ - tap_send_frames(c, tcp6_l2_flags_iov, 1, tcp6_l2_flags_buf_used); - tcp6_l2_flags_buf_used = 0; - - tap_send_frames(c, tcp4_l2_flags_iov, 1, tcp4_l2_flags_buf_used); - tcp4_l2_flags_buf_used = 0; -} - -/** - * tcp_l2_data_buf_flush() - Send out buffers for segments with data - * @c: Execution context - */ -static void tcp_l2_data_buf_flush(const struct ctx *c) -{ - unsigned i; - size_t m; - - m = tap_send_frames(c, tcp6_l2_iov, 1, tcp6_l2_buf_used); - for (i = 0; i < m; i++) - *tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len; - tcp6_l2_buf_used = 0; - - m = tap_send_frames(c, tcp4_l2_iov, 1, tcp4_l2_buf_used); - for (i = 0; i < m; i++) - *tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len; - tcp4_l2_buf_used = 0; -} - /** * tcp_defer_handler() - Handler for TCP deferred tasks * @c: Execution context @@ -1323,8 +879,7 @@ static void tcp_l2_data_buf_flush(const struct ctx *c) /* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */ void tcp_defer_handler(struct ctx *c) { - tcp_l2_flags_buf_flush(c); - tcp_l2_data_buf_flush(c); + tcp_payload_flush(c); } /** @@ -1335,10 +890,12 @@ void tcp_defer_handler(struct ctx *c) * @seq: Sequence number */ static void tcp_fill_header(struct tcphdr *th, - const struct tcp_tap_conn *conn, uint32_t seq) + const struct tcp_tap_conn *conn, uint32_t seq) { - th->source = htons(conn->fport); - th->dest = htons(conn->eport); + const struct flowside *tapside = TAPFLOW(conn); + + th->source = htons(tapside->oport); + th->dest = htons(tapside->eport); th->seq = htonl(seq); th->ack_seq = htonl(conn->seq_ack_to_tap); if (conn->events & ESTABLISHED) { @@ -1351,120 +908,80 @@ static void tcp_fill_header(struct tcphdr *th, } /** - * tcp_fill_headers4() - Fill 802.3, IPv4, TCP headers in pre-cooked buffers - * @c: Execution context - * @conn: Connection pointer - * @iph: Pointer to IPv4 header - * @th: Pointer to TCP header - * @plen: Payload length (including TCP header options) - * @check: Checksum, if already known - * @seq: Sequence number for this segment - * - * Return: The total length of the IPv4 packet, host order + * tcp_fill_headers() - Fill 802.3, IP, TCP headers + * @conn: Connection pointer + * @taph: tap backend specific header + * @ip4h: Pointer to IPv4 header, or NULL + * @ip6h: Pointer to IPv6 header, or NULL + * @th: Pointer to TCP header + * @payload: TCP payload + * @ip4_check: IPv4 checksum, if already known + * @seq: Sequence number for this segment + * @no_tcp_csum: Do not set TCP checksum */ -static size_t tcp_fill_headers4(const struct ctx *c, - const struct tcp_tap_conn *conn, - struct iphdr *iph, struct tcphdr *th, - size_t plen, const uint16_t *check, - uint32_t seq) +void tcp_fill_headers(const struct tcp_tap_conn *conn, + struct tap_hdr *taph, + struct iphdr *ip4h, struct ipv6hdr *ip6h, + struct tcphdr *th, struct iov_tail *payload, + const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum) { - size_t ip_len = plen + sizeof(struct iphdr) + sizeof(struct tcphdr); - const struct in_addr *a4 = inany_v4(&conn->faddr); - - ASSERT(a4); - - iph->tot_len = htons(ip_len); - iph->saddr = a4->s_addr; - iph->daddr = c->ip4.addr_seen.s_addr; - - iph->check = check ? *check : - csum_ip4_header(iph->tot_len, IPPROTO_TCP, - *a4, c->ip4.addr_seen); - - tcp_fill_header(th, conn, seq); + const struct flowside *tapside = TAPFLOW(conn); + size_t l4len = iov_tail_size(payload) + sizeof(*th); + size_t l3len = l4len; + uint32_t psum = 0; - tcp_update_check_tcp4(iph, th); + if (ip4h) { + const struct in_addr *src4 = inany_v4(&tapside->oaddr); + const struct in_addr *dst4 = inany_v4(&tapside->eaddr); - return ip_len; -} + ASSERT(src4 && dst4); -/** - * tcp_fill_headers6() - Fill 802.3, IPv6, TCP headers in pre-cooked buffers - * @c: Execution context - * @conn: Connection pointer - * @ip6h: Pointer to IPv6 header - * @th: Pointer to TCP header - * @plen: Payload length (including TCP header options) - * @check: Checksum, if already known - * @seq: Sequence number for this segment - * - * Return: The total length of the IPv6 packet, host order - */ -static size_t tcp_fill_headers6(const struct ctx *c, - const struct tcp_tap_conn *conn, - struct ipv6hdr *ip6h, struct tcphdr *th, - size_t plen, uint32_t seq) -{ - size_t ip_len = plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr); + l3len += + sizeof(*ip4h); - ip6h->payload_len = htons(plen + sizeof(struct tcphdr)); - ip6h->saddr = conn->faddr.a6; - if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr)) - ip6h->daddr = c->ip6.addr_ll_seen; - else - ip6h->daddr = c->ip6.addr_seen; + ip4h->tot_len = htons(l3len); + ip4h->saddr = src4->s_addr; + ip4h->daddr = dst4->s_addr; - ip6h->hop_limit = 255; - ip6h->version = 6; - ip6h->nexthdr = IPPROTO_TCP; - - ip6h->flow_lbl[0] = (conn->sock >> 16) & 0xf; - ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff; - ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff; - - tcp_fill_header(th, conn, seq); + if (ip4_check) + ip4h->check = *ip4_check; + else + ip4h->check = csum_ip4_header(l3len, IPPROTO_TCP, + *src4, *dst4); - tcp_update_check_tcp6(ip6h, th); + if (!no_tcp_csum) { + psum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, + *src4, *dst4); + } + } - return ip_len; -} + if (ip6h) { + l3len += sizeof(*ip6h); -/** - * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers - * @c: Execution context - * @conn: Connection pointer - * @p: Pointer to any type of TCP pre-cooked buffer - * @plen: Payload length (including TCP header options) - * @check: Checksum, if already known - * @seq: Sequence number for this segment - * - * Return: frame length including L2 headers, host order - */ -static size_t tcp_l2_buf_fill_headers(const struct ctx *c, - const struct tcp_tap_conn *conn, - void *p, size_t plen, - const uint16_t *check, uint32_t seq) -{ - const struct in_addr *a4 = inany_v4(&conn->faddr); - size_t ip_len, tlen; + ip6h->payload_len = htons(l4len); + ip6h->saddr = tapside->oaddr.a6; + ip6h->daddr = tapside->eaddr.a6; - if (a4) { - struct tcp4_l2_buf_t *b = (struct tcp4_l2_buf_t *)p; + ip6h->hop_limit = 255; + ip6h->version = 6; + ip6h->nexthdr = IPPROTO_TCP; - ip_len = tcp_fill_headers4(c, conn, &b->iph, &b->th, plen, - check, seq); + ip6_set_flow_lbl(ip6h, conn->sock); - tlen = tap_frame_len(c, &b->taph, ip_len); - } else { - struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p; + if (!no_tcp_csum) { + psum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, + &ip6h->saddr, + &ip6h->daddr); + } + } - ip_len = tcp_fill_headers6(c, conn, &b->ip6h, &b->th, plen, - seq); + tcp_fill_header(th, conn, seq); - tlen = tap_frame_len(c, &b->taph, ip_len); - } + if (no_tcp_csum) + th->check = 0; + else + tcp_update_csum(psum, th, payload); - return tlen; + tap_hdr_update(taph, l3len + sizeof(struct ethhdr)); } /** @@ -1476,43 +993,42 @@ static size_t tcp_l2_buf_fill_headers(const struct ctx *c, * * Return: 1 if sequence or window were updated, 0 otherwise */ -static int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, - int force_seq, struct tcp_info *tinfo) +int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, + bool force_seq, struct tcp_info_linux *tinfo) { uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap; uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; /* cppcheck-suppress [ctunullpointer, unmatchedSuppression] */ socklen_t sl = sizeof(*tinfo); - struct tcp_info tinfo_new; + struct tcp_info_linux tinfo_new; uint32_t new_wnd_to_tap = prev_wnd_to_tap; int s = conn->sock; -#ifndef HAS_BYTES_ACKED - (void)force_seq; - - conn->seq_ack_to_tap = conn->seq_from_tap; - if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap)) - conn->seq_ack_to_tap = prev_ack_to_tap; -#else - if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || tcp_rtt_dst_low(conn) - || CONN_IS_CLOSING(conn) || (conn->flags & LOCAL) || force_seq) { + if (!bytes_acked_cap) { conn->seq_ack_to_tap = conn->seq_from_tap; - } else if (conn->seq_ack_to_tap != conn->seq_from_tap) { - if (!tinfo) { - tinfo = &tinfo_new; - if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) - return 0; - } - - conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked + - conn->seq_init_from_tap; - if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap)) conn->seq_ack_to_tap = prev_ack_to_tap; + } else { + if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || + tcp_rtt_dst_low(conn) || CONN_IS_CLOSING(conn) || + (conn->flags & LOCAL) || force_seq) { + conn->seq_ack_to_tap = conn->seq_from_tap; + } else if (conn->seq_ack_to_tap != conn->seq_from_tap) { + if (!tinfo) { + tinfo = &tinfo_new; + if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) + return 0; + } + + conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked + + conn->seq_init_from_tap; + + if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap)) + conn->seq_ack_to_tap = prev_ack_to_tap; + } } -#endif /* !HAS_BYTES_ACKED */ - if (!KERNEL_REPORTS_SND_WND(c)) { + if (!snd_wnd_cap) { tcp_get_sndbuf(conn); new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW); conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap, @@ -1523,14 +1039,13 @@ static int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, if (!tinfo) { if (prev_wnd_to_tap > WINDOW_DEFAULT) { goto out; -} + } tinfo = &tinfo_new; if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) { goto out; -} + } } -#ifdef HAS_SND_WND if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) { new_wnd_to_tap = tinfo->tcpi_snd_wnd; } else { @@ -1538,7 +1053,6 @@ static int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, SNDBUF_GET(conn)); } -#endif new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW); if (!(conn->events & ESTABLISHED)) @@ -1565,7 +1079,7 @@ out: * tcp_update_seqack_from_tap() - ACK number from tap and related flags/counters * @c: Execution context * @conn: Connection pointer - * @seq Current ACK sequence, host order + * @seq: Current ACK sequence, host order */ static void tcp_update_seqack_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, uint32_t seq) @@ -1584,72 +1098,48 @@ static void tcp_update_seqack_from_tap(const struct ctx *c, } /** - * tcp_send_flag() - Send segment with flags to tap (no payload) + * tcp_prepare_flags() - Prepare header for flags-only segment (no payload) * @c: Execution context * @conn: Connection pointer * @flags: TCP flags: if not set, send segment only if ACK is due + * @th: TCP header to update + * @opts: TCP option buffer (output parameter) + * @optlen: size of the TCP option buffer (output parameter) * - * Return: negative error code on connection reset, 0 otherwise + * Return: < 0 error code on connection reset, + * 0 if there is no flag to send + * 1 otherwise */ -static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, + int flags, struct tcphdr *th, struct tcp_syn_opts *opts, + size_t *optlen) { - uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; - uint32_t prev_wnd_to_tap = conn->wnd_to_tap; - struct tcp4_l2_flags_buf_t *b4 = NULL; - struct tcp6_l2_flags_buf_t *b6 = NULL; - struct tcp_info tinfo = { 0 }; + struct tcp_info_linux tinfo = { 0 }; socklen_t sl = sizeof(tinfo); int s = conn->sock; - size_t optlen = 0; - struct iovec *iov; - struct tcphdr *th; - char *data; - void *p; if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) && - !flags && conn->wnd_to_tap) + !flags && conn->wnd_to_tap) { + conn_flag(c, conn, ~ACK_TO_TAP_DUE); return 0; + } if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) { conn_event(c, conn, CLOSED); return -ECONNRESET; } -#ifdef HAS_SND_WND - if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd) - c->tcp.kernel_snd_wnd = 1; -#endif - if (!(conn->flags & LOCAL)) tcp_rtt_dst_check(conn, &tinfo); - if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags) + if (!tcp_update_seqack_wnd(c, conn, !!flags, &tinfo) && !flags) return 0; - if (CONN_V4(conn)) { - iov = tcp4_l2_flags_iov + tcp4_l2_flags_buf_used; - p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++; - th = &b4->th; - - /* gcc 11.2 would complain on data = (char *)(th + 1); */ - data = b4->opts; - } else { - iov = tcp6_l2_flags_iov + tcp6_l2_flags_buf_used; - p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++; - th = &b6->th; - data = b6->opts; - } - + *optlen = 0; if (flags & SYN) { int mss; - /* Options: MSS, NOP and window scale (8 bytes) */ - optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN; - - *data++ = OPT_MSS; - *data++ = OPT_MSS_LEN; - - if (c->mtu == -1) { + if (!c->mtu) { mss = tinfo.tcpi_snd_mss; } else { mss = c->mtu - sizeof(struct tcphdr); @@ -1664,32 +1154,22 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) else if (mss > PAGE_SIZE) mss = ROUND_DOWN(mss, PAGE_SIZE); } - *(uint16_t *)data = htons(MIN(USHRT_MAX, mss)); - - data += OPT_MSS_LEN - 2; conn->ws_to_tap = MIN(MAX_WS, tinfo.tcpi_snd_wscale); - *data++ = OPT_NOP; - *data++ = OPT_WS; - *data++ = OPT_WS_LEN; - *data++ = conn->ws_to_tap; - } else if (!(flags & RST)) { - if (conn->seq_ack_to_tap != prev_ack_to_tap || - !prev_wnd_to_tap) - flags |= ACK; + *opts = TCP_SYN_OPTS(mss, conn->ws_to_tap); + *optlen = sizeof(*opts); + } else { + flags |= ACK; } - th->doff = (sizeof(*th) + optlen) / 4; + th->doff = (sizeof(*th) + *optlen) / 4; th->ack = !!(flags & ACK); th->rst = !!(flags & RST); th->syn = !!(flags & SYN); th->fin = !!(flags & FIN); - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, p, optlen, - NULL, conn->seq_to_tap); - if (th->ack) { if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap)) conn_flag(c, conn, ~ACK_TO_TAP_DUE); @@ -1704,27 +1184,24 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) if (th->fin || th->syn) conn->seq_to_tap++; - if (CONN_V4(conn)) { - if (flags & DUP_ACK) { - memcpy(b4 + 1, b4, sizeof(*b4)); - (iov + 1)->iov_len = iov->iov_len; - tcp4_l2_flags_buf_used++; - } - - if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2) - tcp_l2_flags_buf_flush(c); - } else { - if (flags & DUP_ACK) { - memcpy(b6 + 1, b6, sizeof(*b6)); - (iov + 1)->iov_len = iov->iov_len; - tcp6_l2_flags_buf_used++; - } + return 1; +} - if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2) - tcp_l2_flags_buf_flush(c); - } +/** + * tcp_send_flag() - Send segment with flags to tap (no payload) + * @c: Execution context + * @conn: Connection pointer + * @flags: TCP flags: if not set, send segment only if ACK is due + * + * Return: negative error code on connection reset, 0 otherwise + */ +static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, + int flags) +{ + if (c->mode == MODE_VU) + return tcp_vu_send_flag(c, conn, flags); - return 0; + return tcp_buf_send_flag(c, conn, flags); } /** @@ -1732,13 +1209,13 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) * @c: Execution context * @conn: Connection pointer */ -static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn) +void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn) { if (conn->events == CLOSED) return; - if (!tcp_send_flag(c, conn, RST)) - conn_event(c, conn, CLOSED); + tcp_send_flag(c, conn, RST); + conn_event(c, conn, CLOSED); } /** @@ -1761,11 +1238,19 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn, /** * tcp_tap_window_update() - Process an updated window from tap side * @conn: Connection pointer - * @window: Window value, host order, unscaled + * @wnd: Window value, host order, unscaled */ static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd) { wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap); + + /* Work-around for bug introduced in peer kernel code, commit + * e2142825c120 ("net: tcp: send zero-window ACK when no memory"). + * We don't update if window shrank to zero. + */ + if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) + return; + conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX); /* FIXME: reflect the tap-side receiver's window back to the sock-side @@ -1773,33 +1258,18 @@ static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd) } /** - * tcp_seq_init() - Calculate initial sequence number according to RFC 6528 - * @c: Execution context - * @conn: TCP connection, with faddr, fport and eport populated + * tcp_init_seq() - Calculate initial sequence number according to RFC 6528 + * @hash: Hash of connection details * @now: Current timestamp + * + * Return: the calculated 32-bit initial sequence number. */ -static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn, - const struct timespec *now) +static uint32_t tcp_init_seq(uint64_t hash, const struct timespec *now) { - struct siphash_state state = SIPHASH_INIT(c->hash_secret); - union inany_addr aany; - uint64_t hash; - uint32_t ns; - - if (CONN_V4(conn)) - inany_from_af(&aany, AF_INET, &c->ip4.addr); - else - inany_from_af(&aany, AF_INET6, &c->ip6.addr); - - inany_siphash_feed(&state, &conn->faddr); - inany_siphash_feed(&state, &aany); - hash = siphash_final(&state, 36, - (uint64_t)conn->fport << 16 | conn->eport); - /* 32ns ticks, overflows 32 bits every 137s */ - ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5; + uint32_t ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5; - conn->seq_to_tap = ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns; + return ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns; } /** @@ -1822,16 +1292,15 @@ int tcp_conn_pool_sock(int pool[]) /** * tcp_conn_new_sock() - Open and prepare new socket for connection - * @c: Execution context * @af: Address family * * Return: socket number on success, negative code if socket creation failed */ -static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) +static int tcp_conn_new_sock(sa_family_t af) { int s; - s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); + s = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, IPPROTO_TCP); if (s > FD_REF_MAX) { close(s); @@ -1841,19 +1310,18 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) if (s < 0) return -errno; - tcp_sock_set_bufsize(c, s); + tcp_sock_set_nodelay(s); return s; } /** * tcp_conn_sock() - Obtain a connectable socket in the host/init namespace - * @c: Execution context * @af: Address family (AF_INET or AF_INET6) * * Return: Socket fd on success, -errno on failure */ -int tcp_conn_sock(const struct ctx *c, sa_family_t af) +int tcp_conn_sock(sa_family_t af) { int *pool = af == AF_INET6 ? init_sock_pool6 : init_sock_pool4; int s; @@ -1864,11 +1332,11 @@ int tcp_conn_sock(const struct ctx *c, sa_family_t af) /* If the pool is empty we just open a new one without refilling the * pool to keep latency down. */ - if ((s = tcp_conn_new_sock(c, af)) >= 0) + if ((s = tcp_conn_new_sock(af)) >= 0) return s; err("TCP: Unable to open socket for new connection: %s", - strerror(-s)); + strerror_(-s)); return -1; } @@ -1902,53 +1370,47 @@ static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn, /** * tcp_bind_outbound() - Bind socket to outbound address and interface if given * @c: Execution context + * @conn: Connection entry for socket to bind * @s: Outbound TCP socket - * @af: Address family */ -static void tcp_bind_outbound(const struct ctx *c, int s, sa_family_t af) +static void tcp_bind_outbound(const struct ctx *c, + const struct tcp_tap_conn *conn, int s) { - if (af == AF_INET) { - if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.addr_out)) { - struct sockaddr_in addr4 = { - .sin_family = AF_INET, - .sin_port = 0, - .sin_addr = c->ip4.addr_out, - }; - - if (bind(s, (struct sockaddr *)&addr4, sizeof(addr4))) { - debug("Can't bind IPv4 TCP socket address: %s", - strerror(errno)); - } + const struct flowside *tgt = &conn->f.side[TGTSIDE]; + union sockaddr_inany bind_sa; + socklen_t sl; + + + pif_sockaddr(c, &bind_sa, &sl, PIF_HOST, &tgt->oaddr, tgt->oport); + if (!inany_is_unspecified(&tgt->oaddr) || tgt->oport) { + if (bind(s, &bind_sa.sa, sl)) { + char sstr[INANY_ADDRSTRLEN]; + + flow_dbg_perror(conn, + "Can't bind TCP outbound socket to %s:%hu", + inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)), + tgt->oport); } + } + if (bind_sa.sa_family == AF_INET) { if (*c->ip4.ifname_out) { if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE, c->ip4.ifname_out, strlen(c->ip4.ifname_out))) { - debug("Can't bind IPv4 TCP socket to interface:" - " %s", strerror(errno)); - } - } - } else if (af == AF_INET6) { - if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr_out)) { - struct sockaddr_in6 addr6 = { - .sin6_family = AF_INET6, - .sin6_port = 0, - .sin6_addr = c->ip6.addr_out, - }; - - if (bind(s, (struct sockaddr *)&addr6, sizeof(addr6))) { - debug("Can't bind IPv6 TCP socket address: %s", - strerror(errno)); + flow_dbg_perror(conn, + "Can't bind IPv4 TCP socket to interface %s", + c->ip4.ifname_out); } } - + } else if (bind_sa.sa_family == AF_INET6) { if (*c->ip6.ifname_out) { if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE, c->ip6.ifname_out, strlen(c->ip6.ifname_out))) { - debug("Can't bind IPv6 TCP socket to interface:" - " %s", strerror(errno)); + flow_dbg_perror(conn, + "Can't bind IPv6 TCP socket to interface %s", + c->ip6.ifname_out); } } } @@ -1964,88 +1426,81 @@ static void tcp_bind_outbound(const struct ctx *c, int s, sa_family_t af) * @opts: Pointer to start of options * @optlen: Bytes in options: caller MUST ensure available length * @now: Current timestamp + * + * #syscalls:vu getsockname */ -static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, +static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, const void *saddr, const void *daddr, const struct tcphdr *th, const char *opts, size_t optlen, const struct timespec *now) { in_port_t srcport = ntohs(th->source); in_port_t dstport = ntohs(th->dest); - struct sockaddr_in addr4 = { - .sin_family = AF_INET, - .sin_port = htons(dstport), - .sin_addr = *(struct in_addr *)daddr, - }; - struct sockaddr_in6 addr6 = { - .sin6_family = AF_INET6, - .sin6_port = htons(dstport), - .sin6_addr = *(struct in6_addr *)daddr, - }; - const struct sockaddr *sa; + const struct flowside *ini; struct tcp_tap_conn *conn; + union sockaddr_inany sa; + struct flowside *tgt; union flow *flow; int s = -1, mss; + uint64_t hash; socklen_t sl; if (!(flow = flow_alloc())) return; - if (af == AF_INET) { - if (IN4_IS_ADDR_UNSPECIFIED(saddr) || - IN4_IS_ADDR_BROADCAST(saddr) || - IN4_IS_ADDR_MULTICAST(saddr) || srcport == 0 || - IN4_IS_ADDR_UNSPECIFIED(daddr) || - IN4_IS_ADDR_BROADCAST(daddr) || - IN4_IS_ADDR_MULTICAST(daddr) || dstport == 0) { - char sstr[INET_ADDRSTRLEN], dstr[INET_ADDRSTRLEN]; - - debug("Invalid endpoint in TCP SYN: %s:%hu -> %s:%hu", - inet_ntop(AF_INET, saddr, sstr, sizeof(sstr)), - srcport, - inet_ntop(AF_INET, daddr, dstr, sizeof(dstr)), - dstport); - goto cancel; - } - } else if (af == AF_INET6) { - if (IN6_IS_ADDR_UNSPECIFIED(saddr) || - IN6_IS_ADDR_MULTICAST(saddr) || srcport == 0 || - IN6_IS_ADDR_UNSPECIFIED(daddr) || - IN6_IS_ADDR_MULTICAST(daddr) || dstport == 0) { - char sstr[INET6_ADDRSTRLEN], dstr[INET6_ADDRSTRLEN]; - - debug("Invalid endpoint in TCP SYN: %s:%hu -> %s:%hu", - inet_ntop(AF_INET6, saddr, sstr, sizeof(sstr)), - srcport, - inet_ntop(AF_INET6, daddr, dstr, sizeof(dstr)), - dstport); - goto cancel; - } - } + ini = flow_initiate_af(flow, PIF_TAP, + af, saddr, srcport, daddr, dstport); + + if (!(tgt = flow_target(c, flow, IPPROTO_TCP))) + goto cancel; - if ((s = tcp_conn_sock(c, af)) < 0) + if (flow->f.pif[TGTSIDE] != PIF_HOST) { + flow_err(flow, "No support for forwarding TCP from %s to %s", + pif_name(flow->f.pif[INISIDE]), + pif_name(flow->f.pif[TGTSIDE])); goto cancel; + } + + conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); - if (!c->no_map_gw) { - if (af == AF_INET && IN4_ARE_ADDR_EQUAL(daddr, &c->ip4.gw)) - addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); - if (af == AF_INET6 && IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.gw)) - addr6.sin6_addr = in6addr_loopback; + if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0 || + !inany_is_unicast(&ini->oaddr) || ini->oport == 0) { + char sstr[INANY_ADDRSTRLEN], dstr[INANY_ADDRSTRLEN]; + + debug("Invalid endpoint in TCP SYN: %s:%hu -> %s:%hu", + inany_ntop(&ini->eaddr, sstr, sizeof(sstr)), ini->eport, + inany_ntop(&ini->oaddr, dstr, sizeof(dstr)), ini->oport); + goto cancel; } - if (af == AF_INET6 && IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr)) { - struct sockaddr_in6 addr6_ll = { - .sin6_family = AF_INET6, - .sin6_addr = c->ip6.addr_ll, - .sin6_scope_id = c->ifi6, - }; - if (bind(s, (struct sockaddr *)&addr6_ll, sizeof(addr6_ll))) + if ((s = tcp_conn_sock(af)) < 0) + goto cancel; + + pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, tgt->eport); + + /* Use bind() to check if the target address is local (EADDRINUSE or + * similar) and already bound, and set the LOCAL flag in that case. + * + * If bind() succeeds, in general, we could infer that nobody (else) is + * listening on that address and port and reset the connection attempt + * early, but we can't rely on that if non-local binds are enabled, + * because bind() would succeed for any non-local address we can reach. + * + * So, if bind() succeeds, close the socket, get a new one, and proceed. + */ + if (bind(s, &sa.sa, sl)) { + if (errno != EADDRNOTAVAIL && errno != EACCES) + conn_flag(c, conn, LOCAL); + } else { + /* Not a local, bound destination, inconclusive test */ + close(s); + if ((s = tcp_conn_sock(af)) < 0) goto cancel; } - conn = FLOW_START(flow, FLOW_TCP, tcp, TAPSIDE); conn->sock = s; conn->timer = -1; + conn->listening_sock = -1; conn_event(c, conn, TAP_SYN_RCVD); conn->wnd_to_tap = WINDOW_DEFAULT; @@ -2063,44 +1518,20 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, if (!(conn->wnd_from_tap = (htons(th->window) >> conn->ws_from_tap))) conn->wnd_from_tap = 1; - inany_from_af(&conn->faddr, af, daddr); - - if (af == AF_INET) { - sa = (struct sockaddr *)&addr4; - sl = sizeof(addr4); - } else { - sa = (struct sockaddr *)&addr6; - sl = sizeof(addr6); - } - - conn->fport = dstport; - conn->eport = srcport; - conn->seq_init_from_tap = ntohl(th->seq); conn->seq_from_tap = conn->seq_init_from_tap + 1; conn->seq_ack_to_tap = conn->seq_from_tap; - tcp_seq_init(c, conn, now); + hash = flow_hash_insert(c, TAP_SIDX(conn)); + conn->seq_to_tap = tcp_init_seq(hash, now); conn->seq_ack_from_tap = conn->seq_to_tap; - tcp_hash_insert(c, conn); - - if (!bind(s, sa, sl)) { - tcp_rst(c, conn); /* Nobody is listening then */ - return; - } - if (errno != EADDRNOTAVAIL && errno != EACCES) - conn_flag(c, conn, LOCAL); - - if ((af == AF_INET && !IN4_IS_ADDR_LOOPBACK(&addr4.sin_addr)) || - (af == AF_INET6 && !IN6_IS_ADDR_LOOPBACK(&addr6.sin6_addr) && - !IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr))) - tcp_bind_outbound(c, s, af); + tcp_bind_outbound(c, conn, s); - if (connect(s, sa, sl)) { + if (connect(s, &sa.sa, sl)) { if (errno != EINPROGRESS) { tcp_rst(c, conn); - return; + goto cancel; } tcp_get_sndbuf(conn); @@ -2108,12 +1539,21 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, tcp_get_sndbuf(conn); if (tcp_send_flag(c, conn, SYN | ACK)) - return; + goto cancel; conn_event(c, conn, TAP_SYN_ACK_SENT); } tcp_epoll_ctl(c, conn); + + if (c->mode == MODE_VU) { /* To rebind to same oport after migration */ + sl = sizeof(sa); + if (getsockname(s, &sa.sa, &sl) || + inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa) < 0) + err_perror("Can't get local address for socket %i", s); + } + + FLOW_ACTIVATE(conn); return; cancel: @@ -2156,46 +1596,6 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq) } /** - * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer - * @c: Execution context - * @conn: Connection pointer - * @plen: Payload length at L4 - * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer - * @seq: Sequence number to be sent - */ -static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, - ssize_t plen, int no_csum, uint32_t seq) -{ - uint32_t *seq_update = &conn->seq_to_tap; - struct iovec *iov; - - if (CONN_V4(conn)) { - struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used]; - const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL; - - tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update; - tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen; - - iov = tcp4_l2_iov + tcp4_l2_buf_used++; - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, - check, seq); - if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1) - tcp_l2_data_buf_flush(c); - } else if (CONN_V6(conn)) { - struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used]; - - tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update; - tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen; - - iov = tcp6_l2_iov + tcp6_l2_buf_used++; - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, - NULL, seq); - if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1) - tcp_l2_data_buf_flush(c); - } -} - -/** * tcp_data_from_sock() - Handle new data from socket, queue to tap, in window * @c: Execution context * @conn: Connection pointer @@ -2204,125 +1604,12 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, * * #syscalls recvmsg */ -static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) +static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) { - uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; - int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; - int sendlen, len, plen, v4 = CONN_V4(conn); - int s = conn->sock, i, ret = 0; - struct msghdr mh_sock = { 0 }; - uint16_t mss = MSS_GET(conn); - uint32_t already_sent, seq; - struct iovec *iov; - - already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; - - if (SEQ_LT(already_sent, 0)) { - /* RFC 761, section 2.1. */ - flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", - conn->seq_ack_from_tap, conn->seq_to_tap); - conn->seq_to_tap = conn->seq_ack_from_tap; - already_sent = 0; - } - - if (!wnd_scaled || already_sent >= wnd_scaled) { - conn_flag(c, conn, STALLED); - conn_flag(c, conn, ACK_FROM_TAP_DUE); - return 0; - } - - /* Set up buffer descriptors we'll fill completely and partially. */ - fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss); - if (fill_bufs > TCP_FRAMES) { - fill_bufs = TCP_FRAMES; - iov_rem = 0; - } else { - iov_rem = (wnd_scaled - already_sent) % mss; - } - - mh_sock.msg_iov = iov_sock; - mh_sock.msg_iovlen = fill_bufs + 1; - - iov_sock[0].iov_base = tcp_buf_discard; - iov_sock[0].iov_len = already_sent; - - if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) || - (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) { - tcp_l2_data_buf_flush(c); - - /* Silence Coverity CWE-125 false positive */ - tcp4_l2_buf_used = tcp6_l2_buf_used = 0; - } - - for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { - if (v4) - iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data; - else - iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data; - iov->iov_len = mss; - } - if (iov_rem) - iov_sock[fill_bufs].iov_len = iov_rem; + if (c->mode == MODE_VU) + return tcp_vu_data_from_sock(c, conn); - /* Receive into buffers, don't dequeue until acknowledged by guest. */ - do - len = recvmsg(s, &mh_sock, MSG_PEEK); - while (len < 0 && errno == EINTR); - - if (len < 0) - goto err; - - if (!len) { - if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { - if ((ret = tcp_send_flag(c, conn, FIN | ACK))) { - tcp_rst(c, conn); - return ret; - } - - conn_event(c, conn, TAP_FIN_SENT); - } - - return 0; - } - - sendlen = len - already_sent; - if (sendlen <= 0) { - conn_flag(c, conn, STALLED); - return 0; - } - - conn_flag(c, conn, ~STALLED); - - send_bufs = DIV_ROUND_UP(sendlen, mss); - last_len = sendlen - (send_bufs - 1) * mss; - - /* Likely, some new data was acked too. */ - tcp_update_seqack_wnd(c, conn, 0, NULL); - - /* Finally, queue to tap */ - plen = mss; - seq = conn->seq_to_tap; - for (i = 0; i < send_bufs; i++) { - int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used; - - if (i == send_bufs - 1) - plen = last_len; - - tcp_data_to_tap(c, conn, plen, no_csum, seq); - seq += plen; - } - - conn_flag(c, conn, ACK_FROM_TAP_DUE); - - return 0; - -err: - if (errno != EAGAIN && errno != EWOULDBLOCK) { - ret = -errno; - tcp_rst(c, conn); - } - - return ret; + return tcp_buf_data_from_sock(c, conn); } /** @@ -2336,8 +1623,8 @@ err: * * Return: count of consumed packets */ -static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, - const struct pool *p, int idx) +static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, + const struct pool *p, int idx) { int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0; uint16_t max_ack_seq_wnd = conn->wnd_from_tap; @@ -2378,6 +1665,22 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, continue; seq = ntohl(th->seq); + if (SEQ_LT(seq, conn->seq_from_tap) && len <= 1) { + flow_trace(conn, + "keep-alive sequence: %u, previous: %u", + seq, conn->seq_from_tap); + + tcp_send_flag(c, conn, ACK); + tcp_timer_ctl(c, conn); + + if (p->count == 1) { + tcp_tap_window_update(conn, ntohs(th->window)); + return 1; + } + + continue; + } + ack_seq = ntohl(th->ack_seq); if (th->ack) { @@ -2456,6 +1759,10 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, "fast re-transmit, ACK: %u, previous sequence: %u", max_ack_seq, conn->seq_to_tap); conn->seq_to_tap = max_ack_seq; + if (tcp_set_peek_offset(conn, 0)) { + tcp_rst(c, conn); + return -1; + } tcp_data_from_sock(c, conn); } @@ -2530,7 +1837,8 @@ out: * @opts: Pointer to start of options * @optlen: Bytes in options: caller MUST ensure available length */ -static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, +static void tcp_conn_from_sock_finish(const struct ctx *c, + struct tcp_tap_conn *conn, const struct tcphdr *th, const char *opts, size_t optlen) { @@ -2548,12 +1856,86 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, conn->seq_ack_to_tap = conn->seq_from_tap; conn_event(c, conn, ESTABLISHED); + if (tcp_set_peek_offset(conn, 0)) { + tcp_rst(c, conn); + return; + } + + tcp_send_flag(c, conn, ACK); /* The client might have sent data already, which we didn't * dequeue waiting for SYN,ACK from tap -- check now. */ tcp_data_from_sock(c, conn); - tcp_send_flag(c, conn, ACK); +} + +/** + * tcp_rst_no_conn() - Send RST in response to a packet with no connection + * @c: Execution context + * @af: Address family, AF_INET or AF_INET6 + * @saddr: Source address of the packet we're responding to + * @daddr: Destination address of the packet we're responding to + * @flow_lbl: IPv6 flow label (ignored for IPv4) + * @th: TCP header of the packet we're responding to + * @l4len: Packet length, including TCP header + */ +static void tcp_rst_no_conn(const struct ctx *c, int af, + const void *saddr, const void *daddr, + uint32_t flow_lbl, + const struct tcphdr *th, size_t l4len) +{ + struct iov_tail payload = IOV_TAIL(NULL, 0, 0); + struct tcphdr *rsth; + char buf[USHRT_MAX]; + uint32_t psum = 0; + size_t rst_l2len; + + /* Don't respond to RSTs without a connection */ + if (th->rst) + return; + + if (af == AF_INET) { + struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP); + const struct in_addr *rst_src = daddr; + const struct in_addr *rst_dst = saddr; + + rsth = tap_push_ip4h(ip4h, *rst_src, *rst_dst, + sizeof(*rsth), IPPROTO_TCP); + psum = proto_ipv4_header_psum(sizeof(*rsth), IPPROTO_TCP, + *rst_src, *rst_dst); + + } else { + struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6); + const struct in6_addr *rst_src = daddr; + const struct in6_addr *rst_dst = saddr; + + rsth = tap_push_ip6h(ip6h, rst_src, rst_dst, + sizeof(*rsth), IPPROTO_TCP, flow_lbl); + psum = proto_ipv6_header_psum(sizeof(*rsth), IPPROTO_TCP, + rst_src, rst_dst); + } + + memset(rsth, 0, sizeof(*rsth)); + + rsth->source = th->dest; + rsth->dest = th->source; + rsth->rst = 1; + rsth->doff = sizeof(*rsth) / 4UL; + + /* Sequence matching logic from RFC 9293 section 3.10.7.1 */ + if (th->ack) { + rsth->seq = th->ack_seq; + } else { + size_t dlen = l4len - th->doff * 4UL; + uint32_t ack = ntohl(th->seq) + dlen; + + rsth->ack_seq = htonl(ack); + rsth->ack = 1; + } + + tcp_update_csum(psum, rsth, &payload); + rst_l2len = ((char *)rsth - buf) + sizeof(*rsth); + tap_send_single(c, buf, rst_l2len); } /** @@ -2563,20 +1945,23 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, * @af: Address family, AF_INET or AF_INET6 * @saddr: Source address * @daddr: Destination address + * @flow_lbl: IPv6 flow label (ignored for IPv4) * @p: Pool of TCP packets, with TCP headers * @idx: Index of first packet in pool to process * @now: Current timestamp * * Return: count of consumed packets */ -int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, - const void *saddr, const void *daddr, +int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, + const void *saddr, const void *daddr, uint32_t flow_lbl, const struct pool *p, int idx, const struct timespec *now) { struct tcp_tap_conn *conn; const struct tcphdr *th; size_t optlen, len; const char *opts; + union flow *flow; + flow_sidx_t sidx; int ack_due = 0; int count; @@ -2592,16 +1977,24 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, optlen = MIN(optlen, ((1UL << 4) /* from doff width */ - 6) * 4UL); opts = packet_get(p, idx, sizeof(*th), optlen, NULL); - conn = tcp_hash_lookup(c, af, daddr, ntohs(th->source), ntohs(th->dest)); + sidx = flow_lookup_af(c, IPPROTO_TCP, PIF_TAP, af, saddr, daddr, + ntohs(th->source), ntohs(th->dest)); + flow = flow_at_sidx(sidx); /* New connection from tap */ - if (!conn) { + if (!flow) { if (opts && th->syn && !th->ack) tcp_conn_from_tap(c, af, saddr, daddr, th, opts, optlen, now); + else + tcp_rst_no_conn(c, af, saddr, daddr, flow_lbl, th, len); return 1; } + ASSERT(flow->f.type == FLOW_TCP); + ASSERT(pif_at_sidx(sidx) == PIF_TAP); + conn = &flow->tcp; + flow_trace(conn, "packet length %zu from tap", len); if (th->rst) { @@ -2624,10 +2017,15 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, /* Establishing connection from tap */ if (conn->events & TAP_SYN_RCVD) { + if (th->syn && !th->ack && !th->fin) + return 1; /* SYN retry: ignore and keep waiting */ + if (!(conn->events & TAP_SYN_ACK_SENT)) goto reset; conn_event(c, conn, ESTABLISHED); + if (tcp_set_peek_offset(conn, 0)) + goto reset; if (th->fin) { conn->seq_from_tap++; @@ -2652,7 +2050,10 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, /* Established connections not accepting data from tap */ if (conn->events & TAP_FIN_RCVD) { + tcp_sock_consume(conn, ntohl(th->ack_seq)); tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq)); + tcp_tap_window_update(conn, ntohs(th->window)); + tcp_data_from_sock(c, conn); if (conn->events & SOCK_FIN_RCVD && conn->seq_ack_from_tap == conn->seq_to_tap) @@ -2672,10 +2073,27 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, ack_due = 1; if ((conn->events & TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_SENT)) { + socklen_t sl; + struct tcp_info tinfo; + shutdown(conn->sock, SHUT_WR); conn_event(c, conn, SOCK_FIN_SENT); tcp_send_flag(c, conn, ACK); ack_due = 0; + + /* If we received a FIN, but the socket is in TCP_ESTABLISHED + * state, it must be a migrated socket. The kernel saw the FIN + * on the source socket, but not on the target socket. + * + * Approximate the effect of that FIN: as we're sending a FIN + * out ourselves, the socket is now in a state equivalent to + * LAST_ACK. Now that we sent the FIN out, close it with a RST. + */ + sl = sizeof(tinfo); + getsockopt(conn->sock, SOL_TCP, TCP_INFO, &tinfo, &sl); + if (tinfo.tcpi_state == TCP_ESTABLISHED && + conn->events & SOCK_FIN_RCVD) + goto reset; } if (ack_due) @@ -2697,7 +2115,7 @@ reset: * @c: Execution context * @conn: Connection pointer */ -static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn) +static void tcp_connect_finish(const struct ctx *c, struct tcp_tap_conn *conn) { socklen_t sl; int so; @@ -2716,61 +2134,26 @@ static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn) } /** - * tcp_snat_inbound() - Translate source address for inbound data if needed - * @c: Execution context - * @addr: Source address of inbound packet/connection - */ -static void tcp_snat_inbound(const struct ctx *c, union inany_addr *addr) -{ - struct in_addr *addr4 = inany_v4(addr); - - if (addr4) { - if (IN4_IS_ADDR_LOOPBACK(addr4) || - IN4_IS_ADDR_UNSPECIFIED(addr4) || - IN4_ARE_ADDR_EQUAL(addr4, &c->ip4.addr_seen)) - *addr4 = c->ip4.gw; - } else { - struct in6_addr *addr6 = &addr->a6; - - if (IN6_IS_ADDR_LOOPBACK(addr6) || - IN6_ARE_ADDR_EQUAL(addr6, &c->ip6.addr_seen) || - IN6_ARE_ADDR_EQUAL(addr6, &c->ip6.addr)) { - if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw)) - *addr6 = c->ip6.gw; - else - *addr6 = c->ip6.addr_ll; - } - } -} - -/** * tcp_tap_conn_from_sock() - Initialize state for non-spliced connection * @c: Execution context - * @dstport: Destination port for connection (host side) * @flow: flow to initialise * @s: Accepted socket * @sa: Peer socket address (from accept()) * @now: Current timestamp */ -static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport, - union flow *flow, int s, - const union sockaddr_inany *sa, - const struct timespec *now) +static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow, + int s, const struct timespec *now) { - struct tcp_tap_conn *conn = FLOW_START(flow, FLOW_TCP, tcp, SOCKSIDE); + struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); + uint64_t hash; conn->sock = s; conn->timer = -1; conn->ws_to_tap = conn->ws_from_tap = 0; conn_event(c, conn, SOCK_ACCEPTED); - inany_from_sockaddr(&conn->faddr, &conn->fport, sa); - conn->eport = dstport + c->tcp.fwd_in.delta[dstport]; - - tcp_snat_inbound(c, &conn->faddr); - - tcp_seq_init(c, conn, now); - tcp_hash_insert(c, conn); + hash = flow_hash_insert(c, TAP_SIDX(conn)); + conn->seq_to_tap = tcp_init_seq(hash, now); conn->seq_ack_from_tap = conn->seq_to_tap; @@ -2780,6 +2163,8 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport, conn_flag(c, conn, ACK_FROM_TAP_DUE); tcp_get_sndbuf(conn); + + FLOW_ACTIVATE(conn); } /** @@ -2788,53 +2173,71 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport, * @ref: epoll reference of listening socket * @now: Current timestamp */ -void tcp_listen_handler(struct ctx *c, union epoll_ref ref, +void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, const struct timespec *now) { + struct tcp_tap_conn *conn; union sockaddr_inany sa; socklen_t sl = sizeof(sa); + struct flowside *ini; union flow *flow; int s; - if (c->no_tcp || !(flow = flow_alloc())) + ASSERT(!c->no_tcp); + + if (!(flow = flow_alloc())) return; s = accept4(ref.fd, &sa.sa, &sl, SOCK_NONBLOCK); if (s < 0) goto cancel; - if (sa.sa_family == AF_INET) { - const struct in_addr *addr = &sa.sa4.sin_addr; - in_port_t port = sa.sa4.sin_port; + conn = (struct tcp_tap_conn *)flow; + conn->listening_sock = ref.fd; - if (IN4_IS_ADDR_UNSPECIFIED(addr) || - IN4_IS_ADDR_BROADCAST(addr) || - IN4_IS_ADDR_MULTICAST(addr) || port == 0) { - char str[INET_ADDRSTRLEN]; + tcp_sock_set_nodelay(s); - err("Invalid endpoint from TCP accept(): %s:%hu", - inet_ntop(AF_INET, addr, str, sizeof(str)), port); - goto cancel; - } - } else if (sa.sa_family == AF_INET6) { - const struct in6_addr *addr = &sa.sa6.sin6_addr; - in_port_t port = sa.sa6.sin6_port; + /* FIXME: If useful: when the listening port has a specific bound + * address, record that as our address, as implemented for vhost-user + * mode only, below. + */ + ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa, + NULL, ref.tcp_listen.port); - if (IN6_IS_ADDR_UNSPECIFIED(addr) || - IN6_IS_ADDR_MULTICAST(addr) || port == 0) { - char str[INET6_ADDRSTRLEN]; + if (c->mode == MODE_VU) { /* Rebind to same address after migration */ + if (getsockname(s, &sa.sa, &sl) || + inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa) < 0) + err_perror("Can't get local address for socket %i", s); + } - err("Invalid endpoint from TCP accept(): %s:%hu", - inet_ntop(AF_INET6, addr, str, sizeof(str)), port); - goto cancel; - } + if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0) { + char sastr[SOCKADDR_STRLEN]; + + err("Invalid endpoint from TCP accept(): %s", + sockaddr_ntop(&sa, sastr, sizeof(sastr))); + goto cancel; } - if (tcp_splice_conn_from_sock(c, ref.tcp_listen.pif, - ref.tcp_listen.port, flow, s, &sa)) - return; + if (!flow_target(c, flow, IPPROTO_TCP)) + goto cancel; + + switch (flow->f.pif[TGTSIDE]) { + case PIF_SPLICE: + case PIF_HOST: + tcp_splice_conn_from_sock(c, flow, s); + break; + + case PIF_TAP: + tcp_tap_conn_from_sock(c, flow, s, now); + break; + + default: + flow_err(flow, "No support for forwarding TCP from %s to %s", + pif_name(flow->f.pif[INISIDE]), + pif_name(flow->f.pif[TGTSIDE])); + goto cancel; + } - tcp_tap_conn_from_sock(c, ref.tcp_listen.port, flow, s, &sa, now); return; cancel: @@ -2846,21 +2249,23 @@ cancel: * @c: Execution context * @ref: epoll reference of timer (not connection) * - * #syscalls timerfd_gettime + * #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64 */ -void tcp_timer_handler(struct ctx *c, union epoll_ref ref) +void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) { struct itimerspec check_armed = { { 0 }, { 0 } }; - struct tcp_tap_conn *conn = CONN(ref.flow); + struct tcp_tap_conn *conn = &FLOW(ref.flow)->tcp; - if (c->no_tcp) - return; + ASSERT(!c->no_tcp); + ASSERT(conn->f.type == FLOW_TCP); /* We don't reset timers on ~ACK_FROM_TAP_DUE, ~ACK_TO_TAP_DUE. If the * timer is currently armed, this event came from a previous setting, * and we just set the timer to a new point in the future: discard it. */ - timerfd_gettime(conn->timer, &check_armed); + if (timerfd_gettime(conn->timer, &check_armed)) + flow_perror(conn, "failed to read timer"); + if (check_armed.it_value.tv_sec || check_armed.it_value.tv_nsec) return; @@ -2881,8 +2286,14 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) flow_dbg(conn, "ACK timeout, retry"); conn->retrans++; conn->seq_to_tap = conn->seq_ack_from_tap; - tcp_data_from_sock(c, conn); - tcp_timer_ctl(c, conn); + if (!conn->wnd_from_tap) + conn->wnd_from_tap = 1; /* Zero-window probe */ + if (tcp_set_peek_offset(conn, 0)) { + tcp_rst(c, conn); + } else { + tcp_data_from_sock(c, conn); + tcp_timer_ctl(c, conn); + } } } else { struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } }; @@ -2894,7 +2305,9 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) * case. This avoids having to preemptively reset the timer on * ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE. */ - timerfd_settime(conn->timer, 0, &new, &old); + if (timerfd_settime(conn->timer, 0, &new, &old)) + flow_perror(conn, "failed to set timer"); + if (old.it_value.tv_sec == ACT_TIMEOUT) { flow_dbg(conn, "activity timeout"); tcp_rst(c, conn); @@ -2908,12 +2321,13 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) * @ref: epoll reference * @events: epoll events bitmap */ -void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) +void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events) { - struct tcp_tap_conn *conn = CONN(ref.flowside.flow); + struct tcp_tap_conn *conn = conn_at_sidx(ref.flowside); - ASSERT(conn->f.type == FLOW_TCP); - ASSERT(ref.flowside.side == SOCKSIDE); + ASSERT(!c->no_tcp); + ASSERT(pif_at_sidx(ref.flowside) != PIF_TAP); if (conn->events == CLOSED) return; @@ -2938,8 +2352,10 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) if (events & EPOLLIN) tcp_data_from_sock(c, conn); - if (events & EPOLLOUT) - tcp_update_seqack_wnd(c, conn, 0, NULL); + if (events & EPOLLOUT) { + if (tcp_update_seqack_wnd(c, conn, false, NULL)) + tcp_send_flag(c, conn, ACK); + } return; } @@ -2962,17 +2378,16 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) } /** - * tcp_sock_init_af() - Initialise listening socket for a given af and port + * tcp_sock_init_one() - Initialise listening socket for address and port * @c: Execution context - * @af: Address family to listen on - * @port: Port, host order - * @addr: Pointer to address for binding, NULL if not configured + * @addr: Pointer to address for binding, NULL for dual stack any * @ifname: Name of interface to bind to, NULL if not configured + * @port: Port, host order * * Return: fd for the new listening socket, negative error code on failure */ -static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port, - const void *addr, const char *ifname) +static int tcp_sock_init_one(const struct ctx *c, const union inany_addr *addr, + const char *ifname, in_port_t port) { union tcp_listen_epoll_ref tref = { .port = port, @@ -2980,48 +2395,51 @@ static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port, }; int s; - s = sock_l4(c, af, IPPROTO_TCP, addr, ifname, port, tref.u32); + s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_HOST, addr, + ifname, port, tref.u32); if (c->tcp.fwd_in.mode == FWD_AUTO) { - if (af == AF_INET || af == AF_UNSPEC) + if (!addr || inany_v4(addr)) tcp_sock_init_ext[port][V4] = s < 0 ? -1 : s; - if (af == AF_INET6 || af == AF_UNSPEC) + if (!addr || !inany_v4(addr)) tcp_sock_init_ext[port][V6] = s < 0 ? -1 : s; } if (s < 0) return s; - tcp_sock_set_bufsize(c, s); return s; } /** * tcp_sock_init() - Create listening sockets for a given host ("inbound") port * @c: Execution context - * @af: Address family to select a specific IP version, or AF_UNSPEC * @addr: Pointer to address for binding, NULL if not configured * @ifname: Name of interface to bind to, NULL if not configured * @port: Port, host order * * Return: 0 on (partial) success, negative error code on (complete) failure */ -int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr, +int tcp_sock_init(const struct ctx *c, const union inany_addr *addr, const char *ifname, in_port_t port) { int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1; - if (af == AF_UNSPEC && c->ifi4 && c->ifi6) + ASSERT(!c->no_tcp); + + if (!addr && c->ifi4 && c->ifi6) /* Attempt to get a dual stack socket */ - if (tcp_sock_init_af(c, AF_UNSPEC, port, addr, ifname) >= 0) + if (tcp_sock_init_one(c, NULL, ifname, port) >= 0) return 0; /* Otherwise create a socket per IP version */ - if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) - r4 = tcp_sock_init_af(c, AF_INET, port, addr, ifname); + if ((!addr || inany_v4(addr)) && c->ifi4) + r4 = tcp_sock_init_one(c, addr ? addr : &inany_any4, + ifname, port); - if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) - r6 = tcp_sock_init_af(c, AF_INET6, port, addr, ifname); + if ((!addr || !inany_v4(addr)) && c->ifi6) + r6 = tcp_sock_init_one(c, addr ? addr : &inany_any6, + ifname, port); if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6)) return 0; @@ -3044,11 +2462,9 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port) ASSERT(c->mode == MODE_PASTA); - s = sock_l4(c, AF_INET, IPPROTO_TCP, &in4addr_loopback, NULL, port, - tref.u32); - if (s >= 0) - tcp_sock_set_bufsize(c, s); - else + s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback4, + NULL, port, tref.u32); + if (s < 0) s = -1; if (c->tcp.fwd_out.mode == FWD_AUTO) @@ -3070,11 +2486,9 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port) ASSERT(c->mode == MODE_PASTA); - s = sock_l4(c, AF_INET6, IPPROTO_TCP, &in6addr_loopback, NULL, port, - tref.u32); - if (s >= 0) - tcp_sock_set_bufsize(c, s); - else + s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback6, + NULL, port, tref.u32); + if (s < 0) s = -1; if (c->tcp.fwd_out.mode == FWD_AUTO) @@ -3086,8 +2500,10 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port) * @c: Execution context * @port: Port, host order */ -void tcp_ns_sock_init(const struct ctx *c, in_port_t port) +static void tcp_ns_sock_init(const struct ctx *c, in_port_t port) { + ASSERT(!c->no_tcp); + if (c->ifi4) tcp_ns_sock_init4(c, port); if (c->ifi6) @@ -3100,6 +2516,7 @@ void tcp_ns_sock_init(const struct ctx *c, in_port_t port) * * Return: 0 */ +/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */ static int tcp_ns_socks_init(void *arg) { const struct ctx *c = (const struct ctx *)arg; @@ -3119,13 +2536,12 @@ static int tcp_ns_socks_init(void *arg) /** * tcp_sock_refill_pool() - Refill one pool of pre-opened sockets - * @c: Execution context * @pool: Pool of sockets to refill * @af: Address family to use * * Return: 0 on success, negative error code if there was at least one error */ -int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) +int tcp_sock_refill_pool(int pool[], sa_family_t af) { int i; @@ -3135,7 +2551,7 @@ int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) if (pool[i] >= 0) continue; - if ((fd = tcp_conn_new_sock(c, af)) < 0) + if ((fd = tcp_conn_new_sock(af)) < 0) return fd; pool[i] = fd; @@ -3151,20 +2567,71 @@ int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) static void tcp_sock_refill_init(const struct ctx *c) { if (c->ifi4) { - int rc = tcp_sock_refill_pool(c, init_sock_pool4, AF_INET); + int rc = tcp_sock_refill_pool(init_sock_pool4, AF_INET); if (rc < 0) warn("TCP: Error refilling IPv4 host socket pool: %s", - strerror(-rc)); + strerror_(-rc)); } if (c->ifi6) { - int rc = tcp_sock_refill_pool(c, init_sock_pool6, AF_INET6); + int rc = tcp_sock_refill_pool(init_sock_pool6, AF_INET6); if (rc < 0) warn("TCP: Error refilling IPv6 host socket pool: %s", - strerror(-rc)); + strerror_(-rc)); } } /** + * tcp_probe_peek_offset_cap() - Check if SO_PEEK_OFF is supported by kernel + * @af: Address family, IPv4 or IPv6 + * + * Return: true if supported, false otherwise + */ +static bool tcp_probe_peek_offset_cap(sa_family_t af) +{ + bool ret = false; + int s, optv = 0; + + s = socket(af, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); + if (s < 0) { + warn_perror("Temporary TCP socket creation failed"); + } else { + if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int))) + ret = true; + close(s); + } + + return ret; +} + +/** + * tcp_probe_tcp_info() - Check what data TCP_INFO reports + * + * Return: Number of bytes returned by TCP_INFO getsockopt() + */ +static socklen_t tcp_probe_tcp_info(void) +{ + struct tcp_info_linux tinfo; + socklen_t sl = sizeof(tinfo); + int s; + + s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); + if (s < 0) { + warn_perror("Temporary TCP socket creation failed"); + return false; + } + + if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) { + warn_perror("Failed to get TCP_INFO on temporary socket"); + close(s); + return false; + } + + close(s); + + return sl; +} + +/** * tcp_init() - Get initial sequence, hash secret, initialise per-socket data * @c: Execution context * @@ -3172,16 +2639,9 @@ static void tcp_sock_refill_init(const struct ctx *c) */ int tcp_init(struct ctx *c) { - unsigned b; + ASSERT(!c->no_tcp); - for (b = 0; b < TCP_HASH_TABLE_SIZE; b++) - tc_hash[b] = FLOW_SIDX_NONE; - - if (c->ifi4) - tcp_sock4_iov_init(c); - - if (c->ifi6) - tcp_sock6_iov_init(c); + tcp_sock_iov_init(c); memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4)); memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6)); @@ -3196,6 +2656,19 @@ int tcp_init(struct ctx *c) NS_CALL(tcp_ns_socks_init, c); } + peek_offset_cap = (!c->ifi4 || tcp_probe_peek_offset_cap(AF_INET)) && + (!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6)); + debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not "); + + tcp_info_size = tcp_probe_tcp_info(); + +#define dbg_tcpi(f_) debug("TCP_INFO tcpi_%s field%s supported", \ + STRINGIFY(f_), tcp_info_cap(f_) ? " " : " not ") + dbg_tcpi(snd_wnd); + dbg_tcpi(bytes_acked); + dbg_tcpi(min_rtt); +#undef dbg_tcpi + return 0; } @@ -3237,7 +2710,7 @@ static void tcp_port_rebind(struct ctx *c, bool outbound) if (outbound) tcp_ns_sock_init(c, port); else - tcp_sock_init(c, AF_UNSPEC, NULL, NULL, port); + tcp_sock_init(c, NULL, NULL, port); } } } @@ -3285,3 +2758,980 @@ void tcp_timer(struct ctx *c, const struct timespec *now) if (c->mode == MODE_PASTA) tcp_splice_refill(c); } + +/** + * tcp_flow_is_established() - Was the connection established? Includes closing + * @conn: Pointer to the TCP connection structure + * + * Return: true if the connection was established, false otherwise + */ +bool tcp_flow_is_established(const struct tcp_tap_conn *conn) +{ + return conn->events & ESTABLISHED; +} + +/** + * tcp_flow_repair_on() - Enable repair mode for a single TCP flow + * @c: Execution context + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn) +{ + int rc = 0; + + if (conn->sock < 0) + return 0; + + if ((rc = repair_set(c, conn->sock, TCP_REPAIR_ON))) + err("Failed to set TCP_REPAIR"); + + return rc; +} + +/** + * tcp_flow_repair_off() - Clear repair mode for a single TCP flow + * @c: Execution context + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn) +{ + int rc = 0; + + if (conn->sock < 0) + return 0; + + if ((rc = repair_set(c, conn->sock, TCP_REPAIR_OFF))) + err("Failed to clear TCP_REPAIR"); + + return rc; +} + +/** + * tcp_flow_dump_tinfo() - Dump window scale, tcpi_state, tcpi_options + * @conn: Pointer to the TCP connection structure + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_dump_tinfo(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) +{ + struct tcp_info tinfo; + socklen_t sl; + + sl = sizeof(tinfo); + if (getsockopt(conn->sock, SOL_TCP, TCP_INFO, &tinfo, &sl)) { + int rc = -errno; + flow_perror(conn, "Querying TCP_INFO"); + return rc; + } + + t->snd_ws = tinfo.tcpi_snd_wscale; + t->rcv_ws = tinfo.tcpi_rcv_wscale; + t->tcpi_state = tinfo.tcpi_state; + t->tcpi_options = tinfo.tcpi_options; + + return 0; +} + +/** + * tcp_flow_dump_mss() - Dump MSS clamp (not current MSS) via TCP_MAXSEG + * @conn: Pointer to the TCP connection structure + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_dump_mss(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) +{ + socklen_t sl = sizeof(t->mss); + int val; + + if (getsockopt(conn->sock, SOL_TCP, TCP_MAXSEG, &val, &sl)) { + int rc = -errno; + flow_perror(conn, "Getting MSS"); + return rc; + } + + t->mss = (uint32_t)val; + + return 0; +} + + +/** + * tcp_flow_dump_timestamp() - Dump RFC 7323 timestamp via TCP_TIMESTAMP + * @conn: Pointer to the TCP connection structure + * @t: Extended migration data (tcpi_options must be populated) + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_dump_timestamp(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) +{ + int val = 0; + + if (t->tcpi_options & TCPI_OPT_TIMESTAMPS) { + socklen_t sl = sizeof(val); + + if (getsockopt(conn->sock, SOL_TCP, TCP_TIMESTAMP, &val, &sl)) { + int rc = -errno; + flow_perror(conn, "Getting RFC 7323 timestamp"); + return rc; + } + } + + t->timestamp = (uint32_t)val; + return 0; +} + +/** + * tcp_flow_repair_timestamp() - Restore RFC 7323 timestamp via TCP_TIMESTAMP + * @conn: Pointer to the TCP connection structure + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_timestamp(const struct tcp_tap_conn *conn, + const struct tcp_tap_transfer_ext *t) +{ + int val = (int)t->timestamp; + + if (t->tcpi_options & TCPI_OPT_TIMESTAMPS) { + if (setsockopt(conn->sock, SOL_TCP, TCP_TIMESTAMP, + &val, sizeof(val))) { + int rc = -errno; + flow_perror(conn, "Setting RFC 7323 timestamp"); + return rc; + } + } + + return 0; +} + +/** + * tcp_flow_dump_wnd() - Dump current tcp_repair_window parameters + * @conn: Pointer to the TCP connection structure + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_dump_wnd(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) +{ + struct tcp_repair_window wnd; + socklen_t sl = sizeof(wnd); + + if (getsockopt(conn->sock, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, &sl)) { + int rc = -errno; + flow_perror(conn, "Getting window repair data"); + return rc; + } + + t->snd_wl1 = wnd.snd_wl1; + t->snd_wnd = wnd.snd_wnd; + t->max_window = wnd.max_window; + t->rcv_wnd = wnd.rcv_wnd; + t->rcv_wup = wnd.rcv_wup; + + /* If we received a FIN, we also need to adjust window parameters. + * + * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state. + */ + if (t->tcpi_state == TCP_CLOSE_WAIT || t->tcpi_state == TCP_LAST_ACK) { + t->rcv_wup--; + t->rcv_wnd++; + } + + return 0; +} + +/** + * tcp_flow_repair_wnd() - Restore window parameters from extended data + * @conn: Pointer to the TCP connection structure + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_wnd(const struct tcp_tap_conn *conn, + const struct tcp_tap_transfer_ext *t) +{ + struct tcp_repair_window wnd; + + wnd.snd_wl1 = t->snd_wl1; + wnd.snd_wnd = t->snd_wnd; + wnd.max_window = t->max_window; + wnd.rcv_wnd = t->rcv_wnd; + wnd.rcv_wup = t->rcv_wup; + + if (setsockopt(conn->sock, IPPROTO_TCP, TCP_REPAIR_WINDOW, + &wnd, sizeof(wnd))) { + int rc = -errno; + flow_perror(conn, "Setting window data"); + return rc; + } + + return 0; +} + +/** + * tcp_flow_select_queue() - Select queue (receive or send) for next operation + * @conn: Connection to select queue for + * @queue: TCP_RECV_QUEUE or TCP_SEND_QUEUE + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_select_queue(const struct tcp_tap_conn *conn, int queue) +{ + if (setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_QUEUE, + &queue, sizeof(queue))) { + int rc = -errno; + flow_perror(conn, "Selecting TCP_SEND_QUEUE"); + return rc; + } + + return 0; +} + +/** + * tcp_flow_dump_sndqueue() - Dump send queue, length of sent and not sent data + * @conn: Connection to dump queue for + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + * + * #syscalls:vu ioctl + */ +static int tcp_flow_dump_sndqueue(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) +{ + int s = conn->sock; + ssize_t rc; + + if (ioctl(s, SIOCOUTQ, &t->sndq) < 0) { + rc = -errno; + flow_perror(conn, "Getting send queue size"); + return rc; + } + + if (ioctl(s, SIOCOUTQNSD, &t->notsent) < 0) { + rc = -errno; + flow_perror(conn, "Getting not sent count"); + return rc; + } + + /* If we sent a FIN, SIOCOUTQ and SIOCOUTQNSD are one greater than the + * actual pending queue length, because they are based on the sequence + * numbers, not directly on the buffer contents. + * + * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state. + */ + if (t->tcpi_state == TCP_FIN_WAIT1 || t->tcpi_state == TCP_FIN_WAIT2 || + t->tcpi_state == TCP_LAST_ACK || t->tcpi_state == TCP_CLOSING) { + if (t->sndq) + t->sndq--; + if (t->notsent) + t->notsent--; + } + + if (t->notsent > t->sndq) { + flow_err(conn, + "Invalid notsent count socket %i, send: %u, not sent: %u", + s, t->sndq, t->notsent); + return -EINVAL; + } + + if (t->sndq > TCP_MIGRATE_SND_QUEUE_MAX) { + flow_err(conn, + "Send queue too large to migrate socket %i: %u bytes", + s, t->sndq); + return -ENOBUFS; + } + + rc = recv(s, tcp_migrate_snd_queue, + MIN(t->sndq, TCP_MIGRATE_SND_QUEUE_MAX), MSG_PEEK); + if (rc < 0) { + if (errno == EAGAIN) { /* EAGAIN means empty */ + rc = 0; + } else { + rc = -errno; + flow_perror(conn, "Can't read send queue"); + return rc; + } + } + + if ((uint32_t)rc < t->sndq) { + flow_err(conn, "Short read migrating send queue"); + return -ENXIO; + } + + t->notsent = MIN(t->notsent, t->sndq); + + return 0; +} + +/** + * tcp_flow_repair_queue() - Restore contents of a given (pre-selected) queue + * @conn: Connection to repair queue for + * @len: Length of data to be restored + * @buf: Buffer with content of pending data queue + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_queue(const struct tcp_tap_conn *conn, + size_t len, uint8_t *buf) +{ + size_t chunk = len; + uint8_t *p = buf; + + while (len > 0) { + ssize_t rc = send(conn->sock, p, MIN(len, chunk), 0); + + if (rc < 0) { + if ((errno == ENOBUFS || errno == ENOMEM) && + chunk >= TCP_MIGRATE_RESTORE_CHUNK_MIN) { + chunk /= 2; + continue; + } + + rc = -errno; + flow_perror(conn, "Can't write queue"); + return rc; + } + + len -= rc; + p += rc; + } + + return 0; +} + +/** + * tcp_flow_dump_seq() - Dump current sequence of pre-selected queue + * @conn: Pointer to the TCP connection structure + * @v: Sequence value, set on return + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_dump_seq(const struct tcp_tap_conn *conn, uint32_t *v) +{ + socklen_t sl = sizeof(*v); + + if (getsockopt(conn->sock, SOL_TCP, TCP_QUEUE_SEQ, v, &sl)) { + int rc = -errno; + flow_perror(conn, "Dumping sequence"); + return rc; + } + + return 0; +} + +/** + * tcp_flow_repair_seq() - Restore sequence for pre-selected queue + * @conn: Connection to repair sequences for + * @v: Sequence value to be set + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_seq(const struct tcp_tap_conn *conn, + const uint32_t *v) +{ + if (setsockopt(conn->sock, SOL_TCP, TCP_QUEUE_SEQ, v, sizeof(*v))) { + int rc = -errno; + flow_perror(conn, "Setting sequence"); + return rc; + } + + return 0; +} + +/** + * tcp_flow_dump_rcvqueue() - Dump receive queue and its length, seal/block it + * @conn: Pointer to the TCP connection structure + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + * + * #syscalls:vu ioctl + */ +static int tcp_flow_dump_rcvqueue(const struct tcp_tap_conn *conn, + struct tcp_tap_transfer_ext *t) +{ + int s = conn->sock; + ssize_t rc; + + if (ioctl(s, SIOCINQ, &t->rcvq) < 0) { + rc = -errno; + err_perror("Get receive queue size, socket %i", s); + return rc; + } + + /* If we received a FIN, SIOCINQ is one greater than the actual number + * of bytes on the queue, because it's based on the sequence number + * rather than directly on the buffer contents. + * + * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state. + */ + if (t->rcvq && + (t->tcpi_state == TCP_CLOSE_WAIT || t->tcpi_state == TCP_LAST_ACK)) + t->rcvq--; + + if (t->rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) { + flow_err(conn, + "Receive queue too large to migrate socket: %u bytes", + t->rcvq); + return -ENOBUFS; + } + + rc = recv(s, tcp_migrate_rcv_queue, t->rcvq, MSG_PEEK); + if (rc < 0) { + if (errno == EAGAIN) { /* EAGAIN means empty */ + rc = 0; + } else { + rc = -errno; + flow_perror(conn, "Can't read receive queue"); + return rc; + } + } + + if ((uint32_t)rc < t->rcvq) { + flow_err(conn, "Short read migrating receive queue"); + return -ENXIO; + } + + return 0; +} + +/** + * tcp_flow_repair_opt() - Set repair "options" (MSS, scale, SACK, timestamps) + * @conn: Pointer to the TCP connection structure + * @t: Extended migration data + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_opt(const struct tcp_tap_conn *conn, + const struct tcp_tap_transfer_ext *t) +{ + const struct tcp_repair_opt opts[] = { + { TCPOPT_WINDOW, t->snd_ws + (t->rcv_ws << 16) }, + { TCPOPT_MAXSEG, t->mss }, + { TCPOPT_SACK_PERMITTED, 0 }, + { TCPOPT_TIMESTAMP, 0 }, + }; + socklen_t sl; + + sl = sizeof(opts[0]) * (2 + + !!(t->tcpi_options & TCPI_OPT_SACK) + + !!(t->tcpi_options & TCPI_OPT_TIMESTAMPS)); + + if (setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_OPTIONS, opts, sl)) { + int rc = -errno; + flow_perror(conn, "Setting repair options"); + return rc; + } + + return 0; +} + +/** + * tcp_flow_migrate_source() - Send data (flow table) for flow, close listening + * @fd: Descriptor for state migration + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn) +{ + struct tcp_tap_transfer t = { + .retrans = conn->retrans, + .ws_from_tap = conn->ws_from_tap, + .ws_to_tap = conn->ws_to_tap, + .events = conn->events, + + .tap_mss = htonl(MSS_GET(conn)), + + .sndbuf = htonl(conn->sndbuf), + + .flags = conn->flags, + .seq_dup_ack_approx = conn->seq_dup_ack_approx, + + .wnd_from_tap = htons(conn->wnd_from_tap), + .wnd_to_tap = htons(conn->wnd_to_tap), + + .seq_to_tap = htonl(conn->seq_to_tap), + .seq_ack_from_tap = htonl(conn->seq_ack_from_tap), + .seq_from_tap = htonl(conn->seq_from_tap), + .seq_ack_to_tap = htonl(conn->seq_ack_to_tap), + .seq_init_from_tap = htonl(conn->seq_init_from_tap), + }; + + memcpy(&t.pif, conn->f.pif, sizeof(t.pif)); + memcpy(&t.side, conn->f.side, sizeof(t.side)); + + if (write_all_buf(fd, &t, sizeof(t))) { + int rc = -errno; + err_perror("Can't write migration data, socket %i", conn->sock); + return rc; + } + + if (conn->listening_sock != -1 && !fcntl(conn->listening_sock, F_GETFD)) + close(conn->listening_sock); + + return 0; +} + +/** + * tcp_flow_migrate_source_ext() - Dump queues, close sockets, send final data + * @fd: Descriptor for state migration + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative (not -EIO) on failure, -EIO on sending failure + */ +int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn) +{ + uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap; + struct tcp_tap_transfer_ext *t = &migrate_ext[FLOW_IDX(conn)]; + int s = conn->sock; + int rc; + + /* Disable SO_PEEK_OFF, it will make accessing the queues in repair mode + * weird. + */ + if (tcp_set_peek_offset(conn, -1)) { + rc = -errno; + goto fail; + } + + if ((rc = tcp_flow_dump_tinfo(conn, t))) + goto fail; + + if ((rc = tcp_flow_dump_mss(conn, t))) + goto fail; + + if ((rc = tcp_flow_dump_timestamp(conn, t))) + goto fail; + + if ((rc = tcp_flow_dump_wnd(conn, t))) + goto fail; + + if ((rc = tcp_flow_select_queue(conn, TCP_SEND_QUEUE))) + goto fail; + + if ((rc = tcp_flow_dump_sndqueue(conn, t))) + goto fail; + + if ((rc = tcp_flow_dump_seq(conn, &t->seq_snd))) + goto fail; + + if ((rc = tcp_flow_select_queue(conn, TCP_RECV_QUEUE))) + goto fail; + + if ((rc = tcp_flow_dump_rcvqueue(conn, t))) + goto fail; + + if ((rc = tcp_flow_dump_seq(conn, &t->seq_rcv))) + goto fail; + + close(s); + + /* Adjustments unrelated to FIN segments: sequence numbers we dumped are + * based on the end of the queues. + */ + t->seq_rcv -= t->rcvq; + t->seq_snd -= t->sndq; + + flow_dbg(conn, "Extended migration data, socket %i sequences send %u receive %u", + s, t->seq_snd, t->seq_rcv); + flow_dbg(conn, " pending queues: send %u not sent %u receive %u", + t->sndq, t->notsent, t->rcvq); + flow_dbg(conn, " window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u", + t->snd_wl1, t->snd_wnd, t->max_window, t->rcv_wnd, t->rcv_wup); + flow_dbg(conn, " SO_PEEK_OFF %s offset=%"PRIu32, + peek_offset_cap ? "enabled" : "disabled", peek_offset); + + /* Endianness fix-ups */ + t->seq_snd = htonl(t->seq_snd); + t->seq_rcv = htonl(t->seq_rcv); + t->sndq = htonl(t->sndq); + t->notsent = htonl(t->notsent); + t->rcvq = htonl(t->rcvq); + t->mss = htonl(t->mss); + t->timestamp = htonl(t->timestamp); + + t->snd_wl1 = htonl(t->snd_wl1); + t->snd_wnd = htonl(t->snd_wnd); + t->max_window = htonl(t->max_window); + t->rcv_wnd = htonl(t->rcv_wnd); + t->rcv_wup = htonl(t->rcv_wup); + + if (write_all_buf(fd, t, sizeof(*t))) { + flow_perror(conn, "Failed to write extended data"); + return -EIO; + } + + if (write_all_buf(fd, tcp_migrate_snd_queue, ntohl(t->sndq))) { + flow_perror(conn, "Failed to write send queue data"); + return -EIO; + } + + if (write_all_buf(fd, tcp_migrate_rcv_queue, ntohl(t->rcvq))) { + flow_perror(conn, "Failed to write receive queue data"); + return -EIO; + } + + return 0; + +fail: + /* For any type of failure dumping data, write an invalid extended data + * descriptor that allows us to keep the stream in sync, but tells the + * target to skip the flow. If we fail to transfer data, that's fatal: + * return -EIO in that case (and only in that case). + */ + t->tcpi_state = 0; /* Not defined: tell the target to skip this flow */ + + if (write_all_buf(fd, t, sizeof(*t))) { + flow_perror(conn, "Failed to write extended data"); + return -EIO; + } + + if (rc == -EIO) /* but not a migration data transfer failure */ + return -ENODATA; + + return rc; +} + +/** + * tcp_flow_repair_socket() - Open and bind socket, request repair mode + * @c: Execution context + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn) +{ + sa_family_t af = CONN_V4(conn) ? AF_INET : AF_INET6; + int s, rc; + + if ((conn->sock = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, + IPPROTO_TCP)) < 0) { + rc = -errno; + flow_perror(conn, "Failed to create socket for migrated flow"); + return rc; + } + s = conn->sock; + + if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &(int){ 1 }, sizeof(int))) + flow_dbg_perror(conn, "Failed to set SO_REUSEADDR on socket %i", + s); + + tcp_sock_set_nodelay(s); + + if ((rc = tcp_flow_repair_on(c, conn))) + goto err; + + return 0; + +err: + close(s); + conn->sock = -1; + return rc; +} + +/** + * tcp_flow_repair_bind() - Bind socket in repair mode + * @c: Execution context + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_bind(const struct ctx *c, struct tcp_tap_conn *conn) +{ + const struct flowside *sockside = HOSTFLOW(conn); + union sockaddr_inany a; + socklen_t sl; + + pif_sockaddr(c, &a, &sl, PIF_HOST, &sockside->oaddr, sockside->oport); + + if (bind(conn->sock, &a.sa, sizeof(a))) { + int rc = -errno; + flow_perror(conn, "Failed to bind socket for migrated flow"); + return rc; + } + + return 0; +} + +/** + * tcp_flow_repair_connect() - Connect socket in repair mode, then turn it off + * @c: Execution context + * @conn: Pointer to the TCP connection structure + * + * Return: 0 on success, negative error code on failure + */ +static int tcp_flow_repair_connect(const struct ctx *c, + struct tcp_tap_conn *conn) +{ + const struct flowside *tgt = HOSTFLOW(conn); + int rc; + + rc = flowside_connect(c, conn->sock, PIF_HOST, tgt); + if (rc) { + rc = -errno; + flow_perror(conn, "Failed to connect migrated socket"); + return rc; + } + + conn->in_epoll = 0; + conn->timer = -1; + conn->listening_sock = -1; + + return 0; +} + +/** + * tcp_flow_migrate_target() - Receive data (flow table part) for flow, insert + * @c: Execution context + * @fd: Descriptor for state migration + * + * Return: 0 on success, negative on fatal failure, but 0 on single flow failure + */ +int tcp_flow_migrate_target(struct ctx *c, int fd) +{ + struct tcp_tap_transfer t; + struct tcp_tap_conn *conn; + union flow *flow; + int rc; + + if (!(flow = flow_alloc())) { + err("Flow table full on migration target"); + return 0; + } + + if (read_all_buf(fd, &t, sizeof(t))) { + flow_perror(flow, "Failed to receive migration data"); + flow_alloc_cancel(flow); + return -errno; + } + + flow->f.state = FLOW_STATE_TGT; + memcpy(&flow->f.pif, &t.pif, sizeof(flow->f.pif)); + memcpy(&flow->f.side, &t.side, sizeof(flow->f.side)); + conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); + + conn->retrans = t.retrans; + conn->ws_from_tap = t.ws_from_tap; + conn->ws_to_tap = t.ws_to_tap; + conn->events = t.events; + + conn->sndbuf = htonl(t.sndbuf); + + conn->flags = t.flags; + conn->seq_dup_ack_approx = t.seq_dup_ack_approx; + + MSS_SET(conn, ntohl(t.tap_mss)); + + conn->wnd_from_tap = ntohs(t.wnd_from_tap); + conn->wnd_to_tap = ntohs(t.wnd_to_tap); + + conn->seq_to_tap = ntohl(t.seq_to_tap); + conn->seq_ack_from_tap = ntohl(t.seq_ack_from_tap); + conn->seq_from_tap = ntohl(t.seq_from_tap); + conn->seq_ack_to_tap = ntohl(t.seq_ack_to_tap); + conn->seq_init_from_tap = ntohl(t.seq_init_from_tap); + + if ((rc = tcp_flow_repair_socket(c, conn))) { + flow_err(flow, "Can't set up socket: %s, drop", strerror_(-rc)); + /* Can't leave the flow in an incomplete state */ + FLOW_ACTIVATE(conn); + return 0; + } + + flow_hash_insert(c, TAP_SIDX(conn)); + FLOW_ACTIVATE(conn); + + return 0; +} + +/** + * tcp_flow_migrate_target_ext() - Receive extended data for flow, set, connect + * @c: Execution context + * @conn: Connection entry to complete with extra data + * @fd: Descriptor for state migration + * + * Return: 0 on success, negative on fatal failure, but 0 on single flow failure + */ +int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd) +{ + uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap; + struct tcp_tap_transfer_ext t; + int s = conn->sock, rc; + + if (read_all_buf(fd, &t, sizeof(t))) { + rc = -errno; + flow_perror(conn, "Failed to read extended data"); + return rc; + } + + if (!t.tcpi_state) { /* Source wants us to skip this flow */ + flow_err(conn, "Dropping as requested by source"); + goto fail; + } + + /* Endianness fix-ups */ + t.seq_snd = ntohl(t.seq_snd); + t.seq_rcv = ntohl(t.seq_rcv); + t.sndq = ntohl(t.sndq); + t.notsent = ntohl(t.notsent); + t.rcvq = ntohl(t.rcvq); + t.mss = ntohl(t.mss); + t.timestamp = ntohl(t.timestamp); + + t.snd_wl1 = ntohl(t.snd_wl1); + t.snd_wnd = ntohl(t.snd_wnd); + t.max_window = ntohl(t.max_window); + t.rcv_wnd = ntohl(t.rcv_wnd); + t.rcv_wup = ntohl(t.rcv_wup); + + flow_dbg(conn, + "Extended migration data, socket %i sequences send %u receive %u", + s, t.seq_snd, t.seq_rcv); + flow_dbg(conn, " pending queues: send %u not sent %u receive %u", + t.sndq, t.notsent, t.rcvq); + flow_dbg(conn, + " window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u", + t.snd_wl1, t.snd_wnd, t.max_window, t.rcv_wnd, t.rcv_wup); + flow_dbg(conn, " SO_PEEK_OFF %s offset=%"PRIu32, + peek_offset_cap ? "enabled" : "disabled", peek_offset); + + if (t.sndq > TCP_MIGRATE_SND_QUEUE_MAX || t.notsent > t.sndq || + t.rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) { + flow_err(conn, + "Bad queues socket %i, send: %u, not sent: %u, receive: %u", + s, t.sndq, t.notsent, t.rcvq); + return -EINVAL; + } + + if (read_all_buf(fd, tcp_migrate_snd_queue, t.sndq)) { + rc = -errno; + flow_perror(conn, "Failed to read send queue data"); + return rc; + } + + if (read_all_buf(fd, tcp_migrate_rcv_queue, t.rcvq)) { + rc = -errno; + flow_perror(conn, "Failed to read receive queue data"); + return rc; + } + + if (conn->sock < 0) + /* We weren't able to create the socket, discard flow */ + goto fail; + + if (tcp_flow_repair_bind(c, conn)) + goto fail; + + if (tcp_flow_repair_timestamp(conn, &t)) + goto fail; + + if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE)) + goto fail; + + if (tcp_flow_repair_seq(conn, &t.seq_snd)) + goto fail; + + if (tcp_flow_select_queue(conn, TCP_RECV_QUEUE)) + goto fail; + + if (tcp_flow_repair_seq(conn, &t.seq_rcv)) + goto fail; + + if (tcp_flow_repair_connect(c, conn)) + goto fail; + + if (tcp_flow_repair_queue(conn, t.rcvq, tcp_migrate_rcv_queue)) + goto fail; + + if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE)) + goto fail; + + if (tcp_flow_repair_queue(conn, t.sndq - t.notsent, + tcp_migrate_snd_queue)) + goto fail; + + if (tcp_flow_repair_opt(conn, &t)) + goto fail; + + /* If we sent a FIN sent and it was acknowledged (TCP_FIN_WAIT2), don't + * send it out, because we already sent it for sure. + * + * Call shutdown(x, SHUT_WR) in repair mode, so that we move to + * FIN_WAIT_1 (tcp_shutdown()) without sending anything + * (goto in tcp_write_xmit()). + */ + if (t.tcpi_state == TCP_FIN_WAIT2) { + int v; + + v = TCP_SEND_QUEUE; + if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v))) + flow_perror(conn, "Selecting repair queue"); + else + shutdown(s, SHUT_WR); + } + + if (tcp_flow_repair_wnd(conn, &t)) + goto fail; + + tcp_flow_repair_off(c, conn); + repair_flush(c); + + if (t.notsent) { + if (tcp_flow_repair_queue(conn, t.notsent, + tcp_migrate_snd_queue + + (t.sndq - t.notsent))) { + /* This sometimes seems to fail for unclear reasons. + * Don't fail the whole migration, just reset the flow + * and carry on to the next one. + */ + goto fail; + } + } + + /* If we sent a FIN but it wasn't acknowledged yet (TCP_FIN_WAIT1), send + * it out, because we don't know if we already sent it. + * + * Call shutdown(x, SHUT_WR) *not* in repair mode, which moves us to + * TCP_FIN_WAIT1. + */ + if (t.tcpi_state == TCP_FIN_WAIT1) + shutdown(s, SHUT_WR); + + if (tcp_set_peek_offset(conn, peek_offset)) + goto fail; + + tcp_send_flag(c, conn, ACK); + tcp_data_from_sock(c, conn); + + if ((rc = tcp_epoll_ctl(c, conn))) { + flow_dbg(conn, + "Failed to subscribe to epoll for migrated socket: %s", + strerror_(-rc)); + goto fail; + } + + return 0; + +fail: + if (conn->sock >= 0) { + tcp_flow_repair_off(c, conn); + repair_flush(c); + } + + conn->flags = 0; /* Not waiting for ACK, don't schedule timer */ + tcp_rst(c, conn); + + return 0; +} @@ -10,14 +10,15 @@ struct ctx; -void tcp_timer_handler(struct ctx *c, union epoll_ref ref); -void tcp_listen_handler(struct ctx *c, union epoll_ref ref, +void tcp_timer_handler(const struct ctx *c, union epoll_ref ref); +void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, const struct timespec *now); -void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events); -int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, - const void *saddr, const void *daddr, +void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events); +int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, + const void *saddr, const void *daddr, uint32_t flow_lbl, const struct pool *p, int idx, const struct timespec *now); -int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr, +int tcp_sock_init(const struct ctx *c, const union inany_addr *addr, const char *ifname, in_port_t port); int tcp_init(struct ctx *c); void tcp_timer(struct ctx *c, const struct timespec *now); @@ -25,6 +26,8 @@ void tcp_defer_handler(struct ctx *c); void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s); +extern bool peek_offset_cap; + /** * union tcp_epoll_ref - epoll reference portion for TCP connections * @index: Index of connection in table @@ -55,16 +58,12 @@ union tcp_listen_epoll_ref { * @fwd_in: Port forwarding configuration for inbound packets * @fwd_out: Port forwarding configuration for outbound packets * @timer_run: Timestamp of most recent timer run - * @kernel_snd_wnd: Kernel reports sending window (with commit 8f7baad7f035) * @pipe_size: Size of pipes for spliced connections */ struct tcp_ctx { struct fwd_ports fwd_in; struct fwd_ports fwd_out; struct timespec timer_run; -#ifdef HAS_SND_WND - int kernel_snd_wnd; -#endif size_t pipe_size; }; diff --git a/tcp_buf.c b/tcp_buf.c new file mode 100644 index 0000000..d1fca67 --- /dev/null +++ b/tcp_buf.c @@ -0,0 +1,421 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * tcp_buf.c - TCP L2 buffer management functions + * + * Copyright Red Hat + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#include <stddef.h> +#include <stdint.h> +#include <limits.h> +#include <string.h> +#include <errno.h> + +#include <netinet/ip.h> + +#include <netinet/tcp.h> + +#include "util.h" +#include "ip.h" +#include "iov.h" +#include "passt.h" +#include "tap.h" +#include "siphash.h" +#include "inany.h" +#include "tcp_conn.h" +#include "tcp_internal.h" +#include "tcp_buf.h" + +#define TCP_FRAMES_MEM 128 +#define TCP_FRAMES \ + (c->mode == MODE_PASTA ? 1 : TCP_FRAMES_MEM) + +/* Static buffers */ + +/* Ethernet header for IPv4 and IPv6 frames */ +static struct ethhdr tcp4_eth_src; +static struct ethhdr tcp6_eth_src; + +static struct tap_hdr tcp_payload_tap_hdr[TCP_FRAMES_MEM]; + +/* IP headers for IPv4 and IPv6 */ +struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM]; +struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM]; + +/* TCP segments with payload for IPv4 and IPv6 frames */ +static struct tcp_payload_t tcp_payload[TCP_FRAMES_MEM]; + +static_assert(MSS4 <= sizeof(tcp_payload[0].data), "MSS4 is greater than 65516"); +static_assert(MSS6 <= sizeof(tcp_payload[0].data), "MSS6 is greater than 65516"); + +/* References tracking the owner connection of frames in the tap outqueue */ +static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM]; +static unsigned int tcp_payload_used; + +/* recvmsg()/sendmsg() data for tap */ +static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; + +static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS]; + +/** + * tcp_update_l2_buf() - Update Ethernet header buffers with addresses + * @eth_d: Ethernet destination address, NULL if unchanged + * @eth_s: Ethernet source address, NULL if unchanged + */ +void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) +{ + eth_update_mac(&tcp4_eth_src, eth_d, eth_s); + eth_update_mac(&tcp6_eth_src, eth_d, eth_s); +} + +/** + * tcp_sock_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets + * @c: Execution context + */ +void tcp_sock_iov_init(const struct ctx *c) +{ + struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP); + struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); + int i; + + tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6); + tcp4_eth_src.h_proto = htons_constant(ETH_P_IP); + + for (i = 0; i < ARRAY_SIZE(tcp_payload); i++) { + tcp6_payload_ip[i] = ip6; + tcp4_payload_ip[i] = iph; + } + + for (i = 0; i < TCP_FRAMES_MEM; i++) { + struct iovec *iov = tcp_l2_iov[i]; + + iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp_payload_tap_hdr[i]); + iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp_payload[i]; + } +} + +/** + * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission + * @c: Execution context + * @conns: Array of connection pointers corresponding to queued frames + * @frames: Two-dimensional array containing queued frames with sub-iovs + * @num_frames: Number of entries in the two arrays to be compared + */ +static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns, + struct iovec (*frames)[TCP_NUM_IOVS], int num_frames) +{ + int i; + + for (i = 0; i < num_frames; i++) { + const struct tcphdr *th = frames[i][TCP_IOV_PAYLOAD].iov_base; + struct tcp_tap_conn *conn = conns[i]; + uint32_t seq = ntohl(th->seq); + uint32_t peek_offset; + + if (SEQ_LE(conn->seq_to_tap, seq)) + continue; + + conn->seq_to_tap = seq; + peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap; + if (tcp_set_peek_offset(conn, peek_offset)) + tcp_rst(c, conn); + } +} + +/** + * tcp_payload_flush() - Send out buffers for segments with data or flags + * @c: Execution context + */ +void tcp_payload_flush(const struct ctx *c) +{ + size_t m; + + m = tap_send_frames(c, &tcp_l2_iov[0][0], TCP_NUM_IOVS, + tcp_payload_used); + if (m != tcp_payload_used) { + tcp_revert_seq(c, &tcp_frame_conns[m], &tcp_l2_iov[m], + tcp_payload_used - m); + } + tcp_payload_used = 0; +} + +/** + * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers + * @conn: Connection pointer + * @iov: Pointer to an array of iovec of TCP pre-cooked buffers + * @check: Checksum, if already known + * @seq: Sequence number for this segment + * @no_tcp_csum: Do not set TCP checksum + */ +static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, + struct iovec *iov, const uint16_t *check, + uint32_t seq, bool no_tcp_csum) +{ + struct iov_tail tail = IOV_TAIL(&iov[TCP_IOV_PAYLOAD], 1, 0); + struct tcphdr *th = IOV_REMOVE_HEADER(&tail, struct tcphdr); + struct tap_hdr *taph = iov[TCP_IOV_TAP].iov_base; + const struct flowside *tapside = TAPFLOW(conn); + const struct in_addr *a4 = inany_v4(&tapside->oaddr); + struct ipv6hdr *ip6h = NULL; + struct iphdr *ip4h = NULL; + + if (a4) + ip4h = iov[TCP_IOV_IP].iov_base; + else + ip6h = iov[TCP_IOV_IP].iov_base; + + tcp_fill_headers(conn, taph, ip4h, ip6h, th, &tail, + check, seq, no_tcp_csum); +} + +/** + * tcp_buf_send_flag() - Send segment with flags to tap (no payload) + * @c: Execution context + * @conn: Connection pointer + * @flags: TCP flags: if not set, send segment only if ACK is due + * + * Return: negative error code on connection reset, 0 otherwise + */ +int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) +{ + struct tcp_payload_t *payload; + struct iovec *iov; + size_t optlen; + size_t l4len; + uint32_t seq; + int ret; + + iov = tcp_l2_iov[tcp_payload_used]; + if (CONN_V4(conn)) { + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]); + iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; + } else { + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]); + iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; + } + + payload = iov[TCP_IOV_PAYLOAD].iov_base; + seq = conn->seq_to_tap; + ret = tcp_prepare_flags(c, conn, flags, &payload->th, + (struct tcp_syn_opts *)&payload->data, &optlen); + if (ret <= 0) + return ret; + + tcp_payload_used++; + l4len = optlen + sizeof(struct tcphdr); + iov[TCP_IOV_PAYLOAD].iov_len = l4len; + tcp_l2_buf_fill_headers(conn, iov, NULL, seq, false); + + if (flags & DUP_ACK) { + struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++]; + + memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base, + iov[TCP_IOV_TAP].iov_len); + dup_iov[TCP_IOV_ETH].iov_base = iov[TCP_IOV_ETH].iov_base; + dup_iov[TCP_IOV_IP] = iov[TCP_IOV_IP]; + memcpy(dup_iov[TCP_IOV_PAYLOAD].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, l4len); + dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len; + } + + if (tcp_payload_used > TCP_FRAMES_MEM - 2) + tcp_payload_flush(c); + + return 0; +} + +/** + * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer + * @c: Execution context + * @conn: Connection pointer + * @dlen: TCP payload length + * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer + * @seq: Sequence number to be sent + * @push: Set PSH flag, last segment in a batch + */ +static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, + ssize_t dlen, int no_csum, uint32_t seq, bool push) +{ + struct tcp_payload_t *payload; + const uint16_t *check = NULL; + struct iovec *iov; + + conn->seq_to_tap = seq + dlen; + tcp_frame_conns[tcp_payload_used] = conn; + iov = tcp_l2_iov[tcp_payload_used]; + if (CONN_V4(conn)) { + if (no_csum) { + struct iovec *iov_prev = tcp_l2_iov[tcp_payload_used - 1]; + struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base; + + check = &iph->check; + } + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]); + iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; + } else if (CONN_V6(conn)) { + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]); + iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; + } + payload = iov[TCP_IOV_PAYLOAD].iov_base; + payload->th.th_off = sizeof(struct tcphdr) / 4; + payload->th.th_x2 = 0; + payload->th.th_flags = 0; + payload->th.ack = 1; + payload->th.psh = push; + iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr); + tcp_l2_buf_fill_headers(conn, iov, check, seq, false); + if (++tcp_payload_used > TCP_FRAMES_MEM - 1) + tcp_payload_flush(c); +} + +/** + * tcp_buf_data_from_sock() - Handle new data from socket, queue to tap, in window + * @c: Execution context + * @conn: Connection pointer + * + * Return: negative on connection reset, 0 otherwise + * + * #syscalls recvmsg + */ +int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) +{ + uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; + int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; + int len, dlen, i, s = conn->sock; + struct msghdr mh_sock = { 0 }; + uint16_t mss = MSS_GET(conn); + uint32_t already_sent, seq; + struct iovec *iov; + + /* How much have we read/sent since last received ack ? */ + already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; + + if (SEQ_LT(already_sent, 0)) { + /* RFC 761, section 2.1. */ + flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", + conn->seq_ack_from_tap, conn->seq_to_tap); + conn->seq_to_tap = conn->seq_ack_from_tap; + already_sent = 0; + if (tcp_set_peek_offset(conn, 0)) { + tcp_rst(c, conn); + return -1; + } + } + + if (!wnd_scaled || already_sent >= wnd_scaled) { + conn_flag(c, conn, ACK_FROM_TAP_BLOCKS); + conn_flag(c, conn, STALLED); + conn_flag(c, conn, ACK_FROM_TAP_DUE); + return 0; + } + + /* Set up buffer descriptors we'll fill completely and partially. */ + fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss); + if (fill_bufs > TCP_FRAMES) { + fill_bufs = TCP_FRAMES; + iov_rem = 0; + } else { + iov_rem = (wnd_scaled - already_sent) % mss; + } + + /* Prepare iov according to kernel capability */ + if (!peek_offset_cap) { + mh_sock.msg_iov = iov_sock; + iov_sock[0].iov_base = tcp_buf_discard; + iov_sock[0].iov_len = already_sent; + mh_sock.msg_iovlen = fill_bufs + 1; + } else { + mh_sock.msg_iov = &iov_sock[1]; + mh_sock.msg_iovlen = fill_bufs; + } + + if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) { + tcp_payload_flush(c); + + /* Silence Coverity CWE-125 false positive */ + tcp_payload_used = 0; + } + + for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { + iov->iov_base = &tcp_payload[tcp_payload_used + i].data; + iov->iov_len = mss; + } + if (iov_rem) + iov_sock[fill_bufs].iov_len = iov_rem; + + /* Receive into buffers, don't dequeue until acknowledged by guest. */ + do + len = recvmsg(s, &mh_sock, MSG_PEEK); + while (len < 0 && errno == EINTR); + + if (len < 0) { + if (errno != EAGAIN && errno != EWOULDBLOCK) { + tcp_rst(c, conn); + return -errno; + } + + if (already_sent) /* No new data and EAGAIN: set EPOLLET */ + conn_flag(c, conn, STALLED); + + return 0; + } + + if (!len) { + if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { + int ret = tcp_buf_send_flag(c, conn, FIN | ACK); + if (ret) { + tcp_rst(c, conn); + return ret; + } + + conn_event(c, conn, TAP_FIN_SENT); + } + + return 0; + } + + if (!peek_offset_cap) + len -= already_sent; + + if (len <= 0) { + conn_flag(c, conn, STALLED); + return 0; + } + + conn_flag(c, conn, ~ACK_FROM_TAP_BLOCKS); + conn_flag(c, conn, ~STALLED); + + send_bufs = DIV_ROUND_UP(len, mss); + last_len = len - (send_bufs - 1) * mss; + + /* Likely, some new data was acked too. */ + tcp_update_seqack_wnd(c, conn, false, NULL); + + /* Finally, queue to tap */ + dlen = mss; + seq = conn->seq_to_tap; + for (i = 0; i < send_bufs; i++) { + int no_csum = i && i != send_bufs - 1 && tcp_payload_used; + bool push = false; + + if (i == send_bufs - 1) { + dlen = last_len; + push = true; + } + + tcp_data_to_tap(c, conn, dlen, no_csum, seq, push); + seq += dlen; + } + + conn_flag(c, conn, ACK_FROM_TAP_DUE); + + return 0; +} diff --git a/tcp_buf.h b/tcp_buf.h new file mode 100644 index 0000000..54f5e53 --- /dev/null +++ b/tcp_buf.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2021 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#ifndef TCP_BUF_H +#define TCP_BUF_H + +void tcp_sock_iov_init(const struct ctx *c); +void tcp_payload_flush(const struct ctx *c); +int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn); +int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags); + +#endif /*TCP_BUF_H */ @@ -13,19 +13,17 @@ * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced) * @f: Generic flow information * @in_epoll: Is the connection in the epoll set? + * @retrans: Number of retransmissions occurred due to ACK_TIMEOUT + * @ws_from_tap: Window scaling factor advertised from tap/guest + * @ws_to_tap: Window scaling factor advertised to tap/guest * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS * @sock: Socket descriptor number * @events: Connection events, implying connection states + * @listening_sock: Listening socket this socket was accept()ed from, or -1 * @timer: timerfd descriptor for timeout events * @flags: Connection flags representing internal attributes - * @retrans: Number of retransmissions occurred due to ACK_TIMEOUT - * @ws_from_tap: Window scaling factor advertised from tap/guest - * @ws_to_tap: Window scaling factor advertised to tap/guest * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS * @seq_dup_ack_approx: Last duplicate ACK number sent to tap - * @faddr: Guest side forwarding address (guest's remote address) - * @eport: Guest side endpoint port (guest's local port) - * @fport: Guest side forwarding port (guest's remote port) * @wnd_from_tap: Last window size from tap, unscaled (as received) * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) * @seq_to_tap: Next sequence for packets to tap @@ -49,6 +47,10 @@ struct tcp_tap_conn { unsigned int ws_from_tap :TCP_WS_BITS; unsigned int ws_to_tap :TCP_WS_BITS; +#define TCP_MSS_BITS 14 + unsigned int tap_mss :TCP_MSS_BITS; +#define MSS_SET(conn, mss) (conn->tap_mss = (mss >> (16 - TCP_MSS_BITS))) +#define MSS_GET(conn) (conn->tap_mss << (16 - TCP_MSS_BITS)) int sock :FD_REF_BITS; @@ -67,6 +69,7 @@ struct tcp_tap_conn { #define CONN_STATE_BITS /* Setting these clears other flags */ \ (SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED) + int listening_sock; int timer :FD_REF_BITS; @@ -76,13 +79,7 @@ struct tcp_tap_conn { #define ACTIVE_CLOSE BIT(2) #define ACK_TO_TAP_DUE BIT(3) #define ACK_FROM_TAP_DUE BIT(4) - - -#define TCP_MSS_BITS 14 - unsigned int tap_mss :TCP_MSS_BITS; -#define MSS_SET(conn, mss) (conn->tap_mss = (mss >> (16 - TCP_MSS_BITS))) -#define MSS_GET(conn) (conn->tap_mss << (16 - TCP_MSS_BITS)) - +#define ACK_FROM_TAP_BLOCKS BIT(5) #define SNDBUF_BITS 24 unsigned int sndbuf :SNDBUF_BITS; @@ -91,10 +88,51 @@ struct tcp_tap_conn { uint8_t seq_dup_ack_approx; + uint16_t wnd_from_tap; + uint16_t wnd_to_tap; + + uint32_t seq_to_tap; + uint32_t seq_ack_from_tap; + uint32_t seq_from_tap; + uint32_t seq_ack_to_tap; + uint32_t seq_init_from_tap; +}; + +/** + * struct tcp_tap_transfer - Migrated TCP data, flow table part, network order + * @pif: Interfaces for each side of the flow + * @side: Addresses and ports for each side of the flow + * @retrans: Number of retransmissions occurred due to ACK_TIMEOUT + * @ws_from_tap: Window scaling factor advertised from tap/guest + * @ws_to_tap: Window scaling factor advertised to tap/guest + * @events: Connection events, implying connection states + * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS + * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS + * @flags: Connection flags representing internal attributes + * @seq_dup_ack_approx: Last duplicate ACK number sent to tap + * @wnd_from_tap: Last window size from tap, unscaled (as received) + * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) + * @seq_to_tap: Next sequence for packets to tap + * @seq_ack_from_tap: Last ACK number received from tap + * @seq_from_tap: Next sequence for packets from tap (not actually sent) + * @seq_ack_to_tap: Last ACK number sent to tap + * @seq_init_from_tap: Initial sequence number from tap +*/ +struct tcp_tap_transfer { + uint8_t pif[SIDES]; + struct flowside side[SIDES]; + + uint8_t retrans; + uint8_t ws_from_tap; + uint8_t ws_to_tap; + uint8_t events; + + uint32_t tap_mss; - union inany_addr faddr; - in_port_t eport; - in_port_t fport; + uint32_t sndbuf; + + uint8_t flags; + uint8_t seq_dup_ack_approx; uint16_t wnd_from_tap; uint16_t wnd_to_tap; @@ -104,49 +142,86 @@ struct tcp_tap_conn { uint32_t seq_from_tap; uint32_t seq_ack_to_tap; uint32_t seq_init_from_tap; -}; +} __attribute__((packed, aligned(__alignof__(uint32_t)))); + +/** + * struct tcp_tap_transfer_ext - Migrated TCP data, outside flow, network order + * @seq_snd: Socket-side send sequence + * @seq_rcv: Socket-side receive sequence + * @sndq: Length of pending send queue (unacknowledged / not sent) + * @notsent: Part of pending send queue that wasn't sent out yet + * @rcvq: Length of pending receive queue + * @mss: Socket-side MSS clamp + * @timestamp: RFC 7323 timestamp + * @snd_wl1: Next sequence used in window probe (next sequence - 1) + * @snd_wnd: Socket-side sending window + * @max_window: Window clamp + * @rcv_wnd: Socket-side receive window + * @rcv_wup: rcv_nxt on last window update sent + * @snd_ws: Window scaling factor, send + * @rcv_ws: Window scaling factor, receive + * @tcpi_state: Connection state in TCP_INFO style (enum, tcp_states.h) + * @tcpi_options: TCPI_OPT_* constants (timestamps, selective ACK) + */ +struct tcp_tap_transfer_ext { + uint32_t seq_snd; + uint32_t seq_rcv; + + uint32_t sndq; + uint32_t notsent; + uint32_t rcvq; + + uint32_t mss; + uint32_t timestamp; + + /* We can't just use struct tcp_repair_window: we need network order */ + uint32_t snd_wl1; + uint32_t snd_wnd; + uint32_t max_window; + uint32_t rcv_wnd; + uint32_t rcv_wup; + + uint8_t snd_ws; + uint8_t rcv_ws; + uint8_t tcpi_state; + uint8_t tcpi_options; +} __attribute__((packed, aligned(__alignof__(uint32_t)))); -#define SIDES 2 /** * struct tcp_splice_conn - Descriptor for a spliced TCP connection * @f: Generic flow information - * @in_epoll: Is the connection in the epoll set? * @s: File descriptor for sockets * @pipe: File descriptors for pipes - * @events: Events observed/actions performed on connection - * @flags: Connection flags (attributes, not events) * @read: Bytes read (not fully written to other side in one shot) * @written: Bytes written (not fully written from one other side read) -*/ + * @events: Events observed/actions performed on connection + * @flags: Connection flags (attributes, not events) + * @in_epoll: Is the connection in the epoll set? + */ struct tcp_splice_conn { /* Must be first element */ struct flow_common f; - bool in_epoll :1; int s[SIDES]; int pipe[SIDES][2]; + uint32_t read[SIDES]; + uint32_t written[SIDES]; + uint8_t events; #define SPLICE_CLOSED 0 #define SPLICE_CONNECT BIT(0) #define SPLICE_ESTABLISHED BIT(1) -#define OUT_WAIT_0 BIT(2) -#define OUT_WAIT_1 BIT(3) -#define FIN_RCVD_0 BIT(4) -#define FIN_RCVD_1 BIT(5) -#define FIN_SENT_0 BIT(6) -#define FIN_SENT_1 BIT(7) +#define OUT_WAIT(sidei_) ((sidei_) ? BIT(3) : BIT(2)) +#define FIN_RCVD(sidei_) ((sidei_) ? BIT(5) : BIT(4)) +#define FIN_SENT(sidei_) ((sidei_) ? BIT(7) : BIT(6)) uint8_t flags; -#define SPLICE_V6 BIT(0) -#define RCVLOWAT_SET_0 BIT(1) -#define RCVLOWAT_SET_1 BIT(2) -#define RCVLOWAT_ACT_0 BIT(3) -#define RCVLOWAT_ACT_1 BIT(4) -#define CLOSING BIT(5) +#define RCVLOWAT_SET(sidei_) ((sidei_) ? BIT(1) : BIT(0)) +#define RCVLOWAT_ACT(sidei_) ((sidei_) ? BIT(3) : BIT(2)) +#define CLOSING BIT(4) - uint32_t read[SIDES]; - uint32_t written[SIDES]; + bool in_epoll :1; }; /* Socket pools */ @@ -155,12 +230,24 @@ struct tcp_splice_conn { extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; -bool tcp_flow_defer(union flow *flow); -bool tcp_splice_flow_defer(union flow *flow); -void tcp_splice_timer(const struct ctx *c, union flow *flow); +bool tcp_flow_defer(const struct tcp_tap_conn *conn); + +int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn); +int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn); + +int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn); +int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn); + +int tcp_flow_migrate_target(struct ctx *c, int fd); +int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd); + +bool tcp_flow_is_established(const struct tcp_tap_conn *conn); + +bool tcp_splice_flow_defer(struct tcp_splice_conn *conn); +void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn); int tcp_conn_pool_sock(int pool[]); -int tcp_conn_sock(const struct ctx *c, sa_family_t af); -int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af); +int tcp_conn_sock(sa_family_t af); +int tcp_sock_refill_pool(int pool[], sa_family_t af); void tcp_splice_refill(const struct ctx *c); #endif /* TCP_CONN_H */ diff --git a/tcp_internal.h b/tcp_internal.h new file mode 100644 index 0000000..36c6533 --- /dev/null +++ b/tcp_internal.h @@ -0,0 +1,182 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2021 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#ifndef TCP_INTERNAL_H +#define TCP_INTERNAL_H + +#define MAX_WS 8 +#define MAX_WINDOW (1 << (16 + (MAX_WS))) + +#define MSS4 ROUND_DOWN(IP_MAX_MTU - \ + sizeof(struct tcphdr) - \ + sizeof(struct iphdr), \ + sizeof(uint32_t)) +#define MSS6 ROUND_DOWN(IP_MAX_MTU - \ + sizeof(struct tcphdr) - \ + sizeof(struct ipv6hdr), \ + sizeof(uint32_t)) + +#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW) +#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW) +#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW) +#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW) + +#define FIN (1 << 0) +#define SYN (1 << 1) +#define RST (1 << 2) +#define ACK (1 << 4) + +/* Flags for internal usage */ +#define DUP_ACK (1 << 5) +#define OPT_EOL 0 +#define OPT_NOP 1 +#define OPT_MSS 2 +#define OPT_WS 3 +#define OPT_SACKP 4 +#define OPT_SACK 5 +#define OPT_TS 8 + +#define TAPSIDE(conn_) ((conn_)->f.pif[1] == PIF_TAP) +#define TAPFLOW(conn_) (&((conn_)->f.side[TAPSIDE(conn_)])) +#define TAP_SIDX(conn_) (FLOW_SIDX((conn_), TAPSIDE(conn_))) + +#define HOSTSIDE(conn_) ((conn_)->f.pif[1] == PIF_HOST) +#define HOSTFLOW(conn_) (&((conn_)->f.side[HOSTSIDE(conn_)])) +#define HOST_SIDX(conn_) (FLOW_SIDX((conn_), TAPSIDE(conn_))) + +#define CONN_V4(conn) (!!inany_v4(&TAPFLOW(conn)->oaddr)) +#define CONN_V6(conn) (!CONN_V4(conn)) + +/* + * enum tcp_iov_parts - I/O vector parts for one TCP frame + * @TCP_IOV_TAP tap backend specific header + * @TCP_IOV_ETH Ethernet header + * @TCP_IOV_IP IP (v4/v6) header + * @TCP_IOV_PAYLOAD IP payload (TCP header + data) + * @TCP_NUM_IOVS the number of entries in the iovec array + */ +enum tcp_iov_parts { + TCP_IOV_TAP = 0, + TCP_IOV_ETH = 1, + TCP_IOV_IP = 2, + TCP_IOV_PAYLOAD = 3, + TCP_NUM_IOVS +}; + +/** + * struct tcp_payload_t - TCP header and data to send segments with payload + * @th: TCP header + * @data: TCP data + */ +struct tcp_payload_t { + struct tcphdr th; + uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)]; +#ifdef __AVX2__ +} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */ +#else +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); +#endif + +/** struct tcp_opt_nop - TCP NOP option + * @kind: Option kind (OPT_NOP = 1) + */ +struct tcp_opt_nop { + uint8_t kind; +} __attribute__ ((packed)); +#define TCP_OPT_NOP ((struct tcp_opt_nop){ .kind = OPT_NOP, }) + +/** struct tcp_opt_mss - TCP MSS option + * @kind: Option kind (OPT_MSS == 2) + * @len: Option length (4) + * @mss: Maximum Segment Size + */ +struct tcp_opt_mss { + uint8_t kind; + uint8_t len; + uint16_t mss; +} __attribute__ ((packed)); +#define TCP_OPT_MSS(mss_) \ + ((struct tcp_opt_mss) { \ + .kind = OPT_MSS, \ + .len = sizeof(struct tcp_opt_mss), \ + .mss = htons(mss_), \ + }) + +/** struct tcp_opt_ws - TCP Window Scaling option + * @kind: Option kind (OPT_WS == 3) + * @len: Option length (3) + * @shift: Window scaling shift + */ +struct tcp_opt_ws { + uint8_t kind; + uint8_t len; + uint8_t shift; +} __attribute__ ((packed)); +#define TCP_OPT_WS(shift_) \ + ((struct tcp_opt_ws) { \ + .kind = OPT_WS, \ + .len = sizeof(struct tcp_opt_ws), \ + .shift = (shift_), \ + }) + +/** struct tcp_syn_opts - TCP options we apply to SYN packets + * @mss: Maximum Segment Size (MSS) option + * @nop: NOP opt (for alignment) + * @ws: Window Scaling (WS) option + */ +struct tcp_syn_opts { + struct tcp_opt_mss mss; + struct tcp_opt_nop nop; + struct tcp_opt_ws ws; +} __attribute__ ((packed)); +#define TCP_SYN_OPTS(mss_, ws_) \ + ((struct tcp_syn_opts){ \ + .mss = TCP_OPT_MSS(mss_), \ + .nop = TCP_OPT_NOP, \ + .ws = TCP_OPT_WS(ws_), \ + }) + +extern char tcp_buf_discard [MAX_WINDOW]; + +void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, + unsigned long flag); +#define conn_flag(c, conn, flag) \ + do { \ + flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \ + conn_flag_do(c, conn, flag); \ + } while (0) + + +void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, + unsigned long event); +#define conn_event(c, conn, event) \ + do { \ + flow_trace(conn, "event at %s:%i", __func__, __LINE__); \ + conn_event_do(c, conn, event); \ + } while (0) + +void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn); +#define tcp_rst(c, conn) \ + do { \ + flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \ + tcp_rst_do(c, conn); \ + } while (0) + +struct tcp_info_linux; + +void tcp_fill_headers(const struct tcp_tap_conn *conn, + struct tap_hdr *taph, + struct iphdr *ip4h, struct ipv6hdr *ip6h, + struct tcphdr *th, struct iov_tail *payload, + const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum); + +int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, + bool force_seq, struct tcp_info_linux *tinfo); +int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, + int flags, struct tcphdr *th, struct tcp_syn_opts *opts, + size_t *optlen); +int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset); + +#endif /* TCP_INTERNAL_H */ diff --git a/tcp_splice.c b/tcp_splice.c index d066112..60455d6 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -28,7 +28,7 @@ * - FIN_SENT_0: FIN (write shutdown) sent to accepted socket * - FIN_SENT_1: FIN (write shutdown) sent to target socket * - * #syscalls:pasta pipe2|pipe fcntl armv6l:fcntl64 armv7l:fcntl64 ppc64:fcntl64 + * #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64|fcntl i686:fcntl64 */ #include <sched.h> @@ -73,10 +73,7 @@ static int ns_sock_pool6 [TCP_SOCK_POOL_SIZE]; /* Pool of pre-opened pipes */ static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2]; -#define CONN_V6(x) (x->flags & SPLICE_V6) -#define CONN_V4(x) (!CONN_V6(x)) -#define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) -#define CONN(idx) (&FLOW(idx)->tcp_splice) +#define CONN_HAS(conn, set) (((conn)->events & (set)) == (set)) /* Display strings for connection events */ static const char *tcp_splice_event_str[] __attribute((__unused__)) = { @@ -95,6 +92,24 @@ static int tcp_sock_refill_ns(void *arg); static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af); /** + * conn_at_sidx() - Get spliced TCP connection specific flow at given sidx + * @sidx: Flow and side to retrieve + * + * Return: Spliced TCP connection at @sidx, or NULL of @sidx is invalid. + * Asserts if the flow at @sidx is not FLOW_TCP_SPLICE. + */ +static struct tcp_splice_conn *conn_at_sidx(flow_sidx_t sidx) +{ + union flow *flow = flow_at_sidx(sidx); + + if (!flow) + return NULL; + + ASSERT(flow->f.type == FLOW_TCP_SPLICE); + return &flow->tcp_splice; +} + +/** * tcp_splice_conn_epoll_events() - epoll events masks for given state * @events: Connection event flags * @ev: Events to fill in, 0 is accepted socket, 1 is connecting socket @@ -102,19 +117,26 @@ static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af); static void tcp_splice_conn_epoll_events(uint16_t events, struct epoll_event ev[]) { - ev[0].events = ev[1].events = 0; + unsigned sidei; + + flow_foreach_sidei(sidei) + ev[sidei].events = 0; if (events & SPLICE_ESTABLISHED) { - if (!(events & FIN_SENT_1)) - ev[0].events = EPOLLIN | EPOLLRDHUP; - if (!(events & FIN_SENT_0)) - ev[1].events = EPOLLIN | EPOLLRDHUP; + flow_foreach_sidei(sidei) { + if (!(events & FIN_SENT(!sidei))) + ev[sidei].events = EPOLLIN | EPOLLRDHUP; + } } else if (events & SPLICE_CONNECT) { ev[1].events = EPOLLOUT; } - ev[0].events |= (events & OUT_WAIT_0) ? EPOLLOUT : 0; - ev[1].events |= (events & OUT_WAIT_1) ? EPOLLOUT : 0; + flow_foreach_sidei(sidei) { + if (events & OUT_WAIT(sidei)) { + ev[sidei].events |= EPOLLOUT; + ev[!sidei].events &= ~EPOLLIN; + } + } } /** @@ -142,7 +164,7 @@ static int tcp_splice_epoll_ctl(const struct ctx *c, if (epoll_ctl(c->epollfd, m, conn->s[0], &ev[0]) || epoll_ctl(c->epollfd, m, conn->s[1], &ev[1])) { int ret = -errno; - flow_err(conn, "ERROR on epoll_ctl(): %s", strerror(errno)); + flow_perror(conn, "ERROR on epoll_ctl()"); return ret; } @@ -182,8 +204,8 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn, } if (flag == CLOSING) { - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->s[0], NULL); - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->s[1], NULL); + epoll_del(c, conn->s[0]); + epoll_del(c, conn->s[1]); } } @@ -235,32 +257,31 @@ static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn, /** * tcp_splice_flow_defer() - Deferred per-flow handling (clean up closed) - * @flow: Flow table entry for this connection + * @conn: Connection entry to handle * * Return: true if the flow is ready to free, false otherwise */ -bool tcp_splice_flow_defer(union flow *flow) +bool tcp_splice_flow_defer(struct tcp_splice_conn *conn) { - struct tcp_splice_conn *conn = &flow->tcp_splice; - unsigned side; + unsigned sidei; - if (!(flow->tcp_splice.flags & CLOSING)) + if (!(conn->flags & CLOSING)) return false; - for (side = 0; side < SIDES; side++) { + flow_foreach_sidei(sidei) { /* Flushing might need to block: don't recycle them. */ - if (conn->pipe[side][0] >= 0) { - close(conn->pipe[side][0]); - close(conn->pipe[side][1]); - conn->pipe[side][0] = conn->pipe[side][1] = -1; + if (conn->pipe[sidei][0] >= 0) { + close(conn->pipe[sidei][0]); + close(conn->pipe[sidei][1]); + conn->pipe[sidei][0] = conn->pipe[sidei][1] = -1; } - if (conn->s[side] >= 0) { - close(conn->s[side]); - conn->s[side] = -1; + if (conn->s[sidei] >= 0) { + close(conn->s[sidei]); + conn->s[sidei] = -1; } - conn->read[side] = conn->written[side] = 0; + conn->read[sidei] = conn->written[sidei] = 0; } conn->events = SPLICE_CLOSED; @@ -280,33 +301,33 @@ bool tcp_splice_flow_defer(union flow *flow) static int tcp_splice_connect_finish(const struct ctx *c, struct tcp_splice_conn *conn) { - unsigned side; + unsigned sidei; int i = 0; - for (side = 0; side < SIDES; side++) { + flow_foreach_sidei(sidei) { for (; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) { if (splice_pipe_pool[i][0] >= 0) { - SWAP(conn->pipe[side][0], + SWAP(conn->pipe[sidei][0], splice_pipe_pool[i][0]); - SWAP(conn->pipe[side][1], + SWAP(conn->pipe[sidei][1], splice_pipe_pool[i][1]); break; } } - if (conn->pipe[side][0] < 0) { - if (pipe2(conn->pipe[side], O_NONBLOCK | O_CLOEXEC)) { - flow_err(conn, "cannot create %d->%d pipe: %s", - side, !side, strerror(errno)); + if (conn->pipe[sidei][0] < 0) { + if (pipe2(conn->pipe[sidei], O_NONBLOCK | O_CLOEXEC)) { + flow_perror(conn, "cannot create %d->%d pipe", + sidei, !sidei); conn_flag(c, conn, CLOSING); return -EIO; } - if (fcntl(conn->pipe[side][0], F_SETPIPE_SZ, - c->tcp.pipe_size)) { + if (fcntl(conn->pipe[sidei][0], F_SETPIPE_SZ, + c->tcp.pipe_size) != (int)c->tcp.pipe_size) { flow_trace(conn, "cannot set %d->%d pipe size to %zu", - side, !side, c->tcp.pipe_size); + sidei, !sidei, c->tcp.pipe_size); } } } @@ -321,31 +342,21 @@ static int tcp_splice_connect_finish(const struct ctx *c, * tcp_splice_connect() - Create and connect socket for new spliced connection * @c: Execution context * @conn: Connection pointer - * @af: Address family - * @pif: pif on which to create socket - * @port: Destination port, host order * * Return: 0 for connect() succeeded or in progress, negative value on error */ -static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn, - sa_family_t af, uint8_t pif, in_port_t port) +static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn) { - struct sockaddr_in6 addr6 = { - .sin6_family = AF_INET6, - .sin6_port = htons(port), - .sin6_addr = IN6ADDR_LOOPBACK_INIT, - }; - struct sockaddr_in addr4 = { - .sin_family = AF_INET, - .sin_port = htons(port), - .sin_addr = IN4ADDR_LOOPBACK_INIT, - }; - const struct sockaddr *sa; + const struct flowside *tgt = &conn->f.side[TGTSIDE]; + sa_family_t af = inany_v4(&tgt->eaddr) ? AF_INET : AF_INET6; + uint8_t tgtpif = conn->f.pif[TGTSIDE]; + union sockaddr_inany sa; socklen_t sl; + int one = 1; - if (pif == PIF_HOST) - conn->s[1] = tcp_conn_sock(c, af); - else if (pif == PIF_SPLICE) + if (tgtpif == PIF_HOST) + conn->s[1] = tcp_conn_sock(af); + else if (tgtpif == PIF_SPLICE) conn->s[1] = tcp_conn_sock_ns(c, af); else ASSERT(0); @@ -353,24 +364,27 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn, if (conn->s[1] < 0) return -1; - if (setsockopt(conn->s[1], SOL_TCP, TCP_QUICKACK, - &((int){ 1 }), sizeof(int))) { + if (setsockopt(conn->s[1], SOL_TCP, TCP_QUICKACK, &one, sizeof(one))) { flow_trace(conn, "failed to set TCP_QUICKACK on socket %i", conn->s[1]); } - if (CONN_V6(conn)) { - sa = (struct sockaddr *)&addr6; - sl = sizeof(addr6); - } else { - sa = (struct sockaddr *)&addr4; - sl = sizeof(addr4); + if (setsockopt(conn->s[0], SOL_TCP, TCP_NODELAY, &one, sizeof(one))) { + flow_trace(conn, "failed to set TCP_NODELAY on socket %i", + conn->s[0]); + } + + if (setsockopt(conn->s[1], SOL_TCP, TCP_NODELAY, &one, sizeof(one))) { + flow_trace(conn, "failed to set TCP_NODELAY on socket %i", + conn->s[1]); } - if (connect(conn->s[1], sa, sl)) { + pif_sockaddr(c, &sa, &sl, tgtpif, &tgt->eaddr, tgt->eport); + + if (connect(conn->s[1], &sa.sa, sl)) { if (errno != EINPROGRESS) { flow_trace(conn, "Couldn't connect socket for splice: %s", - strerror(errno)); + strerror_(errno)); return -errno; } @@ -414,67 +428,19 @@ static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af) /** * tcp_splice_conn_from_sock() - Attempt to init state for a spliced connection * @c: Execution context - * @pif0: pif id of side 0 - * @dstport: Side 0 destination port of connection * @flow: flow to initialise * @s0: Accepted (side 0) socket * @sa: Peer address of connection * - * Return: true if able to create a spliced connection, false otherwise * #syscalls:pasta setsockopt */ -bool tcp_splice_conn_from_sock(const struct ctx *c, - uint8_t pif0, in_port_t dstport, - union flow *flow, int s0, - const union sockaddr_inany *sa) +void tcp_splice_conn_from_sock(const struct ctx *c, union flow *flow, int s0) { - struct tcp_splice_conn *conn; - union inany_addr src; - in_port_t srcport; - sa_family_t af; - uint8_t pif1; + struct tcp_splice_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP_SPLICE, + tcp_splice); - if (c->mode != MODE_PASTA) - return false; - - inany_from_sockaddr(&src, &srcport, sa); - af = inany_v4(&src) ? AF_INET : AF_INET6; - - switch (pif0) { - case PIF_SPLICE: - if (!inany_is_loopback(&src)) { - char str[INANY_ADDRSTRLEN]; - - /* We can't use flow_err() etc. because we haven't set - * the flow type yet - */ - warn("Bad source address %s for splice, closing", - inany_ntop(&src, str, sizeof(str))); - - /* We *don't* want to fall back to tap */ - flow_alloc_cancel(flow); - return true; - } + ASSERT(c->mode == MODE_PASTA); - pif1 = PIF_HOST; - dstport += c->tcp.fwd_out.delta[dstport]; - break; - - case PIF_HOST: - if (!inany_is_loopback(&src)) - return false; - - pif1 = PIF_SPLICE; - dstport += c->tcp.fwd_in.delta[dstport]; - break; - - default: - return false; - } - - conn = FLOW_START(flow, FLOW_TCP_SPLICE, tcp_splice, 0); - - conn->flags = af == AF_INET ? 0 : SPLICE_V6; conn->s[0] = s0; conn->s[1] = -1; conn->pipe[0][0] = conn->pipe[0][1] = -1; @@ -483,10 +449,10 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, if (setsockopt(s0, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int))) flow_trace(conn, "failed to set TCP_QUICKACK on %i", s0); - if (tcp_splice_connect(c, conn, af, pif1, dstport)) + if (tcp_splice_connect(c, conn)) conn_flag(c, conn, CLOSING); - return true; + FLOW_ACTIVATE(conn); } /** @@ -500,8 +466,8 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) { - struct tcp_splice_conn *conn = CONN(ref.flowside.flow); - unsigned side = ref.flowside.side, fromside; + struct tcp_splice_conn *conn = conn_at_sidx(ref.flowside); + unsigned evsidei = ref.flowside.sidei, fromsidei; uint8_t lowat_set_flag, lowat_act_flag; int eof, never_read; @@ -516,11 +482,10 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, rc = getsockopt(ref.fd, SOL_SOCKET, SO_ERROR, &err, &sl); if (rc) - flow_err(conn, "Error retrieving SO_ERROR: %s", - strerror(errno)); + flow_perror(conn, "Error retrieving SO_ERROR"); else flow_trace(conn, "Error event on socket: %s", - strerror(err)); + strerror_(err)); goto close; } @@ -533,46 +498,45 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, } if (events & EPOLLOUT) { - fromside = !side; - conn_event(c, conn, side == 0 ? ~OUT_WAIT_0 : ~OUT_WAIT_1); + fromsidei = !evsidei; + conn_event(c, conn, ~OUT_WAIT(evsidei)); } else { - fromside = side; + fromsidei = evsidei; } if (events & EPOLLRDHUP) /* For side 0 this is fake, but implied */ - conn_event(c, conn, side == 0 ? FIN_RCVD_0 : FIN_RCVD_1); + conn_event(c, conn, FIN_RCVD(evsidei)); swap: eof = 0; never_read = 1; - lowat_set_flag = fromside == 0 ? RCVLOWAT_SET_0 : RCVLOWAT_SET_1; - lowat_act_flag = fromside == 0 ? RCVLOWAT_ACT_0 : RCVLOWAT_ACT_1; + lowat_set_flag = RCVLOWAT_SET(fromsidei); + lowat_act_flag = RCVLOWAT_ACT(fromsidei); while (1) { - ssize_t readlen, to_write = 0, written; + ssize_t readlen, written, pending; int more = 0; retry: - readlen = splice(conn->s[fromside], NULL, - conn->pipe[fromside][1], NULL, c->tcp.pipe_size, - SPLICE_F_MOVE | SPLICE_F_NONBLOCK); - flow_trace(conn, "%zi from read-side call", readlen); - if (readlen < 0) { - if (errno == EINTR) - goto retry; + do + readlen = splice(conn->s[fromsidei], NULL, + conn->pipe[fromsidei][1], NULL, + c->tcp.pipe_size, + SPLICE_F_MOVE | SPLICE_F_NONBLOCK); + while (readlen < 0 && errno == EINTR); + + if (readlen < 0 && errno != EAGAIN) + goto close; - if (errno != EAGAIN) - goto close; + flow_trace(conn, "%zi from read-side call", readlen); - to_write = c->tcp.pipe_size; - } else if (!readlen) { + if (!readlen) { eof = 1; - to_write = c->tcp.pipe_size; - } else { + } else if (readlen > 0) { never_read = 0; - to_write += readlen; + if (readlen >= (long)c->tcp.pipe_size * 90 / 100) more = SPLICE_F_MORE; @@ -580,83 +544,84 @@ retry: conn_flag(c, conn, lowat_act_flag); } -eintr: - written = splice(conn->pipe[fromside][0], NULL, - conn->s[!fromside], NULL, to_write, - SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK); + do + written = splice(conn->pipe[fromsidei][0], NULL, + conn->s[!fromsidei], NULL, + c->tcp.pipe_size, + SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK); + while (written < 0 && errno == EINTR); + + if (written < 0 && errno != EAGAIN) + goto close; + flow_trace(conn, "%zi from write-side call (passed %zi)", - written, to_write); + written, c->tcp.pipe_size); /* Most common case: skip updating counters. */ if (readlen > 0 && readlen == written) { if (readlen >= (long)c->tcp.pipe_size * 10 / 100) continue; - if (conn->flags & lowat_set_flag && + if (!(conn->flags & lowat_set_flag) && readlen > (long)c->tcp.pipe_size / 10) { int lowat = c->tcp.pipe_size / 4; - setsockopt(conn->s[fromside], SOL_SOCKET, - SO_RCVLOWAT, &lowat, sizeof(lowat)); - - conn_flag(c, conn, lowat_set_flag); - conn_flag(c, conn, lowat_act_flag); + if (setsockopt(conn->s[fromsidei], SOL_SOCKET, + SO_RCVLOWAT, + &lowat, sizeof(lowat))) { + flow_trace(conn, + "Setting SO_RCVLOWAT %i: %s", + lowat, strerror_(errno)); + } else { + conn_flag(c, conn, lowat_set_flag); + conn_flag(c, conn, lowat_act_flag); + } } break; } - conn->read[fromside] += readlen > 0 ? readlen : 0; - conn->written[fromside] += written > 0 ? written : 0; + conn->read[fromsidei] += readlen > 0 ? readlen : 0; + conn->written[fromsidei] += written > 0 ? written : 0; if (written < 0) { - if (errno == EINTR) - goto eintr; - - if (errno != EAGAIN) - goto close; - - if (never_read) + if (conn->read[fromsidei] == conn->written[fromsidei]) break; - conn_event(c, conn, - fromside == 0 ? OUT_WAIT_1 : OUT_WAIT_0); + conn_event(c, conn, OUT_WAIT(!fromsidei)); break; } if (never_read && written == (long)(c->tcp.pipe_size)) goto retry; - if (!never_read && written < to_write) { - to_write -= written; + pending = conn->read[fromsidei] - conn->written[fromsidei]; + if (!never_read && written > 0 && written < pending) goto retry; - } if (eof) break; } - if ((conn->events & FIN_RCVD_0) && !(conn->events & FIN_SENT_1)) { - if (conn->read[fromside] == conn->written[fromside] && eof) { - shutdown(conn->s[1], SHUT_WR); - conn_event(c, conn, FIN_SENT_1); - } - } + if (conn->read[fromsidei] == conn->written[fromsidei] && eof) { + unsigned sidei; - if ((conn->events & FIN_RCVD_1) && !(conn->events & FIN_SENT_0)) { - if (conn->read[fromside] == conn->written[fromside] && eof) { - shutdown(conn->s[0], SHUT_WR); - conn_event(c, conn, FIN_SENT_0); + flow_foreach_sidei(sidei) { + if ((conn->events & FIN_RCVD(sidei)) && + !(conn->events & FIN_SENT(!sidei))) { + shutdown(conn->s[!sidei], SHUT_WR); + conn_event(c, conn, FIN_SENT(!sidei)); + } } } - if (CONN_HAS(conn, FIN_SENT_0 | FIN_SENT_1)) + if (CONN_HAS(conn, FIN_SENT(0) | FIN_SENT(1))) goto close; if ((events & (EPOLLIN | EPOLLOUT)) == (EPOLLIN | EPOLLOUT)) { events = EPOLLIN; - fromside = !fromside; + fromsidei = !fromsidei; goto swap; } @@ -721,7 +686,7 @@ static void tcp_splice_pipe_refill(const struct ctx *c) continue; if (fcntl(splice_pipe_pool[i][0], F_SETPIPE_SZ, - c->tcp.pipe_size)) { + c->tcp.pipe_size) != (int)c->tcp.pipe_size) { trace("TCP (spliced): cannot set pool pipe size to %zu", c->tcp.pipe_size); } @@ -734,6 +699,7 @@ static void tcp_splice_pipe_refill(const struct ctx *c) * * Return: 0 */ +/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */ static int tcp_sock_refill_ns(void *arg) { const struct ctx *c = (const struct ctx *)arg; @@ -741,16 +707,16 @@ static int tcp_sock_refill_ns(void *arg) ns_enter(c); if (c->ifi4) { - int rc = tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET); + int rc = tcp_sock_refill_pool(ns_sock_pool4, AF_INET); if (rc < 0) warn("TCP: Error refilling IPv4 ns socket pool: %s", - strerror(-rc)); + strerror_(-rc)); } if (c->ifi6) { - int rc = tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6); + int rc = tcp_sock_refill_pool(ns_sock_pool6, AF_INET6); if (rc < 0) warn("TCP: Error refilling IPv6 ns socket pool: %s", - strerror(-rc)); + strerror_(-rc)); } return 0; @@ -786,29 +752,26 @@ void tcp_splice_init(struct ctx *c) /** * tcp_splice_timer() - Timer for spliced connections * @c: Execution context - * @flow: Flow table entry + * @conn: Connection to handle */ -void tcp_splice_timer(const struct ctx *c, union flow *flow) +void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn) { - struct tcp_splice_conn *conn = &flow->tcp_splice; - int side; + unsigned sidei; ASSERT(!(conn->flags & CLOSING)); - for (side = 0; side < SIDES; side++) { - uint8_t set = side == 0 ? RCVLOWAT_SET_0 : RCVLOWAT_SET_1; - uint8_t act = side == 0 ? RCVLOWAT_ACT_0 : RCVLOWAT_ACT_1; - - if ((conn->flags & set) && !(conn->flags & act)) { - if (setsockopt(conn->s[side], SOL_SOCKET, SO_RCVLOWAT, + flow_foreach_sidei(sidei) { + if ((conn->flags & RCVLOWAT_SET(sidei)) && + !(conn->flags & RCVLOWAT_ACT(sidei))) { + if (setsockopt(conn->s[sidei], SOL_SOCKET, SO_RCVLOWAT, &((int){ 1 }), sizeof(int))) { flow_trace(conn, "can't set SO_RCVLOWAT on %d", - conn->s[side]); + conn->s[sidei]); } - conn_flag(c, conn, ~set); + conn_flag(c, conn, ~RCVLOWAT_SET(sidei)); } } - conn_flag(c, conn, ~RCVLOWAT_ACT_0); - conn_flag(c, conn, ~RCVLOWAT_ACT_1); + flow_foreach_sidei(sidei) + conn_flag(c, conn, ~RCVLOWAT_ACT(sidei)); } diff --git a/tcp_splice.h b/tcp_splice.h index ed8f0c5..a20f3e2 100644 --- a/tcp_splice.h +++ b/tcp_splice.h @@ -11,10 +11,7 @@ union sockaddr_inany; void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events); -bool tcp_splice_conn_from_sock(const struct ctx *c, - uint8_t pif0, in_port_t dstport, - union flow *flow, int s0, - const union sockaddr_inany *sa); +void tcp_splice_conn_from_sock(const struct ctx *c, union flow *flow, int s0); void tcp_splice_init(struct ctx *c); #endif /* TCP_SPLICE_H */ diff --git a/tcp_vu.c b/tcp_vu.c new file mode 100644 index 0000000..f3914c7 --- /dev/null +++ b/tcp_vu.c @@ -0,0 +1,476 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* tcp_vu.c - TCP L2 vhost-user management functions + * + * Copyright Red Hat + * Author: Laurent Vivier <lvivier@redhat.com> + */ + +#include <errno.h> +#include <stddef.h> +#include <stdint.h> + +#include <netinet/ip.h> +#include <netinet/tcp.h> + +#include <sys/socket.h> + +#include <netinet/if_ether.h> +#include <linux/virtio_net.h> + +#include "util.h" +#include "ip.h" +#include "passt.h" +#include "siphash.h" +#include "inany.h" +#include "vhost_user.h" +#include "tcp.h" +#include "pcap.h" +#include "flow.h" +#include "tcp_conn.h" +#include "flow_table.h" +#include "tcp_vu.h" +#include "tap.h" +#include "tcp_internal.h" +#include "checksum.h" +#include "vu_common.h" +#include <time.h> + +static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + 1]; +static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE]; +static int head[VIRTQUEUE_MAX_SIZE + 1]; + +/** + * tcp_vu_hdrlen() - return the size of the header in level 2 frame (TCP) + * @v6: Set for IPv6 packet + * + * Return: Return the size of the header + */ +static size_t tcp_vu_hdrlen(bool v6) +{ + size_t hdrlen; + + hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf) + + sizeof(struct ethhdr) + sizeof(struct tcphdr); + + if (v6) + hdrlen += sizeof(struct ipv6hdr); + else + hdrlen += sizeof(struct iphdr); + + return hdrlen; +} + +/** + * tcp_vu_send_flag() - Send segment with flags to vhost-user (no payload) + * @c: Execution context + * @conn: Connection pointer + * @flags: TCP flags: if not set, send segment only if ACK is due + * + * Return: negative error code on connection reset, 0 otherwise + */ +int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) +{ + struct vu_dev *vdev = c->vdev; + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + size_t optlen, hdrlen; + struct vu_virtq_element flags_elem[2]; + struct ipv6hdr *ip6h = NULL; + struct iphdr *ip4h = NULL; + struct iovec flags_iov[2]; + struct tcp_syn_opts *opts; + struct iov_tail payload; + struct tcphdr *th; + struct ethhdr *eh; + uint32_t seq; + int elem_cnt; + int nb_ack; + int ret; + + hdrlen = tcp_vu_hdrlen(CONN_V6(conn)); + + vu_set_element(&flags_elem[0], NULL, &flags_iov[0]); + + elem_cnt = vu_collect(vdev, vq, &flags_elem[0], 1, + hdrlen + sizeof(struct tcp_syn_opts), NULL); + if (elem_cnt != 1) + return -1; + + ASSERT(flags_elem[0].in_sg[0].iov_len >= + hdrlen + sizeof(struct tcp_syn_opts)); + + vu_set_vnethdr(vdev, flags_elem[0].in_sg[0].iov_base, 1); + + eh = vu_eth(flags_elem[0].in_sg[0].iov_base); + + memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest)); + memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source)); + + if (CONN_V4(conn)) { + eh->h_proto = htons(ETH_P_IP); + + ip4h = vu_ip(flags_elem[0].in_sg[0].iov_base); + *ip4h = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP); + + th = vu_payloadv4(flags_elem[0].in_sg[0].iov_base); + } else { + eh->h_proto = htons(ETH_P_IPV6); + + ip6h = vu_ip(flags_elem[0].in_sg[0].iov_base); + *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP); + th = vu_payloadv6(flags_elem[0].in_sg[0].iov_base); + } + + memset(th, 0, sizeof(*th)); + th->doff = sizeof(*th) / 4; + th->ack = 1; + + seq = conn->seq_to_tap; + opts = (struct tcp_syn_opts *)(th + 1); + ret = tcp_prepare_flags(c, conn, flags, th, opts, &optlen); + if (ret <= 0) { + vu_queue_rewind(vq, 1); + return ret; + } + + flags_elem[0].in_sg[0].iov_len = hdrlen + optlen; + payload = IOV_TAIL(flags_elem[0].in_sg, 1, hdrlen); + + tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload, + NULL, seq, !*c->pcap); + + if (*c->pcap) { + pcap_iov(&flags_elem[0].in_sg[0], 1, + sizeof(struct virtio_net_hdr_mrg_rxbuf)); + } + nb_ack = 1; + + if (flags & DUP_ACK) { + vu_set_element(&flags_elem[1], NULL, &flags_iov[1]); + + elem_cnt = vu_collect(vdev, vq, &flags_elem[1], 1, + flags_elem[0].in_sg[0].iov_len, NULL); + if (elem_cnt == 1 && + flags_elem[1].in_sg[0].iov_len >= + flags_elem[0].in_sg[0].iov_len) { + memcpy(flags_elem[1].in_sg[0].iov_base, + flags_elem[0].in_sg[0].iov_base, + flags_elem[0].in_sg[0].iov_len); + nb_ack++; + + if (*c->pcap) { + pcap_iov(&flags_elem[1].in_sg[0], 1, + sizeof(struct virtio_net_hdr_mrg_rxbuf)); + } + } + } + + vu_flush(vdev, vq, flags_elem, nb_ack); + + return 0; +} + +/** tcp_vu_sock_recv() - Receive datastream from socket into vhost-user buffers + * @c: Execution context + * @conn: Connection pointer + * @v6: Set for IPv6 connections + * @already_sent: Number of bytes already sent + * @fillsize: Maximum bytes to fill in guest-side receiving window + * @iov_cnt: number of iov (output) + * @head_cnt: Pointer to store the count of head iov entries (output) + * + * Return: number of bytes received from the socket, or a negative error code + * on failure. + */ +static ssize_t tcp_vu_sock_recv(const struct ctx *c, + const struct tcp_tap_conn *conn, bool v6, + uint32_t already_sent, size_t fillsize, + int *iov_cnt, int *head_cnt) +{ + struct vu_dev *vdev = c->vdev; + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + struct msghdr mh_sock = { 0 }; + uint16_t mss = MSS_GET(conn); + int s = conn->sock; + ssize_t ret, len; + size_t hdrlen; + int elem_cnt; + int i; + + *iov_cnt = 0; + + hdrlen = tcp_vu_hdrlen(v6); + + vu_init_elem(elem, &iov_vu[1], VIRTQUEUE_MAX_SIZE); + + elem_cnt = 0; + *head_cnt = 0; + while (fillsize > 0 && elem_cnt < VIRTQUEUE_MAX_SIZE) { + struct iovec *iov; + size_t frame_size, dlen; + int cnt; + + cnt = vu_collect(vdev, vq, &elem[elem_cnt], + VIRTQUEUE_MAX_SIZE - elem_cnt, + MIN(mss, fillsize) + hdrlen, &frame_size); + if (cnt == 0) + break; + + dlen = frame_size - hdrlen; + + /* reserve space for headers in iov */ + iov = &elem[elem_cnt].in_sg[0]; + ASSERT(iov->iov_len >= hdrlen); + iov->iov_base = (char *)iov->iov_base + hdrlen; + iov->iov_len -= hdrlen; + head[(*head_cnt)++] = elem_cnt; + + fillsize -= dlen; + elem_cnt += cnt; + } + + if (peek_offset_cap) { + mh_sock.msg_iov = iov_vu + 1; + mh_sock.msg_iovlen = elem_cnt; + } else { + iov_vu[0].iov_base = tcp_buf_discard; + iov_vu[0].iov_len = already_sent; + + mh_sock.msg_iov = iov_vu; + mh_sock.msg_iovlen = elem_cnt + 1; + } + + do + ret = recvmsg(s, &mh_sock, MSG_PEEK); + while (ret < 0 && errno == EINTR); + + if (ret < 0) { + vu_queue_rewind(vq, elem_cnt); + return -errno; + } + + if (!peek_offset_cap) + ret -= already_sent; + + /* adjust iov number and length of the last iov */ + len = ret; + for (i = 0; len && i < elem_cnt; i++) { + struct iovec *iov = &elem[i].in_sg[0]; + + if (iov->iov_len > (size_t)len) + iov->iov_len = len; + + len -= iov->iov_len; + } + /* adjust head count */ + while (*head_cnt > 0 && head[*head_cnt - 1] >= i) + (*head_cnt)--; + + /* mark end of array */ + head[*head_cnt] = i; + *iov_cnt = i; + + /* release unused buffers */ + vu_queue_rewind(vq, elem_cnt - i); + + /* restore space for headers in iov */ + for (i = 0; i < *head_cnt; i++) { + struct iovec *iov = &elem[head[i]].in_sg[0]; + + iov->iov_base = (char *)iov->iov_base - hdrlen; + iov->iov_len += hdrlen; + } + + return ret; +} + +/** + * tcp_vu_prepare() - Prepare the frame header + * @c: Execution context + * @conn: Connection pointer + * @iov: Pointer to the array of IO vectors + * @iov_cnt: Number of entries in @iov + * @check: Checksum, if already known + * @no_tcp_csum: Do not set TCP checksum + * @push: Set PSH flag, last segment in a batch + */ +static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn, + struct iovec *iov, size_t iov_cnt, + const uint16_t **check, bool no_tcp_csum, bool push) +{ + const struct flowside *toside = TAPFLOW(conn); + bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)); + size_t hdrlen = tcp_vu_hdrlen(v6); + struct iov_tail payload = IOV_TAIL(iov, iov_cnt, hdrlen); + char *base = iov[0].iov_base; + struct ipv6hdr *ip6h = NULL; + struct iphdr *ip4h = NULL; + struct tcphdr *th; + struct ethhdr *eh; + + /* we guess the first iovec provided by the guest can embed + * all the headers needed by L2 frame + */ + ASSERT(iov[0].iov_len >= hdrlen); + + eh = vu_eth(base); + + memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest)); + memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source)); + + /* initialize header */ + + if (!v6) { + eh->h_proto = htons(ETH_P_IP); + + ip4h = vu_ip(base); + *ip4h = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP); + th = vu_payloadv4(base); + } else { + eh->h_proto = htons(ETH_P_IPV6); + + ip6h = vu_ip(base); + *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP); + + th = vu_payloadv6(base); + } + + memset(th, 0, sizeof(*th)); + th->doff = sizeof(*th) / 4; + th->ack = 1; + th->psh = push; + + tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload, + *check, conn->seq_to_tap, no_tcp_csum); + if (ip4h) + *check = &ip4h->check; +} + +/** + * tcp_vu_data_from_sock() - Handle new data from socket, queue to vhost-user, + * in window + * @c: Execution context + * @conn: Connection pointer + * + * Return: Negative on connection reset, 0 otherwise + */ +int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) +{ + uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; + struct vu_dev *vdev = c->vdev; + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + ssize_t len, previous_dlen; + int i, iov_cnt, head_cnt; + size_t hdrlen, fillsize; + int v6 = CONN_V6(conn); + uint32_t already_sent; + const uint16_t *check; + + if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) { + debug("Got packet, but RX virtqueue not usable yet"); + return 0; + } + + already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; + + if (SEQ_LT(already_sent, 0)) { + /* RFC 761, section 2.1. */ + flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", + conn->seq_ack_from_tap, conn->seq_to_tap); + conn->seq_to_tap = conn->seq_ack_from_tap; + already_sent = 0; + if (tcp_set_peek_offset(conn, 0)) { + tcp_rst(c, conn); + return -1; + } + } + + if (!wnd_scaled || already_sent >= wnd_scaled) { + conn_flag(c, conn, ACK_FROM_TAP_BLOCKS); + conn_flag(c, conn, STALLED); + conn_flag(c, conn, ACK_FROM_TAP_DUE); + return 0; + } + + /* Set up buffer descriptors we'll fill completely and partially. */ + + fillsize = wnd_scaled - already_sent; + + /* collect the buffers from vhost-user and fill them with the + * data from the socket + */ + len = tcp_vu_sock_recv(c, conn, v6, already_sent, fillsize, + &iov_cnt, &head_cnt); + if (len < 0) { + if (len != -EAGAIN && len != -EWOULDBLOCK) { + tcp_rst(c, conn); + return len; + } + + if (already_sent) /* No new data and EAGAIN: set EPOLLET */ + conn_flag(c, conn, STALLED); + + return 0; + } + + if (!len) { + if (already_sent) { + conn_flag(c, conn, STALLED); + } else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == + SOCK_FIN_RCVD) { + int ret = tcp_vu_send_flag(c, conn, FIN | ACK); + if (ret) { + tcp_rst(c, conn); + return ret; + } + + conn_event(c, conn, TAP_FIN_SENT); + } + + return 0; + } + + conn_flag(c, conn, ~ACK_FROM_TAP_BLOCKS); + conn_flag(c, conn, ~STALLED); + + /* Likely, some new data was acked too. */ + tcp_update_seqack_wnd(c, conn, false, NULL); + + /* initialize headers */ + /* iov_vu is an array of buffers and the buffer size can be + * smaller than the frame size we want to use but with + * num_buffer we can merge several virtio iov buffers in one packet + * we need only to set the packet headers in the first iov and + * num_buffer to the number of iov entries + */ + + hdrlen = tcp_vu_hdrlen(v6); + for (i = 0, previous_dlen = -1, check = NULL; i < head_cnt; i++) { + struct iovec *iov = &elem[head[i]].in_sg[0]; + int buf_cnt = head[i + 1] - head[i]; + ssize_t dlen = iov_size(iov, buf_cnt) - hdrlen; + bool push = i == head_cnt - 1; + + vu_set_vnethdr(vdev, iov->iov_base, buf_cnt); + + /* The IPv4 header checksum varies only with dlen */ + if (previous_dlen != dlen) + check = NULL; + previous_dlen = dlen; + + tcp_vu_prepare(c, conn, iov, buf_cnt, &check, !*c->pcap, push); + + if (*c->pcap) { + pcap_iov(iov, buf_cnt, + sizeof(struct virtio_net_hdr_mrg_rxbuf)); + } + + conn->seq_to_tap += dlen; + } + + /* send packets */ + vu_flush(vdev, vq, elem, iov_cnt); + + conn_flag(c, conn, ACK_FROM_TAP_DUE); + + return 0; +} diff --git a/tcp_vu.h b/tcp_vu.h new file mode 100644 index 0000000..6ab6057 --- /dev/null +++ b/tcp_vu.h @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Copyright Red Hat + * Author: Laurent Vivier <lvivier@redhat.com> + */ + +#ifndef TCP_VU_H +#define TCP_VU_H + +int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags); +int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn); + +#endif /*TCP_VU_H */ diff --git a/test/.gitignore b/test/.gitignore index 4837402..3573444 100644 --- a/test/.gitignore +++ b/test/.gitignore @@ -1,5 +1,6 @@ test_logs/ mbuto/ +podman/ *.img QEMU_EFI.fd *.qcow2 @@ -7,5 +8,6 @@ QEMU_EFI.fd *.raw.xz *.bin nstool +rampstream guest-key guest-key.pub diff --git a/test/Makefile b/test/Makefile index 7b00bef..bf63db8 100644 --- a/test/Makefile +++ b/test/Makefile @@ -8,7 +8,6 @@ WGET = wget -c DEBIAN_IMGS = debian-8.11.0-openstack-amd64.qcow2 \ - debian-9-nocloud-amd64-daily-20200210-166.qcow2 \ debian-10-nocloud-amd64.qcow2 \ debian-10-generic-arm64.qcow2 \ debian-10-generic-ppc64el-20220911-1135.qcow2 \ @@ -42,8 +41,7 @@ OPENSUSE_IMGS = openSUSE-Leap-15.1-JeOS.x86_64-kvm-and-xen.qcow2 \ openSUSE-Leap-15.2-JeOS.x86_64-kvm-and-xen.qcow2 \ openSUSE-Leap-15.3-JeOS.x86_64-kvm-and-xen.qcow2 \ openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz \ - openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz \ - openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2 + openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz UBUNTU_OLD_IMGS = trusty-server-cloudimg-amd64-disk1.img \ trusty-server-cloudimg-i386-disk1.img \ @@ -52,10 +50,11 @@ UBUNTU_NEW_IMGS = xenial-server-cloudimg-powerpc-disk1.img \ jammy-server-cloudimg-s390x.img UBUNTU_IMGS = $(UBUNTU_OLD_IMGS) $(UBUNTU_NEW_IMGS) -DOWNLOAD_ASSETS = mbuto \ +DOWNLOAD_ASSETS = mbuto podman \ $(DEBIAN_IMGS) $(FEDORA_IMGS) $(OPENSUSE_IMGS) $(UBUNTU_IMGS) -TESTDATA_ASSETS = small.bin big.bin medium.bin -LOCAL_ASSETS = mbuto.img mbuto.mem.img QEMU_EFI.fd \ +TESTDATA_ASSETS = small.bin big.bin medium.bin \ + rampstream +LOCAL_ASSETS = mbuto.img mbuto.mem.img podman/bin/podman QEMU_EFI.fd \ $(DEBIAN_IMGS:%=prepared-%) $(FEDORA_IMGS:%=prepared-%) \ $(UBUNTU_NEW_IMGS:%=prepared-%) \ nstool guest-key guest-key.pub \ @@ -67,13 +66,27 @@ CFLAGS = -Wall -Werror -Wextra -pedantic -std=c99 assets: $(ASSETS) +.PHONY: pull-% +pull-%: % + git -C $* pull + mbuto: git clone git://mbuto.sh/mbuto +mbuto/mbuto: pull-mbuto + +podman: + git clone https://github.com/containers/podman.git + +# To succesfully build podman, you will need gpgme and systemd +# development packages +podman/bin/podman: pull-podman + $(MAKE) -C podman + guest-key guest-key.pub: ssh-keygen -f guest-key -N '' -mbuto.img: passt.mbuto mbuto guest-key.pub $(TESTDATA_ASSETS) +mbuto.img: passt.mbuto mbuto/mbuto guest-key.pub rampstream-check.sh $(TESTDATA_ASSETS) ./mbuto/mbuto -p ./$< -c lz4 -f $@ mbuto.mem.img: passt.mem.mbuto mbuto ../passt.avx2 @@ -121,9 +134,6 @@ realclean: clean debian-8.11.0-openstack-%.qcow2: $(WGET) -O $@ https://cloud.debian.org/images/cloud/OpenStack/archive/8.11.0/debian-8.11.0-openstack-$*.qcow2 -debian-9-nocloud-%-daily-20200210-166.qcow2: - $(WGET) -O $@ https://cloud.debian.org/images/cloud/stretch/daily/20200210-166/debian-9-nocloud-$*-daily-20200210-166.qcow2 - debian-10-nocloud-%.qcow2: $(WGET) -O $@ https://cloud.debian.org/images/cloud/buster/latest/debian-10-nocloud-$*.qcow2 @@ -189,9 +199,6 @@ openSUSE-Tumbleweed-ARM-JeOS-efi.aarch64.raw.xz: openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz: $(WGET) -O $@ http://download.opensuse.org/ports/armv7hl/tumbleweed/appliances/openSUSE-Tumbleweed-ARM-JeOS-efi.armv7l.raw.xz -openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2: - $(WGET) -O $@ https://download.opensuse.org/tumbleweed/appliances/openSUSE-Tumbleweed-JeOS.x86_64-kvm-and-xen.qcow2 - # Ubuntu downloads trusty-server-cloudimg-%-disk1.img: $(WGET) -O $@ https://cloud-images.ubuntu.com/trusty/current/trusty-server-cloudimg-$*-disk1.img diff --git a/test/README.md b/test/README.md index 0936b04..91ca603 100644 --- a/test/README.md +++ b/test/README.md @@ -28,10 +28,11 @@ on a system, i.e. common utilities such as a shell are not included here. Example for Debian, and possibly most Debian-based distributions: - build-essential git jq strace iperf3 qemu-system-x86 tmux sipcalc bats bc - catatonit clang-tidy cppcheck go isc-dhcp-common psmisc linux-cpupower socat - netcat-openbsd fakeroot lz4 lm-sensors qemu-system-arm qemu-system-ppc - qemu-system-misc qemu-system-x86 valgrind + bats bc build-essential catatonit clang-tidy conmon cppcheck crun fakeroot + git go iperf3 isc-dhcp-common jq libgpgme-dev libseccomp-dev linux-cpupower + lm-sensors lz4 netavark netcat-openbsd psmisc qemu-efi-aarch64 + qemu-system-arm qemu-system-misc qemu-system-ppc qemu-system-x86 + qemu-system-x86 sipcalc socat strace tmux uidmap valgrind NOTE: the tests need a qemu version >= 7.2, or one that contains commit 13c6be96618c ("net: stream: add unix socket"): this change introduces support diff --git a/test/lib/layout b/test/lib/layout index f9a1cf1..fddcdc4 100644 --- a/test/lib/layout +++ b/test/lib/layout @@ -15,7 +15,7 @@ # layout_pasta() - Panes for host, pasta, and separate one for namespace layout_pasta() { - sleep 3 + sleep 1 tmux kill-pane -a -t 0 cmd_write 0 clear @@ -46,7 +46,7 @@ layout_pasta() { # layout_passt() - Panes for host, passt, and guest layout_passt() { - sleep 3 + sleep 1 tmux kill-pane -a -t 0 cmd_write 0 clear @@ -77,7 +77,7 @@ layout_passt() { # layout_passt_in_pasta() - Host, passt within pasta, namespace and guest layout_passt_in_pasta() { - sleep 3 + sleep 1 tmux kill-pane -a -t 0 cmd_write 0 clear @@ -113,7 +113,7 @@ layout_passt_in_pasta() { # layout_two_guests() - Two guest panes, two passt panes, plus host and log layout_two_guests() { - sleep 3 + sleep 1 tmux kill-pane -a -t 0 cmd_write 0 clear @@ -135,24 +135,77 @@ layout_two_guests() { get_info_cols pane_watch_contexts ${PANE_GUEST_1} "guest #1 in namespace #1" qemu_1 guest_1 + pane_watch_contexts ${PANE_GUEST_2} "guest #2 in namespace #1" qemu_2 guest_2 + + tmux send-keys -l -t ${PANE_INFO} 'while cat '"$STATEBASE/log_pipe"'; do :; done' + tmux send-keys -t ${PANE_INFO} -N 100 C-m + tmux select-pane -t ${PANE_INFO} -T "test log" + + pane_watch_contexts ${PANE_HOST} host host + pane_watch_contexts ${PANE_PASST_1} "passt #1 in namespace #1" pasta_1 passt_1 + pane_watch_contexts ${PANE_PASST_2} "passt #2 in namespace #1" pasta_1 passt_2 + + info_layout "two guests, two passt instances, in namespaces" + + sleep 1 +} + +# layout_migrate() - Two guest panes, two passt panes, two passt-repair panes, +# plus host and log +layout_migrate() { + sleep 1 + + tmux kill-pane -a -t 0 + cmd_write 0 clear + + tmux split-window -v -t passt_test + tmux split-window -h -l '33%' + tmux split-window -h -t passt_test:1.1 + + tmux split-window -h -l '35%' -t passt_test:1.0 + tmux split-window -v -t passt_test:1.0 + + tmux split-window -v -t passt_test:1.4 + tmux split-window -v -t passt_test:1.6 + + tmux split-window -v -t passt_test:1.3 + + PANE_GUEST_1=0 + PANE_GUEST_2=1 + PANE_INFO=2 + PANE_MON=3 + PANE_HOST=4 + PANE_PASST_REPAIR_1=5 + PANE_PASST_1=6 + PANE_PASST_REPAIR_2=7 + PANE_PASST_2=8 + + get_info_cols + + pane_watch_contexts ${PANE_GUEST_1} "guest #1 in namespace #1" qemu_1 guest_1 pane_watch_contexts ${PANE_GUEST_2} "guest #2 in namespace #2" qemu_2 guest_2 tmux send-keys -l -t ${PANE_INFO} 'while cat '"$STATEBASE/log_pipe"'; do :; done' tmux send-keys -t ${PANE_INFO} -N 100 C-m tmux select-pane -t ${PANE_INFO} -T "test log" + pane_watch_contexts ${PANE_MON} "QEMU monitor" mon mon + pane_watch_contexts ${PANE_HOST} host host + pane_watch_contexts ${PANE_PASST_REPAIR_1} "passt-repair #1 in namespace #1" repair_1 passt_repair_1 pane_watch_contexts ${PANE_PASST_1} "passt #1 in namespace #1" pasta_1 passt_1 + + pane_watch_contexts ${PANE_PASST_REPAIR_2} "passt-repair #2 in namespace #2" repair_2 passt_repair_2 pane_watch_contexts ${PANE_PASST_2} "passt #2 in namespace #2" pasta_2 passt_2 - info_layout "two guests, two passt instances, in namespaces" + info_layout "two guests, two passt + passt-repair instances, in namespaces" sleep 1 } # layout_demo_pasta() - Four panes for pasta demo layout_demo_pasta() { - sleep 3 + sleep 1 cmd_write 0 cd ${BASEPATH} cmd_write 0 clear @@ -188,7 +241,7 @@ layout_demo_pasta() { # layout_demo_passt() - Four panes for passt demo layout_demo_passt() { - sleep 3 + sleep 1 cmd_write 0 cd ${BASEPATH} cmd_write 0 clear @@ -224,7 +277,7 @@ layout_demo_passt() { # layout_demo_podman() - Four panes for pasta demo with Podman layout_demo_podman() { - sleep 3 + sleep 1 cmd_write 0 cd ${BASEPATH} cmd_write 0 clear diff --git a/test/lib/perf_report b/test/lib/perf_report index 67f9f4e..c4ec817 100755 --- a/test/lib/perf_report +++ b/test/lib/perf_report @@ -18,7 +18,7 @@ PERF_LINK_COUNT=0 PERF_JS="${LOGDIR}/web/perf.js" PERF_TEMPLATE_HTML="document.write('"' -Throughput in Gbps, latency in µs. Threads are <span style="font-family: monospace;">iperf3</span> processes, <i>passt</i> and <i>pasta</i> are currently single-threaded.<br/> +Throughput in Gbps, latency in µs. Threads are <span style="font-family: monospace;">iperf3</span> threads, <i>passt</i> and <i>pasta</i> are currently single-threaded.<br/> Click on numbers to show test execution. Measured at head, commit <span style="font-family: monospace;">__commit__</span>. <style type="text/CSS"> @@ -49,6 +49,21 @@ td:empty { visibility: hidden; } __passt_tcp_LINE__ __passt_udp_LINE__ </table> +</li><li><p>passt with vhost-user support</p> +<table class="passt" width="70%"> + <tr> + <th/> + <th id="perf_passt_vu_tcp" colspan="__passt_vu_tcp_cols__">TCP, __passt_vu_tcp_threads__ at __passt_vu_tcp_freq__ GHz</th> + <th id="perf_passt_vu_udp" colspan="__passt_vu_udp_cols__">UDP, __passt_vu_udp_threads__ at __passt_vu_udp_freq__ GHz</th> + </tr> + <tr> + <td align="right">MTU:</td> + __passt_vu_tcp_header__ + __passt_vu_udp_header__ + </tr> + __passt_vu_tcp_LINE__ __passt_vu_udp_LINE__ +</table> + <style type="text/CSS"> table.pasta_local td { border: 0px solid; padding: 6px; line-height: 1; } table.pasta_local td { text-align: right; } @@ -56,7 +71,7 @@ table.pasta_local th { text-align: center; font-weight: bold; } table.pasta_local tr:not(:first-of-type) td:not(:first-of-type) { font-family: monospace; font-weight: bolder; } table.pasta_local tr:nth-child(3n+0) { background-color: #112315; } table.pasta_local tr:not(:nth-child(3n+0)) td { background-color: #101010; } -table.pasta_local td:nth-child(3n+2) { background-color: #603302; } +table.pasta_local td:nth-child(4n+2) { background-color: #603302; } table.pasta_local tr:nth-child(1) { background-color: #363e61; } table.pasta td { border: 0px solid; padding: 6px; line-height: 1; } table.pasta td { text-align: right; } diff --git a/test/lib/setup b/test/lib/setup index 9b39b9f..575bc21 100755 --- a/test/lib/setup +++ b/test/lib/setup @@ -15,8 +15,9 @@ INITRAMFS="${BASEPATH}/mbuto.img" VCPUS="$( [ $(nproc) -ge 8 ] && echo 6 || echo $(( $(nproc) / 2 + 1 )) )" -__mem_kib="$(sed -n 's/MemTotal:[ ]*\([0-9]*\) kB/\1/p' /proc/meminfo)" -VMEM="$((${__mem_kib} / 1024 / 4))" +MEM_KIB="$(sed -n 's/MemTotal:[ ]*\([0-9]*\) kB/\1/p' /proc/meminfo)" +QEMU_ARCH="$(uname -m)" +[ "${QEMU_ARCH}" = "i686" ] && QEMU_ARCH=i386 # setup_build() - Set up pane layout for build tests setup_build() { @@ -44,24 +45,38 @@ setup_passt() { [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt.pcap" [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" + [ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user" context_run passt "make clean" context_run passt "make valgrind" - context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt ${__opts} -s ${STATESETUP}/passt.socket -f -t 10001 -u 10001 -P ${STATESETUP}/passt.pid" + context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt ${__opts} -s ${STATESETUP}/passt.socket -f -t 10001 -u 10001 -H hostname1 --fqdn fqdn1.passt.test -P ${STATESETUP}/passt.pid" # pidfile isn't created until passt is listening wait_for [ -f "${STATESETUP}/passt.pid" ] + __vmem="$((${MEM_KIB} / 1024 / 4))" + if [ ${VHOST_USER} -eq 1 ]; then + __vmem="$(((${__vmem} + 500) / 1000))G" + __qemu_netdev=" \ + -chardev socket,id=c,path=${STATESETUP}/passt.socket \ + -netdev vhost-user,id=v,chardev=c \ + -device virtio-net,netdev=v \ + -object memory-backend-memfd,id=m,share=on,size=${__vmem} \ + -numa node,memdev=m" + else + __qemu_netdev="-device virtio-net-pci,netdev=s \ + -netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket" + fi + GUEST_CID=94557 - context_run_bg qemu 'qemu-system-$(uname -m)' \ + context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}" \ ' -machine accel=kvm' \ - ' -m '${VMEM}' -cpu host -smp '${VCPUS} \ - ' -kernel ' "/boot/vmlinuz-$(uname -r)" \ + ' -m '${__vmem}' -cpu host -smp '${VCPUS} \ + ' -kernel '"${KERNEL}" \ ' -initrd '${INITRAMFS}' -nographic -serial stdio' \ ' -nodefaults' \ ' -append "console=ttyS0 mitigations=off apparmor=0" ' \ - ' -device virtio-net-pci,netdev=s0 ' \ - " -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket " \ + " ${__qemu_netdev}" \ " -pidfile ${STATESETUP}/qemu.pid" \ " -device vhost-vsock-pci,guest-cid=$GUEST_CID" @@ -124,7 +139,12 @@ setup_passt_in_ns() { [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" - context_run_bg pasta "./pasta ${__opts} -t 10001,10002,10011,10012 -T 10003,10013 -u 10001,10002,10011,10012 -U 10003,10013 -P ${STATESETUP}/pasta.pid --config-net ${NSTOOL} hold ${STATESETUP}/ns.hold" + __map_host4=192.0.2.1 + __map_host6=2001:db8:9a55::1 + __map_ns4=192.0.2.2 + __map_ns6=2001:db8:9a55::2 + + context_run_bg pasta "./pasta ${__opts} -t 10001,10002,10011,10012 -T 10003,10013 -u 10001,10002,10011,10012 -U 10003,10013 -P ${STATESETUP}/pasta.pid --map-host-loopback ${__map_host4} --map-host-loopback ${__map_host6} --config-net ${NSTOOL} hold ${STATESETUP}/ns.hold" wait_for [ -f "${STATESETUP}/pasta.pid" ] context_setup_nstool qemu ${STATESETUP}/ns.hold @@ -135,29 +155,43 @@ setup_passt_in_ns() { [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_in_pasta.pcap" [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" + [ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user" if [ ${VALGRIND} -eq 1 ]; then context_run passt "make clean" context_run passt "make valgrind" - context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid" + context_run_bg passt "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f ${__opts} -s ${STATESETUP}/passt.socket -H hostname1 --fqdn fqdn1.passt.test -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}" else context_run passt "make clean" context_run passt "make" - context_run_bg passt "./passt -f ${__opts} -s ${STATESETUP}/passt.socket -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid" + context_run_bg passt "./passt -f ${__opts} -s ${STATESETUP}/passt.socket -H hostname1 --fqdn fqdn1.passt.test -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P ${STATESETUP}/passt.pid --map-host-loopback ${__map_ns4} --map-host-loopback ${__map_ns6}" fi wait_for [ -f "${STATESETUP}/passt.pid" ] + __vmem="$((${MEM_KIB} / 1024 / 4))" + if [ ${VHOST_USER} -eq 1 ]; then + __vmem="$(((${__vmem} + 500) / 1000))G" + __qemu_netdev=" \ + -chardev socket,id=c,path=${STATESETUP}/passt.socket \ + -netdev vhost-user,id=v,chardev=c \ + -device virtio-net,netdev=v \ + -object memory-backend-memfd,id=m,share=on,size=${__vmem} \ + -numa node,memdev=m" + else + __qemu_netdev="-device virtio-net-pci,netdev=s \ + -netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket" + fi + GUEST_CID=94557 - context_run_bg qemu 'qemu-system-$(uname -m)' \ + context_run_bg qemu 'qemu-system-'"${QEMU_ARCH}" \ ' -machine accel=kvm' \ ' -M accel=kvm:tcg' \ - ' -m '${VMEM}' -cpu host -smp '${VCPUS} \ - ' -kernel ' "/boot/vmlinuz-$(uname -r)" \ + ' -m '${__vmem}' -cpu host -smp '${VCPUS} \ + ' -kernel '"${KERNEL}" \ ' -initrd '${INITRAMFS}' -nographic -serial stdio' \ ' -nodefaults' \ ' -append "console=ttyS0 mitigations=off apparmor=0" ' \ - ' -device virtio-net-pci,netdev=s0 ' \ - " -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt.socket " \ + " ${__qemu_netdev}" \ " -pidfile ${STATESETUP}/qemu.pid" \ " -device vhost-vsock-pci,guest-cid=$GUEST_CID" @@ -207,41 +241,63 @@ setup_two_guests() { [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap" [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" + [ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user" - context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} -t 10001 -u 10001" + context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} --fqdn fqdn1.passt.test -H hostname1 -t 10001 -u 10001" wait_for [ -f "${STATESETUP}/passt_1.pid" ] __opts= [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap" [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" + [ ${VHOST_USER} -eq 1 ] && __opts="${__opts} --vhost-user" - context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} -t 10004 -u 10004" + context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} --hostname hostname2 --fqdn fqdn2 -t 10004 -u 10004" wait_for [ -f "${STATESETUP}/passt_2.pid" ] + __vmem="$((${MEM_KIB} / 1024 / 4))" + if [ ${VHOST_USER} -eq 1 ]; then + __vmem="$(((${__vmem} + 500) / 1000))G" + __qemu_netdev1=" \ + -chardev socket,id=c,path=${STATESETUP}/passt_1.socket \ + -netdev vhost-user,id=v,chardev=c \ + -device virtio-net,netdev=v \ + -object memory-backend-memfd,id=m,share=on,size=${__vmem} \ + -numa node,memdev=m" + __qemu_netdev2=" \ + -chardev socket,id=c,path=${STATESETUP}/passt_2.socket \ + -netdev vhost-user,id=v,chardev=c \ + -device virtio-net,netdev=v \ + -object memory-backend-memfd,id=m,share=on,size=${__vmem} \ + -numa node,memdev=m" + else + __qemu_netdev1="-device virtio-net-pci,netdev=s \ + -netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_1.socket" + __qemu_netdev2="-device virtio-net-pci,netdev=s \ + -netdev stream,id=s,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_2.socket" + fi + GUEST_1_CID=94557 - context_run_bg qemu_1 'qemu-system-$(uname -m)' \ + context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}" \ ' -M accel=kvm:tcg' \ - ' -m '${VMEM}' -cpu host -smp '${VCPUS} \ - ' -kernel ' "/boot/vmlinuz-$(uname -r)" \ + ' -m '${__vmem}' -cpu host -smp '${VCPUS} \ + ' -kernel '"${KERNEL}" \ ' -initrd '${INITRAMFS}' -nographic -serial stdio' \ ' -nodefaults' \ ' -append "console=ttyS0 mitigations=off apparmor=0" ' \ - ' -device virtio-net-pci,netdev=s0 ' \ - " -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_1.socket " \ + " ${__qemu_netdev1}" \ " -pidfile ${STATESETUP}/qemu_1.pid" \ " -device vhost-vsock-pci,guest-cid=$GUEST_1_CID" GUEST_2_CID=94558 - context_run_bg qemu_2 'qemu-system-$(uname -m)' \ + context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}" \ ' -M accel=kvm:tcg' \ - ' -m '${VMEM}' -cpu host -smp '${VCPUS} \ - ' -kernel ' "/boot/vmlinuz-$(uname -r)" \ + ' -m '${__vmem}' -cpu host -smp '${VCPUS} \ + ' -kernel '"${KERNEL}" \ ' -initrd '${INITRAMFS}' -nographic -serial stdio' \ ' -nodefaults' \ ' -append "console=ttyS0 mitigations=off apparmor=0" ' \ - ' -device virtio-net-pci,netdev=s0 ' \ - " -netdev stream,id=s0,server=off,addr.type=unix,addr.path=${STATESETUP}/passt_2.socket " \ + " ${__qemu_netdev2}" \ " -pidfile ${STATESETUP}/qemu_2.pid" \ " -device vhost-vsock-pci,guest-cid=$GUEST_2_CID" @@ -249,6 +305,117 @@ setup_two_guests() { context_setup_guest guest_2 ${GUEST_2_CID} } +# setup_migrate() - Set up two namespace, run qemu, passt/passt-repair in both +setup_migrate() { + context_setup_host host + context_setup_host mon + context_setup_host pasta_1 + context_setup_host pasta_2 + + layout_migrate + + # Ports: + # + # guest #1 | guest #2 | ns #1 | host + # --------- |-----------|-----------|------------ + # 10001 as server | | to guest | to ns #1 + # 10002 | | as server | to ns #1 + # 10003 | | to init | as server + # 10004 | as server | to guest | to ns #1 + + __opts= + [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/pasta_1.pcap" + [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" + [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" + + __map_host4=192.0.2.1 + __map_host6=2001:db8:9a55::1 + __map_ns4=192.0.2.2 + __map_ns6=2001:db8:9a55::2 + + # Option 1: send stuff via spliced path in pasta + # context_run_bg pasta_1 "./pasta ${__opts} -P ${STATESETUP}/pasta_1.pid -t 10001,10002 -T 10003 -u 10001,10002 -U 10003 --config-net ${NSTOOL} hold ${STATESETUP}/ns1.hold" + # Option 2: send stuff via tap (--map-guest-addr) instead (useful to see capture of full migration) + context_run_bg pasta_1 "./pasta ${__opts} -P ${STATESETUP}/pasta_1.pid -t 10001,10002,10004 -T 10003 -u 10001,10002,10004 -U 10003 --map-guest-addr ${__map_host4} --map-guest-addr ${__map_host6} --config-net ${NSTOOL} hold ${STATESETUP}/ns1.hold" + context_setup_nstool passt_1 ${STATESETUP}/ns1.hold + context_setup_nstool passt_repair_1 ${STATESETUP}/ns1.hold + + context_setup_nstool passt_2 ${STATESETUP}/ns1.hold + context_setup_nstool passt_repair_2 ${STATESETUP}/ns1.hold + + context_setup_nstool qemu_1 ${STATESETUP}/ns1.hold + context_setup_nstool qemu_2 ${STATESETUP}/ns1.hold + + __ifname="$(context_run qemu_1 "ip -j link show | jq -rM '.[] | select(.link_type == \"ether\").ifname'")" + + sleep 1 + + __opts="--vhost-user" + [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap" + [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" + [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" + + context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} -t 10001 -u 10001" + wait_for [ -f "${STATESETUP}/passt_1.pid" ] + + context_run_bg passt_repair_1 "./passt-repair ${STATESETUP}/passt_1.socket.repair" + + __opts="--vhost-user" + [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap" + [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" + [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" + + context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} -t 10004 -u 10004" + wait_for [ -f "${STATESETUP}/passt_2.pid" ] + + context_run_bg passt_repair_2 "./passt-repair ${STATESETUP}/passt_2.socket.repair" + + __vmem="512M" # Keep migration fast + __qemu_netdev1=" \ + -chardev socket,id=c,path=${STATESETUP}/passt_1.socket \ + -netdev vhost-user,id=v,chardev=c \ + -device virtio-net,netdev=v \ + -object memory-backend-memfd,id=m,share=on,size=${__vmem} \ + -numa node,memdev=m" + __qemu_netdev2=" \ + -chardev socket,id=c,path=${STATESETUP}/passt_2.socket \ + -netdev vhost-user,id=v,chardev=c \ + -device virtio-net,netdev=v \ + -object memory-backend-memfd,id=m,share=on,size=${__vmem} \ + -numa node,memdev=m" + + GUEST_1_CID=94557 + context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}" \ + ' -M accel=kvm:tcg' \ + ' -m '${__vmem}' -cpu host -smp '${VCPUS} \ + ' -kernel '"${KERNEL}" \ + ' -initrd '${INITRAMFS}' -nographic -serial stdio' \ + ' -nodefaults' \ + ' -append "console=ttyS0 mitigations=off apparmor=0" ' \ + " ${__qemu_netdev1}" \ + " -pidfile ${STATESETUP}/qemu_1.pid" \ + " -device vhost-vsock-pci,guest-cid=$GUEST_1_CID" \ + " -monitor unix:${STATESETUP}/qemu_1_mon.sock,server,nowait" + + GUEST_2_CID=94558 + context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}" \ + ' -M accel=kvm:tcg' \ + ' -m '${__vmem}' -cpu host -smp '${VCPUS} \ + ' -kernel '"${KERNEL}" \ + ' -initrd '${INITRAMFS}' -nographic -serial stdio' \ + ' -nodefaults' \ + ' -append "console=ttyS0 mitigations=off apparmor=0" ' \ + " ${__qemu_netdev2}" \ + " -pidfile ${STATESETUP}/qemu_2.pid" \ + " -device vhost-vsock-pci,guest-cid=$GUEST_2_CID" \ + " -monitor unix:${STATESETUP}/qemu_2_mon.sock,server,nowait" \ + " -incoming tcp:0:20005" + + context_setup_guest guest_1 ${GUEST_1_CID} + # Only available after migration: + ( context_setup_guest guest_2 ${GUEST_2_CID} & ) +} + # teardown_context_watch() - Remove contexts and stop panes watching them # $1: Pane number watching # $@: Context names @@ -319,7 +486,8 @@ teardown_two_guests() { context_wait pasta_1 context_wait pasta_2 - rm -f "${STATESETUP}/passt__[12].pid" "${STATESETUP}/pasta_[12].pid" + rm "${STATESETUP}/passt_1.pid" "${STATESETUP}/passt_2.pid" + rm "${STATESETUP}/pasta_1.pid" "${STATESETUP}/pasta_2.pid" teardown_context_watch ${PANE_HOST} host teardown_context_watch ${PANE_GUEST_1} qemu_1 guest_1 @@ -328,6 +496,30 @@ teardown_two_guests() { teardown_context_watch ${PANE_PASST_2} pasta_2 passt_2 } +# teardown_migrate() - Exit namespaces, kill qemu processes, passt and pasta +teardown_migrate() { + ${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/qemu_1.pid") + ${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/qemu_2.pid") + context_wait qemu_1 + context_wait qemu_2 + + ${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/passt_2.pid") + context_wait passt_1 + context_wait passt_2 + ${NSTOOL} stop "${STATESETUP}/ns1.hold" + context_wait pasta_1 + + rm -f "${STATESETUP}/passt_1.pid" "${STATESETUP}/passt_2.pid" + rm -f "${STATESETUP}/pasta_1.pid" "${STATESETUP}/pasta_2.pid" + + teardown_context_watch ${PANE_HOST} host + + teardown_context_watch ${PANE_GUEST_1} qemu_1 guest_1 + teardown_context_watch ${PANE_GUEST_2} qemu_2 guest_2 + teardown_context_watch ${PANE_PASST_1} pasta_1 passt_1 + teardown_context_watch ${PANE_PASST_2} pasta_1 passt_2 +} + # teardown_demo_passt() - Exit namespace, kill qemu, passt and pasta teardown_demo_passt() { tmux send-keys -t ${PANE_GUEST} "C-c" diff --git a/test/lib/setup_ugly b/test/lib/setup_ugly index 4b2a077..2802cc3 100755 --- a/test/lib/setup_ugly +++ b/test/lib/setup_ugly @@ -33,7 +33,7 @@ setup_memory() { pane_or_context_run guest 'qemu-system-$(uname -m)' \ ' -machine accel=kvm' \ - ' -m '${VMEM}' -cpu host -smp '${VCPUS} \ + ' -m '$((${MEM_KIB} / 1024 / 4))' -cpu host -smp '${VCPUS} \ ' -kernel ' "/boot/vmlinuz-$(uname -r)" \ ' -initrd '${INITRAMFS_MEM}' -nographic -serial stdio' \ ' -nodefaults' \ diff --git a/test/lib/term b/test/lib/term index 262937e..089364c 100755 --- a/test/lib/term +++ b/test/lib/term @@ -19,6 +19,7 @@ STATUS_FILE_INDEX=0 STATUS_COLS= STATUS_PASS=0 STATUS_FAIL=0 +STATUS_SKIPPED=0 PR_RED='\033[1;31m' PR_GREEN='\033[1;32m' @@ -31,8 +32,8 @@ PR_DELAY_INIT=100 # ms # $@: Message to print info() { tmux select-pane -t ${PANE_INFO} - echo "${@}" >> $STATEBASE/log_pipe - echo "${@}" >> "${LOGFILE}" + printf "${@}\n" >> $STATEBASE/log_pipe + printf "${@}\n" >> "${LOGFILE}" } # info_n() - Highlight, print message to pane and to log file without newline @@ -47,13 +48,13 @@ info_n() { # $@: Message to print info_nolog() { tmux select-pane -t ${PANE_INFO} - echo "${@}" >> $STATEBASE/log_pipe + printf "${@}\n" >> $STATEBASE/log_pipe } # info_nolog() - Print message to log file # $@: Message to print log() { - echo "${@}" >> "${LOGFILE}" + printf "${@}\n" >> "${LOGFILE}" } # info_nolog_n() - Send message to pane without highlighting it, without newline @@ -97,7 +98,6 @@ display_delay() { switch_pane() { tmux select-pane -t ${1} PR_DELAY=${PR_DELAY_INIT} - display_delay "0.2" } # cmd_write() - Write a command to a pane, letter by letter, and execute it @@ -199,7 +199,7 @@ pane_run() { # $1: Pane name pane_wait() { __lc="$(echo "${1}" | tr [A-Z] [a-z])" - sleep 0.1 || sleep 1 + sleep 0.01 || sleep 1 __done=0 while @@ -207,7 +207,7 @@ pane_wait() { case ${__l} in *"$ " | *"# ") return ;; esac - do sleep 0.1 || sleep 1; done + do sleep 0.01 || sleep 1; done } # pane_parse() - Print last line, @EMPTY@ if command had no output @@ -231,7 +231,7 @@ pane_status() { __status="$(pane_parse "${1}")" while ! [ "${__status}" -eq "${__status}" ] 2>/dev/null; do - sleep 1 + sleep 0.01 || sleep 1 pane_run "${1}" 'echo $?' pane_wait "${1}" __status="$(pane_parse "${1}")" @@ -383,6 +383,16 @@ info_check_failed() { printf " < failed.\n" >> "${LOGFILE}" } +# status_bar_blink() - Make status bar blink +status_bar_blink() { + for i in `seq 1 3`; do + tmux set status-right-style 'bg=colour1 fg=colour196 bold' + sleep 0.1 || sleep 1 + tmux set status-right-style 'bg=colour1 fg=colour233 bold' + sleep 0.1 || sleep 1 + done +} + # info_passed() - Display, log, and make status bar blink when a test passes info_passed() { switch_pane ${PANE_INFO} @@ -391,12 +401,7 @@ info_passed() { log "...passed." log - for i in `seq 1 3`; do - tmux set status-right-style 'bg=colour1 fg=colour2 bold' - sleep "0.1" - tmux set status-right-style 'bg=colour1 fg=colour233 bold' - sleep "0.1" - done + [ ${FAST} -eq 1 ] || status_bar_blink } # info_failed() - Display, log, and make status bar blink when a test passes @@ -407,12 +412,7 @@ info_failed() { log "...failed." log - for i in `seq 1 3`; do - tmux set status-right-style 'bg=colour1 fg=colour196 bold' - sleep "0.1" - tmux set status-right-style 'bg=colour1 fg=colour233 bold' - sleep "0.1" - done + [ ${FAST} -eq 1 ] || status_bar_blink pause_continue \ "Press any key to pause test session" \ @@ -440,19 +440,21 @@ info_layout() { # status_test_ok() - Update counter of passed tests, log and display message status_test_ok() { STATUS_PASS=$((STATUS_PASS + 1)) - tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | #(TZ="UTC" date -Iseconds)" + tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)" info_passed } # status_test_fail() - Update counter of failed tests, log and display message status_test_fail() { STATUS_FAIL=$((STATUS_FAIL + 1)) - tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | #(TZ="UTC" date -Iseconds)" + tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)" info_failed } # status_test_fail() - Update counter of failed tests, log and display message status_test_skip() { + STATUS_SKIPPED=$((STATUS_SKIPPED + 1)) + tmux set status-right "PASS: ${STATUS_PASS} | FAIL: ${STATUS_FAIL} | SKIPPED: ${STATUS_SKIPPED} | #(TZ="UTC" date -Iseconds)" info_skipped } @@ -665,7 +667,7 @@ pause_continue() { # run_term() - Start tmux session, running entry point, with recording if needed run_term() { - TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG" + TMUX="tmux new-session -s passt_test -eSTATEBASE=$STATEBASE -ePCAP=$PCAP -eDEBUG=$DEBUG -eTRACE=$TRACE -eKERNEL=$KERNEL" if [ ${CI} -eq 1 ]; then printf '\e[8;50;240t' diff --git a/test/lib/test b/test/lib/test index 1d571c3..7349674 100755 --- a/test/lib/test +++ b/test/lib/test @@ -15,20 +15,12 @@ # test_iperf3s() - Start iperf3 server # $1: Destination/server context -# $2: Port number, ${i} is translated to process index -# $3: Number of processes to run in parallel +# $2: Port number test_iperf3s() { __sctx="${1}" __port="${2}" - __procs="$((${3} - 1))" - pane_or_context_run_bg "${__sctx}" \ - 'for i in $(seq 0 '${__procs}'); do' \ - ' iperf3 -s -p'${__port}' &' \ - ' echo $! > s${i}.pid; ' \ - 'done' \ - - sleep 1 # Wait for server to be ready + pane_or_context_run "${__sctx}" 'iperf3 -s -p'${__port}' -D -I s.pid' } # test_iperf3k() - Kill iperf3 server @@ -36,9 +28,9 @@ test_iperf3s() { test_iperf3k() { __sctx="${1}" - pane_or_context_run "${__sctx}" 'kill -INT $(cat s*.pid); rm s*.pid' + pane_or_context_run "${__sctx}" 'kill -INT $(cat s.pid)' - sleep 3 # Wait for kernel to free up ports + sleep 1 # Wait for kernel to free up ports } # test_iperf3() - Ugly helper for iperf3 directive @@ -46,37 +38,68 @@ test_iperf3k() { # $2: Source/client context # $3: Destination name or address for client # $4: Port number, ${i} is translated to process index -# $5: Number of processes to run in parallel -# $6: Run time, in seconds +# $5: Run time, in seconds # $@: Client options test_iperf3() { __var="${1}"; shift __cctx="${1}"; shift __dest="${1}"; shift __port="${1}"; shift - __procs="$((${1} - 1))"; shift __time="${1}"; shift - pane_or_context_run "${__cctx}" 'rm -f c*.json' + pane_or_context_run "${__cctx}" 'rm -f c.json' # A 1s wait for connection on what's basically a local link # indicates something is pretty wrong __timeout=1000 pane_or_context_run "${__cctx}" \ - '(' \ - ' for i in $(seq 0 '${__procs}'); do' \ - ' iperf3 -J -c '${__dest}' -p '${__port} \ - ' --connect-timeout '${__timeout} \ - ' -t'${__time}' -i0 -T c${i} '"${@}" \ - ' > c${i}.json &' \ - ' done;' \ - ' wait' \ - ')' + 'iperf3 -J -c '${__dest}' -p '${__port} \ + ' --connect-timeout '${__timeout} \ + ' -t'${__time}' -i0 '"${@}"' > c.json' \ __jval=".end.sum_received.bits_per_second" __bw=$(pane_or_context_output "${__cctx}" \ - 'cat c*.json | jq -rMs "map('${__jval}') | add"') + 'cat c.json | jq -rMs "map('${__jval}') | add"') + + TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__var}__" "${__bw}" )" +} + +# test_iperf3m() - Ugly helper for iperf3 directive, guest migration variant +# $1: Variable name: to put the measure bandwidth into +# $2: Initial source/client context +# $3: Second source/client context the guest is moving to +# $4: Destination name or address for client +# $5: Port number, ${i} is translated to process index +# $6: Run time, in seconds +# $7: Client options +test_iperf3m() { + __var="${1}"; shift + __cctx="${1}"; shift + __cctx2="${1}"; shift + __dest="${1}"; shift + __port="${1}"; shift + __time="${1}"; shift + + pane_or_context_run "${__cctx}" 'rm -f c.json' + + # A 1s wait for connection on what's basically a local link + # indicates something is pretty wrong + __timeout=1000 + pane_or_context_run_bg "${__cctx}" \ + 'iperf3 -J -c '${__dest}' -p '${__port} \ + ' --connect-timeout '${__timeout} \ + ' -t'${__time}' -i0 '"${@}"' > c.json' \ + + __jval=".end.sum_received.bits_per_second" + + sleep $((${__time} + 3)) + + pane_or_context_output "${__cctx2}" \ + 'cat c.json' + + __bw=$(pane_or_context_output "${__cctx2}" \ + 'cat c.json | jq -rMs "map('${__jval}') | add"') TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__var}__" "${__bw}" )" } @@ -190,6 +213,12 @@ test_one_line() { "guest2w") pane_or_context_wait guest_2 || TEST_ONE_nok=1 ;; + "mon") + pane_or_context_run mon "${__arg}" || TEST_ONE_nok=1 + ;; + "monb") + pane_or_context_run_bg mon "${__arg}" + ;; "ns") pane_or_context_run ns "${__arg}" || TEST_ONE_nok=1 ;; @@ -305,6 +334,9 @@ test_one_line() { "iperf3") test_iperf3 ${__arg} ;; + "iperf3m") + test_iperf3m ${__arg} + ;; "set") TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__arg%% *}__" "${__arg#* }")" ;; diff --git a/test/memory/passt b/test/memory/passt index 1193af8..7e45724 100644 --- a/test/memory/passt +++ b/test/memory/passt @@ -44,7 +44,7 @@ endef def start_stop_diff guest sed /proc/slabinfo -ne 's/^\([^ ]* *[^ ]* *[^ ]* *[^ ]*\).*/\\\1/p' > /tmp/slabinfo.before guest cat /proc/meminfo > /tmp/meminfo.before -guest /bin/passt.avx2 -l /tmp/log -s /tmp/sock -P /tmp/pid __OPTS__ --netns-only +guest /bin/passt.avx2 -l /tmp/log -s /tmp/sock -P /tmp/pid __OPTS__ sleep 2 guest cat /proc/meminfo > /tmp/meminfo.after guest sed /proc/slabinfo -ne 's/^\([^ ]* *[^ ]* *[^ ]* *[^ ]*\).*/\\\1/p' > /tmp/slabinfo.after @@ -78,9 +78,16 @@ guest mount -o bind /proc /test/proc guest mount -o bind /dev /test/dev guest cp -Lr /bin /lib /lib64 /usr /sbin /test/ +guest exec switch_root /test /bin/sh + guest ulimit -Hn 300000 -guest unshare -rUm -R /test -guest chroot . +guest unshare -rUn +guest ip link add eth0 type dummy +guest ip link set eth0 up +guest ip address add 192.0.2.2/24 dev eth0 +guest ip address add 2001:db8::2/64 dev eth0 +guest ip route add default via 192.0.2.1 +guest ip -6 route add default via 2001:db8::1 dev eth0 guest meminfo_size() { grep "^$2:" $1 | tr -s ' ' | cut -f2 -d ' '; } guest meminfo_diff() { echo $(( $(meminfo_size $2 $3) - $(meminfo_size $1 $3) )); } @@ -103,27 +110,17 @@ info th symbol MiB set WHAT tcp_buf_discard nm_row -set WHAT tcp6_l2_buf +set WHAT flowtab nm_row -set WHAT tcp4_l2_buf +set WHAT tcp6_payload nm_row -set WHAT tc +set WHAT tcp4_payload nm_row set WHAT pkt_buf nm_row -set WHAT udp_splice_map -nm_row -set WHAT udp6_l2_buf -nm_row -set WHAT udp4_l2_buf -nm_row -set WHAT udp_tap_map +set WHAT udp_payload nm_row -set WHAT icmp_id_map -nm_row -set WHAT udp_splice_buf -nm_row -set WHAT tc_hash +set WHAT flow_hashtab nm_row set WHAT pool_tap6_storage nm_row @@ -142,8 +139,6 @@ set WHAT pid slab_row set WHAT dentry slab_row -set WHAT Acpi-Parse -slab_row set WHAT kmalloc-64 slab_row set WHAT kmalloc-32 diff --git a/test/migrate/basic b/test/migrate/basic new file mode 100644 index 0000000..3f11f7d --- /dev/null +++ b/test/migrate/basic @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/basic - Check basic migration functionality +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv4: guest1/guest2 > host +g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' +hostb socat -u TCP4-LISTEN:10006 OPEN:__STATESETUP__/msg,create,trunc +sleep 1 +# Option 1: via spliced path in pasta, namespace to host +# guest1b { printf "Hello from guest 1"; sleep 10; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__GW1__:10003 +# Option 2: via --map-guest-addr (tap) in pasta, namespace to host +guest1b { printf "Hello from guest 1"; sleep 3; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__MAP_HOST4__:10006 +sleep 1 + +mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +hostw +hout MSG cat __STATESETUP__/msg +check [ "__MSG__" = "Hello from guest 1 and from guest 2" ] diff --git a/test/migrate/basic_fin b/test/migrate/basic_fin new file mode 100644 index 0000000..aa61ec5 --- /dev/null +++ b/test/migrate/basic_fin @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/basic_fin - Outbound traffic across migration, half-closed socket +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv4: guest1, half-close, guest2 > host +g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' + +hostb echo FIN | socat TCP4-LISTEN:10006,shut-down STDIO,ignoreeof > __STATESETUP__/msg +#hostb socat -u TCP4-LISTEN:10006 OPEN:__STATESETUP__/msg,create,trunc + +#sleep 20 +# Option 1: via spliced path in pasta, namespace to host +# guest1b { printf "Hello from guest 1"; sleep 10; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__GW1__:10003 +# Option 2: via --map-guest-addr (tap) in pasta, namespace to host +guest1b { printf "Hello from guest 1"; sleep 3; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__MAP_HOST4__:10006 +sleep 1 + +mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +hostw +hout MSG cat __STATESETUP__/msg +check [ "__MSG__" = "Hello from guest 1 and from guest 2" ] diff --git a/test/migrate/bidirectional b/test/migrate/bidirectional new file mode 100644 index 0000000..4c04081 --- /dev/null +++ b/test/migrate/bidirectional @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/bidirectional - Check migration with messages in both directions +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test TCP/IPv4: guest1/guest2 > host, host > guest1/guest2 +g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' + +hostb socat -u TCP4-LISTEN:10006 OPEN:__STATESETUP__/msg,create,trunc +guest1b socat -u TCP4-LISTEN:10001 OPEN:msg,create,trunc +sleep 1 + +guest1b socat -u UNIX-RECV:proxy.sock,null-eof TCP4:__MAP_HOST4__:10006 +hostb socat -u UNIX-RECV:__STATESETUP__/proxy.sock,null-eof TCP4:__ADDR1__:10001 +sleep 1 +guest1 printf "Hello from guest 1" | socat -u STDIN UNIX:proxy.sock +host printf "Dear guest 1," | socat -u STDIN UNIX:__STATESETUP__/proxy.sock +sleep 1 + +mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +sleep 2 +guest2 printf " and from guest 2" | socat -u STDIN UNIX:proxy.sock,shut-null +host printf " you are now guest 2" | socat -u STDIN UNIX:__STATESETUP__/proxy.sock,shut-null + +hostw +# FIXME: guest2w doesn't work here because shell jobs are (also) from guest #1, +# use sleep 1 for the moment +sleep 1 + +hout MSG cat __STATESETUP__/msg +check [ "__MSG__" = "Hello from guest 1 and from guest 2" ] + +g2out MSG cat msg +check [ "__MSG__" = "Dear guest 1, you are now guest 2" ] diff --git a/test/migrate/bidirectional_fin b/test/migrate/bidirectional_fin new file mode 100644 index 0000000..1c13527 --- /dev/null +++ b/test/migrate/bidirectional_fin @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/bidirectional_fin - Both directions, half-closed sockets +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test TCP/IPv4: guest1/guest2 <- (half closed) -> host +g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' + +hostb echo FIN | socat TCP4-LISTEN:10006,shut-down STDIO,ignoreeof > __STATESETUP__/msg +guest1b echo FIN | socat TCP4-LISTEN:10001,shut-down STDIO,ignoreeof > msg +sleep 1 + +guest1b socat -u UNIX-RECV:proxy.sock,null-eof TCP4:__MAP_HOST4__:10006 +hostb socat -u UNIX-RECV:__STATESETUP__/proxy.sock,null-eof TCP4:__ADDR1__:10001 +sleep 1 +guest1 printf "Hello from guest 1" | socat -u STDIN UNIX:proxy.sock +host printf "Dear guest 1," | socat -u STDIN UNIX:__STATESETUP__/proxy.sock +sleep 1 + +mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +sleep 2 +guest2 printf " and from guest 2" | socat -u STDIN UNIX:proxy.sock,shut-null +host printf " you are now guest 2" | socat -u STDIN UNIX:__STATESETUP__/proxy.sock,shut-null + +hostw +# FIXME: guest2w doesn't work here because shell jobs are (also) from guest #1, +# use sleep 1 for the moment +sleep 1 + +hout MSG cat __STATESETUP__/msg +check [ "__MSG__" = "Hello from guest 1 and from guest 2" ] + +g2out MSG cat msg +check [ "__MSG__" = "Dear guest 1, you are now guest 2" ] diff --git a/test/migrate/iperf3_bidir6 b/test/migrate/iperf3_bidir6 new file mode 100644 index 0000000..4bfefb5 --- /dev/null +++ b/test/migrate/iperf3_bidir6 @@ -0,0 +1,58 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/iperf3_bidir6 - Migration behaviour with many bidirectional flows +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +set THREADS 128 +set TIME 3 +set OMIT 0.1 +set OPTS -Z -P __THREADS__ -O__OMIT__ -N --bidir + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv6 host <-> guest flood, many flows, during migration + +monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +iperf3s host 10006 +iperf3m BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__ +bw __BW__ 1 2 + +iperf3k host diff --git a/test/migrate/iperf3_in4 b/test/migrate/iperf3_in4 new file mode 100644 index 0000000..c5f3916 --- /dev/null +++ b/test/migrate/iperf3_in4 @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/iperf3_in4 - Migration behaviour under inbound IPv4 flood +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +guest1 /sbin/sysctl -w net.core.rmem_max=33554432 +guest1 /sbin/sysctl -w net.core.wmem_max=33554432 + +set THREADS 1 +set TIME 4 +set OMIT 0.1 +set OPTS -Z -P __THREADS__ -O__OMIT__ -N -R + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test TCP/IPv4 host to guest throughput during migration + +monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +iperf3s host 10006 +iperf3m BW guest_1 guest_2 __MAP_HOST4__ 10006 __TIME__ __OPTS__ +bw __BW__ 1 2 + +iperf3k host diff --git a/test/migrate/iperf3_in6 b/test/migrate/iperf3_in6 new file mode 100644 index 0000000..16cf504 --- /dev/null +++ b/test/migrate/iperf3_in6 @@ -0,0 +1,58 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/iperf3_in6 - Migration behaviour under inbound IPv6 flood +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +set THREADS 4 +set TIME 3 +set OMIT 0.1 +set OPTS -Z -P __THREADS__ -O__OMIT__ -N -R + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv6 host to guest throughput during migration + +monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +iperf3s host 10006 +iperf3m BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__ +bw __BW__ 1 2 + +iperf3k host diff --git a/test/migrate/iperf3_many_out6 b/test/migrate/iperf3_many_out6 new file mode 100644 index 0000000..88133f2 --- /dev/null +++ b/test/migrate/iperf3_many_out6 @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/iperf3_many_out6 - Migration behaviour with many outbound flows +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +set THREADS 16 +set TIME 3 +set OMIT 0.1 +set OPTS -Z -P __THREADS__ -O__OMIT__ -N -l 1M + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv6 guest to host flood, many flows, during migration + +test TCP/IPv6 host to guest throughput during migration + +monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +iperf3s host 10006 +iperf3m BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__ +bw __BW__ 1 2 + +iperf3k host diff --git a/test/migrate/iperf3_out4 b/test/migrate/iperf3_out4 new file mode 100644 index 0000000..968057b --- /dev/null +++ b/test/migrate/iperf3_out4 @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/iperf3_out4 - Migration behaviour under outbound IPv4 flood +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +set THREADS 6 +set TIME 2 +set OMIT 0.1 +set OPTS -P __THREADS__ -O__OMIT__ -Z -N -l 1M + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test TCP/IPv4 guest to host throughput during migration + +monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +iperf3s host 10006 +iperf3m BW guest_1 guest_2 __MAP_HOST4__ 10006 __TIME__ __OPTS__ +bw __BW__ 1 2 + +iperf3k host diff --git a/test/migrate/iperf3_out6 b/test/migrate/iperf3_out6 new file mode 100644 index 0000000..21fbfcd --- /dev/null +++ b/test/migrate/iperf3_out6 @@ -0,0 +1,58 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/iperf3_out6 - Migration behaviour under outbound IPv6 flood +# +# Copyright (c) 2025 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +set THREADS 6 +set TIME 2 +set OMIT 0.1 +set OPTS -P __THREADS__ -O__OMIT__ -Z -N -l 1M + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv6 guest to host throughput during migration + +monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +iperf3s host 10006 +iperf3m BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__ +bw __BW__ 1 2 + +iperf3k host diff --git a/test/migrate/rampstream_in b/test/migrate/rampstream_in new file mode 100644 index 0000000..df333ba --- /dev/null +++ b/test/migrate/rampstream_in @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/rampstream_in - Check sequence correctness with inbound ramp +# +# Copyright (c) 2025 Red Hat +# Author: David Gibson <david@gibson.dropbear.id.au> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 +set RAMPS 6000000 + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv4: sequence check, ramps, inbound +g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' +guest1b socat -u TCP4-LISTEN:10001 EXEC:"rampstream-check.sh __RAMPS__" +sleep 1 +hostb socat -u EXEC:"test/rampstream send __RAMPS__" TCP4:__ADDR1__:10001 + +sleep 1 + +monb echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +hostw + +guest2 cat rampstream.err +guest2 [ $(cat rampstream.status) -eq 0 ] diff --git a/test/migrate/rampstream_out b/test/migrate/rampstream_out new file mode 100644 index 0000000..8ed3229 --- /dev/null +++ b/test/migrate/rampstream_out @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/migrate/rampstream_out - Check sequence correctness with outbound ramp +# +# Copyright (c) 2025 Red Hat +# Author: David Gibson <david@gibson.dropbear.id.au> + +g1tools ip jq dhclient socat cat +htools ip jq + +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 +set RAMPS 6000000 + +test Interface name +g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME1__" ] + +test DHCP: address +guest1 ip link set dev __IFNAME1__ up +guest1 /sbin/dhclient -4 __IFNAME1__ +g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR1__" = "__HOST_ADDR__" ] + +test DHCPv6: address +# Link is up now, wait for DAD to complete +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest1 /sbin/dhclient -6 __IFNAME1__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] + +test TCP/IPv4: sequence check, ramps, outbound +g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' +hostb socat -u TCP4-LISTEN:10006 EXEC:"test/rampstream check __RAMPS__" +sleep 1 +guest1b socat -u EXEC:"rampstream send __RAMPS__" TCP4:__MAP_HOST4__:10006 +sleep 1 + +mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock + +hostw diff --git a/test/nstool.c b/test/nstool.c index 1bdf44e..7ab5d2a 100644 --- a/test/nstool.c +++ b/test/nstool.c @@ -31,10 +31,15 @@ #define ARRAY_SIZE(a) ((int)(sizeof(a) / sizeof((a)[0]))) -#define die(...) \ - do { \ - fprintf(stderr, __VA_ARGS__); \ - exit(1); \ +#define die(...) \ + do { \ + fprintf(stderr, "nstool: " __VA_ARGS__); \ + exit(1); \ + } while (0) + +#define err(...) \ + do { \ + fprintf(stderr, "nstool: " __VA_ARGS__); \ } while (0) struct ns_type { @@ -156,6 +161,9 @@ static int connect_ctl(const char *sockpath, bool wait, static void cmd_hold(int argc, char *argv[]) { + struct sigaction sa = { + .sa_handler = SIG_IGN, + }; int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, PF_UNIX); struct sockaddr_un addr; const char *sockpath = argv[1]; @@ -185,6 +193,10 @@ static void cmd_hold(int argc, char *argv[]) if (!getcwd(info.cwd, sizeof(info.cwd))) die("getcwd(): %s\n", strerror(errno)); + rc = sigaction(SIGPIPE, &sa, NULL); + if (rc) + die("sigaction(SIGPIPE): %s\n", strerror(errno)); + do { int afd = accept(fd, NULL, NULL); char buf; @@ -193,17 +205,21 @@ static void cmd_hold(int argc, char *argv[]) die("accept(): %s\n", strerror(errno)); rc = write(afd, &info, sizeof(info)); - if (rc < 0) - die("write(): %s\n", strerror(errno)); + if (rc < 0) { + err("holder write() to control socket: %s\n", + strerror(errno)); + } if ((size_t)rc < sizeof(info)) - die("short write() on control socket\n"); + err("holder short write() on control socket\n"); rc = read(afd, &buf, sizeof(buf)); - if (rc < 0) - die("read(): %s\n", strerror(errno)); + if (rc < 0) { + err("holder read() on control socket: %s\n", + strerror(errno)); + } close(afd); - } while (rc == 0); + } while (rc <= 0); unlink(sockpath); } @@ -345,21 +361,43 @@ static int openns(const char *fmt, ...) return fd; } +static pid_t sig_pid; +static void sig_propagate(int signum) +{ + int err; + + err = kill(sig_pid, signum); + if (err) + die("Propagating %s: %s\n", strsignal(signum), strerror(errno)); +} + static void wait_for_child(pid_t pid) { - int status; + struct sigaction sa = { + .sa_handler = sig_propagate, + .sa_flags = SA_RESETHAND, + }; + int status, err; + + sig_pid = pid; + err = sigaction(SIGTERM, &sa, NULL); + if (err) + die("sigaction(SIGTERM): %s\n", strerror(errno)); /* Match the child's exit status, if possible */ for (;;) { pid_t rc; rc = waitpid(pid, &status, WUNTRACED); - if (rc < 0) + if (rc < 0) { + if (errno == EINTR) + continue; die("waitpid() on %d: %s\n", pid, strerror(errno)); + } if (rc != pid) die("waitpid() on %d returned %d", pid, rc); if (WIFSTOPPED(status)) { - /* Stop the parent to patch */ + /* Stop the parent to match */ kill(getpid(), SIGSTOP); /* We must have resumed, resume the child */ kill(pid, SIGCONT); @@ -508,7 +546,7 @@ static void cmd_exec(int argc, char *argv[]) /* CHILD */ if (argc > optind + 1) { exe = argv[optind + 1]; - xargs = (const char * const*)(argv + optind + 1); + xargs = (const char *const *)(argv + optind + 1); } else { exe = getenv("SHELL"); if (!exe) diff --git a/test/passt.mbuto b/test/passt.mbuto index 6240d5c..5e00132 100755 --- a/test/passt.mbuto +++ b/test/passt.mbuto @@ -13,7 +13,16 @@ PROGS="${PROGS:-ash,dash,bash ip mount ls insmod mkdir ln cat chmod lsmod modprobe find grep mknod mv rm umount jq iperf3 dhclient hostname sed tr chown sipcalc cut socat dd strace ping tail killall sleep sysctl - nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp}" + nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp tcpdump + env}" + +# OpenSSH 9.8 introduced split binaries, with sshd being the daemon, and +# sshd-session the per-session program. We need the latter as well, and the path +# depends on the distribution. It doesn't exist on older versions. +for bin in /usr/lib/openssh/sshd-session /usr/lib/ssh/sshd-session \ + /usr/libexec/openssh/sshd-session; do + command -v "${bin}" >/dev/null && PROGS="${PROGS} ${bin}" +done KMODS="${KMODS:- virtio_net virtio_pci vmw_vsock_virtio_transport}" @@ -23,7 +32,7 @@ LINKS="${LINKS:- DIRS="${DIRS} /tmp /usr/sbin /usr/share /var/log /var/lib /etc/ssh /run/sshd /root/.ssh" -COPIES="${COPIES} small.bin,/root/small.bin medium.bin,/root/medium.bin big.bin,/root/big.bin" +COPIES="${COPIES} small.bin,/root/small.bin medium.bin,/root/medium.bin big.bin,/root/big.bin rampstream,/bin/rampstream rampstream-check.sh,/bin/rampstream-check.sh" FIXUP="${FIXUP}"' mv /sbin/* /usr/sbin || : @@ -33,6 +42,7 @@ FIXUP="${FIXUP}"' #!/bin/sh LOG=/var/log/dhclient-script.log echo \${reason} \${interface} >> \$LOG +env >> \$LOG set >> \$LOG [ -n "\${new_interface_mtu}" ] && ip link set dev \${interface} mtu \${new_interface_mtu} @@ -46,7 +56,8 @@ set >> \$LOG [ -n "\${new_ip6_address}" ] && ip addr add \${new_ip6_address}/\${new_ip6_prefixlen} dev \${interface} [ -n "\${new_dhcp6_name_servers}" ] && for d in \${new_dhcp6_name_servers}; do echo "nameserver \${d}%\${interface}" >> /etc/resolv.conf; done [ -n "\${new_dhcp6_domain_search}" ] && (printf "search"; for d in \${new_dhcp6_domain_search}; do printf " %s" "\${d}"; done; printf "\n") >> /etc/resolv.conf -[ -n "\${new_host_name}" ] && hostname "\${new_host_name}" +[ -n "\${new_host_name}" ] && echo "\${new_host_name}" > /tmp/new_host_name +[ -n "\${new_fqdn_fqdn}" ] && echo "\${new_fqdn_fqdn}" > /tmp/new_fqdn_fqdn exit 0 EOF chmod 755 /sbin/dhclient-script @@ -54,9 +65,10 @@ EOF ln -s /run /var/run :> /etc/fstab - # sshd(dropbear) via vsock + # sshd via vsock cat > /etc/passwd << EOF root:x:0:0:root:/root:/bin/sh +tcpdump:x:72:72:tcpdump:/:/sbin/nologin sshd:x:100:100:Privilege-separated SSH:/var/empty/sshd:/sbin/nologin EOF cat > /etc/shadow << EOF @@ -64,7 +76,9 @@ root:::0:99999:7::: EOF chmod 000 /etc/shadow - :> /etc/ssh/sshd_config + cat > /etc/ssh/sshd_config << EOF +Subsystem sftp internal-sftp +EOF ssh-keygen -A chmod 700 /root/.ssh chmod 700 /run/sshd @@ -76,7 +90,7 @@ EOF EOF chmod 600 /root/.ssh/authorized_keys chmod 700 /root - socat VSOCK-LISTEN:22,fork EXEC:"sshd -i -e" 2> /var/log/vsock-ssh.log & + socat VSOCK-LISTEN:22,fork EXEC:"/sbin/sshd -i -e" 2> /var/log/vsock-ssh.log & sh +m ' diff --git a/test/passt.mem.mbuto b/test/passt.mem.mbuto index 56f5139..532eae0 100755 --- a/test/passt.mem.mbuto +++ b/test/passt.mem.mbuto @@ -12,7 +12,7 @@ PROGS="${PROGS:-ash,dash,bash chmod ip mount insmod mkdir ln cat chmod modprobe grep mknod sed chown sleep bc ls ps mount unshare chroot cp kill diff - head tail sort tr tee cut nm which}" + head tail sort tr tee cut nm which switch_root}" KMODS="${KMODS:- dummy}" @@ -29,13 +29,6 @@ COPIES="${COPIES} ../passt.avx2,/bin/passt.avx2" FIXUP="${FIXUP}"' ln -s /bin /usr/bin chmod 777 /tmp -ip link add eth0 type dummy -ip link set eth0 up -ip address add 192.0.2.2/24 dev eth0 -ip address add 2001:db8::2/64 dev eth0 -ip route add default via 192.0.2.1 -ip -6 route add default via 2001:db8::1 dev eth0 -sleep 2 sh +m ' diff --git a/test/passt/dhcp b/test/passt/dhcp index 53ee641..145f1ba 100644 --- a/test/passt/dhcp +++ b/test/passt/dhcp @@ -11,7 +11,7 @@ # Copyright (c) 2021 Red Hat GmbH # Author: Stefano Brivio <sbrivio@redhat.com> -gtools ip jq dhclient sed tr +gtools ip jq dhclient sed tr hostname htools ip jq sed tr head test Interface name @@ -38,7 +38,7 @@ check [ __MTU__ = 65520 ] test DHCP: DNS gout DNS sed -n 's/^nameserver \([0-9]*\.\)\(.*\)/\1\2/p' /etc/resolv.conf | tr '\n' ',' | sed 's/,$//;s/$/\n/' hout HOST_DNS sed -n 's/^nameserver \([0-9]*\.\)\(.*\)/\1\2/p' /etc/resolv.conf | head -n3 | tr '\n' ',' | sed 's/,$//;s/$/\n/' -check [ "__DNS__" = "__HOST_DNS__" ] || [ "__DNS__" = "__HOST_GW__" -a "__HOST_DNS__" = "127.0.0.1" ] +check [ "__DNS__" = "__HOST_DNS__" ] || ( [ "__DNS__" = "__HOST_GW__" ] && expr "__HOST_DNS__" : "127[.]" ) # FQDNs should be terminated by dots, but the guest DHCP client might omit them: # strip them first @@ -47,10 +47,21 @@ gout SEARCH sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^searc hout HOST_SEARCH sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/' check [ "__SEARCH__" = "__HOST_SEARCH__" ] +test DHCP: Hostname +gout NEW_HOST_NAME cat /tmp/new_host_name +check [ "__NEW_HOST_NAME__" = "hostname1" ] + +test DHCP: Client FQDN +gout NEW_FQDN_FQDN cat /tmp/new_fqdn_fqdn +check [ "__NEW_FQDN_FQDN__" = "fqdn1.passt.test" ] + test DHCPv6: address +guest rm /tmp/new_fqdn_fqdn guest /sbin/dhclient -6 __IFNAME__ +# Wait for DAD to complete +guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]' -hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global").local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' check [ "__ADDR6__" = "__HOST_ADDR6__" ] test DHCPv6: route @@ -68,3 +79,7 @@ test DHCPv6: search list gout SEARCH6 sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/' hout HOST_SEARCH6 sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/' check [ "__SEARCH6__" = "__HOST_SEARCH6__" ] + +test DHCPv6: Hostname +gout NEW_FQDN_FQDN cat /tmp/new_fqdn_fqdn +check [ "__NEW_FQDN_FQDN__" = "fqdn1.passt.test" ] diff --git a/test/passt/ndp b/test/passt/ndp index 7b2dbfe..516cd6b 100644 --- a/test/passt/ndp +++ b/test/passt/ndp @@ -16,14 +16,16 @@ htools ip jq sipcalc grep cut test Interface name gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' -guest ip link set dev __IFNAME__ up && sleep 2 +guest ip link set dev __IFNAME__ up +# Wait for SLAAC & DAD to complete +guest while ! ip -j -6 addr show dev __IFNAME__ | jq -e '.[].addr_info.[] | select(.protocol == "kernel_ra")'; do sleep 0.1; done hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' check [ -n "__IFNAME__" ] test SLAAC: prefix -gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local] | .[0]' -gout PREFIX6 sipcalc __ADDR6__/64 | grep prefix | cut -d' ' -f4 -hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global").local] | .[0]' +gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]' +gout PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4 +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4 check [ "__PREFIX6__" = "__HOST_PREFIX6__" ] diff --git a/test/passt_in_ns/dhcp b/test/passt_in_ns/dhcp new file mode 100644 index 0000000..a38a690 --- /dev/null +++ b/test/passt_in_ns/dhcp @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/passt/dhcp - Check DHCP and DHCPv6 functionality in passt mode +# +# Copyright (c) 2021 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +gtools ip jq dhclient sed tr +htools ip jq sed tr head + +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +test Interface name +gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' +hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' +check [ -n "__IFNAME__" ] + +test DHCP: address +guest /sbin/dhclient -4 __IFNAME__ +gout ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[0].local' +hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local' +check [ "__ADDR__" = "__HOST_ADDR__" ] + +test DHCP: route +gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' +hout HOST_GW ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").gateway] | .[0]' +check [ "__GW__" = "__HOST_GW__" ] + +test DHCP: MTU +gout MTU ip -j link show | jq -rM '.[] | select(.ifname == "__IFNAME__").mtu' +check [ __MTU__ = 65520 ] + +test DHCP: DNS +gout DNS sed -n 's/^nameserver \([0-9]*\.\)\(.*\)/\1\2/p' /etc/resolv.conf | tr '\n' ',' | sed 's/,$//;s/$/\n/' +hout HOST_DNS sed -n 's/^nameserver \([0-9]*\.\)\(.*\)/\1\2/p' /etc/resolv.conf | head -n3 | tr '\n' ',' | sed 's/,$//;s/$/\n/' +check [ "__DNS__" = "__HOST_DNS__" ] || ( [ "__DNS__" = "__MAP_NS4__" ] && expr "__HOST_DNS__" : "127[.]" ) + +# FQDNs should be terminated by dots, but the guest DHCP client might omit them: +# strip them first +test DHCP: search list +gout SEARCH sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/' +hout HOST_SEARCH sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/' +check [ "__SEARCH__" = "__HOST_SEARCH__" ] + +test DHCPv6: address +guest /sbin/dhclient -6 __IFNAME__ +# Wait for DAD to complete +guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +gout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' +check [ "__ADDR6__" = "__HOST_ADDR6__" ] + +test DHCPv6: route +gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway' +hout HOST_GW6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").gateway] | .[0]' +check [ "__GW6__" = "__HOST_GW6__" ] + +# Strip interface specifier: interface names might differ between host and guest +test DHCPv6: DNS +gout DNS6 sed -n 's/^nameserver \([^:]*:\)\([^%]*\).*/\1\2/p' /etc/resolv.conf | tr '\n' ',' | sed 's/,$//;s/$/\n/' +hout HOST_DNS6 sed -n 's/^nameserver \([^:]*:\)\([^%]*\).*/\1\2/p' /etc/resolv.conf | tr '\n' ',' | sed 's/,$//;s/$/\n/' +check [ "__DNS6__" = "__HOST_DNS6__" ] || [ "__DNS6__" = "__MAP_NS6__" -a "__HOST_DNS6__" = "::1" ] + +test DHCPv6: search list +gout SEARCH6 sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/' +hout HOST_SEARCH6 sed 's/\. / /g' /etc/resolv.conf | sed 's/\.$//g' | sed -n 's/^search \(.*\)/\1/p' | tr ' \n' ',' | sed 's/,$//;s/$/\n/' +check [ "__SEARCH6__" = "__HOST_SEARCH6__" ] diff --git a/test/passt_in_ns/tcp b/test/passt_in_ns/tcp index cdb7060..319880b 100644 --- a/test/passt_in_ns/tcp +++ b/test/passt_in_ns/tcp @@ -15,6 +15,11 @@ gtools socat ip jq htools socat ip jq nstools socat ip jq +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + set TEMP_BIG __STATEDIR__/test_big.bin set TEMP_SMALL __STATEDIR__/test_small.bin set TEMP_NS_BIG __STATEDIR__/test_ns_big.bin @@ -27,7 +32,7 @@ host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10001 guestw guest cmp test_big.bin /root/big.bin -test TCP/IPv4: host to ns: big transfer +test TCP/IPv4: host to ns (spliced): big transfer nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc sleep 1 host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10002 @@ -36,16 +41,15 @@ check cmp __TEMP_NS_BIG__ __BASEPATH__/big.bin test TCP/IPv4: guest to host: big transfer hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc -gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' sleep 1 -guest socat -u OPEN:/root/big.bin TCP4:__GW__:10003 +guest socat -u OPEN:/root/big.bin TCP4:__MAP_HOST4__:10003 hostw check cmp __TEMP_BIG__ __BASEPATH__/big.bin test TCP/IPv4: guest to ns: big transfer nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc sleep 1 -guest socat -u OPEN:/root/big.bin TCP4:__GW__:10002 +guest socat -u OPEN:/root/big.bin TCP4:__MAP_NS4__:10002 nsw check cmp __TEMP_NS_BIG__ __BASEPATH__/big.bin @@ -59,7 +63,7 @@ check cmp __TEMP_BIG__ __BASEPATH__/big.bin test TCP/IPv4: ns to host (via tap): big transfer hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc sleep 1 -ns socat -u OPEN:__BASEPATH__/big.bin TCP4:__GW__:10003 +ns socat -u OPEN:__BASEPATH__/big.bin TCP4:__MAP_HOST4__:10003 hostw check cmp __TEMP_BIG__ __BASEPATH__/big.bin @@ -86,7 +90,7 @@ host socat -u OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10001 guestw guest cmp test_small.bin /root/small.bin -test TCP/IPv4: host to ns: small transfer +test TCP/IPv4: host to ns (spliced): small transfer nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc sleep 1 host socat -u OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10002 @@ -95,16 +99,15 @@ check cmp __TEMP_NS_SMALL__ __BASEPATH__/small.bin test TCP/IPv4: guest to host: small transfer hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc -gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' sleep 1 -guest socat -u OPEN:/root/small.bin TCP4:__GW__:10003 +guest socat -u OPEN:/root/small.bin TCP4:__MAP_HOST4__:10003 hostw check cmp __TEMP_SMALL__ __BASEPATH__/small.bin test TCP/IPv4: guest to ns: small transfer nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc sleep 1 -guest socat -u OPEN:/root/small.bin TCP4:__GW__:10002 +guest socat -u OPEN:/root/small.bin TCP4:__MAP_NS4__:10002 nsw check cmp __TEMP_NS_SMALL__ __BASEPATH__/small.bin @@ -118,7 +121,7 @@ check cmp __TEMP_SMALL__ __BASEPATH__/small.bin test TCP/IPv4: ns to host (via tap): small transfer hostb socat -u TCP4-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc sleep 1 -ns socat -u OPEN:__BASEPATH__/small.bin TCP4:__GW__:10003 +ns socat -u OPEN:__BASEPATH__/small.bin TCP4:__MAP_HOST4__:10003 hostw check cmp __TEMP_SMALL__ __BASEPATH__/small.bin @@ -143,7 +146,7 @@ host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10001 guestw guest cmp test_big.bin /root/big.bin -test TCP/IPv6: host to ns: big transfer +test TCP/IPv6: host to ns (spliced): big transfer nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc sleep 1 host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10002 @@ -152,17 +155,15 @@ check cmp __TEMP_NS_BIG__ __BASEPATH__/big.bin test TCP/IPv6: guest to host: big transfer hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc -gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway' -gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' sleep 1 -guest socat -u OPEN:/root/big.bin TCP6:[__GW6__%__IFNAME__]:10003 +guest socat -u OPEN:/root/big.bin TCP6:[__MAP_HOST6__]:10003 hostw check cmp __TEMP_BIG__ __BASEPATH__/big.bin test TCP/IPv6: guest to ns: big transfer nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc sleep 1 -guest socat -u OPEN:/root/big.bin TCP6:[__GW6__%__IFNAME__]:10002 +guest socat -u OPEN:/root/big.bin TCP6:[__MAP_NS6__]:10002 nsw check cmp __TEMP_NS_BIG__ __BASEPATH__/big.bin @@ -175,9 +176,8 @@ check cmp __TEMP_BIG__ __BASEPATH__/big.bin test TCP/IPv6: ns to host (via tap): big transfer hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_BIG__,create,trunc -nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' sleep 1 -ns socat -u OPEN:__BASEPATH__/big.bin TCP6:[__GW6__%__IFNAME__]:10003 +ns socat -u OPEN:__BASEPATH__/big.bin TCP6:[__MAP_HOST6__]:10003 hostw check cmp __TEMP_BIG__ __BASEPATH__/big.bin @@ -190,6 +190,7 @@ guest cmp test_big.bin /root/big.bin test TCP/IPv6: ns to guest (using namespace address): big transfer guestb socat -u TCP6-LISTEN:10001 OPEN:test_big.bin,create,trunc +nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[0].local' sleep 1 ns socat -u OPEN:__BASEPATH__/big.bin TCP6:[__ADDR6__]:10001 @@ -203,7 +204,7 @@ host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10001 guestw guest cmp test_small.bin /root/small.bin -test TCP/IPv6: host to ns: small transfer +test TCP/IPv6: host to ns (spliced): small transfer nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc sleep 1 host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10002 @@ -212,17 +213,15 @@ check cmp __TEMP_NS_SMALL__ __BASEPATH__/small.bin test TCP/IPv6: guest to host: small transfer hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc -gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway' -gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' sleep 1 -guest socat -u OPEN:/root/small.bin TCP6:[__GW6__%__IFNAME__]:10003 +guest socat -u OPEN:/root/small.bin TCP6:[__MAP_HOST6__]:10003 hostw check cmp __TEMP_SMALL__ __BASEPATH__/small.bin test TCP/IPv6: guest to ns: small transfer nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__ sleep 1 -guest socat -u OPEN:/root/small.bin TCP6:[__GW6__%__IFNAME__]:10002 +guest socat -u OPEN:/root/small.bin TCP6:[__MAP_NS6__]:10002 nsw check cmp __TEMP_NS_SMALL__ __BASEPATH__/small.bin @@ -235,9 +234,8 @@ check cmp __TEMP_SMALL__ __BASEPATH__/small.bin test TCP/IPv6: ns to host (via tap): small transfer hostb socat -u TCP6-LISTEN:10003 OPEN:__TEMP_SMALL__,create,trunc -nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' sleep 1 -ns socat -u OPEN:__BASEPATH__/small.bin TCP6:[__GW6__%__IFNAME__]:10003 +ns socat -u OPEN:__BASEPATH__/small.bin TCP6:[__MAP_HOST6__]:10003 hostw check cmp __TEMP_SMALL__ __BASEPATH__/small.bin diff --git a/test/passt_in_ns/udp b/test/passt_in_ns/udp index 8a02513..791511c 100644 --- a/test/passt_in_ns/udp +++ b/test/passt_in_ns/udp @@ -15,6 +15,11 @@ gtools socat ip jq nstools socat ip jq htools socat ip jq +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + set TEMP __STATEDIR__/test.bin set TEMP_NS __STATEDIR__/test_ns.bin @@ -25,7 +30,7 @@ host socat -u OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10001,shut-null guestw guest cmp test.bin /root/medium.bin -test UDP/IPv4: host to ns +test UDP/IPv4: host to ns (recvmmsg/sendmmsg) nsb socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc sleep 1 host socat -u OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10002,shut-null @@ -34,16 +39,15 @@ check cmp __TEMP_NS__ __BASEPATH__/medium.bin test UDP/IPv4: guest to host hostb socat -u UDP4-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc -gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' sleep 1 -guest socat -u OPEN:/root/medium.bin UDP4:__GW__:10003,shut-null +guest socat -u OPEN:/root/medium.bin UDP4:__MAP_HOST4__:10003,shut-null hostw check cmp __TEMP__ __BASEPATH__/medium.bin test UDP/IPv4: guest to ns nsb socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc sleep 1 -guest socat -u OPEN:/root/medium.bin UDP4:__GW__:10002,shut-null +guest socat -u OPEN:/root/medium.bin UDP4:__MAP_NS4__:10002,shut-null nsw check cmp __TEMP_NS__ __BASEPATH__/medium.bin @@ -57,7 +61,7 @@ check cmp __TEMP__ __BASEPATH__/medium.bin test UDP/IPv4: ns to host (via tap) hostb socat -u UDP4-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc sleep 1 -ns socat -u OPEN:__BASEPATH__/medium.bin UDP4:__GW__:10003,shut-null +ns socat -u OPEN:__BASEPATH__/medium.bin UDP4:__MAP_HOST4__:10003,shut-null hostw check cmp __TEMP__ __BASEPATH__/medium.bin @@ -84,7 +88,7 @@ host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10001,shut-null guestw guest cmp test.bin /root/medium.bin -test UDP/IPv6: host to ns +test UDP/IPv6: host to ns (recvmmsg/sendmmsg) nsb socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc sleep 1 host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10002,shut-null @@ -93,17 +97,15 @@ check cmp __TEMP_NS__ __BASEPATH__/medium.bin test UDP/IPv6: guest to host hostb socat -u UDP6-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc -gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway' -gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' sleep 1 -guest socat -u OPEN:/root/medium.bin UDP6:[__GW6__%__IFNAME__]:10003,shut-null +guest socat -u OPEN:/root/medium.bin UDP6:[__MAP_HOST6__]:10003,shut-null hostw check cmp __TEMP__ __BASEPATH__/medium.bin test UDP/IPv6: guest to ns nsb socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc sleep 1 -guest socat -u OPEN:/root/medium.bin UDP6:[__GW6__%__IFNAME__]:10002,shut-null +guest socat -u OPEN:/root/medium.bin UDP6:[__MAP_NS6__]:10002,shut-null nsw check cmp __TEMP_NS__ __BASEPATH__/medium.bin @@ -116,9 +118,8 @@ check cmp __TEMP__ __BASEPATH__/medium.bin test UDP/IPv6: ns to host (via tap) hostb socat -u UDP6-LISTEN:10003,null-eof OPEN:__TEMP__,create,trunc -nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' sleep 1 -ns socat -u OPEN:__BASEPATH__/medium.bin UDP6:[__GW6__%__IFNAME__]:10003,shut-null +ns socat -u OPEN:__BASEPATH__/medium.bin UDP6:[__MAP_HOST6__]:10003,shut-null hostw check cmp __TEMP__ __BASEPATH__/medium.bin @@ -131,6 +132,7 @@ guest cmp test.bin /root/medium.bin test UDP/IPv6: ns to guest (using namespace address) guestb socat -u UDP6-LISTEN:10001,null-eof OPEN:test.bin,create,trunc +nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[0].local' sleep 1 ns socat -u OPEN:__BASEPATH__/medium.bin UDP6:[__ADDR6__]:10001,shut-null diff --git a/test/passt_vu b/test/passt_vu new file mode 120000 index 0000000..22f1840 --- /dev/null +++ b/test/passt_vu @@ -0,0 +1 @@ +passt
\ No newline at end of file diff --git a/test/passt_vu_in_ns b/test/passt_vu_in_ns new file mode 120000 index 0000000..3ff479e --- /dev/null +++ b/test/passt_vu_in_ns @@ -0,0 +1 @@ +passt_in_ns
\ No newline at end of file diff --git a/test/pasta/dhcp b/test/pasta/dhcp index 112633a..d4f3ad5 100644 --- a/test/pasta/dhcp +++ b/test/pasta/dhcp @@ -35,9 +35,11 @@ check [ __MTU__ = 65520 ] test DHCPv6: address ns /sbin/dhclient -6 --no-pid __IFNAME__ +# Wait for DAD to complete +ns while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]' nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.prefixlen == 128).local] | .[0]' -hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global").local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' check [ __ADDR6__ = __HOST_ADDR6__ ] test DHCPv6: route diff --git a/test/pasta/ndp b/test/pasta/ndp index 2a8afe6..952c1ea 100644 --- a/test/pasta/ndp +++ b/test/pasta/ndp @@ -18,12 +18,13 @@ test Interface name nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' check [ -n "__IFNAME__" ] ns ip link set dev __IFNAME__ up -sleep 2 +# Wait for SLAAC & DAD to complete +ns while ! ip -j -6 addr show dev __IFNAME__ | jq -e '.[].addr_info.[] | select(.protocol == "kernel_ra")'; do sleep 0.1; done test SLAAC: prefix -nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local] | .[0]' -nsout PREFIX6 sipcalc __ADDR6__/64 | grep prefix | cut -d' ' -f4 -hout HOST_ADDR6 ip -j -6 addr show|jq -rM ['.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local] | .[0]' +nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.protocol == "kernel_ra") | .local + "/" + (.prefixlen | tostring)] | .[0]' +nsout PREFIX6 sipcalc __ADDR6__ | grep prefix | cut -d' ' -f4 +hout HOST_ADDR6 ip -j -6 addr show|jq -rM ['.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' hout HOST_PREFIX6 sipcalc __HOST_ADDR6__/64 | grep prefix | cut -d' ' -f4 check [ "__PREFIX6__" = "__HOST_PREFIX6__" ] diff --git a/test/pasta/tcp b/test/pasta/tcp index 6ab18c5..53b6f25 100644 --- a/test/pasta/tcp +++ b/test/pasta/tcp @@ -19,8 +19,8 @@ set TEMP_NS_BIG __STATEDIR__/test_ns_big.bin set TEMP_SMALL __STATEDIR__/test_small.bin set TEMP_NS_SMALL __STATEDIR__/test_ns_small.bin -test TCP/IPv4: host to ns: big transfer -nsb socat -u TCP4-LISTEN:10002,bind=127.0.0.1 OPEN:__TEMP_NS_BIG__,create,trunc +test TCP/IPv4: host to ns (spliced): big transfer +nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc host socat -u OPEN:__BASEPATH__/big.bin TCP4:127.0.0.1:10002 nsw check cmp __BASEPATH__/big.bin __TEMP_NS_BIG__ @@ -38,8 +38,8 @@ ns socat -u OPEN:__BASEPATH__/big.bin TCP4:__GW__:10003 hostw check cmp __BASEPATH__/big.bin __TEMP_BIG__ -test TCP/IPv4: host to ns: small transfer -nsb socat -u TCP4-LISTEN:10002,bind=127.0.0.1 OPEN:__TEMP_NS_SMALL__,create,trunc +test TCP/IPv4: host to ns (spliced): small transfer +nsb socat -u TCP4-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc host socat OPEN:__BASEPATH__/small.bin TCP4:127.0.0.1:10002 nsw check cmp __BASEPATH__/small.bin __TEMP_NS_SMALL__ @@ -57,8 +57,8 @@ ns socat -u OPEN:__BASEPATH__/small.bin TCP4:__GW__:10003 hostw check cmp __BASEPATH__/small.bin __TEMP_SMALL__ -test TCP/IPv6: host to ns: big transfer -nsb socat -u TCP6-LISTEN:10002,bind=[::1] OPEN:__TEMP_NS_BIG__,create,trunc +test TCP/IPv6: host to ns (spliced): big transfer +nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_BIG__,create,trunc host socat -u OPEN:__BASEPATH__/big.bin TCP6:[::1]:10002 nsw check cmp __BASEPATH__/big.bin __TEMP_NS_BIG__ @@ -77,8 +77,8 @@ ns socat -u OPEN:__BASEPATH__/big.bin TCP6:[__GW6__%__IFNAME__]:10003 hostw check cmp __BASEPATH__/big.bin __TEMP_BIG__ -test TCP/IPv6: host to ns: small transfer -nsb socat -u TCP6-LISTEN:10002,bind=[::1] OPEN:__TEMP_NS_SMALL__,create,trunc +test TCP/IPv6: host to ns (spliced): small transfer +nsb socat -u TCP6-LISTEN:10002 OPEN:__TEMP_NS_SMALL__,create,trunc host socat -u OPEN:__BASEPATH__/small.bin TCP6:[::1]:10002 nsw check cmp __BASEPATH__/small.bin __TEMP_NS_SMALL__ diff --git a/test/pasta/udp b/test/pasta/udp index 30e3a85..7734d02 100644 --- a/test/pasta/udp +++ b/test/pasta/udp @@ -17,8 +17,8 @@ htools dd socat ip jq set TEMP __STATEDIR__/test.bin set TEMP_NS __STATEDIR__/test_ns.bin -test UDP/IPv4: host to ns -nsb socat -u UDP4-LISTEN:10002,bind=127.0.0.1,null-eof OPEN:__TEMP_NS__,create,trunc +test UDP/IPv4: host to ns (recvmmsg/sendmmsg) +nsb socat -u UDP4-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc host socat OPEN:__BASEPATH__/medium.bin UDP4:127.0.0.1:10002,shut-null nsw check cmp __BASEPATH__/medium.bin __TEMP_NS__ @@ -37,8 +37,8 @@ ns socat -u OPEN:__BASEPATH__/medium.bin UDP4:__GW__:10003,shut-null hostw check cmp __BASEPATH__/medium.bin __TEMP__ -test UDP/IPv6: host to ns -nsb socat -u UDP6-LISTEN:10002,bind=[::1],null-eof OPEN:__TEMP_NS__,create,trunc +test UDP/IPv6: host to ns (recvmmsg/sendmmsg) +nsb socat -u UDP6-LISTEN:10002,null-eof OPEN:__TEMP_NS__,create,trunc host socat -u OPEN:__BASEPATH__/medium.bin UDP6:[::1]:10002,shut-null nsw check cmp __BASEPATH__/medium.bin __TEMP_NS__ diff --git a/test/pasta_options/log_to_file b/test/pasta_options/log_to_file index fcdd553..3ead06c 100644 --- a/test/pasta_options/log_to_file +++ b/test/pasta_options/log_to_file @@ -19,7 +19,7 @@ sleep 1 endef def flood_log_client -host tcp_crr --nolog -P 10001 -C 10002 -6 -c -H ::1 +host tcp_crr --nolog -l1 -P 10001 -C 10002 -6 -c -H ::1 endef def check_log_size_mountns @@ -33,19 +33,16 @@ test Log creation set PORTS -t 10001,10002 -u 10001,10002 set LOG_FILE __STATEDIR__/pasta.log -passt ./pasta -l __LOG_FILE__ -passtb exit -sleep 1 +passt ./pasta -l __LOG_FILE__ -- /bin/true check [ -s __LOG_FILE__ ] test Log truncated on creation -passt ./pasta -l __LOG_FILE__ -passtb exit -sleep 1 -check [ $(cat __LOG_FILE__ | wc -l) -eq 1 ] +passt ./pasta -l __LOG_FILE__ -- /bin/true & wait +pout PID2 echo $! +check head -1 __LOG_FILE__ | grep '^pasta .* [(]__PID2__[)]$' test Maximum log size -passtb ./pasta --config-net -d -f -l __LOG_FILE__ --log-size $((100 * 1024)) -- sh -c 'while true; do tcp_crr --nolog -P 10001 -C 10002 -6; done' +passtb ./pasta --config-net -d -f -l __LOG_FILE__ --log-size $((100 * 1024)) -- sh -c 'while true; do tcp_crr --nolog -l1 -P 10001 -C 10002 -6; done' sleep 1 flood_log_client diff --git a/test/pasta_podman/bats b/test/pasta_podman/bats index 21446f0..2f07be8 100644 --- a/test/pasta_podman/bats +++ b/test/pasta_podman/bats @@ -11,11 +11,16 @@ # Copyright (c) 2022 Red Hat GmbH # Author: Stefano Brivio <sbrivio@redhat.com> -htools git make go bats catatonit ip jq socat +htools git make go bats ip jq socat ./test/podman/bin/podman + +set PODMAN test/podman/bin/podman +hout WD pwd + +test Podman pasta path + +hout PASTA_BIN CONTAINERS_HELPER_BINARY_DIR="__WD__" __PODMAN__ info --format "{{.Host.Pasta.Executable}}" +check [ "__PASTA_BIN__" = "__WD__/pasta" ] test Podman system test with bats -host git -C __STATEDIR__ clone https://github.com/containers/podman.git -host make -C __STATEDIR__/podman -hout WD pwd -host PODMAN="__STATEDIR__/podman/bin/podman" CONTAINERS_HELPER_BINARY_DIR="__WD__" bats __STATEDIR__/podman/test/system/505-networking-pasta.bats +host PODMAN="__PODMAN__" CONTAINERS_HELPER_BINARY_DIR="__WD__" taskset -c 1 bats test/podman/test/system/505-networking-pasta.bats diff --git a/test/perf/passt_tcp b/test/perf/passt_tcp index 631a407..5978c49 100644 --- a/test/perf/passt_tcp +++ b/test/perf/passt_tcp @@ -15,6 +15,9 @@ gtools /sbin/sysctl ip jq nproc seq sleep iperf3 tcp_rr tcp_crr # From neper nstools /sbin/sysctl ip jq nproc seq sleep iperf3 tcp_rr tcp_crr htools bc head sed seq +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + test passt: throughput and latency guest /sbin/sysctl -w net.core.rmem_max=536870912 @@ -29,42 +32,39 @@ ns /sbin/sysctl -w net.ipv4.tcp_rmem="4096 524288 134217728" ns /sbin/sysctl -w net.ipv4.tcp_wmem="4096 524288 134217728" ns /sbin/sysctl -w net.ipv4.tcp_timestamps=0 -gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' -gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway' gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' hout FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1 hout FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq) ) | bc -l hout FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__ -set THREADS 1 -set STREAMS 8 -set TIME 10 +set THREADS 4 +set TIME 1 set OMIT 0.1 -set OPTS -Z -P __STREAMS__ -l 1M -O__OMIT__ +set OPTS -Z -P __THREADS__ -l 1M -O__OMIT__ -info Throughput in Gbps, latency in µs, one thread at __FREQ__ GHz, __STREAMS__ streams +info Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz report passt tcp __THREADS__ __FREQ__ th MTU 256B 576B 1280B 1500B 9000B 65520B tr TCP throughput over IPv6: guest to host -iperf3s ns 100${i}2 __THREADS__ +iperf3s ns 10002 bw - bw - guest ip link set dev __IFNAME__ mtu 1280 -iperf3 BW guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -w 4M +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 4M bw __BW__ 1.2 1.5 guest ip link set dev __IFNAME__ mtu 1500 -iperf3 BW guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -w 4M +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 4M bw __BW__ 1.6 1.8 guest ip link set dev __IFNAME__ mtu 9000 -iperf3 BW guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -w 8M +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 8M bw __BW__ 4.0 5.0 guest ip link set dev __IFNAME__ mtu 65520 -iperf3 BW guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -w 16M +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 16M bw __BW__ 7.0 8.0 iperf3k ns @@ -76,7 +76,7 @@ lat - lat - lat - nsb tcp_rr --nolog -6 -gout LAT tcp_rr --nolog -6 -c -H __GW6__%__IFNAME__ | sed -n 's/^throughput=\(.*\)/\1/p' +gout LAT tcp_rr --nolog -l1 -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p' lat __LAT__ 200 150 tl TCP CRR latency over IPv6: guest to host @@ -86,33 +86,39 @@ lat - lat - lat - nsb tcp_crr --nolog -6 -gout LAT tcp_crr --nolog -6 -c -H __GW6__%__IFNAME__ | sed -n 's/^throughput=\(.*\)/\1/p' +gout LAT tcp_crr --nolog -l1 -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p' lat __LAT__ 500 400 tr TCP throughput over IPv4: guest to host -iperf3s ns 100${i}2 __THREADS__ +iperf3s ns 10002 guest ip link set dev __IFNAME__ mtu 256 -iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -w 1M +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 1M bw __BW__ 0.2 0.3 guest ip link set dev __IFNAME__ mtu 576 -iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -w 1M +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 1M bw __BW__ 0.5 0.8 guest ip link set dev __IFNAME__ mtu 1280 -iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -w 4M +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 4M bw __BW__ 1.2 1.5 guest ip link set dev __IFNAME__ mtu 1500 -iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -w 4M +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 4M bw __BW__ 1.6 1.8 guest ip link set dev __IFNAME__ mtu 9000 -iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -w 8M +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 8M bw __BW__ 4.0 5.0 guest ip link set dev __IFNAME__ mtu 65520 -iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -w 16M +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 16M bw __BW__ 7.0 8.0 iperf3k ns +# Reducing MTU below 1280 deconfigures IPv6, get our address back +guest dhclient -6 -x +guest dhclient -6 __IFNAME__ +# Wait for DAD to complete +guest while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done + tl TCP RR latency over IPv4: guest to host lat - lat - @@ -120,7 +126,7 @@ lat - lat - lat - nsb tcp_rr --nolog -4 -gout LAT tcp_rr --nolog -4 -c -H __GW__ | sed -n 's/^throughput=\(.*\)/\1/p' +gout LAT tcp_rr --nolog -l1 -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p' lat __LAT__ 200 150 tl TCP CRR latency over IPv4: guest to host @@ -130,18 +136,18 @@ lat - lat - lat - nsb tcp_crr --nolog -4 -gout LAT tcp_crr --nolog -4 -c -H __GW__ | sed -n 's/^throughput=\(.*\)/\1/p' +gout LAT tcp_crr --nolog -l1 -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p' lat __LAT__ 500 400 tr TCP throughput over IPv6: host to guest -iperf3s guest 100${i}1 __THREADS__ +iperf3s guest 10001 bw - bw - bw - bw - bw - -iperf3 BW ns ::1 100${i}1 __THREADS__ __TIME__ __OPTS__ +iperf3 BW ns ::1 10001 __TIME__ __OPTS__ bw __BW__ 6.0 6.8 iperf3k guest @@ -154,7 +160,7 @@ lat - lat - guestb tcp_rr --nolog -P 10001 -C 10011 -6 sleep 1 -nsout LAT tcp_rr --nolog -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_rr --nolog -l1 -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' lat __LAT__ 200 150 tl TCP CRR latency over IPv6: host to guest @@ -165,19 +171,19 @@ lat - lat - guestb tcp_crr --nolog -P 10001 -C 10011 -6 sleep 1 -nsout LAT tcp_crr --nolog -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_crr --nolog -l1 -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' lat __LAT__ 500 350 tr TCP throughput over IPv4: host to guest -iperf3s guest 100${i}1 __THREADS__ +iperf3s guest 10001 bw - bw - bw - bw - bw - -iperf3 BW ns 127.0.0.1 100${i}1 __THREADS__ __TIME__ __OPTS__ +iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ bw __BW__ 6.0 6.8 iperf3k guest @@ -190,7 +196,7 @@ lat - lat - guestb tcp_rr --nolog -P 10001 -C 10011 -4 sleep 1 -nsout LAT tcp_rr --nolog -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_rr --nolog -l1 -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' lat __LAT__ 200 150 tl TCP CRR latency over IPv6: host to guest @@ -201,7 +207,7 @@ lat - lat - guestb tcp_crr --nolog -P 10001 -C 10011 -4 sleep 1 -nsout LAT tcp_crr --nolog -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_crr --nolog -l1 -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' lat __LAT__ 500 300 te diff --git a/test/perf/passt_udp b/test/perf/passt_udp index 10f638f..4c66c41 100644 --- a/test/perf/passt_udp +++ b/test/perf/passt_udp @@ -15,6 +15,9 @@ gtools /sbin/sysctl ip jq nproc sleep iperf3 udp_rr # From neper nstools ip jq sleep iperf3 udp_rr htools bc head sed +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + test passt: throughput and latency guest /sbin/sysctl -w net.core.rmem_max=16777216 @@ -22,38 +25,33 @@ guest /sbin/sysctl -w net.core.wmem_max=16777216 guest /sbin/sysctl -w net.core.rmem_default=16777216 guest /sbin/sysctl -w net.core.wmem_default=16777216 -gout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' -gout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway' -gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' - hout FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1 hout FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq) ) | bc -l hout FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__ -set THREADS 4 -set STREAMS 1 -set TIME 10 -set OPTS -u -P __STREAMS__ --pacing-timer 1000 +set THREADS 2 +set TIME 1 +set OPTS -u -P __THREADS__ --pacing-timer 1000 -info Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz, one stream each +info Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz report passt udp __THREADS__ __FREQ__ th pktlen 256B 576B 1280B 1500B 9000B 65520B tr UDP throughput over IPv6: guest to host -iperf3s ns 100${i}2 __THREADS__ +iperf3s ns 10002 # (datagram size) = (packet size) - 48: 40 bytes of IPv6 header, 8 of UDP header bw - bw - -iperf3 BW guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 2G -l 1232 +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 3G -l 1232 bw __BW__ 0.8 1.2 -iperf3 BW guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 3G -l 1452 +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 4G -l 1452 bw __BW__ 1.0 1.5 -iperf3 BW guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 5G -l 8952 +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 8G -l 8952 bw __BW__ 4.0 5.0 -iperf3 BW guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 7G -l 64372 +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 15G -l 64372 bw __BW__ 4.0 5.0 iperf3k ns @@ -65,25 +63,25 @@ lat - lat - lat - nsb udp_rr --nolog -6 -gout LAT udp_rr --nolog -6 -c -H __GW6__%__IFNAME__ | sed -n 's/^throughput=\(.*\)/\1/p' +gout LAT udp_rr --nolog -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p' lat __LAT__ 200 150 tr UDP throughput over IPv4: guest to host -iperf3s ns 100${i}2 __THREADS__ +iperf3s ns 10002 # (datagram size) = (packet size) - 28: 20 bytes of IPv4 header, 8 of UDP header -iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 500M -l 228 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 1G -l 228 bw __BW__ 0.0 0.0 -iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 1G -l 548 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 2G -l 548 bw __BW__ 0.4 0.6 -iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 2G -l 1252 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 3G -l 1252 bw __BW__ 0.8 1.2 -iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 3G -l 1472 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 4G -l 1472 bw __BW__ 1.0 1.5 -iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 6G -l 8972 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 8G -l 8972 bw __BW__ 4.0 5.0 -iperf3 BW guest __GW__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 7G -l 65492 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 15G -l 65492 bw __BW__ 4.0 5.0 iperf3k ns @@ -95,23 +93,23 @@ lat - lat - lat - nsb udp_rr --nolog -4 -gout LAT udp_rr --nolog -4 -c -H __GW__ | sed -n 's/^throughput=\(.*\)/\1/p' +gout LAT udp_rr --nolog -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p' lat __LAT__ 200 150 tr UDP throughput over IPv6: host to guest -iperf3s guest 100${i}1 __THREADS__ +iperf3s guest 10001 # (datagram size) = (packet size) - 48: 40 bytes of IPv6 header, 8 of UDP header bw - bw - -iperf3 BW ns ::1 100${i}1 __THREADS__ __TIME__ __OPTS__ -b 2G -l 1232 +iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -b 3G -l 1232 bw __BW__ 0.8 1.2 -iperf3 BW ns ::1 100${i}1 __THREADS__ __TIME__ __OPTS__ -b 2G -l 1452 +iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -b 4G -l 1452 bw __BW__ 1.0 1.5 -iperf3 BW ns ::1 100${i}1 __THREADS__ __TIME__ __OPTS__ -b 3G -l 8952 +iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -b 8G -l 8952 bw __BW__ 3.0 4.0 -iperf3 BW ns ::1 100${i}1 __THREADS__ __TIME__ __OPTS__ -b 3G -l 64372 +iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -b 15G -l 64372 bw __BW__ 3.0 4.0 iperf3k guest @@ -129,20 +127,20 @@ lat __LAT__ 200 150 tr UDP throughput over IPv4: host to guest -iperf3s guest 100${i}1 __THREADS__ +iperf3s guest 10001 # (datagram size) = (packet size) - 28: 20 bytes of IPv4 header, 8 of UDP header -iperf3 BW ns 127.0.0.1 100${i}1 __THREADS__ __TIME__ __OPTS__ -b 1G -l 228 +iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 1G -l 228 bw __BW__ 0.0 0.0 -iperf3 BW ns 127.0.0.1 100${i}1 __THREADS__ __TIME__ __OPTS__ -b 1G -l 548 +iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 2G -l 548 bw __BW__ 0.4 0.6 -iperf3 BW ns 127.0.0.1 100${i}1 __THREADS__ __TIME__ __OPTS__ -b 3G -l 1252 +iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 3G -l 1252 bw __BW__ 0.8 1.2 -iperf3 BW ns 127.0.0.1 100${i}1 __THREADS__ __TIME__ __OPTS__ -b 3G -l 1472 +iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 4G -l 1472 bw __BW__ 1.0 1.5 -iperf3 BW ns 127.0.0.1 100${i}1 __THREADS__ __TIME__ __OPTS__ -b 3G -l 8972 +iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 8G -l 8972 bw __BW__ 3.0 4.0 -iperf3 BW ns 127.0.0.1 100${i}1 __THREADS__ __TIME__ __OPTS__ -b 3G -l 65492 +iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 15G -l 65492 bw __BW__ 3.0 4.0 iperf3k guest diff --git a/test/perf/passt_vu_tcp b/test/perf/passt_vu_tcp new file mode 100644 index 0000000..c4409b9 --- /dev/null +++ b/test/perf/passt_vu_tcp @@ -0,0 +1,211 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/perf/passt_vu_tcp - Check TCP performance in passt vhost-user mode +# +# Copyright (c) 2021 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +gtools /sbin/sysctl ip jq nproc seq sleep iperf3 tcp_rr tcp_crr # From neper +nstools /sbin/sysctl ip jq nproc seq sleep iperf3 tcp_rr tcp_crr +htools bc head sed seq + +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +test passt: throughput and latency + +guest /sbin/sysctl -w net.core.rmem_max=536870912 +guest /sbin/sysctl -w net.core.wmem_max=536870912 +guest /sbin/sysctl -w net.core.rmem_default=33554432 +guest /sbin/sysctl -w net.core.wmem_default=33554432 +guest /sbin/sysctl -w net.ipv4.tcp_rmem="4096 131072 268435456" +guest /sbin/sysctl -w net.ipv4.tcp_wmem="4096 131072 268435456" +guest /sbin/sysctl -w net.ipv4.tcp_timestamps=0 + +ns /sbin/sysctl -w net.ipv4.tcp_rmem="4096 524288 134217728" +ns /sbin/sysctl -w net.ipv4.tcp_wmem="4096 524288 134217728" +ns /sbin/sysctl -w net.ipv4.tcp_timestamps=0 + +gout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' + +hout FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1 +hout FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq) ) | bc -l +hout FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__ + +set THREADS 6 +set TIME 2 +set OMIT 0.1 +set OPTS -Z -P __THREADS__ -O__OMIT__ -N + +info Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz +report passt_vu tcp __THREADS__ __FREQ__ + +th MTU 256B 576B 1280B 1500B 9000B 65520B + + +tr TCP throughput over IPv6: guest to host +iperf3s ns 10002 + +bw - +bw - +guest ip link set dev __IFNAME__ mtu 1280 +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 16M -l 1M +bw __BW__ 1.2 1.5 +guest ip link set dev __IFNAME__ mtu 1500 +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 32M -l 1M +bw __BW__ 1.6 1.8 +guest ip link set dev __IFNAME__ mtu 9000 +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 64M -l 1M +bw __BW__ 4.0 5.0 +guest ip link set dev __IFNAME__ mtu 65520 +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -w 64M -l 1M +bw __BW__ 7.0 8.0 + +iperf3k ns + +tl TCP RR latency over IPv6: guest to host +lat - +lat - +lat - +lat - +lat - +nsb tcp_rr --nolog -6 +gout LAT tcp_rr --nolog -l1 -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 200 150 + +tl TCP CRR latency over IPv6: guest to host +lat - +lat - +lat - +lat - +lat - +nsb tcp_crr --nolog -6 +gout LAT tcp_crr --nolog -l1 -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 500 400 + +tr TCP throughput over IPv4: guest to host +iperf3s ns 10002 + +guest ip link set dev __IFNAME__ mtu 256 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 2M -l 1M +bw __BW__ 0.2 0.3 +guest ip link set dev __IFNAME__ mtu 576 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 4M -l 1M +bw __BW__ 0.5 0.8 +guest ip link set dev __IFNAME__ mtu 1280 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 8M -l 1M +bw __BW__ 1.2 1.5 +guest ip link set dev __IFNAME__ mtu 1500 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 16M -l 1M +bw __BW__ 1.6 1.8 +guest ip link set dev __IFNAME__ mtu 9000 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 64M -l 1M +bw __BW__ 4.0 5.0 +guest ip link set dev __IFNAME__ mtu 65520 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -w 64M -l 1M +bw __BW__ 7.0 8.0 + +iperf3k ns + +# Reducing MTU below 1280 deconfigures IPv6, get our address back +guest dhclient -6 -x +guest dhclient -6 __IFNAME__ + +tl TCP RR latency over IPv4: guest to host +lat - +lat - +lat - +lat - +lat - +nsb tcp_rr --nolog -4 +gout LAT tcp_rr --nolog -l1 -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 200 150 + +tl TCP CRR latency over IPv4: guest to host +lat - +lat - +lat - +lat - +lat - +nsb tcp_crr --nolog -4 +gout LAT tcp_crr --nolog -l1 -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 500 400 + +tr TCP throughput over IPv6: host to guest +iperf3s guest 10001 + +bw - +bw - +bw - +bw - +bw - +iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -w 256M -l 16k +bw __BW__ 6.0 6.8 + +iperf3k guest + +tl TCP RR latency over IPv6: host to guest +lat - +lat - +lat - +lat - +lat - +guestb tcp_rr --nolog -P 10001 -C 10011 -6 +sleep 1 +nsout LAT tcp_rr --nolog -l1 -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 200 150 + +tl TCP CRR latency over IPv6: host to guest +lat - +lat - +lat - +lat - +lat - +guestb tcp_crr --nolog -P 10001 -C 10011 -6 +sleep 1 +nsout LAT tcp_crr --nolog -l1 -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 500 350 + + +tr TCP throughput over IPv4: host to guest +iperf3s guest 10001 + +bw - +bw - +bw - +bw - +bw - +iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -w 256M -l 16k +bw __BW__ 6.0 6.8 + +iperf3k guest + +tl TCP RR latency over IPv4: host to guest +lat - +lat - +lat - +lat - +lat - +guestb tcp_rr --nolog -P 10001 -C 10011 -4 +sleep 1 +nsout LAT tcp_rr --nolog -l1 -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 200 150 + +tl TCP CRR latency over IPv6: host to guest +lat - +lat - +lat - +lat - +lat - +guestb tcp_crr --nolog -P 10001 -C 10011 -4 +sleep 1 +nsout LAT tcp_crr --nolog -l1 -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 500 300 + +te diff --git a/test/perf/passt_vu_udp b/test/perf/passt_vu_udp new file mode 100644 index 0000000..943ac11 --- /dev/null +++ b/test/perf/passt_vu_udp @@ -0,0 +1,159 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/perf/passt_vu_udp - Check UDP performance in passt vhost-user mode +# +# Copyright (c) 2021 Red Hat GmbH +# Author: Stefano Brivio <sbrivio@redhat.com> + +gtools /sbin/sysctl ip jq nproc sleep iperf3 udp_rr # From neper +nstools ip jq sleep iperf3 udp_rr +htools bc head sed + +set MAP_NS4 192.0.2.2 +set MAP_NS6 2001:db8:9a55::2 + +test passt: throughput and latency + +guest /sbin/sysctl -w net.core.rmem_max=16777216 +guest /sbin/sysctl -w net.core.wmem_max=16777216 +guest /sbin/sysctl -w net.core.rmem_default=16777216 +guest /sbin/sysctl -w net.core.wmem_default=16777216 + +hout FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1 +hout FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq) ) | bc -l +hout FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__ + +set THREADS 2 +set TIME 1 +set OPTS -u -P __THREADS__ --pacing-timer 1000 + +info Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz + +report passt_vu udp __THREADS__ __FREQ__ + +th pktlen 256B 576B 1280B 1500B 9000B 65520B + +tr UDP throughput over IPv6: guest to host +iperf3s ns 10002 +# (datagram size) = (packet size) - 48: 40 bytes of IPv6 header, 8 of UDP header + +bw - +bw - +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 3G -l 1232 +bw __BW__ 0.8 1.2 +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 4G -l 1452 +bw __BW__ 1.0 1.5 +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 10G -l 8952 +bw __BW__ 4.0 5.0 +iperf3 BW guest __MAP_NS6__ 10002 __TIME__ __OPTS__ -b 20G -l 64372 +bw __BW__ 4.0 5.0 + +iperf3k ns + +tl UDP RR latency over IPv6: guest to host +lat - +lat - +lat - +lat - +lat - +nsb udp_rr --nolog -6 +gout LAT udp_rr --nolog -6 -c -H __MAP_NS6__ | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 200 150 + + +tr UDP throughput over IPv4: guest to host +iperf3s ns 10002 +# (datagram size) = (packet size) - 28: 20 bytes of IPv4 header, 8 of UDP header + +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 1G -l 228 +bw __BW__ 0.0 0.0 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 2G -l 548 +bw __BW__ 0.4 0.6 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 3G -l 1252 +bw __BW__ 0.8 1.2 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 4G -l 1472 +bw __BW__ 1.0 1.5 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 10G -l 8972 +bw __BW__ 4.0 5.0 +iperf3 BW guest __MAP_NS4__ 10002 __TIME__ __OPTS__ -b 20G -l 65492 +bw __BW__ 4.0 5.0 + +iperf3k ns + +tl UDP RR latency over IPv4: guest to host +lat - +lat - +lat - +lat - +lat - +nsb udp_rr --nolog -4 +gout LAT udp_rr --nolog -4 -c -H __MAP_NS4__ | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 200 150 + + +tr UDP throughput over IPv6: host to guest +iperf3s guest 10001 +# (datagram size) = (packet size) - 48: 40 bytes of IPv6 header, 8 of UDP header + +bw - +bw - +iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -b 3G -l 1232 +bw __BW__ 0.8 1.2 +iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -b 4G -l 1452 +bw __BW__ 1.0 1.5 +iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -b 10G -l 8952 +bw __BW__ 3.0 4.0 +iperf3 BW ns ::1 10001 __TIME__ __OPTS__ -b 20G -l 64372 +bw __BW__ 3.0 4.0 + +iperf3k guest + +tl UDP RR latency over IPv6: host to guest +lat - +lat - +lat - +lat - +lat - +guestb udp_rr --nolog -P 10001 -C 10011 -6 +sleep 1 +nsout LAT udp_rr --nolog -P 10001 -C 10011 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 200 150 + + +tr UDP throughput over IPv4: host to guest +iperf3s guest 10001 +# (datagram size) = (packet size) - 28: 20 bytes of IPv4 header, 8 of UDP header + +iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 1G -l 228 +bw __BW__ 0.0 0.0 +iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 2G -l 548 +bw __BW__ 0.4 0.6 +iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 3G -l 1252 +bw __BW__ 0.8 1.2 +iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 4G -l 1472 +bw __BW__ 1.0 1.5 +iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 10G -l 8972 +bw __BW__ 3.0 4.0 +iperf3 BW ns 127.0.0.1 10001 __TIME__ __OPTS__ -b 20G -l 65492 +bw __BW__ 3.0 4.0 + +iperf3k guest + +tl UDP RR latency over IPv4: host to guest +lat - +lat - +lat - +lat - +lat - +guestb udp_rr --nolog -P 10001 -C 10011 -4 +sleep 1 +nsout LAT udp_rr --nolog -P 10001 -C 10011 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' +lat __LAT__ 200 150 + +te diff --git a/test/perf/pasta_tcp b/test/perf/pasta_tcp index 7777532..bc0de3c 100644 --- a/test/perf/pasta_tcp +++ b/test/perf/pasta_tcp @@ -14,6 +14,9 @@ htools head ip seq bc sleep iperf3 tcp_rr tcp_crr jq sed nstools /sbin/sysctl nproc ip seq sleep iperf3 tcp_rr tcp_crr jq sed +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 + test pasta: throughput and latency (local connections) ns /sbin/sysctl -w net.ipv4.tcp_rmem="131072 524288 134217728" @@ -21,101 +24,100 @@ ns /sbin/sysctl -w net.ipv4.tcp_wmem="131072 524288 134217728" ns /sbin/sysctl -w net.ipv4.tcp_timestamps=0 -set THREADS 2 -set STREAMS 2 -set TIME 10 +set THREADS 4 +set TIME 1 set OMIT 0.1 -set OPTS -Z -w 4M -l 1M -P __STREAMS__ -O__OMIT__ +set OPTS -Z -w 4M -l 1M -P __THREADS__ -O__OMIT__ hout FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1 hout FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq) ) | bc -l hout FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__ -info Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz, __STREAMS__ streams each +info Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz report pasta lo_tcp __THREADS__ __FREQ__ th MTU 65535B tr TCP throughput over IPv6: ns to host -iperf3s host 100${i}3 __THREADS__ +iperf3s host 10003 -iperf3 BW ns ::1 100${i}3 __THREADS__ __TIME__ __OPTS__ +iperf3 BW ns ::1 10003 __THREADS__ __TIME__ __OPTS__ bw __BW__ 15.0 20.0 iperf3k host tl TCP RR latency over IPv6: ns to host hostb tcp_rr --nolog -P 10003 -C 10013 -6 -nsout LAT tcp_rr --nolog -P 10003 -C 10013 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_rr --nolog -l1 -P 10003 -C 10013 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' hostw lat __LAT__ 150 100 tl TCP CRR latency over IPv6: ns to host hostb tcp_crr --nolog -P 10003 -C 10013 -6 -nsout LAT tcp_crr --nolog -P 10003 -C 10013 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_crr --nolog -l1 -P 10003 -C 10013 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' hostw lat __LAT__ 500 350 tr TCP throughput over IPv4: ns to host -iperf3s host 100${i}3 __THREADS__ +iperf3s host 10003 -iperf3 BW ns 127.0.0.1 100${i}3 __THREADS__ __TIME__ __OPTS__ +iperf3 BW ns 127.0.0.1 10003 __THREADS__ __TIME__ __OPTS__ bw __BW__ 15.0 20.0 iperf3k host tl TCP RR latency over IPv4: ns to host hostb tcp_rr --nolog -P 10003 -C 10013 -4 -nsout LAT tcp_rr --nolog -P 10003 -C 10013 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_rr --nolog -l1 -P 10003 -C 10013 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' hostw lat __LAT__ 150 100 tl TCP CRR latency over IPv4: ns to host hostb tcp_crr --nolog -P 10003 -C 10013 -4 -nsout LAT tcp_crr --nolog -P 10003 -C 10013 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_crr --nolog -l1 -P 10003 -C 10013 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' hostw lat __LAT__ 500 350 tr TCP throughput over IPv6: host to ns -iperf3s ns 100${i}2 __THREADS__ +iperf3s ns 10002 -iperf3 BW host ::1 100${i}2 __THREADS__ __TIME__ __OPTS__ +iperf3 BW host ::1 10002 __TIME__ __OPTS__ bw __BW__ 15.0 20.0 iperf3k ns tl TCP RR latency over IPv6: host to ns nsb tcp_rr --nolog -P 10002 -C 10012 -6 -hout LAT tcp_rr --nolog -P 10002 -C 10012 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' +hout LAT tcp_rr --nolog -l1 -P 10002 -C 10012 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' nsw lat __LAT__ 150 100 tl TCP CRR latency over IPv6: host to ns nsb tcp_crr --nolog -P 10002 -C 10012 -6 -hout LAT tcp_crr --nolog -P 10002 -C 10012 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' +hout LAT tcp_crr --nolog -l1 -P 10002 -C 10012 -6 -c -H ::1 | sed -n 's/^throughput=\(.*\)/\1/p' nsw lat __LAT__ 1000 700 tr TCP throughput over IPv4: host to ns -iperf3s ns 100${i}2 __THREADS__ +iperf3s ns 10002 -iperf3 BW host 127.0.0.1 100${i}2 __THREADS__ __TIME__ __OPTS__ +iperf3 BW host 127.0.0.1 10002 __TIME__ __OPTS__ bw __BW__ 15.0 20.0 iperf3k ns tl TCP RR latency over IPv4: host to ns nsb tcp_rr --nolog -P 10002 -C 10012 -4 -hout LAT tcp_rr --nolog -P 10002 -C 10012 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' +hout LAT tcp_rr --nolog -l1 -P 10002 -C 10012 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' nsw lat __LAT__ 150 100 tl TCP CRR latency over IPv4: host to ns nsb tcp_crr --nolog -P 10002 -C 10012 -4 -hout LAT tcp_crr --nolog -P 10002 -C 10012 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' +hout LAT tcp_crr --nolog -l1 -P 10002 -C 10012 -4 -c -H 127.0.0.1 | sed -n 's/^throughput=\(.*\)/\1/p' nsw lat __LAT__ 1000 700 @@ -123,32 +125,29 @@ te test pasta: throughput and latency (connections via tap) -nsout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' -nsout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway' nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' -set THREADS 1 -set STREAMS 2 -set OPTS -Z -P __STREAMS__ -i1 -O__OMIT__ +set THREADS 2 +set OPTS -Z -P __THREADS__ -i1 -O__OMIT__ -info Throughput in Gbps, latency in µs, one thread at __FREQ__ GHz, __STREAMS__ streams +info Throughput in Gbps, latency in µs, __THREADS__ threads at __FREQ__ GHz report pasta tap_tcp __THREADS__ __FREQ__ th MTU 1500B 4000B 16384B 65520B tr TCP throughput over IPv6: ns to host -iperf3s host 100${i}3 __THREADS__ +iperf3s host 10003 ns ip link set dev __IFNAME__ mtu 1500 -iperf3 BW ns __GW6__%__IFNAME__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -w 512k +iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -w 512k bw __BW__ 0.2 0.4 ns ip link set dev __IFNAME__ mtu 4000 -iperf3 BW ns __GW6__%__IFNAME__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -w 1M +iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -w 1M bw __BW__ 0.3 0.5 ns ip link set dev __IFNAME__ mtu 16384 -iperf3 BW ns __GW6__%__IFNAME__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -w 8M +iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -w 8M bw __BW__ 1.5 2.0 ns ip link set dev __IFNAME__ mtu 65520 -iperf3 BW ns __GW6__%__IFNAME__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -w 8M +iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -w 8M bw __BW__ 2.0 2.5 iperf3k host @@ -158,7 +157,7 @@ lat - lat - lat - hostb tcp_rr --nolog -P 10003 -C 10013 -6 -nsout LAT tcp_rr --nolog -P 10003 -C 10013 -6 -c -H __GW6__%__IFNAME__ | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_rr --nolog -l1 -P 10003 -C 10013 -6 -c -H __MAP_HOST6__ | sed -n 's/^throughput=\(.*\)/\1/p' hostw lat __LAT__ 150 100 @@ -167,25 +166,25 @@ lat - lat - lat - hostb tcp_crr --nolog -P 10003 -C 10013 -6 -nsout LAT tcp_crr --nolog -P 10003 -C 10013 -6 -c -H __GW6__%__IFNAME__ | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_crr --nolog -l1 -P 10003 -C 10013 -6 -c -H __MAP_HOST6__ | sed -n 's/^throughput=\(.*\)/\1/p' hostw lat __LAT__ 1500 500 tr TCP throughput over IPv4: ns to host -iperf3s host 100${i}3 __THREADS__ +iperf3s host 10003 ns ip link set dev __IFNAME__ mtu 1500 -iperf3 BW ns __GW__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -w 512k +iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -w 512k bw __BW__ 0.2 0.4 ns ip link set dev __IFNAME__ mtu 4000 -iperf3 BW ns __GW__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -w 1M +iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -w 1M bw __BW__ 0.3 0.5 ns ip link set dev __IFNAME__ mtu 16384 -iperf3 BW ns __GW__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -w 8M +iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -w 8M bw __BW__ 1.5 2.0 ns ip link set dev __IFNAME__ mtu 65520 -iperf3 BW ns __GW__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -w 8M +iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -w 8M bw __BW__ 2.0 2.5 iperf3k host @@ -195,7 +194,7 @@ lat - lat - lat - hostb tcp_rr --nolog -P 10003 -C 10013 -4 -nsout LAT tcp_rr --nolog -P 10003 -C 10013 -4 -c -H __GW__ | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_rr --nolog -l1 -P 10003 -C 10013 -4 -c -H __MAP_HOST4__ | sed -n 's/^throughput=\(.*\)/\1/p' hostw lat __LAT__ 150 100 @@ -204,19 +203,19 @@ lat - lat - lat - hostb tcp_crr --nolog -P 10003 -C 10013 -4 -nsout LAT tcp_crr --nolog -P 10003 -C 10013 -4 -c -H __GW__ | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT tcp_crr --nolog -l1 -P 10003 -C 10013 -4 -c -H __MAP_HOST4__ | sed -n 's/^throughput=\(.*\)/\1/p' hostw lat __LAT__ 1500 500 tr TCP throughput over IPv6: host to ns -iperf3s ns 100${i}2 __THREADS__ +iperf3s ns 10002 nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' -nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local' +nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local] | .[0]' bw - bw - bw - -iperf3 BW host __ADDR6__ 100${i}2 __THREADS__ __TIME__ __OPTS__ +iperf3 BW host __ADDR6__ 10002 __TIME__ __OPTS__ bw __BW__ 8.0 10.0 iperf3k ns @@ -226,7 +225,7 @@ lat - lat - lat - nsb tcp_rr --nolog -P 10002 -C 10012 -6 -hout LAT tcp_rr --nolog -P 10002 -C 10012 -6 -c -H __ADDR6__ | sed -n 's/^throughput=\(.*\)/\1/p' +hout LAT tcp_rr --nolog -l1 -P 10002 -C 10012 -6 -c -H __ADDR6__ | sed -n 's/^throughput=\(.*\)/\1/p' nsw lat __LAT__ 150 100 @@ -236,19 +235,19 @@ lat - lat - sleep 1 nsb tcp_crr --nolog -P 10002 -C 10012 -6 -hout LAT tcp_crr --nolog -P 10002 -C 10012 -6 -c -H __ADDR6__ | sed -n 's/^throughput=\(.*\)/\1/p' +hout LAT tcp_crr --nolog -l1 -P 10002 -C 10012 -6 -c -H __ADDR6__ | sed -n 's/^throughput=\(.*\)/\1/p' nsw lat __LAT__ 5000 10000 tr TCP throughput over IPv4: host to ns -iperf3s ns 100${i}2 __THREADS__ +iperf3s ns 10002 nsout ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[0].local' bw - bw - bw - -iperf3 BW host __ADDR__ 100${i}2 __THREADS__ __TIME__ __OPTS__ +iperf3 BW host __ADDR__ 10002 __TIME__ __OPTS__ bw __BW__ 8.0 10.0 iperf3k ns @@ -258,7 +257,7 @@ lat - lat - lat - nsb tcp_rr --nolog -P 10002 -C 10012 -4 -hout LAT tcp_rr --nolog -P 10002 -C 10012 -4 -c -H __ADDR__ | sed -n 's/^throughput=\(.*\)/\1/p' +hout LAT tcp_rr --nolog -l1 -P 10002 -C 10012 -4 -c -H __ADDR__ | sed -n 's/^throughput=\(.*\)/\1/p' nsw lat __LAT__ 150 100 @@ -268,7 +267,7 @@ lat - lat - sleep 1 nsb tcp_crr --nolog -P 10002 -C 10012 -4 -hout LAT tcp_crr --nolog -P 10002 -C 10012 -4 -c -H __ADDR__ | sed -n 's/^throughput=\(.*\)/\1/p' +hout LAT tcp_crr --nolog -l1 -P 10002 -C 10012 -4 -c -H __ADDR__ | sed -n 's/^throughput=\(.*\)/\1/p' nsw lat __LAT__ 5000 10000 diff --git a/test/perf/pasta_udp b/test/perf/pasta_udp index 5e3db1e..ab2f3e8 100644 --- a/test/perf/pasta_udp +++ b/test/perf/pasta_udp @@ -14,6 +14,9 @@ htools bc head ip sleep iperf3 udp_rr jq sed nstools ip sleep iperf3 udp_rr jq sed +set MAP_HOST4 192.0.2.1 +set MAP_HOST6 2001:db8:9a55::1 + test pasta: throughput and latency (local traffic) hout FREQ_PROCFS (echo "scale=1"; sed -n 's/cpu MHz.*: \([0-9]*\)\..*$/(\1+10^2\/2)\/10^3/p' /proc/cpuinfo) | bc -l | head -n1 @@ -21,11 +24,10 @@ hout FREQ_CPUFREQ (echo "scale=1"; printf '( %i + 10^5 / 2 ) / 10^6\n' $(cat /sy hout FREQ [ -n "__FREQ_CPUFREQ__" ] && echo __FREQ_CPUFREQ__ || echo __FREQ_PROCFS__ set THREADS 1 -set STREAMS 4 -set TIME 10 -set OPTS -u -P __STREAMS__ +set TIME 1 +set OPTS -u -P __THREADS__ -info Throughput in Gbps, latency in µs, one thread at __FREQ__ GHz, __STREAMS__ streams +info Throughput in Gbps, latency in µs, one thread at __FREQ__ GHz report pasta lo_udp 1 __FREQ__ @@ -33,16 +35,16 @@ th pktlen 1500B 4000B 16384B 65535B tr UDP throughput over IPv6: ns to host -iperf3s host 100${i}3 __THREADS__ +iperf3s host 10003 # (datagram size) = (packet size) - 48: 40 bytes of IPv6 header, 8 of UDP header -iperf3 BW ns ::1 100${i}3 __THREADS__ __TIME__ __OPTS__ -b 3G -l 1452 +iperf3 BW ns ::1 10003 __TIME__ __OPTS__ -b 5G -l 1452 bw __BW__ 1.0 1.5 -iperf3 BW ns ::1 100${i}3 __THREADS__ __TIME__ __OPTS__ -b 3G -l 3972 +iperf3 BW ns ::1 10003 __TIME__ __OPTS__ -b 10G -l 3972 bw __BW__ 1.2 1.8 -iperf3 BW ns ::1 100${i}3 __THREADS__ __TIME__ __OPTS__ -b 10G -l 16336 +iperf3 BW ns ::1 10003 __TIME__ __OPTS__ -b 30G -l 16336 bw __BW__ 5.0 6.0 -iperf3 BW ns ::1 100${i}3 __THREADS__ __TIME__ __OPTS__ -b 15G -l 65487 +iperf3 BW ns ::1 10003 __TIME__ __OPTS__ -b 40G -l 65487 bw __BW__ 7.0 9.0 iperf3k host @@ -58,16 +60,16 @@ lat __LAT__ 200 150 tr UDP throughput over IPv4: ns to host -iperf3s host 100${i}3 __THREADS__ +iperf3s host 10003 # (datagram size) = (packet size) - 28: 20 bytes of IPv4 header, 8 of UDP header -iperf3 BW ns 127.0.0.1 100${i}3 __THREADS__ __TIME__ __OPTS__ -b 3G -l 1372 +iperf3 BW ns 127.0.0.1 10003 __TIME__ __OPTS__ -b 5G -l 1372 bw __BW__ 1.0 1.5 -iperf3 BW ns 127.0.0.1 100${i}3 __THREADS__ __TIME__ __OPTS__ -b 3G -l 3972 +iperf3 BW ns 127.0.0.1 10003 __TIME__ __OPTS__ -b 10G -l 3972 bw __BW__ 1.2 1.8 -iperf3 BW ns 127.0.0.1 100${i}3 __THREADS__ __TIME__ __OPTS__ -b 10G -l 16356 +iperf3 BW ns 127.0.0.1 10003 __TIME__ __OPTS__ -b 30G -l 16356 bw __BW__ 5.0 6.0 -iperf3 BW ns 127.0.0.1 100${i}3 __THREADS__ __TIME__ __OPTS__ -b 15G -l 65507 +iperf3 BW ns 127.0.0.1 10003 __TIME__ __OPTS__ -b 40G -l 65507 bw __BW__ 7.0 9.0 iperf3k host @@ -83,15 +85,15 @@ lat __LAT__ 200 150 tr UDP throughput over IPv6: host to ns -iperf3s ns 100${i}2 __THREADS__ +iperf3s ns 10002 -iperf3 BW host ::1 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 3G -l 1452 +iperf3 BW host ::1 10002 __TIME__ __OPTS__ -b 5G -l 1452 bw __BW__ 1.0 1.5 -iperf3 BW host ::1 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 3G -l 3972 +iperf3 BW host ::1 10002 __TIME__ __OPTS__ -b 10G -l 3972 bw __BW__ 1.2 1.8 -iperf3 BW host ::1 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 10G -l 16336 +iperf3 BW host ::1 10002 __TIME__ __OPTS__ -b 30G -l 16336 bw __BW__ 5.0 6.0 -iperf3 BW host ::1 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 15G -l 16336 +iperf3 BW host ::1 10002 __TIME__ __OPTS__ -b 40G -l 65487 bw __BW__ 7.0 9.0 iperf3k ns @@ -107,14 +109,14 @@ lat __LAT__ 200 150 tr UDP throughput over IPv4: host to ns -iperf3s ns 100${i}2 __THREADS__ -iperf3 BW host 127.0.0.1 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 3G -l 1372 +iperf3s ns 10002 +iperf3 BW host 127.0.0.1 10002 __TIME__ __OPTS__ -b 5G -l 1372 bw __BW__ 1.0 1.5 -iperf3 BW host 127.0.0.1 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 3G -l 3972 +iperf3 BW host 127.0.0.1 10002 __TIME__ __OPTS__ -b 10G -l 3972 bw __BW__ 1.2 1.8 -iperf3 BW host 127.0.0.1 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 10G -l 16356 +iperf3 BW host 127.0.0.1 10002 __TIME__ __OPTS__ -b 30G -l 16356 bw __BW__ 5.0 6.0 -iperf3 BW host 127.0.0.1 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 15G -l 65507 +iperf3 BW host 127.0.0.1 10002 __TIME__ __OPTS__ -b 40G -l 65507 bw __BW__ 7.0 9.0 iperf3k ns @@ -134,26 +136,24 @@ te test pasta: throughput and latency (traffic via tap) -nsout GW ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' -nsout GW6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway' nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' -info Throughput in Gbps, latency in µs, one thread at __FREQ__ GHz, __STREAMS__ streams +info Throughput in Gbps, latency in µs, one thread at __FREQ__ GHz report pasta tap_udp 1 __FREQ__ th pktlen 1500B 4000B 16384B 65520B tr UDP throughput over IPv6: ns to host -iperf3s host 100${i}3 __THREADS__ +iperf3s host 10003 # (datagram size) = (packet size) - 48: 40 bytes of IPv6 header, 8 of UDP header -iperf3 BW ns __GW6__%__IFNAME__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -b 2G -l 1472 +iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -b 8G -l 1472 bw __BW__ 0.3 0.5 -iperf3 BW ns __GW6__%__IFNAME__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -b 3G -l 3972 +iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -b 12G -l 3972 bw __BW__ 0.5 0.8 -iperf3 BW ns __GW6__%__IFNAME__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -b 4G -l 16356 +iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -b 20G -l 16356 bw __BW__ 3.0 4.0 -iperf3 BW ns __GW6__%__IFNAME__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -b 6G -l 65472 +iperf3 BW ns __MAP_HOST6__ 10003 __TIME__ __OPTS__ -b 30G -l 65472 bw __BW__ 6.0 7.0 iperf3k host @@ -163,22 +163,22 @@ lat - lat - lat - hostb udp_rr --nolog -P 10003 -C 10013 -6 -nsout LAT udp_rr --nolog -P 10003 -C 10013 -6 -c -H __GW6__%__IFNAME__ | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT udp_rr --nolog -P 10003 -C 10013 -6 -c -H __MAP_HOST6__ | sed -n 's/^throughput=\(.*\)/\1/p' hostw lat __LAT__ 200 150 tr UDP throughput over IPv4: ns to host -iperf3s host 100${i}3 __THREADS__ +iperf3s host 10003 # (datagram size) = (packet size) - 28: 20 bytes of IPv4 header, 8 of UDP header -iperf3 BW ns __GW__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -b 2G -l 1472 +iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -b 8G -l 1472 bw __BW__ 0.3 0.5 -iperf3 BW ns __GW__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -b 3G -l 3972 +iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -b 12G -l 3972 bw __BW__ 0.5 0.8 -iperf3 BW ns __GW__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -b 4G -l 16356 +iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -b 20G -l 16356 bw __BW__ 3.0 4.0 -iperf3 BW ns __GW__ 100${i}3 __THREADS__ __TIME__ __OPTS__ -b 6G -l 65492 +iperf3 BW ns __MAP_HOST4__ 10003 __TIME__ __OPTS__ -b 30G -l 65492 bw __BW__ 6.0 7.0 iperf3k host @@ -188,22 +188,22 @@ lat - lat - lat - hostb udp_rr --nolog -P 10003 -C 10013 -4 -nsout LAT udp_rr --nolog -P 10003 -C 10013 -4 -c -H __GW__ | sed -n 's/^throughput=\(.*\)/\1/p' +nsout LAT udp_rr --nolog -P 10003 -C 10013 -4 -c -H __MAP_HOST4__ | sed -n 's/^throughput=\(.*\)/\1/p' hostw lat __LAT__ 200 150 tr UDP throughput over IPv6: host to ns -iperf3s ns 100${i}2 __THREADS__ +iperf3s ns 10002 nsout IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname' -nsout ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local' -iperf3 BW host __ADDR6__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 2G -l 1472 +nsout ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global").local] | .[0]' +iperf3 BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 8G -l 1472 bw __BW__ 0.3 0.5 -iperf3 BW host __ADDR6__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 3G -l 3972 +iperf3 BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 12G -l 3972 bw __BW__ 0.5 0.8 -iperf3 BW host __ADDR6__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 4G -l 16356 +iperf3 BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 20G -l 16356 bw __BW__ 3.0 4.0 -iperf3 BW host __ADDR6__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 15G -l 65472 +iperf3 BW host __ADDR6__ 10002 __TIME__ __OPTS__ -b 30G -l 65472 bw __BW__ 7.0 9.0 iperf3k ns @@ -219,16 +219,16 @@ lat __LAT__ 200 150 tr UDP throughput over IPv4: host to ns -iperf3s ns 100${i}2 __THREADS__ +iperf3s ns 10002 nsout ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[0].local' -iperf3 BW host __ADDR__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 2G -l 1472 +iperf3 BW host __ADDR__ 10002 __TIME__ __OPTS__ -b 8G -l 1472 bw __BW__ 0.3 0.5 -iperf3 BW host __ADDR__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 3G -l 3972 +iperf3 BW host __ADDR__ 10002 __TIME__ __OPTS__ -b 12G -l 3972 bw __BW__ 0.5 0.8 -iperf3 BW host __ADDR__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 4G -l 16356 +iperf3 BW host __ADDR__ 10002 __TIME__ __OPTS__ -b 20G -l 16356 bw __BW__ 3.0 4.0 -iperf3 BW host __ADDR__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 15G -l 65492 +iperf3 BW host __ADDR__ 10002 __TIME__ __OPTS__ -b 30G -l 65492 bw __BW__ 7.0 9.0 iperf3k ns diff --git a/test/rampstream-check.sh b/test/rampstream-check.sh new file mode 100755 index 0000000..c27acdb --- /dev/null +++ b/test/rampstream-check.sh @@ -0,0 +1,3 @@ +#! /bin/sh + +(rampstream check "$@" 2>&1; echo $? > rampstream.status) | tee rampstream.err diff --git a/test/rampstream.c b/test/rampstream.c new file mode 100644 index 0000000..8d81296 --- /dev/null +++ b/test/rampstream.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* rampstream - Generate a check and stream of bytes in a ramp pattern + * + * Copyright Red Hat + * Author: David Gibson <david@gibson.dropbear.id.au> + */ + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <sys/types.h> +#include <unistd.h> +#include <errno.h> +#include <string.h> + +/* Length of the repeating ramp. This is a deliberately not a "round" number so + * that we're very likely to misalign with likely block or chunk sizes of the + * transport. That means we'll detect gaps in the stream, even if they occur + * neatly on block boundaries. Specifically this is the largest 8-bit prime. */ +#define RAMPLEN 251 + +#define INTERVAL 10000 + +#define ARRAY_SIZE(a) ((int)(sizeof(a) / sizeof((a)[0]))) + +#define die(...) \ + do { \ + fprintf(stderr, "rampstream: " __VA_ARGS__); \ + exit(1); \ + } while (0) + +static void usage(void) +{ + die("Usage:\n" + " rampstream send <number>\n" + " Generate a ramp pattern of bytes on stdout, repeated <number>\n" + " times\n" + " rampstream check <number>\n" + " Check a ramp pattern of bytes on stdin, repeater <number>\n" + " times\n"); +} + +static void ramp_send(unsigned long long num, const uint8_t *ramp) +{ + unsigned long long i; + + for (i = 0; i < num; i++) { + int off = 0; + ssize_t rc; + + if (i % INTERVAL == 0) + fprintf(stderr, "%llu...\r", i); + + while (off < RAMPLEN) { + rc = write(1, ramp + off, RAMPLEN - off); + if (rc < 0) { + if (errno == EINTR || + errno == EAGAIN || + errno == EWOULDBLOCK) + continue; + die("Error writing ramp: %s\n", + strerror(errno)); + } + if (rc == 0) + die("Zero length write\n"); + off += rc; + } + } +} + +static void ramp_check(unsigned long long num, const uint8_t *ramp) +{ + unsigned long long i; + + for (i = 0; i < num; i++) { + uint8_t buf[RAMPLEN]; + int off = 0; + ssize_t rc; + + if (i % INTERVAL == 0) + fprintf(stderr, "%llu...\r", i); + + while (off < RAMPLEN) { + rc = read(0, buf + off, RAMPLEN - off); + if (rc < 0) { + if (errno == EINTR || + errno == EAGAIN || + errno == EWOULDBLOCK) + continue; + die("Error reading ramp: %s\n", + strerror(errno)); + } + if (rc == 0) + die("Unexpected EOF, ramp %llu, byte %d\n", + i, off); + off += rc; + } + + if (memcmp(buf, ramp, sizeof(buf)) != 0) { + int j, k; + + for (j = 0; j < RAMPLEN; j++) + if (buf[j] != ramp[j]) + break; + for (k = j; k < RAMPLEN && k < j + 16; k++) + fprintf(stderr, + "Byte %d: expected 0x%02x, got 0x%02x\n", + k, ramp[k], buf[k]); + die("Data mismatch, ramp %llu, byte %d\n", i, j); + } + } +} + +int main(int argc, char *argv[]) +{ + const char *subcmd = argv[1]; + unsigned long long num; + uint8_t ramp[RAMPLEN]; + char *e; + int i; + + if (argc < 2) + usage(); + + errno = 0; + num = strtoull(argv[2], &e, 0); + if (*e || errno) + usage(); + + /* Initialize the ramp block */ + for (i = 0; i < RAMPLEN; i++) + ramp[i] = i; + + if (strcmp(subcmd, "send") == 0) + ramp_send(num, ramp); + else if (strcmp(subcmd, "check") == 0) + ramp_check(num, ramp); + else + usage(); + + exit(0); +} @@ -38,6 +38,9 @@ TRACE=${TRACE:-0} # If set, tell passt and pasta to take packet captures PCAP=${PCAP:-0} +# Custom kernel to boot guests with, if given +KERNEL=${KERNEL:-"/boot/vmlinuz-$(uname -r)"} + COMMIT="$(git log --oneline --no-decorate -1)" . lib/util @@ -90,6 +93,7 @@ run() { test memory/passt teardown memory + VHOST_USER=0 setup passt test passt/ndp test passt/dhcp @@ -101,7 +105,7 @@ run() { VALGRIND=1 setup passt_in_ns test passt/ndp - test passt/dhcp + test passt_in_ns/dhcp test passt_in_ns/icmp test passt_in_ns/tcp test passt_in_ns/udp @@ -112,10 +116,62 @@ run() { test two_guests/basic teardown two_guests + VHOST_USER=1 + setup passt_in_ns + test passt_vu/ndp + test passt_vu_in_ns/dhcp + test passt_vu_in_ns/icmp + test passt_vu_in_ns/tcp + test passt_vu_in_ns/udp + test passt_vu_in_ns/shutdown + teardown passt_in_ns + + setup two_guests + test two_guests_vu/basic + teardown two_guests + + setup migrate + test migrate/basic + teardown migrate + setup migrate + test migrate/basic_fin + teardown migrate + setup migrate + test migrate/bidirectional + teardown migrate + setup migrate + test migrate/bidirectional_fin + teardown migrate + setup migrate + test migrate/iperf3_out4 + teardown migrate + setup migrate + test migrate/iperf3_out6 + teardown migrate + setup migrate + test migrate/iperf3_in4 + teardown migrate + setup migrate + test migrate/iperf3_in6 + teardown migrate + setup migrate + test migrate/iperf3_bidir6 + teardown migrate + setup migrate + test migrate/iperf3_many_out6 + teardown migrate + setup migrate + test migrate/rampstream_in + teardown migrate + setup migrate + test migrate/rampstream_out + teardown migrate + VALGRIND=0 + VHOST_USER=0 setup passt_in_ns test passt/ndp - test passt/dhcp + test passt_in_ns/dhcp test perf/passt_tcp test perf/passt_udp test perf/pasta_tcp @@ -123,6 +179,15 @@ run() { test passt_in_ns/shutdown teardown passt_in_ns + VHOST_USER=1 + setup passt_in_ns + test passt_vu/ndp + test passt_vu_in_ns/dhcp + test perf/passt_vu_tcp + test perf/passt_vu_udp + test passt_vu_in_ns/shutdown + teardown passt_in_ns + # TODO: Make those faster by at least pre-installing gcc and make on # non-x86 images, then re-enable. skip_distro() { @@ -137,7 +202,7 @@ skip_distro() { perf_finish [ ${CI} -eq 1 ] && video_stop - log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}" + log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}, SKIPPED: ${STATUS_SKIPPED}" pause_continue \ "Press any key to keep test session open" \ @@ -158,7 +223,10 @@ run_selected() { __setup= for __test; do - if [ "${__test%%/*}" != "${__setup}" ]; then + # HACK: the migrate tests need the setup repeated for + # each test + if [ "${__test%%/*}" != "${__setup}" -o \ + "${__test%%/*}" = "migrate" ]; then [ -n "${__setup}" ] && teardown "${__setup}" __setup="${__test%%/*}" setup "${__setup}" @@ -168,7 +236,7 @@ run_selected() { done teardown "${__setup}" - log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}" + log "PASS: ${STATUS_PASS}, FAIL: ${STATUS_FAIL}, SKIPPED: ${STATUS_SKIPPED}" pause_continue \ "Press any key to keep test session open" \ @@ -239,4 +307,4 @@ fi tail -n1 ${LOGFILE} echo "Log at ${LOGFILE}" -exit $(tail -n1 ${LOGFILE} | sed -n 's/.*FAIL: \(.*\)$/\1/p') +exit $(tail -n1 ${LOGFILE} | sed -n 's/.*FAIL: \(.*\),.*$/\1/p') diff --git a/test/two_guests/basic b/test/two_guests/basic index fa0608b..e2338ff 100644 --- a/test/two_guests/basic +++ b/test/two_guests/basic @@ -36,45 +36,49 @@ check [ "__ADDR2__" = "__HOST_ADDR__" ] test DHCPv6: addresses # Link is up now, wait for DAD to complete -sleep 2 +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest2 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done guest1 /sbin/dhclient -6 __IFNAME1__ guest2 /sbin/dhclient -6 __IFNAME2__ +# Wait for DAD to complete on the DHCP address +guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done +guest2 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]' g2out ADDR2_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME2__").addr_info[] | select(.prefixlen == 128).local] | .[0]' -hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global").local] | .[0]' +hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]' check [ "__ADDR1_6__" = "__HOST_ADDR6__" ] check [ "__ADDR2_6__" = "__HOST_ADDR6__" ] test TCP/IPv4: guest 1 > guest 2 g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway' guest2b socat -u TCP4-LISTEN:10004 OPEN:msg,create,trunc +sleep 1 guest1 echo "Hello_from_guest_1" | socat -u STDIN TCP4:__GW1__:10004 guest2w -sleep 1 g2out MSG2 cat msg check [ "__MSG2__" = "Hello_from_guest_1" ] test TCP/IPv6: guest 2 > guest 1 g2out GW2_6 ip -j -6 route show|jq -rM '.[] | select(.dst == "default").gateway' guest1b socat -u TCP6-LISTEN:10001 OPEN:msg,create,trunc +sleep 1 guest2 echo "Hello_from_guest_2" | socat -u STDIN TCP6:[__GW2_6__%__IFNAME2__]:10001 guest1w -sleep 1 g1out MSG1 cat msg check [ "__MSG1__" = "Hello_from_guest_2" ] test UDP/IPv4: guest 1 > guest 2 guest2b socat -u TCP4-LISTEN:10004 OPEN:msg,create,trunc +sleep 1 guest1 echo "Hello_from_guest_1" | socat -u STDIN TCP4:__GW1__:10004 guest2w -sleep 1 g2out MSG2 cat msg check [ "__MSG2__" = "Hello_from_guest_1" ] test UDP/IPv6: guest 2 > guest 1 guest1b socat -u TCP6-LISTEN:10001 OPEN:msg,create,trunc +sleep 1 guest2 echo "Hello_from_guest_2" | socat -u STDIN TCP6:[__GW2_6__%__IFNAME2__]:10001 guest1w -sleep 1 g1out MSG1 cat msg check [ "__MSG1__" = "Hello_from_guest_2" ] diff --git a/test/two_guests_vu b/test/two_guests_vu new file mode 120000 index 0000000..a8648fc --- /dev/null +++ b/test/two_guests_vu @@ -0,0 +1 @@ +two_guests
\ No newline at end of file diff --git a/test/valgrind.supp b/test/valgrind.supp index a158394..735b5f6 100644 --- a/test/valgrind.supp +++ b/test/valgrind.supp @@ -6,3 +6,12 @@ ... fun:tcp_sock_consume } + +# same as above, for architectures with the recv() system call (at least i686): +{ + passt_recv_MSG_TRUNC_into_NULL_buffer + Memcheck:Param + socketcall.recv(buf) + ... + fun:tcp_sock_consume +} @@ -15,79 +15,66 @@ /** * DOC: Theory of Operation * + * UDP Flows + * ========= * - * For UDP, a reduced version of port-based connection tracking is implemented - * with two purposes: - * - binding ephemeral ports when they're used as source port by the guest, so - * that replies on those ports can be forwarded back to the guest, with a - * fixed timeout for this binding - * - packets received from the local host get their source changed to a local - * address (gateway address) so that they can be forwarded to the guest, and - * packets sent as replies by the guest need their destination address to - * be changed back to the address of the local host. This is dynamic to allow - * connections from the gateway as well, and uses the same fixed 180s timeout - * - * Sockets for bound ports are created at initialisation time, one set for IPv4 - * and one for IPv6. + * UDP doesn't have true connections, but many protocols use a connection-like + * format. The flow is initiated by a client sending a datagram from a port of + * its choosing (usually ephemeral) to a specific port (usually well known) on a + * server. Both client and server address must be unicast. The server sends + * replies using the same addresses & ports with src/dest swapped. * - * Packets are forwarded back and forth, by prepending and stripping UDP headers - * in the obvious way, with no port translation. + * We track pseudo-connections of this type as flow table entries of type + * FLOW_UDP. We store the time of the last traffic on the flow in uflow->ts, + * and let the flow expire if there is no traffic for UDP_CONN_TIMEOUT seconds. * - * In PASTA mode, the L2-L4 translation is skipped for connections to ports - * bound between namespaces using the loopback interface, messages are directly - * transferred between L4 sockets instead. These are called spliced connections - * for consistency with the TCP implementation, but the splice() syscall isn't - * actually used as it wouldn't make sense for datagram-based connections: a - * pair of recvmmsg() and sendmmsg() deals with this case. + * NOTE: This won't handle multicast protocols, or some protocols with different + * port usage. We'll need specific logic if we want to handle those. + * + * "Listening" sockets + * =================== * - * The connection tracking for PASTA mode is slightly complicated by the absence - * of actual connections, see struct udp_splice_port, and these examples: + * UDP doesn't use listen(), but we consider long term sockets which are allowed + * to create new flows "listening" by analogy with TCP. This listening socket + * could receive packets from multiple flows, so we use a hash table match to + * find the specific flow for a datagram. * - * - from init to namespace: + * Flow sockets + * ============ * - * - forward direction: 127.0.0.1:5000 -> 127.0.0.1:80 in init from socket s, - * with epoll reference: index = 80, splice = 1, orig = 1, ns = 0 - * - if udp_splice_ns[V4][5000].sock: - * - send packet to udp_splice_ns[V4][5000].sock, with destination port - * 80 - * - otherwise: - * - create new socket udp_splice_ns[V4][5000].sock - * - bind in namespace to 127.0.0.1:5000 - * - add to epoll with reference: index = 5000, splice = 1, orig = 0, - * ns = 1 - * - update udp_splice_init[V4][80].ts and udp_splice_ns[V4][5000].ts with - * current time + * When a UDP flow targets a socket, we create a "flow" socket in + * uflow->s[TGTSIDE] both to deliver datagrams to the target side and receive + * replies on the target side. This socket is both bound and connected and has + * EPOLL_TYPE_UDP. The connect() means it will only receive datagrams + * associated with this flow, so the epoll reference directly points to the flow + * and we don't need a hash lookup. * - * - reverse direction: 127.0.0.1:80 -> 127.0.0.1:5000 in namespace socket s, - * having epoll reference: index = 5000, splice = 1, orig = 0, ns = 1 - * - if udp_splice_init[V4][80].sock: - * - send to udp_splice_init[V4][80].sock, with destination port 5000 - * - update udp_splice_init[V4][80].ts and udp_splice_ns[V4][5000].ts with - * current time - * - otherwise, discard + * When a flow is initiated from a listening socket, we create a "flow" socket + * with the same bound address as the listening socket, but also connect()ed to + * the flow's peer. This is stored in uflow->s[INISIDE] and will last for the + * lifetime of the flow, even if the original listening socket is closed due to + * port auto-probing. The duplicate is used to deliver replies back to the + * originating side. * - * - from namespace to init: + * NOTE: A flow socket can have a bound address overlapping with a listening + * socket. That will happen naturally for flows initiated from a socket, but is + * also possible (though unlikely) for tap initiated flows, depending on the + * source port. We assume datagrams for the flow will come to a connect()ed + * socket in preference to a listening socket. The sample program + * doc/platform-requirements/reuseaddr-priority.c documents and tests that + * assumption. * - * - forward direction: 127.0.0.1:2000 -> 127.0.0.1:22 in namespace from - * socket s, with epoll reference: index = 22, splice = 1, orig = 1, ns = 1 - * - if udp4_splice_init[V4][2000].sock: - * - send packet to udp_splice_init[V4][2000].sock, with destination - * port 22 - * - otherwise: - * - create new socket udp_splice_init[V4][2000].sock - * - bind in init to 127.0.0.1:2000 - * - add to epoll with reference: index = 2000, splice = 1, orig = 0, - * ns = 0 - * - update udp_splice_ns[V4][22].ts and udp_splice_init[V4][2000].ts with - * current time + * "Spliced" flows + * =============== + * + * In PASTA mode, L2-L4 translation is skipped for connections to ports bound + * between namespaces using the loopback interface, messages are directly + * transferred between L4 sockets instead. These are called spliced connections + * in analogy with the TCP implementation. The the splice() syscall isn't + * actually used; it doesn't make sense for datagrams and instead a pair of + * recvmmsg() and sendmmsg() is used to forward the datagrams. * - * - reverse direction: 127.0.0.1:22 -> 127.0.0.1:2000 in init from socket s, - * having epoll reference: index = 2000, splice = 1, orig = 0, ns = 0 - * - if udp_splice_ns[V4][22].sock: - * - send to udp_splice_ns[V4][22].sock, with destination port 2000 - * - update udp_splice_ns[V4][22].ts and udp_splice_init[V4][2000].ts with - * current time - * - otherwise, discard + * Note that a spliced flow will have two flow sockets (see above). */ #include <sched.h> @@ -102,6 +89,8 @@ #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/udp.h> +#include <netinet/ip_icmp.h> +#include <netinet/icmp6.h> #include <stdint.h> #include <stddef.h> #include <string.h> @@ -110,9 +99,12 @@ #include <sys/socket.h> #include <sys/uio.h> #include <time.h> +#include <arpa/inet.h> +#include <linux/errqueue.h> #include "checksum.h" #include "util.h" +#include "iov.h" #include "ip.h" #include "siphash.h" #include "inany.h" @@ -120,128 +112,87 @@ #include "tap.h" #include "pcap.h" #include "log.h" +#include "flow_table.h" +#include "udp_internal.h" +#include "udp_vu.h" -#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */ #define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ -/** - * struct udp_tap_port - Port tracking based on tap-facing source port - * @sock: Socket bound to source port used as index - * @flags: Flags for recent activity type seen from/to port - * @ts: Activity timestamp from tap, used for socket aging - */ -struct udp_tap_port { - int sock; - uint8_t flags; -#define PORT_LOCAL BIT(0) /* Port was contacted from local address */ -#define PORT_LOOPBACK BIT(1) /* Port was contacted from loopback address */ -#define PORT_GUA BIT(2) /* Port was contacted from global unicast */ -#define PORT_DNS_FWD BIT(3) /* Port used as source for DNS remapped query */ - - time_t ts; -}; - -/** - * struct udp_splice_port - Bound socket for spliced communication - * @sock: Socket bound to index port - * @ts: Activity timestamp - */ -struct udp_splice_port { - int sock; - time_t ts; -}; - -/* Port tracking, arrays indexed by packet source port (host order) */ -static struct udp_tap_port udp_tap_map [IP_VERSIONS][NUM_PORTS]; +/* Maximum UDP data to be returned in ICMP messages */ +#define ICMP4_MAX_DLEN 8 +#define ICMP6_MAX_DLEN (IPV6_MIN_MTU \ + - sizeof(struct udphdr) \ + - sizeof(struct ipv6hdr)) /* "Spliced" sockets indexed by bound port (host order) */ -static struct udp_splice_port udp_splice_ns [IP_VERSIONS][NUM_PORTS]; -static struct udp_splice_port udp_splice_init[IP_VERSIONS][NUM_PORTS]; - -enum udp_act_type { - UDP_ACT_TAP, - UDP_ACT_SPLICE_NS, - UDP_ACT_SPLICE_INIT, - UDP_ACT_TYPE_MAX, -}; - -/* Activity-based aging for bindings */ -static uint8_t udp_act[IP_VERSIONS][UDP_ACT_TYPE_MAX][DIV_ROUND_UP(NUM_PORTS, 8)]; +static int udp_splice_ns [IP_VERSIONS][NUM_PORTS]; +static int udp_splice_init[IP_VERSIONS][NUM_PORTS]; /* Static buffers */ -/** - * udp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections - * @s_in: Source socket address, filled in by recvmmsg() - * @taph: Tap-level headers (partially pre-filled) - * @iph: Pre-filled IP header (except for tot_len and saddr) - * @uh: Headroom for UDP header - * @data: Storage for UDP payload - */ -static struct udp4_l2_buf_t { - struct sockaddr_in s_in; +/* UDP header and data for inbound messages */ +static struct udp_payload_t udp_payload[UDP_MAX_FRAMES]; - struct tap_hdr taph; - struct iphdr iph; - struct udphdr uh; - uint8_t data[USHRT_MAX - - (sizeof(struct iphdr) + sizeof(struct udphdr))]; -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -udp4_l2_buf[UDP_MAX_FRAMES]; +/* Ethernet header for IPv4 frames */ +static struct ethhdr udp4_eth_hdr; + +/* Ethernet header for IPv6 frames */ +static struct ethhdr udp6_eth_hdr; /** - * udp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections - * @s_in6: Source socket address, filled in by recvmmsg() - * @taph: Tap-level headers (partially pre-filled) - * @ip6h: Pre-filled IP header (except for payload_len and addresses) - * @uh: Headroom for UDP header - * @data: Storage for UDP payload + * struct udp_meta_t - Pre-cooked headers for UDP packets + * @ip6h: Pre-filled IPv6 header (except for payload_len and addresses) + * @ip4h: Pre-filled IPv4 header (except for tot_len and saddr) + * @taph: Tap backend specific header */ -struct udp6_l2_buf_t { - struct sockaddr_in6 s_in6; -#ifdef __AVX2__ - /* Align ip6h to 32-byte boundary. */ - uint8_t pad[64 - (sizeof(struct sockaddr_in6) + sizeof(struct ethhdr) + - sizeof(uint32_t))]; -#endif - - struct tap_hdr taph; +static struct udp_meta_t { struct ipv6hdr ip6h; - struct udphdr uh; - uint8_t data[USHRT_MAX - - (sizeof(struct ipv6hdr) + sizeof(struct udphdr))]; + struct iphdr ip4h; + struct tap_hdr taph; +} #ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) +__attribute__ ((aligned(32))) #endif -udp6_l2_buf[UDP_MAX_FRAMES]; +udp_meta[UDP_MAX_FRAMES]; -/* recvmmsg()/sendmmsg() data for tap */ -static struct iovec udp4_l2_iov_sock [UDP_MAX_FRAMES]; -static struct iovec udp6_l2_iov_sock [UDP_MAX_FRAMES]; +#define PKTINFO_SPACE \ + MAX(CMSG_SPACE(sizeof(struct in_pktinfo)), \ + CMSG_SPACE(sizeof(struct in6_pktinfo))) -static struct iovec udp4_l2_iov_tap [UDP_MAX_FRAMES]; -static struct iovec udp6_l2_iov_tap [UDP_MAX_FRAMES]; +#define RECVERR_SPACE \ + MAX(CMSG_SPACE(sizeof(struct sock_extended_err) + \ + sizeof(struct sockaddr_in)), \ + CMSG_SPACE(sizeof(struct sock_extended_err) + \ + sizeof(struct sockaddr_in6))) -static struct mmsghdr udp4_l2_mh_sock [UDP_MAX_FRAMES]; -static struct mmsghdr udp6_l2_mh_sock [UDP_MAX_FRAMES]; +/** + * enum udp_iov_idx - Indices for the buffers making up a single UDP frame + * @UDP_IOV_TAP tap specific header + * @UDP_IOV_ETH Ethernet header + * @UDP_IOV_IP IP (v4/v6) header + * @UDP_IOV_PAYLOAD IP payload (UDP header + data) + * @UDP_NUM_IOVS the number of entries in the iovec array + */ +enum udp_iov_idx { + UDP_IOV_TAP, + UDP_IOV_ETH, + UDP_IOV_IP, + UDP_IOV_PAYLOAD, + UDP_NUM_IOVS, +}; -/* recvmmsg()/sendmmsg() data for "spliced" connections */ -static struct iovec udp4_iov_splice [UDP_MAX_FRAMES]; -static struct iovec udp6_iov_splice [UDP_MAX_FRAMES]; +/* IOVs and msghdr arrays for receiving datagrams from sockets */ +static struct iovec udp_iov_recv [UDP_MAX_FRAMES]; +static struct mmsghdr udp_mh_recv [UDP_MAX_FRAMES]; -static struct sockaddr_in udp4_localname = { - .sin_family = AF_INET, - .sin_addr = IN4ADDR_LOOPBACK_INIT, -}; -static struct sockaddr_in6 udp6_localname = { - .sin6_family = AF_INET6, - .sin6_addr = IN6ADDR_LOOPBACK_INIT, -}; +/* IOVs and msghdr arrays for sending "spliced" datagrams to sockets */ +static union sockaddr_inany udp_splice_to; + +static struct iovec udp_iov_splice [UDP_MAX_FRAMES]; +static struct mmsghdr udp_mh_splice [UDP_MAX_FRAMES]; -static struct mmsghdr udp4_mh_splice [UDP_MAX_FRAMES]; -static struct mmsghdr udp6_mh_splice [UDP_MAX_FRAMES]; +/* IOVs for L2 frames */ +static struct iovec udp_l2_iov [UDP_MAX_FRAMES][UDP_NUM_IOVS]; /** * udp_portmap_clear() - Clear UDP port map before configuration @@ -251,28 +202,8 @@ void udp_portmap_clear(void) unsigned i; for (i = 0; i < NUM_PORTS; i++) { - udp_tap_map[V4][i].sock = udp_tap_map[V6][i].sock = -1; - udp_splice_ns[V4][i].sock = udp_splice_ns[V6][i].sock = -1; - udp_splice_init[V4][i].sock = udp_splice_init[V6][i].sock = -1; - } -} - -/** - * udp_invert_portmap() - Compute reverse port translations for return packets - * @fwd: Port forwarding configuration to compute reverse map for - */ -static void udp_invert_portmap(struct udp_fwd_ports *fwd) -{ - unsigned int i; - - static_assert(ARRAY_SIZE(fwd->f.delta) == ARRAY_SIZE(fwd->rdelta), - "Forward and reverse delta arrays must have same size"); - for (i = 0; i < ARRAY_SIZE(fwd->f.delta); i++) { - in_port_t delta = fwd->f.delta[i]; - in_port_t rport = i + delta; - - if (delta) - fwd->rdelta[rport] = NUM_PORTS - delta; + udp_splice_ns[V4][i] = udp_splice_ns[V6][i] = -1; + udp_splice_init[V4][i] = udp_splice_init[V6][i] = -1; } } @@ -283,509 +214,743 @@ static void udp_invert_portmap(struct udp_fwd_ports *fwd) */ void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) { - int i; - - for (i = 0; i < UDP_MAX_FRAMES; i++) { - struct udp4_l2_buf_t *b4 = &udp4_l2_buf[i]; - struct udp6_l2_buf_t *b6 = &udp6_l2_buf[i]; - - eth_update_mac(&b4->taph.eh, eth_d, eth_s); - eth_update_mac(&b6->taph.eh, eth_d, eth_s); - } + eth_update_mac(&udp4_eth_hdr, eth_d, eth_s); + eth_update_mac(&udp6_eth_hdr, eth_d, eth_s); } /** - * udp_sock4_iov_init_one() - Initialise a scatter-gather L2 buffer for IPv4 + * udp_iov_init_one() - Initialise scatter-gather lists for one buffer * @c: Execution context * @i: Index of buffer to initialize */ -static void udp_sock4_iov_init_one(const struct ctx *c, size_t i) +static void udp_iov_init_one(const struct ctx *c, size_t i) { - struct msghdr *mh = &udp4_l2_mh_sock[i].msg_hdr; - struct udp4_l2_buf_t *buf = &udp4_l2_buf[i]; - struct iovec *siov = &udp4_l2_iov_sock[i]; - struct iovec *tiov = &udp4_l2_iov_tap[i]; - - *buf = (struct udp4_l2_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IP), - .iph = L2_BUF_IP4_INIT(IPPROTO_UDP) + struct udp_payload_t *payload = &udp_payload[i]; + struct msghdr *mh = &udp_mh_recv[i].msg_hdr; + struct udp_meta_t *meta = &udp_meta[i]; + struct iovec *siov = &udp_iov_recv[i]; + struct iovec *tiov = udp_l2_iov[i]; + + *meta = (struct udp_meta_t) { + .ip4h = L2_BUF_IP4_INIT(IPPROTO_UDP), + .ip6h = L2_BUF_IP6_INIT(IPPROTO_UDP), }; - siov->iov_base = buf->data; - siov->iov_len = sizeof(buf->data); + *siov = IOV_OF_LVALUE(payload->data); + + tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph); + tiov[UDP_IOV_PAYLOAD].iov_base = payload; - mh->msg_name = &buf->s_in; - mh->msg_namelen = sizeof(buf->s_in); mh->msg_iov = siov; mh->msg_iovlen = 1; - - tiov->iov_base = tap_frame_base(c, &buf->taph); } /** - * udp_sock6_iov_init_one() - Initialise a scatter-gather L2 buffer for IPv6 + * udp_iov_init() - Initialise scatter-gather L2 buffers * @c: Execution context - * @i: Index of buffer to initialize */ -static void udp_sock6_iov_init_one(const struct ctx *c, size_t i) +static void udp_iov_init(const struct ctx *c) { - struct msghdr *mh = &udp6_l2_mh_sock[i].msg_hdr; - struct udp6_l2_buf_t *buf = &udp6_l2_buf[i]; - struct iovec *siov = &udp6_l2_iov_sock[i]; - struct iovec *tiov = &udp6_l2_iov_tap[i]; - - *buf = (struct udp6_l2_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IPV6), - .ip6h = L2_BUF_IP6_INIT(IPPROTO_UDP) - }; + size_t i; - siov->iov_base = buf->data; - siov->iov_len = sizeof(buf->data); + udp4_eth_hdr.h_proto = htons_constant(ETH_P_IP); + udp6_eth_hdr.h_proto = htons_constant(ETH_P_IPV6); - mh->msg_name = &buf->s_in6; - mh->msg_namelen = sizeof(buf->s_in6); - mh->msg_iov = siov; - mh->msg_iovlen = 1; + for (i = 0; i < UDP_MAX_FRAMES; i++) + udp_iov_init_one(c, i); +} - tiov->iov_base = tap_frame_base(c, &buf->taph); +/** + * udp_update_hdr4() - Update headers for one IPv4 datagram + * @ip4h: Pre-filled IPv4 header (except for tot_len and saddr) + * @bp: Pointer to udp_payload_t to update + * @toside: Flowside for destination side + * @dlen: Length of UDP payload + * @no_udp_csum: Do not set UDP checksum + * + * Return: size of IPv4 payload (UDP header + data) + */ +size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, + const struct flowside *toside, size_t dlen, + bool no_udp_csum) +{ + const struct in_addr *src = inany_v4(&toside->oaddr); + const struct in_addr *dst = inany_v4(&toside->eaddr); + size_t l4len = dlen + sizeof(bp->uh); + size_t l3len = l4len + sizeof(*ip4h); + + ASSERT(src && dst); + + ip4h->tot_len = htons(l3len); + ip4h->daddr = dst->s_addr; + ip4h->saddr = src->s_addr; + ip4h->check = csum_ip4_header(l3len, IPPROTO_UDP, *src, *dst); + + bp->uh.source = htons(toside->oport); + bp->uh.dest = htons(toside->eport); + bp->uh.len = htons(l4len); + if (no_udp_csum) { + bp->uh.check = 0; + } else { + const struct iovec iov = { + .iov_base = bp->data, + .iov_len = dlen + }; + struct iov_tail data = IOV_TAIL(&iov, 1, 0); + csum_udp4(&bp->uh, *src, *dst, &data); + } + + return l4len; } /** - * udp_sock_iov_init() - Initialise scatter-gather L2 buffers + * udp_update_hdr6() - Update headers for one IPv6 datagram + * @ip6h: Pre-filled IPv6 header (except for payload_len and + * addresses) + * @bp: Pointer to udp_payload_t to update + * @toside: Flowside for destination side + * @dlen: Length of UDP payload + * @no_udp_csum: Do not set UDP checksum + * + * Return: size of IPv6 payload (UDP header + data) + */ +size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, + const struct flowside *toside, size_t dlen, + bool no_udp_csum) +{ + uint16_t l4len = dlen + sizeof(bp->uh); + + ip6h->payload_len = htons(l4len); + ip6h->daddr = toside->eaddr.a6; + ip6h->saddr = toside->oaddr.a6; + ip6h->version = 6; + ip6h->nexthdr = IPPROTO_UDP; + ip6h->hop_limit = 255; + + bp->uh.source = htons(toside->oport); + bp->uh.dest = htons(toside->eport); + bp->uh.len = ip6h->payload_len; + if (no_udp_csum) { + /* 0 is an invalid checksum for UDP IPv6 and dropped by + * the kernel stack, even if the checksum is disabled by virtio + * flags. We need to put any non-zero value here. + */ + bp->uh.check = 0xffff; + } else { + const struct iovec iov = { + .iov_base = bp->data, + .iov_len = dlen + }; + struct iov_tail data = IOV_TAIL(&iov, 1, 0); + csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, &data); + } + + return l4len; +} + +/** + * udp_tap_prepare() - Convert one datagram into a tap frame + * @mmh: Receiving mmsghdr array + * @idx: Index of the datagram to prepare + * @toside: Flowside for destination side + * @no_udp_csum: Do not set UDP checksum + */ +static void udp_tap_prepare(const struct mmsghdr *mmh, + unsigned idx, const struct flowside *toside, + bool no_udp_csum) +{ + struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[idx]; + struct udp_payload_t *bp = &udp_payload[idx]; + struct udp_meta_t *bm = &udp_meta[idx]; + size_t l4len; + + if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) { + l4len = udp_update_hdr6(&bm->ip6h, bp, toside, + mmh[idx].msg_len, no_udp_csum); + tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + + sizeof(udp6_eth_hdr)); + (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr); + (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h); + } else { + l4len = udp_update_hdr4(&bm->ip4h, bp, toside, + mmh[idx].msg_len, no_udp_csum); + tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) + + sizeof(udp4_eth_hdr)); + (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr); + (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip4h); + } + (*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len; +} + +/** + * udp_send_tap_icmp4() - Construct and send ICMPv4 to local peer * @c: Execution context + * @ee: Extended error descriptor + * @toside: Destination side of flow + * @saddr: Address of ICMP generating node + * @in: First bytes (max 8) of original UDP message body + * @dlen: Length of the read part of original UDP message body */ -static void udp_sock_iov_init(const struct ctx *c) +static void udp_send_tap_icmp4(const struct ctx *c, + const struct sock_extended_err *ee, + const struct flowside *toside, + struct in_addr saddr, + const void *in, size_t dlen) { - size_t i; + struct in_addr oaddr = toside->oaddr.v4mapped.a4; + struct in_addr eaddr = toside->eaddr.v4mapped.a4; + in_port_t eport = toside->eport; + in_port_t oport = toside->oport; + struct { + struct icmphdr icmp4h; + struct iphdr ip4h; + struct udphdr uh; + char data[ICMP4_MAX_DLEN]; + } __attribute__((packed, aligned(__alignof__(max_align_t)))) msg; + size_t msglen = sizeof(msg) - sizeof(msg.data) + dlen; + size_t l4len = dlen + sizeof(struct udphdr); + + ASSERT(dlen <= ICMP4_MAX_DLEN); + memset(&msg, 0, sizeof(msg)); + msg.icmp4h.type = ee->ee_type; + msg.icmp4h.code = ee->ee_code; + if (ee->ee_type == ICMP_DEST_UNREACH && ee->ee_code == ICMP_FRAG_NEEDED) + msg.icmp4h.un.frag.mtu = htons((uint16_t) ee->ee_info); + + /* Reconstruct the original headers as returned in the ICMP message */ + tap_push_ip4h(&msg.ip4h, eaddr, oaddr, l4len, IPPROTO_UDP); + tap_push_uh4(&msg.uh, eaddr, eport, oaddr, oport, in, dlen); + memcpy(&msg.data, in, dlen); + + tap_icmp4_send(c, saddr, eaddr, &msg, msglen); +} - for (i = 0; i < UDP_MAX_FRAMES; i++) { - if (c->ifi4) - udp_sock4_iov_init_one(c, i); - if (c->ifi6) - udp_sock6_iov_init_one(c, i); + +/** + * udp_send_tap_icmp6() - Construct and send ICMPv6 to local peer + * @c: Execution context + * @ee: Extended error descriptor + * @toside: Destination side of flow + * @saddr: Address of ICMP generating node + * @in: First bytes (max 1232) of original UDP message body + * @dlen: Length of the read part of original UDP message body + * @flow: IPv6 flow identifier + */ +static void udp_send_tap_icmp6(const struct ctx *c, + const struct sock_extended_err *ee, + const struct flowside *toside, + const struct in6_addr *saddr, + void *in, size_t dlen, uint32_t flow) +{ + const struct in6_addr *oaddr = &toside->oaddr.a6; + const struct in6_addr *eaddr = &toside->eaddr.a6; + in_port_t eport = toside->eport; + in_port_t oport = toside->oport; + struct { + struct icmp6_hdr icmp6h; + struct ipv6hdr ip6h; + struct udphdr uh; + char data[ICMP6_MAX_DLEN]; + } __attribute__((packed, aligned(__alignof__(max_align_t)))) msg; + size_t msglen = sizeof(msg) - sizeof(msg.data) + dlen; + size_t l4len = dlen + sizeof(struct udphdr); + + ASSERT(dlen <= ICMP6_MAX_DLEN); + memset(&msg, 0, sizeof(msg)); + msg.icmp6h.icmp6_type = ee->ee_type; + msg.icmp6h.icmp6_code = ee->ee_code; + if (ee->ee_type == ICMP6_PACKET_TOO_BIG) + msg.icmp6h.icmp6_dataun.icmp6_un_data32[0] = htonl(ee->ee_info); + + /* Reconstruct the original headers as returned in the ICMP message */ + tap_push_ip6h(&msg.ip6h, eaddr, oaddr, l4len, IPPROTO_UDP, flow); + tap_push_uh6(&msg.uh, eaddr, eport, oaddr, oport, in, dlen); + memcpy(&msg.data, in, dlen); + + tap_icmp6_send(c, saddr, eaddr, &msg, msglen); +} + +/** + * udp_pktinfo() - Retrieve packet destination address from cmsg + * @msg: msghdr into which message has been received + * @dst: (Local) destination address of message in @msg (output) + * + * Return: 0 on success, -1 if the information was missing (@dst is set to + * inany_any6). + */ +static int udp_pktinfo(struct msghdr *msg, union inany_addr *dst) +{ + struct cmsghdr *hdr; + + for (hdr = CMSG_FIRSTHDR(msg); hdr; hdr = CMSG_NXTHDR(msg, hdr)) { + if (hdr->cmsg_level == IPPROTO_IP && + hdr->cmsg_type == IP_PKTINFO) { + const struct in_pktinfo *i4 = (void *)CMSG_DATA(hdr); + + *dst = inany_from_v4(i4->ipi_addr); + return 0; + } + + if (hdr->cmsg_level == IPPROTO_IPV6 && + hdr->cmsg_type == IPV6_PKTINFO) { + const struct in6_pktinfo *i6 = (void *)CMSG_DATA(hdr); + + dst->a6 = i6->ipi6_addr; + return 0; + } } + + debug("Missing PKTINFO cmsg on datagram"); + *dst = inany_any6; + return -1; } /** - * udp_splice_new() - Create and prepare socket for "spliced" binding + * udp_sock_recverr() - Receive and clear an error from a socket * @c: Execution context - * @v6: Set for IPv6 sockets - * @src: Source port of original connection, host order - * @ns: Does the splice originate in the ns or not + * @s: Socket to receive errors from + * @sidx: Flow and side of @s, or FLOW_SIDX_NONE if unknown + * @pif: Interface on which the error occurred + * (only used if @sidx == FLOW_SIDX_NONE) + * @port: Local port number of @s (only used if @sidx == FLOW_SIDX_NONE) * - * Return: prepared socket, negative error code on failure + * Return: 1 if error received and processed, 0 if no more errors in queue, < 0 + * if there was an error reading the queue * - * #syscalls:pasta getsockname + * #syscalls recvmsg */ -int udp_splice_new(const struct ctx *c, int v6, in_port_t src, bool ns) +static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx, + uint8_t pif, in_port_t port) { - struct epoll_event ev = { .events = EPOLLIN | EPOLLRDHUP | EPOLLHUP }; - union epoll_ref ref = { .type = EPOLL_TYPE_UDP, - .udp = { .splice = true, .v6 = v6, .port = src } - }; - struct udp_splice_port *sp; - int act, s; - - if (ns) { - ref.udp.pif = PIF_SPLICE; - sp = &udp_splice_ns[v6 ? V6 : V4][src]; - act = UDP_ACT_SPLICE_NS; - } else { - ref.udp.pif = PIF_HOST; - sp = &udp_splice_init[v6 ? V6 : V4][src]; - act = UDP_ACT_SPLICE_INIT; + char buf[PKTINFO_SPACE + RECVERR_SPACE]; + const struct sock_extended_err *ee; + char data[ICMP6_MAX_DLEN]; + struct cmsghdr *hdr; + struct iovec iov = { + .iov_base = data, + .iov_len = sizeof(data) + }; + union sockaddr_inany src; + struct msghdr mh = { + .msg_name = &src, + .msg_namelen = sizeof(src), + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = buf, + .msg_controllen = sizeof(buf), + }; + const struct flowside *fromside, *toside; + union inany_addr offender, otap; + char astr[INANY_ADDRSTRLEN]; + char sastr[SOCKADDR_STRLEN]; + const struct in_addr *o4; + in_port_t offender_port; + struct udp_flow *uflow; + uint8_t topif; + size_t dlen; + ssize_t rc; + + rc = recvmsg(s, &mh, MSG_ERRQUEUE); + if (rc < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) + return 0; + + err_perror("UDP: Failed to read error queue"); + return -1; } - s = socket(v6 ? AF_INET6 : AF_INET, SOCK_DGRAM | SOCK_NONBLOCK, - IPPROTO_UDP); + if (!(mh.msg_flags & MSG_ERRQUEUE)) { + err("Missing MSG_ERRQUEUE flag reading error queue"); + return -1; + } - if (s > FD_REF_MAX) { - close(s); - return -EIO; + for (hdr = CMSG_FIRSTHDR(&mh); hdr; hdr = CMSG_NXTHDR(&mh, hdr)) { + if ((hdr->cmsg_level == IPPROTO_IP && + hdr->cmsg_type == IP_RECVERR) || + (hdr->cmsg_level == IPPROTO_IPV6 && + hdr->cmsg_type == IPV6_RECVERR)) + break; } - if (s < 0) - return s; + if (!hdr) { + err("Missing RECVERR cmsg in error queue"); + return -1; + } - ref.fd = s; + ee = (const struct sock_extended_err *)CMSG_DATA(hdr); - if (v6) { - struct sockaddr_in6 addr6 = { - .sin6_family = AF_INET6, - .sin6_port = htons(src), - .sin6_addr = IN6ADDR_LOOPBACK_INIT, - }; - if (bind(s, (struct sockaddr *)&addr6, sizeof(addr6))) - goto fail; + debug("%s error on UDP socket %i: %s", + str_ee_origin(ee), s, strerror_(ee->ee_errno)); + + if (!flow_sidx_valid(sidx)) { + /* No hint from the socket, determine flow from addresses */ + union inany_addr dst; + + if (udp_pktinfo(&mh, &dst) < 0) { + debug("Missing PKTINFO on UDP error"); + return 1; + } + + sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, &src, &dst, port); + if (!flow_sidx_valid(sidx)) { + debug("Ignoring UDP error without flow"); + return 1; + } } else { - struct sockaddr_in addr4 = { - .sin_family = AF_INET, - .sin_port = htons(src), - .sin_addr = IN4ADDR_LOOPBACK_INIT, - }; - if (bind(s, (struct sockaddr *)&addr4, sizeof(addr4))) - goto fail; + pif = pif_at_sidx(sidx); } - sp->sock = s; - bitmap_set(udp_act[v6 ? V6 : V4][act], src); + uflow = udp_at_sidx(sidx); + ASSERT(uflow); + fromside = &uflow->f.side[sidx.sidei]; + toside = &uflow->f.side[!sidx.sidei]; + topif = uflow->f.pif[!sidx.sidei]; + dlen = rc; - ev.data.u64 = ref.u64; - epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev); - return s; + if (inany_from_sockaddr(&offender, &offender_port, + SO_EE_OFFENDER(ee)) < 0) + goto fail; + + if (pif != PIF_HOST || topif != PIF_TAP) + /* XXX Can we support any other cases? */ + goto fail; + + /* If the offender *is* the endpoint, make sure our translation is + * consistent with the flow's translation. This matters if the flow + * endpoint has a port specific translation (like --dns-match). + */ + if (inany_equals(&offender, &fromside->eaddr)) + otap = toside->oaddr; + else if (!nat_inbound(c, &offender, &otap)) + goto fail; + + if (hdr->cmsg_level == IPPROTO_IP && + (o4 = inany_v4(&otap)) && inany_v4(&toside->eaddr)) { + dlen = MIN(dlen, ICMP4_MAX_DLEN); + udp_send_tap_icmp4(c, ee, toside, *o4, data, dlen); + return 1; + } + + if (hdr->cmsg_level == IPPROTO_IPV6 && !inany_v4(&toside->eaddr)) { + udp_send_tap_icmp6(c, ee, toside, &otap.a6, data, dlen, + FLOW_IDX(uflow)); + return 1; + } fail: - close(s); - return -1; + flow_dbg(uflow, "Can't propagate %s error from %s %s to %s %s", + str_ee_origin(ee), + pif_name(pif), + sockaddr_ntop(SO_EE_OFFENDER(ee), sastr, sizeof(sastr)), + pif_name(topif), + inany_ntop(&toside->eaddr, astr, sizeof(astr))); + return 1; } /** - * struct udp_splice_new_ns_arg - Arguments for udp_splice_new_ns() + * udp_sock_errs() - Process errors on a socket * @c: Execution context - * @v6: Set for IPv6 - * @src: Source port of originating datagram, host order - * @dst: Destination port of originating datagram, host order - * @s: Newly created socket or negative error code - */ -struct udp_splice_new_ns_arg { - const struct ctx *c; - int v6; - in_port_t src; - int s; -}; - -/** - * udp_splice_new_ns() - Enter namespace and call udp_splice_new() - * @arg: See struct udp_splice_new_ns_arg + * @s: Socket to receive errors from + * @sidx: Flow and side of @s, or FLOW_SIDX_NONE if unknown + * @pif: Interface on which the error occurred + * (only used if @sidx == FLOW_SIDX_NONE) + * @port: Local port number of @s (only used if @sidx == FLOW_SIDX_NONE) * - * Return: 0 + * Return: Number of errors handled, or < 0 if we have an unrecoverable error */ -static int udp_splice_new_ns(void *arg) +static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx, + uint8_t pif, in_port_t port) { - struct udp_splice_new_ns_arg *a; + unsigned n_err = 0; + socklen_t errlen; + int rc, err; - a = (struct udp_splice_new_ns_arg *)arg; + ASSERT(!c->no_udp); - ns_enter(a->c); + /* Empty the error queue */ + while ((rc = udp_sock_recverr(c, s, sidx, pif, port)) > 0) + n_err += rc; - a->s = udp_splice_new(a->c, a->v6, a->src, true); + if (rc < 0) + return -1; /* error reading error, unrecoverable */ - return 0; + errlen = sizeof(err); + if (getsockopt(s, SOL_SOCKET, SO_ERROR, &err, &errlen) < 0 || + errlen != sizeof(err)) { + err_perror("Error reading SO_ERROR"); + return -1; /* error reading error, unrecoverable */ + } + + if (err) { + debug("Unqueued error on UDP socket %i: %s", s, strerror_(err)); + n_err++; + } + + if (!n_err) { + /* EPOLLERR, but no errors to clear !? */ + err("EPOLLERR event without reported errors on socket %i", s); + return -1; /* no way to clear, unrecoverable */ + } + + return n_err; } /** - * udp_mmh_splice_port() - Is source address of message suitable for splicing? - * @v6: Is @sa a sockaddr_in6 (otherwise sockaddr_in)? - * @mmh: mmsghdr of incoming message + * udp_peek_addr() - Get source address for next packet + * @s: Socket to get information from + * @src: Socket address (output) + * @dst: (Local) destination address (output) * - * Return: if @sa refers to localhost (127.0.0.1 or ::1) the port from - * @sa in host order, otherwise -1. + * Return: 0 if no more packets, 1 on success, -ve error code on error */ -static int udp_mmh_splice_port(bool v6, const struct mmsghdr *mmh) +static int udp_peek_addr(int s, union sockaddr_inany *src, + union inany_addr *dst) { - const struct sockaddr_in6 *sa6 = mmh->msg_hdr.msg_name; - const struct sockaddr_in *sa4 = mmh->msg_hdr.msg_name; + char sastr[SOCKADDR_STRLEN], dstr[INANY_ADDRSTRLEN]; + char cmsg[PKTINFO_SPACE]; + struct msghdr msg = { + .msg_name = src, + .msg_namelen = sizeof(*src), + .msg_control = cmsg, + .msg_controllen = sizeof(cmsg), + }; + int rc; + + rc = recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT); + if (rc < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) + return 0; + return -errno; + } - if (v6 && IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr)) - return ntohs(sa6->sin6_port); + udp_pktinfo(&msg, dst); - if (!v6 && IN4_IS_ADDR_LOOPBACK(&sa4->sin_addr)) - return ntohs(sa4->sin_port); + trace("Peeked UDP datagram: %s -> %s", + sockaddr_ntop(src, sastr, sizeof(sastr)), + inany_ntop(dst, dstr, sizeof(dstr))); - return -1; + return 1; } /** - * udp_splice_sendfrom() - Send datagrams from given port to given port + * udp_sock_recv() - Receive datagrams from a socket * @c: Execution context - * @start: Index of first datagram in udp[46]_l2_buf - * @n: Number of datagrams to send - * @src: Datagrams will be sent from this port (on origin side) - * @dst: Datagrams will be send to this port (on destination side) - * @from_pif: pif from which the packet originated - * @v6: Send as IPv6? - * @allow_new: If true create sending socket if needed, if false discard - * if no sending socket is available - * @now: Timestamp + * @s: Socket to receive from + * @mmh: mmsghdr array to receive into + * @n: Maximum number of datagrams to receive + * + * Return: Number of datagrams received + * + * #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64 */ -static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n, - in_port_t src, in_port_t dst, uint8_t from_pif, - bool v6, bool allow_new, - const struct timespec *now) +static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n) { - struct mmsghdr *mmh_recv, *mmh_send; - unsigned int i; - int s; + ASSERT(!c->no_udp); - if (v6) { - mmh_recv = udp6_l2_mh_sock; - mmh_send = udp6_mh_splice; - } else { - mmh_recv = udp4_l2_mh_sock; - mmh_send = udp4_mh_splice; + n = recvmmsg(s, mmh, n, 0, NULL); + if (n < 0) { + trace("Error receiving datagrams: %s", strerror_(errno)); + /* Bail out and let the EPOLLERR handler deal with it */ + return 0; } - if (from_pif == PIF_SPLICE) { - src += c->udp.fwd_in.rdelta[src]; - s = udp_splice_init[v6][src].sock; - if (s < 0 && allow_new) - s = udp_splice_new(c, v6, src, false); + return n; +} - if (s < 0) - return; +/** + * udp_sock_to_sock() - Forward datagrams from socket to socket + * @c: Execution context + * @from_s: Socket to receive datagrams from + * @n: Maximum number of datagrams to forward + * @tosidx: Flow & side to forward datagrams to + * + * #syscalls sendmmsg + */ +static void udp_sock_to_sock(const struct ctx *c, int from_s, int n, + flow_sidx_t tosidx) +{ + const struct flowside *toside = flowside_at_sidx(tosidx); + const struct udp_flow *uflow = udp_at_sidx(tosidx); + uint8_t topif = pif_at_sidx(tosidx); + int to_s = uflow->s[tosidx.sidei]; + socklen_t sl; + int i; - udp_splice_ns[v6][dst].ts = now->tv_sec; - udp_splice_init[v6][src].ts = now->tv_sec; - } else { - ASSERT(from_pif == PIF_HOST); - src += c->udp.fwd_out.rdelta[src]; - s = udp_splice_ns[v6][src].sock; - if (s < 0 && allow_new) { - struct udp_splice_new_ns_arg arg = { - c, v6, src, -1, - }; - - NS_CALL(udp_splice_new_ns, &arg); - s = arg.s; - } - if (s < 0) - return; + if ((n = udp_sock_recv(c, from_s, udp_mh_recv, n)) <= 0) + return; - udp_splice_init[v6][dst].ts = now->tv_sec; - udp_splice_ns[v6][src].ts = now->tv_sec; + for (i = 0; i < n; i++) { + udp_mh_splice[i].msg_hdr.msg_iov->iov_len + = udp_mh_recv[i].msg_len; } - for (i = start; i < start + n; i++) - mmh_send[i].msg_hdr.msg_iov->iov_len = mmh_recv[i].msg_len; + pif_sockaddr(c, &udp_splice_to, &sl, topif, + &toside->eaddr, toside->eport); - sendmmsg(s, mmh_send + start, n, MSG_NOSIGNAL); + sendmmsg(to_s, udp_mh_splice, n, MSG_NOSIGNAL); } /** - * udp_update_hdr4() - Update headers for one IPv4 datagram + * udp_buf_sock_to_tap() - Forward datagrams from socket to tap * @c: Execution context - * @b: Pointer to udp4_l2_buf to update - * @dstport: Destination port number - * @datalen: Length of UDP payload - * @now: Current timestamp - * - * Return: size of tap frame with headers + * @s: Socket to read data from + * @n: Maximum number of datagrams to forward + * @tosidx: Flow & side to forward data from @s to */ -static size_t udp_update_hdr4(const struct ctx *c, struct udp4_l2_buf_t *b, - in_port_t dstport, size_t datalen, - const struct timespec *now) +static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n, + flow_sidx_t tosidx) { - size_t ip_len = datalen + sizeof(b->iph) + sizeof(b->uh); - in_port_t srcport = ntohs(b->s_in.sin_port); - struct in_addr src = b->s_in.sin_addr; - - if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match) && - IN4_ARE_ADDR_EQUAL(&src, &c->ip4.dns_host) && srcport == 53 && - (udp_tap_map[V4][dstport].flags & PORT_DNS_FWD)) { - src = c->ip4.dns_match; - } else if (IN4_IS_ADDR_LOOPBACK(&src) || - IN4_ARE_ADDR_EQUAL(&src, &c->ip4.addr_seen)) { - udp_tap_map[V4][srcport].ts = now->tv_sec; - udp_tap_map[V4][srcport].flags |= PORT_LOCAL; - - if (IN4_IS_ADDR_LOOPBACK(&src)) - udp_tap_map[V4][srcport].flags |= PORT_LOOPBACK; - else - udp_tap_map[V4][srcport].flags &= ~PORT_LOOPBACK; - - bitmap_set(udp_act[V4][UDP_ACT_TAP], srcport); - - src = c->ip4.gw; - } + const struct flowside *toside = flowside_at_sidx(tosidx); + int i; - b->iph.tot_len = htons(ip_len); - b->iph.daddr = c->ip4.addr_seen.s_addr; - b->iph.saddr = src.s_addr; - b->iph.check = csum_ip4_header(b->iph.tot_len, IPPROTO_UDP, - src, c->ip4.addr_seen); + if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0) + return; - b->uh.source = b->s_in.sin_port; - b->uh.dest = htons(dstport); - b->uh.len = htons(datalen + sizeof(b->uh)); + for (i = 0; i < n; i++) + udp_tap_prepare(udp_mh_recv, i, toside, false); - return tap_frame_len(c, &b->taph, ip_len); + tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n); } /** - * udp_update_hdr6() - Update headers for one IPv6 datagram + * udp_sock_fwd() - Forward datagrams from a possibly unconnected socket * @c: Execution context - * @b: Pointer to udp6_l2_buf to update - * @dstport: Destination port number - * @datalen: Length of UDP payload + * @s: Socket to forward from + * @frompif: Interface to which @s belongs + * @port: Our (local) port number of @s * @now: Current timestamp - * - * Return: size of tap frame with headers */ -static size_t udp_update_hdr6(const struct ctx *c, struct udp6_l2_buf_t *b, - in_port_t dstport, size_t datalen, - const struct timespec *now) +void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif, + in_port_t port, const struct timespec *now) { - const struct in6_addr *src = &b->s_in6.sin6_addr; - const struct in6_addr *dst = &c->ip6.addr_seen; - uint16_t payload_len = datalen + sizeof(b->uh); - in_port_t srcport = ntohs(b->s_in6.sin6_port); - - if (IN6_IS_ADDR_LINKLOCAL(src)) { - dst = &c->ip6.addr_ll_seen; - } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match) && - IN6_ARE_ADDR_EQUAL(src, &c->ip6.dns_host) && - srcport == 53 && - (udp_tap_map[V4][dstport].flags & PORT_DNS_FWD)) { - src = &c->ip6.dns_match; - } else if (IN6_IS_ADDR_LOOPBACK(src) || - IN6_ARE_ADDR_EQUAL(src, &c->ip6.addr_seen) || - IN6_ARE_ADDR_EQUAL(src, &c->ip6.addr)) { - udp_tap_map[V6][srcport].ts = now->tv_sec; - udp_tap_map[V6][srcport].flags |= PORT_LOCAL; - - if (IN6_IS_ADDR_LOOPBACK(src)) - udp_tap_map[V6][srcport].flags |= PORT_LOOPBACK; - else - udp_tap_map[V6][srcport].flags &= ~PORT_LOOPBACK; - - if (IN6_ARE_ADDR_EQUAL(src, &c->ip6.addr)) - udp_tap_map[V6][srcport].flags |= PORT_GUA; - else - udp_tap_map[V6][srcport].flags &= ~PORT_GUA; - - bitmap_set(udp_act[V6][UDP_ACT_TAP], srcport); - - dst = &c->ip6.addr_ll_seen; - - if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw)) - src = &c->ip6.gw; - else - src = &c->ip6.addr_ll; + union sockaddr_inany src; + union inany_addr dst; + int rc; + + while ((rc = udp_peek_addr(s, &src, &dst)) != 0) { + bool discard = false; + flow_sidx_t tosidx; + uint8_t topif; + + if (rc < 0) { + trace("Error peeking at socket address: %s", + strerror_(-rc)); + /* Clear errors & carry on */ + if (udp_sock_errs(c, s, FLOW_SIDX_NONE, + frompif, port) < 0) { + err( +"UDP: Unrecoverable error on listening socket: (%s port %hu)", + pif_name(frompif), port); + /* FIXME: what now? close/re-open socket? */ + } + continue; + } - } + tosidx = udp_flow_from_sock(c, frompif, &dst, port, &src, now); + topif = pif_at_sidx(tosidx); - b->ip6h.payload_len = htons(payload_len); - b->ip6h.daddr = *dst; - b->ip6h.saddr = *src; - b->ip6h.version = 6; - b->ip6h.nexthdr = IPPROTO_UDP; - b->ip6h.hop_limit = 255; + if (pif_is_socket(topif)) { + udp_sock_to_sock(c, s, 1, tosidx); + } else if (topif == PIF_TAP) { + if (c->mode == MODE_VU) + udp_vu_sock_to_tap(c, s, 1, tosidx); + else + udp_buf_sock_to_tap(c, s, 1, tosidx); + } else if (flow_sidx_valid(tosidx)) { + struct udp_flow *uflow = udp_at_sidx(tosidx); + + flow_err(uflow, + "No support for forwarding UDP from %s to %s", + pif_name(frompif), pif_name(topif)); + discard = true; + } else { + debug("Discarding datagram without flow"); + discard = true; + } - b->uh.source = b->s_in6.sin6_port; - b->uh.dest = htons(dstport); - b->uh.len = b->ip6h.payload_len; - csum_udp6(&b->uh, src, dst, b->data, datalen); + if (discard) { + struct msghdr msg = { 0 }; - return tap_frame_len(c, &b->taph, payload_len + sizeof(b->ip6h)); + if (recvmsg(s, &msg, MSG_DONTWAIT) < 0) + debug_perror("Failed to discard datagram"); + } + } } /** - * udp_tap_send() - Prepare UDP datagrams and send to tap interface + * udp_listen_sock_handler() - Handle new data from socket * @c: Execution context - * @start: Index of first datagram in udp[46]_l2_buf pool - * @n: Number of datagrams to send - * @dstport: Destination port number - * @v6: True if using IPv6 + * @ref: epoll reference + * @events: epoll events bitmap * @now: Current timestamp - * - * Return: size of tap frame with headers */ -static void udp_tap_send(const struct ctx *c, - unsigned int start, unsigned int n, - in_port_t dstport, bool v6, const struct timespec *now) +void udp_listen_sock_handler(const struct ctx *c, + union epoll_ref ref, uint32_t events, + const struct timespec *now) { - struct iovec *tap_iov; - unsigned int i; - - if (v6) - tap_iov = udp6_l2_iov_tap; - else - tap_iov = udp4_l2_iov_tap; - - for (i = start; i < start + n; i++) { - size_t buf_len; - - if (v6) - buf_len = udp_update_hdr6(c, &udp6_l2_buf[i], dstport, - udp6_l2_mh_sock[i].msg_len, now); - else - buf_len = udp_update_hdr4(c, &udp4_l2_buf[i], dstport, - udp4_l2_mh_sock[i].msg_len, now); - - tap_iov[i].iov_len = buf_len; - } - - tap_send_frames(c, tap_iov + start, 1, n); + if (events & (EPOLLERR | EPOLLIN)) + udp_sock_fwd(c, ref.fd, ref.udp.pif, ref.udp.port, now); } /** - * udp_sock_handler() - Handle new data from socket + * udp_sock_handler() - Handle new data from flow specific socket * @c: Execution context * @ref: epoll reference * @events: epoll events bitmap * @now: Current timestamp - * - * #syscalls recvmmsg */ -void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, - const struct timespec *now) +void udp_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events, const struct timespec *now) { - /* For not entirely clear reasons (data locality?) pasta gets - * better throughput if we receive tap datagrams one at a - * atime. For small splice datagrams throughput is slightly - * better if we do batch, but it's slightly worse for large - * splice datagrams. Since we don't know before we receive - * whether we'll use tap or splice, always go one at a time - * for pasta mode. - */ - ssize_t n = (c->mode == MODE_PASST ? UDP_MAX_FRAMES : 1); - in_port_t dstport = ref.udp.port; - bool v6 = ref.udp.v6; - struct mmsghdr *mmh_recv; - int i, m; - - if (c->no_udp || !(events & EPOLLIN)) - return; + struct udp_flow *uflow = udp_at_sidx(ref.flowside); - if (ref.udp.pif == PIF_SPLICE) - dstport += c->udp.fwd_out.f.delta[dstport]; - else if (ref.udp.pif == PIF_HOST) - dstport += c->udp.fwd_in.f.delta[dstport]; + ASSERT(!c->no_udp && uflow); - if (v6) { - mmh_recv = udp6_l2_mh_sock; - udp6_localname.sin6_port = htons(dstport); - } else { - mmh_recv = udp4_l2_mh_sock; - udp4_localname.sin_port = htons(dstport); + if (events & EPOLLERR) { + if (udp_sock_errs(c, ref.fd, ref.flowside, PIF_NONE, 0) < 0) { + flow_err(uflow, "Unrecoverable error on flow socket"); + goto fail; + } } - n = recvmmsg(ref.fd, mmh_recv, n, 0, NULL); - if (n <= 0) - return; - - for (i = 0; i < n; i += m) { - int splicefrom = -1; - m = n; - - if (ref.udp.splice) { - splicefrom = udp_mmh_splice_port(v6, mmh_recv + i); - - for (m = 1; i + m < n; m++) { - int p; - - p = udp_mmh_splice_port(v6, mmh_recv + i + m); - if (p != splicefrom) - break; + if (events & EPOLLIN) { + /* For not entirely clear reasons (data locality?) pasta gets + * better throughput if we receive tap datagrams one at a + * time. For small splice datagrams throughput is slightly + * better if we do batch, but it's slightly worse for large + * splice datagrams. Since we don't know the size before we + * receive, always go one at a time for pasta mode. + */ + size_t n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES); + flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside); + uint8_t topif = pif_at_sidx(tosidx); + int s = ref.fd; + + flow_trace(uflow, "Received data on reply socket"); + uflow->ts = now->tv_sec; + + if (pif_is_socket(topif)) { + udp_sock_to_sock(c, ref.fd, n, tosidx); + } else if (topif == PIF_TAP) { + if (c->mode == MODE_VU) { + udp_vu_sock_to_tap(c, s, UDP_MAX_FRAMES, + tosidx); + } else { + udp_buf_sock_to_tap(c, s, n, tosidx); } + } else { + flow_err(uflow, + "No support for forwarding UDP from %s to %s", + pif_name(pif_at_sidx(ref.flowside)), + pif_name(topif)); + goto fail; } - - if (splicefrom >= 0) - udp_splice_sendfrom(c, i, m, splicefrom, dstport, - ref.udp.pif, v6, ref.udp.orig, now); - else - udp_tap_send(c, i, m, dstport, v6, now); } + return; + +fail: + flow_err_details(uflow); + udp_flow_close(c, uflow); } /** @@ -795,6 +960,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, * @af: Address family, AF_INET or AF_INET6 * @saddr: Source address * @daddr: Destination address + * @ttl: TTL or hop limit for packets to be sent in this call * @p: Pool of UDP packets, with UDP headers * @idx: Index of first packet to process * @now: Current timestamp @@ -803,23 +969,24 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, * * #syscalls sendmmsg */ -int udp_tap_handler(struct ctx *c, uint8_t pif, +int udp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, - const struct pool *p, int idx, const struct timespec *now) + uint8_t ttl, const struct pool *p, int idx, + const struct timespec *now) { + const struct flowside *toside; struct mmsghdr mm[UIO_MAXIOV]; + union sockaddr_inany to_sa; struct iovec m[UIO_MAXIOV]; - struct sockaddr_in6 s_in6; - struct sockaddr_in s_in; const struct udphdr *uh; - struct sockaddr *sa; + struct udp_flow *uflow; int i, s, count = 0; + flow_sidx_t tosidx; in_port_t src, dst; + uint8_t topif; socklen_t sl; - (void)c; - (void)saddr; - (void)pif; + ASSERT(!c->no_udp); uh = packet_get(p, idx, 0, sizeof(*uh), NULL); if (!uh) @@ -831,113 +998,32 @@ int udp_tap_handler(struct ctx *c, uint8_t pif, src = ntohs(uh->source); dst = ntohs(uh->dest); - if (af == AF_INET) { - s_in = (struct sockaddr_in) { - .sin_family = AF_INET, - .sin_port = uh->dest, - .sin_addr = *(struct in_addr *)daddr, - }; - - sa = (struct sockaddr *)&s_in; - sl = sizeof(s_in); - - if (IN4_ARE_ADDR_EQUAL(&s_in.sin_addr, &c->ip4.dns_match) && - ntohs(s_in.sin_port) == 53) { - s_in.sin_addr = c->ip4.dns_host; - udp_tap_map[V4][src].ts = now->tv_sec; - udp_tap_map[V4][src].flags |= PORT_DNS_FWD; - bitmap_set(udp_act[V4][UDP_ACT_TAP], src); - } else if (IN4_ARE_ADDR_EQUAL(&s_in.sin_addr, &c->ip4.gw) && - !c->no_map_gw) { - if (!(udp_tap_map[V4][dst].flags & PORT_LOCAL) || - (udp_tap_map[V4][dst].flags & PORT_LOOPBACK)) - s_in.sin_addr.s_addr = htonl(INADDR_LOOPBACK); - else - s_in.sin_addr = c->ip4.addr_seen; - } - - debug("UDP from tap src=%hu dst=%hu, s=%d", - src, dst, udp_tap_map[V4][src].sock); - if ((s = udp_tap_map[V4][src].sock) < 0) { - struct in_addr bind_addr = IN4ADDR_ANY_INIT; - union udp_epoll_ref uref = { - .port = src, - .pif = PIF_HOST, - }; - const char *bind_if = NULL; - - if (!IN4_IS_ADDR_LOOPBACK(&s_in.sin_addr)) - bind_if = c->ip4.ifname_out; - - if (!IN4_IS_ADDR_LOOPBACK(&s_in.sin_addr)) - bind_addr = c->ip4.addr_out; - - s = sock_l4(c, AF_INET, IPPROTO_UDP, &bind_addr, - bind_if, src, uref.u32); - if (s < 0) - return p->count - idx; - - udp_tap_map[V4][src].sock = s; - bitmap_set(udp_act[V4][UDP_ACT_TAP], src); - } + tosidx = udp_flow_from_tap(c, pif, af, saddr, daddr, src, dst, now); + if (!(uflow = udp_at_sidx(tosidx))) { + char sstr[INET6_ADDRSTRLEN], dstr[INET6_ADDRSTRLEN]; - udp_tap_map[V4][src].ts = now->tv_sec; - } else { - s_in6 = (struct sockaddr_in6) { - .sin6_family = AF_INET6, - .sin6_port = uh->dest, - .sin6_addr = *(struct in6_addr *)daddr, - }; - const struct in6_addr *bind_addr = &in6addr_any; - - sa = (struct sockaddr *)&s_in6; - sl = sizeof(s_in6); - - if (IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.dns_match) && - ntohs(s_in6.sin6_port) == 53) { - s_in6.sin6_addr = c->ip6.dns_host; - udp_tap_map[V6][src].ts = now->tv_sec; - udp_tap_map[V6][src].flags |= PORT_DNS_FWD; - bitmap_set(udp_act[V6][UDP_ACT_TAP], src); - } else if (IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.gw) && - !c->no_map_gw) { - if (!(udp_tap_map[V6][dst].flags & PORT_LOCAL) || - (udp_tap_map[V6][dst].flags & PORT_LOOPBACK)) - s_in6.sin6_addr = in6addr_loopback; - else if (udp_tap_map[V6][dst].flags & PORT_GUA) - s_in6.sin6_addr = c->ip6.addr; - else - s_in6.sin6_addr = c->ip6.addr_seen; - } else if (IN6_IS_ADDR_LINKLOCAL(&s_in6.sin6_addr)) { - bind_addr = &c->ip6.addr_ll; - } - - if ((s = udp_tap_map[V6][src].sock) < 0) { - union udp_epoll_ref uref = { - .v6 = 1, - .port = src, - .pif = PIF_HOST, - }; - const char *bind_if = NULL; - - if (!IN6_IS_ADDR_LOOPBACK(&s_in6.sin6_addr)) - bind_if = c->ip6.ifname_out; + debug("Dropping datagram with no flow %s %s:%hu -> %s:%hu", + pif_name(pif), + inet_ntop(af, saddr, sstr, sizeof(sstr)), src, + inet_ntop(af, daddr, dstr, sizeof(dstr)), dst); + return 1; + } - if (!IN6_IS_ADDR_LOOPBACK(&s_in6.sin6_addr) && - !IN6_IS_ADDR_LINKLOCAL(&s_in6.sin6_addr)) - bind_addr = &c->ip6.addr_out; + topif = pif_at_sidx(tosidx); + if (topif != PIF_HOST) { + flow_sidx_t fromsidx = flow_sidx_opposite(tosidx); + uint8_t frompif = pif_at_sidx(fromsidx); - s = sock_l4(c, AF_INET6, IPPROTO_UDP, bind_addr, - bind_if, src, uref.u32); - if (s < 0) - return p->count - idx; + flow_err(uflow, "No support for forwarding UDP from %s to %s", + pif_name(frompif), pif_name(topif)); + return 1; + } + toside = flowside_at_sidx(tosidx); - udp_tap_map[V6][src].sock = s; - bitmap_set(udp_act[V6][UDP_ACT_TAP], src); - } + s = uflow->s[tosidx.sidei]; + ASSERT(s >= 0); - udp_tap_map[V6][src].ts = now->tv_sec; - } + pif_sockaddr(c, &to_sa, &sl, topif, &toside->eaddr, toside->eport); for (i = 0; i < (int)p->count - idx; i++) { struct udphdr *uh_send; @@ -947,7 +1033,7 @@ int udp_tap_handler(struct ctx *c, uint8_t pif, if (!uh_send) return p->count - idx; - mm[i].msg_hdr.msg_name = sa; + mm[i].msg_hdr.msg_name = &to_sa; mm[i].msg_hdr.msg_namelen = sl; if (len) { @@ -965,6 +1051,24 @@ int udp_tap_handler(struct ctx *c, uint8_t pif, mm[i].msg_hdr.msg_controllen = 0; mm[i].msg_hdr.msg_flags = 0; + if (ttl != uflow->ttl[tosidx.sidei]) { + uflow->ttl[tosidx.sidei] = ttl; + if (af == AF_INET) { + if (setsockopt(s, IPPROTO_IP, IP_TTL, + &ttl, sizeof(ttl)) < 0) + flow_perror(uflow, + "setsockopt IP_TTL"); + } else { + /* IPv6 hop_limit cannot be only 1 byte */ + int hop_limit = ttl; + + if (setsockopt(s, SOL_IPV6, IPV6_UNICAST_HOPS, + &hop_limit, sizeof(hop_limit)) < 0) + flow_perror(uflow, + "setsockopt IPV6_UNICAST_HOPS"); + } + } + count++; } @@ -979,56 +1083,62 @@ int udp_tap_handler(struct ctx *c, uint8_t pif, * udp_sock_init() - Initialise listening sockets for a given port * @c: Execution context * @ns: In pasta mode, if set, bind with loopback address in namespace - * @af: Address family to select a specific IP version, or AF_UNSPEC * @addr: Pointer to address for binding, NULL if not configured * @ifname: Name of interface to bind to, NULL if not configured * @port: Port, host order * * Return: 0 on (partial) success, negative error code on (complete) failure */ -int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, - const void *addr, const char *ifname, in_port_t port) +int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr, + const char *ifname, in_port_t port) { - union udp_epoll_ref uref = { .splice = (c->mode == MODE_PASTA), - .orig = true, .port = port }; - int s, r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1; + union udp_listen_epoll_ref uref = { + .pif = ns ? PIF_SPLICE : PIF_HOST, + .port = port, + }; + int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1; + + ASSERT(!c->no_udp); - if (ns) - uref.pif = PIF_SPLICE; - else - uref.pif = PIF_HOST; + if (!addr && c->ifi4 && c->ifi6 && !ns) { + int s; - if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) { - uref.v6 = 0; + /* Attempt to get a dual stack socket */ + s = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST, + NULL, ifname, port, uref.u32); + udp_splice_init[V4][port] = s < 0 ? -1 : s; + udp_splice_init[V6][port] = s < 0 ? -1 : s; + if (IN_INTERVAL(0, FD_REF_MAX, s)) + return 0; + } + if ((!addr || inany_v4(addr)) && c->ifi4) { if (!ns) { - r4 = s = sock_l4(c, AF_INET, IPPROTO_UDP, addr, - ifname, port, uref.u32); + r4 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST, + addr ? addr : &inany_any4, ifname, + port, uref.u32); - udp_tap_map[V4][uref.port].sock = s < 0 ? -1 : s; - udp_splice_init[V4][port].sock = s < 0 ? -1 : s; + udp_splice_init[V4][port] = r4 < 0 ? -1 : r4; } else { - r4 = s = sock_l4(c, AF_INET, IPPROTO_UDP, - &in4addr_loopback, - ifname, port, uref.u32); - udp_splice_ns[V4][port].sock = s < 0 ? -1 : s; + r4 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_SPLICE, + &inany_loopback4, ifname, + port, uref.u32); + udp_splice_ns[V4][port] = r4 < 0 ? -1 : r4; } } - if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) { - uref.v6 = 1; - + if ((!addr || !inany_v4(addr)) && c->ifi6) { if (!ns) { - r6 = s = sock_l4(c, AF_INET6, IPPROTO_UDP, addr, - ifname, port, uref.u32); + r6 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST, + addr ? addr : &inany_any6, ifname, + port, uref.u32); - udp_tap_map[V6][uref.port].sock = s < 0 ? -1 : s; - udp_splice_init[V6][port].sock = s < 0 ? -1 : s; + udp_splice_init[V6][port] = r6 < 0 ? -1 : r6; } else { - r6 = s = sock_l4(c, AF_INET6, IPPROTO_UDP, - &in6addr_loopback, - ifname, port, uref.u32); - udp_splice_ns[V6][port].sock = s < 0 ? -1 : s; + r6 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_SPLICE, + &inany_loopback6, ifname, + port, uref.u32); + udp_splice_ns[V6][port] = r6 < 0 ? -1 : r6; } } @@ -1046,73 +1156,15 @@ static void udp_splice_iov_init(void) int i; for (i = 0; i < UDP_MAX_FRAMES; i++) { - struct msghdr *mh4 = &udp4_mh_splice[i].msg_hdr; - struct msghdr *mh6 = &udp6_mh_splice[i].msg_hdr; + struct msghdr *mh = &udp_mh_splice[i].msg_hdr; - mh4->msg_name = &udp4_localname; - mh4->msg_namelen = sizeof(udp4_localname); + mh->msg_name = &udp_splice_to; + mh->msg_namelen = sizeof(udp_splice_to); - mh6->msg_name = &udp6_localname; - mh6->msg_namelen = sizeof(udp6_localname); + udp_iov_splice[i].iov_base = udp_payload[i].data; - udp4_iov_splice[i].iov_base = udp4_l2_buf[i].data; - udp6_iov_splice[i].iov_base = udp6_l2_buf[i].data; - - mh4->msg_iov = &udp4_iov_splice[i]; - mh6->msg_iov = &udp6_iov_splice[i]; - mh4->msg_iovlen = mh6->msg_iovlen = 1; - } -} - -/** - * udp_timer_one() - Handler for timed events on one port - * @c: Execution context - * @v6: Set for IPv6 connections - * @type: Socket type - * @port: Port number, host order - * @now: Current timestamp - */ -static void udp_timer_one(struct ctx *c, int v6, enum udp_act_type type, - in_port_t port, const struct timespec *now) -{ - struct udp_splice_port *sp; - struct udp_tap_port *tp; - int *sockp = NULL; - - switch (type) { - case UDP_ACT_TAP: - tp = &udp_tap_map[v6 ? V6 : V4][port]; - - if (now->tv_sec - tp->ts > UDP_CONN_TIMEOUT) { - sockp = &tp->sock; - tp->flags = 0; - } - - break; - case UDP_ACT_SPLICE_INIT: - sp = &udp_splice_init[v6 ? V6 : V4][port]; - - if (now->tv_sec - sp->ts > UDP_CONN_TIMEOUT) - sockp = &sp->sock; - - break; - case UDP_ACT_SPLICE_NS: - sp = &udp_splice_ns[v6 ? V6 : V4][port]; - - if (now->tv_sec - sp->ts > UDP_CONN_TIMEOUT) - sockp = &sp->sock; - - break; - default: - return; - } - - if (sockp && *sockp >= 0) { - int s = *sockp; - *sockp = -1; - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL); - close(s); - bitmap_clear(udp_act[v6 ? V6 : V4][type], port); + mh->msg_iov = &udp_iov_splice[i]; + mh->msg_iovlen = 1; } } @@ -1125,24 +1177,23 @@ static void udp_timer_one(struct ctx *c, int v6, enum udp_act_type type, */ static void udp_port_rebind(struct ctx *c, bool outbound) { + int (*socks)[NUM_PORTS] = outbound ? udp_splice_ns : udp_splice_init; const uint8_t *fmap - = outbound ? c->udp.fwd_out.f.map : c->udp.fwd_in.f.map; + = outbound ? c->udp.fwd_out.map : c->udp.fwd_in.map; const uint8_t *rmap - = outbound ? c->udp.fwd_in.f.map : c->udp.fwd_out.f.map; - struct udp_splice_port (*socks)[NUM_PORTS] - = outbound ? udp_splice_ns : udp_splice_init; + = outbound ? c->udp.fwd_in.map : c->udp.fwd_out.map; unsigned port; for (port = 0; port < NUM_PORTS; port++) { if (!bitmap_isset(fmap, port)) { - if (socks[V4][port].sock >= 0) { - close(socks[V4][port].sock); - socks[V4][port].sock = -1; + if (socks[V4][port] >= 0) { + close(socks[V4][port]); + socks[V4][port] = -1; } - if (socks[V6][port].sock >= 0) { - close(socks[V6][port].sock); - socks[V6][port].sock = -1; + if (socks[V6][port] >= 0) { + close(socks[V6][port]); + socks[V6][port] = -1; } continue; @@ -1152,9 +1203,9 @@ static void udp_port_rebind(struct ctx *c, bool outbound) if (bitmap_isset(rmap, port)) continue; - if ((c->ifi4 && socks[V4][port].sock == -1) || - (c->ifi6 && socks[V6][port].sock == -1)) - udp_sock_init(c, outbound, AF_UNSPEC, NULL, NULL, port); + if ((c->ifi4 && socks[V4][port] == -1) || + (c->ifi6 && socks[V6][port] == -1)) + udp_sock_init(c, outbound, NULL, NULL, port); } } @@ -1183,43 +1234,23 @@ static int udp_port_rebind_outbound(void *arg) */ void udp_timer(struct ctx *c, const struct timespec *now) { - int n, t, v6 = 0; - unsigned int i; - long *word, tmp; + (void)now; + + ASSERT(!c->no_udp); if (c->mode == MODE_PASTA) { - if (c->udp.fwd_out.f.mode == FWD_AUTO) { - fwd_scan_ports_udp(&c->udp.fwd_out.f, &c->udp.fwd_in.f, + if (c->udp.fwd_out.mode == FWD_AUTO) { + fwd_scan_ports_udp(&c->udp.fwd_out, &c->udp.fwd_in, &c->tcp.fwd_out, &c->tcp.fwd_in); NS_CALL(udp_port_rebind_outbound, c); } - if (c->udp.fwd_in.f.mode == FWD_AUTO) { - fwd_scan_ports_udp(&c->udp.fwd_in.f, &c->udp.fwd_out.f, + if (c->udp.fwd_in.mode == FWD_AUTO) { + fwd_scan_ports_udp(&c->udp.fwd_in, &c->udp.fwd_out, &c->tcp.fwd_in, &c->tcp.fwd_out); udp_port_rebind(c, false); } } - - if (!c->ifi4) - v6 = 1; -v6: - for (t = 0; t < UDP_ACT_TYPE_MAX; t++) { - word = (long *)udp_act[v6 ? V6 : V4][t]; - for (i = 0; i < ARRAY_SIZE(udp_act[0][0]); - i += sizeof(long), word++) { - tmp = *word; - while ((n = ffsl(tmp))) { - tmp &= ~(1UL << (n - 1)); - udp_timer_one(c, v6, t, i * 8 + n - 1, now); - } - } - } - - if (!v6 && c->ifi6) { - v6 = 1; - goto v6; - } } /** @@ -1230,10 +1261,9 @@ v6: */ int udp_init(struct ctx *c) { - udp_sock_iov_init(c); + ASSERT(!c->no_udp); - udp_invert_portmap(&c->udp.fwd_in); - udp_invert_portmap(&c->udp.fwd_out); + udp_iov_init(c); if (c->mode == MODE_PASTA) { udp_splice_iov_init(); @@ -9,58 +9,44 @@ #define UDP_TIMER_INTERVAL 1000 /* ms */ void udp_portmap_clear(void); -void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, - const struct timespec *now); -int udp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, - const void *saddr, const void *daddr, - const struct pool *p, int idx, const struct timespec *now); -int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, - const void *addr, const char *ifname, in_port_t port); +void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events, const struct timespec *now); +void udp_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events, const struct timespec *now); +int udp_tap_handler(const struct ctx *c, uint8_t pif, + sa_family_t af, const void *saddr, const void *daddr, + uint8_t ttl, const struct pool *p, int idx, + const struct timespec *now); +int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr, + const char *ifname, in_port_t port); int udp_init(struct ctx *c); void udp_timer(struct ctx *c, const struct timespec *now); void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s); /** - * union udp_epoll_ref - epoll reference portion for TCP connections + * union udp_listen_epoll_ref - epoll reference for "listening" UDP sockets * @port: Source port for connected sockets, bound port otherwise * @pif: pif for this socket - * @bound: Set if this file descriptor is a bound socket - * @splice: Set if descriptor packets to be "spliced" - * @orig: Set if a spliced socket which can originate "connections" - * @v6: Set for IPv6 sockets or connections * @u32: Opaque u32 value of reference */ -union udp_epoll_ref { +union udp_listen_epoll_ref { struct { in_port_t port; uint8_t pif; - bool splice:1, - orig:1, - v6:1; }; uint32_t u32; }; /** - * udp_fwd_ports - UDP specific port forwarding configuration - * @f: Generic forwarding configuration - * @rdelta: Reversed delta map to translate source ports on return packets - */ -struct udp_fwd_ports { - struct fwd_ports f; - in_port_t rdelta[NUM_PORTS]; -}; - -/** * struct udp_ctx - Execution context for UDP * @fwd_in: Port forwarding configuration for inbound packets * @fwd_out: Port forwarding configuration for outbound packets * @timer_run: Timestamp of most recent timer run */ struct udp_ctx { - struct udp_fwd_ports fwd_in; - struct udp_fwd_ports fwd_out; + struct fwd_ports fwd_in; + struct fwd_ports fwd_out; struct timespec timer_run; }; diff --git a/udp_flow.c b/udp_flow.c new file mode 100644 index 0000000..cef3fb5 --- /dev/null +++ b/udp_flow.c @@ -0,0 +1,362 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: David Gibson <david@gibson.dropbear.id.au> + * + * UDP flow tracking functions + */ + +#include <errno.h> +#include <fcntl.h> +#include <sys/uio.h> +#include <unistd.h> +#include <netinet/udp.h> + +#include "util.h" +#include "passt.h" +#include "flow_table.h" +#include "udp_internal.h" + +#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */ + +/** + * udp_at_sidx() - Get UDP specific flow at given sidx + * @sidx: Flow and side to retrieve + * + * Return: UDP specific flow at @sidx, or NULL of @sidx is invalid. Asserts if + * the flow at @sidx is not FLOW_UDP. + */ +struct udp_flow *udp_at_sidx(flow_sidx_t sidx) +{ + union flow *flow = flow_at_sidx(sidx); + + if (!flow) + return NULL; + + ASSERT(flow->f.type == FLOW_UDP); + return &flow->udp; +} + +/** + * udp_flow_close() - Close and clean up UDP flow + * @c: Execution context + * @uflow: UDP flow + */ +void udp_flow_close(const struct ctx *c, struct udp_flow *uflow) +{ + unsigned sidei; + + if (uflow->closed) + return; /* Nothing to do */ + + flow_foreach_sidei(sidei) { + flow_hash_remove(c, FLOW_SIDX(uflow, sidei)); + if (uflow->s[sidei] >= 0) { + epoll_del(c, uflow->s[sidei]); + close(uflow->s[sidei]); + uflow->s[sidei] = -1; + } + } + + uflow->closed = true; +} + +/** + * udp_flow_sock() - Create, bind and connect a flow specific UDP socket + * @c: Execution context + * @uflow: UDP flow to open socket for + * @sidei: Side of @uflow to open socket for + * + * Return: fd of new socket on success, -ve error code on failure + */ +static int udp_flow_sock(const struct ctx *c, + struct udp_flow *uflow, unsigned sidei) +{ + const struct flowside *side = &uflow->f.side[sidei]; + uint8_t pif = uflow->f.pif[sidei]; + union { + flow_sidx_t sidx; + uint32_t data; + } fref = { .sidx = FLOW_SIDX(uflow, sidei) }; + int s; + + s = flowside_sock_l4(c, EPOLL_TYPE_UDP, pif, side, fref.data); + if (s < 0) { + flow_dbg_perror(uflow, "Couldn't open flow specific socket"); + return s; + } + + if (flowside_connect(c, s, pif, side) < 0) { + int rc = -errno; + + epoll_del(c, s); + close(s); + + flow_dbg_perror(uflow, "Couldn't connect flow socket"); + return rc; + } + + /* It's possible, if unlikely, that we could receive some packets in + * between the bind() and connect() which may or may not be for this + * flow. Being UDP we could just discard them, but it's not ideal. + * + * There's also a tricky case if a bunch of datagrams for a new flow + * arrive in rapid succession, the first going to the original listening + * socket and later ones going to this new socket. If we forwarded the + * datagrams from the new socket immediately here they would go before + * the datagram which established the flow. Again, not strictly wrong + * for UDP, but not ideal. + * + * So, we flag that the new socket is in a transient state where it + * might have datagrams for a different flow queued. Before the next + * epoll cycle, udp_flow_defer() will flush out any such datagrams, and + * thereafter everything on the new socket should be strictly for this + * flow. + */ + if (sidei) + uflow->flush1 = true; + else + uflow->flush0 = true; + + return s; +} + +/** + * udp_flow_new() - Common setup for a new UDP flow + * @c: Execution context + * @flow: Initiated flow + * @now: Timestamp + * + * Return: sidx for the target side of the new UDP flow, or FLOW_SIDX_NONE + * on failure. + * + * #syscalls getsockname + */ +static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, + const struct timespec *now) +{ + struct udp_flow *uflow = NULL; + const struct flowside *tgt; + unsigned sidei; + + if (!(tgt = flow_target(c, flow, IPPROTO_UDP))) + goto cancel; + + uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp); + uflow->ts = now->tv_sec; + uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1; + uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = 0; + + flow_foreach_sidei(sidei) { + if (pif_is_socket(uflow->f.pif[sidei])) + if ((uflow->s[sidei] = udp_flow_sock(c, uflow, sidei)) < 0) + goto cancel; + } + + if (uflow->s[TGTSIDE] >= 0 && inany_is_unspecified(&tgt->oaddr)) { + /* When we target a socket, we connect() it, but might not + * always bind(), leaving the kernel to pick our address. In + * that case connect() will implicitly bind() the socket, but we + * need to determine its local address so that we can match + * reply packets back to the correct flow. Update the flow with + * the information from getsockname() */ + union sockaddr_inany sa; + socklen_t sl = sizeof(sa); + in_port_t port; + + if (getsockname(uflow->s[TGTSIDE], &sa.sa, &sl) < 0 || + inany_from_sockaddr(&uflow->f.side[TGTSIDE].oaddr, + &port, &sa) < 0) { + flow_perror(uflow, "Unable to determine local address"); + goto cancel; + } + if (port != tgt->oport) { + flow_err(uflow, "Unexpected local port"); + goto cancel; + } + } + + /* Tap sides always need to be looked up by hash. Socket sides don't + * always, but sometimes do (receiving packets on a socket not specific + * to one flow). Unconditionally hash both sides so all our bases are + * covered + */ + flow_foreach_sidei(sidei) + flow_hash_insert(c, FLOW_SIDX(uflow, sidei)); + + FLOW_ACTIVATE(uflow); + + return FLOW_SIDX(uflow, TGTSIDE); + +cancel: + if (uflow) + udp_flow_close(c, uflow); + flow_alloc_cancel(flow); + return FLOW_SIDX_NONE; +} + +/** + * udp_flow_from_sock() - Find or create UDP flow for incoming datagram + * @c: Execution context + * @pif: Interface the datagram is arriving from + * @dst: Our (local) address to which the datagram is arriving + * @port: Our (local) port number to which the datagram is arriving + * @s_in: Source socket address, filled in by recvmmsg() + * @now: Timestamp + * + * #syscalls fcntl arm:fcntl64 ppc64:fcntl64|fcntl i686:fcntl64 + * + * Return: sidx for the destination side of the flow for this packet, or + * FLOW_SIDX_NONE if we couldn't find or create a flow. + */ +flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, + const union inany_addr *dst, in_port_t port, + const union sockaddr_inany *s_in, + const struct timespec *now) +{ + const struct flowside *ini; + struct udp_flow *uflow; + union flow *flow; + flow_sidx_t sidx; + + sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, s_in, dst, port); + if ((uflow = udp_at_sidx(sidx))) { + uflow->ts = now->tv_sec; + return flow_sidx_opposite(sidx); + } + + if (!(flow = flow_alloc())) { + char sastr[SOCKADDR_STRLEN]; + + debug("Couldn't allocate flow for UDP datagram from %s %s", + pif_name(pif), sockaddr_ntop(s_in, sastr, sizeof(sastr))); + return FLOW_SIDX_NONE; + } + + ini = flow_initiate_sa(flow, pif, s_in, dst, port); + + if (!inany_is_unicast(&ini->eaddr) || + ini->eport == 0 || ini->oport == 0) { + /* In principle ini->oddr also must be specified, but when we've + * been initiated from a socket bound to 0.0.0.0 or ::, we don't + * know our address, so we have to leave it unpopulated. + */ + flow_err(flow, "Invalid endpoint on UDP recvfrom()"); + flow_alloc_cancel(flow); + return FLOW_SIDX_NONE; + } + + return udp_flow_new(c, flow, now); +} + +/** + * udp_flow_from_tap() - Find or create UDP flow for tap packets + * @c: Execution context + * @pif: pif on which the packet is arriving + * @af: Address family, AF_INET or AF_INET6 + * @saddr: Source address on guest side + * @daddr: Destination address guest side + * @srcport: Source port on guest side + * @dstport: Destination port on guest side + * + * Return: sidx for the destination side of the flow for this packet, or + * FLOW_SIDX_NONE if we couldn't find or create a flow. + */ +flow_sidx_t udp_flow_from_tap(const struct ctx *c, + uint8_t pif, sa_family_t af, + const void *saddr, const void *daddr, + in_port_t srcport, in_port_t dstport, + const struct timespec *now) +{ + const struct flowside *ini; + struct udp_flow *uflow; + union flow *flow; + flow_sidx_t sidx; + + ASSERT(pif == PIF_TAP); + + sidx = flow_lookup_af(c, IPPROTO_UDP, pif, af, saddr, daddr, + srcport, dstport); + if ((uflow = udp_at_sidx(sidx))) { + uflow->ts = now->tv_sec; + return flow_sidx_opposite(sidx); + } + + if (!(flow = flow_alloc())) { + char sstr[INET6_ADDRSTRLEN], dstr[INET6_ADDRSTRLEN]; + + debug("Couldn't allocate flow for UDP datagram from %s %s:%hu -> %s:%hu", + pif_name(pif), + inet_ntop(af, saddr, sstr, sizeof(sstr)), srcport, + inet_ntop(af, daddr, dstr, sizeof(dstr)), dstport); + return FLOW_SIDX_NONE; + } + + ini = flow_initiate_af(flow, PIF_TAP, af, saddr, srcport, + daddr, dstport); + + if (inany_is_unspecified(&ini->eaddr) || ini->eport == 0 || + inany_is_unspecified(&ini->oaddr) || ini->oport == 0) { + flow_dbg(flow, "Invalid endpoint on UDP packet"); + flow_alloc_cancel(flow); + return FLOW_SIDX_NONE; + } + + return udp_flow_new(c, flow, now); +} + +/** + * udp_flush_flow() - Flush datagrams that might not be for this flow + * @c: Execution context + * @uflow: Flow to handle + * @sidei: Side of the flow to flush + * @now: Current timestamp + */ +static void udp_flush_flow(const struct ctx *c, + const struct udp_flow *uflow, unsigned sidei, + const struct timespec *now) +{ + /* We don't know exactly where the datagrams will come from, but we know + * they'll have an interface and oport matching this flow */ + udp_sock_fwd(c, uflow->s[sidei], uflow->f.pif[sidei], + uflow->f.side[sidei].oport, now); +} + +/** + * udp_flow_defer() - Deferred per-flow handling (clean up aborted flows) + * @c: Execution context + * @uflow: Flow to handle + * @now: Current timestamp + * + * Return: true if the connection is ready to free, false otherwise + */ +bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow, + const struct timespec *now) +{ + if (uflow->flush0) { + udp_flush_flow(c, uflow, INISIDE, now); + uflow->flush0 = false; + } + if (uflow->flush1) { + udp_flush_flow(c, uflow, TGTSIDE, now); + uflow->flush1 = false; + } + return uflow->closed; +} + +/** + * udp_flow_timer() - Handler for timed events related to a given flow + * @c: Execution context + * @uflow: UDP flow + * @now: Current timestamp + * + * Return: true if the flow is ready to free, false otherwise + */ +bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow, + const struct timespec *now) +{ + if (now->tv_sec - uflow->ts <= UDP_CONN_TIMEOUT) + return false; + + udp_flow_close(c, uflow); + return true; +} diff --git a/udp_flow.h b/udp_flow.h new file mode 100644 index 0000000..4c528e9 --- /dev/null +++ b/udp_flow.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: David Gibson <david@gibson.dropbear.id.au> + * + * UDP flow tracking data structures + */ +#ifndef UDP_FLOW_H +#define UDP_FLOW_H + +/** + * struct udp_flow - Descriptor for a flow of UDP packets + * @f: Generic flow information + * @ttl: TTL or hop_limit for both sides + * @closed: Flow is already closed + * @flush0: @s[0] may have datagrams queued for other flows + * @flush1: @s[1] may have datagrams queued for other flows + * @ts: Activity timestamp + * @s: Socket fd (or -1) for each side of the flow + */ +struct udp_flow { + /* Must be first element */ + struct flow_common f; + + uint8_t ttl[SIDES]; + + bool closed :1, + flush0 :1, + flush1 :1; + + time_t ts; + int s[SIDES]; +}; + +struct udp_flow *udp_at_sidx(flow_sidx_t sidx); +flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, + const union inany_addr *dst, in_port_t port, + const union sockaddr_inany *s_in, + const struct timespec *now); +flow_sidx_t udp_flow_from_tap(const struct ctx *c, + uint8_t pif, sa_family_t af, + const void *saddr, const void *daddr, + in_port_t srcport, in_port_t dstport, + const struct timespec *now); +void udp_flow_close(const struct ctx *c, struct udp_flow *uflow); +bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow, + const struct timespec *now); +bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow, + const struct timespec *now); + +#endif /* UDP_FLOW_H */ diff --git a/udp_internal.h b/udp_internal.h new file mode 100644 index 0000000..96d11cf --- /dev/null +++ b/udp_internal.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2021 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#ifndef UDP_INTERNAL_H +#define UDP_INTERNAL_H + +#include "tap.h" /* needed by udp_meta_t */ + +/** + * struct udp_payload_t - UDP header and data for inbound messages + * @uh: UDP header + * @data: UDP data + */ +struct udp_payload_t { + struct udphdr uh; + char data[USHRT_MAX - sizeof(struct udphdr)]; +#ifdef __AVX2__ +} __attribute__ ((packed, aligned(32))); +#else +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); +#endif + +size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, + const struct flowside *toside, size_t dlen, + bool no_udp_csum); +size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, + const struct flowside *toside, size_t dlen, + bool no_udp_csum); +void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif, + in_port_t port, const struct timespec *now); + +#endif /* UDP_INTERNAL_H */ diff --git a/udp_vu.c b/udp_vu.c new file mode 100644 index 0000000..1f89509 --- /dev/null +++ b/udp_vu.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* udp_vu.c - UDP L2 vhost-user management functions + * + * Copyright Red Hat + * Author: Laurent Vivier <lvivier@redhat.com> + */ + +#include <unistd.h> +#include <assert.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/udp.h> +#include <stdint.h> +#include <stddef.h> +#include <sys/uio.h> +#include <linux/virtio_net.h> + +#include "checksum.h" +#include "util.h" +#include "ip.h" +#include "siphash.h" +#include "inany.h" +#include "passt.h" +#include "pcap.h" +#include "log.h" +#include "vhost_user.h" +#include "udp_internal.h" +#include "flow.h" +#include "flow_table.h" +#include "udp_flow.h" +#include "udp_vu.h" +#include "vu_common.h" + +static struct iovec iov_vu [VIRTQUEUE_MAX_SIZE]; +static struct vu_virtq_element elem [VIRTQUEUE_MAX_SIZE]; + +/** + * udp_vu_hdrlen() - return the size of the header in level 2 frame (UDP) + * @v6: Set for IPv6 packet + * + * Return: Return the size of the header + */ +static size_t udp_vu_hdrlen(bool v6) +{ + size_t hdrlen; + + hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf) + + sizeof(struct ethhdr) + sizeof(struct udphdr); + + if (v6) + hdrlen += sizeof(struct ipv6hdr); + else + hdrlen += sizeof(struct iphdr); + + return hdrlen; +} + +/** + * udp_vu_sock_recv() - Receive datagrams from socket into vhost-user buffers + * @c: Execution context + * @s: Socket to receive from + * @v6: Set for IPv6 connections + * @dlen: Size of received data (output) + * + * Return: Number of iov entries used to store the datagram + */ +static int udp_vu_sock_recv(const struct ctx *c, int s, bool v6, ssize_t *dlen) +{ + struct vu_dev *vdev = c->vdev; + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + int iov_cnt, idx, iov_used; + struct msghdr msg = { 0 }; + size_t off, hdrlen; + + ASSERT(!c->no_udp); + + /* compute L2 header length */ + hdrlen = udp_vu_hdrlen(v6); + + vu_init_elem(elem, iov_vu, VIRTQUEUE_MAX_SIZE); + + iov_cnt = vu_collect(vdev, vq, elem, VIRTQUEUE_MAX_SIZE, + IP_MAX_MTU + ETH_HLEN + + sizeof(struct virtio_net_hdr_mrg_rxbuf), + NULL); + if (iov_cnt == 0) + return 0; + + /* reserve space for the headers */ + ASSERT(iov_vu[0].iov_len >= hdrlen); + iov_vu[0].iov_base = (char *)iov_vu[0].iov_base + hdrlen; + iov_vu[0].iov_len -= hdrlen; + + /* read data from the socket */ + msg.msg_iov = iov_vu; + msg.msg_iovlen = iov_cnt; + + *dlen = recvmsg(s, &msg, 0); + if (*dlen < 0) { + vu_queue_rewind(vq, iov_cnt); + return 0; + } + + /* restore the pointer to the headers address */ + iov_vu[0].iov_base = (char *)iov_vu[0].iov_base - hdrlen; + iov_vu[0].iov_len += hdrlen; + + /* count the numbers of buffer filled by recvmsg() */ + idx = iov_skip_bytes(iov_vu, iov_cnt, *dlen + hdrlen, &off); + + /* adjust last iov length */ + if (idx < iov_cnt) + iov_vu[idx].iov_len = off; + iov_used = idx + !!off; + + vu_set_vnethdr(vdev, iov_vu[0].iov_base, iov_used); + + /* release unused buffers */ + vu_queue_rewind(vq, iov_cnt - iov_used); + + return iov_used; +} + +/** + * udp_vu_prepare() - Prepare the packet header + * @c: Execution context + * @toside: Address information for one side of the flow + * @dlen: Packet data length + * + * Return: Layer-4 length + */ +static size_t udp_vu_prepare(const struct ctx *c, + const struct flowside *toside, ssize_t dlen) +{ + struct ethhdr *eh; + size_t l4len; + + /* ethernet header */ + eh = vu_eth(iov_vu[0].iov_base); + + memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest)); + memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source)); + + /* initialize header */ + if (inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)) { + struct iphdr *iph = vu_ip(iov_vu[0].iov_base); + struct udp_payload_t *bp = vu_payloadv4(iov_vu[0].iov_base); + + eh->h_proto = htons(ETH_P_IP); + + *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_UDP); + + l4len = udp_update_hdr4(iph, bp, toside, dlen, true); + } else { + struct ipv6hdr *ip6h = vu_ip(iov_vu[0].iov_base); + struct udp_payload_t *bp = vu_payloadv6(iov_vu[0].iov_base); + + eh->h_proto = htons(ETH_P_IPV6); + + *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_UDP); + + l4len = udp_update_hdr6(ip6h, bp, toside, dlen, true); + } + + return l4len; +} + +/** + * udp_vu_csum() - Calculate and set checksum for a UDP packet + * @toside: Address information for one side of the flow + * @iov_used: Number of used iov_vu items + */ +static void udp_vu_csum(const struct flowside *toside, int iov_used) +{ + const struct in_addr *src4 = inany_v4(&toside->oaddr); + const struct in_addr *dst4 = inany_v4(&toside->eaddr); + char *base = iov_vu[0].iov_base; + struct udp_payload_t *bp; + struct iov_tail data; + + if (src4 && dst4) { + bp = vu_payloadv4(base); + data = IOV_TAIL(iov_vu, iov_used, (char *)&bp->data - base); + csum_udp4(&bp->uh, *src4, *dst4, &data); + } else { + bp = vu_payloadv6(base); + data = IOV_TAIL(iov_vu, iov_used, (char *)&bp->data - base); + csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, &data); + } +} + +/** + * udp_vu_sock_to_tap() - Forward datagrams from socket to tap + * @c: Execution context + * @s: Socket to read data from + * @n: Maximum number of datagrams to forward + * @tosidx: Flow & side to forward data from @s to + */ +void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx) +{ + const struct flowside *toside = flowside_at_sidx(tosidx); + bool v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)); + struct vu_dev *vdev = c->vdev; + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + int i; + + for (i = 0; i < n; i++) { + ssize_t dlen; + int iov_used; + + iov_used = udp_vu_sock_recv(c, s, v6, &dlen); + if (iov_used <= 0) + break; + + udp_vu_prepare(c, toside, dlen); + if (*c->pcap) { + udp_vu_csum(toside, iov_used); + pcap_iov(iov_vu, iov_used, + sizeof(struct virtio_net_hdr_mrg_rxbuf)); + } + vu_flush(vdev, vq, elem, iov_used); + } +} diff --git a/udp_vu.h b/udp_vu.h new file mode 100644 index 0000000..576b0e7 --- /dev/null +++ b/udp_vu.h @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Copyright Red Hat + * Author: Laurent Vivier <lvivier@redhat.com> + */ + +#ifndef UDP_VU_H +#define UDP_VU_H + +void udp_vu_listen_sock_data(const struct ctx *c, union epoll_ref ref, + const struct timespec *now); +void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx); + +#endif /* UDP_VU_H */ @@ -25,75 +25,72 @@ #include <time.h> #include <errno.h> #include <stdbool.h> +#include <linux/errqueue.h> +#include <getopt.h> +#include "linux_dep.h" #include "util.h" #include "iov.h" #include "passt.h" #include "packet.h" #include "log.h" +#ifdef HAS_GETRANDOM +#include <sys/random.h> +#endif /** - * sock_l4() - Create and bind socket for given L4, add to epoll list + * sock_l4_sa() - Create and bind socket to socket address, add to epoll list * @c: Execution context - * @af: Address family, AF_INET or AF_INET6 - * @proto: Protocol number - * @bind_addr: Address for binding, NULL for any + * @type: epoll type + * @sa: Socket address to bind to + * @sl: Length of @sa * @ifname: Interface for binding, NULL for any - * @port: Port, host order + * @v6only: Set IPV6_V6ONLY socket option * @data: epoll reference portion for protocol handlers * * Return: newly created socket, negative error code on failure */ -int sock_l4(const struct ctx *c, sa_family_t af, uint8_t proto, - const void *bind_addr, const char *ifname, uint16_t port, - uint32_t data) +int sock_l4_sa(const struct ctx *c, enum epoll_type type, + const void *sa, socklen_t sl, + const char *ifname, bool v6only, uint32_t data) { - union epoll_ref ref = { .data = data }; - struct sockaddr_in addr4 = { - .sin_family = AF_INET, - .sin_port = htons(port), - { 0 }, { 0 }, - }; - struct sockaddr_in6 addr6 = { - .sin6_family = AF_INET6, - .sin6_port = htons(port), - 0, IN6ADDR_ANY_INIT, 0, - }; - const struct sockaddr *sa; - bool dual_stack = false; - int fd, sl, y = 1, ret; + sa_family_t af = ((const struct sockaddr *)sa)->sa_family; + union epoll_ref ref = { .type = type, .data = data }; + bool freebind = false; struct epoll_event ev; - - switch (proto) { - case IPPROTO_TCP: - ref.type = EPOLL_TYPE_TCP_LISTEN; + int fd, y = 1, ret; + uint8_t proto; + int socktype; + + switch (type) { + case EPOLL_TYPE_TCP_LISTEN: + proto = IPPROTO_TCP; + socktype = SOCK_STREAM | SOCK_NONBLOCK; + freebind = c->freebind; break; - case IPPROTO_UDP: - ref.type = EPOLL_TYPE_UDP; + case EPOLL_TYPE_UDP_LISTEN: + freebind = c->freebind; + /* fallthrough */ + case EPOLL_TYPE_UDP: + proto = IPPROTO_UDP; + socktype = SOCK_DGRAM | SOCK_NONBLOCK; break; - case IPPROTO_ICMP: - case IPPROTO_ICMPV6: - ref.type = EPOLL_TYPE_PING; + case EPOLL_TYPE_PING: + if (af == AF_INET) + proto = IPPROTO_ICMP; + else + proto = IPPROTO_ICMPV6; + socktype = SOCK_DGRAM | SOCK_NONBLOCK; break; default: - return -EPFNOSUPPORT; /* Not implemented. */ + ASSERT(0); } - if (af == AF_UNSPEC) { - if (!DUAL_STACK_SOCKETS || bind_addr) - return -EINVAL; - dual_stack = true; - af = AF_INET6; - } - - if (proto == IPPROTO_TCP) - fd = socket(af, SOCK_STREAM | SOCK_NONBLOCK, proto); - else - fd = socket(af, SOCK_DGRAM | SOCK_NONBLOCK, proto); + fd = socket(af, socktype, proto); ret = -errno; if (fd < 0) { - warn("L4 socket: %s", strerror(-ret)); + warn("L4 socket: %s", strerror_(-ret)); return ret; } @@ -104,34 +101,25 @@ int sock_l4(const struct ctx *c, sa_family_t af, uint8_t proto, ref.fd = fd; - if (af == AF_INET) { - if (bind_addr) - addr4.sin_addr = *(struct in_addr *)bind_addr; + if (v6only) + if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &y, sizeof(y))) + debug("Failed to set IPV6_V6ONLY on socket %i", fd); - sa = (const struct sockaddr *)&addr4; - sl = sizeof(addr4); - } else { - if (bind_addr) { - addr6.sin6_addr = *(struct in6_addr *)bind_addr; + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y))) + debug("Failed to set SO_REUSEADDR on socket %i", fd); - if (!memcmp(bind_addr, &c->ip6.addr_ll, - sizeof(c->ip6.addr_ll))) - addr6.sin6_scope_id = c->ifi6; - } + if (proto == IPPROTO_UDP) { + int pktinfo = af == AF_INET ? IP_PKTINFO : IPV6_RECVPKTINFO; + int recverr = af == AF_INET ? IP_RECVERR : IPV6_RECVERR; + int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6; - sa = (const struct sockaddr *)&addr6; - sl = sizeof(addr6); + if (setsockopt(fd, level, recverr, &y, sizeof(y))) + die_perror("Failed to set RECVERR on socket %i", fd); - if (!dual_stack) - if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, - &y, sizeof(y))) - debug("Failed to set IPV6_V6ONLY on socket %i", - fd); + if (setsockopt(fd, level, pktinfo, &y, sizeof(y))) + die_perror("Failed to set PKTINFO on socket %i", fd); } - if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y))) - debug("Failed to set SO_REUSEADDR on socket %i", fd); - if (ifname && *ifname) { /* Supported since kernel version 5.7, commit c427bfec18f2 * ("net: core: enable SO_BINDTODEVICE for non-root users"). If @@ -140,30 +128,45 @@ int sock_l4(const struct ctx *c, sa_family_t af, uint8_t proto, */ if (setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, ifname, strlen(ifname))) { + char str[SOCKADDR_STRLEN]; + ret = -errno; - warn("Can't bind %s socket for port %u to %s, closing", - EPOLL_TYPE_STR(proto), port, ifname); + warn("Can't bind %s socket for %s to %s, closing", + EPOLL_TYPE_STR(proto), + sockaddr_ntop(sa, str, sizeof(str)), ifname); close(fd); return ret; } } + if (freebind) { + int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6; + int opt = af == AF_INET ? IP_FREEBIND : IPV6_FREEBIND; + + if (setsockopt(fd, level, opt, &y, sizeof(y))) { + err_perror("Failed to set %s on socket %i", + af == AF_INET ? "IP_FREEBIND" + : "IPV6_FREEBIND", + fd); + } + } + if (bind(fd, sa, sl) < 0) { /* We'll fail to bind to low ports if we don't have enough * capabilities, and we'll fail to bind on already bound ports, * this is fine. This might also fail for ICMP because of a * broken SELinux policy, see icmp_tap_handler(). */ - if (proto != IPPROTO_ICMP && proto != IPPROTO_ICMPV6) { + if (type != EPOLL_TYPE_PING) { ret = -errno; close(fd); return ret; } } - if (proto == IPPROTO_TCP && listen(fd, 128) < 0) { + if (type == EPOLL_TYPE_TCP_LISTEN && listen(fd, 128) < 0) { ret = -errno; - warn("TCP socket listen: %s", strerror(-ret)); + warn("TCP socket listen: %s", strerror_(-ret)); close(fd); return ret; } @@ -172,7 +175,7 @@ int sock_l4(const struct ctx *c, sa_family_t af, uint8_t proto, ev.data.u64 = ref.u64; if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev) == -1) { ret = -errno; - warn("L4 epoll_ctl: %s", strerror(-ret)); + warn("L4 epoll_ctl: %s", strerror_(-ret)); return ret; } @@ -180,6 +183,68 @@ int sock_l4(const struct ctx *c, sa_family_t af, uint8_t proto, } /** + * sock_unix() - Create and bind AF_UNIX socket + * @sock_path: Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix) + * + * Return: socket descriptor on success, won't return on failure + */ +int sock_unix(char *sock_path) +{ + int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + struct sockaddr_un addr = { + .sun_family = AF_UNIX, + }; + int i; + + if (fd < 0) + die_perror("Failed to open UNIX domain socket"); + + for (i = 1; i < UNIX_SOCK_MAX; i++) { + char *path = addr.sun_path; + int ex, ret; + + if (*sock_path) + memcpy(path, sock_path, UNIX_PATH_MAX); + else if (snprintf_check(path, UNIX_PATH_MAX - 1, + UNIX_SOCK_PATH, i)) + die_perror("Can't build UNIX domain socket path"); + + ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, + 0); + if (ex < 0) + die_perror("Failed to check for UNIX domain conflicts"); + + ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr)); + if (!ret || (errno != ENOENT && errno != ECONNREFUSED && + errno != EACCES)) { + if (*sock_path) + die("Socket path %s already in use", path); + + close(ex); + continue; + } + close(ex); + + unlink(path); + ret = bind(fd, (const struct sockaddr *)&addr, sizeof(addr)); + if (*sock_path && ret) + die_perror("Failed to bind UNIX domain socket"); + + if (!ret) + break; + } + + if (i == UNIX_SOCK_MAX) + die_perror("Failed to bind UNIX domain socket"); + + info("UNIX domain socket bound at %s", addr.sun_path); + if (!*sock_path) + memcpy(sock_path, addr.sun_path, UNIX_PATH_MAX); + + return fd; +} + +/** * sock_probe_mem() - Check if setting high SO_SNDBUF and SO_RCVBUF is allowed * @c: Execution context */ @@ -188,7 +253,8 @@ void sock_probe_mem(struct ctx *c) int v = INT_MAX / 2, s; socklen_t sl; - if ((s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) { + s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); + if (s < 0) { c->low_wmem = c->low_rmem = 1; return; } @@ -208,23 +274,34 @@ void sock_probe_mem(struct ctx *c) close(s); } - /** - * timespec_diff_ms() - Report difference in milliseconds between two timestamps + * timespec_diff_us() - Report difference in microseconds between two timestamps * @a: Minuend timestamp * @b: Subtrahend timestamp * - * Return: difference in milliseconds + * Return: difference in microseconds (wraps after 2^63 / 10^6s ~= 292k years) */ -int timespec_diff_ms(const struct timespec *a, const struct timespec *b) +int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b) { if (a->tv_nsec < b->tv_nsec) { - return (b->tv_nsec - a->tv_nsec) / 1000000 + - (a->tv_sec - b->tv_sec - 1) * 1000; + return (a->tv_nsec + 1000000000 - b->tv_nsec) / 1000 + + (a->tv_sec - b->tv_sec - 1) * 1000000; } - return (a->tv_nsec - b->tv_nsec) / 1000000 + - (a->tv_sec - b->tv_sec) * 1000; + return (a->tv_nsec - b->tv_nsec) / 1000 + + (a->tv_sec - b->tv_sec) * 1000000; +} + +/** + * timespec_diff_ms() - Report difference in milliseconds between two timestamps + * @a: Minuend timestamp + * @b: Subtrahend timestamp + * + * Return: difference in milliseconds + */ +long timespec_diff_ms(const struct timespec *a, const struct timespec *b) +{ + return timespec_diff_us(a, b) / 1000; } /** @@ -232,7 +309,7 @@ int timespec_diff_ms(const struct timespec *a, const struct timespec *b) * @map: Pointer to bitmap * @bit: Bit number to set */ -void bitmap_set(uint8_t *map, int bit) +void bitmap_set(uint8_t *map, unsigned bit) { unsigned long *word = (unsigned long *)map + BITMAP_WORD(bit); @@ -244,7 +321,7 @@ void bitmap_set(uint8_t *map, int bit) * @map: Pointer to bitmap * @bit: Bit number to clear */ -void bitmap_clear(uint8_t *map, int bit) +void bitmap_clear(uint8_t *map, unsigned bit) { unsigned long *word = (unsigned long *)map + BITMAP_WORD(bit); @@ -256,9 +333,9 @@ void bitmap_clear(uint8_t *map, int bit) * @map: Pointer to bitmap * @bit: Bit number to check * - * Return: one if given bit is set, zero if it's not + * Return: true if given bit is set, false if it's not */ -int bitmap_isset(const uint8_t *map, int bit) +bool bitmap_isset(const uint8_t *map, unsigned bit) { const unsigned long *word = (const unsigned long *)map + BITMAP_WORD(bit); @@ -287,7 +364,7 @@ void bitmap_or(uint8_t *dst, size_t size, const uint8_t *a, const uint8_t *b) dst[i] = a[i] | b[i]; } -/* +/** * ns_enter() - Enter configured user (unless already joined) and network ns * @c: Execution context * @@ -298,7 +375,7 @@ void bitmap_or(uint8_t *dst, size_t size, const uint8_t *a, const uint8_t *b) void ns_enter(const struct ctx *c) { if (setns(c->pasta_netns_fd, CLONE_NEWNET)) - die("setns() failed entering netns: %s", strerror(errno)); + die_perror("setns() failed entering netns"); } /** @@ -313,10 +390,8 @@ bool ns_is_init(void) bool ret = true; int fd; - if ((fd = open("/proc/self/uid_map", O_RDONLY | O_CLOEXEC)) < 0) { - die("Can't determine if we're in init namespace: %s", - strerror(errno)); - } + if ((fd = open("/proc/self/uid_map", O_RDONLY | O_CLOEXEC)) < 0) + die_perror("Can't determine if we're in init namespace"); if (read(fd, buf, sizeof(root_uid_map)) != sizeof(root_uid_map) - 1 || strncmp(buf, root_uid_map, sizeof(root_uid_map))) @@ -380,11 +455,11 @@ int open_in_ns(const struct ctx *c, const char *path, int flags) } /** - * pid_file() - Write PID to file, if requested to do so, and close it + * pidfile_write() - Write PID to file, if requested to do so, and close it * @fd: Open PID file descriptor, closed on exit, -1 to skip writing it * @pid: PID value to write */ -void write_pidfile(int fd, pid_t pid) +void pidfile_write(int fd, pid_t pid) { char pid_buf[12]; int n; @@ -396,18 +471,36 @@ void write_pidfile(int fd, pid_t pid) if (write(fd, pid_buf, n) < 0) { perror("PID file write"); - exit(EXIT_FAILURE); + _exit(EXIT_FAILURE); } close(fd); } /** + * output_file_open() - Open file for output, if needed + * @path: Path for output file + * @flags: Flags for open() other than O_CREAT, O_TRUNC, O_CLOEXEC + * + * Return: file descriptor on success, -1 on failure with errno set by open() + */ +int output_file_open(const char *path, int flags) +{ + /* We use O_CLOEXEC here, but clang-tidy as of LLVM 16 to 19 looks for + * it in the 'mode' argument if we have one + */ + return open(path, O_CREAT | O_TRUNC | O_CLOEXEC | flags, + /* NOLINTNEXTLINE(android-cloexec-open) */ + S_IRUSR | S_IWUSR); +} + +/** * __daemon() - daemon()-like function writing PID file before parent exits * @pidfile_fd: Open PID file descriptor * @devnull_fd: Open file descriptor for /dev/null * - * Return: child PID on success, won't return on failure + * Return: 0 in the child process on success. The parent process exits. + * Does not return in either process on failure (calls _exit). */ int __daemon(int pidfile_fd, int devnull_fd) { @@ -415,25 +508,20 @@ int __daemon(int pidfile_fd, int devnull_fd) if (pid == -1) { perror("fork"); - exit(EXIT_FAILURE); + _exit(EXIT_FAILURE); } if (pid) { - write_pidfile(pidfile_fd, pid); - exit(EXIT_SUCCESS); + pidfile_write(pidfile_fd, pid); + _exit(EXIT_SUCCESS); } - errno = 0; - - setsid(); - - dup2(devnull_fd, STDIN_FILENO); - dup2(devnull_fd, STDOUT_FILENO); - dup2(devnull_fd, STDERR_FILENO); - close(devnull_fd); - - if (errno) - exit(EXIT_FAILURE); + if (setsid() < 0 || + dup2(devnull_fd, STDIN_FILENO) < 0 || + dup2(devnull_fd, STDOUT_FILENO) < 0 || + dup2(devnull_fd, STDERR_FILENO) < 0 || + close(devnull_fd)) + _exit(EXIT_FAILURE); return 0; } @@ -470,7 +558,7 @@ int write_file(const char *path, const char *buf) size_t len = strlen(buf); if (fd < 0) { - warn("Could not open %s: %s", path, strerror(errno)); + warn_perror("Could not open %s", path); return -1; } @@ -478,7 +566,7 @@ int write_file(const char *path, const char *buf) ssize_t rc = write(fd, buf, len); if (rc <= 0) { - warn("Couldn't write to %s: %s", path, strerror(errno)); + warn_perror("Couldn't write to %s", path); break; } @@ -520,7 +608,39 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, #endif } -/* write_remainder() - write the tail of an IO vector to an fd +/** + * write_all_buf() - write all of a buffer to an fd + * @fd: File descriptor + * @buf: Pointer to base of buffer + * @len: Length of buffer + * + * Return: 0 on success, -1 on error (with errno set) + * + * #syscalls write + */ +int write_all_buf(int fd, const void *buf, size_t len) +{ + const char *p = buf; + size_t left = len; + + while (left) { + ssize_t rc; + + do + rc = write(fd, p, left); + while ((rc < 0) && errno == EINTR); + + if (rc < 0) + return -1; + + p += rc; + left -= rc; + } + return 0; +} + +/** + * write_remainder() - write the tail of an IO vector to an fd * @fd: File descriptor * @iov: IO vector * @iovcnt: Number of entries in @iov @@ -528,28 +648,400 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, * * Return: 0 on success, -1 on error (with errno set) * - * #syscalls write writev + * #syscalls writev */ -int write_remainder(int fd, const struct iovec *iov, int iovcnt, size_t skip) +int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip) { - int i; - size_t offset; + size_t i = 0, offset; - while ((i = iov_skip_bytes(iov, iovcnt, skip, &offset)) < iovcnt) { + while ((i += iov_skip_bytes(iov + i, iovcnt - i, skip, &offset)) < iovcnt) { ssize_t rc; if (offset) { - rc = write(fd, (char *)iov[i].iov_base + offset, - iov[i].iov_len - offset); - } else { - rc = writev(fd, &iov[i], iovcnt - i); + /* Write the remainder of the partially written buffer */ + if (write_all_buf(fd, (char *)iov[i].iov_base + offset, + iov[i].iov_len - offset) < 0) + return -1; + i++; } + /* Write as much of the remaining whole buffers as we can */ + rc = writev(fd, &iov[i], iovcnt - i); if (rc < 0) return -1; - skip += rc; + skip = rc; } + return 0; +} +/** + * read_all_buf() - Fill a whole buffer from a file descriptor + * @fd: File descriptor + * @buf: Pointer to base of buffer + * @len: Length of buffer + * + * Return: 0 on success, -1 on error (with errno set) + * + * #syscalls read + */ +int read_all_buf(int fd, void *buf, size_t len) +{ + size_t left = len; + char *p = buf; + + while (left) { + ssize_t rc; + + ASSERT(left <= len); + + do + rc = read(fd, p, left); + while ((rc < 0) && errno == EINTR); + + if (rc < 0) + return -1; + + if (rc == 0) { + errno = ENODATA; + return -1; + } + + p += rc; + left -= rc; + } return 0; } + +/** + * read_remainder() - Read the tail of an IO vector from a file descriptor + * @fd: File descriptor + * @iov: IO vector + * @cnt: Number of entries in @iov + * @skip: Number of bytes of the vector to skip reading + * + * Return: 0 on success, -1 on error (with errno set) + * + * Note: mode-specific seccomp profiles need to enable readv() to use this. + */ +/* cppcheck-suppress unusedFunction */ +int read_remainder(int fd, const struct iovec *iov, size_t cnt, size_t skip) +{ + size_t i = 0, offset; + + while ((i += iov_skip_bytes(iov + i, cnt - i, skip, &offset)) < cnt) { + ssize_t rc; + + if (offset) { + ASSERT(offset < iov[i].iov_len); + /* Read the remainder of the partially read buffer */ + if (read_all_buf(fd, (char *)iov[i].iov_base + offset, + iov[i].iov_len - offset) < 0) + return -1; + i++; + } + + if (cnt == i) + break; + + /* Fill as many of the remaining buffers as we can */ + rc = readv(fd, &iov[i], cnt - i); + if (rc < 0) + return -1; + + if (rc == 0) { + errno = ENODATA; + return -1; + } + + skip = rc; + } + return 0; +} + +/** sockaddr_ntop() - Convert a socket address to text format + * @sa: Socket address + * @dst: output buffer, minimum SOCKADDR_STRLEN bytes + * @size: size of buffer at @dst + * + * Return: On success, a non-null pointer to @dst, NULL on failure + */ +const char *sockaddr_ntop(const void *sa, char *dst, socklen_t size) +{ + sa_family_t family = ((const struct sockaddr *)sa)->sa_family; + socklen_t off = 0; + +#define IPRINTF(...) \ + do { \ + off += snprintf(dst + off, size - off, __VA_ARGS__); \ + if (off >= size) \ + return NULL; \ + } while (0) + +#define INTOP(af, addr) \ + do { \ + if (!inet_ntop((af), (addr), dst + off, size - off)) \ + return NULL; \ + off += strlen(dst + off); \ + } while (0) + + switch (family) { + case AF_UNSPEC: + IPRINTF("<unspecified>"); + break; + + case AF_INET: { + const struct sockaddr_in *sa4 = sa; + + INTOP(AF_INET, &sa4->sin_addr); + IPRINTF(":%hu", ntohs(sa4->sin_port)); + break; + } + + case AF_INET6: { + const struct sockaddr_in6 *sa6 = sa; + + IPRINTF("["); + INTOP(AF_INET6, &sa6->sin6_addr); + IPRINTF("]:%hu", ntohs(sa6->sin6_port)); + break; + } + + /* FIXME: Implement AF_UNIX */ + default: + errno = EAFNOSUPPORT; + return NULL; + } + +#undef IPRINTF +#undef INTOP + + return dst; +} + +/** eth_ntop() - Convert an Ethernet MAC address to text format + * @mac: MAC address + * @dst: Output buffer, minimum ETH_ADDRSTRLEN bytes + * @size: Size of buffer at @dst + * + * Return: On success, a non-null pointer to @dst, NULL on failure + */ +const char *eth_ntop(const unsigned char *mac, char *dst, size_t size) +{ + int len; + + len = snprintf(dst, size, "%02x:%02x:%02x:%02x:%02x:%02x", + mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); + if (len < 0 || (size_t)len >= size) + return NULL; + + return dst; +} + +/** str_ee_origin() - Convert socket extended error origin to a string + * @ee: Socket extended error structure + * + * Return: Static string describing error origin + */ +const char *str_ee_origin(const struct sock_extended_err *ee) +{ + const char *const desc[] = { + [SO_EE_ORIGIN_NONE] = "<no origin>", + [SO_EE_ORIGIN_LOCAL] = "Local", + [SO_EE_ORIGIN_ICMP] = "ICMP", + [SO_EE_ORIGIN_ICMP6] = "ICMPv6", + }; + + if (ee->ee_origin < ARRAY_SIZE(desc)) + return desc[ee->ee_origin]; + + return "<invalid>"; +} + +/** + * close_open_files() - Close leaked files, but not --fd, stdin, stdout, stderr + * @argc: Argument count + * @argv: Command line options, as we need to skip any file given via --fd + */ +void close_open_files(int argc, char **argv) +{ + const struct option optfd[] = { { "fd", required_argument, NULL, 'F' }, + { 0 }, + }; + long fd = -1; + int name, rc; + + do { + name = getopt_long(argc, argv, "-:F:", optfd, NULL); + + if (name == 'F') { + errno = 0; + fd = strtol(optarg, NULL, 0); + + if (errno || + (fd != STDIN_FILENO && fd <= STDERR_FILENO) || + fd > INT_MAX) + die("Invalid --fd: %s", optarg); + } + } while (name != -1); + + if (fd == -1) { + rc = close_range(STDERR_FILENO + 1, ~0U, CLOSE_RANGE_UNSHARE); + } else if (fd == STDERR_FILENO + 1) { /* Still a single range */ + rc = close_range(STDERR_FILENO + 2, ~0U, CLOSE_RANGE_UNSHARE); + } else { + rc = close_range(STDERR_FILENO + 1, fd - 1, + CLOSE_RANGE_UNSHARE); + if (!rc) + rc = close_range(fd + 1, ~0U, CLOSE_RANGE_UNSHARE); + } + + if (rc) { + if (errno == ENOSYS || errno == EINVAL) { + /* This probably means close_range() or the + * CLOSE_RANGE_UNSHARE flag is not supported by the + * kernel. Not much we can do here except carry on and + * hope for the best. + */ + warn( +"Can't use close_range() to ensure no files leaked by parent"); + } else { + die_perror("Failed to close files leaked by parent"); + } + } + +} + +/** + * snprintf_check() - snprintf() wrapper, checking for truncation and errors + * @str: Output buffer + * @size: Maximum size to write to @str + * @format: Message + * + * Return: false on success, true on truncation or error, sets errno on failure + */ +bool snprintf_check(char *str, size_t size, const char *format, ...) +{ + va_list ap; + int rc; + + va_start(ap, format); + rc = vsnprintf(str, size, format, ap); + va_end(ap); + + if (rc < 0) { + errno = EIO; + return true; + } + + if ((size_t)rc >= size) { + errno = ENOBUFS; + return true; + } + + return false; +} + +#define DEV_RANDOM "/dev/random" + +/** + * raw_random() - Get high quality random bytes + * @buf: Buffer to fill with random bytes + * @buflen: Number of bytes of random data to put in @buf + * + * Assumes that the random data is essential, and will die() if unable to obtain + * it. + */ +void raw_random(void *buf, size_t buflen) +{ + size_t random_read = 0; +#ifndef HAS_GETRANDOM + int fd = open(DEV_RANDOM, O_RDONLY); + + if (fd < 0) + die_perror("Couldn't open %s", DEV_RANDOM); +#endif + + while (random_read < buflen) { + ssize_t ret; + +#ifdef HAS_GETRANDOM + ret = getrandom((char *)buf + random_read, + buflen - random_read, GRND_RANDOM); +#else + ret = read(dev_random, (char *)buf + random_read, + buflen - random_read); +#endif + + if (ret == -1 && errno == EINTR) + continue; + + if (ret < 0) + die_perror("Error on random data source"); + + if (ret == 0) + break; + + random_read += ret; + } + +#ifndef HAS_GETRANDOM + close(dev_random); +#endif + + if (random_read < buflen) + die("Unexpected EOF on random data source"); +} + +/** + * epoll_del() - Remove a file descriptor from our passt epoll + * @c: Execution context + * @fd: File descriptor to remove + */ +void epoll_del(const struct ctx *c, int fd) +{ + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, fd, NULL); + +} + +/** + * encode_domain_name() - Encode domain name according to RFC 1035, section 3.1 + * @buf: Buffer to fill in with encoded domain name + * @domain_name: Input domain name string with terminator + * + * The buffer's 'buf' size has to be >= strlen(domain_name) + 2 + */ +void encode_domain_name(char *buf, const char *domain_name) +{ + size_t i; + char *p; + + buf[0] = strcspn(domain_name, "."); + p = buf + 1; + for (i = 0; domain_name[i]; i++) { + if (domain_name[i] == '.') + p[i] = strcspn(domain_name + i + 1, "."); + else + p[i] = domain_name[i]; + } + p[i] = 0L; +} + +/** + * abort_with_msg() - Print error message and abort + * @fmt: Format string + * @...: Format parameters + */ +void abort_with_msg(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vlogmsg(true, false, LOG_CRIT, fmt, ap); + va_end(ap); + + /* This may actually cause a SIGSYS instead of SIGABRT, due to seccomp, + * but that will still get the job done. + */ + abort(); +} @@ -9,8 +9,14 @@ #include <stdlib.h> #include <stdarg.h> #include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> #include <string.h> #include <signal.h> +#include <arpa/inet.h> +#include <unistd.h> +#include <sys/syscall.h> #include "log.h" @@ -25,11 +31,8 @@ #ifndef SECCOMP_RET_KILL_PROCESS #define SECCOMP_RET_KILL_PROCESS SECCOMP_RET_KILL #endif -#ifndef ETH_MAX_MTU -#define ETH_MAX_MTU USHRT_MAX -#endif -#ifndef ETH_MIN_MTU -#define ETH_MIN_MTU 68 +#ifndef IP_MAX_MTU +#define IP_MAX_MTU USHRT_MAX #endif #ifndef MIN @@ -58,17 +61,22 @@ #define STRINGIFY(x) #x #define STR(x) STRINGIFY(x) +void abort_with_msg(const char *fmt, ...) + __attribute__((format(printf, 1, 2), noreturn)); + +/* Some cppcheck versions get confused by aborts inside a loop, causing + * it to give false positive uninitialised variable warnings later in + * the function, because it doesn't realise the non-initialising path + * already exited. See https://trac.cppcheck.net/ticket/13227 + * + * Therefore, avoid using the usual do while wrapper we use to force the macro + * to act like a single statement requiring a ';'. + */ +#define ASSERT_WITH_MSG(expr, ...) \ + ((expr) ? (void)0 : abort_with_msg(__VA_ARGS__)) #define ASSERT(expr) \ - do { \ - if (!(expr)) { \ - err("ASSERTION FAILED in %s (%s:%d): %s", \ - __func__, __FILE__, __LINE__, STRINGIFY(expr)); \ - /* This may actually SIGSYS, due to seccomp, \ - * but that will still get the job done \ - */ \ - abort(); \ - } \ - } while (0) + ASSERT_WITH_MSG((expr), "ASSERTION FAILED in %s (%s:%d): %s", \ + __func__, __FILE__, __LINE__, STRINGIFY(expr)) #ifdef P_tmpdir #define TMPDIR P_tmpdir @@ -82,13 +90,13 @@ #define ARRAY_SIZE(a) ((int)(sizeof(a) / sizeof((a)[0]))) +#define foreach(item, array) \ + for ((item) = (array); (item) - (array) < ARRAY_SIZE(array); (item)++) + #define IN_INTERVAL(a, b, x) ((x) >= (a) && (x) <= (b)) #define FD_PROTO(x, proto) \ (IN_INTERVAL(c->proto.fd_min, c->proto.fd_max, (x))) -#define PORT_EPHEMERAL_MIN ((1 << 15) + (1 << 14)) /* RFC 6335 */ -#define PORT_IS_EPHEMERAL(port) ((port) >= PORT_EPHEMERAL_MIN) - #define MAC_ZERO ((uint8_t [ETH_ALEN]){ 0 }) #define MAC_IS_ZERO(addr) (!memcmp((addr), MAC_ZERO, ETH_ALEN)) @@ -103,58 +111,159 @@ (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24)) #endif +#ifndef __bswap_constant_32 +#define __bswap_constant_32(x) \ + ((((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >> 8) | \ + (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24)) +#endif + +#ifndef __bswap_constant_64 +#define __bswap_constant_64(x) \ + ((((x) & 0xff00000000000000ULL) >> 56) | \ + (((x) & 0x00ff000000000000ULL) >> 40) | \ + (((x) & 0x0000ff0000000000ULL) >> 24) | \ + (((x) & 0x000000ff00000000ULL) >> 8) | \ + (((x) & 0x00000000ff000000ULL) << 8) | \ + (((x) & 0x0000000000ff0000ULL) << 24) | \ + (((x) & 0x000000000000ff00ULL) << 40) | \ + (((x) & 0x00000000000000ffULL) << 56)) +#endif + #if __BYTE_ORDER == __BIG_ENDIAN #define htons_constant(x) (x) #define htonl_constant(x) (x) +#define htonll_constant(x) (x) +#define ntohs_constant(x) (x) +#define ntohl_constant(x) (x) +#define ntohll_constant(x) (x) #else #define htons_constant(x) (__bswap_constant_16(x)) #define htonl_constant(x) (__bswap_constant_32(x)) +#define htonll_constant(x) (__bswap_constant_64(x)) +#define ntohs_constant(x) (__bswap_constant_16(x)) +#define ntohl_constant(x) (__bswap_constant_32(x)) +#define ntohll_constant(x) (__bswap_constant_64(x)) #endif -#define NS_FN_STACK_SIZE (RLIMIT_STACK_VAL * 1024 / 8) +#define ntohll(x) (be64toh((x))) +#define htonll(x) (htobe64((x))) + +/** + * ntohl_unaligned() - Read 32-bit BE value from a possibly unaligned address + * @p: Pointer to the BE value in memory + * + * Returns: Host-order value of 32-bit BE quantity at @p + */ +static inline uint32_t ntohl_unaligned(const void *p) +{ + uint32_t val; + + memcpy(&val, p, sizeof(val)); + return ntohl(val); +} + +static inline void barrier(void) { __asm__ __volatile__("" ::: "memory"); } +#define smp_mb() do { barrier(); __atomic_thread_fence(__ATOMIC_SEQ_CST); } while (0) +#define smp_mb_release() do { barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); } while (0) +#define smp_mb_acquire() do { barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); } while (0) + +#define smp_wmb() smp_mb_release() +#define smp_rmb() smp_mb_acquire() + +#define qatomic_or(ptr, n) \ + ((void) __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST)) + +#define NS_FN_STACK_SIZE (1024 * 1024) /* 1MiB */ + int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, void *arg); #define NS_CALL(fn, arg) \ do { \ - char ns_fn_stack[NS_FN_STACK_SIZE]; \ + char ns_fn_stack[NS_FN_STACK_SIZE] \ + __attribute__ ((aligned(__alignof__(max_align_t)))); \ \ do_clone((fn), ns_fn_stack, sizeof(ns_fn_stack), \ CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD,\ (void *)(arg)); \ } while (0) -#define RCVBUF_BIG (2UL * 1024 * 1024) -#define SNDBUF_BIG (4UL * 1024 * 1024) -#define SNDBUF_SMALL (128UL * 1024) +#define RCVBUF_BIG (2ULL * 1024 * 1024) +#define SNDBUF_BIG (4ULL * 1024 * 1024) +#define SNDBUF_SMALL (128ULL * 1024) #include <net/if.h> #include <limits.h> #include <stdint.h> +#include "epoll_type.h" #include "packet.h" struct ctx; -/* cppcheck-suppress funcArgNamesDifferent */ -__attribute__ ((weak)) int ffsl(long int i) { return __builtin_ffsl(i); } -int sock_l4(const struct ctx *c, sa_family_t af, uint8_t proto, - const void *bind_addr, const char *ifname, uint16_t port, - uint32_t data); +int sock_l4_sa(const struct ctx *c, enum epoll_type type, + const void *sa, socklen_t sl, + const char *ifname, bool v6only, uint32_t data); +int sock_unix(char *sock_path); void sock_probe_mem(struct ctx *c); -int timespec_diff_ms(const struct timespec *a, const struct timespec *b); -void bitmap_set(uint8_t *map, int bit); -void bitmap_clear(uint8_t *map, int bit); -int bitmap_isset(const uint8_t *map, int bit); +long timespec_diff_ms(const struct timespec *a, const struct timespec *b); +int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b); +void bitmap_set(uint8_t *map, unsigned bit); +void bitmap_clear(uint8_t *map, unsigned bit); +bool bitmap_isset(const uint8_t *map, unsigned bit); void bitmap_or(uint8_t *dst, size_t size, const uint8_t *a, const uint8_t *b); char *line_read(char *buf, size_t len, int fd); void ns_enter(const struct ctx *c); bool ns_is_init(void); int open_in_ns(const struct ctx *c, const char *path, int flags); -void write_pidfile(int fd, pid_t pid); +int output_file_open(const char *path, int flags); +void pidfile_write(int fd, pid_t pid); int __daemon(int pidfile_fd, int devnull_fd); int fls(unsigned long x); int write_file(const char *path, const char *buf); -int write_remainder(int fd, const struct iovec *iov, int iovcnt, size_t skip); +int write_all_buf(int fd, const void *buf, size_t len); +int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip); +int read_all_buf(int fd, void *buf, size_t len); +int read_remainder(int fd, const struct iovec *iov, size_t cnt, size_t skip); +void close_open_files(int argc, char **argv); +bool snprintf_check(char *str, size_t size, const char *format, ...); + +/** + * af_name() - Return name of an address family + * @af: Address/protocol family (AF_INET or AF_INET6) + * + * Returns: Name of the protocol family as a string + */ +static inline const char *af_name(sa_family_t af) +{ + switch (af) { + case AF_INET: + return "IPv4"; + case AF_INET6: + return "IPv6"; + default: + return "<unknown address family>"; + } +} + +#define UINT16_STRLEN (sizeof("65535")) + +/* inet address (- '\0') + port (u16) (- '\0') + ':' + '\0' */ +#define SOCKADDR_INET_STRLEN \ + (INET_ADDRSTRLEN-1 + UINT16_STRLEN-1 + sizeof(":")) + +/* inet6 address (- '\0') + port (u16) (- '\0') + '[' + ']' + ':' + '\0' */ +#define SOCKADDR_INET6_STRLEN \ + (INET6_ADDRSTRLEN-1 + UINT16_STRLEN-1 + sizeof("[]:")) + +#define SOCKADDR_STRLEN MAX(SOCKADDR_INET_STRLEN, SOCKADDR_INET6_STRLEN) + +#define ETH_ADDRSTRLEN (sizeof("00:11:22:33:44:55")) + +struct sock_extended_err; + +const char *sockaddr_ntop(const void *sa, char *dst, socklen_t size); +const char *eth_ntop(const unsigned char *mac, char *dst, size_t size); +const char *str_ee_origin(const struct sock_extended_err *ee); /** * mod_sub() - Modular arithmetic subtraction @@ -184,6 +293,44 @@ static inline bool mod_between(unsigned x, unsigned i, unsigned j, unsigned m) return mod_sub(x, i, m) < mod_sub(j, i, m); } +/* FPRINTF() intentionally silences cert-err33-c clang-tidy warnings */ +#define FPRINTF(f, ...) (void)fprintf(f, __VA_ARGS__) + +void raw_random(void *buf, size_t buflen); +void epoll_del(const struct ctx *c, int fd); + +/* + * Starting from glibc 2.40.9000 and commit 25a5eb4010df ("string: strerror, + * strsignal cannot use buffer after dlmopen (bug 32026)"), strerror() needs + * getrandom(2) and brk(2) as it allocates memory for the locale-translated + * error description, but our seccomp profiles forbid both. + * + * Use the strerror_() wrapper instead, calling into strerrordesc_np() to get + * a static untranslated string. It's a GNU implementation, but also defined by + * bionic. + * + * If strerrordesc_np() is not defined (e.g. musl), call strerror(). C libraries + * not defining strerrordesc_np() are expected to provide strerror() + * implementations that are simple enough for us to call. + */ +__attribute__ ((weak)) const char *strerrordesc_np(int errnum); + +/** + * strerror_() - strerror() wrapper calling strerrordesc_np() if available + * @errnum: Error code + * + * Return: error description string + */ +static inline const char *strerror_(int errnum) +{ + if (strerrordesc_np) + return strerrordesc_np(errnum); + + return strerror(errnum); +} + +#define strerror(x) @ "Don't call strerror() directly, use strerror_() instead" + /* * Workarounds for https://github.com/llvm/llvm-project/issues/58992 * @@ -224,4 +371,17 @@ static inline int wrap_accept4(int sockfd, struct sockaddr *addr, #define accept4(s, addr, addrlen, flags) \ wrap_accept4((s), (addr), (addrlen), (flags)) +static inline int wrap_getsockname(int sockfd, struct sockaddr *addr, +/* cppcheck-suppress constParameterPointer */ + socklen_t *addrlen) +{ + sa_init(addr, addrlen); + return getsockname(sockfd, addr, addrlen); +} +#define getsockname(s, addr, addrlen) \ + wrap_getsockname((s), (addr), (addrlen)) + +#define PASST_MAXDNAME 254 /* 253 (RFC 1035) + 1 (the terminator) */ +void encode_domain_name(char *buf, const char *domain_name); + #endif /* UTIL_H */ diff --git a/vhost_user.c b/vhost_user.c new file mode 100644 index 0000000..e8377bb --- /dev/null +++ b/vhost_user.c @@ -0,0 +1,1213 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * vhost-user API, command management and virtio interface + * + * Copyright Red Hat + * Author: Laurent Vivier <lvivier@redhat.com> + * + * Some parts from QEMU subprojects/libvhost-user/libvhost-user.c + * licensed under the following terms: + * + * Copyright IBM, Corp. 2007 + * Copyright (c) 2016 Red Hat, Inc. + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Marc-André Lureau <mlureau@redhat.com> + * Victor Kaplansky <victork@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <stddef.h> +#include <string.h> +#include <assert.h> +#include <stdbool.h> +#include <inttypes.h> +#include <time.h> +#include <net/ethernet.h> +#include <netinet/in.h> +#include <sys/epoll.h> +#include <sys/eventfd.h> +#include <sys/mman.h> +#include <linux/vhost_types.h> +#include <linux/virtio_net.h> + +#include "util.h" +#include "passt.h" +#include "tap.h" +#include "vhost_user.h" +#include "pcap.h" +#include "migrate.h" + +/* vhost-user version we are compatible with */ +#define VHOST_USER_VERSION 1 + +static struct vu_dev vdev_storage; + +/** + * vu_print_capabilities() - print vhost-user capabilities + * this is part of the vhost-user backend + * convention. + */ +void vu_print_capabilities(void) +{ + info("{"); + info(" \"type\": \"net\""); + info("}"); + _exit(EXIT_SUCCESS); +} + +/** + * vu_request_to_string() - convert a vhost-user request number to its name + * @req: request number + * + * Return: the name of request number + */ +static const char *vu_request_to_string(unsigned int req) +{ + if (req < VHOST_USER_MAX) { +#define REQ(req) [req] = #req + static const char * const vu_request_str[VHOST_USER_MAX] = { + REQ(VHOST_USER_NONE), + REQ(VHOST_USER_GET_FEATURES), + REQ(VHOST_USER_SET_FEATURES), + REQ(VHOST_USER_SET_OWNER), + REQ(VHOST_USER_RESET_OWNER), + REQ(VHOST_USER_SET_MEM_TABLE), + REQ(VHOST_USER_SET_LOG_BASE), + REQ(VHOST_USER_SET_LOG_FD), + REQ(VHOST_USER_SET_VRING_NUM), + REQ(VHOST_USER_SET_VRING_ADDR), + REQ(VHOST_USER_SET_VRING_BASE), + REQ(VHOST_USER_GET_VRING_BASE), + REQ(VHOST_USER_SET_VRING_KICK), + REQ(VHOST_USER_SET_VRING_CALL), + REQ(VHOST_USER_SET_VRING_ERR), + REQ(VHOST_USER_GET_PROTOCOL_FEATURES), + REQ(VHOST_USER_SET_PROTOCOL_FEATURES), + REQ(VHOST_USER_GET_QUEUE_NUM), + REQ(VHOST_USER_SET_VRING_ENABLE), + REQ(VHOST_USER_SEND_RARP), + REQ(VHOST_USER_NET_SET_MTU), + REQ(VHOST_USER_SET_BACKEND_REQ_FD), + REQ(VHOST_USER_IOTLB_MSG), + REQ(VHOST_USER_SET_VRING_ENDIAN), + REQ(VHOST_USER_GET_CONFIG), + REQ(VHOST_USER_SET_CONFIG), + REQ(VHOST_USER_POSTCOPY_ADVISE), + REQ(VHOST_USER_POSTCOPY_LISTEN), + REQ(VHOST_USER_POSTCOPY_END), + REQ(VHOST_USER_GET_INFLIGHT_FD), + REQ(VHOST_USER_SET_INFLIGHT_FD), + REQ(VHOST_USER_GPU_SET_SOCKET), + REQ(VHOST_USER_VRING_KICK), + REQ(VHOST_USER_GET_MAX_MEM_SLOTS), + REQ(VHOST_USER_ADD_MEM_REG), + REQ(VHOST_USER_REM_MEM_REG), + REQ(VHOST_USER_SET_STATUS), + REQ(VHOST_USER_GET_STATUS), + REQ(VHOST_USER_GET_SHARED_OBJECT), + REQ(VHOST_USER_SET_DEVICE_STATE_FD), + REQ(VHOST_USER_CHECK_DEVICE_STATE), + }; +#undef REQ + return vu_request_str[req]; + } + + return "unknown"; +} + +/** + * qva_to_va() - Translate front-end (QEMU) virtual address to our virtual + * address + * @dev: vhost-user device + * @qemu_addr: front-end userspace address + * + * Return: the memory address in our process virtual address space. + */ +static void *qva_to_va(struct vu_dev *dev, uint64_t qemu_addr) +{ + unsigned int i; + + /* Find matching memory region. */ + for (i = 0; i < dev->nregions; i++) { + const struct vu_dev_region *r = &dev->regions[i]; + + if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) { + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + return (void *)(uintptr_t)(qemu_addr - r->qva + + r->mmap_addr + + r->mmap_offset); + } + } + + return NULL; +} + +/** + * vmsg_close_fds() - Close all file descriptors of a given message + * @vmsg: vhost-user message with the list of the file descriptors + */ +static void vmsg_close_fds(const struct vhost_user_msg *vmsg) +{ + int i; + + for (i = 0; i < vmsg->fd_num; i++) + close(vmsg->fds[i]); +} + +/** + * vmsg_set_reply_u64() - Set reply payload.u64 and clear request flags + * and fd_num + * @vmsg: vhost-user message + * @val: 64-bit value to reply + */ +static void vmsg_set_reply_u64(struct vhost_user_msg *vmsg, uint64_t val) +{ + vmsg->hdr.flags = 0; /* defaults will be set by vu_send_reply() */ + vmsg->hdr.size = sizeof(vmsg->payload.u64); + vmsg->payload.u64 = val; + vmsg->fd_num = 0; +} + +/** + * vu_message_read_default() - Read incoming vhost-user message from the + * front-end + * @conn_fd: vhost-user command socket + * @vmsg: vhost-user message + * + * Return: 0 if recvmsg() has been interrupted or if there's no data to read, + * 1 if a message has been received + */ +static int vu_message_read_default(int conn_fd, struct vhost_user_msg *vmsg) +{ + char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * + sizeof(int))] = { 0 }; + struct iovec iov = { + .iov_base = (char *)vmsg, + .iov_len = VHOST_USER_HDR_SIZE, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = control, + .msg_controllen = sizeof(control), + }; + ssize_t ret, sz_payload; + struct cmsghdr *cmsg; + + ret = recvmsg(conn_fd, &msg, MSG_DONTWAIT); + if (ret < 0) { + if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK) + return 0; + die_perror("vhost-user message receive (recvmsg)"); + } + + vmsg->fd_num = 0; + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_RIGHTS) { + size_t fd_size; + + ASSERT(cmsg->cmsg_len >= CMSG_LEN(0)); + fd_size = cmsg->cmsg_len - CMSG_LEN(0); + ASSERT(fd_size <= sizeof(vmsg->fds)); + vmsg->fd_num = fd_size / sizeof(int); + memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size); + break; + } + } + + sz_payload = vmsg->hdr.size; + if ((size_t)sz_payload > sizeof(vmsg->payload)) { + die("vhost-user message request too big: %d," + " size: vmsg->size: %zd, " + "while sizeof(vmsg->payload) = %zu", + vmsg->hdr.request, sz_payload, sizeof(vmsg->payload)); + } + + if (sz_payload) { + do + ret = recv(conn_fd, &vmsg->payload, sz_payload, 0); + while (ret < 0 && errno == EINTR); + + if (ret < 0) + die_perror("vhost-user message receive"); + + if (ret == 0) + die("EOF on vhost-user message receive"); + + if (ret < sz_payload) + die("Short-read on vhost-user message receive"); + } + + return 1; +} + +/** + * vu_message_write() - Send a message to the front-end + * @conn_fd: vhost-user command socket + * @vmsg: vhost-user message + * + * #syscalls:vu sendmsg + */ +static void vu_message_write(int conn_fd, struct vhost_user_msg *vmsg) +{ + char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = { 0 }; + struct iovec iov = { + .iov_base = (char *)vmsg, + .iov_len = VHOST_USER_HDR_SIZE + vmsg->hdr.size, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = control, + }; + int rc; + + ASSERT(vmsg->fd_num <= VHOST_MEMORY_BASELINE_NREGIONS); + if (vmsg->fd_num > 0) { + size_t fdsize = vmsg->fd_num * sizeof(int); + struct cmsghdr *cmsg; + + msg.msg_controllen = CMSG_SPACE(fdsize); + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_len = CMSG_LEN(fdsize); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize); + } + + do + rc = sendmsg(conn_fd, &msg, 0); + while (rc < 0 && errno == EINTR); + + if (rc < 0) + die_perror("vhost-user message send"); + + if ((uint32_t)rc < VHOST_USER_HDR_SIZE + vmsg->hdr.size) + die("EOF on vhost-user message send"); +} + +/** + * vu_send_reply() - Update message flags and send it to front-end + * @conn_fd: vhost-user command socket + * @vmsg: vhost-user message + */ +static void vu_send_reply(int conn_fd, struct vhost_user_msg *vmsg) +{ + vmsg->hdr.flags &= ~VHOST_USER_VERSION_MASK; + vmsg->hdr.flags |= VHOST_USER_VERSION; + vmsg->hdr.flags |= VHOST_USER_REPLY_MASK; + + vu_message_write(conn_fd, vmsg); +} + +/** + * vu_get_features_exec() - Provide back-end features bitmask to front-end + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: True as a reply is requested + */ +static bool vu_get_features_exec(struct vu_dev *vdev, + struct vhost_user_msg *vmsg) +{ + uint64_t features = + 1ULL << VIRTIO_F_VERSION_1 | + 1ULL << VIRTIO_NET_F_MRG_RXBUF | + 1ULL << VHOST_F_LOG_ALL | + 1ULL << VHOST_USER_F_PROTOCOL_FEATURES; + + (void)vdev; + + vmsg_set_reply_u64(vmsg, features); + + debug("Sending back to guest u64: 0x%016"PRIx64, vmsg->payload.u64); + + return true; +} + +/** + * vu_set_enable_all_rings() - Enable/disable all the virtqueues + * @vdev: vhost-user device + * @enable: New virtqueues state + */ +static void vu_set_enable_all_rings(struct vu_dev *vdev, bool enable) +{ + uint16_t i; + + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) + vdev->vq[i].enable = enable; +} + +/** + * vu_set_features_exec() - Enable features of the back-end + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_set_features_exec(struct vu_dev *vdev, + struct vhost_user_msg *vmsg) +{ + debug("u64: 0x%016"PRIx64, vmsg->payload.u64); + + vdev->features = vmsg->payload.u64; + /* We only support devices conforming to VIRTIO 1.0 or + * later + */ + if (!vu_has_feature(vdev, VIRTIO_F_VERSION_1)) + die("virtio legacy devices aren't supported by passt"); + + if (!vu_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) + vu_set_enable_all_rings(vdev, true); + + return false; +} + +/** + * vu_set_owner_exec() - Session start flag, do nothing in our case + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_set_owner_exec(struct vu_dev *vdev, + struct vhost_user_msg *vmsg) +{ + (void)vdev; + (void)vmsg; + + return false; +} + +/** + * map_ring() - Convert ring front-end (QEMU) addresses to our process + * virtual address space. + * @vdev: vhost-user device + * @vq: Virtqueue + * + * Return: True if ring cannot be mapped to our address space + */ +static bool map_ring(struct vu_dev *vdev, struct vu_virtq *vq) +{ + vq->vring.desc = qva_to_va(vdev, vq->vra.desc_user_addr); + vq->vring.used = qva_to_va(vdev, vq->vra.used_user_addr); + vq->vring.avail = qva_to_va(vdev, vq->vra.avail_user_addr); + + debug("Setting virtq addresses:"); + debug(" vring_desc at %p", (void *)vq->vring.desc); + debug(" vring_used at %p", (void *)vq->vring.used); + debug(" vring_avail at %p", (void *)vq->vring.avail); + + return !(vq->vring.desc && vq->vring.used && vq->vring.avail); +} + +/** + * vu_set_mem_table_exec() - Sets the memory map regions to be able to + * translate the vring addresses. + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + * + * #syscalls:vu mmap|mmap2 munmap + */ +static bool vu_set_mem_table_exec(struct vu_dev *vdev, + struct vhost_user_msg *vmsg) +{ + struct vhost_user_memory m = vmsg->payload.memory, *memory = &m; + unsigned int i; + + for (i = 0; i < vdev->nregions; i++) { + const struct vu_dev_region *r = &vdev->regions[i]; + + if (r->mmap_addr) { + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + munmap((void *)(uintptr_t)r->mmap_addr, + r->size + r->mmap_offset); + } + } + vdev->nregions = memory->nregions; + + debug("vhost-user nregions: %u", memory->nregions); + for (i = 0; i < vdev->nregions; i++) { + struct vhost_user_memory_region *msg_region = &memory->regions[i]; + struct vu_dev_region *dev_region = &vdev->regions[i]; + void *mmap_addr; + + debug("vhost-user region %d", i); + debug(" guest_phys_addr: 0x%016"PRIx64, + msg_region->guest_phys_addr); + debug(" memory_size: 0x%016"PRIx64, + msg_region->memory_size); + debug(" userspace_addr 0x%016"PRIx64, + msg_region->userspace_addr); + debug(" mmap_offset 0x%016"PRIx64, + msg_region->mmap_offset); + + dev_region->gpa = msg_region->guest_phys_addr; + dev_region->size = msg_region->memory_size; + dev_region->qva = msg_region->userspace_addr; + dev_region->mmap_offset = msg_region->mmap_offset; + + /* We don't use offset argument of mmap() since the + * mapped address has to be page aligned. + */ + mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, + PROT_READ | PROT_WRITE, MAP_SHARED | + MAP_NORESERVE, vmsg->fds[i], 0); + + if (mmap_addr == MAP_FAILED) + die_perror("vhost-user region mmap error"); + + dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; + debug(" mmap_addr: 0x%016"PRIx64, + dev_region->mmap_addr); + + close(vmsg->fds[i]); + } + + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) { + if (vdev->vq[i].vring.desc) { + if (map_ring(vdev, &vdev->vq[i])) + die("remapping queue %d during setmemtable", i); + } + } + + /* As vu_packet_check_range() has no access to the number of + * memory regions, mark the end of the array with mmap_addr = 0 + */ + ASSERT(vdev->nregions < VHOST_USER_MAX_RAM_SLOTS - 1); + vdev->regions[vdev->nregions].mmap_addr = 0; + + tap_sock_update_pool(vdev->regions, 0); + + return false; +} + +/** + * vu_close_log() - Close the logging file descriptor + * @vdev: vhost-user device + */ +static void vu_close_log(struct vu_dev *vdev) +{ + if (vdev->log_table) { + if (munmap(vdev->log_table, vdev->log_size) != 0) + die_perror("close log munmap() error"); + vdev->log_table = NULL; + } + + if (vdev->log_call_fd != -1) { + close(vdev->log_call_fd); + vdev->log_call_fd = -1; + } +} + +/** + * vu_log_kick() - Inform the front-end that the log has been modified + * @vdev: vhost-user device + */ +static void vu_log_kick(const struct vu_dev *vdev) +{ + if (vdev->log_call_fd != -1) { + int rc; + + rc = eventfd_write(vdev->log_call_fd, 1); + if (rc == -1) + die_perror("vhost-user kick eventfd_write()"); + } +} + +/** + * vu_log_page() - Update logging table + * @log_table: Base address of the logging table + * @page: Page number that has been updated + */ +/* NOLINTNEXTLINE(readability-non-const-parameter) */ +static void vu_log_page(uint8_t *log_table, uint64_t page) +{ + qatomic_or(&log_table[page / 8], 1 << (page % 8)); +} + +/** + * vu_log_write() - Log memory write + * @vdev: vhost-user device + * @address: Memory address + * @length: Memory size + */ +void vu_log_write(const struct vu_dev *vdev, uint64_t address, uint64_t length) +{ + uint64_t page; + + if (!vdev->log_table || !length || + !vu_has_feature(vdev, VHOST_F_LOG_ALL)) + return; + + page = address / VHOST_LOG_PAGE; + while (page * VHOST_LOG_PAGE < address + length) { + vu_log_page(vdev->log_table, page); + page++; + } + vu_log_kick(vdev); +} + +/** + * vu_set_log_base_exec() - Set the memory log base + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: True as a reply is requested + * + * #syscalls:vu mmap|mmap2 munmap + */ +static bool vu_set_log_base_exec(struct vu_dev *vdev, + struct vhost_user_msg *vmsg) +{ + uint64_t log_mmap_size, log_mmap_offset; + void *base; + int fd; + + if (vmsg->fd_num != 1 || vmsg->hdr.size != sizeof(vmsg->payload.log)) + die("vhost-user: Invalid log_base message"); + + fd = vmsg->fds[0]; + log_mmap_offset = vmsg->payload.log.mmap_offset; + log_mmap_size = vmsg->payload.log.mmap_size; + + debug("vhost-user log mmap_offset: %"PRId64, log_mmap_offset); + debug("vhost-user log mmap_size: %"PRId64, log_mmap_size); + + base = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, + log_mmap_offset); + close(fd); + if (base == MAP_FAILED) + die("vhost-user log mmap error"); + + if (vdev->log_table) + munmap(vdev->log_table, vdev->log_size); + + vdev->log_table = base; + vdev->log_size = log_mmap_size; + + vmsg->hdr.size = sizeof(vmsg->payload.u64); + vmsg->fd_num = 0; + + return true; +} + +/** + * vu_set_log_fd_exec() - Set the eventfd used to report logging update + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_set_log_fd_exec(struct vu_dev *vdev, + struct vhost_user_msg *vmsg) +{ + if (vmsg->fd_num != 1) + die("Invalid log_fd message"); + + if (vdev->log_call_fd != -1) + close(vdev->log_call_fd); + + vdev->log_call_fd = vmsg->fds[0]; + + debug("Got log_call_fd: %d", vdev->log_call_fd); + + return false; +} + +/** + * vu_set_vring_num_exec() - Set the size of the queue (vring size) + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_set_vring_num_exec(struct vu_dev *vdev, + struct vhost_user_msg *vmsg) +{ + unsigned int idx = vmsg->payload.state.index; + unsigned int num = vmsg->payload.state.num; + + trace("State.index: %u", idx); + trace("State.num: %u", num); + vdev->vq[idx].vring.num = num; + + return false; +} + +/** + * vu_set_vring_addr_exec() - Set the addresses of the vring + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_set_vring_addr_exec(struct vu_dev *vdev, + struct vhost_user_msg *vmsg) +{ + /* We need to copy the payload to vhost_vring_addr structure + * to access index because address of vmsg->payload.addr + * can be unaligned as it is packed. + */ + struct vhost_vring_addr addr = vmsg->payload.addr; + struct vu_virtq *vq = &vdev->vq[addr.index]; + + debug("vhost_vring_addr:"); + debug(" index: %d", addr.index); + debug(" flags: %d", addr.flags); + debug(" desc_user_addr: 0x%016" PRIx64, + (uint64_t)addr.desc_user_addr); + debug(" used_user_addr: 0x%016" PRIx64, + (uint64_t)addr.used_user_addr); + debug(" avail_user_addr: 0x%016" PRIx64, + (uint64_t)addr.avail_user_addr); + debug(" log_guest_addr: 0x%016" PRIx64, + (uint64_t)addr.log_guest_addr); + + vq->vra = vmsg->payload.addr; + vq->vring.flags = addr.flags; + vq->vring.log_guest_addr = addr.log_guest_addr; + + if (map_ring(vdev, vq)) + die("Invalid vring_addr message"); + + vq->used_idx = le16toh(vq->vring.used->idx); + + if (vq->last_avail_idx != vq->used_idx) { + debug("Last avail index != used index: %u != %u", + vq->last_avail_idx, vq->used_idx); + } + + return false; +} +/** + * vu_set_vring_base_exec() - Sets the next index to use for descriptors + * in this vring + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_set_vring_base_exec(struct vu_dev *vdev, + struct vhost_user_msg *vmsg) +{ + unsigned int idx = vmsg->payload.state.index; + unsigned int num = vmsg->payload.state.num; + + debug("State.index: %u", idx); + debug("State.num: %u", num); + vdev->vq[idx].shadow_avail_idx = vdev->vq[idx].last_avail_idx = num; + + return false; +} + +/** + * vu_get_vring_base_exec() - Stops the vring and returns the current + * descriptor index or indices + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: True as a reply is requested + */ +static bool vu_get_vring_base_exec(struct vu_dev *vdev, + struct vhost_user_msg *vmsg) +{ + unsigned int idx = vmsg->payload.state.index; + + debug("State.index: %u", idx); + vmsg->payload.state.num = vdev->vq[idx].last_avail_idx; + vmsg->hdr.size = sizeof(vmsg->payload.state); + + vdev->vq[idx].started = false; + vdev->vq[idx].vring.avail = 0; + + if (vdev->vq[idx].call_fd != -1) { + close(vdev->vq[idx].call_fd); + vdev->vq[idx].call_fd = -1; + } + if (vdev->vq[idx].kick_fd != -1) { + epoll_del(vdev->context, vdev->vq[idx].kick_fd); + close(vdev->vq[idx].kick_fd); + vdev->vq[idx].kick_fd = -1; + } + + return true; +} + +/** + * vu_set_watch() - Add a file descriptor to the passt epoll file descriptor + * @vdev: vhost-user device + * @idx: queue index of the file descriptor to add + */ +static void vu_set_watch(const struct vu_dev *vdev, int idx) +{ + union epoll_ref ref = { + .type = EPOLL_TYPE_VHOST_KICK, + .fd = vdev->vq[idx].kick_fd, + .queue = idx + }; + struct epoll_event ev = { 0 }; + + ev.data.u64 = ref.u64; + ev.events = EPOLLIN; + epoll_ctl(vdev->context->epollfd, EPOLL_CTL_ADD, ref.fd, &ev); +} + +/** + * vu_check_queue_msg_file() - Check if a message is valid, + * close fds if NOFD bit is set + * @vmsg: vhost-user message + */ +static void vu_check_queue_msg_file(struct vhost_user_msg *vmsg) +{ + bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + + if (idx >= VHOST_USER_MAX_QUEUES) + die("Invalid vhost-user queue index: %u", idx); + + if (nofd) { + vmsg_close_fds(vmsg); + return; + } + + if (vmsg->fd_num != 1) + die("Invalid fds in vhost-user request: %d", vmsg->hdr.request); +} + +/** + * vu_set_vring_kick_exec() - Set the event file descriptor for adding buffers + * to the vring + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_set_vring_kick_exec(struct vu_dev *vdev, + struct vhost_user_msg *vmsg) +{ + bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + + debug("u64: 0x%016"PRIx64, vmsg->payload.u64); + + vu_check_queue_msg_file(vmsg); + + if (vdev->vq[idx].kick_fd != -1) { + epoll_del(vdev->context, vdev->vq[idx].kick_fd); + close(vdev->vq[idx].kick_fd); + vdev->vq[idx].kick_fd = -1; + } + + if (!nofd) + vdev->vq[idx].kick_fd = vmsg->fds[0]; + + debug("Got kick_fd: %d for vq: %d", vdev->vq[idx].kick_fd, idx); + + vdev->vq[idx].started = true; + + if (vdev->vq[idx].kick_fd != -1 && VHOST_USER_IS_QUEUE_TX(idx)) { + vu_set_watch(vdev, idx); + debug("Waiting for kicks on fd: %d for vq: %d", + vdev->vq[idx].kick_fd, idx); + } + + return false; +} + +/** + * vu_set_vring_call_exec() - Set the event file descriptor to signal when + * buffers are used + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_set_vring_call_exec(struct vu_dev *vdev, + struct vhost_user_msg *vmsg) +{ + bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + + debug("u64: 0x%016"PRIx64, vmsg->payload.u64); + + vu_check_queue_msg_file(vmsg); + + if (vdev->vq[idx].call_fd != -1) { + close(vdev->vq[idx].call_fd); + vdev->vq[idx].call_fd = -1; + } + + if (!nofd) + vdev->vq[idx].call_fd = vmsg->fds[0]; + + /* in case of I/O hang after reconnecting */ + if (vdev->vq[idx].call_fd != -1) + eventfd_write(vmsg->fds[0], 1); + + debug("Got call_fd: %d for vq: %d", vdev->vq[idx].call_fd, idx); + + return false; +} + +/** + * vu_set_vring_err_exec() - Set the event file descriptor to signal when + * error occurs + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_set_vring_err_exec(struct vu_dev *vdev, + struct vhost_user_msg *vmsg) +{ + bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + int idx = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + + debug("u64: 0x%016"PRIx64, vmsg->payload.u64); + + vu_check_queue_msg_file(vmsg); + + if (vdev->vq[idx].err_fd != -1) { + close(vdev->vq[idx].err_fd); + vdev->vq[idx].err_fd = -1; + } + + if (!nofd) + vdev->vq[idx].err_fd = vmsg->fds[0]; + + return false; +} + +/** + * vu_get_protocol_features_exec() - Provide the protocol (vhost-user) features + * to the front-end + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: True as a reply is requested + */ +static bool vu_get_protocol_features_exec(struct vu_dev *vdev, + struct vhost_user_msg *vmsg) +{ + uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | + 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | + 1ULL << VHOST_USER_PROTOCOL_F_DEVICE_STATE | + 1ULL << VHOST_USER_PROTOCOL_F_RARP; + + (void)vdev; + vmsg_set_reply_u64(vmsg, features); + + return true; +} + +/** + * vu_set_protocol_features_exec() - Enable protocol (vhost-user) features + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_set_protocol_features_exec(struct vu_dev *vdev, + struct vhost_user_msg *vmsg) +{ + uint64_t features = vmsg->payload.u64; + + debug("u64: 0x%016"PRIx64, features); + + vdev->protocol_features = vmsg->payload.u64; + + return false; +} + +/** + * vu_get_queue_num_exec() - Tell how many queues we support + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: True as a reply is requested + */ +static bool vu_get_queue_num_exec(struct vu_dev *vdev, + struct vhost_user_msg *vmsg) +{ + (void)vdev; + + vmsg_set_reply_u64(vmsg, VHOST_USER_MAX_QUEUES); + + return true; +} + +/** + * vu_set_vring_enable_exec() - Enable or disable corresponding vring + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_set_vring_enable_exec(struct vu_dev *vdev, + struct vhost_user_msg *vmsg) +{ + unsigned int enable = vmsg->payload.state.num; + unsigned int idx = vmsg->payload.state.index; + + debug("State.index: %u", idx); + debug("State.enable: %u", enable); + + if (idx >= VHOST_USER_MAX_QUEUES) + die("Invalid vring_enable index: %u", idx); + + vdev->vq[idx].enable = enable; + return false; +} + +/** + * vu_send_rarp_exec() - vhost-user specification says: "Broadcast a fake + * RARP to notify the migration is terminated", + * but passt doesn't need to update any ARP table, + * so do nothing to silence QEMU bogus error message + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_send_rarp_exec(struct vu_dev *vdev, + struct vhost_user_msg *vmsg) +{ + char macstr[ETH_ADDRSTRLEN]; + + (void)vdev; + + /* ignore the command */ + + debug("Ignore command VHOST_USER_SEND_RARP for %s", + eth_ntop((unsigned char *)&vmsg->payload.u64, macstr, + sizeof(macstr))); + + return false; +} + +/** + * vu_set_device_state_fd_exec() - Set the device state migration channel + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: True as the reply contains 0 to indicate success + * and set bit 8 as we don't provide our own fd. + */ +static bool vu_set_device_state_fd_exec(struct vu_dev *vdev, + struct vhost_user_msg *vmsg) +{ + unsigned int direction = vmsg->payload.transfer_state.direction; + unsigned int phase = vmsg->payload.transfer_state.phase; + + if (vmsg->fd_num != 1) + die("Invalid device_state_fd message"); + + if (phase != VHOST_USER_TRANSFER_STATE_PHASE_STOPPED) + die("Invalid device_state_fd phase: %d", phase); + + if (direction != VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE && + direction != VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD) + die("Invalid device_state_fd direction: %d", direction); + + migrate_request(vdev->context, vmsg->fds[0], + direction == VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD); + + /* We don't provide a new fd for the data transfer */ + vmsg_set_reply_u64(vmsg, VHOST_USER_VRING_NOFD_MASK); + + return true; +} + +/** + * vu_check_device_state_exec() - Return device state migration result + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: True as the reply contains the migration result + */ +/* cppcheck-suppress constParameterCallback */ +static bool vu_check_device_state_exec(struct vu_dev *vdev, + struct vhost_user_msg *vmsg) +{ + vmsg_set_reply_u64(vmsg, vdev->context->device_state_result); + + return true; +} + +/** + * vu_init() - Initialize vhost-user device structure + * @c: execution context + */ +void vu_init(struct ctx *c) +{ + int i; + + c->vdev = &vdev_storage; + c->vdev->context = c; + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) { + c->vdev->vq[i] = (struct vu_virtq){ + .call_fd = -1, + .kick_fd = -1, + .err_fd = -1, + .notification = true, + }; + } + c->vdev->log_table = NULL; + c->vdev->log_call_fd = -1; + + migrate_init(c); +} + + +/** + * vu_cleanup() - Reset vhost-user device + * @vdev: vhost-user device + */ +void vu_cleanup(struct vu_dev *vdev) +{ + unsigned int i; + + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) { + struct vu_virtq *vq = &vdev->vq[i]; + + vq->started = false; + vq->notification = true; + + if (vq->call_fd != -1) { + close(vq->call_fd); + vq->call_fd = -1; + } + if (vq->err_fd != -1) { + close(vq->err_fd); + vq->err_fd = -1; + } + if (vq->kick_fd != -1) { + epoll_del(vdev->context, vq->kick_fd); + close(vq->kick_fd); + vq->kick_fd = -1; + } + + vq->vring.desc = 0; + vq->vring.used = 0; + vq->vring.avail = 0; + } + + for (i = 0; i < vdev->nregions; i++) { + const struct vu_dev_region *r = &vdev->regions[i]; + + if (r->mmap_addr) { + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + munmap((void *)(uintptr_t)r->mmap_addr, + r->size + r->mmap_offset); + } + } + vdev->nregions = 0; + + vu_close_log(vdev); + + /* If we lose the VU dev, we also lose our migration channel */ + migrate_close(vdev->context); +} + +/** + * vu_sock_reset() - Reset connection socket + * @vdev: vhost-user device + */ +static void vu_sock_reset(struct vu_dev *vdev) +{ + tap_sock_reset(vdev->context); +} + +static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev, + struct vhost_user_msg *vmsg) = { + [VHOST_USER_GET_FEATURES] = vu_get_features_exec, + [VHOST_USER_SET_FEATURES] = vu_set_features_exec, + [VHOST_USER_GET_PROTOCOL_FEATURES] = vu_get_protocol_features_exec, + [VHOST_USER_SET_PROTOCOL_FEATURES] = vu_set_protocol_features_exec, + [VHOST_USER_GET_QUEUE_NUM] = vu_get_queue_num_exec, + [VHOST_USER_SET_OWNER] = vu_set_owner_exec, + [VHOST_USER_SET_MEM_TABLE] = vu_set_mem_table_exec, + [VHOST_USER_SET_LOG_BASE] = vu_set_log_base_exec, + [VHOST_USER_SET_LOG_FD] = vu_set_log_fd_exec, + [VHOST_USER_SET_VRING_NUM] = vu_set_vring_num_exec, + [VHOST_USER_SET_VRING_ADDR] = vu_set_vring_addr_exec, + [VHOST_USER_SET_VRING_BASE] = vu_set_vring_base_exec, + [VHOST_USER_GET_VRING_BASE] = vu_get_vring_base_exec, + [VHOST_USER_SET_VRING_KICK] = vu_set_vring_kick_exec, + [VHOST_USER_SET_VRING_CALL] = vu_set_vring_call_exec, + [VHOST_USER_SET_VRING_ERR] = vu_set_vring_err_exec, + [VHOST_USER_SET_VRING_ENABLE] = vu_set_vring_enable_exec, + [VHOST_USER_SEND_RARP] = vu_send_rarp_exec, + [VHOST_USER_SET_DEVICE_STATE_FD] = vu_set_device_state_fd_exec, + [VHOST_USER_CHECK_DEVICE_STATE] = vu_check_device_state_exec, +}; + +/** + * vu_control_handler() - Handle control commands for vhost-user + * @vdev: vhost-user device + * @fd: vhost-user message socket + * @events: epoll events + */ +void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events) +{ + struct vhost_user_msg vmsg = { 0 }; + bool need_reply, reply_requested; + int ret; + + if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) { + vu_sock_reset(vdev); + return; + } + + ret = vu_message_read_default(fd, &vmsg); + if (ret == 0) { + vu_sock_reset(vdev); + return; + } + debug("================ Vhost user message ================"); + debug("Request: %s (%d)", vu_request_to_string(vmsg.hdr.request), + vmsg.hdr.request); + debug("Flags: 0x%x", vmsg.hdr.flags); + debug("Size: %u", vmsg.hdr.size); + + need_reply = vmsg.hdr.flags & VHOST_USER_NEED_REPLY_MASK; + + if (vmsg.hdr.request >= 0 && vmsg.hdr.request < VHOST_USER_MAX && + vu_handle[vmsg.hdr.request]) + reply_requested = vu_handle[vmsg.hdr.request](vdev, &vmsg); + else + die("Unhandled request: %d", vmsg.hdr.request); + + /* cppcheck-suppress legacyUninitvar */ + if (!reply_requested && need_reply) { + vmsg.payload.u64 = 0; + vmsg.hdr.flags = 0; + vmsg.hdr.size = sizeof(vmsg.payload.u64); + vmsg.fd_num = 0; + reply_requested = true; + } + + if (reply_requested) + vu_send_reply(fd, &vmsg); + + if (vmsg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE && + vdev->context->device_state_result == 0 && + !vdev->context->migrate_target) { + info("Migration complete, exiting"); + _exit(EXIT_SUCCESS); + } +} diff --git a/vhost_user.h b/vhost_user.h new file mode 100644 index 0000000..f2ae2da --- /dev/null +++ b/vhost_user.h @@ -0,0 +1,247 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * vhost-user API, command management and virtio interface + * + * Copyright Red Hat + * Author: Laurent Vivier <lvivier@redhat.com> + */ + +/* some parts from subprojects/libvhost-user/libvhost-user.h */ + +#ifndef VHOST_USER_H +#define VHOST_USER_H + +#include "virtio.h" +#include "iov.h" + +#define VHOST_USER_F_PROTOCOL_FEATURES 30 +#define VHOST_LOG_PAGE 4096 + +#define VHOST_MEMORY_BASELINE_NREGIONS 8 + +/** + * enum vhost_user_protocol_feature - List of available vhost-user features + */ +enum vhost_user_protocol_feature { + VHOST_USER_PROTOCOL_F_MQ = 0, + VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1, + VHOST_USER_PROTOCOL_F_RARP = 2, + VHOST_USER_PROTOCOL_F_REPLY_ACK = 3, + VHOST_USER_PROTOCOL_F_NET_MTU = 4, + VHOST_USER_PROTOCOL_F_BACKEND_REQ = 5, + VHOST_USER_PROTOCOL_F_CROSS_ENDIAN = 6, + VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7, + VHOST_USER_PROTOCOL_F_PAGEFAULT = 8, + VHOST_USER_PROTOCOL_F_CONFIG = 9, + VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10, + VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11, + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12, + VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS = 14, + VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15, + VHOST_USER_PROTOCOL_F_STATUS = 16, + /* Feature 17 reserved for VHOST_USER_PROTOCOL_F_XEN_MMAP. */ + VHOST_USER_PROTOCOL_F_SHARED_OBJECT = 18, + VHOST_USER_PROTOCOL_F_DEVICE_STATE = 19, + + VHOST_USER_PROTOCOL_F_MAX +}; + +/** + * enum vhost_user_request - List of available vhost-user requests + */ +enum vhost_user_request { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_SEND_RARP = 19, + VHOST_USER_NET_SET_MTU = 20, + VHOST_USER_SET_BACKEND_REQ_FD = 21, + VHOST_USER_IOTLB_MSG = 22, + VHOST_USER_SET_VRING_ENDIAN = 23, + VHOST_USER_GET_CONFIG = 24, + VHOST_USER_SET_CONFIG = 25, + VHOST_USER_CREATE_CRYPTO_SESSION = 26, + VHOST_USER_CLOSE_CRYPTO_SESSION = 27, + VHOST_USER_POSTCOPY_ADVISE = 28, + VHOST_USER_POSTCOPY_LISTEN = 29, + VHOST_USER_POSTCOPY_END = 30, + VHOST_USER_GET_INFLIGHT_FD = 31, + VHOST_USER_SET_INFLIGHT_FD = 32, + VHOST_USER_GPU_SET_SOCKET = 33, + VHOST_USER_VRING_KICK = 35, + VHOST_USER_GET_MAX_MEM_SLOTS = 36, + VHOST_USER_ADD_MEM_REG = 37, + VHOST_USER_REM_MEM_REG = 38, + VHOST_USER_SET_STATUS = 39, + VHOST_USER_GET_STATUS = 40, + VHOST_USER_GET_SHARED_OBJECT = 41, + VHOST_USER_SET_DEVICE_STATE_FD = 42, + VHOST_USER_CHECK_DEVICE_STATE = 43, + VHOST_USER_MAX +}; + +/** + * struct vhost_user_header - vhost-user message header + * @request: Request type of the message + * @flags: Request flags + * @size: The following payload size + */ +struct vhost_user_header { + enum vhost_user_request request; + +#define VHOST_USER_VERSION_MASK 0x3 +#define VHOST_USER_REPLY_MASK (0x1 << 2) +#define VHOST_USER_NEED_REPLY_MASK (0x1 << 3) + uint32_t flags; + uint32_t size; +} __attribute__ ((__packed__)); + +/** + * struct vhost_user_memory_region - Front-end shared memory region information + * @guest_phys_addr: Guest physical address of the region + * @memory_size: Memory size + * @userspace_addr: front-end (QEMU) userspace address + * @mmap_offset: region offset in the shared memory area + */ +struct vhost_user_memory_region { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + uint64_t mmap_offset; +}; + +/** + * struct vhost_user_memory - List of all the shared memory regions + * @nregions: Number of memory regions + * @padding: Padding + * @regions: Memory regions list + */ +struct vhost_user_memory { + uint32_t nregions; + uint32_t padding; + struct vhost_user_memory_region regions[VHOST_MEMORY_BASELINE_NREGIONS]; +}; + +/** + * struct vhost_user_log - Address and size of the shared memory region used + * to log page update + * @mmap_size: Size of the shared memory region + * @mmap_offset: Offset of the shared memory region + */ +struct vhost_user_log { + uint64_t mmap_size; + uint64_t mmap_offset; +}; + +/** + * struct vhost_user_transfer_device_state - Set the direction and phase + * of the backend device state fd + * @direction: Device state transfer direction (save or load) + * @phase: Migration phase (only stopped is supported) + */ +struct vhost_user_transfer_device_state { + uint32_t direction; +#define VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE 0 +#define VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD 1 + uint32_t phase; +#define VHOST_USER_TRANSFER_STATE_PHASE_STOPPED 0 +}; + +/** + * union vhost_user_payload - vhost-user message payload + * @u64: 64-bit payload + * @state: vring state payload + * @addr: vring addresses payload + * @memory: Memory regions information payload + * @log: Memory logging payload + * @transfer_state: Device state payload + */ +union vhost_user_payload { +#define VHOST_USER_VRING_IDX_MASK 0xff +#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + struct vhost_user_memory memory; + struct vhost_user_log log; + struct vhost_user_transfer_device_state transfer_state; +}; + +/** + * struct vhost_user_msg - vhost-user message + * @hdr: Message header + * @payload: Message payload + * @fds: File descriptors associated with the message + * in the ancillary data. + * (shared memory or event file descriptors) + * @fd_num: Number of file descriptors + */ +struct vhost_user_msg { + struct vhost_user_header hdr; + union vhost_user_payload payload; + + int fds[VHOST_MEMORY_BASELINE_NREGIONS]; + int fd_num; +} __attribute__ ((__packed__)); +#define VHOST_USER_HDR_SIZE sizeof(struct vhost_user_header) + +/* index of the RX virtqueue */ +#define VHOST_USER_RX_QUEUE 0 +/* index of the TX virtqueue */ +#define VHOST_USER_TX_QUEUE 1 + +/* in case of multiqueue, the RX and TX queues are interleaved */ +#define VHOST_USER_IS_QUEUE_TX(n) (n % 2) +#define VHOST_USER_IS_QUEUE_RX(n) (!(n % 2)) + +/* Default virtio-net header for passt */ +#define VU_HEADER ((struct virtio_net_hdr){ \ + .flags = VIRTIO_NET_HDR_F_DATA_VALID, \ + .gso_type = VIRTIO_NET_HDR_GSO_NONE, \ +}) + +/** + * vu_queue_enabled - Return state of a virtqueue + * @vq: virtqueue to check + * + * Return: true if the virqueue is enabled, false otherwise + */ +static inline bool vu_queue_enabled(const struct vu_virtq *vq) +{ + return vq->enable; +} + +/** + * vu_queue_started - Return state of a virtqueue + * @vq: virtqueue to check + * + * Return: true if the virqueue is started, false otherwise + */ +static inline bool vu_queue_started(const struct vu_virtq *vq) +{ + return vq->started; +} + +void vu_print_capabilities(void); +void vu_init(struct ctx *c); +void vu_cleanup(struct vu_dev *vdev); +void vu_log_write(const struct vu_dev *vdev, uint64_t address, + uint64_t length); +void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events); +#endif /* VHOST_USER_H */ diff --git a/virtio.c b/virtio.c new file mode 100644 index 0000000..83906aa --- /dev/null +++ b/virtio.c @@ -0,0 +1,752 @@ +// SPDX-License-Identifier: GPL-2.0-or-later AND BSD-3-Clause +/* + * virtio API, vring and virtqueue functions definition + * + * Copyright Red Hat + * Author: Laurent Vivier <lvivier@redhat.com> + */ + +/* Some parts copied from QEMU subprojects/libvhost-user/libvhost-user.c + * originally licensed under the following terms: + * + * -- + * + * Copyright IBM, Corp. 2007 + * Copyright (c) 2016 Red Hat, Inc. + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Marc-André Lureau <mlureau@redhat.com> + * Victor Kaplansky <victork@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + * + * Some parts copied from QEMU hw/virtio/virtio.c + * licensed under the following terms: + * + * Copyright IBM, Corp. 2007 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * -- + * + * virtq_used_event() and virtq_avail_event() from + * https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html#x1-712000A + * licensed under the following terms: + * + * -- + * + * This header is BSD licensed so anyone can use the definitions + * to implement compatible drivers/servers. + * + * Copyright 2007, 2009, IBM Corporation + * Copyright 2011, Red Hat, Inc + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ‘‘AS IS’’ AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <stddef.h> +#include <endian.h> +#include <string.h> +#include <errno.h> +#include <sys/eventfd.h> +#include <sys/socket.h> + +#include "util.h" +#include "virtio.h" +#include "vhost_user.h" + +#define VIRTQUEUE_MAX_SIZE 1024 + +/** + * vu_gpa_to_va() - Translate guest physical address to our virtual address. + * @dev: Vhost-user device + * @plen: Physical length to map (input), capped to region (output) + * @guest_addr: Guest physical address + * + * Return: virtual address in our address space of the guest physical address + */ +static void *vu_gpa_to_va(const struct vu_dev *dev, uint64_t *plen, + uint64_t guest_addr) +{ + unsigned int i; + + if (*plen == 0) + return NULL; + + /* Find matching memory region. */ + for (i = 0; i < dev->nregions; i++) { + const struct vu_dev_region *r = &dev->regions[i]; + + if ((guest_addr >= r->gpa) && + (guest_addr < (r->gpa + r->size))) { + if ((guest_addr + *plen) > (r->gpa + r->size)) + *plen = r->gpa + r->size - guest_addr; + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + return (void *)(uintptr_t)(guest_addr - r->gpa + + r->mmap_addr + + r->mmap_offset); + } + } + + return NULL; +} + +/** + * vring_avail_flags() - Read the available ring flags + * @vq: Virtqueue + * + * Return: the available ring descriptor flags of the given virtqueue + */ +static inline uint16_t vring_avail_flags(const struct vu_virtq *vq) +{ + return le16toh(vq->vring.avail->flags); +} + +/** + * vring_avail_idx() - Read the available ring index + * @vq: Virtqueue + * + * Return: the available ring index of the given virtqueue + */ +static inline uint16_t vring_avail_idx(struct vu_virtq *vq) +{ + vq->shadow_avail_idx = le16toh(vq->vring.avail->idx); + + return vq->shadow_avail_idx; +} + +/** + * vring_avail_ring() - Read an available ring entry + * @vq: Virtqueue + * @i: Index of the entry to read + * + * Return: the ring entry content (head of the descriptor chain) + */ +static inline uint16_t vring_avail_ring(const struct vu_virtq *vq, int i) +{ + return le16toh(vq->vring.avail->ring[i]); +} + +/** + * virtq_used_event() - Get location of used event indices + * (only with VIRTIO_F_EVENT_IDX) + * @vq: Virtqueue + * + * Return: return the location of the used event index + */ +static inline uint16_t *virtq_used_event(const struct vu_virtq *vq) +{ + /* For backwards compat, used event index is at *end* of avail ring. */ + return &vq->vring.avail->ring[vq->vring.num]; +} + +/** + * vring_get_used_event() - Get the used event from the available ring + * @vq: Virtqueue + * + * Return: the used event (available only if VIRTIO_RING_F_EVENT_IDX is set) + * used_event is a performant alternative where the driver + * specifies how far the device can progress before a notification + * is required. + */ +static inline uint16_t vring_get_used_event(const struct vu_virtq *vq) +{ + return le16toh(*virtq_used_event(vq)); +} + +/** + * virtqueue_get_head() - Get the head of the descriptor chain for a given + * index + * @vq: Virtqueue + * @idx: Available ring entry index + * @head: Head of the descriptor chain + */ +static void virtqueue_get_head(const struct vu_virtq *vq, + unsigned int idx, unsigned int *head) +{ + /* Grab the next descriptor number they're advertising, and increment + * the index we've seen. + */ + *head = vring_avail_ring(vq, idx % vq->vring.num); + + /* If their number is silly, that's a fatal mistake. */ + if (*head >= vq->vring.num) + die("vhost-user: Guest says index %u is available", *head); +} + +/** + * virtqueue_read_indirect_desc() - Copy virtio ring descriptors from guest + * memory + * @dev: Vhost-user device + * @desc: Destination address to copy the descriptors to + * @addr: Guest memory address to copy from + * @len: Length of memory to copy + * + * Return: -1 if there is an error, 0 otherwise + */ +static int virtqueue_read_indirect_desc(const struct vu_dev *dev, + struct vring_desc *desc, + uint64_t addr, size_t len) +{ + uint64_t read_len; + + if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) + return -1; + + if (len == 0) + return -1; + + while (len) { + const struct vring_desc *orig_desc; + + read_len = len; + orig_desc = vu_gpa_to_va(dev, &read_len, addr); + if (!orig_desc) + return -1; + + memcpy(desc, orig_desc, read_len); + len -= read_len; + addr += read_len; + /* NOLINTNEXTLINE(bugprone-sizeof-expression,cert-arr39-c) */ + desc += read_len / sizeof(struct vring_desc); + } + + return 0; +} + +/** + * enum virtqueue_read_desc_state - State in the descriptor chain + * @VIRTQUEUE_READ_DESC_ERROR: Found an invalid descriptor + * @VIRTQUEUE_READ_DESC_DONE: No more descriptors in the chain + * @VIRTQUEUE_READ_DESC_MORE: there are more descriptors in the chain + */ +enum virtqueue_read_desc_state { + VIRTQUEUE_READ_DESC_ERROR = -1, + VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */ + VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */ +}; + +/** + * virtqueue_read_next_desc() - Read the the next descriptor in the chain + * @desc: Virtio ring descriptors + * @i: Index of the current descriptor + * @max: Maximum value of the descriptor index + * @next: Index of the next descriptor in the chain (output value) + * + * Return: current chain descriptor state (error, next, done) + */ +static int virtqueue_read_next_desc(const struct vring_desc *desc, + int i, unsigned int max, unsigned int *next) +{ + /* If this descriptor says it doesn't chain, we're done. */ + if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) + return VIRTQUEUE_READ_DESC_DONE; + + /* Check they're not leading us off end of descriptors. */ + *next = le16toh(desc[i].next); + /* Make sure compiler knows to grab that: we don't want it changing! */ + smp_wmb(); + + if (*next >= max) + return VIRTQUEUE_READ_DESC_ERROR; + + return VIRTQUEUE_READ_DESC_MORE; +} + +/** + * vu_queue_empty() - Check if virtqueue is empty + * @vq: Virtqueue + * + * Return: true if the virtqueue is empty, false otherwise + */ +static bool vu_queue_empty(struct vu_virtq *vq) +{ + if (!vq->vring.avail) + return true; + + if (vq->shadow_avail_idx != vq->last_avail_idx) + return false; + + return vring_avail_idx(vq) == vq->last_avail_idx; +} + +/** + * vring_can_notify() - Check if a notification can be sent + * @dev: Vhost-user device + * @vq: Virtqueue + * + * Return: true if notification can be sent + */ +static bool vring_can_notify(const struct vu_dev *dev, struct vu_virtq *vq) +{ + uint16_t old, new; + bool v; + + /* We need to expose used array entries before checking used event. */ + smp_mb(); + + /* Always notify when queue is empty (when feature acknowledge) */ + if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) && + !vq->inuse && vu_queue_empty(vq)) + return true; + + if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) + return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); + + v = vq->signalled_used_valid; + vq->signalled_used_valid = true; + old = vq->signalled_used; + new = vq->signalled_used = vq->used_idx; + return !v || vring_need_event(vring_get_used_event(vq), new, old); +} + +/** + * vu_queue_notify() - Send a notification to the given virtqueue + * @dev: Vhost-user device + * @vq: Virtqueue + */ +void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq) +{ + if (!vq->vring.avail) + return; + + if (!vring_can_notify(dev, vq)) { + debug("vhost-user: virtqueue can skip notify..."); + return; + } + + if (eventfd_write(vq->call_fd, 1) < 0) + die_perror("Error writing vhost-user queue eventfd"); +} + +/** + * virtq_avail_event() - Get location of available event indices + * (only with VIRTIO_F_EVENT_IDX) + * @vq: Virtqueue + * + * Return: return the location of the available event index + */ +static inline uint16_t *virtq_avail_event(const struct vu_virtq *vq) +{ + /* For backwards compat, avail event index is at *end* of used ring. */ + return (uint16_t *)&vq->vring.used->ring[vq->vring.num]; +} + +/** + * vring_set_avail_event() - Set avail_event + * @vq: Virtqueue + * @val: Value to set to avail_event + * avail_event is used in the same way the used_event is in the + * avail_ring. + * avail_event is used to advise the driver that notifications + * are unnecessary until the driver writes entry with an index + * specified by avail_event into the available ring. + */ +static inline void vring_set_avail_event(const struct vu_virtq *vq, + uint16_t val) +{ + uint16_t val_le = htole16(val); + + if (!vq->notification) + return; + + memcpy(virtq_avail_event(vq), &val_le, sizeof(val_le)); +} + +/** + * virtqueue_map_desc() - Translate descriptor ring physical address into our + * virtual address space + * @dev: Vhost-user device + * @p_num_sg: First iov entry to use (input), + * first iov entry not used (output) + * @iov: Iov array to use to store buffer virtual addresses + * @max_num_sg: Maximum number of iov entries + * @pa: Guest physical address of the buffer to map into our virtual + * address + * @sz: Size of the buffer + * + * Return: false on error, true otherwise + */ +static bool virtqueue_map_desc(const struct vu_dev *dev, + unsigned int *p_num_sg, struct iovec *iov, + unsigned int max_num_sg, + uint64_t pa, size_t sz) +{ + unsigned int num_sg = *p_num_sg; + + ASSERT(num_sg < max_num_sg); + ASSERT(sz); + + while (sz) { + uint64_t len = sz; + + iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa); + if (iov[num_sg].iov_base == NULL) + die("vhost-user: invalid address for buffers"); + iov[num_sg].iov_len = len; + num_sg++; + sz -= len; + pa += len; + } + + *p_num_sg = num_sg; + return true; +} + +/** + * vu_queue_map_desc() - Map the virtqueue descriptor ring into our virtual + * address space + * @dev: Vhost-user device + * @vq: Virtqueue + * @idx: First descriptor ring entry to map + * @elem: Virtqueue element to store descriptor ring iov + * + * Return: -1 if there is an error, 0 otherwise + */ +static int vu_queue_map_desc(const struct vu_dev *dev, + struct vu_virtq *vq, unsigned int idx, + struct vu_virtq_element *elem) +{ + const struct vring_desc *desc = vq->vring.desc; + struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; + unsigned int out_num = 0, in_num = 0; + unsigned int max = vq->vring.num; + unsigned int i = idx; + uint64_t read_len; + int rc; + + if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { + unsigned int desc_len; + uint64_t desc_addr; + + if (le32toh(desc[i].len) % sizeof(struct vring_desc)) + die("vhost-user: Invalid size for indirect buffer table"); + + /* loop over the indirect descriptor table */ + desc_addr = le64toh(desc[i].addr); + desc_len = le32toh(desc[i].len); + max = desc_len / sizeof(struct vring_desc); + read_len = desc_len; + desc = vu_gpa_to_va(dev, &read_len, desc_addr); + if (desc && read_len != desc_len) { + /* Failed to use zero copy */ + desc = NULL; + if (!virtqueue_read_indirect_desc(dev, desc_buf, desc_addr, desc_len)) + desc = desc_buf; + } + if (!desc) + die("vhost-user: Invalid indirect buffer table"); + i = 0; + } + + /* Collect all the descriptors */ + do { + if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { + if (!virtqueue_map_desc(dev, &in_num, elem->in_sg, + elem->in_num, + le64toh(desc[i].addr), + le32toh(desc[i].len))) + return -1; + } else { + if (in_num) + die("Incorrect order for descriptors"); + if (!virtqueue_map_desc(dev, &out_num, elem->out_sg, + elem->out_num, + le64toh(desc[i].addr), + le32toh(desc[i].len))) { + return -1; + } + } + + /* If we've got too many, that implies a descriptor loop. */ + if ((in_num + out_num) > max) + die("vhost-user: Loop in queue descriptor list"); + rc = virtqueue_read_next_desc(desc, i, max, &i); + } while (rc == VIRTQUEUE_READ_DESC_MORE); + + if (rc == VIRTQUEUE_READ_DESC_ERROR) + die("vhost-user: Failed to read descriptor list"); + + elem->index = idx; + elem->in_num = in_num; + elem->out_num = out_num; + + return 0; +} + +/** + * vu_queue_pop() - Pop an entry from the virtqueue + * @dev: Vhost-user device + * @vq: Virtqueue + * @elem: Virtqueue element to fill with the entry information + * + * Return: -1 if there is an error, 0 otherwise + */ +int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq, + struct vu_virtq_element *elem) +{ + unsigned int head; + int ret; + + if (!vq->vring.avail) + return -1; + + if (vu_queue_empty(vq)) + return -1; + + /* Needed after vu_queue_empty(), see comment in + * virtqueue_num_heads(). + */ + smp_rmb(); + + if (vq->inuse >= vq->vring.num) + die("vhost-user queue size exceeded"); + + virtqueue_get_head(vq, vq->last_avail_idx++, &head); + + if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) + vring_set_avail_event(vq, vq->last_avail_idx); + + ret = vu_queue_map_desc(dev, vq, head, elem); + + if (ret < 0) + return ret; + + vq->inuse++; + + return 0; +} + +/** + * vu_queue_detach_element() - Detach an element from the virtqueue + * @vq: Virtqueue + */ +void vu_queue_detach_element(struct vu_virtq *vq) +{ + vq->inuse--; + /* unmap, when DMA support is added */ +} + +/** + * vu_queue_unpop() - Push back the previously popped element from the virtqueue + * @vq: Virtqueue + */ +/* cppcheck-suppress unusedFunction */ +void vu_queue_unpop(struct vu_virtq *vq) +{ + vq->last_avail_idx--; + vu_queue_detach_element(vq); +} + +/** + * vu_queue_rewind() - Push back a given number of popped elements + * @vq: Virtqueue + * @num: Number of element to unpop + * + * Return: True on success, false if not + */ +bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num) +{ + if (num > vq->inuse) + return false; + + vq->last_avail_idx -= num; + vq->inuse -= num; + return true; +} + +/** + * vring_used_write() - Write an entry in the used ring + * @dev: Vhost-user device + * @vq: Virtqueue + * @uelem: Entry to write + * @i: Index of the entry in the used ring + */ +static inline void vring_used_write(const struct vu_dev *vdev, + struct vu_virtq *vq, + const struct vring_used_elem *uelem, int i) +{ + struct vring_used *used = vq->vring.used; + + used->ring[i] = *uelem; + vu_log_write(vdev, vq->vring.log_guest_addr + + offsetof(struct vring_used, ring[i]), + sizeof(used->ring[i])); +} + +/** + * vu_log_queue_fill() - Log virtqueue memory update + * @dev: vhost-user device + * @vq: Virtqueue + * @index: Descriptor ring index + * @len: Size of the element + */ +static void vu_log_queue_fill(const struct vu_dev *vdev, struct vu_virtq *vq, + unsigned int index, unsigned int len) +{ + struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; + struct vring_desc *desc = vq->vring.desc; + unsigned int max, min; + unsigned num_bufs = 0; + uint64_t read_len; + + if (!vdev->log_table || !len || !vu_has_feature(vdev, VHOST_F_LOG_ALL)) + return; + + max = vq->vring.num; + + if (le16toh(desc[index].flags) & VRING_DESC_F_INDIRECT) { + unsigned int desc_len; + uint64_t desc_addr; + + if (le32toh(desc[index].len) % sizeof(struct vring_desc)) + die("Invalid size for indirect buffer table"); + + /* loop over the indirect descriptor table */ + desc_addr = le64toh(desc[index].addr); + desc_len = le32toh(desc[index].len); + max = desc_len / sizeof(struct vring_desc); + read_len = desc_len; + desc = vu_gpa_to_va(vdev, &read_len, desc_addr); + if (desc && read_len != desc_len) { + /* Failed to use zero copy */ + desc = NULL; + if (!virtqueue_read_indirect_desc(vdev, desc_buf, + desc_addr, + desc_len)) + desc = desc_buf; + } + + if (!desc) + die("Invalid indirect buffer table"); + + index = 0; + } + + do { + if (++num_bufs > max) + die("Looped descriptor"); + + if (le16toh(desc[index].flags) & VRING_DESC_F_WRITE) { + min = MIN(le32toh(desc[index].len), len); + vu_log_write(vdev, le64toh(desc[index].addr), min); + len -= min; + } + } while (len > 0 && + (virtqueue_read_next_desc(desc, index, max, &index) == + VIRTQUEUE_READ_DESC_MORE)); +} + + +/** + * vu_queue_fill_by_index() - Update information of a descriptor ring entry + * in the used ring + * @dev: Vhost-user device + * @vq: Virtqueue + * @index: Descriptor ring index + * @len: Size of the element + * @idx: Used ring entry index + */ +static void vu_queue_fill_by_index(const struct vu_dev *vdev, + struct vu_virtq *vq, + unsigned int index, unsigned int len, + unsigned int idx) +{ + struct vring_used_elem uelem; + + if (!vq->vring.avail) + return; + + vu_log_queue_fill(vdev, vq, index, len); + + idx = (idx + vq->used_idx) % vq->vring.num; + + uelem.id = htole32(index); + uelem.len = htole32(len); + vring_used_write(vdev, vq, &uelem, idx); +} + +/** + * vu_queue_fill() - Update information of a given element in the used ring + * @dev: Vhost-user device + * @vq: Virtqueue + * @elem: Element information to fill + * @len: Size of the element + * @idx: Used ring entry index + */ +void vu_queue_fill(const struct vu_dev *vdev, struct vu_virtq *vq, + const struct vu_virtq_element *elem, unsigned int len, + unsigned int idx) +{ + vu_queue_fill_by_index(vdev, vq, elem->index, len, idx); +} + +/** + * vring_used_idx_set() - Set the descriptor ring current index + * @dev: Vhost-user device + * @vq: Virtqueue + * @val: Value to set in the index + */ +static inline void vring_used_idx_set(const struct vu_dev *vdev, + struct vu_virtq *vq, uint16_t val) +{ + vq->vring.used->idx = htole16(val); + vu_log_write(vdev, vq->vring.log_guest_addr + + offsetof(struct vring_used, idx), + sizeof(vq->vring.used->idx)); + + vq->used_idx = val; +} + +/** + * vu_queue_flush() - Flush the virtqueue + * @dev: Vhost-user device + * @vq: Virtqueue + * @count: Number of entry to flush + */ +void vu_queue_flush(const struct vu_dev *vdev, struct vu_virtq *vq, + unsigned int count) +{ + uint16_t old, new; + + if (!vq->vring.avail) + return; + + /* Make sure buffer is written before we update index. */ + smp_wmb(); + + old = vq->used_idx; + new = old + count; + vring_used_idx_set(vdev, vq, new); + vq->inuse -= count; + if ((uint16_t)(new - vq->signalled_used) < (uint16_t)(new - old)) + vq->signalled_used_valid = false; +} diff --git a/virtio.h b/virtio.h new file mode 100644 index 0000000..d8beb88 --- /dev/null +++ b/virtio.h @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * virtio API, vring and virtqueue functions definition + * + * Copyright Red Hat + * Author: Laurent Vivier <lvivier@redhat.com> + */ + +#ifndef VIRTIO_H +#define VIRTIO_H + +#include <stdbool.h> +#include <linux/vhost_types.h> + +/* Maximum size of a virtqueue */ +#define VIRTQUEUE_MAX_SIZE 1024 + +/** + * struct vu_ring - Virtqueue rings + * @num: Size of the queue + * @desc: Descriptor ring + * @avail: Available ring + * @used: Used ring + * @log_guest_addr: Guest address for logging + * @flags: Vring flags + * VHOST_VRING_F_LOG is set if log address is valid + */ +struct vu_ring { + unsigned int num; + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; + uint64_t log_guest_addr; + uint32_t flags; +}; + +/** + * struct vu_virtq - Virtqueue definition + * @vring: Virtqueue rings + * @last_avail_idx: Next head to pop + * @shadow_avail_idx: Last avail_idx read from VQ. + * @used_idx: Descriptor ring current index + * @signalled_used: Last used index value we have signalled on + * @signalled_used_valid: True if signalled_used if valid + * @notification: True if the queues notify (via event + * index or interrupt) + * @inuse: Number of entries in use + * @call_fd: The event file descriptor to signal when + * buffers are used. + * @kick_fd: The event file descriptor for adding + * buffers to the vring + * @err_fd: The event file descriptor to signal when + * error occurs + * @enable: True if the virtqueue is enabled + * @started: True if the virtqueue is started + * @vra: QEMU address of our rings + */ +struct vu_virtq { + struct vu_ring vring; + uint16_t last_avail_idx; + uint16_t shadow_avail_idx; + uint16_t used_idx; + uint16_t signalled_used; + bool signalled_used_valid; + bool notification; + unsigned int inuse; + int call_fd; + int kick_fd; + int err_fd; + unsigned int enable; + bool started; + struct vhost_vring_addr vra; +}; + +/** + * struct vu_dev_region - guest shared memory region + * @gpa: Guest physical address of the region + * @size: Memory size in bytes + * @qva: QEMU virtual address + * @mmap_offset: Offset where the region starts in the mapped memory + * @mmap_addr: Address of the mapped memory + */ +struct vu_dev_region { + uint64_t gpa; + uint64_t size; + uint64_t qva; + uint64_t mmap_offset; + uint64_t mmap_addr; +}; + +#define VHOST_USER_MAX_QUEUES 2 + +/* + * Set a reasonable maximum number of ram slots, which will be supported by + * any architecture. + */ +#define VHOST_USER_MAX_RAM_SLOTS 32 + +/** + * struct vu_dev - vhost-user device information + * @context: Execution context + * @nregions: Number of shared memory regions + * @regions: Guest shared memory regions + * @features: Vhost-user features + * @protocol_features: Vhost-user protocol features + * @log_call_fd: Eventfd to report logging update + * @log_size: Size of the logging memory region + * @log_table: Base of the logging memory region + */ +struct vu_dev { + struct ctx *context; + uint32_t nregions; + struct vu_dev_region regions[VHOST_USER_MAX_RAM_SLOTS]; + struct vu_virtq vq[VHOST_USER_MAX_QUEUES]; + uint64_t features; + uint64_t protocol_features; + int log_call_fd; + uint64_t log_size; + uint8_t *log_table; +}; + +/** + * struct vu_virtq_element - virtqueue element + * @index: Descriptor ring index + * @out_num: Number of outgoing iovec buffers + * @in_num: Number of incoming iovec buffers + * @in_sg: Incoming iovec buffers + * @out_sg: Outgoing iovec buffers + */ +struct vu_virtq_element { + unsigned int index; + unsigned int out_num; + unsigned int in_num; + struct iovec *in_sg; + struct iovec *out_sg; +}; + +/** + * has_feature() - Check a feature bit in a features set + * @features: Features set + * @fb: Feature bit to check + * + * Return: True if the feature bit is set + */ +static inline bool has_feature(uint64_t features, unsigned int fbit) +{ + return !!(features & (1ULL << fbit)); +} + +/** + * vu_has_feature() - Check if a virtio-net feature is available + * @vdev: Vhost-user device + * @fbit: Feature to check + * + * Return: True if the feature is available + */ +static inline bool vu_has_feature(const struct vu_dev *vdev, + unsigned int fbit) +{ + return has_feature(vdev->features, fbit); +} + +/** + * vu_has_protocol_feature() - Check if a vhost-user feature is available + * @vdev: Vhost-user device + * @fbit: Feature to check + * + * Return: True if the feature is available + */ +/* cppcheck-suppress unusedFunction */ +static inline bool vu_has_protocol_feature(const struct vu_dev *vdev, + unsigned int fbit) +{ + return has_feature(vdev->protocol_features, fbit); +} + +void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq); +int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq, + struct vu_virtq_element *elem); +void vu_queue_detach_element(struct vu_virtq *vq); +void vu_queue_unpop(struct vu_virtq *vq); +bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num); +void vu_queue_fill(const struct vu_dev *vdev, struct vu_virtq *vq, + const struct vu_virtq_element *elem, unsigned int len, + unsigned int idx); +void vu_queue_flush(const struct vu_dev *vdev, struct vu_virtq *vq, + unsigned int count); +#endif /* VIRTIO_H */ diff --git a/vu_common.c b/vu_common.c new file mode 100644 index 0000000..5e6fd4a --- /dev/null +++ b/vu_common.c @@ -0,0 +1,309 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Copyright Red Hat + * Author: Laurent Vivier <lvivier@redhat.com> + * + * common_vu.c - vhost-user common UDP and TCP functions + */ + +#include <errno.h> +#include <unistd.h> +#include <sys/uio.h> +#include <sys/eventfd.h> +#include <netinet/if_ether.h> +#include <linux/virtio_net.h> + +#include "util.h" +#include "passt.h" +#include "tap.h" +#include "vhost_user.h" +#include "pcap.h" +#include "vu_common.h" +#include "migrate.h" + +#define VU_MAX_TX_BUFFER_NB 2 + +/** + * vu_packet_check_range() - Check if a given memory zone is contained in + * a mapped guest memory region + * @buf: Array of the available memory regions + * @ptr: Start of desired data range + * @size: Length of desired data range + * + * Return: 0 if the zone is in a mapped memory region, -1 otherwise + */ +int vu_packet_check_range(void *buf, const char *ptr, size_t len) +{ + struct vu_dev_region *dev_region; + + for (dev_region = buf; dev_region->mmap_addr; dev_region++) { + uintptr_t base_addr = dev_region->mmap_addr + + dev_region->mmap_offset; + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + const char *base = (const char *)base_addr; + + ASSERT(base_addr >= dev_region->mmap_addr); + + if (len <= dev_region->size && base <= ptr && + (size_t)(ptr - base) <= dev_region->size - len) + return 0; + } + + return -1; +} + +/** + * vu_init_elem() - initialize an array of virtqueue elements with 1 iov in each + * @elem: Array of virtqueue elements to initialize + * @iov: Array of iovec to assign to virtqueue element + * @elem_cnt: Number of virtqueue element + */ +void vu_init_elem(struct vu_virtq_element *elem, struct iovec *iov, int elem_cnt) +{ + int i; + + for (i = 0; i < elem_cnt; i++) + vu_set_element(&elem[i], NULL, &iov[i]); +} + +/** + * vu_collect() - collect virtio buffers from a given virtqueue + * @vdev: vhost-user device + * @vq: virtqueue to collect from + * @elem: Array of virtqueue element + * each element must be initialized with one iovec entry + * in the in_sg array. + * @max_elem: Number of virtqueue elements in the array + * @size: Maximum size of the data in the frame + * @frame_size: The total size of the buffers (output) + * + * Return: number of elements used to contain the frame + */ +int vu_collect(const struct vu_dev *vdev, struct vu_virtq *vq, + struct vu_virtq_element *elem, int max_elem, + size_t size, size_t *frame_size) +{ + size_t current_size = 0; + int elem_cnt = 0; + + while (current_size < size && elem_cnt < max_elem) { + struct iovec *iov; + int ret; + + ret = vu_queue_pop(vdev, vq, &elem[elem_cnt]); + if (ret < 0) + break; + + if (elem[elem_cnt].in_num < 1) { + warn("virtio-net receive queue contains no in buffers"); + vu_queue_detach_element(vq); + break; + } + + iov = &elem[elem_cnt].in_sg[0]; + + if (iov->iov_len > size - current_size) + iov->iov_len = size - current_size; + + current_size += iov->iov_len; + elem_cnt++; + + if (!vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) + break; + } + + if (frame_size) + *frame_size = current_size; + + return elem_cnt; +} + +/** + * vu_set_vnethdr() - set virtio-net headers + * @vdev: vhost-user device + * @vnethdr: Address of the header to set + * @num_buffers: Number of guest buffers of the frame + */ +void vu_set_vnethdr(const struct vu_dev *vdev, + struct virtio_net_hdr_mrg_rxbuf *vnethdr, + int num_buffers) +{ + vnethdr->hdr = VU_HEADER; + if (vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) + vnethdr->num_buffers = htole16(num_buffers); +} + +/** + * vu_flush() - flush all the collected buffers to the vhost-user interface + * @vdev: vhost-user device + * @vq: vhost-user virtqueue + * @elem: virtqueue elements array to send back to the virtqueue + * @elem_cnt: Length of the array + */ +void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq, + struct vu_virtq_element *elem, int elem_cnt) +{ + int i; + + for (i = 0; i < elem_cnt; i++) + vu_queue_fill(vdev, vq, &elem[i], elem[i].in_sg[0].iov_len, i); + + vu_queue_flush(vdev, vq, elem_cnt); + vu_queue_notify(vdev, vq); +} + +/** + * vu_handle_tx() - Receive data from the TX virtqueue + * @vdev: vhost-user device + * @index: index of the virtqueue + * @now: Current timestamp + */ +static void vu_handle_tx(struct vu_dev *vdev, int index, + const struct timespec *now) +{ + struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE]; + struct iovec out_sg[VIRTQUEUE_MAX_SIZE]; + struct vu_virtq *vq = &vdev->vq[index]; + int hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + int out_sg_count; + int count; + + ASSERT(VHOST_USER_IS_QUEUE_TX(index)); + + tap_flush_pools(); + + count = 0; + out_sg_count = 0; + while (count < VIRTQUEUE_MAX_SIZE && + out_sg_count + VU_MAX_TX_BUFFER_NB <= VIRTQUEUE_MAX_SIZE) { + int ret; + + elem[count].out_num = VU_MAX_TX_BUFFER_NB; + elem[count].out_sg = &out_sg[out_sg_count]; + elem[count].in_num = 0; + elem[count].in_sg = NULL; + + ret = vu_queue_pop(vdev, vq, &elem[count]); + if (ret < 0) + break; + out_sg_count += elem[count].out_num; + + if (elem[count].out_num < 1) { + warn("virtio-net transmit queue contains no out buffers"); + break; + } + if (elem[count].out_num == 1) { + tap_add_packet(vdev->context, + elem[count].out_sg[0].iov_len - hdrlen, + (char *)elem[count].out_sg[0].iov_base + + hdrlen, now); + } else { + /* vnet header can be in a separate iovec */ + if (elem[count].out_num != 2) { + debug("virtio-net transmit queue contains more than one buffer ([%d]: %u)", + count, elem[count].out_num); + } else if (elem[count].out_sg[0].iov_len != (size_t)hdrlen) { + debug("virtio-net transmit queue entry not aligned on hdrlen ([%d]: %d != %zu)", + count, hdrlen, elem[count].out_sg[0].iov_len); + } else { + tap_add_packet(vdev->context, + elem[count].out_sg[1].iov_len, + (char *)elem[count].out_sg[1].iov_base, + now); + } + } + + count++; + } + tap_handler(vdev->context, now); + + if (count) { + int i; + + for (i = 0; i < count; i++) + vu_queue_fill(vdev, vq, &elem[i], 0, i); + vu_queue_flush(vdev, vq, count); + vu_queue_notify(vdev, vq); + } +} + +/** + * vu_kick_cb() - Called on a kick event to start to receive data + * @vdev: vhost-user device + * @ref: epoll reference information + * @now: Current timestamp + */ +void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref, + const struct timespec *now) +{ + eventfd_t kick_data; + ssize_t rc; + + rc = eventfd_read(ref.fd, &kick_data); + if (rc == -1) + die_perror("vhost-user kick eventfd_read()"); + + trace("vhost-user: got kick_data: %016"PRIx64" idx: %d", + kick_data, ref.queue); + if (VHOST_USER_IS_QUEUE_TX(ref.queue)) + vu_handle_tx(vdev, ref.queue, now); +} + +/** + * vu_send_single() - Send a buffer to the front-end using the RX virtqueue + * @c: execution context + * @buf: address of the buffer + * @size: size of the buffer + * + * Return: number of bytes sent, -1 if there is an error + */ +int vu_send_single(const struct ctx *c, const void *buf, size_t size) +{ + struct vu_dev *vdev = c->vdev; + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE]; + struct iovec in_sg[VIRTQUEUE_MAX_SIZE]; + size_t total; + int elem_cnt; + int i; + + trace("vu_send_single size %zu", size); + + if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) { + debug("Got packet, but RX virtqueue not usable yet"); + return -1; + } + + vu_init_elem(elem, in_sg, VIRTQUEUE_MAX_SIZE); + + size += sizeof(struct virtio_net_hdr_mrg_rxbuf); + elem_cnt = vu_collect(vdev, vq, elem, VIRTQUEUE_MAX_SIZE, size, &total); + if (total < size) { + debug("vu_send_single: no space to send the data " + "elem_cnt %d size %zd", elem_cnt, total); + goto err; + } + + vu_set_vnethdr(vdev, in_sg[0].iov_base, elem_cnt); + + total -= sizeof(struct virtio_net_hdr_mrg_rxbuf); + + /* copy data from the buffer to the iovec */ + iov_from_buf(in_sg, elem_cnt, sizeof(struct virtio_net_hdr_mrg_rxbuf), + buf, total); + + if (*c->pcap) { + pcap_iov(in_sg, elem_cnt, + sizeof(struct virtio_net_hdr_mrg_rxbuf)); + } + + vu_flush(vdev, vq, elem, elem_cnt); + + trace("vhost-user sent %zu", total); + + return total; +err: + for (i = 0; i < elem_cnt; i++) + vu_queue_detach_element(vq); + + return -1; +} diff --git a/vu_common.h b/vu_common.h new file mode 100644 index 0000000..f538f23 --- /dev/null +++ b/vu_common.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: Laurent Vivier <lvivier@redhat.com> + * + * vhost-user common UDP and TCP functions + */ + +#ifndef VU_COMMON_H +#define VU_COMMON_H +#include <linux/virtio_net.h> + +static inline void *vu_eth(void *base) +{ + return ((char *)base + sizeof(struct virtio_net_hdr_mrg_rxbuf)); +} + +static inline void *vu_ip(void *base) +{ + return (struct ethhdr *)vu_eth(base) + 1; +} + +static inline void *vu_payloadv4(void *base) +{ + return (struct iphdr *)vu_ip(base) + 1; +} + +static inline void *vu_payloadv6(void *base) +{ + return (struct ipv6hdr *)vu_ip(base) + 1; +} + +/** + * vu_set_element() - Initialize a vu_virtq_element + * @elem: Element to initialize + * @out_sg: One out iovec entry to set in elem + * @in_sg: One in iovec entry to set in elem + */ +static inline void vu_set_element(struct vu_virtq_element *elem, + struct iovec *out_sg, struct iovec *in_sg) +{ + elem->out_num = !!out_sg; + elem->out_sg = out_sg; + elem->in_num = !!in_sg; + elem->in_sg = in_sg; +} + +void vu_init_elem(struct vu_virtq_element *elem, struct iovec *iov, + int elem_cnt); +int vu_collect(const struct vu_dev *vdev, struct vu_virtq *vq, + struct vu_virtq_element *elem, int max_elem, size_t size, + size_t *frame_size); +void vu_set_vnethdr(const struct vu_dev *vdev, + struct virtio_net_hdr_mrg_rxbuf *vnethdr, + int num_buffers); +void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq, + struct vu_virtq_element *elem, int elem_cnt); +void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref, + const struct timespec *now); +int vu_send_single(const struct ctx *c, const void *buf, size_t size); + +#endif /* VU_COMMON_H */ |