diff options
author | Stefano Brivio <sbrivio@redhat.com> | 2021-07-17 08:34:53 +0200 |
---|---|---|
committer | Stefano Brivio <sbrivio@redhat.com> | 2021-07-17 11:04:22 +0200 |
commit | 33482d5bf29312464b208beb01a5302257e82fe6 (patch) | |
tree | 6fcb11961ecca0cbed42bccbba15b1d4fe73a62c /util.c | |
parent | 28fca04eb990f11608187252ca8949d7df22ce9d (diff) | |
download | passt-33482d5bf29312464b208beb01a5302257e82fe6.tar passt-33482d5bf29312464b208beb01a5302257e82fe6.tar.gz passt-33482d5bf29312464b208beb01a5302257e82fe6.tar.bz2 passt-33482d5bf29312464b208beb01a5302257e82fe6.tar.lz passt-33482d5bf29312464b208beb01a5302257e82fe6.tar.xz passt-33482d5bf29312464b208beb01a5302257e82fe6.tar.zst passt-33482d5bf29312464b208beb01a5302257e82fe6.zip |
passt: Add PASTA mode, major rework
PASTA (Pack A Subtle Tap Abstraction) provides quasi-native host
connectivity to an otherwise disconnected, unprivileged network
and user namespace, similarly to slirp4netns. Given that the
implementation is largely overlapping with PASST, no separate binary
is built: 'pasta' (and 'passt4netns' for clarity) both link to
'passt', and the mode of operation is selected depending on how the
binary is invoked. Usage example:
$ unshare -rUn
# echo $$
1871759
$ ./pasta 1871759 # From another terminal
# udhcpc -i pasta0 2>/dev/null
# ping -c1 pasta.pizza
PING pasta.pizza (64.190.62.111) 56(84) bytes of data.
64 bytes from 64.190.62.111 (64.190.62.111): icmp_seq=1 ttl=255 time=34.6 ms
--- pasta.pizza ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 34.575/34.575/34.575/0.000 ms
# ping -c1 spaghetti.pizza
PING spaghetti.pizza(2606:4700:3034::6815:147a (2606:4700:3034::6815:147a)) 56 data bytes
64 bytes from 2606:4700:3034::6815:147a (2606:4700:3034::6815:147a): icmp_seq=1 ttl=255 time=29.0 ms
--- spaghetti.pizza ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 28.967/28.967/28.967/0.000 ms
This entails a major rework, especially with regard to the storage of
tracked connections and to the semantics of epoll(7) references.
Indexing TCP and UDP bindings merely by socket proved to be
inflexible and unsuitable to handle different connection flows: pasta
also provides Layer-2 to Layer-2 socket mapping between init and a
separate namespace for local connections, using a pair of splice()
system calls for TCP, and a recvmmsg()/sendmmsg() pair for UDP local
bindings. For instance, building on the previous example:
# ip link set dev lo up
# iperf3 -s
$ iperf3 -c ::1 -Z -w 32M -l 1024k -P2 | tail -n4
[SUM] 0.00-10.00 sec 52.3 GBytes 44.9 Gbits/sec 283 sender
[SUM] 0.00-10.43 sec 52.3 GBytes 43.1 Gbits/sec receiver
iperf Done.
epoll(7) references now include a generic part in order to
demultiplex data to the relevant protocol handler, using 24
bits for the socket number, and an opaque portion reserved for
usage by the single protocol handlers, in order to track sockets
back to corresponding connections and bindings.
A number of fixes pertaining to TCP state machine and congestion
window handling are also included here.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Diffstat (limited to 'util.c')
-rw-r--r-- | util.c | 162 |
1 files changed, 131 insertions, 31 deletions
@@ -1,14 +1,19 @@ // SPDX-License-Identifier: AGPL-3.0-or-later /* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode * * util.c - Convenience helpers * * Copyright (c) 2020-2021 Red Hat GmbH * Author: Stefano Brivio <sbrivio@redhat.com> - * */ +#define _GNU_SOURCE +#include <sched.h> #include <stdio.h> #include <stdint.h> #include <stddef.h> @@ -20,13 +25,16 @@ #include <netinet/tcp.h> #include <netinet/udp.h> #include <sys/epoll.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> #include <syslog.h> #include <stdarg.h> #include <string.h> #include <time.h> -#include "passt.h" #include "util.h" +#include "passt.h" #ifdef DEBUG #define logfn(name, level) \ @@ -183,73 +191,72 @@ char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto) * sock_l4() - Create and bind socket for given L4, add to epoll list * @c: Execution context * @af: Address family, AF_INET or AF_INET6 - * @proto: Protocol number, host order + * @proto: Protocol number * @port: Port, host order + * @lo: Bind to loopback address only, if set + * @data: epoll reference portion for protocol handlers * * Return: newly created socket, -1 on error */ -int sock_l4(struct ctx *c, int af, uint16_t proto, uint16_t port) +int sock_l4(struct ctx *c, int af, uint8_t proto, uint16_t port, int lo, + uint32_t data) { + union epoll_ref ref = { .proto = proto, .data = data }; struct sockaddr_in addr4 = { .sin_family = AF_INET, .sin_port = htons(port), - .sin_addr = { .s_addr = INADDR_ANY }, }; struct sockaddr_in6 addr6 = { .sin6_family = AF_INET6, .sin6_port = htons(port), - .sin6_addr = IN6ADDR_ANY_INIT, }; - struct epoll_event ev = { 0 }; const struct sockaddr *sa; + struct epoll_event ev; int fd, sl, one = 1; if (proto != IPPROTO_TCP && proto != IPPROTO_UDP && proto != IPPROTO_ICMP && proto != IPPROTO_ICMPV6) return -1; /* Not implemented. */ - fd = socket(af, proto == IPPROTO_TCP ? SOCK_STREAM : SOCK_DGRAM, proto); + if (proto == IPPROTO_TCP) + fd = socket(af, SOCK_STREAM | SOCK_NONBLOCK, proto); + else + fd = socket(af, SOCK_DGRAM | SOCK_NONBLOCK, proto); if (fd < 0) { perror("L4 socket"); return -1; } + ref.s = fd; if (af == AF_INET) { + if (lo) + addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + else + addr4.sin_addr.s_addr = htonl(INADDR_ANY); + sa = (const struct sockaddr *)&addr4; sl = sizeof(addr4); } else { + if (lo) + addr6.sin6_addr = in6addr_loopback; + else + addr6.sin6_addr = in6addr_any; + sa = (const struct sockaddr *)&addr6; sl = sizeof(addr6); setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof(one)); } - CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_ICMP, icmp, fd); - CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_ICMPV6, icmp, fd); - CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_TCP, tcp, fd); - CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_UDP, udp, fd); - - if (proto == IPPROTO_UDP && PORT_IS_EPHEMERAL(port)) - goto epoll_add; - - if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) - goto epoll_add; + setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)); if (bind(fd, sa, sl) < 0) { /* We'll fail to bind to low ports if we don't have enough * capabilities, and we'll fail to bind on already bound ports, - * this is fine. If this isn't the socket with the lowest number - * for a given protocol, leave it open, to avoid unnecessary - * holes in the numbering. + * this is fine. */ - if ((proto == IPPROTO_TCP && fd == c->tcp.fd_min) || - (proto == IPPROTO_UDP && fd == c->udp.fd_min) || - ((proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) && - fd == c->icmp.fd_min)) { - close(fd); - return 0; - } - return fd; + close(fd); + return 0; } if (proto == IPPROTO_TCP && listen(fd, 128) < 0) { @@ -258,9 +265,8 @@ int sock_l4(struct ctx *c, int af, uint16_t proto, uint16_t port) return -1; } -epoll_add: ev.events = EPOLLIN; - ev.data.fd = fd; + ev.data.u64 = ref.u64; if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev) == -1) { perror("L4 epoll_ctl"); return -1; @@ -286,3 +292,97 @@ int timespec_diff_ms(struct timespec *a, struct timespec *b) return (a->tv_nsec - b->tv_nsec) / 1000000 + (a->tv_sec - b->tv_sec) * 1000; } + +/** + * bitmap_set() - Set single bit in bitmap + * @map: Pointer to bitmap + * @bit: Bit number to set + */ +void bitmap_set(uint8_t *map, int bit) +{ + map[bit / 8] |= 1 << (bit % 8); +} + +/** + * bitmap_set() - Clear single bit in bitmap + * @map: Pointer to bitmap + * @bit: Bit number to clear + */ +void bitmap_clear(uint8_t *map, int bit) +{ + map[bit / 8] &= ~(1 << (bit % 8)); +} + +/** + * bitmap_isset() - Check for set bit in bitmap + * @map: Pointer to bitmap + * @bit: Bit number to check + * + * Return: non-zero if given bit is set, zero if it's not + */ +int bitmap_isset(uint8_t *map, int bit) +{ + return map[bit / 8] & (1 << bit % 8); +} + +/** + * procfs_scan_listen() - Set bits for listening TCP or UDP sockets from procfs + * @name: Corresponding name of file under /proc/net/ + * @map: Bitmap where numbers of ports in listening state will be set + */ +void procfs_scan_listen(char *name, uint8_t *map) +{ + char line[200], path[PATH_MAX]; + unsigned long port; + unsigned int state; + FILE *fp; + + snprintf(path, PATH_MAX, "/proc/net/%s", name); + if (!(fp = fopen(path, "r"))) + return; + + fgets(line, sizeof(line), fp); + while (fgets(line, sizeof(line), fp)) { + if (sscanf(line, "%*u: %*x:%lx %*x:%*x %x", &port, &state) != 2) + continue; + + /* See enum in kernel's include/net/tcp_states.h */ + if ((strstr(name, "tcp") && state != 0x0a) || + (strstr(name, "udp") && state != 0x07)) + continue; + + bitmap_set(map, port); + } + + fclose(fp); +} + +/** + * ns_enter() - Enter user and network namespaces of process with given PID + * @target_pid: Process PID + * + * Return: 0 on success, -1 on failure + */ +int ns_enter(int target_pid) +{ + char ns[PATH_MAX]; + int fd; + + snprintf(ns, PATH_MAX, "/proc/%i/ns/user", target_pid); + if ((fd = open(ns, O_RDONLY)) < 0 || setns(fd, 0)) + goto fail; + close(fd); + + snprintf(ns, PATH_MAX, "/proc/%i/ns/net", target_pid); + if ((fd = open(ns, O_RDONLY)) < 0 || setns(fd, 0)) + goto fail; + close(fd); + + return 0; + +fail: + if (fd != -1) + close(fd); + + return -1; +} |