aboutgitcodebugslistschat
path: root/util.c
diff options
context:
space:
mode:
authorStefano Brivio <sbrivio@redhat.com>2021-07-17 08:34:53 +0200
committerStefano Brivio <sbrivio@redhat.com>2021-07-17 11:04:22 +0200
commit33482d5bf29312464b208beb01a5302257e82fe6 (patch)
tree6fcb11961ecca0cbed42bccbba15b1d4fe73a62c /util.c
parent28fca04eb990f11608187252ca8949d7df22ce9d (diff)
downloadpasst-33482d5bf29312464b208beb01a5302257e82fe6.tar
passt-33482d5bf29312464b208beb01a5302257e82fe6.tar.gz
passt-33482d5bf29312464b208beb01a5302257e82fe6.tar.bz2
passt-33482d5bf29312464b208beb01a5302257e82fe6.tar.lz
passt-33482d5bf29312464b208beb01a5302257e82fe6.tar.xz
passt-33482d5bf29312464b208beb01a5302257e82fe6.tar.zst
passt-33482d5bf29312464b208beb01a5302257e82fe6.zip
passt: Add PASTA mode, major rework
PASTA (Pack A Subtle Tap Abstraction) provides quasi-native host connectivity to an otherwise disconnected, unprivileged network and user namespace, similarly to slirp4netns. Given that the implementation is largely overlapping with PASST, no separate binary is built: 'pasta' (and 'passt4netns' for clarity) both link to 'passt', and the mode of operation is selected depending on how the binary is invoked. Usage example: $ unshare -rUn # echo $$ 1871759 $ ./pasta 1871759 # From another terminal # udhcpc -i pasta0 2>/dev/null # ping -c1 pasta.pizza PING pasta.pizza (64.190.62.111) 56(84) bytes of data. 64 bytes from 64.190.62.111 (64.190.62.111): icmp_seq=1 ttl=255 time=34.6 ms --- pasta.pizza ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 34.575/34.575/34.575/0.000 ms # ping -c1 spaghetti.pizza PING spaghetti.pizza(2606:4700:3034::6815:147a (2606:4700:3034::6815:147a)) 56 data bytes 64 bytes from 2606:4700:3034::6815:147a (2606:4700:3034::6815:147a): icmp_seq=1 ttl=255 time=29.0 ms --- spaghetti.pizza ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 28.967/28.967/28.967/0.000 ms This entails a major rework, especially with regard to the storage of tracked connections and to the semantics of epoll(7) references. Indexing TCP and UDP bindings merely by socket proved to be inflexible and unsuitable to handle different connection flows: pasta also provides Layer-2 to Layer-2 socket mapping between init and a separate namespace for local connections, using a pair of splice() system calls for TCP, and a recvmmsg()/sendmmsg() pair for UDP local bindings. For instance, building on the previous example: # ip link set dev lo up # iperf3 -s $ iperf3 -c ::1 -Z -w 32M -l 1024k -P2 | tail -n4 [SUM] 0.00-10.00 sec 52.3 GBytes 44.9 Gbits/sec 283 sender [SUM] 0.00-10.43 sec 52.3 GBytes 43.1 Gbits/sec receiver iperf Done. epoll(7) references now include a generic part in order to demultiplex data to the relevant protocol handler, using 24 bits for the socket number, and an opaque portion reserved for usage by the single protocol handlers, in order to track sockets back to corresponding connections and bindings. A number of fixes pertaining to TCP state machine and congestion window handling are also included here. Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Diffstat (limited to 'util.c')
-rw-r--r--util.c162
1 files changed, 131 insertions, 31 deletions
diff --git a/util.c b/util.c
index 59a0cb2..1372eec 100644
--- a/util.c
+++ b/util.c
@@ -1,14 +1,19 @@
// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
*
* util.c - Convenience helpers
*
* Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
- *
*/
+#define _GNU_SOURCE
+#include <sched.h>
#include <stdio.h>
#include <stdint.h>
#include <stddef.h>
@@ -20,13 +25,16 @@
#include <netinet/tcp.h>
#include <netinet/udp.h>
#include <sys/epoll.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
#include <syslog.h>
#include <stdarg.h>
#include <string.h>
#include <time.h>
-#include "passt.h"
#include "util.h"
+#include "passt.h"
#ifdef DEBUG
#define logfn(name, level) \
@@ -183,73 +191,72 @@ char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto)
* sock_l4() - Create and bind socket for given L4, add to epoll list
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
- * @proto: Protocol number, host order
+ * @proto: Protocol number
* @port: Port, host order
+ * @lo: Bind to loopback address only, if set
+ * @data: epoll reference portion for protocol handlers
*
* Return: newly created socket, -1 on error
*/
-int sock_l4(struct ctx *c, int af, uint16_t proto, uint16_t port)
+int sock_l4(struct ctx *c, int af, uint8_t proto, uint16_t port, int lo,
+ uint32_t data)
{
+ union epoll_ref ref = { .proto = proto, .data = data };
struct sockaddr_in addr4 = {
.sin_family = AF_INET,
.sin_port = htons(port),
- .sin_addr = { .s_addr = INADDR_ANY },
};
struct sockaddr_in6 addr6 = {
.sin6_family = AF_INET6,
.sin6_port = htons(port),
- .sin6_addr = IN6ADDR_ANY_INIT,
};
- struct epoll_event ev = { 0 };
const struct sockaddr *sa;
+ struct epoll_event ev;
int fd, sl, one = 1;
if (proto != IPPROTO_TCP && proto != IPPROTO_UDP &&
proto != IPPROTO_ICMP && proto != IPPROTO_ICMPV6)
return -1; /* Not implemented. */
- fd = socket(af, proto == IPPROTO_TCP ? SOCK_STREAM : SOCK_DGRAM, proto);
+ if (proto == IPPROTO_TCP)
+ fd = socket(af, SOCK_STREAM | SOCK_NONBLOCK, proto);
+ else
+ fd = socket(af, SOCK_DGRAM | SOCK_NONBLOCK, proto);
if (fd < 0) {
perror("L4 socket");
return -1;
}
+ ref.s = fd;
if (af == AF_INET) {
+ if (lo)
+ addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ else
+ addr4.sin_addr.s_addr = htonl(INADDR_ANY);
+
sa = (const struct sockaddr *)&addr4;
sl = sizeof(addr4);
} else {
+ if (lo)
+ addr6.sin6_addr = in6addr_loopback;
+ else
+ addr6.sin6_addr = in6addr_any;
+
sa = (const struct sockaddr *)&addr6;
sl = sizeof(addr6);
setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof(one));
}
- CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_ICMP, icmp, fd);
- CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_ICMPV6, icmp, fd);
- CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_TCP, tcp, fd);
- CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_UDP, udp, fd);
-
- if (proto == IPPROTO_UDP && PORT_IS_EPHEMERAL(port))
- goto epoll_add;
-
- if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6)
- goto epoll_add;
+ setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one));
if (bind(fd, sa, sl) < 0) {
/* We'll fail to bind to low ports if we don't have enough
* capabilities, and we'll fail to bind on already bound ports,
- * this is fine. If this isn't the socket with the lowest number
- * for a given protocol, leave it open, to avoid unnecessary
- * holes in the numbering.
+ * this is fine.
*/
- if ((proto == IPPROTO_TCP && fd == c->tcp.fd_min) ||
- (proto == IPPROTO_UDP && fd == c->udp.fd_min) ||
- ((proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) &&
- fd == c->icmp.fd_min)) {
- close(fd);
- return 0;
- }
- return fd;
+ close(fd);
+ return 0;
}
if (proto == IPPROTO_TCP && listen(fd, 128) < 0) {
@@ -258,9 +265,8 @@ int sock_l4(struct ctx *c, int af, uint16_t proto, uint16_t port)
return -1;
}
-epoll_add:
ev.events = EPOLLIN;
- ev.data.fd = fd;
+ ev.data.u64 = ref.u64;
if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev) == -1) {
perror("L4 epoll_ctl");
return -1;
@@ -286,3 +292,97 @@ int timespec_diff_ms(struct timespec *a, struct timespec *b)
return (a->tv_nsec - b->tv_nsec) / 1000000 +
(a->tv_sec - b->tv_sec) * 1000;
}
+
+/**
+ * bitmap_set() - Set single bit in bitmap
+ * @map: Pointer to bitmap
+ * @bit: Bit number to set
+ */
+void bitmap_set(uint8_t *map, int bit)
+{
+ map[bit / 8] |= 1 << (bit % 8);
+}
+
+/**
+ * bitmap_set() - Clear single bit in bitmap
+ * @map: Pointer to bitmap
+ * @bit: Bit number to clear
+ */
+void bitmap_clear(uint8_t *map, int bit)
+{
+ map[bit / 8] &= ~(1 << (bit % 8));
+}
+
+/**
+ * bitmap_isset() - Check for set bit in bitmap
+ * @map: Pointer to bitmap
+ * @bit: Bit number to check
+ *
+ * Return: non-zero if given bit is set, zero if it's not
+ */
+int bitmap_isset(uint8_t *map, int bit)
+{
+ return map[bit / 8] & (1 << bit % 8);
+}
+
+/**
+ * procfs_scan_listen() - Set bits for listening TCP or UDP sockets from procfs
+ * @name: Corresponding name of file under /proc/net/
+ * @map: Bitmap where numbers of ports in listening state will be set
+ */
+void procfs_scan_listen(char *name, uint8_t *map)
+{
+ char line[200], path[PATH_MAX];
+ unsigned long port;
+ unsigned int state;
+ FILE *fp;
+
+ snprintf(path, PATH_MAX, "/proc/net/%s", name);
+ if (!(fp = fopen(path, "r")))
+ return;
+
+ fgets(line, sizeof(line), fp);
+ while (fgets(line, sizeof(line), fp)) {
+ if (sscanf(line, "%*u: %*x:%lx %*x:%*x %x", &port, &state) != 2)
+ continue;
+
+ /* See enum in kernel's include/net/tcp_states.h */
+ if ((strstr(name, "tcp") && state != 0x0a) ||
+ (strstr(name, "udp") && state != 0x07))
+ continue;
+
+ bitmap_set(map, port);
+ }
+
+ fclose(fp);
+}
+
+/**
+ * ns_enter() - Enter user and network namespaces of process with given PID
+ * @target_pid: Process PID
+ *
+ * Return: 0 on success, -1 on failure
+ */
+int ns_enter(int target_pid)
+{
+ char ns[PATH_MAX];
+ int fd;
+
+ snprintf(ns, PATH_MAX, "/proc/%i/ns/user", target_pid);
+ if ((fd = open(ns, O_RDONLY)) < 0 || setns(fd, 0))
+ goto fail;
+ close(fd);
+
+ snprintf(ns, PATH_MAX, "/proc/%i/ns/net", target_pid);
+ if ((fd = open(ns, O_RDONLY)) < 0 || setns(fd, 0))
+ goto fail;
+ close(fd);
+
+ return 0;
+
+fail:
+ if (fd != -1)
+ close(fd);
+
+ return -1;
+}