aboutgitcodebugslistschat
diff options
context:
space:
mode:
-rw-r--r--README.md5
-rw-r--r--conf.c45
-rw-r--r--passt.115
-rw-r--r--passt.c126
-rw-r--r--passt.h7
-rw-r--r--pasta.c165
-rw-r--r--pcap.c5
-rw-r--r--pcap.h2
-rwxr-xr-xslirp4netns.sh2
-rw-r--r--tap.c58
-rw-r--r--tcp.c13
-rw-r--r--test/demo/passt3
-rw-r--r--test/demo/pasta5
-rwxr-xr-xtest/lib/setup28
-rw-r--r--udp.c7
-rw-r--r--util.c129
-rw-r--r--util.h12
17 files changed, 365 insertions, 262 deletions
diff --git a/README.md b/README.md
index d16b705..1c8baf3 100644
--- a/README.md
+++ b/README.md
@@ -232,9 +232,10 @@ speeding up local connections, and usually requiring NAT. _pasta_:
`seccomp`](/passt/tree/seccomp.sh))
* ✅ root operation not allowed outside user namespaces
* ✅ all capabilities dropped, other than `CAP_NET_BIND_SERVICE` (if granted)
+* ✅ with default options, user, mount, IPC, UTS, PID namespaces are detached
* ✅ no external dependencies (other than a standard C library)
-* ✅ restrictive seccomp profiles (50 syscalls allowed for _passt_, 62 for
- _pasta_)
+* ✅ restrictive seccomp profiles (22 syscalls allowed for _passt_, 34 for
+ _pasta_ on x86_64)
* ✅ static checkers in continuous integration (clang-tidy, cppcheck)
* 🛠️ rework of TCP state machine (flags instead of states), TCP timers, and code
de-duplication
diff --git a/conf.c b/conf.c
index abe63a1..732d918 100644
--- a/conf.c
+++ b/conf.c
@@ -10,8 +10,6 @@
*
* Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
- *
- * #syscalls stat|statx
*/
#include <arpa/inet.h>
@@ -46,31 +44,31 @@
*/
void get_bound_ports(struct ctx *c, int ns, uint8_t proto)
{
- uint8_t *udp_map, *udp_exclude, *tcp_map, *tcp_exclude;
+ uint8_t *udp_map, *udp_excl, *tcp_map, *tcp_excl;
if (ns) {
udp_map = c->udp.port_to_tap;
- udp_exclude = c->udp.port_to_init;
+ udp_excl = c->udp.port_to_init;
tcp_map = c->tcp.port_to_tap;
- tcp_exclude = c->tcp.port_to_init;
+ tcp_excl = c->tcp.port_to_init;
} else {
udp_map = c->udp.port_to_init;
- udp_exclude = c->udp.port_to_tap;
+ udp_excl = c->udp.port_to_tap;
tcp_map = c->tcp.port_to_init;
- tcp_exclude = c->tcp.port_to_tap;
+ tcp_excl = c->tcp.port_to_tap;
}
if (proto == IPPROTO_UDP) {
memset(udp_map, 0, USHRT_MAX / 8);
- procfs_scan_listen("udp", udp_map, udp_exclude);
- procfs_scan_listen("udp6", udp_map, udp_exclude);
+ procfs_scan_listen(c, IPPROTO_UDP, V4, ns, udp_map, udp_excl);
+ procfs_scan_listen(c, IPPROTO_UDP, V6, ns, udp_map, udp_excl);
- procfs_scan_listen("tcp", udp_map, udp_exclude);
- procfs_scan_listen("tcp6", udp_map, udp_exclude);
+ procfs_scan_listen(c, IPPROTO_TCP, V4, ns, udp_map, udp_excl);
+ procfs_scan_listen(c, IPPROTO_TCP, V6, ns, udp_map, udp_excl);
} else if (proto == IPPROTO_TCP) {
memset(tcp_map, 0, USHRT_MAX / 8);
- procfs_scan_listen("tcp", tcp_map, tcp_exclude);
- procfs_scan_listen("tcp6", tcp_map, tcp_exclude);
+ procfs_scan_listen(c, IPPROTO_TCP, V4, ns, tcp_map, tcp_excl);
+ procfs_scan_listen(c, IPPROTO_TCP, V6, ns, tcp_map, tcp_excl);
}
}
@@ -367,7 +365,7 @@ static int conf_ns_check(void *arg)
static int conf_ns_opt(struct ctx *c,
char *nsdir, char *conf_userns, const char *optarg)
{
- int ufd = 0, nfd = 0, try, ret, netns_only_reset = c->netns_only;
+ int ufd = -1, nfd = -1, try, ret, netns_only_reset = c->netns_only;
char userns[PATH_MAX] = { 0 }, netns[PATH_MAX];
char *endptr;
pid_t pid;
@@ -416,7 +414,7 @@ static int conf_ns_opt(struct ctx *c,
nfd = open(netns, O_RDONLY);
- if (nfd >= 0 && ufd >= 0) {
+ if (nfd >= 0 && (ufd >= 0 || c->netns_only)) {
c->pasta_netns_fd = nfd;
c->pasta_userns_fd = ufd;
@@ -425,10 +423,10 @@ static int conf_ns_opt(struct ctx *c,
return 0;
}
- if (nfd > 0)
+ if (nfd >= 0)
close(nfd);
- if (ufd > 0)
+ if (ufd >= 0)
close(ufd);
}
@@ -565,9 +563,9 @@ static void usage(const char *name)
info( " if FILE is not given, log to:");
if (strstr(name, "pasta") || strstr(name, "passt4netns"))
- info(" /tmp/pasta_ISO8601-TIMESTAMP_INSTANCE-NUMBER.pcap");
+ info(" /tmp/pasta_ISO8601-TIMESTAMP_PID.pcap");
else
- info(" /tmp/passt_ISO8601-TIMESTAMP_INSTANCE-NUMBER.pcap");
+ info(" /tmp/passt_ISO8601-TIMESTAMP_PID.pcap");
info( " -P, --pid FILE Write own PID to the given file");
info( " -m, --mtu MTU Assign MTU via DHCP/NDP");
@@ -664,7 +662,7 @@ pasta_opts:
info( " SPEC is as described above");
info( " default: auto");
info( " --userns NSPATH Target user namespace to join");
- info( " --netns-only Don't join or create user namespace");
+ info( " --netns-only Don't join existing user namespace");
info( " implied if PATH or NAME are given without --userns");
info( " --nsrun-dir Directory for nsfs mountpoints");
info( " default: " NETNS_RUN_DIR);
@@ -1170,7 +1168,7 @@ void conf(struct ctx *c, int argc, char **argv)
usage(argv[0]);
}
- if (c->mode == MODE_PASTA && c->pasta_netns_fd <= 0)
+ if (c->mode == MODE_PASTA && c->pasta_netns_fd == -1)
pasta_start_ns(c);
if (nl_sock_init(c)) {
@@ -1216,6 +1214,11 @@ void conf(struct ctx *c, int argc, char **argv)
c->tcp.init_detect_ports = c->udp.init_detect_ports = 0;
if (c->mode == MODE_PASTA) {
+ c->proc_net_tcp[V4][0] = c->proc_net_tcp[V4][1] = -1;
+ c->proc_net_tcp[V6][0] = c->proc_net_tcp[V6][1] = -1;
+ c->proc_net_udp[V4][0] = c->proc_net_udp[V4][1] = -1;
+ c->proc_net_udp[V6][0] = c->proc_net_udp[V6][1] = -1;
+
if (!tcp_tap || tcp_tap == PORT_AUTO) {
c->tcp.ns_detect_ports = 1;
ns_ports_arg.proto = IPPROTO_TCP;
diff --git a/passt.1 b/passt.1
index b0d7d87..92681f6 100644
--- a/passt.1
+++ b/passt.1
@@ -80,7 +80,8 @@ Don't print informational messages.
.TP
.BR \-f ", " \-\-foreground
-Don't run in background.
+Don't run in background. This implies that the process is not moved to a
+detached PID namespace after starting, because the PID itself cannot change.
Default is to fork into background.
.TP
@@ -100,14 +101,13 @@ Capture tap-facing (that is, guest-side or namespace-side) network packets to
If \fIfile\fR is not given, capture packets to
- \fB/tmp/passt_\fIISO8601-timestamp\fR_\fIinstance-number\fB.pcap\fR
+ \fB/tmp/passt_\fIISO8601-timestamp\fR_\fIPID\fB.pcap\fR
in \fBpasst\fR mode and to
- \fB/tmp/pasta_\fIISO8601-timestamp\fR_\fIinstance-number\fB.pcap\fR
+ \fB/tmp/pasta_\fIISO8601-timestamp\fR_\fIPID\fB.pcap\fR
-in \fBpasta\fR mode, where \fIinstance-number\fR is a progressive count of
-other detected instances running on the same host.
+in \fBpasta\fR mode, where \fIPID\fR is the ID of the running process.
.TP
.BR \-P ", " \-\-pid " " \fIfile
@@ -379,8 +379,9 @@ This option requires PID, PATH or NAME to be specified.
.TP
.BR \-\-netns-only
-Join or create only the network namespace, not a user namespace. This is implied
-if PATH or NAME are given without \-\-userns.
+Join only a target network namespace, not a user namespace, and don't create one
+for sandboxing purposes either. This is implied if PATH or NAME are given
+without \-\-userns.
.TP
.BR \-\-nsrun-dir " " \fIpath
diff --git a/passt.c b/passt.c
index a8bb88e..508d525 100644
--- a/passt.c
+++ b/passt.c
@@ -30,7 +30,9 @@
#include <sys/mman.h>
#include <sys/resource.h>
#include <sys/uio.h>
+#include <sys/syscall.h>
#include <sys/wait.h>
+#include <sys/mount.h>
#include <netinet/ip.h>
#include <net/ethernet.h>
#include <stdlib.h>
@@ -53,7 +55,6 @@
#include <linux/seccomp.h>
#include <linux/audit.h>
#include <linux/filter.h>
-#include <linux/capability.h>
#include <linux/icmpv6.h>
#include "util.h"
@@ -228,42 +229,61 @@ static void check_root(void)
}
/**
- * drop_caps() - Drop capabilities we might have except for CAP_NET_BIND_SERVICE
+ * sandbox() - Unshare IPC, mount, PID, UTS, and user namespaces, "unmount" root
+ *
+ * Return: negative error code on failure, zero on success
*/
-static void drop_caps(void)
+static int sandbox(struct ctx *c)
{
- int i;
+ int flags = CLONE_NEWIPC | CLONE_NEWNS | CLONE_NEWUTS;
- for (i = 0; i < 64; i++) {
- if (i == CAP_NET_BIND_SERVICE)
- continue;
+ errno = 0;
- prctl(PR_CAPBSET_DROP, i, 0, 0, 0);
+ if (!c->netns_only) {
+ if (c->pasta_userns_fd == -1)
+ flags |= CLONE_NEWUSER;
+ else
+ setns(c->pasta_userns_fd, CLONE_NEWUSER);
}
-}
-/**
- * pid_file() - Write own PID to file, if configured
- * @c: Execution context
- */
-static void pid_file(struct ctx *c) {
- char pid_buf[12];
- int pid_fd, n;
+ c->pasta_userns_fd = -1;
- if (!*c->pid_file)
- return;
+ /* If we run in foreground, we have no chance to actually move to a new
+ * PID namespace. For passt, use CLONE_NEWPID anyway, in case somebody
+ * ever gets around seccomp profiles -- there's no harm in passing it.
+ */
+ if (!c->foreground || c->mode == MODE_PASST)
+ flags |= CLONE_NEWPID;
- pid_fd = open(c->pid_file, O_CREAT | O_WRONLY, S_IRUSR | S_IWUSR);
- if (pid_fd < 0)
- return;
+ unshare(flags);
- n = snprintf(pid_buf, sizeof(pid_buf), "%i\n", getpid());
+ mount("", "/", "", MS_UNBINDABLE | MS_REC, NULL);
+ mount("", TMPDIR, "tmpfs", MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY,
+ "nr_inodes=2,nr_blocks=0");
+ chdir(TMPDIR);
+ syscall(SYS_pivot_root, ".", ".");
+ umount2(".", MNT_DETACH | UMOUNT_NOFOLLOW);
- if (write(pid_fd, pid_buf, n) < 0) {
- perror("PID file write");
- exit(EXIT_FAILURE);
- }
- close(pid_fd);
+ if (errno)
+ return -errno;
+
+ drop_caps(); /* Relative to the new user namespace this time. */
+
+ return 0;
+}
+
+/**
+ * exit_handler() - Signal handler for SIGQUIT and SIGTERM
+ * @unused: Unused, handler deals with SIGQUIT and SIGTERM only
+ *
+ * TODO: After unsharing the PID namespace and forking, SIG_DFL for SIGTERM and
+ * SIGQUIT unexpectedly doesn't cause the process to terminate, figure out why.
+ */
+void exit_handler(int signal)
+{
+ (void)signal;
+
+ exit(EXIT_SUCCESS);
}
/**
@@ -273,36 +293,36 @@ static void pid_file(struct ctx *c) {
*
* Return: non-zero on failure
*
- * #syscalls read write open|openat close fork|clone dup2|dup3 ioctl writev
- * #syscalls socket bind connect getsockopt setsockopt recvfrom sendto shutdown
- * #syscalls accept4 accept listen set_robust_list getrlimit setrlimit
- * #syscalls openat fcntl lseek clone setsid exit exit_group getpid chdir
- * #syscalls epoll_ctl epoll_create1 epoll_wait|epoll_pwait epoll_pwait
- * #syscalls prlimit64 clock_gettime fstat|newfstat newfstatat syslog
- * #syscalls ppc64le:_llseek ppc64le:recv ppc64le:send ppc64le:getuid
- * #syscalls ppc64:_llseek ppc64:recv ppc64:send ppc64:getuid ppc64:ugetrlimit
- * #syscalls s390x:socketcall s390x:sigreturn
- * #syscalls:pasta rt_sigreturn|sigreturn ppc64:sigreturn ppc64:fcntl64
+ * #syscalls read write writev
+ * #syscalls socket bind connect getsockopt setsockopt s390x:socketcall close
+ * #syscalls recvfrom sendto shutdown ppc64le:recv ppc64le:send
+ * #syscalls accept4|accept listen
+ * #syscalls epoll_ctl epoll_wait|epoll_pwait epoll_pwait clock_gettime
*/
int main(int argc, char **argv)
{
+ int nfds, i, devnull_fd = -1, pidfile_fd = -1;
struct epoll_event events[EPOLL_EVENTS];
struct ctx c = { 0 };
struct rlimit limit;
struct timespec now;
+ struct sigaction sa;
char *log_name;
- int nfds, i;
#ifndef PASST_LEGACY_NO_OPTIONS
check_root();
#endif
drop_caps();
- if (strstr(argv[0], "pasta") || strstr(argv[0], "passt4netns")) {
- struct sigaction sa;
+ c.pasta_userns_fd = c.pasta_netns_fd = c.fd_tap = c.fd_tap_listen = -1;
+
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = 0;
+ sa.sa_handler = exit_handler;
+ sigaction(SIGTERM, &sa, NULL);
+ sigaction(SIGQUIT, &sa, NULL);
- sigemptyset(&sa.sa_mask);
- sa.sa_flags = 0;
+ if (strstr(argv[0], "pasta") || strstr(argv[0], "passt4netns")) {
sa.sa_handler = pasta_child_handler;
sigaction(SIGCHLD, &sa, NULL);
signal(SIGPIPE, SIG_IGN);
@@ -323,8 +343,6 @@ int main(int argc, char **argv)
conf(&c, argc, argv);
- seccomp(&c);
-
if (!c.debug && (c.stderr || isatty(fileno(stdout))))
__openlog(log_name, LOG_PERROR, LOG_DAEMON);
@@ -369,12 +387,26 @@ int main(int argc, char **argv)
else
__setlogmask(LOG_UPTO(LOG_INFO));
- if (!c.foreground && daemon(0, 0)) {
- perror("daemon");
+ pcap_init(&c);
+
+ if (!c.foreground)
+ devnull_fd = open("/dev/null", O_RDWR);
+
+ if (*c.pid_file)
+ pidfile_fd = open(c.pid_file,
+ O_CREAT | O_WRONLY, S_IRUSR | S_IWUSR);
+
+ if (sandbox(&c)) {
+ err("Failed to sandbox process, exiting\n");
exit(EXIT_FAILURE);
}
- pid_file(&c);
+ if (!c.foreground)
+ __daemon(pidfile_fd, devnull_fd);
+ else
+ write_pidfile(pidfile_fd, getpid());
+
+ seccomp(&c);
timer_init(&c, &now);
loop:
diff --git a/passt.h b/passt.h
index 0ef1897..d7011da 100644
--- a/passt.h
+++ b/passt.h
@@ -99,8 +99,10 @@ enum passt_modes {
* @pcap: Path for packet capture file
* @pid_file: Path to PID file, empty string if not configured
* @pasta_netns_fd: File descriptor for network namespace in pasta mode
- * @pasta_userns_fd: File descriptor for user namespace in pasta mode
+ * @pasta_userns_fd: Descriptor for user namespace to join, -1 once joined
* @netns_only: In pasta mode, don't join or create a user namespace
+ * @proc_net_tcp: Stored handles for /proc/net/tcp{,6} in init and ns
+ * @proc_net_udp: Stored handles for /proc/net/udp{,6} in init and ns
* @epollfd: File descriptor for epoll instance
* @fd_tap_listen: File descriptor for listening AF_UNIX socket, if any
* @fd_tap: File descriptor for AF_UNIX socket or tuntap device
@@ -155,6 +157,9 @@ struct ctx {
int pasta_userns_fd;
int netns_only;
+ int proc_net_tcp[IP_VERSIONS][2];
+ int proc_net_udp[IP_VERSIONS][2];
+
int epollfd;
int fd_tap_listen;
int fd_tap;
diff --git a/pasta.c b/pasta.c
index bce30d4..972cbcf 100644
--- a/pasta.c
+++ b/pasta.c
@@ -11,9 +11,8 @@
* Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*
- * #syscalls:pasta clone unshare waitid kill execve exit_group rt_sigprocmask
- * #syscalls:pasta geteuid getdents64|getdents readlink|readlinkat setsid
- * #syscalls:pasta nanosleep clock_nanosleep
+ * #syscalls:pasta clone waitid exit exit_group rt_sigprocmask
+ * #syscalls:pasta rt_sigreturn|sigreturn ppc64:sigreturn s390x:sigreturn
*/
#include <sched.h>
@@ -40,75 +39,8 @@
#include "passt.h"
#include "netlink.h"
-/* PID of child, in case we created a namespace, and its procfs link */
+/* PID of child, in case we created a namespace */
static int pasta_child_pid;
-static char pasta_child_ns[PATH_MAX];
-
-/**
- * pasta_ns_cleanup() - Look for processes in namespace, terminate them
- */
-static void pasta_ns_cleanup(void)
-{
- char proc_path[PATH_MAX], ns_link[PATH_MAX], buf[BUFSIZ];
- int recheck = 0, found = 0, waited = 0;
- int dir_fd, n;
-
- if (!*pasta_child_ns)
- return;
-
-loop:
- if ((dir_fd = open("/proc", O_RDONLY | O_DIRECTORY)) < 0)
- return;
-
- while ((n = syscall(SYS_getdents64, dir_fd, buf, BUFSIZ)) > 0) {
- struct dirent *dp = (struct dirent *)buf;
- int pos = 0;
-
- while (dp->d_reclen && pos < n) {
- pid_t pid;
-
- errno = 0;
- pid = strtol(dp->d_name, NULL, 0);
- if (!pid || errno)
- goto next;
-
- snprintf(proc_path, PATH_MAX, "/proc/%i/ns/net", pid);
- if (readlink(proc_path, ns_link, PATH_MAX) < 0)
- goto next;
-
- if (!strncmp(ns_link, pasta_child_ns, PATH_MAX)) {
- found = 1;
- if (waited)
- kill(pid, SIGKILL);
- else
- kill(pid, SIGQUIT);
- }
-next:
- dp = (struct dirent *)(buf + (pos += dp->d_reclen));
- }
- }
-
- close(dir_fd);
-
- if (!found)
- return;
-
- if (waited) {
- if (recheck) {
- info("Some processes in namespace didn't quit");
- } else {
- found = 0;
- recheck = 1;
- goto loop;
- }
- return;
- }
-
- info("Waiting for all processes in namespace to terminate");
- sleep(1);
- waited = 1;
- goto loop;
-}
/**
* pasta_child_handler() - Exit once shell exits (if we started it), reap clones
@@ -120,12 +52,14 @@ void pasta_child_handler(int signal)
(void)signal;
+ if (signal != SIGCHLD)
+ return;
+
if (pasta_child_pid &&
!waitid(P_PID, pasta_child_pid, &infop, WEXITED | WNOHANG)) {
- if (infop.si_pid == pasta_child_pid) {
- pasta_ns_cleanup();
+ if (infop.si_pid == pasta_child_pid)
exit(EXIT_SUCCESS);
- }
+ /* Nothing to do, detached PID namespace going away */
}
waitid(P_ALL, 0, NULL, WEXITED | WNOHANG);
@@ -163,45 +97,31 @@ netns:
}
/**
- * pasta_start_ns() - Fork shell in new namespace if target ns is not given
+ * struct pasta_setup_ns_arg - Argument for pasta_setup_ns()
* @c: Execution context
+ * @euid: Effective UID of caller
*/
-void pasta_start_ns(struct ctx *c)
+struct pasta_setup_ns_arg {
+ struct ctx *c;
+ int euid;
+};
+
+/**
+ * pasta_setup_ns() - Map credentials, enable access to ping sockets, run shell
+ * @arg: See @pasta_setup_ns_arg
+ *
+ * Return: this function never returns
+ */
+static int pasta_setup_ns(void *arg)
{
- int euid = geteuid(), fd;
+ struct pasta_setup_ns_arg *a = (struct pasta_setup_ns_arg *)arg;
char *shell;
+ int fd;
- c->foreground = 1;
- if (!c->debug)
- c->quiet = 1;
-
- if ((pasta_child_pid = fork()) == -1) {
- perror("fork");
- exit(EXIT_FAILURE);
- }
-
- if (pasta_child_pid) {
- char proc_path[PATH_MAX];
-
- NS_CALL(pasta_wait_for_ns, c);
-
- snprintf(proc_path, PATH_MAX, "/proc/%i/ns/net",
- pasta_child_pid);
- if (readlink(proc_path, pasta_child_ns, PATH_MAX) < 0)
- warn("Cannot read link to ns, won't clean up on exit");
-
- return;
- }
-
- if (unshare(CLONE_NEWNET | (c->netns_only ? 0 : CLONE_NEWUSER))) {
- perror("unshare");
- exit(EXIT_FAILURE);
- }
-
- if (!c->netns_only) {
+ if (!a->c->netns_only) {
char buf[BUFSIZ];
- snprintf(buf, BUFSIZ, "%i %i %i", 0, euid, 1);
+ snprintf(buf, BUFSIZ, "%i %i %i", 0, a->euid, 1);
fd = open("/proc/self/uid_map", O_WRONLY);
if (write(fd, buf, strlen(buf)) < 0)
@@ -235,6 +155,39 @@ void pasta_start_ns(struct ctx *c)
}
/**
+ * pasta_start_ns() - Fork shell in new namespace if target ns is not given
+ * @c: Execution context
+ */
+void pasta_start_ns(struct ctx *c)
+{
+ struct pasta_setup_ns_arg arg = { .c = c, .euid = geteuid() };
+ char ns_fn_stack[NS_FN_STACK_SIZE];
+
+ c->foreground = 1;
+ if (!c->debug)
+ c->quiet = 1;
+
+ pasta_child_pid = clone(pasta_setup_ns,
+ ns_fn_stack + sizeof(ns_fn_stack) / 2,
+ (c->netns_only ? 0 : CLONE_NEWNET) |
+ CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWUSER |
+ CLONE_NEWUTS,
+ (void *)&arg);
+
+ if (pasta_child_pid == -1) {
+ perror("clone");
+ exit(EXIT_FAILURE);
+ }
+
+ drop_caps();
+
+ if (pasta_child_pid) {
+ NS_CALL(pasta_wait_for_ns, c);
+ return;
+ }
+}
+
+/**
* pasta_ns_conf() - Set up loopback and tap interfaces in namespace as needed
* @c: Execution context
*/
diff --git a/pcap.c b/pcap.c
index e00fc45..9c617ce 100644
--- a/pcap.c
+++ b/pcap.c
@@ -167,9 +167,8 @@ fail:
/**
* pcap_init() - Initialise pcap file
* @c: Execution context
- * @index: pcap name index: passt instance number or pasta netns socket
*/
-void pcap_init(struct ctx *c, int index)
+void pcap_init(struct ctx *c)
{
struct timeval tv;
@@ -196,7 +195,7 @@ void pcap_init(struct ctx *c, int index)
snprintf(name + strlen(PCAP_PREFIX) + strlen(PCAP_ISO8601_STR),
sizeof(name) - strlen(PCAP_PREFIX) -
strlen(PCAP_ISO8601_STR),
- "_%i.pcap", index);
+ "_%i.pcap", getpid());
strncpy(c->pcap, name, PATH_MAX);
}
diff --git a/pcap.h b/pcap.h
index 26f4f35..73b5ed8 100644
--- a/pcap.h
+++ b/pcap.h
@@ -6,4 +6,4 @@
void pcap(char *pkt, size_t len);
void pcapm(struct msghdr *mh);
void pcapmm(struct mmsghdr *mmh, unsigned int vlen);
-void pcap_init(struct ctx *c, int sock_index);
+void pcap_init(struct ctx *c);
diff --git a/slirp4netns.sh b/slirp4netns.sh
index 518f581..7c2188d 100755
--- a/slirp4netns.sh
+++ b/slirp4netns.sh
@@ -10,7 +10,7 @@
#
# slirp4netns.sh - Compatibility wrapper for pasta, behaving like slirp4netns(1)
#
-# WARNING: Draft quality, not really tested, --enable-sandbox not supported yet
+# WARNING: Draft quality, not really tested
#
# Copyright (c) 2021 Red Hat GmbH
# Author: Stefano Brivio <sbrivio@redhat.com>
diff --git a/tap.c b/tap.c
index 22db9c5..38004a5 100644
--- a/tap.c
+++ b/tap.c
@@ -11,7 +11,6 @@
* Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*
- * #syscalls recvfrom sendto
*/
#include <sched.h>
@@ -769,12 +768,10 @@ restart:
}
/**
- * tap_sock_init_unix() - Create and bind AF_UNIX socket, listen for connection
+ * tap_sock_unix_init() - Create and bind AF_UNIX socket, listen for connection
* @c: Execution context
- *
- * #syscalls:passt unlink|unlinkat
*/
-static void tap_sock_init_unix(struct ctx *c)
+static void tap_sock_unix_init(struct ctx *c)
{
int fd = socket(AF_UNIX, SOCK_STREAM, 0), ex;
struct epoll_event ev = { 0 };
@@ -783,11 +780,6 @@ static void tap_sock_init_unix(struct ctx *c)
};
int i, ret;
- if (c->fd_tap_listen != -1) {
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap_listen, &ev);
- close(c->fd_tap_listen);
- }
-
if (fd < 0) {
perror("UNIX socket");
exit(EXIT_FAILURE);
@@ -834,8 +826,6 @@ static void tap_sock_init_unix(struct ctx *c)
S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
#endif
- pcap_init(c, i);
-
listen(fd, 0);
ev.data.fd = c->fd_tap_listen = fd;
@@ -852,19 +842,26 @@ static void tap_sock_init_unix(struct ctx *c)
}
/**
- * tap_sock_accept_unix() - Accept connection on listening socket
+ * tap_sock_unix_new() - Handle new connection on listening socket
* @c: Execution context
*/
-static void tap_sock_accept_unix(struct ctx *c)
+static void tap_sock_unix_new(struct ctx *c)
{
struct epoll_event ev = { 0 };
int v = INT_MAX / 2;
- c->fd_tap = accept(c->fd_tap_listen, NULL, NULL);
+ /* Another client is already connected: accept and close right away. */
+ if (c->fd_tap != -1) {
+ int discard = accept4(c->fd_tap_listen, NULL, NULL,
+ SOCK_NONBLOCK);
+
+ if (discard != -1)
+ close(discard);
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap_listen, &ev);
- close(c->fd_tap_listen);
- c->fd_tap_listen = -1;
+ return;
+ }
+
+ c->fd_tap = accept4(c->fd_tap_listen, NULL, NULL, 0);
if (!c->low_rmem)
setsockopt(c->fd_tap, SOL_SOCKET, SO_RCVBUF, &v, sizeof(v));
@@ -884,8 +881,6 @@ static int tun_ns_fd = -1;
* @c: Execution context
*
* Return: 0
- *
- * #syscalls:pasta ioctl
*/
static int tap_ns_tun(void *arg)
{
@@ -907,7 +902,7 @@ static int tap_ns_tun(void *arg)
* tap_sock_init_tun() - Set up tuntap file descriptor
* @c: Execution context
*/
-static void tap_sock_init_tun(struct ctx *c)
+static void tap_sock_tun_init(struct ctx *c)
{
struct epoll_event ev = { 0 };
@@ -919,8 +914,6 @@ static void tap_sock_init_tun(struct ctx *c)
pasta_ns_conf(c);
- pcap_init(c, c->pasta_netns_fd);
-
c->fd_tap = tun_ns_fd;
ev.data.fd = c->fd_tap;
@@ -937,12 +930,15 @@ void tap_sock_init(struct ctx *c)
if (c->fd_tap != -1) {
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL);
close(c->fd_tap);
+ c->fd_tap = -1;
}
- if (c->mode == MODE_PASST)
- tap_sock_init_unix(c);
- else
- tap_sock_init_tun(c);
+ if (c->mode == MODE_PASST) {
+ if (c->fd_tap_listen == -1)
+ tap_sock_unix_init(c);
+ } else {
+ tap_sock_tun_init(c);
+ }
}
/**
@@ -955,18 +951,18 @@ void tap_sock_init(struct ctx *c)
void tap_handler(struct ctx *c, int fd, uint32_t events, struct timespec *now)
{
if (fd == c->fd_tap_listen && events == EPOLLIN) {
- tap_sock_accept_unix(c);
+ tap_sock_unix_new(c);
return;
}
if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR))
- goto fail;
+ goto reinit;
if ((c->mode == MODE_PASST && tap_handler_passt(c, now)) ||
(c->mode == MODE_PASTA && tap_handler_pasta(c, now)))
- goto fail;
+ goto reinit;
return;
-fail:
+reinit:
tap_sock_init(c);
}
diff --git a/tcp.c b/tcp.c
index 723b18e..e4fac22 100644
--- a/tcp.c
+++ b/tcp.c
@@ -304,7 +304,7 @@
* - SPLICE_FIN_TO: FIN (EPOLLRDHUP) seen from connected socket
* - SPLICE_FIN_BOTH: FIN (EPOLLRDHUP) seen from both sides
*
- * #syscalls pipe|pipe2 pipe2
+ * #syscalls:pasta pipe2|pipe fcntl ppc64:fcntl64
*/
#include <sched.h>
@@ -3028,7 +3028,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
* @ref: epoll reference
* @events: epoll events bitmap
*
- * #syscalls splice
+ * #syscalls:pasta splice
*/
void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
uint32_t events)
@@ -3374,7 +3374,7 @@ static void tcp_set_pipe_size(struct ctx *c)
smaller:
for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE * 2; i++) {
- if (pipe(probe_pipe[i])) {
+ if (pipe2(probe_pipe[i], 0)) {
i++;
break;
}
@@ -3493,7 +3493,7 @@ static void tcp_sock_init_one(struct ctx *c, int ns, in_port_t port)
* tcp_sock_init_ns() - Bind sockets in namespace for inbound connections
* @arg: Execution context
*
- * Return: 0 on success, -1 on failure
+ * Return: 0
*/
static int tcp_sock_init_ns(void *arg)
{
@@ -3560,8 +3560,7 @@ static int tcp_sock_refill(void *arg)
int i, *p4, *p6;
if (a->ns) {
- if (ns_enter(a->c))
- return 0;
+ ns_enter(a->c);
p4 = ns_sock_pool4;
p6 = ns_sock_pool6;
} else {
@@ -3594,8 +3593,6 @@ static int tcp_sock_refill(void *arg)
* @c: Execution context
*
* Return: 0 on success, -1 on failure
- *
- * #syscalls getrandom
*/
int tcp_sock_init(struct ctx *c, struct timespec *now)
{
diff --git a/test/demo/passt b/test/demo/passt
index b5762aa..76aac86 100644
--- a/test/demo/passt
+++ b/test/demo/passt
@@ -84,7 +84,8 @@ say Now let's run 'passt' in the new namespace, and
nl
say enter this namespace from the guest terminal too.
sleep 3
-pout TARGET_PID echo $$
+guest pstree -p | grep pasta
+gout TARGET_PID pstree -p | grep pasta | sed -n 's/.*(\([0-9].*\))$/\1/p'
sleep 1
passtb ./passt -f -t 5201,5203
diff --git a/test/demo/pasta b/test/demo/pasta
index f8f0cd0..c136965 100644
--- a/test/demo/pasta
+++ b/test/demo/pasta
@@ -58,7 +58,8 @@ say For convenience, let's enter this namespace
nl
say from another terminal.
sleep 3
-pout TARGET_PID echo $$
+ns pstree -p | grep pasta
+nsout TARGET_PID pstree -p | grep pasta | sed -n 's/.*(\([0-9].*\))$/\1/p'
sleep 1
ns nsenter -t __TARGET_PID__ -U -n --preserve-credentials
@@ -172,7 +173,7 @@ sleep 2
passtb perf record -g ./pasta
sleep 2
-pout TARGET_PID echo $$
+nsout TARGET_PID pstree -p | grep pasta | sed -n 's/.*(\([0-9].*\))$/\1/p'
sleep 1
ns nsenter -t __TARGET_PID__ -U -n --preserve-credentials
sleep 5
diff --git a/test/lib/setup b/test/lib/setup
index ab51787..df21655 100755
--- a/test/lib/setup
+++ b/test/lib/setup
@@ -115,13 +115,14 @@ setup_passt_in_ns() {
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p /tmp/pasta_with_passt.pcap"
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
- pane_run PASST "./pasta ${__opts} -t 10001,10002,10011,10012 -T 10003,10013 -u 10001,10002,10011,10012 -U 10003,10013"
+ __pid_file="$(mktemp)"
+ pane_run PASST "./pasta ${__opts} -t 10001,10002,10011,10012 -T 10003,10013 -u 10001,10002,10011,10012 -U 10003,10013 -P ${__pid_file}"
sleep 1
pane_run PASST ''
pane_wait PASST
- pane_run PASST 'echo $$'
- pane_wait PASST
- __ns_pid="$(pane_parse PASST)"
+ __pasta_pid="$(cat "${__pid_file}")"
+ __ns_pid="$(cat /proc/${__pasta_pid}/task/${__pasta_pid}/children | cut -f1 -d' ')"
+ rm "${__pid_file}"
pane_run GUEST "nsenter -t ${__ns_pid} -U -n --preserve-credentials"
pane_run NS "nsenter -t ${__ns_pid} -U -n --preserve-credentials"
@@ -172,15 +173,18 @@ setup_two_guests() {
# 10004 | as server | to init | to guest | to ns #2
# 10005 | | | as server | to ns #2
+ __pid1_file="$(mktemp)"
+ __pid2_file="$(mktemp)"
+
__opts=
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p /tmp/pasta_1.pcap"
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
- pane_run PASST_1 "./pasta ${__opts} -t 10001,10002 -T 10003,10004 -u 10001,10002 -U 10003,10004"
+ pane_run PASST_1 "./pasta ${__opts} -P ${__pid1_file} -t 10001,10002 -T 10003,10004 -u 10001,10002 -U 10003,10004"
__opts=
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p /tmp/pasta_2.pcap"
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
- pane_run PASST_2 "./pasta ${__opts} -t 10004,10005 -T 10003,10001 -u 10004,10005 -U 10003,10001"
+ pane_run PASST_2 "./pasta ${__opts} -P ${__pid2_file} -t 10004,10005 -T 10003,10001 -u 10004,10005 -U 10003,10001"
sleep 1
pane_run PASST_1 ''
@@ -188,12 +192,12 @@ setup_two_guests() {
pane_wait PASST_1
pane_wait PASST_2
- pane_run PASST_1 'echo $$'
- pane_run PASST_2 'echo $$'
- pane_wait PASST_1
- pane_wait PASST_2
- __ns1_pid="$(pane_parse PASST_1)"
- __ns2_pid="$(pane_parse PASST_2)"
+ __pasta1_pid="$(cat "${__pid1_file}")"
+ __ns1_pid="$(cat /proc/${__pasta1_pid}/task/${__pasta1_pid}/children | cut -f1 -d' ')"
+ rm "${__pid1_file}"
+ __pasta2_pid="$(cat "${__pid2_file}")"
+ __ns2_pid="$(cat /proc/${__pasta2_pid}/task/${__pasta2_pid}/children | cut -f1 -d' ')"
+ rm "${__pid2_file}"
pane_run GUEST_1 "nsenter -t ${__ns1_pid} -U -n --preserve-credentials"
pane_run GUEST_2 "nsenter -t ${__ns2_pid} -U -n --preserve-credentials"
diff --git a/udp.c b/udp.c
index e1a9ecb..348f695 100644
--- a/udp.c
+++ b/udp.c
@@ -529,7 +529,9 @@ static int udp_splice_connect_ns(void *arg)
a = (struct udp_splice_connect_ns_arg *)arg;
- ns_enter(a->c);
+ if (ns_enter(a->c))
+ return 0;
+
a->s = udp_splice_connect(a->c, a->v6, a->bound_sock, a->src, a->dst,
UDP_BACK_TO_INIT);
@@ -1029,7 +1031,8 @@ int udp_sock_init_ns(void *arg)
struct ctx *c = (struct ctx *)arg;
int dst;
- ns_enter(c);
+ if (ns_enter(c))
+ return 0;
for (dst = 0; dst < USHRT_MAX; dst++) {
if (!bitmap_isset(c->udp.port_to_init, dst))
diff --git a/util.c b/util.c
index 94d49a6..e9fca3b 100644
--- a/util.c
+++ b/util.c
@@ -16,6 +16,7 @@
#include <stdio.h>
#include <stdint.h>
#include <stddef.h>
+#include <stdlib.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <net/ethernet.h>
@@ -23,6 +24,7 @@
#include <netinet/tcp.h>
#include <netinet/udp.h>
#include <sys/epoll.h>
+#include <sys/prctl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
@@ -32,6 +34,8 @@
#include <time.h>
#include <errno.h>
+#include <linux/capability.h>
+
#include "util.h"
#include "passt.h"
@@ -431,31 +435,51 @@ char *line_read(char *buf, size_t len, int fd)
/**
* procfs_scan_listen() - Set bits for listening TCP or UDP sockets from procfs
- * @name: Corresponding name of file under /proc/net/
+ * @proto: IPPROTO_TCP or IPPROTO_UDP
+ * @ip_version: IP version, V4 or V6
+ * @ns: Use saved file descriptors for namespace if set
* @map: Bitmap where numbers of ports in listening state will be set
* @exclude: Bitmap of ports to exclude from setting (and clear)
+ *
+ * #syscalls:pasta lseek ppc64le:_llseek ppc64:_llseek
*/
-void procfs_scan_listen(char *name, uint8_t *map, uint8_t *exclude)
+void procfs_scan_listen(struct ctx *c, uint8_t proto, int ip_version, int ns,
+ uint8_t *map, uint8_t *exclude)
{
- char line[BUFSIZ], path[PATH_MAX];
+ char line[BUFSIZ], *path;
unsigned long port;
unsigned int state;
- int fd;
+ int *fd;
- snprintf(path, PATH_MAX, "/proc/net/%s", name);
- if ((fd = open(path, O_RDONLY)) < 0)
+ if (proto == IPPROTO_TCP) {
+ fd = &c->proc_net_tcp[ip_version][ns];
+ if (ip_version == V4)
+ path = "/proc/net/tcp";
+ else
+ path = "/proc/net/tcp6";
+ } else {
+ fd = &c->proc_net_udp[ip_version][ns];
+ if (ip_version == V4)
+ path = "/proc/net/udp";
+ else
+ path = "/proc/net/udp6";
+ }
+
+ if (*fd != -1)
+ lseek(*fd, 0, SEEK_SET);
+ else if ((*fd = open(path, O_RDONLY)) < 0)
return;
*line = 0;
- line_read(line, sizeof(line), fd);
- while (line_read(line, sizeof(line), fd)) {
+ line_read(line, sizeof(line), *fd);
+ while (line_read(line, sizeof(line), *fd)) {
/* NOLINTNEXTLINE(cert-err34-c): != 2 if conversion fails */
if (sscanf(line, "%*u: %*x:%lx %*x:%*x %x", &port, &state) != 2)
continue;
/* See enum in kernel's include/net/tcp_states.h */
- if ((strstr(name, "tcp") && state != 0x0a) ||
- (strstr(name, "udp") && state != 0x07))
+ if ((proto == IPPROTO_TCP && state != 0x0a) ||
+ (proto == IPPROTO_UDP && state != 0x07))
continue;
if (bitmap_isset(exclude, port))
@@ -463,25 +487,98 @@ void procfs_scan_listen(char *name, uint8_t *map, uint8_t *exclude)
else
bitmap_set(map, port);
}
+}
- close(fd);
+/**
+ * drop_caps() - Drop capabilities we might have except for CAP_NET_BIND_SERVICE
+ */
+void drop_caps(void)
+{
+ int i;
+
+ for (i = 0; i < 64; i++) {
+ if (i == CAP_NET_BIND_SERVICE)
+ continue;
+
+ prctl(PR_CAPBSET_DROP, i, 0, 0, 0);
+ }
}
/**
- * ns_enter() - Enter configured network and user namespaces
+ * ns_enter() - Enter configured user (unless already joined) and network ns
* @c: Execution context
*
- * Return: 0 on success, -1 on failure
+ * Return: 0, won't return on failure
*
* #syscalls:pasta setns
*/
int ns_enter(struct ctx *c)
{
- if (!c->netns_only && setns(c->pasta_userns_fd, CLONE_NEWUSER))
- return -errno;
+ if (!c->netns_only &&
+ c->pasta_userns_fd != -1 &&
+ setns(c->pasta_userns_fd, CLONE_NEWUSER))
+ exit(EXIT_FAILURE);
if (setns(c->pasta_netns_fd, CLONE_NEWNET))
- return -errno;
+ exit(EXIT_FAILURE);
+
+ return 0;
+}
+
+/**
+ * pid_file() - Write PID to file, if requested to do so, and close it
+ * @fd: Open PID file descriptor, closed on exit, -1 to skip writing it
+ * @pid: PID value to write
+ */
+void write_pidfile(int fd, pid_t pid) {
+ char pid_buf[12];
+ int n;
+
+ if (fd == -1)
+ return;
+
+ n = snprintf(pid_buf, sizeof(pid_buf), "%i\n", pid);
+
+ if (write(fd, pid_buf, n) < 0) {
+ perror("PID file write");
+ exit(EXIT_FAILURE);
+ }
+
+ close(fd);
+}
+
+/**
+ * __daemon() - daemon()-like function writing PID file before parent exits
+ * @pidfile_fd: Open PID file descriptor
+ * @devnull_fd: Open file descriptor for /dev/null
+ *
+ * Return: child PID on success, won't return on failure
+ */
+int __daemon(int pidfile_fd, int devnull_fd)
+{
+ pid_t pid = fork();
+
+ if (pid == -1) {
+ perror("fork");
+ exit(EXIT_FAILURE);
+ }
+
+ if (pid) {
+ write_pidfile(pidfile_fd, pid);
+ exit(EXIT_SUCCESS);
+ }
+
+ errno = 0;
+
+ setsid();
+
+ dup2(devnull_fd, STDIN_FILENO);
+ dup2(devnull_fd, STDOUT_FILENO);
+ dup2(devnull_fd, STDERR_FILENO);
+ close(devnull_fd);
+
+ if (errno)
+ exit(EXIT_FAILURE);
return 0;
}
diff --git a/util.h b/util.h
index add4c1e..b7852e9 100644
--- a/util.h
+++ b/util.h
@@ -54,6 +54,12 @@ void debug(const char *format, ...);
#define STRINGIFY(x) #x
#define STR(x) STRINGIFY(x)
+#ifdef P_tmpdir
+#define TMPDIR P_tmpdir
+#else
+#define TMPDIR "/tmp"
+#endif
+
#define V4 0
#define V6 1
#define IP_VERSIONS 2
@@ -202,5 +208,9 @@ void bitmap_set(uint8_t *map, int bit);
void bitmap_clear(uint8_t *map, int bit);
int bitmap_isset(const uint8_t *map, int bit);
char *line_read(char *buf, size_t len, int fd);
-void procfs_scan_listen(char *name, uint8_t *map, uint8_t *exclude);
+void procfs_scan_listen(struct ctx *c, uint8_t proto, int ip_version, int ns,
+ uint8_t *map, uint8_t *exclude);
+void drop_caps(void);
int ns_enter(struct ctx *c);
+void write_pidfile(int fd, pid_t pid);
+int __daemon(int pidfile_fd, int devnull_fd);