// SPDX-License-Identifier: GPL-2.0-or-later /* PASST - Plug A Simple Socket Transport * for qemu/UNIX domain socket mode * * PASTA - Pack A Subtle Tap Abstraction * for network namespace/tap device mode * * pasta.c - pasta (namespace) specific implementations * * Copyright (c) 2020-2021 Red Hat GmbH * Author: Stefano Brivio * * #syscalls:pasta clone waitid exit exit_group rt_sigprocmask * #syscalls:pasta rt_sigreturn|sigreturn * #syscalls:pasta arm:sigreturn ppc64:sigreturn s390x:sigreturn */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "util.h" #include "passt.h" #include "isolation.h" #include "netlink.h" #include "log.h" /* PID of child, in case we created a namespace */ int pasta_child_pid; /** * pasta_child_handler() - Exit once shell exits (if we started it), reap clones * @signal: Unused, handler deals with SIGCHLD only */ void pasta_child_handler(int signal) { siginfo_t infop; (void)signal; if (signal != SIGCHLD) return; if (pasta_child_pid && !waitid(P_PID, pasta_child_pid, &infop, WEXITED | WNOHANG)) { if (infop.si_pid == pasta_child_pid) { if (infop.si_code == CLD_EXITED) exit(infop.si_status); /* If killed by a signal, si_status is the number. * Follow common shell convention of returning it + 128. */ exit(infop.si_status + 128); /* Nothing to do, detached PID namespace going away */ } } waitid(P_ALL, 0, NULL, WEXITED | WNOHANG); waitid(P_ALL, 0, NULL, WEXITED | WNOHANG); } /** * pasta_wait_for_ns() - Busy loop until we can enter the target namespace * @arg: Execution context * * Return: 0 */ static int pasta_wait_for_ns(void *arg) { struct ctx *c = (struct ctx *)arg; int flags = O_RDONLY | O_CLOEXEC; char ns[PATH_MAX]; snprintf(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid); do { while ((c->pasta_netns_fd = open(ns, flags)) < 0) { if (errno != ENOENT) return 0; } } while (setns(c->pasta_netns_fd, CLONE_NEWNET) && !close(c->pasta_netns_fd)); return 0; } /** * ns_check() - Check if we can enter configured namespaces * @arg: Execution context * * Return: 0 */ static int ns_check(void *arg) { struct ctx *c = (struct ctx *)arg; if (setns(c->pasta_netns_fd, CLONE_NEWNET)) c->pasta_netns_fd = -1; return 0; } /** * pasta_open_ns() - Open network namespace descriptors * @c: Execution context * @netns: network namespace path * * Return: 0 on success, negative error code otherwise */ void pasta_open_ns(struct ctx *c, const char *netns) { int nfd = -1; nfd = open(netns, O_RDONLY | O_CLOEXEC); if (nfd < 0) { die("Couldn't open network namespace %s: %s", netns, strerror(errno)); } c->pasta_netns_fd = nfd; NS_CALL(ns_check, c); if (c->pasta_netns_fd < 0) die("Couldn't switch to pasta namespaces: %s", strerror(errno)); if (!c->no_netns_quit) { char buf[PATH_MAX] = { 0 }; strncpy(buf, netns, PATH_MAX - 1); strncpy(c->netns_base, basename(buf), PATH_MAX - 1); strncpy(buf, netns, PATH_MAX - 1); strncpy(c->netns_dir, dirname(buf), PATH_MAX - 1); } } /** * struct pasta_spawn_cmd_arg - Argument for pasta_spawn_cmd() * @exe: Executable to run * @argv: Command and arguments to run */ struct pasta_spawn_cmd_arg { const char *exe; char *const *argv; }; /** * pasta_spawn_cmd() - Prepare new netns, start command or shell * @arg: See @pasta_spawn_cmd_arg * * Return: this function never returns */ static int pasta_spawn_cmd(void *arg) { const struct pasta_spawn_cmd_arg *a; sigset_t set; /* We run in a detached PID and mount namespace: mount /proc over */ if (mount("", "/proc", "proc", 0, NULL)) warn("Couldn't mount /proc: %s", strerror(errno)); if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0")) warn("Cannot set ping_group_range, ICMP requests might fail"); /* Wait for the parent to be ready: see main() */ sigemptyset(&set); sigaddset(&set, SIGUSR1); sigwaitinfo(&set, NULL); a = (const struct pasta_spawn_cmd_arg *)arg; execvp(a->exe, a->argv); perror("execvp"); exit(EXIT_FAILURE); } /** * pasta_start_ns() - Fork command in new namespace if target ns is not given * @c: Execution context * @uid: UID we're running as in the init namespace * @gid: GID we're running as in the init namespace * @argc: Number of arguments for spawned command * @argv: Command to spawn and arguments */ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid, int argc, char *argv[]) { char ns_fn_stack[NS_FN_STACK_SIZE] __attribute__ ((aligned(__alignof__(max_align_t)))); struct pasta_spawn_cmd_arg arg = { .exe = argv[0], .argv = argv, }; char uidmap[BUFSIZ], gidmap[BUFSIZ]; char *sh_argv[] = { NULL, NULL }; char sh_arg0[PATH_MAX + 1]; sigset_t set; c->foreground = 1; if (!c->debug) c->quiet = 1; /* Configure user and group mappings */ snprintf(uidmap, BUFSIZ, "0 %u 1", uid); snprintf(gidmap, BUFSIZ, "0 %u 1", gid); if (write_file("/proc/self/uid_map", uidmap) || write_file("/proc/self/setgroups", "deny") || write_file("/proc/self/gid_map", gidmap)) { warn("Couldn't configure user mappings"); } if (argc == 0) { arg.exe = getenv("SHELL"); if (!arg.exe) arg.exe = "/bin/sh"; if ((size_t)snprintf(sh_arg0, sizeof(sh_arg0), "-%s", arg.exe) >= sizeof(sh_arg0)) die("$SHELL is too long (%zu bytes)", strlen(arg.exe)); sh_argv[0] = sh_arg0; arg.argv = sh_argv; } /* Block SIGUSR1 in child, we queue it in main() when we're ready */ sigemptyset(&set); sigaddset(&set, SIGUSR1); sigprocmask(SIG_BLOCK, &set, NULL); pasta_child_pid = do_clone(pasta_spawn_cmd, ns_fn_stack, sizeof(ns_fn_stack), CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWUTS | CLONE_NEWNS | SIGCHLD, (void *)&arg); if (pasta_child_pid == -1) { perror("clone"); exit(EXIT_FAILURE); } NS_CALL(pasta_wait_for_ns, c); if (c->pasta_netns_fd < 0) die("Failed to join network namespace: %s", strerror(errno)); } /** * pasta_ns_conf() - Set up loopback and tap interfaces in namespace as needed * @c: Execution context */ void pasta_ns_conf(struct ctx *c) { int rc = 0; rc = nl_link_up(nl_sock_ns, 1 /* lo */, 0); if (rc < 0) die("Couldn't bring up loopback interface in namespace: %s", strerror(-rc)); /* Get or set MAC in target namespace */ if (MAC_IS_ZERO(c->mac_guest)) nl_link_get_mac(nl_sock_ns, c->pasta_ifi, c->mac_guest); else rc = nl_link_set_mac(nl_sock_ns, c->pasta_ifi, c->mac_guest); if (rc < 0) die("Couldn't set MAC address in namespace: %s", strerror(-rc)); if (c->pasta_conf_ns) { nl_link_up(nl_sock_ns, c->pasta_ifi, c->mtu); if (c->ifi4) { if (c->no_copy_addrs) { rc = nl_addr_set(nl_sock_ns, c->pasta_ifi, AF_INET, &c->ip4.addr, c->ip4.prefix_len); } else { rc = nl_addr_dup(nl_sock, c->ifi4, nl_sock_ns, c->pasta_ifi, AF_INET); } if (rc < 0) { die("Couldn't set IPv4 address(es) in namespace: %s", strerror(-rc)); } if (c->no_copy_routes) { rc = nl_route_set_def(nl_sock_ns, c->pasta_ifi, AF_INET, &c->ip4.gw); } else { rc = nl_route_dup(nl_sock, c->ifi4, nl_sock_ns, c->pasta_ifi, AF_INET); } if (rc < 0) { die("Couldn't set IPv4 route(s) in guest: %s", strerror(-rc)); } } if (c->ifi6) { if (c->no_copy_addrs) { rc = nl_addr_set(nl_sock_ns, c->pasta_ifi, AF_INET6, &c->ip6.addr, 64); } else { rc = nl_addr_dup(nl_sock, c->ifi6, nl_sock_ns, c->pasta_ifi, AF_INET6); } if (rc < 0) { die("Couldn't set IPv6 address(es) in namespace: %s", strerror(-rc)); } if (c->no_copy_routes) { rc = nl_route_set_def(nl_sock_ns, c->pasta_ifi, AF_INET6, &c->ip6.gw); } else { rc = nl_route_dup(nl_sock, c->ifi6, nl_sock_ns, c->pasta_ifi, AF_INET6); } if (rc < 0) { die("Couldn't set IPv6 route(s) in guest: %s", strerror(-rc)); } } } proto_update_l2_buf(c->mac_guest, NULL); } /** * pasta_netns_quit_timer() - Set up fallback timer to monitor namespace * * Return: timerfd file descriptor, negative error code on failure */ static int pasta_netns_quit_timer(void) { int fd = timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC); struct itimerspec it = { { 1, 0 }, { 1, 0 } }; /* one-second interval */ if (fd == -1) { err("timerfd_create(): %s", strerror(errno)); return -errno; } if (timerfd_settime(fd, 0, &it, NULL) < 0) { err("timerfd_settime(): %s", strerror(errno)); close(fd); return -errno; } return fd; } /** * pasta_netns_quit_init() - Watch network namespace to quit once it's gone * @c: Execution context */ void pasta_netns_quit_init(const struct ctx *c) { union epoll_ref ref = { .type = EPOLL_TYPE_NSQUIT_INOTIFY }; struct epoll_event ev = { .events = EPOLLIN }; int flags = O_NONBLOCK | O_CLOEXEC; struct statfs s = { 0 }; bool try_inotify = true; int fd = -1, dir_fd; if (c->mode != MODE_PASTA || c->no_netns_quit || !*c->netns_base) return; if ((dir_fd = open(c->netns_dir, O_CLOEXEC | O_RDONLY)) < 0) die("netns dir open: %s, exiting", strerror(errno)); if (fstatfs(dir_fd, &s) || s.f_type == DEVPTS_SUPER_MAGIC || s.f_type == PROC_SUPER_MAGIC || s.f_type == SYSFS_MAGIC) try_inotify = false; if (try_inotify && (fd = inotify_init1(flags)) < 0) warn("inotify_init1(): %s, use a timer", strerror(errno)); if (fd >= 0 && inotify_add_watch(fd, c->netns_dir, IN_DELETE) < 0) { warn("inotify_add_watch(): %s, use a timer", strerror(errno)); close(fd); fd = -1; } if (fd < 0) { if ((fd = pasta_netns_quit_timer()) < 0) die("Failed to set up fallback netns timer, exiting"); ref.nsdir_fd = dir_fd; ref.type = EPOLL_TYPE_NSQUIT_TIMER; } else { close(dir_fd); } if (fd > FD_REF_MAX) die("netns monitor file number %i too big, exiting", fd); ref.fd = fd; ev.data.u64 = ref.u64; epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev); } /** * pasta_netns_quit_inotify_handler() - Handle inotify watch, exit if ns is gone * @c: Execution context * @inotify_fd: inotify file descriptor with watch on namespace directory */ void pasta_netns_quit_inotify_handler(struct ctx *c, int inotify_fd) { char buf[sizeof(struct inotify_event) + NAME_MAX + 1]; const struct inotify_event *in_ev = (struct inotify_event *)buf; if (read(inotify_fd, buf, sizeof(buf)) < (ssize_t)sizeof(*in_ev)) return; if (strncmp(in_ev->name, c->netns_base, sizeof(c->netns_base))) return; info("Namespace %s is gone, exiting", c->netns_base); exit(EXIT_SUCCESS); } /** * pasta_netns_quit_timer_handler() - Handle timer, exit if ns is gone * @c: Execution context * @ref: epoll reference for timer descriptor */ void pasta_netns_quit_timer_handler(struct ctx *c, union epoll_ref ref) { uint64_t expirations; ssize_t n; int fd; n = read(ref.fd, &expirations, sizeof(expirations)); if (n < 0) die("Namespace watch timer read() error: %s", strerror(errno)); if ((size_t)n < sizeof(expirations)) warn("Namespace watch timer: short read(): %zi", n); fd = openat(ref.nsdir_fd, c->netns_base, O_PATH | O_CLOEXEC); if (fd < 0) { if (errno == EACCES) /* Expected for existing procfs entry */ return; info("Namespace %s is gone, exiting", c->netns_base); exit(EXIT_SUCCESS); } close(fd); }