From 32d07f5e59f2372939a7c99c4c4bcbb5f60b0e05 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Thu, 14 Oct 2021 01:21:29 +0200 Subject: passt, pasta: Completely avoid dynamic memory allocation Replace libc functions that might dynamically allocate memory with own implementations or wrappers. Drop brk(2) from list of allowed syscalls in seccomp profile. Signed-off-by: Stefano Brivio --- conf.c | 10 +-- dhcpv6.c | 4 +- passt.c | 16 ++-- pasta.c | 253 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ util.c | 129 +++++++++++++++++++++++++++++--- util.h | 5 ++ 6 files changed, 389 insertions(+), 28 deletions(-) create mode 100644 pasta.c diff --git a/conf.c b/conf.c index e3244aa..d0394a4 100644 --- a/conf.c +++ b/conf.c @@ -275,12 +275,11 @@ overlap: */ static void get_dns(struct ctx *c) { - int dns4_set, dns6_set, dnss_set, dns_set; + int dns4_set, dns6_set, dnss_set, dns_set, fd; struct in6_addr *dns6 = &c->dns6[0]; struct fqdn *s = c->dns_search; uint32_t *dns4 = &c->dns4[0]; char buf[BUFSIZ], *p, *end; - FILE *r; dns4_set = !c->v4 || !!*dns4; dns6_set = !c->v6 || !IN6_IS_ADDR_UNSPECIFIED(dns6); @@ -290,11 +289,10 @@ static void get_dns(struct ctx *c) if (dns_set && dnss_set) return; - r = fopen("/etc/resolv.conf", "r"); - if (!r) + if ((fd = open("/etc/resolv.conf", O_RDONLY)) < 0) goto out; - while (fgets(buf, BUFSIZ, r)) { + while (line_read(buf, BUFSIZ, fd)) { if (!dns_set && strstr(buf, "nameserver ") == buf) { p = strrchr(buf, ' '); if (!p) @@ -333,7 +331,7 @@ static void get_dns(struct ctx *c) } } - fclose(r); + close(fd); out: if (!dns_set && dns4 == c->dns4 && dns6 == c->dns6) diff --git a/dhcpv6.c b/dhcpv6.c index 19c7a62..d514e8d 100644 --- a/dhcpv6.c +++ b/dhcpv6.c @@ -588,10 +588,10 @@ int dhcpv6(struct ctx *c, struct ethhdr *eh, size_t len) */ void dhcpv6_init(struct ctx *c) { - struct tm y2k = { 0, 0, 0, 1, 0, 100, 0, 0, 0, 0, NULL }; + time_t y2k = 946684800; /* Epoch to 2000-01-01T00:00:00Z, no mktime() */ uint32_t duid_time; - duid_time = htonl(difftime(time(NULL), mktime(&y2k))); + duid_time = htonl(difftime(time(NULL), y2k)); resp.server_id.duid_time = duid_time; resp_not_on_link.server_id.duid_time = duid_time; diff --git a/passt.c b/passt.c index 0628d8c..e8f4e62 100644 --- a/passt.c +++ b/passt.c @@ -192,10 +192,10 @@ static void seccomp(struct ctx *c) * * Return: 0 once interrupted, non-zero on failure * - * #syscalls read write open close fork dup2 exit chdir brk ioctl writev syslog + * #syscalls read write open close fork dup2 exit chdir ioctl writev syslog * #syscalls prlimit64 epoll_ctl epoll_create1 epoll_wait accept4 accept listen * #syscalls socket bind connect getsockopt setsockopt recvfrom sendto shutdown - * #syscalls openat fstat fcntl lseek + * #syscalls openat fstat fcntl lseek clone setsid exit_group * #syscalls:pasta rt_sigreturn */ int main(int argc, char **argv) @@ -226,16 +226,16 @@ int main(int argc, char **argv) if (madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE)) perror("madvise"); - openlog(log_name, 0, LOG_DAEMON); + __openlog(log_name, 0, LOG_DAEMON); - setlogmask(LOG_MASK(LOG_EMERG)); + __setlogmask(LOG_MASK(LOG_EMERG)); conf(&c, argc, argv); seccomp(&c); if (!c.debug && (c.stderr || isatty(fileno(stdout)))) - openlog(log_name, LOG_PERROR, LOG_DAEMON); + __openlog(log_name, LOG_PERROR, LOG_DAEMON); c.epollfd = epoll_create1(0); if (c.epollfd == -1) { @@ -271,11 +271,11 @@ int main(int argc, char **argv) dhcpv6_init(&c); if (c.debug) - setlogmask(LOG_UPTO(LOG_DEBUG)); + __setlogmask(LOG_UPTO(LOG_DEBUG)); else if (c.quiet) - setlogmask(LOG_UPTO(LOG_ERR)); + __setlogmask(LOG_UPTO(LOG_ERR)); else - setlogmask(LOG_UPTO(LOG_INFO)); + __setlogmask(LOG_UPTO(LOG_INFO)); if (isatty(fileno(stdout)) && !c.foreground) daemon(0, 0); diff --git a/pasta.c b/pasta.c new file mode 100644 index 0000000..7c53c13 --- /dev/null +++ b/pasta.c @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: AGPL-3.0-or-later + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * pasta.c - pasta (namespace) specific implementations + * + * Copyright (c) 2020-2021 Red Hat GmbH + * Author: Stefano Brivio + * + * #syscalls:pasta clone unshare waitid kill execve exit_group rt_sigprocmask + * #syscalls:pasta geteuid getdents64 readlink setsid + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util.h" +#include "passt.h" +#include "netlink.h" + +/* PID of child, in case we created a namespace, and its procfs link */ +static int pasta_child_pid; +static char pasta_child_ns[PATH_MAX]; + +/** + * pasta_ns_cleanup() - Look for processes in namespace, terminate them + */ +static void pasta_ns_cleanup(void) +{ + char proc_path[PATH_MAX], ns_link[PATH_MAX], buf[BUFSIZ]; + int recheck = 0, found = 0, waited = 0; + int dir_fd, n; + + if (!*pasta_child_ns) + return; + +loop: + if ((dir_fd = open("/proc", O_RDONLY | O_DIRECTORY)) < 0) + return; + + while ((n = syscall(SYS_getdents64, dir_fd, buf, BUFSIZ)) > 0) { + struct dirent *dp = (struct dirent *)buf; + int pos = 0; + + while (pos < n) { + pid_t pid; + + errno = 0; + pid = strtol(dp->d_name, NULL, 0); + if (!pid || errno) + goto next; + + snprintf(proc_path, PATH_MAX, "/proc/%i/ns/net", pid); + if (readlink(proc_path, ns_link, PATH_MAX) < 0) + goto next; + + if (!strncmp(ns_link, pasta_child_ns, PATH_MAX)) { + found = 1; + if (waited) + kill(pid, SIGKILL); + else + kill(pid, SIGQUIT); + } +next: + dp = (struct dirent *)(buf + (pos += dp->d_reclen)); + } + } + + close(dir_fd); + + if (!found) + return; + + if (waited) { + if (recheck) { + info("Some processes in namespace didn't quit"); + } else { + found = 0; + recheck = 1; + goto loop; + } + return; + } + + info("Waiting for all processes in namespace to terminate"); + sleep(1); + waited = 1; + goto loop; +} + +/** + * pasta_child_handler() - Exit once shell exits (if we started it), reap clones + * @signal: Unused, handler deals with SIGCHLD only + */ +void pasta_child_handler(int signal) +{ + siginfo_t infop; + + (void)signal; + + if (pasta_child_pid && + !waitid(P_PID, pasta_child_pid, &infop, WEXITED | WNOHANG)) { + if (infop.si_pid == pasta_child_pid) { + pasta_ns_cleanup(); + exit(EXIT_SUCCESS); + } + } + + waitid(P_ALL, 0, NULL, WEXITED | WNOHANG); + waitid(P_ALL, 0, NULL, WEXITED | WNOHANG); +} + +/** + * pasta_wait_for_ns() - Busy loop until we can enter the target namespace + * @arg: Execution context + * + * Return: 0 + */ +static int pasta_wait_for_ns(void *arg) +{ + struct ctx *c = (struct ctx *)arg; + char ns[PATH_MAX]; + + if (c->netns_only) + goto netns; + + snprintf(ns, PATH_MAX, "/proc/%i/ns/user", pasta_child_pid); + do + while ((c->pasta_userns_fd = open(ns, O_RDONLY)) < 0); + while (setns(c->pasta_userns_fd, 0) && !close(c->pasta_userns_fd)); + +netns: + snprintf(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid); + do + while ((c->pasta_netns_fd = open(ns, O_RDONLY)) < 0); + while (setns(c->pasta_netns_fd, 0) && !close(c->pasta_netns_fd)); + + return 0; +} + +/** + * pasta_start_ns() - Fork shell in new namespace if target ns is not given + * @c: Execution context + */ +void pasta_start_ns(struct ctx *c) +{ + char buf[BUFSIZ], *shell, proc_path[PATH_MAX]; + int euid = geteuid(); + int fd; + + c->foreground = 1; + if (!c->debug) + c->quiet = 1; + + if ((pasta_child_pid = fork()) == -1) { + perror("fork"); + exit(EXIT_FAILURE); + } + + if (pasta_child_pid) { + NS_CALL(pasta_wait_for_ns, c); + + snprintf(proc_path, PATH_MAX, "/proc/%i/ns/net", + pasta_child_pid); + readlink(proc_path, pasta_child_ns, PATH_MAX); + + return; + } + + if (unshare(CLONE_NEWNET | (c->netns_only ? 0 : CLONE_NEWUSER))) { + perror("unshare"); + exit(EXIT_FAILURE); + } + + if (!c->netns_only) { + snprintf(buf, BUFSIZ, "%u %u %u", 0, euid, 1); + + fd = open("/proc/self/uid_map", O_WRONLY); + write(fd, buf, strlen(buf)); + close(fd); + + fd = open("/proc/self/setgroups", O_WRONLY); + write(fd, "deny", sizeof("deny")); + close(fd); + + fd = open("/proc/self/gid_map", O_WRONLY); + write(fd, buf, strlen(buf)); + close(fd); + } + + fd = open("/proc/sys/net/ipv4/ping_group_range", O_WRONLY); + write(fd, "0 0", strlen("0 0")); + close(fd); + + shell = getenv("SHELL") ? getenv("SHELL") : "/bin/sh"; + if (strstr(shell, "/bash")) + execve(shell, ((char *[]) { shell, "-l", NULL }), environ); + else + execve(shell, ((char *[]) { shell, NULL }), environ); + + perror("execve"); + exit(EXIT_FAILURE); +} + +/** + * pasta_ns_conf() - Set up loopback and tap interfaces in namespace as needed + * @c: Execution context + */ +void pasta_ns_conf(struct ctx *c) +{ + nl_link(1, 1 /* lo */, MAC_ZERO, 1); + + if (c->pasta_conf_ns) { + nl_link(1, c->pasta_ifi, c->mac_guest, 1); + + if (c->v4) { + nl_addr(1, c->pasta_ifi, AF_INET, &c->addr4, + __builtin_popcount(c->mask4), NULL); + nl_route(1, c->pasta_ifi, AF_INET, &c->gw4); + } + + if (c->v6) { + nl_addr(1, c->pasta_ifi, AF_INET6, &c->addr6, 64, NULL); + nl_route(1, c->pasta_ifi, AF_INET6, &c->gw6); + } + } else { + nl_link(1, c->pasta_ifi, c->mac_guest, 0); + } + + proto_update_l2_buf(c->mac_guest, NULL, NULL); +} diff --git a/util.c b/util.c index e0fa5ff..a3b3eb8 100644 --- a/util.c +++ b/util.c @@ -37,24 +37,27 @@ #include "util.h" #include "passt.h" +/* For __openlog() and __setlogmask() wrappers, and __vsyslog() (replacement) */ +static int log_mask; +static int log_sock = -1; +static char log_ident[BUFSIZ]; +static int log_opt; +static time_t log_debug_start; + #define logfn(name, level) \ void name(const char *format, ...) { \ - char ts[sizeof("Mmm dd hh:mm:ss.")]; \ struct timespec tp; \ - struct tm *tm; \ va_list args; \ \ if (setlogmask(0) & LOG_MASK(LOG_DEBUG)) { \ clock_gettime(CLOCK_REALTIME, &tp); \ - tm = gmtime(&tp.tv_sec); \ - strftime(ts, sizeof(ts), "%b %d %T.", tm); \ - \ - fprintf(stderr, "%s%04lu: ", ts, \ + fprintf(stderr, "%lu.%04lu: ", \ + tp.tv_sec - log_debug_start, \ tp.tv_nsec / (100 * 1000)); \ } \ \ va_start(args, format); \ - vsyslog(level, format, args); \ + __vsyslog(level, format, args); \ va_end(args); \ \ if (setlogmask(0) & LOG_MASK(LOG_DEBUG) || \ @@ -72,6 +75,79 @@ logfn(warn, LOG_WARNING) logfn(info, LOG_INFO) logfn(debug, LOG_DEBUG) +/** + * __openlog() - Non-optional openlog() wrapper, to allow custom vsyslog() + * @ident: openlog() identity (program name) + * @option: openlog() options + * @facility: openlog() facility (LOG_DAEMON) + */ +void __openlog(const char *ident, int option, int facility) +{ + struct timespec tp; + + clock_gettime(CLOCK_REALTIME, &tp); + log_debug_start = tp.tv_sec; + + if (log_sock < 0) { + struct sockaddr_un a = { .sun_family = AF_UNIX, }; + + log_sock = socket(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0); + if (log_sock < 0) + return; + + strncpy(a.sun_path, _PATH_LOG, sizeof(a.sun_path)); + if (connect(log_sock, (const struct sockaddr *)&a, sizeof(a))) { + close(log_sock); + log_sock = -1; + return; + } + } + + log_mask |= facility; + strncpy(log_ident, ident, sizeof(log_ident) - 1); + log_opt = option; + + openlog(ident, option, facility); +} + +/** + * __setlogmask() - setlogmask() wrapper, to allow custom vsyslog() + * @mask: Same as setlogmask() mask + */ +void __setlogmask(int mask) +{ + log_mask = mask; + setlogmask(mask); +} + +/** + * __vsyslog() - vsyslog() implementation not using heap memory + * @pri: Facility and level map, same as priority for vsyslog() + * @format: Same as vsyslog() format + * @ap: Same as vsyslog() ap + */ +void __vsyslog(int pri, const char *format, va_list ap) +{ + char buf[BUFSIZ]; + int n; + + if (!(LOG_MASK(LOG_PRI(pri)) & log_mask)) + return; + + /* Send without name and timestamp, the system logger should add them */ + n = snprintf(buf, BUFSIZ, "<%i> ", pri); + + n += vsnprintf(buf + n, BUFSIZ - n, format, ap); + + if (format[strlen(format)] != '\n') + n += snprintf(buf + n, BUFSIZ - n, "\n"); + + if (log_opt | LOG_PERROR) + fprintf(stderr, buf + sizeof("<0>")); + + send(log_sock, buf, n, 0); +} + /** * ipv6_l4hdr() - Find pointer to L4 header in IPv6 packet and extract protocol * @ip6h: IPv6 header @@ -291,6 +367,35 @@ int bitmap_isset(uint8_t *map, int bit) return map[bit / 8] & (1 << bit % 8); } +/** + * line_read() - Same as fgets(), without using heap, a file instead of a stream + * @buf: Read buffer + * @len: Maximum line length + * @fd: File descriptor for reading + * + * Return: @buf if a line is found, NULL on EOF or error + */ +char *line_read(char *buf, size_t len, int fd) +{ + char *p; + int n; + + n = read(fd, buf, --len); + if (n <= 0) + return NULL; + + buf[len] = 0; + if (!(p = strchr(buf, '\n'))) + return buf; + + *p = 0; + if (p == buf) + return buf; + + lseek(fd, (p - buf) - n + 1, SEEK_CUR); + return buf; +} + /** * procfs_scan_listen() - Set bits for listening TCP or UDP sockets from procfs * @name: Corresponding name of file under /proc/net/ @@ -302,14 +407,14 @@ void procfs_scan_listen(char *name, uint8_t *map, uint8_t *exclude) char line[200], path[PATH_MAX]; unsigned long port; unsigned int state; - FILE *fp; + int fd; snprintf(path, PATH_MAX, "/proc/net/%s", name); - if (!(fp = fopen(path, "r"))) + if ((fd = open(path, O_RDONLY)) < 0) return; - fgets(line, sizeof(line), fp); - while (fgets(line, sizeof(line), fp)) { + line_read(line, sizeof(line), fd); + while (line_read(line, sizeof(line), fd)) { if (sscanf(line, "%*u: %*x:%lx %*x:%*x %x", &port, &state) != 2) continue; @@ -324,7 +429,7 @@ void procfs_scan_listen(char *name, uint8_t *map, uint8_t *exclude) bitmap_set(map, port); } - fclose(fp); + close(fd); } /** diff --git a/util.h b/util.h index 605b708..9edf041 100644 --- a/util.h +++ b/util.h @@ -133,6 +133,7 @@ enum { #include #include #include +#include enum bind_type { BIND_ANY = 0, @@ -143,6 +144,9 @@ enum bind_type { struct ctx; +void __openlog(const char *ident, int option, int facility); +void __vsyslog(int pri, const char *fmt, va_list ap); +void __setlogmask(int mask); char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto); int sock_l4(struct ctx *c, int af, uint8_t proto, uint16_t port, enum bind_type bind_addr, uint32_t data); @@ -151,5 +155,6 @@ int timespec_diff_ms(struct timespec *a, struct timespec *b); void bitmap_set(uint8_t *map, int bit); void bitmap_clear(uint8_t *map, int bit); int bitmap_isset(uint8_t *map, int bit); +char *line_read(char *buf, size_t len, int fd); void procfs_scan_listen(char *name, uint8_t *map, uint8_t *exclude); int ns_enter(struct ctx *c); -- cgit v1.2.3