diff options
Diffstat (limited to 'util.c')
-rw-r--r-- | util.c | 508 |
1 files changed, 409 insertions, 99 deletions
@@ -28,11 +28,15 @@ #include <linux/errqueue.h> #include <getopt.h> +#include "linux_dep.h" #include "util.h" #include "iov.h" #include "passt.h" #include "packet.h" #include "log.h" +#ifdef HAS_GETRANDOM +#include <sys/random.h> +#endif /** * sock_l4_sa() - Create and bind socket to socket address, add to epoll list @@ -52,6 +56,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, { sa_family_t af = ((const struct sockaddr *)sa)->sa_family; union epoll_ref ref = { .type = type, .data = data }; + bool freebind = false; struct epoll_event ev; int fd, y = 1, ret; uint8_t proto; @@ -61,9 +66,12 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, case EPOLL_TYPE_TCP_LISTEN: proto = IPPROTO_TCP; socktype = SOCK_STREAM | SOCK_NONBLOCK; + freebind = c->freebind; break; case EPOLL_TYPE_UDP_LISTEN: - case EPOLL_TYPE_UDP_REPLY: + freebind = c->freebind; + /* fallthrough */ + case EPOLL_TYPE_UDP: proto = IPPROTO_UDP; socktype = SOCK_DGRAM | SOCK_NONBLOCK; break; @@ -82,7 +90,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, ret = -errno; if (fd < 0) { - warn("L4 socket: %s", strerror(-ret)); + warn("L4 socket: %s", strerror_(-ret)); return ret; } @@ -101,11 +109,15 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, debug("Failed to set SO_REUSEADDR on socket %i", fd); if (proto == IPPROTO_UDP) { + int pktinfo = af == AF_INET ? IP_PKTINFO : IPV6_RECVPKTINFO; + int recverr = af == AF_INET ? IP_RECVERR : IPV6_RECVERR; int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6; - int opt = af == AF_INET ? IP_RECVERR : IPV6_RECVERR; - if (setsockopt(fd, level, opt, &y, sizeof(y))) + if (setsockopt(fd, level, recverr, &y, sizeof(y))) die_perror("Failed to set RECVERR on socket %i", fd); + + if (setsockopt(fd, level, pktinfo, &y, sizeof(y))) + die_perror("Failed to set PKTINFO on socket %i", fd); } if (ifname && *ifname) { @@ -127,6 +139,18 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, } } + if (freebind) { + int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6; + int opt = af == AF_INET ? IP_FREEBIND : IPV6_FREEBIND; + + if (setsockopt(fd, level, opt, &y, sizeof(y))) { + err_perror("Failed to set %s on socket %i", + af == AF_INET ? "IP_FREEBIND" + : "IPV6_FREEBIND", + fd); + } + } + if (bind(fd, sa, sl) < 0) { /* We'll fail to bind to low ports if we don't have enough * capabilities, and we'll fail to bind on already bound ports, @@ -142,7 +166,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, if (type == EPOLL_TYPE_TCP_LISTEN && listen(fd, 128) < 0) { ret = -errno; - warn("TCP socket listen: %s", strerror(-ret)); + warn("TCP socket listen: %s", strerror_(-ret)); close(fd); return ret; } @@ -151,64 +175,73 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, ev.data.u64 = ref.u64; if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev) == -1) { ret = -errno; - warn("L4 epoll_ctl: %s", strerror(-ret)); + warn("L4 epoll_ctl: %s", strerror_(-ret)); return ret; } return fd; } + /** - * sock_l4() - Create and bind socket for given L4, add to epoll list - * @c: Execution context - * @af: Address family, AF_INET or AF_INET6 - * @type: epoll type - * @bind_addr: Address for binding, NULL for any - * @ifname: Interface for binding, NULL for any - * @port: Port, host order - * @data: epoll reference portion for protocol handlers + * sock_unix() - Create and bind AF_UNIX socket + * @sock_path: Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix) * - * Return: newly created socket, negative error code on failure + * Return: socket descriptor on success, won't return on failure */ -int sock_l4(const struct ctx *c, sa_family_t af, enum epoll_type type, - const void *bind_addr, const char *ifname, uint16_t port, - uint32_t data) +int sock_unix(char *sock_path) { - switch (af) { - case AF_INET: { - struct sockaddr_in addr4 = { - .sin_family = AF_INET, - .sin_port = htons(port), - { 0 }, { 0 }, - }; - if (bind_addr) - addr4.sin_addr = *(struct in_addr *)bind_addr; - return sock_l4_sa(c, type, &addr4, sizeof(addr4), ifname, - false, data); - } - - case AF_UNSPEC: - if (!DUAL_STACK_SOCKETS || bind_addr) - return -EINVAL; - /* fallthrough */ - case AF_INET6: { - struct sockaddr_in6 addr6 = { - .sin6_family = AF_INET6, - .sin6_port = htons(port), - 0, IN6ADDR_ANY_INIT, 0, - }; - if (bind_addr) { - addr6.sin6_addr = *(struct in6_addr *)bind_addr; - - if (!memcmp(bind_addr, &c->ip6.addr_ll, - sizeof(c->ip6.addr_ll))) - addr6.sin6_scope_id = c->ifi6; + int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + struct sockaddr_un addr = { + .sun_family = AF_UNIX, + }; + int i; + + if (fd < 0) + die_perror("Failed to open UNIX domain socket"); + + for (i = 1; i < UNIX_SOCK_MAX; i++) { + char *path = addr.sun_path; + int ex, ret; + + if (*sock_path) + memcpy(path, sock_path, UNIX_PATH_MAX); + else if (snprintf_check(path, UNIX_PATH_MAX - 1, + UNIX_SOCK_PATH, i)) + die_perror("Can't build UNIX domain socket path"); + + ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, + 0); + if (ex < 0) + die_perror("Failed to check for UNIX domain conflicts"); + + ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr)); + if (!ret || (errno != ENOENT && errno != ECONNREFUSED && + errno != EACCES)) { + if (*sock_path) + die("Socket path %s already in use", path); + + close(ex); + continue; } - return sock_l4_sa(c, type, &addr6, sizeof(addr6), ifname, - af == AF_INET6, data); - } - default: - return -EINVAL; + close(ex); + + unlink(path); + ret = bind(fd, (const struct sockaddr *)&addr, sizeof(addr)); + if (*sock_path && ret) + die_perror("Failed to bind UNIX domain socket"); + + if (!ret) + break; } + + if (i == UNIX_SOCK_MAX) + die_perror("Failed to bind UNIX domain socket"); + + info("UNIX domain socket bound at %s", addr.sun_path); + if (!*sock_path) + memcpy(sock_path, addr.sun_path, UNIX_PATH_MAX); + + return fd; } /** @@ -220,7 +253,8 @@ void sock_probe_mem(struct ctx *c) int v = INT_MAX / 2, s; socklen_t sl; - if ((s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) { + s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); + if (s < 0) { c->low_wmem = c->low_rmem = 1; return; } @@ -250,7 +284,7 @@ void sock_probe_mem(struct ctx *c) int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b) { if (a->tv_nsec < b->tv_nsec) { - return (b->tv_nsec - a->tv_nsec) / 1000 + + return (a->tv_nsec + 1000000000 - b->tv_nsec) / 1000 + (a->tv_sec - b->tv_sec - 1) * 1000000; } @@ -330,7 +364,7 @@ void bitmap_or(uint8_t *dst, size_t size, const uint8_t *a, const uint8_t *b) dst[i] = a[i] | b[i]; } -/* +/** * ns_enter() - Enter configured user (unless already joined) and network ns * @c: Execution context * @@ -437,32 +471,27 @@ void pidfile_write(int fd, pid_t pid) if (write(fd, pid_buf, n) < 0) { perror("PID file write"); - exit(EXIT_FAILURE); + _exit(EXIT_FAILURE); } close(fd); } /** - * pidfile_open() - Open PID file if needed - * @path: Path for PID file, empty string if no PID file is requested + * output_file_open() - Open file for output, if needed + * @path: Path for output file + * @flags: Flags for open() other than O_CREAT, O_TRUNC, O_CLOEXEC * - * Return: descriptor for PID file, -1 if path is NULL, won't return on failure + * Return: file descriptor on success, -1 on failure with errno set by open() */ -int pidfile_open(const char *path) +int output_file_open(const char *path, int flags) { - int fd; - - if (!*path) - return -1; - - if ((fd = open(path, O_CREAT | O_TRUNC | O_WRONLY | O_CLOEXEC, - S_IRUSR | S_IWUSR)) < 0) { - perror("PID file open"); - exit(EXIT_FAILURE); - } - - return fd; + /* We use O_CLOEXEC here, but clang-tidy as of LLVM 16 to 19 looks for + * it in the 'mode' argument if we have one + */ + return open(path, O_CREAT | O_TRUNC | O_CLOEXEC | flags, + /* NOLINTNEXTLINE(android-cloexec-open) */ + S_IRUSR | S_IWUSR); } /** @@ -470,7 +499,8 @@ int pidfile_open(const char *path) * @pidfile_fd: Open PID file descriptor * @devnull_fd: Open file descriptor for /dev/null * - * Return: child PID on success, won't return on failure + * Return: 0 in the child process on success. The parent process exits. + * Does not return in either process on failure (calls _exit). */ int __daemon(int pidfile_fd, int devnull_fd) { @@ -478,25 +508,20 @@ int __daemon(int pidfile_fd, int devnull_fd) if (pid == -1) { perror("fork"); - exit(EXIT_FAILURE); + _exit(EXIT_FAILURE); } if (pid) { pidfile_write(pidfile_fd, pid); - exit(EXIT_SUCCESS); + _exit(EXIT_SUCCESS); } - errno = 0; - - setsid(); - - dup2(devnull_fd, STDIN_FILENO); - dup2(devnull_fd, STDOUT_FILENO); - dup2(devnull_fd, STDERR_FILENO); - close(devnull_fd); - - if (errno) - exit(EXIT_FAILURE); + if (setsid() < 0 || + dup2(devnull_fd, STDIN_FILENO) < 0 || + dup2(devnull_fd, STDOUT_FILENO) < 0 || + dup2(devnull_fd, STDERR_FILENO) < 0 || + close(devnull_fd)) + _exit(EXIT_FAILURE); return 0; } @@ -583,7 +608,39 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, #endif } -/* write_remainder() - write the tail of an IO vector to an fd +/** + * write_all_buf() - write all of a buffer to an fd + * @fd: File descriptor + * @buf: Pointer to base of buffer + * @len: Length of buffer + * + * Return: 0 on success, -1 on error (with errno set) + * + * #syscalls write + */ +int write_all_buf(int fd, const void *buf, size_t len) +{ + const char *p = buf; + size_t left = len; + + while (left) { + ssize_t rc; + + do + rc = write(fd, p, left); + while ((rc < 0) && errno == EINTR); + + if (rc < 0) + return -1; + + p += rc; + left -= rc; + } + return 0; +} + +/** + * write_remainder() - write the tail of an IO vector to an fd * @fd: File descriptor * @iov: IO vector * @iovcnt: Number of entries in @iov @@ -591,28 +648,114 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, * * Return: 0 on success, -1 on error (with errno set) * - * #syscalls write writev + * #syscalls writev */ int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip) { - size_t offset, i; + size_t i = 0, offset; - while ((i = iov_skip_bytes(iov, iovcnt, skip, &offset)) < iovcnt) { + while ((i += iov_skip_bytes(iov + i, iovcnt - i, skip, &offset)) < iovcnt) { ssize_t rc; if (offset) { - rc = write(fd, (char *)iov[i].iov_base + offset, - iov[i].iov_len - offset); - } else { - rc = writev(fd, &iov[i], iovcnt - i); + /* Write the remainder of the partially written buffer */ + if (write_all_buf(fd, (char *)iov[i].iov_base + offset, + iov[i].iov_len - offset) < 0) + return -1; + i++; } + /* Write as much of the remaining whole buffers as we can */ + rc = writev(fd, &iov[i], iovcnt - i); if (rc < 0) return -1; - skip += rc; + skip = rc; } + return 0; +} + +/** + * read_all_buf() - Fill a whole buffer from a file descriptor + * @fd: File descriptor + * @buf: Pointer to base of buffer + * @len: Length of buffer + * + * Return: 0 on success, -1 on error (with errno set) + * + * #syscalls read + */ +int read_all_buf(int fd, void *buf, size_t len) +{ + size_t left = len; + char *p = buf; + + while (left) { + ssize_t rc; + + ASSERT(left <= len); + + do + rc = read(fd, p, left); + while ((rc < 0) && errno == EINTR); + + if (rc < 0) + return -1; + + if (rc == 0) { + errno = ENODATA; + return -1; + } + + p += rc; + left -= rc; + } + return 0; +} + +/** + * read_remainder() - Read the tail of an IO vector from a file descriptor + * @fd: File descriptor + * @iov: IO vector + * @cnt: Number of entries in @iov + * @skip: Number of bytes of the vector to skip reading + * + * Return: 0 on success, -1 on error (with errno set) + * + * Note: mode-specific seccomp profiles need to enable readv() to use this. + */ +/* cppcheck-suppress unusedFunction */ +int read_remainder(int fd, const struct iovec *iov, size_t cnt, size_t skip) +{ + size_t i = 0, offset; + + while ((i += iov_skip_bytes(iov + i, cnt - i, skip, &offset)) < cnt) { + ssize_t rc; + if (offset) { + ASSERT(offset < iov[i].iov_len); + /* Read the remainder of the partially read buffer */ + if (read_all_buf(fd, (char *)iov[i].iov_base + offset, + iov[i].iov_len - offset) < 0) + return -1; + i++; + } + + if (cnt == i) + break; + + /* Fill as many of the remaining buffers as we can */ + rc = readv(fd, &iov[i], cnt - i); + if (rc < 0) + return -1; + + if (rc == 0) { + errno = ENODATA; + return -1; + } + + skip = rc; + } return 0; } @@ -676,6 +819,25 @@ const char *sockaddr_ntop(const void *sa, char *dst, socklen_t size) return dst; } +/** eth_ntop() - Convert an Ethernet MAC address to text format + * @mac: MAC address + * @dst: Output buffer, minimum ETH_ADDRSTRLEN bytes + * @size: Size of buffer at @dst + * + * Return: On success, a non-null pointer to @dst, NULL on failure + */ +const char *eth_ntop(const unsigned char *mac, char *dst, size_t size) +{ + int len; + + len = snprintf(dst, size, "%02x:%02x:%02x:%02x:%02x:%02x", + mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); + if (len < 0 || (size_t)len >= size) + return NULL; + + return dst; +} + /** str_ee_origin() - Convert socket extended error origin to a string * @ee: Socket extended error structure * @@ -710,13 +872,15 @@ void close_open_files(int argc, char **argv) int name, rc; do { - name = getopt_long(argc, argv, "+:F", optfd, NULL); + name = getopt_long(argc, argv, "-:F:", optfd, NULL); if (name == 'F') { errno = 0; fd = strtol(optarg, NULL, 0); - if (errno || fd <= STDERR_FILENO || fd > INT_MAX) + if (errno || + (fd != STDIN_FILENO && fd <= STDERR_FILENO) || + fd > INT_MAX) die("Invalid --fd: %s", optarg); } } while (name != -1); @@ -732,6 +896,152 @@ void close_open_files(int argc, char **argv) rc = close_range(fd + 1, ~0U, CLOSE_RANGE_UNSHARE); } - if (rc) - die_perror("Failed to close files leaked by parent"); + if (rc) { + if (errno == ENOSYS || errno == EINVAL) { + /* This probably means close_range() or the + * CLOSE_RANGE_UNSHARE flag is not supported by the + * kernel. Not much we can do here except carry on and + * hope for the best. + */ + warn( +"Can't use close_range() to ensure no files leaked by parent"); + } else { + die_perror("Failed to close files leaked by parent"); + } + } + +} + +/** + * snprintf_check() - snprintf() wrapper, checking for truncation and errors + * @str: Output buffer + * @size: Maximum size to write to @str + * @format: Message + * + * Return: false on success, true on truncation or error, sets errno on failure + */ +bool snprintf_check(char *str, size_t size, const char *format, ...) +{ + va_list ap; + int rc; + + va_start(ap, format); + rc = vsnprintf(str, size, format, ap); + va_end(ap); + + if (rc < 0) { + errno = EIO; + return true; + } + + if ((size_t)rc >= size) { + errno = ENOBUFS; + return true; + } + + return false; +} + +#define DEV_RANDOM "/dev/random" + +/** + * raw_random() - Get high quality random bytes + * @buf: Buffer to fill with random bytes + * @buflen: Number of bytes of random data to put in @buf + * + * Assumes that the random data is essential, and will die() if unable to obtain + * it. + */ +void raw_random(void *buf, size_t buflen) +{ + size_t random_read = 0; +#ifndef HAS_GETRANDOM + int fd = open(DEV_RANDOM, O_RDONLY); + + if (fd < 0) + die_perror("Couldn't open %s", DEV_RANDOM); +#endif + + while (random_read < buflen) { + ssize_t ret; + +#ifdef HAS_GETRANDOM + ret = getrandom((char *)buf + random_read, + buflen - random_read, GRND_RANDOM); +#else + ret = read(dev_random, (char *)buf + random_read, + buflen - random_read); +#endif + + if (ret == -1 && errno == EINTR) + continue; + + if (ret < 0) + die_perror("Error on random data source"); + + if (ret == 0) + break; + + random_read += ret; + } + +#ifndef HAS_GETRANDOM + close(dev_random); +#endif + + if (random_read < buflen) + die("Unexpected EOF on random data source"); +} + +/** + * epoll_del() - Remove a file descriptor from our passt epoll + * @c: Execution context + * @fd: File descriptor to remove + */ +void epoll_del(const struct ctx *c, int fd) +{ + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, fd, NULL); + +} + +/** + * encode_domain_name() - Encode domain name according to RFC 1035, section 3.1 + * @buf: Buffer to fill in with encoded domain name + * @domain_name: Input domain name string with terminator + * + * The buffer's 'buf' size has to be >= strlen(domain_name) + 2 + */ +void encode_domain_name(char *buf, const char *domain_name) +{ + size_t i; + char *p; + + buf[0] = strcspn(domain_name, "."); + p = buf + 1; + for (i = 0; domain_name[i]; i++) { + if (domain_name[i] == '.') + p[i] = strcspn(domain_name + i + 1, "."); + else + p[i] = domain_name[i]; + } + p[i] = 0L; +} + +/** + * abort_with_msg() - Print error message and abort + * @fmt: Format string + * @...: Format parameters + */ +void abort_with_msg(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vlogmsg(true, false, LOG_CRIT, fmt, ap); + va_end(ap); + + /* This may actually cause a SIGSYS instead of SIGABRT, due to seccomp, + * but that will still get the job done. + */ + abort(); } |