diff options
Diffstat (limited to 'util.c')
-rw-r--r-- | util.c | 515 |
1 files changed, 399 insertions, 116 deletions
@@ -25,73 +25,68 @@ #include <time.h> #include <errno.h> #include <stdbool.h> +#include <linux/errqueue.h> +#include <getopt.h> +#include "linux_dep.h" #include "util.h" #include "iov.h" #include "passt.h" #include "packet.h" #include "log.h" +#ifdef HAS_GETRANDOM +#include <sys/random.h> +#endif /** - * sock_l4() - Create and bind socket for given L4, add to epoll list + * sock_l4_sa() - Create and bind socket to socket address, add to epoll list * @c: Execution context - * @af: Address family, AF_INET or AF_INET6 - * @proto: Protocol number - * @bind_addr: Address for binding, NULL for any + * @type: epoll type + * @sa: Socket address to bind to + * @sl: Length of @sa * @ifname: Interface for binding, NULL for any - * @port: Port, host order + * @v6only: Set IPV6_V6ONLY socket option * @data: epoll reference portion for protocol handlers * * Return: newly created socket, negative error code on failure */ -int sock_l4(const struct ctx *c, sa_family_t af, uint8_t proto, - const void *bind_addr, const char *ifname, uint16_t port, - uint32_t data) +int sock_l4_sa(const struct ctx *c, enum epoll_type type, + const void *sa, socklen_t sl, + const char *ifname, bool v6only, uint32_t data) { - union epoll_ref ref = { .data = data }; - struct sockaddr_in addr4 = { - .sin_family = AF_INET, - .sin_port = htons(port), - { 0 }, { 0 }, - }; - struct sockaddr_in6 addr6 = { - .sin6_family = AF_INET6, - .sin6_port = htons(port), - 0, IN6ADDR_ANY_INIT, 0, - }; - const struct sockaddr *sa; - bool dual_stack = false; - int fd, sl, y = 1, ret; + sa_family_t af = ((const struct sockaddr *)sa)->sa_family; + union epoll_ref ref = { .type = type, .data = data }; + bool freebind = false; struct epoll_event ev; - - switch (proto) { - case IPPROTO_TCP: - ref.type = EPOLL_TYPE_TCP_LISTEN; - break; - case IPPROTO_UDP: - ref.type = EPOLL_TYPE_UDP; + int fd, y = 1, ret; + uint8_t proto; + int socktype; + + switch (type) { + case EPOLL_TYPE_TCP_LISTEN: + proto = IPPROTO_TCP; + socktype = SOCK_STREAM | SOCK_NONBLOCK; + freebind = c->freebind; break; - case IPPROTO_ICMP: - ref.type = EPOLL_TYPE_ICMP; + case EPOLL_TYPE_UDP_LISTEN: + freebind = c->freebind; + /* fallthrough */ + case EPOLL_TYPE_UDP_REPLY: + proto = IPPROTO_UDP; + socktype = SOCK_DGRAM | SOCK_NONBLOCK; break; - case IPPROTO_ICMPV6: - ref.type = EPOLL_TYPE_ICMPV6; + case EPOLL_TYPE_PING: + if (af == AF_INET) + proto = IPPROTO_ICMP; + else + proto = IPPROTO_ICMPV6; + socktype = SOCK_DGRAM | SOCK_NONBLOCK; break; default: - return -EPFNOSUPPORT; /* Not implemented. */ + ASSERT(0); } - if (af == AF_UNSPEC) { - if (!DUAL_STACK_SOCKETS || bind_addr) - return -EINVAL; - dual_stack = true; - af = AF_INET6; - } - - if (proto == IPPROTO_TCP) - fd = socket(af, SOCK_STREAM | SOCK_NONBLOCK, proto); - else - fd = socket(af, SOCK_DGRAM | SOCK_NONBLOCK, proto); + fd = socket(af, socktype, proto); ret = -errno; if (fd < 0) { @@ -106,34 +101,21 @@ int sock_l4(const struct ctx *c, sa_family_t af, uint8_t proto, ref.fd = fd; - if (af == AF_INET) { - if (bind_addr) - addr4.sin_addr = *(struct in_addr *)bind_addr; + if (v6only) + if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &y, sizeof(y))) + debug("Failed to set IPV6_V6ONLY on socket %i", fd); - sa = (const struct sockaddr *)&addr4; - sl = sizeof(addr4); - } else { - if (bind_addr) { - addr6.sin6_addr = *(struct in6_addr *)bind_addr; - - if (!memcmp(bind_addr, &c->ip6.addr_ll, - sizeof(c->ip6.addr_ll))) - addr6.sin6_scope_id = c->ifi6; - } + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y))) + debug("Failed to set SO_REUSEADDR on socket %i", fd); - sa = (const struct sockaddr *)&addr6; - sl = sizeof(addr6); + if (proto == IPPROTO_UDP) { + int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6; + int opt = af == AF_INET ? IP_RECVERR : IPV6_RECVERR; - if (!dual_stack) - if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, - &y, sizeof(y))) - debug("Failed to set IPV6_V6ONLY on socket %i", - fd); + if (setsockopt(fd, level, opt, &y, sizeof(y))) + die_perror("Failed to set RECVERR on socket %i", fd); } - if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y))) - debug("Failed to set SO_REUSEADDR on socket %i", fd); - if (ifname && *ifname) { /* Supported since kernel version 5.7, commit c427bfec18f2 * ("net: core: enable SO_BINDTODEVICE for non-root users"). If @@ -142,28 +124,43 @@ int sock_l4(const struct ctx *c, sa_family_t af, uint8_t proto, */ if (setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, ifname, strlen(ifname))) { + char str[SOCKADDR_STRLEN]; + ret = -errno; - warn("Can't bind %s socket for port %u to %s, closing", - EPOLL_TYPE_STR(proto), port, ifname); + warn("Can't bind %s socket for %s to %s, closing", + EPOLL_TYPE_STR(proto), + sockaddr_ntop(sa, str, sizeof(str)), ifname); close(fd); return ret; } } + if (freebind) { + int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6; + int opt = af == AF_INET ? IP_FREEBIND : IPV6_FREEBIND; + + if (setsockopt(fd, level, opt, &y, sizeof(y))) { + err_perror("Failed to set %s on socket %i", + af == AF_INET ? "IP_FREEBIND" + : "IPV6_FREEBIND", + fd); + } + } + if (bind(fd, sa, sl) < 0) { /* We'll fail to bind to low ports if we don't have enough * capabilities, and we'll fail to bind on already bound ports, * this is fine. This might also fail for ICMP because of a * broken SELinux policy, see icmp_tap_handler(). */ - if (proto != IPPROTO_ICMP && proto != IPPROTO_ICMPV6) { + if (type != EPOLL_TYPE_PING) { ret = -errno; close(fd); return ret; } } - if (proto == IPPROTO_TCP && listen(fd, 128) < 0) { + if (type == EPOLL_TYPE_TCP_LISTEN && listen(fd, 128) < 0) { ret = -errno; warn("TCP socket listen: %s", strerror(-ret)); close(fd); @@ -190,7 +187,8 @@ void sock_probe_mem(struct ctx *c) int v = INT_MAX / 2, s; socklen_t sl; - if ((s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) { + s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); + if (s < 0) { c->low_wmem = c->low_rmem = 1; return; } @@ -210,23 +208,34 @@ void sock_probe_mem(struct ctx *c) close(s); } - /** - * timespec_diff_ms() - Report difference in milliseconds between two timestamps + * timespec_diff_us() - Report difference in microseconds between two timestamps * @a: Minuend timestamp * @b: Subtrahend timestamp * - * Return: difference in milliseconds + * Return: difference in microseconds (wraps after 2^63 / 10^6s ~= 292k years) */ -int timespec_diff_ms(const struct timespec *a, const struct timespec *b) +int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b) { if (a->tv_nsec < b->tv_nsec) { - return (b->tv_nsec - a->tv_nsec) / 1000000 + - (a->tv_sec - b->tv_sec - 1) * 1000; + return (a->tv_nsec + 1000000000 - b->tv_nsec) / 1000 + + (a->tv_sec - b->tv_sec - 1) * 1000000; } - return (a->tv_nsec - b->tv_nsec) / 1000000 + - (a->tv_sec - b->tv_sec) * 1000; + return (a->tv_nsec - b->tv_nsec) / 1000 + + (a->tv_sec - b->tv_sec) * 1000000; +} + +/** + * timespec_diff_ms() - Report difference in milliseconds between two timestamps + * @a: Minuend timestamp + * @b: Subtrahend timestamp + * + * Return: difference in milliseconds + */ +long timespec_diff_ms(const struct timespec *a, const struct timespec *b) +{ + return timespec_diff_us(a, b) / 1000; } /** @@ -234,7 +243,7 @@ int timespec_diff_ms(const struct timespec *a, const struct timespec *b) * @map: Pointer to bitmap * @bit: Bit number to set */ -void bitmap_set(uint8_t *map, int bit) +void bitmap_set(uint8_t *map, unsigned bit) { unsigned long *word = (unsigned long *)map + BITMAP_WORD(bit); @@ -246,7 +255,7 @@ void bitmap_set(uint8_t *map, int bit) * @map: Pointer to bitmap * @bit: Bit number to clear */ -void bitmap_clear(uint8_t *map, int bit) +void bitmap_clear(uint8_t *map, unsigned bit) { unsigned long *word = (unsigned long *)map + BITMAP_WORD(bit); @@ -258,9 +267,9 @@ void bitmap_clear(uint8_t *map, int bit) * @map: Pointer to bitmap * @bit: Bit number to check * - * Return: one if given bit is set, zero if it's not + * Return: true if given bit is set, false if it's not */ -int bitmap_isset(const uint8_t *map, int bit) +bool bitmap_isset(const uint8_t *map, unsigned bit) { const unsigned long *word = (const unsigned long *)map + BITMAP_WORD(bit); @@ -300,7 +309,7 @@ void bitmap_or(uint8_t *dst, size_t size, const uint8_t *a, const uint8_t *b) void ns_enter(const struct ctx *c) { if (setns(c->pasta_netns_fd, CLONE_NEWNET)) - die("setns() failed entering netns: %s", strerror(errno)); + die_perror("setns() failed entering netns"); } /** @@ -315,10 +324,8 @@ bool ns_is_init(void) bool ret = true; int fd; - if ((fd = open("/proc/self/uid_map", O_RDONLY | O_CLOEXEC)) < 0) { - die("Can't determine if we're in init namespace: %s", - strerror(errno)); - } + if ((fd = open("/proc/self/uid_map", O_RDONLY | O_CLOEXEC)) < 0) + die_perror("Can't determine if we're in init namespace"); if (read(fd, buf, sizeof(root_uid_map)) != sizeof(root_uid_map) - 1 || strncmp(buf, root_uid_map, sizeof(root_uid_map))) @@ -382,11 +389,11 @@ int open_in_ns(const struct ctx *c, const char *path, int flags) } /** - * pid_file() - Write PID to file, if requested to do so, and close it + * pidfile_write() - Write PID to file, if requested to do so, and close it * @fd: Open PID file descriptor, closed on exit, -1 to skip writing it * @pid: PID value to write */ -void write_pidfile(int fd, pid_t pid) +void pidfile_write(int fd, pid_t pid) { char pid_buf[12]; int n; @@ -405,6 +412,23 @@ void write_pidfile(int fd, pid_t pid) } /** + * output_file_open() - Open file for output, if needed + * @path: Path for output file + * @flags: Flags for open() other than O_CREAT, O_TRUNC, O_CLOEXEC + * + * Return: file descriptor on success, -1 on failure with errno set by open() + */ +int output_file_open(const char *path, int flags) +{ + /* We use O_CLOEXEC here, but clang-tidy as of LLVM 16 to 19 looks for + * it in the 'mode' argument if we have one + */ + return open(path, O_CREAT | O_TRUNC | O_CLOEXEC | flags, + /* NOLINTNEXTLINE(android-cloexec-open) */ + S_IRUSR | S_IWUSR); +} + +/** * __daemon() - daemon()-like function writing PID file before parent exits * @pidfile_fd: Open PID file descriptor * @devnull_fd: Open file descriptor for /dev/null @@ -421,20 +445,15 @@ int __daemon(int pidfile_fd, int devnull_fd) } if (pid) { - write_pidfile(pidfile_fd, pid); + pidfile_write(pidfile_fd, pid); exit(EXIT_SUCCESS); } - errno = 0; - - setsid(); - - dup2(devnull_fd, STDIN_FILENO); - dup2(devnull_fd, STDOUT_FILENO); - dup2(devnull_fd, STDERR_FILENO); - close(devnull_fd); - - if (errno) + if (setsid() < 0 || + dup2(devnull_fd, STDIN_FILENO) < 0 || + dup2(devnull_fd, STDOUT_FILENO) < 0 || + dup2(devnull_fd, STDERR_FILENO) < 0 || + close(devnull_fd)) exit(EXIT_FAILURE); return 0; @@ -472,7 +491,7 @@ int write_file(const char *path, const char *buf) size_t len = strlen(buf); if (fd < 0) { - warn("Could not open %s: %s", path, strerror(errno)); + warn_perror("Could not open %s", path); return -1; } @@ -480,7 +499,7 @@ int write_file(const char *path, const char *buf) ssize_t rc = write(fd, buf, len); if (rc <= 0) { - warn("Couldn't write to %s: %s", path, strerror(errno)); + warn_perror("Couldn't write to %s", path); break; } @@ -522,6 +541,36 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, #endif } +/* write_all_buf() - write all of a buffer to an fd + * @fd: File descriptor + * @buf: Pointer to base of buffer + * @len: Length of buffer + * + * Return: 0 on success, -1 on error (with errno set) + * + * #syscalls write + */ +int write_all_buf(int fd, const void *buf, size_t len) +{ + const char *p = buf; + size_t left = len; + + while (left) { + ssize_t rc; + + do + rc = write(fd, p, left); + while ((rc < 0) && errno == EINTR); + + if (rc < 0) + return -1; + + p += rc; + left -= rc; + } + return 0; +} + /* write_remainder() - write the tail of an IO vector to an fd * @fd: File descriptor * @iov: IO vector @@ -530,27 +579,261 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, * * Return: 0 on success, -1 on error (with errno set) * - * #syscalls write writev + * #syscalls writev */ -int write_remainder(int fd, const struct iovec *iov, int iovcnt, size_t skip) +int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip) { - int i; + size_t i = 0, offset; - while ((i = iov_skip_bytes(iov, iovcnt, skip, &skip)) < iovcnt) { + while ((i += iov_skip_bytes(iov + i, iovcnt - i, skip, &offset)) < iovcnt) { ssize_t rc; - if (skip) { - rc = write(fd, (char *)iov[i].iov_base + skip, - iov[i].iov_len - skip); - } else { - rc = writev(fd, &iov[i], iovcnt - i); + if (offset) { + /* Write the remainder of the partially written buffer */ + if (write_all_buf(fd, (char *)iov[i].iov_base + offset, + iov[i].iov_len - offset) < 0) + return -1; + i++; } + /* Write as much of the remaining whole buffers as we can */ + rc = writev(fd, &iov[i], iovcnt - i); if (rc < 0) return -1; - skip += rc; + skip = rc; } - return 0; } + +/** sockaddr_ntop() - Convert a socket address to text format + * @sa: Socket address + * @dst: output buffer, minimum SOCKADDR_STRLEN bytes + * @size: size of buffer at @dst + * + * Return: On success, a non-null pointer to @dst, NULL on failure + */ +const char *sockaddr_ntop(const void *sa, char *dst, socklen_t size) +{ + sa_family_t family = ((const struct sockaddr *)sa)->sa_family; + socklen_t off = 0; + +#define IPRINTF(...) \ + do { \ + off += snprintf(dst + off, size - off, __VA_ARGS__); \ + if (off >= size) \ + return NULL; \ + } while (0) + +#define INTOP(af, addr) \ + do { \ + if (!inet_ntop((af), (addr), dst + off, size - off)) \ + return NULL; \ + off += strlen(dst + off); \ + } while (0) + + switch (family) { + case AF_UNSPEC: + IPRINTF("<unspecified>"); + break; + + case AF_INET: { + const struct sockaddr_in *sa4 = sa; + + INTOP(AF_INET, &sa4->sin_addr); + IPRINTF(":%hu", ntohs(sa4->sin_port)); + break; + } + + case AF_INET6: { + const struct sockaddr_in6 *sa6 = sa; + + IPRINTF("["); + INTOP(AF_INET6, &sa6->sin6_addr); + IPRINTF("]:%hu", ntohs(sa6->sin6_port)); + break; + } + + /* FIXME: Implement AF_UNIX */ + default: + errno = EAFNOSUPPORT; + return NULL; + } + +#undef IPRINTF +#undef INTOP + + return dst; +} + +/** eth_ntop() - Convert an Ethernet MAC address to text format + * @mac: MAC address + * @dst: Output buffer, minimum ETH_ADDRSTRLEN bytes + * @size: Size of buffer at @dst + * + * Return: On success, a non-null pointer to @dst, NULL on failure + */ +const char *eth_ntop(const unsigned char *mac, char *dst, size_t size) +{ + int len; + + len = snprintf(dst, size, "%02x:%02x:%02x:%02x:%02x:%02x", + mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); + if (len < 0 || (size_t)len >= size) + return NULL; + + return dst; +} + +/** str_ee_origin() - Convert socket extended error origin to a string + * @ee: Socket extended error structure + * + * Return: Static string describing error origin + */ +const char *str_ee_origin(const struct sock_extended_err *ee) +{ + const char *const desc[] = { + [SO_EE_ORIGIN_NONE] = "<no origin>", + [SO_EE_ORIGIN_LOCAL] = "Local", + [SO_EE_ORIGIN_ICMP] = "ICMP", + [SO_EE_ORIGIN_ICMP6] = "ICMPv6", + }; + + if (ee->ee_origin < ARRAY_SIZE(desc)) + return desc[ee->ee_origin]; + + return "<invalid>"; +} + +/** + * close_open_files() - Close leaked files, but not --fd, stdin, stdout, stderr + * @argc: Argument count + * @argv: Command line options, as we need to skip any file given via --fd + */ +void close_open_files(int argc, char **argv) +{ + const struct option optfd[] = { { "fd", required_argument, NULL, 'F' }, + { 0 }, + }; + long fd = -1; + int name, rc; + + do { + name = getopt_long(argc, argv, "-:F:", optfd, NULL); + + if (name == 'F') { + errno = 0; + fd = strtol(optarg, NULL, 0); + + if (errno || fd <= STDERR_FILENO || fd > INT_MAX) + die("Invalid --fd: %s", optarg); + } + } while (name != -1); + + if (fd == -1) { + rc = close_range(STDERR_FILENO + 1, ~0U, CLOSE_RANGE_UNSHARE); + } else if (fd == STDERR_FILENO + 1) { /* Still a single range */ + rc = close_range(STDERR_FILENO + 2, ~0U, CLOSE_RANGE_UNSHARE); + } else { + rc = close_range(STDERR_FILENO + 1, fd - 1, + CLOSE_RANGE_UNSHARE); + if (!rc) + rc = close_range(fd + 1, ~0U, CLOSE_RANGE_UNSHARE); + } + + if (rc) { + if (errno == ENOSYS || errno == EINVAL) { + /* This probably means close_range() or the + * CLOSE_RANGE_UNSHARE flag is not supported by the + * kernel. Not much we can do here except carry on and + * hope for the best. + */ + warn( +"Can't use close_range() to ensure no files leaked by parent"); + } else { + die_perror("Failed to close files leaked by parent"); + } + } + +} + +/** + * snprintf_check() - snprintf() wrapper, checking for truncation and errors + * @str: Output buffer + * @size: Maximum size to write to @str + * @format: Message + * + * Return: false on success, true on truncation or error, sets errno on failure + */ +bool snprintf_check(char *str, size_t size, const char *format, ...) +{ + va_list ap; + int rc; + + va_start(ap, format); + rc = vsnprintf(str, size, format, ap); + va_end(ap); + + if (rc < 0) { + errno = EIO; + return true; + } + + if ((size_t)rc >= size) { + errno = ENOBUFS; + return true; + } + + return false; +} + +#define DEV_RANDOM "/dev/random" + +/** + * raw_random() - Get high quality random bytes + * @buf: Buffer to fill with random bytes + * @buflen: Number of bytes of random data to put in @buf + * + * Assumes that the random data is essential, and will die() if unable to obtain + * it. + */ +void raw_random(void *buf, size_t buflen) +{ + size_t random_read = 0; +#ifndef HAS_GETRANDOM + int fd = open(DEV_RANDOM, O_RDONLY); + + if (fd < 0) + die_perror("Couldn't open %s", DEV_RANDOM); +#endif + + while (random_read < buflen) { + ssize_t ret; + +#ifdef HAS_GETRANDOM + ret = getrandom((char *)buf + random_read, + buflen - random_read, GRND_RANDOM); +#else + ret = read(dev_random, (char *)buf + random_read, + buflen - random_read); +#endif + + if (ret == -1 && errno == EINTR) + continue; + + if (ret < 0) + die_perror("Error on random data source"); + + if (ret == 0) + break; + + random_read += ret; + } + +#ifndef HAS_GETRANDOM + close(dev_random); +#endif + + if (random_read < buflen) + die("Unexpected EOF on random data source"); +} |