diff options
Diffstat (limited to 'util.c')
| -rw-r--r-- | util.c | 455 |
1 files changed, 406 insertions, 49 deletions
@@ -18,7 +18,6 @@ #include <unistd.h> #include <arpa/inet.h> #include <net/ethernet.h> -#include <sys/epoll.h> #include <sys/uio.h> #include <fcntl.h> #include <string.h> @@ -34,30 +33,32 @@ #include "passt.h" #include "packet.h" #include "log.h" +#include "pcap.h" +#include "epoll_ctl.h" +#include "pasta.h" #ifdef HAS_GETRANDOM #include <sys/random.h> #endif +/* Zero-filled buffer to pad 802.3 frames, up to 60 (ETH_ZLEN) bytes */ +uint8_t eth_pad[ETH_ZLEN] = { 0 }; + /** - * sock_l4_sa() - Create and bind socket to socket address, add to epoll list + * sock_l4_() - Create and bind socket to socket address * @c: Execution context * @type: epoll type * @sa: Socket address to bind to - * @sl: Length of @sa * @ifname: Interface for binding, NULL for any - * @v6only: Set IPV6_V6ONLY socket option - * @data: epoll reference portion for protocol handlers + * @v6only: If >= 0, set IPV6_V6ONLY socket option to this value * * Return: newly created socket, negative error code on failure */ -int sock_l4_sa(const struct ctx *c, enum epoll_type type, - const void *sa, socklen_t sl, - const char *ifname, bool v6only, uint32_t data) +static int sock_l4_(const struct ctx *c, enum epoll_type type, + const union sockaddr_inany *sa, const char *ifname, + int v6only) { - sa_family_t af = ((const struct sockaddr *)sa)->sa_family; - union epoll_ref ref = { .type = type, .data = data }; + sa_family_t af = sa->sa_family; bool freebind = false; - struct epoll_event ev; int fd, y = 1, ret; uint8_t proto; int socktype; @@ -69,9 +70,8 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, freebind = c->freebind; break; case EPOLL_TYPE_UDP_LISTEN: + case EPOLL_TYPE_UDP: freebind = c->freebind; - /* fallthrough */ - case EPOLL_TYPE_UDP_REPLY: proto = IPPROTO_UDP; socktype = SOCK_DGRAM | SOCK_NONBLOCK; break; @@ -99,21 +99,27 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, return -EBADF; } - ref.fd = fd; - - if (v6only) - if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &y, sizeof(y))) - debug("Failed to set IPV6_V6ONLY on socket %i", fd); + if (v6only >= 0) { + if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, + &v6only, sizeof(v6only))) { + debug("Failed to set IPV6_V6ONLY to %d on socket %i", + v6only, fd); + } + } if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y))) debug("Failed to set SO_REUSEADDR on socket %i", fd); if (proto == IPPROTO_UDP) { + int pktinfo = af == AF_INET ? IP_PKTINFO : IPV6_RECVPKTINFO; + int recverr = af == AF_INET ? IP_RECVERR : IPV6_RECVERR; int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6; - int opt = af == AF_INET ? IP_RECVERR : IPV6_RECVERR; - if (setsockopt(fd, level, opt, &y, sizeof(y))) + if (setsockopt(fd, level, recverr, &y, sizeof(y))) die_perror("Failed to set RECVERR on socket %i", fd); + + if (setsockopt(fd, level, pktinfo, &y, sizeof(y))) + die_perror("Failed to set PKTINFO on socket %i", fd); } if (ifname && *ifname) { @@ -127,9 +133,10 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, char str[SOCKADDR_STRLEN]; ret = -errno; - warn("Can't bind %s socket for %s to %s, closing", - EPOLL_TYPE_STR(proto), - sockaddr_ntop(sa, str, sizeof(str)), ifname); + warn("SO_BINDTODEVICE %s failed for %s on %s: %s", + ifname, EPOLL_TYPE_STR(type), + sockaddr_ntop(sa, str, sizeof(str)), + strerror_(-ret)); close(fd); return ret; } @@ -147,7 +154,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, } } - if (bind(fd, sa, sl) < 0) { + if (bind(fd, &sa->sa, socklen_inany(sa)) < 0) { /* We'll fail to bind to low ports if we don't have enough * capabilities, and we'll fail to bind on already bound ports, * this is fine. This might also fail for ICMP because of a @@ -167,24 +174,130 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, return ret; } - ev.events = EPOLLIN; - ev.data.u64 = ref.u64; - if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev) == -1) { - ret = -errno; - warn("L4 epoll_ctl: %s", strerror_(-ret)); - return ret; + return fd; +} + +/** + * sock_l4() - Create and bind socket to given address + * @c: Execution context + * @type: epoll type + * @sa: Socket address to bind to + * @ifname: Interface for binding, NULL for any + * + * Return: newly created socket, negative error code on failure + */ +int sock_l4(const struct ctx *c, enum epoll_type type, + const union sockaddr_inany *sa, const char *ifname) +{ + int v6only = -1; + + /* The option doesn't exist for IPv4 sockets, and we don't care about it + * for IPv6 sockets with a non-wildcard address. + */ + if (sa->sa_family == AF_INET6 && + IN6_IS_ADDR_UNSPECIFIED(&sa->sa6.sin6_addr)) + v6only = 1; + + return sock_l4_(c, type, sa, ifname, v6only); +} + +/** + * sock_l4_dualstack_any() - Create dualstack socket bound to :: and 0.0.0.0 + * @c: Execution context + * @type: epoll type + * @port Port to bind to (:: and 0.0.0.0) + * @ifname: Interface for binding, NULL for any + * + * Return: newly created socket, negative error code on failure + * + * A dual stack socket is effectively bound to both :: and 0.0.0.0. + */ +int sock_l4_dualstack_any(const struct ctx *c, enum epoll_type type, + in_port_t port, const char *ifname) +{ + union sockaddr_inany sa = { + .sa6.sin6_family = AF_INET6, + .sa6.sin6_addr = in6addr_any, + .sa6.sin6_port = htons(port), + }; + + /* Dual stack sockets require IPV6_V6ONLY == 0. Usually that's the + * default, but sysctl net.ipv6.bindv6only can change that, so set the + * sockopt explicitly. + */ + return sock_l4_(c, type, &sa, ifname, 0); +} + +/** + * sock_unix() - Create and bind AF_UNIX socket + * @sock_path: Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix) + * + * Return: socket descriptor on success, won't return on failure + */ +int sock_unix(char *sock_path) +{ + int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + struct sockaddr_un addr = { + .sun_family = AF_UNIX, + }; + int i; + + if (fd < 0) + die_perror("Failed to open UNIX domain socket"); + + for (i = 1; i < UNIX_SOCK_MAX; i++) { + char *path = addr.sun_path; + int ex, ret; + + if (*sock_path) + memcpy(path, sock_path, UNIX_PATH_MAX); + else if (snprintf_check(path, UNIX_PATH_MAX - 1, + UNIX_SOCK_PATH, i)) + die_perror("Can't build UNIX domain socket path"); + + ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, + 0); + if (ex < 0) + die_perror("Failed to check for UNIX domain conflicts"); + + ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr)); + if (!ret || (errno != ENOENT && errno != ECONNREFUSED && + errno != EACCES)) { + if (*sock_path) + die("Socket path %s already in use", path); + + close(ex); + continue; + } + close(ex); + + unlink(path); + ret = bind(fd, (const struct sockaddr *)&addr, sizeof(addr)); + if (*sock_path && ret) + die_perror("Failed to bind UNIX domain socket"); + + if (!ret) + break; } + if (i == UNIX_SOCK_MAX) + die_perror("Failed to bind UNIX domain socket"); + + info("UNIX domain socket bound at %s", addr.sun_path); + if (!*sock_path) + memcpy(sock_path, addr.sun_path, UNIX_PATH_MAX); + return fd; } /** - * sock_probe_mem() - Check if setting high SO_SNDBUF and SO_RCVBUF is allowed + * sock_probe_features() - Probe for socket features we might use * @c: Execution context */ -void sock_probe_mem(struct ctx *c) +void sock_probe_features(struct ctx *c) { int v = INT_MAX / 2, s; + const char lo[] = "lo"; socklen_t sl; s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); @@ -193,6 +306,7 @@ void sock_probe_mem(struct ctx *c) return; } + /* Check if setting high SO_SNDBUF and SO_RCVBUF is allowed */ sl = sizeof(v); if (setsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v)) || getsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, &sl) || @@ -205,6 +319,19 @@ void sock_probe_mem(struct ctx *c) (size_t)v < RCVBUF_BIG) c->low_rmem = 1; + /* Check if SO_BINDTODEVICE is available + * + * Supported since kernel version 5.7, commit c427bfec18f2 ("net: core: + * enable SO_BINDTODEVICE for non-root users"). Some distro kernels may + * have backports, of course. Record whether we can use it so that we + * can give more useful diagnostics. + */ + if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE, lo, sizeof(lo) - 1)) { + if (errno != EPERM) + warn_perror("Unexpected error probing SO_BINDTODEVICE"); + c->no_bindtodevice = 1; + } + close(s); } @@ -255,6 +382,7 @@ void bitmap_set(uint8_t *map, unsigned bit) * @map: Pointer to bitmap * @bit: Bit number to clear */ +/* cppcheck-suppress unusedFunction */ void bitmap_clear(uint8_t *map, unsigned bit) { unsigned long *word = (unsigned long *)map + BITMAP_WORD(bit); @@ -284,6 +412,7 @@ bool bitmap_isset(const uint8_t *map, unsigned bit) * @a: First operand * @b: Second operand */ +/* cppcheck-suppress unusedFunction */ void bitmap_or(uint8_t *dst, size_t size, const uint8_t *a, const uint8_t *b) { unsigned long *dw = (unsigned long *)dst; @@ -298,7 +427,29 @@ void bitmap_or(uint8_t *dst, size_t size, const uint8_t *a, const uint8_t *b) dst[i] = a[i] | b[i]; } -/* +/** + * bitmap_and_not() - Logical conjunction with complement (AND NOT) of bitmap + * @dst: Pointer to result bitmap + * @size: Size of bitmaps, in bytes + * @a: First operand + * @b: Second operand + */ +void bitmap_and_not(uint8_t *dst, size_t size, + const uint8_t *a, const uint8_t *b) +{ + unsigned long *dw = (unsigned long *)dst; + unsigned long *aw = (unsigned long *)a; + unsigned long *bw = (unsigned long *)b; + size_t i; + + for (i = 0; i < size / sizeof(long); i++, dw++, aw++, bw++) + *dw = *aw & ~*bw; + + for (i = size / sizeof(long) * sizeof(long); i < size; i++) + dst[i] = a[i] & ~b[i]; +} + +/** * ns_enter() - Enter configured user (unless already joined) and network ns * @c: Execution context * @@ -405,7 +556,7 @@ void pidfile_write(int fd, pid_t pid) if (write(fd, pid_buf, n) < 0) { perror("PID file write"); - _exit(EXIT_FAILURE); + passt_exit(EXIT_FAILURE); } close(fd); @@ -433,7 +584,8 @@ int output_file_open(const char *path, int flags) * @pidfile_fd: Open PID file descriptor * @devnull_fd: Open file descriptor for /dev/null * - * Return: child PID on success, won't return on failure + * Return: 0 in the child process on success. The parent process exits. + * Does not return in either process on failure (calls _exit). */ int __daemon(int pidfile_fd, int devnull_fd) { @@ -441,12 +593,12 @@ int __daemon(int pidfile_fd, int devnull_fd) if (pid == -1) { perror("fork"); - _exit(EXIT_FAILURE); + passt_exit(EXIT_FAILURE); } if (pid) { pidfile_write(pidfile_fd, pid); - _exit(EXIT_SUCCESS); + passt_exit(EXIT_SUCCESS); } if (setsid() < 0 || @@ -454,7 +606,7 @@ int __daemon(int pidfile_fd, int devnull_fd) dup2(devnull_fd, STDOUT_FILENO) < 0 || dup2(devnull_fd, STDERR_FILENO) < 0 || close(devnull_fd)) - _exit(EXIT_FAILURE); + passt_exit(EXIT_FAILURE); return 0; } @@ -463,6 +615,9 @@ int __daemon(int pidfile_fd, int devnull_fd) * fls() - Find last (most significant) bit set in word * @x: Word * + * Note: unlike ffs() and other implementations of fls(), notably the one from + * the Linux kernel, the starting position is 0 and not 1, that is, fls(1) = 0. + * * Return: position of most significant bit set, starting from 0, -1 if none */ int fls(unsigned long x) @@ -479,6 +634,17 @@ int fls(unsigned long x) } /** + * ilog2() - Integral part (floor) of binary logarithm (logarithm to the base 2) + * @x: Argument + * + * Return: integral part of binary logarithm of @x, -1 if undefined (if @x is 0) + */ +int ilog2(unsigned long x) +{ + return fls(x); +} + +/** * write_file() - Replace contents of file with a string * @path: File to write * @buf: String to write @@ -511,6 +677,97 @@ int write_file(const char *path, const char *buf) return len == 0 ? 0 : -1; } +/** + * read_file() - Read contents of file into a NULL-terminated buffer + * @path: Path to file to read + * @buf: Buffer to store file contents + * @buf_size: Size of buffer + * + * Return: number of bytes read on success, negative error code on failure + */ +static ssize_t read_file(const char *path, char *buf, size_t buf_size) +{ + size_t total_read = 0; + int fd; + + if (!buf_size) + return -EINVAL; + + fd = open(path, O_RDONLY | O_CLOEXEC); + + if (fd < 0) + return -errno; + + while (total_read < buf_size) { + ssize_t rc = read(fd, buf + total_read, buf_size - total_read); + + if (rc < 0) { + int errno_save = errno; + close(fd); + return -errno_save; + } + + if (rc == 0) + break; + + total_read += rc; + } + + close(fd); + + if (total_read == buf_size) { + buf[buf_size - 1] = '\0'; + return -ENOBUFS; + } + + buf[total_read] = '\0'; + + return total_read; +} + +/** + * read_file_integer() - Read an integer value from a file + * @path: Path to file to read + * @fallback: Default value if file can't be read + * + * Return: integer value, @fallback on failure + */ +intmax_t read_file_integer(const char *path, intmax_t fallback) +{ + ssize_t bytes_read; + char buf[BUFSIZ]; + intmax_t value; + char *end; + + bytes_read = read_file(path, buf, sizeof(buf)); + + if (bytes_read < 0) + goto error; + + if (bytes_read == 0) { + debug("Empty file %s", path); + goto error; + } + + errno = 0; + value = strtoimax(buf, &end, 10); + if (*end && *end != '\n') { + debug("Non-numeric content in %s", path); + goto error; + } + if (errno) { + debug("Out of range value in %s: %s", path, buf); + goto error; + } + + return value; + +error: + debug("Couldn't read %s, using %"PRIdMAX" as default value", + path, fallback); + return fallback; +} + #ifdef __ia64__ /* Needed by do_clone() below: glibc doesn't export the prototype of __clone2(), * use the description from clone(2). @@ -541,7 +798,8 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, #endif } -/* write_all_buf() - write all of a buffer to an fd +/** + * write_all_buf() - write all of a buffer to an fd * @fd: File descriptor * @buf: Pointer to base of buffer * @len: Length of buffer @@ -571,7 +829,8 @@ int write_all_buf(int fd, const void *buf, size_t len) return 0; } -/* write_remainder() - write the tail of an IO vector to an fd +/** + * write_remainder() - write the tail of an IO vector to an fd * @fd: File descriptor * @iov: IO vector * @iovcnt: Number of entries in @iov @@ -695,7 +954,7 @@ int read_remainder(int fd, const struct iovec *iov, size_t cnt, size_t skip) * @dst: output buffer, minimum SOCKADDR_STRLEN bytes * @size: size of buffer at @dst * - * Return: On success, a non-null pointer to @dst, NULL on failure + * Return: on success, a non-null pointer to @dst, NULL on failure */ const char *sockaddr_ntop(const void *sa, char *dst, socklen_t size) { @@ -755,7 +1014,7 @@ const char *sockaddr_ntop(const void *sa, char *dst, socklen_t size) * @dst: Output buffer, minimum ETH_ADDRSTRLEN bytes * @size: Size of buffer at @dst * - * Return: On success, a non-null pointer to @dst, NULL on failure + * Return: on success, a non-null pointer to @dst, NULL on failure */ const char *eth_ntop(const unsigned char *mac, char *dst, size_t size) { @@ -772,7 +1031,7 @@ const char *eth_ntop(const unsigned char *mac, char *dst, size_t size) /** str_ee_origin() - Convert socket extended error origin to a string * @ee: Socket extended error structure * - * Return: Static string describing error origin + * Return: static string describing error origin */ const char *str_ee_origin(const struct sock_extended_err *ee) { @@ -809,7 +1068,9 @@ void close_open_files(int argc, char **argv) errno = 0; fd = strtol(optarg, NULL, 0); - if (errno || fd <= STDERR_FILENO || fd > INT_MAX) + if (errno || + (fd != STDIN_FILENO && fd <= STDERR_FILENO) || + fd > INT_MAX) die("Invalid --fd: %s", optarg); } } while (name != -1); @@ -923,11 +1184,107 @@ void raw_random(void *buf, size_t buflen) } /** - * epoll_del() - Remove a file descriptor from our passt epoll - * @c: Execution context - * @fd: File descriptor to remove + * encode_domain_name() - Encode domain name according to RFC 1035, section 3.1 + * @buf: Buffer to fill in with encoded domain name + * @domain_name: Input domain name string with terminator + * + * The buffer's 'buf' size has to be >= strlen(domain_name) + 2 + */ +void encode_domain_name(char *buf, const char *domain_name) +{ + size_t i; + char *p; + + buf[0] = strcspn(domain_name, "."); + p = buf + 1; + for (i = 0; domain_name[i]; i++) { + if (domain_name[i] == '.') + p[i] = strcspn(domain_name + i + 1, "."); + else + p[i] = domain_name[i]; + } + p[i] = 0L; +} + +/** + * abort_with_msg() - Print error message and abort + * @fmt: Format string + * @...: Format parameters */ -void epoll_del(const struct ctx *c, int fd) +void abort_with_msg(const char *fmt, ...) { - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, fd, NULL); + va_list ap; + + va_start(ap, fmt); + vlogmsg(true, false, LOG_CRIT, fmt, ap); + va_end(ap); + + /* This may actually cause a SIGSYS instead of SIGABRT, due to seccomp, + * but that will still get the job done. + */ + abort(); +} + +/** + * passt_exit() - Perform vital cleanup and exit + * + * We don't use exit(3) because on some C library versions it can do unexpected + * things that hit our seccomp profile (e.g. futex() calls). This is a bespoke + * wrapper around _exit(2) performing just the cleanup that we need. + * + * #syscalls fsync + */ +void passt_exit(int status) +{ + /* Make sure we don't leave the pcap file truncated */ + if (pcap_fd != -1 && fsync(pcap_fd)) + warn_perror("Failed to flush pcap file, it might be truncated"); + + /* Make sure we don't leave an incomplete log */ + if (log_file != -1) + (void)fsync(log_file); + + /* Make sure we don't leave any messages incomplete */ + (void)fflush(stderr); + (void)fflush(stdout); + + _exit(status); +} + +/** + * clamped_scale() - Scale @x from 100% to f% depending on @y's value + * @x: Value to scale + * @y: Value determining scaling + * @lo: Lower bound for @y (start of y-axis slope) + * @hi: Upper bound for @y (end of y-axis slope) + * @f: Scaling factor, percent (might be less or more than 100) + * + * Return: @x scaled by @f * linear interpolation of @y between @lo and @hi + * + * In pictures: + * + * f % -> ,---- * If @y < lo (for example, @y is y0), return @x + * /| | + * / | | * If @lo < @y < @hi (for example, @y is y1), + * / | | return @x scaled by a factor linearly + * (100 + f) / 2 % ->/ | | interpolated between 100% and f% depending on + * /| | | @y's position between @lo (100%) and @hi (f%) + * / | | | + * / | | | * If @y > @hi (for example, @y is y2), return + * 100 % -> -----' | | | @x * @f / 100 + * | | | | | + * y0 lo y1 hi y2 Example: @f = 150, @lo = 10, @hi = 20, @y = 15, + * @x = 1000 + * -> interpolated factor is 125% + * -> return 1250 + */ +long clamped_scale(long x, long y, long lo, long hi, long f) +{ + if (y < lo) + return x; + + if (y > hi) + return x * f / 100; + + return x - (x * (y - lo) / (hi - lo)) * (100 - f) / 100; } |
