aboutgitcodebugslistschat
path: root/util.c
diff options
context:
space:
mode:
Diffstat (limited to 'util.c')
-rw-r--r--util.c455
1 files changed, 406 insertions, 49 deletions
diff --git a/util.c b/util.c
index 4d51e04..2730395 100644
--- a/util.c
+++ b/util.c
@@ -18,7 +18,6 @@
#include <unistd.h>
#include <arpa/inet.h>
#include <net/ethernet.h>
-#include <sys/epoll.h>
#include <sys/uio.h>
#include <fcntl.h>
#include <string.h>
@@ -34,30 +33,32 @@
#include "passt.h"
#include "packet.h"
#include "log.h"
+#include "pcap.h"
+#include "epoll_ctl.h"
+#include "pasta.h"
#ifdef HAS_GETRANDOM
#include <sys/random.h>
#endif
+/* Zero-filled buffer to pad 802.3 frames, up to 60 (ETH_ZLEN) bytes */
+uint8_t eth_pad[ETH_ZLEN] = { 0 };
+
/**
- * sock_l4_sa() - Create and bind socket to socket address, add to epoll list
+ * sock_l4_() - Create and bind socket to socket address
* @c: Execution context
* @type: epoll type
* @sa: Socket address to bind to
- * @sl: Length of @sa
* @ifname: Interface for binding, NULL for any
- * @v6only: Set IPV6_V6ONLY socket option
- * @data: epoll reference portion for protocol handlers
+ * @v6only: If >= 0, set IPV6_V6ONLY socket option to this value
*
* Return: newly created socket, negative error code on failure
*/
-int sock_l4_sa(const struct ctx *c, enum epoll_type type,
- const void *sa, socklen_t sl,
- const char *ifname, bool v6only, uint32_t data)
+static int sock_l4_(const struct ctx *c, enum epoll_type type,
+ const union sockaddr_inany *sa, const char *ifname,
+ int v6only)
{
- sa_family_t af = ((const struct sockaddr *)sa)->sa_family;
- union epoll_ref ref = { .type = type, .data = data };
+ sa_family_t af = sa->sa_family;
bool freebind = false;
- struct epoll_event ev;
int fd, y = 1, ret;
uint8_t proto;
int socktype;
@@ -69,9 +70,8 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
freebind = c->freebind;
break;
case EPOLL_TYPE_UDP_LISTEN:
+ case EPOLL_TYPE_UDP:
freebind = c->freebind;
- /* fallthrough */
- case EPOLL_TYPE_UDP_REPLY:
proto = IPPROTO_UDP;
socktype = SOCK_DGRAM | SOCK_NONBLOCK;
break;
@@ -99,21 +99,27 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
return -EBADF;
}
- ref.fd = fd;
-
- if (v6only)
- if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &y, sizeof(y)))
- debug("Failed to set IPV6_V6ONLY on socket %i", fd);
+ if (v6only >= 0) {
+ if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY,
+ &v6only, sizeof(v6only))) {
+ debug("Failed to set IPV6_V6ONLY to %d on socket %i",
+ v6only, fd);
+ }
+ }
if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y)))
debug("Failed to set SO_REUSEADDR on socket %i", fd);
if (proto == IPPROTO_UDP) {
+ int pktinfo = af == AF_INET ? IP_PKTINFO : IPV6_RECVPKTINFO;
+ int recverr = af == AF_INET ? IP_RECVERR : IPV6_RECVERR;
int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6;
- int opt = af == AF_INET ? IP_RECVERR : IPV6_RECVERR;
- if (setsockopt(fd, level, opt, &y, sizeof(y)))
+ if (setsockopt(fd, level, recverr, &y, sizeof(y)))
die_perror("Failed to set RECVERR on socket %i", fd);
+
+ if (setsockopt(fd, level, pktinfo, &y, sizeof(y)))
+ die_perror("Failed to set PKTINFO on socket %i", fd);
}
if (ifname && *ifname) {
@@ -127,9 +133,10 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
char str[SOCKADDR_STRLEN];
ret = -errno;
- warn("Can't bind %s socket for %s to %s, closing",
- EPOLL_TYPE_STR(proto),
- sockaddr_ntop(sa, str, sizeof(str)), ifname);
+ warn("SO_BINDTODEVICE %s failed for %s on %s: %s",
+ ifname, EPOLL_TYPE_STR(type),
+ sockaddr_ntop(sa, str, sizeof(str)),
+ strerror_(-ret));
close(fd);
return ret;
}
@@ -147,7 +154,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
}
}
- if (bind(fd, sa, sl) < 0) {
+ if (bind(fd, &sa->sa, socklen_inany(sa)) < 0) {
/* We'll fail to bind to low ports if we don't have enough
* capabilities, and we'll fail to bind on already bound ports,
* this is fine. This might also fail for ICMP because of a
@@ -167,24 +174,130 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
return ret;
}
- ev.events = EPOLLIN;
- ev.data.u64 = ref.u64;
- if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev) == -1) {
- ret = -errno;
- warn("L4 epoll_ctl: %s", strerror_(-ret));
- return ret;
+ return fd;
+}
+
+/**
+ * sock_l4() - Create and bind socket to given address
+ * @c: Execution context
+ * @type: epoll type
+ * @sa: Socket address to bind to
+ * @ifname: Interface for binding, NULL for any
+ *
+ * Return: newly created socket, negative error code on failure
+ */
+int sock_l4(const struct ctx *c, enum epoll_type type,
+ const union sockaddr_inany *sa, const char *ifname)
+{
+ int v6only = -1;
+
+ /* The option doesn't exist for IPv4 sockets, and we don't care about it
+ * for IPv6 sockets with a non-wildcard address.
+ */
+ if (sa->sa_family == AF_INET6 &&
+ IN6_IS_ADDR_UNSPECIFIED(&sa->sa6.sin6_addr))
+ v6only = 1;
+
+ return sock_l4_(c, type, sa, ifname, v6only);
+}
+
+/**
+ * sock_l4_dualstack_any() - Create dualstack socket bound to :: and 0.0.0.0
+ * @c: Execution context
+ * @type: epoll type
+ * @port Port to bind to (:: and 0.0.0.0)
+ * @ifname: Interface for binding, NULL for any
+ *
+ * Return: newly created socket, negative error code on failure
+ *
+ * A dual stack socket is effectively bound to both :: and 0.0.0.0.
+ */
+int sock_l4_dualstack_any(const struct ctx *c, enum epoll_type type,
+ in_port_t port, const char *ifname)
+{
+ union sockaddr_inany sa = {
+ .sa6.sin6_family = AF_INET6,
+ .sa6.sin6_addr = in6addr_any,
+ .sa6.sin6_port = htons(port),
+ };
+
+ /* Dual stack sockets require IPV6_V6ONLY == 0. Usually that's the
+ * default, but sysctl net.ipv6.bindv6only can change that, so set the
+ * sockopt explicitly.
+ */
+ return sock_l4_(c, type, &sa, ifname, 0);
+}
+
+/**
+ * sock_unix() - Create and bind AF_UNIX socket
+ * @sock_path: Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix)
+ *
+ * Return: socket descriptor on success, won't return on failure
+ */
+int sock_unix(char *sock_path)
+{
+ int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
+ struct sockaddr_un addr = {
+ .sun_family = AF_UNIX,
+ };
+ int i;
+
+ if (fd < 0)
+ die_perror("Failed to open UNIX domain socket");
+
+ for (i = 1; i < UNIX_SOCK_MAX; i++) {
+ char *path = addr.sun_path;
+ int ex, ret;
+
+ if (*sock_path)
+ memcpy(path, sock_path, UNIX_PATH_MAX);
+ else if (snprintf_check(path, UNIX_PATH_MAX - 1,
+ UNIX_SOCK_PATH, i))
+ die_perror("Can't build UNIX domain socket path");
+
+ ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
+ 0);
+ if (ex < 0)
+ die_perror("Failed to check for UNIX domain conflicts");
+
+ ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr));
+ if (!ret || (errno != ENOENT && errno != ECONNREFUSED &&
+ errno != EACCES)) {
+ if (*sock_path)
+ die("Socket path %s already in use", path);
+
+ close(ex);
+ continue;
+ }
+ close(ex);
+
+ unlink(path);
+ ret = bind(fd, (const struct sockaddr *)&addr, sizeof(addr));
+ if (*sock_path && ret)
+ die_perror("Failed to bind UNIX domain socket");
+
+ if (!ret)
+ break;
}
+ if (i == UNIX_SOCK_MAX)
+ die_perror("Failed to bind UNIX domain socket");
+
+ info("UNIX domain socket bound at %s", addr.sun_path);
+ if (!*sock_path)
+ memcpy(sock_path, addr.sun_path, UNIX_PATH_MAX);
+
return fd;
}
/**
- * sock_probe_mem() - Check if setting high SO_SNDBUF and SO_RCVBUF is allowed
+ * sock_probe_features() - Probe for socket features we might use
* @c: Execution context
*/
-void sock_probe_mem(struct ctx *c)
+void sock_probe_features(struct ctx *c)
{
int v = INT_MAX / 2, s;
+ const char lo[] = "lo";
socklen_t sl;
s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
@@ -193,6 +306,7 @@ void sock_probe_mem(struct ctx *c)
return;
}
+ /* Check if setting high SO_SNDBUF and SO_RCVBUF is allowed */
sl = sizeof(v);
if (setsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v)) ||
getsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, &sl) ||
@@ -205,6 +319,19 @@ void sock_probe_mem(struct ctx *c)
(size_t)v < RCVBUF_BIG)
c->low_rmem = 1;
+ /* Check if SO_BINDTODEVICE is available
+ *
+ * Supported since kernel version 5.7, commit c427bfec18f2 ("net: core:
+ * enable SO_BINDTODEVICE for non-root users"). Some distro kernels may
+ * have backports, of course. Record whether we can use it so that we
+ * can give more useful diagnostics.
+ */
+ if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE, lo, sizeof(lo) - 1)) {
+ if (errno != EPERM)
+ warn_perror("Unexpected error probing SO_BINDTODEVICE");
+ c->no_bindtodevice = 1;
+ }
+
close(s);
}
@@ -255,6 +382,7 @@ void bitmap_set(uint8_t *map, unsigned bit)
* @map: Pointer to bitmap
* @bit: Bit number to clear
*/
+/* cppcheck-suppress unusedFunction */
void bitmap_clear(uint8_t *map, unsigned bit)
{
unsigned long *word = (unsigned long *)map + BITMAP_WORD(bit);
@@ -284,6 +412,7 @@ bool bitmap_isset(const uint8_t *map, unsigned bit)
* @a: First operand
* @b: Second operand
*/
+/* cppcheck-suppress unusedFunction */
void bitmap_or(uint8_t *dst, size_t size, const uint8_t *a, const uint8_t *b)
{
unsigned long *dw = (unsigned long *)dst;
@@ -298,7 +427,29 @@ void bitmap_or(uint8_t *dst, size_t size, const uint8_t *a, const uint8_t *b)
dst[i] = a[i] | b[i];
}
-/*
+/**
+ * bitmap_and_not() - Logical conjunction with complement (AND NOT) of bitmap
+ * @dst: Pointer to result bitmap
+ * @size: Size of bitmaps, in bytes
+ * @a: First operand
+ * @b: Second operand
+ */
+void bitmap_and_not(uint8_t *dst, size_t size,
+ const uint8_t *a, const uint8_t *b)
+{
+ unsigned long *dw = (unsigned long *)dst;
+ unsigned long *aw = (unsigned long *)a;
+ unsigned long *bw = (unsigned long *)b;
+ size_t i;
+
+ for (i = 0; i < size / sizeof(long); i++, dw++, aw++, bw++)
+ *dw = *aw & ~*bw;
+
+ for (i = size / sizeof(long) * sizeof(long); i < size; i++)
+ dst[i] = a[i] & ~b[i];
+}
+
+/**
* ns_enter() - Enter configured user (unless already joined) and network ns
* @c: Execution context
*
@@ -405,7 +556,7 @@ void pidfile_write(int fd, pid_t pid)
if (write(fd, pid_buf, n) < 0) {
perror("PID file write");
- _exit(EXIT_FAILURE);
+ passt_exit(EXIT_FAILURE);
}
close(fd);
@@ -433,7 +584,8 @@ int output_file_open(const char *path, int flags)
* @pidfile_fd: Open PID file descriptor
* @devnull_fd: Open file descriptor for /dev/null
*
- * Return: child PID on success, won't return on failure
+ * Return: 0 in the child process on success. The parent process exits.
+ * Does not return in either process on failure (calls _exit).
*/
int __daemon(int pidfile_fd, int devnull_fd)
{
@@ -441,12 +593,12 @@ int __daemon(int pidfile_fd, int devnull_fd)
if (pid == -1) {
perror("fork");
- _exit(EXIT_FAILURE);
+ passt_exit(EXIT_FAILURE);
}
if (pid) {
pidfile_write(pidfile_fd, pid);
- _exit(EXIT_SUCCESS);
+ passt_exit(EXIT_SUCCESS);
}
if (setsid() < 0 ||
@@ -454,7 +606,7 @@ int __daemon(int pidfile_fd, int devnull_fd)
dup2(devnull_fd, STDOUT_FILENO) < 0 ||
dup2(devnull_fd, STDERR_FILENO) < 0 ||
close(devnull_fd))
- _exit(EXIT_FAILURE);
+ passt_exit(EXIT_FAILURE);
return 0;
}
@@ -463,6 +615,9 @@ int __daemon(int pidfile_fd, int devnull_fd)
* fls() - Find last (most significant) bit set in word
* @x: Word
*
+ * Note: unlike ffs() and other implementations of fls(), notably the one from
+ * the Linux kernel, the starting position is 0 and not 1, that is, fls(1) = 0.
+ *
* Return: position of most significant bit set, starting from 0, -1 if none
*/
int fls(unsigned long x)
@@ -479,6 +634,17 @@ int fls(unsigned long x)
}
/**
+ * ilog2() - Integral part (floor) of binary logarithm (logarithm to the base 2)
+ * @x: Argument
+ *
+ * Return: integral part of binary logarithm of @x, -1 if undefined (if @x is 0)
+ */
+int ilog2(unsigned long x)
+{
+ return fls(x);
+}
+
+/**
* write_file() - Replace contents of file with a string
* @path: File to write
* @buf: String to write
@@ -511,6 +677,97 @@ int write_file(const char *path, const char *buf)
return len == 0 ? 0 : -1;
}
+/**
+ * read_file() - Read contents of file into a NULL-terminated buffer
+ * @path: Path to file to read
+ * @buf: Buffer to store file contents
+ * @buf_size: Size of buffer
+ *
+ * Return: number of bytes read on success, negative error code on failure
+ */
+static ssize_t read_file(const char *path, char *buf, size_t buf_size)
+{
+ size_t total_read = 0;
+ int fd;
+
+ if (!buf_size)
+ return -EINVAL;
+
+ fd = open(path, O_RDONLY | O_CLOEXEC);
+
+ if (fd < 0)
+ return -errno;
+
+ while (total_read < buf_size) {
+ ssize_t rc = read(fd, buf + total_read, buf_size - total_read);
+
+ if (rc < 0) {
+ int errno_save = errno;
+ close(fd);
+ return -errno_save;
+ }
+
+ if (rc == 0)
+ break;
+
+ total_read += rc;
+ }
+
+ close(fd);
+
+ if (total_read == buf_size) {
+ buf[buf_size - 1] = '\0';
+ return -ENOBUFS;
+ }
+
+ buf[total_read] = '\0';
+
+ return total_read;
+}
+
+/**
+ * read_file_integer() - Read an integer value from a file
+ * @path: Path to file to read
+ * @fallback: Default value if file can't be read
+ *
+ * Return: integer value, @fallback on failure
+ */
+intmax_t read_file_integer(const char *path, intmax_t fallback)
+{
+ ssize_t bytes_read;
+ char buf[BUFSIZ];
+ intmax_t value;
+ char *end;
+
+ bytes_read = read_file(path, buf, sizeof(buf));
+
+ if (bytes_read < 0)
+ goto error;
+
+ if (bytes_read == 0) {
+ debug("Empty file %s", path);
+ goto error;
+ }
+
+ errno = 0;
+ value = strtoimax(buf, &end, 10);
+ if (*end && *end != '\n') {
+ debug("Non-numeric content in %s", path);
+ goto error;
+ }
+ if (errno) {
+ debug("Out of range value in %s: %s", path, buf);
+ goto error;
+ }
+
+ return value;
+
+error:
+ debug("Couldn't read %s, using %"PRIdMAX" as default value",
+ path, fallback);
+ return fallback;
+}
+
#ifdef __ia64__
/* Needed by do_clone() below: glibc doesn't export the prototype of __clone2(),
* use the description from clone(2).
@@ -541,7 +798,8 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
#endif
}
-/* write_all_buf() - write all of a buffer to an fd
+/**
+ * write_all_buf() - write all of a buffer to an fd
* @fd: File descriptor
* @buf: Pointer to base of buffer
* @len: Length of buffer
@@ -571,7 +829,8 @@ int write_all_buf(int fd, const void *buf, size_t len)
return 0;
}
-/* write_remainder() - write the tail of an IO vector to an fd
+/**
+ * write_remainder() - write the tail of an IO vector to an fd
* @fd: File descriptor
* @iov: IO vector
* @iovcnt: Number of entries in @iov
@@ -695,7 +954,7 @@ int read_remainder(int fd, const struct iovec *iov, size_t cnt, size_t skip)
* @dst: output buffer, minimum SOCKADDR_STRLEN bytes
* @size: size of buffer at @dst
*
- * Return: On success, a non-null pointer to @dst, NULL on failure
+ * Return: on success, a non-null pointer to @dst, NULL on failure
*/
const char *sockaddr_ntop(const void *sa, char *dst, socklen_t size)
{
@@ -755,7 +1014,7 @@ const char *sockaddr_ntop(const void *sa, char *dst, socklen_t size)
* @dst: Output buffer, minimum ETH_ADDRSTRLEN bytes
* @size: Size of buffer at @dst
*
- * Return: On success, a non-null pointer to @dst, NULL on failure
+ * Return: on success, a non-null pointer to @dst, NULL on failure
*/
const char *eth_ntop(const unsigned char *mac, char *dst, size_t size)
{
@@ -772,7 +1031,7 @@ const char *eth_ntop(const unsigned char *mac, char *dst, size_t size)
/** str_ee_origin() - Convert socket extended error origin to a string
* @ee: Socket extended error structure
*
- * Return: Static string describing error origin
+ * Return: static string describing error origin
*/
const char *str_ee_origin(const struct sock_extended_err *ee)
{
@@ -809,7 +1068,9 @@ void close_open_files(int argc, char **argv)
errno = 0;
fd = strtol(optarg, NULL, 0);
- if (errno || fd <= STDERR_FILENO || fd > INT_MAX)
+ if (errno ||
+ (fd != STDIN_FILENO && fd <= STDERR_FILENO) ||
+ fd > INT_MAX)
die("Invalid --fd: %s", optarg);
}
} while (name != -1);
@@ -923,11 +1184,107 @@ void raw_random(void *buf, size_t buflen)
}
/**
- * epoll_del() - Remove a file descriptor from our passt epoll
- * @c: Execution context
- * @fd: File descriptor to remove
+ * encode_domain_name() - Encode domain name according to RFC 1035, section 3.1
+ * @buf: Buffer to fill in with encoded domain name
+ * @domain_name: Input domain name string with terminator
+ *
+ * The buffer's 'buf' size has to be >= strlen(domain_name) + 2
+ */
+void encode_domain_name(char *buf, const char *domain_name)
+{
+ size_t i;
+ char *p;
+
+ buf[0] = strcspn(domain_name, ".");
+ p = buf + 1;
+ for (i = 0; domain_name[i]; i++) {
+ if (domain_name[i] == '.')
+ p[i] = strcspn(domain_name + i + 1, ".");
+ else
+ p[i] = domain_name[i];
+ }
+ p[i] = 0L;
+}
+
+/**
+ * abort_with_msg() - Print error message and abort
+ * @fmt: Format string
+ * @...: Format parameters
*/
-void epoll_del(const struct ctx *c, int fd)
+void abort_with_msg(const char *fmt, ...)
{
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, fd, NULL);
+ va_list ap;
+
+ va_start(ap, fmt);
+ vlogmsg(true, false, LOG_CRIT, fmt, ap);
+ va_end(ap);
+
+ /* This may actually cause a SIGSYS instead of SIGABRT, due to seccomp,
+ * but that will still get the job done.
+ */
+ abort();
+}
+
+/**
+ * passt_exit() - Perform vital cleanup and exit
+ *
+ * We don't use exit(3) because on some C library versions it can do unexpected
+ * things that hit our seccomp profile (e.g. futex() calls). This is a bespoke
+ * wrapper around _exit(2) performing just the cleanup that we need.
+ *
+ * #syscalls fsync
+ */
+void passt_exit(int status)
+{
+ /* Make sure we don't leave the pcap file truncated */
+ if (pcap_fd != -1 && fsync(pcap_fd))
+ warn_perror("Failed to flush pcap file, it might be truncated");
+
+ /* Make sure we don't leave an incomplete log */
+ if (log_file != -1)
+ (void)fsync(log_file);
+
+ /* Make sure we don't leave any messages incomplete */
+ (void)fflush(stderr);
+ (void)fflush(stdout);
+
+ _exit(status);
+}
+
+/**
+ * clamped_scale() - Scale @x from 100% to f% depending on @y's value
+ * @x: Value to scale
+ * @y: Value determining scaling
+ * @lo: Lower bound for @y (start of y-axis slope)
+ * @hi: Upper bound for @y (end of y-axis slope)
+ * @f: Scaling factor, percent (might be less or more than 100)
+ *
+ * Return: @x scaled by @f * linear interpolation of @y between @lo and @hi
+ *
+ * In pictures:
+ *
+ * f % -> ,---- * If @y < lo (for example, @y is y0), return @x
+ * /| |
+ * / | | * If @lo < @y < @hi (for example, @y is y1),
+ * / | | return @x scaled by a factor linearly
+ * (100 + f) / 2 % ->/ | | interpolated between 100% and f% depending on
+ * /| | | @y's position between @lo (100%) and @hi (f%)
+ * / | | |
+ * / | | | * If @y > @hi (for example, @y is y2), return
+ * 100 % -> -----' | | | @x * @f / 100
+ * | | | | |
+ * y0 lo y1 hi y2 Example: @f = 150, @lo = 10, @hi = 20, @y = 15,
+ * @x = 1000
+ * -> interpolated factor is 125%
+ * -> return 1250
+ */
+long clamped_scale(long x, long y, long lo, long hi, long f)
+{
+ if (y < lo)
+ return x;
+
+ if (y > hi)
+ return x * f / 100;
+
+ return x - (x * (y - lo) / (hi - lo)) * (100 - f) / 100;
}