aboutgitcodebugslistschat
path: root/util.c
diff options
context:
space:
mode:
Diffstat (limited to 'util.c')
-rw-r--r--util.c508
1 files changed, 409 insertions, 99 deletions
diff --git a/util.c b/util.c
index 0b41404..7b245cc 100644
--- a/util.c
+++ b/util.c
@@ -28,11 +28,15 @@
#include <linux/errqueue.h>
#include <getopt.h>
+#include "linux_dep.h"
#include "util.h"
#include "iov.h"
#include "passt.h"
#include "packet.h"
#include "log.h"
+#ifdef HAS_GETRANDOM
+#include <sys/random.h>
+#endif
/**
* sock_l4_sa() - Create and bind socket to socket address, add to epoll list
@@ -52,6 +56,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
{
sa_family_t af = ((const struct sockaddr *)sa)->sa_family;
union epoll_ref ref = { .type = type, .data = data };
+ bool freebind = false;
struct epoll_event ev;
int fd, y = 1, ret;
uint8_t proto;
@@ -61,9 +66,12 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
case EPOLL_TYPE_TCP_LISTEN:
proto = IPPROTO_TCP;
socktype = SOCK_STREAM | SOCK_NONBLOCK;
+ freebind = c->freebind;
break;
case EPOLL_TYPE_UDP_LISTEN:
- case EPOLL_TYPE_UDP_REPLY:
+ freebind = c->freebind;
+ /* fallthrough */
+ case EPOLL_TYPE_UDP:
proto = IPPROTO_UDP;
socktype = SOCK_DGRAM | SOCK_NONBLOCK;
break;
@@ -82,7 +90,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
ret = -errno;
if (fd < 0) {
- warn("L4 socket: %s", strerror(-ret));
+ warn("L4 socket: %s", strerror_(-ret));
return ret;
}
@@ -101,11 +109,15 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
debug("Failed to set SO_REUSEADDR on socket %i", fd);
if (proto == IPPROTO_UDP) {
+ int pktinfo = af == AF_INET ? IP_PKTINFO : IPV6_RECVPKTINFO;
+ int recverr = af == AF_INET ? IP_RECVERR : IPV6_RECVERR;
int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6;
- int opt = af == AF_INET ? IP_RECVERR : IPV6_RECVERR;
- if (setsockopt(fd, level, opt, &y, sizeof(y)))
+ if (setsockopt(fd, level, recverr, &y, sizeof(y)))
die_perror("Failed to set RECVERR on socket %i", fd);
+
+ if (setsockopt(fd, level, pktinfo, &y, sizeof(y)))
+ die_perror("Failed to set PKTINFO on socket %i", fd);
}
if (ifname && *ifname) {
@@ -127,6 +139,18 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
}
}
+ if (freebind) {
+ int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6;
+ int opt = af == AF_INET ? IP_FREEBIND : IPV6_FREEBIND;
+
+ if (setsockopt(fd, level, opt, &y, sizeof(y))) {
+ err_perror("Failed to set %s on socket %i",
+ af == AF_INET ? "IP_FREEBIND"
+ : "IPV6_FREEBIND",
+ fd);
+ }
+ }
+
if (bind(fd, sa, sl) < 0) {
/* We'll fail to bind to low ports if we don't have enough
* capabilities, and we'll fail to bind on already bound ports,
@@ -142,7 +166,7 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
if (type == EPOLL_TYPE_TCP_LISTEN && listen(fd, 128) < 0) {
ret = -errno;
- warn("TCP socket listen: %s", strerror(-ret));
+ warn("TCP socket listen: %s", strerror_(-ret));
close(fd);
return ret;
}
@@ -151,64 +175,73 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
ev.data.u64 = ref.u64;
if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev) == -1) {
ret = -errno;
- warn("L4 epoll_ctl: %s", strerror(-ret));
+ warn("L4 epoll_ctl: %s", strerror_(-ret));
return ret;
}
return fd;
}
+
/**
- * sock_l4() - Create and bind socket for given L4, add to epoll list
- * @c: Execution context
- * @af: Address family, AF_INET or AF_INET6
- * @type: epoll type
- * @bind_addr: Address for binding, NULL for any
- * @ifname: Interface for binding, NULL for any
- * @port: Port, host order
- * @data: epoll reference portion for protocol handlers
+ * sock_unix() - Create and bind AF_UNIX socket
+ * @sock_path: Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix)
*
- * Return: newly created socket, negative error code on failure
+ * Return: socket descriptor on success, won't return on failure
*/
-int sock_l4(const struct ctx *c, sa_family_t af, enum epoll_type type,
- const void *bind_addr, const char *ifname, uint16_t port,
- uint32_t data)
+int sock_unix(char *sock_path)
{
- switch (af) {
- case AF_INET: {
- struct sockaddr_in addr4 = {
- .sin_family = AF_INET,
- .sin_port = htons(port),
- { 0 }, { 0 },
- };
- if (bind_addr)
- addr4.sin_addr = *(struct in_addr *)bind_addr;
- return sock_l4_sa(c, type, &addr4, sizeof(addr4), ifname,
- false, data);
- }
-
- case AF_UNSPEC:
- if (!DUAL_STACK_SOCKETS || bind_addr)
- return -EINVAL;
- /* fallthrough */
- case AF_INET6: {
- struct sockaddr_in6 addr6 = {
- .sin6_family = AF_INET6,
- .sin6_port = htons(port),
- 0, IN6ADDR_ANY_INIT, 0,
- };
- if (bind_addr) {
- addr6.sin6_addr = *(struct in6_addr *)bind_addr;
-
- if (!memcmp(bind_addr, &c->ip6.addr_ll,
- sizeof(c->ip6.addr_ll)))
- addr6.sin6_scope_id = c->ifi6;
+ int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
+ struct sockaddr_un addr = {
+ .sun_family = AF_UNIX,
+ };
+ int i;
+
+ if (fd < 0)
+ die_perror("Failed to open UNIX domain socket");
+
+ for (i = 1; i < UNIX_SOCK_MAX; i++) {
+ char *path = addr.sun_path;
+ int ex, ret;
+
+ if (*sock_path)
+ memcpy(path, sock_path, UNIX_PATH_MAX);
+ else if (snprintf_check(path, UNIX_PATH_MAX - 1,
+ UNIX_SOCK_PATH, i))
+ die_perror("Can't build UNIX domain socket path");
+
+ ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
+ 0);
+ if (ex < 0)
+ die_perror("Failed to check for UNIX domain conflicts");
+
+ ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr));
+ if (!ret || (errno != ENOENT && errno != ECONNREFUSED &&
+ errno != EACCES)) {
+ if (*sock_path)
+ die("Socket path %s already in use", path);
+
+ close(ex);
+ continue;
}
- return sock_l4_sa(c, type, &addr6, sizeof(addr6), ifname,
- af == AF_INET6, data);
- }
- default:
- return -EINVAL;
+ close(ex);
+
+ unlink(path);
+ ret = bind(fd, (const struct sockaddr *)&addr, sizeof(addr));
+ if (*sock_path && ret)
+ die_perror("Failed to bind UNIX domain socket");
+
+ if (!ret)
+ break;
}
+
+ if (i == UNIX_SOCK_MAX)
+ die_perror("Failed to bind UNIX domain socket");
+
+ info("UNIX domain socket bound at %s", addr.sun_path);
+ if (!*sock_path)
+ memcpy(sock_path, addr.sun_path, UNIX_PATH_MAX);
+
+ return fd;
}
/**
@@ -220,7 +253,8 @@ void sock_probe_mem(struct ctx *c)
int v = INT_MAX / 2, s;
socklen_t sl;
- if ((s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) {
+ s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+ if (s < 0) {
c->low_wmem = c->low_rmem = 1;
return;
}
@@ -250,7 +284,7 @@ void sock_probe_mem(struct ctx *c)
int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b)
{
if (a->tv_nsec < b->tv_nsec) {
- return (b->tv_nsec - a->tv_nsec) / 1000 +
+ return (a->tv_nsec + 1000000000 - b->tv_nsec) / 1000 +
(a->tv_sec - b->tv_sec - 1) * 1000000;
}
@@ -330,7 +364,7 @@ void bitmap_or(uint8_t *dst, size_t size, const uint8_t *a, const uint8_t *b)
dst[i] = a[i] | b[i];
}
-/*
+/**
* ns_enter() - Enter configured user (unless already joined) and network ns
* @c: Execution context
*
@@ -437,32 +471,27 @@ void pidfile_write(int fd, pid_t pid)
if (write(fd, pid_buf, n) < 0) {
perror("PID file write");
- exit(EXIT_FAILURE);
+ _exit(EXIT_FAILURE);
}
close(fd);
}
/**
- * pidfile_open() - Open PID file if needed
- * @path: Path for PID file, empty string if no PID file is requested
+ * output_file_open() - Open file for output, if needed
+ * @path: Path for output file
+ * @flags: Flags for open() other than O_CREAT, O_TRUNC, O_CLOEXEC
*
- * Return: descriptor for PID file, -1 if path is NULL, won't return on failure
+ * Return: file descriptor on success, -1 on failure with errno set by open()
*/
-int pidfile_open(const char *path)
+int output_file_open(const char *path, int flags)
{
- int fd;
-
- if (!*path)
- return -1;
-
- if ((fd = open(path, O_CREAT | O_TRUNC | O_WRONLY | O_CLOEXEC,
- S_IRUSR | S_IWUSR)) < 0) {
- perror("PID file open");
- exit(EXIT_FAILURE);
- }
-
- return fd;
+ /* We use O_CLOEXEC here, but clang-tidy as of LLVM 16 to 19 looks for
+ * it in the 'mode' argument if we have one
+ */
+ return open(path, O_CREAT | O_TRUNC | O_CLOEXEC | flags,
+ /* NOLINTNEXTLINE(android-cloexec-open) */
+ S_IRUSR | S_IWUSR);
}
/**
@@ -470,7 +499,8 @@ int pidfile_open(const char *path)
* @pidfile_fd: Open PID file descriptor
* @devnull_fd: Open file descriptor for /dev/null
*
- * Return: child PID on success, won't return on failure
+ * Return: 0 in the child process on success. The parent process exits.
+ * Does not return in either process on failure (calls _exit).
*/
int __daemon(int pidfile_fd, int devnull_fd)
{
@@ -478,25 +508,20 @@ int __daemon(int pidfile_fd, int devnull_fd)
if (pid == -1) {
perror("fork");
- exit(EXIT_FAILURE);
+ _exit(EXIT_FAILURE);
}
if (pid) {
pidfile_write(pidfile_fd, pid);
- exit(EXIT_SUCCESS);
+ _exit(EXIT_SUCCESS);
}
- errno = 0;
-
- setsid();
-
- dup2(devnull_fd, STDIN_FILENO);
- dup2(devnull_fd, STDOUT_FILENO);
- dup2(devnull_fd, STDERR_FILENO);
- close(devnull_fd);
-
- if (errno)
- exit(EXIT_FAILURE);
+ if (setsid() < 0 ||
+ dup2(devnull_fd, STDIN_FILENO) < 0 ||
+ dup2(devnull_fd, STDOUT_FILENO) < 0 ||
+ dup2(devnull_fd, STDERR_FILENO) < 0 ||
+ close(devnull_fd))
+ _exit(EXIT_FAILURE);
return 0;
}
@@ -583,7 +608,39 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
#endif
}
-/* write_remainder() - write the tail of an IO vector to an fd
+/**
+ * write_all_buf() - write all of a buffer to an fd
+ * @fd: File descriptor
+ * @buf: Pointer to base of buffer
+ * @len: Length of buffer
+ *
+ * Return: 0 on success, -1 on error (with errno set)
+ *
+ * #syscalls write
+ */
+int write_all_buf(int fd, const void *buf, size_t len)
+{
+ const char *p = buf;
+ size_t left = len;
+
+ while (left) {
+ ssize_t rc;
+
+ do
+ rc = write(fd, p, left);
+ while ((rc < 0) && errno == EINTR);
+
+ if (rc < 0)
+ return -1;
+
+ p += rc;
+ left -= rc;
+ }
+ return 0;
+}
+
+/**
+ * write_remainder() - write the tail of an IO vector to an fd
* @fd: File descriptor
* @iov: IO vector
* @iovcnt: Number of entries in @iov
@@ -591,28 +648,114 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
*
* Return: 0 on success, -1 on error (with errno set)
*
- * #syscalls write writev
+ * #syscalls writev
*/
int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip)
{
- size_t offset, i;
+ size_t i = 0, offset;
- while ((i = iov_skip_bytes(iov, iovcnt, skip, &offset)) < iovcnt) {
+ while ((i += iov_skip_bytes(iov + i, iovcnt - i, skip, &offset)) < iovcnt) {
ssize_t rc;
if (offset) {
- rc = write(fd, (char *)iov[i].iov_base + offset,
- iov[i].iov_len - offset);
- } else {
- rc = writev(fd, &iov[i], iovcnt - i);
+ /* Write the remainder of the partially written buffer */
+ if (write_all_buf(fd, (char *)iov[i].iov_base + offset,
+ iov[i].iov_len - offset) < 0)
+ return -1;
+ i++;
}
+ /* Write as much of the remaining whole buffers as we can */
+ rc = writev(fd, &iov[i], iovcnt - i);
if (rc < 0)
return -1;
- skip += rc;
+ skip = rc;
}
+ return 0;
+}
+
+/**
+ * read_all_buf() - Fill a whole buffer from a file descriptor
+ * @fd: File descriptor
+ * @buf: Pointer to base of buffer
+ * @len: Length of buffer
+ *
+ * Return: 0 on success, -1 on error (with errno set)
+ *
+ * #syscalls read
+ */
+int read_all_buf(int fd, void *buf, size_t len)
+{
+ size_t left = len;
+ char *p = buf;
+
+ while (left) {
+ ssize_t rc;
+
+ ASSERT(left <= len);
+
+ do
+ rc = read(fd, p, left);
+ while ((rc < 0) && errno == EINTR);
+
+ if (rc < 0)
+ return -1;
+
+ if (rc == 0) {
+ errno = ENODATA;
+ return -1;
+ }
+
+ p += rc;
+ left -= rc;
+ }
+ return 0;
+}
+
+/**
+ * read_remainder() - Read the tail of an IO vector from a file descriptor
+ * @fd: File descriptor
+ * @iov: IO vector
+ * @cnt: Number of entries in @iov
+ * @skip: Number of bytes of the vector to skip reading
+ *
+ * Return: 0 on success, -1 on error (with errno set)
+ *
+ * Note: mode-specific seccomp profiles need to enable readv() to use this.
+ */
+/* cppcheck-suppress unusedFunction */
+int read_remainder(int fd, const struct iovec *iov, size_t cnt, size_t skip)
+{
+ size_t i = 0, offset;
+
+ while ((i += iov_skip_bytes(iov + i, cnt - i, skip, &offset)) < cnt) {
+ ssize_t rc;
+ if (offset) {
+ ASSERT(offset < iov[i].iov_len);
+ /* Read the remainder of the partially read buffer */
+ if (read_all_buf(fd, (char *)iov[i].iov_base + offset,
+ iov[i].iov_len - offset) < 0)
+ return -1;
+ i++;
+ }
+
+ if (cnt == i)
+ break;
+
+ /* Fill as many of the remaining buffers as we can */
+ rc = readv(fd, &iov[i], cnt - i);
+ if (rc < 0)
+ return -1;
+
+ if (rc == 0) {
+ errno = ENODATA;
+ return -1;
+ }
+
+ skip = rc;
+ }
return 0;
}
@@ -676,6 +819,25 @@ const char *sockaddr_ntop(const void *sa, char *dst, socklen_t size)
return dst;
}
+/** eth_ntop() - Convert an Ethernet MAC address to text format
+ * @mac: MAC address
+ * @dst: Output buffer, minimum ETH_ADDRSTRLEN bytes
+ * @size: Size of buffer at @dst
+ *
+ * Return: On success, a non-null pointer to @dst, NULL on failure
+ */
+const char *eth_ntop(const unsigned char *mac, char *dst, size_t size)
+{
+ int len;
+
+ len = snprintf(dst, size, "%02x:%02x:%02x:%02x:%02x:%02x",
+ mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
+ if (len < 0 || (size_t)len >= size)
+ return NULL;
+
+ return dst;
+}
+
/** str_ee_origin() - Convert socket extended error origin to a string
* @ee: Socket extended error structure
*
@@ -710,13 +872,15 @@ void close_open_files(int argc, char **argv)
int name, rc;
do {
- name = getopt_long(argc, argv, "+:F", optfd, NULL);
+ name = getopt_long(argc, argv, "-:F:", optfd, NULL);
if (name == 'F') {
errno = 0;
fd = strtol(optarg, NULL, 0);
- if (errno || fd <= STDERR_FILENO || fd > INT_MAX)
+ if (errno ||
+ (fd != STDIN_FILENO && fd <= STDERR_FILENO) ||
+ fd > INT_MAX)
die("Invalid --fd: %s", optarg);
}
} while (name != -1);
@@ -732,6 +896,152 @@ void close_open_files(int argc, char **argv)
rc = close_range(fd + 1, ~0U, CLOSE_RANGE_UNSHARE);
}
- if (rc)
- die_perror("Failed to close files leaked by parent");
+ if (rc) {
+ if (errno == ENOSYS || errno == EINVAL) {
+ /* This probably means close_range() or the
+ * CLOSE_RANGE_UNSHARE flag is not supported by the
+ * kernel. Not much we can do here except carry on and
+ * hope for the best.
+ */
+ warn(
+"Can't use close_range() to ensure no files leaked by parent");
+ } else {
+ die_perror("Failed to close files leaked by parent");
+ }
+ }
+
+}
+
+/**
+ * snprintf_check() - snprintf() wrapper, checking for truncation and errors
+ * @str: Output buffer
+ * @size: Maximum size to write to @str
+ * @format: Message
+ *
+ * Return: false on success, true on truncation or error, sets errno on failure
+ */
+bool snprintf_check(char *str, size_t size, const char *format, ...)
+{
+ va_list ap;
+ int rc;
+
+ va_start(ap, format);
+ rc = vsnprintf(str, size, format, ap);
+ va_end(ap);
+
+ if (rc < 0) {
+ errno = EIO;
+ return true;
+ }
+
+ if ((size_t)rc >= size) {
+ errno = ENOBUFS;
+ return true;
+ }
+
+ return false;
+}
+
+#define DEV_RANDOM "/dev/random"
+
+/**
+ * raw_random() - Get high quality random bytes
+ * @buf: Buffer to fill with random bytes
+ * @buflen: Number of bytes of random data to put in @buf
+ *
+ * Assumes that the random data is essential, and will die() if unable to obtain
+ * it.
+ */
+void raw_random(void *buf, size_t buflen)
+{
+ size_t random_read = 0;
+#ifndef HAS_GETRANDOM
+ int fd = open(DEV_RANDOM, O_RDONLY);
+
+ if (fd < 0)
+ die_perror("Couldn't open %s", DEV_RANDOM);
+#endif
+
+ while (random_read < buflen) {
+ ssize_t ret;
+
+#ifdef HAS_GETRANDOM
+ ret = getrandom((char *)buf + random_read,
+ buflen - random_read, GRND_RANDOM);
+#else
+ ret = read(dev_random, (char *)buf + random_read,
+ buflen - random_read);
+#endif
+
+ if (ret == -1 && errno == EINTR)
+ continue;
+
+ if (ret < 0)
+ die_perror("Error on random data source");
+
+ if (ret == 0)
+ break;
+
+ random_read += ret;
+ }
+
+#ifndef HAS_GETRANDOM
+ close(dev_random);
+#endif
+
+ if (random_read < buflen)
+ die("Unexpected EOF on random data source");
+}
+
+/**
+ * epoll_del() - Remove a file descriptor from our passt epoll
+ * @c: Execution context
+ * @fd: File descriptor to remove
+ */
+void epoll_del(const struct ctx *c, int fd)
+{
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, fd, NULL);
+
+}
+
+/**
+ * encode_domain_name() - Encode domain name according to RFC 1035, section 3.1
+ * @buf: Buffer to fill in with encoded domain name
+ * @domain_name: Input domain name string with terminator
+ *
+ * The buffer's 'buf' size has to be >= strlen(domain_name) + 2
+ */
+void encode_domain_name(char *buf, const char *domain_name)
+{
+ size_t i;
+ char *p;
+
+ buf[0] = strcspn(domain_name, ".");
+ p = buf + 1;
+ for (i = 0; domain_name[i]; i++) {
+ if (domain_name[i] == '.')
+ p[i] = strcspn(domain_name + i + 1, ".");
+ else
+ p[i] = domain_name[i];
+ }
+ p[i] = 0L;
+}
+
+/**
+ * abort_with_msg() - Print error message and abort
+ * @fmt: Format string
+ * @...: Format parameters
+ */
+void abort_with_msg(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vlogmsg(true, false, LOG_CRIT, fmt, ap);
+ va_end(ap);
+
+ /* This may actually cause a SIGSYS instead of SIGABRT, due to seccomp,
+ * but that will still get the job done.
+ */
+ abort();
}