aboutgitcodebugslistschat
path: root/util.c
diff options
context:
space:
mode:
Diffstat (limited to 'util.c')
-rw-r--r--util.c515
1 files changed, 399 insertions, 116 deletions
diff --git a/util.c b/util.c
index bac5a53..55cae3f 100644
--- a/util.c
+++ b/util.c
@@ -25,73 +25,68 @@
#include <time.h>
#include <errno.h>
#include <stdbool.h>
+#include <linux/errqueue.h>
+#include <getopt.h>
+#include "linux_dep.h"
#include "util.h"
#include "iov.h"
#include "passt.h"
#include "packet.h"
#include "log.h"
+#ifdef HAS_GETRANDOM
+#include <sys/random.h>
+#endif
/**
- * sock_l4() - Create and bind socket for given L4, add to epoll list
+ * sock_l4_sa() - Create and bind socket to socket address, add to epoll list
* @c: Execution context
- * @af: Address family, AF_INET or AF_INET6
- * @proto: Protocol number
- * @bind_addr: Address for binding, NULL for any
+ * @type: epoll type
+ * @sa: Socket address to bind to
+ * @sl: Length of @sa
* @ifname: Interface for binding, NULL for any
- * @port: Port, host order
+ * @v6only: Set IPV6_V6ONLY socket option
* @data: epoll reference portion for protocol handlers
*
* Return: newly created socket, negative error code on failure
*/
-int sock_l4(const struct ctx *c, sa_family_t af, uint8_t proto,
- const void *bind_addr, const char *ifname, uint16_t port,
- uint32_t data)
+int sock_l4_sa(const struct ctx *c, enum epoll_type type,
+ const void *sa, socklen_t sl,
+ const char *ifname, bool v6only, uint32_t data)
{
- union epoll_ref ref = { .data = data };
- struct sockaddr_in addr4 = {
- .sin_family = AF_INET,
- .sin_port = htons(port),
- { 0 }, { 0 },
- };
- struct sockaddr_in6 addr6 = {
- .sin6_family = AF_INET6,
- .sin6_port = htons(port),
- 0, IN6ADDR_ANY_INIT, 0,
- };
- const struct sockaddr *sa;
- bool dual_stack = false;
- int fd, sl, y = 1, ret;
+ sa_family_t af = ((const struct sockaddr *)sa)->sa_family;
+ union epoll_ref ref = { .type = type, .data = data };
+ bool freebind = false;
struct epoll_event ev;
-
- switch (proto) {
- case IPPROTO_TCP:
- ref.type = EPOLL_TYPE_TCP_LISTEN;
- break;
- case IPPROTO_UDP:
- ref.type = EPOLL_TYPE_UDP;
+ int fd, y = 1, ret;
+ uint8_t proto;
+ int socktype;
+
+ switch (type) {
+ case EPOLL_TYPE_TCP_LISTEN:
+ proto = IPPROTO_TCP;
+ socktype = SOCK_STREAM | SOCK_NONBLOCK;
+ freebind = c->freebind;
break;
- case IPPROTO_ICMP:
- ref.type = EPOLL_TYPE_ICMP;
+ case EPOLL_TYPE_UDP_LISTEN:
+ freebind = c->freebind;
+ /* fallthrough */
+ case EPOLL_TYPE_UDP_REPLY:
+ proto = IPPROTO_UDP;
+ socktype = SOCK_DGRAM | SOCK_NONBLOCK;
break;
- case IPPROTO_ICMPV6:
- ref.type = EPOLL_TYPE_ICMPV6;
+ case EPOLL_TYPE_PING:
+ if (af == AF_INET)
+ proto = IPPROTO_ICMP;
+ else
+ proto = IPPROTO_ICMPV6;
+ socktype = SOCK_DGRAM | SOCK_NONBLOCK;
break;
default:
- return -EPFNOSUPPORT; /* Not implemented. */
+ ASSERT(0);
}
- if (af == AF_UNSPEC) {
- if (!DUAL_STACK_SOCKETS || bind_addr)
- return -EINVAL;
- dual_stack = true;
- af = AF_INET6;
- }
-
- if (proto == IPPROTO_TCP)
- fd = socket(af, SOCK_STREAM | SOCK_NONBLOCK, proto);
- else
- fd = socket(af, SOCK_DGRAM | SOCK_NONBLOCK, proto);
+ fd = socket(af, socktype, proto);
ret = -errno;
if (fd < 0) {
@@ -106,34 +101,21 @@ int sock_l4(const struct ctx *c, sa_family_t af, uint8_t proto,
ref.fd = fd;
- if (af == AF_INET) {
- if (bind_addr)
- addr4.sin_addr = *(struct in_addr *)bind_addr;
+ if (v6only)
+ if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &y, sizeof(y)))
+ debug("Failed to set IPV6_V6ONLY on socket %i", fd);
- sa = (const struct sockaddr *)&addr4;
- sl = sizeof(addr4);
- } else {
- if (bind_addr) {
- addr6.sin6_addr = *(struct in6_addr *)bind_addr;
-
- if (!memcmp(bind_addr, &c->ip6.addr_ll,
- sizeof(c->ip6.addr_ll)))
- addr6.sin6_scope_id = c->ifi6;
- }
+ if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y)))
+ debug("Failed to set SO_REUSEADDR on socket %i", fd);
- sa = (const struct sockaddr *)&addr6;
- sl = sizeof(addr6);
+ if (proto == IPPROTO_UDP) {
+ int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6;
+ int opt = af == AF_INET ? IP_RECVERR : IPV6_RECVERR;
- if (!dual_stack)
- if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY,
- &y, sizeof(y)))
- debug("Failed to set IPV6_V6ONLY on socket %i",
- fd);
+ if (setsockopt(fd, level, opt, &y, sizeof(y)))
+ die_perror("Failed to set RECVERR on socket %i", fd);
}
- if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y)))
- debug("Failed to set SO_REUSEADDR on socket %i", fd);
-
if (ifname && *ifname) {
/* Supported since kernel version 5.7, commit c427bfec18f2
* ("net: core: enable SO_BINDTODEVICE for non-root users"). If
@@ -142,28 +124,43 @@ int sock_l4(const struct ctx *c, sa_family_t af, uint8_t proto,
*/
if (setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE,
ifname, strlen(ifname))) {
+ char str[SOCKADDR_STRLEN];
+
ret = -errno;
- warn("Can't bind %s socket for port %u to %s, closing",
- EPOLL_TYPE_STR(proto), port, ifname);
+ warn("Can't bind %s socket for %s to %s, closing",
+ EPOLL_TYPE_STR(proto),
+ sockaddr_ntop(sa, str, sizeof(str)), ifname);
close(fd);
return ret;
}
}
+ if (freebind) {
+ int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6;
+ int opt = af == AF_INET ? IP_FREEBIND : IPV6_FREEBIND;
+
+ if (setsockopt(fd, level, opt, &y, sizeof(y))) {
+ err_perror("Failed to set %s on socket %i",
+ af == AF_INET ? "IP_FREEBIND"
+ : "IPV6_FREEBIND",
+ fd);
+ }
+ }
+
if (bind(fd, sa, sl) < 0) {
/* We'll fail to bind to low ports if we don't have enough
* capabilities, and we'll fail to bind on already bound ports,
* this is fine. This might also fail for ICMP because of a
* broken SELinux policy, see icmp_tap_handler().
*/
- if (proto != IPPROTO_ICMP && proto != IPPROTO_ICMPV6) {
+ if (type != EPOLL_TYPE_PING) {
ret = -errno;
close(fd);
return ret;
}
}
- if (proto == IPPROTO_TCP && listen(fd, 128) < 0) {
+ if (type == EPOLL_TYPE_TCP_LISTEN && listen(fd, 128) < 0) {
ret = -errno;
warn("TCP socket listen: %s", strerror(-ret));
close(fd);
@@ -190,7 +187,8 @@ void sock_probe_mem(struct ctx *c)
int v = INT_MAX / 2, s;
socklen_t sl;
- if ((s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) {
+ s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+ if (s < 0) {
c->low_wmem = c->low_rmem = 1;
return;
}
@@ -210,23 +208,34 @@ void sock_probe_mem(struct ctx *c)
close(s);
}
-
/**
- * timespec_diff_ms() - Report difference in milliseconds between two timestamps
+ * timespec_diff_us() - Report difference in microseconds between two timestamps
* @a: Minuend timestamp
* @b: Subtrahend timestamp
*
- * Return: difference in milliseconds
+ * Return: difference in microseconds (wraps after 2^63 / 10^6s ~= 292k years)
*/
-int timespec_diff_ms(const struct timespec *a, const struct timespec *b)
+int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b)
{
if (a->tv_nsec < b->tv_nsec) {
- return (b->tv_nsec - a->tv_nsec) / 1000000 +
- (a->tv_sec - b->tv_sec - 1) * 1000;
+ return (a->tv_nsec + 1000000000 - b->tv_nsec) / 1000 +
+ (a->tv_sec - b->tv_sec - 1) * 1000000;
}
- return (a->tv_nsec - b->tv_nsec) / 1000000 +
- (a->tv_sec - b->tv_sec) * 1000;
+ return (a->tv_nsec - b->tv_nsec) / 1000 +
+ (a->tv_sec - b->tv_sec) * 1000000;
+}
+
+/**
+ * timespec_diff_ms() - Report difference in milliseconds between two timestamps
+ * @a: Minuend timestamp
+ * @b: Subtrahend timestamp
+ *
+ * Return: difference in milliseconds
+ */
+long timespec_diff_ms(const struct timespec *a, const struct timespec *b)
+{
+ return timespec_diff_us(a, b) / 1000;
}
/**
@@ -234,7 +243,7 @@ int timespec_diff_ms(const struct timespec *a, const struct timespec *b)
* @map: Pointer to bitmap
* @bit: Bit number to set
*/
-void bitmap_set(uint8_t *map, int bit)
+void bitmap_set(uint8_t *map, unsigned bit)
{
unsigned long *word = (unsigned long *)map + BITMAP_WORD(bit);
@@ -246,7 +255,7 @@ void bitmap_set(uint8_t *map, int bit)
* @map: Pointer to bitmap
* @bit: Bit number to clear
*/
-void bitmap_clear(uint8_t *map, int bit)
+void bitmap_clear(uint8_t *map, unsigned bit)
{
unsigned long *word = (unsigned long *)map + BITMAP_WORD(bit);
@@ -258,9 +267,9 @@ void bitmap_clear(uint8_t *map, int bit)
* @map: Pointer to bitmap
* @bit: Bit number to check
*
- * Return: one if given bit is set, zero if it's not
+ * Return: true if given bit is set, false if it's not
*/
-int bitmap_isset(const uint8_t *map, int bit)
+bool bitmap_isset(const uint8_t *map, unsigned bit)
{
const unsigned long *word
= (const unsigned long *)map + BITMAP_WORD(bit);
@@ -300,7 +309,7 @@ void bitmap_or(uint8_t *dst, size_t size, const uint8_t *a, const uint8_t *b)
void ns_enter(const struct ctx *c)
{
if (setns(c->pasta_netns_fd, CLONE_NEWNET))
- die("setns() failed entering netns: %s", strerror(errno));
+ die_perror("setns() failed entering netns");
}
/**
@@ -315,10 +324,8 @@ bool ns_is_init(void)
bool ret = true;
int fd;
- if ((fd = open("/proc/self/uid_map", O_RDONLY | O_CLOEXEC)) < 0) {
- die("Can't determine if we're in init namespace: %s",
- strerror(errno));
- }
+ if ((fd = open("/proc/self/uid_map", O_RDONLY | O_CLOEXEC)) < 0)
+ die_perror("Can't determine if we're in init namespace");
if (read(fd, buf, sizeof(root_uid_map)) != sizeof(root_uid_map) - 1 ||
strncmp(buf, root_uid_map, sizeof(root_uid_map)))
@@ -382,11 +389,11 @@ int open_in_ns(const struct ctx *c, const char *path, int flags)
}
/**
- * pid_file() - Write PID to file, if requested to do so, and close it
+ * pidfile_write() - Write PID to file, if requested to do so, and close it
* @fd: Open PID file descriptor, closed on exit, -1 to skip writing it
* @pid: PID value to write
*/
-void write_pidfile(int fd, pid_t pid)
+void pidfile_write(int fd, pid_t pid)
{
char pid_buf[12];
int n;
@@ -405,6 +412,23 @@ void write_pidfile(int fd, pid_t pid)
}
/**
+ * output_file_open() - Open file for output, if needed
+ * @path: Path for output file
+ * @flags: Flags for open() other than O_CREAT, O_TRUNC, O_CLOEXEC
+ *
+ * Return: file descriptor on success, -1 on failure with errno set by open()
+ */
+int output_file_open(const char *path, int flags)
+{
+ /* We use O_CLOEXEC here, but clang-tidy as of LLVM 16 to 19 looks for
+ * it in the 'mode' argument if we have one
+ */
+ return open(path, O_CREAT | O_TRUNC | O_CLOEXEC | flags,
+ /* NOLINTNEXTLINE(android-cloexec-open) */
+ S_IRUSR | S_IWUSR);
+}
+
+/**
* __daemon() - daemon()-like function writing PID file before parent exits
* @pidfile_fd: Open PID file descriptor
* @devnull_fd: Open file descriptor for /dev/null
@@ -421,20 +445,15 @@ int __daemon(int pidfile_fd, int devnull_fd)
}
if (pid) {
- write_pidfile(pidfile_fd, pid);
+ pidfile_write(pidfile_fd, pid);
exit(EXIT_SUCCESS);
}
- errno = 0;
-
- setsid();
-
- dup2(devnull_fd, STDIN_FILENO);
- dup2(devnull_fd, STDOUT_FILENO);
- dup2(devnull_fd, STDERR_FILENO);
- close(devnull_fd);
-
- if (errno)
+ if (setsid() < 0 ||
+ dup2(devnull_fd, STDIN_FILENO) < 0 ||
+ dup2(devnull_fd, STDOUT_FILENO) < 0 ||
+ dup2(devnull_fd, STDERR_FILENO) < 0 ||
+ close(devnull_fd))
exit(EXIT_FAILURE);
return 0;
@@ -472,7 +491,7 @@ int write_file(const char *path, const char *buf)
size_t len = strlen(buf);
if (fd < 0) {
- warn("Could not open %s: %s", path, strerror(errno));
+ warn_perror("Could not open %s", path);
return -1;
}
@@ -480,7 +499,7 @@ int write_file(const char *path, const char *buf)
ssize_t rc = write(fd, buf, len);
if (rc <= 0) {
- warn("Couldn't write to %s: %s", path, strerror(errno));
+ warn_perror("Couldn't write to %s", path);
break;
}
@@ -522,6 +541,36 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
#endif
}
+/* write_all_buf() - write all of a buffer to an fd
+ * @fd: File descriptor
+ * @buf: Pointer to base of buffer
+ * @len: Length of buffer
+ *
+ * Return: 0 on success, -1 on error (with errno set)
+ *
+ * #syscalls write
+ */
+int write_all_buf(int fd, const void *buf, size_t len)
+{
+ const char *p = buf;
+ size_t left = len;
+
+ while (left) {
+ ssize_t rc;
+
+ do
+ rc = write(fd, p, left);
+ while ((rc < 0) && errno == EINTR);
+
+ if (rc < 0)
+ return -1;
+
+ p += rc;
+ left -= rc;
+ }
+ return 0;
+}
+
/* write_remainder() - write the tail of an IO vector to an fd
* @fd: File descriptor
* @iov: IO vector
@@ -530,27 +579,261 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
*
* Return: 0 on success, -1 on error (with errno set)
*
- * #syscalls write writev
+ * #syscalls writev
*/
-int write_remainder(int fd, const struct iovec *iov, int iovcnt, size_t skip)
+int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip)
{
- int i;
+ size_t i = 0, offset;
- while ((i = iov_skip_bytes(iov, iovcnt, skip, &skip)) < iovcnt) {
+ while ((i += iov_skip_bytes(iov + i, iovcnt - i, skip, &offset)) < iovcnt) {
ssize_t rc;
- if (skip) {
- rc = write(fd, (char *)iov[i].iov_base + skip,
- iov[i].iov_len - skip);
- } else {
- rc = writev(fd, &iov[i], iovcnt - i);
+ if (offset) {
+ /* Write the remainder of the partially written buffer */
+ if (write_all_buf(fd, (char *)iov[i].iov_base + offset,
+ iov[i].iov_len - offset) < 0)
+ return -1;
+ i++;
}
+ /* Write as much of the remaining whole buffers as we can */
+ rc = writev(fd, &iov[i], iovcnt - i);
if (rc < 0)
return -1;
- skip += rc;
+ skip = rc;
}
-
return 0;
}
+
+/** sockaddr_ntop() - Convert a socket address to text format
+ * @sa: Socket address
+ * @dst: output buffer, minimum SOCKADDR_STRLEN bytes
+ * @size: size of buffer at @dst
+ *
+ * Return: On success, a non-null pointer to @dst, NULL on failure
+ */
+const char *sockaddr_ntop(const void *sa, char *dst, socklen_t size)
+{
+ sa_family_t family = ((const struct sockaddr *)sa)->sa_family;
+ socklen_t off = 0;
+
+#define IPRINTF(...) \
+ do { \
+ off += snprintf(dst + off, size - off, __VA_ARGS__); \
+ if (off >= size) \
+ return NULL; \
+ } while (0)
+
+#define INTOP(af, addr) \
+ do { \
+ if (!inet_ntop((af), (addr), dst + off, size - off)) \
+ return NULL; \
+ off += strlen(dst + off); \
+ } while (0)
+
+ switch (family) {
+ case AF_UNSPEC:
+ IPRINTF("<unspecified>");
+ break;
+
+ case AF_INET: {
+ const struct sockaddr_in *sa4 = sa;
+
+ INTOP(AF_INET, &sa4->sin_addr);
+ IPRINTF(":%hu", ntohs(sa4->sin_port));
+ break;
+ }
+
+ case AF_INET6: {
+ const struct sockaddr_in6 *sa6 = sa;
+
+ IPRINTF("[");
+ INTOP(AF_INET6, &sa6->sin6_addr);
+ IPRINTF("]:%hu", ntohs(sa6->sin6_port));
+ break;
+ }
+
+ /* FIXME: Implement AF_UNIX */
+ default:
+ errno = EAFNOSUPPORT;
+ return NULL;
+ }
+
+#undef IPRINTF
+#undef INTOP
+
+ return dst;
+}
+
+/** eth_ntop() - Convert an Ethernet MAC address to text format
+ * @mac: MAC address
+ * @dst: Output buffer, minimum ETH_ADDRSTRLEN bytes
+ * @size: Size of buffer at @dst
+ *
+ * Return: On success, a non-null pointer to @dst, NULL on failure
+ */
+const char *eth_ntop(const unsigned char *mac, char *dst, size_t size)
+{
+ int len;
+
+ len = snprintf(dst, size, "%02x:%02x:%02x:%02x:%02x:%02x",
+ mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
+ if (len < 0 || (size_t)len >= size)
+ return NULL;
+
+ return dst;
+}
+
+/** str_ee_origin() - Convert socket extended error origin to a string
+ * @ee: Socket extended error structure
+ *
+ * Return: Static string describing error origin
+ */
+const char *str_ee_origin(const struct sock_extended_err *ee)
+{
+ const char *const desc[] = {
+ [SO_EE_ORIGIN_NONE] = "<no origin>",
+ [SO_EE_ORIGIN_LOCAL] = "Local",
+ [SO_EE_ORIGIN_ICMP] = "ICMP",
+ [SO_EE_ORIGIN_ICMP6] = "ICMPv6",
+ };
+
+ if (ee->ee_origin < ARRAY_SIZE(desc))
+ return desc[ee->ee_origin];
+
+ return "<invalid>";
+}
+
+/**
+ * close_open_files() - Close leaked files, but not --fd, stdin, stdout, stderr
+ * @argc: Argument count
+ * @argv: Command line options, as we need to skip any file given via --fd
+ */
+void close_open_files(int argc, char **argv)
+{
+ const struct option optfd[] = { { "fd", required_argument, NULL, 'F' },
+ { 0 },
+ };
+ long fd = -1;
+ int name, rc;
+
+ do {
+ name = getopt_long(argc, argv, "-:F:", optfd, NULL);
+
+ if (name == 'F') {
+ errno = 0;
+ fd = strtol(optarg, NULL, 0);
+
+ if (errno || fd <= STDERR_FILENO || fd > INT_MAX)
+ die("Invalid --fd: %s", optarg);
+ }
+ } while (name != -1);
+
+ if (fd == -1) {
+ rc = close_range(STDERR_FILENO + 1, ~0U, CLOSE_RANGE_UNSHARE);
+ } else if (fd == STDERR_FILENO + 1) { /* Still a single range */
+ rc = close_range(STDERR_FILENO + 2, ~0U, CLOSE_RANGE_UNSHARE);
+ } else {
+ rc = close_range(STDERR_FILENO + 1, fd - 1,
+ CLOSE_RANGE_UNSHARE);
+ if (!rc)
+ rc = close_range(fd + 1, ~0U, CLOSE_RANGE_UNSHARE);
+ }
+
+ if (rc) {
+ if (errno == ENOSYS || errno == EINVAL) {
+ /* This probably means close_range() or the
+ * CLOSE_RANGE_UNSHARE flag is not supported by the
+ * kernel. Not much we can do here except carry on and
+ * hope for the best.
+ */
+ warn(
+"Can't use close_range() to ensure no files leaked by parent");
+ } else {
+ die_perror("Failed to close files leaked by parent");
+ }
+ }
+
+}
+
+/**
+ * snprintf_check() - snprintf() wrapper, checking for truncation and errors
+ * @str: Output buffer
+ * @size: Maximum size to write to @str
+ * @format: Message
+ *
+ * Return: false on success, true on truncation or error, sets errno on failure
+ */
+bool snprintf_check(char *str, size_t size, const char *format, ...)
+{
+ va_list ap;
+ int rc;
+
+ va_start(ap, format);
+ rc = vsnprintf(str, size, format, ap);
+ va_end(ap);
+
+ if (rc < 0) {
+ errno = EIO;
+ return true;
+ }
+
+ if ((size_t)rc >= size) {
+ errno = ENOBUFS;
+ return true;
+ }
+
+ return false;
+}
+
+#define DEV_RANDOM "/dev/random"
+
+/**
+ * raw_random() - Get high quality random bytes
+ * @buf: Buffer to fill with random bytes
+ * @buflen: Number of bytes of random data to put in @buf
+ *
+ * Assumes that the random data is essential, and will die() if unable to obtain
+ * it.
+ */
+void raw_random(void *buf, size_t buflen)
+{
+ size_t random_read = 0;
+#ifndef HAS_GETRANDOM
+ int fd = open(DEV_RANDOM, O_RDONLY);
+
+ if (fd < 0)
+ die_perror("Couldn't open %s", DEV_RANDOM);
+#endif
+
+ while (random_read < buflen) {
+ ssize_t ret;
+
+#ifdef HAS_GETRANDOM
+ ret = getrandom((char *)buf + random_read,
+ buflen - random_read, GRND_RANDOM);
+#else
+ ret = read(dev_random, (char *)buf + random_read,
+ buflen - random_read);
+#endif
+
+ if (ret == -1 && errno == EINTR)
+ continue;
+
+ if (ret < 0)
+ die_perror("Error on random data source");
+
+ if (ret == 0)
+ break;
+
+ random_read += ret;
+ }
+
+#ifndef HAS_GETRANDOM
+ close(dev_random);
+#endif
+
+ if (random_read < buflen)
+ die("Unexpected EOF on random data source");
+}