aboutgitcodebugslistschat
path: root/passt.c
diff options
context:
space:
mode:
Diffstat (limited to 'passt.c')
-rw-r--r--passt.c336
1 files changed, 208 insertions, 128 deletions
diff --git a/passt.c b/passt.c
index 4b3c306..7488a84 100644
--- a/passt.c
+++ b/passt.c
@@ -19,7 +19,6 @@
* created in a separate network namespace).
*/
-#include <sys/epoll.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/resource.h>
@@ -36,9 +35,6 @@
#include <sys/prctl.h>
#include <netinet/if_ether.h>
#include <libgen.h>
-#ifdef HAS_GETRANDOM
-#include <sys/random.h>
-#endif
#include "util.h"
#include "passt.h"
@@ -52,11 +48,16 @@
#include "arch.h"
#include "log.h"
#include "tcp_splice.h"
+#include "ndp.h"
+#include "vu_common.h"
+#include "migrate.h"
+#include "repair.h"
+#include "netlink.h"
+#include "epoll_ctl.h"
-#define EPOLL_EVENTS 8
+#define NUM_EPOLL_EVENTS 8
-#define TIMER_INTERVAL__ MIN(TCP_TIMER_INTERVAL, UDP_TIMER_INTERVAL)
-#define TIMER_INTERVAL_ MIN(TIMER_INTERVAL__, ICMP_TIMER_INTERVAL)
+#define TIMER_INTERVAL_ MIN(TCP_TIMER_INTERVAL, FWD_PORT_SCAN_INTERVAL)
#define TIMER_INTERVAL MIN(TIMER_INTERVAL_, FLOW_TIMER_INTERVAL)
char pkt_buf[PKT_BUF_BYTES] __attribute__ ((aligned(PAGE_SIZE)));
@@ -67,18 +68,31 @@ char *epoll_type_str[] = {
[EPOLL_TYPE_TCP_LISTEN] = "listening TCP socket",
[EPOLL_TYPE_TCP_TIMER] = "TCP timer",
[EPOLL_TYPE_UDP_LISTEN] = "listening UDP socket",
- [EPOLL_TYPE_UDP_REPLY] = "UDP reply socket",
+ [EPOLL_TYPE_UDP] = "UDP flow socket",
[EPOLL_TYPE_PING] = "ICMP/ICMPv6 ping socket",
[EPOLL_TYPE_NSQUIT_INOTIFY] = "namespace inotify watch",
[EPOLL_TYPE_NSQUIT_TIMER] = "namespace timer watch",
[EPOLL_TYPE_TAP_PASTA] = "/dev/net/tun device",
[EPOLL_TYPE_TAP_PASST] = "connected qemu socket",
[EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket",
+ [EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket",
+ [EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket",
+ [EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket",
+ [EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket",
+ [EPOLL_TYPE_NL_NEIGH] = "netlink neighbour notifier socket",
};
static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
"epoll_type_str[] doesn't match enum epoll_type");
/**
+ * struct passt_stats - Statistics
+ * @events: Event counters for epoll type events
+ */
+struct passt_stats {
+ unsigned long events[EPOLL_NUM_TYPES];
+};
+
+/**
* post_handler() - Run periodic and deferred tasks for L4 protocol handlers
* @c: Execution context
* @now: Current timestamp
@@ -105,45 +119,29 @@ static void post_handler(struct ctx *c, const struct timespec *now)
/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
CALL_PROTO_HANDLER(tcp, TCP);
- /* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
- CALL_PROTO_HANDLER(udp, UDP);
+#undef CALL_PROTO_HANDLER
flow_defer_handler(c, now);
-#undef CALL_PROTO_HANDLER
+ fwd_scan_ports_timer(c, now);
+
+ if (!c->no_ndp)
+ ndp_timer(c, now);
}
/**
- * secret_init() - Create secret value for SipHash calculations
+ * random_init() - Initialise things based on random data
* @c: Execution context
*/
-static void secret_init(struct ctx *c)
+static void random_init(struct ctx *c)
{
-#ifndef HAS_GETRANDOM
- int dev_random = open("/dev/random", O_RDONLY);
- unsigned int random_read = 0;
+ unsigned int seed;
- while (dev_random && random_read < sizeof(c->hash_secret)) {
- int ret = read(dev_random,
- (uint8_t *)&c->hash_secret + random_read,
- sizeof(c->hash_secret) - random_read);
+ /* Create secret value for SipHash calculations */
+ raw_random(&c->hash_secret, sizeof(c->hash_secret));
- if (ret == -1 && errno == EINTR)
- continue;
-
- if (ret <= 0)
- break;
-
- random_read += ret;
- }
- if (dev_random >= 0)
- close(dev_random);
-
- if (random_read < sizeof(c->hash_secret))
-#else
- if (getrandom(&c->hash_secret, sizeof(c->hash_secret),
- GRND_RANDOM) < 0)
-#endif /* !HAS_GETRANDOM */
- die_perror("Failed to get random bytes for hash table and TCP");
+ /* Seed pseudo-RNG for things that need non-cryptographic random */
+ raw_random(&seed, sizeof(seed));
+ srandom(seed);
}
/**
@@ -159,12 +157,11 @@ static void timer_init(struct ctx *c, const struct timespec *now)
/**
* proto_update_l2_buf() - Update scatter-gather L2 buffers in protocol handlers
* @eth_d: Ethernet destination address, NULL if unchanged
- * @eth_s: Ethernet source address, NULL if unchanged
*/
-void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
+void proto_update_l2_buf(const unsigned char *eth_d)
{
- tcp_update_l2_buf(eth_d, eth_s);
- udp_update_l2_buf(eth_d, eth_s);
+ tcp_update_l2_buf(eth_d);
+ udp_update_l2_buf(eth_d);
}
/**
@@ -176,11 +173,147 @@ void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
*
* #syscalls exit_group
*/
-void exit_handler(int signal)
+static void exit_handler(int signal)
{
(void)signal;
- exit(EXIT_SUCCESS);
+ passt_exit(EXIT_SUCCESS);
+}
+
+/**
+ * print_stats() - Print event statistics table to stderr
+ * @c: Execution context
+ * @stats: Event counters
+ * @now: Current timestamp
+ */
+static void print_stats(const struct ctx *c, const struct passt_stats *stats,
+ const struct timespec *now)
+{
+ static struct timespec before;
+ static int lines_printed;
+ long long elapsed_ns;
+ int i;
+
+ if (!c->stats)
+ return;
+
+ elapsed_ns = (now->tv_sec - before.tv_sec) * 1000000000LL +
+ (now->tv_nsec - before.tv_nsec);
+
+ if (elapsed_ns < c->stats * 1000000000LL)
+ return;
+
+ before = *now;
+
+ if (!(lines_printed % 20)) {
+ /* Table header */
+ for (i = 1; i < EPOLL_NUM_TYPES; i++) {
+ int j;
+
+ for (j = 0; j < i * (6 + 1); j++) {
+ if (j && !(j % (6 + 1)))
+ FPRINTF(stderr, "|");
+ else
+ FPRINTF(stderr, " ");
+ }
+ FPRINTF(stderr, "%s\n", epoll_type_str[i]);
+ }
+ }
+
+ FPRINTF(stderr, " ");
+ for (i = 1; i < EPOLL_NUM_TYPES; i++)
+ FPRINTF(stderr, " %6lu", stats->events[i]);
+ FPRINTF(stderr, "\n");
+ lines_printed++;
+}
+
+/**
+ * passt_worker() - Process epoll events and handle protocol operations
+ * @opaque: Pointer to execution context (struct ctx)
+ * @nfds: Number of file descriptors ready (epoll_wait return value)
+ * @events: epoll_event array of ready file descriptors
+ */
+static void passt_worker(void *opaque, int nfds, struct epoll_event *events)
+{
+ static struct passt_stats stats = { 0 };
+ struct ctx *c = opaque;
+ struct timespec now;
+ int i;
+
+ if (clock_gettime(CLOCK_MONOTONIC, &now))
+ err_perror("Failed to get CLOCK_MONOTONIC time");
+
+ for (i = 0; i < nfds; i++) {
+ union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64);
+ uint32_t eventmask = events[i].events;
+
+ trace("%s: epoll event on %s %i (events: 0x%08x)",
+ c->mode == MODE_PASTA ? "pasta" : "passt",
+ EPOLL_TYPE_STR(ref.type), ref.fd, eventmask);
+
+ switch (ref.type) {
+ case EPOLL_TYPE_TAP_PASTA:
+ tap_handler_pasta(c, eventmask, &now);
+ break;
+ case EPOLL_TYPE_TAP_PASST:
+ tap_handler_passt(c, eventmask, &now);
+ break;
+ case EPOLL_TYPE_TAP_LISTEN:
+ tap_listen_handler(c, eventmask);
+ break;
+ case EPOLL_TYPE_NSQUIT_INOTIFY:
+ pasta_netns_quit_inotify_handler(c, ref.fd);
+ break;
+ case EPOLL_TYPE_NSQUIT_TIMER:
+ pasta_netns_quit_timer_handler(c, ref);
+ break;
+ case EPOLL_TYPE_TCP:
+ tcp_sock_handler(c, ref, eventmask);
+ break;
+ case EPOLL_TYPE_TCP_SPLICE:
+ tcp_splice_sock_handler(c, ref, eventmask);
+ break;
+ case EPOLL_TYPE_TCP_LISTEN:
+ tcp_listen_handler(c, ref, &now);
+ break;
+ case EPOLL_TYPE_TCP_TIMER:
+ tcp_timer_handler(c, ref);
+ break;
+ case EPOLL_TYPE_UDP_LISTEN:
+ udp_listen_sock_handler(c, ref, eventmask, &now);
+ break;
+ case EPOLL_TYPE_UDP:
+ udp_sock_handler(c, ref, eventmask, &now);
+ break;
+ case EPOLL_TYPE_PING:
+ icmp_sock_handler(c, ref);
+ break;
+ case EPOLL_TYPE_VHOST_CMD:
+ vu_control_handler(c->vdev, c->fd_tap, eventmask);
+ break;
+ case EPOLL_TYPE_VHOST_KICK:
+ vu_kick_cb(c->vdev, ref, &now);
+ break;
+ case EPOLL_TYPE_REPAIR_LISTEN:
+ repair_listen_handler(c, eventmask);
+ break;
+ case EPOLL_TYPE_REPAIR:
+ repair_handler(c, eventmask);
+ break;
+ case EPOLL_TYPE_NL_NEIGH:
+ nl_neigh_notify_handler(c);
+ break;
+ default:
+ /* Can't happen */
+ ASSERT(0);
+ }
+ stats.events[ref.type]++;
+ print_stats(c, &stats, &now);
+ }
+
+ post_handler(c, &now);
+
+ migrate_handler(c);
}
/**
@@ -191,29 +324,31 @@ void exit_handler(int signal)
* Return: non-zero on failure
*
* #syscalls read write writev
- * #syscalls socket bind connect getsockopt setsockopt s390x:socketcall close
- * #syscalls recvfrom sendto shutdown
+ * #syscalls socket getsockopt setsockopt s390x:socketcall i686:socketcall close
+ * #syscalls bind connect recvfrom sendto shutdown
* #syscalls arm:recv ppc64le:recv arm:send ppc64le:send
- * #syscalls accept4|accept listen epoll_ctl epoll_wait|epoll_pwait epoll_pwait
- * #syscalls clock_gettime arm:clock_gettime64
+ * #syscalls accept4 accept listen epoll_ctl epoll_wait|epoll_pwait epoll_pwait
+ * #syscalls clock_gettime|clock_gettime64
+ * #syscalls arm:clock_gettime64 i686:clock_gettime64
*/
int main(int argc, char **argv)
{
- struct epoll_event events[EPOLL_EVENTS];
- int nfds, i, devnull_fd = -1;
- char argv0[PATH_MAX], *name;
+ struct epoll_event events[NUM_EPOLL_EVENTS];
+ int nfds, devnull_fd = -1;
struct ctx c = { 0 };
struct rlimit limit;
struct timespec now;
struct sigaction sa;
- clock_gettime(CLOCK_MONOTONIC, &log_start);
+ if (clock_gettime(CLOCK_MONOTONIC, &log_start))
+ die_perror("Failed to get CLOCK_MONOTONIC time");
arch_avx2_exec(argv);
isolate_initial(argc, argv);
c.pasta_netns_fd = c.fd_tap = c.pidfile_fd = -1;
+ c.device_state_fd = -1;
sigemptyset(&sa.sa_mask);
sa.sa_flags = 0;
@@ -221,31 +356,23 @@ int main(int argc, char **argv)
sigaction(SIGTERM, &sa, NULL);
sigaction(SIGQUIT, &sa, NULL);
- if (argc < 1)
- exit(EXIT_FAILURE);
+ c.mode = conf_mode(argc, argv);
- strncpy(argv0, argv[0], PATH_MAX - 1);
- name = basename(argv0);
- if (strstr(name, "pasta")) {
+ if (c.mode == MODE_PASTA) {
sa.sa_handler = pasta_child_handler;
if (sigaction(SIGCHLD, &sa, NULL))
die_perror("Couldn't install signal handlers");
-
- if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
- die_perror("Couldn't set disposition for SIGPIPE");
-
- c.mode = MODE_PASTA;
- } else if (strstr(name, "passt")) {
- c.mode = MODE_PASST;
- } else {
- exit(EXIT_FAILURE);
}
- madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE);
+ if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
+ die_perror("Couldn't set disposition for SIGPIPE");
+
+ madvise(pkt_buf, sizeof(pkt_buf), MADV_HUGEPAGE);
c.epollfd = epoll_create1(EPOLL_CLOEXEC);
if (c.epollfd == -1)
die_perror("Failed to create epoll file descriptor");
+ flow_epollid_register(EPOLLFD_ID_DEFAULT, c.epollfd);
if (getrlimit(RLIMIT_NOFILE, &limit))
die_perror("Failed to get maximum value of open files limit");
@@ -254,25 +381,27 @@ int main(int argc, char **argv)
if (setrlimit(RLIMIT_NOFILE, &limit))
die_perror("Failed to set current limit for open files");
- sock_probe_mem(&c);
+ sock_probe_features(&c);
conf(&c, argc, argv);
trace_init(c.trace);
pasta_netns_quit_init(&c);
- tap_sock_init(&c);
+ tap_backend_init(&c);
- secret_init(&c);
+ random_init(&c);
- clock_gettime(CLOCK_MONOTONIC, &now);
+ if (clock_gettime(CLOCK_MONOTONIC, &now))
+ die_perror("Failed to get CLOCK_MONOTONIC time");
flow_init();
+ fwd_scan_ports_init(&c);
if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c)))
- exit(EXIT_FAILURE);
+ passt_exit(EXIT_FAILURE);
- proto_update_l2_buf(c.mac_guest, c.mac);
+ proto_update_l2_buf(c.guest_mac);
if (c.ifi4 && !c.no_dhcp)
dhcp_init();
@@ -282,6 +411,9 @@ int main(int argc, char **argv)
pcap_init(&c);
+ fwd_neigh_table_init(&c);
+ nl_neigh_notify_init(&c);
+
if (!c.foreground) {
if ((devnull_fd = open("/dev/null", O_RDWR | O_CLOEXEC)) < 0)
die_perror("Failed to open /dev/null");
@@ -307,66 +439,14 @@ int main(int argc, char **argv)
timer_init(&c, &now);
loop:
- /* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
+ /* NOLINTBEGIN(bugprone-branch-clone): intervals can be the same */
/* cppcheck-suppress [duplicateValueTernary, unmatchedSuppression] */
- nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL);
+ nfds = epoll_wait(c.epollfd, events, NUM_EPOLL_EVENTS, TIMER_INTERVAL);
+ /* NOLINTEND(bugprone-branch-clone) */
if (nfds == -1 && errno != EINTR)
die_perror("epoll_wait() failed in main loop");
- clock_gettime(CLOCK_MONOTONIC, &now);
-
- for (i = 0; i < nfds; i++) {
- union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64);
- uint32_t eventmask = events[i].events;
-
- trace("%s: epoll event on %s %i (events: 0x%08x)",
- c.mode == MODE_PASTA ? "pasta" : "passt",
- EPOLL_TYPE_STR(ref.type), ref.fd, eventmask);
-
- switch (ref.type) {
- case EPOLL_TYPE_TAP_PASTA:
- tap_handler_pasta(&c, eventmask, &now);
- break;
- case EPOLL_TYPE_TAP_PASST:
- tap_handler_passt(&c, eventmask, &now);
- break;
- case EPOLL_TYPE_TAP_LISTEN:
- tap_listen_handler(&c, eventmask);
- break;
- case EPOLL_TYPE_NSQUIT_INOTIFY:
- pasta_netns_quit_inotify_handler(&c, ref.fd);
- break;
- case EPOLL_TYPE_NSQUIT_TIMER:
- pasta_netns_quit_timer_handler(&c, ref);
- break;
- case EPOLL_TYPE_TCP:
- tcp_sock_handler(&c, ref, eventmask);
- break;
- case EPOLL_TYPE_TCP_SPLICE:
- tcp_splice_sock_handler(&c, ref, eventmask);
- break;
- case EPOLL_TYPE_TCP_LISTEN:
- tcp_listen_handler(&c, ref, &now);
- break;
- case EPOLL_TYPE_TCP_TIMER:
- tcp_timer_handler(&c, ref);
- break;
- case EPOLL_TYPE_UDP_LISTEN:
- udp_listen_sock_handler(&c, ref, eventmask, &now);
- break;
- case EPOLL_TYPE_UDP_REPLY:
- udp_reply_sock_handler(&c, ref, eventmask, &now);
- break;
- case EPOLL_TYPE_PING:
- icmp_sock_handler(&c, ref);
- break;
- default:
- /* Can't happen */
- ASSERT(0);
- }
- }
-
- post_handler(&c, &now);
+ passt_worker(&c, nfds, events);
goto loop;
}