aboutgitcodebugslistschat
diff options
context:
space:
mode:
authorStefano Brivio <sbrivio@redhat.com>2021-04-23 22:22:37 +0200
committerStefano Brivio <sbrivio@redhat.com>2021-04-23 22:22:37 +0200
commit38b50dba4704856194ac02b98e492d2349d64058 (patch)
treeb7f7a12479aa53f9e7ac2cae7a4e709ce78d99a5
parent962bc97cf116519bd11b7e8beeda6dcce033d537 (diff)
downloadpasst-38b50dba4704856194ac02b98e492d2349d64058.tar
passt-38b50dba4704856194ac02b98e492d2349d64058.tar.gz
passt-38b50dba4704856194ac02b98e492d2349d64058.tar.bz2
passt-38b50dba4704856194ac02b98e492d2349d64058.tar.lz
passt-38b50dba4704856194ac02b98e492d2349d64058.tar.xz
passt-38b50dba4704856194ac02b98e492d2349d64058.tar.zst
passt-38b50dba4704856194ac02b98e492d2349d64058.zip
passt: Spare some syscalls, add some optimisations from profiling
Avoid a bunch of syscalls on forwarding paths by: - storing minimum and maximum file descriptor numbers for each protocol, fall back to SO_PROTOCOL query only on overlaps - allocating a larger receive buffer -- this can result in more coalesced packets than sendmmsg() can take (UIO_MAXIOV, i.e. 1024), so make sure we don't exceed that within a single call to protocol tap handlers - nesting the handling loop in tap_handler() in the receive loop, so that we have better chances of filling our receive buffer in fewer calls - skipping the recvfrom() in the UDP handler on EPOLLERR -- there's nothing to be done in that case and while at it: - restore the 20ms timer interval for periodic (TCP) events, I accidentally changed that to 100ms in an earlier commit - attempt using SO_ZEROCOPY for UDP -- if it's not available, sendmmsg() will succeed anyway - fix the handling of the status code from sendmmsg(), if it fails, we'll try to discard the first message, hence return 1 from the UDP handler Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
-rw-r--r--icmp.c3
-rw-r--r--icmp.h4
-rw-r--r--passt.c113
-rw-r--r--passt.h2
-rw-r--r--tap.c4
-rw-r--r--tcp.c8
-rw-r--r--tcp.h4
-rw-r--r--udp.c20
-rw-r--r--udp.h15
-rw-r--r--util.c15
10 files changed, 139 insertions, 49 deletions
diff --git a/icmp.c b/icmp.c
index 9a3c740..dd4e3a4 100644
--- a/icmp.c
+++ b/icmp.c
@@ -135,6 +135,9 @@ int icmp_tap_handler(struct ctx *c, int af, void *addr,
*/
int icmp_sock_init(struct ctx *c)
{
+ c->icmp.fd_min = INT_MAX;
+ c->icmp.fd_max = 0;
+
if (c->v4 && (c->icmp.s4 = sock_l4_add(c, 4, IPPROTO_ICMP, 0)) < 0)
return -1;
diff --git a/icmp.h b/icmp.h
index 9d26050..1941028 100644
--- a/icmp.h
+++ b/icmp.h
@@ -12,10 +12,14 @@ int icmp_sock_init(struct ctx *c);
* struct icmp_ctx - Execution context for ICMP routines
* @s4: ICMP socket number
* @s6: ICMPv6 socket number
+ * @fd_min: Lowest file descriptor number for ICMP/ICMPv6 ever used
+ * @fd_max: Highest file descriptor number for ICMP/ICMPv6 ever used
*/
struct icmp_ctx {
int s4;
int s6;
+ int fd_min;
+ int fd_max;
};
#endif /* ICMP_H */
diff --git a/passt.c b/passt.c
index 9550c68..2fc88cf 100644
--- a/passt.c
+++ b/passt.c
@@ -57,9 +57,11 @@
#define EPOLL_EVENTS 10
-#define TAP_NMSG 32 /* maximum messages to buffer from tap */
+#define TAP_BUF_BYTES (ETH_MAX_MTU * 8)
+#define TAP_BUF_FILL (TAP_BUF_BYTES - ETH_MAX_MTU - sizeof(uint32_t))
+#define TAP_MSGS (TAP_BUF_BYTES / sizeof(struct ethhdr) + 1)
-#define TIMER_INTERVAL 100 /* ms, for protocol periodic handlers */
+#define TIMER_INTERVAL 20 /* ms, for protocol periodic handlers */
/**
* sock_unix() - Create and bind AF_UNIX socket, add to epoll list
@@ -515,7 +517,7 @@ static int tap6_handler(struct ctx *c, struct tap_msg *msg, size_t count)
return 1;
}
-static char tap_buf[ETH_MAX_MTU * TAP_NMSG];
+static char tap_buf[TAP_BUF_BYTES];
/**
* tap_handler() - Packet handler for tap file descriptor
@@ -525,32 +527,30 @@ static char tap_buf[ETH_MAX_MTU * TAP_NMSG];
*/
static int tap_handler(struct ctx *c)
{
- int msg_count = 0, same, rcv = 0, i = 0;
- struct tap_msg msg[UIO_MAXIOV];
- ssize_t n, rem, fill;
+ struct tap_msg msg[TAP_MSGS];
+ int msg_count, same, i;
struct ethhdr *eh;
char *p = tap_buf;
+ ssize_t n, rem;
- fill = ETH_MAX_MTU * (TAP_NMSG - 1);
+ while ((n = recv(c->fd_unix, p, TAP_BUF_FILL, MSG_DONTWAIT)) > 0) {
+ msg_count = 0;
- while ((n = recv(c->fd_unix, p, fill, MSG_DONTWAIT)) > 0) {
- fill -= n;
- while (n > 0) {
+ while (n > (ssize_t)sizeof(uint32_t)) {
ssize_t len = ntohl(*(uint32_t *)p);
p += sizeof(uint32_t);
n -= sizeof(uint32_t);
if (len < (ssize_t)sizeof(*eh))
- break;
+ return 0;
/* At most one packet might not fit in a single read */
if (len > n) {
- rem = recv(c->fd_unix, p + n, fill,
+ rem = recv(c->fd_unix, p + n, len - n,
MSG_DONTWAIT);
- rcv = errno;
- if (rem <= 0 || rem + n != len)
- break;
+ if ((n += rem) != len)
+ return 0;
}
msg[msg_count].start = p;
@@ -559,40 +559,49 @@ static int tap_handler(struct ctx *c)
n -= len;
p += len;
}
- }
-
- rcv = errno;
- while (i < msg_count) {
- eh = (struct ethhdr *)msg[i].start;
+ i = 0;
+ while (i < msg_count) {
+ eh = (struct ethhdr *)msg[i].start;
switch (ntohs(eh->h_proto)) {
- case ETH_P_ARP:
- tap4_handler(c, msg + i, 1);
- i++;
- break;
- case ETH_P_IP:
- for (same = 1; i + same < msg_count; same++) {
- eh = (struct ethhdr *)msg[i + same].start;
- if (ntohs(eh->h_proto) != ETH_P_IP)
- break;
- }
+ case ETH_P_ARP:
+ tap4_handler(c, msg + i, 1);
+ i++;
+ break;
+ case ETH_P_IP:
+ for (same = 1; i + same < msg_count &&
+ same < UIO_MAXIOV; same++) {
+ struct tap_msg *next = &msg[i + same];
+
+ eh = (struct ethhdr *)next->start;
+ if (ntohs(eh->h_proto) != ETH_P_IP)
+ break;
+ }
+
i += tap4_handler(c, msg + i, same);
- break;
- case ETH_P_IPV6:
- for (same = 1; i + same < msg_count; same++) {
- eh = (struct ethhdr *)msg[i + same].start;
- if (ntohs(eh->h_proto) != ETH_P_IPV6)
- break;
- }
+ break;
+ case ETH_P_IPV6:
+ for (same = 1; i + same < msg_count &&
+ same < UIO_MAXIOV; same++) {
+ struct tap_msg *next = &msg[i + same];
+
+ eh = (struct ethhdr *)next->start;
+ if (ntohs(eh->h_proto) != ETH_P_IPV6)
+ break;
+ }
+
i += tap6_handler(c, msg + i, same);
- break;
- default:
- i++;
- break;
+ break;
+ default:
+ i++;
+ break;
+ }
}
+
+ p = tap_buf;
}
- if (n >= 0 || rcv == EINTR || rcv == EAGAIN || rcv == EWOULDBLOCK)
+ if (n >= 0 || errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)
return 0;
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_unix, NULL);
@@ -614,8 +623,21 @@ static void sock_handler(struct ctx *c, int fd, uint32_t events)
sl = sizeof(so);
- if (getsockopt(fd, SOL_SOCKET, SO_PROTOCOL, &so, &sl))
+#define IN(x, proto) (x >= c->proto.fd_min && x <= c->proto.fd_max)
+
+ if (IN(fd, udp) && !IN(fd, icmp) && !IN(fd, tcp))
+ so = IPPROTO_UDP;
+ else if (IN(fd, tcp) && !IN(fd, icmp) && !IN(fd, udp))
+ so = IPPROTO_TCP;
+ else if (IN(fd, icmp) && !IN(fd, udp) && !IN(fd, tcp))
+ so = IPPROTO_ICMP; /* Fits ICMPv6 below, too */
+ else if (getsockopt(fd, SOL_SOCKET, SO_PROTOCOL, &so, &sl)) {
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, fd, NULL);
+ close(fd);
return;
+ }
+
+#undef IN
debug("%s: packet from socket %i", getprotobynumber(so)->p_name, fd);
@@ -771,7 +793,10 @@ loop:
for (i = 0; i < nfds; i++) {
if (events[i].data.fd == c.fd_unix) {
- if (tap_handler(&c))
+ if (events[i].events & EPOLLRDHUP ||
+ events[i].events & EPOLLHUP ||
+ events[i].events & EPOLLERR ||
+ tap_handler(&c))
goto listen;
} else {
sock_handler(&c, events[i].data.fd, events[i].events);
diff --git a/passt.h b/passt.h
index 87d91e5..d8b2dce 100644
--- a/passt.h
+++ b/passt.h
@@ -16,6 +16,7 @@ struct tap_msg {
#include "icmp.h"
#include "tcp.h"
+#include "udp.h"
/**
* struct ctx - Execution context
@@ -56,4 +57,5 @@ struct ctx {
struct icmp_ctx icmp;
struct tcp_ctx tcp;
+ struct tcp_ctx udp;
};
diff --git a/tap.c b/tap.c
index f8b8b4f..c11191c 100644
--- a/tap.c
+++ b/tap.c
@@ -37,9 +37,9 @@
int tap_send(int fd, void *data, size_t len, int flags)
{
uint32_t vnet_len = htonl(len);
- send(fd, &vnet_len, 4, 0);
+ send(fd, &vnet_len, 4, MSG_DONTWAIT | MSG_NOSIGNAL);
- return send(fd, data, len, flags);
+ return send(fd, data, len, flags | MSG_DONTWAIT | MSG_NOSIGNAL);
}
/**
diff --git a/tcp.c b/tcp.c
index 330e21a..3d47f35 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1003,6 +1003,11 @@ static void tcp_conn_from_sock(struct ctx *c, int fd)
if (s == -1)
return;
+ if (s < c->tcp.fd_min)
+ c->tcp.fd_min = s;
+ if (s > c->tcp.fd_max)
+ c->tcp.fd_max = s;
+
if (sa_l.ss_family == AF_INET) {
struct sockaddr_in *sa4 = (struct sockaddr_in *)&sa_r;
@@ -1445,6 +1450,9 @@ int tcp_sock_init(struct ctx *c)
{
in_port_t port;
+ c->tcp.fd_min = INT_MAX;
+ c->tcp.fd_max = 0;
+
for (port = 0; port < (1 << 15) + (1 << 14); port++) {
if (c->v4 && sock_l4_add(c, 4, IPPROTO_TCP, port) < 0)
return -1;
diff --git a/tcp.h b/tcp.h
index da081ad..2a79a75 100644
--- a/tcp.h
+++ b/tcp.h
@@ -12,9 +12,13 @@ void tcp_timer(struct ctx *c, struct timespec *ts);
/**
* struct tcp_ctx - Execution context for TCP routines
* @hash_secret: 128-bit secret for hash functions, ISN and hash table
+ * @fd_min: Lowest file descriptor number for TCP ever used
+ * @fd_max: Highest file descriptor number for TCP ever used
*/
struct tcp_ctx {
uint64_t hash_secret[2];
+ int fd_min;
+ int fd_max;
};
#endif /* TCP_H */
diff --git a/udp.c b/udp.c
index 7be88f6..edb73de 100644
--- a/udp.c
+++ b/udp.c
@@ -68,7 +68,8 @@ void udp_sock_handler(struct ctx *c, int s, uint32_t events)
struct udphdr *uh;
ssize_t n;
- (void)events;
+ if (events == EPOLLERR)
+ return;
n = recvfrom(s, buf + sizeof(*uh), sizeof(buf) - sizeof(*uh),
MSG_DONTWAIT, (struct sockaddr *)&sr, &slen);
@@ -179,7 +180,11 @@ int udp_tap_handler(struct ctx *c, int af, void *addr,
return count;
}
- return sendmmsg(s, mm, count, MSG_DONTWAIT | MSG_NOSIGNAL);
+ count = sendmmsg(s, mm, count, MSG_DONTWAIT | MSG_NOSIGNAL | MSG_ZEROCOPY);
+ if (count < 0)
+ return 1;
+
+ return count;
}
/**
@@ -191,13 +196,19 @@ int udp_tap_handler(struct ctx *c, int af, void *addr,
int udp_sock_init(struct ctx *c)
{
in_port_t port;
- int s;
+ int s, one = 1;
+
+ c->udp.fd_min = INT_MAX;
+ c->udp.fd_max = 0;
for (port = 0; port < USHRT_MAX; port++) {
if (c->v4) {
if ((s = sock_l4_add(c, 4, IPPROTO_UDP, port)) < 0)
return -1;
+ setsockopt(s, SOL_SOCKET, SO_ZEROCOPY,
+ &one, sizeof(one));
+
udp4_sock_port[port] = s;
}
@@ -205,6 +216,9 @@ int udp_sock_init(struct ctx *c)
if ((s = sock_l4_add(c, 6, IPPROTO_UDP, port)) < 0)
return -1;
+ setsockopt(s, SOL_SOCKET, SO_ZEROCOPY,
+ &one, sizeof(one));
+
udp6_sock_port[port] = s;
}
}
diff --git a/udp.h b/udp.h
index 0179fa2..b9ac2e0 100644
--- a/udp.h
+++ b/udp.h
@@ -1,4 +1,19 @@
+#ifndef UDP_H
+#define UDP_H
+
void udp_sock_handler(struct ctx *c, int s, uint32_t events);
int udp_tap_handler(struct ctx *c, int af, void *addr,
struct tap_msg *msg, int count);
int udp_sock_init(struct ctx *c);
+
+/**
+ * struct udp_ctx - Execution context for UDP
+ * @fd_min: Lowest file descriptor number for UDP ever used
+ * @fd_max: Highest file descriptor number for UDP ever used
+ */
+struct udp_ctx {
+ int fd_min;
+ int fd_max;
+};
+
+#endif /* UDP_H */
diff --git a/util.c b/util.c
index 7a75e02..cc96a1a 100644
--- a/util.c
+++ b/util.c
@@ -189,6 +189,21 @@ int sock_l4_add(struct ctx *c, int v, uint16_t proto, uint16_t port)
return -1;
}
+#define CHECK_SET_MIN_MAX(ipproto, proto_ctx, fd) \
+ if (proto == (ipproto)) { \
+ if (fd < c->proto_ctx.fd_min) \
+ c->proto_ctx.fd_min = (fd); \
+ if (fd > c->proto_ctx.fd_max) \
+ c->proto_ctx.fd_max = (fd); \
+ }
+
+ CHECK_SET_MIN_MAX(IPPROTO_ICMP, icmp, fd);
+ CHECK_SET_MIN_MAX(IPPROTO_ICMPV6, icmp, fd);
+ CHECK_SET_MIN_MAX(IPPROTO_TCP, tcp, fd);
+ CHECK_SET_MIN_MAX(IPPROTO_UDP, udp, fd);
+
+#undef CHECK_SET_MIN_MAX
+
if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6)
goto epoll_add;