From 38b50dba4704856194ac02b98e492d2349d64058 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Fri, 23 Apr 2021 22:22:37 +0200 Subject: passt: Spare some syscalls, add some optimisations from profiling Avoid a bunch of syscalls on forwarding paths by: - storing minimum and maximum file descriptor numbers for each protocol, fall back to SO_PROTOCOL query only on overlaps - allocating a larger receive buffer -- this can result in more coalesced packets than sendmmsg() can take (UIO_MAXIOV, i.e. 1024), so make sure we don't exceed that within a single call to protocol tap handlers - nesting the handling loop in tap_handler() in the receive loop, so that we have better chances of filling our receive buffer in fewer calls - skipping the recvfrom() in the UDP handler on EPOLLERR -- there's nothing to be done in that case and while at it: - restore the 20ms timer interval for periodic (TCP) events, I accidentally changed that to 100ms in an earlier commit - attempt using SO_ZEROCOPY for UDP -- if it's not available, sendmmsg() will succeed anyway - fix the handling of the status code from sendmmsg(), if it fails, we'll try to discard the first message, hence return 1 from the UDP handler Signed-off-by: Stefano Brivio --- icmp.c | 3 ++ icmp.h | 4 +++ passt.c | 113 +++++++++++++++++++++++++++++++++++++++------------------------- passt.h | 2 ++ tap.c | 4 +-- tcp.c | 8 +++++ tcp.h | 4 +++ udp.c | 20 ++++++++++-- udp.h | 15 +++++++++ util.c | 15 +++++++++ 10 files changed, 139 insertions(+), 49 deletions(-) diff --git a/icmp.c b/icmp.c index 9a3c740..dd4e3a4 100644 --- a/icmp.c +++ b/icmp.c @@ -135,6 +135,9 @@ int icmp_tap_handler(struct ctx *c, int af, void *addr, */ int icmp_sock_init(struct ctx *c) { + c->icmp.fd_min = INT_MAX; + c->icmp.fd_max = 0; + if (c->v4 && (c->icmp.s4 = sock_l4_add(c, 4, IPPROTO_ICMP, 0)) < 0) return -1; diff --git a/icmp.h b/icmp.h index 9d26050..1941028 100644 --- a/icmp.h +++ b/icmp.h @@ -12,10 +12,14 @@ int icmp_sock_init(struct ctx *c); * struct icmp_ctx - Execution context for ICMP routines * @s4: ICMP socket number * @s6: ICMPv6 socket number + * @fd_min: Lowest file descriptor number for ICMP/ICMPv6 ever used + * @fd_max: Highest file descriptor number for ICMP/ICMPv6 ever used */ struct icmp_ctx { int s4; int s6; + int fd_min; + int fd_max; }; #endif /* ICMP_H */ diff --git a/passt.c b/passt.c index 9550c68..2fc88cf 100644 --- a/passt.c +++ b/passt.c @@ -57,9 +57,11 @@ #define EPOLL_EVENTS 10 -#define TAP_NMSG 32 /* maximum messages to buffer from tap */ +#define TAP_BUF_BYTES (ETH_MAX_MTU * 8) +#define TAP_BUF_FILL (TAP_BUF_BYTES - ETH_MAX_MTU - sizeof(uint32_t)) +#define TAP_MSGS (TAP_BUF_BYTES / sizeof(struct ethhdr) + 1) -#define TIMER_INTERVAL 100 /* ms, for protocol periodic handlers */ +#define TIMER_INTERVAL 20 /* ms, for protocol periodic handlers */ /** * sock_unix() - Create and bind AF_UNIX socket, add to epoll list @@ -515,7 +517,7 @@ static int tap6_handler(struct ctx *c, struct tap_msg *msg, size_t count) return 1; } -static char tap_buf[ETH_MAX_MTU * TAP_NMSG]; +static char tap_buf[TAP_BUF_BYTES]; /** * tap_handler() - Packet handler for tap file descriptor @@ -525,32 +527,30 @@ static char tap_buf[ETH_MAX_MTU * TAP_NMSG]; */ static int tap_handler(struct ctx *c) { - int msg_count = 0, same, rcv = 0, i = 0; - struct tap_msg msg[UIO_MAXIOV]; - ssize_t n, rem, fill; + struct tap_msg msg[TAP_MSGS]; + int msg_count, same, i; struct ethhdr *eh; char *p = tap_buf; + ssize_t n, rem; - fill = ETH_MAX_MTU * (TAP_NMSG - 1); + while ((n = recv(c->fd_unix, p, TAP_BUF_FILL, MSG_DONTWAIT)) > 0) { + msg_count = 0; - while ((n = recv(c->fd_unix, p, fill, MSG_DONTWAIT)) > 0) { - fill -= n; - while (n > 0) { + while (n > (ssize_t)sizeof(uint32_t)) { ssize_t len = ntohl(*(uint32_t *)p); p += sizeof(uint32_t); n -= sizeof(uint32_t); if (len < (ssize_t)sizeof(*eh)) - break; + return 0; /* At most one packet might not fit in a single read */ if (len > n) { - rem = recv(c->fd_unix, p + n, fill, + rem = recv(c->fd_unix, p + n, len - n, MSG_DONTWAIT); - rcv = errno; - if (rem <= 0 || rem + n != len) - break; + if ((n += rem) != len) + return 0; } msg[msg_count].start = p; @@ -559,40 +559,49 @@ static int tap_handler(struct ctx *c) n -= len; p += len; } - } - - rcv = errno; - while (i < msg_count) { - eh = (struct ethhdr *)msg[i].start; + i = 0; + while (i < msg_count) { + eh = (struct ethhdr *)msg[i].start; switch (ntohs(eh->h_proto)) { - case ETH_P_ARP: - tap4_handler(c, msg + i, 1); - i++; - break; - case ETH_P_IP: - for (same = 1; i + same < msg_count; same++) { - eh = (struct ethhdr *)msg[i + same].start; - if (ntohs(eh->h_proto) != ETH_P_IP) - break; - } + case ETH_P_ARP: + tap4_handler(c, msg + i, 1); + i++; + break; + case ETH_P_IP: + for (same = 1; i + same < msg_count && + same < UIO_MAXIOV; same++) { + struct tap_msg *next = &msg[i + same]; + + eh = (struct ethhdr *)next->start; + if (ntohs(eh->h_proto) != ETH_P_IP) + break; + } + i += tap4_handler(c, msg + i, same); - break; - case ETH_P_IPV6: - for (same = 1; i + same < msg_count; same++) { - eh = (struct ethhdr *)msg[i + same].start; - if (ntohs(eh->h_proto) != ETH_P_IPV6) - break; - } + break; + case ETH_P_IPV6: + for (same = 1; i + same < msg_count && + same < UIO_MAXIOV; same++) { + struct tap_msg *next = &msg[i + same]; + + eh = (struct ethhdr *)next->start; + if (ntohs(eh->h_proto) != ETH_P_IPV6) + break; + } + i += tap6_handler(c, msg + i, same); - break; - default: - i++; - break; + break; + default: + i++; + break; + } } + + p = tap_buf; } - if (n >= 0 || rcv == EINTR || rcv == EAGAIN || rcv == EWOULDBLOCK) + if (n >= 0 || errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK) return 0; epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_unix, NULL); @@ -614,8 +623,21 @@ static void sock_handler(struct ctx *c, int fd, uint32_t events) sl = sizeof(so); - if (getsockopt(fd, SOL_SOCKET, SO_PROTOCOL, &so, &sl)) +#define IN(x, proto) (x >= c->proto.fd_min && x <= c->proto.fd_max) + + if (IN(fd, udp) && !IN(fd, icmp) && !IN(fd, tcp)) + so = IPPROTO_UDP; + else if (IN(fd, tcp) && !IN(fd, icmp) && !IN(fd, udp)) + so = IPPROTO_TCP; + else if (IN(fd, icmp) && !IN(fd, udp) && !IN(fd, tcp)) + so = IPPROTO_ICMP; /* Fits ICMPv6 below, too */ + else if (getsockopt(fd, SOL_SOCKET, SO_PROTOCOL, &so, &sl)) { + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, fd, NULL); + close(fd); return; + } + +#undef IN debug("%s: packet from socket %i", getprotobynumber(so)->p_name, fd); @@ -771,7 +793,10 @@ loop: for (i = 0; i < nfds; i++) { if (events[i].data.fd == c.fd_unix) { - if (tap_handler(&c)) + if (events[i].events & EPOLLRDHUP || + events[i].events & EPOLLHUP || + events[i].events & EPOLLERR || + tap_handler(&c)) goto listen; } else { sock_handler(&c, events[i].data.fd, events[i].events); diff --git a/passt.h b/passt.h index 87d91e5..d8b2dce 100644 --- a/passt.h +++ b/passt.h @@ -16,6 +16,7 @@ struct tap_msg { #include "icmp.h" #include "tcp.h" +#include "udp.h" /** * struct ctx - Execution context @@ -56,4 +57,5 @@ struct ctx { struct icmp_ctx icmp; struct tcp_ctx tcp; + struct tcp_ctx udp; }; diff --git a/tap.c b/tap.c index f8b8b4f..c11191c 100644 --- a/tap.c +++ b/tap.c @@ -37,9 +37,9 @@ int tap_send(int fd, void *data, size_t len, int flags) { uint32_t vnet_len = htonl(len); - send(fd, &vnet_len, 4, 0); + send(fd, &vnet_len, 4, MSG_DONTWAIT | MSG_NOSIGNAL); - return send(fd, data, len, flags); + return send(fd, data, len, flags | MSG_DONTWAIT | MSG_NOSIGNAL); } /** diff --git a/tcp.c b/tcp.c index 330e21a..3d47f35 100644 --- a/tcp.c +++ b/tcp.c @@ -1003,6 +1003,11 @@ static void tcp_conn_from_sock(struct ctx *c, int fd) if (s == -1) return; + if (s < c->tcp.fd_min) + c->tcp.fd_min = s; + if (s > c->tcp.fd_max) + c->tcp.fd_max = s; + if (sa_l.ss_family == AF_INET) { struct sockaddr_in *sa4 = (struct sockaddr_in *)&sa_r; @@ -1445,6 +1450,9 @@ int tcp_sock_init(struct ctx *c) { in_port_t port; + c->tcp.fd_min = INT_MAX; + c->tcp.fd_max = 0; + for (port = 0; port < (1 << 15) + (1 << 14); port++) { if (c->v4 && sock_l4_add(c, 4, IPPROTO_TCP, port) < 0) return -1; diff --git a/tcp.h b/tcp.h index da081ad..2a79a75 100644 --- a/tcp.h +++ b/tcp.h @@ -12,9 +12,13 @@ void tcp_timer(struct ctx *c, struct timespec *ts); /** * struct tcp_ctx - Execution context for TCP routines * @hash_secret: 128-bit secret for hash functions, ISN and hash table + * @fd_min: Lowest file descriptor number for TCP ever used + * @fd_max: Highest file descriptor number for TCP ever used */ struct tcp_ctx { uint64_t hash_secret[2]; + int fd_min; + int fd_max; }; #endif /* TCP_H */ diff --git a/udp.c b/udp.c index 7be88f6..edb73de 100644 --- a/udp.c +++ b/udp.c @@ -68,7 +68,8 @@ void udp_sock_handler(struct ctx *c, int s, uint32_t events) struct udphdr *uh; ssize_t n; - (void)events; + if (events == EPOLLERR) + return; n = recvfrom(s, buf + sizeof(*uh), sizeof(buf) - sizeof(*uh), MSG_DONTWAIT, (struct sockaddr *)&sr, &slen); @@ -179,7 +180,11 @@ int udp_tap_handler(struct ctx *c, int af, void *addr, return count; } - return sendmmsg(s, mm, count, MSG_DONTWAIT | MSG_NOSIGNAL); + count = sendmmsg(s, mm, count, MSG_DONTWAIT | MSG_NOSIGNAL | MSG_ZEROCOPY); + if (count < 0) + return 1; + + return count; } /** @@ -191,13 +196,19 @@ int udp_tap_handler(struct ctx *c, int af, void *addr, int udp_sock_init(struct ctx *c) { in_port_t port; - int s; + int s, one = 1; + + c->udp.fd_min = INT_MAX; + c->udp.fd_max = 0; for (port = 0; port < USHRT_MAX; port++) { if (c->v4) { if ((s = sock_l4_add(c, 4, IPPROTO_UDP, port)) < 0) return -1; + setsockopt(s, SOL_SOCKET, SO_ZEROCOPY, + &one, sizeof(one)); + udp4_sock_port[port] = s; } @@ -205,6 +216,9 @@ int udp_sock_init(struct ctx *c) if ((s = sock_l4_add(c, 6, IPPROTO_UDP, port)) < 0) return -1; + setsockopt(s, SOL_SOCKET, SO_ZEROCOPY, + &one, sizeof(one)); + udp6_sock_port[port] = s; } } diff --git a/udp.h b/udp.h index 0179fa2..b9ac2e0 100644 --- a/udp.h +++ b/udp.h @@ -1,4 +1,19 @@ +#ifndef UDP_H +#define UDP_H + void udp_sock_handler(struct ctx *c, int s, uint32_t events); int udp_tap_handler(struct ctx *c, int af, void *addr, struct tap_msg *msg, int count); int udp_sock_init(struct ctx *c); + +/** + * struct udp_ctx - Execution context for UDP + * @fd_min: Lowest file descriptor number for UDP ever used + * @fd_max: Highest file descriptor number for UDP ever used + */ +struct udp_ctx { + int fd_min; + int fd_max; +}; + +#endif /* UDP_H */ diff --git a/util.c b/util.c index 7a75e02..cc96a1a 100644 --- a/util.c +++ b/util.c @@ -189,6 +189,21 @@ int sock_l4_add(struct ctx *c, int v, uint16_t proto, uint16_t port) return -1; } +#define CHECK_SET_MIN_MAX(ipproto, proto_ctx, fd) \ + if (proto == (ipproto)) { \ + if (fd < c->proto_ctx.fd_min) \ + c->proto_ctx.fd_min = (fd); \ + if (fd > c->proto_ctx.fd_max) \ + c->proto_ctx.fd_max = (fd); \ + } + + CHECK_SET_MIN_MAX(IPPROTO_ICMP, icmp, fd); + CHECK_SET_MIN_MAX(IPPROTO_ICMPV6, icmp, fd); + CHECK_SET_MIN_MAX(IPPROTO_TCP, tcp, fd); + CHECK_SET_MIN_MAX(IPPROTO_UDP, udp, fd); + +#undef CHECK_SET_MIN_MAX + if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) goto epoll_add; -- cgit v1.2.3