From e07f539ae0aa3ad623c4e8afcaca26906fd1eb17 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Fri, 30 Apr 2021 14:52:18 +0200 Subject: udp, passt: Introduce socket packet buffer, avoid getsockname() for UDP This is in preparation for scatter-gather IO on the UDP receive path: save a getsockname() syscall by setting a flag if we get the numbering of all bound sockets in a strict sequence (expected, in practice) and repurpose the tap buffer to be also a socket receive buffer, passing it down to protocol handlers. Signed-off-by: Stefano Brivio --- icmp.c | 4 +++- icmp.h | 2 +- passt.c | 15 +++++++------ passt.h | 4 +++- tcp.c | 5 ++++- tcp.h | 2 +- udp.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----------- udp.h | 4 +++- util.c | 24 ++++++++++++++------- util.h | 1 + 10 files changed, 103 insertions(+), 34 deletions(-) diff --git a/icmp.c b/icmp.c index 493452d..2966cb3 100644 --- a/icmp.c +++ b/icmp.c @@ -38,9 +38,10 @@ * @c: Execution context * @s: File descriptor number for socket * @events: epoll events bitmap + * @pkt_buf: Buffer to receive packets, currently unused * @now: Current timestamp, unused */ -void icmp_sock_handler(struct ctx *c, int s, uint32_t events, +void icmp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf, struct timespec *now) { struct in6_addr a6 = { .s6_addr = { 0, 0, 0, 0, @@ -53,6 +54,7 @@ void icmp_sock_handler(struct ctx *c, int s, uint32_t events, ssize_t n; (void)events; + (void)pkt_buf; (void)now; n = recvfrom(s, buf, sizeof(buf), MSG_DONTWAIT, diff --git a/icmp.h b/icmp.h index 930097a..4152b7e 100644 --- a/icmp.h +++ b/icmp.h @@ -3,7 +3,7 @@ struct ctx; -void icmp_sock_handler(struct ctx *c, int s, uint32_t events, +void icmp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf, struct timespec *now); int icmp_tap_handler(struct ctx *c, int af, void *addr, struct tap_msg *msg, int count, struct timespec *now); diff --git a/passt.c b/passt.c index 765f153..4b0497c 100644 --- a/passt.c +++ b/passt.c @@ -60,6 +60,9 @@ #define TAP_BUF_FILL (TAP_BUF_BYTES - ETH_MAX_MTU - sizeof(uint32_t)) #define TAP_MSGS (TAP_BUF_BYTES / sizeof(struct ethhdr) + 1) +#define PKT_BUF_BYTES MAX(TAP_BUF_BYTES, SOCK_BUF_BYTES) +static char pkt_buf [PKT_BUF_BYTES]; + #define TIMER_INTERVAL MIN(TCP_TIMER_INTERVAL, UDP_TIMER_INTERVAL) /** @@ -530,8 +533,6 @@ static int tap6_handler(struct ctx *c, struct tap_msg *msg, size_t count, return 1; } -static char tap_buf[TAP_BUF_BYTES]; - /** * tap_handler() - Packet handler for tap file descriptor * @c: Execution context @@ -544,7 +545,7 @@ static int tap_handler(struct ctx *c, struct timespec *now) struct tap_msg msg[TAP_MSGS]; int msg_count, same, i; struct ethhdr *eh; - char *p = tap_buf; + char *p = pkt_buf; ssize_t n, rem; while ((n = recv(c->fd_unix, p, TAP_BUF_FILL, MSG_DONTWAIT)) > 0) { @@ -615,7 +616,7 @@ static int tap_handler(struct ctx *c, struct timespec *now) } } - p = tap_buf; + p = pkt_buf; } if (n >= 0 || errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK) @@ -660,11 +661,11 @@ static void sock_handler(struct ctx *c, int s, uint32_t events, debug("%s: packet from socket %i", getprotobynumber(proto)->p_name, s); if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) - icmp_sock_handler(c, s, events, now); + icmp_sock_handler(c, s, events, pkt_buf, now); else if (proto == IPPROTO_TCP) - tcp_sock_handler(c, s, events, now); + tcp_sock_handler( c, s, events, pkt_buf, now); else if (proto == IPPROTO_UDP) - udp_sock_handler(c, s, events, now); + udp_sock_handler( c, s, events, pkt_buf, now); } /** diff --git a/passt.h b/passt.h index 009cedb..22a91d6 100644 --- a/passt.h +++ b/passt.h @@ -14,6 +14,8 @@ struct tap_msg { size_t l4_len; }; +#define SOCK_BUF_BYTES (ETH_MAX_MTU * 4) + #include "icmp.h" #include "tcp.h" #include "udp.h" @@ -62,5 +64,5 @@ struct ctx { struct icmp_ctx icmp; struct tcp_ctx tcp; - struct tcp_ctx udp; + struct udp_ctx udp; }; diff --git a/tcp.c b/tcp.c index df9508c..3c92d37 100644 --- a/tcp.c +++ b/tcp.c @@ -1402,14 +1402,17 @@ static void tcp_connect_finish(struct ctx *c, int s) * @c: Execution context * @s: File descriptor number for socket * @events: epoll events bitmap + * @pkt_buf: Buffer to receive packets, currently unused * @now: Current timestamp */ -void tcp_sock_handler(struct ctx *c, int s, uint32_t events, +void tcp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf, struct timespec *now) { socklen_t sl; int accept; + (void)pkt_buf; + if (tc[s].s == LAST_ACK) { tcp_send_to_tap(c, s, ACK, NULL, 0); tcp_close_and_epoll_del(c, s); diff --git a/tcp.h b/tcp.h index 163ba96..7435c41 100644 --- a/tcp.h +++ b/tcp.h @@ -5,7 +5,7 @@ struct ctx; -void tcp_sock_handler(struct ctx *c, int s, uint32_t events, +void tcp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf, struct timespec *now); int tcp_tap_handler(struct ctx *c, int af, void *addr, struct tap_msg *msg, int count, struct timespec *now); diff --git a/udp.c b/udp.c index 1fb8406..46a3302 100644 --- a/udp.c +++ b/udp.c @@ -124,25 +124,69 @@ static void udp_sock_handler_local(struct ctx *c, int af, void *sa, } } +/** + * udp_sock_name() - Get address family and port for bound UDP socket + * @c: Execution context + * @s: File descriptor number for socket + * @port: Local port, set on return, network order + * + * Return: address family, AF_INET or AF_INET6, negative error code on failure + */ +static int udp_sock_name(struct ctx *c, int s, in_port_t *port) +{ + if (!c->udp.fd_in_seq) { + struct sockaddr_storage sa; + socklen_t sl; + + sl = sizeof(sa); + if (getsockname(s, (struct sockaddr *)&sa, &sl)) + return -errno; + + if (sa.ss_family == AF_INET) { + *port = ((struct sockaddr_in *)&sa)->sin_port; + return AF_INET; + } + + if (sa.ss_family == AF_INET6) { + *port = ((struct sockaddr_in6 *)&sa)->sin6_port; + return AF_INET6; + } + + return -ENOTSUP; + } + + if (c->v4 && c->v6) { + *port = htons((s - c->udp.fd_min) / 2); + return ((s - c->udp.fd_min) % 2) ? AF_INET6 : AF_INET; + } + + *port = htons(s - c->udp.fd_min); + return c->v4 ? AF_INET : AF_INET6; +} + /** * udp_sock_handler() - Handle new data from socket * @c: Execution context * @s: File descriptor number for socket * @events: epoll events bitmap + * @pkt_buf: Buffer to receive packets, currently unused * @now: Current timestamp */ -void udp_sock_handler(struct ctx *c, int s, uint32_t events, +void udp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf, struct timespec *now) { struct in6_addr a6 = { .s6_addr = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0, 0, 0, 0 } }; - struct sockaddr_storage sr, sl; + struct sockaddr_storage sr; socklen_t slen = sizeof(sr); char buf[USHRT_MAX]; struct udphdr *uh; ssize_t n; + int af; + + (void)pkt_buf; if (events == EPOLLERR) return; @@ -153,13 +197,10 @@ void udp_sock_handler(struct ctx *c, int s, uint32_t events, return; uh = (struct udphdr *)buf; + af = udp_sock_name(c, s, &uh->dest); - if (getsockname(s, (struct sockaddr *)&sl, &slen)) - return; - - if (sl.ss_family == AF_INET) { + if (af == AF_INET) { struct sockaddr_in *sr4 = (struct sockaddr_in *)&sr; - struct sockaddr_in *sl4 = (struct sockaddr_in *)&sl; if (ntohl(sr4->sin_addr.s_addr) == INADDR_LOOPBACK || ntohl(sr4->sin_addr.s_addr) == INADDR_ANY) @@ -167,19 +208,16 @@ void udp_sock_handler(struct ctx *c, int s, uint32_t events, memcpy(&a6.s6_addr[12], &sr4->sin_addr, sizeof(sr4->sin_addr)); uh->source = sr4->sin_port; - uh->dest = sl4->sin_port; uh->len = htons(n + sizeof(*uh)); tap_ip_send(c, &a6, IPPROTO_UDP, buf, n + sizeof(*uh)); - } else if (sl.ss_family == AF_INET6) { + } else if (af == AF_INET6) { struct sockaddr_in6 *sr6 = (struct sockaddr_in6 *)&sr; - struct sockaddr_in6 *sl6 = (struct sockaddr_in6 *)&sl; if (IN6_IS_ADDR_LOOPBACK(&sr6->sin6_addr)) udp_sock_handler_local(c, AF_INET6, sr6, now); uh->source = sr6->sin6_port; - uh->dest = sl6->sin6_port; uh->len = htons(n + sizeof(*uh)); tap_ip_send(c, &sr6->sin6_addr, IPPROTO_UDP, @@ -363,17 +401,23 @@ int udp_tap_handler(struct ctx *c, int af, void *addr, */ int udp_sock_init(struct ctx *c) { + int s, prev = -1; in_port_t port; - int s; c->udp.fd_min = INT_MAX; c->udp.fd_max = 0; + c->udp.fd_in_seq = 1; for (port = 0; port < USHRT_MAX; port++) { if (c->v4) { if ((s = sock_l4(c, AF_INET, IPPROTO_UDP, port)) < 0) return -1; + if (c->udp.fd_in_seq && prev != -1 && s != prev + 1) + c->udp.fd_in_seq = 0; + else + prev = s; + up4[port].s = s; } @@ -381,6 +425,11 @@ int udp_sock_init(struct ctx *c) if ((s = sock_l4(c, AF_INET6, IPPROTO_UDP, port)) < 0) return -1; + if (c->udp.fd_in_seq && prev != -1 && s != prev + 1) + c->udp.fd_in_seq = 0; + else + prev = s; + up6[port].s = s; } } @@ -424,7 +473,8 @@ static void udp_timer_one(struct ctx *c, int af, in_port_t p, if (s != -1) { epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL); close(s); - sock_l4(c, af, IPPROTO_UDP, p); + if (sock_l4(c, af, IPPROTO_UDP, p) != s) + c->udp.fd_in_seq = 0; } } diff --git a/udp.h b/udp.h index 201714a..a126488 100644 --- a/udp.h +++ b/udp.h @@ -3,7 +3,7 @@ #define UDP_TIMER_INTERVAL 1000 /* ms */ -void udp_sock_handler(struct ctx *c, int s, uint32_t events, +void udp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf, struct timespec *now); int udp_tap_handler(struct ctx *c, int af, void *addr, struct tap_msg *msg, int count, struct timespec *now); @@ -14,11 +14,13 @@ void udp_timer(struct ctx *c, struct timespec *ts); * struct udp_ctx - Execution context for UDP * @fd_min: Lowest file descriptor number for UDP ever used * @fd_max: Highest file descriptor number for UDP ever used + * @fd_in_seq: 1 if all socket numbers are in sequence, 0 otherwise * @timer_run: Timestamp of most recent timer run */ struct udp_ctx { int fd_min; int fd_max; + int fd_in_seq; struct timespec timer_run; }; diff --git a/util.c b/util.c index c48f2f6..9ccd9f6 100644 --- a/util.c +++ b/util.c @@ -189,11 +189,6 @@ int sock_l4(struct ctx *c, int af, uint16_t proto, uint16_t port) return -1; } - CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_ICMP, icmp, fd); - CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_ICMPV6, icmp, fd); - CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_TCP, tcp, fd); - CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_UDP, udp, fd); - if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) goto epoll_add; @@ -207,16 +202,29 @@ int sock_l4(struct ctx *c, int af, uint16_t proto, uint16_t port) setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof(one)); } + CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_ICMP, icmp, fd); + CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_ICMPV6, icmp, fd); + CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_TCP, tcp, fd); + CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_UDP, udp, fd); + if (proto == IPPROTO_UDP && PORT_IS_EPHEMERAL(port)) goto epoll_add; if (bind(fd, sa, sl) < 0) { /* We'll fail to bind to low ports if we don't have enough * capabilities, and we'll fail to bind on already bound ports, - * this is fine. + * this is fine. If this isn't the socket with the lowest number + * for a given protocol, leave it open, to avoid unnecessary + * holes in the numbering. */ - close(fd); - return 0; + if ((proto == IPPROTO_TCP && fd == c->tcp.fd_min) || + (proto == IPPROTO_UDP && fd == c->udp.fd_min) || + ((proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) && + fd == c->icmp.fd_min)) { + close(fd); + return 0; + } + return fd; } if (proto == IPPROTO_TCP && listen(fd, 128) < 0) { diff --git a/util.h b/util.h index 0bb39a7..fe129ee 100644 --- a/util.h +++ b/util.h @@ -23,6 +23,7 @@ void debug(const char *format, ...); } while (0) #define MIN(x, y) (((x) < (y)) ? (x) : (y)) +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) #define IN_INTERVAL(a, b, x) ((x) >= (a) && (x) <= (b)) -- cgit v1.2.3