diff options
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | arp.c | 51 | ||||
-rw-r--r-- | arp.h | 2 | ||||
-rw-r--r-- | dhcp.c | 54 | ||||
-rw-r--r-- | dhcp.h | 2 | ||||
-rw-r--r-- | dhcpv6.c | 151 | ||||
-rw-r--r-- | dhcpv6.h | 3 | ||||
-rw-r--r-- | icmp.c | 28 | ||||
-rw-r--r-- | icmp.h | 4 | ||||
-rw-r--r-- | ndp.c | 59 | ||||
-rw-r--r-- | ndp.h | 3 | ||||
-rw-r--r-- | packet.c | 134 | ||||
-rw-r--r-- | packet.h | 81 | ||||
-rw-r--r-- | passt.h | 1 | ||||
-rw-r--r-- | tap.c | 331 | ||||
-rw-r--r-- | tcp.c | 442 | ||||
-rw-r--r-- | tcp.h | 16 | ||||
-rw-r--r-- | tcp_splice.c | 216 | ||||
-rw-r--r-- | tcp_splice.h | 2 | ||||
-rw-r--r-- | udp.c | 46 | ||||
-rw-r--r-- | udp.h | 6 | ||||
-rw-r--r-- | util.c | 60 | ||||
-rw-r--r-- | util.h | 5 |
23 files changed, 999 insertions, 700 deletions
@@ -291,7 +291,7 @@ speeding up local connections, and usually requiring NAT. _pasta_: * ✅ restrictive seccomp profiles (25 syscalls allowed for _passt_, 37 for _pasta_ on x86_64) * ✅ static checkers in continuous integration (clang-tidy, cppcheck) -* 🛠️ clearly defined packet abstraction +* ✅️ clearly defined boundary-checked packet abstraction * 🛠️ ~5 000 LoC target * ⌚ [fuzzing](https://bugs.passt.top/show_bug.cgi?id=9), _packetdrill_ tests * ⌚ stricter [synflood protection](https://bugs.passt.top/show_bug.cgi?id=10) @@ -30,53 +30,56 @@ #include "tap.h" /** - * arp() - Check if this is an ARP message, reply as needed + * arp() - Check if this is a supported ARP message, reply as needed * @c: Execution context - * @len: Total L2 packet length - * @eh: Packet buffer, Ethernet header + * @p: Packet pool, single packet with Ethernet buffer * - * Return: 0 if it's not an ARP message, 1 if handled, -1 on failure + * Return: 1 if handled, -1 on failure */ -int arp(struct ctx *c, struct ethhdr *eh, size_t len) +int arp(struct ctx *c, struct pool *p) { - struct arphdr *ah = (struct arphdr *)(eh + 1); - struct arpmsg *am = (struct arpmsg *)(ah + 1); unsigned char swap[4]; + struct ethhdr *eh; + struct arphdr *ah; + struct arpmsg *am; + size_t len; - if (eh->h_proto != htons(ETH_P_ARP)) - return 0; + eh = packet_get(p, 0, 0, sizeof(*eh), NULL); + ah = packet_get(p, 0, sizeof(*eh), sizeof(*ah), NULL); + am = packet_get(p, 0, sizeof(*eh) + sizeof(*ah), sizeof(*am), NULL); - if (len < sizeof(*eh) + sizeof(*ah) + sizeof(*am)) + if (!eh || !ah || !am) return -1; - if (ah->ar_hrd != htons(ARPHRD_ETHER) || - ah->ar_pro != htons(ETH_P_IP) || - ah->ar_hln != ETH_ALEN || ah->ar_pln != 4 || - ah->ar_op != htons(ARPOP_REQUEST)) + if (ah->ar_hrd != htons(ARPHRD_ETHER) || + ah->ar_pro != htons(ETH_P_IP) || + ah->ar_hln != ETH_ALEN || + ah->ar_pln != 4 || + ah->ar_op != htons(ARPOP_REQUEST)) return 1; /* Discard announcements (but not 0.0.0.0 "probes"): we might have the * same IP address, hide that. */ - if (memcmp(am->sip, (unsigned char[4]){ 0, 0, 0, 0 }, 4) && - !memcmp(am->sip, am->tip, 4)) + if (memcmp(am->sip, (unsigned char[4]){ 0 }, sizeof(am->tip)) && + !memcmp(am->sip, am->tip, sizeof(am->sip))) return 1; /* Don't resolve our own address, either. */ - if (!memcmp(am->tip, &c->addr4, 4)) + if (!memcmp(am->tip, &c->addr4, sizeof(am->tip))) return 1; ah->ar_op = htons(ARPOP_REPLY); - memcpy(am->tha, am->sha, ETH_ALEN); - memcpy(am->sha, c->mac, ETH_ALEN); + memcpy(am->tha, am->sha, sizeof(am->tha)); + memcpy(am->sha, c->mac, sizeof(am->sha)); - memcpy(swap, am->tip, 4); - memcpy(am->tip, am->sip, 4); - memcpy(am->sip, swap, 4); + memcpy(swap, am->tip, sizeof(am->tip)); + memcpy(am->tip, am->sip, sizeof(am->tip)); + memcpy(am->sip, swap, sizeof(am->sip)); len = sizeof(*eh) + sizeof(*ah) + sizeof(*am); - memcpy(eh->h_dest, eh->h_source, ETH_ALEN); - memcpy(eh->h_source, c->mac, ETH_ALEN); + memcpy(eh->h_dest, eh->h_source, sizeof(eh->h_dest)); + memcpy(eh->h_source, c->mac, sizeof(eh->h_source)); if (tap_send(c, eh, len, 0) < 0) perror("ARP: send"); @@ -17,4 +17,4 @@ struct arpmsg { unsigned char tip[4]; } __attribute__((__packed__)); -int arp(struct ctx *c, struct ethhdr *eh, size_t len); +int arp(struct ctx *c, struct pool *p); @@ -22,9 +22,11 @@ #include <stdint.h> #include <unistd.h> #include <string.h> +#include <limits.h> #include "util.h" #include "checksum.h" +#include "packet.h" #include "passt.h" #include "tap.h" #include "dhcp.h" @@ -257,27 +259,32 @@ static void opt_set_dns_search(struct ctx *c, size_t max_len) /** * dhcp() - Check if this is a DHCP message, reply as needed * @c: Execution context - * @len: Total L2 packet length - * @eh: Packet buffer, Ethernet header + * @p: Packet pool, single packet with Ethernet buffer * * Return: 0 if it's not a DHCP message, 1 if handled, -1 on failure */ -int dhcp(struct ctx *c, struct ethhdr *eh, size_t len) +int dhcp(struct ctx *c, struct pool *p) { - struct iphdr *iph = (struct iphdr *)(eh + 1); - size_t mlen, olen; + size_t mlen, len, offset = 0, opt_len, opt_off = 0; + struct ethhdr *eh; + struct iphdr *iph; struct udphdr *uh; unsigned int i; struct msg *m; - if (len < sizeof(*eh) + sizeof(*iph)) - return 0; + eh = packet_get(p, 0, offset, sizeof(*eh), NULL); + offset += sizeof(*eh); - if (len < sizeof(*eh) + (long)iph->ihl * 4 + sizeof(*uh)) - return 0; + iph = packet_get(p, 0, offset, sizeof(*iph), NULL); + if (!eh || !iph) + return -1; - uh = (struct udphdr *)((char *)iph + (long)(iph->ihl * 4)); - m = (struct msg *)(uh + 1); + offset += iph->ihl * 4UL; + uh = packet_get(p, 0, offset, sizeof(*uh), &mlen); + offset += sizeof(*uh); + + if (!uh) + return -1; if (uh->dest != htons(67)) return 0; @@ -285,18 +292,29 @@ int dhcp(struct ctx *c, struct ethhdr *eh, size_t len) if (c->no_dhcp) return 1; - mlen = len - sizeof(*eh) - (long)iph->ihl * 4 - sizeof(*uh); - if (mlen != ntohs(uh->len) - sizeof(*uh) || - mlen < offsetof(struct msg, o) || + m = packet_get(p, 0, offset, offsetof(struct msg, o), &opt_len); + if (!m || + mlen != ntohs(uh->len) - sizeof(*uh) || + mlen < offsetof(struct msg, o) || m->op != BOOTREQUEST) return -1; - olen = mlen - offsetof(struct msg, o); - for (i = 0; i + 2 < olen; i += m->o[i + 1] + 2) { - if (m->o[i + 1] + i + 2 >= olen) + offset += offsetof(struct msg, o); + + while (opt_off + 2 < opt_len) { + uint8_t *olen, *type, *val; + + type = packet_get(p, 0, offset + opt_off, 1, NULL); + olen = packet_get(p, 0, offset + opt_off + 1, 1, NULL); + if (!type || !olen) + return -1; + + val = packet_get(p, 0, offset + opt_off + 2, *olen, NULL); + if (!val) return -1; - memcpy(&opts[m->o[i]].c, &m->o[i + 2], m->o[i + 1]); + memcpy(&opts[*type].c, val, *olen); + opt_off += *olen + 2; } if (opts[53].c[0] == DHCPDISCOVER) { @@ -3,5 +3,5 @@ * Author: Stefano Brivio <sbrivio@redhat.com> */ -int dhcp(struct ctx *c, struct ethhdr *eh, size_t len); +int dhcp(struct ctx *c, struct pool *p); void dhcp_init(void); @@ -24,7 +24,9 @@ #include <unistd.h> #include <string.h> #include <time.h> +#include <limits.h> +#include "packet.h" #include "util.h" #include "passt.h" #include "tap.h" @@ -69,6 +71,8 @@ struct opt_hdr { #endif #define OPT_SIZE(x) OPT_SIZE_CONV(sizeof(struct opt_##x) - \ sizeof(struct opt_hdr)) +#define OPT_VSIZE(x) (sizeof(struct opt_##x) - \ + sizeof(struct opt_hdr)) /** * struct opt_client_id - DHCPv6 Client Identifier option @@ -265,10 +269,10 @@ static const struct opt_status_code sc_not_on_link = { /** * struct resp_not_on_link_t - NotOnLink error (mandated by RFC 8415, 18.3.2.) - * @uh: UDP header - * @hdr: DHCP message header - * @server_id: Server Identifier option - * @var: Payload: IA_NA from client, status code, client ID + * @uh: UDP header + * @hdr: DHCP message header + * @server_id: Server Identifier option + * @var: Payload: IA_NA from client, status code, client ID */ static struct resp_not_on_link_t { struct udphdr uh; @@ -287,26 +291,30 @@ static struct resp_not_on_link_t { /** * dhcpv6_opt() - Get option from DHCPv6 message - * @o: First option header to check - * @type: Option type to look up, network order - * @len: Remaining length, host order, modified on return + * @p: Packet pool, single packet with UDP header + * @offset: Offset to look at, 0: end of header, set to option start + * @type: Option type to look up, network order * * Return: pointer to option header, or NULL on malformed or missing option */ -static struct opt_hdr *dhcpv6_opt(struct opt_hdr *o, uint16_t type, size_t *len) +static struct opt_hdr *dhcpv6_opt(struct pool *p, size_t *offset, uint16_t type) { - while (*len >= sizeof(struct opt_hdr)) { - unsigned int opt_len = ntohs(o->l) + sizeof(struct opt_hdr); + struct opt_hdr *o; + size_t left; - if (opt_len > *len) - return NULL; + if (!*offset) + *offset = sizeof(struct udphdr) + sizeof(struct msg_hdr); + + while ((o = packet_get_try(p, 0, *offset, sizeof(*o), &left))) { + unsigned int opt_len = ntohs(o->l) + sizeof(*o); - *len -= opt_len; + if (ntohs(o->l) > left) + return NULL; if (o->t == type) return o; - o = (struct opt_hdr *)((uint8_t *)o + opt_len); + *offset += opt_len; } return NULL; @@ -314,61 +322,45 @@ static struct opt_hdr *dhcpv6_opt(struct opt_hdr *o, uint16_t type, size_t *len) /** * dhcpv6_ia_notonlink() - Check if any IA contains non-appropriate addresses - * @o: First option header to check for IAs - * @rem_len: Remaining message length, host order - * @addr: Address we want to lease to the client + * @o: First option header to check for IAs + * @rem_len: Remaining message length, host order + * @addr: Address we want to lease to the client * * Return: pointer to non-appropriate IA_NA or IA_TA, if any, NULL otherwise */ -static struct opt_hdr *dhcpv6_ia_notonlink(struct opt_hdr *o, size_t rem_len, - struct in6_addr *addr) +static struct opt_hdr *dhcpv6_ia_notonlink(struct pool *p, struct in6_addr *la) { - struct opt_hdr *ia, *ia_addr; char buf[INET6_ADDRSTRLEN]; struct in6_addr *req_addr; - size_t len; + struct opt_hdr *ia, *h; + size_t offset; int ia_type; ia_type = OPT_IA_NA; ia_ta: - len = rem_len; - ia = o; - - while ((ia = dhcpv6_opt(ia, ia_type, &len))) { - size_t ia_len = ntohs(ia->l); - - if (ia_type == OPT_IA_NA) { - struct opt_ia_na *subopt = (struct opt_ia_na *)ia + 1; - - ia_addr = (struct opt_hdr *)subopt; - } else if (ia_type == OPT_IA_TA) { - struct opt_ia_ta *subopt = (struct opt_ia_ta *)ia + 1; - - ia_addr = (struct opt_hdr *)subopt; - } + offset = 0; + while ((ia = dhcpv6_opt(p, &offset, ia_type))) { + if (ntohs(ia->l) < OPT_VSIZE(ia_na)) + return NULL; - ia_len -= sizeof(struct opt_ia_na) - sizeof(struct opt_hdr); + offset += sizeof(struct opt_ia_na); - while ((ia_addr = dhcpv6_opt(ia_addr, OPT_IAAADR, &ia_len))) { - struct opt_ia_addr *next; + while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) { + struct opt_ia_addr *opt_addr = (struct opt_ia_addr *)h; - req_addr = (struct in6_addr *)(ia_addr + 1); + if (ntohs(h->l) != OPT_VSIZE(ia_addr)) + return NULL; - if (!IN6_ARE_ADDR_EQUAL(addr, req_addr)) { + req_addr = &opt_addr->addr; + if (!IN6_ARE_ADDR_EQUAL(la, req_addr)) { info("DHCPv6: requested address %s not on link", inet_ntop(AF_INET6, req_addr, buf, sizeof(buf))); return ia; } - next = (struct opt_ia_addr *)ia_addr + 1; - ia_addr = (struct opt_hdr *)next; + offset += sizeof(struct opt_ia_addr); } - - if (!ia_addr) - break; - - ia = ia_addr; } if (ia_type == OPT_IA_NA) { @@ -449,59 +441,58 @@ search: /** * dhcpv6() - Check if this is a DHCPv6 message, reply as needed * @c: Execution context - * @eh: Packet buffer, Ethernet header - * @len: Total L2 packet length + * @p: Packet pool, single packet starting from UDP header + * @saddr: Source IPv6 address of original message + * @daddr: Destination IPv6 address of original message * * Return: 0 if it's not a DHCPv6 message, 1 if handled, -1 on failure */ -int dhcpv6(struct ctx *c, struct ethhdr *eh, size_t len) +int dhcpv6(struct ctx *c, struct pool *p, + const struct in6_addr *saddr, const struct in6_addr *daddr) { - struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1); struct opt_hdr *ia, *bad_ia, *client_id, *server_id; struct in6_addr *src; struct msg_hdr *mh; struct udphdr *uh; - uint8_t proto; - size_t mlen; - size_t n; + size_t mlen, n; - uh = (struct udphdr *)ipv6_l4hdr(ip6h, &proto); - if (!uh || proto != IPPROTO_UDP || uh->dest != htons(547)) + uh = packet_get(p, 0, 0, sizeof(*uh), &mlen); + if (!uh) + return -1; + + if (uh->dest != htons(547)) return 0; if (c->no_dhcpv6) return 1; - if (!IN6_IS_ADDR_MULTICAST(&ip6h->daddr)) + if (!IN6_IS_ADDR_MULTICAST(daddr)) return -1; - mlen = len - ((intptr_t)uh - (intptr_t)eh) - sizeof(*uh); - - if (mlen != ntohs(uh->len) - sizeof(*uh) || - mlen < sizeof(struct msg_hdr)) + if (mlen + sizeof(*uh) != ntohs(uh->len) || mlen < sizeof(*mh)) return -1; - c->addr6_ll_seen = ip6h->saddr; + c->addr6_ll_seen = *saddr; if (IN6_IS_ADDR_LINKLOCAL(&c->gw6)) src = &c->gw6; else src = &c->addr6_ll; - mh = (struct msg_hdr *)(uh + 1); - mlen -= sizeof(struct msg_hdr); + mh = packet_get(p, 0, sizeof(*uh), sizeof(*mh), NULL); + if (!mh) + return -1; - n = mlen; - client_id = dhcpv6_opt((struct opt_hdr *)(mh + 1), OPT_CLIENTID, &n); - if (!client_id || ntohs(client_id->l) > ntohs(OPT_SIZE(client_id))) + client_id = dhcpv6_opt(p, &(size_t){ 0 }, OPT_CLIENTID); + if (!client_id || ntohs(client_id->l) > OPT_VSIZE(client_id)) return -1; - n = mlen; - server_id = dhcpv6_opt((struct opt_hdr *)(mh + 1), OPT_SERVERID, &n); + server_id = dhcpv6_opt(p, &(size_t){ 0 }, OPT_SERVERID); + if (server_id && ntohs(server_id->l) != OPT_VSIZE(server_id)) + return -1; - n = mlen; - ia = dhcpv6_opt((struct opt_hdr *)(mh + 1), OPT_IA_NA, &n); - if (ia && ntohs(ia->l) < ntohs(OPT_SIZE(ia_na))) + ia = dhcpv6_opt(p, &(size_t){ 0 }, OPT_IA_NA); + if (ia && ntohs(ia->l) < MIN(OPT_VSIZE(ia_na), OPT_VSIZE(ia_ta))) return -1; resp.hdr.type = TYPE_REPLY; @@ -516,18 +507,17 @@ int dhcpv6(struct ctx *c, struct ethhdr *eh, size_t len) if (mh->type == TYPE_CONFIRM && server_id) return -1; - if ((bad_ia = dhcpv6_ia_notonlink((struct opt_hdr *)(mh + 1), - mlen, &c->addr6))) { + if ((bad_ia = dhcpv6_ia_notonlink(p, &c->addr6))) { info("DHCPv6: received CONFIRM with inappropriate IA," " sending NotOnLink status in REPLY"); - n = ntohs(bad_ia->l) + sizeof(struct opt_hdr); - bad_ia->l = htons(n - sizeof(struct opt_hdr) + + bad_ia->l = htons(OPT_VSIZE(ia_na) + sizeof(sc_not_on_link)); + n = sizeof(struct opt_ia_na); memcpy(resp_not_on_link.var, bad_ia, n); - memcpy(resp_not_on_link.var + n, &sc_not_on_link, - sizeof(sc_not_on_link)); + memcpy(resp_not_on_link.var + n, + &sc_not_on_link, sizeof(sc_not_on_link)); n += sizeof(sc_not_on_link); memcpy(resp_not_on_link.var + n, client_id, @@ -552,8 +542,7 @@ int dhcpv6(struct ctx *c, struct ethhdr *eh, size_t len) memcmp(&resp.server_id, server_id, sizeof(resp.server_id))) return -1; - n = mlen; - if (ia || dhcpv6_opt((struct opt_hdr *)(mh + 1), OPT_IA_TA, &n)) + if (ia || dhcpv6_opt(p, &(size_t){ 0 }, OPT_IA_TA)) return -1; info("DHCPv6: received INFORMATION_REQUEST, sending REPLY"); @@ -3,5 +3,6 @@ * Author: Stefano Brivio <sbrivio@redhat.com> */ -int dhcpv6(struct ctx *c, struct ethhdr *eh, size_t len); +int dhcpv6(struct ctx *c, struct pool *p, + struct in6_addr *saddr, struct in6_addr *daddr); void dhcpv6_init(struct ctx *c); @@ -31,9 +31,11 @@ #include <linux/icmpv6.h> +#include "packet.h" #include "util.h" #include "passt.h" #include "tap.h" +#include "packet.h" #include "icmp.h" #define ICMP_ECHO_TIMEOUT 60 /* s, timeout for ICMP socket activity */ @@ -134,17 +136,15 @@ void icmp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, * icmp_tap_handler() - Handle packets from tap * @c: Execution context * @af: Address family, AF_INET or AF_INET6 - * @ - * @msg: Input message - * @count: Message count (always 1 for ICMP) + * @p: Packet pool, single packet with ICMP/ICMPv6 header * @now: Current timestamp * * Return: count of consumed packets (always 1, even if malformed) */ -int icmp_tap_handler(struct ctx *c, int af, void *addr, - struct tap_l4_msg *msg, int count, struct timespec *now) +int icmp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p, + struct timespec *now) { - (void)count; + size_t plen; if (af == AF_INET) { union icmp_epoll_ref iref = { .icmp.v6 = 0 }; @@ -155,9 +155,8 @@ int icmp_tap_handler(struct ctx *c, int af, void *addr, struct icmphdr *ih; int id, s; - ih = (struct icmphdr *)(pkt_buf + msg[0].pkt_buf_offset); - - if (msg[0].l4_len < sizeof(*ih) || ih->type != ICMP_ECHO) + ih = packet_get(p, 0, 0, sizeof(*ih), &plen); + if (!ih) return 1; sa.sin_port = ih->un.echo.id; @@ -175,7 +174,7 @@ int icmp_tap_handler(struct ctx *c, int af, void *addr, bitmap_set(icmp_act[V4], id); sa.sin_addr = *(struct in_addr *)addr; - sendto(s, ih, msg[0].l4_len, MSG_NOSIGNAL, + sendto(s, ih, sizeof(*ih) + plen, MSG_NOSIGNAL, (struct sockaddr *)&sa, sizeof(sa)); } else if (af == AF_INET6) { union icmp_epoll_ref iref = { .icmp.v6 = 1 }; @@ -186,10 +185,11 @@ int icmp_tap_handler(struct ctx *c, int af, void *addr, struct icmp6hdr *ih; int id, s; - ih = (struct icmp6hdr *)(pkt_buf + msg[0].pkt_buf_offset); + ih = packet_get(p, 0, 0, sizeof(struct icmp6hdr), &plen); + if (!ih) + return 1; - if (msg[0].l4_len < sizeof(*ih) || - (ih->icmp6_type != 128 && ih->icmp6_type != 129)) + if (ih->icmp6_type != 128 && ih->icmp6_type != 129) return 1; sa.sin6_port = ih->icmp6_identifier; @@ -207,7 +207,7 @@ int icmp_tap_handler(struct ctx *c, int af, void *addr, bitmap_set(icmp_act[V6], id); sa.sin6_addr = *(struct in6_addr *)addr; - sendto(s, ih, msg[0].l4_len, MSG_NOSIGNAL, + sendto(s, ih, sizeof(*ih) + plen, MSG_NOSIGNAL, (struct sockaddr *)&sa, sizeof(sa)); } @@ -12,8 +12,8 @@ struct ctx; void icmp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, struct timespec *now); -int icmp_tap_handler(struct ctx *c, int af, void *addr, - struct tap_l4_msg *msg, int count, struct timespec *now); +int icmp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p, + struct timespec *now); void icmp_timer(struct ctx *c, struct timespec *ts); /** @@ -39,28 +39,23 @@ /** * ndp() - Check for NDP solicitations, reply as needed * @c: Execution context - * @len: Total L2 packet length - * @eh: Packet buffer, Ethernet header + * @ih: ICMPv6 header + * @eh_source: Source Ethernet address + * @saddr Source IPv6 address * * Return: 0 if not handled here, 1 if handled, -1 on failure */ -int ndp(struct ctx *c, struct ethhdr *eh, size_t len) +int ndp(struct ctx *c, struct icmp6hdr *ih, unsigned char *eh_source, + struct in6_addr *saddr) { - struct ethhdr *ehr; - struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1), *ip6hr; - struct icmp6hdr *ih, *ihr; char buf[BUFSIZ] = { 0 }; - uint8_t proto, *p; - - if (len < sizeof(*ehr) + sizeof(*ip6h) + sizeof(*ih)) - return 0; - - ih = (struct icmp6hdr *)ipv6_l4hdr(ip6h, &proto); - if (!ih) - return -1; + struct ipv6hdr *ip6hr; + struct icmp6hdr *ihr; + struct ethhdr *ehr; + unsigned char *p; + size_t len; - if (proto != IPPROTO_ICMPV6 || - ih->icmp6_type < RS || ih->icmp6_type > NA) + if (ih->icmp6_type < RS || ih->icmp6_type > NA) return 0; if (c->no_ndp) @@ -71,11 +66,7 @@ int ndp(struct ctx *c, struct ethhdr *eh, size_t len) ihr = (struct icmp6hdr *)(ip6hr + 1); if (ih->icmp6_type == NS) { - if (len < sizeof(*ehr) + sizeof(*ip6h) + sizeof(*ih) + - sizeof(struct in6_addr)) - return -1; - - if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->saddr)) + if (IN6_IS_ADDR_UNSPECIFIED(saddr)) return 1; info("NDP: received NS, sending NA"); @@ -132,10 +123,10 @@ int ndp(struct ctx *c, struct ethhdr *eh, size_t len) for (n = 0; !IN6_IS_ADDR_UNSPECIFIED(&c->dns6[n]); n++); if (n) { - *p++ = 25; /* RDNSS */ - *p++ = 1 + 2 * n; /* length */ - p += 2; /* reserved */ - *(uint32_t *)p = htonl(60); /* lifetime */ + *p++ = 25; /* RDNSS */ + *p++ = 1 + 2 * n; /* length */ + p += 2; /* reserved */ + *(uint32_t *)p = htonl(60); /* lifetime */ p += 4; for (i = 0; i < n; i++) { @@ -148,10 +139,10 @@ int ndp(struct ctx *c, struct ethhdr *eh, size_t len) } if (!c->no_dhcp_dns_search && dns_s_len) { - *p++ = 31; /* DNSSL */ - *p++ = (len + 8 - 1) / 8 + 1; /* length */ - p += 2; /* reserved */ - *(uint32_t *)p = htonl(60); /* lifetime */ + *p++ = 31; /* DNSSL */ + *p++ = (dns_s_len + 8 - 1) / 8 + 1; /* length */ + p += 2; /* reserved */ + *(uint32_t *)p = htonl(60); /* lifetime */ p += 4; for (i = 0; i < n; i++) { @@ -185,12 +176,12 @@ dns_done: len = (uintptr_t)p - (uintptr_t)ihr - sizeof(*ihr); - if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr)) - c->addr6_ll_seen = ip6h->saddr; + if (IN6_IS_ADDR_LINKLOCAL(saddr)) + c->addr6_ll_seen = *saddr; else - c->addr6_seen = ip6h->saddr; + c->addr6_seen = *saddr; - ip6hr->daddr = ip6h->saddr; + ip6hr->daddr = *saddr; if (IN6_IS_ADDR_LINKLOCAL(&c->gw6)) ip6hr->saddr = c->gw6; else @@ -207,7 +198,7 @@ dns_done: ip6hr->hop_limit = 255; len += sizeof(*ehr) + sizeof(*ip6hr) + sizeof(*ihr); - memcpy(ehr->h_dest, eh->h_source, ETH_ALEN); + memcpy(ehr->h_dest, eh_source, ETH_ALEN); memcpy(ehr->h_source, c->mac, ETH_ALEN); ehr->h_proto = htons(ETH_P_IPV6); @@ -3,4 +3,5 @@ * Author: Stefano Brivio <sbrivio@redhat.com> */ -int ndp(struct ctx *c, struct ethhdr *eh, size_t len); +int ndp(struct ctx *c, struct icmp6hdr *ih, unsigned char *eh_source, + struct in6_addr *saddr); diff --git a/packet.c b/packet.c new file mode 100644 index 0000000..0b7cb20 --- /dev/null +++ b/packet.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: AGPL-3.0-or-later + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * packet.c - Packet abstraction: add packets to pool, flush, get packet data + * + * Copyright (c) 2020-2021 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#include <limits.h> +#include <stddef.h> +#include <stdint.h> + +#include <netinet/ip6.h> + +#include "packet.h" +#include "util.h" + +/** + * packet_add_do() - Add data as packet descriptor to given pool + * @p: Existing pool + * @len: Length of new descriptor + * @start: Start of data + * @func: For tracing: name of calling function, NULL means no trace() + * @line: For tracing: caller line of function call + */ +void packet_add_do(struct pool *p, size_t len, const char *start, + const char *func, const int line) +{ + size_t index = p->count; + + if (index >= p->size) { + trace("add packet index %lu to pool with size %lu, %s:%i", + index, p->size, func, line); + return; + } + + if (start < p->buf) { + trace("add packet start %p before buffer start %p, %s:%i", + start, p->buf, func, line); + return; + } + + if (start + len > p->buf + p->buf_size) { + trace("add packet start %p, length: %lu, buffer end %p, %s:%i", + start, len, p->buf + p->buf_size, func, line); + return; + } + + if (len > UINT16_MAX) { + trace("add packet length %lu, %s:%i", func, line); + return; + } + + if ((unsigned int)((intptr_t)start - (intptr_t)p->buf) > UINT32_MAX) { + trace("add packet start %p, buffer start %lu, %s:%i", + start, p->buf, func, line); + return; + } + + p->pkt[index].offset = start - p->buf; + p->pkt[index].len = len; + + p->count++; +} + +/** + * packet_get_do() - Get data range from packet descriptor from given pool + * @p: Packet pool + * @index: Index of packet descriptor in pool + * @offset: Offset of data range in packet descriptor + * @len: Length of desired data range + * @left: Length of available data after range, set on return, can be NULL + * @func: For tracing: name of calling function, NULL means no trace() + * @line: For tracing: caller line of function call + * + * Return: pointer to start of data range, NULL on invalid range or descriptor + */ +void *packet_get_do(struct pool *p, size_t index, size_t offset, size_t len, + size_t *left, const char *func, const int line) +{ + if (index > p->size || index > p->count) { + if (func) { + trace("packet %lu from pool size: %lu, count: %lu, " + "%s:%i", index, p->size, p->count, func, line); + } + return NULL; + } + + if (len > UINT16_MAX || len + offset > UINT32_MAX) { + if (func) { + trace("packet data length %lu, offset %lu, %s:%i", + len, offset, func, line); + } + return NULL; + } + + if (p->pkt[index].offset + len + offset > p->buf_size) { + if (func) { + trace("packet offset plus length %lu from size %lu, " + "%s:%i", p->pkt[index].offset + len + offset, + p->buf_size, func, line); + } + return NULL; + } + + if (len + offset > p->pkt[index].len) { + if (func) { + trace("data length %lu, offset %lu from length %lu, " + "%s:%i", len, offset, p->pkt[index].len, + func, line); + } + return NULL; + } + + if (left) + *left = p->pkt[index].len - offset - len; + + return p->buf + p->pkt[index].offset + offset; +} + +/** + * pool_flush() - Flush a packet pool + * @p: Pointer to packet pool + */ +void pool_flush(struct pool *p) +{ + p->count = 0; +} diff --git a/packet.h b/packet.h new file mode 100644 index 0000000..0ef8849 --- /dev/null +++ b/packet.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: AGPL-3.0-or-later + * Copyright (c) 2022 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#ifndef PACKET_H +#define PACKET_H + +/** + * struct desc - Generic offset-based descriptor within buffer + * @offset: Offset of descriptor relative to buffer start, 32-bit limit + * @len: Length of descriptor, host order, 16-bit limit + */ +struct desc { + uint32_t offset; + uint16_t len; +}; + +/** + * struct pool - Generic pool of packets stored in a buffer + * @buf: Buffer storing packet descriptors + * @buf_size: Total size of buffer + * @size: Number of usable descriptors for the pool + * @count: Number of used descriptors for the pool + * @pkt: Descriptors: see macros below + */ +struct pool { + char *buf; + size_t buf_size; + size_t size; + size_t count; + struct desc pkt[1]; +}; + +void packet_add_do(struct pool *p, size_t len, const char *start, + const char *func, const int line); +void *packet_get_do(struct pool *p, size_t index, size_t offset, size_t len, + size_t *left, const char *func, const int line); +void pool_flush(struct pool *p); + +#define packet_add(p, len, start) \ + packet_add_do(p, len, start, __func__, __LINE__); + +#define packet_get(p, index, offset, len, left) \ + packet_get_do(p, index, offset, len, left, __func__, __LINE__); + +#define packet_get_try(p, index, offset, len, left) \ + packet_get_do(p, index, offset, len, left, NULL, 0) + +#define PACKET_POOL_DECL(_name, _size, _buf) \ +struct _name ## _t { \ + char *buf; \ + size_t buf_size; \ + size_t size; \ + size_t count; \ + struct desc pkt[_size]; \ +} + +#define PACKET_POOL_INIT_NOCAST(_size, _buf, _buf_size) \ +{ \ + .buf_size = _buf_size, \ + .buf = _buf, \ + .size = _size, \ +} + +#define PACKET_POOL(name, size, buf, buf_size) \ + PACKET_POOL_DECL(name, size, buf) name = \ + PACKET_POOL_INIT_NOCAST(size, buf, buf_size) + +#define PACKET_INIT(name, size, buf, buf_size) \ + (struct name ## _t) PACKET_POOL_INIT_NOCAST(size, buf, buf_size) + +#define PACKET_POOL_NOINIT(name, size, buf) \ + PACKET_POOL_DECL(name, size, buf) name ## _storage; \ + static struct pool *name = (struct pool *)&name ## _storage + +#define PACKET_POOL_P(name, size, buf, buf_size) \ + PACKET_POOL(name ## _storage, size, buf, buf_size); \ + struct pool *name = (struct pool *)&name ## _storage + +#endif /* PACKET_H */ @@ -28,6 +28,7 @@ struct tap_l4_msg { union epoll_ref; +#include "packet.h" #include "icmp.h" #include "tcp.h" #include "udp.h" @@ -51,10 +51,11 @@ #include "pcap.h" #include "netlink.h" #include "pasta.h" +#include "packet.h" /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */ -static struct tap_msg seq4[TAP_MSGS]; -static struct tap_msg seq6[TAP_MSGS]; +static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf); +static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf); /** * tap_send() - Send frame, with qemu socket header if needed @@ -202,6 +203,8 @@ void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto, } } +PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf); + /** * struct l4_seq4_t - Message sequence for one protocol handler call, IPv4 * @msgs: Count of messages in sequence @@ -212,8 +215,7 @@ void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto, * @daddr: Destination address * @msg: Array of messages that can be handled in a single call */ -static struct tap_l4_seq4 { - uint16_t msgs; +static struct tap4_l4_t { uint8_t protocol; uint16_t source; @@ -222,8 +224,8 @@ static struct tap_l4_seq4 { uint32_t saddr; uint32_t daddr; - struct tap_l4_msg msg[UIO_MAXIOV]; -} l4_seq4[UIO_MAXIOV /* Arbitrary: TAP_MSGS in theory, so limit in users */]; + struct pool_l4_t p; +} tap4_l4[UIO_MAXIOV /* Arbitrary: TAP_MSGS in theory, so limit in users */]; /** * struct l4_seq6_t - Message sequence for one protocol handler call, IPv6 @@ -235,8 +237,7 @@ static struct tap_l4_seq4 { * @daddr: Destination address * @msg: Array of messages that can be handled in a single call */ -static struct tap_l4_seq6 { - uint16_t msgs; +static struct tap6_l4_t { uint8_t protocol; uint16_t source; @@ -245,8 +246,8 @@ static struct tap_l4_seq6 { struct in6_addr saddr; struct in6_addr daddr; - struct tap_l4_msg msg[UIO_MAXIOV]; -} l4_seq6[UIO_MAXIOV /* Arbitrary: TAP_MSGS in theory, so limit in users */]; + struct pool_l4_t p; +} tap6_l4[UIO_MAXIOV /* Arbitrary: TAP_MSGS in theory, so limit in users */]; /** * tap_packet_debug() - Print debug message for packet(s) from guest/tap @@ -258,8 +259,8 @@ static struct tap_l4_seq6 { * @count: Count of packets in this sequence */ static void tap_packet_debug(struct iphdr *iph, struct ipv6hdr *ip6h, - struct tap_l4_seq4 *seq4, uint8_t proto6, - struct tap_l4_seq6 *seq6, int count) + struct tap4_l4_t *seq4, uint8_t proto6, + struct tap6_l4_t *seq6, int count) { char buf6s[INET6_ADDRSTRLEN], buf6d[INET6_ADDRSTRLEN]; char buf4s[INET_ADDRSTRLEN], buf4d[INET_ADDRSTRLEN]; @@ -283,14 +284,15 @@ static void tap_packet_debug(struct iphdr *iph, struct ipv6hdr *ip6h, } if (proto == IPPROTO_TCP || proto == IPPROTO_UDP) { - trace("protocol %i from tap: %s:%i -> %s:%i (%i packet%s)", - proto, seq4 ? buf4s : buf6s, + trace("tap: protocol %i, %s%s%s:%i -> %s%s%s:%i (%i packet%s)", + proto, + seq4 ? "" : "[", seq4 ? buf4s : buf6s, seq4 ? "" : "]", ntohs(seq4 ? seq4->source : seq6->source), - seq4 ? buf4d : buf6d, + seq4 ? "" : "[", seq4 ? buf4d : buf6d, seq4 ? "" : "]", ntohs(seq4 ? seq4->dest : seq6->dest), count, count == 1 ? "" : "s"); } else { - trace("protocol %i from tap: %s -> %s (%i packet%s)", + trace("tap: protocol %i, %s -> %s (%i packet%s)", proto, iph ? buf4s : buf6s, iph ? buf4d : buf6d, count, count == 1 ? "" : "s"); } @@ -299,78 +301,83 @@ static void tap_packet_debug(struct iphdr *iph, struct ipv6hdr *ip6h, /** * tap4_handler() - IPv4 and ARP packet handler for tap file descriptor * @c: Execution context - * @msg: Array of messages with IPv4 or ARP protocol - * @count: Count of messages + * @in: Ingress packet pool, packets with Ethernet headers * @now: Current timestamp * * Return: count of packets consumed by handlers */ -static int tap4_handler(struct ctx *c, struct tap_msg *msg, size_t count, - struct timespec *now) +static int tap4_handler(struct ctx *c, struct pool *in, struct timespec *now) { unsigned int i, j, seq_count; - struct tap_l4_msg *l4_msg; - struct tap_l4_seq4 *seq; - size_t len, l4_len; - struct ethhdr *eh; - struct iphdr *iph; - struct udphdr *uh; - char *l4h; + struct tap4_l4_t *seq; - if (!c->v4) - return count; + if (!c->v4 || !in->count) + return in->count; i = 0; resume: - for (seq_count = 0, seq = NULL; i < count; i++) { - eh = (struct ethhdr *)(pkt_buf + msg[i].pkt_buf_offset); - len = msg[i].len; + for (seq_count = 0, seq = NULL; i < in->count; i++) { + size_t l2_len, l3_len, hlen, l4_len; + struct ethhdr *eh; + struct iphdr *iph; + struct udphdr *uh; + char *l4h; - if (len < sizeof(*eh)) - continue; + packet_get(in, i, 0, 0, &l2_len); - if (ntohs(eh->h_proto) == ETH_P_ARP && arp(c, eh, len)) + eh = packet_get(in, i, 0, sizeof(*eh), &l3_len); + if (!eh) continue; + if (ntohs(eh->h_proto) == ETH_P_ARP) { + PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); - if (len < sizeof(*eh) + sizeof(*iph)) + packet_add(pkt, l2_len, (char *)eh); + arp(c, pkt); continue; + } - iph = (struct iphdr *)(eh + 1); - if ((size_t)iph->ihl * 4 + sizeof(*eh) > len) + iph = packet_get(in, i, sizeof(*eh), sizeof(*iph), NULL); + if (!iph) continue; - if ((size_t)iph->ihl * 4 < (int)sizeof(*iph)) + + hlen = iph->ihl * 4UL; + if (hlen < sizeof(*iph) || htons(iph->tot_len) != l3_len || + hlen > l3_len) continue; + l4_len = l3_len - hlen; + if (iph->saddr && c->addr4_seen != iph->saddr) { c->addr4_seen = iph->saddr; proto_update_l2_buf(NULL, NULL, &c->addr4_seen); } - l4h = (char *)iph + (size_t)iph->ihl * 4; - l4_len = len - ((intptr_t)l4h - (intptr_t)eh); + l4h = packet_get(in, i, sizeof(*eh) + hlen, l4_len, NULL); + if (!l4h) + continue; if (iph->protocol == IPPROTO_ICMP) { - struct tap_l4_msg icmp_msg = { l4h - pkt_buf, - l4_len }; + PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); - if (l4_len < sizeof(struct icmphdr)) + if (c->no_icmp) continue; - tap_packet_debug(iph, NULL, NULL, 0, NULL, 1); - if (!c->no_icmp) { - icmp_tap_handler(c, AF_INET, &iph->daddr, - &icmp_msg, 1, now); - } + packet_add(pkt, l4_len, l4h); + icmp_tap_handler(c, AF_INET, &iph->daddr, pkt, now); continue; } - if (l4_len < sizeof(*uh)) + uh = packet_get(in, i, sizeof(*eh) + hlen, sizeof(*uh), NULL); + if (!uh) continue; - uh = (struct udphdr *)l4h; + if (iph->protocol == IPPROTO_UDP) { + PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); - if (iph->protocol == IPPROTO_UDP && dhcp(c, eh, len)) - continue; + packet_add(pkt, l2_len, (char *)eh); + if (dhcp(c, pkt)) + continue; + } if (iph->protocol != IPPROTO_TCP && iph->protocol != IPPROTO_UDP) { @@ -392,147 +399,145 @@ resume: seq->daddr = iph->daddr; \ } while (0) - if (seq && L4_MATCH(iph, uh, seq) && seq->msgs < UIO_MAXIOV) + if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV) goto append; - for (seq = l4_seq4 + seq_count - 1; seq >= l4_seq4; seq--) { + for (seq = tap4_l4 + seq_count - 1; seq >= tap4_l4; seq--) { if (L4_MATCH(iph, uh, seq)) { - if (seq->msgs >= UIO_MAXIOV) + if (seq->p.count >= UIO_MAXIOV) seq = NULL; break; } } - if (!seq || seq < l4_seq4) { - seq = l4_seq4 + seq_count++; + if (!seq || seq < tap4_l4) { + seq = tap4_l4 + seq_count++; L4_SET(iph, uh, seq); - seq->msgs = 0; + pool_flush((struct pool *)&seq->p); } #undef L4_MATCH #undef L4_SET append: - l4_msg = &seq->msg[seq->msgs++]; - - l4_msg->pkt_buf_offset = l4h - pkt_buf; - l4_msg->l4_len = l4_len; + packet_add((struct pool *)&seq->p, l4_len, l4h); if (seq_count == UIO_MAXIOV) break; /* Resume after flushing if i < count */ } - for (j = 0, seq = l4_seq4; j < seq_count; j++, seq++) { - int n = seq->msgs; - - l4_msg = seq->msg; + for (j = 0, seq = tap4_l4; j < seq_count; j++, seq++) { + struct pool *p = (struct pool *)&seq->p; + uint32_t *da = &seq->daddr; + size_t n = p->count; tap_packet_debug(NULL, NULL, seq, 0, NULL, n); if (seq->protocol == IPPROTO_TCP) { if (c->no_tcp) continue; - while ((n -= tcp_tap_handler(c, AF_INET, &seq->daddr, - l4_msg, n, now))); + while ((n -= tcp_tap_handler(c, AF_INET, da, p, now))); } else if (seq->protocol == IPPROTO_UDP) { if (c->no_udp) continue; - while ((n -= udp_tap_handler(c, AF_INET, &seq->daddr, - l4_msg, n, now))); + while ((n -= udp_tap_handler(c, AF_INET, da, p, now))); } } - if (i < count) + if (i < in->count) goto resume; - return count; + return in->count; } /** * tap6_handler() - IPv6 packet handler for tap file descriptor * @c: Execution context - * @msg: Array of messages with IPv6 protocol - * @count: Count of messages + * @in: Ingress packet pool, packets with Ethernet headers * @now: Current timestamp * * Return: count of packets consumed by handlers */ -static int tap6_handler(struct ctx *c, struct tap_msg *msg, size_t count, - struct timespec *now) +static int tap6_handler(struct ctx *c, struct pool *in, struct timespec *now) { unsigned int i, j, seq_count = 0; - struct tap_l4_msg *l4_msg; - struct tap_l4_seq6 *seq; - struct ipv6hdr *ip6h; - size_t len, l4_len; - struct ethhdr *eh; - struct udphdr *uh; - uint8_t proto; - char *l4h; + struct tap6_l4_t *seq; - if (!c->v6) - return count; + if (!c->v6 || !in->count) + return in->count; i = 0; resume: - for (seq_count = 0, seq = NULL; i < count; i++) { - eh = (struct ethhdr *)(pkt_buf + msg[i].pkt_buf_offset); - len = msg[i].len; + for (seq_count = 0, seq = NULL; i < in->count; i++) { + size_t l4_len, plen, check; + struct in6_addr *saddr, *daddr; + struct ipv6hdr *ip6h; + struct ethhdr *eh; + struct udphdr *uh; + uint8_t proto; + char *l4h; + + eh = packet_get(in, i, 0, sizeof(*eh), NULL); + if (!eh) + continue; - if (len < sizeof(*eh)) + ip6h = packet_get(in, i, sizeof(*eh), sizeof(*ip6h), &check); + if (!ip6h) continue; - if (len < sizeof(*eh) + sizeof(*ip6h)) - return 1; + saddr = &ip6h->saddr; + daddr = &ip6h->daddr; - ip6h = (struct ipv6hdr *)(eh + 1); + plen = ntohs(ip6h->payload_len); + if (plen != check) + continue; + + if (!(l4h = ipv6_l4hdr(in, i, sizeof(*eh), &proto, &l4_len))) + continue; - if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr)) { - c->addr6_ll_seen = ip6h->saddr; + if (IN6_IS_ADDR_LINKLOCAL(saddr)) { + c->addr6_ll_seen = *saddr; if (IN6_IS_ADDR_UNSPECIFIED(&c->addr6_seen)) { - c->addr6_seen = ip6h->saddr; + c->addr6_seen = *saddr; } } else { - c->addr6_seen = ip6h->saddr; + c->addr6_seen = *saddr; } - if (ntohs(ip6h->payload_len) > - len - sizeof(*eh) - sizeof(*ip6h)) - continue; - - if (!(l4h = ipv6_l4hdr(ip6h, &proto))) - continue; - - l4_len = len - ((intptr_t)l4h - (intptr_t)eh); - if (proto == IPPROTO_ICMPV6) { - struct tap_l4_msg icmpv6_msg = { l4h - pkt_buf, - l4_len }; + PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + + if (c->no_icmp) + continue; if (l4_len < sizeof(struct icmp6hdr)) continue; - if (ndp(c, eh, len)) + if (ndp(c, (struct icmp6hdr *)l4h, eh->h_source, saddr)) continue; tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1); - if (!c->no_icmp) { - icmp_tap_handler(c, AF_INET6, &ip6h->daddr, - &icmpv6_msg, 1, now); - } + + packet_add(pkt, l4_len, l4h); + icmp_tap_handler(c, AF_INET6, daddr, pkt, now); continue; } if (l4_len < sizeof(*uh)) continue; - uh = (struct udphdr *)l4h; - if (proto == IPPROTO_UDP && dhcpv6(c, eh, len)) - continue; + if (proto == IPPROTO_UDP) { + PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + + packet_add(pkt, l4_len, l4h); + + if (dhcpv6(c, pkt, saddr, daddr)) + continue; + } - ip6h->saddr = c->addr6; + *saddr = c->addr6; if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) { tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1); @@ -542,73 +547,68 @@ resume: #define L4_MATCH(ip6h, proto, uh, seq) \ (seq->protocol == proto && \ seq->source == uh->source && seq->dest == uh->dest && \ - IN6_ARE_ADDR_EQUAL(&seq->saddr, &ip6h->saddr) && \ - IN6_ARE_ADDR_EQUAL(&seq->daddr, &ip6h->daddr)) + IN6_ARE_ADDR_EQUAL(&seq->saddr, saddr) && \ + IN6_ARE_ADDR_EQUAL(&seq->daddr, daddr)) #define L4_SET(ip6h, proto, uh, seq) \ do { \ seq->protocol = proto; \ seq->source = uh->source; \ seq->dest = uh->dest; \ - seq->saddr = ip6h->saddr; \ - seq->daddr = ip6h->daddr; \ + seq->saddr = *saddr; \ + seq->daddr = *daddr; \ } while (0) if (seq && L4_MATCH(ip6h, proto, uh, seq) && - seq->msgs < UIO_MAXIOV) + seq->p.count < UIO_MAXIOV) goto append; - for (seq = l4_seq6 + seq_count - 1; seq >= l4_seq6; seq--) { + for (seq = tap6_l4 + seq_count - 1; seq >= tap6_l4; seq--) { if (L4_MATCH(ip6h, proto, uh, seq)) { - if (seq->msgs >= UIO_MAXIOV) + if (seq->p.count >= UIO_MAXIOV) seq = NULL; break; } } - if (!seq || seq < l4_seq6) { - seq = l4_seq6 + seq_count++; + if (!seq || seq < tap6_l4) { + seq = tap6_l4 + seq_count++; L4_SET(ip6h, proto, uh, seq); - seq->msgs = 0; + pool_flush((struct pool *)&seq->p); } #undef L4_MATCH #undef L4_SET append: - l4_msg = &seq->msg[seq->msgs++]; - - l4_msg->pkt_buf_offset = l4h - pkt_buf; - l4_msg->l4_len = l4_len; + packet_add((struct pool *)&seq->p, l4_len, l4h); if (seq_count == UIO_MAXIOV) break; /* Resume after flushing if i < count */ } - for (j = 0, seq = l4_seq6; j < seq_count; j++, seq++) { - int n = seq->msgs; - - l4_msg = seq->msg; + for (j = 0, seq = tap6_l4; j < seq_count; j++, seq++) { + struct pool *p = (struct pool *)&seq->p; + struct in6_addr *da = &seq->daddr; + size_t n = p->count; tap_packet_debug(NULL, NULL, NULL, seq->protocol, seq, n); if (seq->protocol == IPPROTO_TCP) { if (c->no_tcp) continue; - while ((n -= tcp_tap_handler(c, AF_INET6, &seq->daddr, - l4_msg, n, now))); + while ((n -= tcp_tap_handler(c, AF_INET6, da, p, now))); } else if (seq->protocol == IPPROTO_UDP) { if (c->no_udp) continue; - while ((n -= udp_tap_handler(c, AF_INET6, &seq->daddr, - l4_msg, n, now))); + while ((n -= udp_tap_handler(c, AF_INET6, da, p, now))); } } - if (i < count) + if (i < in->count) goto resume; - return count; + return in->count; } /** @@ -620,14 +620,16 @@ append: */ static int tap_handler_passt(struct ctx *c, struct timespec *now) { - int seq4_i, seq6_i; struct ethhdr *eh; ssize_t n, rem; char *p; redo: p = pkt_buf; - seq4_i = seq6_i = rem = 0; + rem = 0; + + pool_flush(pool_tap4); + pool_flush(pool_tap6); n = recv(c->fd_tap, p, TAP_BUF_FILL, MSG_DONTWAIT); if (n < 0) { @@ -673,12 +675,10 @@ redo: switch (ntohs(eh->h_proto)) { case ETH_P_ARP: case ETH_P_IP: - seq4[seq4_i].pkt_buf_offset = p - pkt_buf; - seq4[seq4_i++].len = len; + packet_add(pool_tap4, len, p); break; case ETH_P_IPV6: - seq6[seq6_i].pkt_buf_offset = p - pkt_buf; - seq6[seq6_i++].len = len; + packet_add(pool_tap6, len, p); break; default: break; @@ -689,11 +689,8 @@ next: n -= len; } - if (seq4_i) - tap4_handler(c, seq4, seq4_i, now); - - if (seq6_i) - tap6_handler(c, seq6, seq6_i, now); + tap4_handler(c, pool_tap4, now); + tap6_handler(c, pool_tap6, now); /* We can't use EPOLLET otherwise. */ if (rem) @@ -712,8 +709,10 @@ next: static int tap_handler_pasta(struct ctx *c, struct timespec *now) { ssize_t n = 0, len; - int ret, seq4_i = 0, seq6_i = 0; + int ret; + pool_flush(pool_tap4); + pool_flush(pool_tap6); restart: while ((len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n)) > 0) { struct ethhdr *eh = (struct ethhdr *)(pkt_buf + n); @@ -733,12 +732,10 @@ restart: switch (ntohs(eh->h_proto)) { case ETH_P_ARP: case ETH_P_IP: - seq4[seq4_i].pkt_buf_offset = n; - seq4[seq4_i++].len = len; + packet_add(pool_tap4, len, pkt_buf + n); break; case ETH_P_IPV6: - seq6[seq6_i].pkt_buf_offset = n; - seq6[seq6_i++].len = len; + packet_add(pool_tap6, len, pkt_buf + n); break; default: break; @@ -752,11 +749,8 @@ restart: ret = errno; - if (seq4_i) - tap4_handler(c, seq4, seq4_i, now); - - if (seq6_i) - tap6_handler(c, seq6, seq6_i, now); + tap4_handler(c, pool_tap4, now); + tap6_handler(c, pool_tap6, now); if (len > 0 || ret == EAGAIN) return 0; @@ -920,6 +914,17 @@ static void tap_sock_tun_init(struct ctx *c) */ void tap_sock_init(struct ctx *c) { + size_t sz = sizeof(pkt_buf); + int i; + + pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, pkt_buf, sz); + pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, pkt_buf, sz); + + for (i = 0; i < UIO_MAXIOV; i++) { + tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz); + tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz); + } + if (c->fd_tap != -1) { epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL); close(c->fd_tap); @@ -303,6 +303,9 @@ #define TCP_FRAMES \ (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) +#define TCP_FILE_PRESSURE 30 /* % of c->nofile */ +#define TCP_CONN_PRESSURE 30 /* % of c->tcp.conn_count */ + #define TCP_HASH_BUCKET_BITS (TCP_CONN_INDEX_BITS + 1) #define TCP_HASH_TABLE_LOAD 70 /* % */ #define TCP_HASH_TABLE_SIZE (TCP_MAX_CONNS * 100 / \ @@ -440,6 +443,7 @@ struct tcp_conn { #define TCP_MAX_RETRANS ((1U << TCP_RETRANS_BITS) - 1) #define TCP_WS_BITS 4 /* RFC 7323 */ +#define TCP_WS_MAX 14 unsigned int ws_from_tap :TCP_WS_BITS; unsigned int ws_to_tap :TCP_WS_BITS; @@ -476,7 +480,6 @@ struct tcp_conn { uint32_t seq_init_from_tap; }; -#define CONN_IS_CLOSED(conn) (conn->events == CLOSED) #define CONN_IS_CLOSING(conn) \ ((conn->events & ESTABLISHED) && \ (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) @@ -699,7 +702,7 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) return EPOLLET; if (conn_flags & STALLED) - return EPOLLIN | EPOLLRDHUP | EPOLLET; + return EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLET; return EPOLLIN | EPOLLRDHUP; } @@ -733,8 +736,11 @@ static int tcp_epoll_ctl(struct ctx *c, struct tcp_conn *conn) .r.p.tcp.tcp.v6 = CONN_V6(conn) }; struct epoll_event ev = { .data.u64 = ref.u64 }; - if (CONN_IS_CLOSED(conn)) { - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev); + if (conn->events == CLOSED) { + if (conn->flags & IN_EPOLL) + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev); + if (conn->timer != -1) + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev); return 0; } @@ -745,6 +751,18 @@ static int tcp_epoll_ctl(struct ctx *c, struct tcp_conn *conn) conn->flags |= IN_EPOLL; /* No need to log this */ + if (conn->timer != -1) { + union epoll_ref ref_t = { .r.proto = IPPROTO_TCP, + .r.s = conn->sock, + .r.p.tcp.tcp.timer = 1, + .r.p.tcp.tcp.index = conn - tc }; + struct epoll_event ev_t = { .data.u64 = ref_t.u64, + .events = EPOLLIN | EPOLLET }; + + if (epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t)) + return -errno; + } + return 0; } @@ -759,6 +777,9 @@ static void tcp_timer_ctl(struct ctx *c, struct tcp_conn *conn) { struct itimerspec it = { { 0 }, { 0 } }; + if (conn->events == CLOSED) + return; + if (conn->timer == -1) { union epoll_ref ref = { .r.proto = IPPROTO_TCP, .r.s = conn->sock, @@ -783,15 +804,11 @@ static void tcp_timer_ctl(struct ctx *c, struct tcp_conn *conn) } } - if (conn->events == CLOSED) { - it.it_value.tv_nsec = 1; - } else if (conn->flags & ACK_TO_TAP_DUE) { + if (conn->flags & ACK_TO_TAP_DUE) { it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000; } else if (conn->flags & ACK_FROM_TAP_DUE) { if (!(conn->events & ESTABLISHED)) it.it_value.tv_sec = SYN_TIMEOUT; - else if (conn->events & TAP_FIN_SENT) - it.it_value.tv_sec = FIN_TIMEOUT; else it.it_value.tv_sec = ACK_TIMEOUT; } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { @@ -834,7 +851,9 @@ static void conn_flag_do(struct ctx *c, struct tcp_conn *conn, if (flag == STALLED || flag == ~STALLED) tcp_epoll_ctl(c, conn); - if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE) + if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE || + (flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) || + (flag == ~ACK_TO_TAP_DUE && (conn->flags & ACK_FROM_TAP_DUE))) tcp_timer_ctl(c, conn); } @@ -888,7 +907,7 @@ static void conn_event_do(struct ctx *c, struct tcp_conn *conn, else tcp_epoll_ctl(c, conn); - if (event == CLOSED || CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) + if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) tcp_timer_ctl(c, conn); } @@ -1182,36 +1201,32 @@ static void tcp_sock6_iov_init(void) /** * tcp_opt_get() - Get option, and value if any, from TCP header - * @th: Pointer to TCP header - * @len: Length of buffer, including TCP header + * @opts: Pointer to start of TCP options in header + * @len: Length of buffer, excluding TCP header -- NOT checked here! * @type_find: Option type to look for * @optlen_set: Optional, filled with option length if passed * @value_set: Optional, set to start of option value if passed * * Return: option value, meaningful for up to 4 bytes, -1 if not found */ -static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t type_find, +static int tcp_opt_get(char *opts, size_t len, uint8_t type_find, uint8_t *optlen_set, char **value_set) { uint8_t type, optlen; - char *p; - - if (len > (size_t)th->doff * 4) - len = (size_t)th->doff * 4; - len -= sizeof(*th); - p = (char *)(th + 1); + if (!len) + return -1; - for (; len >= 2; p += optlen, len -= optlen) { - switch (*p) { + for (; len >= 2; opts += optlen, len -= optlen) { + switch (*opts) { case OPT_EOL: return -1; case OPT_NOP: optlen = 1; break; default: - type = *(p++); - optlen = *(p++) - 2; + type = *(opts++); + optlen = *(opts++) - 2; len -= 2; if (type != type_find) @@ -1220,17 +1235,17 @@ static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t type_find, if (optlen_set) *optlen_set = optlen; if (value_set) - *value_set = p; + *value_set = opts; switch (optlen) { case 0: return 0; case 1: - return *p; + return *opts; case 2: - return ntohs(*(uint16_t *)p); + return ntohs(*(uint16_t *)opts); default: - return ntohl(*(uint32_t *)p); + return ntohl(*(uint32_t *)opts); } } } @@ -1415,12 +1430,12 @@ static void tcp_table_compact(struct ctx *c, struct tcp_conn *hole) if ((hole - tc) == --c->tcp.conn_count) { debug("TCP: hash table compaction: index %i (%p) was max index", hole - tc, hole); + memset(hole, 0, sizeof(*hole)); return; } from = CONN(c->tcp.conn_count); memcpy(hole, from, sizeof(*hole)); - from->flags = from->events = 0; to = hole; tcp_hash_update(from, to); @@ -1430,25 +1445,23 @@ static void tcp_table_compact(struct ctx *c, struct tcp_conn *hole) debug("TCP: hash table compaction: old index %i, new index %i, " "sock %i, from: %p, to: %p", from - tc, to - tc, from->sock, from, to); + + memset(from, 0, sizeof(*from)); } /** - * tcp_conn_destroy() - Close connection, drop from epoll file descriptor + * tcp_conn_destroy() - Close sockets, trigger hash table removal and compaction * @c: Execution context * @conn: Connection pointer */ static void tcp_conn_destroy(struct ctx *c, struct tcp_conn *conn) { - if (CONN_IS_CLOSED(conn)) - return; - - conn_event(c, conn, CLOSED); - conn->flags = 0; close(conn->sock); + if (conn->timer != -1) + close(conn->timer); - /* Removal from hash table and connection table compaction deferred to - * timer. - */ + tcp_hash_remove(conn); + tcp_table_compact(c, conn); } static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn); @@ -1582,9 +1595,23 @@ static void tcp_l2_data_buf_flush(struct ctx *c) */ void tcp_defer_handler(struct ctx *c) { + int max_conns = c->tcp.conn_count / 100 * TCP_CONN_PRESSURE; + int max_files = c->nofile / 100 * TCP_FILE_PRESSURE; + struct tcp_conn *conn; + tcp_l2_flags_buf_flush(c); tcp_l2_data_buf_flush(c); + tcp_splice_defer_handler(c); + + if (c->tcp.conn_count < MIN(max_files, max_conns)) + return; + + for (conn = CONN(c->tcp.conn_count - 1); conn >= tc; conn--) { + if (conn->events == CLOSED) + tcp_conn_destroy(c, conn); + } + } /** @@ -1605,13 +1632,19 @@ static size_t tcp_l2_buf_fill_headers(struct ctx *c, struct tcp_conn *conn, size_t ip_len, eth_len; #define SET_TCP_HEADER_COMMON_V4_V6(b, conn, seq) \ - do { \ - b->th.source = htons(conn->sock_port); \ - b->th.dest = htons(conn->tap_port); \ - b->th.seq = htonl(seq); \ - b->th.ack_seq = htonl(conn->seq_ack_to_tap); \ - b->th.window = htons(MIN(conn->wnd_to_tap, USHRT_MAX)); \ - } while (0) +do { \ + b->th.source = htons(conn->sock_port); \ + b->th.dest = htons(conn->tap_port); \ + b->th.seq = htonl(seq); \ + b->th.ack_seq = htonl(conn->seq_ack_to_tap); \ + if (conn->events & ESTABLISHED) { \ + b->th.window = htons(conn->wnd_to_tap); \ + } else { \ + unsigned wnd = conn->wnd_to_tap << conn->ws_to_tap; \ + \ + b->th.window = htons(MIN(wnd, USHRT_MAX)); \ + } \ +} while (0) if (CONN_V6(conn)) { struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p; @@ -1692,7 +1725,7 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn, conn->seq_ack_to_tap = prev_ack_to_tap; #else if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || tcp_rtt_dst_low(conn) - || CONN_IS_CLOSING(conn) || conn->flags & LOCAL || force_seq) { + || CONN_IS_CLOSING(conn) || (conn->flags & LOCAL) || force_seq) { conn->seq_ack_to_tap = conn->seq_from_tap; } else if (conn->seq_ack_to_tap != conn->seq_from_tap) { if (!tinfo) { @@ -1712,17 +1745,19 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn, if (!KERNEL_REPORTS_SND_WND(c)) { tcp_get_sndbuf(conn); new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW); - conn->wnd_to_tap = new_wnd_to_tap >> conn->ws_to_tap; + conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap, + USHRT_MAX); goto out; } if (!tinfo) { - if (prev_wnd_to_tap > WINDOW_DEFAULT) + if (prev_wnd_to_tap > WINDOW_DEFAULT) { goto out; - +} tinfo = &tinfo_new; - if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) + if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) { goto out; +} } #ifdef HAS_SND_WND @@ -1735,10 +1770,15 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn, } #endif - conn->wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW) >> conn->ws_to_tap; + new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW); + if (!(conn->events & ESTABLISHED)) + new_wnd_to_tap = MAX(new_wnd_to_tap, WINDOW_DEFAULT); + + conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap, USHRT_MAX); if (!conn->wnd_to_tap) conn_flag(c, conn, ACK_TO_TAP_DUE); + out: return new_wnd_to_tap != prev_wnd_to_tap || conn->seq_ack_to_tap != prev_ack_to_tap; @@ -1772,10 +1812,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags) return 0; if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) { - tcp_conn_destroy(c, conn); + conn_event(c, conn, CLOSED); return -ECONNRESET; } +#ifdef HAS_SND_WND + if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd) + c->tcp.kernel_snd_wnd = 1; +#endif + if (!(conn->flags & LOCAL)) tcp_rtt_dst_check(conn, &tinfo); @@ -1825,11 +1870,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags) data += OPT_MSS_LEN - 2; th->doff += OPT_MSS_LEN / 4; -#ifdef HAS_SND_WND - if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd) - c->tcp.kernel_snd_wnd = 1; -#endif - conn->ws_to_tap = MIN(MAX_WS, tinfo.tcpi_snd_wscale); *data++ = OPT_NOP; @@ -1854,10 +1894,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags) NULL, conn->seq_to_tap); iov->iov_len = eth_len + sizeof(uint32_t); - /* First value is not scaled: scale now */ - if (flags & SYN) - conn->wnd_to_tap >>= conn->ws_to_tap; - if (CONN_V4(conn)) tcp4_l2_flags_buf_bytes += iov->iov_len; else @@ -1905,68 +1941,55 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags) */ static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn) { - if (CONN_IS_CLOSED(conn)) + if (conn->events == CLOSED) return; if (!tcp_send_flag(c, conn, RST)) - tcp_conn_destroy(c, conn); + conn_event(c, conn, CLOSED); } /** - * tcp_clamp_window() - Set window and scaling from option, clamp on socket + * tcp_get_tap_ws() - Get Window Scaling option for connection from tap/guest * @conn: Connection pointer - * @th: TCP header, from tap, can be NULL if window is passed - * @len: Buffer length, at L4, can be 0 if no header is passed - * @window: Window value, host order, unscaled, if no header is passed - * @init: Set if this is the very first segment from tap - */ -static void tcp_clamp_window(struct ctx *c, struct tcp_conn *conn, - struct tcphdr *th, int len, unsigned int window, - int init) + * @opts: Pointer to start of TCP options + * @optlen: Bytes in options: caller MUST ensure available length + */ +static void tcp_get_tap_ws(struct tcp_conn *conn, char *opts, size_t optlen) { - if (init && th) { - int ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL); + int ws = tcp_opt_get(opts, optlen, OPT_WS, NULL, NULL); - conn->ws_from_tap = ws & 0xf; + if (ws >= 0 && ws <= TCP_WS_MAX) + conn->ws_from_tap = ws; + else + conn->ws_from_tap = 0; +} - /* RFC 7323, 2.2: first value is not scaled. Also, don't clamp - * yet, to avoid getting a zero scale just because we set a - * small window now. - */ - conn->wnd_from_tap = ntohs(th->window); - } else { - uint32_t prev_scaled = conn->wnd_from_tap << conn->ws_from_tap; +/** + * tcp_clamp_window() - Set new window for connection, clamp on socket + * @c: Execution context + * @conn: Connection pointer + * @window: Window value, host order, unscaled + */ +static void tcp_clamp_window(struct ctx *c, struct tcp_conn *conn, unsigned wnd) +{ + uint32_t prev_scaled = conn->wnd_from_tap << conn->ws_from_tap; - if (th) - window = ntohs(th->window) << conn->ws_from_tap; - else - window <<= conn->ws_from_tap; - - window = MIN(MAX_WINDOW, window); - - if (conn->flags & WND_CLAMPED) { - if (prev_scaled == window) - return; - - /* Discard +/- 1% updates to spare some syscalls. */ - if ((window > prev_scaled && - window * 99 / 100 < prev_scaled) || - (window < prev_scaled && - window * 101 / 100 > prev_scaled)) { - conn->wnd_from_tap = window >> - conn->ws_from_tap; - return; - } - } + wnd <<= conn->ws_from_tap; + wnd = MIN(MAX_WINDOW, wnd); - if (window < 256) - window = 256; + if (conn->flags & WND_CLAMPED) { + if (prev_scaled == wnd) + return; - conn->wnd_from_tap = window >> conn->ws_from_tap; - setsockopt(conn->sock, SOL_TCP, TCP_WINDOW_CLAMP, - &window, sizeof(window)); - conn_flag(c, conn, WND_CLAMPED); + /* Discard +/- 1% updates to spare some syscalls. */ + if ((wnd > prev_scaled && wnd * 99 / 100 < prev_scaled) || + (wnd < prev_scaled && wnd * 101 / 100 > prev_scaled)) + return; } + + conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX); + setsockopt(conn->sock, SOL_TCP, TCP_WINDOW_CLAMP, &wnd, sizeof(wnd)); + conn_flag(c, conn, WND_CLAMPED); } /** @@ -2059,18 +2082,18 @@ static int tcp_conn_new_sock(struct ctx *c, sa_family_t af) * tcp_conn_tap_mss() - Get and clamp MSS value advertised by tap/guest * @c: Execution context * @conn: Connection pointer - * @th: TCP header send by tap/guest - * @len: L4 packet length, host order + * @opts: Pointer to start of TCP options + * @optlen: Bytes in options: caller MUST ensure available length * * Return: clamped MSS value */ static uint16_t tcp_conn_tap_mss(struct ctx *c, struct tcp_conn *conn, - struct tcphdr *th, size_t len) + char *opts, size_t optlen) { unsigned int mss; int ret; - if ((ret = tcp_opt_get(th, len, OPT_MSS, NULL, NULL)) < 0) + if ((ret = tcp_opt_get(opts, optlen, OPT_MSS, NULL, NULL)) < 0) mss = MSS_DEFAULT; else mss = ret; @@ -2091,12 +2114,13 @@ static uint16_t tcp_conn_tap_mss(struct ctx *c, struct tcp_conn *conn, * @c: Execution context * @af: Address family, AF_INET or AF_INET6 * @addr: Remote address, pointer to sin_addr or sin6_addr - * @th: TCP header from tap - * @len: Packet length at L4 + * @th: TCP header from tap: caller MUST ensure it's there + * @opts: Pointer to start of options + * @optlen: Bytes in options: caller MUST ensure available length * @now: Current timestamp */ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, - struct tcphdr *th, size_t len, + struct tcphdr *th, char *opts, size_t optlen, struct timespec *now) { struct sockaddr_in addr4 = { @@ -2142,16 +2166,21 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, conn = CONN(c->tcp.conn_count++); conn->sock = s; conn->timer = -1; - conn->ws_to_tap = conn->ws_from_tap = 0; conn_event(c, conn, TAP_SYN_RCVD); conn->wnd_to_tap = WINDOW_DEFAULT; - mss = tcp_conn_tap_mss(c, conn, th, len); + mss = tcp_conn_tap_mss(c, conn, opts, optlen); setsockopt(s, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)); MSS_SET(conn, mss); - tcp_clamp_window(c, conn, th, len, 0, 1); + tcp_get_tap_ws(conn, opts, optlen); + + /* RFC 7323, 2.2: first value is not scaled. Also, don't clamp yet, to + * avoid getting a zero scale just because we set a small window now. + */ + if (!(conn->wnd_from_tap = (htons(th->window) >> conn->ws_from_tap))) + conn->wnd_from_tap = 1; if (af == AF_INET) { sa = (struct sockaddr *)&addr4; @@ -2395,53 +2424,52 @@ zero_len: } /** - * tcp_data_from_tap() - tap data for established connection + * tcp_data_from_tap() - tap/guest data for established connection * @c: Execution context * @conn: Connection pointer - * @msg: Array of messages from tap - * @count: Count of messages + * @p: Pool of TCP packets, with TCP headers * * #syscalls sendmsg */ static void tcp_data_from_tap(struct ctx *c, struct tcp_conn *conn, - struct tap_l4_msg *msg, int count) + struct pool *p) { - int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1; + int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0; uint16_t max_ack_seq_wnd = conn->wnd_from_tap; uint32_t max_ack_seq = conn->seq_ack_from_tap; uint32_t seq_from_tap = conn->seq_from_tap; struct msghdr mh = { .msg_iov = tcp_iov }; - int partial_send = 0; - uint16_t len; + size_t len; ssize_t n; - for (i = 0, iov_i = 0; i < count; i++) { + for (i = 0, iov_i = 0; i < (int)p->count; i++) { uint32_t seq, seq_offset, ack_seq; struct tcphdr *th; char *data; size_t off; - th = (struct tcphdr *)(pkt_buf + msg[i].pkt_buf_offset); - len = msg[i].l4_len; - - if (len < sizeof(*th)) { + packet_get(p, i, 0, 0, &len); + th = packet_get(p, i, 0, sizeof(*th), NULL); + if (!th) { tcp_rst(c, conn); return; } - off = (size_t)th->doff * 4; + off = th->doff * 4UL; if (off < sizeof(*th) || off > len) { tcp_rst(c, conn); return; } if (th->rst) { - tcp_conn_destroy(c, conn); + conn_event(c, conn, CLOSED); return; } len -= off; - data = (char *)th + off; + data = packet_get(p, i, off, len, NULL); + if (!data) + continue; seq = ntohl(th->seq); ack_seq = ntohl(th->ack_seq); @@ -2511,7 +2539,7 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_conn *conn, i = keep - 1; } - tcp_clamp_window(c, conn, NULL, 0, max_ack_seq_wnd, 0); + tcp_clamp_window(c, conn, max_ack_seq_wnd); if (ack) { if (max_ack_seq == conn->seq_to_tap) { @@ -2595,14 +2623,22 @@ out: * tcp_conn_from_sock_finish() - Complete connection setup after connect() * @c: Execution context * @conn: Connection pointer - * @th: TCP header of SYN, ACK segment from tap/guest - * @len: Packet length of SYN, ACK segment at L4, host order + * @th: TCP header of SYN, ACK segment: caller MUST ensure it's there + * @opts: Pointer to start of options + * @optlen: Bytes in options: caller MUST ensure available length */ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn, - struct tcphdr *th, size_t len) + struct tcphdr *th, + char *opts, size_t optlen) { - tcp_clamp_window(c, conn, th, len, 0, 1); - MSS_SET(conn, tcp_conn_tap_mss(c, conn, th, len)); + tcp_clamp_window(c, conn, ntohs(th->window)); + tcp_get_tap_ws(conn, opts, optlen); + + /* First value is not scaled */ + if (!(conn->wnd_from_tap >>= conn->ws_from_tap)) + conn->wnd_from_tap = 1; + + MSS_SET(conn, tcp_conn_tap_mss(c, conn, opts, optlen)); conn->seq_init_from_tap = ntohl(th->seq) + 1; conn->seq_from_tap = conn->seq_init_from_tap; @@ -2622,32 +2658,42 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn, * @c: Execution context * @af: Address family, AF_INET or AF_INET6 * @addr: Destination address - * @msg: Input messages - * @count: Message count + * @p: Pool of TCP packets, with TCP headers * @now: Current timestamp * * Return: count of consumed packets */ -int tcp_tap_handler(struct ctx *c, int af, void *addr, - struct tap_l4_msg *msg, int count, struct timespec *now) +int tcp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p, + struct timespec *now) { - struct tcphdr *th = (struct tcphdr *)(pkt_buf + msg[0].pkt_buf_offset); - uint16_t len = msg[0].l4_len; struct tcp_conn *conn; + size_t optlen, len; + struct tcphdr *th; int ack_due = 0; + char *opts; + + packet_get(p, 0, 0, 0, &len); + th = packet_get(p, 0, 0, sizeof(*th), NULL); + if (!th) + return 1; + + optlen = th->doff * 4UL - sizeof(*th); + opts = packet_get(p, 0, sizeof(*th), optlen, NULL); conn = tcp_hash_lookup(c, af, addr, htons(th->source), htons(th->dest)); /* New connection from tap */ if (!conn) { if (th->syn && !th->ack) - tcp_conn_from_tap(c, af, addr, th, len, now); + tcp_conn_from_tap(c, af, addr, th, opts, optlen, now); return 1; } + trace("TCP: packet length %lu from tap for index %lu", len, conn - tc); + if (th->rst) { - tcp_conn_destroy(c, conn); - return count; + conn_event(c, conn, CLOSED); + return p->count; } if (th->ack) { @@ -2660,7 +2706,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, /* Establishing connection from socket */ if (conn->events & SOCK_ACCEPTED) { if (th->syn && th->ack && !th->fin) - tcp_conn_from_sock_finish(c, conn, th, len); + tcp_conn_from_sock_finish(c, conn, th, opts, optlen); else tcp_rst(c, conn); @@ -2671,7 +2717,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, if (conn->events & TAP_SYN_RCVD) { if (!(conn->events & TAP_SYN_ACK_SENT)) { tcp_rst(c, conn); - return count; + return p->count; } conn_event(c, conn, ESTABLISHED); @@ -2683,17 +2729,19 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, tcp_send_flag(c, conn, ACK); conn_event(c, conn, SOCK_FIN_SENT); - return count; + return p->count; } if (!th->ack) { tcp_rst(c, conn); - return count; + return p->count; } - tcp_clamp_window(c, conn, th, len, 0, 0); + tcp_clamp_window(c, conn, ntohs(th->window)); + + tcp_data_from_sock(c, conn); - if (count == 1) + if (p->count == 1) return 1; } @@ -2701,13 +2749,13 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, if (conn->events & TAP_FIN_RCVD) { if (conn->events & SOCK_FIN_RCVD && conn->seq_ack_from_tap == conn->seq_to_tap) - tcp_conn_destroy(c, conn); + conn_event(c, conn, CLOSED); return 1; } /* Established connections accepting data from tap */ - tcp_data_from_tap(c, conn, msg, count); + tcp_data_from_tap(c, conn, p); if (conn->seq_ack_to_tap != conn->seq_from_tap) ack_due = 1; @@ -2721,7 +2769,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, if (ack_due) conn_flag(c, conn, ACK_TO_TAP_DUE); - return count; + return p->count; } /** @@ -2872,7 +2920,7 @@ static void tcp_timer_handler(struct ctx *c, union epoll_ref ref) if (!(conn->events & ESTABLISHED)) { debug("TCP: index %i, handshake timeout", conn - tc); tcp_rst(c, conn); - } else if (conn->events & TAP_FIN_SENT) { + } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { debug("TCP: index %i, FIN timeout", conn - tc); tcp_rst(c, conn); } else if (conn->retrans == TCP_MAX_RETRANS) { @@ -2884,6 +2932,7 @@ static void tcp_timer_handler(struct ctx *c, union epoll_ref ref) conn->retrans++; conn->seq_to_tap = conn->seq_ack_from_tap; tcp_data_from_sock(c, conn); + tcp_timer_ctl(c, conn); } } else { struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } }; @@ -2933,19 +2982,22 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, if (!(conn = CONN_OR_NULL(ref.r.p.tcp.tcp.index))) return; + if (conn->events == CLOSED) + return; + if (events & EPOLLERR) { tcp_rst(c, conn); return; } if ((conn->events & TAP_FIN_SENT) && (events & EPOLLHUP)) { - tcp_conn_destroy(c, conn); + conn_event(c, conn, CLOSED); return; } if (conn->events & ESTABLISHED) { if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) - tcp_conn_destroy(c, conn); + conn_event(c, conn, CLOSED); if (events & (EPOLLRDHUP | EPOLLHUP)) conn_event(c, conn, SOCK_FIN_RCVD); @@ -3159,7 +3211,7 @@ static int tcp_sock_refill(void *arg) * * Return: 0 on success, -1 on failure */ -int tcp_sock_init(struct ctx *c, struct timespec *now) +int tcp_sock_init(struct ctx *c) { struct tcp_sock_refill_arg refill_arg = { c, 0 }; int i, port; @@ -3215,7 +3267,6 @@ int tcp_sock_init(struct ctx *c, struct timespec *now) memset(tcp_sock_init_ext, 0xff, sizeof(tcp_sock_init_ext)); memset(tcp_sock_ns, 0xff, sizeof(tcp_sock_ns)); - c->tcp.refill_ts = *now; tcp_sock_refill(&refill_arg); if (c->mode == MODE_PASTA) { @@ -3226,7 +3277,7 @@ int tcp_sock_init(struct ctx *c, struct timespec *now) refill_arg.ns = 1; NS_CALL(tcp_sock_refill, &refill_arg); - c->tcp.port_detect_ts = *now; + tcp_splice_timer(c); } return 0; @@ -3349,47 +3400,48 @@ static int tcp_port_rebind(void *arg) } /** - * tcp_timer() - Scan activity bitmap for sockets waiting for timed events + * tcp_timer() - Periodic tasks: port detection, closed connections, pool refill * @c: Execution context - * @now: Timestamp from caller + * @ts: Unused */ -void tcp_timer(struct ctx *c, struct timespec *now) +void tcp_timer(struct ctx *c, struct timespec *ts) { struct tcp_sock_refill_arg refill_arg = { c, 0 }; + struct tcp_conn *conn; - if (c->mode == MODE_PASTA) { - if (timespec_diff_ms(now, &c->tcp.port_detect_ts) > - PORT_DETECT_INTERVAL) { - struct tcp_port_detect_arg detect_arg = { c, 0 }; - struct tcp_port_rebind_arg rebind_arg = { c, 0 }; - - if (c->tcp.init_detect_ports) { - detect_arg.detect_in_ns = 0; - tcp_port_detect(&detect_arg); - rebind_arg.bind_in_ns = 1; - NS_CALL(tcp_port_rebind, &rebind_arg); - } + (void)ts; - if (c->tcp.ns_detect_ports) { - detect_arg.detect_in_ns = 1; - NS_CALL(tcp_port_detect, &detect_arg); - rebind_arg.bind_in_ns = 0; - tcp_port_rebind(&rebind_arg); - } + if (c->mode == MODE_PASTA) { + struct tcp_port_detect_arg detect_arg = { c, 0 }; + struct tcp_port_rebind_arg rebind_arg = { c, 0 }; + + if (c->tcp.init_detect_ports) { + detect_arg.detect_in_ns = 0; + tcp_port_detect(&detect_arg); + rebind_arg.bind_in_ns = 1; + NS_CALL(tcp_port_rebind, &rebind_arg); + } - c->tcp.port_detect_ts = *now; + if (c->tcp.ns_detect_ports) { + detect_arg.detect_in_ns = 1; + NS_CALL(tcp_port_detect, &detect_arg); + rebind_arg.bind_in_ns = 0; + tcp_port_rebind(&rebind_arg); } + } - tcp_splice_timer(c, now); + for (conn = CONN(c->tcp.conn_count - 1); conn >= tc; conn--) { + if (conn->events == CLOSED) + tcp_conn_destroy(c, conn); } - if (timespec_diff_ms(now, &c->tcp.refill_ts) > REFILL_INTERVAL) { - tcp_sock_refill(&refill_arg); - if (c->mode == MODE_PASTA) { - refill_arg.ns = 1; - if ((c->v4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) || - (c->v6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0)) - NS_CALL(tcp_sock_refill, &refill_arg); - } + tcp_sock_refill(&refill_arg); + if (c->mode == MODE_PASTA) { + refill_arg.ns = 1; + if ((c->v4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) || + (c->v6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0)) + NS_CALL(tcp_sock_refill, &refill_arg); + + tcp_splice_timer(c); } } @@ -6,9 +6,7 @@ #ifndef TCP_H #define TCP_H -#define REFILL_INTERVAL 1000 /* ms */ -#define PORT_DETECT_INTERVAL 1000 -#define TCP_TIMER_INTERVAL MIN(REFILL_INTERVAL, PORT_DETECT_INTERVAL) +#define TCP_TIMER_INTERVAL 1000 /* ms */ #define TCP_CONN_INDEX_BITS 17 /* 128k */ #define TCP_MAX_CONNS (1 << TCP_CONN_INDEX_BITS) @@ -20,10 +18,10 @@ struct ctx; void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, struct timespec *now); -int tcp_tap_handler(struct ctx *c, int af, void *addr, - struct tap_l4_msg *msg, int count, struct timespec *now); -int tcp_sock_init(struct ctx *c, struct timespec *now); -void tcp_timer(struct ctx *c, struct timespec *now); +int tcp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p, + struct timespec *now); +int tcp_sock_init(struct ctx *c); +void tcp_timer(struct ctx *c, struct timespec *ts); void tcp_defer_handler(struct ctx *c); void tcp_sock_set_bufsize(struct ctx *c, int s); @@ -64,8 +62,6 @@ union tcp_epoll_ref { * @timer_run: Timestamp of most recent timer run * @kernel_snd_wnd: Kernel reports sending window (with commit 8f7baad7f035) * @pipe_size: Size of pipes for spliced connections - * @refill_ts: Time of last refill operation for pools of sockets/pipes - * @port_detect_ts: Time of last TCP port detection/rebind, if enabled */ struct tcp_ctx { uint64_t hash_secret[2]; @@ -80,8 +76,6 @@ struct tcp_ctx { int kernel_snd_wnd; #endif size_t pipe_size; - struct timespec refill_ts; - struct timespec port_detect_ts; }; #endif /* TCP_H */ diff --git a/tcp_splice.c b/tcp_splice.c index bcafd33..0095740 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -51,7 +51,7 @@ #define MAX_PIPE_SIZE (2UL * 1024 * 1024) #define TCP_SPLICE_MAX_CONNS (128 * 1024) #define TCP_SPLICE_PIPE_POOL_SIZE 16 -#define REFILL_INTERVAL 1000 /* ms, refill pool of pipes */ +#define TCP_SPLICE_CONN_PRESSURE 30 /* % of splice_conn_count */ #define TCP_SPLICE_FILE_PRESSURE 30 /* % of c->nofile */ /* From tcp.c */ @@ -83,24 +83,24 @@ struct tcp_splice_conn { int pipe_b_a[2]; uint8_t events; -#define SPLICE_CLOSED 0 -#define SPLICE_CONNECT BIT(0) -#define SPLICE_ESTABLISHED BIT(1) -#define SPLICE_A_OUT_WAIT BIT(2) -#define SPLICE_B_OUT_WAIT BIT(3) -#define SPLICE_A_FIN_RCVD BIT(4) -#define SPLICE_B_FIN_RCVD BIT(5) -#define SPLICE_A_FIN_SENT BIT(6) -#define SPLICE_B_FIN_SENT BIT(7) +#define CLOSED 0 +#define CONNECT BIT(0) +#define ESTABLISHED BIT(1) +#define A_OUT_WAIT BIT(2) +#define B_OUT_WAIT BIT(3) +#define A_FIN_RCVD BIT(4) +#define B_FIN_RCVD BIT(5) +#define A_FIN_SENT BIT(6) +#define B_FIN_SENT BIT(7) uint8_t flags; -#define SPLICE_V6 BIT(0) -#define SPLICE_IN_EPOLL BIT(1) -#define SPLICE_RCVLOWAT_SET_A BIT(2) -#define SPLICE_RCVLOWAT_SET_B BIT(3) -#define SPLICE_RCVLOWAT_ACT_A BIT(4) -#define SPLICE_RCVLOWAT_ACT_B BIT(5) -#define SPLICE_CLOSING BIT(6) +#define SOCK_V6 BIT(0) +#define IN_EPOLL BIT(1) +#define RCVLOWAT_SET_A BIT(2) +#define RCVLOWAT_SET_B BIT(3) +#define RCVLOWAT_ACT_A BIT(4) +#define RCVLOWAT_ACT_B BIT(5) +#define CLOSING BIT(6) uint64_t a_read; uint64_t a_written; @@ -108,7 +108,7 @@ struct tcp_splice_conn { uint64_t b_written; }; -#define CONN_V6(x) (x->flags & SPLICE_V6) +#define CONN_V6(x) (x->flags & SOCK_V6) #define CONN_V4(x) (!CONN_V6(x)) #define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) #define CONN(index) (tc + (index)) @@ -118,15 +118,13 @@ static struct tcp_splice_conn tc[TCP_SPLICE_MAX_CONNS]; /* Display strings for connection events */ static const char *tcp_splice_event_str[] __attribute((__unused__)) = { - "SPLICE_CONNECT", "SPLICE_ESTABLISHED", - "SPLICE_A_OUT_WAIT", "SPLICE_B_OUT_WAIT", - "SPLICE_A_FIN_RCVD", "SPLICE_B_FIN_RCVD", - "SPLICE_A_FIN_SENT", "SPLICE_B_FIN_SENT", + "CONNECT", "ESTABLISHED", "A_OUT_WAIT", "B_OUT_WAIT", + "A_FIN_RCVD", "B_FIN_RCVD", "A_FIN_SENT", "B_FIN_SENT", }; /* Display strings for connection flags */ static const char *tcp_splice_flag_str[] __attribute((__unused__)) = { - "V6", "IN_EPOLL", "RCVLOWAT_SET_A", "RCVLOWAT_SET_B", + "SOCK_V6", "IN_EPOLL", "RCVLOWAT_SET_A", "RCVLOWAT_SET_B", "RCVLOWAT_ACT_A", "RCVLOWAT_ACT_B", "CLOSING", }; @@ -141,23 +139,27 @@ static void tcp_splice_conn_epoll_events(uint16_t events, { *a = *b = 0; - if (events & SPLICE_CLOSED) + if (events & CLOSED) return; - if (events & SPLICE_ESTABLISHED) - *a = *b = EPOLLIN | EPOLLRDHUP; - else if (events & SPLICE_CONNECT) + if (events & ESTABLISHED) { + if (!(events & B_FIN_SENT)) + *a = EPOLLIN | EPOLLRDHUP; + if (!(events & A_FIN_SENT)) + *b = EPOLLIN | EPOLLRDHUP; + } else if (events & CONNECT) { *b = EPOLLOUT; + } - *a |= (events & SPLICE_A_OUT_WAIT) ? EPOLLOUT : 0; - *b |= (events & SPLICE_B_OUT_WAIT) ? EPOLLOUT : 0; + *a |= (events & A_OUT_WAIT) ? EPOLLOUT : 0; + *b |= (events & B_OUT_WAIT) ? EPOLLOUT : 0; } static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn); static int tcp_splice_epoll_ctl(struct ctx *c, struct tcp_splice_conn *conn); /** - * conn_flag_do() - Set/unset given flag, log, update epoll on SPLICE_CLOSING + * conn_flag_do() - Set/unset given flag, log, update epoll on CLOSING flag * @c: Execution context * @conn: Connection pointer * @flag: Flag to set, or ~flag to unset @@ -181,7 +183,7 @@ static void conn_flag_do(struct ctx *c, struct tcp_splice_conn *conn, tcp_splice_flag_str[fls(flag)]); } - if (flag == SPLICE_CLOSING) + if (flag == CLOSING) tcp_splice_epoll_ctl(c, conn); } @@ -201,7 +203,7 @@ static void conn_flag_do(struct ctx *c, struct tcp_splice_conn *conn, */ static int tcp_splice_epoll_ctl(struct ctx *c, struct tcp_splice_conn *conn) { - int m = (conn->flags & SPLICE_IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; + int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref_a = { .r.proto = IPPROTO_TCP, .r.s = conn->a, .r.p.tcp.tcp.splice = 1, .r.p.tcp.tcp.index = conn - tc, @@ -214,15 +216,8 @@ static int tcp_splice_epoll_ctl(struct ctx *c, struct tcp_splice_conn *conn) struct epoll_event ev_b = { .data.u64 = ref_b.u64 }; uint32_t events_a, events_b; - if (conn->flags & SPLICE_CLOSING) { - if (conn->flags & SPLICE_IN_EPOLL) - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->a, &ev_a); - - if (conn->events & SPLICE_CONNECT) - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->b, &ev_b); - - return 0; - } + if (conn->flags & CLOSING) + goto delete; tcp_splice_conn_epoll_events(conn->events, &events_a, &events_b); ev_a.events = events_a; @@ -230,13 +225,13 @@ static int tcp_splice_epoll_ctl(struct ctx *c, struct tcp_splice_conn *conn) if (epoll_ctl(c->epollfd, m, conn->a, &ev_a) || epoll_ctl(c->epollfd, m, conn->b, &ev_b)) - goto err; + goto delete; - conn->flags |= SPLICE_IN_EPOLL; /* No need to log this */ + conn->flags |= IN_EPOLL; /* No need to log this */ return 0; -err: +delete: epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->a, &ev_a); epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->b, &ev_b); return -errno; @@ -251,12 +246,6 @@ err: static void conn_event_do(struct ctx *c, struct tcp_splice_conn *conn, unsigned long event) { - if (event == SPLICE_CLOSED) { - conn->events = SPLICE_CLOSED; - debug("TCP (spliced): index %i, CLOSED", conn - tc); - return; - } - if (event & (event - 1)) { if (!(conn->events & ~event)) return; @@ -274,7 +263,7 @@ static void conn_event_do(struct ctx *c, struct tcp_splice_conn *conn, } if (tcp_splice_epoll_ctl(c, conn)) - conn_flag(c, conn, SPLICE_CLOSING); + conn_flag(c, conn, CLOSING); } #define conn_event(c, conn, event) \ @@ -304,22 +293,25 @@ static void tcp_table_splice_compact(struct ctx *c, memcpy(hole, move, sizeof(*hole)); move->a = move->b = -1; - move->flags = move->events = 0; move->a_read = move->a_written = move->b_read = move->b_written = 0; + move->pipe_a_b[0] = move->pipe_a_b[1] = -1; + move->pipe_b_a[0] = move->pipe_b_a[1] = -1; + move->flags = move->events = 0; debug("TCP (spliced): index %i moved to %i", move - tc, hole - tc); + tcp_splice_epoll_ctl(c, hole); if (tcp_splice_epoll_ctl(c, hole)) - conn_flag(c, hole, SPLICE_CLOSING); + conn_flag(c, hole, CLOSING); } /** - * tcp_splice_destroy() - Close spliced connection and pipes, drop from epoll + * tcp_splice_destroy() - Close spliced connection and pipes, clear * @c: Execution context * @conn: Connection pointer */ static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn) { - if (conn->events & SPLICE_ESTABLISHED) { + if (conn->events & ESTABLISHED) { /* Flushing might need to block: don't recycle them. */ if (conn->pipe_a_b[0] != -1) { close(conn->pipe_a_b[0]); @@ -333,18 +325,19 @@ static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn) } } - if (conn->events & SPLICE_CONNECT) { + if (conn->events & CONNECT) { close(conn->b); conn->b = -1; } - conn_event(c, conn, SPLICE_CLOSED); - close(conn->a); conn->a = -1; - conn->flags = 0; conn->a_read = conn->a_written = conn->b_read = conn->b_written = 0; + conn->events = CLOSED; + conn->flags = 0; + debug("TCP (spliced): index %i, CLOSED", conn - tc); + tcp_table_splice_compact(c, conn); } @@ -364,7 +357,7 @@ static int tcp_splice_connect_finish(struct ctx *c, conn->pipe_a_b[1] = conn->pipe_b_a[1] = -1; for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) { - if (splice_pipe_pool[i][0][0] > 0) { + if (splice_pipe_pool[i][0][0] >= 0) { SWAP(conn->pipe_a_b[0], splice_pipe_pool[i][0][0]); SWAP(conn->pipe_a_b[1], splice_pipe_pool[i][0][1]); @@ -377,7 +370,7 @@ static int tcp_splice_connect_finish(struct ctx *c, if (conn->pipe_a_b[0] < 0) { if (pipe2(conn->pipe_a_b, O_NONBLOCK) || pipe2(conn->pipe_b_a, O_NONBLOCK)) { - conn_flag(c, conn, SPLICE_CLOSING); + conn_flag(c, conn, CLOSING); return -EIO; } @@ -385,8 +378,8 @@ static int tcp_splice_connect_finish(struct ctx *c, fcntl(conn->pipe_b_a[0], F_SETPIPE_SZ, c->tcp.pipe_size); } - if (!(conn->events & SPLICE_ESTABLISHED)) - conn_event(c, conn, SPLICE_ESTABLISHED); + if (!(conn->events & ESTABLISHED)) + conn_event(c, conn, ESTABLISHED); return 0; } @@ -450,9 +443,9 @@ static int tcp_splice_connect(struct ctx *c, struct tcp_splice_conn *conn, close(sock_conn); return ret; } - conn_event(c, conn, SPLICE_CONNECT); + conn_event(c, conn, CONNECT); } else { - conn_event(c, conn, SPLICE_ESTABLISHED); + conn_event(c, conn, ESTABLISHED); return tcp_splice_connect_finish(c, conn); } @@ -575,20 +568,23 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, conn = CONN(c->tcp.splice_conn_count++); conn->a = s; - conn->flags = ref.r.p.tcp.tcp.v6 ? SPLICE_V6 : 0; + conn->flags = ref.r.p.tcp.tcp.v6 ? SOCK_V6 : 0; if (tcp_splice_new(c, conn, ref.r.p.tcp.tcp.index)) - conn_flag(c, conn, SPLICE_CLOSING); + conn_flag(c, conn, CLOSING); return; } conn = CONN(ref.r.p.tcp.tcp.index); - if (events & EPOLLERR || events & EPOLLHUP) + if (conn->events == CLOSED) + return; + + if (events & EPOLLERR) goto close; - if (conn->events == SPLICE_CONNECT) { + if (conn->events == CONNECT) { if (!(events & EPOLLOUT)) goto close; if (tcp_splice_connect_finish(c, conn)) @@ -597,9 +593,9 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, if (events & EPOLLOUT) { if (ref.r.s == conn->a) - conn_event(c, conn, ~SPLICE_A_OUT_WAIT); + conn_event(c, conn, ~A_OUT_WAIT); else - conn_event(c, conn, ~SPLICE_B_OUT_WAIT); + conn_event(c, conn, ~B_OUT_WAIT); tcp_splice_dir(conn, ref.r.s, 1, &from, &to, &pipes); } else { @@ -608,9 +604,16 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, if (events & EPOLLRDHUP) { if (ref.r.s == conn->a) - conn_event(c, conn, SPLICE_A_FIN_RCVD); + conn_event(c, conn, A_FIN_RCVD); + else + conn_event(c, conn, B_FIN_RCVD); + } + + if (events & EPOLLHUP) { + if (ref.r.s == conn->a) + conn_event(c, conn, A_FIN_SENT); /* Fake, but implied */ else - conn_event(c, conn, SPLICE_B_FIN_RCVD); + conn_event(c, conn, B_FIN_SENT); } swap: @@ -620,13 +623,13 @@ swap: if (from == conn->a) { seq_read = &conn->a_read; seq_write = &conn->a_written; - lowat_set_flag = SPLICE_RCVLOWAT_SET_A; - lowat_act_flag = SPLICE_RCVLOWAT_ACT_A; + lowat_set_flag = RCVLOWAT_SET_A; + lowat_act_flag = RCVLOWAT_ACT_A; } else { seq_read = &conn->b_read; seq_write = &conn->b_written; - lowat_set_flag = SPLICE_RCVLOWAT_SET_B; - lowat_act_flag = SPLICE_RCVLOWAT_ACT_B; + lowat_set_flag = RCVLOWAT_SET_B; + lowat_act_flag = RCVLOWAT_ACT_B; } while (1) { @@ -636,6 +639,7 @@ swap: retry: readlen = splice(from, NULL, pipes[1], NULL, c->tcp.pipe_size, SPLICE_F_MOVE | SPLICE_F_NONBLOCK); + trace("TCP (spliced): %li from read-side call", readlen); if (readlen < 0) { if (errno == EINTR) goto retry; @@ -660,6 +664,8 @@ retry: eintr: written = splice(pipes[0], NULL, to, NULL, to_write, SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK); + trace("TCP (spliced): %li from write-side call (passed %lu)", + written, to_write); /* Most common case: skip updating counters. */ if (readlen > 0 && readlen == written) { @@ -697,9 +703,9 @@ eintr: goto retry; if (to == conn->a) - conn_event(c, conn, SPLICE_A_OUT_WAIT); + conn_event(c, conn, A_OUT_WAIT); else - conn_event(c, conn, SPLICE_B_OUT_WAIT); + conn_event(c, conn, B_OUT_WAIT); break; } @@ -715,23 +721,21 @@ eintr: break; } - if ( (conn->events & SPLICE_A_FIN_RCVD) && - !(conn->events & SPLICE_B_FIN_SENT)) { - if (*seq_read == *seq_write) { + if ((conn->events & A_FIN_RCVD) && !(conn->events & B_FIN_SENT)) { + if (*seq_read == *seq_write && eof) { shutdown(conn->b, SHUT_WR); - conn_event(c, conn, SPLICE_B_FIN_SENT); + conn_event(c, conn, B_FIN_SENT); } } - if ( (conn->events & SPLICE_B_FIN_RCVD) && - !(conn->events & SPLICE_A_FIN_SENT)) { - if (*seq_read == *seq_write) { + if ((conn->events & B_FIN_RCVD) && !(conn->events & A_FIN_SENT)) { + if (*seq_read == *seq_write && eof) { shutdown(conn->a, SHUT_WR); - conn_event(c, conn, SPLICE_A_FIN_SENT); + conn_event(c, conn, A_FIN_SENT); } } - if (CONN_HAS(conn, SPLICE_A_FIN_SENT | SPLICE_B_FIN_SENT)) + if (CONN_HAS(conn, A_FIN_SENT | B_FIN_SENT)) goto close; if ((events & (EPOLLIN | EPOLLOUT)) == (EPOLLIN | EPOLLOUT)) { @@ -746,10 +750,13 @@ eintr: goto swap; } + if (events & EPOLLHUP) + goto close; + return; close: - conn_flag(c, conn, SPLICE_CLOSING); + conn_flag(c, conn, CLOSING); } /** @@ -829,38 +836,36 @@ void tcp_splice_init(struct ctx *c) /** * tcp_splice_timer() - Timer for spliced connections * @c: Execution context - * @now: Current timestamp */ -void tcp_splice_timer(struct ctx *c, struct timespec *now) +void tcp_splice_timer(struct ctx *c) { struct tcp_splice_conn *conn; for (conn = CONN(c->tcp.splice_conn_count - 1); conn >= tc; conn--) { - if (conn->flags & SPLICE_CLOSING) { + if (conn->flags & CLOSING) { tcp_splice_destroy(c, conn); - continue; + return; } - if ( (conn->flags & SPLICE_RCVLOWAT_SET_A) && - !(conn->flags & SPLICE_RCVLOWAT_ACT_A)) { + if ( (conn->flags & RCVLOWAT_SET_A) && + !(conn->flags & RCVLOWAT_ACT_A)) { setsockopt(conn->a, SOL_SOCKET, SO_RCVLOWAT, &((int){ 1 }), sizeof(int)); - conn_flag(c, conn, ~SPLICE_RCVLOWAT_SET_A); + conn_flag(c, conn, ~RCVLOWAT_SET_A); } - if ( (conn->flags & SPLICE_RCVLOWAT_SET_B) && - !(conn->flags & SPLICE_RCVLOWAT_ACT_B)) { + if ( (conn->flags & RCVLOWAT_SET_B) && + !(conn->flags & RCVLOWAT_ACT_B)) { setsockopt(conn->b, SOL_SOCKET, SO_RCVLOWAT, &((int){ 1 }), sizeof(int)); - conn_flag(c, conn, ~SPLICE_RCVLOWAT_SET_B); + conn_flag(c, conn, ~RCVLOWAT_SET_B); } - conn_flag(c, conn, ~SPLICE_RCVLOWAT_ACT_A); - conn_flag(c, conn, ~SPLICE_RCVLOWAT_ACT_B); + conn_flag(c, conn, ~RCVLOWAT_ACT_A); + conn_flag(c, conn, ~RCVLOWAT_ACT_B); } - if (timespec_diff_ms(now, &c->tcp.refill_ts) > REFILL_INTERVAL) - tcp_splice_pipe_refill(c); + tcp_splice_pipe_refill(c); } /** @@ -869,14 +874,15 @@ void tcp_splice_timer(struct ctx *c, struct timespec *now) */ void tcp_splice_defer_handler(struct ctx *c) { + int max_conns = c->tcp.conn_count / 100 * TCP_SPLICE_CONN_PRESSURE; int max_files = c->nofile / 100 * TCP_SPLICE_FILE_PRESSURE; struct tcp_splice_conn *conn; - if (c->tcp.splice_conn_count * 6 < max_files) + if (c->tcp.splice_conn_count < MIN(max_files / 6, max_conns)) return; for (conn = CONN(c->tcp.splice_conn_count - 1); conn >= tc; conn--) { - if (conn->flags & SPLICE_CLOSING) + if (conn->flags & CLOSING) tcp_splice_destroy(c, conn); } } diff --git a/tcp_splice.h b/tcp_splice.h index b744ba7..f7c2f86 100644 --- a/tcp_splice.h +++ b/tcp_splice.h @@ -11,5 +11,5 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, uint32_t events); void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn); void tcp_splice_init(struct ctx *c); -void tcp_splice_timer(struct ctx *c, struct timespec *now); +void tcp_splice_timer(struct ctx *c); void tcp_splice_defer_handler(struct ctx *c); @@ -951,35 +951,35 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, * @c: Execution context * @af: Address family, AF_INET or AF_INET6 * @addr: Destination address - * @msg: Input messages - * @count: Message count + * @p: Pool of UDP packets, with UDP headers * @now: Current timestamp * * Return: count of consumed packets * * #syscalls sendmmsg */ -int udp_tap_handler(struct ctx *c, int af, void *addr, - struct tap_l4_msg *msg, int count, struct timespec *now) +int udp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p, + struct timespec *now) { - /* The caller already checks that all the messages have the same source - * and destination, so we can just take those from the first message. - */ - struct udphdr *uh = (struct udphdr *)(pkt_buf + msg[0].pkt_buf_offset); struct mmsghdr mm[UIO_MAXIOV] = { 0 }; struct iovec m[UIO_MAXIOV]; struct sockaddr_in6 s_in6; struct sockaddr_in s_in; struct sockaddr *sa; + int i, s, count = 0; in_port_t src, dst; + struct udphdr *uh; socklen_t sl; - int i, s; (void)c; - if (msg[0].l4_len < sizeof(*uh)) + uh = packet_get(p, 0, 0, sizeof(*uh), NULL); + if (!uh) return 1; + /* The caller already checks that all the messages have the same source + * and destination, so we can just take those from the first message. + */ src = ntohs(uh->source); dst = ntohs(uh->dest); @@ -998,8 +998,8 @@ int udp_tap_handler(struct ctx *c, int af, void *addr, .udp.port = src }; s = sock_l4(c, AF_INET, IPPROTO_UDP, src, 0, uref.u32); - if (s <= 0) - return count; + if (s < 0) + return p->count; udp_tap_map[V4][src].sock = s; bitmap_set(udp_act[V4][UDP_ACT_TAP], src); @@ -1050,8 +1050,8 @@ int udp_tap_handler(struct ctx *c, int af, void *addr, s = sock_l4(c, AF_INET6, IPPROTO_UDP, src, bind_to, uref.u32); - if (s <= 0) - return count; + if (s < 0) + return p->count; udp_tap_map[V6][src].sock = s; bitmap_set(udp_act[V6][UDP_ACT_TAP], src); @@ -1060,18 +1060,26 @@ int udp_tap_handler(struct ctx *c, int af, void *addr, udp_tap_map[V6][src].ts = now->tv_sec; } - for (i = 0; i < count; i++) { + for (i = 0; i < (int)p->count; i++) { struct udphdr *uh_send; + size_t len; + + uh_send = packet_get(p, i, 0, sizeof(*uh), &len); + if (!uh_send) + return p->count; + if (!len) + continue; - uh_send = (struct udphdr *)(msg[i].pkt_buf_offset + pkt_buf); m[i].iov_base = (char *)(uh_send + 1); - m[i].iov_len = msg[i].l4_len - sizeof(*uh_send); + m[i].iov_len = len; mm[i].msg_hdr.msg_name = sa; mm[i].msg_hdr.msg_namelen = sl; mm[i].msg_hdr.msg_iov = m + i; mm[i].msg_hdr.msg_iovlen = 1; + + count++; } count = sendmmsg(s, mm, count, MSG_NOSIGNAL); @@ -1172,13 +1180,11 @@ static void udp_splice_iov_init(void) * * Return: 0 on success, -1 on failure */ -int udp_sock_init(struct ctx *c, struct timespec *now) +int udp_sock_init(struct ctx *c) { union udp_epoll_ref uref = { .udp.bound = 1 }; int dst, s; - (void)now; - for (dst = 0; dst < USHRT_MAX; dst++) { if (!bitmap_isset(c->udp.port_to_tap, dst)) continue; @@ -10,9 +10,9 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, struct timespec *now); -int udp_tap_handler(struct ctx *c, int af, void *addr, - struct tap_l4_msg *msg, int count, struct timespec *now); -int udp_sock_init(struct ctx *c, struct timespec *now); +int udp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p, + struct timespec *now); +int udp_sock_init(struct ctx *c); void udp_timer(struct ctx *c, struct timespec *ts); void udp_update_l2_buf(unsigned char *eth_d, unsigned char *eth_s, const uint32_t *ip_da); @@ -38,6 +38,7 @@ #include "util.h" #include "passt.h" +#include "packet.h" /* For __openlog() and __setlogmask() wrappers, and passt_vsyslog() */ static int log_mask; @@ -156,46 +157,59 @@ void passt_vsyslog(int pri, const char *format, va_list ap) send(log_sock, buf, n, 0); } +#define IPV6_NH_OPT(nh) \ + ((nh) == 0 || (nh) == 43 || (nh) == 44 || (nh) == 50 || \ + (nh) == 51 || (nh) == 60 || (nh) == 135 || (nh) == 139 || \ + (nh) == 140 || (nh) == 253 || (nh) == 254) + /** * ipv6_l4hdr() - Find pointer to L4 header in IPv6 packet and extract protocol - * @ip6h: IPv6 header + * @p: Packet pool, packet number @index has IPv6 header at @offset + * @index: Index of packet in pool + * @offset: Pre-calculated IPv6 header offset * @proto: Filled with L4 protocol number + * @dlen: Data length (payload excluding header extensions), set on return * * Return: pointer to L4 header, NULL if not found */ -char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto) +char *ipv6_l4hdr(struct pool *p, int index, size_t offset, uint8_t *proto, + size_t *dlen) { - int offset, len, hdrlen; struct ipv6_opt_hdr *o; + struct ipv6hdr *ip6h; + char *base; + int hdrlen; uint8_t nh; - len = ntohs(ip6h->payload_len); - offset = 0; + base = packet_get(p, index, 0, 0, NULL); + ip6h = packet_get(p, index, offset, sizeof(*ip6h), dlen); + if (!ip6h) + return NULL; - while (offset < len) { - if (!offset) { - nh = ip6h->nexthdr; - hdrlen = sizeof(struct ipv6hdr); - } else { - o = (struct ipv6_opt_hdr *)(((char *)ip6h) + offset); - nh = o->nexthdr; - hdrlen = (o->hdrlen + 1) * 8; - } + offset += sizeof(*ip6h); - if (nh == 59) - return NULL; + nh = ip6h->nexthdr; + if (!IPV6_NH_OPT(nh)) + goto found; + + while ((o = packet_get_try(p, index, offset, sizeof(*o), dlen))) { + nh = o->nexthdr; + hdrlen = (o->hdrlen + 1) * 8; - if (nh == 0 || nh == 43 || nh == 44 || nh == 50 || - nh == 51 || nh == 60 || nh == 135 || nh == 139 || - nh == 140 || nh == 253 || nh == 254) { + if (IPV6_NH_OPT(nh)) offset += hdrlen; - } else { - *proto = nh; - return (char *)(ip6h + 1) + offset; - } + else + goto found; } return NULL; + +found: + if (nh == 59) + return NULL; + + *proto = nh; + return base + offset; } /** @@ -153,6 +153,8 @@ enum { #include <limits.h> #include <stdarg.h> +#include "packet.h" + enum bind_type { BIND_ANY = 0, BIND_LOOPBACK, @@ -194,7 +196,8 @@ __attribute__ ((weak)) int ffsl(long int i) { return __builtin_ffsl(i); } void __openlog(const char *ident, int option, int facility); void passt_vsyslog(int pri, const char *format, va_list ap); void __setlogmask(int mask); -char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto); +char *ipv6_l4hdr(struct pool *p, int index, size_t offset, uint8_t *proto, + size_t *dlen); int sock_l4(struct ctx *c, int af, uint8_t proto, uint16_t port, enum bind_type bind_addr, uint32_t data); void sock_probe_mem(struct ctx *c); |