aboutgitcodebugslistschat
diff options
context:
space:
mode:
-rw-r--r--README.md2
-rw-r--r--arp.c51
-rw-r--r--arp.h2
-rw-r--r--dhcp.c54
-rw-r--r--dhcp.h2
-rw-r--r--dhcpv6.c151
-rw-r--r--dhcpv6.h3
-rw-r--r--icmp.c28
-rw-r--r--icmp.h4
-rw-r--r--ndp.c59
-rw-r--r--ndp.h3
-rw-r--r--packet.c134
-rw-r--r--packet.h81
-rw-r--r--passt.h1
-rw-r--r--tap.c331
-rw-r--r--tcp.c442
-rw-r--r--tcp.h16
-rw-r--r--tcp_splice.c216
-rw-r--r--tcp_splice.h2
-rw-r--r--udp.c46
-rw-r--r--udp.h6
-rw-r--r--util.c60
-rw-r--r--util.h5
23 files changed, 999 insertions, 700 deletions
diff --git a/README.md b/README.md
index f01f89f..9d08470 100644
--- a/README.md
+++ b/README.md
@@ -291,7 +291,7 @@ speeding up local connections, and usually requiring NAT. _pasta_:
* ✅ restrictive seccomp profiles (25 syscalls allowed for _passt_, 37 for
_pasta_ on x86_64)
* ✅ static checkers in continuous integration (clang-tidy, cppcheck)
-* 🛠️ clearly defined packet abstraction
+* ✅️ clearly defined boundary-checked packet abstraction
* 🛠️ ~5 000 LoC target
* ⌚ [fuzzing](https://bugs.passt.top/show_bug.cgi?id=9), _packetdrill_ tests
* ⌚ stricter [synflood protection](https://bugs.passt.top/show_bug.cgi?id=10)
diff --git a/arp.c b/arp.c
index 3195692..bcce804 100644
--- a/arp.c
+++ b/arp.c
@@ -30,53 +30,56 @@
#include "tap.h"
/**
- * arp() - Check if this is an ARP message, reply as needed
+ * arp() - Check if this is a supported ARP message, reply as needed
* @c: Execution context
- * @len: Total L2 packet length
- * @eh: Packet buffer, Ethernet header
+ * @p: Packet pool, single packet with Ethernet buffer
*
- * Return: 0 if it's not an ARP message, 1 if handled, -1 on failure
+ * Return: 1 if handled, -1 on failure
*/
-int arp(struct ctx *c, struct ethhdr *eh, size_t len)
+int arp(struct ctx *c, struct pool *p)
{
- struct arphdr *ah = (struct arphdr *)(eh + 1);
- struct arpmsg *am = (struct arpmsg *)(ah + 1);
unsigned char swap[4];
+ struct ethhdr *eh;
+ struct arphdr *ah;
+ struct arpmsg *am;
+ size_t len;
- if (eh->h_proto != htons(ETH_P_ARP))
- return 0;
+ eh = packet_get(p, 0, 0, sizeof(*eh), NULL);
+ ah = packet_get(p, 0, sizeof(*eh), sizeof(*ah), NULL);
+ am = packet_get(p, 0, sizeof(*eh) + sizeof(*ah), sizeof(*am), NULL);
- if (len < sizeof(*eh) + sizeof(*ah) + sizeof(*am))
+ if (!eh || !ah || !am)
return -1;
- if (ah->ar_hrd != htons(ARPHRD_ETHER) ||
- ah->ar_pro != htons(ETH_P_IP) ||
- ah->ar_hln != ETH_ALEN || ah->ar_pln != 4 ||
- ah->ar_op != htons(ARPOP_REQUEST))
+ if (ah->ar_hrd != htons(ARPHRD_ETHER) ||
+ ah->ar_pro != htons(ETH_P_IP) ||
+ ah->ar_hln != ETH_ALEN ||
+ ah->ar_pln != 4 ||
+ ah->ar_op != htons(ARPOP_REQUEST))
return 1;
/* Discard announcements (but not 0.0.0.0 "probes"): we might have the
* same IP address, hide that.
*/
- if (memcmp(am->sip, (unsigned char[4]){ 0, 0, 0, 0 }, 4) &&
- !memcmp(am->sip, am->tip, 4))
+ if (memcmp(am->sip, (unsigned char[4]){ 0 }, sizeof(am->tip)) &&
+ !memcmp(am->sip, am->tip, sizeof(am->sip)))
return 1;
/* Don't resolve our own address, either. */
- if (!memcmp(am->tip, &c->addr4, 4))
+ if (!memcmp(am->tip, &c->addr4, sizeof(am->tip)))
return 1;
ah->ar_op = htons(ARPOP_REPLY);
- memcpy(am->tha, am->sha, ETH_ALEN);
- memcpy(am->sha, c->mac, ETH_ALEN);
+ memcpy(am->tha, am->sha, sizeof(am->tha));
+ memcpy(am->sha, c->mac, sizeof(am->sha));
- memcpy(swap, am->tip, 4);
- memcpy(am->tip, am->sip, 4);
- memcpy(am->sip, swap, 4);
+ memcpy(swap, am->tip, sizeof(am->tip));
+ memcpy(am->tip, am->sip, sizeof(am->tip));
+ memcpy(am->sip, swap, sizeof(am->sip));
len = sizeof(*eh) + sizeof(*ah) + sizeof(*am);
- memcpy(eh->h_dest, eh->h_source, ETH_ALEN);
- memcpy(eh->h_source, c->mac, ETH_ALEN);
+ memcpy(eh->h_dest, eh->h_source, sizeof(eh->h_dest));
+ memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
if (tap_send(c, eh, len, 0) < 0)
perror("ARP: send");
diff --git a/arp.h b/arp.h
index a198969..6ef3736 100644
--- a/arp.h
+++ b/arp.h
@@ -17,4 +17,4 @@ struct arpmsg {
unsigned char tip[4];
} __attribute__((__packed__));
-int arp(struct ctx *c, struct ethhdr *eh, size_t len);
+int arp(struct ctx *c, struct pool *p);
diff --git a/dhcp.c b/dhcp.c
index d24ef86..7f2191d 100644
--- a/dhcp.c
+++ b/dhcp.c
@@ -22,9 +22,11 @@
#include <stdint.h>
#include <unistd.h>
#include <string.h>
+#include <limits.h>
#include "util.h"
#include "checksum.h"
+#include "packet.h"
#include "passt.h"
#include "tap.h"
#include "dhcp.h"
@@ -257,27 +259,32 @@ static void opt_set_dns_search(struct ctx *c, size_t max_len)
/**
* dhcp() - Check if this is a DHCP message, reply as needed
* @c: Execution context
- * @len: Total L2 packet length
- * @eh: Packet buffer, Ethernet header
+ * @p: Packet pool, single packet with Ethernet buffer
*
* Return: 0 if it's not a DHCP message, 1 if handled, -1 on failure
*/
-int dhcp(struct ctx *c, struct ethhdr *eh, size_t len)
+int dhcp(struct ctx *c, struct pool *p)
{
- struct iphdr *iph = (struct iphdr *)(eh + 1);
- size_t mlen, olen;
+ size_t mlen, len, offset = 0, opt_len, opt_off = 0;
+ struct ethhdr *eh;
+ struct iphdr *iph;
struct udphdr *uh;
unsigned int i;
struct msg *m;
- if (len < sizeof(*eh) + sizeof(*iph))
- return 0;
+ eh = packet_get(p, 0, offset, sizeof(*eh), NULL);
+ offset += sizeof(*eh);
- if (len < sizeof(*eh) + (long)iph->ihl * 4 + sizeof(*uh))
- return 0;
+ iph = packet_get(p, 0, offset, sizeof(*iph), NULL);
+ if (!eh || !iph)
+ return -1;
- uh = (struct udphdr *)((char *)iph + (long)(iph->ihl * 4));
- m = (struct msg *)(uh + 1);
+ offset += iph->ihl * 4UL;
+ uh = packet_get(p, 0, offset, sizeof(*uh), &mlen);
+ offset += sizeof(*uh);
+
+ if (!uh)
+ return -1;
if (uh->dest != htons(67))
return 0;
@@ -285,18 +292,29 @@ int dhcp(struct ctx *c, struct ethhdr *eh, size_t len)
if (c->no_dhcp)
return 1;
- mlen = len - sizeof(*eh) - (long)iph->ihl * 4 - sizeof(*uh);
- if (mlen != ntohs(uh->len) - sizeof(*uh) ||
- mlen < offsetof(struct msg, o) ||
+ m = packet_get(p, 0, offset, offsetof(struct msg, o), &opt_len);
+ if (!m ||
+ mlen != ntohs(uh->len) - sizeof(*uh) ||
+ mlen < offsetof(struct msg, o) ||
m->op != BOOTREQUEST)
return -1;
- olen = mlen - offsetof(struct msg, o);
- for (i = 0; i + 2 < olen; i += m->o[i + 1] + 2) {
- if (m->o[i + 1] + i + 2 >= olen)
+ offset += offsetof(struct msg, o);
+
+ while (opt_off + 2 < opt_len) {
+ uint8_t *olen, *type, *val;
+
+ type = packet_get(p, 0, offset + opt_off, 1, NULL);
+ olen = packet_get(p, 0, offset + opt_off + 1, 1, NULL);
+ if (!type || !olen)
+ return -1;
+
+ val = packet_get(p, 0, offset + opt_off + 2, *olen, NULL);
+ if (!val)
return -1;
- memcpy(&opts[m->o[i]].c, &m->o[i + 2], m->o[i + 1]);
+ memcpy(&opts[*type].c, val, *olen);
+ opt_off += *olen + 2;
}
if (opts[53].c[0] == DHCPDISCOVER) {
diff --git a/dhcp.h b/dhcp.h
index 91697b5..7c72fd2 100644
--- a/dhcp.h
+++ b/dhcp.h
@@ -3,5 +3,5 @@
* Author: Stefano Brivio <sbrivio@redhat.com>
*/
-int dhcp(struct ctx *c, struct ethhdr *eh, size_t len);
+int dhcp(struct ctx *c, struct pool *p);
void dhcp_init(void);
diff --git a/dhcpv6.c b/dhcpv6.c
index 375ba79..5c9ea88 100644
--- a/dhcpv6.c
+++ b/dhcpv6.c
@@ -24,7 +24,9 @@
#include <unistd.h>
#include <string.h>
#include <time.h>
+#include <limits.h>
+#include "packet.h"
#include "util.h"
#include "passt.h"
#include "tap.h"
@@ -69,6 +71,8 @@ struct opt_hdr {
#endif
#define OPT_SIZE(x) OPT_SIZE_CONV(sizeof(struct opt_##x) - \
sizeof(struct opt_hdr))
+#define OPT_VSIZE(x) (sizeof(struct opt_##x) - \
+ sizeof(struct opt_hdr))
/**
* struct opt_client_id - DHCPv6 Client Identifier option
@@ -265,10 +269,10 @@ static const struct opt_status_code sc_not_on_link = {
/**
* struct resp_not_on_link_t - NotOnLink error (mandated by RFC 8415, 18.3.2.)
- * @uh: UDP header
- * @hdr: DHCP message header
- * @server_id: Server Identifier option
- * @var: Payload: IA_NA from client, status code, client ID
+ * @uh: UDP header
+ * @hdr: DHCP message header
+ * @server_id: Server Identifier option
+ * @var: Payload: IA_NA from client, status code, client ID
*/
static struct resp_not_on_link_t {
struct udphdr uh;
@@ -287,26 +291,30 @@ static struct resp_not_on_link_t {
/**
* dhcpv6_opt() - Get option from DHCPv6 message
- * @o: First option header to check
- * @type: Option type to look up, network order
- * @len: Remaining length, host order, modified on return
+ * @p: Packet pool, single packet with UDP header
+ * @offset: Offset to look at, 0: end of header, set to option start
+ * @type: Option type to look up, network order
*
* Return: pointer to option header, or NULL on malformed or missing option
*/
-static struct opt_hdr *dhcpv6_opt(struct opt_hdr *o, uint16_t type, size_t *len)
+static struct opt_hdr *dhcpv6_opt(struct pool *p, size_t *offset, uint16_t type)
{
- while (*len >= sizeof(struct opt_hdr)) {
- unsigned int opt_len = ntohs(o->l) + sizeof(struct opt_hdr);
+ struct opt_hdr *o;
+ size_t left;
- if (opt_len > *len)
- return NULL;
+ if (!*offset)
+ *offset = sizeof(struct udphdr) + sizeof(struct msg_hdr);
+
+ while ((o = packet_get_try(p, 0, *offset, sizeof(*o), &left))) {
+ unsigned int opt_len = ntohs(o->l) + sizeof(*o);
- *len -= opt_len;
+ if (ntohs(o->l) > left)
+ return NULL;
if (o->t == type)
return o;
- o = (struct opt_hdr *)((uint8_t *)o + opt_len);
+ *offset += opt_len;
}
return NULL;
@@ -314,61 +322,45 @@ static struct opt_hdr *dhcpv6_opt(struct opt_hdr *o, uint16_t type, size_t *len)
/**
* dhcpv6_ia_notonlink() - Check if any IA contains non-appropriate addresses
- * @o: First option header to check for IAs
- * @rem_len: Remaining message length, host order
- * @addr: Address we want to lease to the client
+ * @o: First option header to check for IAs
+ * @rem_len: Remaining message length, host order
+ * @addr: Address we want to lease to the client
*
* Return: pointer to non-appropriate IA_NA or IA_TA, if any, NULL otherwise
*/
-static struct opt_hdr *dhcpv6_ia_notonlink(struct opt_hdr *o, size_t rem_len,
- struct in6_addr *addr)
+static struct opt_hdr *dhcpv6_ia_notonlink(struct pool *p, struct in6_addr *la)
{
- struct opt_hdr *ia, *ia_addr;
char buf[INET6_ADDRSTRLEN];
struct in6_addr *req_addr;
- size_t len;
+ struct opt_hdr *ia, *h;
+ size_t offset;
int ia_type;
ia_type = OPT_IA_NA;
ia_ta:
- len = rem_len;
- ia = o;
-
- while ((ia = dhcpv6_opt(ia, ia_type, &len))) {
- size_t ia_len = ntohs(ia->l);
-
- if (ia_type == OPT_IA_NA) {
- struct opt_ia_na *subopt = (struct opt_ia_na *)ia + 1;
-
- ia_addr = (struct opt_hdr *)subopt;
- } else if (ia_type == OPT_IA_TA) {
- struct opt_ia_ta *subopt = (struct opt_ia_ta *)ia + 1;
-
- ia_addr = (struct opt_hdr *)subopt;
- }
+ offset = 0;
+ while ((ia = dhcpv6_opt(p, &offset, ia_type))) {
+ if (ntohs(ia->l) < OPT_VSIZE(ia_na))
+ return NULL;
- ia_len -= sizeof(struct opt_ia_na) - sizeof(struct opt_hdr);
+ offset += sizeof(struct opt_ia_na);
- while ((ia_addr = dhcpv6_opt(ia_addr, OPT_IAAADR, &ia_len))) {
- struct opt_ia_addr *next;
+ while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) {
+ struct opt_ia_addr *opt_addr = (struct opt_ia_addr *)h;
- req_addr = (struct in6_addr *)(ia_addr + 1);
+ if (ntohs(h->l) != OPT_VSIZE(ia_addr))
+ return NULL;
- if (!IN6_ARE_ADDR_EQUAL(addr, req_addr)) {
+ req_addr = &opt_addr->addr;
+ if (!IN6_ARE_ADDR_EQUAL(la, req_addr)) {
info("DHCPv6: requested address %s not on link",
inet_ntop(AF_INET6, req_addr,
buf, sizeof(buf)));
return ia;
}
- next = (struct opt_ia_addr *)ia_addr + 1;
- ia_addr = (struct opt_hdr *)next;
+ offset += sizeof(struct opt_ia_addr);
}
-
- if (!ia_addr)
- break;
-
- ia = ia_addr;
}
if (ia_type == OPT_IA_NA) {
@@ -449,59 +441,58 @@ search:
/**
* dhcpv6() - Check if this is a DHCPv6 message, reply as needed
* @c: Execution context
- * @eh: Packet buffer, Ethernet header
- * @len: Total L2 packet length
+ * @p: Packet pool, single packet starting from UDP header
+ * @saddr: Source IPv6 address of original message
+ * @daddr: Destination IPv6 address of original message
*
* Return: 0 if it's not a DHCPv6 message, 1 if handled, -1 on failure
*/
-int dhcpv6(struct ctx *c, struct ethhdr *eh, size_t len)
+int dhcpv6(struct ctx *c, struct pool *p,
+ const struct in6_addr *saddr, const struct in6_addr *daddr)
{
- struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
struct opt_hdr *ia, *bad_ia, *client_id, *server_id;
struct in6_addr *src;
struct msg_hdr *mh;
struct udphdr *uh;
- uint8_t proto;
- size_t mlen;
- size_t n;
+ size_t mlen, n;
- uh = (struct udphdr *)ipv6_l4hdr(ip6h, &proto);
- if (!uh || proto != IPPROTO_UDP || uh->dest != htons(547))
+ uh = packet_get(p, 0, 0, sizeof(*uh), &mlen);
+ if (!uh)
+ return -1;
+
+ if (uh->dest != htons(547))
return 0;
if (c->no_dhcpv6)
return 1;
- if (!IN6_IS_ADDR_MULTICAST(&ip6h->daddr))
+ if (!IN6_IS_ADDR_MULTICAST(daddr))
return -1;
- mlen = len - ((intptr_t)uh - (intptr_t)eh) - sizeof(*uh);
-
- if (mlen != ntohs(uh->len) - sizeof(*uh) ||
- mlen < sizeof(struct msg_hdr))
+ if (mlen + sizeof(*uh) != ntohs(uh->len) || mlen < sizeof(*mh))
return -1;
- c->addr6_ll_seen = ip6h->saddr;
+ c->addr6_ll_seen = *saddr;
if (IN6_IS_ADDR_LINKLOCAL(&c->gw6))
src = &c->gw6;
else
src = &c->addr6_ll;
- mh = (struct msg_hdr *)(uh + 1);
- mlen -= sizeof(struct msg_hdr);
+ mh = packet_get(p, 0, sizeof(*uh), sizeof(*mh), NULL);
+ if (!mh)
+ return -1;
- n = mlen;
- client_id = dhcpv6_opt((struct opt_hdr *)(mh + 1), OPT_CLIENTID, &n);
- if (!client_id || ntohs(client_id->l) > ntohs(OPT_SIZE(client_id)))
+ client_id = dhcpv6_opt(p, &(size_t){ 0 }, OPT_CLIENTID);
+ if (!client_id || ntohs(client_id->l) > OPT_VSIZE(client_id))
return -1;
- n = mlen;
- server_id = dhcpv6_opt((struct opt_hdr *)(mh + 1), OPT_SERVERID, &n);
+ server_id = dhcpv6_opt(p, &(size_t){ 0 }, OPT_SERVERID);
+ if (server_id && ntohs(server_id->l) != OPT_VSIZE(server_id))
+ return -1;
- n = mlen;
- ia = dhcpv6_opt((struct opt_hdr *)(mh + 1), OPT_IA_NA, &n);
- if (ia && ntohs(ia->l) < ntohs(OPT_SIZE(ia_na)))
+ ia = dhcpv6_opt(p, &(size_t){ 0 }, OPT_IA_NA);
+ if (ia && ntohs(ia->l) < MIN(OPT_VSIZE(ia_na), OPT_VSIZE(ia_ta)))
return -1;
resp.hdr.type = TYPE_REPLY;
@@ -516,18 +507,17 @@ int dhcpv6(struct ctx *c, struct ethhdr *eh, size_t len)
if (mh->type == TYPE_CONFIRM && server_id)
return -1;
- if ((bad_ia = dhcpv6_ia_notonlink((struct opt_hdr *)(mh + 1),
- mlen, &c->addr6))) {
+ if ((bad_ia = dhcpv6_ia_notonlink(p, &c->addr6))) {
info("DHCPv6: received CONFIRM with inappropriate IA,"
" sending NotOnLink status in REPLY");
- n = ntohs(bad_ia->l) + sizeof(struct opt_hdr);
- bad_ia->l = htons(n - sizeof(struct opt_hdr) +
+ bad_ia->l = htons(OPT_VSIZE(ia_na) +
sizeof(sc_not_on_link));
+ n = sizeof(struct opt_ia_na);
memcpy(resp_not_on_link.var, bad_ia, n);
- memcpy(resp_not_on_link.var + n, &sc_not_on_link,
- sizeof(sc_not_on_link));
+ memcpy(resp_not_on_link.var + n,
+ &sc_not_on_link, sizeof(sc_not_on_link));
n += sizeof(sc_not_on_link);
memcpy(resp_not_on_link.var + n, client_id,
@@ -552,8 +542,7 @@ int dhcpv6(struct ctx *c, struct ethhdr *eh, size_t len)
memcmp(&resp.server_id, server_id, sizeof(resp.server_id)))
return -1;
- n = mlen;
- if (ia || dhcpv6_opt((struct opt_hdr *)(mh + 1), OPT_IA_TA, &n))
+ if (ia || dhcpv6_opt(p, &(size_t){ 0 }, OPT_IA_TA))
return -1;
info("DHCPv6: received INFORMATION_REQUEST, sending REPLY");
diff --git a/dhcpv6.h b/dhcpv6.h
index 36b6a57..73d28d3 100644
--- a/dhcpv6.h
+++ b/dhcpv6.h
@@ -3,5 +3,6 @@
* Author: Stefano Brivio <sbrivio@redhat.com>
*/
-int dhcpv6(struct ctx *c, struct ethhdr *eh, size_t len);
+int dhcpv6(struct ctx *c, struct pool *p,
+ struct in6_addr *saddr, struct in6_addr *daddr);
void dhcpv6_init(struct ctx *c);
diff --git a/icmp.c b/icmp.c
index 67859e0..075bc4c 100644
--- a/icmp.c
+++ b/icmp.c
@@ -31,9 +31,11 @@
#include <linux/icmpv6.h>
+#include "packet.h"
#include "util.h"
#include "passt.h"
#include "tap.h"
+#include "packet.h"
#include "icmp.h"
#define ICMP_ECHO_TIMEOUT 60 /* s, timeout for ICMP socket activity */
@@ -134,17 +136,15 @@ void icmp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
* icmp_tap_handler() - Handle packets from tap
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
- * @
- * @msg: Input message
- * @count: Message count (always 1 for ICMP)
+ * @p: Packet pool, single packet with ICMP/ICMPv6 header
* @now: Current timestamp
*
* Return: count of consumed packets (always 1, even if malformed)
*/
-int icmp_tap_handler(struct ctx *c, int af, void *addr,
- struct tap_l4_msg *msg, int count, struct timespec *now)
+int icmp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p,
+ struct timespec *now)
{
- (void)count;
+ size_t plen;
if (af == AF_INET) {
union icmp_epoll_ref iref = { .icmp.v6 = 0 };
@@ -155,9 +155,8 @@ int icmp_tap_handler(struct ctx *c, int af, void *addr,
struct icmphdr *ih;
int id, s;
- ih = (struct icmphdr *)(pkt_buf + msg[0].pkt_buf_offset);
-
- if (msg[0].l4_len < sizeof(*ih) || ih->type != ICMP_ECHO)
+ ih = packet_get(p, 0, 0, sizeof(*ih), &plen);
+ if (!ih)
return 1;
sa.sin_port = ih->un.echo.id;
@@ -175,7 +174,7 @@ int icmp_tap_handler(struct ctx *c, int af, void *addr,
bitmap_set(icmp_act[V4], id);
sa.sin_addr = *(struct in_addr *)addr;
- sendto(s, ih, msg[0].l4_len, MSG_NOSIGNAL,
+ sendto(s, ih, sizeof(*ih) + plen, MSG_NOSIGNAL,
(struct sockaddr *)&sa, sizeof(sa));
} else if (af == AF_INET6) {
union icmp_epoll_ref iref = { .icmp.v6 = 1 };
@@ -186,10 +185,11 @@ int icmp_tap_handler(struct ctx *c, int af, void *addr,
struct icmp6hdr *ih;
int id, s;
- ih = (struct icmp6hdr *)(pkt_buf + msg[0].pkt_buf_offset);
+ ih = packet_get(p, 0, 0, sizeof(struct icmp6hdr), &plen);
+ if (!ih)
+ return 1;
- if (msg[0].l4_len < sizeof(*ih) ||
- (ih->icmp6_type != 128 && ih->icmp6_type != 129))
+ if (ih->icmp6_type != 128 && ih->icmp6_type != 129)
return 1;
sa.sin6_port = ih->icmp6_identifier;
@@ -207,7 +207,7 @@ int icmp_tap_handler(struct ctx *c, int af, void *addr,
bitmap_set(icmp_act[V6], id);
sa.sin6_addr = *(struct in6_addr *)addr;
- sendto(s, ih, msg[0].l4_len, MSG_NOSIGNAL,
+ sendto(s, ih, sizeof(*ih) + plen, MSG_NOSIGNAL,
(struct sockaddr *)&sa, sizeof(sa));
}
diff --git a/icmp.h b/icmp.h
index 89b5f55..2152a66 100644
--- a/icmp.h
+++ b/icmp.h
@@ -12,8 +12,8 @@ struct ctx;
void icmp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now);
-int icmp_tap_handler(struct ctx *c, int af, void *addr,
- struct tap_l4_msg *msg, int count, struct timespec *now);
+int icmp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p,
+ struct timespec *now);
void icmp_timer(struct ctx *c, struct timespec *ts);
/**
diff --git a/ndp.c b/ndp.c
index 6b1c1a8..b40a0c4 100644
--- a/ndp.c
+++ b/ndp.c
@@ -39,28 +39,23 @@
/**
* ndp() - Check for NDP solicitations, reply as needed
* @c: Execution context
- * @len: Total L2 packet length
- * @eh: Packet buffer, Ethernet header
+ * @ih: ICMPv6 header
+ * @eh_source: Source Ethernet address
+ * @saddr Source IPv6 address
*
* Return: 0 if not handled here, 1 if handled, -1 on failure
*/
-int ndp(struct ctx *c, struct ethhdr *eh, size_t len)
+int ndp(struct ctx *c, struct icmp6hdr *ih, unsigned char *eh_source,
+ struct in6_addr *saddr)
{
- struct ethhdr *ehr;
- struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1), *ip6hr;
- struct icmp6hdr *ih, *ihr;
char buf[BUFSIZ] = { 0 };
- uint8_t proto, *p;
-
- if (len < sizeof(*ehr) + sizeof(*ip6h) + sizeof(*ih))
- return 0;
-
- ih = (struct icmp6hdr *)ipv6_l4hdr(ip6h, &proto);
- if (!ih)
- return -1;
+ struct ipv6hdr *ip6hr;
+ struct icmp6hdr *ihr;
+ struct ethhdr *ehr;
+ unsigned char *p;
+ size_t len;
- if (proto != IPPROTO_ICMPV6 ||
- ih->icmp6_type < RS || ih->icmp6_type > NA)
+ if (ih->icmp6_type < RS || ih->icmp6_type > NA)
return 0;
if (c->no_ndp)
@@ -71,11 +66,7 @@ int ndp(struct ctx *c, struct ethhdr *eh, size_t len)
ihr = (struct icmp6hdr *)(ip6hr + 1);
if (ih->icmp6_type == NS) {
- if (len < sizeof(*ehr) + sizeof(*ip6h) + sizeof(*ih) +
- sizeof(struct in6_addr))
- return -1;
-
- if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->saddr))
+ if (IN6_IS_ADDR_UNSPECIFIED(saddr))
return 1;
info("NDP: received NS, sending NA");
@@ -132,10 +123,10 @@ int ndp(struct ctx *c, struct ethhdr *eh, size_t len)
for (n = 0; !IN6_IS_ADDR_UNSPECIFIED(&c->dns6[n]); n++);
if (n) {
- *p++ = 25; /* RDNSS */
- *p++ = 1 + 2 * n; /* length */
- p += 2; /* reserved */
- *(uint32_t *)p = htonl(60); /* lifetime */
+ *p++ = 25; /* RDNSS */
+ *p++ = 1 + 2 * n; /* length */
+ p += 2; /* reserved */
+ *(uint32_t *)p = htonl(60); /* lifetime */
p += 4;
for (i = 0; i < n; i++) {
@@ -148,10 +139,10 @@ int ndp(struct ctx *c, struct ethhdr *eh, size_t len)
}
if (!c->no_dhcp_dns_search && dns_s_len) {
- *p++ = 31; /* DNSSL */
- *p++ = (len + 8 - 1) / 8 + 1; /* length */
- p += 2; /* reserved */
- *(uint32_t *)p = htonl(60); /* lifetime */
+ *p++ = 31; /* DNSSL */
+ *p++ = (dns_s_len + 8 - 1) / 8 + 1; /* length */
+ p += 2; /* reserved */
+ *(uint32_t *)p = htonl(60); /* lifetime */
p += 4;
for (i = 0; i < n; i++) {
@@ -185,12 +176,12 @@ dns_done:
len = (uintptr_t)p - (uintptr_t)ihr - sizeof(*ihr);
- if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr))
- c->addr6_ll_seen = ip6h->saddr;
+ if (IN6_IS_ADDR_LINKLOCAL(saddr))
+ c->addr6_ll_seen = *saddr;
else
- c->addr6_seen = ip6h->saddr;
+ c->addr6_seen = *saddr;
- ip6hr->daddr = ip6h->saddr;
+ ip6hr->daddr = *saddr;
if (IN6_IS_ADDR_LINKLOCAL(&c->gw6))
ip6hr->saddr = c->gw6;
else
@@ -207,7 +198,7 @@ dns_done:
ip6hr->hop_limit = 255;
len += sizeof(*ehr) + sizeof(*ip6hr) + sizeof(*ihr);
- memcpy(ehr->h_dest, eh->h_source, ETH_ALEN);
+ memcpy(ehr->h_dest, eh_source, ETH_ALEN);
memcpy(ehr->h_source, c->mac, ETH_ALEN);
ehr->h_proto = htons(ETH_P_IPV6);
diff --git a/ndp.h b/ndp.h
index 918fb66..a26673e 100644
--- a/ndp.h
+++ b/ndp.h
@@ -3,4 +3,5 @@
* Author: Stefano Brivio <sbrivio@redhat.com>
*/
-int ndp(struct ctx *c, struct ethhdr *eh, size_t len);
+int ndp(struct ctx *c, struct icmp6hdr *ih, unsigned char *eh_source,
+ struct in6_addr *saddr);
diff --git a/packet.c b/packet.c
new file mode 100644
index 0000000..0b7cb20
--- /dev/null
+++ b/packet.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: AGPL-3.0-or-later
+
+/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
+ *
+ * packet.c - Packet abstraction: add packets to pool, flush, get packet data
+ *
+ * Copyright (c) 2020-2021 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#include <limits.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <netinet/ip6.h>
+
+#include "packet.h"
+#include "util.h"
+
+/**
+ * packet_add_do() - Add data as packet descriptor to given pool
+ * @p: Existing pool
+ * @len: Length of new descriptor
+ * @start: Start of data
+ * @func: For tracing: name of calling function, NULL means no trace()
+ * @line: For tracing: caller line of function call
+ */
+void packet_add_do(struct pool *p, size_t len, const char *start,
+ const char *func, const int line)
+{
+ size_t index = p->count;
+
+ if (index >= p->size) {
+ trace("add packet index %lu to pool with size %lu, %s:%i",
+ index, p->size, func, line);
+ return;
+ }
+
+ if (start < p->buf) {
+ trace("add packet start %p before buffer start %p, %s:%i",
+ start, p->buf, func, line);
+ return;
+ }
+
+ if (start + len > p->buf + p->buf_size) {
+ trace("add packet start %p, length: %lu, buffer end %p, %s:%i",
+ start, len, p->buf + p->buf_size, func, line);
+ return;
+ }
+
+ if (len > UINT16_MAX) {
+ trace("add packet length %lu, %s:%i", func, line);
+ return;
+ }
+
+ if ((unsigned int)((intptr_t)start - (intptr_t)p->buf) > UINT32_MAX) {
+ trace("add packet start %p, buffer start %lu, %s:%i",
+ start, p->buf, func, line);
+ return;
+ }
+
+ p->pkt[index].offset = start - p->buf;
+ p->pkt[index].len = len;
+
+ p->count++;
+}
+
+/**
+ * packet_get_do() - Get data range from packet descriptor from given pool
+ * @p: Packet pool
+ * @index: Index of packet descriptor in pool
+ * @offset: Offset of data range in packet descriptor
+ * @len: Length of desired data range
+ * @left: Length of available data after range, set on return, can be NULL
+ * @func: For tracing: name of calling function, NULL means no trace()
+ * @line: For tracing: caller line of function call
+ *
+ * Return: pointer to start of data range, NULL on invalid range or descriptor
+ */
+void *packet_get_do(struct pool *p, size_t index, size_t offset, size_t len,
+ size_t *left, const char *func, const int line)
+{
+ if (index > p->size || index > p->count) {
+ if (func) {
+ trace("packet %lu from pool size: %lu, count: %lu, "
+ "%s:%i", index, p->size, p->count, func, line);
+ }
+ return NULL;
+ }
+
+ if (len > UINT16_MAX || len + offset > UINT32_MAX) {
+ if (func) {
+ trace("packet data length %lu, offset %lu, %s:%i",
+ len, offset, func, line);
+ }
+ return NULL;
+ }
+
+ if (p->pkt[index].offset + len + offset > p->buf_size) {
+ if (func) {
+ trace("packet offset plus length %lu from size %lu, "
+ "%s:%i", p->pkt[index].offset + len + offset,
+ p->buf_size, func, line);
+ }
+ return NULL;
+ }
+
+ if (len + offset > p->pkt[index].len) {
+ if (func) {
+ trace("data length %lu, offset %lu from length %lu, "
+ "%s:%i", len, offset, p->pkt[index].len,
+ func, line);
+ }
+ return NULL;
+ }
+
+ if (left)
+ *left = p->pkt[index].len - offset - len;
+
+ return p->buf + p->pkt[index].offset + offset;
+}
+
+/**
+ * pool_flush() - Flush a packet pool
+ * @p: Pointer to packet pool
+ */
+void pool_flush(struct pool *p)
+{
+ p->count = 0;
+}
diff --git a/packet.h b/packet.h
new file mode 100644
index 0000000..0ef8849
--- /dev/null
+++ b/packet.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: AGPL-3.0-or-later
+ * Copyright (c) 2022 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#ifndef PACKET_H
+#define PACKET_H
+
+/**
+ * struct desc - Generic offset-based descriptor within buffer
+ * @offset: Offset of descriptor relative to buffer start, 32-bit limit
+ * @len: Length of descriptor, host order, 16-bit limit
+ */
+struct desc {
+ uint32_t offset;
+ uint16_t len;
+};
+
+/**
+ * struct pool - Generic pool of packets stored in a buffer
+ * @buf: Buffer storing packet descriptors
+ * @buf_size: Total size of buffer
+ * @size: Number of usable descriptors for the pool
+ * @count: Number of used descriptors for the pool
+ * @pkt: Descriptors: see macros below
+ */
+struct pool {
+ char *buf;
+ size_t buf_size;
+ size_t size;
+ size_t count;
+ struct desc pkt[1];
+};
+
+void packet_add_do(struct pool *p, size_t len, const char *start,
+ const char *func, const int line);
+void *packet_get_do(struct pool *p, size_t index, size_t offset, size_t len,
+ size_t *left, const char *func, const int line);
+void pool_flush(struct pool *p);
+
+#define packet_add(p, len, start) \
+ packet_add_do(p, len, start, __func__, __LINE__);
+
+#define packet_get(p, index, offset, len, left) \
+ packet_get_do(p, index, offset, len, left, __func__, __LINE__);
+
+#define packet_get_try(p, index, offset, len, left) \
+ packet_get_do(p, index, offset, len, left, NULL, 0)
+
+#define PACKET_POOL_DECL(_name, _size, _buf) \
+struct _name ## _t { \
+ char *buf; \
+ size_t buf_size; \
+ size_t size; \
+ size_t count; \
+ struct desc pkt[_size]; \
+}
+
+#define PACKET_POOL_INIT_NOCAST(_size, _buf, _buf_size) \
+{ \
+ .buf_size = _buf_size, \
+ .buf = _buf, \
+ .size = _size, \
+}
+
+#define PACKET_POOL(name, size, buf, buf_size) \
+ PACKET_POOL_DECL(name, size, buf) name = \
+ PACKET_POOL_INIT_NOCAST(size, buf, buf_size)
+
+#define PACKET_INIT(name, size, buf, buf_size) \
+ (struct name ## _t) PACKET_POOL_INIT_NOCAST(size, buf, buf_size)
+
+#define PACKET_POOL_NOINIT(name, size, buf) \
+ PACKET_POOL_DECL(name, size, buf) name ## _storage; \
+ static struct pool *name = (struct pool *)&name ## _storage
+
+#define PACKET_POOL_P(name, size, buf, buf_size) \
+ PACKET_POOL(name ## _storage, size, buf, buf_size); \
+ struct pool *name = (struct pool *)&name ## _storage
+
+#endif /* PACKET_H */
diff --git a/passt.h b/passt.h
index 9ea8f8d..cd28973 100644
--- a/passt.h
+++ b/passt.h
@@ -28,6 +28,7 @@ struct tap_l4_msg {
union epoll_ref;
+#include "packet.h"
#include "icmp.h"
#include "tcp.h"
#include "udp.h"
diff --git a/tap.c b/tap.c
index 59a87f9..ca2c86a 100644
--- a/tap.c
+++ b/tap.c
@@ -51,10 +51,11 @@
#include "pcap.h"
#include "netlink.h"
#include "pasta.h"
+#include "packet.h"
/* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */
-static struct tap_msg seq4[TAP_MSGS];
-static struct tap_msg seq6[TAP_MSGS];
+static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf);
+static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
/**
* tap_send() - Send frame, with qemu socket header if needed
@@ -202,6 +203,8 @@ void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto,
}
}
+PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
+
/**
* struct l4_seq4_t - Message sequence for one protocol handler call, IPv4
* @msgs: Count of messages in sequence
@@ -212,8 +215,7 @@ void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto,
* @daddr: Destination address
* @msg: Array of messages that can be handled in a single call
*/
-static struct tap_l4_seq4 {
- uint16_t msgs;
+static struct tap4_l4_t {
uint8_t protocol;
uint16_t source;
@@ -222,8 +224,8 @@ static struct tap_l4_seq4 {
uint32_t saddr;
uint32_t daddr;
- struct tap_l4_msg msg[UIO_MAXIOV];
-} l4_seq4[UIO_MAXIOV /* Arbitrary: TAP_MSGS in theory, so limit in users */];
+ struct pool_l4_t p;
+} tap4_l4[UIO_MAXIOV /* Arbitrary: TAP_MSGS in theory, so limit in users */];
/**
* struct l4_seq6_t - Message sequence for one protocol handler call, IPv6
@@ -235,8 +237,7 @@ static struct tap_l4_seq4 {
* @daddr: Destination address
* @msg: Array of messages that can be handled in a single call
*/
-static struct tap_l4_seq6 {
- uint16_t msgs;
+static struct tap6_l4_t {
uint8_t protocol;
uint16_t source;
@@ -245,8 +246,8 @@ static struct tap_l4_seq6 {
struct in6_addr saddr;
struct in6_addr daddr;
- struct tap_l4_msg msg[UIO_MAXIOV];
-} l4_seq6[UIO_MAXIOV /* Arbitrary: TAP_MSGS in theory, so limit in users */];
+ struct pool_l4_t p;
+} tap6_l4[UIO_MAXIOV /* Arbitrary: TAP_MSGS in theory, so limit in users */];
/**
* tap_packet_debug() - Print debug message for packet(s) from guest/tap
@@ -258,8 +259,8 @@ static struct tap_l4_seq6 {
* @count: Count of packets in this sequence
*/
static void tap_packet_debug(struct iphdr *iph, struct ipv6hdr *ip6h,
- struct tap_l4_seq4 *seq4, uint8_t proto6,
- struct tap_l4_seq6 *seq6, int count)
+ struct tap4_l4_t *seq4, uint8_t proto6,
+ struct tap6_l4_t *seq6, int count)
{
char buf6s[INET6_ADDRSTRLEN], buf6d[INET6_ADDRSTRLEN];
char buf4s[INET_ADDRSTRLEN], buf4d[INET_ADDRSTRLEN];
@@ -283,14 +284,15 @@ static void tap_packet_debug(struct iphdr *iph, struct ipv6hdr *ip6h,
}
if (proto == IPPROTO_TCP || proto == IPPROTO_UDP) {
- trace("protocol %i from tap: %s:%i -> %s:%i (%i packet%s)",
- proto, seq4 ? buf4s : buf6s,
+ trace("tap: protocol %i, %s%s%s:%i -> %s%s%s:%i (%i packet%s)",
+ proto,
+ seq4 ? "" : "[", seq4 ? buf4s : buf6s, seq4 ? "" : "]",
ntohs(seq4 ? seq4->source : seq6->source),
- seq4 ? buf4d : buf6d,
+ seq4 ? "" : "[", seq4 ? buf4d : buf6d, seq4 ? "" : "]",
ntohs(seq4 ? seq4->dest : seq6->dest),
count, count == 1 ? "" : "s");
} else {
- trace("protocol %i from tap: %s -> %s (%i packet%s)",
+ trace("tap: protocol %i, %s -> %s (%i packet%s)",
proto, iph ? buf4s : buf6s, iph ? buf4d : buf6d,
count, count == 1 ? "" : "s");
}
@@ -299,78 +301,83 @@ static void tap_packet_debug(struct iphdr *iph, struct ipv6hdr *ip6h,
/**
* tap4_handler() - IPv4 and ARP packet handler for tap file descriptor
* @c: Execution context
- * @msg: Array of messages with IPv4 or ARP protocol
- * @count: Count of messages
+ * @in: Ingress packet pool, packets with Ethernet headers
* @now: Current timestamp
*
* Return: count of packets consumed by handlers
*/
-static int tap4_handler(struct ctx *c, struct tap_msg *msg, size_t count,
- struct timespec *now)
+static int tap4_handler(struct ctx *c, struct pool *in, struct timespec *now)
{
unsigned int i, j, seq_count;
- struct tap_l4_msg *l4_msg;
- struct tap_l4_seq4 *seq;
- size_t len, l4_len;
- struct ethhdr *eh;
- struct iphdr *iph;
- struct udphdr *uh;
- char *l4h;
+ struct tap4_l4_t *seq;
- if (!c->v4)
- return count;
+ if (!c->v4 || !in->count)
+ return in->count;
i = 0;
resume:
- for (seq_count = 0, seq = NULL; i < count; i++) {
- eh = (struct ethhdr *)(pkt_buf + msg[i].pkt_buf_offset);
- len = msg[i].len;
+ for (seq_count = 0, seq = NULL; i < in->count; i++) {
+ size_t l2_len, l3_len, hlen, l4_len;
+ struct ethhdr *eh;
+ struct iphdr *iph;
+ struct udphdr *uh;
+ char *l4h;
- if (len < sizeof(*eh))
- continue;
+ packet_get(in, i, 0, 0, &l2_len);
- if (ntohs(eh->h_proto) == ETH_P_ARP && arp(c, eh, len))
+ eh = packet_get(in, i, 0, sizeof(*eh), &l3_len);
+ if (!eh)
continue;
+ if (ntohs(eh->h_proto) == ETH_P_ARP) {
+ PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf));
- if (len < sizeof(*eh) + sizeof(*iph))
+ packet_add(pkt, l2_len, (char *)eh);
+ arp(c, pkt);
continue;
+ }
- iph = (struct iphdr *)(eh + 1);
- if ((size_t)iph->ihl * 4 + sizeof(*eh) > len)
+ iph = packet_get(in, i, sizeof(*eh), sizeof(*iph), NULL);
+ if (!iph)
continue;
- if ((size_t)iph->ihl * 4 < (int)sizeof(*iph))
+
+ hlen = iph->ihl * 4UL;
+ if (hlen < sizeof(*iph) || htons(iph->tot_len) != l3_len ||
+ hlen > l3_len)
continue;
+ l4_len = l3_len - hlen;
+
if (iph->saddr && c->addr4_seen != iph->saddr) {
c->addr4_seen = iph->saddr;
proto_update_l2_buf(NULL, NULL, &c->addr4_seen);
}
- l4h = (char *)iph + (size_t)iph->ihl * 4;
- l4_len = len - ((intptr_t)l4h - (intptr_t)eh);
+ l4h = packet_get(in, i, sizeof(*eh) + hlen, l4_len, NULL);
+ if (!l4h)
+ continue;
if (iph->protocol == IPPROTO_ICMP) {
- struct tap_l4_msg icmp_msg = { l4h - pkt_buf,
- l4_len };
+ PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf));
- if (l4_len < sizeof(struct icmphdr))
+ if (c->no_icmp)
continue;
- tap_packet_debug(iph, NULL, NULL, 0, NULL, 1);
- if (!c->no_icmp) {
- icmp_tap_handler(c, AF_INET, &iph->daddr,
- &icmp_msg, 1, now);
- }
+ packet_add(pkt, l4_len, l4h);
+ icmp_tap_handler(c, AF_INET, &iph->daddr, pkt, now);
continue;
}
- if (l4_len < sizeof(*uh))
+ uh = packet_get(in, i, sizeof(*eh) + hlen, sizeof(*uh), NULL);
+ if (!uh)
continue;
- uh = (struct udphdr *)l4h;
+ if (iph->protocol == IPPROTO_UDP) {
+ PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf));
- if (iph->protocol == IPPROTO_UDP && dhcp(c, eh, len))
- continue;
+ packet_add(pkt, l2_len, (char *)eh);
+ if (dhcp(c, pkt))
+ continue;
+ }
if (iph->protocol != IPPROTO_TCP &&
iph->protocol != IPPROTO_UDP) {
@@ -392,147 +399,145 @@ resume:
seq->daddr = iph->daddr; \
} while (0)
- if (seq && L4_MATCH(iph, uh, seq) && seq->msgs < UIO_MAXIOV)
+ if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV)
goto append;
- for (seq = l4_seq4 + seq_count - 1; seq >= l4_seq4; seq--) {
+ for (seq = tap4_l4 + seq_count - 1; seq >= tap4_l4; seq--) {
if (L4_MATCH(iph, uh, seq)) {
- if (seq->msgs >= UIO_MAXIOV)
+ if (seq->p.count >= UIO_MAXIOV)
seq = NULL;
break;
}
}
- if (!seq || seq < l4_seq4) {
- seq = l4_seq4 + seq_count++;
+ if (!seq || seq < tap4_l4) {
+ seq = tap4_l4 + seq_count++;
L4_SET(iph, uh, seq);
- seq->msgs = 0;
+ pool_flush((struct pool *)&seq->p);
}
#undef L4_MATCH
#undef L4_SET
append:
- l4_msg = &seq->msg[seq->msgs++];
-
- l4_msg->pkt_buf_offset = l4h - pkt_buf;
- l4_msg->l4_len = l4_len;
+ packet_add((struct pool *)&seq->p, l4_len, l4h);
if (seq_count == UIO_MAXIOV)
break; /* Resume after flushing if i < count */
}
- for (j = 0, seq = l4_seq4; j < seq_count; j++, seq++) {
- int n = seq->msgs;
-
- l4_msg = seq->msg;
+ for (j = 0, seq = tap4_l4; j < seq_count; j++, seq++) {
+ struct pool *p = (struct pool *)&seq->p;
+ uint32_t *da = &seq->daddr;
+ size_t n = p->count;
tap_packet_debug(NULL, NULL, seq, 0, NULL, n);
if (seq->protocol == IPPROTO_TCP) {
if (c->no_tcp)
continue;
- while ((n -= tcp_tap_handler(c, AF_INET, &seq->daddr,
- l4_msg, n, now)));
+ while ((n -= tcp_tap_handler(c, AF_INET, da, p, now)));
} else if (seq->protocol == IPPROTO_UDP) {
if (c->no_udp)
continue;
- while ((n -= udp_tap_handler(c, AF_INET, &seq->daddr,
- l4_msg, n, now)));
+ while ((n -= udp_tap_handler(c, AF_INET, da, p, now)));
}
}
- if (i < count)
+ if (i < in->count)
goto resume;
- return count;
+ return in->count;
}
/**
* tap6_handler() - IPv6 packet handler for tap file descriptor
* @c: Execution context
- * @msg: Array of messages with IPv6 protocol
- * @count: Count of messages
+ * @in: Ingress packet pool, packets with Ethernet headers
* @now: Current timestamp
*
* Return: count of packets consumed by handlers
*/
-static int tap6_handler(struct ctx *c, struct tap_msg *msg, size_t count,
- struct timespec *now)
+static int tap6_handler(struct ctx *c, struct pool *in, struct timespec *now)
{
unsigned int i, j, seq_count = 0;
- struct tap_l4_msg *l4_msg;
- struct tap_l4_seq6 *seq;
- struct ipv6hdr *ip6h;
- size_t len, l4_len;
- struct ethhdr *eh;
- struct udphdr *uh;
- uint8_t proto;
- char *l4h;
+ struct tap6_l4_t *seq;
- if (!c->v6)
- return count;
+ if (!c->v6 || !in->count)
+ return in->count;
i = 0;
resume:
- for (seq_count = 0, seq = NULL; i < count; i++) {
- eh = (struct ethhdr *)(pkt_buf + msg[i].pkt_buf_offset);
- len = msg[i].len;
+ for (seq_count = 0, seq = NULL; i < in->count; i++) {
+ size_t l4_len, plen, check;
+ struct in6_addr *saddr, *daddr;
+ struct ipv6hdr *ip6h;
+ struct ethhdr *eh;
+ struct udphdr *uh;
+ uint8_t proto;
+ char *l4h;
+
+ eh = packet_get(in, i, 0, sizeof(*eh), NULL);
+ if (!eh)
+ continue;
- if (len < sizeof(*eh))
+ ip6h = packet_get(in, i, sizeof(*eh), sizeof(*ip6h), &check);
+ if (!ip6h)
continue;
- if (len < sizeof(*eh) + sizeof(*ip6h))
- return 1;
+ saddr = &ip6h->saddr;
+ daddr = &ip6h->daddr;
- ip6h = (struct ipv6hdr *)(eh + 1);
+ plen = ntohs(ip6h->payload_len);
+ if (plen != check)
+ continue;
+
+ if (!(l4h = ipv6_l4hdr(in, i, sizeof(*eh), &proto, &l4_len)))
+ continue;
- if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr)) {
- c->addr6_ll_seen = ip6h->saddr;
+ if (IN6_IS_ADDR_LINKLOCAL(saddr)) {
+ c->addr6_ll_seen = *saddr;
if (IN6_IS_ADDR_UNSPECIFIED(&c->addr6_seen)) {
- c->addr6_seen = ip6h->saddr;
+ c->addr6_seen = *saddr;
}
} else {
- c->addr6_seen = ip6h->saddr;
+ c->addr6_seen = *saddr;
}
- if (ntohs(ip6h->payload_len) >
- len - sizeof(*eh) - sizeof(*ip6h))
- continue;
-
- if (!(l4h = ipv6_l4hdr(ip6h, &proto)))
- continue;
-
- l4_len = len - ((intptr_t)l4h - (intptr_t)eh);
-
if (proto == IPPROTO_ICMPV6) {
- struct tap_l4_msg icmpv6_msg = { l4h - pkt_buf,
- l4_len };
+ PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf));
+
+ if (c->no_icmp)
+ continue;
if (l4_len < sizeof(struct icmp6hdr))
continue;
- if (ndp(c, eh, len))
+ if (ndp(c, (struct icmp6hdr *)l4h, eh->h_source, saddr))
continue;
tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1);
- if (!c->no_icmp) {
- icmp_tap_handler(c, AF_INET6, &ip6h->daddr,
- &icmpv6_msg, 1, now);
- }
+
+ packet_add(pkt, l4_len, l4h);
+ icmp_tap_handler(c, AF_INET6, daddr, pkt, now);
continue;
}
if (l4_len < sizeof(*uh))
continue;
-
uh = (struct udphdr *)l4h;
- if (proto == IPPROTO_UDP && dhcpv6(c, eh, len))
- continue;
+ if (proto == IPPROTO_UDP) {
+ PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf));
+
+ packet_add(pkt, l4_len, l4h);
+
+ if (dhcpv6(c, pkt, saddr, daddr))
+ continue;
+ }
- ip6h->saddr = c->addr6;
+ *saddr = c->addr6;
if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1);
@@ -542,73 +547,68 @@ resume:
#define L4_MATCH(ip6h, proto, uh, seq) \
(seq->protocol == proto && \
seq->source == uh->source && seq->dest == uh->dest && \
- IN6_ARE_ADDR_EQUAL(&seq->saddr, &ip6h->saddr) && \
- IN6_ARE_ADDR_EQUAL(&seq->daddr, &ip6h->daddr))
+ IN6_ARE_ADDR_EQUAL(&seq->saddr, saddr) && \
+ IN6_ARE_ADDR_EQUAL(&seq->daddr, daddr))
#define L4_SET(ip6h, proto, uh, seq) \
do { \
seq->protocol = proto; \
seq->source = uh->source; \
seq->dest = uh->dest; \
- seq->saddr = ip6h->saddr; \
- seq->daddr = ip6h->daddr; \
+ seq->saddr = *saddr; \
+ seq->daddr = *daddr; \
} while (0)
if (seq && L4_MATCH(ip6h, proto, uh, seq) &&
- seq->msgs < UIO_MAXIOV)
+ seq->p.count < UIO_MAXIOV)
goto append;
- for (seq = l4_seq6 + seq_count - 1; seq >= l4_seq6; seq--) {
+ for (seq = tap6_l4 + seq_count - 1; seq >= tap6_l4; seq--) {
if (L4_MATCH(ip6h, proto, uh, seq)) {
- if (seq->msgs >= UIO_MAXIOV)
+ if (seq->p.count >= UIO_MAXIOV)
seq = NULL;
break;
}
}
- if (!seq || seq < l4_seq6) {
- seq = l4_seq6 + seq_count++;
+ if (!seq || seq < tap6_l4) {
+ seq = tap6_l4 + seq_count++;
L4_SET(ip6h, proto, uh, seq);
- seq->msgs = 0;
+ pool_flush((struct pool *)&seq->p);
}
#undef L4_MATCH
#undef L4_SET
append:
- l4_msg = &seq->msg[seq->msgs++];
-
- l4_msg->pkt_buf_offset = l4h - pkt_buf;
- l4_msg->l4_len = l4_len;
+ packet_add((struct pool *)&seq->p, l4_len, l4h);
if (seq_count == UIO_MAXIOV)
break; /* Resume after flushing if i < count */
}
- for (j = 0, seq = l4_seq6; j < seq_count; j++, seq++) {
- int n = seq->msgs;
-
- l4_msg = seq->msg;
+ for (j = 0, seq = tap6_l4; j < seq_count; j++, seq++) {
+ struct pool *p = (struct pool *)&seq->p;
+ struct in6_addr *da = &seq->daddr;
+ size_t n = p->count;
tap_packet_debug(NULL, NULL, NULL, seq->protocol, seq, n);
if (seq->protocol == IPPROTO_TCP) {
if (c->no_tcp)
continue;
- while ((n -= tcp_tap_handler(c, AF_INET6, &seq->daddr,
- l4_msg, n, now)));
+ while ((n -= tcp_tap_handler(c, AF_INET6, da, p, now)));
} else if (seq->protocol == IPPROTO_UDP) {
if (c->no_udp)
continue;
- while ((n -= udp_tap_handler(c, AF_INET6, &seq->daddr,
- l4_msg, n, now)));
+ while ((n -= udp_tap_handler(c, AF_INET6, da, p, now)));
}
}
- if (i < count)
+ if (i < in->count)
goto resume;
- return count;
+ return in->count;
}
/**
@@ -620,14 +620,16 @@ append:
*/
static int tap_handler_passt(struct ctx *c, struct timespec *now)
{
- int seq4_i, seq6_i;
struct ethhdr *eh;
ssize_t n, rem;
char *p;
redo:
p = pkt_buf;
- seq4_i = seq6_i = rem = 0;
+ rem = 0;
+
+ pool_flush(pool_tap4);
+ pool_flush(pool_tap6);
n = recv(c->fd_tap, p, TAP_BUF_FILL, MSG_DONTWAIT);
if (n < 0) {
@@ -673,12 +675,10 @@ redo:
switch (ntohs(eh->h_proto)) {
case ETH_P_ARP:
case ETH_P_IP:
- seq4[seq4_i].pkt_buf_offset = p - pkt_buf;
- seq4[seq4_i++].len = len;
+ packet_add(pool_tap4, len, p);
break;
case ETH_P_IPV6:
- seq6[seq6_i].pkt_buf_offset = p - pkt_buf;
- seq6[seq6_i++].len = len;
+ packet_add(pool_tap6, len, p);
break;
default:
break;
@@ -689,11 +689,8 @@ next:
n -= len;
}
- if (seq4_i)
- tap4_handler(c, seq4, seq4_i, now);
-
- if (seq6_i)
- tap6_handler(c, seq6, seq6_i, now);
+ tap4_handler(c, pool_tap4, now);
+ tap6_handler(c, pool_tap6, now);
/* We can't use EPOLLET otherwise. */
if (rem)
@@ -712,8 +709,10 @@ next:
static int tap_handler_pasta(struct ctx *c, struct timespec *now)
{
ssize_t n = 0, len;
- int ret, seq4_i = 0, seq6_i = 0;
+ int ret;
+ pool_flush(pool_tap4);
+ pool_flush(pool_tap6);
restart:
while ((len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n)) > 0) {
struct ethhdr *eh = (struct ethhdr *)(pkt_buf + n);
@@ -733,12 +732,10 @@ restart:
switch (ntohs(eh->h_proto)) {
case ETH_P_ARP:
case ETH_P_IP:
- seq4[seq4_i].pkt_buf_offset = n;
- seq4[seq4_i++].len = len;
+ packet_add(pool_tap4, len, pkt_buf + n);
break;
case ETH_P_IPV6:
- seq6[seq6_i].pkt_buf_offset = n;
- seq6[seq6_i++].len = len;
+ packet_add(pool_tap6, len, pkt_buf + n);
break;
default:
break;
@@ -752,11 +749,8 @@ restart:
ret = errno;
- if (seq4_i)
- tap4_handler(c, seq4, seq4_i, now);
-
- if (seq6_i)
- tap6_handler(c, seq6, seq6_i, now);
+ tap4_handler(c, pool_tap4, now);
+ tap6_handler(c, pool_tap6, now);
if (len > 0 || ret == EAGAIN)
return 0;
@@ -920,6 +914,17 @@ static void tap_sock_tun_init(struct ctx *c)
*/
void tap_sock_init(struct ctx *c)
{
+ size_t sz = sizeof(pkt_buf);
+ int i;
+
+ pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, pkt_buf, sz);
+ pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, pkt_buf, sz);
+
+ for (i = 0; i < UIO_MAXIOV; i++) {
+ tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz);
+ tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz);
+ }
+
if (c->fd_tap != -1) {
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL);
close(c->fd_tap);
diff --git a/tcp.c b/tcp.c
index 802d6d8..e03561a 100644
--- a/tcp.c
+++ b/tcp.c
@@ -303,6 +303,9 @@
#define TCP_FRAMES \
(c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)
+#define TCP_FILE_PRESSURE 30 /* % of c->nofile */
+#define TCP_CONN_PRESSURE 30 /* % of c->tcp.conn_count */
+
#define TCP_HASH_BUCKET_BITS (TCP_CONN_INDEX_BITS + 1)
#define TCP_HASH_TABLE_LOAD 70 /* % */
#define TCP_HASH_TABLE_SIZE (TCP_MAX_CONNS * 100 / \
@@ -440,6 +443,7 @@ struct tcp_conn {
#define TCP_MAX_RETRANS ((1U << TCP_RETRANS_BITS) - 1)
#define TCP_WS_BITS 4 /* RFC 7323 */
+#define TCP_WS_MAX 14
unsigned int ws_from_tap :TCP_WS_BITS;
unsigned int ws_to_tap :TCP_WS_BITS;
@@ -476,7 +480,6 @@ struct tcp_conn {
uint32_t seq_init_from_tap;
};
-#define CONN_IS_CLOSED(conn) (conn->events == CLOSED)
#define CONN_IS_CLOSING(conn) \
((conn->events & ESTABLISHED) && \
(conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
@@ -699,7 +702,7 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
return EPOLLET;
if (conn_flags & STALLED)
- return EPOLLIN | EPOLLRDHUP | EPOLLET;
+ return EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLET;
return EPOLLIN | EPOLLRDHUP;
}
@@ -733,8 +736,11 @@ static int tcp_epoll_ctl(struct ctx *c, struct tcp_conn *conn)
.r.p.tcp.tcp.v6 = CONN_V6(conn) };
struct epoll_event ev = { .data.u64 = ref.u64 };
- if (CONN_IS_CLOSED(conn)) {
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev);
+ if (conn->events == CLOSED) {
+ if (conn->flags & IN_EPOLL)
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev);
+ if (conn->timer != -1)
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev);
return 0;
}
@@ -745,6 +751,18 @@ static int tcp_epoll_ctl(struct ctx *c, struct tcp_conn *conn)
conn->flags |= IN_EPOLL; /* No need to log this */
+ if (conn->timer != -1) {
+ union epoll_ref ref_t = { .r.proto = IPPROTO_TCP,
+ .r.s = conn->sock,
+ .r.p.tcp.tcp.timer = 1,
+ .r.p.tcp.tcp.index = conn - tc };
+ struct epoll_event ev_t = { .data.u64 = ref_t.u64,
+ .events = EPOLLIN | EPOLLET };
+
+ if (epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t))
+ return -errno;
+ }
+
return 0;
}
@@ -759,6 +777,9 @@ static void tcp_timer_ctl(struct ctx *c, struct tcp_conn *conn)
{
struct itimerspec it = { { 0 }, { 0 } };
+ if (conn->events == CLOSED)
+ return;
+
if (conn->timer == -1) {
union epoll_ref ref = { .r.proto = IPPROTO_TCP,
.r.s = conn->sock,
@@ -783,15 +804,11 @@ static void tcp_timer_ctl(struct ctx *c, struct tcp_conn *conn)
}
}
- if (conn->events == CLOSED) {
- it.it_value.tv_nsec = 1;
- } else if (conn->flags & ACK_TO_TAP_DUE) {
+ if (conn->flags & ACK_TO_TAP_DUE) {
it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000;
} else if (conn->flags & ACK_FROM_TAP_DUE) {
if (!(conn->events & ESTABLISHED))
it.it_value.tv_sec = SYN_TIMEOUT;
- else if (conn->events & TAP_FIN_SENT)
- it.it_value.tv_sec = FIN_TIMEOUT;
else
it.it_value.tv_sec = ACK_TIMEOUT;
} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
@@ -834,7 +851,9 @@ static void conn_flag_do(struct ctx *c, struct tcp_conn *conn,
if (flag == STALLED || flag == ~STALLED)
tcp_epoll_ctl(c, conn);
- if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE)
+ if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE ||
+ (flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) ||
+ (flag == ~ACK_TO_TAP_DUE && (conn->flags & ACK_FROM_TAP_DUE)))
tcp_timer_ctl(c, conn);
}
@@ -888,7 +907,7 @@ static void conn_event_do(struct ctx *c, struct tcp_conn *conn,
else
tcp_epoll_ctl(c, conn);
- if (event == CLOSED || CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
+ if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
tcp_timer_ctl(c, conn);
}
@@ -1182,36 +1201,32 @@ static void tcp_sock6_iov_init(void)
/**
* tcp_opt_get() - Get option, and value if any, from TCP header
- * @th: Pointer to TCP header
- * @len: Length of buffer, including TCP header
+ * @opts: Pointer to start of TCP options in header
+ * @len: Length of buffer, excluding TCP header -- NOT checked here!
* @type_find: Option type to look for
* @optlen_set: Optional, filled with option length if passed
* @value_set: Optional, set to start of option value if passed
*
* Return: option value, meaningful for up to 4 bytes, -1 if not found
*/
-static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t type_find,
+static int tcp_opt_get(char *opts, size_t len, uint8_t type_find,
uint8_t *optlen_set, char **value_set)
{
uint8_t type, optlen;
- char *p;
-
- if (len > (size_t)th->doff * 4)
- len = (size_t)th->doff * 4;
- len -= sizeof(*th);
- p = (char *)(th + 1);
+ if (!len)
+ return -1;
- for (; len >= 2; p += optlen, len -= optlen) {
- switch (*p) {
+ for (; len >= 2; opts += optlen, len -= optlen) {
+ switch (*opts) {
case OPT_EOL:
return -1;
case OPT_NOP:
optlen = 1;
break;
default:
- type = *(p++);
- optlen = *(p++) - 2;
+ type = *(opts++);
+ optlen = *(opts++) - 2;
len -= 2;
if (type != type_find)
@@ -1220,17 +1235,17 @@ static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t type_find,
if (optlen_set)
*optlen_set = optlen;
if (value_set)
- *value_set = p;
+ *value_set = opts;
switch (optlen) {
case 0:
return 0;
case 1:
- return *p;
+ return *opts;
case 2:
- return ntohs(*(uint16_t *)p);
+ return ntohs(*(uint16_t *)opts);
default:
- return ntohl(*(uint32_t *)p);
+ return ntohl(*(uint32_t *)opts);
}
}
}
@@ -1415,12 +1430,12 @@ static void tcp_table_compact(struct ctx *c, struct tcp_conn *hole)
if ((hole - tc) == --c->tcp.conn_count) {
debug("TCP: hash table compaction: index %i (%p) was max index",
hole - tc, hole);
+ memset(hole, 0, sizeof(*hole));
return;
}
from = CONN(c->tcp.conn_count);
memcpy(hole, from, sizeof(*hole));
- from->flags = from->events = 0;
to = hole;
tcp_hash_update(from, to);
@@ -1430,25 +1445,23 @@ static void tcp_table_compact(struct ctx *c, struct tcp_conn *hole)
debug("TCP: hash table compaction: old index %i, new index %i, "
"sock %i, from: %p, to: %p",
from - tc, to - tc, from->sock, from, to);
+
+ memset(from, 0, sizeof(*from));
}
/**
- * tcp_conn_destroy() - Close connection, drop from epoll file descriptor
+ * tcp_conn_destroy() - Close sockets, trigger hash table removal and compaction
* @c: Execution context
* @conn: Connection pointer
*/
static void tcp_conn_destroy(struct ctx *c, struct tcp_conn *conn)
{
- if (CONN_IS_CLOSED(conn))
- return;
-
- conn_event(c, conn, CLOSED);
- conn->flags = 0;
close(conn->sock);
+ if (conn->timer != -1)
+ close(conn->timer);
- /* Removal from hash table and connection table compaction deferred to
- * timer.
- */
+ tcp_hash_remove(conn);
+ tcp_table_compact(c, conn);
}
static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn);
@@ -1582,9 +1595,23 @@ static void tcp_l2_data_buf_flush(struct ctx *c)
*/
void tcp_defer_handler(struct ctx *c)
{
+ int max_conns = c->tcp.conn_count / 100 * TCP_CONN_PRESSURE;
+ int max_files = c->nofile / 100 * TCP_FILE_PRESSURE;
+ struct tcp_conn *conn;
+
tcp_l2_flags_buf_flush(c);
tcp_l2_data_buf_flush(c);
+
tcp_splice_defer_handler(c);
+
+ if (c->tcp.conn_count < MIN(max_files, max_conns))
+ return;
+
+ for (conn = CONN(c->tcp.conn_count - 1); conn >= tc; conn--) {
+ if (conn->events == CLOSED)
+ tcp_conn_destroy(c, conn);
+ }
+
}
/**
@@ -1605,13 +1632,19 @@ static size_t tcp_l2_buf_fill_headers(struct ctx *c, struct tcp_conn *conn,
size_t ip_len, eth_len;
#define SET_TCP_HEADER_COMMON_V4_V6(b, conn, seq) \
- do { \
- b->th.source = htons(conn->sock_port); \
- b->th.dest = htons(conn->tap_port); \
- b->th.seq = htonl(seq); \
- b->th.ack_seq = htonl(conn->seq_ack_to_tap); \
- b->th.window = htons(MIN(conn->wnd_to_tap, USHRT_MAX)); \
- } while (0)
+do { \
+ b->th.source = htons(conn->sock_port); \
+ b->th.dest = htons(conn->tap_port); \
+ b->th.seq = htonl(seq); \
+ b->th.ack_seq = htonl(conn->seq_ack_to_tap); \
+ if (conn->events & ESTABLISHED) { \
+ b->th.window = htons(conn->wnd_to_tap); \
+ } else { \
+ unsigned wnd = conn->wnd_to_tap << conn->ws_to_tap; \
+ \
+ b->th.window = htons(MIN(wnd, USHRT_MAX)); \
+ } \
+} while (0)
if (CONN_V6(conn)) {
struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p;
@@ -1692,7 +1725,7 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn,
conn->seq_ack_to_tap = prev_ack_to_tap;
#else
if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || tcp_rtt_dst_low(conn)
- || CONN_IS_CLOSING(conn) || conn->flags & LOCAL || force_seq) {
+ || CONN_IS_CLOSING(conn) || (conn->flags & LOCAL) || force_seq) {
conn->seq_ack_to_tap = conn->seq_from_tap;
} else if (conn->seq_ack_to_tap != conn->seq_from_tap) {
if (!tinfo) {
@@ -1712,17 +1745,19 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn,
if (!KERNEL_REPORTS_SND_WND(c)) {
tcp_get_sndbuf(conn);
new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW);
- conn->wnd_to_tap = new_wnd_to_tap >> conn->ws_to_tap;
+ conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap,
+ USHRT_MAX);
goto out;
}
if (!tinfo) {
- if (prev_wnd_to_tap > WINDOW_DEFAULT)
+ if (prev_wnd_to_tap > WINDOW_DEFAULT) {
goto out;
-
+}
tinfo = &tinfo_new;
- if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
+ if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) {
goto out;
+}
}
#ifdef HAS_SND_WND
@@ -1735,10 +1770,15 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn,
}
#endif
- conn->wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW) >> conn->ws_to_tap;
+ new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW);
+ if (!(conn->events & ESTABLISHED))
+ new_wnd_to_tap = MAX(new_wnd_to_tap, WINDOW_DEFAULT);
+
+ conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap, USHRT_MAX);
if (!conn->wnd_to_tap)
conn_flag(c, conn, ACK_TO_TAP_DUE);
+
out:
return new_wnd_to_tap != prev_wnd_to_tap ||
conn->seq_ack_to_tap != prev_ack_to_tap;
@@ -1772,10 +1812,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags)
return 0;
if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
- tcp_conn_destroy(c, conn);
+ conn_event(c, conn, CLOSED);
return -ECONNRESET;
}
+#ifdef HAS_SND_WND
+ if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd)
+ c->tcp.kernel_snd_wnd = 1;
+#endif
+
if (!(conn->flags & LOCAL))
tcp_rtt_dst_check(conn, &tinfo);
@@ -1825,11 +1870,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags)
data += OPT_MSS_LEN - 2;
th->doff += OPT_MSS_LEN / 4;
-#ifdef HAS_SND_WND
- if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd)
- c->tcp.kernel_snd_wnd = 1;
-#endif
-
conn->ws_to_tap = MIN(MAX_WS, tinfo.tcpi_snd_wscale);
*data++ = OPT_NOP;
@@ -1854,10 +1894,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags)
NULL, conn->seq_to_tap);
iov->iov_len = eth_len + sizeof(uint32_t);
- /* First value is not scaled: scale now */
- if (flags & SYN)
- conn->wnd_to_tap >>= conn->ws_to_tap;
-
if (CONN_V4(conn))
tcp4_l2_flags_buf_bytes += iov->iov_len;
else
@@ -1905,68 +1941,55 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags)
*/
static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn)
{
- if (CONN_IS_CLOSED(conn))
+ if (conn->events == CLOSED)
return;
if (!tcp_send_flag(c, conn, RST))
- tcp_conn_destroy(c, conn);
+ conn_event(c, conn, CLOSED);
}
/**
- * tcp_clamp_window() - Set window and scaling from option, clamp on socket
+ * tcp_get_tap_ws() - Get Window Scaling option for connection from tap/guest
* @conn: Connection pointer
- * @th: TCP header, from tap, can be NULL if window is passed
- * @len: Buffer length, at L4, can be 0 if no header is passed
- * @window: Window value, host order, unscaled, if no header is passed
- * @init: Set if this is the very first segment from tap
- */
-static void tcp_clamp_window(struct ctx *c, struct tcp_conn *conn,
- struct tcphdr *th, int len, unsigned int window,
- int init)
+ * @opts: Pointer to start of TCP options
+ * @optlen: Bytes in options: caller MUST ensure available length
+ */
+static void tcp_get_tap_ws(struct tcp_conn *conn, char *opts, size_t optlen)
{
- if (init && th) {
- int ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL);
+ int ws = tcp_opt_get(opts, optlen, OPT_WS, NULL, NULL);
- conn->ws_from_tap = ws & 0xf;
+ if (ws >= 0 && ws <= TCP_WS_MAX)
+ conn->ws_from_tap = ws;
+ else
+ conn->ws_from_tap = 0;
+}
- /* RFC 7323, 2.2: first value is not scaled. Also, don't clamp
- * yet, to avoid getting a zero scale just because we set a
- * small window now.
- */
- conn->wnd_from_tap = ntohs(th->window);
- } else {
- uint32_t prev_scaled = conn->wnd_from_tap << conn->ws_from_tap;
+/**
+ * tcp_clamp_window() - Set new window for connection, clamp on socket
+ * @c: Execution context
+ * @conn: Connection pointer
+ * @window: Window value, host order, unscaled
+ */
+static void tcp_clamp_window(struct ctx *c, struct tcp_conn *conn, unsigned wnd)
+{
+ uint32_t prev_scaled = conn->wnd_from_tap << conn->ws_from_tap;
- if (th)
- window = ntohs(th->window) << conn->ws_from_tap;
- else
- window <<= conn->ws_from_tap;
-
- window = MIN(MAX_WINDOW, window);
-
- if (conn->flags & WND_CLAMPED) {
- if (prev_scaled == window)
- return;
-
- /* Discard +/- 1% updates to spare some syscalls. */
- if ((window > prev_scaled &&
- window * 99 / 100 < prev_scaled) ||
- (window < prev_scaled &&
- window * 101 / 100 > prev_scaled)) {
- conn->wnd_from_tap = window >>
- conn->ws_from_tap;
- return;
- }
- }
+ wnd <<= conn->ws_from_tap;
+ wnd = MIN(MAX_WINDOW, wnd);
- if (window < 256)
- window = 256;
+ if (conn->flags & WND_CLAMPED) {
+ if (prev_scaled == wnd)
+ return;
- conn->wnd_from_tap = window >> conn->ws_from_tap;
- setsockopt(conn->sock, SOL_TCP, TCP_WINDOW_CLAMP,
- &window, sizeof(window));
- conn_flag(c, conn, WND_CLAMPED);
+ /* Discard +/- 1% updates to spare some syscalls. */
+ if ((wnd > prev_scaled && wnd * 99 / 100 < prev_scaled) ||
+ (wnd < prev_scaled && wnd * 101 / 100 > prev_scaled))
+ return;
}
+
+ conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX);
+ setsockopt(conn->sock, SOL_TCP, TCP_WINDOW_CLAMP, &wnd, sizeof(wnd));
+ conn_flag(c, conn, WND_CLAMPED);
}
/**
@@ -2059,18 +2082,18 @@ static int tcp_conn_new_sock(struct ctx *c, sa_family_t af)
* tcp_conn_tap_mss() - Get and clamp MSS value advertised by tap/guest
* @c: Execution context
* @conn: Connection pointer
- * @th: TCP header send by tap/guest
- * @len: L4 packet length, host order
+ * @opts: Pointer to start of TCP options
+ * @optlen: Bytes in options: caller MUST ensure available length
*
* Return: clamped MSS value
*/
static uint16_t tcp_conn_tap_mss(struct ctx *c, struct tcp_conn *conn,
- struct tcphdr *th, size_t len)
+ char *opts, size_t optlen)
{
unsigned int mss;
int ret;
- if ((ret = tcp_opt_get(th, len, OPT_MSS, NULL, NULL)) < 0)
+ if ((ret = tcp_opt_get(opts, optlen, OPT_MSS, NULL, NULL)) < 0)
mss = MSS_DEFAULT;
else
mss = ret;
@@ -2091,12 +2114,13 @@ static uint16_t tcp_conn_tap_mss(struct ctx *c, struct tcp_conn *conn,
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
* @addr: Remote address, pointer to sin_addr or sin6_addr
- * @th: TCP header from tap
- * @len: Packet length at L4
+ * @th: TCP header from tap: caller MUST ensure it's there
+ * @opts: Pointer to start of options
+ * @optlen: Bytes in options: caller MUST ensure available length
* @now: Current timestamp
*/
static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
- struct tcphdr *th, size_t len,
+ struct tcphdr *th, char *opts, size_t optlen,
struct timespec *now)
{
struct sockaddr_in addr4 = {
@@ -2142,16 +2166,21 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
conn = CONN(c->tcp.conn_count++);
conn->sock = s;
conn->timer = -1;
- conn->ws_to_tap = conn->ws_from_tap = 0;
conn_event(c, conn, TAP_SYN_RCVD);
conn->wnd_to_tap = WINDOW_DEFAULT;
- mss = tcp_conn_tap_mss(c, conn, th, len);
+ mss = tcp_conn_tap_mss(c, conn, opts, optlen);
setsockopt(s, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss));
MSS_SET(conn, mss);
- tcp_clamp_window(c, conn, th, len, 0, 1);
+ tcp_get_tap_ws(conn, opts, optlen);
+
+ /* RFC 7323, 2.2: first value is not scaled. Also, don't clamp yet, to
+ * avoid getting a zero scale just because we set a small window now.
+ */
+ if (!(conn->wnd_from_tap = (htons(th->window) >> conn->ws_from_tap)))
+ conn->wnd_from_tap = 1;
if (af == AF_INET) {
sa = (struct sockaddr *)&addr4;
@@ -2395,53 +2424,52 @@ zero_len:
}
/**
- * tcp_data_from_tap() - tap data for established connection
+ * tcp_data_from_tap() - tap/guest data for established connection
* @c: Execution context
* @conn: Connection pointer
- * @msg: Array of messages from tap
- * @count: Count of messages
+ * @p: Pool of TCP packets, with TCP headers
*
* #syscalls sendmsg
*/
static void tcp_data_from_tap(struct ctx *c, struct tcp_conn *conn,
- struct tap_l4_msg *msg, int count)
+ struct pool *p)
{
- int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1;
+ int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0;
uint16_t max_ack_seq_wnd = conn->wnd_from_tap;
uint32_t max_ack_seq = conn->seq_ack_from_tap;
uint32_t seq_from_tap = conn->seq_from_tap;
struct msghdr mh = { .msg_iov = tcp_iov };
- int partial_send = 0;
- uint16_t len;
+ size_t len;
ssize_t n;
- for (i = 0, iov_i = 0; i < count; i++) {
+ for (i = 0, iov_i = 0; i < (int)p->count; i++) {
uint32_t seq, seq_offset, ack_seq;
struct tcphdr *th;
char *data;
size_t off;
- th = (struct tcphdr *)(pkt_buf + msg[i].pkt_buf_offset);
- len = msg[i].l4_len;
-
- if (len < sizeof(*th)) {
+ packet_get(p, i, 0, 0, &len);
+ th = packet_get(p, i, 0, sizeof(*th), NULL);
+ if (!th) {
tcp_rst(c, conn);
return;
}
- off = (size_t)th->doff * 4;
+ off = th->doff * 4UL;
if (off < sizeof(*th) || off > len) {
tcp_rst(c, conn);
return;
}
if (th->rst) {
- tcp_conn_destroy(c, conn);
+ conn_event(c, conn, CLOSED);
return;
}
len -= off;
- data = (char *)th + off;
+ data = packet_get(p, i, off, len, NULL);
+ if (!data)
+ continue;
seq = ntohl(th->seq);
ack_seq = ntohl(th->ack_seq);
@@ -2511,7 +2539,7 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_conn *conn,
i = keep - 1;
}
- tcp_clamp_window(c, conn, NULL, 0, max_ack_seq_wnd, 0);
+ tcp_clamp_window(c, conn, max_ack_seq_wnd);
if (ack) {
if (max_ack_seq == conn->seq_to_tap) {
@@ -2595,14 +2623,22 @@ out:
* tcp_conn_from_sock_finish() - Complete connection setup after connect()
* @c: Execution context
* @conn: Connection pointer
- * @th: TCP header of SYN, ACK segment from tap/guest
- * @len: Packet length of SYN, ACK segment at L4, host order
+ * @th: TCP header of SYN, ACK segment: caller MUST ensure it's there
+ * @opts: Pointer to start of options
+ * @optlen: Bytes in options: caller MUST ensure available length
*/
static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn,
- struct tcphdr *th, size_t len)
+ struct tcphdr *th,
+ char *opts, size_t optlen)
{
- tcp_clamp_window(c, conn, th, len, 0, 1);
- MSS_SET(conn, tcp_conn_tap_mss(c, conn, th, len));
+ tcp_clamp_window(c, conn, ntohs(th->window));
+ tcp_get_tap_ws(conn, opts, optlen);
+
+ /* First value is not scaled */
+ if (!(conn->wnd_from_tap >>= conn->ws_from_tap))
+ conn->wnd_from_tap = 1;
+
+ MSS_SET(conn, tcp_conn_tap_mss(c, conn, opts, optlen));
conn->seq_init_from_tap = ntohl(th->seq) + 1;
conn->seq_from_tap = conn->seq_init_from_tap;
@@ -2622,32 +2658,42 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn,
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
* @addr: Destination address
- * @msg: Input messages
- * @count: Message count
+ * @p: Pool of TCP packets, with TCP headers
* @now: Current timestamp
*
* Return: count of consumed packets
*/
-int tcp_tap_handler(struct ctx *c, int af, void *addr,
- struct tap_l4_msg *msg, int count, struct timespec *now)
+int tcp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p,
+ struct timespec *now)
{
- struct tcphdr *th = (struct tcphdr *)(pkt_buf + msg[0].pkt_buf_offset);
- uint16_t len = msg[0].l4_len;
struct tcp_conn *conn;
+ size_t optlen, len;
+ struct tcphdr *th;
int ack_due = 0;
+ char *opts;
+
+ packet_get(p, 0, 0, 0, &len);
+ th = packet_get(p, 0, 0, sizeof(*th), NULL);
+ if (!th)
+ return 1;
+
+ optlen = th->doff * 4UL - sizeof(*th);
+ opts = packet_get(p, 0, sizeof(*th), optlen, NULL);
conn = tcp_hash_lookup(c, af, addr, htons(th->source), htons(th->dest));
/* New connection from tap */
if (!conn) {
if (th->syn && !th->ack)
- tcp_conn_from_tap(c, af, addr, th, len, now);
+ tcp_conn_from_tap(c, af, addr, th, opts, optlen, now);
return 1;
}
+ trace("TCP: packet length %lu from tap for index %lu", len, conn - tc);
+
if (th->rst) {
- tcp_conn_destroy(c, conn);
- return count;
+ conn_event(c, conn, CLOSED);
+ return p->count;
}
if (th->ack) {
@@ -2660,7 +2706,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
/* Establishing connection from socket */
if (conn->events & SOCK_ACCEPTED) {
if (th->syn && th->ack && !th->fin)
- tcp_conn_from_sock_finish(c, conn, th, len);
+ tcp_conn_from_sock_finish(c, conn, th, opts, optlen);
else
tcp_rst(c, conn);
@@ -2671,7 +2717,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
if (conn->events & TAP_SYN_RCVD) {
if (!(conn->events & TAP_SYN_ACK_SENT)) {
tcp_rst(c, conn);
- return count;
+ return p->count;
}
conn_event(c, conn, ESTABLISHED);
@@ -2683,17 +2729,19 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
tcp_send_flag(c, conn, ACK);
conn_event(c, conn, SOCK_FIN_SENT);
- return count;
+ return p->count;
}
if (!th->ack) {
tcp_rst(c, conn);
- return count;
+ return p->count;
}
- tcp_clamp_window(c, conn, th, len, 0, 0);
+ tcp_clamp_window(c, conn, ntohs(th->window));
+
+ tcp_data_from_sock(c, conn);
- if (count == 1)
+ if (p->count == 1)
return 1;
}
@@ -2701,13 +2749,13 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
if (conn->events & TAP_FIN_RCVD) {
if (conn->events & SOCK_FIN_RCVD &&
conn->seq_ack_from_tap == conn->seq_to_tap)
- tcp_conn_destroy(c, conn);
+ conn_event(c, conn, CLOSED);
return 1;
}
/* Established connections accepting data from tap */
- tcp_data_from_tap(c, conn, msg, count);
+ tcp_data_from_tap(c, conn, p);
if (conn->seq_ack_to_tap != conn->seq_from_tap)
ack_due = 1;
@@ -2721,7 +2769,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
if (ack_due)
conn_flag(c, conn, ACK_TO_TAP_DUE);
- return count;
+ return p->count;
}
/**
@@ -2872,7 +2920,7 @@ static void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
if (!(conn->events & ESTABLISHED)) {
debug("TCP: index %i, handshake timeout", conn - tc);
tcp_rst(c, conn);
- } else if (conn->events & TAP_FIN_SENT) {
+ } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
debug("TCP: index %i, FIN timeout", conn - tc);
tcp_rst(c, conn);
} else if (conn->retrans == TCP_MAX_RETRANS) {
@@ -2884,6 +2932,7 @@ static void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
conn->retrans++;
conn->seq_to_tap = conn->seq_ack_from_tap;
tcp_data_from_sock(c, conn);
+ tcp_timer_ctl(c, conn);
}
} else {
struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } };
@@ -2933,19 +2982,22 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
if (!(conn = CONN_OR_NULL(ref.r.p.tcp.tcp.index)))
return;
+ if (conn->events == CLOSED)
+ return;
+
if (events & EPOLLERR) {
tcp_rst(c, conn);
return;
}
if ((conn->events & TAP_FIN_SENT) && (events & EPOLLHUP)) {
- tcp_conn_destroy(c, conn);
+ conn_event(c, conn, CLOSED);
return;
}
if (conn->events & ESTABLISHED) {
if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
- tcp_conn_destroy(c, conn);
+ conn_event(c, conn, CLOSED);
if (events & (EPOLLRDHUP | EPOLLHUP))
conn_event(c, conn, SOCK_FIN_RCVD);
@@ -3159,7 +3211,7 @@ static int tcp_sock_refill(void *arg)
*
* Return: 0 on success, -1 on failure
*/
-int tcp_sock_init(struct ctx *c, struct timespec *now)
+int tcp_sock_init(struct ctx *c)
{
struct tcp_sock_refill_arg refill_arg = { c, 0 };
int i, port;
@@ -3215,7 +3267,6 @@ int tcp_sock_init(struct ctx *c, struct timespec *now)
memset(tcp_sock_init_ext, 0xff, sizeof(tcp_sock_init_ext));
memset(tcp_sock_ns, 0xff, sizeof(tcp_sock_ns));
- c->tcp.refill_ts = *now;
tcp_sock_refill(&refill_arg);
if (c->mode == MODE_PASTA) {
@@ -3226,7 +3277,7 @@ int tcp_sock_init(struct ctx *c, struct timespec *now)
refill_arg.ns = 1;
NS_CALL(tcp_sock_refill, &refill_arg);
- c->tcp.port_detect_ts = *now;
+ tcp_splice_timer(c);
}
return 0;
@@ -3349,47 +3400,48 @@ static int tcp_port_rebind(void *arg)
}
/**
- * tcp_timer() - Scan activity bitmap for sockets waiting for timed events
+ * tcp_timer() - Periodic tasks: port detection, closed connections, pool refill
* @c: Execution context
- * @now: Timestamp from caller
+ * @ts: Unused
*/
-void tcp_timer(struct ctx *c, struct timespec *now)
+void tcp_timer(struct ctx *c, struct timespec *ts)
{
struct tcp_sock_refill_arg refill_arg = { c, 0 };
+ struct tcp_conn *conn;
- if (c->mode == MODE_PASTA) {
- if (timespec_diff_ms(now, &c->tcp.port_detect_ts) >
- PORT_DETECT_INTERVAL) {
- struct tcp_port_detect_arg detect_arg = { c, 0 };
- struct tcp_port_rebind_arg rebind_arg = { c, 0 };
-
- if (c->tcp.init_detect_ports) {
- detect_arg.detect_in_ns = 0;
- tcp_port_detect(&detect_arg);
- rebind_arg.bind_in_ns = 1;
- NS_CALL(tcp_port_rebind, &rebind_arg);
- }
+ (void)ts;
- if (c->tcp.ns_detect_ports) {
- detect_arg.detect_in_ns = 1;
- NS_CALL(tcp_port_detect, &detect_arg);
- rebind_arg.bind_in_ns = 0;
- tcp_port_rebind(&rebind_arg);
- }
+ if (c->mode == MODE_PASTA) {
+ struct tcp_port_detect_arg detect_arg = { c, 0 };
+ struct tcp_port_rebind_arg rebind_arg = { c, 0 };
+
+ if (c->tcp.init_detect_ports) {
+ detect_arg.detect_in_ns = 0;
+ tcp_port_detect(&detect_arg);
+ rebind_arg.bind_in_ns = 1;
+ NS_CALL(tcp_port_rebind, &rebind_arg);
+ }
- c->tcp.port_detect_ts = *now;
+ if (c->tcp.ns_detect_ports) {
+ detect_arg.detect_in_ns = 1;
+ NS_CALL(tcp_port_detect, &detect_arg);
+ rebind_arg.bind_in_ns = 0;
+ tcp_port_rebind(&rebind_arg);
}
+ }
- tcp_splice_timer(c, now);
+ for (conn = CONN(c->tcp.conn_count - 1); conn >= tc; conn--) {
+ if (conn->events == CLOSED)
+ tcp_conn_destroy(c, conn);
}
- if (timespec_diff_ms(now, &c->tcp.refill_ts) > REFILL_INTERVAL) {
- tcp_sock_refill(&refill_arg);
- if (c->mode == MODE_PASTA) {
- refill_arg.ns = 1;
- if ((c->v4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) ||
- (c->v6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0))
- NS_CALL(tcp_sock_refill, &refill_arg);
- }
+ tcp_sock_refill(&refill_arg);
+ if (c->mode == MODE_PASTA) {
+ refill_arg.ns = 1;
+ if ((c->v4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) ||
+ (c->v6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0))
+ NS_CALL(tcp_sock_refill, &refill_arg);
+
+ tcp_splice_timer(c);
}
}
diff --git a/tcp.h b/tcp.h
index 109516d..cf52f32 100644
--- a/tcp.h
+++ b/tcp.h
@@ -6,9 +6,7 @@
#ifndef TCP_H
#define TCP_H
-#define REFILL_INTERVAL 1000 /* ms */
-#define PORT_DETECT_INTERVAL 1000
-#define TCP_TIMER_INTERVAL MIN(REFILL_INTERVAL, PORT_DETECT_INTERVAL)
+#define TCP_TIMER_INTERVAL 1000 /* ms */
#define TCP_CONN_INDEX_BITS 17 /* 128k */
#define TCP_MAX_CONNS (1 << TCP_CONN_INDEX_BITS)
@@ -20,10 +18,10 @@ struct ctx;
void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now);
-int tcp_tap_handler(struct ctx *c, int af, void *addr,
- struct tap_l4_msg *msg, int count, struct timespec *now);
-int tcp_sock_init(struct ctx *c, struct timespec *now);
-void tcp_timer(struct ctx *c, struct timespec *now);
+int tcp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p,
+ struct timespec *now);
+int tcp_sock_init(struct ctx *c);
+void tcp_timer(struct ctx *c, struct timespec *ts);
void tcp_defer_handler(struct ctx *c);
void tcp_sock_set_bufsize(struct ctx *c, int s);
@@ -64,8 +62,6 @@ union tcp_epoll_ref {
* @timer_run: Timestamp of most recent timer run
* @kernel_snd_wnd: Kernel reports sending window (with commit 8f7baad7f035)
* @pipe_size: Size of pipes for spliced connections
- * @refill_ts: Time of last refill operation for pools of sockets/pipes
- * @port_detect_ts: Time of last TCP port detection/rebind, if enabled
*/
struct tcp_ctx {
uint64_t hash_secret[2];
@@ -80,8 +76,6 @@ struct tcp_ctx {
int kernel_snd_wnd;
#endif
size_t pipe_size;
- struct timespec refill_ts;
- struct timespec port_detect_ts;
};
#endif /* TCP_H */
diff --git a/tcp_splice.c b/tcp_splice.c
index bcafd33..0095740 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -51,7 +51,7 @@
#define MAX_PIPE_SIZE (2UL * 1024 * 1024)
#define TCP_SPLICE_MAX_CONNS (128 * 1024)
#define TCP_SPLICE_PIPE_POOL_SIZE 16
-#define REFILL_INTERVAL 1000 /* ms, refill pool of pipes */
+#define TCP_SPLICE_CONN_PRESSURE 30 /* % of splice_conn_count */
#define TCP_SPLICE_FILE_PRESSURE 30 /* % of c->nofile */
/* From tcp.c */
@@ -83,24 +83,24 @@ struct tcp_splice_conn {
int pipe_b_a[2];
uint8_t events;
-#define SPLICE_CLOSED 0
-#define SPLICE_CONNECT BIT(0)
-#define SPLICE_ESTABLISHED BIT(1)
-#define SPLICE_A_OUT_WAIT BIT(2)
-#define SPLICE_B_OUT_WAIT BIT(3)
-#define SPLICE_A_FIN_RCVD BIT(4)
-#define SPLICE_B_FIN_RCVD BIT(5)
-#define SPLICE_A_FIN_SENT BIT(6)
-#define SPLICE_B_FIN_SENT BIT(7)
+#define CLOSED 0
+#define CONNECT BIT(0)
+#define ESTABLISHED BIT(1)
+#define A_OUT_WAIT BIT(2)
+#define B_OUT_WAIT BIT(3)
+#define A_FIN_RCVD BIT(4)
+#define B_FIN_RCVD BIT(5)
+#define A_FIN_SENT BIT(6)
+#define B_FIN_SENT BIT(7)
uint8_t flags;
-#define SPLICE_V6 BIT(0)
-#define SPLICE_IN_EPOLL BIT(1)
-#define SPLICE_RCVLOWAT_SET_A BIT(2)
-#define SPLICE_RCVLOWAT_SET_B BIT(3)
-#define SPLICE_RCVLOWAT_ACT_A BIT(4)
-#define SPLICE_RCVLOWAT_ACT_B BIT(5)
-#define SPLICE_CLOSING BIT(6)
+#define SOCK_V6 BIT(0)
+#define IN_EPOLL BIT(1)
+#define RCVLOWAT_SET_A BIT(2)
+#define RCVLOWAT_SET_B BIT(3)
+#define RCVLOWAT_ACT_A BIT(4)
+#define RCVLOWAT_ACT_B BIT(5)
+#define CLOSING BIT(6)
uint64_t a_read;
uint64_t a_written;
@@ -108,7 +108,7 @@ struct tcp_splice_conn {
uint64_t b_written;
};
-#define CONN_V6(x) (x->flags & SPLICE_V6)
+#define CONN_V6(x) (x->flags & SOCK_V6)
#define CONN_V4(x) (!CONN_V6(x))
#define CONN_HAS(conn, set) ((conn->events & (set)) == (set))
#define CONN(index) (tc + (index))
@@ -118,15 +118,13 @@ static struct tcp_splice_conn tc[TCP_SPLICE_MAX_CONNS];
/* Display strings for connection events */
static const char *tcp_splice_event_str[] __attribute((__unused__)) = {
- "SPLICE_CONNECT", "SPLICE_ESTABLISHED",
- "SPLICE_A_OUT_WAIT", "SPLICE_B_OUT_WAIT",
- "SPLICE_A_FIN_RCVD", "SPLICE_B_FIN_RCVD",
- "SPLICE_A_FIN_SENT", "SPLICE_B_FIN_SENT",
+ "CONNECT", "ESTABLISHED", "A_OUT_WAIT", "B_OUT_WAIT",
+ "A_FIN_RCVD", "B_FIN_RCVD", "A_FIN_SENT", "B_FIN_SENT",
};
/* Display strings for connection flags */
static const char *tcp_splice_flag_str[] __attribute((__unused__)) = {
- "V6", "IN_EPOLL", "RCVLOWAT_SET_A", "RCVLOWAT_SET_B",
+ "SOCK_V6", "IN_EPOLL", "RCVLOWAT_SET_A", "RCVLOWAT_SET_B",
"RCVLOWAT_ACT_A", "RCVLOWAT_ACT_B", "CLOSING",
};
@@ -141,23 +139,27 @@ static void tcp_splice_conn_epoll_events(uint16_t events,
{
*a = *b = 0;
- if (events & SPLICE_CLOSED)
+ if (events & CLOSED)
return;
- if (events & SPLICE_ESTABLISHED)
- *a = *b = EPOLLIN | EPOLLRDHUP;
- else if (events & SPLICE_CONNECT)
+ if (events & ESTABLISHED) {
+ if (!(events & B_FIN_SENT))
+ *a = EPOLLIN | EPOLLRDHUP;
+ if (!(events & A_FIN_SENT))
+ *b = EPOLLIN | EPOLLRDHUP;
+ } else if (events & CONNECT) {
*b = EPOLLOUT;
+ }
- *a |= (events & SPLICE_A_OUT_WAIT) ? EPOLLOUT : 0;
- *b |= (events & SPLICE_B_OUT_WAIT) ? EPOLLOUT : 0;
+ *a |= (events & A_OUT_WAIT) ? EPOLLOUT : 0;
+ *b |= (events & B_OUT_WAIT) ? EPOLLOUT : 0;
}
static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn);
static int tcp_splice_epoll_ctl(struct ctx *c, struct tcp_splice_conn *conn);
/**
- * conn_flag_do() - Set/unset given flag, log, update epoll on SPLICE_CLOSING
+ * conn_flag_do() - Set/unset given flag, log, update epoll on CLOSING flag
* @c: Execution context
* @conn: Connection pointer
* @flag: Flag to set, or ~flag to unset
@@ -181,7 +183,7 @@ static void conn_flag_do(struct ctx *c, struct tcp_splice_conn *conn,
tcp_splice_flag_str[fls(flag)]);
}
- if (flag == SPLICE_CLOSING)
+ if (flag == CLOSING)
tcp_splice_epoll_ctl(c, conn);
}
@@ -201,7 +203,7 @@ static void conn_flag_do(struct ctx *c, struct tcp_splice_conn *conn,
*/
static int tcp_splice_epoll_ctl(struct ctx *c, struct tcp_splice_conn *conn)
{
- int m = (conn->flags & SPLICE_IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
+ int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
union epoll_ref ref_a = { .r.proto = IPPROTO_TCP, .r.s = conn->a,
.r.p.tcp.tcp.splice = 1,
.r.p.tcp.tcp.index = conn - tc,
@@ -214,15 +216,8 @@ static int tcp_splice_epoll_ctl(struct ctx *c, struct tcp_splice_conn *conn)
struct epoll_event ev_b = { .data.u64 = ref_b.u64 };
uint32_t events_a, events_b;
- if (conn->flags & SPLICE_CLOSING) {
- if (conn->flags & SPLICE_IN_EPOLL)
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->a, &ev_a);
-
- if (conn->events & SPLICE_CONNECT)
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->b, &ev_b);
-
- return 0;
- }
+ if (conn->flags & CLOSING)
+ goto delete;
tcp_splice_conn_epoll_events(conn->events, &events_a, &events_b);
ev_a.events = events_a;
@@ -230,13 +225,13 @@ static int tcp_splice_epoll_ctl(struct ctx *c, struct tcp_splice_conn *conn)
if (epoll_ctl(c->epollfd, m, conn->a, &ev_a) ||
epoll_ctl(c->epollfd, m, conn->b, &ev_b))
- goto err;
+ goto delete;
- conn->flags |= SPLICE_IN_EPOLL; /* No need to log this */
+ conn->flags |= IN_EPOLL; /* No need to log this */
return 0;
-err:
+delete:
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->a, &ev_a);
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->b, &ev_b);
return -errno;
@@ -251,12 +246,6 @@ err:
static void conn_event_do(struct ctx *c, struct tcp_splice_conn *conn,
unsigned long event)
{
- if (event == SPLICE_CLOSED) {
- conn->events = SPLICE_CLOSED;
- debug("TCP (spliced): index %i, CLOSED", conn - tc);
- return;
- }
-
if (event & (event - 1)) {
if (!(conn->events & ~event))
return;
@@ -274,7 +263,7 @@ static void conn_event_do(struct ctx *c, struct tcp_splice_conn *conn,
}
if (tcp_splice_epoll_ctl(c, conn))
- conn_flag(c, conn, SPLICE_CLOSING);
+ conn_flag(c, conn, CLOSING);
}
#define conn_event(c, conn, event) \
@@ -304,22 +293,25 @@ static void tcp_table_splice_compact(struct ctx *c,
memcpy(hole, move, sizeof(*hole));
move->a = move->b = -1;
- move->flags = move->events = 0;
move->a_read = move->a_written = move->b_read = move->b_written = 0;
+ move->pipe_a_b[0] = move->pipe_a_b[1] = -1;
+ move->pipe_b_a[0] = move->pipe_b_a[1] = -1;
+ move->flags = move->events = 0;
debug("TCP (spliced): index %i moved to %i", move - tc, hole - tc);
+ tcp_splice_epoll_ctl(c, hole);
if (tcp_splice_epoll_ctl(c, hole))
- conn_flag(c, hole, SPLICE_CLOSING);
+ conn_flag(c, hole, CLOSING);
}
/**
- * tcp_splice_destroy() - Close spliced connection and pipes, drop from epoll
+ * tcp_splice_destroy() - Close spliced connection and pipes, clear
* @c: Execution context
* @conn: Connection pointer
*/
static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn)
{
- if (conn->events & SPLICE_ESTABLISHED) {
+ if (conn->events & ESTABLISHED) {
/* Flushing might need to block: don't recycle them. */
if (conn->pipe_a_b[0] != -1) {
close(conn->pipe_a_b[0]);
@@ -333,18 +325,19 @@ static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn)
}
}
- if (conn->events & SPLICE_CONNECT) {
+ if (conn->events & CONNECT) {
close(conn->b);
conn->b = -1;
}
- conn_event(c, conn, SPLICE_CLOSED);
-
close(conn->a);
conn->a = -1;
- conn->flags = 0;
conn->a_read = conn->a_written = conn->b_read = conn->b_written = 0;
+ conn->events = CLOSED;
+ conn->flags = 0;
+ debug("TCP (spliced): index %i, CLOSED", conn - tc);
+
tcp_table_splice_compact(c, conn);
}
@@ -364,7 +357,7 @@ static int tcp_splice_connect_finish(struct ctx *c,
conn->pipe_a_b[1] = conn->pipe_b_a[1] = -1;
for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) {
- if (splice_pipe_pool[i][0][0] > 0) {
+ if (splice_pipe_pool[i][0][0] >= 0) {
SWAP(conn->pipe_a_b[0], splice_pipe_pool[i][0][0]);
SWAP(conn->pipe_a_b[1], splice_pipe_pool[i][0][1]);
@@ -377,7 +370,7 @@ static int tcp_splice_connect_finish(struct ctx *c,
if (conn->pipe_a_b[0] < 0) {
if (pipe2(conn->pipe_a_b, O_NONBLOCK) ||
pipe2(conn->pipe_b_a, O_NONBLOCK)) {
- conn_flag(c, conn, SPLICE_CLOSING);
+ conn_flag(c, conn, CLOSING);
return -EIO;
}
@@ -385,8 +378,8 @@ static int tcp_splice_connect_finish(struct ctx *c,
fcntl(conn->pipe_b_a[0], F_SETPIPE_SZ, c->tcp.pipe_size);
}
- if (!(conn->events & SPLICE_ESTABLISHED))
- conn_event(c, conn, SPLICE_ESTABLISHED);
+ if (!(conn->events & ESTABLISHED))
+ conn_event(c, conn, ESTABLISHED);
return 0;
}
@@ -450,9 +443,9 @@ static int tcp_splice_connect(struct ctx *c, struct tcp_splice_conn *conn,
close(sock_conn);
return ret;
}
- conn_event(c, conn, SPLICE_CONNECT);
+ conn_event(c, conn, CONNECT);
} else {
- conn_event(c, conn, SPLICE_ESTABLISHED);
+ conn_event(c, conn, ESTABLISHED);
return tcp_splice_connect_finish(c, conn);
}
@@ -575,20 +568,23 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
conn = CONN(c->tcp.splice_conn_count++);
conn->a = s;
- conn->flags = ref.r.p.tcp.tcp.v6 ? SPLICE_V6 : 0;
+ conn->flags = ref.r.p.tcp.tcp.v6 ? SOCK_V6 : 0;
if (tcp_splice_new(c, conn, ref.r.p.tcp.tcp.index))
- conn_flag(c, conn, SPLICE_CLOSING);
+ conn_flag(c, conn, CLOSING);
return;
}
conn = CONN(ref.r.p.tcp.tcp.index);
- if (events & EPOLLERR || events & EPOLLHUP)
+ if (conn->events == CLOSED)
+ return;
+
+ if (events & EPOLLERR)
goto close;
- if (conn->events == SPLICE_CONNECT) {
+ if (conn->events == CONNECT) {
if (!(events & EPOLLOUT))
goto close;
if (tcp_splice_connect_finish(c, conn))
@@ -597,9 +593,9 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
if (events & EPOLLOUT) {
if (ref.r.s == conn->a)
- conn_event(c, conn, ~SPLICE_A_OUT_WAIT);
+ conn_event(c, conn, ~A_OUT_WAIT);
else
- conn_event(c, conn, ~SPLICE_B_OUT_WAIT);
+ conn_event(c, conn, ~B_OUT_WAIT);
tcp_splice_dir(conn, ref.r.s, 1, &from, &to, &pipes);
} else {
@@ -608,9 +604,16 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
if (events & EPOLLRDHUP) {
if (ref.r.s == conn->a)
- conn_event(c, conn, SPLICE_A_FIN_RCVD);
+ conn_event(c, conn, A_FIN_RCVD);
+ else
+ conn_event(c, conn, B_FIN_RCVD);
+ }
+
+ if (events & EPOLLHUP) {
+ if (ref.r.s == conn->a)
+ conn_event(c, conn, A_FIN_SENT); /* Fake, but implied */
else
- conn_event(c, conn, SPLICE_B_FIN_RCVD);
+ conn_event(c, conn, B_FIN_SENT);
}
swap:
@@ -620,13 +623,13 @@ swap:
if (from == conn->a) {
seq_read = &conn->a_read;
seq_write = &conn->a_written;
- lowat_set_flag = SPLICE_RCVLOWAT_SET_A;
- lowat_act_flag = SPLICE_RCVLOWAT_ACT_A;
+ lowat_set_flag = RCVLOWAT_SET_A;
+ lowat_act_flag = RCVLOWAT_ACT_A;
} else {
seq_read = &conn->b_read;
seq_write = &conn->b_written;
- lowat_set_flag = SPLICE_RCVLOWAT_SET_B;
- lowat_act_flag = SPLICE_RCVLOWAT_ACT_B;
+ lowat_set_flag = RCVLOWAT_SET_B;
+ lowat_act_flag = RCVLOWAT_ACT_B;
}
while (1) {
@@ -636,6 +639,7 @@ swap:
retry:
readlen = splice(from, NULL, pipes[1], NULL, c->tcp.pipe_size,
SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
+ trace("TCP (spliced): %li from read-side call", readlen);
if (readlen < 0) {
if (errno == EINTR)
goto retry;
@@ -660,6 +664,8 @@ retry:
eintr:
written = splice(pipes[0], NULL, to, NULL, to_write,
SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
+ trace("TCP (spliced): %li from write-side call (passed %lu)",
+ written, to_write);
/* Most common case: skip updating counters. */
if (readlen > 0 && readlen == written) {
@@ -697,9 +703,9 @@ eintr:
goto retry;
if (to == conn->a)
- conn_event(c, conn, SPLICE_A_OUT_WAIT);
+ conn_event(c, conn, A_OUT_WAIT);
else
- conn_event(c, conn, SPLICE_B_OUT_WAIT);
+ conn_event(c, conn, B_OUT_WAIT);
break;
}
@@ -715,23 +721,21 @@ eintr:
break;
}
- if ( (conn->events & SPLICE_A_FIN_RCVD) &&
- !(conn->events & SPLICE_B_FIN_SENT)) {
- if (*seq_read == *seq_write) {
+ if ((conn->events & A_FIN_RCVD) && !(conn->events & B_FIN_SENT)) {
+ if (*seq_read == *seq_write && eof) {
shutdown(conn->b, SHUT_WR);
- conn_event(c, conn, SPLICE_B_FIN_SENT);
+ conn_event(c, conn, B_FIN_SENT);
}
}
- if ( (conn->events & SPLICE_B_FIN_RCVD) &&
- !(conn->events & SPLICE_A_FIN_SENT)) {
- if (*seq_read == *seq_write) {
+ if ((conn->events & B_FIN_RCVD) && !(conn->events & A_FIN_SENT)) {
+ if (*seq_read == *seq_write && eof) {
shutdown(conn->a, SHUT_WR);
- conn_event(c, conn, SPLICE_A_FIN_SENT);
+ conn_event(c, conn, A_FIN_SENT);
}
}
- if (CONN_HAS(conn, SPLICE_A_FIN_SENT | SPLICE_B_FIN_SENT))
+ if (CONN_HAS(conn, A_FIN_SENT | B_FIN_SENT))
goto close;
if ((events & (EPOLLIN | EPOLLOUT)) == (EPOLLIN | EPOLLOUT)) {
@@ -746,10 +750,13 @@ eintr:
goto swap;
}
+ if (events & EPOLLHUP)
+ goto close;
+
return;
close:
- conn_flag(c, conn, SPLICE_CLOSING);
+ conn_flag(c, conn, CLOSING);
}
/**
@@ -829,38 +836,36 @@ void tcp_splice_init(struct ctx *c)
/**
* tcp_splice_timer() - Timer for spliced connections
* @c: Execution context
- * @now: Current timestamp
*/
-void tcp_splice_timer(struct ctx *c, struct timespec *now)
+void tcp_splice_timer(struct ctx *c)
{
struct tcp_splice_conn *conn;
for (conn = CONN(c->tcp.splice_conn_count - 1); conn >= tc; conn--) {
- if (conn->flags & SPLICE_CLOSING) {
+ if (conn->flags & CLOSING) {
tcp_splice_destroy(c, conn);
- continue;
+ return;
}
- if ( (conn->flags & SPLICE_RCVLOWAT_SET_A) &&
- !(conn->flags & SPLICE_RCVLOWAT_ACT_A)) {
+ if ( (conn->flags & RCVLOWAT_SET_A) &&
+ !(conn->flags & RCVLOWAT_ACT_A)) {
setsockopt(conn->a, SOL_SOCKET, SO_RCVLOWAT,
&((int){ 1 }), sizeof(int));
- conn_flag(c, conn, ~SPLICE_RCVLOWAT_SET_A);
+ conn_flag(c, conn, ~RCVLOWAT_SET_A);
}
- if ( (conn->flags & SPLICE_RCVLOWAT_SET_B) &&
- !(conn->flags & SPLICE_RCVLOWAT_ACT_B)) {
+ if ( (conn->flags & RCVLOWAT_SET_B) &&
+ !(conn->flags & RCVLOWAT_ACT_B)) {
setsockopt(conn->b, SOL_SOCKET, SO_RCVLOWAT,
&((int){ 1 }), sizeof(int));
- conn_flag(c, conn, ~SPLICE_RCVLOWAT_SET_B);
+ conn_flag(c, conn, ~RCVLOWAT_SET_B);
}
- conn_flag(c, conn, ~SPLICE_RCVLOWAT_ACT_A);
- conn_flag(c, conn, ~SPLICE_RCVLOWAT_ACT_B);
+ conn_flag(c, conn, ~RCVLOWAT_ACT_A);
+ conn_flag(c, conn, ~RCVLOWAT_ACT_B);
}
- if (timespec_diff_ms(now, &c->tcp.refill_ts) > REFILL_INTERVAL)
- tcp_splice_pipe_refill(c);
+ tcp_splice_pipe_refill(c);
}
/**
@@ -869,14 +874,15 @@ void tcp_splice_timer(struct ctx *c, struct timespec *now)
*/
void tcp_splice_defer_handler(struct ctx *c)
{
+ int max_conns = c->tcp.conn_count / 100 * TCP_SPLICE_CONN_PRESSURE;
int max_files = c->nofile / 100 * TCP_SPLICE_FILE_PRESSURE;
struct tcp_splice_conn *conn;
- if (c->tcp.splice_conn_count * 6 < max_files)
+ if (c->tcp.splice_conn_count < MIN(max_files / 6, max_conns))
return;
for (conn = CONN(c->tcp.splice_conn_count - 1); conn >= tc; conn--) {
- if (conn->flags & SPLICE_CLOSING)
+ if (conn->flags & CLOSING)
tcp_splice_destroy(c, conn);
}
}
diff --git a/tcp_splice.h b/tcp_splice.h
index b744ba7..f7c2f86 100644
--- a/tcp_splice.h
+++ b/tcp_splice.h
@@ -11,5 +11,5 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
uint32_t events);
void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn);
void tcp_splice_init(struct ctx *c);
-void tcp_splice_timer(struct ctx *c, struct timespec *now);
+void tcp_splice_timer(struct ctx *c);
void tcp_splice_defer_handler(struct ctx *c);
diff --git a/udp.c b/udp.c
index e22f3ac..9032e47 100644
--- a/udp.c
+++ b/udp.c
@@ -951,35 +951,35 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
* @addr: Destination address
- * @msg: Input messages
- * @count: Message count
+ * @p: Pool of UDP packets, with UDP headers
* @now: Current timestamp
*
* Return: count of consumed packets
*
* #syscalls sendmmsg
*/
-int udp_tap_handler(struct ctx *c, int af, void *addr,
- struct tap_l4_msg *msg, int count, struct timespec *now)
+int udp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p,
+ struct timespec *now)
{
- /* The caller already checks that all the messages have the same source
- * and destination, so we can just take those from the first message.
- */
- struct udphdr *uh = (struct udphdr *)(pkt_buf + msg[0].pkt_buf_offset);
struct mmsghdr mm[UIO_MAXIOV] = { 0 };
struct iovec m[UIO_MAXIOV];
struct sockaddr_in6 s_in6;
struct sockaddr_in s_in;
struct sockaddr *sa;
+ int i, s, count = 0;
in_port_t src, dst;
+ struct udphdr *uh;
socklen_t sl;
- int i, s;
(void)c;
- if (msg[0].l4_len < sizeof(*uh))
+ uh = packet_get(p, 0, 0, sizeof(*uh), NULL);
+ if (!uh)
return 1;
+ /* The caller already checks that all the messages have the same source
+ * and destination, so we can just take those from the first message.
+ */
src = ntohs(uh->source);
dst = ntohs(uh->dest);
@@ -998,8 +998,8 @@ int udp_tap_handler(struct ctx *c, int af, void *addr,
.udp.port = src };
s = sock_l4(c, AF_INET, IPPROTO_UDP, src, 0, uref.u32);
- if (s <= 0)
- return count;
+ if (s < 0)
+ return p->count;
udp_tap_map[V4][src].sock = s;
bitmap_set(udp_act[V4][UDP_ACT_TAP], src);
@@ -1050,8 +1050,8 @@ int udp_tap_handler(struct ctx *c, int af, void *addr,
s = sock_l4(c, AF_INET6, IPPROTO_UDP, src, bind_to,
uref.u32);
- if (s <= 0)
- return count;
+ if (s < 0)
+ return p->count;
udp_tap_map[V6][src].sock = s;
bitmap_set(udp_act[V6][UDP_ACT_TAP], src);
@@ -1060,18 +1060,26 @@ int udp_tap_handler(struct ctx *c, int af, void *addr,
udp_tap_map[V6][src].ts = now->tv_sec;
}
- for (i = 0; i < count; i++) {
+ for (i = 0; i < (int)p->count; i++) {
struct udphdr *uh_send;
+ size_t len;
+
+ uh_send = packet_get(p, i, 0, sizeof(*uh), &len);
+ if (!uh_send)
+ return p->count;
+ if (!len)
+ continue;
- uh_send = (struct udphdr *)(msg[i].pkt_buf_offset + pkt_buf);
m[i].iov_base = (char *)(uh_send + 1);
- m[i].iov_len = msg[i].l4_len - sizeof(*uh_send);
+ m[i].iov_len = len;
mm[i].msg_hdr.msg_name = sa;
mm[i].msg_hdr.msg_namelen = sl;
mm[i].msg_hdr.msg_iov = m + i;
mm[i].msg_hdr.msg_iovlen = 1;
+
+ count++;
}
count = sendmmsg(s, mm, count, MSG_NOSIGNAL);
@@ -1172,13 +1180,11 @@ static void udp_splice_iov_init(void)
*
* Return: 0 on success, -1 on failure
*/
-int udp_sock_init(struct ctx *c, struct timespec *now)
+int udp_sock_init(struct ctx *c)
{
union udp_epoll_ref uref = { .udp.bound = 1 };
int dst, s;
- (void)now;
-
for (dst = 0; dst < USHRT_MAX; dst++) {
if (!bitmap_isset(c->udp.port_to_tap, dst))
continue;
diff --git a/udp.h b/udp.h
index 2c9066b..ce40b07 100644
--- a/udp.h
+++ b/udp.h
@@ -10,9 +10,9 @@
void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now);
-int udp_tap_handler(struct ctx *c, int af, void *addr,
- struct tap_l4_msg *msg, int count, struct timespec *now);
-int udp_sock_init(struct ctx *c, struct timespec *now);
+int udp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p,
+ struct timespec *now);
+int udp_sock_init(struct ctx *c);
void udp_timer(struct ctx *c, struct timespec *ts);
void udp_update_l2_buf(unsigned char *eth_d, unsigned char *eth_s,
const uint32_t *ip_da);
diff --git a/util.c b/util.c
index 0adc6b9..f16cd61 100644
--- a/util.c
+++ b/util.c
@@ -38,6 +38,7 @@
#include "util.h"
#include "passt.h"
+#include "packet.h"
/* For __openlog() and __setlogmask() wrappers, and passt_vsyslog() */
static int log_mask;
@@ -156,46 +157,59 @@ void passt_vsyslog(int pri, const char *format, va_list ap)
send(log_sock, buf, n, 0);
}
+#define IPV6_NH_OPT(nh) \
+ ((nh) == 0 || (nh) == 43 || (nh) == 44 || (nh) == 50 || \
+ (nh) == 51 || (nh) == 60 || (nh) == 135 || (nh) == 139 || \
+ (nh) == 140 || (nh) == 253 || (nh) == 254)
+
/**
* ipv6_l4hdr() - Find pointer to L4 header in IPv6 packet and extract protocol
- * @ip6h: IPv6 header
+ * @p: Packet pool, packet number @index has IPv6 header at @offset
+ * @index: Index of packet in pool
+ * @offset: Pre-calculated IPv6 header offset
* @proto: Filled with L4 protocol number
+ * @dlen: Data length (payload excluding header extensions), set on return
*
* Return: pointer to L4 header, NULL if not found
*/
-char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto)
+char *ipv6_l4hdr(struct pool *p, int index, size_t offset, uint8_t *proto,
+ size_t *dlen)
{
- int offset, len, hdrlen;
struct ipv6_opt_hdr *o;
+ struct ipv6hdr *ip6h;
+ char *base;
+ int hdrlen;
uint8_t nh;
- len = ntohs(ip6h->payload_len);
- offset = 0;
+ base = packet_get(p, index, 0, 0, NULL);
+ ip6h = packet_get(p, index, offset, sizeof(*ip6h), dlen);
+ if (!ip6h)
+ return NULL;
- while (offset < len) {
- if (!offset) {
- nh = ip6h->nexthdr;
- hdrlen = sizeof(struct ipv6hdr);
- } else {
- o = (struct ipv6_opt_hdr *)(((char *)ip6h) + offset);
- nh = o->nexthdr;
- hdrlen = (o->hdrlen + 1) * 8;
- }
+ offset += sizeof(*ip6h);
- if (nh == 59)
- return NULL;
+ nh = ip6h->nexthdr;
+ if (!IPV6_NH_OPT(nh))
+ goto found;
+
+ while ((o = packet_get_try(p, index, offset, sizeof(*o), dlen))) {
+ nh = o->nexthdr;
+ hdrlen = (o->hdrlen + 1) * 8;
- if (nh == 0 || nh == 43 || nh == 44 || nh == 50 ||
- nh == 51 || nh == 60 || nh == 135 || nh == 139 ||
- nh == 140 || nh == 253 || nh == 254) {
+ if (IPV6_NH_OPT(nh))
offset += hdrlen;
- } else {
- *proto = nh;
- return (char *)(ip6h + 1) + offset;
- }
+ else
+ goto found;
}
return NULL;
+
+found:
+ if (nh == 59)
+ return NULL;
+
+ *proto = nh;
+ return base + offset;
}
/**
diff --git a/util.h b/util.h
index 3073f58..073a913 100644
--- a/util.h
+++ b/util.h
@@ -153,6 +153,8 @@ enum {
#include <limits.h>
#include <stdarg.h>
+#include "packet.h"
+
enum bind_type {
BIND_ANY = 0,
BIND_LOOPBACK,
@@ -194,7 +196,8 @@ __attribute__ ((weak)) int ffsl(long int i) { return __builtin_ffsl(i); }
void __openlog(const char *ident, int option, int facility);
void passt_vsyslog(int pri, const char *format, va_list ap);
void __setlogmask(int mask);
-char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto);
+char *ipv6_l4hdr(struct pool *p, int index, size_t offset, uint8_t *proto,
+ size_t *dlen);
int sock_l4(struct ctx *c, int af, uint8_t proto, uint16_t port,
enum bind_type bind_addr, uint32_t data);
void sock_probe_mem(struct ctx *c);