aboutgitcodebugslistschat
diff options
context:
space:
mode:
-rw-r--r--icmp.c20
-rw-r--r--icmp.h2
-rw-r--r--passt.h18
-rw-r--r--tap.c618
-rw-r--r--tcp.c26
-rw-r--r--udp.c13
-rw-r--r--udp.h4
7 files changed, 435 insertions, 266 deletions
diff --git a/icmp.c b/icmp.c
index 49fdf91..51848c2 100644
--- a/icmp.c
+++ b/icmp.c
@@ -141,23 +141,26 @@ void icmp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
* Return: count of consumed packets (always 1, even if malformed)
*/
int icmp_tap_handler(struct ctx *c, int af, void *addr,
- struct tap_msg *msg, int count, struct timespec *now)
+ struct tap_l4_msg *msg, int count, struct timespec *now)
{
(void)count;
if (af == AF_INET) {
- struct icmphdr *ih = (struct icmphdr *)msg[0].l4h;
union icmp_epoll_ref iref = { .v6 = 0 };
struct sockaddr_in sa = {
.sin_family = AF_INET,
.sin_addr = { .s_addr = INADDR_ANY },
- .sin_port = ih->un.echo.id,
};
+ struct icmphdr *ih;
int id, s;
+ ih = (struct icmphdr *)(pkt_buf + msg[0].pkt_buf_offset);
+
if (msg[0].l4_len < sizeof(*ih) || ih->type != ICMP_ECHO)
return 1;
+ sa.sin_port = ih->un.echo.id;
+
iref.id = id = ntohs(ih->un.echo.id);
if ((s = icmp_id_map[V4][id].sock) <= 0) {
@@ -171,22 +174,25 @@ int icmp_tap_handler(struct ctx *c, int af, void *addr,
bitmap_set(icmp_act[V4], id);
sa.sin_addr = *(struct in_addr *)addr;
- sendto(s, msg[0].l4h, msg[0].l4_len, MSG_NOSIGNAL,
+ sendto(s, ih, msg[0].l4_len, MSG_NOSIGNAL,
(struct sockaddr *)&sa, sizeof(sa));
} else if (af == AF_INET6) {
- struct icmp6hdr *ih = (struct icmp6hdr *)msg[0].l4h;
union icmp_epoll_ref iref = { .v6 = 1 };
struct sockaddr_in6 sa = {
.sin6_family = AF_INET6,
.sin6_addr = IN6ADDR_ANY_INIT,
- .sin6_port = ih->icmp6_identifier,
};
+ struct icmp6hdr *ih;
int id, s;
+ ih = (struct icmp6hdr *)(pkt_buf + msg[0].pkt_buf_offset);
+
if (msg[0].l4_len < sizeof(*ih) ||
(ih->icmp6_type != 128 && ih->icmp6_type != 129))
return 1;
+ sa.sin6_port = ih->icmp6_identifier;
+
iref.id = id = ntohs(ih->icmp6_identifier);
if ((s = icmp_id_map[V6][id].sock) <= 0) {
s = sock_l4(c, AF_INET6, IPPROTO_ICMPV6, id, 0,
@@ -200,7 +206,7 @@ int icmp_tap_handler(struct ctx *c, int af, void *addr,
bitmap_set(icmp_act[V6], id);
sa.sin6_addr = *(struct in6_addr *)addr;
- sendto(s, msg[0].l4h, msg[0].l4_len, MSG_NOSIGNAL,
+ sendto(s, ih, msg[0].l4_len, MSG_NOSIGNAL,
(struct sockaddr *)&sa, sizeof(sa));
}
diff --git a/icmp.h b/icmp.h
index 27f0a5c..78f54aa 100644
--- a/icmp.h
+++ b/icmp.h
@@ -8,7 +8,7 @@ struct ctx;
void icmp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now);
int icmp_tap_handler(struct ctx *c, int af, void *addr,
- struct tap_msg *msg, int count, struct timespec *now);
+ struct tap_l4_msg *msg, int count, struct timespec *now);
void icmp_timer(struct ctx *c, struct timespec *ts);
/**
diff --git a/passt.h b/passt.h
index bb41e90..57b5c09 100644
--- a/passt.h
+++ b/passt.h
@@ -3,15 +3,21 @@
/**
* struct tap_msg - Generic message descriptor for arrays of messages
- * @start: Pointer to message start
- * @l4_start: Pointer to L4 header
- * @len: Message length, with L2 headers
- * @l4_len: Message length, with L4 headers
+ * @pkt_buf_offset: Offset from @pkt_buf
+ * @len: Message length, with L2 headers
*/
struct tap_msg {
- char *start;
- char *l4h;
+ uint32_t pkt_buf_offset;
uint16_t len;
+};
+
+/**
+ * struct tap_l4_msg - Layer-4 message descriptor for protocol handlers
+ * @pkt_buf_offset: Offset of message from @pkt_buf
+ * @l4_len: Length of Layer-4 payload, host order
+ */
+struct tap_l4_msg {
+ uint32_t pkt_buf_offset;
uint16_t l4_len;
};
diff --git a/tap.c b/tap.c
index cac7e4f..162d02b 100644
--- a/tap.c
+++ b/tap.c
@@ -50,7 +50,9 @@
#include "dhcpv6.h"
#include "pcap.h"
-static struct tap_msg tap_msgs[TAP_MSGS];
+/* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */
+static struct tap_msg seq4[TAP_MSGS];
+static struct tap_msg seq6[TAP_MSGS];
/**
* tap_send() - Send frame, with qemu socket header if needed
@@ -199,256 +201,409 @@ void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto,
}
/**
+ * struct l4_seq4_t - Message sequence for one protocol handler call, IPv4
+ * @msgs: Count of messages in sequence
+ * @protocol: Protocol number
+ * @source: Source port
+ * @dest: Destination port
+ * @saddr: Source address
+ * @daddr: Destination address
+ * @msg: Array of messages that can be handled in a single call
+ */
+static struct tap_l4_seq4 {
+ uint16_t msgs;
+ uint8_t protocol;
+
+ uint16_t source;
+ uint16_t dest;
+
+ uint32_t saddr;
+ uint32_t daddr;
+
+ struct tap_l4_msg msg[UIO_MAXIOV];
+} l4_seq4[UIO_MAXIOV /* Arbitrary: TAP_MSGS in theory, so limit in users */];
+
+/**
+ * struct l4_seq6_t - Message sequence for one protocol handler call, IPv6
+ * @msgs: Count of messages in sequence
+ * @protocol: Protocol number
+ * @source: Source port
+ * @dest: Destination port
+ * @saddr: Source address
+ * @daddr: Destination address
+ * @msg: Array of messages that can be handled in a single call
+ */
+static struct tap_l4_seq6 {
+ uint16_t msgs;
+ uint8_t protocol;
+
+ uint16_t source;
+ uint16_t dest;
+
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+
+ struct tap_l4_msg msg[UIO_MAXIOV];
+} l4_seq6[UIO_MAXIOV /* Arbitrary: TAP_MSGS in theory, so limit in users */];
+
+/**
+ * tap_packet_debug() - Print debug message for packet(s) from guest/tap
+ * @iph: IPv4 header, can be NULL
+ * @ip6h: IPv6 header, can be NULL
+ * @seq4: Pointer to @struct tap_l4_seq4, can be NULL
+ * @proto6: IPv6 protocol, for IPv6
+ * @seq6: Pointer to @struct tap_l4_seq6, can be NULL
+ * @count: Count of packets in this sequence
+ */
+static void tap_packet_debug(struct iphdr *iph, struct ipv6hdr *ip6h,
+ struct tap_l4_seq4 *seq4, uint8_t proto6,
+ struct tap_l4_seq6 *seq6, int count)
+{
+ char buf6s[INET6_ADDRSTRLEN], buf6d[INET6_ADDRSTRLEN];
+ char buf4s[INET_ADDRSTRLEN], buf4d[INET_ADDRSTRLEN];
+ uint8_t proto;
+
+ if (iph || seq4) {
+ inet_ntop(AF_INET, iph ? &iph->saddr : &seq4->saddr,
+ buf4s, sizeof(buf4s)),
+ inet_ntop(AF_INET, iph ? &iph->daddr : &seq4->daddr,
+ buf4d, sizeof(buf4d)),
+ proto = iph ? iph->protocol : seq4->protocol;
+ } else {
+ inet_ntop(AF_INET6, ip6h ? &ip6h->saddr : &seq6->saddr,
+ buf6s, sizeof(buf6s)),
+ inet_ntop(AF_INET6, ip6h ? &ip6h->daddr : &seq6->daddr,
+ buf6d, sizeof(buf6d)),
+ proto = proto6;
+ }
+
+ if (proto == IPPROTO_TCP || proto == IPPROTO_UDP) {
+ debug("protocol %i from tap: %s:%i -> %s:%i (%i packet%s)",
+ proto, seq4 ? buf4s : buf6s,
+ ntohs(seq4 ? seq4->source : seq6->source),
+ seq4 ? buf4d : buf6d,
+ ntohs(seq4 ? seq4->dest : seq6->dest),
+ count, count == 1 ? "" : "s");
+ } else {
+ debug("protocol %i from tap: %s -> %s (%i packet%s)",
+ proto, iph ? buf4s : buf6s, iph ? buf4d : buf6d,
+ count, count == 1 ? "" : "s");
+ }
+}
+
+/**
* tap4_handler() - IPv4 and ARP packet handler for tap file descriptor
* @c: Execution context
- * @msg: Array of messages with the same L3 protocol
- * @count: Count of messages with the same L3 protocol
+ * @msg: Array of messages with IPv4 or ARP protocol
+ * @count: Count of messages
* @now: Current timestamp
- * @first: First call for an IPv4 packet in this batch
*
* Return: count of packets consumed by handlers
*/
static int tap4_handler(struct ctx *c, struct tap_msg *msg, size_t count,
- struct timespec *now, int first)
+ struct timespec *now)
{
- char buf_s[INET_ADDRSTRLEN] __attribute((__unused__));
- char buf_d[INET_ADDRSTRLEN] __attribute((__unused__));
- struct ethhdr *eh = (struct ethhdr *)msg[0].start;
- struct iphdr *iph, *prev_iph = NULL;
- struct udphdr *uh, *prev_uh = NULL;
- size_t len = msg[0].len;
- unsigned int i;
+ unsigned int i, j, seq_count;
+ struct tap_l4_msg *l4_msg;
+ struct tap_l4_seq4 *seq;
+ size_t len, l4_len;
+ struct ethhdr *eh;
+ struct iphdr *iph;
+ struct udphdr *uh;
char *l4h;
if (!c->v4)
return count;
- if (len < sizeof(*eh) + sizeof(*iph))
- return 1;
+ i = 0;
+resume:
+ for (seq_count = 0, seq = NULL; i < count; i++) {
+ eh = (struct ethhdr *)(pkt_buf + msg[i].pkt_buf_offset);
+ len = msg[i].len;
- if (arp(c, eh, len) || dhcp(c, eh, len))
- return 1;
+ if (len < sizeof(*eh))
+ continue;
+
+ if (ntohs(eh->h_proto) == ETH_P_ARP && arp(c, eh, len))
+ continue;
- for (i = 0; i < count; i++) {
- len = msg[i].len;
if (len < sizeof(*eh) + sizeof(*iph))
- return 1;
+ continue;
- eh = (struct ethhdr *)msg[i].start;
iph = (struct iphdr *)(eh + 1);
- l4h = (char *)iph + iph->ihl * 4;
+ if ((iph->ihl * 4) + sizeof(*eh) > len)
+ continue;
+ if (iph->ihl * 4 < sizeof(*iph))
+ continue;
- if (first && c->addr4_seen != iph->saddr) {
+ if (iph->saddr && c->addr4_seen != iph->saddr) {
c->addr4_seen = iph->saddr;
proto_update_l2_buf(NULL, NULL, &c->addr4_seen);
}
- msg[i].l4h = l4h;
- msg[i].l4_len = len - ((intptr_t)l4h - (intptr_t)eh);
+ l4h = (char *)iph + iph->ihl * 4;
+ l4_len = len - ((intptr_t)l4h - (intptr_t)eh);
- if (iph->protocol != IPPROTO_TCP &&
- iph->protocol != IPPROTO_UDP)
- break;
+ if (iph->protocol == IPPROTO_ICMP) {
+ struct tap_l4_msg icmp_msg = { l4h - pkt_buf,
+ l4_len };
- if (len < sizeof(*uh))
- break;
+ if (l4_len < sizeof(struct icmphdr))
+ continue;
+
+ tap_packet_debug(iph, NULL, NULL, 0, NULL, 1);
+ if (!c->no_icmp) {
+ icmp_tap_handler(c, AF_INET, &iph->daddr,
+ &icmp_msg, 1, now);
+ }
+ continue;
+ }
+
+ if (l4_len < sizeof(*uh))
+ continue;
uh = (struct udphdr *)l4h;
- if (!i) {
- prev_iph = iph;
- prev_uh = uh;
+ if (iph->protocol == IPPROTO_UDP && dhcp(c, eh, len))
+ continue;
+
+ if (iph->protocol != IPPROTO_TCP &&
+ iph->protocol != IPPROTO_UDP) {
+ tap_packet_debug(iph, NULL, NULL, 0, NULL, 1);
continue;
}
- if (iph->tos != prev_iph->tos ||
- iph->frag_off != prev_iph->frag_off ||
- iph->protocol != prev_iph->protocol ||
- iph->saddr != prev_iph->saddr ||
- iph->daddr != prev_iph->daddr ||
- uh->source != prev_uh->source ||
- uh->dest != prev_uh->dest)
- break;
+#define L4_MATCH(iph, uh, seq) \
+ (seq->protocol == iph->protocol && \
+ seq->source == uh->source && seq->dest == uh->dest && \
+ seq->saddr == iph->saddr && seq->daddr == iph->daddr)
+
+#define L4_SET(iph, uh, seq) \
+ do { \
+ seq->protocol = iph->protocol; \
+ seq->source = uh->source; \
+ seq->dest = uh->dest; \
+ seq->saddr = iph->saddr; \
+ seq->daddr = iph->daddr; \
+ } while (0)
+
+ if (seq && L4_MATCH(iph, uh, seq) && seq->msgs < UIO_MAXIOV)
+ goto append;
+
+ for (seq = l4_seq4 + seq_count - 1; seq >= l4_seq4; seq--) {
+ if (L4_MATCH(iph, uh, seq)) {
+ if (seq->msgs >= UIO_MAXIOV)
+ seq = l4_seq4 - 1;
+ break;
+ }
+ }
- prev_iph = iph;
- prev_uh = uh;
- }
+ if (seq < l4_seq4) {
+ seq = l4_seq4 + seq_count++;
+ L4_SET(iph, uh, seq);
+ seq->msgs = 0;
+ }
- eh = (struct ethhdr *)msg[0].start;
- iph = (struct iphdr *)(eh + 1);
+#undef L4_MATCH
+#undef L4_SET
- if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP ||
- iph->protocol == IPPROTO_SCTP) {
- uh = (struct udphdr *)msg[0].l4h;
+append:
+ l4_msg = &seq->msg[seq->msgs++];
- if (msg[0].len < sizeof(*uh))
- return 1;
+ l4_msg->pkt_buf_offset = l4h - pkt_buf;
+ l4_msg->l4_len = l4_len;
- debug("%s (%i) from tap: %s:%i -> %s:%i (%i packet%s)",
- IP_PROTO_STR(iph->protocol), iph->protocol,
- inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)),
- ntohs(uh->source),
- inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)),
- ntohs(uh->dest),
- i, i > 1 ? "s" : "");
- } else if (iph->protocol == IPPROTO_ICMP) {
- debug("icmp from tap: %s -> %s",
- inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)),
- inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)));
+ if (seq_count == UIO_MAXIOV)
+ break; /* Resume after flushing if i < count */
}
- if (iph->protocol == IPPROTO_TCP) {
- if (c->no_tcp)
- return i;
- return tcp_tap_handler(c, AF_INET, &iph->daddr, msg, i, now);
- }
+ for (j = 0, seq = l4_seq4; j < seq_count; j++, seq++) {
+ int n = seq->msgs;
- if (iph->protocol == IPPROTO_UDP) {
- if (c->no_udp)
- return i;
- return udp_tap_handler(c, AF_INET, &iph->daddr, msg, i, now);
- }
+ l4_msg = seq->msg;
- if (iph->protocol == IPPROTO_ICMP) {
- if (c->no_icmp)
- return 1;
- icmp_tap_handler(c, AF_INET, &iph->daddr, msg, 1, now);
+ tap_packet_debug(NULL, NULL, seq, 0, NULL, n);
+
+ if (seq->protocol == IPPROTO_TCP) {
+ if (c->no_tcp)
+ continue;
+ while ((n -= tcp_tap_handler(c, AF_INET, &seq->daddr,
+ l4_msg, n, now)));
+ } else if (seq->protocol == IPPROTO_UDP) {
+ if (c->no_udp)
+ continue;
+ while ((n -= udp_tap_handler(c, AF_INET, &seq->daddr,
+ l4_msg, n, now)));
+ }
}
- return 1;
+ if (i < count)
+ goto resume;
+
+ return count;
}
/**
* tap6_handler() - IPv6 packet handler for tap file descriptor
* @c: Execution context
- * @msg: Array of messages with the same L3 protocol
- * @count: Count of messages with the same L3 protocol
+ * @msg: Array of messages with IPv6 protocol
+ * @count: Count of messages
* @now: Current timestamp
- * @first: First call for an IPv6 packet in this batch
*
* Return: count of packets consumed by handlers
*/
static int tap6_handler(struct ctx *c, struct tap_msg *msg, size_t count,
- struct timespec *now, int first)
+ struct timespec *now)
{
- char buf_s[INET6_ADDRSTRLEN], buf_d[INET6_ADDRSTRLEN];
- struct ethhdr *eh = (struct ethhdr *)msg[0].start;
- struct udphdr *uh, *prev_uh = NULL;
- uint8_t proto = 0, prev_proto = 0;
- size_t len = msg[0].len;
+ unsigned int i, j, seq_count = 0;
+ struct tap_l4_msg *l4_msg;
+ struct tap_l4_seq6 *seq;
struct ipv6hdr *ip6h;
- unsigned int i;
+ size_t len, l4_len;
+ struct ethhdr *eh;
+ struct udphdr *uh;
+ uint8_t proto;
char *l4h;
if (!c->v6)
return count;
- if (len < sizeof(*eh) + sizeof(*ip6h))
- return 1;
-
- if (ndp(c, eh, len) || dhcpv6(c, eh, len))
- return 1;
+ i = 0;
+resume:
+ for (seq_count = 0, seq = NULL; i < count; i++) {
+ eh = (struct ethhdr *)(pkt_buf + msg[i].pkt_buf_offset);
+ len = msg[i].len;
- for (i = 0; i < count; i++) {
- struct ipv6hdr *p_ip6h;
+ if (len < sizeof(*eh))
+ continue;
- len = msg[i].len;
if (len < sizeof(*eh) + sizeof(*ip6h))
return 1;
- eh = (struct ethhdr *)msg[i].start;
ip6h = (struct ipv6hdr *)(eh + 1);
- l4h = ipv6_l4hdr(ip6h, &proto);
-
- msg[i].l4h = l4h;
- msg[i].l4_len = len - ((intptr_t)l4h - (intptr_t)eh);
- if (first) {
- if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr)) {
- c->addr6_ll_seen = ip6h->saddr;
+ if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr)) {
+ c->addr6_ll_seen = ip6h->saddr;
- if (IN6_IS_ADDR_UNSPECIFIED(&c->addr6_seen)) {
- c->addr6_seen = ip6h->saddr;
- }
- } else {
+ if (IN6_IS_ADDR_UNSPECIFIED(&c->addr6_seen)) {
c->addr6_seen = ip6h->saddr;
}
+ } else {
+ c->addr6_seen = ip6h->saddr;
}
- ip6h->saddr = c->addr6;
+ if (ntohs(ip6h->payload_len) >
+ len - sizeof(*eh) - sizeof(*ip6h))
+ continue;
- if (proto != IPPROTO_TCP && proto != IPPROTO_UDP)
- break;
+ if (!(l4h = ipv6_l4hdr(ip6h, &proto)))
+ continue;
- if (len < sizeof(*uh))
- break;
+ l4_len = len - ((intptr_t)l4h - (intptr_t)eh);
- uh = (struct udphdr *)l4h;
+ if (proto == IPPROTO_ICMPV6) {
+ struct tap_l4_msg icmpv6_msg = { l4h - pkt_buf,
+ l4_len };
- if (!i) {
- p_ip6h = ip6h;
- prev_proto = proto;
- prev_uh = uh;
+ if (l4_len < sizeof(struct icmp6hdr))
+ continue;
+
+ if (ndp(c, eh, len))
+ continue;
+
+ tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1);
+ if (!c->no_icmp) {
+ icmp_tap_handler(c, AF_INET6, &ip6h->daddr,
+ &icmpv6_msg, 1, now);
+ }
continue;
}
- if (proto != prev_proto ||
- memcmp(&ip6h->saddr, &p_ip6h->saddr, sizeof(ip6h->saddr)) ||
- memcmp(&ip6h->daddr, &p_ip6h->daddr, sizeof(ip6h->daddr)) ||
- uh->source != prev_uh->source ||
- uh->dest != prev_uh->dest)
- break;
+ if (l4_len < sizeof(*uh))
+ continue;
- p_ip6h = ip6h;
- prev_proto = proto;
- prev_uh = uh;
- }
+ uh = (struct udphdr *)l4h;
- if (prev_proto)
- proto = prev_proto;
+ if (proto == IPPROTO_UDP && dhcpv6(c, eh, len))
+ continue;
- eh = (struct ethhdr *)msg[0].start;
- ip6h = (struct ipv6hdr *)(eh + 1);
+ ip6h->saddr = c->addr6;
- if (proto == IPPROTO_ICMPV6) {
- debug("icmpv6 from tap: %s ->\n\t%s",
- inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)),
- inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d)));
- } else if (proto == IPPROTO_TCP || proto == IPPROTO_UDP ||
- proto == IPPROTO_SCTP) {
- uh = (struct udphdr *)msg[0].l4h;
+ if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
+ tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1);
+ continue;
+ }
- if (msg[0].len < sizeof(*uh))
- return 1;
+#define L4_MATCH(ip6h, proto, uh, seq) \
+ (seq->protocol == proto && \
+ seq->source == uh->source && seq->dest == uh->dest && \
+ !memcmp(&seq->saddr, &ip6h->saddr, sizeof(seq->saddr)) && \
+ !memcmp(&seq->daddr, &ip6h->daddr, sizeof(seq->daddr)))
+
+#define L4_SET(ip6h, proto, uh, seq) \
+ do { \
+ seq->protocol = proto; \
+ seq->source = uh->source; \
+ seq->dest = uh->dest; \
+ seq->saddr = ip6h->saddr; \
+ seq->daddr = ip6h->daddr; \
+ } while (0)
+
+ if (seq && L4_MATCH(ip6h, proto, uh, seq) &&
+ seq->msgs < UIO_MAXIOV)
+ goto append;
+
+ for (seq = l4_seq6 + seq_count - 1; seq >= l4_seq6; seq--) {
+ if (L4_MATCH(ip6h, proto, uh, seq)) {
+ if (seq->msgs >= UIO_MAXIOV)
+ seq = l4_seq6 - 1;
+ break;
+ }
+ }
- debug("%s (%i) from tap: [%s]:%i\n\t-> [%s]:%i (%i packet%s)",
- IP_PROTO_STR(proto), proto,
- inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)),
- ntohs(uh->source),
- inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d)),
- ntohs(uh->dest),
- i, i > 1 ? "s" : "");
- }
+ if (seq < l4_seq6) {
+ seq = l4_seq6 + seq_count++;
+ L4_SET(ip6h, proto, uh, seq);
+ seq->msgs = 0;
+ }
- if (proto == IPPROTO_TCP) {
- if (c->no_tcp)
- return i;
- return tcp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, i, now);
- }
+#undef L4_MATCH
+#undef L4_SET
- if (proto == IPPROTO_UDP) {
- if (c->no_udp)
- return i;
- return udp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, i, now);
+append:
+ l4_msg = &seq->msg[seq->msgs++];
+
+ l4_msg->pkt_buf_offset = l4h - pkt_buf;
+ l4_msg->l4_len = l4_len;
+
+ if (seq_count == UIO_MAXIOV)
+ break; /* Resume after flushing if i < count */
}
- if (proto == IPPROTO_ICMPV6) {
- if (c->no_icmp)
- return 1;
- icmp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, 1, now);
+ for (j = 0, seq = l4_seq6; j < seq_count; j++, seq++) {
+ int n = seq->msgs;
+
+ l4_msg = seq->msg;
+
+ tap_packet_debug(NULL, NULL, NULL, seq->protocol, seq, n);
+
+ if (seq->protocol == IPPROTO_TCP) {
+ if (c->no_tcp)
+ continue;
+ while ((n -= tcp_tap_handler(c, AF_INET6, &seq->daddr,
+ l4_msg, n, now)));
+ } else if (seq->protocol == IPPROTO_UDP) {
+ if (c->no_udp)
+ continue;
+ while ((n -= udp_tap_handler(c, AF_INET6, &seq->daddr,
+ l4_msg, n, now)));
+ }
}
- return 1;
+ if (i < count)
+ goto resume;
+
+ return count;
}
/**
@@ -460,10 +615,14 @@ static int tap6_handler(struct ctx *c, struct tap_msg *msg, size_t count,
*/
static int tap_handler_passt(struct ctx *c, struct timespec *now)
{
- int msg_count = 0, same, i = 0, first_v4 = 1, first_v6 = 1;
+ int seq4_i, seq6_i;
struct ethhdr *eh;
- char *p = pkt_buf;
ssize_t n, rem;
+ char *p;
+
+redo:
+ p = pkt_buf;
+ seq4_i = seq6_i = rem = 0;
n = recv(c->fd_tap, p, TAP_BUF_FILL, MSG_DONTWAIT);
if (n < 0) {
@@ -479,30 +638,27 @@ static int tap_handler_passt(struct ctx *c, struct timespec *now)
while (n > (ssize_t)sizeof(uint32_t)) {
ssize_t len = ntohl(*(uint32_t *)p);
- if (len < (ssize_t)sizeof(*eh) || len > ETH_MAX_MTU)
- return 0;
-
p += sizeof(uint32_t);
n -= sizeof(uint32_t);
- /* At most one packet might not fit in a single read */
+ /* At most one packet might not fit in a single read, and this
+ * needs to be blocking.
+ */
if (len > n) {
- rem = recv(c->fd_tap, p + n, len - n, MSG_DONTWAIT);
+ rem = recv(c->fd_tap, p + n, len - n, 0);
if ((n += rem) != len)
return 0;
}
- pcap(p, len);
-
- tap_msgs[msg_count].start = p;
- tap_msgs[msg_count++].len = len;
+ /* Complete the partial read above before discarding a malformed
+ * frame, otherwise the stream will be inconsistent.
+ */
+ if (len < (ssize_t)sizeof(*eh) || len > ETH_MAX_MTU)
+ goto next;
- n -= len;
- p += len;
- }
+ pcap(p, len);
- while (i < msg_count) {
- eh = (struct ethhdr *)tap_msgs[i].start;
+ eh = (struct ethhdr *)p;
if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) {
memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
@@ -511,52 +667,33 @@ static int tap_handler_passt(struct ctx *c, struct timespec *now)
switch (ntohs(eh->h_proto)) {
case ETH_P_ARP:
- if (c->v4)
- tap4_handler(c, tap_msgs + i, 1, now, 1);
- i++;
- break;
case ETH_P_IP:
- for (same = 1; i + same < msg_count &&
- same < UIO_MAXIOV; same++) {
- struct tap_msg *next = &tap_msgs[i + same];
-
- eh = (struct ethhdr *)next->start;
- if (ntohs(eh->h_proto) != ETH_P_IP)
- break;
- }
-
- if (!c->v4) {
- i += same;
- break;
- }
-
- i += tap4_handler(c, tap_msgs + i, same, now, first_v4);
- first_v4 = 0;
+ seq4[seq4_i].pkt_buf_offset = p - pkt_buf;
+ seq4[seq4_i++].len = len;
break;
case ETH_P_IPV6:
- for (same = 1; i + same < msg_count &&
- same < UIO_MAXIOV; same++) {
- struct tap_msg *next = &tap_msgs[i + same];
-
- eh = (struct ethhdr *)next->start;
- if (ntohs(eh->h_proto) != ETH_P_IPV6)
- break;
- }
-
- if (!c->v6) {
- i += same;
- break;
- }
-
- i += tap6_handler(c, tap_msgs + i, same, now, first_v6);
- first_v6 = 0;
+ seq6[seq6_i].pkt_buf_offset = p - pkt_buf;
+ seq6[seq6_i++].len = len;
break;
default:
- i++;
break;
}
+
+next:
+ p += len;
+ n -= len;
}
+ if (seq4_i)
+ tap4_handler(c, seq4, seq4_i, now);
+
+ if (seq6_i)
+ tap6_handler(c, seq6, seq6_i, now);
+
+ /* We can't use EPOLLET otherwise. */
+ if (rem)
+ goto redo;
+
return 0;
}
@@ -569,14 +706,19 @@ static int tap_handler_passt(struct ctx *c, struct timespec *now)
*/
static int tap_handler_pasta(struct ctx *c, struct timespec *now)
{
- struct tap_msg msg = { .start = pkt_buf };
- ssize_t n;
+ ssize_t n = 0, len;
+ int err, seq4_i = 0, seq6_i = 0;
+
+restart:
+ while ((len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n)) > 0) {
+ struct ethhdr *eh = (struct ethhdr *)(pkt_buf + n);
- while ((n = read(c->fd_tap, pkt_buf, TAP_BUF_BYTES)) > 0) {
- struct ethhdr *eh = (struct ethhdr *)pkt_buf;
- msg.len = n;
+ if (len < (ssize_t)sizeof(*eh) || len > ETH_MAX_MTU) {
+ n += len;
+ continue;
+ }
- pcap(msg.start, msg.len);
+ pcap(pkt_buf + n, len);
if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) {
memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
@@ -585,21 +727,33 @@ static int tap_handler_pasta(struct ctx *c, struct timespec *now)
switch (ntohs(eh->h_proto)) {
case ETH_P_ARP:
- if (c->v4)
- tap4_handler(c, &msg, 1, now, 1);
- break;
case ETH_P_IP:
- if (c->v4)
- tap4_handler(c, &msg, 1, now, 1);
+ seq4[seq4_i].pkt_buf_offset = n;
+ seq4[seq4_i++].len = len;
break;
case ETH_P_IPV6:
- if (c->v6)
- tap6_handler(c, &msg, 1, now, 1);
+ seq6[seq6_i].pkt_buf_offset = n;
+ seq6[seq6_i++].len = len;
+ break;
+ default:
break;
}
+
+ n += len;
}
- if (!n || errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)
+ if (len < 0 && errno == EINTR)
+ goto restart;
+
+ err = errno;
+
+ if (seq4_i)
+ tap4_handler(c, seq4, seq4_i, now);
+
+ if (seq6_i)
+ tap6_handler(c, seq6, seq6_i, now);
+
+ if (len > 0 || err == EAGAIN)
return 0;
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL);
@@ -753,12 +907,14 @@ void tap_sock_init(struct ctx *c)
close(c->fd_tap);
}
- if (c->mode == MODE_PASST)
+ if (c->mode == MODE_PASST) {
tap_sock_init_unix(c);
- else
+ ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
+ } else {
tap_sock_init_tun(c);
+ ev.events = EPOLLIN | EPOLLRDHUP;
+ }
- ev.events = EPOLLIN | EPOLLRDHUP;
ev.data.fd = c->fd_tap;
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
}
diff --git a/tcp.c b/tcp.c
index 645d525..b269e0e 100644
--- a/tcp.c
+++ b/tcp.c
@@ -333,6 +333,7 @@
#include <sys/random.h>
#include <sys/socket.h>
#include <sys/types.h>
+#include <sys/uio.h>
#include <unistd.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
@@ -645,7 +646,7 @@ static struct mmsghdr tcp_l2_mh_tap [TCP_TAP_FRAMES] = {
};
/* sendmsg() to socket */
-static struct iovec tcp_tap_iov [TAP_MSGS];
+static struct iovec tcp_tap_iov [UIO_MAXIOV];
/* Bitmap, activity monitoring needed for connection via tap */
static uint8_t tcp_act[MAX_TAP_CONNS / 8] = { 0 };
@@ -1968,7 +1969,7 @@ out_restore_iov:
* @now: Current timestamp
*/
static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
- struct tap_msg *msg, int count,
+ struct tap_l4_msg *msg, int count,
struct timespec *now)
{
int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1;
@@ -1979,10 +1980,13 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
ssize_t len;
for (i = 0, iov_i = 0; i < count; i++) {
- struct tcphdr *th = (struct tcphdr *)msg[i].l4h;
uint32_t seq, seq_offset, ack_seq;
- size_t len = msg[i].l4_len, off;
+ struct tcphdr *th;
char *data;
+ size_t off;
+
+ th = (struct tcphdr *)(pkt_buf + msg[i].pkt_buf_offset);
+ len = msg[i].l4_len;
if (len < sizeof(*th)) {
tcp_rst(c, conn);
@@ -2152,19 +2156,11 @@ out:
* Return: count of consumed packets
*/
int tcp_tap_handler(struct ctx *c, int af, void *addr,
- struct tap_msg *msg, int count, struct timespec *now)
+ struct tap_l4_msg *msg, int count, struct timespec *now)
{
- struct tcphdr *th = (struct tcphdr *)msg[0].l4h;
- size_t len = msg[0].l4_len, off;
+ struct tcphdr *th = (struct tcphdr *)(pkt_buf + msg[0].pkt_buf_offset);
+ uint16_t len = msg[0].l4_len;
struct tcp_tap_conn *conn;
- int ws;
-
- if (len < sizeof(*th))
- return 1;
-
- off = th->doff * 4;
- if (off < sizeof(*th) || off > len)
- return 1;
conn = tcp_hash_lookup(c, af, addr, htons(th->source), htons(th->dest));
if (!conn) {
diff --git a/udp.c b/udp.c
index e640f16..6b2ee36 100644
--- a/udp.c
+++ b/udp.c
@@ -879,12 +879,12 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
* Return: count of consumed packets
*/
int udp_tap_handler(struct ctx *c, int af, void *addr,
- struct tap_msg *msg, int count, struct timespec *now)
+ struct tap_l4_msg *msg, int count, struct timespec *now)
{
/* The caller already checks that all the messages have the same source
* and destination, so we can just take those from the first message.
*/
- struct udphdr *uh = (struct udphdr *)msg[0].l4h;
+ struct udphdr *uh = (struct udphdr *)(pkt_buf + msg[0].pkt_buf_offset);
struct mmsghdr mm[UIO_MAXIOV] = { 0 };
struct iovec m[UIO_MAXIOV];
struct sockaddr_in6 s_in6;
@@ -972,7 +972,10 @@ int udp_tap_handler(struct ctx *c, int af, void *addr,
}
for (i = 0; i < count; i++) {
- m[i].iov_base = (char *)((struct udphdr *)msg[i].l4h + 1);
+ struct udphdr *uh;
+
+ uh = (struct udphdr *)(msg[i].pkt_buf_offset + pkt_buf);
+ m[i].iov_base = (char *)(uh + 1);
m[i].iov_len = msg[i].l4_len - sizeof(*uh);
mm[i].msg_hdr.msg_name = sa;
@@ -1084,12 +1087,14 @@ static void udp_splice_iov_init(void)
*
* Return: 0 on success, -1 on failure
*/
-int udp_sock_init(struct ctx *c)
+int udp_sock_init(struct ctx *c, struct timespec *now)
{
union udp_epoll_ref uref = { .bound = 1 };
in_port_t dst;
int s;
+ (void)now;
+
for (dst = 0; dst < USHRT_MAX; dst++) {
if (!bitmap_isset(c->udp.port_to_tap, dst))
continue;
diff --git a/udp.h b/udp.h
index bdafcaf..c20f936 100644
--- a/udp.h
+++ b/udp.h
@@ -6,8 +6,8 @@
void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now);
int udp_tap_handler(struct ctx *c, int af, void *addr,
- struct tap_msg *msg, int count, struct timespec *now);
-int udp_sock_init(struct ctx *c);
+ struct tap_l4_msg *msg, int count, struct timespec *now);
+int udp_sock_init(struct ctx *c, struct timespec *now);
void udp_timer(struct ctx *c, struct timespec *ts);
void udp_update_l2_buf(unsigned char *eth_d, unsigned char *eth_s,
uint32_t *ip_da);