8 files changed, 332 insertions, 92 deletions
diff --git a/Makefile b/Makefile
index 2f48c35..5a6692e 100644
--- a/Makefile
+++ b/Makefile
@@ -3,8 +3,23 @@ CFLAGS += -DRLIMIT_STACK_VAL=$(shell ulimit -s)
 
 all: passt pasta passt4netns qrap
 
-passt: passt.c passt.h arp.c arp.h dhcp.c dhcp.h dhcpv6.c dhcpv6.h pcap.c pcap.h  ndp.c ndp.h siphash.c siphash.h tap.c tap.h icmp.c icmp.h tcp.c tcp.h udp.c udp.h util.c util.h
-	$(CC) $(CFLAGS) passt.c arp.c dhcp.c dhcpv6.c pcap.c ndp.c siphash.c tap.c icmp.c tcp.c udp.c util.c -o passt
+avx2: CFLAGS += -Ofast -mavx2 -ftree-vectorize -funroll-loops
+avx2: clean all
+
+avx2_debug: CFLAGS += -Ofast -mavx2 -ftree-vectorize -funroll-loops -DDEBUG -g
+avx2_debug: clean all
+
+static: CFLAGS += -static
+static: clean all
+
+debug: CFLAGS += -static -DDEBUG -g
+debug: clean all
+
+passt: passt.c passt.h arp.c arp.h checksum.c checksum.h dhcp.c dhcp.h \
+	dhcpv6.c dhcpv6.h pcap.c pcap.h ndp.c ndp.h siphash.c siphash.h \
+	tap.c tap.h icmp.c icmp.h tcp.c tcp.h udp.c udp.h util.c util.h
+	$(CC) $(CFLAGS) passt.c arp.c checksum.c dhcp.c dhcpv6.c pcap.c ndp.c \
+		siphash.c tap.c icmp.c tcp.c udp.c util.c -o passt
 
 pasta: passt
 	ln -s passt pasta
diff --git a/checksum.c b/checksum.c
new file mode 100644
index 0000000..9c8a458
--- /dev/null
+++ b/checksum.c
@@ -0,0 +1,292 @@
+// SPDX-License-Identifier: AGPL-3.0-or-later
+// SPDX-License-Identifier: BSD-3-Clause
+
+/* PASST - Plug A Simple Socket Transport
+ *  for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ *  for network namespace/tap device mode
+ *
+ * checksum.c - TCP/IP checksum routines
+ *
+ * Copyright (c) 2021 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ *
+ * This file also contains code originally licensed under the following terms:
+ *
+ * Copyright (c) 2014-2016, The Regents of the University of California.
+ * Copyright (c) 2016-2017, Nefeli Networks, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the names of the copyright holders nor the names of their
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * See the comment to csum_avx2() for further details.
+ */
+
+#include <stdint.h>
+#include <stddef.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <arpa/inet.h>
+
+/**
+ * sum_16b() - Calculate sum of 16-bit words
+ * @buf:	Input buffer
+ * @len:	Buffer length
+ *
+ * Return: 32-bit sum of 16-bit words
+*/
+uint32_t sum_16b(const void *buf, size_t len)
+{
+	const uint16_t *p = buf;
+	uint32_t sum = 0;
+
+	while (len > 1) {
+		sum += *p++;
+		len -= 2;
+	}
+
+	if (len > 0)
+		sum += *p & htons(0xff00);
+
+	return sum;
+}
+
+/**
+ * csum_fold() - Fold long sum for IP and TCP checksum
+ * @sum:	Original long sum
+ *
+ * Return: 16-bit folded sum
+ */
+uint16_t csum_fold(uint32_t sum)
+{
+	while (sum >> 16)
+		sum = (sum & 0xffff) + (sum >> 16);
+
+	return sum;
+}
+
+/**
+ * csum_unaligned() - Compute TCP/IP-style checksum for not 32-byte aligned data
+ * @buf:	Input data
+ * @len:	Input length
+ * @init:	Initial 32-bit checksum, 0 for no pre-computed checksum
+ *
+ * Return: 16-bit IPv4-style checksum
+ */
+uint16_t csum_unaligned(const void *buf, size_t len, uint32_t init)
+{
+	return (uint16_t)~csum_fold(sum_16b(buf, len) + init);
+}
+
+/**
+ * csum_tcp4() - Calculate TCP checksum for IPv4 and set in place
+ * @iph:	Packet buffer, IP header
+ */
+void csum_tcp4(struct iphdr *iph)
+{
+	struct tcphdr *th = (struct tcphdr *)((char *)iph + iph->ihl * 4);
+	uint16_t tlen = ntohs(iph->tot_len) - iph->ihl * 4, *p = (uint16_t *)th;
+	uint32_t sum = 0;
+
+	sum += (iph->saddr >> 16) & 0xffff;
+	sum += iph->saddr & 0xffff;
+	sum += (iph->daddr >> 16) & 0xffff;
+	sum += iph->daddr & 0xffff;
+
+	sum += htons(IPPROTO_TCP);
+	sum += htons(tlen);
+
+	th->check = 0;
+	while (tlen > 1) {
+		sum += *p++;
+		tlen -= 2;
+	}
+
+	if (tlen > 0) {
+		sum += *p & htons(0xff00);
+	}
+
+	th->check = (uint16_t)~csum_fold(sum);
+}
+
+#ifdef __AVX2__
+#include <immintrin.h>
+
+/**
+ * csum_avx2() - Compute 32-bit checksum using AVX2 SIMD instructions
+ * @buf:	Input buffer, must be aligned to 32-byte boundary
+ * @len:	Input length
+ * @init:	Initial 32-bit checksum, 0 for no pre-computed checksum
+ *
+ * Return: 32-bit checksum, not complemented, not folded
+ *
+ * This implementation is mostly sourced from BESS ("Berkeley Extensible
+ * Software Switch"), core/utils/checksum.h, distributed under the terms of the
+ * 3-Clause BSD license. Notable changes:
+ * - input buffer data is loaded (streamed) with a non-temporal aligned hint
+ *   (VMOVNTDQA, _mm256_stream_load_si256() intrinsic) instead of the original
+ *   unaligned load with temporal hint (VMOVDQU, _mm256_loadu_si256() intrinsic)
+ *   given that the input buffer layout guarantees 32-byte alignment of TCP and
+ *   UDP headers, and that the data is not used immediately afterwards, reducing
+ *   cache pollution significantly and latency (e.g. on Intel Skylake: 0 instead
+ *   of 7)
+ * - replace the ADCQ implementation for the portion remaining after the
+ *   checksum computation for 128-byte blocks by a load/unpack/add loop on a
+ *   single stream, and do the rest with a for loop, auto-vectorisation seems to
+ *   outperforms the original hand-coded loop there
+ * - sum_a/sum_b unpacking is interleaved and not sequential to reduce stalls
+ * - coding style adaptation
+ */
+static uint32_t csum_avx2(const void *buf, size_t len, uint32_t init)
+{
+	__m256i zero, a, b, sum256, sum_a_hi, sum_a_lo, sum_b_hi, sum_b_lo;
+	const uint64_t *buf64 = (const uint64_t *)buf;
+	const __m256i *buf256 = (__m256i *)buf64;
+	const uint16_t *buf16;
+	uint64_t sum64 = init;
+	int odd = len & 1;
+	__m128i sum128;
+
+	zero = _mm256_setzero_si256();
+	buf256 = (const __m256i *)buf64;
+
+	if (len < sizeof(__m256i) * 4)
+		goto less_than_128_bytes;
+
+
+	/* We parallelize two ymm streams to minimize register dependency:
+	 *
+	 * a: buf256,             buf256 + 2,             ...
+	 * b:         buf256 + 1,             buf256 + 3, ...
+	 */
+	a = _mm256_stream_load_si256(buf256);
+	b = _mm256_stream_load_si256(buf256 + 1);
+
+	/* For each stream, accumulate unpackhi and unpacklo in parallel (as
+	 * 4x64bit vectors, so that each upper 0000 can hold carries):
+	 *
+	 * 32B data: aaaaAAAA bbbbBBBB ccccCCCC ddddDDDD (1 letter: 1 byte)
+	 * unpackhi: bbbb0000 BBBB0000 dddd0000 DDDD0000
+	 * unpacklo: aaaa0000 AAAA0000 cccc0000 CCCC0000
+	 */
+	sum_a_hi = _mm256_unpackhi_epi32(a, zero);
+	sum_b_hi = _mm256_unpackhi_epi32(b, zero);
+	sum_a_lo = _mm256_unpacklo_epi32(a, zero);
+	sum_b_lo = _mm256_unpacklo_epi32(b, zero);
+
+	len -= sizeof(__m256i) * 2;
+	buf256 += 2;
+
+	for (; len >= sizeof(a) * 2; len -= sizeof(a) * 2, buf256 += 2) {
+		a = _mm256_stream_load_si256(buf256);
+		b = _mm256_stream_load_si256(buf256 + 1);
+
+		sum_a_hi = _mm256_add_epi64(sum_a_hi,
+					    _mm256_unpackhi_epi32(a, zero));
+		sum_b_hi = _mm256_add_epi64(sum_b_hi,
+					    _mm256_unpackhi_epi32(b, zero));
+		sum_a_lo = _mm256_add_epi64(sum_a_lo,
+					    _mm256_unpacklo_epi32(a, zero));
+		sum_b_lo = _mm256_add_epi64(sum_b_lo,
+					    _mm256_unpacklo_epi32(b, zero));
+	}
+
+	/* Fold four 256bit sums into one 128-bit sum. TODO */
+	sum256 = _mm256_add_epi64(_mm256_add_epi64(sum_a_hi, sum_a_lo),
+				  _mm256_add_epi64(sum_b_hi, sum_b_lo));
+	sum128 = _mm_add_epi64(_mm256_extracti128_si256(sum256, 0),
+			       _mm256_extracti128_si256(sum256, 1));
+
+	/* Fold 128-bit sum into 64 bits. */
+	sum64 += _mm_extract_epi64(sum128, 0) + _mm_extract_epi64(sum128, 1);
+	buf64 = (const uint64_t *)buf256;
+
+less_than_128_bytes:
+	for (; len >= sizeof(a); len -= sizeof(a), buf256++) {
+		a = _mm256_stream_load_si256(buf256);
+
+		sum_a_hi = _mm256_unpackhi_epi32(a, zero);
+		sum_a_lo = _mm256_unpacklo_epi32(a, zero);
+
+		sum256 = _mm256_add_epi64(sum_a_hi, sum_a_lo);
+		sum128 = _mm_add_epi64(_mm256_extracti128_si256(sum256, 0),
+				       _mm256_extracti128_si256(sum256, 1));
+
+		sum64 += _mm_extract_epi64(sum128, 0);
+		sum64 += _mm_extract_epi64(sum128, 1);
+	}
+	buf64 = (const uint64_t *)buf256;
+
+	/* Repeat 16-bit one's complement sum (at sum64). */
+	buf16 = (const uint16_t *)buf64;
+	while (len >= sizeof(uint16_t)) {
+		sum64 += *buf16++;
+		len -= sizeof(uint16_t);
+	}
+
+	/* Add remaining 8 bits to the one's complement sum. */
+	if (odd)
+		sum64 += *(const uint8_t *)buf16;
+
+	/* Reduce 64-bit unsigned int to 32-bit unsigned int. */
+	sum64 = (sum64 >> 32) + (sum64 & 0xffffffff);
+	sum64 += sum64 >> 32;
+
+	return (uint32_t)sum64;
+}
+
+/**
+ * csum() - Compute TCP/IP-style checksum
+ * @buf:	Input buffer, must be aligned to 32-byte boundary
+ * @len:	Input length
+ * @init:	Initial 32-bit checksum, 0 for no pre-computed checksum
+ *
+ * Return: 16-bit folded, complemented checksum sum
+ */
+uint16_t csum(const void *buf, size_t len, uint32_t init)
+{
+	return (uint16_t)~csum_fold(csum_avx2(buf, len, init));
+}
+
+#else /* __AVX2__ */
+
+/**
+ * csum() - Compute TCP/IP-style checksum
+ * @buf:	Input buffer
+ * @len:	Input length
+ * @sum:	Initial 32-bit checksum, 0 for no pre-computed checksum
+ *
+ * Return: 16-bit folded, complemented checksum
+ */
+uint16_t csum(const void *buf, size_t len, uint32_t init)
+{
+	return csum_unaligned(buf, len, init);
+}
+
+#endif /* !__AVX2__ */
diff --git a/dhcp.c b/dhcp.c
index 4a83627..ed5df27 100644
--- a/dhcp.c
+++ b/dhcp.c
@@ -24,6 +24,7 @@
 #include <net/if.h>
 #include <arpa/inet.h>
 
+#include "checksum.h"
 #include "util.h"
 #include "passt.h"
 #include "dhcp.h"
@@ -320,7 +321,7 @@ int dhcp(struct ctx *c, struct ethhdr *eh, size_t len)
 	iph->daddr = c->addr4;
 	iph->saddr = c->gw4;
 	iph->check = 0;
-	iph->check = csum_ip4(iph, iph->ihl * 4);
+	iph->check = csum_unaligned(iph, iph->ihl * 4, 0);
 
 	len += sizeof(*eh);
 	memcpy(eh->h_dest, eh->h_source, ETH_ALEN);
diff --git a/ndp.c b/ndp.c
index acc0473..b676825 100644
--- a/ndp.c
+++ b/ndp.c
@@ -27,6 +27,7 @@
 #include <net/if.h>
 #include <net/if_arp.h>
 
+#include "checksum.h"
 #include "util.h"
 #include "passt.h"
 #include "tap.h"
@@ -172,8 +173,8 @@ int ndp(struct ctx *c, struct ethhdr *eh, size_t len)
 	ip6hr->payload_len = htons(sizeof(*ihr) + len);
 	ip6hr->hop_limit = IPPROTO_ICMPV6;
 	ihr->icmp6_cksum = 0;
-	ihr->icmp6_cksum = csum_ip4(ip6hr, sizeof(*ip6hr) +
-					   sizeof(*ihr) + len);
+	ihr->icmp6_cksum = csum_unaligned(ip6hr, sizeof(*ip6hr) +
+						 sizeof(*ihr) + len, 0);
 
 	ip6hr->version = 6;
 	ip6hr->nexthdr = IPPROTO_ICMPV6;
diff --git a/tap.c b/tap.c
index 2af1c7a..08b85ff 100644
--- a/tap.c
+++ b/tap.c
@@ -41,6 +41,7 @@
 #include <linux/icmp.h>
 #include <linux/icmpv6.h>
 
+#include "checksum.h"
 #include "util.h"
 #include "passt.h"
 #include "arp.h"
@@ -122,7 +123,7 @@ void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto,
 		memcpy(&iph->saddr, &src->s6_addr[12], 4);
 
 		iph->check = 0;
-		iph->check = csum_ip4(iph, iph->ihl * 4);
+		iph->check = csum_unaligned(iph, iph->ihl * 4, 0);
 
 		memcpy(data, in, len);
 
diff --git a/udp.c b/udp.c
index 30659e0..00a34a9 100644
--- a/udp.c
+++ b/udp.c
@@ -110,6 +110,7 @@
 #include <linux/udp.h>
 #include <time.h>
 
+#include "checksum.h"
 #include "util.h"
 #include "passt.h"
 #include "tap.h"
@@ -210,6 +211,11 @@ udp4_l2_buf[UDP_TAP_FRAMES] = {
  */
 __extension__ struct udp6_l2_buf_t {
 	struct sockaddr_in6 s_in6;
+#ifdef __AVX2__
+	/* Align ip6h to 32-byte boundary. */
+	uint8_t pad[64 - (sizeof(struct sockaddr_in6) + sizeof(struct ethhdr) +
+			  sizeof(uint32_t))];
+#endif
 
 	uint32_t vnet_len;
 	struct ethhdr eh;
@@ -217,10 +223,18 @@ __extension__ struct udp6_l2_buf_t {
 	struct udphdr uh;
 	uint8_t data[USHRT_MAX -
 		     (sizeof(struct ipv6hdr) + sizeof(struct udphdr))];
+#ifdef __AVX2__
+} __attribute__ ((packed, aligned(32)))
+#else
 } __attribute__ ((packed, aligned(__alignof__(unsigned int))))
+#endif
 udp6_l2_buf[UDP_TAP_FRAMES] = {
 	[ 0 ... UDP_TAP_FRAMES - 1 ] = {
-		{ 0 }, 0, L2_BUF_ETH_IP6_INIT, L2_BUF_IP6_INIT(IPPROTO_UDP),
+		{ 0 },
+#ifdef __AVX2__
+		{ 0 },
+#endif
+		0, L2_BUF_ETH_IP6_INIT, L2_BUF_IP6_INIT(IPPROTO_UDP),
 		{ 0 }, { 0 },
 	},
 };
@@ -656,7 +670,7 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
 			b->ip6h.version = 0;
 			b->ip6h.nexthdr = 0;
 			b->uh.check = 0;
-			b->uh.check = csum_ip4(&b->ip6h, ip_len);
+			b->uh.check = csum(&b->ip6h, ip_len, 0);
 			b->ip6h.version = 6;
 			b->ip6h.nexthdr = IPPROTO_UDP;
 			b->ip6h.hop_limit = 255;
diff --git a/util.c b/util.c
index 4d4661a..33894f7 100644
--- a/util.c
+++ b/util.c
@@ -77,86 +77,6 @@ logfn(debug, LOG_DEBUG)
 #endif
 
 /**
- * sum_16b() - Calculate sum of 16-bit words
- * @buf:	Input buffer
- * @len:	Buffer length
- *
- * Return: 32-bit sum of 16-bit words
-*/
-uint32_t sum_16b(void *buf, size_t len)
-{
-	uint32_t sum = 0;
-	uint16_t *p = buf;
-	size_t len1 = len / 2;
-	size_t off;
-
-	for (off = 0; off < len1; off++, p++)
-		sum += *p;
-
-	if (len % 2)
-		sum += *p & 0xff;
-
-	return sum;
-}
-
-/**
- * csum_fold() - Fold long sum for IP and TCP checksum
- * @sum:	Original long sum
- *
- * Return: 16-bit folded sum
- */
-uint16_t csum_fold(uint32_t sum)
-{
-	while (sum >> 16)
-		sum = (sum & 0xffff) + (sum >> 16);
-
-	return sum;
-}
-
-/**
- * csum_ip4() - Calculate IPv4 checksum
- * @buf:	Packet buffer, L3 headers
- * @len:	Total L3 packet length
- *
- * Return: 16-bit IPv4-style checksum
- */
-uint16_t csum_ip4(void *buf, size_t len)
-{
-	return ~csum_fold(sum_16b(buf, len));
-}
-
-/**
- * csum_tcp4() - Calculate TCP checksum for IPv4 and set in place
- * @iph:	Packet buffer, IP header
- */
-void csum_tcp4(struct iphdr *iph)
-{
-	struct tcphdr *th = (struct tcphdr *)((char *)iph + iph->ihl * 4);
-	uint16_t tlen = ntohs(iph->tot_len) - iph->ihl * 4, *p = (uint16_t *)th;
-	uint32_t sum = 0;
-
-	sum += (iph->saddr >> 16) & 0xffff;
-	sum += iph->saddr & 0xffff;
-	sum += (iph->daddr >> 16) & 0xffff;
-	sum += iph->daddr & 0xffff;
-
-	sum += htons(IPPROTO_TCP);
-	sum += htons(tlen);
-
-	th->check = 0;
-	while (tlen > 1) {
-		sum += *p++;
-		tlen -= 2;
-	}
-
-	if (tlen > 0) {
-		sum += *p & htons(0xff00);
-	}
-
-	th->check = (uint16_t)~csum_fold(sum);
-}
-
-/**
  * ipv6_l4hdr() - Find pointer to L4 header in IPv6 packet and extract protocol
  * @ip6h:	IPv6 header
  * @proto:	Filled with L4 protocol number
diff --git a/util.h b/util.h
index 7fbce1f..1c11474 100644
--- a/util.h
+++ b/util.h
@@ -117,10 +117,6 @@ void debug(const char *format, ...);
 
 struct ctx;
 
-uint32_t sum_16b(void *buf, size_t len);
-uint16_t csum_fold(uint32_t sum);
-uint16_t csum_ip4(void *buf, size_t len);
-void csum_tcp4(struct iphdr *iph);
 char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto);
 int sock_l4(struct ctx *c, int af, uint8_t proto, uint16_t port, int lo,
 	    uint32_t data);