aboutgitcodebugslistschat
diff options
context:
space:
mode:
-rw-r--r--Makefile19
-rw-r--r--checksum.c292
-rw-r--r--dhcp.c3
-rw-r--r--ndp.c5
-rw-r--r--tap.c3
-rw-r--r--udp.c18
-rw-r--r--util.c80
-rw-r--r--util.h4
8 files changed, 332 insertions, 92 deletions
diff --git a/Makefile b/Makefile
index 2f48c35..5a6692e 100644
--- a/Makefile
+++ b/Makefile
@@ -3,8 +3,23 @@ CFLAGS += -DRLIMIT_STACK_VAL=$(shell ulimit -s)
all: passt pasta passt4netns qrap
-passt: passt.c passt.h arp.c arp.h dhcp.c dhcp.h dhcpv6.c dhcpv6.h pcap.c pcap.h ndp.c ndp.h siphash.c siphash.h tap.c tap.h icmp.c icmp.h tcp.c tcp.h udp.c udp.h util.c util.h
- $(CC) $(CFLAGS) passt.c arp.c dhcp.c dhcpv6.c pcap.c ndp.c siphash.c tap.c icmp.c tcp.c udp.c util.c -o passt
+avx2: CFLAGS += -Ofast -mavx2 -ftree-vectorize -funroll-loops
+avx2: clean all
+
+avx2_debug: CFLAGS += -Ofast -mavx2 -ftree-vectorize -funroll-loops -DDEBUG -g
+avx2_debug: clean all
+
+static: CFLAGS += -static
+static: clean all
+
+debug: CFLAGS += -static -DDEBUG -g
+debug: clean all
+
+passt: passt.c passt.h arp.c arp.h checksum.c checksum.h dhcp.c dhcp.h \
+ dhcpv6.c dhcpv6.h pcap.c pcap.h ndp.c ndp.h siphash.c siphash.h \
+ tap.c tap.h icmp.c icmp.h tcp.c tcp.h udp.c udp.h util.c util.h
+ $(CC) $(CFLAGS) passt.c arp.c checksum.c dhcp.c dhcpv6.c pcap.c ndp.c \
+ siphash.c tap.c icmp.c tcp.c udp.c util.c -o passt
pasta: passt
ln -s passt pasta
diff --git a/checksum.c b/checksum.c
new file mode 100644
index 0000000..9c8a458
--- /dev/null
+++ b/checksum.c
@@ -0,0 +1,292 @@
+// SPDX-License-Identifier: AGPL-3.0-or-later
+// SPDX-License-Identifier: BSD-3-Clause
+
+/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
+ *
+ * checksum.c - TCP/IP checksum routines
+ *
+ * Copyright (c) 2021 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ *
+ * This file also contains code originally licensed under the following terms:
+ *
+ * Copyright (c) 2014-2016, The Regents of the University of California.
+ * Copyright (c) 2016-2017, Nefeli Networks, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * * Neither the names of the copyright holders nor the names of their
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * See the comment to csum_avx2() for further details.
+ */
+
+#include <stdint.h>
+#include <stddef.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <arpa/inet.h>
+
+/**
+ * sum_16b() - Calculate sum of 16-bit words
+ * @buf: Input buffer
+ * @len: Buffer length
+ *
+ * Return: 32-bit sum of 16-bit words
+*/
+uint32_t sum_16b(const void *buf, size_t len)
+{
+ const uint16_t *p = buf;
+ uint32_t sum = 0;
+
+ while (len > 1) {
+ sum += *p++;
+ len -= 2;
+ }
+
+ if (len > 0)
+ sum += *p & htons(0xff00);
+
+ return sum;
+}
+
+/**
+ * csum_fold() - Fold long sum for IP and TCP checksum
+ * @sum: Original long sum
+ *
+ * Return: 16-bit folded sum
+ */
+uint16_t csum_fold(uint32_t sum)
+{
+ while (sum >> 16)
+ sum = (sum & 0xffff) + (sum >> 16);
+
+ return sum;
+}
+
+/**
+ * csum_unaligned() - Compute TCP/IP-style checksum for not 32-byte aligned data
+ * @buf: Input data
+ * @len: Input length
+ * @init: Initial 32-bit checksum, 0 for no pre-computed checksum
+ *
+ * Return: 16-bit IPv4-style checksum
+ */
+uint16_t csum_unaligned(const void *buf, size_t len, uint32_t init)
+{
+ return (uint16_t)~csum_fold(sum_16b(buf, len) + init);
+}
+
+/**
+ * csum_tcp4() - Calculate TCP checksum for IPv4 and set in place
+ * @iph: Packet buffer, IP header
+ */
+void csum_tcp4(struct iphdr *iph)
+{
+ struct tcphdr *th = (struct tcphdr *)((char *)iph + iph->ihl * 4);
+ uint16_t tlen = ntohs(iph->tot_len) - iph->ihl * 4, *p = (uint16_t *)th;
+ uint32_t sum = 0;
+
+ sum += (iph->saddr >> 16) & 0xffff;
+ sum += iph->saddr & 0xffff;
+ sum += (iph->daddr >> 16) & 0xffff;
+ sum += iph->daddr & 0xffff;
+
+ sum += htons(IPPROTO_TCP);
+ sum += htons(tlen);
+
+ th->check = 0;
+ while (tlen > 1) {
+ sum += *p++;
+ tlen -= 2;
+ }
+
+ if (tlen > 0) {
+ sum += *p & htons(0xff00);
+ }
+
+ th->check = (uint16_t)~csum_fold(sum);
+}
+
+#ifdef __AVX2__
+#include <immintrin.h>
+
+/**
+ * csum_avx2() - Compute 32-bit checksum using AVX2 SIMD instructions
+ * @buf: Input buffer, must be aligned to 32-byte boundary
+ * @len: Input length
+ * @init: Initial 32-bit checksum, 0 for no pre-computed checksum
+ *
+ * Return: 32-bit checksum, not complemented, not folded
+ *
+ * This implementation is mostly sourced from BESS ("Berkeley Extensible
+ * Software Switch"), core/utils/checksum.h, distributed under the terms of the
+ * 3-Clause BSD license. Notable changes:
+ * - input buffer data is loaded (streamed) with a non-temporal aligned hint
+ * (VMOVNTDQA, _mm256_stream_load_si256() intrinsic) instead of the original
+ * unaligned load with temporal hint (VMOVDQU, _mm256_loadu_si256() intrinsic)
+ * given that the input buffer layout guarantees 32-byte alignment of TCP and
+ * UDP headers, and that the data is not used immediately afterwards, reducing
+ * cache pollution significantly and latency (e.g. on Intel Skylake: 0 instead
+ * of 7)
+ * - replace the ADCQ implementation for the portion remaining after the
+ * checksum computation for 128-byte blocks by a load/unpack/add loop on a
+ * single stream, and do the rest with a for loop, auto-vectorisation seems to
+ * outperforms the original hand-coded loop there
+ * - sum_a/sum_b unpacking is interleaved and not sequential to reduce stalls
+ * - coding style adaptation
+ */
+static uint32_t csum_avx2(const void *buf, size_t len, uint32_t init)
+{
+ __m256i zero, a, b, sum256, sum_a_hi, sum_a_lo, sum_b_hi, sum_b_lo;
+ const uint64_t *buf64 = (const uint64_t *)buf;
+ const __m256i *buf256 = (__m256i *)buf64;
+ const uint16_t *buf16;
+ uint64_t sum64 = init;
+ int odd = len & 1;
+ __m128i sum128;
+
+ zero = _mm256_setzero_si256();
+ buf256 = (const __m256i *)buf64;
+
+ if (len < sizeof(__m256i) * 4)
+ goto less_than_128_bytes;
+
+
+ /* We parallelize two ymm streams to minimize register dependency:
+ *
+ * a: buf256, buf256 + 2, ...
+ * b: buf256 + 1, buf256 + 3, ...
+ */
+ a = _mm256_stream_load_si256(buf256);
+ b = _mm256_stream_load_si256(buf256 + 1);
+
+ /* For each stream, accumulate unpackhi and unpacklo in parallel (as
+ * 4x64bit vectors, so that each upper 0000 can hold carries):
+ *
+ * 32B data: aaaaAAAA bbbbBBBB ccccCCCC ddddDDDD (1 letter: 1 byte)
+ * unpackhi: bbbb0000 BBBB0000 dddd0000 DDDD0000
+ * unpacklo: aaaa0000 AAAA0000 cccc0000 CCCC0000
+ */
+ sum_a_hi = _mm256_unpackhi_epi32(a, zero);
+ sum_b_hi = _mm256_unpackhi_epi32(b, zero);
+ sum_a_lo = _mm256_unpacklo_epi32(a, zero);
+ sum_b_lo = _mm256_unpacklo_epi32(b, zero);
+
+ len -= sizeof(__m256i) * 2;
+ buf256 += 2;
+
+ for (; len >= sizeof(a) * 2; len -= sizeof(a) * 2, buf256 += 2) {
+ a = _mm256_stream_load_si256(buf256);
+ b = _mm256_stream_load_si256(buf256 + 1);
+
+ sum_a_hi = _mm256_add_epi64(sum_a_hi,
+ _mm256_unpackhi_epi32(a, zero));
+ sum_b_hi = _mm256_add_epi64(sum_b_hi,
+ _mm256_unpackhi_epi32(b, zero));
+ sum_a_lo = _mm256_add_epi64(sum_a_lo,
+ _mm256_unpacklo_epi32(a, zero));
+ sum_b_lo = _mm256_add_epi64(sum_b_lo,
+ _mm256_unpacklo_epi32(b, zero));
+ }
+
+ /* Fold four 256bit sums into one 128-bit sum. TODO */
+ sum256 = _mm256_add_epi64(_mm256_add_epi64(sum_a_hi, sum_a_lo),
+ _mm256_add_epi64(sum_b_hi, sum_b_lo));
+ sum128 = _mm_add_epi64(_mm256_extracti128_si256(sum256, 0),
+ _mm256_extracti128_si256(sum256, 1));
+
+ /* Fold 128-bit sum into 64 bits. */
+ sum64 += _mm_extract_epi64(sum128, 0) + _mm_extract_epi64(sum128, 1);
+ buf64 = (const uint64_t *)buf256;
+
+less_than_128_bytes:
+ for (; len >= sizeof(a); len -= sizeof(a), buf256++) {
+ a = _mm256_stream_load_si256(buf256);
+
+ sum_a_hi = _mm256_unpackhi_epi32(a, zero);
+ sum_a_lo = _mm256_unpacklo_epi32(a, zero);
+
+ sum256 = _mm256_add_epi64(sum_a_hi, sum_a_lo);
+ sum128 = _mm_add_epi64(_mm256_extracti128_si256(sum256, 0),
+ _mm256_extracti128_si256(sum256, 1));
+
+ sum64 += _mm_extract_epi64(sum128, 0);
+ sum64 += _mm_extract_epi64(sum128, 1);
+ }
+ buf64 = (const uint64_t *)buf256;
+
+ /* Repeat 16-bit one's complement sum (at sum64). */
+ buf16 = (const uint16_t *)buf64;
+ while (len >= sizeof(uint16_t)) {
+ sum64 += *buf16++;
+ len -= sizeof(uint16_t);
+ }
+
+ /* Add remaining 8 bits to the one's complement sum. */
+ if (odd)
+ sum64 += *(const uint8_t *)buf16;
+
+ /* Reduce 64-bit unsigned int to 32-bit unsigned int. */
+ sum64 = (sum64 >> 32) + (sum64 & 0xffffffff);
+ sum64 += sum64 >> 32;
+
+ return (uint32_t)sum64;
+}
+
+/**
+ * csum() - Compute TCP/IP-style checksum
+ * @buf: Input buffer, must be aligned to 32-byte boundary
+ * @len: Input length
+ * @init: Initial 32-bit checksum, 0 for no pre-computed checksum
+ *
+ * Return: 16-bit folded, complemented checksum sum
+ */
+uint16_t csum(const void *buf, size_t len, uint32_t init)
+{
+ return (uint16_t)~csum_fold(csum_avx2(buf, len, init));
+}
+
+#else /* __AVX2__ */
+
+/**
+ * csum() - Compute TCP/IP-style checksum
+ * @buf: Input buffer
+ * @len: Input length
+ * @sum: Initial 32-bit checksum, 0 for no pre-computed checksum
+ *
+ * Return: 16-bit folded, complemented checksum
+ */
+uint16_t csum(const void *buf, size_t len, uint32_t init)
+{
+ return csum_unaligned(buf, len, init);
+}
+
+#endif /* !__AVX2__ */
diff --git a/dhcp.c b/dhcp.c
index 4a83627..ed5df27 100644
--- a/dhcp.c
+++ b/dhcp.c
@@ -24,6 +24,7 @@
#include <net/if.h>
#include <arpa/inet.h>
+#include "checksum.h"
#include "util.h"
#include "passt.h"
#include "dhcp.h"
@@ -320,7 +321,7 @@ int dhcp(struct ctx *c, struct ethhdr *eh, size_t len)
iph->daddr = c->addr4;
iph->saddr = c->gw4;
iph->check = 0;
- iph->check = csum_ip4(iph, iph->ihl * 4);
+ iph->check = csum_unaligned(iph, iph->ihl * 4, 0);
len += sizeof(*eh);
memcpy(eh->h_dest, eh->h_source, ETH_ALEN);
diff --git a/ndp.c b/ndp.c
index acc0473..b676825 100644
--- a/ndp.c
+++ b/ndp.c
@@ -27,6 +27,7 @@
#include <net/if.h>
#include <net/if_arp.h>
+#include "checksum.h"
#include "util.h"
#include "passt.h"
#include "tap.h"
@@ -172,8 +173,8 @@ int ndp(struct ctx *c, struct ethhdr *eh, size_t len)
ip6hr->payload_len = htons(sizeof(*ihr) + len);
ip6hr->hop_limit = IPPROTO_ICMPV6;
ihr->icmp6_cksum = 0;
- ihr->icmp6_cksum = csum_ip4(ip6hr, sizeof(*ip6hr) +
- sizeof(*ihr) + len);
+ ihr->icmp6_cksum = csum_unaligned(ip6hr, sizeof(*ip6hr) +
+ sizeof(*ihr) + len, 0);
ip6hr->version = 6;
ip6hr->nexthdr = IPPROTO_ICMPV6;
diff --git a/tap.c b/tap.c
index 2af1c7a..08b85ff 100644
--- a/tap.c
+++ b/tap.c
@@ -41,6 +41,7 @@
#include <linux/icmp.h>
#include <linux/icmpv6.h>
+#include "checksum.h"
#include "util.h"
#include "passt.h"
#include "arp.h"
@@ -122,7 +123,7 @@ void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto,
memcpy(&iph->saddr, &src->s6_addr[12], 4);
iph->check = 0;
- iph->check = csum_ip4(iph, iph->ihl * 4);
+ iph->check = csum_unaligned(iph, iph->ihl * 4, 0);
memcpy(data, in, len);
diff --git a/udp.c b/udp.c
index 30659e0..00a34a9 100644
--- a/udp.c
+++ b/udp.c
@@ -110,6 +110,7 @@
#include <linux/udp.h>
#include <time.h>
+#include "checksum.h"
#include "util.h"
#include "passt.h"
#include "tap.h"
@@ -210,6 +211,11 @@ udp4_l2_buf[UDP_TAP_FRAMES] = {
*/
__extension__ struct udp6_l2_buf_t {
struct sockaddr_in6 s_in6;
+#ifdef __AVX2__
+ /* Align ip6h to 32-byte boundary. */
+ uint8_t pad[64 - (sizeof(struct sockaddr_in6) + sizeof(struct ethhdr) +
+ sizeof(uint32_t))];
+#endif
uint32_t vnet_len;
struct ethhdr eh;
@@ -217,10 +223,18 @@ __extension__ struct udp6_l2_buf_t {
struct udphdr uh;
uint8_t data[USHRT_MAX -
(sizeof(struct ipv6hdr) + sizeof(struct udphdr))];
+#ifdef __AVX2__
+} __attribute__ ((packed, aligned(32)))
+#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
+#endif
udp6_l2_buf[UDP_TAP_FRAMES] = {
[ 0 ... UDP_TAP_FRAMES - 1 ] = {
- { 0 }, 0, L2_BUF_ETH_IP6_INIT, L2_BUF_IP6_INIT(IPPROTO_UDP),
+ { 0 },
+#ifdef __AVX2__
+ { 0 },
+#endif
+ 0, L2_BUF_ETH_IP6_INIT, L2_BUF_IP6_INIT(IPPROTO_UDP),
{ 0 }, { 0 },
},
};
@@ -656,7 +670,7 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
b->ip6h.version = 0;
b->ip6h.nexthdr = 0;
b->uh.check = 0;
- b->uh.check = csum_ip4(&b->ip6h, ip_len);
+ b->uh.check = csum(&b->ip6h, ip_len, 0);
b->ip6h.version = 6;
b->ip6h.nexthdr = IPPROTO_UDP;
b->ip6h.hop_limit = 255;
diff --git a/util.c b/util.c
index 4d4661a..33894f7 100644
--- a/util.c
+++ b/util.c
@@ -77,86 +77,6 @@ logfn(debug, LOG_DEBUG)
#endif
/**
- * sum_16b() - Calculate sum of 16-bit words
- * @buf: Input buffer
- * @len: Buffer length
- *
- * Return: 32-bit sum of 16-bit words
-*/
-uint32_t sum_16b(void *buf, size_t len)
-{
- uint32_t sum = 0;
- uint16_t *p = buf;
- size_t len1 = len / 2;
- size_t off;
-
- for (off = 0; off < len1; off++, p++)
- sum += *p;
-
- if (len % 2)
- sum += *p & 0xff;
-
- return sum;
-}
-
-/**
- * csum_fold() - Fold long sum for IP and TCP checksum
- * @sum: Original long sum
- *
- * Return: 16-bit folded sum
- */
-uint16_t csum_fold(uint32_t sum)
-{
- while (sum >> 16)
- sum = (sum & 0xffff) + (sum >> 16);
-
- return sum;
-}
-
-/**
- * csum_ip4() - Calculate IPv4 checksum
- * @buf: Packet buffer, L3 headers
- * @len: Total L3 packet length
- *
- * Return: 16-bit IPv4-style checksum
- */
-uint16_t csum_ip4(void *buf, size_t len)
-{
- return ~csum_fold(sum_16b(buf, len));
-}
-
-/**
- * csum_tcp4() - Calculate TCP checksum for IPv4 and set in place
- * @iph: Packet buffer, IP header
- */
-void csum_tcp4(struct iphdr *iph)
-{
- struct tcphdr *th = (struct tcphdr *)((char *)iph + iph->ihl * 4);
- uint16_t tlen = ntohs(iph->tot_len) - iph->ihl * 4, *p = (uint16_t *)th;
- uint32_t sum = 0;
-
- sum += (iph->saddr >> 16) & 0xffff;
- sum += iph->saddr & 0xffff;
- sum += (iph->daddr >> 16) & 0xffff;
- sum += iph->daddr & 0xffff;
-
- sum += htons(IPPROTO_TCP);
- sum += htons(tlen);
-
- th->check = 0;
- while (tlen > 1) {
- sum += *p++;
- tlen -= 2;
- }
-
- if (tlen > 0) {
- sum += *p & htons(0xff00);
- }
-
- th->check = (uint16_t)~csum_fold(sum);
-}
-
-/**
* ipv6_l4hdr() - Find pointer to L4 header in IPv6 packet and extract protocol
* @ip6h: IPv6 header
* @proto: Filled with L4 protocol number
diff --git a/util.h b/util.h
index 7fbce1f..1c11474 100644
--- a/util.h
+++ b/util.h
@@ -117,10 +117,6 @@ void debug(const char *format, ...);
struct ctx;
-uint32_t sum_16b(void *buf, size_t len);
-uint16_t csum_fold(uint32_t sum);
-uint16_t csum_ip4(void *buf, size_t len);
-void csum_tcp4(struct iphdr *iph);
char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto);
int sock_l4(struct ctx *c, int af, uint8_t proto, uint16_t port, int lo,
uint32_t data);