aboutgitcodebugslistschat
path: root/checksum.c
diff options
context:
space:
mode:
Diffstat (limited to 'checksum.c')
-rw-r--r--checksum.c292
1 files changed, 292 insertions, 0 deletions
diff --git a/checksum.c b/checksum.c
new file mode 100644
index 0000000..9c8a458
--- /dev/null
+++ b/checksum.c
@@ -0,0 +1,292 @@
+// SPDX-License-Identifier: AGPL-3.0-or-later
+// SPDX-License-Identifier: BSD-3-Clause
+
+/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
+ *
+ * checksum.c - TCP/IP checksum routines
+ *
+ * Copyright (c) 2021 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ *
+ * This file also contains code originally licensed under the following terms:
+ *
+ * Copyright (c) 2014-2016, The Regents of the University of California.
+ * Copyright (c) 2016-2017, Nefeli Networks, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * * Neither the names of the copyright holders nor the names of their
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * See the comment to csum_avx2() for further details.
+ */
+
+#include <stdint.h>
+#include <stddef.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <arpa/inet.h>
+
+/**
+ * sum_16b() - Calculate sum of 16-bit words
+ * @buf: Input buffer
+ * @len: Buffer length
+ *
+ * Return: 32-bit sum of 16-bit words
+*/
+uint32_t sum_16b(const void *buf, size_t len)
+{
+ const uint16_t *p = buf;
+ uint32_t sum = 0;
+
+ while (len > 1) {
+ sum += *p++;
+ len -= 2;
+ }
+
+ if (len > 0)
+ sum += *p & htons(0xff00);
+
+ return sum;
+}
+
+/**
+ * csum_fold() - Fold long sum for IP and TCP checksum
+ * @sum: Original long sum
+ *
+ * Return: 16-bit folded sum
+ */
+uint16_t csum_fold(uint32_t sum)
+{
+ while (sum >> 16)
+ sum = (sum & 0xffff) + (sum >> 16);
+
+ return sum;
+}
+
+/**
+ * csum_unaligned() - Compute TCP/IP-style checksum for not 32-byte aligned data
+ * @buf: Input data
+ * @len: Input length
+ * @init: Initial 32-bit checksum, 0 for no pre-computed checksum
+ *
+ * Return: 16-bit IPv4-style checksum
+ */
+uint16_t csum_unaligned(const void *buf, size_t len, uint32_t init)
+{
+ return (uint16_t)~csum_fold(sum_16b(buf, len) + init);
+}
+
+/**
+ * csum_tcp4() - Calculate TCP checksum for IPv4 and set in place
+ * @iph: Packet buffer, IP header
+ */
+void csum_tcp4(struct iphdr *iph)
+{
+ struct tcphdr *th = (struct tcphdr *)((char *)iph + iph->ihl * 4);
+ uint16_t tlen = ntohs(iph->tot_len) - iph->ihl * 4, *p = (uint16_t *)th;
+ uint32_t sum = 0;
+
+ sum += (iph->saddr >> 16) & 0xffff;
+ sum += iph->saddr & 0xffff;
+ sum += (iph->daddr >> 16) & 0xffff;
+ sum += iph->daddr & 0xffff;
+
+ sum += htons(IPPROTO_TCP);
+ sum += htons(tlen);
+
+ th->check = 0;
+ while (tlen > 1) {
+ sum += *p++;
+ tlen -= 2;
+ }
+
+ if (tlen > 0) {
+ sum += *p & htons(0xff00);
+ }
+
+ th->check = (uint16_t)~csum_fold(sum);
+}
+
+#ifdef __AVX2__
+#include <immintrin.h>
+
+/**
+ * csum_avx2() - Compute 32-bit checksum using AVX2 SIMD instructions
+ * @buf: Input buffer, must be aligned to 32-byte boundary
+ * @len: Input length
+ * @init: Initial 32-bit checksum, 0 for no pre-computed checksum
+ *
+ * Return: 32-bit checksum, not complemented, not folded
+ *
+ * This implementation is mostly sourced from BESS ("Berkeley Extensible
+ * Software Switch"), core/utils/checksum.h, distributed under the terms of the
+ * 3-Clause BSD license. Notable changes:
+ * - input buffer data is loaded (streamed) with a non-temporal aligned hint
+ * (VMOVNTDQA, _mm256_stream_load_si256() intrinsic) instead of the original
+ * unaligned load with temporal hint (VMOVDQU, _mm256_loadu_si256() intrinsic)
+ * given that the input buffer layout guarantees 32-byte alignment of TCP and
+ * UDP headers, and that the data is not used immediately afterwards, reducing
+ * cache pollution significantly and latency (e.g. on Intel Skylake: 0 instead
+ * of 7)
+ * - replace the ADCQ implementation for the portion remaining after the
+ * checksum computation for 128-byte blocks by a load/unpack/add loop on a
+ * single stream, and do the rest with a for loop, auto-vectorisation seems to
+ * outperforms the original hand-coded loop there
+ * - sum_a/sum_b unpacking is interleaved and not sequential to reduce stalls
+ * - coding style adaptation
+ */
+static uint32_t csum_avx2(const void *buf, size_t len, uint32_t init)
+{
+ __m256i zero, a, b, sum256, sum_a_hi, sum_a_lo, sum_b_hi, sum_b_lo;
+ const uint64_t *buf64 = (const uint64_t *)buf;
+ const __m256i *buf256 = (__m256i *)buf64;
+ const uint16_t *buf16;
+ uint64_t sum64 = init;
+ int odd = len & 1;
+ __m128i sum128;
+
+ zero = _mm256_setzero_si256();
+ buf256 = (const __m256i *)buf64;
+
+ if (len < sizeof(__m256i) * 4)
+ goto less_than_128_bytes;
+
+
+ /* We parallelize two ymm streams to minimize register dependency:
+ *
+ * a: buf256, buf256 + 2, ...
+ * b: buf256 + 1, buf256 + 3, ...
+ */
+ a = _mm256_stream_load_si256(buf256);
+ b = _mm256_stream_load_si256(buf256 + 1);
+
+ /* For each stream, accumulate unpackhi and unpacklo in parallel (as
+ * 4x64bit vectors, so that each upper 0000 can hold carries):
+ *
+ * 32B data: aaaaAAAA bbbbBBBB ccccCCCC ddddDDDD (1 letter: 1 byte)
+ * unpackhi: bbbb0000 BBBB0000 dddd0000 DDDD0000
+ * unpacklo: aaaa0000 AAAA0000 cccc0000 CCCC0000
+ */
+ sum_a_hi = _mm256_unpackhi_epi32(a, zero);
+ sum_b_hi = _mm256_unpackhi_epi32(b, zero);
+ sum_a_lo = _mm256_unpacklo_epi32(a, zero);
+ sum_b_lo = _mm256_unpacklo_epi32(b, zero);
+
+ len -= sizeof(__m256i) * 2;
+ buf256 += 2;
+
+ for (; len >= sizeof(a) * 2; len -= sizeof(a) * 2, buf256 += 2) {
+ a = _mm256_stream_load_si256(buf256);
+ b = _mm256_stream_load_si256(buf256 + 1);
+
+ sum_a_hi = _mm256_add_epi64(sum_a_hi,
+ _mm256_unpackhi_epi32(a, zero));
+ sum_b_hi = _mm256_add_epi64(sum_b_hi,
+ _mm256_unpackhi_epi32(b, zero));
+ sum_a_lo = _mm256_add_epi64(sum_a_lo,
+ _mm256_unpacklo_epi32(a, zero));
+ sum_b_lo = _mm256_add_epi64(sum_b_lo,
+ _mm256_unpacklo_epi32(b, zero));
+ }
+
+ /* Fold four 256bit sums into one 128-bit sum. TODO */
+ sum256 = _mm256_add_epi64(_mm256_add_epi64(sum_a_hi, sum_a_lo),
+ _mm256_add_epi64(sum_b_hi, sum_b_lo));
+ sum128 = _mm_add_epi64(_mm256_extracti128_si256(sum256, 0),
+ _mm256_extracti128_si256(sum256, 1));
+
+ /* Fold 128-bit sum into 64 bits. */
+ sum64 += _mm_extract_epi64(sum128, 0) + _mm_extract_epi64(sum128, 1);
+ buf64 = (const uint64_t *)buf256;
+
+less_than_128_bytes:
+ for (; len >= sizeof(a); len -= sizeof(a), buf256++) {
+ a = _mm256_stream_load_si256(buf256);
+
+ sum_a_hi = _mm256_unpackhi_epi32(a, zero);
+ sum_a_lo = _mm256_unpacklo_epi32(a, zero);
+
+ sum256 = _mm256_add_epi64(sum_a_hi, sum_a_lo);
+ sum128 = _mm_add_epi64(_mm256_extracti128_si256(sum256, 0),
+ _mm256_extracti128_si256(sum256, 1));
+
+ sum64 += _mm_extract_epi64(sum128, 0);
+ sum64 += _mm_extract_epi64(sum128, 1);
+ }
+ buf64 = (const uint64_t *)buf256;
+
+ /* Repeat 16-bit one's complement sum (at sum64). */
+ buf16 = (const uint16_t *)buf64;
+ while (len >= sizeof(uint16_t)) {
+ sum64 += *buf16++;
+ len -= sizeof(uint16_t);
+ }
+
+ /* Add remaining 8 bits to the one's complement sum. */
+ if (odd)
+ sum64 += *(const uint8_t *)buf16;
+
+ /* Reduce 64-bit unsigned int to 32-bit unsigned int. */
+ sum64 = (sum64 >> 32) + (sum64 & 0xffffffff);
+ sum64 += sum64 >> 32;
+
+ return (uint32_t)sum64;
+}
+
+/**
+ * csum() - Compute TCP/IP-style checksum
+ * @buf: Input buffer, must be aligned to 32-byte boundary
+ * @len: Input length
+ * @init: Initial 32-bit checksum, 0 for no pre-computed checksum
+ *
+ * Return: 16-bit folded, complemented checksum sum
+ */
+uint16_t csum(const void *buf, size_t len, uint32_t init)
+{
+ return (uint16_t)~csum_fold(csum_avx2(buf, len, init));
+}
+
+#else /* __AVX2__ */
+
+/**
+ * csum() - Compute TCP/IP-style checksum
+ * @buf: Input buffer
+ * @len: Input length
+ * @sum: Initial 32-bit checksum, 0 for no pre-computed checksum
+ *
+ * Return: 16-bit folded, complemented checksum
+ */
+uint16_t csum(const void *buf, size_t len, uint32_t init)
+{
+ return csum_unaligned(buf, len, init);
+}
+
+#endif /* !__AVX2__ */