diff options
Diffstat (limited to 'checksum.c')
-rw-r--r-- | checksum.c | 292 |
1 files changed, 292 insertions, 0 deletions
diff --git a/checksum.c b/checksum.c new file mode 100644 index 0000000..9c8a458 --- /dev/null +++ b/checksum.c @@ -0,0 +1,292 @@ +// SPDX-License-Identifier: AGPL-3.0-or-later +// SPDX-License-Identifier: BSD-3-Clause + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * checksum.c - TCP/IP checksum routines + * + * Copyright (c) 2021 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + * + * This file also contains code originally licensed under the following terms: + * + * Copyright (c) 2014-2016, The Regents of the University of California. + * Copyright (c) 2016-2017, Nefeli Networks, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of their + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * See the comment to csum_avx2() for further details. + */ + +#include <stdint.h> +#include <stddef.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <arpa/inet.h> + +/** + * sum_16b() - Calculate sum of 16-bit words + * @buf: Input buffer + * @len: Buffer length + * + * Return: 32-bit sum of 16-bit words +*/ +uint32_t sum_16b(const void *buf, size_t len) +{ + const uint16_t *p = buf; + uint32_t sum = 0; + + while (len > 1) { + sum += *p++; + len -= 2; + } + + if (len > 0) + sum += *p & htons(0xff00); + + return sum; +} + +/** + * csum_fold() - Fold long sum for IP and TCP checksum + * @sum: Original long sum + * + * Return: 16-bit folded sum + */ +uint16_t csum_fold(uint32_t sum) +{ + while (sum >> 16) + sum = (sum & 0xffff) + (sum >> 16); + + return sum; +} + +/** + * csum_unaligned() - Compute TCP/IP-style checksum for not 32-byte aligned data + * @buf: Input data + * @len: Input length + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum + * + * Return: 16-bit IPv4-style checksum + */ +uint16_t csum_unaligned(const void *buf, size_t len, uint32_t init) +{ + return (uint16_t)~csum_fold(sum_16b(buf, len) + init); +} + +/** + * csum_tcp4() - Calculate TCP checksum for IPv4 and set in place + * @iph: Packet buffer, IP header + */ +void csum_tcp4(struct iphdr *iph) +{ + struct tcphdr *th = (struct tcphdr *)((char *)iph + iph->ihl * 4); + uint16_t tlen = ntohs(iph->tot_len) - iph->ihl * 4, *p = (uint16_t *)th; + uint32_t sum = 0; + + sum += (iph->saddr >> 16) & 0xffff; + sum += iph->saddr & 0xffff; + sum += (iph->daddr >> 16) & 0xffff; + sum += iph->daddr & 0xffff; + + sum += htons(IPPROTO_TCP); + sum += htons(tlen); + + th->check = 0; + while (tlen > 1) { + sum += *p++; + tlen -= 2; + } + + if (tlen > 0) { + sum += *p & htons(0xff00); + } + + th->check = (uint16_t)~csum_fold(sum); +} + +#ifdef __AVX2__ +#include <immintrin.h> + +/** + * csum_avx2() - Compute 32-bit checksum using AVX2 SIMD instructions + * @buf: Input buffer, must be aligned to 32-byte boundary + * @len: Input length + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum + * + * Return: 32-bit checksum, not complemented, not folded + * + * This implementation is mostly sourced from BESS ("Berkeley Extensible + * Software Switch"), core/utils/checksum.h, distributed under the terms of the + * 3-Clause BSD license. Notable changes: + * - input buffer data is loaded (streamed) with a non-temporal aligned hint + * (VMOVNTDQA, _mm256_stream_load_si256() intrinsic) instead of the original + * unaligned load with temporal hint (VMOVDQU, _mm256_loadu_si256() intrinsic) + * given that the input buffer layout guarantees 32-byte alignment of TCP and + * UDP headers, and that the data is not used immediately afterwards, reducing + * cache pollution significantly and latency (e.g. on Intel Skylake: 0 instead + * of 7) + * - replace the ADCQ implementation for the portion remaining after the + * checksum computation for 128-byte blocks by a load/unpack/add loop on a + * single stream, and do the rest with a for loop, auto-vectorisation seems to + * outperforms the original hand-coded loop there + * - sum_a/sum_b unpacking is interleaved and not sequential to reduce stalls + * - coding style adaptation + */ +static uint32_t csum_avx2(const void *buf, size_t len, uint32_t init) +{ + __m256i zero, a, b, sum256, sum_a_hi, sum_a_lo, sum_b_hi, sum_b_lo; + const uint64_t *buf64 = (const uint64_t *)buf; + const __m256i *buf256 = (__m256i *)buf64; + const uint16_t *buf16; + uint64_t sum64 = init; + int odd = len & 1; + __m128i sum128; + + zero = _mm256_setzero_si256(); + buf256 = (const __m256i *)buf64; + + if (len < sizeof(__m256i) * 4) + goto less_than_128_bytes; + + + /* We parallelize two ymm streams to minimize register dependency: + * + * a: buf256, buf256 + 2, ... + * b: buf256 + 1, buf256 + 3, ... + */ + a = _mm256_stream_load_si256(buf256); + b = _mm256_stream_load_si256(buf256 + 1); + + /* For each stream, accumulate unpackhi and unpacklo in parallel (as + * 4x64bit vectors, so that each upper 0000 can hold carries): + * + * 32B data: aaaaAAAA bbbbBBBB ccccCCCC ddddDDDD (1 letter: 1 byte) + * unpackhi: bbbb0000 BBBB0000 dddd0000 DDDD0000 + * unpacklo: aaaa0000 AAAA0000 cccc0000 CCCC0000 + */ + sum_a_hi = _mm256_unpackhi_epi32(a, zero); + sum_b_hi = _mm256_unpackhi_epi32(b, zero); + sum_a_lo = _mm256_unpacklo_epi32(a, zero); + sum_b_lo = _mm256_unpacklo_epi32(b, zero); + + len -= sizeof(__m256i) * 2; + buf256 += 2; + + for (; len >= sizeof(a) * 2; len -= sizeof(a) * 2, buf256 += 2) { + a = _mm256_stream_load_si256(buf256); + b = _mm256_stream_load_si256(buf256 + 1); + + sum_a_hi = _mm256_add_epi64(sum_a_hi, + _mm256_unpackhi_epi32(a, zero)); + sum_b_hi = _mm256_add_epi64(sum_b_hi, + _mm256_unpackhi_epi32(b, zero)); + sum_a_lo = _mm256_add_epi64(sum_a_lo, + _mm256_unpacklo_epi32(a, zero)); + sum_b_lo = _mm256_add_epi64(sum_b_lo, + _mm256_unpacklo_epi32(b, zero)); + } + + /* Fold four 256bit sums into one 128-bit sum. TODO */ + sum256 = _mm256_add_epi64(_mm256_add_epi64(sum_a_hi, sum_a_lo), + _mm256_add_epi64(sum_b_hi, sum_b_lo)); + sum128 = _mm_add_epi64(_mm256_extracti128_si256(sum256, 0), + _mm256_extracti128_si256(sum256, 1)); + + /* Fold 128-bit sum into 64 bits. */ + sum64 += _mm_extract_epi64(sum128, 0) + _mm_extract_epi64(sum128, 1); + buf64 = (const uint64_t *)buf256; + +less_than_128_bytes: + for (; len >= sizeof(a); len -= sizeof(a), buf256++) { + a = _mm256_stream_load_si256(buf256); + + sum_a_hi = _mm256_unpackhi_epi32(a, zero); + sum_a_lo = _mm256_unpacklo_epi32(a, zero); + + sum256 = _mm256_add_epi64(sum_a_hi, sum_a_lo); + sum128 = _mm_add_epi64(_mm256_extracti128_si256(sum256, 0), + _mm256_extracti128_si256(sum256, 1)); + + sum64 += _mm_extract_epi64(sum128, 0); + sum64 += _mm_extract_epi64(sum128, 1); + } + buf64 = (const uint64_t *)buf256; + + /* Repeat 16-bit one's complement sum (at sum64). */ + buf16 = (const uint16_t *)buf64; + while (len >= sizeof(uint16_t)) { + sum64 += *buf16++; + len -= sizeof(uint16_t); + } + + /* Add remaining 8 bits to the one's complement sum. */ + if (odd) + sum64 += *(const uint8_t *)buf16; + + /* Reduce 64-bit unsigned int to 32-bit unsigned int. */ + sum64 = (sum64 >> 32) + (sum64 & 0xffffffff); + sum64 += sum64 >> 32; + + return (uint32_t)sum64; +} + +/** + * csum() - Compute TCP/IP-style checksum + * @buf: Input buffer, must be aligned to 32-byte boundary + * @len: Input length + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum + * + * Return: 16-bit folded, complemented checksum sum + */ +uint16_t csum(const void *buf, size_t len, uint32_t init) +{ + return (uint16_t)~csum_fold(csum_avx2(buf, len, init)); +} + +#else /* __AVX2__ */ + +/** + * csum() - Compute TCP/IP-style checksum + * @buf: Input buffer + * @len: Input length + * @sum: Initial 32-bit checksum, 0 for no pre-computed checksum + * + * Return: 16-bit folded, complemented checksum + */ +uint16_t csum(const void *buf, size_t len, uint32_t init) +{ + return csum_unaligned(buf, len, init); +} + +#endif /* !__AVX2__ */ |