From 74f29d3148d34d166c040e6cf1f626245c0d479a Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Fri, 15 Oct 2021 16:59:12 +0200 Subject: checksum: Interleave lo/hi sums while folding into 128-bit sums, drop TODO I left a TODO and never checked -- this actually seems to slightly improve CPIs on AMD Naples (two 128-bit FMA units glued together). Signed-off-by: Stefano Brivio --- checksum.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/checksum.c b/checksum.c index 9c8a458..c79c9d7 100644 --- a/checksum.c +++ b/checksum.c @@ -217,9 +217,9 @@ static uint32_t csum_avx2(const void *buf, size_t len, uint32_t init) _mm256_unpacklo_epi32(b, zero)); } - /* Fold four 256bit sums into one 128-bit sum. TODO */ - sum256 = _mm256_add_epi64(_mm256_add_epi64(sum_a_hi, sum_a_lo), - _mm256_add_epi64(sum_b_hi, sum_b_lo)); + /* Fold four 256bit sums into one 128-bit sum. */ + sum256 = _mm256_add_epi64(_mm256_add_epi64(sum_a_hi, sum_b_lo), + _mm256_add_epi64(sum_b_hi, sum_a_lo)); sum128 = _mm_add_epi64(_mm256_extracti128_si256(sum256, 0), _mm256_extracti128_si256(sum256, 1)); -- cgit v1.2.3