From 4b12cf94f078dd54e2ed7a2202104ddf6f3be69a Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 15 Oct 2021 17:04:46 +0200
Subject: checksum: Stream load into four registers at a time with > 128 bytes

...and further interleave register usage. This brings the csum()
overhead reported by perf(1) for 30 seconds of 64KiB TCP IPv4
frames, host to guest, from 7.2% to 5.8%.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 checksum.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 47 insertions(+), 3 deletions(-)

(limited to 'checksum.c')

diff --git a/checksum.c b/checksum.c
index c79c9d7..72a7bba 100644
--- a/checksum.c
+++ b/checksum.c
@@ -156,6 +156,8 @@ void csum_tcp4(struct iphdr *iph)
  *   UDP headers, and that the data is not used immediately afterwards, reducing
  *   cache pollution significantly and latency (e.g. on Intel Skylake: 0 instead
  *   of 7)
+ * - read from four streams in parallel as long as we have more than 128 bytes,
+ *   not just two
  * - replace the ADCQ implementation for the portion remaining after the
  *   checksum computation for 128-byte blocks by a load/unpack/add loop on a
  *   single stream, and do the rest with a for loop, auto-vectorisation seems to
@@ -165,13 +167,15 @@ void csum_tcp4(struct iphdr *iph)
  */
 static uint32_t csum_avx2(const void *buf, size_t len, uint32_t init)
 {
-	__m256i zero, a, b, sum256, sum_a_hi, sum_a_lo, sum_b_hi, sum_b_lo;
+	__m256i a, b, sum256, sum_a_hi, sum_a_lo, sum_b_hi, sum_b_lo, c, d;
+	__m256i __sum_a_hi, __sum_a_lo, __sum_b_hi, __sum_b_lo;
 	const uint64_t *buf64 = (const uint64_t *)buf;
-	const __m256i *buf256 = (__m256i *)buf64;
+	const __m256i *buf256;
 	const uint16_t *buf16;
 	uint64_t sum64 = init;
 	int odd = len & 1;
 	__m128i sum128;
+	__m256i zero;
 
 	zero = _mm256_setzero_si256();
 	buf256 = (const __m256i *)buf64;
@@ -179,7 +183,6 @@ static uint32_t csum_avx2(const void *buf, size_t len, uint32_t init)
 	if (len < sizeof(__m256i) * 4)
 		goto less_than_128_bytes;
 
-
 	/* We parallelize two ymm streams to minimize register dependency:
 	 *
 	 * a: buf256,             buf256 + 2,             ...
@@ -203,6 +206,47 @@ static uint32_t csum_avx2(const void *buf, size_t len, uint32_t init)
 	len -= sizeof(__m256i) * 2;
 	buf256 += 2;
 
+	/* As long as we have more than 128 bytes, (stream) load from four
+	 * streams instead of two, interleaving loads and register usage, to
+	 * further decrease stalls, but don't double the number of accumulators
+	 * and don't make this a general case to keep branching reasonable.
+	 */
+	if (len >= sizeof(a) * 4) {
+		a = _mm256_stream_load_si256(buf256);
+		b = _mm256_stream_load_si256(buf256 + 1);
+		c = _mm256_stream_load_si256(buf256 + 2);
+		d = _mm256_stream_load_si256(buf256 + 3);
+	}
+	for (; len >= sizeof(a) * 4; len -= sizeof(a) * 4, buf256 += 4) {
+		__sum_a_hi = _mm256_add_epi64(sum_a_hi,
+					    _mm256_unpackhi_epi32(a, zero));
+		__sum_b_hi = _mm256_add_epi64(sum_b_hi,
+					    _mm256_unpackhi_epi32(b, zero));
+		__sum_a_lo = _mm256_add_epi64(sum_a_lo,
+					    _mm256_unpacklo_epi32(a, zero));
+		__sum_b_lo = _mm256_add_epi64(sum_b_lo,
+					    _mm256_unpacklo_epi32(b, zero));
+
+		if (len >= sizeof(a) * 8) {
+			a = _mm256_stream_load_si256(buf256 + 4);
+			b = _mm256_stream_load_si256(buf256 + 5);
+		}
+
+		sum_a_hi = _mm256_add_epi64(__sum_a_hi,
+					    _mm256_unpackhi_epi32(c, zero));
+		sum_b_hi = _mm256_add_epi64(__sum_b_hi,
+					    _mm256_unpackhi_epi32(d, zero));
+		sum_a_lo = _mm256_add_epi64(__sum_a_lo,
+					    _mm256_unpacklo_epi32(c, zero));
+		sum_b_lo = _mm256_add_epi64(__sum_b_lo,
+					    _mm256_unpacklo_epi32(d, zero));
+
+		if (len >= sizeof(a) * 8) {
+			c = _mm256_stream_load_si256(buf256 + 6);
+			d = _mm256_stream_load_si256(buf256 + 7);
+		}
+	}
+
 	for (; len >= sizeof(a) * 2; len -= sizeof(a) * 2, buf256 += 2) {
 		a = _mm256_stream_load_si256(buf256);
 		b = _mm256_stream_load_si256(buf256 + 1);
-- 
cgit v1.2.3