From ac8295b68aae46b1608d7fe540852c9d61e05426 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Thu, 26 Feb 2026 22:08:53 +0100 Subject: [PATCH] style(simd): rename sse to avx (#821) * style(simd): rename sse to avx * fix(exp,simd): apply the right avx512 constraints to a few methods * fix(exp,simd): apply the right avx512 constraints to a few methods --- docs/data/simd-clamp.md | 6 +- docs/data/simd-contains.md | 6 +- docs/data/simd-max.md | 6 +- docs/data/simd-mean.md | 6 +- docs/data/simd-meanby.md | 4 +- docs/data/simd-min.md | 6 +- docs/data/simd-sum.md | 6 +- docs/data/simd-sumby.md | 4 +- docs/docs/exp/simd.md | 8 +- exp/simd/BENCHMARK.md | 384 ++++++++--------- exp/simd/README.md | 11 +- exp/simd/cpu_amd64_test.go | 15 +- exp/simd/intersect_avx512.go | 20 +- exp/simd/intersect_bench_test.go | 68 +-- exp/simd/math.go | 18 +- exp/simd/{math_sse.go => math_avx.go} | 386 +++--------------- exp/simd/math_avx512.go | 258 ++++++++++++ exp/simd/math_avx512_test.go | 224 ++++++++++ .../{math_sse_test.go => math_avx_test.go} | 309 ++++---------- exp/simd/math_bench_test.go | 50 ++- exp/simd/simd_test.go | 22 +- 21 files changed, 964 insertions(+), 853 deletions(-) rename exp/simd/{math_sse.go => math_avx.go} (82%) rename exp/simd/{math_sse_test.go => math_avx_test.go} (91%) diff --git a/docs/data/simd-clamp.md b/docs/data/simd-clamp.md index 2c1b247..1c936cc 100644 --- a/docs/data/simd-clamp.md +++ b/docs/data/simd-clamp.md @@ -1,7 +1,7 @@ --- name: Clamp slug: clamp -sourceRef: exp/simd/math_sse.go#L424 +sourceRef: exp/simd/math_avx.go#L453 category: exp subCategory: simd similarHelpers: @@ -51,7 +51,7 @@ Clamps each element in a collection between min and max values using SIMD instru | SIMD variant | Lanes | Required flags | Typical CPUs | | ------------ | ----- | -------------- | ------------------------------ | -| SSE (xN) | 2-16 | `sse2` | All amd64 | +| AVX (xN) | 2-16 | `avx` | All amd64 | | AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ | | AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons | @@ -70,7 +70,7 @@ result := simd.ClampFloat32x16([]float32{0.5, 1.5, 2.5, 3.5}, 1.0, 3.0) ``` ```go -// Using SSE variant (8 lanes at once) - works on all amd64 +// Using AVX variant (8 lanes at once) - works on all amd64 result := simd.ClampInt16x8([]int16{100, 150, 200, 250}, 120, 220) // []int16{120, 150, 200, 220} ``` diff --git a/docs/data/simd-contains.md b/docs/data/simd-contains.md index 9c35610..ae0178c 100644 --- a/docs/data/simd-contains.md +++ b/docs/data/simd-contains.md @@ -1,7 +1,7 @@ --- name: Contains slug: contains -sourceRef: exp/simd/intersect_sse.go#L11 +sourceRef: exp/simd/intersect_avx512.go#L9 category: exp subCategory: simd similarHelpers: @@ -51,7 +51,7 @@ Checks if a target value is present in a collection using SIMD instructions. The | SIMD variant | Lanes | Required flags | Typical CPUs | | ------------ | ----- | -------------- | ------------------------------ | -| SSE (xN) | 2-16 | `sse2` | All amd64 | +| AVX (xN) | 2-16 | `avx` | All amd64 | | AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ | | AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons | @@ -64,7 +64,7 @@ found := simd.ContainsInt8x32([]int8{1, 2, 3, 4, 5}, 3) ``` ```go -// Using SSE variant (16 lanes at once) - works on all amd64 +// Using AVX variant (16 lanes at once) - works on all amd64 found := simd.ContainsInt64x2([]int64{1000000, 2000000, 3000000}, 2000000) // true ``` diff --git a/docs/data/simd-max.md b/docs/data/simd-max.md index 551837c..746ddc1 100644 --- a/docs/data/simd-max.md +++ b/docs/data/simd-max.md @@ -1,7 +1,7 @@ --- name: Max slug: max -sourceRef: exp/simd/math_sse.go#L1328 +sourceRef: exp/simd/math_avx.go#L1279 category: exp subCategory: simd similarHelpers: @@ -51,7 +51,7 @@ Finds the maximum value in a collection using SIMD instructions. The suffix (x2, | SIMD variant | Lanes | Required flags | Typical CPUs | | ------------ | ----- | -------------- | ------------------------------ | -| SSE (xN) | 2-16 | `sse2` | All amd64 | +| AVX (xN) | 2-16 | `avx` | All amd64 | | AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ | | AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons | @@ -70,7 +70,7 @@ max := simd.MaxFloat32x16([]float32{3.5, 1.2, 4.8, 2.1}) ``` ```go -// Using SSE variant (4 lanes at once) - works on all amd64 +// Using AVX variant (4 lanes at once) - works on all amd64 max := simd.MaxInt32x4([]int32{100, 50, 200, 75}) // 200 ``` diff --git a/docs/data/simd-mean.md b/docs/data/simd-mean.md index 35403e9..a245ef4 100644 --- a/docs/data/simd-mean.md +++ b/docs/data/simd-mean.md @@ -1,7 +1,7 @@ --- name: Mean slug: mean -sourceRef: exp/simd/math_sse.go#L333 +sourceRef: exp/simd/math_avx.go#L352 category: exp subCategory: simd similarHelpers: @@ -52,7 +52,7 @@ Calculates the arithmetic mean of a collection using SIMD instructions. The suff | SIMD variant | Lanes | Required flags | Typical CPUs | | ------------ | ----- | -------------- | ------------------------------ | -| SSE (xN) | 2-16 | `sse2` | All amd64 | +| AVX (xN) | 2-16 | `avx` | All amd64 | | AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ | | AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons | @@ -71,7 +71,7 @@ mean := simd.MeanFloat32x16([]float32{1.0, 2.0, 3.0, 4.0}) ``` ```go -// Using SSE variant (8 lanes at once) - works on all amd64 +// Using AVX variant (8 lanes at once) - works on all amd64 mean := simd.MeanInt16x8([]int16{10, 20, 30, 40}) // 25 ``` diff --git a/docs/data/simd-meanby.md b/docs/data/simd-meanby.md index 83dfd48..75a96dc 100644 --- a/docs/data/simd-meanby.md +++ b/docs/data/simd-meanby.md @@ -62,7 +62,7 @@ MeanBy transforms a collection using an iteratee function and calculates the ari | SIMD variant | Lanes | Required flags | Typical CPUs | | ------------ | ----- | -------------- | ------------------------------ | -| SSE (xN) | 2-16 | `sse2` | All amd64 | +| AVX (xN) | 2-16 | `avx` | All amd64 | | AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ | | AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons | @@ -118,7 +118,7 @@ metrics := []Metric{ {Value: 400}, } -// Using SSE variant - works on all amd64 +// Using AVX variant - works on all amd64 mean := simd.MeanByUint16x8(metrics, func(m Metric) uint16 { return m.Value }) diff --git a/docs/data/simd-min.md b/docs/data/simd-min.md index 6bc6694..b92e373 100644 --- a/docs/data/simd-min.md +++ b/docs/data/simd-min.md @@ -1,7 +1,7 @@ --- name: Min slug: min -sourceRef: exp/simd/math_sse.go#L834 +sourceRef: exp/simd/math_avx.go#L833 category: exp subCategory: simd similarHelpers: @@ -51,7 +51,7 @@ Finds the minimum value in a collection using SIMD instructions. The suffix (x2, | SIMD variant | Lanes | Required flags | Typical CPUs | | ------------ | ----- | -------------- | ------------------------------ | -| SSE (xN) | 2-16 | `sse2` | All amd64 | +| AVX (xN) | 2-16 | `avx` | All amd64 | | AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ | | AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons | @@ -70,7 +70,7 @@ min := simd.MinFloat32x16([]float32{3.5, 1.2, 4.8, 2.1}) ``` ```go -// Using SSE variant (4 lanes at once) - works on all amd64 +// Using AVX variant (4 lanes at once) - works on all amd64 min := simd.MinInt32x4([]int32{100, 50, 200, 75}) // 50 ``` diff --git a/docs/data/simd-sum.md b/docs/data/simd-sum.md index e6675da..fe8223c 100644 --- a/docs/data/simd-sum.md +++ b/docs/data/simd-sum.md @@ -1,7 +1,7 @@ --- name: Sum slug: sum -sourceRef: exp/simd/math_sse.go#L13 +sourceRef: exp/simd/math_avx.go#L14 category: exp subCategory: simd similarHelpers: @@ -52,7 +52,7 @@ Sums the values in a collection using SIMD instructions. The suffix (x2, x4, x8, | SIMD variant | Lanes | Required flags | Typical CPUs | | ------------ | ----- | -------------- | ------------------------------ | -| SSE (xN) | 2-16 | `sse2` | All amd64 | +| AVX (xN) | 2-16 | `avx` | All amd64 | | AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ | | AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons | @@ -71,7 +71,7 @@ sum := simd.SumFloat32x16([]float32{1.1, 2.2, 3.3, 4.4}) ``` ```go -// Using SSE variant (4 lanes at once) - works on all amd64 +// Using AVX variant (4 lanes at once) - works on all amd64 sum := simd.SumInt32x4([]int32{1000000, 2000000, 3000000}) // 6000000 ``` diff --git a/docs/data/simd-sumby.md b/docs/data/simd-sumby.md index 82afef0..6498558 100644 --- a/docs/data/simd-sumby.md +++ b/docs/data/simd-sumby.md @@ -62,7 +62,7 @@ SumBy transforms a collection using an iteratee function and sums the result usi | SIMD variant | Lanes | Required flags | Typical CPUs | | ------------ | ----- | -------------- | ------------------------------ | -| SSE (xN) | 2-16 | `sse2` | All amd64 | +| AVX (xN) | 2-16 | `avx` | All amd64 | | AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ | | AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons | @@ -119,7 +119,7 @@ metrics := []Metric{ {Value: 400}, } -// Using SSE variant - works on all amd64 +// Using AVX variant - works on all amd64 sum := simd.SumByUint16x8(metrics, func(m Metric) uint16 { return m.Value }) diff --git a/docs/docs/exp/simd.md b/docs/docs/exp/simd.md index e844c8d..fd2f6cd 100644 --- a/docs/docs/exp/simd.md +++ b/docs/docs/exp/simd.md @@ -1,6 +1,6 @@ --- title: SIMD operations -description: High-performance slice operations using SSE, AVX2 and AVX512 SIMD when built with Go 1.26+ and GOEXPERIMENT=simd on amd64. +description: High-performance slice operations using AVX, AVX2 and AVX512 SIMD when built with Go 1.26+ and GOEXPERIMENT=simd on amd64. sidebar_position: 0 hide_table_of_contents: true --- @@ -14,7 +14,7 @@ Your feedback helps us improve! # ## SIMD helpers -This page lists all operations on slices, available in the `exp/simd` sub-package. These helpers use **SSE** (128-bit), **AVX2** (256-bit) or **AVX512** (512-bit) SIMD when built with Go 1.26+, the `GOEXPERIMENT=simd` flag, and on amd64. +This page lists all operations on slices, available in the `exp/simd` sub-package. These helpers use **AVX** (128-bit), **AVX2** (256-bit) or **AVX512** (512-bit) SIMD when built with Go 1.26+, the `GOEXPERIMENT=simd` flag, and on amd64. :::warning Unstable API SIMD helpers are experimental. The API may break in the future. @@ -26,7 +26,7 @@ Benchmarks show that running SIMD operators on small datasets is slower: ```txt BenchmarkSumInt8/small/Fallback-lo-4 203616572 5.875 ns/op -BenchmarkSumInt8/small/SSE-x16-4 100000000 12.04 ns/op +BenchmarkSumInt8/small/AVX-x16-4 100000000 12.04 ns/op BenchmarkSumInt8/small/AVX2-x32-4 64041816 17.93 ns/op BenchmarkSumInt8/small/AVX512-x64-4 26947528 44.75 ns/op ``` @@ -35,7 +35,7 @@ But much much faster on big datasets: ```txt BenchmarkSumInt8/xlarge/Fallback-lo-4 247677 4860 ns/op -BenchmarkSumInt8/xlarge/SSE-x16-4 3851040 311.4 ns/op +BenchmarkSumInt8/xlarge/AVX-x16-4 3851040 311.4 ns/op BenchmarkSumInt8/xlarge/AVX2-x32-4 7100002 169.2 ns/op BenchmarkSumInt8/xlarge/AVX512-x64-4 10107534 118.1 ns/op ``` diff --git a/exp/simd/BENCHMARK.md b/exp/simd/BENCHMARK.md index 1a5877c..d6cf100 100644 --- a/exp/simd/BENCHMARK.md +++ b/exp/simd/BENCHMARK.md @@ -6,7 +6,7 @@ Benchmarks show that running SIMD operations on small datasets is slower: ```txt BenchmarkSumInt8/small/Fallback-lo-2 248740710 5.218 ns/op -BenchmarkSumInt8/small/SSE-x16-2 126181464 9.485 ns/op +BenchmarkSumInt8/small/AVX-x16-2 126181464 9.485 ns/op BenchmarkSumInt8/small/AVX2-x32-2 73059427 14.44 ns/op BenchmarkSumInt8/small/AVX512-x64-2 49913169 24.41 ns/op ``` @@ -15,7 +15,7 @@ But SIMD is much faster on large datasets: ```txt BenchmarkSumInt8/xlarge/Fallback-lo-2 273898 4383 ns/op -BenchmarkSumInt8/xlarge/SSE-x16-2 6928408 173.1 ns/op +BenchmarkSumInt8/xlarge/AVX-x16-2 6928408 173.1 ns/op BenchmarkSumInt8/xlarge/AVX2-x32-2 12639586 94.09 ns/op BenchmarkSumInt8/xlarge/AVX512-x64-2 13509693 89.67 ns/op ``` @@ -50,397 +50,397 @@ ok github.com/samber/lo/exp/simd 596.213s | Benchmark | Iterations | Time/op | Bytes/op | Allocs/op | | ---------------------------------------------- | ---------- | ----------- | -------- | ----------- | -| BenchmarkContainsInt8/tiny/SSE-x16-2 | 312359204 | 3.625 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt8/tiny/AVX2-x32-2 | 277194441 | 4.531 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt8/tiny/AVX512-x16-2 | 312359204 | 3.625 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt8/tiny/AVX512-x32-2 | 277194441 | 4.531 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt8/tiny/AVX512-x64-2 | 336853209 | 3.401 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt8/small/SSE-x16-2 | 449132103 | 2.670 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt8/small/AVX2-x32-2 | 148648339 | 8.332 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt8/small/AVX512-x16-2 | 449132103 | 2.670 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt8/small/AVX512-x32-2 | 148648339 | 8.332 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt8/small/AVX512-x64-2 | 143124861 | 7.982 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt8/medium/SSE-x16-2 | 276816714 | 4.302 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt8/medium/AVX2-x32-2 | 345774957 | 3.529 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt8/medium/AVX512-x16-2 | 276816714 | 4.302 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt8/medium/AVX512-x32-2 | 345774957 | 3.529 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt8/medium/AVX512-x64-2 | 449868722 | 2.669 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt8/large/SSE-x16-2 | 100000000 | 10.68 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt8/large/AVX2-x32-2 | 172934200 | 6.941 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt8/large/AVX512-x16-2 | 100000000 | 10.68 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt8/large/AVX512-x32-2 | 172934200 | 6.941 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt8/large/AVX512-x64-2 | 280992625 | 4.384 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt8/xlarge/SSE-x16-2 | 187189599 | 6.203 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt8/xlarge/AVX2-x32-2 | 274289563 | 4.042 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt8/xlarge/AVX512-x16-2 | 187189599 | 6.203 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt8/xlarge/AVX512-x32-2 | 274289563 | 4.042 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt8/xlarge/AVX512-x64-2 | 375048555 | 2.953 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt8/massive/SSE-x16-2 | 86434948 | 14.02 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt8/massive/AVX2-x32-2 | 153742346 | 8.012 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt8/massive/AVX512-x16-2 | 86434948 | 14.02 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt8/massive/AVX512-x32-2 | 153742346 | 8.012 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt8/massive/AVX512-x64-2 | 259404483 | 5.214 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt16/tiny/SSE-x8-2 | 270309470 | 4.315 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt16/tiny/AVX2-x16-2 | 264874646 | 4.281 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt16/tiny/AVX512-x8-2 | 270309470 | 4.315 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt16/tiny/AVX512-x16-2 | 264874646 | 4.281 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt16/tiny/AVX512-x32-2 | 328810479 | 3.593 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt16/small/SSE-x8-2 | 374742561 | 3.206 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt16/small/AVX2-x16-2 | 449838870 | 2.678 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt16/small/AVX512-x8-2 | 374742561 | 3.206 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt16/small/AVX512-x16-2 | 449838870 | 2.678 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt16/small/AVX512-x32-2 | 143845734 | 8.484 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt16/medium/SSE-x8-2 | 185415590 | 6.448 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt16/medium/AVX2-x16-2 | 273780868 | 4.268 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt16/medium/AVX512-x8-2 | 185415590 | 6.448 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt16/medium/AVX512-x16-2 | 273780868 | 4.268 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt16/medium/AVX512-x32-2 | 350067484 | 3.431 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt16/large/SSE-x8-2 | 61109778 | 19.66 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt16/large/AVX2-x16-2 | 100000000 | 10.74 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt16/large/AVX512-x8-2 | 61109778 | 19.66 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt16/large/AVX512-x16-2 | 100000000 | 10.74 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt16/large/AVX512-x32-2 | 182886646 | 6.575 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt16/xlarge/SSE-x8-2 | 15220682 | 71.53 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt16/xlarge/AVX2-x16-2 | 31876572 | 37.57 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt16/xlarge/AVX512-x8-2 | 15220682 | 71.53 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt16/xlarge/AVX512-x16-2 | 31876572 | 37.57 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt16/xlarge/AVX512-x32-2 | 61992217 | 19.55 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt16/massive/SSE-x8-2 | 4372000 | 262.8 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt16/massive/AVX2-x16-2 | 9019658 | 131.1 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt16/massive/AVX512-x8-2 | 4372000 | 262.8 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt16/massive/AVX512-x16-2 | 9019658 | 131.1 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt16/massive/AVX512-x32-2 | 16568430 | 74.25 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt32/tiny/SSE-x4-2 | 499209442 | 2.406 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt32/tiny/AVX2-x8-2 | 350479609 | 3.433 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt32/tiny/AVX512-x4-2 | 499209442 | 2.406 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt32/tiny/AVX512-x8-2 | 350479609 | 3.433 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt32/tiny/AVX512-x16-2 | 280918554 | 4.309 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt32/small/SSE-x4-2 | 299561596 | 4.028 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt32/small/AVX2-x8-2 | 374064310 | 3.205 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt32/small/AVX512-x4-2 | 299561596 | 4.028 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt32/small/AVX512-x8-2 | 374064310 | 3.205 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt32/small/AVX512-x16-2 | 499219765 | 2.418 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt32/medium/SSE-x4-2 | 100000000 | 10.42 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt32/medium/AVX2-x8-2 | 187391635 | 6.403 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt32/medium/AVX512-x4-2 | 100000000 | 10.42 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt32/medium/AVX512-x8-2 | 187391635 | 6.403 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt32/medium/AVX512-x16-2 | 307955800 | 3.875 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt32/large/SSE-x4-2 | 33256420 | 36.05 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt32/large/AVX2-x8-2 | 62421526 | 19.23 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt32/large/AVX512-x4-2 | 33256420 | 36.05 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt32/large/AVX512-x8-2 | 62421526 | 19.23 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt32/large/AVX512-x16-2 | 100000000 | 10.36 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt32/xlarge/SSE-x4-2 | 8328856 | 144.9 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt32/xlarge/AVX2-x8-2 | 17039037 | 71.14 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt32/xlarge/AVX512-x4-2 | 8328856 | 144.9 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt32/xlarge/AVX512-x8-2 | 17039037 | 71.14 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt32/xlarge/AVX512-x16-2 | 28740241 | 41.77 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt32/massive/SSE-x4-2 | 3525885 | 332.3 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt32/massive/AVX2-x8-2 | 7318027 | 164.5 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt32/massive/AVX512-x4-2 | 3525885 | 332.3 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt32/massive/AVX512-x8-2 | 7318027 | 164.5 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt32/massive/AVX512-x16-2 | 12181366 | 99.08 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt64/tiny/SSE-x2-2 | 409014308 | 2.934 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt64/tiny/AVX2-x4-2 | 449210791 | 2.667 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt64/tiny/AVX512-x2-2 | 409014308 | 2.934 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt64/tiny/AVX512-x4-2 | 449210791 | 2.667 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt64/tiny/AVX512-x8-2 | 280998146 | 4.293 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt64/small/SSE-x2-2 | 195631429 | 6.172 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt64/small/AVX2-x4-2 | 281272394 | 4.308 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt64/small/AVX512-x2-2 | 195631429 | 6.172 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt64/small/AVX512-x4-2 | 281272394 | 4.308 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt64/small/AVX512-x8-2 | 408933924 | 3.044 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt64/medium/SSE-x2-2 | 63006909 | 18.94 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt64/medium/AVX2-x4-2 | 100000000 | 10.67 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt64/medium/AVX512-x2-2 | 63006909 | 18.94 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt64/medium/AVX512-x4-2 | 100000000 | 10.67 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt64/medium/AVX512-x8-2 | 197411126 | 6.016 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt64/large/SSE-x2-2 | 17098578 | 70.57 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt64/large/AVX2-x4-2 | 32558013 | 37.07 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt64/large/AVX512-x2-2 | 17098578 | 70.57 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt64/large/AVX512-x4-2 | 32558013 | 37.07 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt64/large/AVX512-x8-2 | 57629485 | 20.94 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt64/xlarge/SSE-x2-2 | 4286155 | 281.8 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt64/xlarge/AVX2-x4-2 | 8344772 | 143.8 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt64/xlarge/AVX512-x2-2 | 4286155 | 281.8 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt64/xlarge/AVX512-x4-2 | 8344772 | 143.8 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt64/xlarge/AVX512-x8-2 | 14428276 | 83.14 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt64/massive/SSE-x2-2 | 1000000 | 1012 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt64/massive/AVX2-x4-2 | 2350525 | 510.6 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt64/massive/AVX512-x2-2 | 1000000 | 1012 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt64/massive/AVX512-x4-2 | 2350525 | 510.6 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt64/massive/AVX512-x8-2 | 3773523 | 318.1 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint8/tiny/SSE-x16-2 | 338880315 | 3.332 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint8/tiny/AVX2-x32-2 | 320784217 | 3.559 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint8/tiny/AVX512-x16-2 | 338880315 | 3.332 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint8/tiny/AVX512-x32-2 | 320784217 | 3.559 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint8/tiny/AVX512-x64-2 | 341599854 | 3.331 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint8/small/SSE-x16-2 | 449579424 | 2.670 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint8/small/AVX2-x32-2 | 140368142 | 8.648 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint8/small/AVX512-x16-2 | 449579424 | 2.670 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint8/small/AVX512-x32-2 | 140368142 | 8.648 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint8/small/AVX512-x64-2 | 146828888 | 8.182 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint8/medium/SSE-x16-2 | 374443974 | 3.472 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint8/medium/AVX2-x32-2 | 449271607 | 2.672 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint8/medium/AVX512-x16-2 | 374443974 | 3.472 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint8/medium/AVX512-x32-2 | 449271607 | 2.672 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint8/medium/AVX512-x64-2 | 598525731 | 2.018 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint8/large/SSE-x16-2 | 254828565 | 4.956 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint8/large/AVX2-x32-2 | 407777484 | 2.938 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint8/large/AVX512-x16-2 | 254828565 | 4.956 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint8/large/AVX512-x32-2 | 407777484 | 2.938 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint8/large/AVX512-x64-2 | 443472316 | 2.666 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint8/xlarge/SSE-x16-2 | 162196827 | 7.867 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint8/xlarge/AVX2-x32-2 | 268324950 | 4.518 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint8/xlarge/AVX512-x16-2 | 162196827 | 7.867 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint8/xlarge/AVX512-x32-2 | 268324950 | 4.518 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint8/xlarge/AVX512-x64-2 | 400437789 | 2.952 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint8/massive/SSE-x16-2 | 214548872 | 5.640 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint8/massive/AVX2-x32-2 | 348431553 | 3.391 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint8/massive/AVX512-x16-2 | 214548872 | 5.640 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint8/massive/AVX512-x32-2 | 348431553 | 3.391 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint8/massive/AVX512-x64-2 | 459781908 | 2.455 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint16/tiny/SSE-x8-2 | 276271912 | 4.297 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint16/tiny/AVX2-x16-2 | 281145528 | 4.270 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint16/tiny/AVX512-x8-2 | 276271912 | 4.297 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint16/tiny/AVX512-x16-2 | 281145528 | 4.270 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint16/tiny/AVX512-x32-2 | 315343911 | 3.667 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint16/small/SSE-x8-2 | 374632351 | 3.204 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint16/small/AVX2-x16-2 | 449355727 | 2.670 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint16/small/AVX512-x8-2 | 374632351 | 3.204 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint16/small/AVX512-x16-2 | 449355727 | 2.670 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint16/small/AVX512-x32-2 | 138088146 | 8.395 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint16/medium/SSE-x8-2 | 187276191 | 6.582 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint16/medium/AVX2-x16-2 | 281107980 | 4.306 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint16/medium/AVX512-x8-2 | 187276191 | 6.582 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint16/medium/AVX512-x16-2 | 281107980 | 4.306 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint16/medium/AVX512-x32-2 | 358850328 | 3.516 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint16/large/SSE-x8-2 | 59025931 | 19.98 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint16/large/AVX2-x16-2 | 100000000 | 10.68 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint16/large/AVX512-x8-2 | 59025931 | 19.98 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint16/large/AVX512-x16-2 | 100000000 | 10.68 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint16/large/AVX512-x32-2 | 179631354 | 6.569 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint16/xlarge/SSE-x8-2 | 16576267 | 71.63 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint16/xlarge/AVX2-x16-2 | 32578981 | 36.96 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint16/xlarge/AVX512-x8-2 | 16576267 | 71.63 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint16/xlarge/AVX512-x16-2 | 32578981 | 36.96 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint16/xlarge/AVX512-x32-2 | 61464870 | 19.44 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint16/massive/SSE-x8-2 | 2153736 | 557.4 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint16/massive/AVX2-x16-2 | 4225728 | 281.3 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint16/massive/AVX512-x8-2 | 2153736 | 557.4 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint16/massive/AVX512-x16-2 | 4225728 | 281.3 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint16/massive/AVX512-x32-2 | 7829936 | 145.1 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint32/tiny/SSE-x4-2 | 499390296 | 2.403 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint32/tiny/AVX2-x8-2 | 362964080 | 3.342 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint32/tiny/AVX512-x4-2 | 499390296 | 2.403 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint32/tiny/AVX512-x8-2 | 362964080 | 3.342 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint32/tiny/AVX512-x16-2 | 281063364 | 4.268 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint32/small/SSE-x4-2 | 293867554 | 4.004 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint32/small/AVX2-x8-2 | 374510434 | 3.203 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint32/small/AVX512-x4-2 | 293867554 | 4.004 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint32/small/AVX512-x8-2 | 374510434 | 3.203 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint32/small/AVX512-x16-2 | 499714206 | 2.402 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint32/medium/SSE-x4-2 | 100000000 | 10.42 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint32/medium/AVX2-x8-2 | 187258657 | 6.405 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint32/medium/AVX512-x4-2 | 100000000 | 10.42 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint32/medium/AVX512-x8-2 | 187258657 | 6.405 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint32/medium/AVX512-x16-2 | 312999210 | 3.881 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint32/large/SSE-x4-2 | 33298366 | 36.02 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint32/large/AVX2-x8-2 | 62409421 | 19.23 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint32/large/AVX512-x4-2 | 33298366 | 36.02 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint32/large/AVX512-x8-2 | 62409421 | 19.23 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint32/large/AVX512-x16-2 | 100000000 | 10.10 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint32/xlarge/SSE-x4-2 | 7948898 | 143.6 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint32/xlarge/AVX2-x8-2 | 17021738 | 70.49 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint32/xlarge/AVX512-x4-2 | 7948898 | 143.6 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint32/xlarge/AVX512-x8-2 | 17021738 | 70.49 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint32/xlarge/AVX512-x16-2 | 28742320 | 41.77 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint32/massive/SSE-x4-2 | 1595774 | 751.1 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint32/massive/AVX2-x8-2 | 3094242 | 381.1 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint32/massive/AVX512-x4-2 | 1595774 | 751.1 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint32/massive/AVX512-x8-2 | 3094242 | 381.1 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint32/massive/AVX512-x16-2 | 5080051 | 238.3 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint64/tiny/SSE-x2-2 | 374760351 | 3.203 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint64/tiny/AVX2-x4-2 | 498763054 | 2.419 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint64/tiny/AVX512-x2-2 | 374760351 | 3.203 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint64/tiny/AVX512-x4-2 | 498763054 | 2.419 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint64/tiny/AVX512-x8-2 | 319635274 | 3.582 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint64/small/SSE-x2-2 | 187032452 | 6.447 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint64/small/AVX2-x4-2 | 299546244 | 4.009 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint64/small/AVX512-x2-2 | 187032452 | 6.447 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint64/small/AVX512-x4-2 | 299546244 | 4.009 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint64/small/AVX512-x8-2 | 373937659 | 3.207 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint64/medium/SSE-x2-2 | 62413118 | 19.23 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint64/medium/AVX2-x4-2 | 113978791 | 10.42 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint64/medium/AVX512-x2-2 | 62413118 | 19.23 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint64/medium/AVX512-x4-2 | 113978791 | 10.42 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint64/medium/AVX512-x8-2 | 186965330 | 6.484 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint64/large/SSE-x2-2 | 17005768 | 70.57 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint64/large/AVX2-x4-2 | 33286495 | 36.69 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint64/large/AVX512-x2-2 | 17005768 | 70.57 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint64/large/AVX512-x4-2 | 33286495 | 36.69 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint64/large/AVX512-x8-2 | 61486065 | 19.93 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint64/xlarge/SSE-x2-2 | 4154370 | 280.8 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint64/xlarge/AVX2-x4-2 | 8371358 | 148.2 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint64/xlarge/AVX512-x2-2 | 4154370 | 280.8 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint64/xlarge/AVX512-x4-2 | 8371358 | 148.2 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint64/xlarge/AVX512-x8-2 | 14193795 | 72.36 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint64/massive/SSE-x2-2 | 1773937 | 676.4 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsUint64/massive/AVX2-x4-2 | 3500168 | 343.0 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint64/massive/AVX512-x2-2 | 1773937 | 676.4 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsUint64/massive/AVX512-x4-2 | 3500168 | 343.0 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsUint64/massive/AVX512-x8-2 | 7097266 | 249.3 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat32/tiny/SSE-x4-2 | 410522160 | 2.675 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat32/tiny/AVX2-x8-2 | 308565882 | 3.814 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat32/tiny/AVX512-x4-2 | 410522160 | 2.675 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat32/tiny/AVX512-x8-2 | 308565882 | 3.814 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsFloat32/tiny/AVX512-x16-2 | 315331897 | 3.755 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat32/small/SSE-x4-2 | 278219434 | 4.642 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat32/small/AVX2-x8-2 | 362945481 | 3.287 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat32/small/AVX512-x4-2 | 278219434 | 4.642 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat32/small/AVX512-x8-2 | 362945481 | 3.287 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsFloat32/small/AVX512-x16-2 | 408523153 | 2.941 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat32/medium/SSE-x4-2 | 100000000 | 10.77 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat32/medium/AVX2-x8-2 | 186186376 | 6.409 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat32/medium/AVX512-x4-2 | 100000000 | 10.77 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat32/medium/AVX512-x8-2 | 186186376 | 6.409 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsFloat32/medium/AVX512-x16-2 | 264255108 | 4.619 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat32/large/SSE-x4-2 | 33028701 | 36.27 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat32/large/AVX2-x8-2 | 62465360 | 19.53 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat32/large/AVX512-x4-2 | 33028701 | 36.27 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat32/large/AVX512-x8-2 | 62465360 | 19.53 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsFloat32/large/AVX512-x16-2 | 108213310 | 10.95 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat32/xlarge/SSE-x4-2 | 8359381 | 143.6 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat32/xlarge/AVX2-x8-2 | 17042701 | 70.46 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat32/xlarge/AVX512-x4-2 | 8359381 | 143.6 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat32/xlarge/AVX512-x8-2 | 17042701 | 70.46 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsFloat32/xlarge/AVX512-x16-2 | 31806921 | 37.13 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat32/massive/SSE-x4-2 | 1000000 | 1100 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat32/massive/AVX2-x8-2 | 2164672 | 554.4 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat32/massive/AVX512-x4-2 | 1000000 | 1100 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat32/massive/AVX512-x8-2 | 2164672 | 554.4 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsFloat32/massive/AVX512-x16-2 | 4201453 | 293.9 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat64/tiny/SSE-x2-2 | 362183925 | 3.223 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat64/tiny/AVX2-x4-2 | 449021466 | 2.687 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat64/tiny/AVX512-x2-2 | 362183925 | 3.223 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat64/tiny/AVX512-x4-2 | 449021466 | 2.687 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsFloat64/tiny/AVX512-x8-2 | 320176149 | 3.820 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat64/small/SSE-x2-2 | 187139116 | 6.415 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat64/small/AVX2-x4-2 | 280722585 | 4.300 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat64/small/AVX512-x2-2 | 187139116 | 6.415 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat64/small/AVX512-x4-2 | 280722585 | 4.300 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsFloat64/small/AVX512-x8-2 | 335670502 | 3.472 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat64/medium/SSE-x2-2 | 62343927 | 19.23 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat64/medium/AVX2-x4-2 | 112332902 | 10.69 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat64/medium/AVX512-x2-2 | 62343927 | 19.23 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat64/medium/AVX512-x4-2 | 112332902 | 10.69 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsFloat64/medium/AVX512-x8-2 | 179610780 | 6.741 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat64/large/SSE-x2-2 | 16996959 | 70.51 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat64/large/AVX2-x4-2 | 33017950 | 36.29 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat64/large/AVX512-x2-2 | 16996959 | 70.51 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat64/large/AVX512-x4-2 | 33017950 | 36.29 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsFloat64/large/AVX512-x8-2 | 60322328 | 19.73 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat64/xlarge/SSE-x2-2 | 4141281 | 282.9 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat64/xlarge/AVX2-x4-2 | 7856590 | 145.0 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat64/xlarge/AVX512-x2-2 | 4141281 | 282.9 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat64/xlarge/AVX512-x4-2 | 7856590 | 145.0 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsFloat64/xlarge/AVX512-x8-2 | 16623739 | 72.06 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat64/massive/SSE-x2-2 | 541202 | 2195 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsFloat64/massive/AVX2-x4-2 | 1000000 | 1158 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat64/massive/AVX512-x2-2 | 541202 | 2195 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsFloat64/massive/AVX512-x4-2 | 1000000 | 1158 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsFloat64/massive/AVX512-x8-2 | 2115301 | 560.4 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsWorstCase/SSE-x4-2 | 7651734 | 145.6 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsWorstCase/AVX2-x8-2 | 14921599 | 70.49 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsWorstCase/AVX512-x4-2 | 7651734 | 145.6 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsWorstCase/AVX512-x8-2 | 14921599 | 70.49 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsWorstCase/AVX512-x16-2 | 28708478 | 41.38 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsBestCase/SSE-x4-2 | 534237578 | 2.136 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsBestCase/AVX2-x8-2 | 561252645 | 2.159 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsBestCase/AVX512-x4-2 | 534237578 | 2.136 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsBestCase/AVX512-x8-2 | 561252645 | 2.159 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsBestCase/AVX512-x16-2 | 560396454 | 2.137 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsNegative/tiny/SSE-x4-2 | 499649139 | 2.401 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsNegative/tiny/AVX2-x8-2 | 329743240 | 3.421 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsNegative/tiny/AVX512-x4-2 | 499649139 | 2.401 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsNegative/tiny/AVX512-x8-2 | 329743240 | 3.421 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsNegative/tiny/AVX512-x16-2 | 280516392 | 4.276 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsNegative/small/SSE-x4-2 | 299373171 | 4.006 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsNegative/small/AVX2-x8-2 | 374407988 | 3.267 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsNegative/small/AVX512-x4-2 | 299373171 | 4.006 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsNegative/small/AVX512-x8-2 | 374407988 | 3.267 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsNegative/small/AVX512-x16-2 | 486948346 | 2.424 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsNegative/medium/SSE-x4-2 | 100000000 | 10.41 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsNegative/medium/AVX2-x8-2 | 182899621 | 6.412 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsNegative/medium/AVX512-x4-2 | 100000000 | 10.41 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsNegative/medium/AVX512-x8-2 | 182899621 | 6.412 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsNegative/medium/AVX512-x16-2 | 311969776 | 3.829 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsNegative/large/SSE-x4-2 | 33309816 | 36.04 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsNegative/large/AVX2-x8-2 | 59912676 | 19.74 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsNegative/large/AVX512-x4-2 | 33309816 | 36.04 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsNegative/large/AVX512-x8-2 | 59912676 | 19.74 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsNegative/large/AVX512-x16-2 | 100000000 | 10.65 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsNegative/xlarge/SSE-x4-2 | 8346818 | 143.7 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsNegative/xlarge/AVX2-x8-2 | 16980399 | 70.54 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsNegative/xlarge/AVX512-x4-2 | 8346818 | 143.7 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsNegative/xlarge/AVX512-x8-2 | 16980399 | 70.54 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsNegative/xlarge/AVX512-x16-2 | 28676455 | 42.94 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsNegative/massive/SSE-x4-2 | 1000000 | 1151 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsNegative/massive/AVX2-x8-2 | 2161594 | 555.2 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsNegative/massive/AVX512-x4-2 | 1000000 | 1151 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsNegative/massive/AVX512-x8-2 | 2161594 | 555.2 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsNegative/massive/AVX512-x16-2 | 3549094 | 350.5 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt8ByWidth/SSE-x16-2 | 331533141 | 3.222 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt8ByWidth/AVX2-x32-2 | 408741681 | 3.193 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt8ByWidth/AVX512-x16-2 | 331533141 | 3.222 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt8ByWidth/AVX512-x32-2 | 408741681 | 3.193 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt8ByWidth/AVX512-x64-2 | 365382873 | 3.241 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt64SteadyState/SSE-x2-2 | 5722603 | 211.5 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkContainsInt64SteadyState/AVX2-x4-2 | 11711869 | 103.1 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt64SteadyState/AVX512-x2-2 | 5722603 | 211.5 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkContainsInt64SteadyState/AVX512-x4-2 | 11711869 | 103.1 ns/op | 0 B/op | 0 allocs/op | | BenchmarkContainsInt64SteadyState/AVX512-x8-2 | 19671033 | 61.36 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt8/small/Fallback-lo-2 | 248740710 | 5.218 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumInt8/small/SSE-x16-2 | 126181464 | 9.485 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumInt8/small/AVX-x16-2 | 126181464 | 9.485 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt8/small/AVX2-x32-2 | 73059427 | 14.44 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt8/small/AVX512-x64-2 | 49913169 | 24.41 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt8/medium/Fallback-lo-2 | 17278075 | 69.96 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumInt8/medium/SSE-x16-2 | 100000000 | 10.58 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumInt8/medium/AVX-x16-2 | 100000000 | 10.58 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt8/medium/AVX2-x32-2 | 91620999 | 13.10 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt8/medium/AVX512-x64-2 | 54082130 | 22.20 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt8/large/Fallback-lo-2 | 2006178 | 576.3 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumInt8/large/SSE-x16-2 | 41836690 | 27.82 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumInt8/large/AVX-x16-2 | 41836690 | 27.82 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt8/large/AVX2-x32-2 | 51735399 | 23.04 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt8/large/AVX512-x64-2 | 40861586 | 29.40 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt8/xlarge/Fallback-lo-2 | 273898 | 4383 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumInt8/xlarge/SSE-x16-2 | 6928408 | 173.1 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumInt8/xlarge/AVX-x16-2 | 6928408 | 173.1 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt8/xlarge/AVX2-x32-2 | 12639586 | 94.09 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt8/xlarge/AVX512-x64-2 | 13509693 | 89.67 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt16/small/Fallback-lo-2 | 249444103 | 5.012 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumInt16/small/SSE-x8-2 | 244927230 | 5.052 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumInt16/small/AVX-x8-2 | 244927230 | 5.052 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt16/small/AVX2-x16-2 | 122088517 | 9.715 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt16/small/AVX512-x32-2 | 54098370 | 22.00 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt16/medium/Fallback-lo-2 | 15782683 | 72.54 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumInt16/medium/SSE-x8-2 | 100000000 | 10.51 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumInt16/medium/AVX-x8-2 | 100000000 | 10.51 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt16/medium/AVX2-x16-2 | 100000000 | 10.75 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt16/medium/AVX512-x32-2 | 56147455 | 21.38 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt16/large/Fallback-lo-2 | 2173214 | 598.1 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumInt16/large/SSE-x8-2 | 26319481 | 44.73 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumInt16/large/AVX-x8-2 | 26319481 | 44.73 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt16/large/AVX2-x16-2 | 40459519 | 27.91 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt16/large/AVX512-x32-2 | 39359752 | 31.28 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt16/xlarge/Fallback-lo-2 | 273932 | 4382 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumInt16/xlarge/SSE-x8-2 | 3557265 | 331.2 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumInt16/xlarge/AVX-x8-2 | 3557265 | 331.2 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt16/xlarge/AVX2-x16-2 | 6930166 | 173.4 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt16/xlarge/AVX512-x32-2 | 12100244 | 97.01 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt32/small/Fallback-lo-2 | 249566539 | 4.808 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumInt32/small/SSE-x4-2 | 259250019 | 4.581 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumInt32/small/AVX-x4-2 | 259250019 | 4.581 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt32/small/AVX2-x8-2 | 232858933 | 5.404 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt32/small/AVX512-x16-2 | 100000000 | 11.18 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt32/medium/Fallback-lo-2 | 17274441 | 72.28 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumInt32/medium/SSE-x4-2 | 58400258 | 20.56 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumInt32/medium/AVX-x4-2 | 58400258 | 20.56 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt32/medium/AVX2-x8-2 | 110851756 | 10.67 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt32/medium/AVX512-x16-2 | 106593603 | 11.25 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt32/large/Fallback-lo-2 | 2171817 | 551.8 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumInt32/large/SSE-x4-2 | 8270253 | 146.0 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumInt32/large/AVX-x4-2 | 8270253 | 146.0 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt32/large/AVX2-x8-2 | 22234518 | 46.06 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt32/large/AVX512-x16-2 | 37448763 | 32.31 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt32/xlarge/Fallback-lo-2 | 273699 | 4559 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumInt32/xlarge/SSE-x4-2 | 1000000 | 1102 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumInt32/xlarge/AVX-x4-2 | 1000000 | 1102 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt32/xlarge/AVX2-x8-2 | 3586887 | 332.4 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt32/xlarge/AVX512-x16-2 | 7214437 | 170.5 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt64/small/Fallback-lo-2 | 417473124 | 2.886 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumInt64/small/SSE-x2-2 | 287521756 | 4.169 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumInt64/small/AVX-x2-2 | 287521756 | 4.169 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt64/small/AVX2-x4-2 | 277783513 | 4.311 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt64/small/AVX512-x8-2 | 172823103 | 6.993 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt64/medium/Fallback-lo-2 | 34022653 | 35.27 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumInt64/medium/SSE-x2-2 | 49241248 | 24.05 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumInt64/medium/AVX-x2-2 | 49241248 | 24.05 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt64/medium/AVX2-x4-2 | 78897342 | 14.58 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt64/medium/AVX512-x8-2 | 84361297 | 14.03 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt64/large/Fallback-lo-2 | 3680988 | 282.3 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumInt64/large/SSE-x2-2 | 6293607 | 170.7 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumInt64/large/AVX-x2-2 | 6293607 | 170.7 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt64/large/AVX2-x4-2 | 12739849 | 91.28 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt64/large/AVX512-x8-2 | 25508130 | 46.30 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt64/xlarge/Fallback-lo-2 | 546321 | 2283 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumInt64/xlarge/SSE-x2-2 | 877434 | 1289 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumInt64/xlarge/AVX-x2-2 | 877434 | 1289 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt64/xlarge/AVX2-x4-2 | 1845892 | 650.4 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt64/xlarge/AVX512-x8-2 | 2148355 | 550.8 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat32/small/Fallback-lo-2 | 411100770 | 2.951 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumFloat32/small/SSE-x4-2 | 264013596 | 4.572 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumFloat32/small/AVX-x4-2 | 264013596 | 4.572 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat32/small/AVX2-x8-2 | 174478266 | 6.911 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat32/small/AVX512-x16-2 | 61182673 | 19.78 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat32/medium/Fallback-lo-2 | 33815070 | 35.68 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumFloat32/medium/SSE-x4-2 | 58238188 | 20.66 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumFloat32/medium/AVX-x4-2 | 58238188 | 20.66 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat32/medium/AVX2-x8-2 | 91316544 | 13.26 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat32/medium/AVX512-x16-2 | 80046624 | 15.08 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat32/large/Fallback-lo-2 | 4304168 | 278.7 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumFloat32/large/SSE-x4-2 | 6198957 | 184.8 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumFloat32/large/AVX-x4-2 | 6198957 | 184.8 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat32/large/AVX2-x8-2 | 12260169 | 86.60 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat32/large/AVX512-x16-2 | 22147112 | 45.34 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat32/xlarge/Fallback-lo-2 | 546901 | 2193 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumFloat32/xlarge/SSE-x4-2 | 736503 | 1622 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumFloat32/xlarge/AVX-x4-2 | 736503 | 1622 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat32/xlarge/AVX2-x8-2 | 1493887 | 810.5 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat32/xlarge/AVX512-x16-2 | 2959298 | 393.4 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat64/small/Fallback-lo-2 | 410778070 | 3.043 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumFloat64/small/SSE-x2-2 | 254156008 | 4.714 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumFloat64/small/AVX-x2-2 | 254156008 | 4.714 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat64/small/AVX2-x4-2 | 227604434 | 5.323 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat64/small/AVX512-x8-2 | 170099748 | 7.115 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat64/medium/Fallback-lo-2 | 33646345 | 35.78 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumFloat64/medium/SSE-x2-2 | 32931152 | 34.92 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumFloat64/medium/AVX-x2-2 | 32931152 | 34.92 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat64/medium/AVX2-x4-2 | 75389446 | 16.79 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat64/medium/AVX512-x8-2 | 89826181 | 13.33 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat64/large/Fallback-lo-2 | 4293837 | 302.8 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumFloat64/large/SSE-x2-2 | 3146601 | 381.4 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumFloat64/large/AVX-x2-2 | 3146601 | 381.4 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat64/large/AVX2-x4-2 | 6373876 | 184.3 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat64/large/AVX512-x8-2 | 13464712 | 88.96 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat64/xlarge/Fallback-lo-2 | 545764 | 2193 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumFloat64/xlarge/SSE-x2-2 | 368846 | 3390 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumFloat64/xlarge/AVX-x2-2 | 368846 | 3390 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat64/xlarge/AVX2-x4-2 | 709940 | 1613 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumFloat64/xlarge/AVX512-x8-2 | 1480214 | 808.6 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanInt32/small/Fallback-lo-2 | 411529147 | 3.043 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMeanInt32/small/SSE-x4-2 | 204428401 | 5.872 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMeanInt32/small/AVX-x4-2 | 204428401 | 5.872 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanInt32/small/AVX2-x8-2 | 187573928 | 6.214 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanInt32/small/AVX512-x16-2 | 98346700 | 12.12 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanInt32/medium/Fallback-lo-2 | 33481442 | 35.72 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMeanInt32/medium/SSE-x4-2 | 52042394 | 22.12 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMeanInt32/medium/AVX-x4-2 | 52042394 | 22.12 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanInt32/medium/AVX2-x8-2 | 96288541 | 13.44 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanInt32/medium/AVX512-x16-2 | 100995780 | 11.90 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanInt32/large/Fallback-lo-2 | 4296570 | 289.9 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMeanInt32/large/SSE-x4-2 | 7743022 | 146.4 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMeanInt32/large/AVX-x4-2 | 7743022 | 146.4 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanInt32/large/AVX2-x8-2 | 24355988 | 46.26 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanInt32/large/AVX512-x16-2 | 37322655 | 32.89 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanInt32/xlarge/Fallback-lo-2 | 547008 | 2193 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMeanInt32/xlarge/SSE-x4-2 | 1087246 | 1112 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMeanInt32/xlarge/AVX-x4-2 | 1087246 | 1112 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanInt32/xlarge/AVX2-x8-2 | 1386868 | 761.9 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanInt32/xlarge/AVX512-x16-2 | 7166142 | 170.7 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanFloat64/small/Fallback-lo-2 | 349760005 | 3.449 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMeanFloat64/small/SSE-x2-2 | 189674538 | 6.293 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMeanFloat64/small/AVX-x2-2 | 189674538 | 6.293 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanFloat64/small/AVX2-x4-2 | 159228600 | 7.531 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanFloat64/small/AVX512-x8-2 | 110196433 | 10.89 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanFloat64/medium/Fallback-lo-2 | 32968618 | 36.17 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMeanFloat64/medium/SSE-x2-2 | 30863817 | 37.69 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMeanFloat64/medium/AVX-x2-2 | 30863817 | 37.69 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanFloat64/medium/AVX2-x4-2 | 62428772 | 19.66 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanFloat64/medium/AVX512-x8-2 | 77140984 | 15.54 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanFloat64/large/Fallback-lo-2 | 4281057 | 280.6 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMeanFloat64/large/SSE-x2-2 | 3057349 | 389.4 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMeanFloat64/large/AVX-x2-2 | 3057349 | 389.4 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanFloat64/large/AVX2-x4-2 | 6509438 | 185.9 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanFloat64/large/AVX512-x8-2 | 12668032 | 93.50 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanFloat64/xlarge/Fallback-lo-2 | 545898 | 2288 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMeanFloat64/xlarge/SSE-x2-2 | 367671 | 4048 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMeanFloat64/xlarge/AVX-x2-2 | 367671 | 4048 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanFloat64/xlarge/AVX2-x4-2 | 739941 | 1621 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMeanFloat64/xlarge/AVX512-x8-2 | 1434867 | 811.3 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMinInt32/small/SSE-x4-2 | 312338268 | 3.860 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMinInt32/small/AVX-x4-2 | 312338268 | 3.860 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMinInt32/small/AVX2-x8-2 | 238034872 | 5.042 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMinInt32/small/AVX512-x16-2 | 152600943 | 6.661 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMinInt32/medium/SSE-x4-2 | 61051266 | 19.73 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMinInt32/medium/AVX-x4-2 | 61051266 | 19.73 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMinInt32/medium/AVX2-x8-2 | 91792144 | 13.11 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMinInt32/medium/AVX512-x16-2 | 99994540 | 12.18 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMinInt32/large/SSE-x4-2 | 8604774 | 140.5 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMinInt32/large/AVX-x4-2 | 8604774 | 140.5 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMinInt32/large/AVX2-x8-2 | 15581037 | 77.56 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMinInt32/large/AVX512-x16-2 | 30512421 | 40.24 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMinInt32/xlarge/SSE-x4-2 | 1000000 | 1110 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMinInt32/xlarge/AVX-x4-2 | 1000000 | 1110 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMinInt32/xlarge/AVX2-x8-2 | 2158272 | 557.2 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMinInt32/xlarge/AVX512-x16-2 | 4253668 | 282.6 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMinFloat64/small/SSE-x2-2 | 264129410 | 4.544 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMinFloat64/small/AVX-x2-2 | 264129410 | 4.544 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMinFloat64/small/AVX2-x4-2 | 299587609 | 4.008 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMinFloat64/small/AVX512-x8-2 | 100000000 | 10.05 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMinFloat64/medium/SSE-x2-2 | 32778514 | 36.93 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMinFloat64/medium/AVX-x2-2 | 32778514 | 36.93 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMinFloat64/medium/AVX2-x4-2 | 53356347 | 20.30 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMinFloat64/medium/AVX512-x8-2 | 74832976 | 16.21 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMinFloat64/large/SSE-x2-2 | 3863326 | 300.0 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMinFloat64/large/AVX-x2-2 | 3863326 | 300.0 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMinFloat64/large/AVX2-x4-2 | 7670576 | 146.5 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMinFloat64/large/AVX512-x8-2 | 14017984 | 78.21 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMinFloat64/xlarge/SSE-x2-2 | 492739 | 2195 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMinFloat64/xlarge/AVX-x2-2 | 492739 | 2195 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMinFloat64/xlarge/AVX2-x4-2 | 1000000 | 1103 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMinFloat64/xlarge/AVX512-x8-2 | 2145290 | 560.3 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMaxInt32/small/SSE-x4-2 | 306585705 | 3.860 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMaxInt32/small/AVX-x4-2 | 306585705 | 3.860 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMaxInt32/small/AVX2-x8-2 | 237347997 | 5.086 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMaxInt32/small/AVX512-x16-2 | 201433966 | 6.130 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMaxInt32/medium/SSE-x4-2 | 60759631 | 19.92 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMaxInt32/medium/AVX-x4-2 | 60759631 | 19.92 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMaxInt32/medium/AVX2-x8-2 | 90934662 | 13.13 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMaxInt32/medium/AVX512-x16-2 | 98517944 | 12.18 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMaxInt32/large/SSE-x4-2 | 8590542 | 139.6 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMaxInt32/large/AVX-x4-2 | 8590542 | 139.6 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMaxInt32/large/AVX2-x8-2 | 15770372 | 77.69 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMaxInt32/large/AVX512-x16-2 | 30197324 | 39.32 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMaxInt32/xlarge/SSE-x4-2 | 1000000 | 1104 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMaxInt32/xlarge/AVX-x4-2 | 1000000 | 1104 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMaxInt32/xlarge/AVX2-x8-2 | 2152038 | 562.1 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMaxInt32/xlarge/AVX512-x16-2 | 3917990 | 296.7 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMaxFloat64/small/SSE-x2-2 | 249617162 | 4.816 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMaxFloat64/small/AVX-x2-2 | 249617162 | 4.816 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMaxFloat64/small/AVX2-x4-2 | 207017514 | 5.855 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMaxFloat64/small/AVX512-x8-2 | 66520290 | 17.74 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMaxFloat64/medium/SSE-x2-2 | 32307492 | 36.92 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMaxFloat64/medium/AVX-x2-2 | 32307492 | 36.92 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMaxFloat64/medium/AVX2-x4-2 | 57306838 | 20.77 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMaxFloat64/medium/AVX512-x8-2 | 56911946 | 21.12 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMaxFloat64/large/SSE-x2-2 | 4259366 | 287.1 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMaxFloat64/large/AVX-x2-2 | 4259366 | 287.1 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMaxFloat64/large/AVX2-x4-2 | 7905420 | 148.9 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMaxFloat64/large/AVX512-x8-2 | 14100686 | 83.43 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkMaxFloat64/xlarge/SSE-x2-2 | 545378 | 2243 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkMaxFloat64/xlarge/AVX-x2-2 | 545378 | 2243 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMaxFloat64/xlarge/AVX2-x4-2 | 1000000 | 1113 ns/op | 0 B/op | 0 allocs/op | | BenchmarkMaxFloat64/xlarge/AVX512-x8-2 | 2119741 | 565.7 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt8ByWidth/Fallback-lo-2 | 896775 | 1335 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumInt8ByWidth/SSE-x16-2 | 12557700 | 94.52 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumInt8ByWidth/AVX-x16-2 | 12557700 | 94.52 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt8ByWidth/AVX2-x32-2 | 18702537 | 55.03 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt8ByWidth/AVX512-x64-2 | 21342572 | 56.10 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt64SteadyState/Fallback-lo-2 | 513738 | 2195 ns/op | 0 B/op | 0 allocs/op | -| BenchmarkSumInt64SteadyState/SSE-x2-2 | 928376 | 1296 ns/op | 0 B/op | 0 allocs/op | +| BenchmarkSumInt64SteadyState/AVX-x2-2 | 928376 | 1296 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt64SteadyState/AVX2-x4-2 | 1836968 | 888.1 ns/op | 0 B/op | 0 allocs/op | | BenchmarkSumInt64SteadyState/AVX512-x8-2 | 2141715 | 551.3 ns/op | 0 B/op | 0 allocs/op | diff --git a/exp/simd/README.md b/exp/simd/README.md index ea7b56f..03322d4 100644 --- a/exp/simd/README.md +++ b/exp/simd/README.md @@ -12,7 +12,7 @@ If you see **SIGILL: illegal instruction** when running tests, the CPU or VM doe ```bash # List SIMD-related flags -grep -E 'avx|sse' /proc/cpuinfo +grep -E 'avx' /proc/cpuinfo # Or with lscpu lscpu | grep -i avx @@ -22,21 +22,22 @@ lscpu | grep -i avx | Tests / code | Required flag(s) | Typical CPUs | | ----------------- | -------------------------- | ----------------------------------------------------------------------- | -| SSE (128-bit) | `sse2` (baseline on amd64) | All amd64 | +| AVX (128-bit) | `avx` (baseline on amd64) | All amd64 | | AVX2 (256-bit) | `avx2` | Intel Haswell+, AMD Excavator+ | | AVX-512 (512-bit) | `avx512f` | Intel Skylake-X+, some Xeons; many AMD/consumer CPUs do **not** have it | ### What the tests do +- **AVX tests** (128-bit) call `requireAVX(t)` and are **skipped** if the CPU does not support AVX. - **AVX2 tests** call `requireAVX2(t)` and are **skipped** if the CPU does not support AVX2 (no SIGILL). - **AVX-512 tests** (when enabled) should call `requireAVX512(t)` and skip when AVX-512 is not available. So on a machine without AVX2, AVX2 tests will show as skipped instead of crashing. -### Run only SSE tests +### Run only AVX tests -If your environment does not support AVX2/AVX-512, you can still run the SSE tests: +If your environment does not support AVX2/AVX-512, you can still run the AVX (128-bit) tests: ```bash -GOEXPERIMENT=simd go test -run SSE ./... +GOEXPERIMENT=simd go test -run AVX ./... ``` diff --git a/exp/simd/cpu_amd64_test.go b/exp/simd/cpu_amd64_test.go index 77a3196..07aab69 100644 --- a/exp/simd/cpu_amd64_test.go +++ b/exp/simd/cpu_amd64_test.go @@ -19,16 +19,25 @@ type skipHelper interface { // How to check if your Linux CPU supports SIMD (avoids SIGILL): // -// grep -E 'avx|sse' /proc/cpuinfo +// grep -E 'avx' /proc/cpuinfo // // Or: lscpu | grep -i avx // // You need: -// - SSE tests (128-bit): sse2 (baseline on amd64), sse4.1/sse4.2 often used +// - AVX tests (128-bit): avx in flags (baseline on amd64) // - AVX2 tests (256-bit): avx2 in flags // - AVX-512 tests: avx512f (and often avx512bw, avx512vl) // -// If your CPU lacks AVX2 or AVX-512, tests that use them will be skipped automatically. +// If your CPU lacks AVX or AVX2 or AVX-512, tests that use them will be skipped automatically. + +// requireAVX skips the test/benchmark if the CPU does not support AVX (128-bit SIMD). +// Use at the start of each AVX test/benchmark to avoid SIGILL on older or non-x86 systems. +func requireAVX(t skipHelper) { + t.Helper() + if !archsimd.X86.AVX() { + t.Skipf("CPU does not support AVX; skipping. Check compatibility: grep avx /proc/cpuinfo") + } +} // requireAVX2 skips the test/benchmark if the CPU does not support AVX2 (256-bit SIMD). // Use at the start of each AVX2 test/benchmark to avoid SIGILL on older or non-x86 systems. diff --git a/exp/simd/intersect_avx512.go b/exp/simd/intersect_avx512.go index f55cb4f..9b435c6 100644 --- a/exp/simd/intersect_avx512.go +++ b/exp/simd/intersect_avx512.go @@ -6,7 +6,7 @@ import ( "simd/archsimd" ) -// ContainsInt8x16 checks if collection contains target using SSE SIMD and AVX512 SIMD +// ContainsInt8x16 checks if collection contains target using AVX SIMD and AVX-512 SIMD func ContainsInt8x16[T ~int8](collection []T, target T) bool { length := uint(len(collection)) if length == 0 { @@ -40,7 +40,7 @@ func ContainsInt8x16[T ~int8](collection []T, target T) bool { return false } -// ContainsInt16x8 checks if collection contains target using SSE SIMD and AVX512 SIMD +// ContainsInt16x8 checks if collection contains target using AVX SIMD and AVX-512 SIMD func ContainsInt16x8[T ~int16](collection []T, target T) bool { length := uint(len(collection)) if length == 0 { @@ -72,7 +72,7 @@ func ContainsInt16x8[T ~int16](collection []T, target T) bool { return false } -// ContainsInt32x4 checks if collection contains target using SSE SIMD and AVX512 SIMD +// ContainsInt32x4 checks if collection contains target using AVX SIMD and AVX-512 SIMD func ContainsInt32x4[T ~int32](collection []T, target T) bool { length := uint(len(collection)) if length == 0 { @@ -104,7 +104,7 @@ func ContainsInt32x4[T ~int32](collection []T, target T) bool { return false } -// ContainsInt64x2 checks if collection contains target using SSE SIMD and AVX512 SIMD +// ContainsInt64x2 checks if collection contains target using AVX SIMD and AVX-512 SIMD func ContainsInt64x2[T ~int64](collection []T, target T) bool { length := uint(len(collection)) if length == 0 { @@ -136,7 +136,7 @@ func ContainsInt64x2[T ~int64](collection []T, target T) bool { return false } -// ContainsUint8x16 checks if collection contains target using SSE SIMD and AVX512 SIMD +// ContainsUint8x16 checks if collection contains target using AVX SIMD and AVX-512 SIMD func ContainsUint8x16[T ~uint8](collection []T, target T) bool { length := uint(len(collection)) if length == 0 { @@ -168,7 +168,7 @@ func ContainsUint8x16[T ~uint8](collection []T, target T) bool { return false } -// ContainsUint16x8 checks if collection contains target using SSE SIMD and AVX512 SIMD +// ContainsUint16x8 checks if collection contains target using AVX SIMD and AVX-512 SIMD func ContainsUint16x8[T ~uint16](collection []T, target T) bool { length := uint(len(collection)) if length == 0 { @@ -200,7 +200,7 @@ func ContainsUint16x8[T ~uint16](collection []T, target T) bool { return false } -// ContainsUint32x4 checks if collection contains target using SSE SIMD and AVX512 SIMD +// ContainsUint32x4 checks if collection contains target using AVX SIMD and AVX-512 SIMD func ContainsUint32x4[T ~uint32](collection []T, target T) bool { length := uint(len(collection)) if length == 0 { @@ -232,7 +232,7 @@ func ContainsUint32x4[T ~uint32](collection []T, target T) bool { return false } -// ContainsUint64x2 checks if collection contains target using SSE SIMD and AVX512 SIMD +// ContainsUint64x2 checks if collection contains target using AVX SIMD and AVX-512 SIMD func ContainsUint64x2[T ~uint64](collection []T, target T) bool { length := uint(len(collection)) if length == 0 { @@ -264,7 +264,7 @@ func ContainsUint64x2[T ~uint64](collection []T, target T) bool { return false } -// ContainsFloat32x4 checks if collection contains target using SSE SIMD and AVX512 SIMD +// ContainsFloat32x4 checks if collection contains target using AVX SIMD and AVX-512 SIMD func ContainsFloat32x4[T ~float32](collection []T, target T) bool { length := uint(len(collection)) if length == 0 { @@ -296,7 +296,7 @@ func ContainsFloat32x4[T ~float32](collection []T, target T) bool { return false } -// ContainsFloat64x2 checks if collection contains target using SSE SIMD and AVX512 SIMD +// ContainsFloat64x2 checks if collection contains target using AVX SIMD and AVX-512 SIMD func ContainsFloat64x2[T ~float64](collection []T, target T) bool { length := uint(len(collection)) if length == 0 { diff --git a/exp/simd/intersect_bench_test.go b/exp/simd/intersect_bench_test.go index 5ad7af0..387d863 100644 --- a/exp/simd/intersect_bench_test.go +++ b/exp/simd/intersect_bench_test.go @@ -8,16 +8,16 @@ import ( // Benchmark suite for SIMD Contains operations compared to core lo package fallbacks. // These benchmarks measure the performance of element lookup operations -// across different SIMD implementations (SSE, AVX2, AVX512) and data sizes. +// across different SIMD implementations (AVX, AVX2, AVX512) and data sizes. // Benchmark sizes for Contains operations var containsBenchmarkSizes = []struct { name string size int }{ - {"tiny", 4}, // Smaller than SSE width (16 lanes for int8) - {"small", 16}, // Exactly SSE width for int8 - {"medium", 64}, // Multiple of SSE, between SSE and AVX2 for int8 + {"tiny", 4}, // Smaller than AVX width (16 lanes for int8) + {"small", 16}, // Exactly AVX width for int8 + {"medium", 64}, // Multiple of AVX, between AVX and AVX2 for int8 {"large", 256}, // Multiple of AVX2 (32 lanes for int8) {"xlarge", 1024}, // Multiple of AVX512 (64 lanes for int8) {"massive", 8192}, // Very large dataset @@ -33,14 +33,14 @@ func BenchmarkContainsInt8(b *testing.B) { data := generateInt8(bs.size) target := int8(42) - b.Run("SSE-x16", func(b *testing.B) { + b.Run("AVX512-x16", func(b *testing.B) { requireAVX512(b) // ContainsInt8x16 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { _ = ContainsInt8x16(data, target) } }) - b.Run("AVX2-x32", func(b *testing.B) { + b.Run("AVX512-x32", func(b *testing.B) { requireAVX512(b) // ContainsInt8x32 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { @@ -68,14 +68,14 @@ func BenchmarkContainsInt16(b *testing.B) { data := generateInt16(bs.size) target := int16(42) - b.Run("SSE-x8", func(b *testing.B) { + b.Run("AVX512-x8", func(b *testing.B) { requireAVX512(b) // ContainsInt16x8 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { _ = ContainsInt16x8(data, target) } }) - b.Run("AVX2-x16", func(b *testing.B) { + b.Run("AVX512-x16", func(b *testing.B) { requireAVX512(b) // ContainsInt16x16 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { @@ -103,14 +103,14 @@ func BenchmarkContainsInt32(b *testing.B) { data := generateInt32(bs.size) target := int32(42) - b.Run("SSE-x4", func(b *testing.B) { + b.Run("AVX512-x4", func(b *testing.B) { requireAVX512(b) // ContainsInt32x4 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { _ = ContainsInt32x4(data, target) } }) - b.Run("AVX2-x8", func(b *testing.B) { + b.Run("AVX512-x8", func(b *testing.B) { requireAVX512(b) // ContainsInt32x8 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { @@ -138,14 +138,14 @@ func BenchmarkContainsInt64(b *testing.B) { data := generateInt64(bs.size) target := int64(42) - b.Run("SSE-x2", func(b *testing.B) { + b.Run("AVX512-x2", func(b *testing.B) { requireAVX512(b) // ContainsInt64x2 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { _ = ContainsInt64x2(data, target) } }) - b.Run("AVX2-x4", func(b *testing.B) { + b.Run("AVX512-x4", func(b *testing.B) { requireAVX512(b) // ContainsInt64x4 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { @@ -173,14 +173,14 @@ func BenchmarkContainsUint8(b *testing.B) { data := generateUint8(bs.size) target := uint8(255) - b.Run("SSE-x16", func(b *testing.B) { + b.Run("AVX512-x16", func(b *testing.B) { requireAVX512(b) // ContainsUint8x16 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { _ = ContainsUint8x16(data, target) } }) - b.Run("AVX2-x32", func(b *testing.B) { + b.Run("AVX512-x32", func(b *testing.B) { requireAVX512(b) // ContainsUint8x32 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { @@ -208,14 +208,14 @@ func BenchmarkContainsUint16(b *testing.B) { data := generateUint16(bs.size) target := uint16(42) - b.Run("SSE-x8", func(b *testing.B) { + b.Run("AVX512-x8", func(b *testing.B) { requireAVX512(b) // ContainsUint16x8 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { _ = ContainsUint16x8(data, target) } }) - b.Run("AVX2-x16", func(b *testing.B) { + b.Run("AVX512-x16", func(b *testing.B) { requireAVX512(b) // ContainsUint16x16 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { @@ -243,14 +243,14 @@ func BenchmarkContainsUint32(b *testing.B) { data := generateUint32(bs.size) target := uint32(42) - b.Run("SSE-x4", func(b *testing.B) { + b.Run("AVX512-x4", func(b *testing.B) { requireAVX512(b) // ContainsUint32x4 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { _ = ContainsUint32x4(data, target) } }) - b.Run("AVX2-x8", func(b *testing.B) { + b.Run("AVX512-x8", func(b *testing.B) { requireAVX512(b) // ContainsUint32x8 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { @@ -278,14 +278,14 @@ func BenchmarkContainsUint64(b *testing.B) { data := generateUint64(bs.size) target := uint64(42) - b.Run("SSE-x2", func(b *testing.B) { + b.Run("AVX512-x2", func(b *testing.B) { requireAVX512(b) // ContainsUint64x2 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { _ = ContainsUint64x2(data, target) } }) - b.Run("AVX2-x4", func(b *testing.B) { + b.Run("AVX512-x4", func(b *testing.B) { requireAVX512(b) // ContainsUint64x4 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { @@ -313,14 +313,14 @@ func BenchmarkContainsFloat32(b *testing.B) { data := generateFloat32(bs.size) target := float32(42.5) - b.Run("SSE-x4", func(b *testing.B) { + b.Run("AVX512-x4", func(b *testing.B) { requireAVX512(b) // ContainsFloat32x4 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { _ = ContainsFloat32x4(data, target) } }) - b.Run("AVX2-x8", func(b *testing.B) { + b.Run("AVX512-x8", func(b *testing.B) { requireAVX512(b) // ContainsFloat32x8 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { @@ -348,14 +348,14 @@ func BenchmarkContainsFloat64(b *testing.B) { data := generateFloat64(bs.size) target := float64(42.5) - b.Run("SSE-x2", func(b *testing.B) { + b.Run("AVX512-x2", func(b *testing.B) { requireAVX512(b) // ContainsFloat64x2 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { _ = ContainsFloat64x2(data, target) } }) - b.Run("AVX2-x4", func(b *testing.B) { + b.Run("AVX512-x4", func(b *testing.B) { requireAVX512(b) // ContainsFloat64x4 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { @@ -386,14 +386,14 @@ func BenchmarkContainsWorstCase(b *testing.B) { } target := int32(size - 1) // Target at the very end - b.Run("SSE-x4", func(b *testing.B) { + b.Run("AVX512-x4", func(b *testing.B) { requireAVX512(b) // ContainsInt32x4 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { _ = ContainsInt32x4(data, target) } }) - b.Run("AVX2-x8", func(b *testing.B) { + b.Run("AVX512-x8", func(b *testing.B) { requireAVX512(b) // ContainsInt32x8 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { @@ -422,14 +422,14 @@ func BenchmarkContainsBestCase(b *testing.B) { } target := int32(0) // Target at the very beginning - b.Run("SSE-x4", func(b *testing.B) { + b.Run("AVX512-x4", func(b *testing.B) { requireAVX512(b) // ContainsInt32x4 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { _ = ContainsInt32x4(data, target) } }) - b.Run("AVX2-x8", func(b *testing.B) { + b.Run("AVX512-x8", func(b *testing.B) { requireAVX512(b) // ContainsInt32x8 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { @@ -456,14 +456,14 @@ func BenchmarkContainsNegative(b *testing.B) { data := generateInt32(bs.size) target := int32(999999) // Target that's unlikely to be in the data - b.Run("SSE-x4", func(b *testing.B) { + b.Run("AVX512-x4", func(b *testing.B) { requireAVX512(b) // ContainsInt32x4 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { _ = ContainsInt32x4(data, target) } }) - b.Run("AVX2-x8", func(b *testing.B) { + b.Run("AVX512-x8", func(b *testing.B) { requireAVX512(b) // ContainsInt32x8 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { @@ -497,8 +497,8 @@ func BenchmarkContainsInt8ByWidth(b *testing.B) { name string fn func() bool }{ - {"SSE-x16", func() bool { return ContainsInt8x16(data, target) }}, - {"AVX2-x32", func() bool { return ContainsInt8x32(data, target) }}, + {"AVX512-x16", func() bool { return ContainsInt8x16(data, target) }}, + {"AVX512-x32", func() bool { return ContainsInt8x32(data, target) }}, {"AVX512-x64", func() bool { return ContainsInt8x64(data, target) }}, } @@ -533,14 +533,14 @@ func BenchmarkContainsInt64SteadyState(b *testing.B) { b.ResetTimer() // Reset timer to exclude warmup - b.Run("SSE-x2", func(b *testing.B) { + b.Run("AVX512-x2", func(b *testing.B) { requireAVX512(b) // ContainsInt64x2 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { _ = ContainsInt64x2(data, target) } }) - b.Run("AVX2-x4", func(b *testing.B) { + b.Run("AVX512-x4", func(b *testing.B) { requireAVX512(b) // ContainsInt64x4 is in intersect_avx512.go which uses AVX-512 b.ReportAllocs() for i := 0; i < b.N; i++ { diff --git a/exp/simd/math.go b/exp/simd/math.go index be7d329..629534a 100644 --- a/exp/simd/math.go +++ b/exp/simd/math.go @@ -364,7 +364,8 @@ func MinInt64[T ~int64](collection []T) T { case simdFeatureAVX2: return MinInt64x4(collection) case simdFeatureAVX: - return MinInt64x2(collection) + // MinInt64x2 requires AVX-512 (archsimd Int64x2.Min); use scalar fallback + fallthrough default: return lo.Min(collection) } @@ -420,7 +421,8 @@ func MinUint64[T ~uint64](collection []T) T { case simdFeatureAVX2: return MinUint64x4(collection) case simdFeatureAVX: - return MinUint64x2(collection) + // MinUint64x2 requires AVX-512; use scalar fallback + fallthrough default: return lo.Min(collection) } @@ -504,7 +506,8 @@ func MaxInt64[T ~int64](collection []T) T { case simdFeatureAVX2: return MaxInt64x4(collection) case simdFeatureAVX: - return MaxInt64x2(collection) + // MaxInt64x2 requires AVX-512; use scalar fallback + fallthrough default: return lo.Max(collection) } @@ -560,7 +563,8 @@ func MaxUint64[T ~uint64](collection []T) T { case simdFeatureAVX2: return MaxUint64x4(collection) case simdFeatureAVX: - return MaxUint64x2(collection) + // MaxUint64x2 requires AVX-512; use scalar fallback + fallthrough default: return lo.Max(collection) } @@ -674,7 +678,8 @@ func ClampInt64[T ~int64, Slice ~[]T](collection Slice, min, max T) Slice { case simdFeatureAVX2: return ClampInt64x4(collection, min, max) case simdFeatureAVX: - return ClampInt64x2(collection, min, max) + // ClampInt64x2 requires AVX-512; use scalar fallback + fallthrough default: result := make(Slice, len(collection)) for i, v := range collection { @@ -770,7 +775,8 @@ func ClampUint64[T ~uint64, Slice ~[]T](collection Slice, min, max T) Slice { case simdFeatureAVX2: return ClampUint64x4(collection, min, max) case simdFeatureAVX: - return ClampUint64x2(collection, min, max) + // ClampUint64x2 requires AVX-512; use scalar fallback + fallthrough default: result := make(Slice, len(collection)) for i, v := range collection { diff --git a/exp/simd/math_sse.go b/exp/simd/math_avx.go similarity index 82% rename from exp/simd/math_sse.go rename to exp/simd/math_avx.go index fb4277d..909d6b6 100644 --- a/exp/simd/math_sse.go +++ b/exp/simd/math_avx.go @@ -9,9 +9,9 @@ import ( "github.com/samber/lo" ) -// SSE (128-bit) SIMD sum functions - 16/8/4/2 lanes +// AVX (128-bit) SIMD sum functions - 16/8/4/2 lanes -// SumInt8x16 sums a slice of int8 using SSE SIMD (Int8x16, 16 lanes). +// SumInt8x16 sums a slice of int8 using AVX SIMD (Int8x16, 16 lanes). // Overflow: The accumulation is performed using int8, which can overflow for large collections. // If the sum exceeds the int8 range (-128 to 127), the result will wrap around silently. // For collections that may overflow, consider using a wider type or handle overflow detection externally. @@ -45,7 +45,7 @@ func SumInt8x16[T ~int8](collection []T) T { return sum } -// SumInt16x8 sums a slice of int16 using SSE SIMD (Int16x8, 8 lanes). +// SumInt16x8 sums a slice of int16 using AVX SIMD (Int16x8, 8 lanes). // Overflow: The accumulation is performed using int16, which can overflow for large collections. // If the sum exceeds the int16 range (-32768 to 32767), the result will wrap around silently. // For collections that may overflow, consider using a wider type or handle overflow detection externally. @@ -79,7 +79,7 @@ func SumInt16x8[T ~int16](collection []T) T { return sum } -// SumInt32x4 sums a slice of int32 using SSE SIMD (Int32x4, 4 lanes). +// SumInt32x4 sums a slice of int32 using AVX SIMD (Int32x4, 4 lanes). // Overflow: The accumulation is performed using int32, which can overflow for very large collections. // If the sum exceeds the int32 range (-2147483648 to 2147483647), the result will wrap around silently. // For collections that may overflow, consider using SumInt64x2 or handle overflow detection externally. @@ -113,7 +113,7 @@ func SumInt32x4[T ~int32](collection []T) T { return sum } -// SumInt64x2 sums a slice of int64 using SSE SIMD (Int64x2, 2 lanes). +// SumInt64x2 sums a slice of int64 using AVX SIMD (Int64x2, 2 lanes). // Overflow: The accumulation is performed using int64, which can overflow for extremely large collections. // If the sum exceeds the int64 range, the result will wrap around silently. // For collections that may overflow, handle overflow detection externally (e.g., using big.Int). @@ -147,7 +147,7 @@ func SumInt64x2[T ~int64](collection []T) T { return sum } -// SumUint8x16 sums a slice of uint8 using SSE SIMD (Uint8x16, 16 lanes). +// SumUint8x16 sums a slice of uint8 using AVX SIMD (Uint8x16, 16 lanes). // Overflow: The accumulation is performed using uint8, which can overflow for large collections. // If the sum exceeds the uint8 range (0 to 255), the result will wrap around silently. // For collections that may overflow, consider using a wider type or handle overflow detection externally. @@ -181,7 +181,7 @@ func SumUint8x16[T ~uint8](collection []T) T { return sum } -// SumUint16x8 sums a slice of uint16 using SSE SIMD (Uint16x8, 8 lanes). +// SumUint16x8 sums a slice of uint16 using AVX SIMD (Uint16x8, 8 lanes). // Overflow: The accumulation is performed using uint16, which can overflow for large collections. // If the sum exceeds the uint16 range (0 to 65535), the result will wrap around silently. // For collections that may overflow, consider using a wider type or handle overflow detection externally. @@ -215,7 +215,7 @@ func SumUint16x8[T ~uint16](collection []T) T { return sum } -// SumUint32x4 sums a slice of uint32 using SSE SIMD (Uint32x4, 4 lanes). +// SumUint32x4 sums a slice of uint32 using AVX SIMD (Uint32x4, 4 lanes). // Overflow: The accumulation is performed using uint32, which can overflow for very large collections. // If the sum exceeds the uint32 range (0 to 4294967295), the result will wrap around silently. // For collections that may overflow, consider using SumUint64x2 or handle overflow detection externally. @@ -249,7 +249,7 @@ func SumUint32x4[T ~uint32](collection []T) T { return sum } -// SumUint64x2 sums a slice of uint64 using SSE SIMD (Uint64x2, 2 lanes). +// SumUint64x2 sums a slice of uint64 using AVX SIMD (Uint64x2, 2 lanes). // Overflow: The accumulation is performed using uint64, which can overflow for extremely large collections. // If the sum exceeds the uint64 range, the result will wrap around silently. // For collections that may overflow, handle overflow detection externally (e.g., using big.Int). @@ -283,7 +283,7 @@ func SumUint64x2[T ~uint64](collection []T) T { return sum } -// SumFloat32x4 sums a slice of float32 using SSE SIMD (Float32x4, 4 lanes). +// SumFloat32x4 sums a slice of float32 using AVX SIMD (Float32x4, 4 lanes). // Overflow: The accumulation is performed using float32. Overflow will result in +/-Inf rather than wrapping. // For collections requiring high precision or large sums, consider using SumFloat64x2. func SumFloat32x4[T ~float32](collection []T) T { @@ -316,7 +316,7 @@ func SumFloat32x4[T ~float32](collection []T) T { return sum } -// SumFloat64x2 sums a slice of float64 using SSE SIMD (Float64x2, 2 lanes). +// SumFloat64x2 sums a slice of float64 using AVX SIMD (Float64x2, 2 lanes). // Overflow: The accumulation is performed using float64. Overflow will result in +/-Inf rather than wrapping. // For collections that may overflow, handle overflow detection externally (e.g., using big.Float). func SumFloat64x2[T ~float64](collection []T) T { @@ -349,7 +349,7 @@ func SumFloat64x2[T ~float64](collection []T) T { return sum } -// MeanInt8x16 calculates the mean of a slice of int8 using SSE SIMD +// MeanInt8x16 calculates the mean of a slice of int8 using AVX SIMD func MeanInt8x16[T ~int8](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -359,7 +359,7 @@ func MeanInt8x16[T ~int8](collection []T) T { return sum / T(length) } -// MeanInt16x8 calculates the mean of a slice of int16 using SSE SIMD +// MeanInt16x8 calculates the mean of a slice of int16 using AVX SIMD func MeanInt16x8[T ~int16](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -369,7 +369,7 @@ func MeanInt16x8[T ~int16](collection []T) T { return sum / T(length) } -// MeanInt32x4 calculates the mean of a slice of int32 using SSE SIMD +// MeanInt32x4 calculates the mean of a slice of int32 using AVX SIMD func MeanInt32x4[T ~int32](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -379,7 +379,7 @@ func MeanInt32x4[T ~int32](collection []T) T { return sum / T(length) } -// MeanInt64x2 calculates the mean of a slice of int64 using SSE SIMD +// MeanInt64x2 calculates the mean of a slice of int64 using AVX SIMD func MeanInt64x2[T ~int64](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -389,7 +389,7 @@ func MeanInt64x2[T ~int64](collection []T) T { return sum / T(length) } -// MeanUint8x16 calculates the mean of a slice of uint8 using SSE SIMD +// MeanUint8x16 calculates the mean of a slice of uint8 using AVX SIMD func MeanUint8x16[T ~uint8](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -399,7 +399,7 @@ func MeanUint8x16[T ~uint8](collection []T) T { return sum / T(length) } -// MeanUint16x8 calculates the mean of a slice of uint16 using SSE SIMD +// MeanUint16x8 calculates the mean of a slice of uint16 using AVX SIMD func MeanUint16x8[T ~uint16](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -409,7 +409,7 @@ func MeanUint16x8[T ~uint16](collection []T) T { return sum / T(length) } -// MeanUint32x4 calculates the mean of a slice of uint32 using SSE SIMD +// MeanUint32x4 calculates the mean of a slice of uint32 using AVX SIMD func MeanUint32x4[T ~uint32](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -419,7 +419,7 @@ func MeanUint32x4[T ~uint32](collection []T) T { return sum / T(length) } -// MeanUint64x2 calculates the mean of a slice of uint64 using SSE SIMD +// MeanUint64x2 calculates the mean of a slice of uint64 using AVX SIMD func MeanUint64x2[T ~uint64](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -429,7 +429,7 @@ func MeanUint64x2[T ~uint64](collection []T) T { return sum / T(length) } -// MeanFloat32x4 calculates the mean of a slice of float32 using SSE SIMD +// MeanFloat32x4 calculates the mean of a slice of float32 using AVX SIMD func MeanFloat32x4[T ~float32](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -440,7 +440,7 @@ func MeanFloat32x4[T ~float32](collection []T) T { return sum / T(length) } -// MeanFloat64x2 calculates the mean of a slice of float64 using SSE SIMD +// MeanFloat64x2 calculates the mean of a slice of float64 using AVX SIMD func MeanFloat64x2[T ~float64](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -450,7 +450,7 @@ func MeanFloat64x2[T ~float64](collection []T) T { return sum / T(length) } -// ClampInt8x16 clamps each element in collection between min and max values using SSE SIMD +// ClampInt8x16 clamps each element in collection between min and max values using AVX SIMD func ClampInt8x16[T ~int8, Slice ~[]T](collection Slice, min, max T) Slice { length := uint(len(collection)) if length == 0 { @@ -488,7 +488,7 @@ func ClampInt8x16[T ~int8, Slice ~[]T](collection Slice, min, max T) Slice { return result } -// ClampInt16x8 clamps each element in collection between min and max values using SSE SIMD +// ClampInt16x8 clamps each element in collection between min and max values using AVX SIMD func ClampInt16x8[T ~int16, Slice ~[]T](collection Slice, min, max T) Slice { length := uint(len(collection)) if length == 0 { @@ -526,7 +526,7 @@ func ClampInt16x8[T ~int16, Slice ~[]T](collection Slice, min, max T) Slice { return result } -// ClampInt32x4 clamps each element in collection between min and max values using SSE SIMD +// ClampInt32x4 clamps each element in collection between min and max values using AVX SIMD func ClampInt32x4[T ~int32, Slice ~[]T](collection Slice, min, max T) Slice { length := uint(len(collection)) if length == 0 { @@ -564,45 +564,7 @@ func ClampInt32x4[T ~int32, Slice ~[]T](collection Slice, min, max T) Slice { return result } -// ClampInt64x2 clamps each element in collection between min and max values using SSE SIMD and AVX-512 SIMD. -func ClampInt64x2[T ~int64, Slice ~[]T](collection Slice, min, max T) Slice { - length := uint(len(collection)) - if length == 0 { - return collection - } - - result := make(Slice, length) - const lanes = simdLanes2 - - base := unsafeSliceInt64(collection, length) - - minVec := archsimd.BroadcastInt64x2(int64(min)) - maxVec := archsimd.BroadcastInt64x2(int64(max)) - - i := uint(0) - for ; i+lanes <= length; i += lanes { - v := archsimd.LoadInt64x2Slice(base[i : i+lanes]) - - clamped := v.Max(minVec).Min(maxVec) - - // bearer:disable go_gosec_unsafe_unsafe - clamped.Store((*[lanes]int64)(unsafe.Pointer(&result[i]))) - } - - for ; i < length; i++ { - val := collection[i] - if val < min { - val = min - } else if val > max { - val = max - } - result[i] = val - } - - return result -} - -// ClampUint8x16 clamps each element in collection between min and max values using SSE SIMD +// ClampUint8x16 clamps each element in collection between min and max values using AVX SIMD func ClampUint8x16[T ~uint8, Slice ~[]T](collection Slice, min, max T) Slice { length := uint(len(collection)) if length == 0 { @@ -640,7 +602,7 @@ func ClampUint8x16[T ~uint8, Slice ~[]T](collection Slice, min, max T) Slice { return result } -// ClampUint16x8 clamps each element in collection between min and max values using SSE SIMD +// ClampUint16x8 clamps each element in collection between min and max values using AVX SIMD func ClampUint16x8[T ~uint16, Slice ~[]T](collection Slice, min, max T) Slice { length := uint(len(collection)) if length == 0 { @@ -678,7 +640,7 @@ func ClampUint16x8[T ~uint16, Slice ~[]T](collection Slice, min, max T) Slice { return result } -// ClampUint32x4 clamps each element in collection between min and max values using SSE SIMD +// ClampUint32x4 clamps each element in collection between min and max values using AVX SIMD func ClampUint32x4[T ~uint32, Slice ~[]T](collection Slice, min, max T) Slice { length := uint(len(collection)) if length == 0 { @@ -716,45 +678,7 @@ func ClampUint32x4[T ~uint32, Slice ~[]T](collection Slice, min, max T) Slice { return result } -// ClampUint64x2 clamps each element in collection between min and max values using SSE SIMD and AVX-512 SIMD. -func ClampUint64x2[T ~uint64, Slice ~[]T](collection Slice, min, max T) Slice { - length := uint(len(collection)) - if length == 0 { - return collection - } - - result := make(Slice, length) - const lanes = simdLanes2 - - base := unsafeSliceUint64(collection, length) - - minVec := archsimd.BroadcastUint64x2(uint64(min)) - maxVec := archsimd.BroadcastUint64x2(uint64(max)) - - i := uint(0) - for ; i+lanes <= length; i += lanes { - v := archsimd.LoadUint64x2Slice(base[i : i+lanes]) - - clamped := v.Max(minVec).Min(maxVec) - - // bearer:disable go_gosec_unsafe_unsafe - clamped.Store((*[lanes]uint64)(unsafe.Pointer(&result[i]))) - } - - for ; i < length; i++ { - val := collection[i] - if val < min { - val = min - } else if val > max { - val = max - } - result[i] = val - } - - return result -} - -// ClampFloat32x4 clamps each element in collection between min and max values using SSE SIMD +// ClampFloat32x4 clamps each element in collection between min and max values using AVX SIMD func ClampFloat32x4[T ~float32, Slice ~[]T](collection Slice, min, max T) Slice { length := uint(len(collection)) if length == 0 { @@ -792,7 +716,7 @@ func ClampFloat32x4[T ~float32, Slice ~[]T](collection Slice, min, max T) Slice return result } -// ClampFloat64x2 clamps each element in collection between min and max values using SSE SIMD +// ClampFloat64x2 clamps each element in collection between min and max values using AVX SIMD func ClampFloat64x2[T ~float64, Slice ~[]T](collection Slice, min, max T) Slice { length := uint(len(collection)) if length == 0 { @@ -830,7 +754,7 @@ func ClampFloat64x2[T ~float64, Slice ~[]T](collection Slice, min, max T) Slice return result } -// MinInt8x16 finds the minimum value in a collection of int8 using SSE SIMD +// MinInt8x16 finds the minimum value in a collection of int8 using AVX SIMD func MinInt8x16[T ~int8](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -877,7 +801,7 @@ func MinInt8x16[T ~int8](collection []T) T { return T(minVal) } -// MinInt16x8 finds the minimum value in a collection of int16 using SSE SIMD +// MinInt16x8 finds the minimum value in a collection of int16 using AVX SIMD func MinInt16x8[T ~int16](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -921,7 +845,7 @@ func MinInt16x8[T ~int16](collection []T) T { return T(minVal) } -// MinInt32x4 finds the minimum value in a collection of int32 using SSE SIMD +// MinInt32x4 finds the minimum value in a collection of int32 using AVX SIMD func MinInt32x4[T ~int32](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -965,51 +889,7 @@ func MinInt32x4[T ~int32](collection []T) T { return T(minVal) } -// MinInt64x2 finds the minimum value in a collection of int64 using SSE SIMD -func MinInt64x2[T ~int64](collection []T) T { - length := uint(len(collection)) - if length == 0 { - return 0 - } - - const lanes = simdLanes2 - base := unsafeSliceInt64(collection, length) - - var minVec archsimd.Int64x2 - firstInitialized := false - - i := uint(0) - for ; i+lanes <= length; i += lanes { - v := archsimd.LoadInt64x2Slice(base[i : i+lanes]) - - if !firstInitialized { - minVec = v - firstInitialized = true - } else { - minVec = minVec.Min(v) - } - } - - // Find minimum in the vector (only if we processed any vectors) - var minVal int64 - if firstInitialized { - var buf [lanes]int64 - minVec.Store(&buf) - minVal = min(buf[0], buf[1]) - } - - // Handle remaining elements - for ; i < length; i++ { - if !firstInitialized || collection[i] < T(minVal) { - minVal = int64(collection[i]) - firstInitialized = true - } - } - - return T(minVal) -} - -// MinUint8x16 finds the minimum value in a collection of uint8 using SSE SIMD +// MinUint8x16 finds the minimum value in a collection of uint8 using AVX SIMD func MinUint8x16[T ~uint8](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -1056,7 +936,7 @@ func MinUint8x16[T ~uint8](collection []T) T { return T(minVal) } -// MinUint16x8 finds the minimum value in a collection of uint16 using SSE SIMD +// MinUint16x8 finds the minimum value in a collection of uint16 using AVX SIMD func MinUint16x8[T ~uint16](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -1100,7 +980,7 @@ func MinUint16x8[T ~uint16](collection []T) T { return T(minVal) } -// MinUint32x4 finds the minimum value in a collection of uint32 using SSE SIMD +// MinUint32x4 finds the minimum value in a collection of uint32 using AVX SIMD func MinUint32x4[T ~uint32](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -1144,51 +1024,7 @@ func MinUint32x4[T ~uint32](collection []T) T { return T(minVal) } -// MinUint64x2 finds the minimum value in a collection of uint64 using SSE SIMD -func MinUint64x2[T ~uint64](collection []T) T { - length := uint(len(collection)) - if length == 0 { - return 0 - } - - const lanes = simdLanes2 - base := unsafeSliceUint64(collection, length) - - var minVec archsimd.Uint64x2 - firstInitialized := false - - i := uint(0) - for ; i+lanes <= length; i += lanes { - v := archsimd.LoadUint64x2Slice(base[i : i+lanes]) - - if !firstInitialized { - minVec = v - firstInitialized = true - } else { - minVec = minVec.Min(v) - } - } - - // Find minimum in the vector (only if we processed any vectors) - var minVal uint64 - if firstInitialized { - var buf [lanes]uint64 - minVec.Store(&buf) - minVal = min(buf[0], buf[1]) - } - - // Handle remaining elements - for ; i < length; i++ { - if !firstInitialized || collection[i] < T(minVal) { - minVal = uint64(collection[i]) - firstInitialized = true - } - } - - return T(minVal) -} - -// MinFloat32x4 finds the minimum value in a collection of float32 using SSE SIMD +// MinFloat32x4 finds the minimum value in a collection of float32 using AVX SIMD func MinFloat32x4[T ~float32](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -1232,7 +1068,7 @@ func MinFloat32x4[T ~float32](collection []T) T { return T(minVal) } -// MinFloat64x2 finds the minimum value in a collection of float64 using SSE SIMD +// MinFloat64x2 finds the minimum value in a collection of float64 using AVX SIMD func MinFloat64x2[T ~float64](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -1276,7 +1112,7 @@ func MinFloat64x2[T ~float64](collection []T) T { return T(minVal) } -// MaxInt8x16 finds the maximum value in a collection of int8 using SSE SIMD +// MaxInt8x16 finds the maximum value in a collection of int8 using AVX SIMD func MaxInt8x16[T ~int8](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -1323,7 +1159,7 @@ func MaxInt8x16[T ~int8](collection []T) T { return T(maxVal) } -// MaxInt16x8 finds the maximum value in a collection of int16 using SSE SIMD +// MaxInt16x8 finds the maximum value in a collection of int16 using AVX SIMD func MaxInt16x8[T ~int16](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -1367,7 +1203,7 @@ func MaxInt16x8[T ~int16](collection []T) T { return T(maxVal) } -// MaxInt32x4 finds the maximum value in a collection of int32 using SSE SIMD +// MaxInt32x4 finds the maximum value in a collection of int32 using AVX SIMD func MaxInt32x4[T ~int32](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -1411,51 +1247,7 @@ func MaxInt32x4[T ~int32](collection []T) T { return T(maxVal) } -// MaxInt64x2 finds the maximum value in a collection of int64 using SSE SIMD -func MaxInt64x2[T ~int64](collection []T) T { - length := uint(len(collection)) - if length == 0 { - return 0 - } - - const lanes = simdLanes2 - base := unsafeSliceInt64(collection, length) - - var maxVec archsimd.Int64x2 - firstInitialized := false - - i := uint(0) - for ; i+lanes <= length; i += lanes { - v := archsimd.LoadInt64x2Slice(base[i : i+lanes]) - - if !firstInitialized { - maxVec = v - firstInitialized = true - } else { - maxVec = maxVec.Max(v) - } - } - - // Find maximum in the vector (only if we processed any vectors) - var maxVal int64 - if firstInitialized { - var buf [lanes]int64 - maxVec.Store(&buf) - maxVal = max(buf[0], buf[1]) - } - - // Handle remaining elements - for ; i < length; i++ { - if !firstInitialized || collection[i] > T(maxVal) { - maxVal = int64(collection[i]) - firstInitialized = true - } - } - - return T(maxVal) -} - -// MaxUint8x16 finds the maximum value in a collection of uint8 using SSE SIMD +// MaxUint8x16 finds the maximum value in a collection of uint8 using AVX SIMD func MaxUint8x16[T ~uint8](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -1502,7 +1294,7 @@ func MaxUint8x16[T ~uint8](collection []T) T { return T(maxVal) } -// MaxUint16x8 finds the maximum value in a collection of uint16 using SSE SIMD +// MaxUint16x8 finds the maximum value in a collection of uint16 using AVX SIMD func MaxUint16x8[T ~uint16](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -1546,7 +1338,7 @@ func MaxUint16x8[T ~uint16](collection []T) T { return T(maxVal) } -// MaxUint32x4 finds the maximum value in a collection of uint32 using SSE SIMD +// MaxUint32x4 finds the maximum value in a collection of uint32 using AVX SIMD func MaxUint32x4[T ~uint32](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -1590,51 +1382,7 @@ func MaxUint32x4[T ~uint32](collection []T) T { return T(maxVal) } -// MaxUint64x2 finds the maximum value in a collection of uint64 using SSE SIMD -func MaxUint64x2[T ~uint64](collection []T) T { - length := uint(len(collection)) - if length == 0 { - return 0 - } - - const lanes = simdLanes2 - base := unsafeSliceUint64(collection, length) - - var maxVec archsimd.Uint64x2 - firstInitialized := false - - i := uint(0) - for ; i+lanes <= length; i += lanes { - v := archsimd.LoadUint64x2Slice(base[i : i+lanes]) - - if !firstInitialized { - maxVec = v - firstInitialized = true - } else { - maxVec = maxVec.Max(v) - } - } - - // Find maximum in the vector (only if we processed any vectors) - var maxVal uint64 - if firstInitialized { - var buf [lanes]uint64 - maxVec.Store(&buf) - maxVal = max(buf[0], buf[1]) - } - - // Handle remaining elements - for ; i < length; i++ { - if !firstInitialized || collection[i] > T(maxVal) { - maxVal = uint64(collection[i]) - firstInitialized = true - } - } - - return T(maxVal) -} - -// MaxFloat32x4 finds the maximum value in a collection of float32 using SSE SIMD +// MaxFloat32x4 finds the maximum value in a collection of float32 using AVX SIMD func MaxFloat32x4[T ~float32](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -1678,7 +1426,7 @@ func MaxFloat32x4[T ~float32](collection []T) T { return T(maxVal) } -// MaxFloat64x2 finds the maximum value in a collection of float64 using SSE SIMD +// MaxFloat64x2 finds the maximum value in a collection of float64 using AVX SIMD func MaxFloat64x2[T ~float64](collection []T) T { length := uint(len(collection)) if length == 0 { @@ -1722,127 +1470,127 @@ func MaxFloat64x2[T ~float64](collection []T) T { return T(maxVal) } -// SSE (128-bit) SIMD sumBy functions - 16/8/4/2 lanes +// AVX (128-bit) SIMD sumBy functions - 16/8/4/2 lanes // These implementations use lo.Map to apply the iteratee, then chain with SIMD sum functions. -// SumByInt8x16 sums the values extracted by iteratee from a slice using SSE SIMD. +// SumByInt8x16 sums the values extracted by iteratee from a slice using AVX SIMD. func SumByInt8x16[T any, R ~int8](collection []T, iteratee func(item T) R) R { mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) }) return SumInt8x16(mapped) } -// SumByInt16x8 sums the values extracted by iteratee from a slice using SSE SIMD. +// SumByInt16x8 sums the values extracted by iteratee from a slice using AVX SIMD. func SumByInt16x8[T any, R ~int16](collection []T, iteratee func(item T) R) R { mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) }) return SumInt16x8(mapped) } -// SumByInt32x4 sums the values extracted by iteratee from a slice using SSE SIMD. +// SumByInt32x4 sums the values extracted by iteratee from a slice using AVX SIMD. func SumByInt32x4[T any, R ~int32](collection []T, iteratee func(item T) R) R { mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) }) return SumInt32x4(mapped) } -// SumByInt64x2 sums the values extracted by iteratee from a slice using SSE SIMD. +// SumByInt64x2 sums the values extracted by iteratee from a slice using AVX SIMD. func SumByInt64x2[T any, R ~int64](collection []T, iteratee func(item T) R) R { mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) }) return SumInt64x2(mapped) } -// SumByUint8x16 sums the values extracted by iteratee from a slice using SSE SIMD. +// SumByUint8x16 sums the values extracted by iteratee from a slice using AVX SIMD. func SumByUint8x16[T any, R ~uint8](collection []T, iteratee func(item T) R) R { mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) }) return SumUint8x16(mapped) } -// SumByUint16x8 sums the values extracted by iteratee from a slice using SSE SIMD. +// SumByUint16x8 sums the values extracted by iteratee from a slice using AVX SIMD. func SumByUint16x8[T any, R ~uint16](collection []T, iteratee func(item T) R) R { mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) }) return SumUint16x8(mapped) } -// SumByUint32x4 sums the values extracted by iteratee from a slice using SSE SIMD. +// SumByUint32x4 sums the values extracted by iteratee from a slice using AVX SIMD. func SumByUint32x4[T any, R ~uint32](collection []T, iteratee func(item T) R) R { mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) }) return SumUint32x4(mapped) } -// SumByUint64x2 sums the values extracted by iteratee from a slice using SSE SIMD. +// SumByUint64x2 sums the values extracted by iteratee from a slice using AVX SIMD. func SumByUint64x2[T any, R ~uint64](collection []T, iteratee func(item T) R) R { mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) }) return SumUint64x2(mapped) } -// SumByFloat32x4 sums the values extracted by iteratee from a slice using SSE SIMD. +// SumByFloat32x4 sums the values extracted by iteratee from a slice using AVX SIMD. func SumByFloat32x4[T any, R ~float32](collection []T, iteratee func(item T) R) R { mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) }) return SumFloat32x4(mapped) } -// SumByFloat64x2 sums the values extracted by iteratee from a slice using SSE SIMD. +// SumByFloat64x2 sums the values extracted by iteratee from a slice using AVX SIMD. func SumByFloat64x2[T any, R ~float64](collection []T, iteratee func(item T) R) R { mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) }) return SumFloat64x2(mapped) } -// SSE (128-bit) SIMD meanBy functions - 16/8/4/2 lanes +// AVX (128-bit) SIMD meanBy functions - 16/8/4/2 lanes // These implementations use lo.Map to apply the iteratee, then chain with SIMD mean functions. -// MeanByInt8x16 calculates the mean of values extracted by iteratee from a slice using SSE SIMD. +// MeanByInt8x16 calculates the mean of values extracted by iteratee from a slice using AVX SIMD. func MeanByInt8x16[T any, R ~int8](collection []T, iteratee func(item T) R) R { mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) }) return MeanInt8x16(mapped) } -// MeanByInt16x8 calculates the mean of values extracted by iteratee from a slice using SSE SIMD. +// MeanByInt16x8 calculates the mean of values extracted by iteratee from a slice using AVX SIMD. func MeanByInt16x8[T any, R ~int16](collection []T, iteratee func(item T) R) R { mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) }) return MeanInt16x8(mapped) } -// MeanByInt32x4 calculates the mean of values extracted by iteratee from a slice using SSE SIMD. +// MeanByInt32x4 calculates the mean of values extracted by iteratee from a slice using AVX SIMD. func MeanByInt32x4[T any, R ~int32](collection []T, iteratee func(item T) R) R { mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) }) return MeanInt32x4(mapped) } -// MeanByInt64x2 calculates the mean of values extracted by iteratee from a slice using SSE SIMD. +// MeanByInt64x2 calculates the mean of values extracted by iteratee from a slice using AVX SIMD. func MeanByInt64x2[T any, R ~int64](collection []T, iteratee func(item T) R) R { mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) }) return MeanInt64x2(mapped) } -// MeanByUint8x16 calculates the mean of values extracted by iteratee from a slice using SSE SIMD. +// MeanByUint8x16 calculates the mean of values extracted by iteratee from a slice using AVX SIMD. func MeanByUint8x16[T any, R ~uint8](collection []T, iteratee func(item T) R) R { mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) }) return MeanUint8x16(mapped) } -// MeanByUint16x8 calculates the mean of values extracted by iteratee from a slice using SSE SIMD. +// MeanByUint16x8 calculates the mean of values extracted by iteratee from a slice using AVX SIMD. func MeanByUint16x8[T any, R ~uint16](collection []T, iteratee func(item T) R) R { mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) }) return MeanUint16x8(mapped) } -// MeanByUint32x4 calculates the mean of values extracted by iteratee from a slice using SSE SIMD. +// MeanByUint32x4 calculates the mean of values extracted by iteratee from a slice using AVX SIMD. func MeanByUint32x4[T any, R ~uint32](collection []T, iteratee func(item T) R) R { mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) }) return MeanUint32x4(mapped) } -// MeanByUint64x2 calculates the mean of values extracted by iteratee from a slice using SSE SIMD. +// MeanByUint64x2 calculates the mean of values extracted by iteratee from a slice using AVX SIMD. func MeanByUint64x2[T any, R ~uint64](collection []T, iteratee func(item T) R) R { mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) }) return MeanUint64x2(mapped) } -// MeanByFloat32x4 calculates the mean of values extracted by iteratee from a slice using SSE SIMD. +// MeanByFloat32x4 calculates the mean of values extracted by iteratee from a slice using AVX SIMD. func MeanByFloat32x4[T any, R ~float32](collection []T, iteratee func(item T) R) R { mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) }) return MeanFloat32x4(mapped) } -// MeanByFloat64x2 calculates the mean of values extracted by iteratee from a slice using SSE SIMD. +// MeanByFloat64x2 calculates the mean of values extracted by iteratee from a slice using AVX SIMD. func MeanByFloat64x2[T any, R ~float64](collection []T, iteratee func(item T) R) R { mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) }) return MeanFloat64x2(mapped) diff --git a/exp/simd/math_avx512.go b/exp/simd/math_avx512.go index 1bf95cd..7714b1c 100644 --- a/exp/simd/math_avx512.go +++ b/exp/simd/math_avx512.go @@ -566,6 +566,84 @@ func ClampInt32x16[T ~int32, Slice ~[]T](collection Slice, min, max T) Slice { return result } +// ClampInt64x2 clamps each element in collection between min and max values using AVX-512 SIMD. +// Int64x2 Min/Max operations in archsimd require AVX-512 (VPMAXSQ/VPMINSQ). +func ClampInt64x2[T ~int64, Slice ~[]T](collection Slice, min, max T) Slice { + length := uint(len(collection)) + if length == 0 { + return collection + } + + result := make(Slice, length) + const lanes = simdLanes2 + + base := unsafeSliceInt64(collection, length) + + minVec := archsimd.BroadcastInt64x2(int64(min)) + maxVec := archsimd.BroadcastInt64x2(int64(max)) + + i := uint(0) + for ; i+lanes <= length; i += lanes { + v := archsimd.LoadInt64x2Slice(base[i : i+lanes]) + + clamped := v.Max(minVec).Min(maxVec) + + // bearer:disable go_gosec_unsafe_unsafe + clamped.Store((*[lanes]int64)(unsafe.Pointer(&result[i]))) + } + + for ; i < length; i++ { + val := collection[i] + if val < min { + val = min + } else if val > max { + val = max + } + result[i] = val + } + + return result +} + +// ClampUint64x2 clamps each element in collection between min and max values using AVX-512 SIMD. +// Uint64x2 Min/Max operations in archsimd require AVX-512. +func ClampUint64x2[T ~uint64, Slice ~[]T](collection Slice, min, max T) Slice { + length := uint(len(collection)) + if length == 0 { + return collection + } + + result := make(Slice, length) + const lanes = simdLanes2 + + base := unsafeSliceUint64(collection, length) + + minVec := archsimd.BroadcastUint64x2(uint64(min)) + maxVec := archsimd.BroadcastUint64x2(uint64(max)) + + i := uint(0) + for ; i+lanes <= length; i += lanes { + v := archsimd.LoadUint64x2Slice(base[i : i+lanes]) + + clamped := v.Max(minVec).Min(maxVec) + + // bearer:disable go_gosec_unsafe_unsafe + clamped.Store((*[lanes]uint64)(unsafe.Pointer(&result[i]))) + } + + for ; i < length; i++ { + val := collection[i] + if val < min { + val = min + } else if val > max { + val = max + } + result[i] = val + } + + return result +} + // ClampInt64x8 clamps each element in collection between min and max values using AVX-512 SIMD func ClampInt64x8[T ~int64, Slice ~[]T](collection Slice, min, max T) Slice { length := uint(len(collection)) @@ -991,6 +1069,96 @@ func MinInt32x16[T ~int32](collection []T) T { return T(minVal) } +// MinInt64x2 finds the minimum value in a collection of int64 using AVX-512 SIMD. +// Int64x2 Min operations in archsimd require AVX-512. +func MinInt64x2[T ~int64](collection []T) T { + length := uint(len(collection)) + if length == 0 { + return 0 + } + + const lanes = simdLanes2 + base := unsafeSliceInt64(collection, length) + + var minVec archsimd.Int64x2 + firstInitialized := false + + i := uint(0) + for ; i+lanes <= length; i += lanes { + v := archsimd.LoadInt64x2Slice(base[i : i+lanes]) + + if !firstInitialized { + minVec = v + firstInitialized = true + } else { + minVec = minVec.Min(v) + } + } + + // Find minimum in the vector (only if we processed any vectors) + var minVal int64 + if firstInitialized { + var buf [lanes]int64 + minVec.Store(&buf) + minVal = min(buf[0], buf[1]) + } + + // Handle remaining elements + for ; i < length; i++ { + if !firstInitialized || collection[i] < T(minVal) { + minVal = int64(collection[i]) + firstInitialized = true + } + } + + return T(minVal) +} + +// MinUint64x2 finds the minimum value in a collection of uint64 using AVX-512 SIMD. +// Uint64x2 Min operations in archsimd require AVX-512. +func MinUint64x2[T ~uint64](collection []T) T { + length := uint(len(collection)) + if length == 0 { + return 0 + } + + const lanes = simdLanes2 + base := unsafeSliceUint64(collection, length) + + var minVec archsimd.Uint64x2 + firstInitialized := false + + i := uint(0) + for ; i+lanes <= length; i += lanes { + v := archsimd.LoadUint64x2Slice(base[i : i+lanes]) + + if !firstInitialized { + minVec = v + firstInitialized = true + } else { + minVec = minVec.Min(v) + } + } + + // Find minimum in the vector (only if we processed any vectors) + var minVal uint64 + if firstInitialized { + var buf [lanes]uint64 + minVec.Store(&buf) + minVal = min(buf[0], buf[1]) + } + + // Handle remaining elements + for ; i < length; i++ { + if !firstInitialized || collection[i] < T(minVal) { + minVal = uint64(collection[i]) + firstInitialized = true + } + } + + return T(minVal) +} + // MinInt64x8 finds the minimum value in a collection of int64 using AVX-512 SIMD func MinInt64x8[T ~int64](collection []T) T { length := uint(len(collection)) @@ -1478,6 +1646,96 @@ func MaxInt32x16[T ~int32](collection []T) T { return T(maxVal) } +// MaxInt64x2 finds the maximum value in a collection of int64 using AVX-512 SIMD. +// Int64x2 Max operations in archsimd require AVX-512. +func MaxInt64x2[T ~int64](collection []T) T { + length := uint(len(collection)) + if length == 0 { + return 0 + } + + const lanes = simdLanes2 + base := unsafeSliceInt64(collection, length) + + var maxVec archsimd.Int64x2 + firstInitialized := false + + i := uint(0) + for ; i+lanes <= length; i += lanes { + v := archsimd.LoadInt64x2Slice(base[i : i+lanes]) + + if !firstInitialized { + maxVec = v + firstInitialized = true + } else { + maxVec = maxVec.Max(v) + } + } + + // Find maximum in the vector (only if we processed any vectors) + var maxVal int64 + if firstInitialized { + var buf [lanes]int64 + maxVec.Store(&buf) + maxVal = max(buf[0], buf[1]) + } + + // Handle remaining elements + for ; i < length; i++ { + if !firstInitialized || collection[i] > T(maxVal) { + maxVal = int64(collection[i]) + firstInitialized = true + } + } + + return T(maxVal) +} + +// MaxUint64x2 finds the maximum value in a collection of uint64 using AVX-512 SIMD. +// Uint64x2 Max operations in archsimd require AVX-512. +func MaxUint64x2[T ~uint64](collection []T) T { + length := uint(len(collection)) + if length == 0 { + return 0 + } + + const lanes = simdLanes2 + base := unsafeSliceUint64(collection, length) + + var maxVec archsimd.Uint64x2 + firstInitialized := false + + i := uint(0) + for ; i+lanes <= length; i += lanes { + v := archsimd.LoadUint64x2Slice(base[i : i+lanes]) + + if !firstInitialized { + maxVec = v + firstInitialized = true + } else { + maxVec = maxVec.Max(v) + } + } + + // Find maximum in the vector (only if we processed any vectors) + var maxVal uint64 + if firstInitialized { + var buf [lanes]uint64 + maxVec.Store(&buf) + maxVal = max(buf[0], buf[1]) + } + + // Handle remaining elements + for ; i < length; i++ { + if !firstInitialized || collection[i] > T(maxVal) { + maxVal = uint64(collection[i]) + firstInitialized = true + } + } + + return T(maxVal) +} + // MaxInt64x8 finds the maximum value in a collection of int64 using AVX-512 SIMD func MaxInt64x8[T ~int64](collection []T) T { length := uint(len(collection)) diff --git a/exp/simd/math_avx512_test.go b/exp/simd/math_avx512_test.go index 649f657..8056ead 100644 --- a/exp/simd/math_avx512_test.go +++ b/exp/simd/math_avx512_test.go @@ -819,6 +819,55 @@ func TestClampInt32x16(t *testing.T) { } } +func TestClampInt64x2(t *testing.T) { + requireAVX512(t) + testCases := []struct { + name string + input []int64 + min int64 + max int64 + }{ + {"empty", []int64{}, -100, 100}, + {"single", []int64{42}, -10, 10}, + {"small", []int64{1, 2, 3, 4, 5}, 2, 4}, + {"exactly 2", []int64{-100, 200}, -50, 50}, + {"large", make([]int64, 1000), -50, 50}, + {"all below min", []int64{-1000, -2000, -3000}, -500, 100}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 { + for i := range tc.input { + tc.input[i] = rand.Int64() + } + } + + got := ClampInt64x2(tc.input, tc.min, tc.max) + + if len(got) != len(tc.input) { + t.Errorf("ClampInt64x2() returned length %d, want %d", len(got), len(tc.input)) + } + + for i, v := range got { + if v < tc.min || v > tc.max { + t.Errorf("ClampInt64x2()[%d] = %v, outside range [%v, %v]", i, v, tc.min, tc.max) + } + original := tc.input[i] + expected := original + if expected < tc.min { + expected = tc.min + } else if expected > tc.max { + expected = tc.max + } + if v != expected { + t.Errorf("ClampInt64x2()[%d] = %v, want %v (original: %v)", i, v, expected, original) + } + } + }) + } +} + func TestClampInt64x8(t *testing.T) { requireAVX512(t) testCases := []struct { @@ -1018,6 +1067,55 @@ func TestClampUint32x16(t *testing.T) { } } +func TestClampUint64x2(t *testing.T) { + requireAVX512(t) + testCases := []struct { + name string + input []uint64 + min uint64 + max uint64 + }{ + {"empty", []uint64{}, 100, 1000}, + {"single", []uint64{42}, 10, 100}, + {"small", []uint64{1, 2, 3, 4, 5}, 2, 4}, + {"exactly 2", []uint64{50, 2000}, 100, 1000}, + {"large", make([]uint64, 1000), 500, 5000}, + {"all below min", []uint64{1, 2, 3}, 10, 100}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 { + for i := range tc.input { + tc.input[i] = rand.Uint64() + } + } + + got := ClampUint64x2(tc.input, tc.min, tc.max) + + if len(got) != len(tc.input) { + t.Errorf("ClampUint64x2() returned length %d, want %d", len(got), len(tc.input)) + } + + for i, v := range got { + if v < tc.min || v > tc.max { + t.Errorf("ClampUint64x2()[%d] = %v, outside range [%v, %v]", i, v, tc.min, tc.max) + } + original := tc.input[i] + expected := original + if expected < tc.min { + expected = tc.min + } else if expected > tc.max { + expected = tc.max + } + if v != expected { + t.Errorf("ClampUint64x2()[%d] = %v, want %v (original: %v)", i, v, expected, original) + } + } + }) + } +} + func TestClampUint64x8(t *testing.T) { requireAVX512(t) testCases := []struct { @@ -1292,6 +1390,38 @@ func TestMinInt32x16(t *testing.T) { } } +func TestMinInt64x2(t *testing.T) { + requireAVX512(t) + testCases := []struct { + name string + input []int64 + }{ + {"empty", []int64{}}, + {"single", []int64{42}}, + {"small", []int64{1, 2, 3, 4, 5}}, + {"exactly 2", []int64{1, 2}}, + {"large", make([]int64, 1000)}, + {"negative", []int64{-1, -2, -3, 4, 5}}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 { + for i := range tc.input { + tc.input[i] = rand.Int64() + } + } + + got := MinInt64x2(tc.input) + want := lo.Min(tc.input) + + if got != want { + t.Errorf("MinInt64x2() = %v, want %v", got, want) + } + }) + } +} + func TestMinInt64x8(t *testing.T) { requireAVX512(t) testCases := []struct { @@ -1419,6 +1549,37 @@ func TestMinUint32x16(t *testing.T) { } } +func TestMinUint64x2(t *testing.T) { + requireAVX512(t) + testCases := []struct { + name string + input []uint64 + }{ + {"empty", []uint64{}}, + {"single", []uint64{42}}, + {"small", []uint64{1, 2, 3, 4, 5}}, + {"exactly 2", []uint64{1, 2}}, + {"large", make([]uint64, 1000)}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 { + for i := range tc.input { + tc.input[i] = rand.Uint64() + } + } + + got := MinUint64x2(tc.input) + want := lo.Min(tc.input) + + if got != want { + t.Errorf("MinUint64x2() = %v, want %v", got, want) + } + }) + } +} + func TestMinUint64x8(t *testing.T) { requireAVX512(t) testCases := []struct { @@ -1625,6 +1786,38 @@ func TestMaxInt32x16(t *testing.T) { } } +func TestMaxInt64x2(t *testing.T) { + requireAVX512(t) + testCases := []struct { + name string + input []int64 + }{ + {"empty", []int64{}}, + {"single", []int64{42}}, + {"small", []int64{1, 2, 3, 4, 5}}, + {"exactly 2", []int64{1, 2}}, + {"large", make([]int64, 1000)}, + {"negative", []int64{-1, -2, -3, 4, 5}}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 { + for i := range tc.input { + tc.input[i] = rand.Int64() + } + } + + got := MaxInt64x2(tc.input) + want := lo.Max(tc.input) + + if got != want { + t.Errorf("MaxInt64x2() = %v, want %v", got, want) + } + }) + } +} + func TestMaxInt64x8(t *testing.T) { requireAVX512(t) testCases := []struct { @@ -1752,6 +1945,37 @@ func TestMaxUint32x16(t *testing.T) { } } +func TestMaxUint64x2(t *testing.T) { + requireAVX512(t) + testCases := []struct { + name string + input []uint64 + }{ + {"empty", []uint64{}}, + {"single", []uint64{42}}, + {"small", []uint64{1, 2, 3, 4, 5}}, + {"exactly 2", []uint64{1, 2}}, + {"large", make([]uint64, 1000)}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 { + for i := range tc.input { + tc.input[i] = rand.Uint64() + } + } + + got := MaxUint64x2(tc.input) + want := lo.Max(tc.input) + + if got != want { + t.Errorf("MaxUint64x2() = %v, want %v", got, want) + } + }) + } +} + func TestMaxUint64x8(t *testing.T) { requireAVX512(t) testCases := []struct { diff --git a/exp/simd/math_sse_test.go b/exp/simd/math_avx_test.go similarity index 91% rename from exp/simd/math_sse_test.go rename to exp/simd/math_avx_test.go index ef234e6..c0da14c 100644 --- a/exp/simd/math_sse_test.go +++ b/exp/simd/math_avx_test.go @@ -10,6 +10,7 @@ import ( ) func TestSumInt8x16(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []int8 @@ -42,6 +43,7 @@ func TestSumInt8x16(t *testing.T) { } func TestSumInt16x8(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []int16 @@ -74,6 +76,7 @@ func TestSumInt16x8(t *testing.T) { } func TestSumInt32x4(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []int32 @@ -105,6 +108,7 @@ func TestSumInt32x4(t *testing.T) { } func TestSumInt64x2(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []int64 @@ -136,6 +140,7 @@ func TestSumInt64x2(t *testing.T) { } func TestSumUint8x16(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []uint8 @@ -167,6 +172,7 @@ func TestSumUint8x16(t *testing.T) { } func TestSumUint16x8(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []uint16 @@ -198,6 +204,7 @@ func TestSumUint16x8(t *testing.T) { } func TestSumUint32x4(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []uint32 @@ -228,6 +235,7 @@ func TestSumUint32x4(t *testing.T) { } func TestSumUint64x2(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []uint64 @@ -258,6 +266,7 @@ func TestSumUint64x2(t *testing.T) { } func TestSumFloat32x4(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []float32 @@ -291,6 +300,7 @@ func TestSumFloat32x4(t *testing.T) { } func TestSumFloat64x2(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []float64 @@ -323,7 +333,8 @@ func TestSumFloat64x2(t *testing.T) { } // Test type aliases work correctly -func TestSSETypeAlias(t *testing.T) { +func TestAVXTypeAlias(t *testing.T) { + requireAVX(t) input := []myInt8{1, 2, 3, 4, 5} got := SumInt8x16(input) want := lo.Sum(input) @@ -334,6 +345,7 @@ func TestSSETypeAlias(t *testing.T) { } func TestClampInt8x16(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []int8 @@ -385,6 +397,7 @@ func TestClampInt8x16(t *testing.T) { } func TestClampInt16x8(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []int16 @@ -434,6 +447,7 @@ func TestClampInt16x8(t *testing.T) { } func TestClampInt32x4(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []int32 @@ -481,56 +495,8 @@ func TestClampInt32x4(t *testing.T) { } } -func TestClampInt64x2(t *testing.T) { - requireAVX512(t) - testCases := []struct { - name string - input []int64 - min int64 - max int64 - }{ - {"empty", []int64{}, -100, 100}, - {"single", []int64{42}, -10, 10}, - {"small", []int64{1, 2, 3, 4, 5}, 2, 4}, - {"exactly 2", []int64{-100, 200}, -50, 50}, - {"large", make([]int64, 1000), -50, 50}, - {"all below min", []int64{-1000, -2000, -3000}, -500, 100}, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 { - for i := range tc.input { - tc.input[i] = rand.Int64() - } - } - - got := ClampInt64x2(tc.input, tc.min, tc.max) - - if len(got) != len(tc.input) { - t.Errorf("ClampInt64x2() returned length %d, want %d", len(got), len(tc.input)) - } - - for i, v := range got { - if v < tc.min || v > tc.max { - t.Errorf("ClampInt64x2()[%d] = %v, outside range [%v, %v]", i, v, tc.min, tc.max) - } - original := tc.input[i] - expected := original - if expected < tc.min { - expected = tc.min - } else if expected > tc.max { - expected = tc.max - } - if v != expected { - t.Errorf("ClampInt64x2()[%d] = %v, want %v (original: %v)", i, v, expected, original) - } - } - }) - } -} - func TestClampUint8x16(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []uint8 @@ -581,6 +547,7 @@ func TestClampUint8x16(t *testing.T) { } func TestClampUint16x8(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []uint16 @@ -630,6 +597,7 @@ func TestClampUint16x8(t *testing.T) { } func TestClampUint32x4(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []uint32 @@ -677,56 +645,8 @@ func TestClampUint32x4(t *testing.T) { } } -func TestClampUint64x2(t *testing.T) { - requireAVX512(t) - testCases := []struct { - name string - input []uint64 - min uint64 - max uint64 - }{ - {"empty", []uint64{}, 100, 1000}, - {"single", []uint64{42}, 10, 100}, - {"small", []uint64{1, 2, 3, 4, 5}, 2, 4}, - {"exactly 2", []uint64{50, 2000}, 100, 1000}, - {"large", make([]uint64, 1000), 500, 5000}, - {"all below min", []uint64{1, 2, 3}, 10, 100}, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 { - for i := range tc.input { - tc.input[i] = rand.Uint64() - } - } - - got := ClampUint64x2(tc.input, tc.min, tc.max) - - if len(got) != len(tc.input) { - t.Errorf("ClampUint64x2() returned length %d, want %d", len(got), len(tc.input)) - } - - for i, v := range got { - if v < tc.min || v > tc.max { - t.Errorf("ClampUint64x2()[%d] = %v, outside range [%v, %v]", i, v, tc.min, tc.max) - } - original := tc.input[i] - expected := original - if expected < tc.min { - expected = tc.min - } else if expected > tc.max { - expected = tc.max - } - if v != expected { - t.Errorf("ClampUint64x2()[%d] = %v, want %v (original: %v)", i, v, expected, original) - } - } - }) - } -} - func TestClampFloat32x4(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []float32 @@ -778,6 +698,7 @@ func TestClampFloat32x4(t *testing.T) { } func TestClampFloat64x2(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []float64 @@ -829,7 +750,8 @@ func TestClampFloat64x2(t *testing.T) { } // Test type aliases work correctly -func TestSSEClampTypeAlias(t *testing.T) { +func TestAVXClampTypeAlias(t *testing.T) { + requireAVX(t) input := []myInt8{-5, 0, 10, 15, 20} min := myInt8(0) max := myInt8(10) @@ -853,6 +775,7 @@ func TestSSEClampTypeAlias(t *testing.T) { } func TestMeanInt8x16(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []int8 @@ -884,6 +807,7 @@ func TestMeanInt8x16(t *testing.T) { } func TestMeanInt16x8(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []int16 @@ -915,6 +839,7 @@ func TestMeanInt16x8(t *testing.T) { } func TestMeanInt32x4(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []int32 @@ -946,6 +871,7 @@ func TestMeanInt32x4(t *testing.T) { } func TestMeanInt64x2(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []int64 @@ -977,6 +903,7 @@ func TestMeanInt64x2(t *testing.T) { } func TestMeanUint8x16(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []uint8 @@ -1008,6 +935,7 @@ func TestMeanUint8x16(t *testing.T) { } func TestMeanUint16x8(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []uint16 @@ -1039,6 +967,7 @@ func TestMeanUint16x8(t *testing.T) { } func TestMeanUint32x4(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []uint32 @@ -1069,6 +998,7 @@ func TestMeanUint32x4(t *testing.T) { } func TestMeanUint64x2(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []uint64 @@ -1099,6 +1029,7 @@ func TestMeanUint64x2(t *testing.T) { } func TestMeanFloat32x4(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []float32 @@ -1132,6 +1063,7 @@ func TestMeanFloat32x4(t *testing.T) { } func TestMeanFloat64x2(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []float64 @@ -1164,7 +1096,8 @@ func TestMeanFloat64x2(t *testing.T) { } // Test type aliases work correctly -func TestSSEMeanTypeAlias(t *testing.T) { +func TestAVXMeanTypeAlias(t *testing.T) { + requireAVX(t) input := []myInt8{1, 2, 3, 4, 5} got := MeanInt8x16(input) want := lo.Mean(input) @@ -1175,6 +1108,7 @@ func TestSSEMeanTypeAlias(t *testing.T) { } func TestMinInt8x16(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []int8 @@ -1206,6 +1140,7 @@ func TestMinInt8x16(t *testing.T) { } func TestMinInt16x8(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []int16 @@ -1237,6 +1172,7 @@ func TestMinInt16x8(t *testing.T) { } func TestMinInt32x4(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []int32 @@ -1267,39 +1203,8 @@ func TestMinInt32x4(t *testing.T) { } } -func TestMinInt64x2(t *testing.T) { - requireAVX512(t) - testCases := []struct { - name string - input []int64 - }{ - {"empty", []int64{}}, - {"single", []int64{42}}, - {"small", []int64{1, 2, 3, 4, 5}}, - {"exactly 2", []int64{1, 2}}, - {"large", make([]int64, 1000)}, - {"negative", []int64{-1, -2, -3, 4, 5}}, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 { - for i := range tc.input { - tc.input[i] = rand.Int64() - } - } - - got := MinInt64x2(tc.input) - want := lo.Min(tc.input) - - if got != want { - t.Errorf("MinInt64x2() = %v, want %v", got, want) - } - }) - } -} - func TestMinUint8x16(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []uint8 @@ -1331,6 +1236,7 @@ func TestMinUint8x16(t *testing.T) { } func TestMinUint16x8(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []uint16 @@ -1362,6 +1268,7 @@ func TestMinUint16x8(t *testing.T) { } func TestMinUint32x4(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []uint32 @@ -1391,38 +1298,8 @@ func TestMinUint32x4(t *testing.T) { } } -func TestMinUint64x2(t *testing.T) { - requireAVX512(t) - testCases := []struct { - name string - input []uint64 - }{ - {"empty", []uint64{}}, - {"single", []uint64{42}}, - {"small", []uint64{1, 2, 3, 4, 5}}, - {"exactly 2", []uint64{1, 2}}, - {"large", make([]uint64, 1000)}, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 { - for i := range tc.input { - tc.input[i] = rand.Uint64() - } - } - - got := MinUint64x2(tc.input) - want := lo.Min(tc.input) - - if got != want { - t.Errorf("MinUint64x2() = %v, want %v", got, want) - } - }) - } -} - func TestMinFloat32x4(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []float32 @@ -1456,6 +1333,7 @@ func TestMinFloat32x4(t *testing.T) { } func TestMinFloat64x2(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []float64 @@ -1488,7 +1366,8 @@ func TestMinFloat64x2(t *testing.T) { } // Test type aliases work correctly -func TestSSEMinTypeAlias(t *testing.T) { +func TestAVXMinTypeAlias(t *testing.T) { + requireAVX(t) input := []myInt8{5, 2, 8, 1, 9} got := MinInt8x16(input) want := myInt8(1) @@ -1499,6 +1378,7 @@ func TestSSEMinTypeAlias(t *testing.T) { } func TestMaxInt8x16(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []int8 @@ -1530,6 +1410,7 @@ func TestMaxInt8x16(t *testing.T) { } func TestMaxInt16x8(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []int16 @@ -1561,6 +1442,7 @@ func TestMaxInt16x8(t *testing.T) { } func TestMaxInt32x4(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []int32 @@ -1591,39 +1473,8 @@ func TestMaxInt32x4(t *testing.T) { } } -func TestMaxInt64x2(t *testing.T) { - requireAVX512(t) - testCases := []struct { - name string - input []int64 - }{ - {"empty", []int64{}}, - {"single", []int64{42}}, - {"small", []int64{1, 2, 3, 4, 5}}, - {"exactly 2", []int64{1, 2}}, - {"large", make([]int64, 1000)}, - {"negative", []int64{-1, -2, -3, 4, 5}}, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 { - for i := range tc.input { - tc.input[i] = rand.Int64() - } - } - - got := MaxInt64x2(tc.input) - want := lo.Max(tc.input) - - if got != want { - t.Errorf("MaxInt64x2() = %v, want %v", got, want) - } - }) - } -} - func TestMaxUint8x16(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []uint8 @@ -1655,6 +1506,7 @@ func TestMaxUint8x16(t *testing.T) { } func TestMaxUint16x8(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []uint16 @@ -1686,6 +1538,7 @@ func TestMaxUint16x8(t *testing.T) { } func TestMaxUint32x4(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []uint32 @@ -1715,38 +1568,8 @@ func TestMaxUint32x4(t *testing.T) { } } -func TestMaxUint64x2(t *testing.T) { - requireAVX512(t) - testCases := []struct { - name string - input []uint64 - }{ - {"empty", []uint64{}}, - {"single", []uint64{42}}, - {"small", []uint64{1, 2, 3, 4, 5}}, - {"exactly 2", []uint64{1, 2}}, - {"large", make([]uint64, 1000)}, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 { - for i := range tc.input { - tc.input[i] = rand.Uint64() - } - } - - got := MaxUint64x2(tc.input) - want := lo.Max(tc.input) - - if got != want { - t.Errorf("MaxUint64x2() = %v, want %v", got, want) - } - }) - } -} - func TestMaxFloat32x4(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []float32 @@ -1780,6 +1603,7 @@ func TestMaxFloat32x4(t *testing.T) { } func TestMaxFloat64x2(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []float64 @@ -1812,7 +1636,8 @@ func TestMaxFloat64x2(t *testing.T) { } // Test type aliases work correctly -func TestSSEMaxTypeAlias(t *testing.T) { +func TestAVXMaxTypeAlias(t *testing.T) { + requireAVX(t) input := []myInt8{5, 2, 8, 1, 9} got := MaxInt8x16(input) want := myInt8(9) @@ -1831,6 +1656,7 @@ type item struct { } func TestSumByInt8x16(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []item @@ -1863,6 +1689,7 @@ func TestSumByInt8x16(t *testing.T) { } func TestSumByInt16x8(t *testing.T) { + requireAVX(t) type itemInt16 struct { Value int16 } @@ -1898,6 +1725,7 @@ func TestSumByInt16x8(t *testing.T) { } func TestSumByInt32x4(t *testing.T) { + requireAVX(t) type itemInt32 struct { Value int32 } @@ -1933,6 +1761,7 @@ func TestSumByInt32x4(t *testing.T) { } func TestSumByInt64x2(t *testing.T) { + requireAVX(t) type itemInt64 struct { Value int64 } @@ -1968,6 +1797,7 @@ func TestSumByInt64x2(t *testing.T) { } func TestSumByUint8x16(t *testing.T) { + requireAVX(t) type itemUint8 struct { Value uint8 } @@ -2003,6 +1833,7 @@ func TestSumByUint8x16(t *testing.T) { } func TestSumByUint16x8(t *testing.T) { + requireAVX(t) type itemUint16 struct { Value uint16 } @@ -2038,6 +1869,7 @@ func TestSumByUint16x8(t *testing.T) { } func TestSumByUint32x4(t *testing.T) { + requireAVX(t) type itemUint32 struct { Value uint32 } @@ -2072,6 +1904,7 @@ func TestSumByUint32x4(t *testing.T) { } func TestSumByUint64x2(t *testing.T) { + requireAVX(t) type itemUint64 struct { Value uint64 } @@ -2106,6 +1939,7 @@ func TestSumByUint64x2(t *testing.T) { } func TestSumByFloat32x4(t *testing.T) { + requireAVX(t) type itemFloat32 struct { Value float32 } @@ -2143,6 +1977,7 @@ func TestSumByFloat32x4(t *testing.T) { } func TestSumByFloat64x2(t *testing.T) { + requireAVX(t) type itemFloat64 struct { Value float64 } @@ -2179,7 +2014,8 @@ func TestSumByFloat64x2(t *testing.T) { } // Test type alias works correctly for SumBy -func TestSSESumByTypeAlias(t *testing.T) { +func TestAVXSumByTypeAlias(t *testing.T) { + requireAVX(t) type myItem struct { Value myInt8 } @@ -2196,6 +2032,7 @@ func TestSSESumByTypeAlias(t *testing.T) { // MeanBy tests func TestMeanByInt8x16(t *testing.T) { + requireAVX(t) testCases := []struct { name string input []item @@ -2227,6 +2064,7 @@ func TestMeanByInt8x16(t *testing.T) { } func TestMeanByInt16x8(t *testing.T) { + requireAVX(t) type itemInt16 struct { Value int16 } @@ -2262,6 +2100,7 @@ func TestMeanByInt16x8(t *testing.T) { } func TestMeanByInt32x4(t *testing.T) { + requireAVX(t) type itemInt32 struct { Value int32 } @@ -2297,6 +2136,7 @@ func TestMeanByInt32x4(t *testing.T) { } func TestMeanByInt64x2(t *testing.T) { + requireAVX(t) type itemInt64 struct { Value int64 } @@ -2332,6 +2172,7 @@ func TestMeanByInt64x2(t *testing.T) { } func TestMeanByUint8x16(t *testing.T) { + requireAVX(t) type itemUint8 struct { Value uint8 } @@ -2367,6 +2208,7 @@ func TestMeanByUint8x16(t *testing.T) { } func TestMeanByUint16x8(t *testing.T) { + requireAVX(t) type itemUint16 struct { Value uint16 } @@ -2402,6 +2244,7 @@ func TestMeanByUint16x8(t *testing.T) { } func TestMeanByUint32x4(t *testing.T) { + requireAVX(t) type itemUint32 struct { Value uint32 } @@ -2436,6 +2279,7 @@ func TestMeanByUint32x4(t *testing.T) { } func TestMeanByUint64x2(t *testing.T) { + requireAVX(t) type itemUint64 struct { Value uint64 } @@ -2470,6 +2314,7 @@ func TestMeanByUint64x2(t *testing.T) { } func TestMeanByFloat32x4(t *testing.T) { + requireAVX(t) type itemFloat32 struct { Value float32 } @@ -2507,6 +2352,7 @@ func TestMeanByFloat32x4(t *testing.T) { } func TestMeanByFloat64x2(t *testing.T) { + requireAVX(t) type itemFloat64 struct { Value float64 } @@ -2543,7 +2389,8 @@ func TestMeanByFloat64x2(t *testing.T) { } // Test type alias works correctly for MeanBy -func TestSSEMeanByTypeAlias(t *testing.T) { +func TestAVXMeanByTypeAlias(t *testing.T) { + requireAVX(t) type myItem struct { Value myInt8 } diff --git a/exp/simd/math_bench_test.go b/exp/simd/math_bench_test.go index ad35d2f..31918b5 100644 --- a/exp/simd/math_bench_test.go +++ b/exp/simd/math_bench_test.go @@ -13,15 +13,15 @@ import ( // Benchmark suite for SIMD math operations compared to core lo package fallbacks. // These benchmarks measure the performance of Sum, Mean, Min, and Max operations -// across different SIMD implementations (SSE, AVX2, AVX512) and data sizes. +// across different SIMD implementations (AVX, AVX2, AVX512) and data sizes. // Benchmark sizes to demonstrate performance characteristics at different scales var benchmarkSizes = []struct { name string size int }{ - {"small", 8}, // Smaller than SSE width (16 lanes for int8) - {"medium", 128}, // Between SSE (16) and AVX2 (32) width for int8 + {"small", 8}, // Smaller than AVX width (16 lanes for int8) + {"medium", 128}, // Between AVX (16) and AVX2 (32) width for int8 {"large", 1024}, // Well above SIMD register widths {"xlarge", 8192}, // Large dataset for real-world performance } @@ -128,7 +128,8 @@ func BenchmarkSumInt8(b *testing.B) { _ = lo.Sum(data) } }) - b.Run("SSE-x16", func(b *testing.B) { + b.Run("AVX-x16", func(b *testing.B) { + requireAVX(b) b.ReportAllocs() for i := 0; i < b.N; i++ { _ = SumInt8x16(data) @@ -162,7 +163,8 @@ func BenchmarkSumInt16(b *testing.B) { _ = lo.Sum(data) } }) - b.Run("SSE-x8", func(b *testing.B) { + b.Run("AVX-x8", func(b *testing.B) { + requireAVX(b) b.ReportAllocs() for i := 0; i < b.N; i++ { _ = SumInt16x8(data) @@ -196,7 +198,8 @@ func BenchmarkSumInt32(b *testing.B) { _ = lo.Sum(data) } }) - b.Run("SSE-x4", func(b *testing.B) { + b.Run("AVX-x4", func(b *testing.B) { + requireAVX(b) b.ReportAllocs() for i := 0; i < b.N; i++ { _ = SumInt32x4(data) @@ -230,7 +233,8 @@ func BenchmarkSumInt64(b *testing.B) { _ = lo.Sum(data) } }) - b.Run("SSE-x2", func(b *testing.B) { + b.Run("AVX-x2", func(b *testing.B) { + requireAVX(b) b.ReportAllocs() for i := 0; i < b.N; i++ { _ = SumInt64x2(data) @@ -264,7 +268,8 @@ func BenchmarkSumFloat32(b *testing.B) { _ = lo.Sum(data) } }) - b.Run("SSE-x4", func(b *testing.B) { + b.Run("AVX-x4", func(b *testing.B) { + requireAVX(b) b.ReportAllocs() for i := 0; i < b.N; i++ { _ = SumFloat32x4(data) @@ -298,7 +303,8 @@ func BenchmarkSumFloat64(b *testing.B) { _ = lo.Sum(data) } }) - b.Run("SSE-x2", func(b *testing.B) { + b.Run("AVX-x2", func(b *testing.B) { + requireAVX(b) b.ReportAllocs() for i := 0; i < b.N; i++ { _ = SumFloat64x2(data) @@ -336,7 +342,8 @@ func BenchmarkMeanInt32(b *testing.B) { _ = lo.Mean(data) } }) - b.Run("SSE-x4", func(b *testing.B) { + b.Run("AVX-x4", func(b *testing.B) { + requireAVX(b) b.ReportAllocs() for i := 0; i < b.N; i++ { _ = MeanInt32x4(data) @@ -370,7 +377,8 @@ func BenchmarkMeanFloat64(b *testing.B) { _ = lo.Mean(data) } }) - b.Run("SSE-x2", func(b *testing.B) { + b.Run("AVX-x2", func(b *testing.B) { + requireAVX(b) b.ReportAllocs() for i := 0; i < b.N; i++ { _ = MeanFloat64x2(data) @@ -402,7 +410,8 @@ func BenchmarkMinInt32(b *testing.B) { for _, bs := range benchmarkSizes { b.Run(bs.name, func(b *testing.B) { data := generateInt32(bs.size) - b.Run("SSE-x4", func(b *testing.B) { + b.Run("AVX-x4", func(b *testing.B) { + requireAVX(b) b.ReportAllocs() for i := 0; i < b.N; i++ { _ = MinInt32x4(data) @@ -430,7 +439,8 @@ func BenchmarkMinFloat64(b *testing.B) { for _, bs := range benchmarkSizes { b.Run(bs.name, func(b *testing.B) { data := generateFloat64(bs.size) - b.Run("SSE-x2", func(b *testing.B) { + b.Run("AVX-x2", func(b *testing.B) { + requireAVX(b) b.ReportAllocs() for i := 0; i < b.N; i++ { _ = MinFloat64x2(data) @@ -462,7 +472,8 @@ func BenchmarkMaxInt32(b *testing.B) { for _, bs := range benchmarkSizes { b.Run(bs.name, func(b *testing.B) { data := generateInt32(bs.size) - b.Run("SSE-x4", func(b *testing.B) { + b.Run("AVX-x4", func(b *testing.B) { + requireAVX(b) b.ReportAllocs() for i := 0; i < b.N; i++ { _ = MaxInt32x4(data) @@ -490,7 +501,8 @@ func BenchmarkMaxFloat64(b *testing.B) { for _, bs := range benchmarkSizes { b.Run(bs.name, func(b *testing.B) { data := generateFloat64(bs.size) - b.Run("SSE-x2", func(b *testing.B) { + b.Run("AVX-x2", func(b *testing.B) { + requireAVX(b) b.ReportAllocs() for i := 0; i < b.N; i++ { _ = MaxFloat64x2(data) @@ -528,13 +540,16 @@ func BenchmarkSumInt8ByWidth(b *testing.B) { fn func() int8 }{ {"Fallback-lo", func() int8 { return lo.Sum(data) }}, - {"SSE-x16", func() int8 { return SumInt8x16(data) }}, + {"AVX-x16", func() int8 { return SumInt8x16(data) }}, {"AVX2-x32", func() int8 { return SumInt8x32(data) }}, {"AVX512-x64", func() int8 { return SumInt8x64(data) }}, } for _, bm := range benchmarks { b.Run(bm.name, func(b *testing.B) { + if bm.name == "AVX-x16" { + requireAVX(b) + } if bm.name == "AVX2-x32" { requireAVX2(b) } @@ -578,7 +593,8 @@ func BenchmarkSumInt64SteadyState(b *testing.B) { _ = lo.Sum(data) } }) - b.Run("SSE-x2", func(b *testing.B) { + b.Run("AVX-x2", func(b *testing.B) { + requireAVX(b) b.ReportAllocs() for i := 0; i < b.N; i++ { _ = SumInt64x2(data) diff --git a/exp/simd/simd_test.go b/exp/simd/simd_test.go index 6366c21..2fb8359 100644 --- a/exp/simd/simd_test.go +++ b/exp/simd/simd_test.go @@ -24,13 +24,15 @@ func init() { } // Type aliases for testing -type myInt8 int8 -type myInt16 int16 -type myInt32 int32 -type myInt64 int64 -type myUint8 uint8 -type myUint16 uint16 -type myUint32 uint32 -type myUint64 uint64 -type myFloat32 float32 -type myFloat64 float64 +type ( + myInt8 int8 + myInt16 int16 + myInt32 int32 + myInt64 int64 + myUint8 uint8 + myUint16 uint16 + myUint32 uint32 + myUint64 uint64 + myFloat32 float32 + myFloat64 float64 +)