style(simd): rename sse to avx (#821)

* style(simd): rename sse to avx * fix(exp,simd): apply the right avx512 constraints to a few methods * fix(exp,simd): apply the right avx512 constraints to a few methods
2026-04-22 15:37:14 +08:00 · 2026-02-26 22:08:53 +01:00
parent c49f84658a
commit ac8295b68a
21 changed files with 964 additions and 853 deletions
@@ -1,7 +1,7 @@
 ---
 name: Clamp
 slug: clamp
-sourceRef: exp/simd/math_sse.go#L424
+sourceRef: exp/simd/math_avx.go#L453
 category: exp
 subCategory: simd
 similarHelpers:
@@ -51,7 +51,7 @@ Clamps each element in a collection between min and max values using SIMD instru

 | SIMD variant | Lanes | Required flags | Typical CPUs                   |
 | ------------ | ----- | -------------- | ------------------------------ |
-| SSE (xN)     | 2-16  | `sse2`         | All amd64                      |
+| AVX (xN)     | 2-16  | `avx`          | All amd64                      |
 | AVX2 (xN)    | 4-32  | `avx2`         | Intel Haswell+, AMD Excavator+ |
 | AVX-512 (xN) | 8-64  | `avx512f`      | Intel Skylake-X+, some Xeons   |

@@ -70,7 +70,7 @@ result := simd.ClampFloat32x16([]float32{0.5, 1.5, 2.5, 3.5}, 1.0, 3.0)
 ```

 ```go
-// Using SSE variant (8 lanes at once) - works on all amd64
+// Using AVX variant (8 lanes at once) - works on all amd64
 result := simd.ClampInt16x8([]int16{100, 150, 200, 250}, 120, 220)
 // []int16{120, 150, 200, 220}
 ```
@@ -1,7 +1,7 @@
 ---
 name: Contains
 slug: contains
-sourceRef: exp/simd/intersect_sse.go#L11
+sourceRef: exp/simd/intersect_avx512.go#L9
 category: exp
 subCategory: simd
 similarHelpers:
@@ -51,7 +51,7 @@ Checks if a target value is present in a collection using SIMD instructions. The

 | SIMD variant | Lanes | Required flags | Typical CPUs                   |
 | ------------ | ----- | -------------- | ------------------------------ |
-| SSE (xN)     | 2-16  | `sse2`         | All amd64                      |
+| AVX (xN)     | 2-16  | `avx`          | All amd64                      |
 | AVX2 (xN)    | 4-32  | `avx2`         | Intel Haswell+, AMD Excavator+ |
 | AVX-512 (xN) | 8-64  | `avx512f`      | Intel Skylake-X+, some Xeons   |

@@ -64,7 +64,7 @@ found := simd.ContainsInt8x32([]int8{1, 2, 3, 4, 5}, 3)
 ```

 ```go
-// Using SSE variant (16 lanes at once) - works on all amd64
+// Using AVX variant (16 lanes at once) - works on all amd64
 found := simd.ContainsInt64x2([]int64{1000000, 2000000, 3000000}, 2000000)
 // true
 ```
@@ -1,7 +1,7 @@
 ---
 name: Max
 slug: max
-sourceRef: exp/simd/math_sse.go#L1328
+sourceRef: exp/simd/math_avx.go#L1279
 category: exp
 subCategory: simd
 similarHelpers:
@@ -51,7 +51,7 @@ Finds the maximum value in a collection using SIMD instructions. The suffix (x2,

 | SIMD variant | Lanes | Required flags | Typical CPUs                   |
 | ------------ | ----- | -------------- | ------------------------------ |
-| SSE (xN)     | 2-16  | `sse2`         | All amd64                      |
+| AVX (xN)     | 2-16  | `avx`          | All amd64                      |
 | AVX2 (xN)    | 4-32  | `avx2`         | Intel Haswell+, AMD Excavator+ |
 | AVX-512 (xN) | 8-64  | `avx512f`      | Intel Skylake-X+, some Xeons   |

@@ -70,7 +70,7 @@ max := simd.MaxFloat32x16([]float32{3.5, 1.2, 4.8, 2.1})
 ```

 ```go
-// Using SSE variant (4 lanes at once) - works on all amd64
+// Using AVX variant (4 lanes at once) - works on all amd64
 max := simd.MaxInt32x4([]int32{100, 50, 200, 75})
 // 200
 ```
@@ -1,7 +1,7 @@
 ---
 name: Mean
 slug: mean
-sourceRef: exp/simd/math_sse.go#L333
+sourceRef: exp/simd/math_avx.go#L352
 category: exp
 subCategory: simd
 similarHelpers:
@@ -52,7 +52,7 @@ Calculates the arithmetic mean of a collection using SIMD instructions. The suff

 | SIMD variant | Lanes | Required flags | Typical CPUs                   |
 | ------------ | ----- | -------------- | ------------------------------ |
-| SSE (xN)     | 2-16  | `sse2`         | All amd64                      |
+| AVX (xN)     | 2-16  | `avx`          | All amd64                      |
 | AVX2 (xN)    | 4-32  | `avx2`         | Intel Haswell+, AMD Excavator+ |
 | AVX-512 (xN) | 8-64  | `avx512f`      | Intel Skylake-X+, some Xeons   |

@@ -71,7 +71,7 @@ mean := simd.MeanFloat32x16([]float32{1.0, 2.0, 3.0, 4.0})
 ```

 ```go
-// Using SSE variant (8 lanes at once) - works on all amd64
+// Using AVX variant (8 lanes at once) - works on all amd64
 mean := simd.MeanInt16x8([]int16{10, 20, 30, 40})
 // 25
 ```
@@ -62,7 +62,7 @@ MeanBy transforms a collection using an iteratee function and calculates the ari

 | SIMD variant | Lanes | Required flags | Typical CPUs                   |
 | ------------ | ----- | -------------- | ------------------------------ |
-| SSE (xN)     | 2-16  | `sse2`         | All amd64                      |
+| AVX (xN)     | 2-16  | `avx`          | All amd64                      |
 | AVX2 (xN)    | 4-32  | `avx2`         | Intel Haswell+, AMD Excavator+ |
 | AVX-512 (xN) | 8-64  | `avx512f`      | Intel Skylake-X+, some Xeons   |

@@ -118,7 +118,7 @@ metrics := []Metric{
    {Value: 400},
 }

-// Using SSE variant - works on all amd64
+// Using AVX variant - works on all amd64
 mean := simd.MeanByUint16x8(metrics, func(m Metric) uint16 {
    return m.Value
 })
@@ -1,7 +1,7 @@
 ---
 name: Min
 slug: min
-sourceRef: exp/simd/math_sse.go#L834
+sourceRef: exp/simd/math_avx.go#L833
 category: exp
 subCategory: simd
 similarHelpers:
@@ -51,7 +51,7 @@ Finds the minimum value in a collection using SIMD instructions. The suffix (x2,

 | SIMD variant | Lanes | Required flags | Typical CPUs                   |
 | ------------ | ----- | -------------- | ------------------------------ |
-| SSE (xN)     | 2-16  | `sse2`         | All amd64                      |
+| AVX (xN)     | 2-16  | `avx`          | All amd64                      |
 | AVX2 (xN)    | 4-32  | `avx2`         | Intel Haswell+, AMD Excavator+ |
 | AVX-512 (xN) | 8-64  | `avx512f`      | Intel Skylake-X+, some Xeons   |

@@ -70,7 +70,7 @@ min := simd.MinFloat32x16([]float32{3.5, 1.2, 4.8, 2.1})
 ```

 ```go
-// Using SSE variant (4 lanes at once) - works on all amd64
+// Using AVX variant (4 lanes at once) - works on all amd64
 min := simd.MinInt32x4([]int32{100, 50, 200, 75})
 // 50
 ```
@@ -1,7 +1,7 @@
 ---
 name: Sum
 slug: sum
-sourceRef: exp/simd/math_sse.go#L13
+sourceRef: exp/simd/math_avx.go#L14
 category: exp
 subCategory: simd
 similarHelpers:
@@ -52,7 +52,7 @@ Sums the values in a collection using SIMD instructions. The suffix (x2, x4, x8,

 | SIMD variant | Lanes | Required flags | Typical CPUs                   |
 | ------------ | ----- | -------------- | ------------------------------ |
-| SSE (xN)     | 2-16  | `sse2`         | All amd64                      |
+| AVX (xN)     | 2-16  | `avx`          | All amd64                      |
 | AVX2 (xN)    | 4-32  | `avx2`         | Intel Haswell+, AMD Excavator+ |
 | AVX-512 (xN) | 8-64  | `avx512f`      | Intel Skylake-X+, some Xeons   |

@@ -71,7 +71,7 @@ sum := simd.SumFloat32x16([]float32{1.1, 2.2, 3.3, 4.4})
 ```

 ```go
-// Using SSE variant (4 lanes at once) - works on all amd64
+// Using AVX variant (4 lanes at once) - works on all amd64
 sum := simd.SumInt32x4([]int32{1000000, 2000000, 3000000})
 // 6000000
 ```
@@ -62,7 +62,7 @@ SumBy transforms a collection using an iteratee function and sums the result usi

 | SIMD variant | Lanes | Required flags | Typical CPUs                   |
 | ------------ | ----- | -------------- | ------------------------------ |
-| SSE (xN)     | 2-16  | `sse2`         | All amd64                      |
+| AVX (xN)     | 2-16  | `avx`          | All amd64                      |
 | AVX2 (xN)    | 4-32  | `avx2`         | Intel Haswell+, AMD Excavator+ |
 | AVX-512 (xN) | 8-64  | `avx512f`      | Intel Skylake-X+, some Xeons   |

@@ -119,7 +119,7 @@ metrics := []Metric{
    {Value: 400},
 }

-// Using SSE variant - works on all amd64
+// Using AVX variant - works on all amd64
 sum := simd.SumByUint16x8(metrics, func(m Metric) uint16 {
    return m.Value
 })
@@ -1,6 +1,6 @@
 ---
 title: SIMD operations
-description: High-performance slice operations using SSE, AVX2 and AVX512 SIMD when built with Go 1.26+ and GOEXPERIMENT=simd on amd64.
+description: High-performance slice operations using AVX, AVX2 and AVX512 SIMD when built with Go 1.26+ and GOEXPERIMENT=simd on amd64.
 sidebar_position: 0
 hide_table_of_contents: true
 ---
@@ -14,7 +14,7 @@ Your feedback helps us improve!
 #
 ## SIMD helpers

-This page lists all operations on slices, available in the `exp/simd` sub-package. These helpers use **SSE** (128-bit), **AVX2** (256-bit) or **AVX512** (512-bit) SIMD when built with Go 1.26+, the `GOEXPERIMENT=simd` flag, and on amd64.
+This page lists all operations on slices, available in the `exp/simd` sub-package. These helpers use **AVX** (128-bit), **AVX2** (256-bit) or **AVX512** (512-bit) SIMD when built with Go 1.26+, the `GOEXPERIMENT=simd` flag, and on amd64.

 :::warning Unstable API
 SIMD helpers are experimental. The API may break in the future.
@@ -26,7 +26,7 @@ Benchmarks show that running SIMD operators on small datasets is slower:

 ```txt
 BenchmarkSumInt8/small/Fallback-lo-4             203616572        5.875 ns/op
-BenchmarkSumInt8/small/SSE-x16-4                 100000000        12.04 ns/op
+BenchmarkSumInt8/small/AVX-x16-4                 100000000        12.04 ns/op
 BenchmarkSumInt8/small/AVX2-x32-4                 64041816        17.93 ns/op
 BenchmarkSumInt8/small/AVX512-x64-4               26947528        44.75 ns/op
 ```
@@ -35,7 +35,7 @@ But much much faster on big datasets:

 ```txt
 BenchmarkSumInt8/xlarge/Fallback-lo-4               247677       4860 ns/op
-BenchmarkSumInt8/xlarge/SSE-x16-4                  3851040      311.4 ns/op
+BenchmarkSumInt8/xlarge/AVX-x16-4                  3851040      311.4 ns/op
 BenchmarkSumInt8/xlarge/AVX2-x32-4                 7100002      169.2 ns/op
 BenchmarkSumInt8/xlarge/AVX512-x64-4              10107534      118.1 ns/op
 ```
@@ -6,7 +6,7 @@ Benchmarks show that running SIMD operations on small datasets is slower:

 ```txt
 BenchmarkSumInt8/small/Fallback-lo-2            	248740710	        5.218 ns/op
-BenchmarkSumInt8/small/SSE-x16-2                	126181464	        9.485 ns/op
+BenchmarkSumInt8/small/AVX-x16-2                	126181464	        9.485 ns/op
 BenchmarkSumInt8/small/AVX2-x32-2               	 73059427	        14.44 ns/op
 BenchmarkSumInt8/small/AVX512-x64-2             	 49913169	        24.41 ns/op
 ```
@@ -15,7 +15,7 @@ But SIMD is much faster on large datasets:

 ```txt
 BenchmarkSumInt8/xlarge/Fallback-lo-2           	  273898	         4383 ns/op
-BenchmarkSumInt8/xlarge/SSE-x16-2               	 6928408	        173.1 ns/op
+BenchmarkSumInt8/xlarge/AVX-x16-2               	 6928408	        173.1 ns/op
 BenchmarkSumInt8/xlarge/AVX2-x32-2              	12639586	        94.09 ns/op
 BenchmarkSumInt8/xlarge/AVX512-x64-2            	13509693	        89.67 ns/op
 ```
@@ -50,397 +50,397 @@ ok  	github.com/samber/lo/exp/simd	596.213s

 | Benchmark                                      | Iterations | Time/op     | Bytes/op | Allocs/op   |
 | ---------------------------------------------- | ---------- | ----------- | -------- | ----------- |
-| BenchmarkContainsInt8/tiny/SSE-x16-2           | 312359204  | 3.625 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt8/tiny/AVX2-x32-2          | 277194441  | 4.531 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt8/tiny/AVX512-x16-2        | 312359204  | 3.625 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt8/tiny/AVX512-x32-2        | 277194441  | 4.531 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt8/tiny/AVX512-x64-2        | 336853209  | 3.401 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt8/small/SSE-x16-2          | 449132103  | 2.670 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt8/small/AVX2-x32-2         | 148648339  | 8.332 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt8/small/AVX512-x16-2       | 449132103  | 2.670 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt8/small/AVX512-x32-2       | 148648339  | 8.332 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt8/small/AVX512-x64-2       | 143124861  | 7.982 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt8/medium/SSE-x16-2         | 276816714  | 4.302 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt8/medium/AVX2-x32-2        | 345774957  | 3.529 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt8/medium/AVX512-x16-2      | 276816714  | 4.302 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt8/medium/AVX512-x32-2      | 345774957  | 3.529 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt8/medium/AVX512-x64-2      | 449868722  | 2.669 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt8/large/SSE-x16-2          | 100000000  | 10.68 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt8/large/AVX2-x32-2         | 172934200  | 6.941 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt8/large/AVX512-x16-2          | 100000000  | 10.68 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt8/large/AVX512-x32-2         | 172934200  | 6.941 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt8/large/AVX512-x64-2       | 280992625  | 4.384 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt8/xlarge/SSE-x16-2         | 187189599  | 6.203 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt8/xlarge/AVX2-x32-2        | 274289563  | 4.042 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt8/xlarge/AVX512-x16-2         | 187189599  | 6.203 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt8/xlarge/AVX512-x32-2        | 274289563  | 4.042 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt8/xlarge/AVX512-x64-2      | 375048555  | 2.953 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt8/massive/SSE-x16-2        | 86434948   | 14.02 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt8/massive/AVX2-x32-2       | 153742346  | 8.012 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt8/massive/AVX512-x16-2        | 86434948   | 14.02 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt8/massive/AVX512-x32-2       | 153742346  | 8.012 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt8/massive/AVX512-x64-2     | 259404483  | 5.214 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt16/tiny/SSE-x8-2           | 270309470  | 4.315 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt16/tiny/AVX2-x16-2         | 264874646  | 4.281 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt16/tiny/AVX512-x8-2           | 270309470  | 4.315 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt16/tiny/AVX512-x16-2         | 264874646  | 4.281 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt16/tiny/AVX512-x32-2       | 328810479  | 3.593 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt16/small/SSE-x8-2          | 374742561  | 3.206 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt16/small/AVX2-x16-2        | 449838870  | 2.678 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt16/small/AVX512-x8-2          | 374742561  | 3.206 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt16/small/AVX512-x16-2        | 449838870  | 2.678 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt16/small/AVX512-x32-2      | 143845734  | 8.484 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt16/medium/SSE-x8-2         | 185415590  | 6.448 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt16/medium/AVX2-x16-2       | 273780868  | 4.268 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt16/medium/AVX512-x8-2         | 185415590  | 6.448 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt16/medium/AVX512-x16-2       | 273780868  | 4.268 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt16/medium/AVX512-x32-2     | 350067484  | 3.431 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt16/large/SSE-x8-2          | 61109778   | 19.66 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt16/large/AVX2-x16-2        | 100000000  | 10.74 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt16/large/AVX512-x8-2       | 61109778   | 19.66 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt16/large/AVX512-x16-2      | 100000000  | 10.74 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt16/large/AVX512-x32-2      | 182886646  | 6.575 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt16/xlarge/SSE-x8-2         | 15220682   | 71.53 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt16/xlarge/AVX2-x16-2       | 31876572   | 37.57 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt16/xlarge/AVX512-x8-2      | 15220682   | 71.53 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt16/xlarge/AVX512-x16-2     | 31876572   | 37.57 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt16/xlarge/AVX512-x32-2     | 61992217   | 19.55 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt16/massive/SSE-x8-2        | 4372000    | 262.8 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt16/massive/AVX2-x16-2      | 9019658    | 131.1 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt16/massive/AVX512-x8-2     | 4372000    | 262.8 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt16/massive/AVX512-x16-2    | 9019658    | 131.1 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt16/massive/AVX512-x32-2    | 16568430   | 74.25 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt32/tiny/SSE-x4-2           | 499209442  | 2.406 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt32/tiny/AVX2-x8-2          | 350479609  | 3.433 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt32/tiny/AVX512-x4-2        | 499209442  | 2.406 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt32/tiny/AVX512-x8-2        | 350479609  | 3.433 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt32/tiny/AVX512-x16-2       | 280918554  | 4.309 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt32/small/SSE-x4-2          | 299561596  | 4.028 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt32/small/AVX2-x8-2         | 374064310  | 3.205 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt32/small/AVX512-x4-2       | 299561596  | 4.028 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt32/small/AVX512-x8-2       | 374064310  | 3.205 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt32/small/AVX512-x16-2      | 499219765  | 2.418 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt32/medium/SSE-x4-2         | 100000000  | 10.42 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt32/medium/AVX2-x8-2        | 187391635  | 6.403 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt32/medium/AVX512-x4-2      | 100000000  | 10.42 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt32/medium/AVX512-x8-2      | 187391635  | 6.403 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt32/medium/AVX512-x16-2     | 307955800  | 3.875 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt32/large/SSE-x4-2          | 33256420   | 36.05 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt32/large/AVX2-x8-2         | 62421526   | 19.23 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt32/large/AVX512-x4-2       | 33256420   | 36.05 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt32/large/AVX512-x8-2       | 62421526   | 19.23 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt32/large/AVX512-x16-2      | 100000000  | 10.36 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt32/xlarge/SSE-x4-2         | 8328856    | 144.9 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt32/xlarge/AVX2-x8-2        | 17039037   | 71.14 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt32/xlarge/AVX512-x4-2      | 8328856    | 144.9 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt32/xlarge/AVX512-x8-2      | 17039037   | 71.14 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt32/xlarge/AVX512-x16-2     | 28740241   | 41.77 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt32/massive/SSE-x4-2        | 3525885    | 332.3 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt32/massive/AVX2-x8-2       | 7318027    | 164.5 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt32/massive/AVX512-x4-2     | 3525885    | 332.3 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt32/massive/AVX512-x8-2     | 7318027    | 164.5 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt32/massive/AVX512-x16-2    | 12181366   | 99.08 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt64/tiny/SSE-x2-2           | 409014308  | 2.934 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt64/tiny/AVX2-x4-2          | 449210791  | 2.667 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt64/tiny/AVX512-x2-2        | 409014308  | 2.934 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt64/tiny/AVX512-x4-2        | 449210791  | 2.667 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt64/tiny/AVX512-x8-2        | 280998146  | 4.293 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt64/small/SSE-x2-2          | 195631429  | 6.172 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt64/small/AVX2-x4-2         | 281272394  | 4.308 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt64/small/AVX512-x2-2       | 195631429  | 6.172 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt64/small/AVX512-x4-2       | 281272394  | 4.308 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt64/small/AVX512-x8-2       | 408933924  | 3.044 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt64/medium/SSE-x2-2         | 63006909   | 18.94 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt64/medium/AVX2-x4-2        | 100000000  | 10.67 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt64/medium/AVX512-x2-2      | 63006909   | 18.94 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt64/medium/AVX512-x4-2      | 100000000  | 10.67 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt64/medium/AVX512-x8-2      | 197411126  | 6.016 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt64/large/SSE-x2-2          | 17098578   | 70.57 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt64/large/AVX2-x4-2         | 32558013   | 37.07 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt64/large/AVX512-x2-2       | 17098578   | 70.57 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt64/large/AVX512-x4-2       | 32558013   | 37.07 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt64/large/AVX512-x8-2       | 57629485   | 20.94 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt64/xlarge/SSE-x2-2         | 4286155    | 281.8 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt64/xlarge/AVX2-x4-2        | 8344772    | 143.8 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt64/xlarge/AVX512-x2-2      | 4286155    | 281.8 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt64/xlarge/AVX512-x4-2      | 8344772    | 143.8 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt64/xlarge/AVX512-x8-2      | 14428276   | 83.14 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt64/massive/SSE-x2-2        | 1000000    | 1012 ns/op  | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt64/massive/AVX2-x4-2       | 2350525    | 510.6 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt64/massive/AVX512-x2-2     | 1000000    | 1012 ns/op  | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt64/massive/AVX512-x4-2     | 2350525    | 510.6 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt64/massive/AVX512-x8-2     | 3773523    | 318.1 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint8/tiny/SSE-x16-2          | 338880315  | 3.332 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint8/tiny/AVX2-x32-2         | 320784217  | 3.559 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint8/tiny/AVX512-x16-2       | 338880315  | 3.332 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint8/tiny/AVX512-x32-2       | 320784217  | 3.559 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint8/tiny/AVX512-x64-2       | 341599854  | 3.331 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint8/small/SSE-x16-2         | 449579424  | 2.670 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint8/small/AVX2-x32-2        | 140368142  | 8.648 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint8/small/AVX512-x16-2      | 449579424  | 2.670 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint8/small/AVX512-x32-2      | 140368142  | 8.648 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint8/small/AVX512-x64-2      | 146828888  | 8.182 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint8/medium/SSE-x16-2        | 374443974  | 3.472 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint8/medium/AVX2-x32-2       | 449271607  | 2.672 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint8/medium/AVX512-x16-2     | 374443974  | 3.472 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint8/medium/AVX512-x32-2     | 449271607  | 2.672 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint8/medium/AVX512-x64-2     | 598525731  | 2.018 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint8/large/SSE-x16-2         | 254828565  | 4.956 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint8/large/AVX2-x32-2        | 407777484  | 2.938 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint8/large/AVX512-x16-2      | 254828565  | 4.956 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint8/large/AVX512-x32-2      | 407777484  | 2.938 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint8/large/AVX512-x64-2      | 443472316  | 2.666 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint8/xlarge/SSE-x16-2        | 162196827  | 7.867 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint8/xlarge/AVX2-x32-2       | 268324950  | 4.518 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint8/xlarge/AVX512-x16-2     | 162196827  | 7.867 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint8/xlarge/AVX512-x32-2     | 268324950  | 4.518 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint8/xlarge/AVX512-x64-2     | 400437789  | 2.952 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint8/massive/SSE-x16-2       | 214548872  | 5.640 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint8/massive/AVX2-x32-2      | 348431553  | 3.391 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint8/massive/AVX512-x16-2    | 214548872  | 5.640 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint8/massive/AVX512-x32-2    | 348431553  | 3.391 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint8/massive/AVX512-x64-2    | 459781908  | 2.455 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint16/tiny/SSE-x8-2          | 276271912  | 4.297 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint16/tiny/AVX2-x16-2        | 281145528  | 4.270 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint16/tiny/AVX512-x8-2       | 276271912  | 4.297 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint16/tiny/AVX512-x16-2      | 281145528  | 4.270 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint16/tiny/AVX512-x32-2      | 315343911  | 3.667 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint16/small/SSE-x8-2         | 374632351  | 3.204 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint16/small/AVX2-x16-2       | 449355727  | 2.670 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint16/small/AVX512-x8-2      | 374632351  | 3.204 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint16/small/AVX512-x16-2     | 449355727  | 2.670 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint16/small/AVX512-x32-2     | 138088146  | 8.395 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint16/medium/SSE-x8-2        | 187276191  | 6.582 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint16/medium/AVX2-x16-2      | 281107980  | 4.306 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint16/medium/AVX512-x8-2     | 187276191  | 6.582 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint16/medium/AVX512-x16-2    | 281107980  | 4.306 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint16/medium/AVX512-x32-2    | 358850328  | 3.516 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint16/large/SSE-x8-2         | 59025931   | 19.98 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint16/large/AVX2-x16-2       | 100000000  | 10.68 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint16/large/AVX512-x8-2      | 59025931   | 19.98 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint16/large/AVX512-x16-2     | 100000000  | 10.68 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint16/large/AVX512-x32-2     | 179631354  | 6.569 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint16/xlarge/SSE-x8-2        | 16576267   | 71.63 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint16/xlarge/AVX2-x16-2      | 32578981   | 36.96 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint16/xlarge/AVX512-x8-2     | 16576267   | 71.63 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint16/xlarge/AVX512-x16-2    | 32578981   | 36.96 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint16/xlarge/AVX512-x32-2    | 61464870   | 19.44 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint16/massive/SSE-x8-2       | 2153736    | 557.4 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint16/massive/AVX2-x16-2     | 4225728    | 281.3 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint16/massive/AVX512-x8-2    | 2153736    | 557.4 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint16/massive/AVX512-x16-2   | 4225728    | 281.3 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint16/massive/AVX512-x32-2   | 7829936    | 145.1 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint32/tiny/SSE-x4-2          | 499390296  | 2.403 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint32/tiny/AVX2-x8-2         | 362964080  | 3.342 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint32/tiny/AVX512-x4-2       | 499390296  | 2.403 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint32/tiny/AVX512-x8-2       | 362964080  | 3.342 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint32/tiny/AVX512-x16-2      | 281063364  | 4.268 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint32/small/SSE-x4-2         | 293867554  | 4.004 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint32/small/AVX2-x8-2        | 374510434  | 3.203 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint32/small/AVX512-x4-2      | 293867554  | 4.004 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint32/small/AVX512-x8-2      | 374510434  | 3.203 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint32/small/AVX512-x16-2     | 499714206  | 2.402 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint32/medium/SSE-x4-2        | 100000000  | 10.42 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint32/medium/AVX2-x8-2       | 187258657  | 6.405 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint32/medium/AVX512-x4-2     | 100000000  | 10.42 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint32/medium/AVX512-x8-2     | 187258657  | 6.405 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint32/medium/AVX512-x16-2    | 312999210  | 3.881 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint32/large/SSE-x4-2         | 33298366   | 36.02 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint32/large/AVX2-x8-2        | 62409421   | 19.23 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint32/large/AVX512-x4-2      | 33298366   | 36.02 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint32/large/AVX512-x8-2      | 62409421   | 19.23 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint32/large/AVX512-x16-2     | 100000000  | 10.10 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint32/xlarge/SSE-x4-2        | 7948898    | 143.6 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint32/xlarge/AVX2-x8-2       | 17021738   | 70.49 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint32/xlarge/AVX512-x4-2     | 7948898    | 143.6 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint32/xlarge/AVX512-x8-2     | 17021738   | 70.49 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint32/xlarge/AVX512-x16-2    | 28742320   | 41.77 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint32/massive/SSE-x4-2       | 1595774    | 751.1 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint32/massive/AVX2-x8-2      | 3094242    | 381.1 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint32/massive/AVX512-x4-2    | 1595774    | 751.1 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint32/massive/AVX512-x8-2    | 3094242    | 381.1 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint32/massive/AVX512-x16-2   | 5080051    | 238.3 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint64/tiny/SSE-x2-2          | 374760351  | 3.203 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint64/tiny/AVX2-x4-2         | 498763054  | 2.419 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint64/tiny/AVX512-x2-2       | 374760351  | 3.203 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint64/tiny/AVX512-x4-2       | 498763054  | 2.419 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint64/tiny/AVX512-x8-2       | 319635274  | 3.582 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint64/small/SSE-x2-2         | 187032452  | 6.447 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint64/small/AVX2-x4-2        | 299546244  | 4.009 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint64/small/AVX512-x2-2      | 187032452  | 6.447 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint64/small/AVX512-x4-2      | 299546244  | 4.009 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint64/small/AVX512-x8-2      | 373937659  | 3.207 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint64/medium/SSE-x2-2        | 62413118   | 19.23 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint64/medium/AVX2-x4-2       | 113978791  | 10.42 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint64/medium/AVX512-x2-2     | 62413118   | 19.23 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint64/medium/AVX512-x4-2     | 113978791  | 10.42 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint64/medium/AVX512-x8-2     | 186965330  | 6.484 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint64/large/SSE-x2-2         | 17005768   | 70.57 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint64/large/AVX2-x4-2        | 33286495   | 36.69 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint64/large/AVX512-x2-2      | 17005768   | 70.57 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint64/large/AVX512-x4-2      | 33286495   | 36.69 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint64/large/AVX512-x8-2      | 61486065   | 19.93 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint64/xlarge/SSE-x2-2        | 4154370    | 280.8 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint64/xlarge/AVX2-x4-2       | 8371358    | 148.2 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint64/xlarge/AVX512-x2-2     | 4154370    | 280.8 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint64/xlarge/AVX512-x4-2     | 8371358    | 148.2 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint64/xlarge/AVX512-x8-2     | 14193795   | 72.36 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint64/massive/SSE-x2-2       | 1773937    | 676.4 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsUint64/massive/AVX2-x4-2      | 3500168    | 343.0 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint64/massive/AVX512-x2-2    | 1773937    | 676.4 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsUint64/massive/AVX512-x4-2    | 3500168    | 343.0 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsUint64/massive/AVX512-x8-2    | 7097266    | 249.3 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat32/tiny/SSE-x4-2         | 410522160  | 2.675 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat32/tiny/AVX2-x8-2        | 308565882  | 3.814 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat32/tiny/AVX512-x4-2      | 410522160  | 2.675 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat32/tiny/AVX512-x8-2      | 308565882  | 3.814 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsFloat32/tiny/AVX512-x16-2     | 315331897  | 3.755 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat32/small/SSE-x4-2        | 278219434  | 4.642 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat32/small/AVX2-x8-2       | 362945481  | 3.287 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat32/small/AVX512-x4-2     | 278219434  | 4.642 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat32/small/AVX512-x8-2     | 362945481  | 3.287 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsFloat32/small/AVX512-x16-2    | 408523153  | 2.941 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat32/medium/SSE-x4-2       | 100000000  | 10.77 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat32/medium/AVX2-x8-2      | 186186376  | 6.409 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat32/medium/AVX512-x4-2    | 100000000  | 10.77 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat32/medium/AVX512-x8-2    | 186186376  | 6.409 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsFloat32/medium/AVX512-x16-2   | 264255108  | 4.619 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat32/large/SSE-x4-2        | 33028701   | 36.27 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat32/large/AVX2-x8-2       | 62465360   | 19.53 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat32/large/AVX512-x4-2      | 33028701   | 36.27 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat32/large/AVX512-x8-2      | 62465360   | 19.53 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsFloat32/large/AVX512-x16-2    | 108213310  | 10.95 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat32/xlarge/SSE-x4-2       | 8359381    | 143.6 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat32/xlarge/AVX2-x8-2      | 17042701   | 70.46 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat32/xlarge/AVX512-x4-2    | 8359381    | 143.6 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat32/xlarge/AVX512-x8-2    | 17042701   | 70.46 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsFloat32/xlarge/AVX512-x16-2   | 31806921   | 37.13 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat32/massive/SSE-x4-2      | 1000000    | 1100 ns/op  | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat32/massive/AVX2-x8-2     | 2164672    | 554.4 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat32/massive/AVX512-x4-2   | 1000000    | 1100 ns/op  | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat32/massive/AVX512-x8-2   | 2164672    | 554.4 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsFloat32/massive/AVX512-x16-2  | 4201453    | 293.9 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat64/tiny/SSE-x2-2         | 362183925  | 3.223 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat64/tiny/AVX2-x4-2        | 449021466  | 2.687 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat64/tiny/AVX512-x2-2      | 362183925  | 3.223 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat64/tiny/AVX512-x4-2      | 449021466  | 2.687 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsFloat64/tiny/AVX512-x8-2      | 320176149  | 3.820 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat64/small/SSE-x2-2        | 187139116  | 6.415 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat64/small/AVX2-x4-2       | 280722585  | 4.300 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat64/small/AVX512-x2-2     | 187139116  | 6.415 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat64/small/AVX512-x4-2     | 280722585  | 4.300 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsFloat64/small/AVX512-x8-2     | 335670502  | 3.472 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat64/medium/SSE-x2-2       | 62343927   | 19.23 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat64/medium/AVX2-x4-2      | 112332902  | 10.69 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat64/medium/AVX512-x2-2    | 62343927   | 19.23 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat64/medium/AVX512-x4-2    | 112332902  | 10.69 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsFloat64/medium/AVX512-x8-2    | 179610780  | 6.741 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat64/large/SSE-x2-2        | 16996959   | 70.51 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat64/large/AVX2-x4-2       | 33017950   | 36.29 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat64/large/AVX512-x2-2     | 16996959   | 70.51 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat64/large/AVX512-x4-2     | 33017950   | 36.29 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsFloat64/large/AVX512-x8-2     | 60322328   | 19.73 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat64/xlarge/SSE-x2-2       | 4141281    | 282.9 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat64/xlarge/AVX2-x4-2      | 7856590    | 145.0 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat64/xlarge/AVX512-x2-2    | 4141281    | 282.9 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat64/xlarge/AVX512-x4-2    | 7856590    | 145.0 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsFloat64/xlarge/AVX512-x8-2    | 16623739   | 72.06 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat64/massive/SSE-x2-2      | 541202     | 2195 ns/op  | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsFloat64/massive/AVX2-x4-2     | 1000000    | 1158 ns/op  | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat64/massive/AVX512-x2-2   | 541202     | 2195 ns/op  | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsFloat64/massive/AVX512-x4-2   | 1000000    | 1158 ns/op  | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsFloat64/massive/AVX512-x8-2   | 2115301    | 560.4 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsWorstCase/SSE-x4-2            | 7651734    | 145.6 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsWorstCase/AVX2-x8-2           | 14921599   | 70.49 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsWorstCase/AVX512-x4-2         | 7651734    | 145.6 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsWorstCase/AVX512-x8-2         | 14921599   | 70.49 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsWorstCase/AVX512-x16-2        | 28708478   | 41.38 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsBestCase/SSE-x4-2             | 534237578  | 2.136 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsBestCase/AVX2-x8-2            | 561252645  | 2.159 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsBestCase/AVX512-x4-2          | 534237578  | 2.136 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsBestCase/AVX512-x8-2          | 561252645  | 2.159 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsBestCase/AVX512-x16-2         | 560396454  | 2.137 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsNegative/tiny/SSE-x4-2        | 499649139  | 2.401 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsNegative/tiny/AVX2-x8-2       | 329743240  | 3.421 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsNegative/tiny/AVX512-x4-2     | 499649139  | 2.401 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsNegative/tiny/AVX512-x8-2     | 329743240  | 3.421 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsNegative/tiny/AVX512-x16-2    | 280516392  | 4.276 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsNegative/small/SSE-x4-2       | 299373171  | 4.006 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsNegative/small/AVX2-x8-2      | 374407988  | 3.267 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsNegative/small/AVX512-x4-2    | 299373171  | 4.006 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsNegative/small/AVX512-x8-2    | 374407988  | 3.267 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsNegative/small/AVX512-x16-2   | 486948346  | 2.424 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsNegative/medium/SSE-x4-2      | 100000000  | 10.41 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsNegative/medium/AVX2-x8-2     | 182899621  | 6.412 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsNegative/medium/AVX512-x4-2   | 100000000  | 10.41 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsNegative/medium/AVX512-x8-2   | 182899621  | 6.412 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsNegative/medium/AVX512-x16-2  | 311969776  | 3.829 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsNegative/large/SSE-x4-2       | 33309816   | 36.04 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsNegative/large/AVX2-x8-2      | 59912676   | 19.74 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsNegative/large/AVX512-x4-2    | 33309816   | 36.04 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsNegative/large/AVX512-x8-2    | 59912676   | 19.74 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsNegative/large/AVX512-x16-2   | 100000000  | 10.65 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsNegative/xlarge/SSE-x4-2      | 8346818    | 143.7 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsNegative/xlarge/AVX2-x8-2     | 16980399   | 70.54 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsNegative/xlarge/AVX512-x4-2   | 8346818    | 143.7 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsNegative/xlarge/AVX512-x8-2   | 16980399   | 70.54 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsNegative/xlarge/AVX512-x16-2  | 28676455   | 42.94 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsNegative/massive/SSE-x4-2     | 1000000    | 1151 ns/op  | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsNegative/massive/AVX2-x8-2    | 2161594    | 555.2 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsNegative/massive/AVX512-x4-2  | 1000000    | 1151 ns/op  | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsNegative/massive/AVX512-x8-2  | 2161594    | 555.2 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsNegative/massive/AVX512-x16-2 | 3549094    | 350.5 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt8ByWidth/SSE-x16-2         | 331533141  | 3.222 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt8ByWidth/AVX2-x32-2        | 408741681  | 3.193 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt8ByWidth/AVX512-x16-2      | 331533141  | 3.222 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt8ByWidth/AVX512-x32-2      | 408741681  | 3.193 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt8ByWidth/AVX512-x64-2      | 365382873  | 3.241 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt64SteadyState/SSE-x2-2     | 5722603    | 211.5 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkContainsInt64SteadyState/AVX2-x4-2    | 11711869   | 103.1 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt64SteadyState/AVX512-x2-2  | 5722603    | 211.5 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkContainsInt64SteadyState/AVX512-x4-2  | 11711869   | 103.1 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkContainsInt64SteadyState/AVX512-x8-2  | 19671033   | 61.36 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt8/small/Fallback-lo-2           | 248740710  | 5.218 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkSumInt8/small/SSE-x16-2               | 126181464  | 9.485 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkSumInt8/small/AVX-x16-2               | 126181464  | 9.485 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt8/small/AVX2-x32-2              | 73059427   | 14.44 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt8/small/AVX512-x64-2            | 49913169   | 24.41 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt8/medium/Fallback-lo-2          | 17278075   | 69.96 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkSumInt8/medium/SSE-x16-2              | 100000000  | 10.58 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkSumInt8/medium/AVX-x16-2              | 100000000  | 10.58 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt8/medium/AVX2-x32-2             | 91620999   | 13.10 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt8/medium/AVX512-x64-2           | 54082130   | 22.20 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt8/large/Fallback-lo-2           | 2006178    | 576.3 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkSumInt8/large/SSE-x16-2               | 41836690   | 27.82 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkSumInt8/large/AVX-x16-2               | 41836690   | 27.82 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt8/large/AVX2-x32-2              | 51735399   | 23.04 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt8/large/AVX512-x64-2            | 40861586   | 29.40 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt8/xlarge/Fallback-lo-2          | 273898     | 4383 ns/op  | 0 B/op   | 0 allocs/op |
-| BenchmarkSumInt8/xlarge/SSE-x16-2              | 6928408    | 173.1 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkSumInt8/xlarge/AVX-x16-2              | 6928408    | 173.1 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt8/xlarge/AVX2-x32-2             | 12639586   | 94.09 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt8/xlarge/AVX512-x64-2           | 13509693   | 89.67 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt16/small/Fallback-lo-2          | 249444103  | 5.012 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkSumInt16/small/SSE-x8-2               | 244927230  | 5.052 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkSumInt16/small/AVX-x8-2               | 244927230  | 5.052 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt16/small/AVX2-x16-2             | 122088517  | 9.715 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt16/small/AVX512-x32-2           | 54098370   | 22.00 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt16/medium/Fallback-lo-2         | 15782683   | 72.54 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkSumInt16/medium/SSE-x8-2              | 100000000  | 10.51 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkSumInt16/medium/AVX-x8-2              | 100000000  | 10.51 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt16/medium/AVX2-x16-2            | 100000000  | 10.75 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt16/medium/AVX512-x32-2          | 56147455   | 21.38 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt16/large/Fallback-lo-2          | 2173214    | 598.1 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkSumInt16/large/SSE-x8-2               | 26319481   | 44.73 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkSumInt16/large/AVX-x8-2               | 26319481   | 44.73 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt16/large/AVX2-x16-2             | 40459519   | 27.91 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt16/large/AVX512-x32-2           | 39359752   | 31.28 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt16/xlarge/Fallback-lo-2         | 273932     | 4382 ns/op  | 0 B/op   | 0 allocs/op |
-| BenchmarkSumInt16/xlarge/SSE-x8-2              | 3557265    | 331.2 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkSumInt16/xlarge/AVX-x8-2              | 3557265    | 331.2 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt16/xlarge/AVX2-x16-2            | 6930166    | 173.4 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt16/xlarge/AVX512-x32-2          | 12100244   | 97.01 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt32/small/Fallback-lo-2          | 249566539  | 4.808 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkSumInt32/small/SSE-x4-2               | 259250019  | 4.581 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkSumInt32/small/AVX-x4-2               | 259250019  | 4.581 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt32/small/AVX2-x8-2              | 232858933  | 5.404 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt32/small/AVX512-x16-2           | 100000000  | 11.18 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt32/medium/Fallback-lo-2         | 17274441   | 72.28 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkSumInt32/medium/SSE-x4-2              | 58400258   | 20.56 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkSumInt32/medium/AVX-x4-2              | 58400258   | 20.56 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt32/medium/AVX2-x8-2             | 110851756  | 10.67 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt32/medium/AVX512-x16-2          | 106593603  | 11.25 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt32/large/Fallback-lo-2          | 2171817    | 551.8 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkSumInt32/large/SSE-x4-2               | 8270253    | 146.0 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkSumInt32/large/AVX-x4-2               | 8270253    | 146.0 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt32/large/AVX2-x8-2              | 22234518   | 46.06 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt32/large/AVX512-x16-2           | 37448763   | 32.31 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt32/xlarge/Fallback-lo-2         | 273699     | 4559 ns/op  | 0 B/op   | 0 allocs/op |
-| BenchmarkSumInt32/xlarge/SSE-x4-2              | 1000000    | 1102 ns/op  | 0 B/op   | 0 allocs/op |
+| BenchmarkSumInt32/xlarge/AVX-x4-2              | 1000000    | 1102 ns/op  | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt32/xlarge/AVX2-x8-2             | 3586887    | 332.4 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt32/xlarge/AVX512-x16-2          | 7214437    | 170.5 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt64/small/Fallback-lo-2          | 417473124  | 2.886 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkSumInt64/small/SSE-x2-2               | 287521756  | 4.169 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkSumInt64/small/AVX-x2-2               | 287521756  | 4.169 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt64/small/AVX2-x4-2              | 277783513  | 4.311 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt64/small/AVX512-x8-2            | 172823103  | 6.993 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt64/medium/Fallback-lo-2         | 34022653   | 35.27 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkSumInt64/medium/SSE-x2-2              | 49241248   | 24.05 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkSumInt64/medium/AVX-x2-2              | 49241248   | 24.05 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt64/medium/AVX2-x4-2             | 78897342   | 14.58 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt64/medium/AVX512-x8-2           | 84361297   | 14.03 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt64/large/Fallback-lo-2          | 3680988    | 282.3 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkSumInt64/large/SSE-x2-2               | 6293607    | 170.7 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkSumInt64/large/AVX-x2-2               | 6293607    | 170.7 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt64/large/AVX2-x4-2              | 12739849   | 91.28 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt64/large/AVX512-x8-2            | 25508130   | 46.30 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt64/xlarge/Fallback-lo-2         | 546321     | 2283 ns/op  | 0 B/op   | 0 allocs/op |
-| BenchmarkSumInt64/xlarge/SSE-x2-2              | 877434     | 1289 ns/op  | 0 B/op   | 0 allocs/op |
+| BenchmarkSumInt64/xlarge/AVX-x2-2              | 877434     | 1289 ns/op  | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt64/xlarge/AVX2-x4-2             | 1845892    | 650.4 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt64/xlarge/AVX512-x8-2           | 2148355    | 550.8 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat32/small/Fallback-lo-2        | 411100770  | 2.951 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkSumFloat32/small/SSE-x4-2             | 264013596  | 4.572 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkSumFloat32/small/AVX-x4-2             | 264013596  | 4.572 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat32/small/AVX2-x8-2            | 174478266  | 6.911 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat32/small/AVX512-x16-2         | 61182673   | 19.78 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat32/medium/Fallback-lo-2       | 33815070   | 35.68 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkSumFloat32/medium/SSE-x4-2            | 58238188   | 20.66 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkSumFloat32/medium/AVX-x4-2            | 58238188   | 20.66 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat32/medium/AVX2-x8-2           | 91316544   | 13.26 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat32/medium/AVX512-x16-2        | 80046624   | 15.08 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat32/large/Fallback-lo-2        | 4304168    | 278.7 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkSumFloat32/large/SSE-x4-2             | 6198957    | 184.8 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkSumFloat32/large/AVX-x4-2             | 6198957    | 184.8 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat32/large/AVX2-x8-2            | 12260169   | 86.60 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat32/large/AVX512-x16-2         | 22147112   | 45.34 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat32/xlarge/Fallback-lo-2       | 546901     | 2193 ns/op  | 0 B/op   | 0 allocs/op |
-| BenchmarkSumFloat32/xlarge/SSE-x4-2            | 736503     | 1622 ns/op  | 0 B/op   | 0 allocs/op |
+| BenchmarkSumFloat32/xlarge/AVX-x4-2            | 736503     | 1622 ns/op  | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat32/xlarge/AVX2-x8-2           | 1493887    | 810.5 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat32/xlarge/AVX512-x16-2        | 2959298    | 393.4 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat64/small/Fallback-lo-2        | 410778070  | 3.043 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkSumFloat64/small/SSE-x2-2             | 254156008  | 4.714 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkSumFloat64/small/AVX-x2-2             | 254156008  | 4.714 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat64/small/AVX2-x4-2            | 227604434  | 5.323 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat64/small/AVX512-x8-2          | 170099748  | 7.115 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat64/medium/Fallback-lo-2       | 33646345   | 35.78 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkSumFloat64/medium/SSE-x2-2            | 32931152   | 34.92 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkSumFloat64/medium/AVX-x2-2            | 32931152   | 34.92 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat64/medium/AVX2-x4-2           | 75389446   | 16.79 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat64/medium/AVX512-x8-2         | 89826181   | 13.33 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat64/large/Fallback-lo-2        | 4293837    | 302.8 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkSumFloat64/large/SSE-x2-2             | 3146601    | 381.4 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkSumFloat64/large/AVX-x2-2             | 3146601    | 381.4 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat64/large/AVX2-x4-2            | 6373876    | 184.3 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat64/large/AVX512-x8-2          | 13464712   | 88.96 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat64/xlarge/Fallback-lo-2       | 545764     | 2193 ns/op  | 0 B/op   | 0 allocs/op |
-| BenchmarkSumFloat64/xlarge/SSE-x2-2            | 368846     | 3390 ns/op  | 0 B/op   | 0 allocs/op |
+| BenchmarkSumFloat64/xlarge/AVX-x2-2            | 368846     | 3390 ns/op  | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat64/xlarge/AVX2-x4-2           | 709940     | 1613 ns/op  | 0 B/op   | 0 allocs/op |
 | BenchmarkSumFloat64/xlarge/AVX512-x8-2         | 1480214    | 808.6 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanInt32/small/Fallback-lo-2         | 411529147  | 3.043 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkMeanInt32/small/SSE-x4-2              | 204428401  | 5.872 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkMeanInt32/small/AVX-x4-2              | 204428401  | 5.872 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanInt32/small/AVX2-x8-2             | 187573928  | 6.214 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanInt32/small/AVX512-x16-2          | 98346700   | 12.12 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanInt32/medium/Fallback-lo-2        | 33481442   | 35.72 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkMeanInt32/medium/SSE-x4-2             | 52042394   | 22.12 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkMeanInt32/medium/AVX-x4-2             | 52042394   | 22.12 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanInt32/medium/AVX2-x8-2            | 96288541   | 13.44 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanInt32/medium/AVX512-x16-2         | 100995780  | 11.90 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanInt32/large/Fallback-lo-2         | 4296570    | 289.9 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkMeanInt32/large/SSE-x4-2              | 7743022    | 146.4 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkMeanInt32/large/AVX-x4-2              | 7743022    | 146.4 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanInt32/large/AVX2-x8-2             | 24355988   | 46.26 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanInt32/large/AVX512-x16-2          | 37322655   | 32.89 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanInt32/xlarge/Fallback-lo-2        | 547008     | 2193 ns/op  | 0 B/op   | 0 allocs/op |
-| BenchmarkMeanInt32/xlarge/SSE-x4-2             | 1087246    | 1112 ns/op  | 0 B/op   | 0 allocs/op |
+| BenchmarkMeanInt32/xlarge/AVX-x4-2             | 1087246    | 1112 ns/op  | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanInt32/xlarge/AVX2-x8-2            | 1386868    | 761.9 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanInt32/xlarge/AVX512-x16-2         | 7166142    | 170.7 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanFloat64/small/Fallback-lo-2       | 349760005  | 3.449 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkMeanFloat64/small/SSE-x2-2            | 189674538  | 6.293 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkMeanFloat64/small/AVX-x2-2            | 189674538  | 6.293 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanFloat64/small/AVX2-x4-2           | 159228600  | 7.531 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanFloat64/small/AVX512-x8-2         | 110196433  | 10.89 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanFloat64/medium/Fallback-lo-2      | 32968618   | 36.17 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkMeanFloat64/medium/SSE-x2-2           | 30863817   | 37.69 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkMeanFloat64/medium/AVX-x2-2           | 30863817   | 37.69 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanFloat64/medium/AVX2-x4-2          | 62428772   | 19.66 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanFloat64/medium/AVX512-x8-2        | 77140984   | 15.54 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanFloat64/large/Fallback-lo-2       | 4281057    | 280.6 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkMeanFloat64/large/SSE-x2-2            | 3057349    | 389.4 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkMeanFloat64/large/AVX-x2-2            | 3057349    | 389.4 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanFloat64/large/AVX2-x4-2           | 6509438    | 185.9 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanFloat64/large/AVX512-x8-2         | 12668032   | 93.50 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanFloat64/xlarge/Fallback-lo-2      | 545898     | 2288 ns/op  | 0 B/op   | 0 allocs/op |
-| BenchmarkMeanFloat64/xlarge/SSE-x2-2           | 367671     | 4048 ns/op  | 0 B/op   | 0 allocs/op |
+| BenchmarkMeanFloat64/xlarge/AVX-x2-2           | 367671     | 4048 ns/op  | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanFloat64/xlarge/AVX2-x4-2          | 739941     | 1621 ns/op  | 0 B/op   | 0 allocs/op |
 | BenchmarkMeanFloat64/xlarge/AVX512-x8-2        | 1434867    | 811.3 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkMinInt32/small/SSE-x4-2               | 312338268  | 3.860 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkMinInt32/small/AVX-x4-2               | 312338268  | 3.860 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMinInt32/small/AVX2-x8-2              | 238034872  | 5.042 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMinInt32/small/AVX512-x16-2           | 152600943  | 6.661 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkMinInt32/medium/SSE-x4-2              | 61051266   | 19.73 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkMinInt32/medium/AVX-x4-2              | 61051266   | 19.73 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMinInt32/medium/AVX2-x8-2             | 91792144   | 13.11 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMinInt32/medium/AVX512-x16-2          | 99994540   | 12.18 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkMinInt32/large/SSE-x4-2               | 8604774    | 140.5 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkMinInt32/large/AVX-x4-2               | 8604774    | 140.5 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMinInt32/large/AVX2-x8-2              | 15581037   | 77.56 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMinInt32/large/AVX512-x16-2           | 30512421   | 40.24 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkMinInt32/xlarge/SSE-x4-2              | 1000000    | 1110 ns/op  | 0 B/op   | 0 allocs/op |
+| BenchmarkMinInt32/xlarge/AVX-x4-2              | 1000000    | 1110 ns/op  | 0 B/op   | 0 allocs/op |
 | BenchmarkMinInt32/xlarge/AVX2-x8-2             | 2158272    | 557.2 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMinInt32/xlarge/AVX512-x16-2          | 4253668    | 282.6 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkMinFloat64/small/SSE-x2-2             | 264129410  | 4.544 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkMinFloat64/small/AVX-x2-2             | 264129410  | 4.544 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMinFloat64/small/AVX2-x4-2            | 299587609  | 4.008 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMinFloat64/small/AVX512-x8-2          | 100000000  | 10.05 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkMinFloat64/medium/SSE-x2-2            | 32778514   | 36.93 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkMinFloat64/medium/AVX-x2-2            | 32778514   | 36.93 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMinFloat64/medium/AVX2-x4-2           | 53356347   | 20.30 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMinFloat64/medium/AVX512-x8-2         | 74832976   | 16.21 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkMinFloat64/large/SSE-x2-2             | 3863326    | 300.0 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkMinFloat64/large/AVX-x2-2             | 3863326    | 300.0 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMinFloat64/large/AVX2-x4-2            | 7670576    | 146.5 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMinFloat64/large/AVX512-x8-2          | 14017984   | 78.21 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkMinFloat64/xlarge/SSE-x2-2            | 492739     | 2195 ns/op  | 0 B/op   | 0 allocs/op |
+| BenchmarkMinFloat64/xlarge/AVX-x2-2            | 492739     | 2195 ns/op  | 0 B/op   | 0 allocs/op |
 | BenchmarkMinFloat64/xlarge/AVX2-x4-2           | 1000000    | 1103 ns/op  | 0 B/op   | 0 allocs/op |
 | BenchmarkMinFloat64/xlarge/AVX512-x8-2         | 2145290    | 560.3 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkMaxInt32/small/SSE-x4-2               | 306585705  | 3.860 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkMaxInt32/small/AVX-x4-2               | 306585705  | 3.860 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMaxInt32/small/AVX2-x8-2              | 237347997  | 5.086 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMaxInt32/small/AVX512-x16-2           | 201433966  | 6.130 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkMaxInt32/medium/SSE-x4-2              | 60759631   | 19.92 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkMaxInt32/medium/AVX-x4-2              | 60759631   | 19.92 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMaxInt32/medium/AVX2-x8-2             | 90934662   | 13.13 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMaxInt32/medium/AVX512-x16-2          | 98517944   | 12.18 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkMaxInt32/large/SSE-x4-2               | 8590542    | 139.6 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkMaxInt32/large/AVX-x4-2               | 8590542    | 139.6 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMaxInt32/large/AVX2-x8-2              | 15770372   | 77.69 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMaxInt32/large/AVX512-x16-2           | 30197324   | 39.32 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkMaxInt32/xlarge/SSE-x4-2              | 1000000    | 1104 ns/op  | 0 B/op   | 0 allocs/op |
+| BenchmarkMaxInt32/xlarge/AVX-x4-2              | 1000000    | 1104 ns/op  | 0 B/op   | 0 allocs/op |
 | BenchmarkMaxInt32/xlarge/AVX2-x8-2             | 2152038    | 562.1 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMaxInt32/xlarge/AVX512-x16-2          | 3917990    | 296.7 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkMaxFloat64/small/SSE-x2-2             | 249617162  | 4.816 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkMaxFloat64/small/AVX-x2-2             | 249617162  | 4.816 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMaxFloat64/small/AVX2-x4-2            | 207017514  | 5.855 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMaxFloat64/small/AVX512-x8-2          | 66520290   | 17.74 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkMaxFloat64/medium/SSE-x2-2            | 32307492   | 36.92 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkMaxFloat64/medium/AVX-x2-2            | 32307492   | 36.92 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMaxFloat64/medium/AVX2-x4-2           | 57306838   | 20.77 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMaxFloat64/medium/AVX512-x8-2         | 56911946   | 21.12 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkMaxFloat64/large/SSE-x2-2             | 4259366    | 287.1 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkMaxFloat64/large/AVX-x2-2             | 4259366    | 287.1 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMaxFloat64/large/AVX2-x4-2            | 7905420    | 148.9 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkMaxFloat64/large/AVX512-x8-2          | 14100686   | 83.43 ns/op | 0 B/op   | 0 allocs/op |
-| BenchmarkMaxFloat64/xlarge/SSE-x2-2            | 545378     | 2243 ns/op  | 0 B/op   | 0 allocs/op |
+| BenchmarkMaxFloat64/xlarge/AVX-x2-2            | 545378     | 2243 ns/op  | 0 B/op   | 0 allocs/op |
 | BenchmarkMaxFloat64/xlarge/AVX2-x4-2           | 1000000    | 1113 ns/op  | 0 B/op   | 0 allocs/op |
 | BenchmarkMaxFloat64/xlarge/AVX512-x8-2         | 2119741    | 565.7 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt8ByWidth/Fallback-lo-2          | 896775     | 1335 ns/op  | 0 B/op   | 0 allocs/op |
-| BenchmarkSumInt8ByWidth/SSE-x16-2              | 12557700   | 94.52 ns/op | 0 B/op   | 0 allocs/op |
+| BenchmarkSumInt8ByWidth/AVX-x16-2              | 12557700   | 94.52 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt8ByWidth/AVX2-x32-2             | 18702537   | 55.03 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt8ByWidth/AVX512-x64-2           | 21342572   | 56.10 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt64SteadyState/Fallback-lo-2     | 513738     | 2195 ns/op  | 0 B/op   | 0 allocs/op |
-| BenchmarkSumInt64SteadyState/SSE-x2-2          | 928376     | 1296 ns/op  | 0 B/op   | 0 allocs/op |
+| BenchmarkSumInt64SteadyState/AVX-x2-2          | 928376     | 1296 ns/op  | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt64SteadyState/AVX2-x4-2         | 1836968    | 888.1 ns/op | 0 B/op   | 0 allocs/op |
 | BenchmarkSumInt64SteadyState/AVX512-x8-2       | 2141715    | 551.3 ns/op | 0 B/op   | 0 allocs/op |
@@ -12,7 +12,7 @@ If you see **SIGILL: illegal instruction** when running tests, the CPU or VM doe

 ```bash
 # List SIMD-related flags
-grep -E 'avx|sse' /proc/cpuinfo
+grep -E 'avx' /proc/cpuinfo

 # Or with lscpu
 lscpu | grep -i avx
@@ -22,21 +22,22 @@ lscpu | grep -i avx

 | Tests / code      | Required flag(s)           | Typical CPUs                                                            |
 | ----------------- | -------------------------- | ----------------------------------------------------------------------- |
-| SSE (128-bit)     | `sse2` (baseline on amd64) | All amd64                                                               |
+| AVX (128-bit)     | `avx` (baseline on amd64)  | All amd64                                                               |
 | AVX2 (256-bit)    | `avx2`                     | Intel Haswell+, AMD Excavator+                                          |
 | AVX-512 (512-bit) | `avx512f`                  | Intel Skylake-X+, some Xeons; many AMD/consumer CPUs do **not** have it |

 ### What the tests do

+- **AVX tests** (128-bit) call `requireAVX(t)` and are **skipped** if the CPU does not support AVX.
 - **AVX2 tests** call `requireAVX2(t)` and are **skipped** if the CPU does not support AVX2 (no SIGILL).
 - **AVX-512 tests** (when enabled) should call `requireAVX512(t)` and skip when AVX-512 is not available.

 So on a machine without AVX2, AVX2 tests will show as skipped instead of crashing.

-### Run only SSE tests
+### Run only AVX tests

-If your environment does not support AVX2/AVX-512, you can still run the SSE tests:
+If your environment does not support AVX2/AVX-512, you can still run the AVX (128-bit) tests:

 ```bash
-GOEXPERIMENT=simd go test -run SSE ./...
+GOEXPERIMENT=simd go test -run AVX ./...
 ```
@@ -19,16 +19,25 @@ type skipHelper interface {

 // How to check if your Linux CPU supports SIMD (avoids SIGILL):
 //
-//   grep -E 'avx|sse' /proc/cpuinfo
+//   grep -E 'avx' /proc/cpuinfo
 //
 // Or:  lscpu | grep -i avx
 //
 // You need:
-//   - SSE tests (128-bit):  sse2 (baseline on amd64), sse4.1/sse4.2 often used
+//   - AVX tests (128-bit):  avx in flags (baseline on amd64)
 //   - AVX2 tests (256-bit):  avx2  in flags
 //   - AVX-512 tests:        avx512f (and often avx512bw, avx512vl)
 //
-// If your CPU lacks AVX2 or AVX-512, tests that use them will be skipped automatically.
+// If your CPU lacks AVX or AVX2 or AVX-512, tests that use them will be skipped automatically.
+
+// requireAVX skips the test/benchmark if the CPU does not support AVX (128-bit SIMD).
+// Use at the start of each AVX test/benchmark to avoid SIGILL on older or non-x86 systems.
+func requireAVX(t skipHelper) {
+	t.Helper()
+	if !archsimd.X86.AVX() {
+		t.Skipf("CPU does not support AVX; skipping. Check compatibility: grep avx /proc/cpuinfo")
+	}
+}

 // requireAVX2 skips the test/benchmark if the CPU does not support AVX2 (256-bit SIMD).
 // Use at the start of each AVX2 test/benchmark to avoid SIGILL on older or non-x86 systems.
@@ -6,7 +6,7 @@ import (
 	"simd/archsimd"
 )

-// ContainsInt8x16 checks if collection contains target using SSE SIMD and AVX512 SIMD
+// ContainsInt8x16 checks if collection contains target using AVX SIMD and AVX-512 SIMD
 func ContainsInt8x16[T ~int8](collection []T, target T) bool {
 	length := uint(len(collection))
 	if length == 0 {
@@ -40,7 +40,7 @@ func ContainsInt8x16[T ~int8](collection []T, target T) bool {
 	return false
 }

-// ContainsInt16x8 checks if collection contains target using SSE SIMD and AVX512 SIMD
+// ContainsInt16x8 checks if collection contains target using AVX SIMD and AVX-512 SIMD
 func ContainsInt16x8[T ~int16](collection []T, target T) bool {
 	length := uint(len(collection))
 	if length == 0 {
@@ -72,7 +72,7 @@ func ContainsInt16x8[T ~int16](collection []T, target T) bool {
 	return false
 }

-// ContainsInt32x4 checks if collection contains target using SSE SIMD and AVX512 SIMD
+// ContainsInt32x4 checks if collection contains target using AVX SIMD and AVX-512 SIMD
 func ContainsInt32x4[T ~int32](collection []T, target T) bool {
 	length := uint(len(collection))
 	if length == 0 {
@@ -104,7 +104,7 @@ func ContainsInt32x4[T ~int32](collection []T, target T) bool {
 	return false
 }

-// ContainsInt64x2 checks if collection contains target using SSE SIMD and AVX512 SIMD
+// ContainsInt64x2 checks if collection contains target using AVX SIMD and AVX-512 SIMD
 func ContainsInt64x2[T ~int64](collection []T, target T) bool {
 	length := uint(len(collection))
 	if length == 0 {
@@ -136,7 +136,7 @@ func ContainsInt64x2[T ~int64](collection []T, target T) bool {
 	return false
 }

-// ContainsUint8x16 checks if collection contains target using SSE SIMD and AVX512 SIMD
+// ContainsUint8x16 checks if collection contains target using AVX SIMD and AVX-512 SIMD
 func ContainsUint8x16[T ~uint8](collection []T, target T) bool {
 	length := uint(len(collection))
 	if length == 0 {
@@ -168,7 +168,7 @@ func ContainsUint8x16[T ~uint8](collection []T, target T) bool {
 	return false
 }

-// ContainsUint16x8 checks if collection contains target using SSE SIMD and AVX512 SIMD
+// ContainsUint16x8 checks if collection contains target using AVX SIMD and AVX-512 SIMD
 func ContainsUint16x8[T ~uint16](collection []T, target T) bool {
 	length := uint(len(collection))
 	if length == 0 {
@@ -200,7 +200,7 @@ func ContainsUint16x8[T ~uint16](collection []T, target T) bool {
 	return false
 }

-// ContainsUint32x4 checks if collection contains target using SSE SIMD and AVX512 SIMD
+// ContainsUint32x4 checks if collection contains target using AVX SIMD and AVX-512 SIMD
 func ContainsUint32x4[T ~uint32](collection []T, target T) bool {
 	length := uint(len(collection))
 	if length == 0 {
@@ -232,7 +232,7 @@ func ContainsUint32x4[T ~uint32](collection []T, target T) bool {
 	return false
 }

-// ContainsUint64x2 checks if collection contains target using SSE SIMD and AVX512 SIMD
+// ContainsUint64x2 checks if collection contains target using AVX SIMD and AVX-512 SIMD
 func ContainsUint64x2[T ~uint64](collection []T, target T) bool {
 	length := uint(len(collection))
 	if length == 0 {
@@ -264,7 +264,7 @@ func ContainsUint64x2[T ~uint64](collection []T, target T) bool {
 	return false
 }

-// ContainsFloat32x4 checks if collection contains target using SSE SIMD and AVX512 SIMD
+// ContainsFloat32x4 checks if collection contains target using AVX SIMD and AVX-512 SIMD
 func ContainsFloat32x4[T ~float32](collection []T, target T) bool {
 	length := uint(len(collection))
 	if length == 0 {
@@ -296,7 +296,7 @@ func ContainsFloat32x4[T ~float32](collection []T, target T) bool {
 	return false
 }

-// ContainsFloat64x2 checks if collection contains target using SSE SIMD and AVX512 SIMD
+// ContainsFloat64x2 checks if collection contains target using AVX SIMD and AVX-512 SIMD
 func ContainsFloat64x2[T ~float64](collection []T, target T) bool {
 	length := uint(len(collection))
 	if length == 0 {
@@ -8,16 +8,16 @@ import (

 // Benchmark suite for SIMD Contains operations compared to core lo package fallbacks.
 // These benchmarks measure the performance of element lookup operations
-// across different SIMD implementations (SSE, AVX2, AVX512) and data sizes.
+// across different SIMD implementations (AVX, AVX2, AVX512) and data sizes.

 // Benchmark sizes for Contains operations
 var containsBenchmarkSizes = []struct {
 	name string
 	size int
 }{
-	{"tiny", 4},       // Smaller than SSE width (16 lanes for int8)
-	{"small", 16},     // Exactly SSE width for int8
-	{"medium", 64},    // Multiple of SSE, between SSE and AVX2 for int8
+	{"tiny", 4},       // Smaller than AVX width (16 lanes for int8)
+	{"small", 16},     // Exactly AVX width for int8
+	{"medium", 64},    // Multiple of AVX, between AVX and AVX2 for int8
 	{"large", 256},    // Multiple of AVX2 (32 lanes for int8)
 	{"xlarge", 1024},  // Multiple of AVX512 (64 lanes for int8)
 	{"massive", 8192}, // Very large dataset
@@ -33,14 +33,14 @@ func BenchmarkContainsInt8(b *testing.B) {
 			data := generateInt8(bs.size)
 			target := int8(42)

-			b.Run("SSE-x16", func(b *testing.B) {
+			b.Run("AVX512-x16", func(b *testing.B) {
 				requireAVX512(b) // ContainsInt8x16 is in intersect_avx512.go which uses AVX-512
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
 					_ = ContainsInt8x16(data, target)
 				}
 			})
-			b.Run("AVX2-x32", func(b *testing.B) {
+			b.Run("AVX512-x32", func(b *testing.B) {
 				requireAVX512(b) // ContainsInt8x32 is in intersect_avx512.go which uses AVX-512
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
@@ -68,14 +68,14 @@ func BenchmarkContainsInt16(b *testing.B) {
 			data := generateInt16(bs.size)
 			target := int16(42)

-			b.Run("SSE-x8", func(b *testing.B) {
+			b.Run("AVX512-x8", func(b *testing.B) {
 				requireAVX512(b) // ContainsInt16x8 is in intersect_avx512.go which uses AVX-512
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
 					_ = ContainsInt16x8(data, target)
 				}
 			})
-			b.Run("AVX2-x16", func(b *testing.B) {
+			b.Run("AVX512-x16", func(b *testing.B) {
 				requireAVX512(b) // ContainsInt16x16 is in intersect_avx512.go which uses AVX-512
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
@@ -103,14 +103,14 @@ func BenchmarkContainsInt32(b *testing.B) {
 			data := generateInt32(bs.size)
 			target := int32(42)

-			b.Run("SSE-x4", func(b *testing.B) {
+			b.Run("AVX512-x4", func(b *testing.B) {
 				requireAVX512(b) // ContainsInt32x4 is in intersect_avx512.go which uses AVX-512
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
 					_ = ContainsInt32x4(data, target)
 				}
 			})
-			b.Run("AVX2-x8", func(b *testing.B) {
+			b.Run("AVX512-x8", func(b *testing.B) {
 				requireAVX512(b) // ContainsInt32x8 is in intersect_avx512.go which uses AVX-512
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
@@ -138,14 +138,14 @@ func BenchmarkContainsInt64(b *testing.B) {
 			data := generateInt64(bs.size)
 			target := int64(42)

-			b.Run("SSE-x2", func(b *testing.B) {
+			b.Run("AVX512-x2", func(b *testing.B) {
 				requireAVX512(b) // ContainsInt64x2 is in intersect_avx512.go which uses AVX-512
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
 					_ = ContainsInt64x2(data, target)
 				}
 			})
-			b.Run("AVX2-x4", func(b *testing.B) {
+			b.Run("AVX512-x4", func(b *testing.B) {
 				requireAVX512(b) // ContainsInt64x4 is in intersect_avx512.go which uses AVX-512
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
@@ -173,14 +173,14 @@ func BenchmarkContainsUint8(b *testing.B) {
 			data := generateUint8(bs.size)
 			target := uint8(255)

-			b.Run("SSE-x16", func(b *testing.B) {
+			b.Run("AVX512-x16", func(b *testing.B) {
 				requireAVX512(b) // ContainsUint8x16 is in intersect_avx512.go which uses AVX-512
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
 					_ = ContainsUint8x16(data, target)
 				}
 			})
-			b.Run("AVX2-x32", func(b *testing.B) {
+			b.Run("AVX512-x32", func(b *testing.B) {
 				requireAVX512(b) // ContainsUint8x32 is in intersect_avx512.go which uses AVX-512
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
@@ -208,14 +208,14 @@ func BenchmarkContainsUint16(b *testing.B) {
 			data := generateUint16(bs.size)
 			target := uint16(42)

-			b.Run("SSE-x8", func(b *testing.B) {
+			b.Run("AVX512-x8", func(b *testing.B) {
 				requireAVX512(b) // ContainsUint16x8 is in intersect_avx512.go which uses AVX-512
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
 					_ = ContainsUint16x8(data, target)
 				}
 			})
-			b.Run("AVX2-x16", func(b *testing.B) {
+			b.Run("AVX512-x16", func(b *testing.B) {
 				requireAVX512(b) // ContainsUint16x16 is in intersect_avx512.go which uses AVX-512
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
@@ -243,14 +243,14 @@ func BenchmarkContainsUint32(b *testing.B) {
 			data := generateUint32(bs.size)
 			target := uint32(42)

-			b.Run("SSE-x4", func(b *testing.B) {
+			b.Run("AVX512-x4", func(b *testing.B) {
 				requireAVX512(b) // ContainsUint32x4 is in intersect_avx512.go which uses AVX-512
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
 					_ = ContainsUint32x4(data, target)
 				}
 			})
-			b.Run("AVX2-x8", func(b *testing.B) {
+			b.Run("AVX512-x8", func(b *testing.B) {
 				requireAVX512(b) // ContainsUint32x8 is in intersect_avx512.go which uses AVX-512
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
@@ -278,14 +278,14 @@ func BenchmarkContainsUint64(b *testing.B) {
 			data := generateUint64(bs.size)
 			target := uint64(42)

-			b.Run("SSE-x2", func(b *testing.B) {
+			b.Run("AVX512-x2", func(b *testing.B) {
 				requireAVX512(b) // ContainsUint64x2 is in intersect_avx512.go which uses AVX-512
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
 					_ = ContainsUint64x2(data, target)
 				}
 			})
-			b.Run("AVX2-x4", func(b *testing.B) {
+			b.Run("AVX512-x4", func(b *testing.B) {
 				requireAVX512(b) // ContainsUint64x4 is in intersect_avx512.go which uses AVX-512
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
@@ -313,14 +313,14 @@ func BenchmarkContainsFloat32(b *testing.B) {
 			data := generateFloat32(bs.size)
 			target := float32(42.5)

-			b.Run("SSE-x4", func(b *testing.B) {
+			b.Run("AVX512-x4", func(b *testing.B) {
 				requireAVX512(b) // ContainsFloat32x4 is in intersect_avx512.go which uses AVX-512
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
 					_ = ContainsFloat32x4(data, target)
 				}
 			})
-			b.Run("AVX2-x8", func(b *testing.B) {
+			b.Run("AVX512-x8", func(b *testing.B) {
 				requireAVX512(b) // ContainsFloat32x8 is in intersect_avx512.go which uses AVX-512
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
@@ -348,14 +348,14 @@ func BenchmarkContainsFloat64(b *testing.B) {
 			data := generateFloat64(bs.size)
 			target := float64(42.5)

-			b.Run("SSE-x2", func(b *testing.B) {
+			b.Run("AVX512-x2", func(b *testing.B) {
 				requireAVX512(b) // ContainsFloat64x2 is in intersect_avx512.go which uses AVX-512
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
 					_ = ContainsFloat64x2(data, target)
 				}
 			})
-			b.Run("AVX2-x4", func(b *testing.B) {
+			b.Run("AVX512-x4", func(b *testing.B) {
 				requireAVX512(b) // ContainsFloat64x4 is in intersect_avx512.go which uses AVX-512
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
@@ -386,14 +386,14 @@ func BenchmarkContainsWorstCase(b *testing.B) {
 	}
 	target := int32(size - 1) // Target at the very end

-	b.Run("SSE-x4", func(b *testing.B) {
+	b.Run("AVX512-x4", func(b *testing.B) {
 		requireAVX512(b) // ContainsInt32x4 is in intersect_avx512.go which uses AVX-512
 		b.ReportAllocs()
 		for i := 0; i < b.N; i++ {
 			_ = ContainsInt32x4(data, target)
 		}
 	})
-	b.Run("AVX2-x8", func(b *testing.B) {
+	b.Run("AVX512-x8", func(b *testing.B) {
 		requireAVX512(b) // ContainsInt32x8 is in intersect_avx512.go which uses AVX-512
 		b.ReportAllocs()
 		for i := 0; i < b.N; i++ {
@@ -422,14 +422,14 @@ func BenchmarkContainsBestCase(b *testing.B) {
 	}
 	target := int32(0) // Target at the very beginning

-	b.Run("SSE-x4", func(b *testing.B) {
+	b.Run("AVX512-x4", func(b *testing.B) {
 		requireAVX512(b) // ContainsInt32x4 is in intersect_avx512.go which uses AVX-512
 		b.ReportAllocs()
 		for i := 0; i < b.N; i++ {
 			_ = ContainsInt32x4(data, target)
 		}
 	})
-	b.Run("AVX2-x8", func(b *testing.B) {
+	b.Run("AVX512-x8", func(b *testing.B) {
 		requireAVX512(b) // ContainsInt32x8 is in intersect_avx512.go which uses AVX-512
 		b.ReportAllocs()
 		for i := 0; i < b.N; i++ {
@@ -456,14 +456,14 @@ func BenchmarkContainsNegative(b *testing.B) {
 			data := generateInt32(bs.size)
 			target := int32(999999) // Target that's unlikely to be in the data

-			b.Run("SSE-x4", func(b *testing.B) {
+			b.Run("AVX512-x4", func(b *testing.B) {
 				requireAVX512(b) // ContainsInt32x4 is in intersect_avx512.go which uses AVX-512
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
 					_ = ContainsInt32x4(data, target)
 				}
 			})
-			b.Run("AVX2-x8", func(b *testing.B) {
+			b.Run("AVX512-x8", func(b *testing.B) {
 				requireAVX512(b) // ContainsInt32x8 is in intersect_avx512.go which uses AVX-512
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
@@ -497,8 +497,8 @@ func BenchmarkContainsInt8ByWidth(b *testing.B) {
 		name string
 		fn   func() bool
 	}{
-		{"SSE-x16", func() bool { return ContainsInt8x16(data, target) }},
-		{"AVX2-x32", func() bool { return ContainsInt8x32(data, target) }},
+		{"AVX512-x16", func() bool { return ContainsInt8x16(data, target) }},
+		{"AVX512-x32", func() bool { return ContainsInt8x32(data, target) }},
 		{"AVX512-x64", func() bool { return ContainsInt8x64(data, target) }},
 	}

@@ -533,14 +533,14 @@ func BenchmarkContainsInt64SteadyState(b *testing.B) {

 	b.ResetTimer() // Reset timer to exclude warmup

-	b.Run("SSE-x2", func(b *testing.B) {
+	b.Run("AVX512-x2", func(b *testing.B) {
 		requireAVX512(b) // ContainsInt64x2 is in intersect_avx512.go which uses AVX-512
 		b.ReportAllocs()
 		for i := 0; i < b.N; i++ {
 			_ = ContainsInt64x2(data, target)
 		}
 	})
-	b.Run("AVX2-x4", func(b *testing.B) {
+	b.Run("AVX512-x4", func(b *testing.B) {
 		requireAVX512(b) // ContainsInt64x4 is in intersect_avx512.go which uses AVX-512
 		b.ReportAllocs()
 		for i := 0; i < b.N; i++ {
@@ -364,7 +364,8 @@ func MinInt64[T ~int64](collection []T) T {
 	case simdFeatureAVX2:
 		return MinInt64x4(collection)
 	case simdFeatureAVX:
-		return MinInt64x2(collection)
+		// MinInt64x2 requires AVX-512 (archsimd Int64x2.Min); use scalar fallback
+		fallthrough
 	default:
 		return lo.Min(collection)
 	}
@@ -420,7 +421,8 @@ func MinUint64[T ~uint64](collection []T) T {
 	case simdFeatureAVX2:
 		return MinUint64x4(collection)
 	case simdFeatureAVX:
-		return MinUint64x2(collection)
+		// MinUint64x2 requires AVX-512; use scalar fallback
+		fallthrough
 	default:
 		return lo.Min(collection)
 	}
@@ -504,7 +506,8 @@ func MaxInt64[T ~int64](collection []T) T {
 	case simdFeatureAVX2:
 		return MaxInt64x4(collection)
 	case simdFeatureAVX:
-		return MaxInt64x2(collection)
+		// MaxInt64x2 requires AVX-512; use scalar fallback
+		fallthrough
 	default:
 		return lo.Max(collection)
 	}
@@ -560,7 +563,8 @@ func MaxUint64[T ~uint64](collection []T) T {
 	case simdFeatureAVX2:
 		return MaxUint64x4(collection)
 	case simdFeatureAVX:
-		return MaxUint64x2(collection)
+		// MaxUint64x2 requires AVX-512; use scalar fallback
+		fallthrough
 	default:
 		return lo.Max(collection)
 	}
@@ -674,7 +678,8 @@ func ClampInt64[T ~int64, Slice ~[]T](collection Slice, min, max T) Slice {
 	case simdFeatureAVX2:
 		return ClampInt64x4(collection, min, max)
 	case simdFeatureAVX:
-		return ClampInt64x2(collection, min, max)
+		// ClampInt64x2 requires AVX-512; use scalar fallback
+		fallthrough
 	default:
 		result := make(Slice, len(collection))
 		for i, v := range collection {
@@ -770,7 +775,8 @@ func ClampUint64[T ~uint64, Slice ~[]T](collection Slice, min, max T) Slice {
 	case simdFeatureAVX2:
 		return ClampUint64x4(collection, min, max)
 	case simdFeatureAVX:
-		return ClampUint64x2(collection, min, max)
+		// ClampUint64x2 requires AVX-512; use scalar fallback
+		fallthrough
 	default:
 		result := make(Slice, len(collection))
 		for i, v := range collection {
@@ -9,9 +9,9 @@ import (
 	"github.com/samber/lo"
 )

-// SSE (128-bit) SIMD sum functions - 16/8/4/2 lanes
+// AVX (128-bit) SIMD sum functions - 16/8/4/2 lanes

-// SumInt8x16 sums a slice of int8 using SSE SIMD (Int8x16, 16 lanes).
+// SumInt8x16 sums a slice of int8 using AVX SIMD (Int8x16, 16 lanes).
 // Overflow: The accumulation is performed using int8, which can overflow for large collections.
 // If the sum exceeds the int8 range (-128 to 127), the result will wrap around silently.
 // For collections that may overflow, consider using a wider type or handle overflow detection externally.
@@ -45,7 +45,7 @@ func SumInt8x16[T ~int8](collection []T) T {
 	return sum
 }

-// SumInt16x8 sums a slice of int16 using SSE SIMD (Int16x8, 8 lanes).
+// SumInt16x8 sums a slice of int16 using AVX SIMD (Int16x8, 8 lanes).
 // Overflow: The accumulation is performed using int16, which can overflow for large collections.
 // If the sum exceeds the int16 range (-32768 to 32767), the result will wrap around silently.
 // For collections that may overflow, consider using a wider type or handle overflow detection externally.
@@ -79,7 +79,7 @@ func SumInt16x8[T ~int16](collection []T) T {
 	return sum
 }

-// SumInt32x4 sums a slice of int32 using SSE SIMD (Int32x4, 4 lanes).
+// SumInt32x4 sums a slice of int32 using AVX SIMD (Int32x4, 4 lanes).
 // Overflow: The accumulation is performed using int32, which can overflow for very large collections.
 // If the sum exceeds the int32 range (-2147483648 to 2147483647), the result will wrap around silently.
 // For collections that may overflow, consider using SumInt64x2 or handle overflow detection externally.
@@ -113,7 +113,7 @@ func SumInt32x4[T ~int32](collection []T) T {
 	return sum
 }

-// SumInt64x2 sums a slice of int64 using SSE SIMD (Int64x2, 2 lanes).
+// SumInt64x2 sums a slice of int64 using AVX SIMD (Int64x2, 2 lanes).
 // Overflow: The accumulation is performed using int64, which can overflow for extremely large collections.
 // If the sum exceeds the int64 range, the result will wrap around silently.
 // For collections that may overflow, handle overflow detection externally (e.g., using big.Int).
@@ -147,7 +147,7 @@ func SumInt64x2[T ~int64](collection []T) T {
 	return sum
 }

-// SumUint8x16 sums a slice of uint8 using SSE SIMD (Uint8x16, 16 lanes).
+// SumUint8x16 sums a slice of uint8 using AVX SIMD (Uint8x16, 16 lanes).
 // Overflow: The accumulation is performed using uint8, which can overflow for large collections.
 // If the sum exceeds the uint8 range (0 to 255), the result will wrap around silently.
 // For collections that may overflow, consider using a wider type or handle overflow detection externally.
@@ -181,7 +181,7 @@ func SumUint8x16[T ~uint8](collection []T) T {
 	return sum
 }

-// SumUint16x8 sums a slice of uint16 using SSE SIMD (Uint16x8, 8 lanes).
+// SumUint16x8 sums a slice of uint16 using AVX SIMD (Uint16x8, 8 lanes).
 // Overflow: The accumulation is performed using uint16, which can overflow for large collections.
 // If the sum exceeds the uint16 range (0 to 65535), the result will wrap around silently.
 // For collections that may overflow, consider using a wider type or handle overflow detection externally.
@@ -215,7 +215,7 @@ func SumUint16x8[T ~uint16](collection []T) T {
 	return sum
 }

-// SumUint32x4 sums a slice of uint32 using SSE SIMD (Uint32x4, 4 lanes).
+// SumUint32x4 sums a slice of uint32 using AVX SIMD (Uint32x4, 4 lanes).
 // Overflow: The accumulation is performed using uint32, which can overflow for very large collections.
 // If the sum exceeds the uint32 range (0 to 4294967295), the result will wrap around silently.
 // For collections that may overflow, consider using SumUint64x2 or handle overflow detection externally.
@@ -249,7 +249,7 @@ func SumUint32x4[T ~uint32](collection []T) T {
 	return sum
 }

-// SumUint64x2 sums a slice of uint64 using SSE SIMD (Uint64x2, 2 lanes).
+// SumUint64x2 sums a slice of uint64 using AVX SIMD (Uint64x2, 2 lanes).
 // Overflow: The accumulation is performed using uint64, which can overflow for extremely large collections.
 // If the sum exceeds the uint64 range, the result will wrap around silently.
 // For collections that may overflow, handle overflow detection externally (e.g., using big.Int).
@@ -283,7 +283,7 @@ func SumUint64x2[T ~uint64](collection []T) T {
 	return sum
 }

-// SumFloat32x4 sums a slice of float32 using SSE SIMD (Float32x4, 4 lanes).
+// SumFloat32x4 sums a slice of float32 using AVX SIMD (Float32x4, 4 lanes).
 // Overflow: The accumulation is performed using float32. Overflow will result in +/-Inf rather than wrapping.
 // For collections requiring high precision or large sums, consider using SumFloat64x2.
 func SumFloat32x4[T ~float32](collection []T) T {
@@ -316,7 +316,7 @@ func SumFloat32x4[T ~float32](collection []T) T {
 	return sum
 }

-// SumFloat64x2 sums a slice of float64 using SSE SIMD (Float64x2, 2 lanes).
+// SumFloat64x2 sums a slice of float64 using AVX SIMD (Float64x2, 2 lanes).
 // Overflow: The accumulation is performed using float64. Overflow will result in +/-Inf rather than wrapping.
 // For collections that may overflow, handle overflow detection externally (e.g., using big.Float).
 func SumFloat64x2[T ~float64](collection []T) T {
@@ -349,7 +349,7 @@ func SumFloat64x2[T ~float64](collection []T) T {
 	return sum
 }

-// MeanInt8x16 calculates the mean of a slice of int8 using SSE SIMD
+// MeanInt8x16 calculates the mean of a slice of int8 using AVX SIMD
 func MeanInt8x16[T ~int8](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -359,7 +359,7 @@ func MeanInt8x16[T ~int8](collection []T) T {
 	return sum / T(length)
 }

-// MeanInt16x8 calculates the mean of a slice of int16 using SSE SIMD
+// MeanInt16x8 calculates the mean of a slice of int16 using AVX SIMD
 func MeanInt16x8[T ~int16](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -369,7 +369,7 @@ func MeanInt16x8[T ~int16](collection []T) T {
 	return sum / T(length)
 }

-// MeanInt32x4 calculates the mean of a slice of int32 using SSE SIMD
+// MeanInt32x4 calculates the mean of a slice of int32 using AVX SIMD
 func MeanInt32x4[T ~int32](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -379,7 +379,7 @@ func MeanInt32x4[T ~int32](collection []T) T {
 	return sum / T(length)
 }

-// MeanInt64x2 calculates the mean of a slice of int64 using SSE SIMD
+// MeanInt64x2 calculates the mean of a slice of int64 using AVX SIMD
 func MeanInt64x2[T ~int64](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -389,7 +389,7 @@ func MeanInt64x2[T ~int64](collection []T) T {
 	return sum / T(length)
 }

-// MeanUint8x16 calculates the mean of a slice of uint8 using SSE SIMD
+// MeanUint8x16 calculates the mean of a slice of uint8 using AVX SIMD
 func MeanUint8x16[T ~uint8](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -399,7 +399,7 @@ func MeanUint8x16[T ~uint8](collection []T) T {
 	return sum / T(length)
 }

-// MeanUint16x8 calculates the mean of a slice of uint16 using SSE SIMD
+// MeanUint16x8 calculates the mean of a slice of uint16 using AVX SIMD
 func MeanUint16x8[T ~uint16](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -409,7 +409,7 @@ func MeanUint16x8[T ~uint16](collection []T) T {
 	return sum / T(length)
 }

-// MeanUint32x4 calculates the mean of a slice of uint32 using SSE SIMD
+// MeanUint32x4 calculates the mean of a slice of uint32 using AVX SIMD
 func MeanUint32x4[T ~uint32](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -419,7 +419,7 @@ func MeanUint32x4[T ~uint32](collection []T) T {
 	return sum / T(length)
 }

-// MeanUint64x2 calculates the mean of a slice of uint64 using SSE SIMD
+// MeanUint64x2 calculates the mean of a slice of uint64 using AVX SIMD
 func MeanUint64x2[T ~uint64](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -429,7 +429,7 @@ func MeanUint64x2[T ~uint64](collection []T) T {
 	return sum / T(length)
 }

-// MeanFloat32x4 calculates the mean of a slice of float32 using SSE SIMD
+// MeanFloat32x4 calculates the mean of a slice of float32 using AVX SIMD
 func MeanFloat32x4[T ~float32](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -440,7 +440,7 @@ func MeanFloat32x4[T ~float32](collection []T) T {
 	return sum / T(length)
 }

-// MeanFloat64x2 calculates the mean of a slice of float64 using SSE SIMD
+// MeanFloat64x2 calculates the mean of a slice of float64 using AVX SIMD
 func MeanFloat64x2[T ~float64](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -450,7 +450,7 @@ func MeanFloat64x2[T ~float64](collection []T) T {
 	return sum / T(length)
 }

-// ClampInt8x16 clamps each element in collection between min and max values using SSE SIMD
+// ClampInt8x16 clamps each element in collection between min and max values using AVX SIMD
 func ClampInt8x16[T ~int8, Slice ~[]T](collection Slice, min, max T) Slice {
 	length := uint(len(collection))
 	if length == 0 {
@@ -488,7 +488,7 @@ func ClampInt8x16[T ~int8, Slice ~[]T](collection Slice, min, max T) Slice {
 	return result
 }

-// ClampInt16x8 clamps each element in collection between min and max values using SSE SIMD
+// ClampInt16x8 clamps each element in collection between min and max values using AVX SIMD
 func ClampInt16x8[T ~int16, Slice ~[]T](collection Slice, min, max T) Slice {
 	length := uint(len(collection))
 	if length == 0 {
@@ -526,7 +526,7 @@ func ClampInt16x8[T ~int16, Slice ~[]T](collection Slice, min, max T) Slice {
 	return result
 }

-// ClampInt32x4 clamps each element in collection between min and max values using SSE SIMD
+// ClampInt32x4 clamps each element in collection between min and max values using AVX SIMD
 func ClampInt32x4[T ~int32, Slice ~[]T](collection Slice, min, max T) Slice {
 	length := uint(len(collection))
 	if length == 0 {
@@ -564,45 +564,7 @@ func ClampInt32x4[T ~int32, Slice ~[]T](collection Slice, min, max T) Slice {
 	return result
 }

-// ClampInt64x2 clamps each element in collection between min and max values using SSE SIMD and AVX-512 SIMD.
-func ClampInt64x2[T ~int64, Slice ~[]T](collection Slice, min, max T) Slice {
-	length := uint(len(collection))
-	if length == 0 {
-		return collection
-	}
-
-	result := make(Slice, length)
-	const lanes = simdLanes2
-
-	base := unsafeSliceInt64(collection, length)
-
-	minVec := archsimd.BroadcastInt64x2(int64(min))
-	maxVec := archsimd.BroadcastInt64x2(int64(max))
-
-	i := uint(0)
-	for ; i+lanes <= length; i += lanes {
-		v := archsimd.LoadInt64x2Slice(base[i : i+lanes])
-
-		clamped := v.Max(minVec).Min(maxVec)
-
-		// bearer:disable go_gosec_unsafe_unsafe
-		clamped.Store((*[lanes]int64)(unsafe.Pointer(&result[i])))
-	}
-
-	for ; i < length; i++ {
-		val := collection[i]
-		if val < min {
-			val = min
-		} else if val > max {
-			val = max
-		}
-		result[i] = val
-	}
-
-	return result
-}
-
-// ClampUint8x16 clamps each element in collection between min and max values using SSE SIMD
+// ClampUint8x16 clamps each element in collection between min and max values using AVX SIMD
 func ClampUint8x16[T ~uint8, Slice ~[]T](collection Slice, min, max T) Slice {
 	length := uint(len(collection))
 	if length == 0 {
@@ -640,7 +602,7 @@ func ClampUint8x16[T ~uint8, Slice ~[]T](collection Slice, min, max T) Slice {
 	return result
 }

-// ClampUint16x8 clamps each element in collection between min and max values using SSE SIMD
+// ClampUint16x8 clamps each element in collection between min and max values using AVX SIMD
 func ClampUint16x8[T ~uint16, Slice ~[]T](collection Slice, min, max T) Slice {
 	length := uint(len(collection))
 	if length == 0 {
@@ -678,7 +640,7 @@ func ClampUint16x8[T ~uint16, Slice ~[]T](collection Slice, min, max T) Slice {
 	return result
 }

-// ClampUint32x4 clamps each element in collection between min and max values using SSE SIMD
+// ClampUint32x4 clamps each element in collection between min and max values using AVX SIMD
 func ClampUint32x4[T ~uint32, Slice ~[]T](collection Slice, min, max T) Slice {
 	length := uint(len(collection))
 	if length == 0 {
@@ -716,45 +678,7 @@ func ClampUint32x4[T ~uint32, Slice ~[]T](collection Slice, min, max T) Slice {
 	return result
 }

-// ClampUint64x2 clamps each element in collection between min and max values using SSE SIMD and AVX-512 SIMD.
-func ClampUint64x2[T ~uint64, Slice ~[]T](collection Slice, min, max T) Slice {
-	length := uint(len(collection))
-	if length == 0 {
-		return collection
-	}
-
-	result := make(Slice, length)
-	const lanes = simdLanes2
-
-	base := unsafeSliceUint64(collection, length)
-
-	minVec := archsimd.BroadcastUint64x2(uint64(min))
-	maxVec := archsimd.BroadcastUint64x2(uint64(max))
-
-	i := uint(0)
-	for ; i+lanes <= length; i += lanes {
-		v := archsimd.LoadUint64x2Slice(base[i : i+lanes])
-
-		clamped := v.Max(minVec).Min(maxVec)
-
-		// bearer:disable go_gosec_unsafe_unsafe
-		clamped.Store((*[lanes]uint64)(unsafe.Pointer(&result[i])))
-	}
-
-	for ; i < length; i++ {
-		val := collection[i]
-		if val < min {
-			val = min
-		} else if val > max {
-			val = max
-		}
-		result[i] = val
-	}
-
-	return result
-}
-
-// ClampFloat32x4 clamps each element in collection between min and max values using SSE SIMD
+// ClampFloat32x4 clamps each element in collection between min and max values using AVX SIMD
 func ClampFloat32x4[T ~float32, Slice ~[]T](collection Slice, min, max T) Slice {
 	length := uint(len(collection))
 	if length == 0 {
@@ -792,7 +716,7 @@ func ClampFloat32x4[T ~float32, Slice ~[]T](collection Slice, min, max T) Slice
 	return result
 }

-// ClampFloat64x2 clamps each element in collection between min and max values using SSE SIMD
+// ClampFloat64x2 clamps each element in collection between min and max values using AVX SIMD
 func ClampFloat64x2[T ~float64, Slice ~[]T](collection Slice, min, max T) Slice {
 	length := uint(len(collection))
 	if length == 0 {
@@ -830,7 +754,7 @@ func ClampFloat64x2[T ~float64, Slice ~[]T](collection Slice, min, max T) Slice
 	return result
 }

-// MinInt8x16 finds the minimum value in a collection of int8 using SSE SIMD
+// MinInt8x16 finds the minimum value in a collection of int8 using AVX SIMD
 func MinInt8x16[T ~int8](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -877,7 +801,7 @@ func MinInt8x16[T ~int8](collection []T) T {
 	return T(minVal)
 }

-// MinInt16x8 finds the minimum value in a collection of int16 using SSE SIMD
+// MinInt16x8 finds the minimum value in a collection of int16 using AVX SIMD
 func MinInt16x8[T ~int16](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -921,7 +845,7 @@ func MinInt16x8[T ~int16](collection []T) T {
 	return T(minVal)
 }

-// MinInt32x4 finds the minimum value in a collection of int32 using SSE SIMD
+// MinInt32x4 finds the minimum value in a collection of int32 using AVX SIMD
 func MinInt32x4[T ~int32](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -965,51 +889,7 @@ func MinInt32x4[T ~int32](collection []T) T {
 	return T(minVal)
 }

-// MinInt64x2 finds the minimum value in a collection of int64 using SSE SIMD
-func MinInt64x2[T ~int64](collection []T) T {
-	length := uint(len(collection))
-	if length == 0 {
-		return 0
-	}
-
-	const lanes = simdLanes2
-	base := unsafeSliceInt64(collection, length)
-
-	var minVec archsimd.Int64x2
-	firstInitialized := false
-
-	i := uint(0)
-	for ; i+lanes <= length; i += lanes {
-		v := archsimd.LoadInt64x2Slice(base[i : i+lanes])
-
-		if !firstInitialized {
-			minVec = v
-			firstInitialized = true
-		} else {
-			minVec = minVec.Min(v)
-		}
-	}
-
-	// Find minimum in the vector (only if we processed any vectors)
-	var minVal int64
-	if firstInitialized {
-		var buf [lanes]int64
-		minVec.Store(&buf)
-		minVal = min(buf[0], buf[1])
-	}
-
-	// Handle remaining elements
-	for ; i < length; i++ {
-		if !firstInitialized || collection[i] < T(minVal) {
-			minVal = int64(collection[i])
-			firstInitialized = true
-		}
-	}
-
-	return T(minVal)
-}
-
-// MinUint8x16 finds the minimum value in a collection of uint8 using SSE SIMD
+// MinUint8x16 finds the minimum value in a collection of uint8 using AVX SIMD
 func MinUint8x16[T ~uint8](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -1056,7 +936,7 @@ func MinUint8x16[T ~uint8](collection []T) T {
 	return T(minVal)
 }

-// MinUint16x8 finds the minimum value in a collection of uint16 using SSE SIMD
+// MinUint16x8 finds the minimum value in a collection of uint16 using AVX SIMD
 func MinUint16x8[T ~uint16](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -1100,7 +980,7 @@ func MinUint16x8[T ~uint16](collection []T) T {
 	return T(minVal)
 }

-// MinUint32x4 finds the minimum value in a collection of uint32 using SSE SIMD
+// MinUint32x4 finds the minimum value in a collection of uint32 using AVX SIMD
 func MinUint32x4[T ~uint32](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -1144,51 +1024,7 @@ func MinUint32x4[T ~uint32](collection []T) T {
 	return T(minVal)
 }

-// MinUint64x2 finds the minimum value in a collection of uint64 using SSE SIMD
-func MinUint64x2[T ~uint64](collection []T) T {
-	length := uint(len(collection))
-	if length == 0 {
-		return 0
-	}
-
-	const lanes = simdLanes2
-	base := unsafeSliceUint64(collection, length)
-
-	var minVec archsimd.Uint64x2
-	firstInitialized := false
-
-	i := uint(0)
-	for ; i+lanes <= length; i += lanes {
-		v := archsimd.LoadUint64x2Slice(base[i : i+lanes])
-
-		if !firstInitialized {
-			minVec = v
-			firstInitialized = true
-		} else {
-			minVec = minVec.Min(v)
-		}
-	}
-
-	// Find minimum in the vector (only if we processed any vectors)
-	var minVal uint64
-	if firstInitialized {
-		var buf [lanes]uint64
-		minVec.Store(&buf)
-		minVal = min(buf[0], buf[1])
-	}
-
-	// Handle remaining elements
-	for ; i < length; i++ {
-		if !firstInitialized || collection[i] < T(minVal) {
-			minVal = uint64(collection[i])
-			firstInitialized = true
-		}
-	}
-
-	return T(minVal)
-}
-
-// MinFloat32x4 finds the minimum value in a collection of float32 using SSE SIMD
+// MinFloat32x4 finds the minimum value in a collection of float32 using AVX SIMD
 func MinFloat32x4[T ~float32](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -1232,7 +1068,7 @@ func MinFloat32x4[T ~float32](collection []T) T {
 	return T(minVal)
 }

-// MinFloat64x2 finds the minimum value in a collection of float64 using SSE SIMD
+// MinFloat64x2 finds the minimum value in a collection of float64 using AVX SIMD
 func MinFloat64x2[T ~float64](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -1276,7 +1112,7 @@ func MinFloat64x2[T ~float64](collection []T) T {
 	return T(minVal)
 }

-// MaxInt8x16 finds the maximum value in a collection of int8 using SSE SIMD
+// MaxInt8x16 finds the maximum value in a collection of int8 using AVX SIMD
 func MaxInt8x16[T ~int8](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -1323,7 +1159,7 @@ func MaxInt8x16[T ~int8](collection []T) T {
 	return T(maxVal)
 }

-// MaxInt16x8 finds the maximum value in a collection of int16 using SSE SIMD
+// MaxInt16x8 finds the maximum value in a collection of int16 using AVX SIMD
 func MaxInt16x8[T ~int16](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -1367,7 +1203,7 @@ func MaxInt16x8[T ~int16](collection []T) T {
 	return T(maxVal)
 }

-// MaxInt32x4 finds the maximum value in a collection of int32 using SSE SIMD
+// MaxInt32x4 finds the maximum value in a collection of int32 using AVX SIMD
 func MaxInt32x4[T ~int32](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -1411,51 +1247,7 @@ func MaxInt32x4[T ~int32](collection []T) T {
 	return T(maxVal)
 }

-// MaxInt64x2 finds the maximum value in a collection of int64 using SSE SIMD
-func MaxInt64x2[T ~int64](collection []T) T {
-	length := uint(len(collection))
-	if length == 0 {
-		return 0
-	}
-
-	const lanes = simdLanes2
-	base := unsafeSliceInt64(collection, length)
-
-	var maxVec archsimd.Int64x2
-	firstInitialized := false
-
-	i := uint(0)
-	for ; i+lanes <= length; i += lanes {
-		v := archsimd.LoadInt64x2Slice(base[i : i+lanes])
-
-		if !firstInitialized {
-			maxVec = v
-			firstInitialized = true
-		} else {
-			maxVec = maxVec.Max(v)
-		}
-	}
-
-	// Find maximum in the vector (only if we processed any vectors)
-	var maxVal int64
-	if firstInitialized {
-		var buf [lanes]int64
-		maxVec.Store(&buf)
-		maxVal = max(buf[0], buf[1])
-	}
-
-	// Handle remaining elements
-	for ; i < length; i++ {
-		if !firstInitialized || collection[i] > T(maxVal) {
-			maxVal = int64(collection[i])
-			firstInitialized = true
-		}
-	}
-
-	return T(maxVal)
-}
-
-// MaxUint8x16 finds the maximum value in a collection of uint8 using SSE SIMD
+// MaxUint8x16 finds the maximum value in a collection of uint8 using AVX SIMD
 func MaxUint8x16[T ~uint8](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -1502,7 +1294,7 @@ func MaxUint8x16[T ~uint8](collection []T) T {
 	return T(maxVal)
 }

-// MaxUint16x8 finds the maximum value in a collection of uint16 using SSE SIMD
+// MaxUint16x8 finds the maximum value in a collection of uint16 using AVX SIMD
 func MaxUint16x8[T ~uint16](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -1546,7 +1338,7 @@ func MaxUint16x8[T ~uint16](collection []T) T {
 	return T(maxVal)
 }

-// MaxUint32x4 finds the maximum value in a collection of uint32 using SSE SIMD
+// MaxUint32x4 finds the maximum value in a collection of uint32 using AVX SIMD
 func MaxUint32x4[T ~uint32](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -1590,51 +1382,7 @@ func MaxUint32x4[T ~uint32](collection []T) T {
 	return T(maxVal)
 }

-// MaxUint64x2 finds the maximum value in a collection of uint64 using SSE SIMD
-func MaxUint64x2[T ~uint64](collection []T) T {
-	length := uint(len(collection))
-	if length == 0 {
-		return 0
-	}
-
-	const lanes = simdLanes2
-	base := unsafeSliceUint64(collection, length)
-
-	var maxVec archsimd.Uint64x2
-	firstInitialized := false
-
-	i := uint(0)
-	for ; i+lanes <= length; i += lanes {
-		v := archsimd.LoadUint64x2Slice(base[i : i+lanes])
-
-		if !firstInitialized {
-			maxVec = v
-			firstInitialized = true
-		} else {
-			maxVec = maxVec.Max(v)
-		}
-	}
-
-	// Find maximum in the vector (only if we processed any vectors)
-	var maxVal uint64
-	if firstInitialized {
-		var buf [lanes]uint64
-		maxVec.Store(&buf)
-		maxVal = max(buf[0], buf[1])
-	}
-
-	// Handle remaining elements
-	for ; i < length; i++ {
-		if !firstInitialized || collection[i] > T(maxVal) {
-			maxVal = uint64(collection[i])
-			firstInitialized = true
-		}
-	}
-
-	return T(maxVal)
-}
-
-// MaxFloat32x4 finds the maximum value in a collection of float32 using SSE SIMD
+// MaxFloat32x4 finds the maximum value in a collection of float32 using AVX SIMD
 func MaxFloat32x4[T ~float32](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -1678,7 +1426,7 @@ func MaxFloat32x4[T ~float32](collection []T) T {
 	return T(maxVal)
 }

-// MaxFloat64x2 finds the maximum value in a collection of float64 using SSE SIMD
+// MaxFloat64x2 finds the maximum value in a collection of float64 using AVX SIMD
 func MaxFloat64x2[T ~float64](collection []T) T {
 	length := uint(len(collection))
 	if length == 0 {
@@ -1722,127 +1470,127 @@ func MaxFloat64x2[T ~float64](collection []T) T {
 	return T(maxVal)
 }

-// SSE (128-bit) SIMD sumBy functions - 16/8/4/2 lanes
+// AVX (128-bit) SIMD sumBy functions - 16/8/4/2 lanes
 // These implementations use lo.Map to apply the iteratee, then chain with SIMD sum functions.

-// SumByInt8x16 sums the values extracted by iteratee from a slice using SSE SIMD.
+// SumByInt8x16 sums the values extracted by iteratee from a slice using AVX SIMD.
 func SumByInt8x16[T any, R ~int8](collection []T, iteratee func(item T) R) R {
 	mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
 	return SumInt8x16(mapped)
 }

-// SumByInt16x8 sums the values extracted by iteratee from a slice using SSE SIMD.
+// SumByInt16x8 sums the values extracted by iteratee from a slice using AVX SIMD.
 func SumByInt16x8[T any, R ~int16](collection []T, iteratee func(item T) R) R {
 	mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
 	return SumInt16x8(mapped)
 }

-// SumByInt32x4 sums the values extracted by iteratee from a slice using SSE SIMD.
+// SumByInt32x4 sums the values extracted by iteratee from a slice using AVX SIMD.
 func SumByInt32x4[T any, R ~int32](collection []T, iteratee func(item T) R) R {
 	mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
 	return SumInt32x4(mapped)
 }

-// SumByInt64x2 sums the values extracted by iteratee from a slice using SSE SIMD.
+// SumByInt64x2 sums the values extracted by iteratee from a slice using AVX SIMD.
 func SumByInt64x2[T any, R ~int64](collection []T, iteratee func(item T) R) R {
 	mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
 	return SumInt64x2(mapped)
 }

-// SumByUint8x16 sums the values extracted by iteratee from a slice using SSE SIMD.
+// SumByUint8x16 sums the values extracted by iteratee from a slice using AVX SIMD.
 func SumByUint8x16[T any, R ~uint8](collection []T, iteratee func(item T) R) R {
 	mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
 	return SumUint8x16(mapped)
 }

-// SumByUint16x8 sums the values extracted by iteratee from a slice using SSE SIMD.
+// SumByUint16x8 sums the values extracted by iteratee from a slice using AVX SIMD.
 func SumByUint16x8[T any, R ~uint16](collection []T, iteratee func(item T) R) R {
 	mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
 	return SumUint16x8(mapped)
 }

-// SumByUint32x4 sums the values extracted by iteratee from a slice using SSE SIMD.
+// SumByUint32x4 sums the values extracted by iteratee from a slice using AVX SIMD.
 func SumByUint32x4[T any, R ~uint32](collection []T, iteratee func(item T) R) R {
 	mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
 	return SumUint32x4(mapped)
 }

-// SumByUint64x2 sums the values extracted by iteratee from a slice using SSE SIMD.
+// SumByUint64x2 sums the values extracted by iteratee from a slice using AVX SIMD.
 func SumByUint64x2[T any, R ~uint64](collection []T, iteratee func(item T) R) R {
 	mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
 	return SumUint64x2(mapped)
 }

-// SumByFloat32x4 sums the values extracted by iteratee from a slice using SSE SIMD.
+// SumByFloat32x4 sums the values extracted by iteratee from a slice using AVX SIMD.
 func SumByFloat32x4[T any, R ~float32](collection []T, iteratee func(item T) R) R {
 	mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
 	return SumFloat32x4(mapped)
 }

-// SumByFloat64x2 sums the values extracted by iteratee from a slice using SSE SIMD.
+// SumByFloat64x2 sums the values extracted by iteratee from a slice using AVX SIMD.
 func SumByFloat64x2[T any, R ~float64](collection []T, iteratee func(item T) R) R {
 	mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
 	return SumFloat64x2(mapped)
 }

-// SSE (128-bit) SIMD meanBy functions - 16/8/4/2 lanes
+// AVX (128-bit) SIMD meanBy functions - 16/8/4/2 lanes
 // These implementations use lo.Map to apply the iteratee, then chain with SIMD mean functions.

-// MeanByInt8x16 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
+// MeanByInt8x16 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
 func MeanByInt8x16[T any, R ~int8](collection []T, iteratee func(item T) R) R {
 	mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
 	return MeanInt8x16(mapped)
 }

-// MeanByInt16x8 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
+// MeanByInt16x8 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
 func MeanByInt16x8[T any, R ~int16](collection []T, iteratee func(item T) R) R {
 	mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
 	return MeanInt16x8(mapped)
 }

-// MeanByInt32x4 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
+// MeanByInt32x4 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
 func MeanByInt32x4[T any, R ~int32](collection []T, iteratee func(item T) R) R {
 	mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
 	return MeanInt32x4(mapped)
 }

-// MeanByInt64x2 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
+// MeanByInt64x2 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
 func MeanByInt64x2[T any, R ~int64](collection []T, iteratee func(item T) R) R {
 	mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
 	return MeanInt64x2(mapped)
 }

-// MeanByUint8x16 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
+// MeanByUint8x16 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
 func MeanByUint8x16[T any, R ~uint8](collection []T, iteratee func(item T) R) R {
 	mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
 	return MeanUint8x16(mapped)
 }

-// MeanByUint16x8 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
+// MeanByUint16x8 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
 func MeanByUint16x8[T any, R ~uint16](collection []T, iteratee func(item T) R) R {
 	mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
 	return MeanUint16x8(mapped)
 }

-// MeanByUint32x4 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
+// MeanByUint32x4 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
 func MeanByUint32x4[T any, R ~uint32](collection []T, iteratee func(item T) R) R {
 	mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
 	return MeanUint32x4(mapped)
 }

-// MeanByUint64x2 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
+// MeanByUint64x2 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
 func MeanByUint64x2[T any, R ~uint64](collection []T, iteratee func(item T) R) R {
 	mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
 	return MeanUint64x2(mapped)
 }

-// MeanByFloat32x4 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
+// MeanByFloat32x4 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
 func MeanByFloat32x4[T any, R ~float32](collection []T, iteratee func(item T) R) R {
 	mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
 	return MeanFloat32x4(mapped)
 }

-// MeanByFloat64x2 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
+// MeanByFloat64x2 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
 func MeanByFloat64x2[T any, R ~float64](collection []T, iteratee func(item T) R) R {
 	mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
 	return MeanFloat64x2(mapped)
@@ -566,6 +566,84 @@ func ClampInt32x16[T ~int32, Slice ~[]T](collection Slice, min, max T) Slice {
 	return result
 }

+// ClampInt64x2 clamps each element in collection between min and max values using AVX-512 SIMD.
+// Int64x2 Min/Max operations in archsimd require AVX-512 (VPMAXSQ/VPMINSQ).
+func ClampInt64x2[T ~int64, Slice ~[]T](collection Slice, min, max T) Slice {
+	length := uint(len(collection))
+	if length == 0 {
+		return collection
+	}
+
+	result := make(Slice, length)
+	const lanes = simdLanes2
+
+	base := unsafeSliceInt64(collection, length)
+
+	minVec := archsimd.BroadcastInt64x2(int64(min))
+	maxVec := archsimd.BroadcastInt64x2(int64(max))
+
+	i := uint(0)
+	for ; i+lanes <= length; i += lanes {
+		v := archsimd.LoadInt64x2Slice(base[i : i+lanes])
+
+		clamped := v.Max(minVec).Min(maxVec)
+
+		// bearer:disable go_gosec_unsafe_unsafe
+		clamped.Store((*[lanes]int64)(unsafe.Pointer(&result[i])))
+	}
+
+	for ; i < length; i++ {
+		val := collection[i]
+		if val < min {
+			val = min
+		} else if val > max {
+			val = max
+		}
+		result[i] = val
+	}
+
+	return result
+}
+
+// ClampUint64x2 clamps each element in collection between min and max values using AVX-512 SIMD.
+// Uint64x2 Min/Max operations in archsimd require AVX-512.
+func ClampUint64x2[T ~uint64, Slice ~[]T](collection Slice, min, max T) Slice {
+	length := uint(len(collection))
+	if length == 0 {
+		return collection
+	}
+
+	result := make(Slice, length)
+	const lanes = simdLanes2
+
+	base := unsafeSliceUint64(collection, length)
+
+	minVec := archsimd.BroadcastUint64x2(uint64(min))
+	maxVec := archsimd.BroadcastUint64x2(uint64(max))
+
+	i := uint(0)
+	for ; i+lanes <= length; i += lanes {
+		v := archsimd.LoadUint64x2Slice(base[i : i+lanes])
+
+		clamped := v.Max(minVec).Min(maxVec)
+
+		// bearer:disable go_gosec_unsafe_unsafe
+		clamped.Store((*[lanes]uint64)(unsafe.Pointer(&result[i])))
+	}
+
+	for ; i < length; i++ {
+		val := collection[i]
+		if val < min {
+			val = min
+		} else if val > max {
+			val = max
+		}
+		result[i] = val
+	}
+
+	return result
+}
+
 // ClampInt64x8 clamps each element in collection between min and max values using AVX-512 SIMD
 func ClampInt64x8[T ~int64, Slice ~[]T](collection Slice, min, max T) Slice {
 	length := uint(len(collection))
@@ -991,6 +1069,96 @@ func MinInt32x16[T ~int32](collection []T) T {
 	return T(minVal)
 }

+// MinInt64x2 finds the minimum value in a collection of int64 using AVX-512 SIMD.
+// Int64x2 Min operations in archsimd require AVX-512.
+func MinInt64x2[T ~int64](collection []T) T {
+	length := uint(len(collection))
+	if length == 0 {
+		return 0
+	}
+
+	const lanes = simdLanes2
+	base := unsafeSliceInt64(collection, length)
+
+	var minVec archsimd.Int64x2
+	firstInitialized := false
+
+	i := uint(0)
+	for ; i+lanes <= length; i += lanes {
+		v := archsimd.LoadInt64x2Slice(base[i : i+lanes])
+
+		if !firstInitialized {
+			minVec = v
+			firstInitialized = true
+		} else {
+			minVec = minVec.Min(v)
+		}
+	}
+
+	// Find minimum in the vector (only if we processed any vectors)
+	var minVal int64
+	if firstInitialized {
+		var buf [lanes]int64
+		minVec.Store(&buf)
+		minVal = min(buf[0], buf[1])
+	}
+
+	// Handle remaining elements
+	for ; i < length; i++ {
+		if !firstInitialized || collection[i] < T(minVal) {
+			minVal = int64(collection[i])
+			firstInitialized = true
+		}
+	}
+
+	return T(minVal)
+}
+
+// MinUint64x2 finds the minimum value in a collection of uint64 using AVX-512 SIMD.
+// Uint64x2 Min operations in archsimd require AVX-512.
+func MinUint64x2[T ~uint64](collection []T) T {
+	length := uint(len(collection))
+	if length == 0 {
+		return 0
+	}
+
+	const lanes = simdLanes2
+	base := unsafeSliceUint64(collection, length)
+
+	var minVec archsimd.Uint64x2
+	firstInitialized := false
+
+	i := uint(0)
+	for ; i+lanes <= length; i += lanes {
+		v := archsimd.LoadUint64x2Slice(base[i : i+lanes])
+
+		if !firstInitialized {
+			minVec = v
+			firstInitialized = true
+		} else {
+			minVec = minVec.Min(v)
+		}
+	}
+
+	// Find minimum in the vector (only if we processed any vectors)
+	var minVal uint64
+	if firstInitialized {
+		var buf [lanes]uint64
+		minVec.Store(&buf)
+		minVal = min(buf[0], buf[1])
+	}
+
+	// Handle remaining elements
+	for ; i < length; i++ {
+		if !firstInitialized || collection[i] < T(minVal) {
+			minVal = uint64(collection[i])
+			firstInitialized = true
+		}
+	}
+
+	return T(minVal)
+}
+
 // MinInt64x8 finds the minimum value in a collection of int64 using AVX-512 SIMD
 func MinInt64x8[T ~int64](collection []T) T {
 	length := uint(len(collection))
@@ -1478,6 +1646,96 @@ func MaxInt32x16[T ~int32](collection []T) T {
 	return T(maxVal)
 }

+// MaxInt64x2 finds the maximum value in a collection of int64 using AVX-512 SIMD.
+// Int64x2 Max operations in archsimd require AVX-512.
+func MaxInt64x2[T ~int64](collection []T) T {
+	length := uint(len(collection))
+	if length == 0 {
+		return 0
+	}
+
+	const lanes = simdLanes2
+	base := unsafeSliceInt64(collection, length)
+
+	var maxVec archsimd.Int64x2
+	firstInitialized := false
+
+	i := uint(0)
+	for ; i+lanes <= length; i += lanes {
+		v := archsimd.LoadInt64x2Slice(base[i : i+lanes])
+
+		if !firstInitialized {
+			maxVec = v
+			firstInitialized = true
+		} else {
+			maxVec = maxVec.Max(v)
+		}
+	}
+
+	// Find maximum in the vector (only if we processed any vectors)
+	var maxVal int64
+	if firstInitialized {
+		var buf [lanes]int64
+		maxVec.Store(&buf)
+		maxVal = max(buf[0], buf[1])
+	}
+
+	// Handle remaining elements
+	for ; i < length; i++ {
+		if !firstInitialized || collection[i] > T(maxVal) {
+			maxVal = int64(collection[i])
+			firstInitialized = true
+		}
+	}
+
+	return T(maxVal)
+}
+
+// MaxUint64x2 finds the maximum value in a collection of uint64 using AVX-512 SIMD.
+// Uint64x2 Max operations in archsimd require AVX-512.
+func MaxUint64x2[T ~uint64](collection []T) T {
+	length := uint(len(collection))
+	if length == 0 {
+		return 0
+	}
+
+	const lanes = simdLanes2
+	base := unsafeSliceUint64(collection, length)
+
+	var maxVec archsimd.Uint64x2
+	firstInitialized := false
+
+	i := uint(0)
+	for ; i+lanes <= length; i += lanes {
+		v := archsimd.LoadUint64x2Slice(base[i : i+lanes])
+
+		if !firstInitialized {
+			maxVec = v
+			firstInitialized = true
+		} else {
+			maxVec = maxVec.Max(v)
+		}
+	}
+
+	// Find maximum in the vector (only if we processed any vectors)
+	var maxVal uint64
+	if firstInitialized {
+		var buf [lanes]uint64
+		maxVec.Store(&buf)
+		maxVal = max(buf[0], buf[1])
+	}
+
+	// Handle remaining elements
+	for ; i < length; i++ {
+		if !firstInitialized || collection[i] > T(maxVal) {
+			maxVal = uint64(collection[i])
+			firstInitialized = true
+		}
+	}
+
+	return T(maxVal)
+}
+
 // MaxInt64x8 finds the maximum value in a collection of int64 using AVX-512 SIMD
 func MaxInt64x8[T ~int64](collection []T) T {
 	length := uint(len(collection))
@@ -819,6 +819,55 @@ func TestClampInt32x16(t *testing.T) {
 	}
 }

+func TestClampInt64x2(t *testing.T) {
+	requireAVX512(t)
+	testCases := []struct {
+		name  string
+		input []int64
+		min   int64
+		max   int64
+	}{
+		{"empty", []int64{}, -100, 100},
+		{"single", []int64{42}, -10, 10},
+		{"small", []int64{1, 2, 3, 4, 5}, 2, 4},
+		{"exactly 2", []int64{-100, 200}, -50, 50},
+		{"large", make([]int64, 1000), -50, 50},
+		{"all below min", []int64{-1000, -2000, -3000}, -500, 100},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
+				for i := range tc.input {
+					tc.input[i] = rand.Int64()
+				}
+			}
+
+			got := ClampInt64x2(tc.input, tc.min, tc.max)
+
+			if len(got) != len(tc.input) {
+				t.Errorf("ClampInt64x2() returned length %d, want %d", len(got), len(tc.input))
+			}
+
+			for i, v := range got {
+				if v < tc.min || v > tc.max {
+					t.Errorf("ClampInt64x2()[%d] = %v, outside range [%v, %v]", i, v, tc.min, tc.max)
+				}
+				original := tc.input[i]
+				expected := original
+				if expected < tc.min {
+					expected = tc.min
+				} else if expected > tc.max {
+					expected = tc.max
+				}
+				if v != expected {
+					t.Errorf("ClampInt64x2()[%d] = %v, want %v (original: %v)", i, v, expected, original)
+				}
+			}
+		})
+	}
+}
+
 func TestClampInt64x8(t *testing.T) {
 	requireAVX512(t)
 	testCases := []struct {
@@ -1018,6 +1067,55 @@ func TestClampUint32x16(t *testing.T) {
 	}
 }

+func TestClampUint64x2(t *testing.T) {
+	requireAVX512(t)
+	testCases := []struct {
+		name  string
+		input []uint64
+		min   uint64
+		max   uint64
+	}{
+		{"empty", []uint64{}, 100, 1000},
+		{"single", []uint64{42}, 10, 100},
+		{"small", []uint64{1, 2, 3, 4, 5}, 2, 4},
+		{"exactly 2", []uint64{50, 2000}, 100, 1000},
+		{"large", make([]uint64, 1000), 500, 5000},
+		{"all below min", []uint64{1, 2, 3}, 10, 100},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
+				for i := range tc.input {
+					tc.input[i] = rand.Uint64()
+				}
+			}
+
+			got := ClampUint64x2(tc.input, tc.min, tc.max)
+
+			if len(got) != len(tc.input) {
+				t.Errorf("ClampUint64x2() returned length %d, want %d", len(got), len(tc.input))
+			}
+
+			for i, v := range got {
+				if v < tc.min || v > tc.max {
+					t.Errorf("ClampUint64x2()[%d] = %v, outside range [%v, %v]", i, v, tc.min, tc.max)
+				}
+				original := tc.input[i]
+				expected := original
+				if expected < tc.min {
+					expected = tc.min
+				} else if expected > tc.max {
+					expected = tc.max
+				}
+				if v != expected {
+					t.Errorf("ClampUint64x2()[%d] = %v, want %v (original: %v)", i, v, expected, original)
+				}
+			}
+		})
+	}
+}
+
 func TestClampUint64x8(t *testing.T) {
 	requireAVX512(t)
 	testCases := []struct {
@@ -1292,6 +1390,38 @@ func TestMinInt32x16(t *testing.T) {
 	}
 }

+func TestMinInt64x2(t *testing.T) {
+	requireAVX512(t)
+	testCases := []struct {
+		name  string
+		input []int64
+	}{
+		{"empty", []int64{}},
+		{"single", []int64{42}},
+		{"small", []int64{1, 2, 3, 4, 5}},
+		{"exactly 2", []int64{1, 2}},
+		{"large", make([]int64, 1000)},
+		{"negative", []int64{-1, -2, -3, 4, 5}},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
+				for i := range tc.input {
+					tc.input[i] = rand.Int64()
+				}
+			}
+
+			got := MinInt64x2(tc.input)
+			want := lo.Min(tc.input)
+
+			if got != want {
+				t.Errorf("MinInt64x2() = %v, want %v", got, want)
+			}
+		})
+	}
+}
+
 func TestMinInt64x8(t *testing.T) {
 	requireAVX512(t)
 	testCases := []struct {
@@ -1419,6 +1549,37 @@ func TestMinUint32x16(t *testing.T) {
 	}
 }

+func TestMinUint64x2(t *testing.T) {
+	requireAVX512(t)
+	testCases := []struct {
+		name  string
+		input []uint64
+	}{
+		{"empty", []uint64{}},
+		{"single", []uint64{42}},
+		{"small", []uint64{1, 2, 3, 4, 5}},
+		{"exactly 2", []uint64{1, 2}},
+		{"large", make([]uint64, 1000)},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
+				for i := range tc.input {
+					tc.input[i] = rand.Uint64()
+				}
+			}
+
+			got := MinUint64x2(tc.input)
+			want := lo.Min(tc.input)
+
+			if got != want {
+				t.Errorf("MinUint64x2() = %v, want %v", got, want)
+			}
+		})
+	}
+}
+
 func TestMinUint64x8(t *testing.T) {
 	requireAVX512(t)
 	testCases := []struct {
@@ -1625,6 +1786,38 @@ func TestMaxInt32x16(t *testing.T) {
 	}
 }

+func TestMaxInt64x2(t *testing.T) {
+	requireAVX512(t)
+	testCases := []struct {
+		name  string
+		input []int64
+	}{
+		{"empty", []int64{}},
+		{"single", []int64{42}},
+		{"small", []int64{1, 2, 3, 4, 5}},
+		{"exactly 2", []int64{1, 2}},
+		{"large", make([]int64, 1000)},
+		{"negative", []int64{-1, -2, -3, 4, 5}},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
+				for i := range tc.input {
+					tc.input[i] = rand.Int64()
+				}
+			}
+
+			got := MaxInt64x2(tc.input)
+			want := lo.Max(tc.input)
+
+			if got != want {
+				t.Errorf("MaxInt64x2() = %v, want %v", got, want)
+			}
+		})
+	}
+}
+
 func TestMaxInt64x8(t *testing.T) {
 	requireAVX512(t)
 	testCases := []struct {
@@ -1752,6 +1945,37 @@ func TestMaxUint32x16(t *testing.T) {
 	}
 }

+func TestMaxUint64x2(t *testing.T) {
+	requireAVX512(t)
+	testCases := []struct {
+		name  string
+		input []uint64
+	}{
+		{"empty", []uint64{}},
+		{"single", []uint64{42}},
+		{"small", []uint64{1, 2, 3, 4, 5}},
+		{"exactly 2", []uint64{1, 2}},
+		{"large", make([]uint64, 1000)},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
+				for i := range tc.input {
+					tc.input[i] = rand.Uint64()
+				}
+			}
+
+			got := MaxUint64x2(tc.input)
+			want := lo.Max(tc.input)
+
+			if got != want {
+				t.Errorf("MaxUint64x2() = %v, want %v", got, want)
+			}
+		})
+	}
+}
+
 func TestMaxUint64x8(t *testing.T) {
 	requireAVX512(t)
 	testCases := []struct {
@@ -10,6 +10,7 @@ import (
 )

 func TestSumInt8x16(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []int8
@@ -42,6 +43,7 @@ func TestSumInt8x16(t *testing.T) {
 }

 func TestSumInt16x8(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []int16
@@ -74,6 +76,7 @@ func TestSumInt16x8(t *testing.T) {
 }

 func TestSumInt32x4(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []int32
@@ -105,6 +108,7 @@ func TestSumInt32x4(t *testing.T) {
 }

 func TestSumInt64x2(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []int64
@@ -136,6 +140,7 @@ func TestSumInt64x2(t *testing.T) {
 }

 func TestSumUint8x16(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []uint8
@@ -167,6 +172,7 @@ func TestSumUint8x16(t *testing.T) {
 }

 func TestSumUint16x8(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []uint16
@@ -198,6 +204,7 @@ func TestSumUint16x8(t *testing.T) {
 }

 func TestSumUint32x4(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []uint32
@@ -228,6 +235,7 @@ func TestSumUint32x4(t *testing.T) {
 }

 func TestSumUint64x2(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []uint64
@@ -258,6 +266,7 @@ func TestSumUint64x2(t *testing.T) {
 }

 func TestSumFloat32x4(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []float32
@@ -291,6 +300,7 @@ func TestSumFloat32x4(t *testing.T) {
 }

 func TestSumFloat64x2(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []float64
@@ -323,7 +333,8 @@ func TestSumFloat64x2(t *testing.T) {
 }

 // Test type aliases work correctly
-func TestSSETypeAlias(t *testing.T) {
+func TestAVXTypeAlias(t *testing.T) {
+	requireAVX(t)
 	input := []myInt8{1, 2, 3, 4, 5}
 	got := SumInt8x16(input)
 	want := lo.Sum(input)
@@ -334,6 +345,7 @@ func TestSSETypeAlias(t *testing.T) {
 }

 func TestClampInt8x16(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []int8
@@ -385,6 +397,7 @@ func TestClampInt8x16(t *testing.T) {
 }

 func TestClampInt16x8(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []int16
@@ -434,6 +447,7 @@ func TestClampInt16x8(t *testing.T) {
 }

 func TestClampInt32x4(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []int32
@@ -481,56 +495,8 @@ func TestClampInt32x4(t *testing.T) {
 	}
 }

-func TestClampInt64x2(t *testing.T) {
-	requireAVX512(t)
-	testCases := []struct {
-		name  string
-		input []int64
-		min   int64
-		max   int64
-	}{
-		{"empty", []int64{}, -100, 100},
-		{"single", []int64{42}, -10, 10},
-		{"small", []int64{1, 2, 3, 4, 5}, 2, 4},
-		{"exactly 2", []int64{-100, 200}, -50, 50},
-		{"large", make([]int64, 1000), -50, 50},
-		{"all below min", []int64{-1000, -2000, -3000}, -500, 100},
-	}
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
-				for i := range tc.input {
-					tc.input[i] = rand.Int64()
-				}
-			}
-
-			got := ClampInt64x2(tc.input, tc.min, tc.max)
-
-			if len(got) != len(tc.input) {
-				t.Errorf("ClampInt64x2() returned length %d, want %d", len(got), len(tc.input))
-			}
-
-			for i, v := range got {
-				if v < tc.min || v > tc.max {
-					t.Errorf("ClampInt64x2()[%d] = %v, outside range [%v, %v]", i, v, tc.min, tc.max)
-				}
-				original := tc.input[i]
-				expected := original
-				if expected < tc.min {
-					expected = tc.min
-				} else if expected > tc.max {
-					expected = tc.max
-				}
-				if v != expected {
-					t.Errorf("ClampInt64x2()[%d] = %v, want %v (original: %v)", i, v, expected, original)
-				}
-			}
-		})
-	}
-}
-
 func TestClampUint8x16(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []uint8
@@ -581,6 +547,7 @@ func TestClampUint8x16(t *testing.T) {
 }

 func TestClampUint16x8(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []uint16
@@ -630,6 +597,7 @@ func TestClampUint16x8(t *testing.T) {
 }

 func TestClampUint32x4(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []uint32
@@ -677,56 +645,8 @@ func TestClampUint32x4(t *testing.T) {
 	}
 }

-func TestClampUint64x2(t *testing.T) {
-	requireAVX512(t)
-	testCases := []struct {
-		name  string
-		input []uint64
-		min   uint64
-		max   uint64
-	}{
-		{"empty", []uint64{}, 100, 1000},
-		{"single", []uint64{42}, 10, 100},
-		{"small", []uint64{1, 2, 3, 4, 5}, 2, 4},
-		{"exactly 2", []uint64{50, 2000}, 100, 1000},
-		{"large", make([]uint64, 1000), 500, 5000},
-		{"all below min", []uint64{1, 2, 3}, 10, 100},
-	}
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
-				for i := range tc.input {
-					tc.input[i] = rand.Uint64()
-				}
-			}
-
-			got := ClampUint64x2(tc.input, tc.min, tc.max)
-
-			if len(got) != len(tc.input) {
-				t.Errorf("ClampUint64x2() returned length %d, want %d", len(got), len(tc.input))
-			}
-
-			for i, v := range got {
-				if v < tc.min || v > tc.max {
-					t.Errorf("ClampUint64x2()[%d] = %v, outside range [%v, %v]", i, v, tc.min, tc.max)
-				}
-				original := tc.input[i]
-				expected := original
-				if expected < tc.min {
-					expected = tc.min
-				} else if expected > tc.max {
-					expected = tc.max
-				}
-				if v != expected {
-					t.Errorf("ClampUint64x2()[%d] = %v, want %v (original: %v)", i, v, expected, original)
-				}
-			}
-		})
-	}
-}
-
 func TestClampFloat32x4(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []float32
@@ -778,6 +698,7 @@ func TestClampFloat32x4(t *testing.T) {
 }

 func TestClampFloat64x2(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []float64
@@ -829,7 +750,8 @@ func TestClampFloat64x2(t *testing.T) {
 }

 // Test type aliases work correctly
-func TestSSEClampTypeAlias(t *testing.T) {
+func TestAVXClampTypeAlias(t *testing.T) {
+	requireAVX(t)
 	input := []myInt8{-5, 0, 10, 15, 20}
 	min := myInt8(0)
 	max := myInt8(10)
@@ -853,6 +775,7 @@ func TestSSEClampTypeAlias(t *testing.T) {
 }

 func TestMeanInt8x16(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []int8
@@ -884,6 +807,7 @@ func TestMeanInt8x16(t *testing.T) {
 }

 func TestMeanInt16x8(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []int16
@@ -915,6 +839,7 @@ func TestMeanInt16x8(t *testing.T) {
 }

 func TestMeanInt32x4(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []int32
@@ -946,6 +871,7 @@ func TestMeanInt32x4(t *testing.T) {
 }

 func TestMeanInt64x2(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []int64
@@ -977,6 +903,7 @@ func TestMeanInt64x2(t *testing.T) {
 }

 func TestMeanUint8x16(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []uint8
@@ -1008,6 +935,7 @@ func TestMeanUint8x16(t *testing.T) {
 }

 func TestMeanUint16x8(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []uint16
@@ -1039,6 +967,7 @@ func TestMeanUint16x8(t *testing.T) {
 }

 func TestMeanUint32x4(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []uint32
@@ -1069,6 +998,7 @@ func TestMeanUint32x4(t *testing.T) {
 }

 func TestMeanUint64x2(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []uint64
@@ -1099,6 +1029,7 @@ func TestMeanUint64x2(t *testing.T) {
 }

 func TestMeanFloat32x4(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []float32
@@ -1132,6 +1063,7 @@ func TestMeanFloat32x4(t *testing.T) {
 }

 func TestMeanFloat64x2(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []float64
@@ -1164,7 +1096,8 @@ func TestMeanFloat64x2(t *testing.T) {
 }

 // Test type aliases work correctly
-func TestSSEMeanTypeAlias(t *testing.T) {
+func TestAVXMeanTypeAlias(t *testing.T) {
+	requireAVX(t)
 	input := []myInt8{1, 2, 3, 4, 5}
 	got := MeanInt8x16(input)
 	want := lo.Mean(input)
@@ -1175,6 +1108,7 @@ func TestSSEMeanTypeAlias(t *testing.T) {
 }

 func TestMinInt8x16(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []int8
@@ -1206,6 +1140,7 @@ func TestMinInt8x16(t *testing.T) {
 }

 func TestMinInt16x8(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []int16
@@ -1237,6 +1172,7 @@ func TestMinInt16x8(t *testing.T) {
 }

 func TestMinInt32x4(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []int32
@@ -1267,39 +1203,8 @@ func TestMinInt32x4(t *testing.T) {
 	}
 }

-func TestMinInt64x2(t *testing.T) {
-	requireAVX512(t)
-	testCases := []struct {
-		name  string
-		input []int64
-	}{
-		{"empty", []int64{}},
-		{"single", []int64{42}},
-		{"small", []int64{1, 2, 3, 4, 5}},
-		{"exactly 2", []int64{1, 2}},
-		{"large", make([]int64, 1000)},
-		{"negative", []int64{-1, -2, -3, 4, 5}},
-	}
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
-				for i := range tc.input {
-					tc.input[i] = rand.Int64()
-				}
-			}
-
-			got := MinInt64x2(tc.input)
-			want := lo.Min(tc.input)
-
-			if got != want {
-				t.Errorf("MinInt64x2() = %v, want %v", got, want)
-			}
-		})
-	}
-}
-
 func TestMinUint8x16(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []uint8
@@ -1331,6 +1236,7 @@ func TestMinUint8x16(t *testing.T) {
 }

 func TestMinUint16x8(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []uint16
@@ -1362,6 +1268,7 @@ func TestMinUint16x8(t *testing.T) {
 }

 func TestMinUint32x4(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []uint32
@@ -1391,38 +1298,8 @@ func TestMinUint32x4(t *testing.T) {
 	}
 }

-func TestMinUint64x2(t *testing.T) {
-	requireAVX512(t)
-	testCases := []struct {
-		name  string
-		input []uint64
-	}{
-		{"empty", []uint64{}},
-		{"single", []uint64{42}},
-		{"small", []uint64{1, 2, 3, 4, 5}},
-		{"exactly 2", []uint64{1, 2}},
-		{"large", make([]uint64, 1000)},
-	}
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
-				for i := range tc.input {
-					tc.input[i] = rand.Uint64()
-				}
-			}
-
-			got := MinUint64x2(tc.input)
-			want := lo.Min(tc.input)
-
-			if got != want {
-				t.Errorf("MinUint64x2() = %v, want %v", got, want)
-			}
-		})
-	}
-}
-
 func TestMinFloat32x4(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []float32
@@ -1456,6 +1333,7 @@ func TestMinFloat32x4(t *testing.T) {
 }

 func TestMinFloat64x2(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []float64
@@ -1488,7 +1366,8 @@ func TestMinFloat64x2(t *testing.T) {
 }

 // Test type aliases work correctly
-func TestSSEMinTypeAlias(t *testing.T) {
+func TestAVXMinTypeAlias(t *testing.T) {
+	requireAVX(t)
 	input := []myInt8{5, 2, 8, 1, 9}
 	got := MinInt8x16(input)
 	want := myInt8(1)
@@ -1499,6 +1378,7 @@ func TestSSEMinTypeAlias(t *testing.T) {
 }

 func TestMaxInt8x16(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []int8
@@ -1530,6 +1410,7 @@ func TestMaxInt8x16(t *testing.T) {
 }

 func TestMaxInt16x8(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []int16
@@ -1561,6 +1442,7 @@ func TestMaxInt16x8(t *testing.T) {
 }

 func TestMaxInt32x4(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []int32
@@ -1591,39 +1473,8 @@ func TestMaxInt32x4(t *testing.T) {
 	}
 }

-func TestMaxInt64x2(t *testing.T) {
-	requireAVX512(t)
-	testCases := []struct {
-		name  string
-		input []int64
-	}{
-		{"empty", []int64{}},
-		{"single", []int64{42}},
-		{"small", []int64{1, 2, 3, 4, 5}},
-		{"exactly 2", []int64{1, 2}},
-		{"large", make([]int64, 1000)},
-		{"negative", []int64{-1, -2, -3, 4, 5}},
-	}
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
-				for i := range tc.input {
-					tc.input[i] = rand.Int64()
-				}
-			}
-
-			got := MaxInt64x2(tc.input)
-			want := lo.Max(tc.input)
-
-			if got != want {
-				t.Errorf("MaxInt64x2() = %v, want %v", got, want)
-			}
-		})
-	}
-}
-
 func TestMaxUint8x16(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []uint8
@@ -1655,6 +1506,7 @@ func TestMaxUint8x16(t *testing.T) {
 }

 func TestMaxUint16x8(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []uint16
@@ -1686,6 +1538,7 @@ func TestMaxUint16x8(t *testing.T) {
 }

 func TestMaxUint32x4(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []uint32
@@ -1715,38 +1568,8 @@ func TestMaxUint32x4(t *testing.T) {
 	}
 }

-func TestMaxUint64x2(t *testing.T) {
-	requireAVX512(t)
-	testCases := []struct {
-		name  string
-		input []uint64
-	}{
-		{"empty", []uint64{}},
-		{"single", []uint64{42}},
-		{"small", []uint64{1, 2, 3, 4, 5}},
-		{"exactly 2", []uint64{1, 2}},
-		{"large", make([]uint64, 1000)},
-	}
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
-				for i := range tc.input {
-					tc.input[i] = rand.Uint64()
-				}
-			}
-
-			got := MaxUint64x2(tc.input)
-			want := lo.Max(tc.input)
-
-			if got != want {
-				t.Errorf("MaxUint64x2() = %v, want %v", got, want)
-			}
-		})
-	}
-}
-
 func TestMaxFloat32x4(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []float32
@@ -1780,6 +1603,7 @@ func TestMaxFloat32x4(t *testing.T) {
 }

 func TestMaxFloat64x2(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []float64
@@ -1812,7 +1636,8 @@ func TestMaxFloat64x2(t *testing.T) {
 }

 // Test type aliases work correctly
-func TestSSEMaxTypeAlias(t *testing.T) {
+func TestAVXMaxTypeAlias(t *testing.T) {
+	requireAVX(t)
 	input := []myInt8{5, 2, 8, 1, 9}
 	got := MaxInt8x16(input)
 	want := myInt8(9)
@@ -1831,6 +1656,7 @@ type item struct {
 }

 func TestSumByInt8x16(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []item
@@ -1863,6 +1689,7 @@ func TestSumByInt8x16(t *testing.T) {
 }

 func TestSumByInt16x8(t *testing.T) {
+	requireAVX(t)
 	type itemInt16 struct {
 		Value int16
 	}
@@ -1898,6 +1725,7 @@ func TestSumByInt16x8(t *testing.T) {
 }

 func TestSumByInt32x4(t *testing.T) {
+	requireAVX(t)
 	type itemInt32 struct {
 		Value int32
 	}
@@ -1933,6 +1761,7 @@ func TestSumByInt32x4(t *testing.T) {
 }

 func TestSumByInt64x2(t *testing.T) {
+	requireAVX(t)
 	type itemInt64 struct {
 		Value int64
 	}
@@ -1968,6 +1797,7 @@ func TestSumByInt64x2(t *testing.T) {
 }

 func TestSumByUint8x16(t *testing.T) {
+	requireAVX(t)
 	type itemUint8 struct {
 		Value uint8
 	}
@@ -2003,6 +1833,7 @@ func TestSumByUint8x16(t *testing.T) {
 }

 func TestSumByUint16x8(t *testing.T) {
+	requireAVX(t)
 	type itemUint16 struct {
 		Value uint16
 	}
@@ -2038,6 +1869,7 @@ func TestSumByUint16x8(t *testing.T) {
 }

 func TestSumByUint32x4(t *testing.T) {
+	requireAVX(t)
 	type itemUint32 struct {
 		Value uint32
 	}
@@ -2072,6 +1904,7 @@ func TestSumByUint32x4(t *testing.T) {
 }

 func TestSumByUint64x2(t *testing.T) {
+	requireAVX(t)
 	type itemUint64 struct {
 		Value uint64
 	}
@@ -2106,6 +1939,7 @@ func TestSumByUint64x2(t *testing.T) {
 }

 func TestSumByFloat32x4(t *testing.T) {
+	requireAVX(t)
 	type itemFloat32 struct {
 		Value float32
 	}
@@ -2143,6 +1977,7 @@ func TestSumByFloat32x4(t *testing.T) {
 }

 func TestSumByFloat64x2(t *testing.T) {
+	requireAVX(t)
 	type itemFloat64 struct {
 		Value float64
 	}
@@ -2179,7 +2014,8 @@ func TestSumByFloat64x2(t *testing.T) {
 }

 // Test type alias works correctly for SumBy
-func TestSSESumByTypeAlias(t *testing.T) {
+func TestAVXSumByTypeAlias(t *testing.T) {
+	requireAVX(t)
 	type myItem struct {
 		Value myInt8
 	}
@@ -2196,6 +2032,7 @@ func TestSSESumByTypeAlias(t *testing.T) {
 // MeanBy tests

 func TestMeanByInt8x16(t *testing.T) {
+	requireAVX(t)
 	testCases := []struct {
 		name  string
 		input []item
@@ -2227,6 +2064,7 @@ func TestMeanByInt8x16(t *testing.T) {
 }

 func TestMeanByInt16x8(t *testing.T) {
+	requireAVX(t)
 	type itemInt16 struct {
 		Value int16
 	}
@@ -2262,6 +2100,7 @@ func TestMeanByInt16x8(t *testing.T) {
 }

 func TestMeanByInt32x4(t *testing.T) {
+	requireAVX(t)
 	type itemInt32 struct {
 		Value int32
 	}
@@ -2297,6 +2136,7 @@ func TestMeanByInt32x4(t *testing.T) {
 }

 func TestMeanByInt64x2(t *testing.T) {
+	requireAVX(t)
 	type itemInt64 struct {
 		Value int64
 	}
@@ -2332,6 +2172,7 @@ func TestMeanByInt64x2(t *testing.T) {
 }

 func TestMeanByUint8x16(t *testing.T) {
+	requireAVX(t)
 	type itemUint8 struct {
 		Value uint8
 	}
@@ -2367,6 +2208,7 @@ func TestMeanByUint8x16(t *testing.T) {
 }

 func TestMeanByUint16x8(t *testing.T) {
+	requireAVX(t)
 	type itemUint16 struct {
 		Value uint16
 	}
@@ -2402,6 +2244,7 @@ func TestMeanByUint16x8(t *testing.T) {
 }

 func TestMeanByUint32x4(t *testing.T) {
+	requireAVX(t)
 	type itemUint32 struct {
 		Value uint32
 	}
@@ -2436,6 +2279,7 @@ func TestMeanByUint32x4(t *testing.T) {
 }

 func TestMeanByUint64x2(t *testing.T) {
+	requireAVX(t)
 	type itemUint64 struct {
 		Value uint64
 	}
@@ -2470,6 +2314,7 @@ func TestMeanByUint64x2(t *testing.T) {
 }

 func TestMeanByFloat32x4(t *testing.T) {
+	requireAVX(t)
 	type itemFloat32 struct {
 		Value float32
 	}
@@ -2507,6 +2352,7 @@ func TestMeanByFloat32x4(t *testing.T) {
 }

 func TestMeanByFloat64x2(t *testing.T) {
+	requireAVX(t)
 	type itemFloat64 struct {
 		Value float64
 	}
@@ -2543,7 +2389,8 @@ func TestMeanByFloat64x2(t *testing.T) {
 }

 // Test type alias works correctly for MeanBy
-func TestSSEMeanByTypeAlias(t *testing.T) {
+func TestAVXMeanByTypeAlias(t *testing.T) {
+	requireAVX(t)
 	type myItem struct {
 		Value myInt8
 	}
@@ -13,15 +13,15 @@ import (

 // Benchmark suite for SIMD math operations compared to core lo package fallbacks.
 // These benchmarks measure the performance of Sum, Mean, Min, and Max operations
-// across different SIMD implementations (SSE, AVX2, AVX512) and data sizes.
+// across different SIMD implementations (AVX, AVX2, AVX512) and data sizes.

 // Benchmark sizes to demonstrate performance characteristics at different scales
 var benchmarkSizes = []struct {
 	name string
 	size int
 }{
-	{"small", 8},     // Smaller than SSE width (16 lanes for int8)
-	{"medium", 128},  // Between SSE (16) and AVX2 (32) width for int8
+	{"small", 8},     // Smaller than AVX width (16 lanes for int8)
+	{"medium", 128},  // Between AVX (16) and AVX2 (32) width for int8
 	{"large", 1024},  // Well above SIMD register widths
 	{"xlarge", 8192}, // Large dataset for real-world performance
 }
@@ -128,7 +128,8 @@ func BenchmarkSumInt8(b *testing.B) {
 					_ = lo.Sum(data)
 				}
 			})
-			b.Run("SSE-x16", func(b *testing.B) {
+			b.Run("AVX-x16", func(b *testing.B) {
+				requireAVX(b)
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
 					_ = SumInt8x16(data)
@@ -162,7 +163,8 @@ func BenchmarkSumInt16(b *testing.B) {
 					_ = lo.Sum(data)
 				}
 			})
-			b.Run("SSE-x8", func(b *testing.B) {
+			b.Run("AVX-x8", func(b *testing.B) {
+				requireAVX(b)
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
 					_ = SumInt16x8(data)
@@ -196,7 +198,8 @@ func BenchmarkSumInt32(b *testing.B) {
 					_ = lo.Sum(data)
 				}
 			})
-			b.Run("SSE-x4", func(b *testing.B) {
+			b.Run("AVX-x4", func(b *testing.B) {
+				requireAVX(b)
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
 					_ = SumInt32x4(data)
@@ -230,7 +233,8 @@ func BenchmarkSumInt64(b *testing.B) {
 					_ = lo.Sum(data)
 				}
 			})
-			b.Run("SSE-x2", func(b *testing.B) {
+			b.Run("AVX-x2", func(b *testing.B) {
+				requireAVX(b)
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
 					_ = SumInt64x2(data)
@@ -264,7 +268,8 @@ func BenchmarkSumFloat32(b *testing.B) {
 					_ = lo.Sum(data)
 				}
 			})
-			b.Run("SSE-x4", func(b *testing.B) {
+			b.Run("AVX-x4", func(b *testing.B) {
+				requireAVX(b)
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
 					_ = SumFloat32x4(data)
@@ -298,7 +303,8 @@ func BenchmarkSumFloat64(b *testing.B) {
 					_ = lo.Sum(data)
 				}
 			})
-			b.Run("SSE-x2", func(b *testing.B) {
+			b.Run("AVX-x2", func(b *testing.B) {
+				requireAVX(b)
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
 					_ = SumFloat64x2(data)
@@ -336,7 +342,8 @@ func BenchmarkMeanInt32(b *testing.B) {
 					_ = lo.Mean(data)
 				}
 			})
-			b.Run("SSE-x4", func(b *testing.B) {
+			b.Run("AVX-x4", func(b *testing.B) {
+				requireAVX(b)
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
 					_ = MeanInt32x4(data)
@@ -370,7 +377,8 @@ func BenchmarkMeanFloat64(b *testing.B) {
 					_ = lo.Mean(data)
 				}
 			})
-			b.Run("SSE-x2", func(b *testing.B) {
+			b.Run("AVX-x2", func(b *testing.B) {
+				requireAVX(b)
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
 					_ = MeanFloat64x2(data)
@@ -402,7 +410,8 @@ func BenchmarkMinInt32(b *testing.B) {
 	for _, bs := range benchmarkSizes {
 		b.Run(bs.name, func(b *testing.B) {
 			data := generateInt32(bs.size)
-			b.Run("SSE-x4", func(b *testing.B) {
+			b.Run("AVX-x4", func(b *testing.B) {
+				requireAVX(b)
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
 					_ = MinInt32x4(data)
@@ -430,7 +439,8 @@ func BenchmarkMinFloat64(b *testing.B) {
 	for _, bs := range benchmarkSizes {
 		b.Run(bs.name, func(b *testing.B) {
 			data := generateFloat64(bs.size)
-			b.Run("SSE-x2", func(b *testing.B) {
+			b.Run("AVX-x2", func(b *testing.B) {
+				requireAVX(b)
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
 					_ = MinFloat64x2(data)
@@ -462,7 +472,8 @@ func BenchmarkMaxInt32(b *testing.B) {
 	for _, bs := range benchmarkSizes {
 		b.Run(bs.name, func(b *testing.B) {
 			data := generateInt32(bs.size)
-			b.Run("SSE-x4", func(b *testing.B) {
+			b.Run("AVX-x4", func(b *testing.B) {
+				requireAVX(b)
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
 					_ = MaxInt32x4(data)
@@ -490,7 +501,8 @@ func BenchmarkMaxFloat64(b *testing.B) {
 	for _, bs := range benchmarkSizes {
 		b.Run(bs.name, func(b *testing.B) {
 			data := generateFloat64(bs.size)
-			b.Run("SSE-x2", func(b *testing.B) {
+			b.Run("AVX-x2", func(b *testing.B) {
+				requireAVX(b)
 				b.ReportAllocs()
 				for i := 0; i < b.N; i++ {
 					_ = MaxFloat64x2(data)
@@ -528,13 +540,16 @@ func BenchmarkSumInt8ByWidth(b *testing.B) {
 		fn   func() int8
 	}{
 		{"Fallback-lo", func() int8 { return lo.Sum(data) }},
-		{"SSE-x16", func() int8 { return SumInt8x16(data) }},
+		{"AVX-x16", func() int8 { return SumInt8x16(data) }},
 		{"AVX2-x32", func() int8 { return SumInt8x32(data) }},
 		{"AVX512-x64", func() int8 { return SumInt8x64(data) }},
 	}

 	for _, bm := range benchmarks {
 		b.Run(bm.name, func(b *testing.B) {
+			if bm.name == "AVX-x16" {
+				requireAVX(b)
+			}
 			if bm.name == "AVX2-x32" {
 				requireAVX2(b)
 			}
@@ -578,7 +593,8 @@ func BenchmarkSumInt64SteadyState(b *testing.B) {
 			_ = lo.Sum(data)
 		}
 	})
-	b.Run("SSE-x2", func(b *testing.B) {
+	b.Run("AVX-x2", func(b *testing.B) {
+		requireAVX(b)
 		b.ReportAllocs()
 		for i := 0; i < b.N; i++ {
 			_ = SumInt64x2(data)
@@ -24,13 +24,15 @@ func init() {
 }

 // Type aliases for testing
-type myInt8 int8
-type myInt16 int16
-type myInt32 int32
-type myInt64 int64
-type myUint8 uint8
-type myUint16 uint16
-type myUint32 uint32
-type myUint64 uint64
-type myFloat32 float32
-type myFloat64 float64
+type (
+	myInt8    int8
+	myInt16   int16
+	myInt32   int32
+	myInt64   int64
+	myUint8   uint8
+	myUint16  uint16
+	myUint32  uint32
+	myUint64  uint64
+	myFloat32 float32
+	myFloat64 float64
+)