style(simd): rename sse to avx (#821)

* style(simd): rename sse to avx

* fix(exp,simd): apply the right avx512 constraints to a few methods

* fix(exp,simd): apply the right avx512 constraints to a few methods
This commit is contained in:
Samuel Berthe
2026-02-26 22:08:53 +01:00
committed by GitHub
parent c49f84658a
commit ac8295b68a
21 changed files with 964 additions and 853 deletions
+3 -3
View File
@@ -1,7 +1,7 @@
---
name: Clamp
slug: clamp
sourceRef: exp/simd/math_sse.go#L424
sourceRef: exp/simd/math_avx.go#L453
category: exp
subCategory: simd
similarHelpers:
@@ -51,7 +51,7 @@ Clamps each element in a collection between min and max values using SIMD instru
| SIMD variant | Lanes | Required flags | Typical CPUs |
| ------------ | ----- | -------------- | ------------------------------ |
| SSE (xN) | 2-16 | `sse2` | All amd64 |
| AVX (xN) | 2-16 | `avx` | All amd64 |
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
@@ -70,7 +70,7 @@ result := simd.ClampFloat32x16([]float32{0.5, 1.5, 2.5, 3.5}, 1.0, 3.0)
```
```go
// Using SSE variant (8 lanes at once) - works on all amd64
// Using AVX variant (8 lanes at once) - works on all amd64
result := simd.ClampInt16x8([]int16{100, 150, 200, 250}, 120, 220)
// []int16{120, 150, 200, 220}
```
+3 -3
View File
@@ -1,7 +1,7 @@
---
name: Contains
slug: contains
sourceRef: exp/simd/intersect_sse.go#L11
sourceRef: exp/simd/intersect_avx512.go#L9
category: exp
subCategory: simd
similarHelpers:
@@ -51,7 +51,7 @@ Checks if a target value is present in a collection using SIMD instructions. The
| SIMD variant | Lanes | Required flags | Typical CPUs |
| ------------ | ----- | -------------- | ------------------------------ |
| SSE (xN) | 2-16 | `sse2` | All amd64 |
| AVX (xN) | 2-16 | `avx` | All amd64 |
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
@@ -64,7 +64,7 @@ found := simd.ContainsInt8x32([]int8{1, 2, 3, 4, 5}, 3)
```
```go
// Using SSE variant (16 lanes at once) - works on all amd64
// Using AVX variant (16 lanes at once) - works on all amd64
found := simd.ContainsInt64x2([]int64{1000000, 2000000, 3000000}, 2000000)
// true
```
+3 -3
View File
@@ -1,7 +1,7 @@
---
name: Max
slug: max
sourceRef: exp/simd/math_sse.go#L1328
sourceRef: exp/simd/math_avx.go#L1279
category: exp
subCategory: simd
similarHelpers:
@@ -51,7 +51,7 @@ Finds the maximum value in a collection using SIMD instructions. The suffix (x2,
| SIMD variant | Lanes | Required flags | Typical CPUs |
| ------------ | ----- | -------------- | ------------------------------ |
| SSE (xN) | 2-16 | `sse2` | All amd64 |
| AVX (xN) | 2-16 | `avx` | All amd64 |
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
@@ -70,7 +70,7 @@ max := simd.MaxFloat32x16([]float32{3.5, 1.2, 4.8, 2.1})
```
```go
// Using SSE variant (4 lanes at once) - works on all amd64
// Using AVX variant (4 lanes at once) - works on all amd64
max := simd.MaxInt32x4([]int32{100, 50, 200, 75})
// 200
```
+3 -3
View File
@@ -1,7 +1,7 @@
---
name: Mean
slug: mean
sourceRef: exp/simd/math_sse.go#L333
sourceRef: exp/simd/math_avx.go#L352
category: exp
subCategory: simd
similarHelpers:
@@ -52,7 +52,7 @@ Calculates the arithmetic mean of a collection using SIMD instructions. The suff
| SIMD variant | Lanes | Required flags | Typical CPUs |
| ------------ | ----- | -------------- | ------------------------------ |
| SSE (xN) | 2-16 | `sse2` | All amd64 |
| AVX (xN) | 2-16 | `avx` | All amd64 |
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
@@ -71,7 +71,7 @@ mean := simd.MeanFloat32x16([]float32{1.0, 2.0, 3.0, 4.0})
```
```go
// Using SSE variant (8 lanes at once) - works on all amd64
// Using AVX variant (8 lanes at once) - works on all amd64
mean := simd.MeanInt16x8([]int16{10, 20, 30, 40})
// 25
```
+2 -2
View File
@@ -62,7 +62,7 @@ MeanBy transforms a collection using an iteratee function and calculates the ari
| SIMD variant | Lanes | Required flags | Typical CPUs |
| ------------ | ----- | -------------- | ------------------------------ |
| SSE (xN) | 2-16 | `sse2` | All amd64 |
| AVX (xN) | 2-16 | `avx` | All amd64 |
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
@@ -118,7 +118,7 @@ metrics := []Metric{
{Value: 400},
}
// Using SSE variant - works on all amd64
// Using AVX variant - works on all amd64
mean := simd.MeanByUint16x8(metrics, func(m Metric) uint16 {
return m.Value
})
+3 -3
View File
@@ -1,7 +1,7 @@
---
name: Min
slug: min
sourceRef: exp/simd/math_sse.go#L834
sourceRef: exp/simd/math_avx.go#L833
category: exp
subCategory: simd
similarHelpers:
@@ -51,7 +51,7 @@ Finds the minimum value in a collection using SIMD instructions. The suffix (x2,
| SIMD variant | Lanes | Required flags | Typical CPUs |
| ------------ | ----- | -------------- | ------------------------------ |
| SSE (xN) | 2-16 | `sse2` | All amd64 |
| AVX (xN) | 2-16 | `avx` | All amd64 |
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
@@ -70,7 +70,7 @@ min := simd.MinFloat32x16([]float32{3.5, 1.2, 4.8, 2.1})
```
```go
// Using SSE variant (4 lanes at once) - works on all amd64
// Using AVX variant (4 lanes at once) - works on all amd64
min := simd.MinInt32x4([]int32{100, 50, 200, 75})
// 50
```
+3 -3
View File
@@ -1,7 +1,7 @@
---
name: Sum
slug: sum
sourceRef: exp/simd/math_sse.go#L13
sourceRef: exp/simd/math_avx.go#L14
category: exp
subCategory: simd
similarHelpers:
@@ -52,7 +52,7 @@ Sums the values in a collection using SIMD instructions. The suffix (x2, x4, x8,
| SIMD variant | Lanes | Required flags | Typical CPUs |
| ------------ | ----- | -------------- | ------------------------------ |
| SSE (xN) | 2-16 | `sse2` | All amd64 |
| AVX (xN) | 2-16 | `avx` | All amd64 |
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
@@ -71,7 +71,7 @@ sum := simd.SumFloat32x16([]float32{1.1, 2.2, 3.3, 4.4})
```
```go
// Using SSE variant (4 lanes at once) - works on all amd64
// Using AVX variant (4 lanes at once) - works on all amd64
sum := simd.SumInt32x4([]int32{1000000, 2000000, 3000000})
// 6000000
```
+2 -2
View File
@@ -62,7 +62,7 @@ SumBy transforms a collection using an iteratee function and sums the result usi
| SIMD variant | Lanes | Required flags | Typical CPUs |
| ------------ | ----- | -------------- | ------------------------------ |
| SSE (xN) | 2-16 | `sse2` | All amd64 |
| AVX (xN) | 2-16 | `avx` | All amd64 |
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
@@ -119,7 +119,7 @@ metrics := []Metric{
{Value: 400},
}
// Using SSE variant - works on all amd64
// Using AVX variant - works on all amd64
sum := simd.SumByUint16x8(metrics, func(m Metric) uint16 {
return m.Value
})
+4 -4
View File
@@ -1,6 +1,6 @@
---
title: SIMD operations
description: High-performance slice operations using SSE, AVX2 and AVX512 SIMD when built with Go 1.26+ and GOEXPERIMENT=simd on amd64.
description: High-performance slice operations using AVX, AVX2 and AVX512 SIMD when built with Go 1.26+ and GOEXPERIMENT=simd on amd64.
sidebar_position: 0
hide_table_of_contents: true
---
@@ -14,7 +14,7 @@ Your feedback helps us improve!
#
## SIMD helpers
This page lists all operations on slices, available in the `exp/simd` sub-package. These helpers use **SSE** (128-bit), **AVX2** (256-bit) or **AVX512** (512-bit) SIMD when built with Go 1.26+, the `GOEXPERIMENT=simd` flag, and on amd64.
This page lists all operations on slices, available in the `exp/simd` sub-package. These helpers use **AVX** (128-bit), **AVX2** (256-bit) or **AVX512** (512-bit) SIMD when built with Go 1.26+, the `GOEXPERIMENT=simd` flag, and on amd64.
:::warning Unstable API
SIMD helpers are experimental. The API may break in the future.
@@ -26,7 +26,7 @@ Benchmarks show that running SIMD operators on small datasets is slower:
```txt
BenchmarkSumInt8/small/Fallback-lo-4 203616572 5.875 ns/op
BenchmarkSumInt8/small/SSE-x16-4 100000000 12.04 ns/op
BenchmarkSumInt8/small/AVX-x16-4 100000000 12.04 ns/op
BenchmarkSumInt8/small/AVX2-x32-4 64041816 17.93 ns/op
BenchmarkSumInt8/small/AVX512-x64-4 26947528 44.75 ns/op
```
@@ -35,7 +35,7 @@ But much much faster on big datasets:
```txt
BenchmarkSumInt8/xlarge/Fallback-lo-4 247677 4860 ns/op
BenchmarkSumInt8/xlarge/SSE-x16-4 3851040 311.4 ns/op
BenchmarkSumInt8/xlarge/AVX-x16-4 3851040 311.4 ns/op
BenchmarkSumInt8/xlarge/AVX2-x32-4 7100002 169.2 ns/op
BenchmarkSumInt8/xlarge/AVX512-x64-4 10107534 118.1 ns/op
```
+192 -192
View File
@@ -6,7 +6,7 @@ Benchmarks show that running SIMD operations on small datasets is slower:
```txt
BenchmarkSumInt8/small/Fallback-lo-2 248740710 5.218 ns/op
BenchmarkSumInt8/small/SSE-x16-2 126181464 9.485 ns/op
BenchmarkSumInt8/small/AVX-x16-2 126181464 9.485 ns/op
BenchmarkSumInt8/small/AVX2-x32-2 73059427 14.44 ns/op
BenchmarkSumInt8/small/AVX512-x64-2 49913169 24.41 ns/op
```
@@ -15,7 +15,7 @@ But SIMD is much faster on large datasets:
```txt
BenchmarkSumInt8/xlarge/Fallback-lo-2 273898 4383 ns/op
BenchmarkSumInt8/xlarge/SSE-x16-2 6928408 173.1 ns/op
BenchmarkSumInt8/xlarge/AVX-x16-2 6928408 173.1 ns/op
BenchmarkSumInt8/xlarge/AVX2-x32-2 12639586 94.09 ns/op
BenchmarkSumInt8/xlarge/AVX512-x64-2 13509693 89.67 ns/op
```
@@ -50,397 +50,397 @@ ok github.com/samber/lo/exp/simd 596.213s
| Benchmark | Iterations | Time/op | Bytes/op | Allocs/op |
| ---------------------------------------------- | ---------- | ----------- | -------- | ----------- |
| BenchmarkContainsInt8/tiny/SSE-x16-2 | 312359204 | 3.625 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/tiny/AVX2-x32-2 | 277194441 | 4.531 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/tiny/AVX512-x16-2 | 312359204 | 3.625 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/tiny/AVX512-x32-2 | 277194441 | 4.531 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/tiny/AVX512-x64-2 | 336853209 | 3.401 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/small/SSE-x16-2 | 449132103 | 2.670 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/small/AVX2-x32-2 | 148648339 | 8.332 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/small/AVX512-x16-2 | 449132103 | 2.670 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/small/AVX512-x32-2 | 148648339 | 8.332 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/small/AVX512-x64-2 | 143124861 | 7.982 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/medium/SSE-x16-2 | 276816714 | 4.302 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/medium/AVX2-x32-2 | 345774957 | 3.529 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/medium/AVX512-x16-2 | 276816714 | 4.302 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/medium/AVX512-x32-2 | 345774957 | 3.529 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/medium/AVX512-x64-2 | 449868722 | 2.669 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/large/SSE-x16-2 | 100000000 | 10.68 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/large/AVX2-x32-2 | 172934200 | 6.941 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/large/AVX512-x16-2 | 100000000 | 10.68 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/large/AVX512-x32-2 | 172934200 | 6.941 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/large/AVX512-x64-2 | 280992625 | 4.384 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/xlarge/SSE-x16-2 | 187189599 | 6.203 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/xlarge/AVX2-x32-2 | 274289563 | 4.042 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/xlarge/AVX512-x16-2 | 187189599 | 6.203 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/xlarge/AVX512-x32-2 | 274289563 | 4.042 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/xlarge/AVX512-x64-2 | 375048555 | 2.953 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/massive/SSE-x16-2 | 86434948 | 14.02 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/massive/AVX2-x32-2 | 153742346 | 8.012 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/massive/AVX512-x16-2 | 86434948 | 14.02 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/massive/AVX512-x32-2 | 153742346 | 8.012 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8/massive/AVX512-x64-2 | 259404483 | 5.214 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/tiny/SSE-x8-2 | 270309470 | 4.315 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/tiny/AVX2-x16-2 | 264874646 | 4.281 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/tiny/AVX512-x8-2 | 270309470 | 4.315 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/tiny/AVX512-x16-2 | 264874646 | 4.281 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/tiny/AVX512-x32-2 | 328810479 | 3.593 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/small/SSE-x8-2 | 374742561 | 3.206 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/small/AVX2-x16-2 | 449838870 | 2.678 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/small/AVX512-x8-2 | 374742561 | 3.206 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/small/AVX512-x16-2 | 449838870 | 2.678 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/small/AVX512-x32-2 | 143845734 | 8.484 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/medium/SSE-x8-2 | 185415590 | 6.448 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/medium/AVX2-x16-2 | 273780868 | 4.268 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/medium/AVX512-x8-2 | 185415590 | 6.448 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/medium/AVX512-x16-2 | 273780868 | 4.268 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/medium/AVX512-x32-2 | 350067484 | 3.431 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/large/SSE-x8-2 | 61109778 | 19.66 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/large/AVX2-x16-2 | 100000000 | 10.74 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/large/AVX512-x8-2 | 61109778 | 19.66 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/large/AVX512-x16-2 | 100000000 | 10.74 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/large/AVX512-x32-2 | 182886646 | 6.575 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/xlarge/SSE-x8-2 | 15220682 | 71.53 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/xlarge/AVX2-x16-2 | 31876572 | 37.57 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/xlarge/AVX512-x8-2 | 15220682 | 71.53 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/xlarge/AVX512-x16-2 | 31876572 | 37.57 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/xlarge/AVX512-x32-2 | 61992217 | 19.55 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/massive/SSE-x8-2 | 4372000 | 262.8 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/massive/AVX2-x16-2 | 9019658 | 131.1 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/massive/AVX512-x8-2 | 4372000 | 262.8 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/massive/AVX512-x16-2 | 9019658 | 131.1 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt16/massive/AVX512-x32-2 | 16568430 | 74.25 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/tiny/SSE-x4-2 | 499209442 | 2.406 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/tiny/AVX2-x8-2 | 350479609 | 3.433 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/tiny/AVX512-x4-2 | 499209442 | 2.406 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/tiny/AVX512-x8-2 | 350479609 | 3.433 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/tiny/AVX512-x16-2 | 280918554 | 4.309 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/small/SSE-x4-2 | 299561596 | 4.028 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/small/AVX2-x8-2 | 374064310 | 3.205 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/small/AVX512-x4-2 | 299561596 | 4.028 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/small/AVX512-x8-2 | 374064310 | 3.205 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/small/AVX512-x16-2 | 499219765 | 2.418 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/medium/SSE-x4-2 | 100000000 | 10.42 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/medium/AVX2-x8-2 | 187391635 | 6.403 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/medium/AVX512-x4-2 | 100000000 | 10.42 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/medium/AVX512-x8-2 | 187391635 | 6.403 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/medium/AVX512-x16-2 | 307955800 | 3.875 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/large/SSE-x4-2 | 33256420 | 36.05 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/large/AVX2-x8-2 | 62421526 | 19.23 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/large/AVX512-x4-2 | 33256420 | 36.05 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/large/AVX512-x8-2 | 62421526 | 19.23 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/large/AVX512-x16-2 | 100000000 | 10.36 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/xlarge/SSE-x4-2 | 8328856 | 144.9 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/xlarge/AVX2-x8-2 | 17039037 | 71.14 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/xlarge/AVX512-x4-2 | 8328856 | 144.9 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/xlarge/AVX512-x8-2 | 17039037 | 71.14 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/xlarge/AVX512-x16-2 | 28740241 | 41.77 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/massive/SSE-x4-2 | 3525885 | 332.3 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/massive/AVX2-x8-2 | 7318027 | 164.5 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/massive/AVX512-x4-2 | 3525885 | 332.3 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/massive/AVX512-x8-2 | 7318027 | 164.5 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt32/massive/AVX512-x16-2 | 12181366 | 99.08 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/tiny/SSE-x2-2 | 409014308 | 2.934 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/tiny/AVX2-x4-2 | 449210791 | 2.667 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/tiny/AVX512-x2-2 | 409014308 | 2.934 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/tiny/AVX512-x4-2 | 449210791 | 2.667 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/tiny/AVX512-x8-2 | 280998146 | 4.293 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/small/SSE-x2-2 | 195631429 | 6.172 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/small/AVX2-x4-2 | 281272394 | 4.308 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/small/AVX512-x2-2 | 195631429 | 6.172 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/small/AVX512-x4-2 | 281272394 | 4.308 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/small/AVX512-x8-2 | 408933924 | 3.044 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/medium/SSE-x2-2 | 63006909 | 18.94 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/medium/AVX2-x4-2 | 100000000 | 10.67 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/medium/AVX512-x2-2 | 63006909 | 18.94 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/medium/AVX512-x4-2 | 100000000 | 10.67 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/medium/AVX512-x8-2 | 197411126 | 6.016 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/large/SSE-x2-2 | 17098578 | 70.57 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/large/AVX2-x4-2 | 32558013 | 37.07 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/large/AVX512-x2-2 | 17098578 | 70.57 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/large/AVX512-x4-2 | 32558013 | 37.07 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/large/AVX512-x8-2 | 57629485 | 20.94 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/xlarge/SSE-x2-2 | 4286155 | 281.8 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/xlarge/AVX2-x4-2 | 8344772 | 143.8 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/xlarge/AVX512-x2-2 | 4286155 | 281.8 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/xlarge/AVX512-x4-2 | 8344772 | 143.8 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/xlarge/AVX512-x8-2 | 14428276 | 83.14 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/massive/SSE-x2-2 | 1000000 | 1012 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/massive/AVX2-x4-2 | 2350525 | 510.6 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/massive/AVX512-x2-2 | 1000000 | 1012 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/massive/AVX512-x4-2 | 2350525 | 510.6 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64/massive/AVX512-x8-2 | 3773523 | 318.1 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/tiny/SSE-x16-2 | 338880315 | 3.332 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/tiny/AVX2-x32-2 | 320784217 | 3.559 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/tiny/AVX512-x16-2 | 338880315 | 3.332 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/tiny/AVX512-x32-2 | 320784217 | 3.559 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/tiny/AVX512-x64-2 | 341599854 | 3.331 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/small/SSE-x16-2 | 449579424 | 2.670 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/small/AVX2-x32-2 | 140368142 | 8.648 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/small/AVX512-x16-2 | 449579424 | 2.670 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/small/AVX512-x32-2 | 140368142 | 8.648 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/small/AVX512-x64-2 | 146828888 | 8.182 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/medium/SSE-x16-2 | 374443974 | 3.472 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/medium/AVX2-x32-2 | 449271607 | 2.672 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/medium/AVX512-x16-2 | 374443974 | 3.472 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/medium/AVX512-x32-2 | 449271607 | 2.672 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/medium/AVX512-x64-2 | 598525731 | 2.018 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/large/SSE-x16-2 | 254828565 | 4.956 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/large/AVX2-x32-2 | 407777484 | 2.938 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/large/AVX512-x16-2 | 254828565 | 4.956 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/large/AVX512-x32-2 | 407777484 | 2.938 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/large/AVX512-x64-2 | 443472316 | 2.666 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/xlarge/SSE-x16-2 | 162196827 | 7.867 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/xlarge/AVX2-x32-2 | 268324950 | 4.518 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/xlarge/AVX512-x16-2 | 162196827 | 7.867 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/xlarge/AVX512-x32-2 | 268324950 | 4.518 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/xlarge/AVX512-x64-2 | 400437789 | 2.952 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/massive/SSE-x16-2 | 214548872 | 5.640 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/massive/AVX2-x32-2 | 348431553 | 3.391 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/massive/AVX512-x16-2 | 214548872 | 5.640 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/massive/AVX512-x32-2 | 348431553 | 3.391 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint8/massive/AVX512-x64-2 | 459781908 | 2.455 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/tiny/SSE-x8-2 | 276271912 | 4.297 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/tiny/AVX2-x16-2 | 281145528 | 4.270 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/tiny/AVX512-x8-2 | 276271912 | 4.297 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/tiny/AVX512-x16-2 | 281145528 | 4.270 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/tiny/AVX512-x32-2 | 315343911 | 3.667 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/small/SSE-x8-2 | 374632351 | 3.204 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/small/AVX2-x16-2 | 449355727 | 2.670 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/small/AVX512-x8-2 | 374632351 | 3.204 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/small/AVX512-x16-2 | 449355727 | 2.670 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/small/AVX512-x32-2 | 138088146 | 8.395 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/medium/SSE-x8-2 | 187276191 | 6.582 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/medium/AVX2-x16-2 | 281107980 | 4.306 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/medium/AVX512-x8-2 | 187276191 | 6.582 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/medium/AVX512-x16-2 | 281107980 | 4.306 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/medium/AVX512-x32-2 | 358850328 | 3.516 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/large/SSE-x8-2 | 59025931 | 19.98 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/large/AVX2-x16-2 | 100000000 | 10.68 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/large/AVX512-x8-2 | 59025931 | 19.98 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/large/AVX512-x16-2 | 100000000 | 10.68 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/large/AVX512-x32-2 | 179631354 | 6.569 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/xlarge/SSE-x8-2 | 16576267 | 71.63 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/xlarge/AVX2-x16-2 | 32578981 | 36.96 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/xlarge/AVX512-x8-2 | 16576267 | 71.63 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/xlarge/AVX512-x16-2 | 32578981 | 36.96 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/xlarge/AVX512-x32-2 | 61464870 | 19.44 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/massive/SSE-x8-2 | 2153736 | 557.4 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/massive/AVX2-x16-2 | 4225728 | 281.3 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/massive/AVX512-x8-2 | 2153736 | 557.4 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/massive/AVX512-x16-2 | 4225728 | 281.3 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint16/massive/AVX512-x32-2 | 7829936 | 145.1 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/tiny/SSE-x4-2 | 499390296 | 2.403 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/tiny/AVX2-x8-2 | 362964080 | 3.342 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/tiny/AVX512-x4-2 | 499390296 | 2.403 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/tiny/AVX512-x8-2 | 362964080 | 3.342 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/tiny/AVX512-x16-2 | 281063364 | 4.268 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/small/SSE-x4-2 | 293867554 | 4.004 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/small/AVX2-x8-2 | 374510434 | 3.203 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/small/AVX512-x4-2 | 293867554 | 4.004 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/small/AVX512-x8-2 | 374510434 | 3.203 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/small/AVX512-x16-2 | 499714206 | 2.402 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/medium/SSE-x4-2 | 100000000 | 10.42 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/medium/AVX2-x8-2 | 187258657 | 6.405 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/medium/AVX512-x4-2 | 100000000 | 10.42 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/medium/AVX512-x8-2 | 187258657 | 6.405 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/medium/AVX512-x16-2 | 312999210 | 3.881 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/large/SSE-x4-2 | 33298366 | 36.02 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/large/AVX2-x8-2 | 62409421 | 19.23 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/large/AVX512-x4-2 | 33298366 | 36.02 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/large/AVX512-x8-2 | 62409421 | 19.23 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/large/AVX512-x16-2 | 100000000 | 10.10 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/xlarge/SSE-x4-2 | 7948898 | 143.6 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/xlarge/AVX2-x8-2 | 17021738 | 70.49 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/xlarge/AVX512-x4-2 | 7948898 | 143.6 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/xlarge/AVX512-x8-2 | 17021738 | 70.49 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/xlarge/AVX512-x16-2 | 28742320 | 41.77 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/massive/SSE-x4-2 | 1595774 | 751.1 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/massive/AVX2-x8-2 | 3094242 | 381.1 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/massive/AVX512-x4-2 | 1595774 | 751.1 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/massive/AVX512-x8-2 | 3094242 | 381.1 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint32/massive/AVX512-x16-2 | 5080051 | 238.3 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/tiny/SSE-x2-2 | 374760351 | 3.203 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/tiny/AVX2-x4-2 | 498763054 | 2.419 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/tiny/AVX512-x2-2 | 374760351 | 3.203 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/tiny/AVX512-x4-2 | 498763054 | 2.419 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/tiny/AVX512-x8-2 | 319635274 | 3.582 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/small/SSE-x2-2 | 187032452 | 6.447 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/small/AVX2-x4-2 | 299546244 | 4.009 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/small/AVX512-x2-2 | 187032452 | 6.447 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/small/AVX512-x4-2 | 299546244 | 4.009 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/small/AVX512-x8-2 | 373937659 | 3.207 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/medium/SSE-x2-2 | 62413118 | 19.23 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/medium/AVX2-x4-2 | 113978791 | 10.42 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/medium/AVX512-x2-2 | 62413118 | 19.23 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/medium/AVX512-x4-2 | 113978791 | 10.42 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/medium/AVX512-x8-2 | 186965330 | 6.484 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/large/SSE-x2-2 | 17005768 | 70.57 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/large/AVX2-x4-2 | 33286495 | 36.69 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/large/AVX512-x2-2 | 17005768 | 70.57 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/large/AVX512-x4-2 | 33286495 | 36.69 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/large/AVX512-x8-2 | 61486065 | 19.93 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/xlarge/SSE-x2-2 | 4154370 | 280.8 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/xlarge/AVX2-x4-2 | 8371358 | 148.2 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/xlarge/AVX512-x2-2 | 4154370 | 280.8 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/xlarge/AVX512-x4-2 | 8371358 | 148.2 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/xlarge/AVX512-x8-2 | 14193795 | 72.36 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/massive/SSE-x2-2 | 1773937 | 676.4 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/massive/AVX2-x4-2 | 3500168 | 343.0 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/massive/AVX512-x2-2 | 1773937 | 676.4 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/massive/AVX512-x4-2 | 3500168 | 343.0 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsUint64/massive/AVX512-x8-2 | 7097266 | 249.3 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/tiny/SSE-x4-2 | 410522160 | 2.675 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/tiny/AVX2-x8-2 | 308565882 | 3.814 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/tiny/AVX512-x4-2 | 410522160 | 2.675 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/tiny/AVX512-x8-2 | 308565882 | 3.814 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/tiny/AVX512-x16-2 | 315331897 | 3.755 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/small/SSE-x4-2 | 278219434 | 4.642 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/small/AVX2-x8-2 | 362945481 | 3.287 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/small/AVX512-x4-2 | 278219434 | 4.642 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/small/AVX512-x8-2 | 362945481 | 3.287 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/small/AVX512-x16-2 | 408523153 | 2.941 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/medium/SSE-x4-2 | 100000000 | 10.77 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/medium/AVX2-x8-2 | 186186376 | 6.409 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/medium/AVX512-x4-2 | 100000000 | 10.77 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/medium/AVX512-x8-2 | 186186376 | 6.409 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/medium/AVX512-x16-2 | 264255108 | 4.619 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/large/SSE-x4-2 | 33028701 | 36.27 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/large/AVX2-x8-2 | 62465360 | 19.53 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/large/AVX512-x4-2 | 33028701 | 36.27 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/large/AVX512-x8-2 | 62465360 | 19.53 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/large/AVX512-x16-2 | 108213310 | 10.95 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/xlarge/SSE-x4-2 | 8359381 | 143.6 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/xlarge/AVX2-x8-2 | 17042701 | 70.46 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/xlarge/AVX512-x4-2 | 8359381 | 143.6 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/xlarge/AVX512-x8-2 | 17042701 | 70.46 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/xlarge/AVX512-x16-2 | 31806921 | 37.13 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/massive/SSE-x4-2 | 1000000 | 1100 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/massive/AVX2-x8-2 | 2164672 | 554.4 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/massive/AVX512-x4-2 | 1000000 | 1100 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/massive/AVX512-x8-2 | 2164672 | 554.4 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat32/massive/AVX512-x16-2 | 4201453 | 293.9 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/tiny/SSE-x2-2 | 362183925 | 3.223 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/tiny/AVX2-x4-2 | 449021466 | 2.687 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/tiny/AVX512-x2-2 | 362183925 | 3.223 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/tiny/AVX512-x4-2 | 449021466 | 2.687 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/tiny/AVX512-x8-2 | 320176149 | 3.820 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/small/SSE-x2-2 | 187139116 | 6.415 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/small/AVX2-x4-2 | 280722585 | 4.300 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/small/AVX512-x2-2 | 187139116 | 6.415 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/small/AVX512-x4-2 | 280722585 | 4.300 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/small/AVX512-x8-2 | 335670502 | 3.472 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/medium/SSE-x2-2 | 62343927 | 19.23 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/medium/AVX2-x4-2 | 112332902 | 10.69 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/medium/AVX512-x2-2 | 62343927 | 19.23 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/medium/AVX512-x4-2 | 112332902 | 10.69 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/medium/AVX512-x8-2 | 179610780 | 6.741 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/large/SSE-x2-2 | 16996959 | 70.51 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/large/AVX2-x4-2 | 33017950 | 36.29 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/large/AVX512-x2-2 | 16996959 | 70.51 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/large/AVX512-x4-2 | 33017950 | 36.29 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/large/AVX512-x8-2 | 60322328 | 19.73 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/xlarge/SSE-x2-2 | 4141281 | 282.9 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/xlarge/AVX2-x4-2 | 7856590 | 145.0 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/xlarge/AVX512-x2-2 | 4141281 | 282.9 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/xlarge/AVX512-x4-2 | 7856590 | 145.0 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/xlarge/AVX512-x8-2 | 16623739 | 72.06 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/massive/SSE-x2-2 | 541202 | 2195 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/massive/AVX2-x4-2 | 1000000 | 1158 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/massive/AVX512-x2-2 | 541202 | 2195 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/massive/AVX512-x4-2 | 1000000 | 1158 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsFloat64/massive/AVX512-x8-2 | 2115301 | 560.4 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsWorstCase/SSE-x4-2 | 7651734 | 145.6 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsWorstCase/AVX2-x8-2 | 14921599 | 70.49 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsWorstCase/AVX512-x4-2 | 7651734 | 145.6 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsWorstCase/AVX512-x8-2 | 14921599 | 70.49 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsWorstCase/AVX512-x16-2 | 28708478 | 41.38 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsBestCase/SSE-x4-2 | 534237578 | 2.136 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsBestCase/AVX2-x8-2 | 561252645 | 2.159 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsBestCase/AVX512-x4-2 | 534237578 | 2.136 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsBestCase/AVX512-x8-2 | 561252645 | 2.159 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsBestCase/AVX512-x16-2 | 560396454 | 2.137 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/tiny/SSE-x4-2 | 499649139 | 2.401 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/tiny/AVX2-x8-2 | 329743240 | 3.421 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/tiny/AVX512-x4-2 | 499649139 | 2.401 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/tiny/AVX512-x8-2 | 329743240 | 3.421 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/tiny/AVX512-x16-2 | 280516392 | 4.276 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/small/SSE-x4-2 | 299373171 | 4.006 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/small/AVX2-x8-2 | 374407988 | 3.267 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/small/AVX512-x4-2 | 299373171 | 4.006 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/small/AVX512-x8-2 | 374407988 | 3.267 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/small/AVX512-x16-2 | 486948346 | 2.424 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/medium/SSE-x4-2 | 100000000 | 10.41 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/medium/AVX2-x8-2 | 182899621 | 6.412 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/medium/AVX512-x4-2 | 100000000 | 10.41 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/medium/AVX512-x8-2 | 182899621 | 6.412 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/medium/AVX512-x16-2 | 311969776 | 3.829 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/large/SSE-x4-2 | 33309816 | 36.04 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/large/AVX2-x8-2 | 59912676 | 19.74 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/large/AVX512-x4-2 | 33309816 | 36.04 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/large/AVX512-x8-2 | 59912676 | 19.74 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/large/AVX512-x16-2 | 100000000 | 10.65 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/xlarge/SSE-x4-2 | 8346818 | 143.7 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/xlarge/AVX2-x8-2 | 16980399 | 70.54 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/xlarge/AVX512-x4-2 | 8346818 | 143.7 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/xlarge/AVX512-x8-2 | 16980399 | 70.54 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/xlarge/AVX512-x16-2 | 28676455 | 42.94 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/massive/SSE-x4-2 | 1000000 | 1151 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/massive/AVX2-x8-2 | 2161594 | 555.2 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/massive/AVX512-x4-2 | 1000000 | 1151 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/massive/AVX512-x8-2 | 2161594 | 555.2 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsNegative/massive/AVX512-x16-2 | 3549094 | 350.5 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8ByWidth/SSE-x16-2 | 331533141 | 3.222 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8ByWidth/AVX2-x32-2 | 408741681 | 3.193 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8ByWidth/AVX512-x16-2 | 331533141 | 3.222 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8ByWidth/AVX512-x32-2 | 408741681 | 3.193 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt8ByWidth/AVX512-x64-2 | 365382873 | 3.241 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64SteadyState/SSE-x2-2 | 5722603 | 211.5 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64SteadyState/AVX2-x4-2 | 11711869 | 103.1 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64SteadyState/AVX512-x2-2 | 5722603 | 211.5 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64SteadyState/AVX512-x4-2 | 11711869 | 103.1 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkContainsInt64SteadyState/AVX512-x8-2 | 19671033 | 61.36 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8/small/Fallback-lo-2 | 248740710 | 5.218 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8/small/SSE-x16-2 | 126181464 | 9.485 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8/small/AVX-x16-2 | 126181464 | 9.485 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8/small/AVX2-x32-2 | 73059427 | 14.44 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8/small/AVX512-x64-2 | 49913169 | 24.41 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8/medium/Fallback-lo-2 | 17278075 | 69.96 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8/medium/SSE-x16-2 | 100000000 | 10.58 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8/medium/AVX-x16-2 | 100000000 | 10.58 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8/medium/AVX2-x32-2 | 91620999 | 13.10 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8/medium/AVX512-x64-2 | 54082130 | 22.20 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8/large/Fallback-lo-2 | 2006178 | 576.3 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8/large/SSE-x16-2 | 41836690 | 27.82 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8/large/AVX-x16-2 | 41836690 | 27.82 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8/large/AVX2-x32-2 | 51735399 | 23.04 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8/large/AVX512-x64-2 | 40861586 | 29.40 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8/xlarge/Fallback-lo-2 | 273898 | 4383 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8/xlarge/SSE-x16-2 | 6928408 | 173.1 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8/xlarge/AVX-x16-2 | 6928408 | 173.1 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8/xlarge/AVX2-x32-2 | 12639586 | 94.09 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8/xlarge/AVX512-x64-2 | 13509693 | 89.67 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt16/small/Fallback-lo-2 | 249444103 | 5.012 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt16/small/SSE-x8-2 | 244927230 | 5.052 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt16/small/AVX-x8-2 | 244927230 | 5.052 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt16/small/AVX2-x16-2 | 122088517 | 9.715 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt16/small/AVX512-x32-2 | 54098370 | 22.00 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt16/medium/Fallback-lo-2 | 15782683 | 72.54 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt16/medium/SSE-x8-2 | 100000000 | 10.51 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt16/medium/AVX-x8-2 | 100000000 | 10.51 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt16/medium/AVX2-x16-2 | 100000000 | 10.75 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt16/medium/AVX512-x32-2 | 56147455 | 21.38 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt16/large/Fallback-lo-2 | 2173214 | 598.1 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt16/large/SSE-x8-2 | 26319481 | 44.73 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt16/large/AVX-x8-2 | 26319481 | 44.73 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt16/large/AVX2-x16-2 | 40459519 | 27.91 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt16/large/AVX512-x32-2 | 39359752 | 31.28 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt16/xlarge/Fallback-lo-2 | 273932 | 4382 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt16/xlarge/SSE-x8-2 | 3557265 | 331.2 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt16/xlarge/AVX-x8-2 | 3557265 | 331.2 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt16/xlarge/AVX2-x16-2 | 6930166 | 173.4 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt16/xlarge/AVX512-x32-2 | 12100244 | 97.01 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt32/small/Fallback-lo-2 | 249566539 | 4.808 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt32/small/SSE-x4-2 | 259250019 | 4.581 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt32/small/AVX-x4-2 | 259250019 | 4.581 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt32/small/AVX2-x8-2 | 232858933 | 5.404 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt32/small/AVX512-x16-2 | 100000000 | 11.18 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt32/medium/Fallback-lo-2 | 17274441 | 72.28 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt32/medium/SSE-x4-2 | 58400258 | 20.56 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt32/medium/AVX-x4-2 | 58400258 | 20.56 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt32/medium/AVX2-x8-2 | 110851756 | 10.67 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt32/medium/AVX512-x16-2 | 106593603 | 11.25 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt32/large/Fallback-lo-2 | 2171817 | 551.8 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt32/large/SSE-x4-2 | 8270253 | 146.0 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt32/large/AVX-x4-2 | 8270253 | 146.0 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt32/large/AVX2-x8-2 | 22234518 | 46.06 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt32/large/AVX512-x16-2 | 37448763 | 32.31 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt32/xlarge/Fallback-lo-2 | 273699 | 4559 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt32/xlarge/SSE-x4-2 | 1000000 | 1102 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt32/xlarge/AVX-x4-2 | 1000000 | 1102 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt32/xlarge/AVX2-x8-2 | 3586887 | 332.4 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt32/xlarge/AVX512-x16-2 | 7214437 | 170.5 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64/small/Fallback-lo-2 | 417473124 | 2.886 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64/small/SSE-x2-2 | 287521756 | 4.169 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64/small/AVX-x2-2 | 287521756 | 4.169 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64/small/AVX2-x4-2 | 277783513 | 4.311 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64/small/AVX512-x8-2 | 172823103 | 6.993 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64/medium/Fallback-lo-2 | 34022653 | 35.27 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64/medium/SSE-x2-2 | 49241248 | 24.05 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64/medium/AVX-x2-2 | 49241248 | 24.05 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64/medium/AVX2-x4-2 | 78897342 | 14.58 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64/medium/AVX512-x8-2 | 84361297 | 14.03 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64/large/Fallback-lo-2 | 3680988 | 282.3 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64/large/SSE-x2-2 | 6293607 | 170.7 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64/large/AVX-x2-2 | 6293607 | 170.7 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64/large/AVX2-x4-2 | 12739849 | 91.28 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64/large/AVX512-x8-2 | 25508130 | 46.30 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64/xlarge/Fallback-lo-2 | 546321 | 2283 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64/xlarge/SSE-x2-2 | 877434 | 1289 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64/xlarge/AVX-x2-2 | 877434 | 1289 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64/xlarge/AVX2-x4-2 | 1845892 | 650.4 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64/xlarge/AVX512-x8-2 | 2148355 | 550.8 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat32/small/Fallback-lo-2 | 411100770 | 2.951 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat32/small/SSE-x4-2 | 264013596 | 4.572 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat32/small/AVX-x4-2 | 264013596 | 4.572 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat32/small/AVX2-x8-2 | 174478266 | 6.911 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat32/small/AVX512-x16-2 | 61182673 | 19.78 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat32/medium/Fallback-lo-2 | 33815070 | 35.68 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat32/medium/SSE-x4-2 | 58238188 | 20.66 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat32/medium/AVX-x4-2 | 58238188 | 20.66 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat32/medium/AVX2-x8-2 | 91316544 | 13.26 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat32/medium/AVX512-x16-2 | 80046624 | 15.08 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat32/large/Fallback-lo-2 | 4304168 | 278.7 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat32/large/SSE-x4-2 | 6198957 | 184.8 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat32/large/AVX-x4-2 | 6198957 | 184.8 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat32/large/AVX2-x8-2 | 12260169 | 86.60 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat32/large/AVX512-x16-2 | 22147112 | 45.34 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat32/xlarge/Fallback-lo-2 | 546901 | 2193 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat32/xlarge/SSE-x4-2 | 736503 | 1622 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat32/xlarge/AVX-x4-2 | 736503 | 1622 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat32/xlarge/AVX2-x8-2 | 1493887 | 810.5 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat32/xlarge/AVX512-x16-2 | 2959298 | 393.4 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat64/small/Fallback-lo-2 | 410778070 | 3.043 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat64/small/SSE-x2-2 | 254156008 | 4.714 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat64/small/AVX-x2-2 | 254156008 | 4.714 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat64/small/AVX2-x4-2 | 227604434 | 5.323 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat64/small/AVX512-x8-2 | 170099748 | 7.115 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat64/medium/Fallback-lo-2 | 33646345 | 35.78 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat64/medium/SSE-x2-2 | 32931152 | 34.92 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat64/medium/AVX-x2-2 | 32931152 | 34.92 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat64/medium/AVX2-x4-2 | 75389446 | 16.79 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat64/medium/AVX512-x8-2 | 89826181 | 13.33 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat64/large/Fallback-lo-2 | 4293837 | 302.8 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat64/large/SSE-x2-2 | 3146601 | 381.4 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat64/large/AVX-x2-2 | 3146601 | 381.4 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat64/large/AVX2-x4-2 | 6373876 | 184.3 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat64/large/AVX512-x8-2 | 13464712 | 88.96 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat64/xlarge/Fallback-lo-2 | 545764 | 2193 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat64/xlarge/SSE-x2-2 | 368846 | 3390 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat64/xlarge/AVX-x2-2 | 368846 | 3390 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat64/xlarge/AVX2-x4-2 | 709940 | 1613 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumFloat64/xlarge/AVX512-x8-2 | 1480214 | 808.6 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanInt32/small/Fallback-lo-2 | 411529147 | 3.043 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanInt32/small/SSE-x4-2 | 204428401 | 5.872 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanInt32/small/AVX-x4-2 | 204428401 | 5.872 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanInt32/small/AVX2-x8-2 | 187573928 | 6.214 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanInt32/small/AVX512-x16-2 | 98346700 | 12.12 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanInt32/medium/Fallback-lo-2 | 33481442 | 35.72 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanInt32/medium/SSE-x4-2 | 52042394 | 22.12 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanInt32/medium/AVX-x4-2 | 52042394 | 22.12 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanInt32/medium/AVX2-x8-2 | 96288541 | 13.44 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanInt32/medium/AVX512-x16-2 | 100995780 | 11.90 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanInt32/large/Fallback-lo-2 | 4296570 | 289.9 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanInt32/large/SSE-x4-2 | 7743022 | 146.4 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanInt32/large/AVX-x4-2 | 7743022 | 146.4 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanInt32/large/AVX2-x8-2 | 24355988 | 46.26 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanInt32/large/AVX512-x16-2 | 37322655 | 32.89 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanInt32/xlarge/Fallback-lo-2 | 547008 | 2193 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanInt32/xlarge/SSE-x4-2 | 1087246 | 1112 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanInt32/xlarge/AVX-x4-2 | 1087246 | 1112 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanInt32/xlarge/AVX2-x8-2 | 1386868 | 761.9 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanInt32/xlarge/AVX512-x16-2 | 7166142 | 170.7 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanFloat64/small/Fallback-lo-2 | 349760005 | 3.449 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanFloat64/small/SSE-x2-2 | 189674538 | 6.293 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanFloat64/small/AVX-x2-2 | 189674538 | 6.293 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanFloat64/small/AVX2-x4-2 | 159228600 | 7.531 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanFloat64/small/AVX512-x8-2 | 110196433 | 10.89 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanFloat64/medium/Fallback-lo-2 | 32968618 | 36.17 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanFloat64/medium/SSE-x2-2 | 30863817 | 37.69 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanFloat64/medium/AVX-x2-2 | 30863817 | 37.69 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanFloat64/medium/AVX2-x4-2 | 62428772 | 19.66 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanFloat64/medium/AVX512-x8-2 | 77140984 | 15.54 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanFloat64/large/Fallback-lo-2 | 4281057 | 280.6 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanFloat64/large/SSE-x2-2 | 3057349 | 389.4 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanFloat64/large/AVX-x2-2 | 3057349 | 389.4 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanFloat64/large/AVX2-x4-2 | 6509438 | 185.9 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanFloat64/large/AVX512-x8-2 | 12668032 | 93.50 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanFloat64/xlarge/Fallback-lo-2 | 545898 | 2288 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanFloat64/xlarge/SSE-x2-2 | 367671 | 4048 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanFloat64/xlarge/AVX-x2-2 | 367671 | 4048 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanFloat64/xlarge/AVX2-x4-2 | 739941 | 1621 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMeanFloat64/xlarge/AVX512-x8-2 | 1434867 | 811.3 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinInt32/small/SSE-x4-2 | 312338268 | 3.860 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinInt32/small/AVX-x4-2 | 312338268 | 3.860 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinInt32/small/AVX2-x8-2 | 238034872 | 5.042 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinInt32/small/AVX512-x16-2 | 152600943 | 6.661 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinInt32/medium/SSE-x4-2 | 61051266 | 19.73 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinInt32/medium/AVX-x4-2 | 61051266 | 19.73 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinInt32/medium/AVX2-x8-2 | 91792144 | 13.11 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinInt32/medium/AVX512-x16-2 | 99994540 | 12.18 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinInt32/large/SSE-x4-2 | 8604774 | 140.5 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinInt32/large/AVX-x4-2 | 8604774 | 140.5 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinInt32/large/AVX2-x8-2 | 15581037 | 77.56 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinInt32/large/AVX512-x16-2 | 30512421 | 40.24 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinInt32/xlarge/SSE-x4-2 | 1000000 | 1110 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinInt32/xlarge/AVX-x4-2 | 1000000 | 1110 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinInt32/xlarge/AVX2-x8-2 | 2158272 | 557.2 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinInt32/xlarge/AVX512-x16-2 | 4253668 | 282.6 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinFloat64/small/SSE-x2-2 | 264129410 | 4.544 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinFloat64/small/AVX-x2-2 | 264129410 | 4.544 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinFloat64/small/AVX2-x4-2 | 299587609 | 4.008 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinFloat64/small/AVX512-x8-2 | 100000000 | 10.05 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinFloat64/medium/SSE-x2-2 | 32778514 | 36.93 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinFloat64/medium/AVX-x2-2 | 32778514 | 36.93 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinFloat64/medium/AVX2-x4-2 | 53356347 | 20.30 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinFloat64/medium/AVX512-x8-2 | 74832976 | 16.21 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinFloat64/large/SSE-x2-2 | 3863326 | 300.0 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinFloat64/large/AVX-x2-2 | 3863326 | 300.0 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinFloat64/large/AVX2-x4-2 | 7670576 | 146.5 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinFloat64/large/AVX512-x8-2 | 14017984 | 78.21 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinFloat64/xlarge/SSE-x2-2 | 492739 | 2195 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinFloat64/xlarge/AVX-x2-2 | 492739 | 2195 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinFloat64/xlarge/AVX2-x4-2 | 1000000 | 1103 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMinFloat64/xlarge/AVX512-x8-2 | 2145290 | 560.3 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxInt32/small/SSE-x4-2 | 306585705 | 3.860 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxInt32/small/AVX-x4-2 | 306585705 | 3.860 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxInt32/small/AVX2-x8-2 | 237347997 | 5.086 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxInt32/small/AVX512-x16-2 | 201433966 | 6.130 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxInt32/medium/SSE-x4-2 | 60759631 | 19.92 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxInt32/medium/AVX-x4-2 | 60759631 | 19.92 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxInt32/medium/AVX2-x8-2 | 90934662 | 13.13 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxInt32/medium/AVX512-x16-2 | 98517944 | 12.18 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxInt32/large/SSE-x4-2 | 8590542 | 139.6 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxInt32/large/AVX-x4-2 | 8590542 | 139.6 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxInt32/large/AVX2-x8-2 | 15770372 | 77.69 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxInt32/large/AVX512-x16-2 | 30197324 | 39.32 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxInt32/xlarge/SSE-x4-2 | 1000000 | 1104 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxInt32/xlarge/AVX-x4-2 | 1000000 | 1104 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxInt32/xlarge/AVX2-x8-2 | 2152038 | 562.1 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxInt32/xlarge/AVX512-x16-2 | 3917990 | 296.7 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxFloat64/small/SSE-x2-2 | 249617162 | 4.816 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxFloat64/small/AVX-x2-2 | 249617162 | 4.816 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxFloat64/small/AVX2-x4-2 | 207017514 | 5.855 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxFloat64/small/AVX512-x8-2 | 66520290 | 17.74 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxFloat64/medium/SSE-x2-2 | 32307492 | 36.92 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxFloat64/medium/AVX-x2-2 | 32307492 | 36.92 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxFloat64/medium/AVX2-x4-2 | 57306838 | 20.77 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxFloat64/medium/AVX512-x8-2 | 56911946 | 21.12 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxFloat64/large/SSE-x2-2 | 4259366 | 287.1 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxFloat64/large/AVX-x2-2 | 4259366 | 287.1 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxFloat64/large/AVX2-x4-2 | 7905420 | 148.9 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxFloat64/large/AVX512-x8-2 | 14100686 | 83.43 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxFloat64/xlarge/SSE-x2-2 | 545378 | 2243 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxFloat64/xlarge/AVX-x2-2 | 545378 | 2243 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxFloat64/xlarge/AVX2-x4-2 | 1000000 | 1113 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkMaxFloat64/xlarge/AVX512-x8-2 | 2119741 | 565.7 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8ByWidth/Fallback-lo-2 | 896775 | 1335 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8ByWidth/SSE-x16-2 | 12557700 | 94.52 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8ByWidth/AVX-x16-2 | 12557700 | 94.52 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8ByWidth/AVX2-x32-2 | 18702537 | 55.03 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt8ByWidth/AVX512-x64-2 | 21342572 | 56.10 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64SteadyState/Fallback-lo-2 | 513738 | 2195 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64SteadyState/SSE-x2-2 | 928376 | 1296 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64SteadyState/AVX-x2-2 | 928376 | 1296 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64SteadyState/AVX2-x4-2 | 1836968 | 888.1 ns/op | 0 B/op | 0 allocs/op |
| BenchmarkSumInt64SteadyState/AVX512-x8-2 | 2141715 | 551.3 ns/op | 0 B/op | 0 allocs/op |
+6 -5
View File
@@ -12,7 +12,7 @@ If you see **SIGILL: illegal instruction** when running tests, the CPU or VM doe
```bash
# List SIMD-related flags
grep -E 'avx|sse' /proc/cpuinfo
grep -E 'avx' /proc/cpuinfo
# Or with lscpu
lscpu | grep -i avx
@@ -22,21 +22,22 @@ lscpu | grep -i avx
| Tests / code | Required flag(s) | Typical CPUs |
| ----------------- | -------------------------- | ----------------------------------------------------------------------- |
| SSE (128-bit) | `sse2` (baseline on amd64) | All amd64 |
| AVX (128-bit) | `avx` (baseline on amd64) | All amd64 |
| AVX2 (256-bit) | `avx2` | Intel Haswell+, AMD Excavator+ |
| AVX-512 (512-bit) | `avx512f` | Intel Skylake-X+, some Xeons; many AMD/consumer CPUs do **not** have it |
### What the tests do
- **AVX tests** (128-bit) call `requireAVX(t)` and are **skipped** if the CPU does not support AVX.
- **AVX2 tests** call `requireAVX2(t)` and are **skipped** if the CPU does not support AVX2 (no SIGILL).
- **AVX-512 tests** (when enabled) should call `requireAVX512(t)` and skip when AVX-512 is not available.
So on a machine without AVX2, AVX2 tests will show as skipped instead of crashing.
### Run only SSE tests
### Run only AVX tests
If your environment does not support AVX2/AVX-512, you can still run the SSE tests:
If your environment does not support AVX2/AVX-512, you can still run the AVX (128-bit) tests:
```bash
GOEXPERIMENT=simd go test -run SSE ./...
GOEXPERIMENT=simd go test -run AVX ./...
```
+12 -3
View File
@@ -19,16 +19,25 @@ type skipHelper interface {
// How to check if your Linux CPU supports SIMD (avoids SIGILL):
//
// grep -E 'avx|sse' /proc/cpuinfo
// grep -E 'avx' /proc/cpuinfo
//
// Or: lscpu | grep -i avx
//
// You need:
// - SSE tests (128-bit): sse2 (baseline on amd64), sse4.1/sse4.2 often used
// - AVX tests (128-bit): avx in flags (baseline on amd64)
// - AVX2 tests (256-bit): avx2 in flags
// - AVX-512 tests: avx512f (and often avx512bw, avx512vl)
//
// If your CPU lacks AVX2 or AVX-512, tests that use them will be skipped automatically.
// If your CPU lacks AVX or AVX2 or AVX-512, tests that use them will be skipped automatically.
// requireAVX skips the test/benchmark if the CPU does not support AVX (128-bit SIMD).
// Use at the start of each AVX test/benchmark to avoid SIGILL on older or non-x86 systems.
func requireAVX(t skipHelper) {
t.Helper()
if !archsimd.X86.AVX() {
t.Skipf("CPU does not support AVX; skipping. Check compatibility: grep avx /proc/cpuinfo")
}
}
// requireAVX2 skips the test/benchmark if the CPU does not support AVX2 (256-bit SIMD).
// Use at the start of each AVX2 test/benchmark to avoid SIGILL on older or non-x86 systems.
+10 -10
View File
@@ -6,7 +6,7 @@ import (
"simd/archsimd"
)
// ContainsInt8x16 checks if collection contains target using SSE SIMD and AVX512 SIMD
// ContainsInt8x16 checks if collection contains target using AVX SIMD and AVX-512 SIMD
func ContainsInt8x16[T ~int8](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
@@ -40,7 +40,7 @@ func ContainsInt8x16[T ~int8](collection []T, target T) bool {
return false
}
// ContainsInt16x8 checks if collection contains target using SSE SIMD and AVX512 SIMD
// ContainsInt16x8 checks if collection contains target using AVX SIMD and AVX-512 SIMD
func ContainsInt16x8[T ~int16](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
@@ -72,7 +72,7 @@ func ContainsInt16x8[T ~int16](collection []T, target T) bool {
return false
}
// ContainsInt32x4 checks if collection contains target using SSE SIMD and AVX512 SIMD
// ContainsInt32x4 checks if collection contains target using AVX SIMD and AVX-512 SIMD
func ContainsInt32x4[T ~int32](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
@@ -104,7 +104,7 @@ func ContainsInt32x4[T ~int32](collection []T, target T) bool {
return false
}
// ContainsInt64x2 checks if collection contains target using SSE SIMD and AVX512 SIMD
// ContainsInt64x2 checks if collection contains target using AVX SIMD and AVX-512 SIMD
func ContainsInt64x2[T ~int64](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
@@ -136,7 +136,7 @@ func ContainsInt64x2[T ~int64](collection []T, target T) bool {
return false
}
// ContainsUint8x16 checks if collection contains target using SSE SIMD and AVX512 SIMD
// ContainsUint8x16 checks if collection contains target using AVX SIMD and AVX-512 SIMD
func ContainsUint8x16[T ~uint8](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
@@ -168,7 +168,7 @@ func ContainsUint8x16[T ~uint8](collection []T, target T) bool {
return false
}
// ContainsUint16x8 checks if collection contains target using SSE SIMD and AVX512 SIMD
// ContainsUint16x8 checks if collection contains target using AVX SIMD and AVX-512 SIMD
func ContainsUint16x8[T ~uint16](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
@@ -200,7 +200,7 @@ func ContainsUint16x8[T ~uint16](collection []T, target T) bool {
return false
}
// ContainsUint32x4 checks if collection contains target using SSE SIMD and AVX512 SIMD
// ContainsUint32x4 checks if collection contains target using AVX SIMD and AVX-512 SIMD
func ContainsUint32x4[T ~uint32](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
@@ -232,7 +232,7 @@ func ContainsUint32x4[T ~uint32](collection []T, target T) bool {
return false
}
// ContainsUint64x2 checks if collection contains target using SSE SIMD and AVX512 SIMD
// ContainsUint64x2 checks if collection contains target using AVX SIMD and AVX-512 SIMD
func ContainsUint64x2[T ~uint64](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
@@ -264,7 +264,7 @@ func ContainsUint64x2[T ~uint64](collection []T, target T) bool {
return false
}
// ContainsFloat32x4 checks if collection contains target using SSE SIMD and AVX512 SIMD
// ContainsFloat32x4 checks if collection contains target using AVX SIMD and AVX-512 SIMD
func ContainsFloat32x4[T ~float32](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
@@ -296,7 +296,7 @@ func ContainsFloat32x4[T ~float32](collection []T, target T) bool {
return false
}
// ContainsFloat64x2 checks if collection contains target using SSE SIMD and AVX512 SIMD
// ContainsFloat64x2 checks if collection contains target using AVX SIMD and AVX-512 SIMD
func ContainsFloat64x2[T ~float64](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
+34 -34
View File
@@ -8,16 +8,16 @@ import (
// Benchmark suite for SIMD Contains operations compared to core lo package fallbacks.
// These benchmarks measure the performance of element lookup operations
// across different SIMD implementations (SSE, AVX2, AVX512) and data sizes.
// across different SIMD implementations (AVX, AVX2, AVX512) and data sizes.
// Benchmark sizes for Contains operations
var containsBenchmarkSizes = []struct {
name string
size int
}{
{"tiny", 4}, // Smaller than SSE width (16 lanes for int8)
{"small", 16}, // Exactly SSE width for int8
{"medium", 64}, // Multiple of SSE, between SSE and AVX2 for int8
{"tiny", 4}, // Smaller than AVX width (16 lanes for int8)
{"small", 16}, // Exactly AVX width for int8
{"medium", 64}, // Multiple of AVX, between AVX and AVX2 for int8
{"large", 256}, // Multiple of AVX2 (32 lanes for int8)
{"xlarge", 1024}, // Multiple of AVX512 (64 lanes for int8)
{"massive", 8192}, // Very large dataset
@@ -33,14 +33,14 @@ func BenchmarkContainsInt8(b *testing.B) {
data := generateInt8(bs.size)
target := int8(42)
b.Run("SSE-x16", func(b *testing.B) {
b.Run("AVX512-x16", func(b *testing.B) {
requireAVX512(b) // ContainsInt8x16 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = ContainsInt8x16(data, target)
}
})
b.Run("AVX2-x32", func(b *testing.B) {
b.Run("AVX512-x32", func(b *testing.B) {
requireAVX512(b) // ContainsInt8x32 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
@@ -68,14 +68,14 @@ func BenchmarkContainsInt16(b *testing.B) {
data := generateInt16(bs.size)
target := int16(42)
b.Run("SSE-x8", func(b *testing.B) {
b.Run("AVX512-x8", func(b *testing.B) {
requireAVX512(b) // ContainsInt16x8 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = ContainsInt16x8(data, target)
}
})
b.Run("AVX2-x16", func(b *testing.B) {
b.Run("AVX512-x16", func(b *testing.B) {
requireAVX512(b) // ContainsInt16x16 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
@@ -103,14 +103,14 @@ func BenchmarkContainsInt32(b *testing.B) {
data := generateInt32(bs.size)
target := int32(42)
b.Run("SSE-x4", func(b *testing.B) {
b.Run("AVX512-x4", func(b *testing.B) {
requireAVX512(b) // ContainsInt32x4 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = ContainsInt32x4(data, target)
}
})
b.Run("AVX2-x8", func(b *testing.B) {
b.Run("AVX512-x8", func(b *testing.B) {
requireAVX512(b) // ContainsInt32x8 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
@@ -138,14 +138,14 @@ func BenchmarkContainsInt64(b *testing.B) {
data := generateInt64(bs.size)
target := int64(42)
b.Run("SSE-x2", func(b *testing.B) {
b.Run("AVX512-x2", func(b *testing.B) {
requireAVX512(b) // ContainsInt64x2 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = ContainsInt64x2(data, target)
}
})
b.Run("AVX2-x4", func(b *testing.B) {
b.Run("AVX512-x4", func(b *testing.B) {
requireAVX512(b) // ContainsInt64x4 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
@@ -173,14 +173,14 @@ func BenchmarkContainsUint8(b *testing.B) {
data := generateUint8(bs.size)
target := uint8(255)
b.Run("SSE-x16", func(b *testing.B) {
b.Run("AVX512-x16", func(b *testing.B) {
requireAVX512(b) // ContainsUint8x16 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = ContainsUint8x16(data, target)
}
})
b.Run("AVX2-x32", func(b *testing.B) {
b.Run("AVX512-x32", func(b *testing.B) {
requireAVX512(b) // ContainsUint8x32 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
@@ -208,14 +208,14 @@ func BenchmarkContainsUint16(b *testing.B) {
data := generateUint16(bs.size)
target := uint16(42)
b.Run("SSE-x8", func(b *testing.B) {
b.Run("AVX512-x8", func(b *testing.B) {
requireAVX512(b) // ContainsUint16x8 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = ContainsUint16x8(data, target)
}
})
b.Run("AVX2-x16", func(b *testing.B) {
b.Run("AVX512-x16", func(b *testing.B) {
requireAVX512(b) // ContainsUint16x16 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
@@ -243,14 +243,14 @@ func BenchmarkContainsUint32(b *testing.B) {
data := generateUint32(bs.size)
target := uint32(42)
b.Run("SSE-x4", func(b *testing.B) {
b.Run("AVX512-x4", func(b *testing.B) {
requireAVX512(b) // ContainsUint32x4 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = ContainsUint32x4(data, target)
}
})
b.Run("AVX2-x8", func(b *testing.B) {
b.Run("AVX512-x8", func(b *testing.B) {
requireAVX512(b) // ContainsUint32x8 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
@@ -278,14 +278,14 @@ func BenchmarkContainsUint64(b *testing.B) {
data := generateUint64(bs.size)
target := uint64(42)
b.Run("SSE-x2", func(b *testing.B) {
b.Run("AVX512-x2", func(b *testing.B) {
requireAVX512(b) // ContainsUint64x2 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = ContainsUint64x2(data, target)
}
})
b.Run("AVX2-x4", func(b *testing.B) {
b.Run("AVX512-x4", func(b *testing.B) {
requireAVX512(b) // ContainsUint64x4 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
@@ -313,14 +313,14 @@ func BenchmarkContainsFloat32(b *testing.B) {
data := generateFloat32(bs.size)
target := float32(42.5)
b.Run("SSE-x4", func(b *testing.B) {
b.Run("AVX512-x4", func(b *testing.B) {
requireAVX512(b) // ContainsFloat32x4 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = ContainsFloat32x4(data, target)
}
})
b.Run("AVX2-x8", func(b *testing.B) {
b.Run("AVX512-x8", func(b *testing.B) {
requireAVX512(b) // ContainsFloat32x8 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
@@ -348,14 +348,14 @@ func BenchmarkContainsFloat64(b *testing.B) {
data := generateFloat64(bs.size)
target := float64(42.5)
b.Run("SSE-x2", func(b *testing.B) {
b.Run("AVX512-x2", func(b *testing.B) {
requireAVX512(b) // ContainsFloat64x2 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = ContainsFloat64x2(data, target)
}
})
b.Run("AVX2-x4", func(b *testing.B) {
b.Run("AVX512-x4", func(b *testing.B) {
requireAVX512(b) // ContainsFloat64x4 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
@@ -386,14 +386,14 @@ func BenchmarkContainsWorstCase(b *testing.B) {
}
target := int32(size - 1) // Target at the very end
b.Run("SSE-x4", func(b *testing.B) {
b.Run("AVX512-x4", func(b *testing.B) {
requireAVX512(b) // ContainsInt32x4 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = ContainsInt32x4(data, target)
}
})
b.Run("AVX2-x8", func(b *testing.B) {
b.Run("AVX512-x8", func(b *testing.B) {
requireAVX512(b) // ContainsInt32x8 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
@@ -422,14 +422,14 @@ func BenchmarkContainsBestCase(b *testing.B) {
}
target := int32(0) // Target at the very beginning
b.Run("SSE-x4", func(b *testing.B) {
b.Run("AVX512-x4", func(b *testing.B) {
requireAVX512(b) // ContainsInt32x4 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = ContainsInt32x4(data, target)
}
})
b.Run("AVX2-x8", func(b *testing.B) {
b.Run("AVX512-x8", func(b *testing.B) {
requireAVX512(b) // ContainsInt32x8 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
@@ -456,14 +456,14 @@ func BenchmarkContainsNegative(b *testing.B) {
data := generateInt32(bs.size)
target := int32(999999) // Target that's unlikely to be in the data
b.Run("SSE-x4", func(b *testing.B) {
b.Run("AVX512-x4", func(b *testing.B) {
requireAVX512(b) // ContainsInt32x4 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = ContainsInt32x4(data, target)
}
})
b.Run("AVX2-x8", func(b *testing.B) {
b.Run("AVX512-x8", func(b *testing.B) {
requireAVX512(b) // ContainsInt32x8 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
@@ -497,8 +497,8 @@ func BenchmarkContainsInt8ByWidth(b *testing.B) {
name string
fn func() bool
}{
{"SSE-x16", func() bool { return ContainsInt8x16(data, target) }},
{"AVX2-x32", func() bool { return ContainsInt8x32(data, target) }},
{"AVX512-x16", func() bool { return ContainsInt8x16(data, target) }},
{"AVX512-x32", func() bool { return ContainsInt8x32(data, target) }},
{"AVX512-x64", func() bool { return ContainsInt8x64(data, target) }},
}
@@ -533,14 +533,14 @@ func BenchmarkContainsInt64SteadyState(b *testing.B) {
b.ResetTimer() // Reset timer to exclude warmup
b.Run("SSE-x2", func(b *testing.B) {
b.Run("AVX512-x2", func(b *testing.B) {
requireAVX512(b) // ContainsInt64x2 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = ContainsInt64x2(data, target)
}
})
b.Run("AVX2-x4", func(b *testing.B) {
b.Run("AVX512-x4", func(b *testing.B) {
requireAVX512(b) // ContainsInt64x4 is in intersect_avx512.go which uses AVX-512
b.ReportAllocs()
for i := 0; i < b.N; i++ {
+12 -6
View File
@@ -364,7 +364,8 @@ func MinInt64[T ~int64](collection []T) T {
case simdFeatureAVX2:
return MinInt64x4(collection)
case simdFeatureAVX:
return MinInt64x2(collection)
// MinInt64x2 requires AVX-512 (archsimd Int64x2.Min); use scalar fallback
fallthrough
default:
return lo.Min(collection)
}
@@ -420,7 +421,8 @@ func MinUint64[T ~uint64](collection []T) T {
case simdFeatureAVX2:
return MinUint64x4(collection)
case simdFeatureAVX:
return MinUint64x2(collection)
// MinUint64x2 requires AVX-512; use scalar fallback
fallthrough
default:
return lo.Min(collection)
}
@@ -504,7 +506,8 @@ func MaxInt64[T ~int64](collection []T) T {
case simdFeatureAVX2:
return MaxInt64x4(collection)
case simdFeatureAVX:
return MaxInt64x2(collection)
// MaxInt64x2 requires AVX-512; use scalar fallback
fallthrough
default:
return lo.Max(collection)
}
@@ -560,7 +563,8 @@ func MaxUint64[T ~uint64](collection []T) T {
case simdFeatureAVX2:
return MaxUint64x4(collection)
case simdFeatureAVX:
return MaxUint64x2(collection)
// MaxUint64x2 requires AVX-512; use scalar fallback
fallthrough
default:
return lo.Max(collection)
}
@@ -674,7 +678,8 @@ func ClampInt64[T ~int64, Slice ~[]T](collection Slice, min, max T) Slice {
case simdFeatureAVX2:
return ClampInt64x4(collection, min, max)
case simdFeatureAVX:
return ClampInt64x2(collection, min, max)
// ClampInt64x2 requires AVX-512; use scalar fallback
fallthrough
default:
result := make(Slice, len(collection))
for i, v := range collection {
@@ -770,7 +775,8 @@ func ClampUint64[T ~uint64, Slice ~[]T](collection Slice, min, max T) Slice {
case simdFeatureAVX2:
return ClampUint64x4(collection, min, max)
case simdFeatureAVX:
return ClampUint64x2(collection, min, max)
// ClampUint64x2 requires AVX-512; use scalar fallback
fallthrough
default:
result := make(Slice, len(collection))
for i, v := range collection {
+67 -319
View File
@@ -9,9 +9,9 @@ import (
"github.com/samber/lo"
)
// SSE (128-bit) SIMD sum functions - 16/8/4/2 lanes
// AVX (128-bit) SIMD sum functions - 16/8/4/2 lanes
// SumInt8x16 sums a slice of int8 using SSE SIMD (Int8x16, 16 lanes).
// SumInt8x16 sums a slice of int8 using AVX SIMD (Int8x16, 16 lanes).
// Overflow: The accumulation is performed using int8, which can overflow for large collections.
// If the sum exceeds the int8 range (-128 to 127), the result will wrap around silently.
// For collections that may overflow, consider using a wider type or handle overflow detection externally.
@@ -45,7 +45,7 @@ func SumInt8x16[T ~int8](collection []T) T {
return sum
}
// SumInt16x8 sums a slice of int16 using SSE SIMD (Int16x8, 8 lanes).
// SumInt16x8 sums a slice of int16 using AVX SIMD (Int16x8, 8 lanes).
// Overflow: The accumulation is performed using int16, which can overflow for large collections.
// If the sum exceeds the int16 range (-32768 to 32767), the result will wrap around silently.
// For collections that may overflow, consider using a wider type or handle overflow detection externally.
@@ -79,7 +79,7 @@ func SumInt16x8[T ~int16](collection []T) T {
return sum
}
// SumInt32x4 sums a slice of int32 using SSE SIMD (Int32x4, 4 lanes).
// SumInt32x4 sums a slice of int32 using AVX SIMD (Int32x4, 4 lanes).
// Overflow: The accumulation is performed using int32, which can overflow for very large collections.
// If the sum exceeds the int32 range (-2147483648 to 2147483647), the result will wrap around silently.
// For collections that may overflow, consider using SumInt64x2 or handle overflow detection externally.
@@ -113,7 +113,7 @@ func SumInt32x4[T ~int32](collection []T) T {
return sum
}
// SumInt64x2 sums a slice of int64 using SSE SIMD (Int64x2, 2 lanes).
// SumInt64x2 sums a slice of int64 using AVX SIMD (Int64x2, 2 lanes).
// Overflow: The accumulation is performed using int64, which can overflow for extremely large collections.
// If the sum exceeds the int64 range, the result will wrap around silently.
// For collections that may overflow, handle overflow detection externally (e.g., using big.Int).
@@ -147,7 +147,7 @@ func SumInt64x2[T ~int64](collection []T) T {
return sum
}
// SumUint8x16 sums a slice of uint8 using SSE SIMD (Uint8x16, 16 lanes).
// SumUint8x16 sums a slice of uint8 using AVX SIMD (Uint8x16, 16 lanes).
// Overflow: The accumulation is performed using uint8, which can overflow for large collections.
// If the sum exceeds the uint8 range (0 to 255), the result will wrap around silently.
// For collections that may overflow, consider using a wider type or handle overflow detection externally.
@@ -181,7 +181,7 @@ func SumUint8x16[T ~uint8](collection []T) T {
return sum
}
// SumUint16x8 sums a slice of uint16 using SSE SIMD (Uint16x8, 8 lanes).
// SumUint16x8 sums a slice of uint16 using AVX SIMD (Uint16x8, 8 lanes).
// Overflow: The accumulation is performed using uint16, which can overflow for large collections.
// If the sum exceeds the uint16 range (0 to 65535), the result will wrap around silently.
// For collections that may overflow, consider using a wider type or handle overflow detection externally.
@@ -215,7 +215,7 @@ func SumUint16x8[T ~uint16](collection []T) T {
return sum
}
// SumUint32x4 sums a slice of uint32 using SSE SIMD (Uint32x4, 4 lanes).
// SumUint32x4 sums a slice of uint32 using AVX SIMD (Uint32x4, 4 lanes).
// Overflow: The accumulation is performed using uint32, which can overflow for very large collections.
// If the sum exceeds the uint32 range (0 to 4294967295), the result will wrap around silently.
// For collections that may overflow, consider using SumUint64x2 or handle overflow detection externally.
@@ -249,7 +249,7 @@ func SumUint32x4[T ~uint32](collection []T) T {
return sum
}
// SumUint64x2 sums a slice of uint64 using SSE SIMD (Uint64x2, 2 lanes).
// SumUint64x2 sums a slice of uint64 using AVX SIMD (Uint64x2, 2 lanes).
// Overflow: The accumulation is performed using uint64, which can overflow for extremely large collections.
// If the sum exceeds the uint64 range, the result will wrap around silently.
// For collections that may overflow, handle overflow detection externally (e.g., using big.Int).
@@ -283,7 +283,7 @@ func SumUint64x2[T ~uint64](collection []T) T {
return sum
}
// SumFloat32x4 sums a slice of float32 using SSE SIMD (Float32x4, 4 lanes).
// SumFloat32x4 sums a slice of float32 using AVX SIMD (Float32x4, 4 lanes).
// Overflow: The accumulation is performed using float32. Overflow will result in +/-Inf rather than wrapping.
// For collections requiring high precision or large sums, consider using SumFloat64x2.
func SumFloat32x4[T ~float32](collection []T) T {
@@ -316,7 +316,7 @@ func SumFloat32x4[T ~float32](collection []T) T {
return sum
}
// SumFloat64x2 sums a slice of float64 using SSE SIMD (Float64x2, 2 lanes).
// SumFloat64x2 sums a slice of float64 using AVX SIMD (Float64x2, 2 lanes).
// Overflow: The accumulation is performed using float64. Overflow will result in +/-Inf rather than wrapping.
// For collections that may overflow, handle overflow detection externally (e.g., using big.Float).
func SumFloat64x2[T ~float64](collection []T) T {
@@ -349,7 +349,7 @@ func SumFloat64x2[T ~float64](collection []T) T {
return sum
}
// MeanInt8x16 calculates the mean of a slice of int8 using SSE SIMD
// MeanInt8x16 calculates the mean of a slice of int8 using AVX SIMD
func MeanInt8x16[T ~int8](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -359,7 +359,7 @@ func MeanInt8x16[T ~int8](collection []T) T {
return sum / T(length)
}
// MeanInt16x8 calculates the mean of a slice of int16 using SSE SIMD
// MeanInt16x8 calculates the mean of a slice of int16 using AVX SIMD
func MeanInt16x8[T ~int16](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -369,7 +369,7 @@ func MeanInt16x8[T ~int16](collection []T) T {
return sum / T(length)
}
// MeanInt32x4 calculates the mean of a slice of int32 using SSE SIMD
// MeanInt32x4 calculates the mean of a slice of int32 using AVX SIMD
func MeanInt32x4[T ~int32](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -379,7 +379,7 @@ func MeanInt32x4[T ~int32](collection []T) T {
return sum / T(length)
}
// MeanInt64x2 calculates the mean of a slice of int64 using SSE SIMD
// MeanInt64x2 calculates the mean of a slice of int64 using AVX SIMD
func MeanInt64x2[T ~int64](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -389,7 +389,7 @@ func MeanInt64x2[T ~int64](collection []T) T {
return sum / T(length)
}
// MeanUint8x16 calculates the mean of a slice of uint8 using SSE SIMD
// MeanUint8x16 calculates the mean of a slice of uint8 using AVX SIMD
func MeanUint8x16[T ~uint8](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -399,7 +399,7 @@ func MeanUint8x16[T ~uint8](collection []T) T {
return sum / T(length)
}
// MeanUint16x8 calculates the mean of a slice of uint16 using SSE SIMD
// MeanUint16x8 calculates the mean of a slice of uint16 using AVX SIMD
func MeanUint16x8[T ~uint16](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -409,7 +409,7 @@ func MeanUint16x8[T ~uint16](collection []T) T {
return sum / T(length)
}
// MeanUint32x4 calculates the mean of a slice of uint32 using SSE SIMD
// MeanUint32x4 calculates the mean of a slice of uint32 using AVX SIMD
func MeanUint32x4[T ~uint32](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -419,7 +419,7 @@ func MeanUint32x4[T ~uint32](collection []T) T {
return sum / T(length)
}
// MeanUint64x2 calculates the mean of a slice of uint64 using SSE SIMD
// MeanUint64x2 calculates the mean of a slice of uint64 using AVX SIMD
func MeanUint64x2[T ~uint64](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -429,7 +429,7 @@ func MeanUint64x2[T ~uint64](collection []T) T {
return sum / T(length)
}
// MeanFloat32x4 calculates the mean of a slice of float32 using SSE SIMD
// MeanFloat32x4 calculates the mean of a slice of float32 using AVX SIMD
func MeanFloat32x4[T ~float32](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -440,7 +440,7 @@ func MeanFloat32x4[T ~float32](collection []T) T {
return sum / T(length)
}
// MeanFloat64x2 calculates the mean of a slice of float64 using SSE SIMD
// MeanFloat64x2 calculates the mean of a slice of float64 using AVX SIMD
func MeanFloat64x2[T ~float64](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -450,7 +450,7 @@ func MeanFloat64x2[T ~float64](collection []T) T {
return sum / T(length)
}
// ClampInt8x16 clamps each element in collection between min and max values using SSE SIMD
// ClampInt8x16 clamps each element in collection between min and max values using AVX SIMD
func ClampInt8x16[T ~int8, Slice ~[]T](collection Slice, min, max T) Slice {
length := uint(len(collection))
if length == 0 {
@@ -488,7 +488,7 @@ func ClampInt8x16[T ~int8, Slice ~[]T](collection Slice, min, max T) Slice {
return result
}
// ClampInt16x8 clamps each element in collection between min and max values using SSE SIMD
// ClampInt16x8 clamps each element in collection between min and max values using AVX SIMD
func ClampInt16x8[T ~int16, Slice ~[]T](collection Slice, min, max T) Slice {
length := uint(len(collection))
if length == 0 {
@@ -526,7 +526,7 @@ func ClampInt16x8[T ~int16, Slice ~[]T](collection Slice, min, max T) Slice {
return result
}
// ClampInt32x4 clamps each element in collection between min and max values using SSE SIMD
// ClampInt32x4 clamps each element in collection between min and max values using AVX SIMD
func ClampInt32x4[T ~int32, Slice ~[]T](collection Slice, min, max T) Slice {
length := uint(len(collection))
if length == 0 {
@@ -564,45 +564,7 @@ func ClampInt32x4[T ~int32, Slice ~[]T](collection Slice, min, max T) Slice {
return result
}
// ClampInt64x2 clamps each element in collection between min and max values using SSE SIMD and AVX-512 SIMD.
func ClampInt64x2[T ~int64, Slice ~[]T](collection Slice, min, max T) Slice {
length := uint(len(collection))
if length == 0 {
return collection
}
result := make(Slice, length)
const lanes = simdLanes2
base := unsafeSliceInt64(collection, length)
minVec := archsimd.BroadcastInt64x2(int64(min))
maxVec := archsimd.BroadcastInt64x2(int64(max))
i := uint(0)
for ; i+lanes <= length; i += lanes {
v := archsimd.LoadInt64x2Slice(base[i : i+lanes])
clamped := v.Max(minVec).Min(maxVec)
// bearer:disable go_gosec_unsafe_unsafe
clamped.Store((*[lanes]int64)(unsafe.Pointer(&result[i])))
}
for ; i < length; i++ {
val := collection[i]
if val < min {
val = min
} else if val > max {
val = max
}
result[i] = val
}
return result
}
// ClampUint8x16 clamps each element in collection between min and max values using SSE SIMD
// ClampUint8x16 clamps each element in collection between min and max values using AVX SIMD
func ClampUint8x16[T ~uint8, Slice ~[]T](collection Slice, min, max T) Slice {
length := uint(len(collection))
if length == 0 {
@@ -640,7 +602,7 @@ func ClampUint8x16[T ~uint8, Slice ~[]T](collection Slice, min, max T) Slice {
return result
}
// ClampUint16x8 clamps each element in collection between min and max values using SSE SIMD
// ClampUint16x8 clamps each element in collection between min and max values using AVX SIMD
func ClampUint16x8[T ~uint16, Slice ~[]T](collection Slice, min, max T) Slice {
length := uint(len(collection))
if length == 0 {
@@ -678,7 +640,7 @@ func ClampUint16x8[T ~uint16, Slice ~[]T](collection Slice, min, max T) Slice {
return result
}
// ClampUint32x4 clamps each element in collection between min and max values using SSE SIMD
// ClampUint32x4 clamps each element in collection between min and max values using AVX SIMD
func ClampUint32x4[T ~uint32, Slice ~[]T](collection Slice, min, max T) Slice {
length := uint(len(collection))
if length == 0 {
@@ -716,45 +678,7 @@ func ClampUint32x4[T ~uint32, Slice ~[]T](collection Slice, min, max T) Slice {
return result
}
// ClampUint64x2 clamps each element in collection between min and max values using SSE SIMD and AVX-512 SIMD.
func ClampUint64x2[T ~uint64, Slice ~[]T](collection Slice, min, max T) Slice {
length := uint(len(collection))
if length == 0 {
return collection
}
result := make(Slice, length)
const lanes = simdLanes2
base := unsafeSliceUint64(collection, length)
minVec := archsimd.BroadcastUint64x2(uint64(min))
maxVec := archsimd.BroadcastUint64x2(uint64(max))
i := uint(0)
for ; i+lanes <= length; i += lanes {
v := archsimd.LoadUint64x2Slice(base[i : i+lanes])
clamped := v.Max(minVec).Min(maxVec)
// bearer:disable go_gosec_unsafe_unsafe
clamped.Store((*[lanes]uint64)(unsafe.Pointer(&result[i])))
}
for ; i < length; i++ {
val := collection[i]
if val < min {
val = min
} else if val > max {
val = max
}
result[i] = val
}
return result
}
// ClampFloat32x4 clamps each element in collection between min and max values using SSE SIMD
// ClampFloat32x4 clamps each element in collection between min and max values using AVX SIMD
func ClampFloat32x4[T ~float32, Slice ~[]T](collection Slice, min, max T) Slice {
length := uint(len(collection))
if length == 0 {
@@ -792,7 +716,7 @@ func ClampFloat32x4[T ~float32, Slice ~[]T](collection Slice, min, max T) Slice
return result
}
// ClampFloat64x2 clamps each element in collection between min and max values using SSE SIMD
// ClampFloat64x2 clamps each element in collection between min and max values using AVX SIMD
func ClampFloat64x2[T ~float64, Slice ~[]T](collection Slice, min, max T) Slice {
length := uint(len(collection))
if length == 0 {
@@ -830,7 +754,7 @@ func ClampFloat64x2[T ~float64, Slice ~[]T](collection Slice, min, max T) Slice
return result
}
// MinInt8x16 finds the minimum value in a collection of int8 using SSE SIMD
// MinInt8x16 finds the minimum value in a collection of int8 using AVX SIMD
func MinInt8x16[T ~int8](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -877,7 +801,7 @@ func MinInt8x16[T ~int8](collection []T) T {
return T(minVal)
}
// MinInt16x8 finds the minimum value in a collection of int16 using SSE SIMD
// MinInt16x8 finds the minimum value in a collection of int16 using AVX SIMD
func MinInt16x8[T ~int16](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -921,7 +845,7 @@ func MinInt16x8[T ~int16](collection []T) T {
return T(minVal)
}
// MinInt32x4 finds the minimum value in a collection of int32 using SSE SIMD
// MinInt32x4 finds the minimum value in a collection of int32 using AVX SIMD
func MinInt32x4[T ~int32](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -965,51 +889,7 @@ func MinInt32x4[T ~int32](collection []T) T {
return T(minVal)
}
// MinInt64x2 finds the minimum value in a collection of int64 using SSE SIMD
func MinInt64x2[T ~int64](collection []T) T {
length := uint(len(collection))
if length == 0 {
return 0
}
const lanes = simdLanes2
base := unsafeSliceInt64(collection, length)
var minVec archsimd.Int64x2
firstInitialized := false
i := uint(0)
for ; i+lanes <= length; i += lanes {
v := archsimd.LoadInt64x2Slice(base[i : i+lanes])
if !firstInitialized {
minVec = v
firstInitialized = true
} else {
minVec = minVec.Min(v)
}
}
// Find minimum in the vector (only if we processed any vectors)
var minVal int64
if firstInitialized {
var buf [lanes]int64
minVec.Store(&buf)
minVal = min(buf[0], buf[1])
}
// Handle remaining elements
for ; i < length; i++ {
if !firstInitialized || collection[i] < T(minVal) {
minVal = int64(collection[i])
firstInitialized = true
}
}
return T(minVal)
}
// MinUint8x16 finds the minimum value in a collection of uint8 using SSE SIMD
// MinUint8x16 finds the minimum value in a collection of uint8 using AVX SIMD
func MinUint8x16[T ~uint8](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -1056,7 +936,7 @@ func MinUint8x16[T ~uint8](collection []T) T {
return T(minVal)
}
// MinUint16x8 finds the minimum value in a collection of uint16 using SSE SIMD
// MinUint16x8 finds the minimum value in a collection of uint16 using AVX SIMD
func MinUint16x8[T ~uint16](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -1100,7 +980,7 @@ func MinUint16x8[T ~uint16](collection []T) T {
return T(minVal)
}
// MinUint32x4 finds the minimum value in a collection of uint32 using SSE SIMD
// MinUint32x4 finds the minimum value in a collection of uint32 using AVX SIMD
func MinUint32x4[T ~uint32](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -1144,51 +1024,7 @@ func MinUint32x4[T ~uint32](collection []T) T {
return T(minVal)
}
// MinUint64x2 finds the minimum value in a collection of uint64 using SSE SIMD
func MinUint64x2[T ~uint64](collection []T) T {
length := uint(len(collection))
if length == 0 {
return 0
}
const lanes = simdLanes2
base := unsafeSliceUint64(collection, length)
var minVec archsimd.Uint64x2
firstInitialized := false
i := uint(0)
for ; i+lanes <= length; i += lanes {
v := archsimd.LoadUint64x2Slice(base[i : i+lanes])
if !firstInitialized {
minVec = v
firstInitialized = true
} else {
minVec = minVec.Min(v)
}
}
// Find minimum in the vector (only if we processed any vectors)
var minVal uint64
if firstInitialized {
var buf [lanes]uint64
minVec.Store(&buf)
minVal = min(buf[0], buf[1])
}
// Handle remaining elements
for ; i < length; i++ {
if !firstInitialized || collection[i] < T(minVal) {
minVal = uint64(collection[i])
firstInitialized = true
}
}
return T(minVal)
}
// MinFloat32x4 finds the minimum value in a collection of float32 using SSE SIMD
// MinFloat32x4 finds the minimum value in a collection of float32 using AVX SIMD
func MinFloat32x4[T ~float32](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -1232,7 +1068,7 @@ func MinFloat32x4[T ~float32](collection []T) T {
return T(minVal)
}
// MinFloat64x2 finds the minimum value in a collection of float64 using SSE SIMD
// MinFloat64x2 finds the minimum value in a collection of float64 using AVX SIMD
func MinFloat64x2[T ~float64](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -1276,7 +1112,7 @@ func MinFloat64x2[T ~float64](collection []T) T {
return T(minVal)
}
// MaxInt8x16 finds the maximum value in a collection of int8 using SSE SIMD
// MaxInt8x16 finds the maximum value in a collection of int8 using AVX SIMD
func MaxInt8x16[T ~int8](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -1323,7 +1159,7 @@ func MaxInt8x16[T ~int8](collection []T) T {
return T(maxVal)
}
// MaxInt16x8 finds the maximum value in a collection of int16 using SSE SIMD
// MaxInt16x8 finds the maximum value in a collection of int16 using AVX SIMD
func MaxInt16x8[T ~int16](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -1367,7 +1203,7 @@ func MaxInt16x8[T ~int16](collection []T) T {
return T(maxVal)
}
// MaxInt32x4 finds the maximum value in a collection of int32 using SSE SIMD
// MaxInt32x4 finds the maximum value in a collection of int32 using AVX SIMD
func MaxInt32x4[T ~int32](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -1411,51 +1247,7 @@ func MaxInt32x4[T ~int32](collection []T) T {
return T(maxVal)
}
// MaxInt64x2 finds the maximum value in a collection of int64 using SSE SIMD
func MaxInt64x2[T ~int64](collection []T) T {
length := uint(len(collection))
if length == 0 {
return 0
}
const lanes = simdLanes2
base := unsafeSliceInt64(collection, length)
var maxVec archsimd.Int64x2
firstInitialized := false
i := uint(0)
for ; i+lanes <= length; i += lanes {
v := archsimd.LoadInt64x2Slice(base[i : i+lanes])
if !firstInitialized {
maxVec = v
firstInitialized = true
} else {
maxVec = maxVec.Max(v)
}
}
// Find maximum in the vector (only if we processed any vectors)
var maxVal int64
if firstInitialized {
var buf [lanes]int64
maxVec.Store(&buf)
maxVal = max(buf[0], buf[1])
}
// Handle remaining elements
for ; i < length; i++ {
if !firstInitialized || collection[i] > T(maxVal) {
maxVal = int64(collection[i])
firstInitialized = true
}
}
return T(maxVal)
}
// MaxUint8x16 finds the maximum value in a collection of uint8 using SSE SIMD
// MaxUint8x16 finds the maximum value in a collection of uint8 using AVX SIMD
func MaxUint8x16[T ~uint8](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -1502,7 +1294,7 @@ func MaxUint8x16[T ~uint8](collection []T) T {
return T(maxVal)
}
// MaxUint16x8 finds the maximum value in a collection of uint16 using SSE SIMD
// MaxUint16x8 finds the maximum value in a collection of uint16 using AVX SIMD
func MaxUint16x8[T ~uint16](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -1546,7 +1338,7 @@ func MaxUint16x8[T ~uint16](collection []T) T {
return T(maxVal)
}
// MaxUint32x4 finds the maximum value in a collection of uint32 using SSE SIMD
// MaxUint32x4 finds the maximum value in a collection of uint32 using AVX SIMD
func MaxUint32x4[T ~uint32](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -1590,51 +1382,7 @@ func MaxUint32x4[T ~uint32](collection []T) T {
return T(maxVal)
}
// MaxUint64x2 finds the maximum value in a collection of uint64 using SSE SIMD
func MaxUint64x2[T ~uint64](collection []T) T {
length := uint(len(collection))
if length == 0 {
return 0
}
const lanes = simdLanes2
base := unsafeSliceUint64(collection, length)
var maxVec archsimd.Uint64x2
firstInitialized := false
i := uint(0)
for ; i+lanes <= length; i += lanes {
v := archsimd.LoadUint64x2Slice(base[i : i+lanes])
if !firstInitialized {
maxVec = v
firstInitialized = true
} else {
maxVec = maxVec.Max(v)
}
}
// Find maximum in the vector (only if we processed any vectors)
var maxVal uint64
if firstInitialized {
var buf [lanes]uint64
maxVec.Store(&buf)
maxVal = max(buf[0], buf[1])
}
// Handle remaining elements
for ; i < length; i++ {
if !firstInitialized || collection[i] > T(maxVal) {
maxVal = uint64(collection[i])
firstInitialized = true
}
}
return T(maxVal)
}
// MaxFloat32x4 finds the maximum value in a collection of float32 using SSE SIMD
// MaxFloat32x4 finds the maximum value in a collection of float32 using AVX SIMD
func MaxFloat32x4[T ~float32](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -1678,7 +1426,7 @@ func MaxFloat32x4[T ~float32](collection []T) T {
return T(maxVal)
}
// MaxFloat64x2 finds the maximum value in a collection of float64 using SSE SIMD
// MaxFloat64x2 finds the maximum value in a collection of float64 using AVX SIMD
func MaxFloat64x2[T ~float64](collection []T) T {
length := uint(len(collection))
if length == 0 {
@@ -1722,127 +1470,127 @@ func MaxFloat64x2[T ~float64](collection []T) T {
return T(maxVal)
}
// SSE (128-bit) SIMD sumBy functions - 16/8/4/2 lanes
// AVX (128-bit) SIMD sumBy functions - 16/8/4/2 lanes
// These implementations use lo.Map to apply the iteratee, then chain with SIMD sum functions.
// SumByInt8x16 sums the values extracted by iteratee from a slice using SSE SIMD.
// SumByInt8x16 sums the values extracted by iteratee from a slice using AVX SIMD.
func SumByInt8x16[T any, R ~int8](collection []T, iteratee func(item T) R) R {
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
return SumInt8x16(mapped)
}
// SumByInt16x8 sums the values extracted by iteratee from a slice using SSE SIMD.
// SumByInt16x8 sums the values extracted by iteratee from a slice using AVX SIMD.
func SumByInt16x8[T any, R ~int16](collection []T, iteratee func(item T) R) R {
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
return SumInt16x8(mapped)
}
// SumByInt32x4 sums the values extracted by iteratee from a slice using SSE SIMD.
// SumByInt32x4 sums the values extracted by iteratee from a slice using AVX SIMD.
func SumByInt32x4[T any, R ~int32](collection []T, iteratee func(item T) R) R {
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
return SumInt32x4(mapped)
}
// SumByInt64x2 sums the values extracted by iteratee from a slice using SSE SIMD.
// SumByInt64x2 sums the values extracted by iteratee from a slice using AVX SIMD.
func SumByInt64x2[T any, R ~int64](collection []T, iteratee func(item T) R) R {
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
return SumInt64x2(mapped)
}
// SumByUint8x16 sums the values extracted by iteratee from a slice using SSE SIMD.
// SumByUint8x16 sums the values extracted by iteratee from a slice using AVX SIMD.
func SumByUint8x16[T any, R ~uint8](collection []T, iteratee func(item T) R) R {
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
return SumUint8x16(mapped)
}
// SumByUint16x8 sums the values extracted by iteratee from a slice using SSE SIMD.
// SumByUint16x8 sums the values extracted by iteratee from a slice using AVX SIMD.
func SumByUint16x8[T any, R ~uint16](collection []T, iteratee func(item T) R) R {
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
return SumUint16x8(mapped)
}
// SumByUint32x4 sums the values extracted by iteratee from a slice using SSE SIMD.
// SumByUint32x4 sums the values extracted by iteratee from a slice using AVX SIMD.
func SumByUint32x4[T any, R ~uint32](collection []T, iteratee func(item T) R) R {
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
return SumUint32x4(mapped)
}
// SumByUint64x2 sums the values extracted by iteratee from a slice using SSE SIMD.
// SumByUint64x2 sums the values extracted by iteratee from a slice using AVX SIMD.
func SumByUint64x2[T any, R ~uint64](collection []T, iteratee func(item T) R) R {
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
return SumUint64x2(mapped)
}
// SumByFloat32x4 sums the values extracted by iteratee from a slice using SSE SIMD.
// SumByFloat32x4 sums the values extracted by iteratee from a slice using AVX SIMD.
func SumByFloat32x4[T any, R ~float32](collection []T, iteratee func(item T) R) R {
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
return SumFloat32x4(mapped)
}
// SumByFloat64x2 sums the values extracted by iteratee from a slice using SSE SIMD.
// SumByFloat64x2 sums the values extracted by iteratee from a slice using AVX SIMD.
func SumByFloat64x2[T any, R ~float64](collection []T, iteratee func(item T) R) R {
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
return SumFloat64x2(mapped)
}
// SSE (128-bit) SIMD meanBy functions - 16/8/4/2 lanes
// AVX (128-bit) SIMD meanBy functions - 16/8/4/2 lanes
// These implementations use lo.Map to apply the iteratee, then chain with SIMD mean functions.
// MeanByInt8x16 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
// MeanByInt8x16 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
func MeanByInt8x16[T any, R ~int8](collection []T, iteratee func(item T) R) R {
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
return MeanInt8x16(mapped)
}
// MeanByInt16x8 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
// MeanByInt16x8 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
func MeanByInt16x8[T any, R ~int16](collection []T, iteratee func(item T) R) R {
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
return MeanInt16x8(mapped)
}
// MeanByInt32x4 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
// MeanByInt32x4 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
func MeanByInt32x4[T any, R ~int32](collection []T, iteratee func(item T) R) R {
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
return MeanInt32x4(mapped)
}
// MeanByInt64x2 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
// MeanByInt64x2 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
func MeanByInt64x2[T any, R ~int64](collection []T, iteratee func(item T) R) R {
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
return MeanInt64x2(mapped)
}
// MeanByUint8x16 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
// MeanByUint8x16 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
func MeanByUint8x16[T any, R ~uint8](collection []T, iteratee func(item T) R) R {
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
return MeanUint8x16(mapped)
}
// MeanByUint16x8 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
// MeanByUint16x8 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
func MeanByUint16x8[T any, R ~uint16](collection []T, iteratee func(item T) R) R {
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
return MeanUint16x8(mapped)
}
// MeanByUint32x4 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
// MeanByUint32x4 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
func MeanByUint32x4[T any, R ~uint32](collection []T, iteratee func(item T) R) R {
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
return MeanUint32x4(mapped)
}
// MeanByUint64x2 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
// MeanByUint64x2 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
func MeanByUint64x2[T any, R ~uint64](collection []T, iteratee func(item T) R) R {
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
return MeanUint64x2(mapped)
}
// MeanByFloat32x4 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
// MeanByFloat32x4 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
func MeanByFloat32x4[T any, R ~float32](collection []T, iteratee func(item T) R) R {
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
return MeanFloat32x4(mapped)
}
// MeanByFloat64x2 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
// MeanByFloat64x2 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
func MeanByFloat64x2[T any, R ~float64](collection []T, iteratee func(item T) R) R {
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
return MeanFloat64x2(mapped)
+258
View File
@@ -566,6 +566,84 @@ func ClampInt32x16[T ~int32, Slice ~[]T](collection Slice, min, max T) Slice {
return result
}
// ClampInt64x2 clamps each element in collection between min and max values using AVX-512 SIMD.
// Int64x2 Min/Max operations in archsimd require AVX-512 (VPMAXSQ/VPMINSQ).
func ClampInt64x2[T ~int64, Slice ~[]T](collection Slice, min, max T) Slice {
length := uint(len(collection))
if length == 0 {
return collection
}
result := make(Slice, length)
const lanes = simdLanes2
base := unsafeSliceInt64(collection, length)
minVec := archsimd.BroadcastInt64x2(int64(min))
maxVec := archsimd.BroadcastInt64x2(int64(max))
i := uint(0)
for ; i+lanes <= length; i += lanes {
v := archsimd.LoadInt64x2Slice(base[i : i+lanes])
clamped := v.Max(minVec).Min(maxVec)
// bearer:disable go_gosec_unsafe_unsafe
clamped.Store((*[lanes]int64)(unsafe.Pointer(&result[i])))
}
for ; i < length; i++ {
val := collection[i]
if val < min {
val = min
} else if val > max {
val = max
}
result[i] = val
}
return result
}
// ClampUint64x2 clamps each element in collection between min and max values using AVX-512 SIMD.
// Uint64x2 Min/Max operations in archsimd require AVX-512.
func ClampUint64x2[T ~uint64, Slice ~[]T](collection Slice, min, max T) Slice {
length := uint(len(collection))
if length == 0 {
return collection
}
result := make(Slice, length)
const lanes = simdLanes2
base := unsafeSliceUint64(collection, length)
minVec := archsimd.BroadcastUint64x2(uint64(min))
maxVec := archsimd.BroadcastUint64x2(uint64(max))
i := uint(0)
for ; i+lanes <= length; i += lanes {
v := archsimd.LoadUint64x2Slice(base[i : i+lanes])
clamped := v.Max(minVec).Min(maxVec)
// bearer:disable go_gosec_unsafe_unsafe
clamped.Store((*[lanes]uint64)(unsafe.Pointer(&result[i])))
}
for ; i < length; i++ {
val := collection[i]
if val < min {
val = min
} else if val > max {
val = max
}
result[i] = val
}
return result
}
// ClampInt64x8 clamps each element in collection between min and max values using AVX-512 SIMD
func ClampInt64x8[T ~int64, Slice ~[]T](collection Slice, min, max T) Slice {
length := uint(len(collection))
@@ -991,6 +1069,96 @@ func MinInt32x16[T ~int32](collection []T) T {
return T(minVal)
}
// MinInt64x2 finds the minimum value in a collection of int64 using AVX-512 SIMD.
// Int64x2 Min operations in archsimd require AVX-512.
func MinInt64x2[T ~int64](collection []T) T {
length := uint(len(collection))
if length == 0 {
return 0
}
const lanes = simdLanes2
base := unsafeSliceInt64(collection, length)
var minVec archsimd.Int64x2
firstInitialized := false
i := uint(0)
for ; i+lanes <= length; i += lanes {
v := archsimd.LoadInt64x2Slice(base[i : i+lanes])
if !firstInitialized {
minVec = v
firstInitialized = true
} else {
minVec = minVec.Min(v)
}
}
// Find minimum in the vector (only if we processed any vectors)
var minVal int64
if firstInitialized {
var buf [lanes]int64
minVec.Store(&buf)
minVal = min(buf[0], buf[1])
}
// Handle remaining elements
for ; i < length; i++ {
if !firstInitialized || collection[i] < T(minVal) {
minVal = int64(collection[i])
firstInitialized = true
}
}
return T(minVal)
}
// MinUint64x2 finds the minimum value in a collection of uint64 using AVX-512 SIMD.
// Uint64x2 Min operations in archsimd require AVX-512.
func MinUint64x2[T ~uint64](collection []T) T {
length := uint(len(collection))
if length == 0 {
return 0
}
const lanes = simdLanes2
base := unsafeSliceUint64(collection, length)
var minVec archsimd.Uint64x2
firstInitialized := false
i := uint(0)
for ; i+lanes <= length; i += lanes {
v := archsimd.LoadUint64x2Slice(base[i : i+lanes])
if !firstInitialized {
minVec = v
firstInitialized = true
} else {
minVec = minVec.Min(v)
}
}
// Find minimum in the vector (only if we processed any vectors)
var minVal uint64
if firstInitialized {
var buf [lanes]uint64
minVec.Store(&buf)
minVal = min(buf[0], buf[1])
}
// Handle remaining elements
for ; i < length; i++ {
if !firstInitialized || collection[i] < T(minVal) {
minVal = uint64(collection[i])
firstInitialized = true
}
}
return T(minVal)
}
// MinInt64x8 finds the minimum value in a collection of int64 using AVX-512 SIMD
func MinInt64x8[T ~int64](collection []T) T {
length := uint(len(collection))
@@ -1478,6 +1646,96 @@ func MaxInt32x16[T ~int32](collection []T) T {
return T(maxVal)
}
// MaxInt64x2 finds the maximum value in a collection of int64 using AVX-512 SIMD.
// Int64x2 Max operations in archsimd require AVX-512.
func MaxInt64x2[T ~int64](collection []T) T {
length := uint(len(collection))
if length == 0 {
return 0
}
const lanes = simdLanes2
base := unsafeSliceInt64(collection, length)
var maxVec archsimd.Int64x2
firstInitialized := false
i := uint(0)
for ; i+lanes <= length; i += lanes {
v := archsimd.LoadInt64x2Slice(base[i : i+lanes])
if !firstInitialized {
maxVec = v
firstInitialized = true
} else {
maxVec = maxVec.Max(v)
}
}
// Find maximum in the vector (only if we processed any vectors)
var maxVal int64
if firstInitialized {
var buf [lanes]int64
maxVec.Store(&buf)
maxVal = max(buf[0], buf[1])
}
// Handle remaining elements
for ; i < length; i++ {
if !firstInitialized || collection[i] > T(maxVal) {
maxVal = int64(collection[i])
firstInitialized = true
}
}
return T(maxVal)
}
// MaxUint64x2 finds the maximum value in a collection of uint64 using AVX-512 SIMD.
// Uint64x2 Max operations in archsimd require AVX-512.
func MaxUint64x2[T ~uint64](collection []T) T {
length := uint(len(collection))
if length == 0 {
return 0
}
const lanes = simdLanes2
base := unsafeSliceUint64(collection, length)
var maxVec archsimd.Uint64x2
firstInitialized := false
i := uint(0)
for ; i+lanes <= length; i += lanes {
v := archsimd.LoadUint64x2Slice(base[i : i+lanes])
if !firstInitialized {
maxVec = v
firstInitialized = true
} else {
maxVec = maxVec.Max(v)
}
}
// Find maximum in the vector (only if we processed any vectors)
var maxVal uint64
if firstInitialized {
var buf [lanes]uint64
maxVec.Store(&buf)
maxVal = max(buf[0], buf[1])
}
// Handle remaining elements
for ; i < length; i++ {
if !firstInitialized || collection[i] > T(maxVal) {
maxVal = uint64(collection[i])
firstInitialized = true
}
}
return T(maxVal)
}
// MaxInt64x8 finds the maximum value in a collection of int64 using AVX-512 SIMD
func MaxInt64x8[T ~int64](collection []T) T {
length := uint(len(collection))
+224
View File
@@ -819,6 +819,55 @@ func TestClampInt32x16(t *testing.T) {
}
}
func TestClampInt64x2(t *testing.T) {
requireAVX512(t)
testCases := []struct {
name string
input []int64
min int64
max int64
}{
{"empty", []int64{}, -100, 100},
{"single", []int64{42}, -10, 10},
{"small", []int64{1, 2, 3, 4, 5}, 2, 4},
{"exactly 2", []int64{-100, 200}, -50, 50},
{"large", make([]int64, 1000), -50, 50},
{"all below min", []int64{-1000, -2000, -3000}, -500, 100},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
for i := range tc.input {
tc.input[i] = rand.Int64()
}
}
got := ClampInt64x2(tc.input, tc.min, tc.max)
if len(got) != len(tc.input) {
t.Errorf("ClampInt64x2() returned length %d, want %d", len(got), len(tc.input))
}
for i, v := range got {
if v < tc.min || v > tc.max {
t.Errorf("ClampInt64x2()[%d] = %v, outside range [%v, %v]", i, v, tc.min, tc.max)
}
original := tc.input[i]
expected := original
if expected < tc.min {
expected = tc.min
} else if expected > tc.max {
expected = tc.max
}
if v != expected {
t.Errorf("ClampInt64x2()[%d] = %v, want %v (original: %v)", i, v, expected, original)
}
}
})
}
}
func TestClampInt64x8(t *testing.T) {
requireAVX512(t)
testCases := []struct {
@@ -1018,6 +1067,55 @@ func TestClampUint32x16(t *testing.T) {
}
}
func TestClampUint64x2(t *testing.T) {
requireAVX512(t)
testCases := []struct {
name string
input []uint64
min uint64
max uint64
}{
{"empty", []uint64{}, 100, 1000},
{"single", []uint64{42}, 10, 100},
{"small", []uint64{1, 2, 3, 4, 5}, 2, 4},
{"exactly 2", []uint64{50, 2000}, 100, 1000},
{"large", make([]uint64, 1000), 500, 5000},
{"all below min", []uint64{1, 2, 3}, 10, 100},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
for i := range tc.input {
tc.input[i] = rand.Uint64()
}
}
got := ClampUint64x2(tc.input, tc.min, tc.max)
if len(got) != len(tc.input) {
t.Errorf("ClampUint64x2() returned length %d, want %d", len(got), len(tc.input))
}
for i, v := range got {
if v < tc.min || v > tc.max {
t.Errorf("ClampUint64x2()[%d] = %v, outside range [%v, %v]", i, v, tc.min, tc.max)
}
original := tc.input[i]
expected := original
if expected < tc.min {
expected = tc.min
} else if expected > tc.max {
expected = tc.max
}
if v != expected {
t.Errorf("ClampUint64x2()[%d] = %v, want %v (original: %v)", i, v, expected, original)
}
}
})
}
}
func TestClampUint64x8(t *testing.T) {
requireAVX512(t)
testCases := []struct {
@@ -1292,6 +1390,38 @@ func TestMinInt32x16(t *testing.T) {
}
}
func TestMinInt64x2(t *testing.T) {
requireAVX512(t)
testCases := []struct {
name string
input []int64
}{
{"empty", []int64{}},
{"single", []int64{42}},
{"small", []int64{1, 2, 3, 4, 5}},
{"exactly 2", []int64{1, 2}},
{"large", make([]int64, 1000)},
{"negative", []int64{-1, -2, -3, 4, 5}},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
for i := range tc.input {
tc.input[i] = rand.Int64()
}
}
got := MinInt64x2(tc.input)
want := lo.Min(tc.input)
if got != want {
t.Errorf("MinInt64x2() = %v, want %v", got, want)
}
})
}
}
func TestMinInt64x8(t *testing.T) {
requireAVX512(t)
testCases := []struct {
@@ -1419,6 +1549,37 @@ func TestMinUint32x16(t *testing.T) {
}
}
func TestMinUint64x2(t *testing.T) {
requireAVX512(t)
testCases := []struct {
name string
input []uint64
}{
{"empty", []uint64{}},
{"single", []uint64{42}},
{"small", []uint64{1, 2, 3, 4, 5}},
{"exactly 2", []uint64{1, 2}},
{"large", make([]uint64, 1000)},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
for i := range tc.input {
tc.input[i] = rand.Uint64()
}
}
got := MinUint64x2(tc.input)
want := lo.Min(tc.input)
if got != want {
t.Errorf("MinUint64x2() = %v, want %v", got, want)
}
})
}
}
func TestMinUint64x8(t *testing.T) {
requireAVX512(t)
testCases := []struct {
@@ -1625,6 +1786,38 @@ func TestMaxInt32x16(t *testing.T) {
}
}
func TestMaxInt64x2(t *testing.T) {
requireAVX512(t)
testCases := []struct {
name string
input []int64
}{
{"empty", []int64{}},
{"single", []int64{42}},
{"small", []int64{1, 2, 3, 4, 5}},
{"exactly 2", []int64{1, 2}},
{"large", make([]int64, 1000)},
{"negative", []int64{-1, -2, -3, 4, 5}},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
for i := range tc.input {
tc.input[i] = rand.Int64()
}
}
got := MaxInt64x2(tc.input)
want := lo.Max(tc.input)
if got != want {
t.Errorf("MaxInt64x2() = %v, want %v", got, want)
}
})
}
}
func TestMaxInt64x8(t *testing.T) {
requireAVX512(t)
testCases := []struct {
@@ -1752,6 +1945,37 @@ func TestMaxUint32x16(t *testing.T) {
}
}
func TestMaxUint64x2(t *testing.T) {
requireAVX512(t)
testCases := []struct {
name string
input []uint64
}{
{"empty", []uint64{}},
{"single", []uint64{42}},
{"small", []uint64{1, 2, 3, 4, 5}},
{"exactly 2", []uint64{1, 2}},
{"large", make([]uint64, 1000)},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
for i := range tc.input {
tc.input[i] = rand.Uint64()
}
}
got := MaxUint64x2(tc.input)
want := lo.Max(tc.input)
if got != want {
t.Errorf("MaxUint64x2() = %v, want %v", got, want)
}
})
}
}
func TestMaxUint64x8(t *testing.T) {
requireAVX512(t)
testCases := []struct {
@@ -10,6 +10,7 @@ import (
)
func TestSumInt8x16(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []int8
@@ -42,6 +43,7 @@ func TestSumInt8x16(t *testing.T) {
}
func TestSumInt16x8(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []int16
@@ -74,6 +76,7 @@ func TestSumInt16x8(t *testing.T) {
}
func TestSumInt32x4(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []int32
@@ -105,6 +108,7 @@ func TestSumInt32x4(t *testing.T) {
}
func TestSumInt64x2(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []int64
@@ -136,6 +140,7 @@ func TestSumInt64x2(t *testing.T) {
}
func TestSumUint8x16(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []uint8
@@ -167,6 +172,7 @@ func TestSumUint8x16(t *testing.T) {
}
func TestSumUint16x8(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []uint16
@@ -198,6 +204,7 @@ func TestSumUint16x8(t *testing.T) {
}
func TestSumUint32x4(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []uint32
@@ -228,6 +235,7 @@ func TestSumUint32x4(t *testing.T) {
}
func TestSumUint64x2(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []uint64
@@ -258,6 +266,7 @@ func TestSumUint64x2(t *testing.T) {
}
func TestSumFloat32x4(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []float32
@@ -291,6 +300,7 @@ func TestSumFloat32x4(t *testing.T) {
}
func TestSumFloat64x2(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []float64
@@ -323,7 +333,8 @@ func TestSumFloat64x2(t *testing.T) {
}
// Test type aliases work correctly
func TestSSETypeAlias(t *testing.T) {
func TestAVXTypeAlias(t *testing.T) {
requireAVX(t)
input := []myInt8{1, 2, 3, 4, 5}
got := SumInt8x16(input)
want := lo.Sum(input)
@@ -334,6 +345,7 @@ func TestSSETypeAlias(t *testing.T) {
}
func TestClampInt8x16(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []int8
@@ -385,6 +397,7 @@ func TestClampInt8x16(t *testing.T) {
}
func TestClampInt16x8(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []int16
@@ -434,6 +447,7 @@ func TestClampInt16x8(t *testing.T) {
}
func TestClampInt32x4(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []int32
@@ -481,56 +495,8 @@ func TestClampInt32x4(t *testing.T) {
}
}
func TestClampInt64x2(t *testing.T) {
requireAVX512(t)
testCases := []struct {
name string
input []int64
min int64
max int64
}{
{"empty", []int64{}, -100, 100},
{"single", []int64{42}, -10, 10},
{"small", []int64{1, 2, 3, 4, 5}, 2, 4},
{"exactly 2", []int64{-100, 200}, -50, 50},
{"large", make([]int64, 1000), -50, 50},
{"all below min", []int64{-1000, -2000, -3000}, -500, 100},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
for i := range tc.input {
tc.input[i] = rand.Int64()
}
}
got := ClampInt64x2(tc.input, tc.min, tc.max)
if len(got) != len(tc.input) {
t.Errorf("ClampInt64x2() returned length %d, want %d", len(got), len(tc.input))
}
for i, v := range got {
if v < tc.min || v > tc.max {
t.Errorf("ClampInt64x2()[%d] = %v, outside range [%v, %v]", i, v, tc.min, tc.max)
}
original := tc.input[i]
expected := original
if expected < tc.min {
expected = tc.min
} else if expected > tc.max {
expected = tc.max
}
if v != expected {
t.Errorf("ClampInt64x2()[%d] = %v, want %v (original: %v)", i, v, expected, original)
}
}
})
}
}
func TestClampUint8x16(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []uint8
@@ -581,6 +547,7 @@ func TestClampUint8x16(t *testing.T) {
}
func TestClampUint16x8(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []uint16
@@ -630,6 +597,7 @@ func TestClampUint16x8(t *testing.T) {
}
func TestClampUint32x4(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []uint32
@@ -677,56 +645,8 @@ func TestClampUint32x4(t *testing.T) {
}
}
func TestClampUint64x2(t *testing.T) {
requireAVX512(t)
testCases := []struct {
name string
input []uint64
min uint64
max uint64
}{
{"empty", []uint64{}, 100, 1000},
{"single", []uint64{42}, 10, 100},
{"small", []uint64{1, 2, 3, 4, 5}, 2, 4},
{"exactly 2", []uint64{50, 2000}, 100, 1000},
{"large", make([]uint64, 1000), 500, 5000},
{"all below min", []uint64{1, 2, 3}, 10, 100},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
for i := range tc.input {
tc.input[i] = rand.Uint64()
}
}
got := ClampUint64x2(tc.input, tc.min, tc.max)
if len(got) != len(tc.input) {
t.Errorf("ClampUint64x2() returned length %d, want %d", len(got), len(tc.input))
}
for i, v := range got {
if v < tc.min || v > tc.max {
t.Errorf("ClampUint64x2()[%d] = %v, outside range [%v, %v]", i, v, tc.min, tc.max)
}
original := tc.input[i]
expected := original
if expected < tc.min {
expected = tc.min
} else if expected > tc.max {
expected = tc.max
}
if v != expected {
t.Errorf("ClampUint64x2()[%d] = %v, want %v (original: %v)", i, v, expected, original)
}
}
})
}
}
func TestClampFloat32x4(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []float32
@@ -778,6 +698,7 @@ func TestClampFloat32x4(t *testing.T) {
}
func TestClampFloat64x2(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []float64
@@ -829,7 +750,8 @@ func TestClampFloat64x2(t *testing.T) {
}
// Test type aliases work correctly
func TestSSEClampTypeAlias(t *testing.T) {
func TestAVXClampTypeAlias(t *testing.T) {
requireAVX(t)
input := []myInt8{-5, 0, 10, 15, 20}
min := myInt8(0)
max := myInt8(10)
@@ -853,6 +775,7 @@ func TestSSEClampTypeAlias(t *testing.T) {
}
func TestMeanInt8x16(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []int8
@@ -884,6 +807,7 @@ func TestMeanInt8x16(t *testing.T) {
}
func TestMeanInt16x8(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []int16
@@ -915,6 +839,7 @@ func TestMeanInt16x8(t *testing.T) {
}
func TestMeanInt32x4(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []int32
@@ -946,6 +871,7 @@ func TestMeanInt32x4(t *testing.T) {
}
func TestMeanInt64x2(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []int64
@@ -977,6 +903,7 @@ func TestMeanInt64x2(t *testing.T) {
}
func TestMeanUint8x16(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []uint8
@@ -1008,6 +935,7 @@ func TestMeanUint8x16(t *testing.T) {
}
func TestMeanUint16x8(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []uint16
@@ -1039,6 +967,7 @@ func TestMeanUint16x8(t *testing.T) {
}
func TestMeanUint32x4(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []uint32
@@ -1069,6 +998,7 @@ func TestMeanUint32x4(t *testing.T) {
}
func TestMeanUint64x2(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []uint64
@@ -1099,6 +1029,7 @@ func TestMeanUint64x2(t *testing.T) {
}
func TestMeanFloat32x4(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []float32
@@ -1132,6 +1063,7 @@ func TestMeanFloat32x4(t *testing.T) {
}
func TestMeanFloat64x2(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []float64
@@ -1164,7 +1096,8 @@ func TestMeanFloat64x2(t *testing.T) {
}
// Test type aliases work correctly
func TestSSEMeanTypeAlias(t *testing.T) {
func TestAVXMeanTypeAlias(t *testing.T) {
requireAVX(t)
input := []myInt8{1, 2, 3, 4, 5}
got := MeanInt8x16(input)
want := lo.Mean(input)
@@ -1175,6 +1108,7 @@ func TestSSEMeanTypeAlias(t *testing.T) {
}
func TestMinInt8x16(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []int8
@@ -1206,6 +1140,7 @@ func TestMinInt8x16(t *testing.T) {
}
func TestMinInt16x8(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []int16
@@ -1237,6 +1172,7 @@ func TestMinInt16x8(t *testing.T) {
}
func TestMinInt32x4(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []int32
@@ -1267,39 +1203,8 @@ func TestMinInt32x4(t *testing.T) {
}
}
func TestMinInt64x2(t *testing.T) {
requireAVX512(t)
testCases := []struct {
name string
input []int64
}{
{"empty", []int64{}},
{"single", []int64{42}},
{"small", []int64{1, 2, 3, 4, 5}},
{"exactly 2", []int64{1, 2}},
{"large", make([]int64, 1000)},
{"negative", []int64{-1, -2, -3, 4, 5}},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
for i := range tc.input {
tc.input[i] = rand.Int64()
}
}
got := MinInt64x2(tc.input)
want := lo.Min(tc.input)
if got != want {
t.Errorf("MinInt64x2() = %v, want %v", got, want)
}
})
}
}
func TestMinUint8x16(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []uint8
@@ -1331,6 +1236,7 @@ func TestMinUint8x16(t *testing.T) {
}
func TestMinUint16x8(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []uint16
@@ -1362,6 +1268,7 @@ func TestMinUint16x8(t *testing.T) {
}
func TestMinUint32x4(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []uint32
@@ -1391,38 +1298,8 @@ func TestMinUint32x4(t *testing.T) {
}
}
func TestMinUint64x2(t *testing.T) {
requireAVX512(t)
testCases := []struct {
name string
input []uint64
}{
{"empty", []uint64{}},
{"single", []uint64{42}},
{"small", []uint64{1, 2, 3, 4, 5}},
{"exactly 2", []uint64{1, 2}},
{"large", make([]uint64, 1000)},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
for i := range tc.input {
tc.input[i] = rand.Uint64()
}
}
got := MinUint64x2(tc.input)
want := lo.Min(tc.input)
if got != want {
t.Errorf("MinUint64x2() = %v, want %v", got, want)
}
})
}
}
func TestMinFloat32x4(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []float32
@@ -1456,6 +1333,7 @@ func TestMinFloat32x4(t *testing.T) {
}
func TestMinFloat64x2(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []float64
@@ -1488,7 +1366,8 @@ func TestMinFloat64x2(t *testing.T) {
}
// Test type aliases work correctly
func TestSSEMinTypeAlias(t *testing.T) {
func TestAVXMinTypeAlias(t *testing.T) {
requireAVX(t)
input := []myInt8{5, 2, 8, 1, 9}
got := MinInt8x16(input)
want := myInt8(1)
@@ -1499,6 +1378,7 @@ func TestSSEMinTypeAlias(t *testing.T) {
}
func TestMaxInt8x16(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []int8
@@ -1530,6 +1410,7 @@ func TestMaxInt8x16(t *testing.T) {
}
func TestMaxInt16x8(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []int16
@@ -1561,6 +1442,7 @@ func TestMaxInt16x8(t *testing.T) {
}
func TestMaxInt32x4(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []int32
@@ -1591,39 +1473,8 @@ func TestMaxInt32x4(t *testing.T) {
}
}
func TestMaxInt64x2(t *testing.T) {
requireAVX512(t)
testCases := []struct {
name string
input []int64
}{
{"empty", []int64{}},
{"single", []int64{42}},
{"small", []int64{1, 2, 3, 4, 5}},
{"exactly 2", []int64{1, 2}},
{"large", make([]int64, 1000)},
{"negative", []int64{-1, -2, -3, 4, 5}},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
for i := range tc.input {
tc.input[i] = rand.Int64()
}
}
got := MaxInt64x2(tc.input)
want := lo.Max(tc.input)
if got != want {
t.Errorf("MaxInt64x2() = %v, want %v", got, want)
}
})
}
}
func TestMaxUint8x16(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []uint8
@@ -1655,6 +1506,7 @@ func TestMaxUint8x16(t *testing.T) {
}
func TestMaxUint16x8(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []uint16
@@ -1686,6 +1538,7 @@ func TestMaxUint16x8(t *testing.T) {
}
func TestMaxUint32x4(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []uint32
@@ -1715,38 +1568,8 @@ func TestMaxUint32x4(t *testing.T) {
}
}
func TestMaxUint64x2(t *testing.T) {
requireAVX512(t)
testCases := []struct {
name string
input []uint64
}{
{"empty", []uint64{}},
{"single", []uint64{42}},
{"small", []uint64{1, 2, 3, 4, 5}},
{"exactly 2", []uint64{1, 2}},
{"large", make([]uint64, 1000)},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
for i := range tc.input {
tc.input[i] = rand.Uint64()
}
}
got := MaxUint64x2(tc.input)
want := lo.Max(tc.input)
if got != want {
t.Errorf("MaxUint64x2() = %v, want %v", got, want)
}
})
}
}
func TestMaxFloat32x4(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []float32
@@ -1780,6 +1603,7 @@ func TestMaxFloat32x4(t *testing.T) {
}
func TestMaxFloat64x2(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []float64
@@ -1812,7 +1636,8 @@ func TestMaxFloat64x2(t *testing.T) {
}
// Test type aliases work correctly
func TestSSEMaxTypeAlias(t *testing.T) {
func TestAVXMaxTypeAlias(t *testing.T) {
requireAVX(t)
input := []myInt8{5, 2, 8, 1, 9}
got := MaxInt8x16(input)
want := myInt8(9)
@@ -1831,6 +1656,7 @@ type item struct {
}
func TestSumByInt8x16(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []item
@@ -1863,6 +1689,7 @@ func TestSumByInt8x16(t *testing.T) {
}
func TestSumByInt16x8(t *testing.T) {
requireAVX(t)
type itemInt16 struct {
Value int16
}
@@ -1898,6 +1725,7 @@ func TestSumByInt16x8(t *testing.T) {
}
func TestSumByInt32x4(t *testing.T) {
requireAVX(t)
type itemInt32 struct {
Value int32
}
@@ -1933,6 +1761,7 @@ func TestSumByInt32x4(t *testing.T) {
}
func TestSumByInt64x2(t *testing.T) {
requireAVX(t)
type itemInt64 struct {
Value int64
}
@@ -1968,6 +1797,7 @@ func TestSumByInt64x2(t *testing.T) {
}
func TestSumByUint8x16(t *testing.T) {
requireAVX(t)
type itemUint8 struct {
Value uint8
}
@@ -2003,6 +1833,7 @@ func TestSumByUint8x16(t *testing.T) {
}
func TestSumByUint16x8(t *testing.T) {
requireAVX(t)
type itemUint16 struct {
Value uint16
}
@@ -2038,6 +1869,7 @@ func TestSumByUint16x8(t *testing.T) {
}
func TestSumByUint32x4(t *testing.T) {
requireAVX(t)
type itemUint32 struct {
Value uint32
}
@@ -2072,6 +1904,7 @@ func TestSumByUint32x4(t *testing.T) {
}
func TestSumByUint64x2(t *testing.T) {
requireAVX(t)
type itemUint64 struct {
Value uint64
}
@@ -2106,6 +1939,7 @@ func TestSumByUint64x2(t *testing.T) {
}
func TestSumByFloat32x4(t *testing.T) {
requireAVX(t)
type itemFloat32 struct {
Value float32
}
@@ -2143,6 +1977,7 @@ func TestSumByFloat32x4(t *testing.T) {
}
func TestSumByFloat64x2(t *testing.T) {
requireAVX(t)
type itemFloat64 struct {
Value float64
}
@@ -2179,7 +2014,8 @@ func TestSumByFloat64x2(t *testing.T) {
}
// Test type alias works correctly for SumBy
func TestSSESumByTypeAlias(t *testing.T) {
func TestAVXSumByTypeAlias(t *testing.T) {
requireAVX(t)
type myItem struct {
Value myInt8
}
@@ -2196,6 +2032,7 @@ func TestSSESumByTypeAlias(t *testing.T) {
// MeanBy tests
func TestMeanByInt8x16(t *testing.T) {
requireAVX(t)
testCases := []struct {
name string
input []item
@@ -2227,6 +2064,7 @@ func TestMeanByInt8x16(t *testing.T) {
}
func TestMeanByInt16x8(t *testing.T) {
requireAVX(t)
type itemInt16 struct {
Value int16
}
@@ -2262,6 +2100,7 @@ func TestMeanByInt16x8(t *testing.T) {
}
func TestMeanByInt32x4(t *testing.T) {
requireAVX(t)
type itemInt32 struct {
Value int32
}
@@ -2297,6 +2136,7 @@ func TestMeanByInt32x4(t *testing.T) {
}
func TestMeanByInt64x2(t *testing.T) {
requireAVX(t)
type itemInt64 struct {
Value int64
}
@@ -2332,6 +2172,7 @@ func TestMeanByInt64x2(t *testing.T) {
}
func TestMeanByUint8x16(t *testing.T) {
requireAVX(t)
type itemUint8 struct {
Value uint8
}
@@ -2367,6 +2208,7 @@ func TestMeanByUint8x16(t *testing.T) {
}
func TestMeanByUint16x8(t *testing.T) {
requireAVX(t)
type itemUint16 struct {
Value uint16
}
@@ -2402,6 +2244,7 @@ func TestMeanByUint16x8(t *testing.T) {
}
func TestMeanByUint32x4(t *testing.T) {
requireAVX(t)
type itemUint32 struct {
Value uint32
}
@@ -2436,6 +2279,7 @@ func TestMeanByUint32x4(t *testing.T) {
}
func TestMeanByUint64x2(t *testing.T) {
requireAVX(t)
type itemUint64 struct {
Value uint64
}
@@ -2470,6 +2314,7 @@ func TestMeanByUint64x2(t *testing.T) {
}
func TestMeanByFloat32x4(t *testing.T) {
requireAVX(t)
type itemFloat32 struct {
Value float32
}
@@ -2507,6 +2352,7 @@ func TestMeanByFloat32x4(t *testing.T) {
}
func TestMeanByFloat64x2(t *testing.T) {
requireAVX(t)
type itemFloat64 struct {
Value float64
}
@@ -2543,7 +2389,8 @@ func TestMeanByFloat64x2(t *testing.T) {
}
// Test type alias works correctly for MeanBy
func TestSSEMeanByTypeAlias(t *testing.T) {
func TestAVXMeanByTypeAlias(t *testing.T) {
requireAVX(t)
type myItem struct {
Value myInt8
}
+33 -17
View File
@@ -13,15 +13,15 @@ import (
// Benchmark suite for SIMD math operations compared to core lo package fallbacks.
// These benchmarks measure the performance of Sum, Mean, Min, and Max operations
// across different SIMD implementations (SSE, AVX2, AVX512) and data sizes.
// across different SIMD implementations (AVX, AVX2, AVX512) and data sizes.
// Benchmark sizes to demonstrate performance characteristics at different scales
var benchmarkSizes = []struct {
name string
size int
}{
{"small", 8}, // Smaller than SSE width (16 lanes for int8)
{"medium", 128}, // Between SSE (16) and AVX2 (32) width for int8
{"small", 8}, // Smaller than AVX width (16 lanes for int8)
{"medium", 128}, // Between AVX (16) and AVX2 (32) width for int8
{"large", 1024}, // Well above SIMD register widths
{"xlarge", 8192}, // Large dataset for real-world performance
}
@@ -128,7 +128,8 @@ func BenchmarkSumInt8(b *testing.B) {
_ = lo.Sum(data)
}
})
b.Run("SSE-x16", func(b *testing.B) {
b.Run("AVX-x16", func(b *testing.B) {
requireAVX(b)
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = SumInt8x16(data)
@@ -162,7 +163,8 @@ func BenchmarkSumInt16(b *testing.B) {
_ = lo.Sum(data)
}
})
b.Run("SSE-x8", func(b *testing.B) {
b.Run("AVX-x8", func(b *testing.B) {
requireAVX(b)
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = SumInt16x8(data)
@@ -196,7 +198,8 @@ func BenchmarkSumInt32(b *testing.B) {
_ = lo.Sum(data)
}
})
b.Run("SSE-x4", func(b *testing.B) {
b.Run("AVX-x4", func(b *testing.B) {
requireAVX(b)
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = SumInt32x4(data)
@@ -230,7 +233,8 @@ func BenchmarkSumInt64(b *testing.B) {
_ = lo.Sum(data)
}
})
b.Run("SSE-x2", func(b *testing.B) {
b.Run("AVX-x2", func(b *testing.B) {
requireAVX(b)
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = SumInt64x2(data)
@@ -264,7 +268,8 @@ func BenchmarkSumFloat32(b *testing.B) {
_ = lo.Sum(data)
}
})
b.Run("SSE-x4", func(b *testing.B) {
b.Run("AVX-x4", func(b *testing.B) {
requireAVX(b)
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = SumFloat32x4(data)
@@ -298,7 +303,8 @@ func BenchmarkSumFloat64(b *testing.B) {
_ = lo.Sum(data)
}
})
b.Run("SSE-x2", func(b *testing.B) {
b.Run("AVX-x2", func(b *testing.B) {
requireAVX(b)
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = SumFloat64x2(data)
@@ -336,7 +342,8 @@ func BenchmarkMeanInt32(b *testing.B) {
_ = lo.Mean(data)
}
})
b.Run("SSE-x4", func(b *testing.B) {
b.Run("AVX-x4", func(b *testing.B) {
requireAVX(b)
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = MeanInt32x4(data)
@@ -370,7 +377,8 @@ func BenchmarkMeanFloat64(b *testing.B) {
_ = lo.Mean(data)
}
})
b.Run("SSE-x2", func(b *testing.B) {
b.Run("AVX-x2", func(b *testing.B) {
requireAVX(b)
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = MeanFloat64x2(data)
@@ -402,7 +410,8 @@ func BenchmarkMinInt32(b *testing.B) {
for _, bs := range benchmarkSizes {
b.Run(bs.name, func(b *testing.B) {
data := generateInt32(bs.size)
b.Run("SSE-x4", func(b *testing.B) {
b.Run("AVX-x4", func(b *testing.B) {
requireAVX(b)
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = MinInt32x4(data)
@@ -430,7 +439,8 @@ func BenchmarkMinFloat64(b *testing.B) {
for _, bs := range benchmarkSizes {
b.Run(bs.name, func(b *testing.B) {
data := generateFloat64(bs.size)
b.Run("SSE-x2", func(b *testing.B) {
b.Run("AVX-x2", func(b *testing.B) {
requireAVX(b)
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = MinFloat64x2(data)
@@ -462,7 +472,8 @@ func BenchmarkMaxInt32(b *testing.B) {
for _, bs := range benchmarkSizes {
b.Run(bs.name, func(b *testing.B) {
data := generateInt32(bs.size)
b.Run("SSE-x4", func(b *testing.B) {
b.Run("AVX-x4", func(b *testing.B) {
requireAVX(b)
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = MaxInt32x4(data)
@@ -490,7 +501,8 @@ func BenchmarkMaxFloat64(b *testing.B) {
for _, bs := range benchmarkSizes {
b.Run(bs.name, func(b *testing.B) {
data := generateFloat64(bs.size)
b.Run("SSE-x2", func(b *testing.B) {
b.Run("AVX-x2", func(b *testing.B) {
requireAVX(b)
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = MaxFloat64x2(data)
@@ -528,13 +540,16 @@ func BenchmarkSumInt8ByWidth(b *testing.B) {
fn func() int8
}{
{"Fallback-lo", func() int8 { return lo.Sum(data) }},
{"SSE-x16", func() int8 { return SumInt8x16(data) }},
{"AVX-x16", func() int8 { return SumInt8x16(data) }},
{"AVX2-x32", func() int8 { return SumInt8x32(data) }},
{"AVX512-x64", func() int8 { return SumInt8x64(data) }},
}
for _, bm := range benchmarks {
b.Run(bm.name, func(b *testing.B) {
if bm.name == "AVX-x16" {
requireAVX(b)
}
if bm.name == "AVX2-x32" {
requireAVX2(b)
}
@@ -578,7 +593,8 @@ func BenchmarkSumInt64SteadyState(b *testing.B) {
_ = lo.Sum(data)
}
})
b.Run("SSE-x2", func(b *testing.B) {
b.Run("AVX-x2", func(b *testing.B) {
requireAVX(b)
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = SumInt64x2(data)
+12 -10
View File
@@ -24,13 +24,15 @@ func init() {
}
// Type aliases for testing
type myInt8 int8
type myInt16 int16
type myInt32 int32
type myInt64 int64
type myUint8 uint8
type myUint16 uint16
type myUint32 uint32
type myUint64 uint64
type myFloat32 float32
type myFloat64 float64
type (
myInt8 int8
myInt16 int16
myInt32 int32
myInt64 int64
myUint8 uint8
myUint16 uint16
myUint32 uint32
myUint64 uint64
myFloat32 float32
myFloat64 float64
)