mirror of
https://github.com/samber/lo.git
synced 2026-04-22 15:37:14 +08:00
style(simd): rename sse to avx (#821)
* style(simd): rename sse to avx * fix(exp,simd): apply the right avx512 constraints to a few methods * fix(exp,simd): apply the right avx512 constraints to a few methods
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
---
|
||||
name: Clamp
|
||||
slug: clamp
|
||||
sourceRef: exp/simd/math_sse.go#L424
|
||||
sourceRef: exp/simd/math_avx.go#L453
|
||||
category: exp
|
||||
subCategory: simd
|
||||
similarHelpers:
|
||||
@@ -51,7 +51,7 @@ Clamps each element in a collection between min and max values using SIMD instru
|
||||
|
||||
| SIMD variant | Lanes | Required flags | Typical CPUs |
|
||||
| ------------ | ----- | -------------- | ------------------------------ |
|
||||
| SSE (xN) | 2-16 | `sse2` | All amd64 |
|
||||
| AVX (xN) | 2-16 | `avx` | All amd64 |
|
||||
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
|
||||
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
|
||||
|
||||
@@ -70,7 +70,7 @@ result := simd.ClampFloat32x16([]float32{0.5, 1.5, 2.5, 3.5}, 1.0, 3.0)
|
||||
```
|
||||
|
||||
```go
|
||||
// Using SSE variant (8 lanes at once) - works on all amd64
|
||||
// Using AVX variant (8 lanes at once) - works on all amd64
|
||||
result := simd.ClampInt16x8([]int16{100, 150, 200, 250}, 120, 220)
|
||||
// []int16{120, 150, 200, 220}
|
||||
```
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
---
|
||||
name: Contains
|
||||
slug: contains
|
||||
sourceRef: exp/simd/intersect_sse.go#L11
|
||||
sourceRef: exp/simd/intersect_avx512.go#L9
|
||||
category: exp
|
||||
subCategory: simd
|
||||
similarHelpers:
|
||||
@@ -51,7 +51,7 @@ Checks if a target value is present in a collection using SIMD instructions. The
|
||||
|
||||
| SIMD variant | Lanes | Required flags | Typical CPUs |
|
||||
| ------------ | ----- | -------------- | ------------------------------ |
|
||||
| SSE (xN) | 2-16 | `sse2` | All amd64 |
|
||||
| AVX (xN) | 2-16 | `avx` | All amd64 |
|
||||
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
|
||||
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
|
||||
|
||||
@@ -64,7 +64,7 @@ found := simd.ContainsInt8x32([]int8{1, 2, 3, 4, 5}, 3)
|
||||
```
|
||||
|
||||
```go
|
||||
// Using SSE variant (16 lanes at once) - works on all amd64
|
||||
// Using AVX variant (16 lanes at once) - works on all amd64
|
||||
found := simd.ContainsInt64x2([]int64{1000000, 2000000, 3000000}, 2000000)
|
||||
// true
|
||||
```
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
---
|
||||
name: Max
|
||||
slug: max
|
||||
sourceRef: exp/simd/math_sse.go#L1328
|
||||
sourceRef: exp/simd/math_avx.go#L1279
|
||||
category: exp
|
||||
subCategory: simd
|
||||
similarHelpers:
|
||||
@@ -51,7 +51,7 @@ Finds the maximum value in a collection using SIMD instructions. The suffix (x2,
|
||||
|
||||
| SIMD variant | Lanes | Required flags | Typical CPUs |
|
||||
| ------------ | ----- | -------------- | ------------------------------ |
|
||||
| SSE (xN) | 2-16 | `sse2` | All amd64 |
|
||||
| AVX (xN) | 2-16 | `avx` | All amd64 |
|
||||
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
|
||||
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
|
||||
|
||||
@@ -70,7 +70,7 @@ max := simd.MaxFloat32x16([]float32{3.5, 1.2, 4.8, 2.1})
|
||||
```
|
||||
|
||||
```go
|
||||
// Using SSE variant (4 lanes at once) - works on all amd64
|
||||
// Using AVX variant (4 lanes at once) - works on all amd64
|
||||
max := simd.MaxInt32x4([]int32{100, 50, 200, 75})
|
||||
// 200
|
||||
```
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
---
|
||||
name: Mean
|
||||
slug: mean
|
||||
sourceRef: exp/simd/math_sse.go#L333
|
||||
sourceRef: exp/simd/math_avx.go#L352
|
||||
category: exp
|
||||
subCategory: simd
|
||||
similarHelpers:
|
||||
@@ -52,7 +52,7 @@ Calculates the arithmetic mean of a collection using SIMD instructions. The suff
|
||||
|
||||
| SIMD variant | Lanes | Required flags | Typical CPUs |
|
||||
| ------------ | ----- | -------------- | ------------------------------ |
|
||||
| SSE (xN) | 2-16 | `sse2` | All amd64 |
|
||||
| AVX (xN) | 2-16 | `avx` | All amd64 |
|
||||
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
|
||||
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
|
||||
|
||||
@@ -71,7 +71,7 @@ mean := simd.MeanFloat32x16([]float32{1.0, 2.0, 3.0, 4.0})
|
||||
```
|
||||
|
||||
```go
|
||||
// Using SSE variant (8 lanes at once) - works on all amd64
|
||||
// Using AVX variant (8 lanes at once) - works on all amd64
|
||||
mean := simd.MeanInt16x8([]int16{10, 20, 30, 40})
|
||||
// 25
|
||||
```
|
||||
|
||||
@@ -62,7 +62,7 @@ MeanBy transforms a collection using an iteratee function and calculates the ari
|
||||
|
||||
| SIMD variant | Lanes | Required flags | Typical CPUs |
|
||||
| ------------ | ----- | -------------- | ------------------------------ |
|
||||
| SSE (xN) | 2-16 | `sse2` | All amd64 |
|
||||
| AVX (xN) | 2-16 | `avx` | All amd64 |
|
||||
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
|
||||
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
|
||||
|
||||
@@ -118,7 +118,7 @@ metrics := []Metric{
|
||||
{Value: 400},
|
||||
}
|
||||
|
||||
// Using SSE variant - works on all amd64
|
||||
// Using AVX variant - works on all amd64
|
||||
mean := simd.MeanByUint16x8(metrics, func(m Metric) uint16 {
|
||||
return m.Value
|
||||
})
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
---
|
||||
name: Min
|
||||
slug: min
|
||||
sourceRef: exp/simd/math_sse.go#L834
|
||||
sourceRef: exp/simd/math_avx.go#L833
|
||||
category: exp
|
||||
subCategory: simd
|
||||
similarHelpers:
|
||||
@@ -51,7 +51,7 @@ Finds the minimum value in a collection using SIMD instructions. The suffix (x2,
|
||||
|
||||
| SIMD variant | Lanes | Required flags | Typical CPUs |
|
||||
| ------------ | ----- | -------------- | ------------------------------ |
|
||||
| SSE (xN) | 2-16 | `sse2` | All amd64 |
|
||||
| AVX (xN) | 2-16 | `avx` | All amd64 |
|
||||
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
|
||||
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
|
||||
|
||||
@@ -70,7 +70,7 @@ min := simd.MinFloat32x16([]float32{3.5, 1.2, 4.8, 2.1})
|
||||
```
|
||||
|
||||
```go
|
||||
// Using SSE variant (4 lanes at once) - works on all amd64
|
||||
// Using AVX variant (4 lanes at once) - works on all amd64
|
||||
min := simd.MinInt32x4([]int32{100, 50, 200, 75})
|
||||
// 50
|
||||
```
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
---
|
||||
name: Sum
|
||||
slug: sum
|
||||
sourceRef: exp/simd/math_sse.go#L13
|
||||
sourceRef: exp/simd/math_avx.go#L14
|
||||
category: exp
|
||||
subCategory: simd
|
||||
similarHelpers:
|
||||
@@ -52,7 +52,7 @@ Sums the values in a collection using SIMD instructions. The suffix (x2, x4, x8,
|
||||
|
||||
| SIMD variant | Lanes | Required flags | Typical CPUs |
|
||||
| ------------ | ----- | -------------- | ------------------------------ |
|
||||
| SSE (xN) | 2-16 | `sse2` | All amd64 |
|
||||
| AVX (xN) | 2-16 | `avx` | All amd64 |
|
||||
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
|
||||
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
|
||||
|
||||
@@ -71,7 +71,7 @@ sum := simd.SumFloat32x16([]float32{1.1, 2.2, 3.3, 4.4})
|
||||
```
|
||||
|
||||
```go
|
||||
// Using SSE variant (4 lanes at once) - works on all amd64
|
||||
// Using AVX variant (4 lanes at once) - works on all amd64
|
||||
sum := simd.SumInt32x4([]int32{1000000, 2000000, 3000000})
|
||||
// 6000000
|
||||
```
|
||||
|
||||
@@ -62,7 +62,7 @@ SumBy transforms a collection using an iteratee function and sums the result usi
|
||||
|
||||
| SIMD variant | Lanes | Required flags | Typical CPUs |
|
||||
| ------------ | ----- | -------------- | ------------------------------ |
|
||||
| SSE (xN) | 2-16 | `sse2` | All amd64 |
|
||||
| AVX (xN) | 2-16 | `avx` | All amd64 |
|
||||
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
|
||||
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
|
||||
|
||||
@@ -119,7 +119,7 @@ metrics := []Metric{
|
||||
{Value: 400},
|
||||
}
|
||||
|
||||
// Using SSE variant - works on all amd64
|
||||
// Using AVX variant - works on all amd64
|
||||
sum := simd.SumByUint16x8(metrics, func(m Metric) uint16 {
|
||||
return m.Value
|
||||
})
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
title: SIMD operations
|
||||
description: High-performance slice operations using SSE, AVX2 and AVX512 SIMD when built with Go 1.26+ and GOEXPERIMENT=simd on amd64.
|
||||
description: High-performance slice operations using AVX, AVX2 and AVX512 SIMD when built with Go 1.26+ and GOEXPERIMENT=simd on amd64.
|
||||
sidebar_position: 0
|
||||
hide_table_of_contents: true
|
||||
---
|
||||
@@ -14,7 +14,7 @@ Your feedback helps us improve!
|
||||
#
|
||||
## SIMD helpers
|
||||
|
||||
This page lists all operations on slices, available in the `exp/simd` sub-package. These helpers use **SSE** (128-bit), **AVX2** (256-bit) or **AVX512** (512-bit) SIMD when built with Go 1.26+, the `GOEXPERIMENT=simd` flag, and on amd64.
|
||||
This page lists all operations on slices, available in the `exp/simd` sub-package. These helpers use **AVX** (128-bit), **AVX2** (256-bit) or **AVX512** (512-bit) SIMD when built with Go 1.26+, the `GOEXPERIMENT=simd` flag, and on amd64.
|
||||
|
||||
:::warning Unstable API
|
||||
SIMD helpers are experimental. The API may break in the future.
|
||||
@@ -26,7 +26,7 @@ Benchmarks show that running SIMD operators on small datasets is slower:
|
||||
|
||||
```txt
|
||||
BenchmarkSumInt8/small/Fallback-lo-4 203616572 5.875 ns/op
|
||||
BenchmarkSumInt8/small/SSE-x16-4 100000000 12.04 ns/op
|
||||
BenchmarkSumInt8/small/AVX-x16-4 100000000 12.04 ns/op
|
||||
BenchmarkSumInt8/small/AVX2-x32-4 64041816 17.93 ns/op
|
||||
BenchmarkSumInt8/small/AVX512-x64-4 26947528 44.75 ns/op
|
||||
```
|
||||
@@ -35,7 +35,7 @@ But much much faster on big datasets:
|
||||
|
||||
```txt
|
||||
BenchmarkSumInt8/xlarge/Fallback-lo-4 247677 4860 ns/op
|
||||
BenchmarkSumInt8/xlarge/SSE-x16-4 3851040 311.4 ns/op
|
||||
BenchmarkSumInt8/xlarge/AVX-x16-4 3851040 311.4 ns/op
|
||||
BenchmarkSumInt8/xlarge/AVX2-x32-4 7100002 169.2 ns/op
|
||||
BenchmarkSumInt8/xlarge/AVX512-x64-4 10107534 118.1 ns/op
|
||||
```
|
||||
|
||||
+192
-192
@@ -6,7 +6,7 @@ Benchmarks show that running SIMD operations on small datasets is slower:
|
||||
|
||||
```txt
|
||||
BenchmarkSumInt8/small/Fallback-lo-2 248740710 5.218 ns/op
|
||||
BenchmarkSumInt8/small/SSE-x16-2 126181464 9.485 ns/op
|
||||
BenchmarkSumInt8/small/AVX-x16-2 126181464 9.485 ns/op
|
||||
BenchmarkSumInt8/small/AVX2-x32-2 73059427 14.44 ns/op
|
||||
BenchmarkSumInt8/small/AVX512-x64-2 49913169 24.41 ns/op
|
||||
```
|
||||
@@ -15,7 +15,7 @@ But SIMD is much faster on large datasets:
|
||||
|
||||
```txt
|
||||
BenchmarkSumInt8/xlarge/Fallback-lo-2 273898 4383 ns/op
|
||||
BenchmarkSumInt8/xlarge/SSE-x16-2 6928408 173.1 ns/op
|
||||
BenchmarkSumInt8/xlarge/AVX-x16-2 6928408 173.1 ns/op
|
||||
BenchmarkSumInt8/xlarge/AVX2-x32-2 12639586 94.09 ns/op
|
||||
BenchmarkSumInt8/xlarge/AVX512-x64-2 13509693 89.67 ns/op
|
||||
```
|
||||
@@ -50,397 +50,397 @@ ok github.com/samber/lo/exp/simd 596.213s
|
||||
|
||||
| Benchmark | Iterations | Time/op | Bytes/op | Allocs/op |
|
||||
| ---------------------------------------------- | ---------- | ----------- | -------- | ----------- |
|
||||
| BenchmarkContainsInt8/tiny/SSE-x16-2 | 312359204 | 3.625 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/tiny/AVX2-x32-2 | 277194441 | 4.531 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/tiny/AVX512-x16-2 | 312359204 | 3.625 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/tiny/AVX512-x32-2 | 277194441 | 4.531 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/tiny/AVX512-x64-2 | 336853209 | 3.401 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/small/SSE-x16-2 | 449132103 | 2.670 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/small/AVX2-x32-2 | 148648339 | 8.332 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/small/AVX512-x16-2 | 449132103 | 2.670 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/small/AVX512-x32-2 | 148648339 | 8.332 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/small/AVX512-x64-2 | 143124861 | 7.982 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/medium/SSE-x16-2 | 276816714 | 4.302 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/medium/AVX2-x32-2 | 345774957 | 3.529 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/medium/AVX512-x16-2 | 276816714 | 4.302 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/medium/AVX512-x32-2 | 345774957 | 3.529 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/medium/AVX512-x64-2 | 449868722 | 2.669 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/large/SSE-x16-2 | 100000000 | 10.68 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/large/AVX2-x32-2 | 172934200 | 6.941 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/large/AVX512-x16-2 | 100000000 | 10.68 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/large/AVX512-x32-2 | 172934200 | 6.941 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/large/AVX512-x64-2 | 280992625 | 4.384 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/xlarge/SSE-x16-2 | 187189599 | 6.203 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/xlarge/AVX2-x32-2 | 274289563 | 4.042 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/xlarge/AVX512-x16-2 | 187189599 | 6.203 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/xlarge/AVX512-x32-2 | 274289563 | 4.042 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/xlarge/AVX512-x64-2 | 375048555 | 2.953 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/massive/SSE-x16-2 | 86434948 | 14.02 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/massive/AVX2-x32-2 | 153742346 | 8.012 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/massive/AVX512-x16-2 | 86434948 | 14.02 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/massive/AVX512-x32-2 | 153742346 | 8.012 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8/massive/AVX512-x64-2 | 259404483 | 5.214 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/tiny/SSE-x8-2 | 270309470 | 4.315 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/tiny/AVX2-x16-2 | 264874646 | 4.281 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/tiny/AVX512-x8-2 | 270309470 | 4.315 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/tiny/AVX512-x16-2 | 264874646 | 4.281 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/tiny/AVX512-x32-2 | 328810479 | 3.593 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/small/SSE-x8-2 | 374742561 | 3.206 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/small/AVX2-x16-2 | 449838870 | 2.678 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/small/AVX512-x8-2 | 374742561 | 3.206 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/small/AVX512-x16-2 | 449838870 | 2.678 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/small/AVX512-x32-2 | 143845734 | 8.484 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/medium/SSE-x8-2 | 185415590 | 6.448 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/medium/AVX2-x16-2 | 273780868 | 4.268 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/medium/AVX512-x8-2 | 185415590 | 6.448 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/medium/AVX512-x16-2 | 273780868 | 4.268 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/medium/AVX512-x32-2 | 350067484 | 3.431 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/large/SSE-x8-2 | 61109778 | 19.66 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/large/AVX2-x16-2 | 100000000 | 10.74 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/large/AVX512-x8-2 | 61109778 | 19.66 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/large/AVX512-x16-2 | 100000000 | 10.74 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/large/AVX512-x32-2 | 182886646 | 6.575 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/xlarge/SSE-x8-2 | 15220682 | 71.53 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/xlarge/AVX2-x16-2 | 31876572 | 37.57 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/xlarge/AVX512-x8-2 | 15220682 | 71.53 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/xlarge/AVX512-x16-2 | 31876572 | 37.57 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/xlarge/AVX512-x32-2 | 61992217 | 19.55 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/massive/SSE-x8-2 | 4372000 | 262.8 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/massive/AVX2-x16-2 | 9019658 | 131.1 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/massive/AVX512-x8-2 | 4372000 | 262.8 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/massive/AVX512-x16-2 | 9019658 | 131.1 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt16/massive/AVX512-x32-2 | 16568430 | 74.25 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/tiny/SSE-x4-2 | 499209442 | 2.406 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/tiny/AVX2-x8-2 | 350479609 | 3.433 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/tiny/AVX512-x4-2 | 499209442 | 2.406 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/tiny/AVX512-x8-2 | 350479609 | 3.433 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/tiny/AVX512-x16-2 | 280918554 | 4.309 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/small/SSE-x4-2 | 299561596 | 4.028 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/small/AVX2-x8-2 | 374064310 | 3.205 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/small/AVX512-x4-2 | 299561596 | 4.028 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/small/AVX512-x8-2 | 374064310 | 3.205 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/small/AVX512-x16-2 | 499219765 | 2.418 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/medium/SSE-x4-2 | 100000000 | 10.42 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/medium/AVX2-x8-2 | 187391635 | 6.403 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/medium/AVX512-x4-2 | 100000000 | 10.42 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/medium/AVX512-x8-2 | 187391635 | 6.403 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/medium/AVX512-x16-2 | 307955800 | 3.875 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/large/SSE-x4-2 | 33256420 | 36.05 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/large/AVX2-x8-2 | 62421526 | 19.23 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/large/AVX512-x4-2 | 33256420 | 36.05 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/large/AVX512-x8-2 | 62421526 | 19.23 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/large/AVX512-x16-2 | 100000000 | 10.36 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/xlarge/SSE-x4-2 | 8328856 | 144.9 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/xlarge/AVX2-x8-2 | 17039037 | 71.14 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/xlarge/AVX512-x4-2 | 8328856 | 144.9 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/xlarge/AVX512-x8-2 | 17039037 | 71.14 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/xlarge/AVX512-x16-2 | 28740241 | 41.77 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/massive/SSE-x4-2 | 3525885 | 332.3 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/massive/AVX2-x8-2 | 7318027 | 164.5 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/massive/AVX512-x4-2 | 3525885 | 332.3 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/massive/AVX512-x8-2 | 7318027 | 164.5 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt32/massive/AVX512-x16-2 | 12181366 | 99.08 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/tiny/SSE-x2-2 | 409014308 | 2.934 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/tiny/AVX2-x4-2 | 449210791 | 2.667 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/tiny/AVX512-x2-2 | 409014308 | 2.934 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/tiny/AVX512-x4-2 | 449210791 | 2.667 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/tiny/AVX512-x8-2 | 280998146 | 4.293 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/small/SSE-x2-2 | 195631429 | 6.172 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/small/AVX2-x4-2 | 281272394 | 4.308 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/small/AVX512-x2-2 | 195631429 | 6.172 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/small/AVX512-x4-2 | 281272394 | 4.308 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/small/AVX512-x8-2 | 408933924 | 3.044 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/medium/SSE-x2-2 | 63006909 | 18.94 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/medium/AVX2-x4-2 | 100000000 | 10.67 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/medium/AVX512-x2-2 | 63006909 | 18.94 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/medium/AVX512-x4-2 | 100000000 | 10.67 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/medium/AVX512-x8-2 | 197411126 | 6.016 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/large/SSE-x2-2 | 17098578 | 70.57 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/large/AVX2-x4-2 | 32558013 | 37.07 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/large/AVX512-x2-2 | 17098578 | 70.57 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/large/AVX512-x4-2 | 32558013 | 37.07 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/large/AVX512-x8-2 | 57629485 | 20.94 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/xlarge/SSE-x2-2 | 4286155 | 281.8 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/xlarge/AVX2-x4-2 | 8344772 | 143.8 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/xlarge/AVX512-x2-2 | 4286155 | 281.8 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/xlarge/AVX512-x4-2 | 8344772 | 143.8 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/xlarge/AVX512-x8-2 | 14428276 | 83.14 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/massive/SSE-x2-2 | 1000000 | 1012 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/massive/AVX2-x4-2 | 2350525 | 510.6 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/massive/AVX512-x2-2 | 1000000 | 1012 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/massive/AVX512-x4-2 | 2350525 | 510.6 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64/massive/AVX512-x8-2 | 3773523 | 318.1 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/tiny/SSE-x16-2 | 338880315 | 3.332 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/tiny/AVX2-x32-2 | 320784217 | 3.559 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/tiny/AVX512-x16-2 | 338880315 | 3.332 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/tiny/AVX512-x32-2 | 320784217 | 3.559 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/tiny/AVX512-x64-2 | 341599854 | 3.331 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/small/SSE-x16-2 | 449579424 | 2.670 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/small/AVX2-x32-2 | 140368142 | 8.648 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/small/AVX512-x16-2 | 449579424 | 2.670 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/small/AVX512-x32-2 | 140368142 | 8.648 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/small/AVX512-x64-2 | 146828888 | 8.182 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/medium/SSE-x16-2 | 374443974 | 3.472 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/medium/AVX2-x32-2 | 449271607 | 2.672 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/medium/AVX512-x16-2 | 374443974 | 3.472 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/medium/AVX512-x32-2 | 449271607 | 2.672 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/medium/AVX512-x64-2 | 598525731 | 2.018 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/large/SSE-x16-2 | 254828565 | 4.956 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/large/AVX2-x32-2 | 407777484 | 2.938 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/large/AVX512-x16-2 | 254828565 | 4.956 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/large/AVX512-x32-2 | 407777484 | 2.938 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/large/AVX512-x64-2 | 443472316 | 2.666 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/xlarge/SSE-x16-2 | 162196827 | 7.867 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/xlarge/AVX2-x32-2 | 268324950 | 4.518 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/xlarge/AVX512-x16-2 | 162196827 | 7.867 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/xlarge/AVX512-x32-2 | 268324950 | 4.518 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/xlarge/AVX512-x64-2 | 400437789 | 2.952 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/massive/SSE-x16-2 | 214548872 | 5.640 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/massive/AVX2-x32-2 | 348431553 | 3.391 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/massive/AVX512-x16-2 | 214548872 | 5.640 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/massive/AVX512-x32-2 | 348431553 | 3.391 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint8/massive/AVX512-x64-2 | 459781908 | 2.455 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/tiny/SSE-x8-2 | 276271912 | 4.297 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/tiny/AVX2-x16-2 | 281145528 | 4.270 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/tiny/AVX512-x8-2 | 276271912 | 4.297 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/tiny/AVX512-x16-2 | 281145528 | 4.270 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/tiny/AVX512-x32-2 | 315343911 | 3.667 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/small/SSE-x8-2 | 374632351 | 3.204 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/small/AVX2-x16-2 | 449355727 | 2.670 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/small/AVX512-x8-2 | 374632351 | 3.204 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/small/AVX512-x16-2 | 449355727 | 2.670 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/small/AVX512-x32-2 | 138088146 | 8.395 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/medium/SSE-x8-2 | 187276191 | 6.582 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/medium/AVX2-x16-2 | 281107980 | 4.306 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/medium/AVX512-x8-2 | 187276191 | 6.582 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/medium/AVX512-x16-2 | 281107980 | 4.306 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/medium/AVX512-x32-2 | 358850328 | 3.516 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/large/SSE-x8-2 | 59025931 | 19.98 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/large/AVX2-x16-2 | 100000000 | 10.68 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/large/AVX512-x8-2 | 59025931 | 19.98 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/large/AVX512-x16-2 | 100000000 | 10.68 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/large/AVX512-x32-2 | 179631354 | 6.569 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/xlarge/SSE-x8-2 | 16576267 | 71.63 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/xlarge/AVX2-x16-2 | 32578981 | 36.96 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/xlarge/AVX512-x8-2 | 16576267 | 71.63 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/xlarge/AVX512-x16-2 | 32578981 | 36.96 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/xlarge/AVX512-x32-2 | 61464870 | 19.44 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/massive/SSE-x8-2 | 2153736 | 557.4 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/massive/AVX2-x16-2 | 4225728 | 281.3 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/massive/AVX512-x8-2 | 2153736 | 557.4 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/massive/AVX512-x16-2 | 4225728 | 281.3 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint16/massive/AVX512-x32-2 | 7829936 | 145.1 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/tiny/SSE-x4-2 | 499390296 | 2.403 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/tiny/AVX2-x8-2 | 362964080 | 3.342 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/tiny/AVX512-x4-2 | 499390296 | 2.403 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/tiny/AVX512-x8-2 | 362964080 | 3.342 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/tiny/AVX512-x16-2 | 281063364 | 4.268 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/small/SSE-x4-2 | 293867554 | 4.004 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/small/AVX2-x8-2 | 374510434 | 3.203 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/small/AVX512-x4-2 | 293867554 | 4.004 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/small/AVX512-x8-2 | 374510434 | 3.203 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/small/AVX512-x16-2 | 499714206 | 2.402 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/medium/SSE-x4-2 | 100000000 | 10.42 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/medium/AVX2-x8-2 | 187258657 | 6.405 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/medium/AVX512-x4-2 | 100000000 | 10.42 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/medium/AVX512-x8-2 | 187258657 | 6.405 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/medium/AVX512-x16-2 | 312999210 | 3.881 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/large/SSE-x4-2 | 33298366 | 36.02 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/large/AVX2-x8-2 | 62409421 | 19.23 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/large/AVX512-x4-2 | 33298366 | 36.02 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/large/AVX512-x8-2 | 62409421 | 19.23 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/large/AVX512-x16-2 | 100000000 | 10.10 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/xlarge/SSE-x4-2 | 7948898 | 143.6 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/xlarge/AVX2-x8-2 | 17021738 | 70.49 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/xlarge/AVX512-x4-2 | 7948898 | 143.6 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/xlarge/AVX512-x8-2 | 17021738 | 70.49 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/xlarge/AVX512-x16-2 | 28742320 | 41.77 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/massive/SSE-x4-2 | 1595774 | 751.1 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/massive/AVX2-x8-2 | 3094242 | 381.1 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/massive/AVX512-x4-2 | 1595774 | 751.1 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/massive/AVX512-x8-2 | 3094242 | 381.1 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint32/massive/AVX512-x16-2 | 5080051 | 238.3 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/tiny/SSE-x2-2 | 374760351 | 3.203 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/tiny/AVX2-x4-2 | 498763054 | 2.419 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/tiny/AVX512-x2-2 | 374760351 | 3.203 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/tiny/AVX512-x4-2 | 498763054 | 2.419 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/tiny/AVX512-x8-2 | 319635274 | 3.582 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/small/SSE-x2-2 | 187032452 | 6.447 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/small/AVX2-x4-2 | 299546244 | 4.009 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/small/AVX512-x2-2 | 187032452 | 6.447 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/small/AVX512-x4-2 | 299546244 | 4.009 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/small/AVX512-x8-2 | 373937659 | 3.207 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/medium/SSE-x2-2 | 62413118 | 19.23 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/medium/AVX2-x4-2 | 113978791 | 10.42 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/medium/AVX512-x2-2 | 62413118 | 19.23 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/medium/AVX512-x4-2 | 113978791 | 10.42 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/medium/AVX512-x8-2 | 186965330 | 6.484 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/large/SSE-x2-2 | 17005768 | 70.57 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/large/AVX2-x4-2 | 33286495 | 36.69 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/large/AVX512-x2-2 | 17005768 | 70.57 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/large/AVX512-x4-2 | 33286495 | 36.69 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/large/AVX512-x8-2 | 61486065 | 19.93 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/xlarge/SSE-x2-2 | 4154370 | 280.8 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/xlarge/AVX2-x4-2 | 8371358 | 148.2 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/xlarge/AVX512-x2-2 | 4154370 | 280.8 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/xlarge/AVX512-x4-2 | 8371358 | 148.2 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/xlarge/AVX512-x8-2 | 14193795 | 72.36 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/massive/SSE-x2-2 | 1773937 | 676.4 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/massive/AVX2-x4-2 | 3500168 | 343.0 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/massive/AVX512-x2-2 | 1773937 | 676.4 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/massive/AVX512-x4-2 | 3500168 | 343.0 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsUint64/massive/AVX512-x8-2 | 7097266 | 249.3 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/tiny/SSE-x4-2 | 410522160 | 2.675 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/tiny/AVX2-x8-2 | 308565882 | 3.814 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/tiny/AVX512-x4-2 | 410522160 | 2.675 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/tiny/AVX512-x8-2 | 308565882 | 3.814 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/tiny/AVX512-x16-2 | 315331897 | 3.755 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/small/SSE-x4-2 | 278219434 | 4.642 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/small/AVX2-x8-2 | 362945481 | 3.287 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/small/AVX512-x4-2 | 278219434 | 4.642 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/small/AVX512-x8-2 | 362945481 | 3.287 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/small/AVX512-x16-2 | 408523153 | 2.941 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/medium/SSE-x4-2 | 100000000 | 10.77 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/medium/AVX2-x8-2 | 186186376 | 6.409 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/medium/AVX512-x4-2 | 100000000 | 10.77 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/medium/AVX512-x8-2 | 186186376 | 6.409 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/medium/AVX512-x16-2 | 264255108 | 4.619 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/large/SSE-x4-2 | 33028701 | 36.27 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/large/AVX2-x8-2 | 62465360 | 19.53 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/large/AVX512-x4-2 | 33028701 | 36.27 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/large/AVX512-x8-2 | 62465360 | 19.53 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/large/AVX512-x16-2 | 108213310 | 10.95 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/xlarge/SSE-x4-2 | 8359381 | 143.6 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/xlarge/AVX2-x8-2 | 17042701 | 70.46 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/xlarge/AVX512-x4-2 | 8359381 | 143.6 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/xlarge/AVX512-x8-2 | 17042701 | 70.46 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/xlarge/AVX512-x16-2 | 31806921 | 37.13 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/massive/SSE-x4-2 | 1000000 | 1100 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/massive/AVX2-x8-2 | 2164672 | 554.4 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/massive/AVX512-x4-2 | 1000000 | 1100 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/massive/AVX512-x8-2 | 2164672 | 554.4 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat32/massive/AVX512-x16-2 | 4201453 | 293.9 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/tiny/SSE-x2-2 | 362183925 | 3.223 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/tiny/AVX2-x4-2 | 449021466 | 2.687 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/tiny/AVX512-x2-2 | 362183925 | 3.223 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/tiny/AVX512-x4-2 | 449021466 | 2.687 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/tiny/AVX512-x8-2 | 320176149 | 3.820 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/small/SSE-x2-2 | 187139116 | 6.415 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/small/AVX2-x4-2 | 280722585 | 4.300 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/small/AVX512-x2-2 | 187139116 | 6.415 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/small/AVX512-x4-2 | 280722585 | 4.300 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/small/AVX512-x8-2 | 335670502 | 3.472 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/medium/SSE-x2-2 | 62343927 | 19.23 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/medium/AVX2-x4-2 | 112332902 | 10.69 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/medium/AVX512-x2-2 | 62343927 | 19.23 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/medium/AVX512-x4-2 | 112332902 | 10.69 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/medium/AVX512-x8-2 | 179610780 | 6.741 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/large/SSE-x2-2 | 16996959 | 70.51 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/large/AVX2-x4-2 | 33017950 | 36.29 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/large/AVX512-x2-2 | 16996959 | 70.51 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/large/AVX512-x4-2 | 33017950 | 36.29 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/large/AVX512-x8-2 | 60322328 | 19.73 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/xlarge/SSE-x2-2 | 4141281 | 282.9 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/xlarge/AVX2-x4-2 | 7856590 | 145.0 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/xlarge/AVX512-x2-2 | 4141281 | 282.9 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/xlarge/AVX512-x4-2 | 7856590 | 145.0 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/xlarge/AVX512-x8-2 | 16623739 | 72.06 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/massive/SSE-x2-2 | 541202 | 2195 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/massive/AVX2-x4-2 | 1000000 | 1158 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/massive/AVX512-x2-2 | 541202 | 2195 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/massive/AVX512-x4-2 | 1000000 | 1158 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsFloat64/massive/AVX512-x8-2 | 2115301 | 560.4 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsWorstCase/SSE-x4-2 | 7651734 | 145.6 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsWorstCase/AVX2-x8-2 | 14921599 | 70.49 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsWorstCase/AVX512-x4-2 | 7651734 | 145.6 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsWorstCase/AVX512-x8-2 | 14921599 | 70.49 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsWorstCase/AVX512-x16-2 | 28708478 | 41.38 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsBestCase/SSE-x4-2 | 534237578 | 2.136 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsBestCase/AVX2-x8-2 | 561252645 | 2.159 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsBestCase/AVX512-x4-2 | 534237578 | 2.136 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsBestCase/AVX512-x8-2 | 561252645 | 2.159 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsBestCase/AVX512-x16-2 | 560396454 | 2.137 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/tiny/SSE-x4-2 | 499649139 | 2.401 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/tiny/AVX2-x8-2 | 329743240 | 3.421 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/tiny/AVX512-x4-2 | 499649139 | 2.401 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/tiny/AVX512-x8-2 | 329743240 | 3.421 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/tiny/AVX512-x16-2 | 280516392 | 4.276 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/small/SSE-x4-2 | 299373171 | 4.006 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/small/AVX2-x8-2 | 374407988 | 3.267 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/small/AVX512-x4-2 | 299373171 | 4.006 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/small/AVX512-x8-2 | 374407988 | 3.267 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/small/AVX512-x16-2 | 486948346 | 2.424 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/medium/SSE-x4-2 | 100000000 | 10.41 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/medium/AVX2-x8-2 | 182899621 | 6.412 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/medium/AVX512-x4-2 | 100000000 | 10.41 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/medium/AVX512-x8-2 | 182899621 | 6.412 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/medium/AVX512-x16-2 | 311969776 | 3.829 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/large/SSE-x4-2 | 33309816 | 36.04 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/large/AVX2-x8-2 | 59912676 | 19.74 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/large/AVX512-x4-2 | 33309816 | 36.04 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/large/AVX512-x8-2 | 59912676 | 19.74 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/large/AVX512-x16-2 | 100000000 | 10.65 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/xlarge/SSE-x4-2 | 8346818 | 143.7 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/xlarge/AVX2-x8-2 | 16980399 | 70.54 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/xlarge/AVX512-x4-2 | 8346818 | 143.7 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/xlarge/AVX512-x8-2 | 16980399 | 70.54 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/xlarge/AVX512-x16-2 | 28676455 | 42.94 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/massive/SSE-x4-2 | 1000000 | 1151 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/massive/AVX2-x8-2 | 2161594 | 555.2 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/massive/AVX512-x4-2 | 1000000 | 1151 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/massive/AVX512-x8-2 | 2161594 | 555.2 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsNegative/massive/AVX512-x16-2 | 3549094 | 350.5 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8ByWidth/SSE-x16-2 | 331533141 | 3.222 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8ByWidth/AVX2-x32-2 | 408741681 | 3.193 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8ByWidth/AVX512-x16-2 | 331533141 | 3.222 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8ByWidth/AVX512-x32-2 | 408741681 | 3.193 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt8ByWidth/AVX512-x64-2 | 365382873 | 3.241 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64SteadyState/SSE-x2-2 | 5722603 | 211.5 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64SteadyState/AVX2-x4-2 | 11711869 | 103.1 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64SteadyState/AVX512-x2-2 | 5722603 | 211.5 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64SteadyState/AVX512-x4-2 | 11711869 | 103.1 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkContainsInt64SteadyState/AVX512-x8-2 | 19671033 | 61.36 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8/small/Fallback-lo-2 | 248740710 | 5.218 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8/small/SSE-x16-2 | 126181464 | 9.485 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8/small/AVX-x16-2 | 126181464 | 9.485 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8/small/AVX2-x32-2 | 73059427 | 14.44 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8/small/AVX512-x64-2 | 49913169 | 24.41 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8/medium/Fallback-lo-2 | 17278075 | 69.96 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8/medium/SSE-x16-2 | 100000000 | 10.58 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8/medium/AVX-x16-2 | 100000000 | 10.58 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8/medium/AVX2-x32-2 | 91620999 | 13.10 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8/medium/AVX512-x64-2 | 54082130 | 22.20 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8/large/Fallback-lo-2 | 2006178 | 576.3 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8/large/SSE-x16-2 | 41836690 | 27.82 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8/large/AVX-x16-2 | 41836690 | 27.82 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8/large/AVX2-x32-2 | 51735399 | 23.04 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8/large/AVX512-x64-2 | 40861586 | 29.40 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8/xlarge/Fallback-lo-2 | 273898 | 4383 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8/xlarge/SSE-x16-2 | 6928408 | 173.1 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8/xlarge/AVX-x16-2 | 6928408 | 173.1 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8/xlarge/AVX2-x32-2 | 12639586 | 94.09 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8/xlarge/AVX512-x64-2 | 13509693 | 89.67 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt16/small/Fallback-lo-2 | 249444103 | 5.012 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt16/small/SSE-x8-2 | 244927230 | 5.052 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt16/small/AVX-x8-2 | 244927230 | 5.052 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt16/small/AVX2-x16-2 | 122088517 | 9.715 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt16/small/AVX512-x32-2 | 54098370 | 22.00 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt16/medium/Fallback-lo-2 | 15782683 | 72.54 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt16/medium/SSE-x8-2 | 100000000 | 10.51 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt16/medium/AVX-x8-2 | 100000000 | 10.51 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt16/medium/AVX2-x16-2 | 100000000 | 10.75 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt16/medium/AVX512-x32-2 | 56147455 | 21.38 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt16/large/Fallback-lo-2 | 2173214 | 598.1 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt16/large/SSE-x8-2 | 26319481 | 44.73 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt16/large/AVX-x8-2 | 26319481 | 44.73 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt16/large/AVX2-x16-2 | 40459519 | 27.91 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt16/large/AVX512-x32-2 | 39359752 | 31.28 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt16/xlarge/Fallback-lo-2 | 273932 | 4382 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt16/xlarge/SSE-x8-2 | 3557265 | 331.2 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt16/xlarge/AVX-x8-2 | 3557265 | 331.2 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt16/xlarge/AVX2-x16-2 | 6930166 | 173.4 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt16/xlarge/AVX512-x32-2 | 12100244 | 97.01 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt32/small/Fallback-lo-2 | 249566539 | 4.808 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt32/small/SSE-x4-2 | 259250019 | 4.581 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt32/small/AVX-x4-2 | 259250019 | 4.581 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt32/small/AVX2-x8-2 | 232858933 | 5.404 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt32/small/AVX512-x16-2 | 100000000 | 11.18 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt32/medium/Fallback-lo-2 | 17274441 | 72.28 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt32/medium/SSE-x4-2 | 58400258 | 20.56 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt32/medium/AVX-x4-2 | 58400258 | 20.56 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt32/medium/AVX2-x8-2 | 110851756 | 10.67 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt32/medium/AVX512-x16-2 | 106593603 | 11.25 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt32/large/Fallback-lo-2 | 2171817 | 551.8 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt32/large/SSE-x4-2 | 8270253 | 146.0 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt32/large/AVX-x4-2 | 8270253 | 146.0 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt32/large/AVX2-x8-2 | 22234518 | 46.06 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt32/large/AVX512-x16-2 | 37448763 | 32.31 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt32/xlarge/Fallback-lo-2 | 273699 | 4559 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt32/xlarge/SSE-x4-2 | 1000000 | 1102 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt32/xlarge/AVX-x4-2 | 1000000 | 1102 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt32/xlarge/AVX2-x8-2 | 3586887 | 332.4 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt32/xlarge/AVX512-x16-2 | 7214437 | 170.5 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64/small/Fallback-lo-2 | 417473124 | 2.886 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64/small/SSE-x2-2 | 287521756 | 4.169 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64/small/AVX-x2-2 | 287521756 | 4.169 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64/small/AVX2-x4-2 | 277783513 | 4.311 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64/small/AVX512-x8-2 | 172823103 | 6.993 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64/medium/Fallback-lo-2 | 34022653 | 35.27 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64/medium/SSE-x2-2 | 49241248 | 24.05 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64/medium/AVX-x2-2 | 49241248 | 24.05 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64/medium/AVX2-x4-2 | 78897342 | 14.58 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64/medium/AVX512-x8-2 | 84361297 | 14.03 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64/large/Fallback-lo-2 | 3680988 | 282.3 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64/large/SSE-x2-2 | 6293607 | 170.7 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64/large/AVX-x2-2 | 6293607 | 170.7 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64/large/AVX2-x4-2 | 12739849 | 91.28 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64/large/AVX512-x8-2 | 25508130 | 46.30 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64/xlarge/Fallback-lo-2 | 546321 | 2283 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64/xlarge/SSE-x2-2 | 877434 | 1289 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64/xlarge/AVX-x2-2 | 877434 | 1289 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64/xlarge/AVX2-x4-2 | 1845892 | 650.4 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64/xlarge/AVX512-x8-2 | 2148355 | 550.8 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat32/small/Fallback-lo-2 | 411100770 | 2.951 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat32/small/SSE-x4-2 | 264013596 | 4.572 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat32/small/AVX-x4-2 | 264013596 | 4.572 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat32/small/AVX2-x8-2 | 174478266 | 6.911 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat32/small/AVX512-x16-2 | 61182673 | 19.78 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat32/medium/Fallback-lo-2 | 33815070 | 35.68 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat32/medium/SSE-x4-2 | 58238188 | 20.66 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat32/medium/AVX-x4-2 | 58238188 | 20.66 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat32/medium/AVX2-x8-2 | 91316544 | 13.26 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat32/medium/AVX512-x16-2 | 80046624 | 15.08 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat32/large/Fallback-lo-2 | 4304168 | 278.7 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat32/large/SSE-x4-2 | 6198957 | 184.8 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat32/large/AVX-x4-2 | 6198957 | 184.8 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat32/large/AVX2-x8-2 | 12260169 | 86.60 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat32/large/AVX512-x16-2 | 22147112 | 45.34 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat32/xlarge/Fallback-lo-2 | 546901 | 2193 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat32/xlarge/SSE-x4-2 | 736503 | 1622 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat32/xlarge/AVX-x4-2 | 736503 | 1622 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat32/xlarge/AVX2-x8-2 | 1493887 | 810.5 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat32/xlarge/AVX512-x16-2 | 2959298 | 393.4 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat64/small/Fallback-lo-2 | 410778070 | 3.043 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat64/small/SSE-x2-2 | 254156008 | 4.714 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat64/small/AVX-x2-2 | 254156008 | 4.714 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat64/small/AVX2-x4-2 | 227604434 | 5.323 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat64/small/AVX512-x8-2 | 170099748 | 7.115 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat64/medium/Fallback-lo-2 | 33646345 | 35.78 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat64/medium/SSE-x2-2 | 32931152 | 34.92 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat64/medium/AVX-x2-2 | 32931152 | 34.92 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat64/medium/AVX2-x4-2 | 75389446 | 16.79 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat64/medium/AVX512-x8-2 | 89826181 | 13.33 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat64/large/Fallback-lo-2 | 4293837 | 302.8 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat64/large/SSE-x2-2 | 3146601 | 381.4 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat64/large/AVX-x2-2 | 3146601 | 381.4 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat64/large/AVX2-x4-2 | 6373876 | 184.3 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat64/large/AVX512-x8-2 | 13464712 | 88.96 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat64/xlarge/Fallback-lo-2 | 545764 | 2193 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat64/xlarge/SSE-x2-2 | 368846 | 3390 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat64/xlarge/AVX-x2-2 | 368846 | 3390 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat64/xlarge/AVX2-x4-2 | 709940 | 1613 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumFloat64/xlarge/AVX512-x8-2 | 1480214 | 808.6 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanInt32/small/Fallback-lo-2 | 411529147 | 3.043 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanInt32/small/SSE-x4-2 | 204428401 | 5.872 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanInt32/small/AVX-x4-2 | 204428401 | 5.872 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanInt32/small/AVX2-x8-2 | 187573928 | 6.214 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanInt32/small/AVX512-x16-2 | 98346700 | 12.12 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanInt32/medium/Fallback-lo-2 | 33481442 | 35.72 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanInt32/medium/SSE-x4-2 | 52042394 | 22.12 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanInt32/medium/AVX-x4-2 | 52042394 | 22.12 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanInt32/medium/AVX2-x8-2 | 96288541 | 13.44 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanInt32/medium/AVX512-x16-2 | 100995780 | 11.90 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanInt32/large/Fallback-lo-2 | 4296570 | 289.9 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanInt32/large/SSE-x4-2 | 7743022 | 146.4 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanInt32/large/AVX-x4-2 | 7743022 | 146.4 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanInt32/large/AVX2-x8-2 | 24355988 | 46.26 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanInt32/large/AVX512-x16-2 | 37322655 | 32.89 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanInt32/xlarge/Fallback-lo-2 | 547008 | 2193 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanInt32/xlarge/SSE-x4-2 | 1087246 | 1112 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanInt32/xlarge/AVX-x4-2 | 1087246 | 1112 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanInt32/xlarge/AVX2-x8-2 | 1386868 | 761.9 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanInt32/xlarge/AVX512-x16-2 | 7166142 | 170.7 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanFloat64/small/Fallback-lo-2 | 349760005 | 3.449 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanFloat64/small/SSE-x2-2 | 189674538 | 6.293 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanFloat64/small/AVX-x2-2 | 189674538 | 6.293 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanFloat64/small/AVX2-x4-2 | 159228600 | 7.531 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanFloat64/small/AVX512-x8-2 | 110196433 | 10.89 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanFloat64/medium/Fallback-lo-2 | 32968618 | 36.17 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanFloat64/medium/SSE-x2-2 | 30863817 | 37.69 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanFloat64/medium/AVX-x2-2 | 30863817 | 37.69 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanFloat64/medium/AVX2-x4-2 | 62428772 | 19.66 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanFloat64/medium/AVX512-x8-2 | 77140984 | 15.54 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanFloat64/large/Fallback-lo-2 | 4281057 | 280.6 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanFloat64/large/SSE-x2-2 | 3057349 | 389.4 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanFloat64/large/AVX-x2-2 | 3057349 | 389.4 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanFloat64/large/AVX2-x4-2 | 6509438 | 185.9 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanFloat64/large/AVX512-x8-2 | 12668032 | 93.50 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanFloat64/xlarge/Fallback-lo-2 | 545898 | 2288 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanFloat64/xlarge/SSE-x2-2 | 367671 | 4048 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanFloat64/xlarge/AVX-x2-2 | 367671 | 4048 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanFloat64/xlarge/AVX2-x4-2 | 739941 | 1621 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMeanFloat64/xlarge/AVX512-x8-2 | 1434867 | 811.3 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinInt32/small/SSE-x4-2 | 312338268 | 3.860 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinInt32/small/AVX-x4-2 | 312338268 | 3.860 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinInt32/small/AVX2-x8-2 | 238034872 | 5.042 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinInt32/small/AVX512-x16-2 | 152600943 | 6.661 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinInt32/medium/SSE-x4-2 | 61051266 | 19.73 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinInt32/medium/AVX-x4-2 | 61051266 | 19.73 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinInt32/medium/AVX2-x8-2 | 91792144 | 13.11 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinInt32/medium/AVX512-x16-2 | 99994540 | 12.18 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinInt32/large/SSE-x4-2 | 8604774 | 140.5 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinInt32/large/AVX-x4-2 | 8604774 | 140.5 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinInt32/large/AVX2-x8-2 | 15581037 | 77.56 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinInt32/large/AVX512-x16-2 | 30512421 | 40.24 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinInt32/xlarge/SSE-x4-2 | 1000000 | 1110 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinInt32/xlarge/AVX-x4-2 | 1000000 | 1110 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinInt32/xlarge/AVX2-x8-2 | 2158272 | 557.2 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinInt32/xlarge/AVX512-x16-2 | 4253668 | 282.6 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinFloat64/small/SSE-x2-2 | 264129410 | 4.544 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinFloat64/small/AVX-x2-2 | 264129410 | 4.544 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinFloat64/small/AVX2-x4-2 | 299587609 | 4.008 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinFloat64/small/AVX512-x8-2 | 100000000 | 10.05 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinFloat64/medium/SSE-x2-2 | 32778514 | 36.93 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinFloat64/medium/AVX-x2-2 | 32778514 | 36.93 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinFloat64/medium/AVX2-x4-2 | 53356347 | 20.30 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinFloat64/medium/AVX512-x8-2 | 74832976 | 16.21 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinFloat64/large/SSE-x2-2 | 3863326 | 300.0 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinFloat64/large/AVX-x2-2 | 3863326 | 300.0 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinFloat64/large/AVX2-x4-2 | 7670576 | 146.5 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinFloat64/large/AVX512-x8-2 | 14017984 | 78.21 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinFloat64/xlarge/SSE-x2-2 | 492739 | 2195 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinFloat64/xlarge/AVX-x2-2 | 492739 | 2195 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinFloat64/xlarge/AVX2-x4-2 | 1000000 | 1103 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMinFloat64/xlarge/AVX512-x8-2 | 2145290 | 560.3 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxInt32/small/SSE-x4-2 | 306585705 | 3.860 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxInt32/small/AVX-x4-2 | 306585705 | 3.860 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxInt32/small/AVX2-x8-2 | 237347997 | 5.086 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxInt32/small/AVX512-x16-2 | 201433966 | 6.130 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxInt32/medium/SSE-x4-2 | 60759631 | 19.92 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxInt32/medium/AVX-x4-2 | 60759631 | 19.92 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxInt32/medium/AVX2-x8-2 | 90934662 | 13.13 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxInt32/medium/AVX512-x16-2 | 98517944 | 12.18 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxInt32/large/SSE-x4-2 | 8590542 | 139.6 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxInt32/large/AVX-x4-2 | 8590542 | 139.6 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxInt32/large/AVX2-x8-2 | 15770372 | 77.69 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxInt32/large/AVX512-x16-2 | 30197324 | 39.32 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxInt32/xlarge/SSE-x4-2 | 1000000 | 1104 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxInt32/xlarge/AVX-x4-2 | 1000000 | 1104 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxInt32/xlarge/AVX2-x8-2 | 2152038 | 562.1 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxInt32/xlarge/AVX512-x16-2 | 3917990 | 296.7 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxFloat64/small/SSE-x2-2 | 249617162 | 4.816 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxFloat64/small/AVX-x2-2 | 249617162 | 4.816 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxFloat64/small/AVX2-x4-2 | 207017514 | 5.855 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxFloat64/small/AVX512-x8-2 | 66520290 | 17.74 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxFloat64/medium/SSE-x2-2 | 32307492 | 36.92 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxFloat64/medium/AVX-x2-2 | 32307492 | 36.92 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxFloat64/medium/AVX2-x4-2 | 57306838 | 20.77 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxFloat64/medium/AVX512-x8-2 | 56911946 | 21.12 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxFloat64/large/SSE-x2-2 | 4259366 | 287.1 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxFloat64/large/AVX-x2-2 | 4259366 | 287.1 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxFloat64/large/AVX2-x4-2 | 7905420 | 148.9 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxFloat64/large/AVX512-x8-2 | 14100686 | 83.43 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxFloat64/xlarge/SSE-x2-2 | 545378 | 2243 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxFloat64/xlarge/AVX-x2-2 | 545378 | 2243 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxFloat64/xlarge/AVX2-x4-2 | 1000000 | 1113 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkMaxFloat64/xlarge/AVX512-x8-2 | 2119741 | 565.7 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8ByWidth/Fallback-lo-2 | 896775 | 1335 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8ByWidth/SSE-x16-2 | 12557700 | 94.52 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8ByWidth/AVX-x16-2 | 12557700 | 94.52 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8ByWidth/AVX2-x32-2 | 18702537 | 55.03 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt8ByWidth/AVX512-x64-2 | 21342572 | 56.10 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64SteadyState/Fallback-lo-2 | 513738 | 2195 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64SteadyState/SSE-x2-2 | 928376 | 1296 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64SteadyState/AVX-x2-2 | 928376 | 1296 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64SteadyState/AVX2-x4-2 | 1836968 | 888.1 ns/op | 0 B/op | 0 allocs/op |
|
||||
| BenchmarkSumInt64SteadyState/AVX512-x8-2 | 2141715 | 551.3 ns/op | 0 B/op | 0 allocs/op |
|
||||
|
||||
+6
-5
@@ -12,7 +12,7 @@ If you see **SIGILL: illegal instruction** when running tests, the CPU or VM doe
|
||||
|
||||
```bash
|
||||
# List SIMD-related flags
|
||||
grep -E 'avx|sse' /proc/cpuinfo
|
||||
grep -E 'avx' /proc/cpuinfo
|
||||
|
||||
# Or with lscpu
|
||||
lscpu | grep -i avx
|
||||
@@ -22,21 +22,22 @@ lscpu | grep -i avx
|
||||
|
||||
| Tests / code | Required flag(s) | Typical CPUs |
|
||||
| ----------------- | -------------------------- | ----------------------------------------------------------------------- |
|
||||
| SSE (128-bit) | `sse2` (baseline on amd64) | All amd64 |
|
||||
| AVX (128-bit) | `avx` (baseline on amd64) | All amd64 |
|
||||
| AVX2 (256-bit) | `avx2` | Intel Haswell+, AMD Excavator+ |
|
||||
| AVX-512 (512-bit) | `avx512f` | Intel Skylake-X+, some Xeons; many AMD/consumer CPUs do **not** have it |
|
||||
|
||||
### What the tests do
|
||||
|
||||
- **AVX tests** (128-bit) call `requireAVX(t)` and are **skipped** if the CPU does not support AVX.
|
||||
- **AVX2 tests** call `requireAVX2(t)` and are **skipped** if the CPU does not support AVX2 (no SIGILL).
|
||||
- **AVX-512 tests** (when enabled) should call `requireAVX512(t)` and skip when AVX-512 is not available.
|
||||
|
||||
So on a machine without AVX2, AVX2 tests will show as skipped instead of crashing.
|
||||
|
||||
### Run only SSE tests
|
||||
### Run only AVX tests
|
||||
|
||||
If your environment does not support AVX2/AVX-512, you can still run the SSE tests:
|
||||
If your environment does not support AVX2/AVX-512, you can still run the AVX (128-bit) tests:
|
||||
|
||||
```bash
|
||||
GOEXPERIMENT=simd go test -run SSE ./...
|
||||
GOEXPERIMENT=simd go test -run AVX ./...
|
||||
```
|
||||
|
||||
@@ -19,16 +19,25 @@ type skipHelper interface {
|
||||
|
||||
// How to check if your Linux CPU supports SIMD (avoids SIGILL):
|
||||
//
|
||||
// grep -E 'avx|sse' /proc/cpuinfo
|
||||
// grep -E 'avx' /proc/cpuinfo
|
||||
//
|
||||
// Or: lscpu | grep -i avx
|
||||
//
|
||||
// You need:
|
||||
// - SSE tests (128-bit): sse2 (baseline on amd64), sse4.1/sse4.2 often used
|
||||
// - AVX tests (128-bit): avx in flags (baseline on amd64)
|
||||
// - AVX2 tests (256-bit): avx2 in flags
|
||||
// - AVX-512 tests: avx512f (and often avx512bw, avx512vl)
|
||||
//
|
||||
// If your CPU lacks AVX2 or AVX-512, tests that use them will be skipped automatically.
|
||||
// If your CPU lacks AVX or AVX2 or AVX-512, tests that use them will be skipped automatically.
|
||||
|
||||
// requireAVX skips the test/benchmark if the CPU does not support AVX (128-bit SIMD).
|
||||
// Use at the start of each AVX test/benchmark to avoid SIGILL on older or non-x86 systems.
|
||||
func requireAVX(t skipHelper) {
|
||||
t.Helper()
|
||||
if !archsimd.X86.AVX() {
|
||||
t.Skipf("CPU does not support AVX; skipping. Check compatibility: grep avx /proc/cpuinfo")
|
||||
}
|
||||
}
|
||||
|
||||
// requireAVX2 skips the test/benchmark if the CPU does not support AVX2 (256-bit SIMD).
|
||||
// Use at the start of each AVX2 test/benchmark to avoid SIGILL on older or non-x86 systems.
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
"simd/archsimd"
|
||||
)
|
||||
|
||||
// ContainsInt8x16 checks if collection contains target using SSE SIMD and AVX512 SIMD
|
||||
// ContainsInt8x16 checks if collection contains target using AVX SIMD and AVX-512 SIMD
|
||||
func ContainsInt8x16[T ~int8](collection []T, target T) bool {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -40,7 +40,7 @@ func ContainsInt8x16[T ~int8](collection []T, target T) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// ContainsInt16x8 checks if collection contains target using SSE SIMD and AVX512 SIMD
|
||||
// ContainsInt16x8 checks if collection contains target using AVX SIMD and AVX-512 SIMD
|
||||
func ContainsInt16x8[T ~int16](collection []T, target T) bool {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -72,7 +72,7 @@ func ContainsInt16x8[T ~int16](collection []T, target T) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// ContainsInt32x4 checks if collection contains target using SSE SIMD and AVX512 SIMD
|
||||
// ContainsInt32x4 checks if collection contains target using AVX SIMD and AVX-512 SIMD
|
||||
func ContainsInt32x4[T ~int32](collection []T, target T) bool {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -104,7 +104,7 @@ func ContainsInt32x4[T ~int32](collection []T, target T) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// ContainsInt64x2 checks if collection contains target using SSE SIMD and AVX512 SIMD
|
||||
// ContainsInt64x2 checks if collection contains target using AVX SIMD and AVX-512 SIMD
|
||||
func ContainsInt64x2[T ~int64](collection []T, target T) bool {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -136,7 +136,7 @@ func ContainsInt64x2[T ~int64](collection []T, target T) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// ContainsUint8x16 checks if collection contains target using SSE SIMD and AVX512 SIMD
|
||||
// ContainsUint8x16 checks if collection contains target using AVX SIMD and AVX-512 SIMD
|
||||
func ContainsUint8x16[T ~uint8](collection []T, target T) bool {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -168,7 +168,7 @@ func ContainsUint8x16[T ~uint8](collection []T, target T) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// ContainsUint16x8 checks if collection contains target using SSE SIMD and AVX512 SIMD
|
||||
// ContainsUint16x8 checks if collection contains target using AVX SIMD and AVX-512 SIMD
|
||||
func ContainsUint16x8[T ~uint16](collection []T, target T) bool {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -200,7 +200,7 @@ func ContainsUint16x8[T ~uint16](collection []T, target T) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// ContainsUint32x4 checks if collection contains target using SSE SIMD and AVX512 SIMD
|
||||
// ContainsUint32x4 checks if collection contains target using AVX SIMD and AVX-512 SIMD
|
||||
func ContainsUint32x4[T ~uint32](collection []T, target T) bool {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -232,7 +232,7 @@ func ContainsUint32x4[T ~uint32](collection []T, target T) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// ContainsUint64x2 checks if collection contains target using SSE SIMD and AVX512 SIMD
|
||||
// ContainsUint64x2 checks if collection contains target using AVX SIMD and AVX-512 SIMD
|
||||
func ContainsUint64x2[T ~uint64](collection []T, target T) bool {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -264,7 +264,7 @@ func ContainsUint64x2[T ~uint64](collection []T, target T) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// ContainsFloat32x4 checks if collection contains target using SSE SIMD and AVX512 SIMD
|
||||
// ContainsFloat32x4 checks if collection contains target using AVX SIMD and AVX-512 SIMD
|
||||
func ContainsFloat32x4[T ~float32](collection []T, target T) bool {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -296,7 +296,7 @@ func ContainsFloat32x4[T ~float32](collection []T, target T) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// ContainsFloat64x2 checks if collection contains target using SSE SIMD and AVX512 SIMD
|
||||
// ContainsFloat64x2 checks if collection contains target using AVX SIMD and AVX-512 SIMD
|
||||
func ContainsFloat64x2[T ~float64](collection []T, target T) bool {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
|
||||
@@ -8,16 +8,16 @@ import (
|
||||
|
||||
// Benchmark suite for SIMD Contains operations compared to core lo package fallbacks.
|
||||
// These benchmarks measure the performance of element lookup operations
|
||||
// across different SIMD implementations (SSE, AVX2, AVX512) and data sizes.
|
||||
// across different SIMD implementations (AVX, AVX2, AVX512) and data sizes.
|
||||
|
||||
// Benchmark sizes for Contains operations
|
||||
var containsBenchmarkSizes = []struct {
|
||||
name string
|
||||
size int
|
||||
}{
|
||||
{"tiny", 4}, // Smaller than SSE width (16 lanes for int8)
|
||||
{"small", 16}, // Exactly SSE width for int8
|
||||
{"medium", 64}, // Multiple of SSE, between SSE and AVX2 for int8
|
||||
{"tiny", 4}, // Smaller than AVX width (16 lanes for int8)
|
||||
{"small", 16}, // Exactly AVX width for int8
|
||||
{"medium", 64}, // Multiple of AVX, between AVX and AVX2 for int8
|
||||
{"large", 256}, // Multiple of AVX2 (32 lanes for int8)
|
||||
{"xlarge", 1024}, // Multiple of AVX512 (64 lanes for int8)
|
||||
{"massive", 8192}, // Very large dataset
|
||||
@@ -33,14 +33,14 @@ func BenchmarkContainsInt8(b *testing.B) {
|
||||
data := generateInt8(bs.size)
|
||||
target := int8(42)
|
||||
|
||||
b.Run("SSE-x16", func(b *testing.B) {
|
||||
b.Run("AVX512-x16", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsInt8x16 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = ContainsInt8x16(data, target)
|
||||
}
|
||||
})
|
||||
b.Run("AVX2-x32", func(b *testing.B) {
|
||||
b.Run("AVX512-x32", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsInt8x32 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
@@ -68,14 +68,14 @@ func BenchmarkContainsInt16(b *testing.B) {
|
||||
data := generateInt16(bs.size)
|
||||
target := int16(42)
|
||||
|
||||
b.Run("SSE-x8", func(b *testing.B) {
|
||||
b.Run("AVX512-x8", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsInt16x8 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = ContainsInt16x8(data, target)
|
||||
}
|
||||
})
|
||||
b.Run("AVX2-x16", func(b *testing.B) {
|
||||
b.Run("AVX512-x16", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsInt16x16 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
@@ -103,14 +103,14 @@ func BenchmarkContainsInt32(b *testing.B) {
|
||||
data := generateInt32(bs.size)
|
||||
target := int32(42)
|
||||
|
||||
b.Run("SSE-x4", func(b *testing.B) {
|
||||
b.Run("AVX512-x4", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsInt32x4 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = ContainsInt32x4(data, target)
|
||||
}
|
||||
})
|
||||
b.Run("AVX2-x8", func(b *testing.B) {
|
||||
b.Run("AVX512-x8", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsInt32x8 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
@@ -138,14 +138,14 @@ func BenchmarkContainsInt64(b *testing.B) {
|
||||
data := generateInt64(bs.size)
|
||||
target := int64(42)
|
||||
|
||||
b.Run("SSE-x2", func(b *testing.B) {
|
||||
b.Run("AVX512-x2", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsInt64x2 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = ContainsInt64x2(data, target)
|
||||
}
|
||||
})
|
||||
b.Run("AVX2-x4", func(b *testing.B) {
|
||||
b.Run("AVX512-x4", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsInt64x4 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
@@ -173,14 +173,14 @@ func BenchmarkContainsUint8(b *testing.B) {
|
||||
data := generateUint8(bs.size)
|
||||
target := uint8(255)
|
||||
|
||||
b.Run("SSE-x16", func(b *testing.B) {
|
||||
b.Run("AVX512-x16", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsUint8x16 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = ContainsUint8x16(data, target)
|
||||
}
|
||||
})
|
||||
b.Run("AVX2-x32", func(b *testing.B) {
|
||||
b.Run("AVX512-x32", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsUint8x32 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
@@ -208,14 +208,14 @@ func BenchmarkContainsUint16(b *testing.B) {
|
||||
data := generateUint16(bs.size)
|
||||
target := uint16(42)
|
||||
|
||||
b.Run("SSE-x8", func(b *testing.B) {
|
||||
b.Run("AVX512-x8", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsUint16x8 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = ContainsUint16x8(data, target)
|
||||
}
|
||||
})
|
||||
b.Run("AVX2-x16", func(b *testing.B) {
|
||||
b.Run("AVX512-x16", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsUint16x16 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
@@ -243,14 +243,14 @@ func BenchmarkContainsUint32(b *testing.B) {
|
||||
data := generateUint32(bs.size)
|
||||
target := uint32(42)
|
||||
|
||||
b.Run("SSE-x4", func(b *testing.B) {
|
||||
b.Run("AVX512-x4", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsUint32x4 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = ContainsUint32x4(data, target)
|
||||
}
|
||||
})
|
||||
b.Run("AVX2-x8", func(b *testing.B) {
|
||||
b.Run("AVX512-x8", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsUint32x8 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
@@ -278,14 +278,14 @@ func BenchmarkContainsUint64(b *testing.B) {
|
||||
data := generateUint64(bs.size)
|
||||
target := uint64(42)
|
||||
|
||||
b.Run("SSE-x2", func(b *testing.B) {
|
||||
b.Run("AVX512-x2", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsUint64x2 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = ContainsUint64x2(data, target)
|
||||
}
|
||||
})
|
||||
b.Run("AVX2-x4", func(b *testing.B) {
|
||||
b.Run("AVX512-x4", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsUint64x4 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
@@ -313,14 +313,14 @@ func BenchmarkContainsFloat32(b *testing.B) {
|
||||
data := generateFloat32(bs.size)
|
||||
target := float32(42.5)
|
||||
|
||||
b.Run("SSE-x4", func(b *testing.B) {
|
||||
b.Run("AVX512-x4", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsFloat32x4 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = ContainsFloat32x4(data, target)
|
||||
}
|
||||
})
|
||||
b.Run("AVX2-x8", func(b *testing.B) {
|
||||
b.Run("AVX512-x8", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsFloat32x8 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
@@ -348,14 +348,14 @@ func BenchmarkContainsFloat64(b *testing.B) {
|
||||
data := generateFloat64(bs.size)
|
||||
target := float64(42.5)
|
||||
|
||||
b.Run("SSE-x2", func(b *testing.B) {
|
||||
b.Run("AVX512-x2", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsFloat64x2 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = ContainsFloat64x2(data, target)
|
||||
}
|
||||
})
|
||||
b.Run("AVX2-x4", func(b *testing.B) {
|
||||
b.Run("AVX512-x4", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsFloat64x4 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
@@ -386,14 +386,14 @@ func BenchmarkContainsWorstCase(b *testing.B) {
|
||||
}
|
||||
target := int32(size - 1) // Target at the very end
|
||||
|
||||
b.Run("SSE-x4", func(b *testing.B) {
|
||||
b.Run("AVX512-x4", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsInt32x4 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = ContainsInt32x4(data, target)
|
||||
}
|
||||
})
|
||||
b.Run("AVX2-x8", func(b *testing.B) {
|
||||
b.Run("AVX512-x8", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsInt32x8 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
@@ -422,14 +422,14 @@ func BenchmarkContainsBestCase(b *testing.B) {
|
||||
}
|
||||
target := int32(0) // Target at the very beginning
|
||||
|
||||
b.Run("SSE-x4", func(b *testing.B) {
|
||||
b.Run("AVX512-x4", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsInt32x4 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = ContainsInt32x4(data, target)
|
||||
}
|
||||
})
|
||||
b.Run("AVX2-x8", func(b *testing.B) {
|
||||
b.Run("AVX512-x8", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsInt32x8 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
@@ -456,14 +456,14 @@ func BenchmarkContainsNegative(b *testing.B) {
|
||||
data := generateInt32(bs.size)
|
||||
target := int32(999999) // Target that's unlikely to be in the data
|
||||
|
||||
b.Run("SSE-x4", func(b *testing.B) {
|
||||
b.Run("AVX512-x4", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsInt32x4 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = ContainsInt32x4(data, target)
|
||||
}
|
||||
})
|
||||
b.Run("AVX2-x8", func(b *testing.B) {
|
||||
b.Run("AVX512-x8", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsInt32x8 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
@@ -497,8 +497,8 @@ func BenchmarkContainsInt8ByWidth(b *testing.B) {
|
||||
name string
|
||||
fn func() bool
|
||||
}{
|
||||
{"SSE-x16", func() bool { return ContainsInt8x16(data, target) }},
|
||||
{"AVX2-x32", func() bool { return ContainsInt8x32(data, target) }},
|
||||
{"AVX512-x16", func() bool { return ContainsInt8x16(data, target) }},
|
||||
{"AVX512-x32", func() bool { return ContainsInt8x32(data, target) }},
|
||||
{"AVX512-x64", func() bool { return ContainsInt8x64(data, target) }},
|
||||
}
|
||||
|
||||
@@ -533,14 +533,14 @@ func BenchmarkContainsInt64SteadyState(b *testing.B) {
|
||||
|
||||
b.ResetTimer() // Reset timer to exclude warmup
|
||||
|
||||
b.Run("SSE-x2", func(b *testing.B) {
|
||||
b.Run("AVX512-x2", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsInt64x2 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = ContainsInt64x2(data, target)
|
||||
}
|
||||
})
|
||||
b.Run("AVX2-x4", func(b *testing.B) {
|
||||
b.Run("AVX512-x4", func(b *testing.B) {
|
||||
requireAVX512(b) // ContainsInt64x4 is in intersect_avx512.go which uses AVX-512
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
|
||||
+12
-6
@@ -364,7 +364,8 @@ func MinInt64[T ~int64](collection []T) T {
|
||||
case simdFeatureAVX2:
|
||||
return MinInt64x4(collection)
|
||||
case simdFeatureAVX:
|
||||
return MinInt64x2(collection)
|
||||
// MinInt64x2 requires AVX-512 (archsimd Int64x2.Min); use scalar fallback
|
||||
fallthrough
|
||||
default:
|
||||
return lo.Min(collection)
|
||||
}
|
||||
@@ -420,7 +421,8 @@ func MinUint64[T ~uint64](collection []T) T {
|
||||
case simdFeatureAVX2:
|
||||
return MinUint64x4(collection)
|
||||
case simdFeatureAVX:
|
||||
return MinUint64x2(collection)
|
||||
// MinUint64x2 requires AVX-512; use scalar fallback
|
||||
fallthrough
|
||||
default:
|
||||
return lo.Min(collection)
|
||||
}
|
||||
@@ -504,7 +506,8 @@ func MaxInt64[T ~int64](collection []T) T {
|
||||
case simdFeatureAVX2:
|
||||
return MaxInt64x4(collection)
|
||||
case simdFeatureAVX:
|
||||
return MaxInt64x2(collection)
|
||||
// MaxInt64x2 requires AVX-512; use scalar fallback
|
||||
fallthrough
|
||||
default:
|
||||
return lo.Max(collection)
|
||||
}
|
||||
@@ -560,7 +563,8 @@ func MaxUint64[T ~uint64](collection []T) T {
|
||||
case simdFeatureAVX2:
|
||||
return MaxUint64x4(collection)
|
||||
case simdFeatureAVX:
|
||||
return MaxUint64x2(collection)
|
||||
// MaxUint64x2 requires AVX-512; use scalar fallback
|
||||
fallthrough
|
||||
default:
|
||||
return lo.Max(collection)
|
||||
}
|
||||
@@ -674,7 +678,8 @@ func ClampInt64[T ~int64, Slice ~[]T](collection Slice, min, max T) Slice {
|
||||
case simdFeatureAVX2:
|
||||
return ClampInt64x4(collection, min, max)
|
||||
case simdFeatureAVX:
|
||||
return ClampInt64x2(collection, min, max)
|
||||
// ClampInt64x2 requires AVX-512; use scalar fallback
|
||||
fallthrough
|
||||
default:
|
||||
result := make(Slice, len(collection))
|
||||
for i, v := range collection {
|
||||
@@ -770,7 +775,8 @@ func ClampUint64[T ~uint64, Slice ~[]T](collection Slice, min, max T) Slice {
|
||||
case simdFeatureAVX2:
|
||||
return ClampUint64x4(collection, min, max)
|
||||
case simdFeatureAVX:
|
||||
return ClampUint64x2(collection, min, max)
|
||||
// ClampUint64x2 requires AVX-512; use scalar fallback
|
||||
fallthrough
|
||||
default:
|
||||
result := make(Slice, len(collection))
|
||||
for i, v := range collection {
|
||||
|
||||
@@ -9,9 +9,9 @@ import (
|
||||
"github.com/samber/lo"
|
||||
)
|
||||
|
||||
// SSE (128-bit) SIMD sum functions - 16/8/4/2 lanes
|
||||
// AVX (128-bit) SIMD sum functions - 16/8/4/2 lanes
|
||||
|
||||
// SumInt8x16 sums a slice of int8 using SSE SIMD (Int8x16, 16 lanes).
|
||||
// SumInt8x16 sums a slice of int8 using AVX SIMD (Int8x16, 16 lanes).
|
||||
// Overflow: The accumulation is performed using int8, which can overflow for large collections.
|
||||
// If the sum exceeds the int8 range (-128 to 127), the result will wrap around silently.
|
||||
// For collections that may overflow, consider using a wider type or handle overflow detection externally.
|
||||
@@ -45,7 +45,7 @@ func SumInt8x16[T ~int8](collection []T) T {
|
||||
return sum
|
||||
}
|
||||
|
||||
// SumInt16x8 sums a slice of int16 using SSE SIMD (Int16x8, 8 lanes).
|
||||
// SumInt16x8 sums a slice of int16 using AVX SIMD (Int16x8, 8 lanes).
|
||||
// Overflow: The accumulation is performed using int16, which can overflow for large collections.
|
||||
// If the sum exceeds the int16 range (-32768 to 32767), the result will wrap around silently.
|
||||
// For collections that may overflow, consider using a wider type or handle overflow detection externally.
|
||||
@@ -79,7 +79,7 @@ func SumInt16x8[T ~int16](collection []T) T {
|
||||
return sum
|
||||
}
|
||||
|
||||
// SumInt32x4 sums a slice of int32 using SSE SIMD (Int32x4, 4 lanes).
|
||||
// SumInt32x4 sums a slice of int32 using AVX SIMD (Int32x4, 4 lanes).
|
||||
// Overflow: The accumulation is performed using int32, which can overflow for very large collections.
|
||||
// If the sum exceeds the int32 range (-2147483648 to 2147483647), the result will wrap around silently.
|
||||
// For collections that may overflow, consider using SumInt64x2 or handle overflow detection externally.
|
||||
@@ -113,7 +113,7 @@ func SumInt32x4[T ~int32](collection []T) T {
|
||||
return sum
|
||||
}
|
||||
|
||||
// SumInt64x2 sums a slice of int64 using SSE SIMD (Int64x2, 2 lanes).
|
||||
// SumInt64x2 sums a slice of int64 using AVX SIMD (Int64x2, 2 lanes).
|
||||
// Overflow: The accumulation is performed using int64, which can overflow for extremely large collections.
|
||||
// If the sum exceeds the int64 range, the result will wrap around silently.
|
||||
// For collections that may overflow, handle overflow detection externally (e.g., using big.Int).
|
||||
@@ -147,7 +147,7 @@ func SumInt64x2[T ~int64](collection []T) T {
|
||||
return sum
|
||||
}
|
||||
|
||||
// SumUint8x16 sums a slice of uint8 using SSE SIMD (Uint8x16, 16 lanes).
|
||||
// SumUint8x16 sums a slice of uint8 using AVX SIMD (Uint8x16, 16 lanes).
|
||||
// Overflow: The accumulation is performed using uint8, which can overflow for large collections.
|
||||
// If the sum exceeds the uint8 range (0 to 255), the result will wrap around silently.
|
||||
// For collections that may overflow, consider using a wider type or handle overflow detection externally.
|
||||
@@ -181,7 +181,7 @@ func SumUint8x16[T ~uint8](collection []T) T {
|
||||
return sum
|
||||
}
|
||||
|
||||
// SumUint16x8 sums a slice of uint16 using SSE SIMD (Uint16x8, 8 lanes).
|
||||
// SumUint16x8 sums a slice of uint16 using AVX SIMD (Uint16x8, 8 lanes).
|
||||
// Overflow: The accumulation is performed using uint16, which can overflow for large collections.
|
||||
// If the sum exceeds the uint16 range (0 to 65535), the result will wrap around silently.
|
||||
// For collections that may overflow, consider using a wider type or handle overflow detection externally.
|
||||
@@ -215,7 +215,7 @@ func SumUint16x8[T ~uint16](collection []T) T {
|
||||
return sum
|
||||
}
|
||||
|
||||
// SumUint32x4 sums a slice of uint32 using SSE SIMD (Uint32x4, 4 lanes).
|
||||
// SumUint32x4 sums a slice of uint32 using AVX SIMD (Uint32x4, 4 lanes).
|
||||
// Overflow: The accumulation is performed using uint32, which can overflow for very large collections.
|
||||
// If the sum exceeds the uint32 range (0 to 4294967295), the result will wrap around silently.
|
||||
// For collections that may overflow, consider using SumUint64x2 or handle overflow detection externally.
|
||||
@@ -249,7 +249,7 @@ func SumUint32x4[T ~uint32](collection []T) T {
|
||||
return sum
|
||||
}
|
||||
|
||||
// SumUint64x2 sums a slice of uint64 using SSE SIMD (Uint64x2, 2 lanes).
|
||||
// SumUint64x2 sums a slice of uint64 using AVX SIMD (Uint64x2, 2 lanes).
|
||||
// Overflow: The accumulation is performed using uint64, which can overflow for extremely large collections.
|
||||
// If the sum exceeds the uint64 range, the result will wrap around silently.
|
||||
// For collections that may overflow, handle overflow detection externally (e.g., using big.Int).
|
||||
@@ -283,7 +283,7 @@ func SumUint64x2[T ~uint64](collection []T) T {
|
||||
return sum
|
||||
}
|
||||
|
||||
// SumFloat32x4 sums a slice of float32 using SSE SIMD (Float32x4, 4 lanes).
|
||||
// SumFloat32x4 sums a slice of float32 using AVX SIMD (Float32x4, 4 lanes).
|
||||
// Overflow: The accumulation is performed using float32. Overflow will result in +/-Inf rather than wrapping.
|
||||
// For collections requiring high precision or large sums, consider using SumFloat64x2.
|
||||
func SumFloat32x4[T ~float32](collection []T) T {
|
||||
@@ -316,7 +316,7 @@ func SumFloat32x4[T ~float32](collection []T) T {
|
||||
return sum
|
||||
}
|
||||
|
||||
// SumFloat64x2 sums a slice of float64 using SSE SIMD (Float64x2, 2 lanes).
|
||||
// SumFloat64x2 sums a slice of float64 using AVX SIMD (Float64x2, 2 lanes).
|
||||
// Overflow: The accumulation is performed using float64. Overflow will result in +/-Inf rather than wrapping.
|
||||
// For collections that may overflow, handle overflow detection externally (e.g., using big.Float).
|
||||
func SumFloat64x2[T ~float64](collection []T) T {
|
||||
@@ -349,7 +349,7 @@ func SumFloat64x2[T ~float64](collection []T) T {
|
||||
return sum
|
||||
}
|
||||
|
||||
// MeanInt8x16 calculates the mean of a slice of int8 using SSE SIMD
|
||||
// MeanInt8x16 calculates the mean of a slice of int8 using AVX SIMD
|
||||
func MeanInt8x16[T ~int8](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -359,7 +359,7 @@ func MeanInt8x16[T ~int8](collection []T) T {
|
||||
return sum / T(length)
|
||||
}
|
||||
|
||||
// MeanInt16x8 calculates the mean of a slice of int16 using SSE SIMD
|
||||
// MeanInt16x8 calculates the mean of a slice of int16 using AVX SIMD
|
||||
func MeanInt16x8[T ~int16](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -369,7 +369,7 @@ func MeanInt16x8[T ~int16](collection []T) T {
|
||||
return sum / T(length)
|
||||
}
|
||||
|
||||
// MeanInt32x4 calculates the mean of a slice of int32 using SSE SIMD
|
||||
// MeanInt32x4 calculates the mean of a slice of int32 using AVX SIMD
|
||||
func MeanInt32x4[T ~int32](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -379,7 +379,7 @@ func MeanInt32x4[T ~int32](collection []T) T {
|
||||
return sum / T(length)
|
||||
}
|
||||
|
||||
// MeanInt64x2 calculates the mean of a slice of int64 using SSE SIMD
|
||||
// MeanInt64x2 calculates the mean of a slice of int64 using AVX SIMD
|
||||
func MeanInt64x2[T ~int64](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -389,7 +389,7 @@ func MeanInt64x2[T ~int64](collection []T) T {
|
||||
return sum / T(length)
|
||||
}
|
||||
|
||||
// MeanUint8x16 calculates the mean of a slice of uint8 using SSE SIMD
|
||||
// MeanUint8x16 calculates the mean of a slice of uint8 using AVX SIMD
|
||||
func MeanUint8x16[T ~uint8](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -399,7 +399,7 @@ func MeanUint8x16[T ~uint8](collection []T) T {
|
||||
return sum / T(length)
|
||||
}
|
||||
|
||||
// MeanUint16x8 calculates the mean of a slice of uint16 using SSE SIMD
|
||||
// MeanUint16x8 calculates the mean of a slice of uint16 using AVX SIMD
|
||||
func MeanUint16x8[T ~uint16](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -409,7 +409,7 @@ func MeanUint16x8[T ~uint16](collection []T) T {
|
||||
return sum / T(length)
|
||||
}
|
||||
|
||||
// MeanUint32x4 calculates the mean of a slice of uint32 using SSE SIMD
|
||||
// MeanUint32x4 calculates the mean of a slice of uint32 using AVX SIMD
|
||||
func MeanUint32x4[T ~uint32](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -419,7 +419,7 @@ func MeanUint32x4[T ~uint32](collection []T) T {
|
||||
return sum / T(length)
|
||||
}
|
||||
|
||||
// MeanUint64x2 calculates the mean of a slice of uint64 using SSE SIMD
|
||||
// MeanUint64x2 calculates the mean of a slice of uint64 using AVX SIMD
|
||||
func MeanUint64x2[T ~uint64](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -429,7 +429,7 @@ func MeanUint64x2[T ~uint64](collection []T) T {
|
||||
return sum / T(length)
|
||||
}
|
||||
|
||||
// MeanFloat32x4 calculates the mean of a slice of float32 using SSE SIMD
|
||||
// MeanFloat32x4 calculates the mean of a slice of float32 using AVX SIMD
|
||||
func MeanFloat32x4[T ~float32](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -440,7 +440,7 @@ func MeanFloat32x4[T ~float32](collection []T) T {
|
||||
return sum / T(length)
|
||||
}
|
||||
|
||||
// MeanFloat64x2 calculates the mean of a slice of float64 using SSE SIMD
|
||||
// MeanFloat64x2 calculates the mean of a slice of float64 using AVX SIMD
|
||||
func MeanFloat64x2[T ~float64](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -450,7 +450,7 @@ func MeanFloat64x2[T ~float64](collection []T) T {
|
||||
return sum / T(length)
|
||||
}
|
||||
|
||||
// ClampInt8x16 clamps each element in collection between min and max values using SSE SIMD
|
||||
// ClampInt8x16 clamps each element in collection between min and max values using AVX SIMD
|
||||
func ClampInt8x16[T ~int8, Slice ~[]T](collection Slice, min, max T) Slice {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -488,7 +488,7 @@ func ClampInt8x16[T ~int8, Slice ~[]T](collection Slice, min, max T) Slice {
|
||||
return result
|
||||
}
|
||||
|
||||
// ClampInt16x8 clamps each element in collection between min and max values using SSE SIMD
|
||||
// ClampInt16x8 clamps each element in collection between min and max values using AVX SIMD
|
||||
func ClampInt16x8[T ~int16, Slice ~[]T](collection Slice, min, max T) Slice {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -526,7 +526,7 @@ func ClampInt16x8[T ~int16, Slice ~[]T](collection Slice, min, max T) Slice {
|
||||
return result
|
||||
}
|
||||
|
||||
// ClampInt32x4 clamps each element in collection between min and max values using SSE SIMD
|
||||
// ClampInt32x4 clamps each element in collection between min and max values using AVX SIMD
|
||||
func ClampInt32x4[T ~int32, Slice ~[]T](collection Slice, min, max T) Slice {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -564,45 +564,7 @@ func ClampInt32x4[T ~int32, Slice ~[]T](collection Slice, min, max T) Slice {
|
||||
return result
|
||||
}
|
||||
|
||||
// ClampInt64x2 clamps each element in collection between min and max values using SSE SIMD and AVX-512 SIMD.
|
||||
func ClampInt64x2[T ~int64, Slice ~[]T](collection Slice, min, max T) Slice {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
return collection
|
||||
}
|
||||
|
||||
result := make(Slice, length)
|
||||
const lanes = simdLanes2
|
||||
|
||||
base := unsafeSliceInt64(collection, length)
|
||||
|
||||
minVec := archsimd.BroadcastInt64x2(int64(min))
|
||||
maxVec := archsimd.BroadcastInt64x2(int64(max))
|
||||
|
||||
i := uint(0)
|
||||
for ; i+lanes <= length; i += lanes {
|
||||
v := archsimd.LoadInt64x2Slice(base[i : i+lanes])
|
||||
|
||||
clamped := v.Max(minVec).Min(maxVec)
|
||||
|
||||
// bearer:disable go_gosec_unsafe_unsafe
|
||||
clamped.Store((*[lanes]int64)(unsafe.Pointer(&result[i])))
|
||||
}
|
||||
|
||||
for ; i < length; i++ {
|
||||
val := collection[i]
|
||||
if val < min {
|
||||
val = min
|
||||
} else if val > max {
|
||||
val = max
|
||||
}
|
||||
result[i] = val
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// ClampUint8x16 clamps each element in collection between min and max values using SSE SIMD
|
||||
// ClampUint8x16 clamps each element in collection between min and max values using AVX SIMD
|
||||
func ClampUint8x16[T ~uint8, Slice ~[]T](collection Slice, min, max T) Slice {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -640,7 +602,7 @@ func ClampUint8x16[T ~uint8, Slice ~[]T](collection Slice, min, max T) Slice {
|
||||
return result
|
||||
}
|
||||
|
||||
// ClampUint16x8 clamps each element in collection between min and max values using SSE SIMD
|
||||
// ClampUint16x8 clamps each element in collection between min and max values using AVX SIMD
|
||||
func ClampUint16x8[T ~uint16, Slice ~[]T](collection Slice, min, max T) Slice {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -678,7 +640,7 @@ func ClampUint16x8[T ~uint16, Slice ~[]T](collection Slice, min, max T) Slice {
|
||||
return result
|
||||
}
|
||||
|
||||
// ClampUint32x4 clamps each element in collection between min and max values using SSE SIMD
|
||||
// ClampUint32x4 clamps each element in collection between min and max values using AVX SIMD
|
||||
func ClampUint32x4[T ~uint32, Slice ~[]T](collection Slice, min, max T) Slice {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -716,45 +678,7 @@ func ClampUint32x4[T ~uint32, Slice ~[]T](collection Slice, min, max T) Slice {
|
||||
return result
|
||||
}
|
||||
|
||||
// ClampUint64x2 clamps each element in collection between min and max values using SSE SIMD and AVX-512 SIMD.
|
||||
func ClampUint64x2[T ~uint64, Slice ~[]T](collection Slice, min, max T) Slice {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
return collection
|
||||
}
|
||||
|
||||
result := make(Slice, length)
|
||||
const lanes = simdLanes2
|
||||
|
||||
base := unsafeSliceUint64(collection, length)
|
||||
|
||||
minVec := archsimd.BroadcastUint64x2(uint64(min))
|
||||
maxVec := archsimd.BroadcastUint64x2(uint64(max))
|
||||
|
||||
i := uint(0)
|
||||
for ; i+lanes <= length; i += lanes {
|
||||
v := archsimd.LoadUint64x2Slice(base[i : i+lanes])
|
||||
|
||||
clamped := v.Max(minVec).Min(maxVec)
|
||||
|
||||
// bearer:disable go_gosec_unsafe_unsafe
|
||||
clamped.Store((*[lanes]uint64)(unsafe.Pointer(&result[i])))
|
||||
}
|
||||
|
||||
for ; i < length; i++ {
|
||||
val := collection[i]
|
||||
if val < min {
|
||||
val = min
|
||||
} else if val > max {
|
||||
val = max
|
||||
}
|
||||
result[i] = val
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// ClampFloat32x4 clamps each element in collection between min and max values using SSE SIMD
|
||||
// ClampFloat32x4 clamps each element in collection between min and max values using AVX SIMD
|
||||
func ClampFloat32x4[T ~float32, Slice ~[]T](collection Slice, min, max T) Slice {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -792,7 +716,7 @@ func ClampFloat32x4[T ~float32, Slice ~[]T](collection Slice, min, max T) Slice
|
||||
return result
|
||||
}
|
||||
|
||||
// ClampFloat64x2 clamps each element in collection between min and max values using SSE SIMD
|
||||
// ClampFloat64x2 clamps each element in collection between min and max values using AVX SIMD
|
||||
func ClampFloat64x2[T ~float64, Slice ~[]T](collection Slice, min, max T) Slice {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -830,7 +754,7 @@ func ClampFloat64x2[T ~float64, Slice ~[]T](collection Slice, min, max T) Slice
|
||||
return result
|
||||
}
|
||||
|
||||
// MinInt8x16 finds the minimum value in a collection of int8 using SSE SIMD
|
||||
// MinInt8x16 finds the minimum value in a collection of int8 using AVX SIMD
|
||||
func MinInt8x16[T ~int8](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -877,7 +801,7 @@ func MinInt8x16[T ~int8](collection []T) T {
|
||||
return T(minVal)
|
||||
}
|
||||
|
||||
// MinInt16x8 finds the minimum value in a collection of int16 using SSE SIMD
|
||||
// MinInt16x8 finds the minimum value in a collection of int16 using AVX SIMD
|
||||
func MinInt16x8[T ~int16](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -921,7 +845,7 @@ func MinInt16x8[T ~int16](collection []T) T {
|
||||
return T(minVal)
|
||||
}
|
||||
|
||||
// MinInt32x4 finds the minimum value in a collection of int32 using SSE SIMD
|
||||
// MinInt32x4 finds the minimum value in a collection of int32 using AVX SIMD
|
||||
func MinInt32x4[T ~int32](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -965,51 +889,7 @@ func MinInt32x4[T ~int32](collection []T) T {
|
||||
return T(minVal)
|
||||
}
|
||||
|
||||
// MinInt64x2 finds the minimum value in a collection of int64 using SSE SIMD
|
||||
func MinInt64x2[T ~int64](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
const lanes = simdLanes2
|
||||
base := unsafeSliceInt64(collection, length)
|
||||
|
||||
var minVec archsimd.Int64x2
|
||||
firstInitialized := false
|
||||
|
||||
i := uint(0)
|
||||
for ; i+lanes <= length; i += lanes {
|
||||
v := archsimd.LoadInt64x2Slice(base[i : i+lanes])
|
||||
|
||||
if !firstInitialized {
|
||||
minVec = v
|
||||
firstInitialized = true
|
||||
} else {
|
||||
minVec = minVec.Min(v)
|
||||
}
|
||||
}
|
||||
|
||||
// Find minimum in the vector (only if we processed any vectors)
|
||||
var minVal int64
|
||||
if firstInitialized {
|
||||
var buf [lanes]int64
|
||||
minVec.Store(&buf)
|
||||
minVal = min(buf[0], buf[1])
|
||||
}
|
||||
|
||||
// Handle remaining elements
|
||||
for ; i < length; i++ {
|
||||
if !firstInitialized || collection[i] < T(minVal) {
|
||||
minVal = int64(collection[i])
|
||||
firstInitialized = true
|
||||
}
|
||||
}
|
||||
|
||||
return T(minVal)
|
||||
}
|
||||
|
||||
// MinUint8x16 finds the minimum value in a collection of uint8 using SSE SIMD
|
||||
// MinUint8x16 finds the minimum value in a collection of uint8 using AVX SIMD
|
||||
func MinUint8x16[T ~uint8](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -1056,7 +936,7 @@ func MinUint8x16[T ~uint8](collection []T) T {
|
||||
return T(minVal)
|
||||
}
|
||||
|
||||
// MinUint16x8 finds the minimum value in a collection of uint16 using SSE SIMD
|
||||
// MinUint16x8 finds the minimum value in a collection of uint16 using AVX SIMD
|
||||
func MinUint16x8[T ~uint16](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -1100,7 +980,7 @@ func MinUint16x8[T ~uint16](collection []T) T {
|
||||
return T(minVal)
|
||||
}
|
||||
|
||||
// MinUint32x4 finds the minimum value in a collection of uint32 using SSE SIMD
|
||||
// MinUint32x4 finds the minimum value in a collection of uint32 using AVX SIMD
|
||||
func MinUint32x4[T ~uint32](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -1144,51 +1024,7 @@ func MinUint32x4[T ~uint32](collection []T) T {
|
||||
return T(minVal)
|
||||
}
|
||||
|
||||
// MinUint64x2 finds the minimum value in a collection of uint64 using SSE SIMD
|
||||
func MinUint64x2[T ~uint64](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
const lanes = simdLanes2
|
||||
base := unsafeSliceUint64(collection, length)
|
||||
|
||||
var minVec archsimd.Uint64x2
|
||||
firstInitialized := false
|
||||
|
||||
i := uint(0)
|
||||
for ; i+lanes <= length; i += lanes {
|
||||
v := archsimd.LoadUint64x2Slice(base[i : i+lanes])
|
||||
|
||||
if !firstInitialized {
|
||||
minVec = v
|
||||
firstInitialized = true
|
||||
} else {
|
||||
minVec = minVec.Min(v)
|
||||
}
|
||||
}
|
||||
|
||||
// Find minimum in the vector (only if we processed any vectors)
|
||||
var minVal uint64
|
||||
if firstInitialized {
|
||||
var buf [lanes]uint64
|
||||
minVec.Store(&buf)
|
||||
minVal = min(buf[0], buf[1])
|
||||
}
|
||||
|
||||
// Handle remaining elements
|
||||
for ; i < length; i++ {
|
||||
if !firstInitialized || collection[i] < T(minVal) {
|
||||
minVal = uint64(collection[i])
|
||||
firstInitialized = true
|
||||
}
|
||||
}
|
||||
|
||||
return T(minVal)
|
||||
}
|
||||
|
||||
// MinFloat32x4 finds the minimum value in a collection of float32 using SSE SIMD
|
||||
// MinFloat32x4 finds the minimum value in a collection of float32 using AVX SIMD
|
||||
func MinFloat32x4[T ~float32](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -1232,7 +1068,7 @@ func MinFloat32x4[T ~float32](collection []T) T {
|
||||
return T(minVal)
|
||||
}
|
||||
|
||||
// MinFloat64x2 finds the minimum value in a collection of float64 using SSE SIMD
|
||||
// MinFloat64x2 finds the minimum value in a collection of float64 using AVX SIMD
|
||||
func MinFloat64x2[T ~float64](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -1276,7 +1112,7 @@ func MinFloat64x2[T ~float64](collection []T) T {
|
||||
return T(minVal)
|
||||
}
|
||||
|
||||
// MaxInt8x16 finds the maximum value in a collection of int8 using SSE SIMD
|
||||
// MaxInt8x16 finds the maximum value in a collection of int8 using AVX SIMD
|
||||
func MaxInt8x16[T ~int8](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -1323,7 +1159,7 @@ func MaxInt8x16[T ~int8](collection []T) T {
|
||||
return T(maxVal)
|
||||
}
|
||||
|
||||
// MaxInt16x8 finds the maximum value in a collection of int16 using SSE SIMD
|
||||
// MaxInt16x8 finds the maximum value in a collection of int16 using AVX SIMD
|
||||
func MaxInt16x8[T ~int16](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -1367,7 +1203,7 @@ func MaxInt16x8[T ~int16](collection []T) T {
|
||||
return T(maxVal)
|
||||
}
|
||||
|
||||
// MaxInt32x4 finds the maximum value in a collection of int32 using SSE SIMD
|
||||
// MaxInt32x4 finds the maximum value in a collection of int32 using AVX SIMD
|
||||
func MaxInt32x4[T ~int32](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -1411,51 +1247,7 @@ func MaxInt32x4[T ~int32](collection []T) T {
|
||||
return T(maxVal)
|
||||
}
|
||||
|
||||
// MaxInt64x2 finds the maximum value in a collection of int64 using SSE SIMD
|
||||
func MaxInt64x2[T ~int64](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
const lanes = simdLanes2
|
||||
base := unsafeSliceInt64(collection, length)
|
||||
|
||||
var maxVec archsimd.Int64x2
|
||||
firstInitialized := false
|
||||
|
||||
i := uint(0)
|
||||
for ; i+lanes <= length; i += lanes {
|
||||
v := archsimd.LoadInt64x2Slice(base[i : i+lanes])
|
||||
|
||||
if !firstInitialized {
|
||||
maxVec = v
|
||||
firstInitialized = true
|
||||
} else {
|
||||
maxVec = maxVec.Max(v)
|
||||
}
|
||||
}
|
||||
|
||||
// Find maximum in the vector (only if we processed any vectors)
|
||||
var maxVal int64
|
||||
if firstInitialized {
|
||||
var buf [lanes]int64
|
||||
maxVec.Store(&buf)
|
||||
maxVal = max(buf[0], buf[1])
|
||||
}
|
||||
|
||||
// Handle remaining elements
|
||||
for ; i < length; i++ {
|
||||
if !firstInitialized || collection[i] > T(maxVal) {
|
||||
maxVal = int64(collection[i])
|
||||
firstInitialized = true
|
||||
}
|
||||
}
|
||||
|
||||
return T(maxVal)
|
||||
}
|
||||
|
||||
// MaxUint8x16 finds the maximum value in a collection of uint8 using SSE SIMD
|
||||
// MaxUint8x16 finds the maximum value in a collection of uint8 using AVX SIMD
|
||||
func MaxUint8x16[T ~uint8](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -1502,7 +1294,7 @@ func MaxUint8x16[T ~uint8](collection []T) T {
|
||||
return T(maxVal)
|
||||
}
|
||||
|
||||
// MaxUint16x8 finds the maximum value in a collection of uint16 using SSE SIMD
|
||||
// MaxUint16x8 finds the maximum value in a collection of uint16 using AVX SIMD
|
||||
func MaxUint16x8[T ~uint16](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -1546,7 +1338,7 @@ func MaxUint16x8[T ~uint16](collection []T) T {
|
||||
return T(maxVal)
|
||||
}
|
||||
|
||||
// MaxUint32x4 finds the maximum value in a collection of uint32 using SSE SIMD
|
||||
// MaxUint32x4 finds the maximum value in a collection of uint32 using AVX SIMD
|
||||
func MaxUint32x4[T ~uint32](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -1590,51 +1382,7 @@ func MaxUint32x4[T ~uint32](collection []T) T {
|
||||
return T(maxVal)
|
||||
}
|
||||
|
||||
// MaxUint64x2 finds the maximum value in a collection of uint64 using SSE SIMD
|
||||
func MaxUint64x2[T ~uint64](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
const lanes = simdLanes2
|
||||
base := unsafeSliceUint64(collection, length)
|
||||
|
||||
var maxVec archsimd.Uint64x2
|
||||
firstInitialized := false
|
||||
|
||||
i := uint(0)
|
||||
for ; i+lanes <= length; i += lanes {
|
||||
v := archsimd.LoadUint64x2Slice(base[i : i+lanes])
|
||||
|
||||
if !firstInitialized {
|
||||
maxVec = v
|
||||
firstInitialized = true
|
||||
} else {
|
||||
maxVec = maxVec.Max(v)
|
||||
}
|
||||
}
|
||||
|
||||
// Find maximum in the vector (only if we processed any vectors)
|
||||
var maxVal uint64
|
||||
if firstInitialized {
|
||||
var buf [lanes]uint64
|
||||
maxVec.Store(&buf)
|
||||
maxVal = max(buf[0], buf[1])
|
||||
}
|
||||
|
||||
// Handle remaining elements
|
||||
for ; i < length; i++ {
|
||||
if !firstInitialized || collection[i] > T(maxVal) {
|
||||
maxVal = uint64(collection[i])
|
||||
firstInitialized = true
|
||||
}
|
||||
}
|
||||
|
||||
return T(maxVal)
|
||||
}
|
||||
|
||||
// MaxFloat32x4 finds the maximum value in a collection of float32 using SSE SIMD
|
||||
// MaxFloat32x4 finds the maximum value in a collection of float32 using AVX SIMD
|
||||
func MaxFloat32x4[T ~float32](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -1678,7 +1426,7 @@ func MaxFloat32x4[T ~float32](collection []T) T {
|
||||
return T(maxVal)
|
||||
}
|
||||
|
||||
// MaxFloat64x2 finds the maximum value in a collection of float64 using SSE SIMD
|
||||
// MaxFloat64x2 finds the maximum value in a collection of float64 using AVX SIMD
|
||||
func MaxFloat64x2[T ~float64](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
@@ -1722,127 +1470,127 @@ func MaxFloat64x2[T ~float64](collection []T) T {
|
||||
return T(maxVal)
|
||||
}
|
||||
|
||||
// SSE (128-bit) SIMD sumBy functions - 16/8/4/2 lanes
|
||||
// AVX (128-bit) SIMD sumBy functions - 16/8/4/2 lanes
|
||||
// These implementations use lo.Map to apply the iteratee, then chain with SIMD sum functions.
|
||||
|
||||
// SumByInt8x16 sums the values extracted by iteratee from a slice using SSE SIMD.
|
||||
// SumByInt8x16 sums the values extracted by iteratee from a slice using AVX SIMD.
|
||||
func SumByInt8x16[T any, R ~int8](collection []T, iteratee func(item T) R) R {
|
||||
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
|
||||
return SumInt8x16(mapped)
|
||||
}
|
||||
|
||||
// SumByInt16x8 sums the values extracted by iteratee from a slice using SSE SIMD.
|
||||
// SumByInt16x8 sums the values extracted by iteratee from a slice using AVX SIMD.
|
||||
func SumByInt16x8[T any, R ~int16](collection []T, iteratee func(item T) R) R {
|
||||
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
|
||||
return SumInt16x8(mapped)
|
||||
}
|
||||
|
||||
// SumByInt32x4 sums the values extracted by iteratee from a slice using SSE SIMD.
|
||||
// SumByInt32x4 sums the values extracted by iteratee from a slice using AVX SIMD.
|
||||
func SumByInt32x4[T any, R ~int32](collection []T, iteratee func(item T) R) R {
|
||||
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
|
||||
return SumInt32x4(mapped)
|
||||
}
|
||||
|
||||
// SumByInt64x2 sums the values extracted by iteratee from a slice using SSE SIMD.
|
||||
// SumByInt64x2 sums the values extracted by iteratee from a slice using AVX SIMD.
|
||||
func SumByInt64x2[T any, R ~int64](collection []T, iteratee func(item T) R) R {
|
||||
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
|
||||
return SumInt64x2(mapped)
|
||||
}
|
||||
|
||||
// SumByUint8x16 sums the values extracted by iteratee from a slice using SSE SIMD.
|
||||
// SumByUint8x16 sums the values extracted by iteratee from a slice using AVX SIMD.
|
||||
func SumByUint8x16[T any, R ~uint8](collection []T, iteratee func(item T) R) R {
|
||||
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
|
||||
return SumUint8x16(mapped)
|
||||
}
|
||||
|
||||
// SumByUint16x8 sums the values extracted by iteratee from a slice using SSE SIMD.
|
||||
// SumByUint16x8 sums the values extracted by iteratee from a slice using AVX SIMD.
|
||||
func SumByUint16x8[T any, R ~uint16](collection []T, iteratee func(item T) R) R {
|
||||
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
|
||||
return SumUint16x8(mapped)
|
||||
}
|
||||
|
||||
// SumByUint32x4 sums the values extracted by iteratee from a slice using SSE SIMD.
|
||||
// SumByUint32x4 sums the values extracted by iteratee from a slice using AVX SIMD.
|
||||
func SumByUint32x4[T any, R ~uint32](collection []T, iteratee func(item T) R) R {
|
||||
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
|
||||
return SumUint32x4(mapped)
|
||||
}
|
||||
|
||||
// SumByUint64x2 sums the values extracted by iteratee from a slice using SSE SIMD.
|
||||
// SumByUint64x2 sums the values extracted by iteratee from a slice using AVX SIMD.
|
||||
func SumByUint64x2[T any, R ~uint64](collection []T, iteratee func(item T) R) R {
|
||||
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
|
||||
return SumUint64x2(mapped)
|
||||
}
|
||||
|
||||
// SumByFloat32x4 sums the values extracted by iteratee from a slice using SSE SIMD.
|
||||
// SumByFloat32x4 sums the values extracted by iteratee from a slice using AVX SIMD.
|
||||
func SumByFloat32x4[T any, R ~float32](collection []T, iteratee func(item T) R) R {
|
||||
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
|
||||
return SumFloat32x4(mapped)
|
||||
}
|
||||
|
||||
// SumByFloat64x2 sums the values extracted by iteratee from a slice using SSE SIMD.
|
||||
// SumByFloat64x2 sums the values extracted by iteratee from a slice using AVX SIMD.
|
||||
func SumByFloat64x2[T any, R ~float64](collection []T, iteratee func(item T) R) R {
|
||||
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
|
||||
return SumFloat64x2(mapped)
|
||||
}
|
||||
|
||||
// SSE (128-bit) SIMD meanBy functions - 16/8/4/2 lanes
|
||||
// AVX (128-bit) SIMD meanBy functions - 16/8/4/2 lanes
|
||||
// These implementations use lo.Map to apply the iteratee, then chain with SIMD mean functions.
|
||||
|
||||
// MeanByInt8x16 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
|
||||
// MeanByInt8x16 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
|
||||
func MeanByInt8x16[T any, R ~int8](collection []T, iteratee func(item T) R) R {
|
||||
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
|
||||
return MeanInt8x16(mapped)
|
||||
}
|
||||
|
||||
// MeanByInt16x8 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
|
||||
// MeanByInt16x8 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
|
||||
func MeanByInt16x8[T any, R ~int16](collection []T, iteratee func(item T) R) R {
|
||||
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
|
||||
return MeanInt16x8(mapped)
|
||||
}
|
||||
|
||||
// MeanByInt32x4 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
|
||||
// MeanByInt32x4 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
|
||||
func MeanByInt32x4[T any, R ~int32](collection []T, iteratee func(item T) R) R {
|
||||
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
|
||||
return MeanInt32x4(mapped)
|
||||
}
|
||||
|
||||
// MeanByInt64x2 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
|
||||
// MeanByInt64x2 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
|
||||
func MeanByInt64x2[T any, R ~int64](collection []T, iteratee func(item T) R) R {
|
||||
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
|
||||
return MeanInt64x2(mapped)
|
||||
}
|
||||
|
||||
// MeanByUint8x16 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
|
||||
// MeanByUint8x16 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
|
||||
func MeanByUint8x16[T any, R ~uint8](collection []T, iteratee func(item T) R) R {
|
||||
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
|
||||
return MeanUint8x16(mapped)
|
||||
}
|
||||
|
||||
// MeanByUint16x8 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
|
||||
// MeanByUint16x8 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
|
||||
func MeanByUint16x8[T any, R ~uint16](collection []T, iteratee func(item T) R) R {
|
||||
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
|
||||
return MeanUint16x8(mapped)
|
||||
}
|
||||
|
||||
// MeanByUint32x4 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
|
||||
// MeanByUint32x4 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
|
||||
func MeanByUint32x4[T any, R ~uint32](collection []T, iteratee func(item T) R) R {
|
||||
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
|
||||
return MeanUint32x4(mapped)
|
||||
}
|
||||
|
||||
// MeanByUint64x2 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
|
||||
// MeanByUint64x2 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
|
||||
func MeanByUint64x2[T any, R ~uint64](collection []T, iteratee func(item T) R) R {
|
||||
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
|
||||
return MeanUint64x2(mapped)
|
||||
}
|
||||
|
||||
// MeanByFloat32x4 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
|
||||
// MeanByFloat32x4 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
|
||||
func MeanByFloat32x4[T any, R ~float32](collection []T, iteratee func(item T) R) R {
|
||||
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
|
||||
return MeanFloat32x4(mapped)
|
||||
}
|
||||
|
||||
// MeanByFloat64x2 calculates the mean of values extracted by iteratee from a slice using SSE SIMD.
|
||||
// MeanByFloat64x2 calculates the mean of values extracted by iteratee from a slice using AVX SIMD.
|
||||
func MeanByFloat64x2[T any, R ~float64](collection []T, iteratee func(item T) R) R {
|
||||
mapped := lo.Map(collection, func(item T, _ int) R { return iteratee(item) })
|
||||
return MeanFloat64x2(mapped)
|
||||
@@ -566,6 +566,84 @@ func ClampInt32x16[T ~int32, Slice ~[]T](collection Slice, min, max T) Slice {
|
||||
return result
|
||||
}
|
||||
|
||||
// ClampInt64x2 clamps each element in collection between min and max values using AVX-512 SIMD.
|
||||
// Int64x2 Min/Max operations in archsimd require AVX-512 (VPMAXSQ/VPMINSQ).
|
||||
func ClampInt64x2[T ~int64, Slice ~[]T](collection Slice, min, max T) Slice {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
return collection
|
||||
}
|
||||
|
||||
result := make(Slice, length)
|
||||
const lanes = simdLanes2
|
||||
|
||||
base := unsafeSliceInt64(collection, length)
|
||||
|
||||
minVec := archsimd.BroadcastInt64x2(int64(min))
|
||||
maxVec := archsimd.BroadcastInt64x2(int64(max))
|
||||
|
||||
i := uint(0)
|
||||
for ; i+lanes <= length; i += lanes {
|
||||
v := archsimd.LoadInt64x2Slice(base[i : i+lanes])
|
||||
|
||||
clamped := v.Max(minVec).Min(maxVec)
|
||||
|
||||
// bearer:disable go_gosec_unsafe_unsafe
|
||||
clamped.Store((*[lanes]int64)(unsafe.Pointer(&result[i])))
|
||||
}
|
||||
|
||||
for ; i < length; i++ {
|
||||
val := collection[i]
|
||||
if val < min {
|
||||
val = min
|
||||
} else if val > max {
|
||||
val = max
|
||||
}
|
||||
result[i] = val
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// ClampUint64x2 clamps each element in collection between min and max values using AVX-512 SIMD.
|
||||
// Uint64x2 Min/Max operations in archsimd require AVX-512.
|
||||
func ClampUint64x2[T ~uint64, Slice ~[]T](collection Slice, min, max T) Slice {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
return collection
|
||||
}
|
||||
|
||||
result := make(Slice, length)
|
||||
const lanes = simdLanes2
|
||||
|
||||
base := unsafeSliceUint64(collection, length)
|
||||
|
||||
minVec := archsimd.BroadcastUint64x2(uint64(min))
|
||||
maxVec := archsimd.BroadcastUint64x2(uint64(max))
|
||||
|
||||
i := uint(0)
|
||||
for ; i+lanes <= length; i += lanes {
|
||||
v := archsimd.LoadUint64x2Slice(base[i : i+lanes])
|
||||
|
||||
clamped := v.Max(minVec).Min(maxVec)
|
||||
|
||||
// bearer:disable go_gosec_unsafe_unsafe
|
||||
clamped.Store((*[lanes]uint64)(unsafe.Pointer(&result[i])))
|
||||
}
|
||||
|
||||
for ; i < length; i++ {
|
||||
val := collection[i]
|
||||
if val < min {
|
||||
val = min
|
||||
} else if val > max {
|
||||
val = max
|
||||
}
|
||||
result[i] = val
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// ClampInt64x8 clamps each element in collection between min and max values using AVX-512 SIMD
|
||||
func ClampInt64x8[T ~int64, Slice ~[]T](collection Slice, min, max T) Slice {
|
||||
length := uint(len(collection))
|
||||
@@ -991,6 +1069,96 @@ func MinInt32x16[T ~int32](collection []T) T {
|
||||
return T(minVal)
|
||||
}
|
||||
|
||||
// MinInt64x2 finds the minimum value in a collection of int64 using AVX-512 SIMD.
|
||||
// Int64x2 Min operations in archsimd require AVX-512.
|
||||
func MinInt64x2[T ~int64](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
const lanes = simdLanes2
|
||||
base := unsafeSliceInt64(collection, length)
|
||||
|
||||
var minVec archsimd.Int64x2
|
||||
firstInitialized := false
|
||||
|
||||
i := uint(0)
|
||||
for ; i+lanes <= length; i += lanes {
|
||||
v := archsimd.LoadInt64x2Slice(base[i : i+lanes])
|
||||
|
||||
if !firstInitialized {
|
||||
minVec = v
|
||||
firstInitialized = true
|
||||
} else {
|
||||
minVec = minVec.Min(v)
|
||||
}
|
||||
}
|
||||
|
||||
// Find minimum in the vector (only if we processed any vectors)
|
||||
var minVal int64
|
||||
if firstInitialized {
|
||||
var buf [lanes]int64
|
||||
minVec.Store(&buf)
|
||||
minVal = min(buf[0], buf[1])
|
||||
}
|
||||
|
||||
// Handle remaining elements
|
||||
for ; i < length; i++ {
|
||||
if !firstInitialized || collection[i] < T(minVal) {
|
||||
minVal = int64(collection[i])
|
||||
firstInitialized = true
|
||||
}
|
||||
}
|
||||
|
||||
return T(minVal)
|
||||
}
|
||||
|
||||
// MinUint64x2 finds the minimum value in a collection of uint64 using AVX-512 SIMD.
|
||||
// Uint64x2 Min operations in archsimd require AVX-512.
|
||||
func MinUint64x2[T ~uint64](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
const lanes = simdLanes2
|
||||
base := unsafeSliceUint64(collection, length)
|
||||
|
||||
var minVec archsimd.Uint64x2
|
||||
firstInitialized := false
|
||||
|
||||
i := uint(0)
|
||||
for ; i+lanes <= length; i += lanes {
|
||||
v := archsimd.LoadUint64x2Slice(base[i : i+lanes])
|
||||
|
||||
if !firstInitialized {
|
||||
minVec = v
|
||||
firstInitialized = true
|
||||
} else {
|
||||
minVec = minVec.Min(v)
|
||||
}
|
||||
}
|
||||
|
||||
// Find minimum in the vector (only if we processed any vectors)
|
||||
var minVal uint64
|
||||
if firstInitialized {
|
||||
var buf [lanes]uint64
|
||||
minVec.Store(&buf)
|
||||
minVal = min(buf[0], buf[1])
|
||||
}
|
||||
|
||||
// Handle remaining elements
|
||||
for ; i < length; i++ {
|
||||
if !firstInitialized || collection[i] < T(minVal) {
|
||||
minVal = uint64(collection[i])
|
||||
firstInitialized = true
|
||||
}
|
||||
}
|
||||
|
||||
return T(minVal)
|
||||
}
|
||||
|
||||
// MinInt64x8 finds the minimum value in a collection of int64 using AVX-512 SIMD
|
||||
func MinInt64x8[T ~int64](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
@@ -1478,6 +1646,96 @@ func MaxInt32x16[T ~int32](collection []T) T {
|
||||
return T(maxVal)
|
||||
}
|
||||
|
||||
// MaxInt64x2 finds the maximum value in a collection of int64 using AVX-512 SIMD.
|
||||
// Int64x2 Max operations in archsimd require AVX-512.
|
||||
func MaxInt64x2[T ~int64](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
const lanes = simdLanes2
|
||||
base := unsafeSliceInt64(collection, length)
|
||||
|
||||
var maxVec archsimd.Int64x2
|
||||
firstInitialized := false
|
||||
|
||||
i := uint(0)
|
||||
for ; i+lanes <= length; i += lanes {
|
||||
v := archsimd.LoadInt64x2Slice(base[i : i+lanes])
|
||||
|
||||
if !firstInitialized {
|
||||
maxVec = v
|
||||
firstInitialized = true
|
||||
} else {
|
||||
maxVec = maxVec.Max(v)
|
||||
}
|
||||
}
|
||||
|
||||
// Find maximum in the vector (only if we processed any vectors)
|
||||
var maxVal int64
|
||||
if firstInitialized {
|
||||
var buf [lanes]int64
|
||||
maxVec.Store(&buf)
|
||||
maxVal = max(buf[0], buf[1])
|
||||
}
|
||||
|
||||
// Handle remaining elements
|
||||
for ; i < length; i++ {
|
||||
if !firstInitialized || collection[i] > T(maxVal) {
|
||||
maxVal = int64(collection[i])
|
||||
firstInitialized = true
|
||||
}
|
||||
}
|
||||
|
||||
return T(maxVal)
|
||||
}
|
||||
|
||||
// MaxUint64x2 finds the maximum value in a collection of uint64 using AVX-512 SIMD.
|
||||
// Uint64x2 Max operations in archsimd require AVX-512.
|
||||
func MaxUint64x2[T ~uint64](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
if length == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
const lanes = simdLanes2
|
||||
base := unsafeSliceUint64(collection, length)
|
||||
|
||||
var maxVec archsimd.Uint64x2
|
||||
firstInitialized := false
|
||||
|
||||
i := uint(0)
|
||||
for ; i+lanes <= length; i += lanes {
|
||||
v := archsimd.LoadUint64x2Slice(base[i : i+lanes])
|
||||
|
||||
if !firstInitialized {
|
||||
maxVec = v
|
||||
firstInitialized = true
|
||||
} else {
|
||||
maxVec = maxVec.Max(v)
|
||||
}
|
||||
}
|
||||
|
||||
// Find maximum in the vector (only if we processed any vectors)
|
||||
var maxVal uint64
|
||||
if firstInitialized {
|
||||
var buf [lanes]uint64
|
||||
maxVec.Store(&buf)
|
||||
maxVal = max(buf[0], buf[1])
|
||||
}
|
||||
|
||||
// Handle remaining elements
|
||||
for ; i < length; i++ {
|
||||
if !firstInitialized || collection[i] > T(maxVal) {
|
||||
maxVal = uint64(collection[i])
|
||||
firstInitialized = true
|
||||
}
|
||||
}
|
||||
|
||||
return T(maxVal)
|
||||
}
|
||||
|
||||
// MaxInt64x8 finds the maximum value in a collection of int64 using AVX-512 SIMD
|
||||
func MaxInt64x8[T ~int64](collection []T) T {
|
||||
length := uint(len(collection))
|
||||
|
||||
@@ -819,6 +819,55 @@ func TestClampInt32x16(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestClampInt64x2(t *testing.T) {
|
||||
requireAVX512(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []int64
|
||||
min int64
|
||||
max int64
|
||||
}{
|
||||
{"empty", []int64{}, -100, 100},
|
||||
{"single", []int64{42}, -10, 10},
|
||||
{"small", []int64{1, 2, 3, 4, 5}, 2, 4},
|
||||
{"exactly 2", []int64{-100, 200}, -50, 50},
|
||||
{"large", make([]int64, 1000), -50, 50},
|
||||
{"all below min", []int64{-1000, -2000, -3000}, -500, 100},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
|
||||
for i := range tc.input {
|
||||
tc.input[i] = rand.Int64()
|
||||
}
|
||||
}
|
||||
|
||||
got := ClampInt64x2(tc.input, tc.min, tc.max)
|
||||
|
||||
if len(got) != len(tc.input) {
|
||||
t.Errorf("ClampInt64x2() returned length %d, want %d", len(got), len(tc.input))
|
||||
}
|
||||
|
||||
for i, v := range got {
|
||||
if v < tc.min || v > tc.max {
|
||||
t.Errorf("ClampInt64x2()[%d] = %v, outside range [%v, %v]", i, v, tc.min, tc.max)
|
||||
}
|
||||
original := tc.input[i]
|
||||
expected := original
|
||||
if expected < tc.min {
|
||||
expected = tc.min
|
||||
} else if expected > tc.max {
|
||||
expected = tc.max
|
||||
}
|
||||
if v != expected {
|
||||
t.Errorf("ClampInt64x2()[%d] = %v, want %v (original: %v)", i, v, expected, original)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestClampInt64x8(t *testing.T) {
|
||||
requireAVX512(t)
|
||||
testCases := []struct {
|
||||
@@ -1018,6 +1067,55 @@ func TestClampUint32x16(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestClampUint64x2(t *testing.T) {
|
||||
requireAVX512(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []uint64
|
||||
min uint64
|
||||
max uint64
|
||||
}{
|
||||
{"empty", []uint64{}, 100, 1000},
|
||||
{"single", []uint64{42}, 10, 100},
|
||||
{"small", []uint64{1, 2, 3, 4, 5}, 2, 4},
|
||||
{"exactly 2", []uint64{50, 2000}, 100, 1000},
|
||||
{"large", make([]uint64, 1000), 500, 5000},
|
||||
{"all below min", []uint64{1, 2, 3}, 10, 100},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
|
||||
for i := range tc.input {
|
||||
tc.input[i] = rand.Uint64()
|
||||
}
|
||||
}
|
||||
|
||||
got := ClampUint64x2(tc.input, tc.min, tc.max)
|
||||
|
||||
if len(got) != len(tc.input) {
|
||||
t.Errorf("ClampUint64x2() returned length %d, want %d", len(got), len(tc.input))
|
||||
}
|
||||
|
||||
for i, v := range got {
|
||||
if v < tc.min || v > tc.max {
|
||||
t.Errorf("ClampUint64x2()[%d] = %v, outside range [%v, %v]", i, v, tc.min, tc.max)
|
||||
}
|
||||
original := tc.input[i]
|
||||
expected := original
|
||||
if expected < tc.min {
|
||||
expected = tc.min
|
||||
} else if expected > tc.max {
|
||||
expected = tc.max
|
||||
}
|
||||
if v != expected {
|
||||
t.Errorf("ClampUint64x2()[%d] = %v, want %v (original: %v)", i, v, expected, original)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestClampUint64x8(t *testing.T) {
|
||||
requireAVX512(t)
|
||||
testCases := []struct {
|
||||
@@ -1292,6 +1390,38 @@ func TestMinInt32x16(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestMinInt64x2(t *testing.T) {
|
||||
requireAVX512(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []int64
|
||||
}{
|
||||
{"empty", []int64{}},
|
||||
{"single", []int64{42}},
|
||||
{"small", []int64{1, 2, 3, 4, 5}},
|
||||
{"exactly 2", []int64{1, 2}},
|
||||
{"large", make([]int64, 1000)},
|
||||
{"negative", []int64{-1, -2, -3, 4, 5}},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
|
||||
for i := range tc.input {
|
||||
tc.input[i] = rand.Int64()
|
||||
}
|
||||
}
|
||||
|
||||
got := MinInt64x2(tc.input)
|
||||
want := lo.Min(tc.input)
|
||||
|
||||
if got != want {
|
||||
t.Errorf("MinInt64x2() = %v, want %v", got, want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestMinInt64x8(t *testing.T) {
|
||||
requireAVX512(t)
|
||||
testCases := []struct {
|
||||
@@ -1419,6 +1549,37 @@ func TestMinUint32x16(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestMinUint64x2(t *testing.T) {
|
||||
requireAVX512(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []uint64
|
||||
}{
|
||||
{"empty", []uint64{}},
|
||||
{"single", []uint64{42}},
|
||||
{"small", []uint64{1, 2, 3, 4, 5}},
|
||||
{"exactly 2", []uint64{1, 2}},
|
||||
{"large", make([]uint64, 1000)},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
|
||||
for i := range tc.input {
|
||||
tc.input[i] = rand.Uint64()
|
||||
}
|
||||
}
|
||||
|
||||
got := MinUint64x2(tc.input)
|
||||
want := lo.Min(tc.input)
|
||||
|
||||
if got != want {
|
||||
t.Errorf("MinUint64x2() = %v, want %v", got, want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestMinUint64x8(t *testing.T) {
|
||||
requireAVX512(t)
|
||||
testCases := []struct {
|
||||
@@ -1625,6 +1786,38 @@ func TestMaxInt32x16(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestMaxInt64x2(t *testing.T) {
|
||||
requireAVX512(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []int64
|
||||
}{
|
||||
{"empty", []int64{}},
|
||||
{"single", []int64{42}},
|
||||
{"small", []int64{1, 2, 3, 4, 5}},
|
||||
{"exactly 2", []int64{1, 2}},
|
||||
{"large", make([]int64, 1000)},
|
||||
{"negative", []int64{-1, -2, -3, 4, 5}},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
|
||||
for i := range tc.input {
|
||||
tc.input[i] = rand.Int64()
|
||||
}
|
||||
}
|
||||
|
||||
got := MaxInt64x2(tc.input)
|
||||
want := lo.Max(tc.input)
|
||||
|
||||
if got != want {
|
||||
t.Errorf("MaxInt64x2() = %v, want %v", got, want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestMaxInt64x8(t *testing.T) {
|
||||
requireAVX512(t)
|
||||
testCases := []struct {
|
||||
@@ -1752,6 +1945,37 @@ func TestMaxUint32x16(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestMaxUint64x2(t *testing.T) {
|
||||
requireAVX512(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []uint64
|
||||
}{
|
||||
{"empty", []uint64{}},
|
||||
{"single", []uint64{42}},
|
||||
{"small", []uint64{1, 2, 3, 4, 5}},
|
||||
{"exactly 2", []uint64{1, 2}},
|
||||
{"large", make([]uint64, 1000)},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
|
||||
for i := range tc.input {
|
||||
tc.input[i] = rand.Uint64()
|
||||
}
|
||||
}
|
||||
|
||||
got := MaxUint64x2(tc.input)
|
||||
want := lo.Max(tc.input)
|
||||
|
||||
if got != want {
|
||||
t.Errorf("MaxUint64x2() = %v, want %v", got, want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestMaxUint64x8(t *testing.T) {
|
||||
requireAVX512(t)
|
||||
testCases := []struct {
|
||||
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
)
|
||||
|
||||
func TestSumInt8x16(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []int8
|
||||
@@ -42,6 +43,7 @@ func TestSumInt8x16(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSumInt16x8(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []int16
|
||||
@@ -74,6 +76,7 @@ func TestSumInt16x8(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSumInt32x4(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []int32
|
||||
@@ -105,6 +108,7 @@ func TestSumInt32x4(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSumInt64x2(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []int64
|
||||
@@ -136,6 +140,7 @@ func TestSumInt64x2(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSumUint8x16(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []uint8
|
||||
@@ -167,6 +172,7 @@ func TestSumUint8x16(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSumUint16x8(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []uint16
|
||||
@@ -198,6 +204,7 @@ func TestSumUint16x8(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSumUint32x4(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []uint32
|
||||
@@ -228,6 +235,7 @@ func TestSumUint32x4(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSumUint64x2(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []uint64
|
||||
@@ -258,6 +266,7 @@ func TestSumUint64x2(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSumFloat32x4(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []float32
|
||||
@@ -291,6 +300,7 @@ func TestSumFloat32x4(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSumFloat64x2(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []float64
|
||||
@@ -323,7 +333,8 @@ func TestSumFloat64x2(t *testing.T) {
|
||||
}
|
||||
|
||||
// Test type aliases work correctly
|
||||
func TestSSETypeAlias(t *testing.T) {
|
||||
func TestAVXTypeAlias(t *testing.T) {
|
||||
requireAVX(t)
|
||||
input := []myInt8{1, 2, 3, 4, 5}
|
||||
got := SumInt8x16(input)
|
||||
want := lo.Sum(input)
|
||||
@@ -334,6 +345,7 @@ func TestSSETypeAlias(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestClampInt8x16(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []int8
|
||||
@@ -385,6 +397,7 @@ func TestClampInt8x16(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestClampInt16x8(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []int16
|
||||
@@ -434,6 +447,7 @@ func TestClampInt16x8(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestClampInt32x4(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []int32
|
||||
@@ -481,56 +495,8 @@ func TestClampInt32x4(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestClampInt64x2(t *testing.T) {
|
||||
requireAVX512(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []int64
|
||||
min int64
|
||||
max int64
|
||||
}{
|
||||
{"empty", []int64{}, -100, 100},
|
||||
{"single", []int64{42}, -10, 10},
|
||||
{"small", []int64{1, 2, 3, 4, 5}, 2, 4},
|
||||
{"exactly 2", []int64{-100, 200}, -50, 50},
|
||||
{"large", make([]int64, 1000), -50, 50},
|
||||
{"all below min", []int64{-1000, -2000, -3000}, -500, 100},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
|
||||
for i := range tc.input {
|
||||
tc.input[i] = rand.Int64()
|
||||
}
|
||||
}
|
||||
|
||||
got := ClampInt64x2(tc.input, tc.min, tc.max)
|
||||
|
||||
if len(got) != len(tc.input) {
|
||||
t.Errorf("ClampInt64x2() returned length %d, want %d", len(got), len(tc.input))
|
||||
}
|
||||
|
||||
for i, v := range got {
|
||||
if v < tc.min || v > tc.max {
|
||||
t.Errorf("ClampInt64x2()[%d] = %v, outside range [%v, %v]", i, v, tc.min, tc.max)
|
||||
}
|
||||
original := tc.input[i]
|
||||
expected := original
|
||||
if expected < tc.min {
|
||||
expected = tc.min
|
||||
} else if expected > tc.max {
|
||||
expected = tc.max
|
||||
}
|
||||
if v != expected {
|
||||
t.Errorf("ClampInt64x2()[%d] = %v, want %v (original: %v)", i, v, expected, original)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestClampUint8x16(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []uint8
|
||||
@@ -581,6 +547,7 @@ func TestClampUint8x16(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestClampUint16x8(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []uint16
|
||||
@@ -630,6 +597,7 @@ func TestClampUint16x8(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestClampUint32x4(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []uint32
|
||||
@@ -677,56 +645,8 @@ func TestClampUint32x4(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestClampUint64x2(t *testing.T) {
|
||||
requireAVX512(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []uint64
|
||||
min uint64
|
||||
max uint64
|
||||
}{
|
||||
{"empty", []uint64{}, 100, 1000},
|
||||
{"single", []uint64{42}, 10, 100},
|
||||
{"small", []uint64{1, 2, 3, 4, 5}, 2, 4},
|
||||
{"exactly 2", []uint64{50, 2000}, 100, 1000},
|
||||
{"large", make([]uint64, 1000), 500, 5000},
|
||||
{"all below min", []uint64{1, 2, 3}, 10, 100},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
|
||||
for i := range tc.input {
|
||||
tc.input[i] = rand.Uint64()
|
||||
}
|
||||
}
|
||||
|
||||
got := ClampUint64x2(tc.input, tc.min, tc.max)
|
||||
|
||||
if len(got) != len(tc.input) {
|
||||
t.Errorf("ClampUint64x2() returned length %d, want %d", len(got), len(tc.input))
|
||||
}
|
||||
|
||||
for i, v := range got {
|
||||
if v < tc.min || v > tc.max {
|
||||
t.Errorf("ClampUint64x2()[%d] = %v, outside range [%v, %v]", i, v, tc.min, tc.max)
|
||||
}
|
||||
original := tc.input[i]
|
||||
expected := original
|
||||
if expected < tc.min {
|
||||
expected = tc.min
|
||||
} else if expected > tc.max {
|
||||
expected = tc.max
|
||||
}
|
||||
if v != expected {
|
||||
t.Errorf("ClampUint64x2()[%d] = %v, want %v (original: %v)", i, v, expected, original)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestClampFloat32x4(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []float32
|
||||
@@ -778,6 +698,7 @@ func TestClampFloat32x4(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestClampFloat64x2(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []float64
|
||||
@@ -829,7 +750,8 @@ func TestClampFloat64x2(t *testing.T) {
|
||||
}
|
||||
|
||||
// Test type aliases work correctly
|
||||
func TestSSEClampTypeAlias(t *testing.T) {
|
||||
func TestAVXClampTypeAlias(t *testing.T) {
|
||||
requireAVX(t)
|
||||
input := []myInt8{-5, 0, 10, 15, 20}
|
||||
min := myInt8(0)
|
||||
max := myInt8(10)
|
||||
@@ -853,6 +775,7 @@ func TestSSEClampTypeAlias(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMeanInt8x16(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []int8
|
||||
@@ -884,6 +807,7 @@ func TestMeanInt8x16(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMeanInt16x8(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []int16
|
||||
@@ -915,6 +839,7 @@ func TestMeanInt16x8(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMeanInt32x4(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []int32
|
||||
@@ -946,6 +871,7 @@ func TestMeanInt32x4(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMeanInt64x2(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []int64
|
||||
@@ -977,6 +903,7 @@ func TestMeanInt64x2(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMeanUint8x16(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []uint8
|
||||
@@ -1008,6 +935,7 @@ func TestMeanUint8x16(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMeanUint16x8(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []uint16
|
||||
@@ -1039,6 +967,7 @@ func TestMeanUint16x8(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMeanUint32x4(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []uint32
|
||||
@@ -1069,6 +998,7 @@ func TestMeanUint32x4(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMeanUint64x2(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []uint64
|
||||
@@ -1099,6 +1029,7 @@ func TestMeanUint64x2(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMeanFloat32x4(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []float32
|
||||
@@ -1132,6 +1063,7 @@ func TestMeanFloat32x4(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMeanFloat64x2(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []float64
|
||||
@@ -1164,7 +1096,8 @@ func TestMeanFloat64x2(t *testing.T) {
|
||||
}
|
||||
|
||||
// Test type aliases work correctly
|
||||
func TestSSEMeanTypeAlias(t *testing.T) {
|
||||
func TestAVXMeanTypeAlias(t *testing.T) {
|
||||
requireAVX(t)
|
||||
input := []myInt8{1, 2, 3, 4, 5}
|
||||
got := MeanInt8x16(input)
|
||||
want := lo.Mean(input)
|
||||
@@ -1175,6 +1108,7 @@ func TestSSEMeanTypeAlias(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMinInt8x16(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []int8
|
||||
@@ -1206,6 +1140,7 @@ func TestMinInt8x16(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMinInt16x8(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []int16
|
||||
@@ -1237,6 +1172,7 @@ func TestMinInt16x8(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMinInt32x4(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []int32
|
||||
@@ -1267,39 +1203,8 @@ func TestMinInt32x4(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestMinInt64x2(t *testing.T) {
|
||||
requireAVX512(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []int64
|
||||
}{
|
||||
{"empty", []int64{}},
|
||||
{"single", []int64{42}},
|
||||
{"small", []int64{1, 2, 3, 4, 5}},
|
||||
{"exactly 2", []int64{1, 2}},
|
||||
{"large", make([]int64, 1000)},
|
||||
{"negative", []int64{-1, -2, -3, 4, 5}},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
|
||||
for i := range tc.input {
|
||||
tc.input[i] = rand.Int64()
|
||||
}
|
||||
}
|
||||
|
||||
got := MinInt64x2(tc.input)
|
||||
want := lo.Min(tc.input)
|
||||
|
||||
if got != want {
|
||||
t.Errorf("MinInt64x2() = %v, want %v", got, want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestMinUint8x16(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []uint8
|
||||
@@ -1331,6 +1236,7 @@ func TestMinUint8x16(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMinUint16x8(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []uint16
|
||||
@@ -1362,6 +1268,7 @@ func TestMinUint16x8(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMinUint32x4(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []uint32
|
||||
@@ -1391,38 +1298,8 @@ func TestMinUint32x4(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestMinUint64x2(t *testing.T) {
|
||||
requireAVX512(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []uint64
|
||||
}{
|
||||
{"empty", []uint64{}},
|
||||
{"single", []uint64{42}},
|
||||
{"small", []uint64{1, 2, 3, 4, 5}},
|
||||
{"exactly 2", []uint64{1, 2}},
|
||||
{"large", make([]uint64, 1000)},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
|
||||
for i := range tc.input {
|
||||
tc.input[i] = rand.Uint64()
|
||||
}
|
||||
}
|
||||
|
||||
got := MinUint64x2(tc.input)
|
||||
want := lo.Min(tc.input)
|
||||
|
||||
if got != want {
|
||||
t.Errorf("MinUint64x2() = %v, want %v", got, want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestMinFloat32x4(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []float32
|
||||
@@ -1456,6 +1333,7 @@ func TestMinFloat32x4(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMinFloat64x2(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []float64
|
||||
@@ -1488,7 +1366,8 @@ func TestMinFloat64x2(t *testing.T) {
|
||||
}
|
||||
|
||||
// Test type aliases work correctly
|
||||
func TestSSEMinTypeAlias(t *testing.T) {
|
||||
func TestAVXMinTypeAlias(t *testing.T) {
|
||||
requireAVX(t)
|
||||
input := []myInt8{5, 2, 8, 1, 9}
|
||||
got := MinInt8x16(input)
|
||||
want := myInt8(1)
|
||||
@@ -1499,6 +1378,7 @@ func TestSSEMinTypeAlias(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMaxInt8x16(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []int8
|
||||
@@ -1530,6 +1410,7 @@ func TestMaxInt8x16(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMaxInt16x8(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []int16
|
||||
@@ -1561,6 +1442,7 @@ func TestMaxInt16x8(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMaxInt32x4(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []int32
|
||||
@@ -1591,39 +1473,8 @@ func TestMaxInt32x4(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestMaxInt64x2(t *testing.T) {
|
||||
requireAVX512(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []int64
|
||||
}{
|
||||
{"empty", []int64{}},
|
||||
{"single", []int64{42}},
|
||||
{"small", []int64{1, 2, 3, 4, 5}},
|
||||
{"exactly 2", []int64{1, 2}},
|
||||
{"large", make([]int64, 1000)},
|
||||
{"negative", []int64{-1, -2, -3, 4, 5}},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
|
||||
for i := range tc.input {
|
||||
tc.input[i] = rand.Int64()
|
||||
}
|
||||
}
|
||||
|
||||
got := MaxInt64x2(tc.input)
|
||||
want := lo.Max(tc.input)
|
||||
|
||||
if got != want {
|
||||
t.Errorf("MaxInt64x2() = %v, want %v", got, want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestMaxUint8x16(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []uint8
|
||||
@@ -1655,6 +1506,7 @@ func TestMaxUint8x16(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMaxUint16x8(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []uint16
|
||||
@@ -1686,6 +1538,7 @@ func TestMaxUint16x8(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMaxUint32x4(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []uint32
|
||||
@@ -1715,38 +1568,8 @@ func TestMaxUint32x4(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestMaxUint64x2(t *testing.T) {
|
||||
requireAVX512(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []uint64
|
||||
}{
|
||||
{"empty", []uint64{}},
|
||||
{"single", []uint64{42}},
|
||||
{"small", []uint64{1, 2, 3, 4, 5}},
|
||||
{"exactly 2", []uint64{1, 2}},
|
||||
{"large", make([]uint64, 1000)},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if len(tc.input) > 0 && tc.input[0] == 0 && len(tc.input) > 6 {
|
||||
for i := range tc.input {
|
||||
tc.input[i] = rand.Uint64()
|
||||
}
|
||||
}
|
||||
|
||||
got := MaxUint64x2(tc.input)
|
||||
want := lo.Max(tc.input)
|
||||
|
||||
if got != want {
|
||||
t.Errorf("MaxUint64x2() = %v, want %v", got, want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestMaxFloat32x4(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []float32
|
||||
@@ -1780,6 +1603,7 @@ func TestMaxFloat32x4(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMaxFloat64x2(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []float64
|
||||
@@ -1812,7 +1636,8 @@ func TestMaxFloat64x2(t *testing.T) {
|
||||
}
|
||||
|
||||
// Test type aliases work correctly
|
||||
func TestSSEMaxTypeAlias(t *testing.T) {
|
||||
func TestAVXMaxTypeAlias(t *testing.T) {
|
||||
requireAVX(t)
|
||||
input := []myInt8{5, 2, 8, 1, 9}
|
||||
got := MaxInt8x16(input)
|
||||
want := myInt8(9)
|
||||
@@ -1831,6 +1656,7 @@ type item struct {
|
||||
}
|
||||
|
||||
func TestSumByInt8x16(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []item
|
||||
@@ -1863,6 +1689,7 @@ func TestSumByInt8x16(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSumByInt16x8(t *testing.T) {
|
||||
requireAVX(t)
|
||||
type itemInt16 struct {
|
||||
Value int16
|
||||
}
|
||||
@@ -1898,6 +1725,7 @@ func TestSumByInt16x8(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSumByInt32x4(t *testing.T) {
|
||||
requireAVX(t)
|
||||
type itemInt32 struct {
|
||||
Value int32
|
||||
}
|
||||
@@ -1933,6 +1761,7 @@ func TestSumByInt32x4(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSumByInt64x2(t *testing.T) {
|
||||
requireAVX(t)
|
||||
type itemInt64 struct {
|
||||
Value int64
|
||||
}
|
||||
@@ -1968,6 +1797,7 @@ func TestSumByInt64x2(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSumByUint8x16(t *testing.T) {
|
||||
requireAVX(t)
|
||||
type itemUint8 struct {
|
||||
Value uint8
|
||||
}
|
||||
@@ -2003,6 +1833,7 @@ func TestSumByUint8x16(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSumByUint16x8(t *testing.T) {
|
||||
requireAVX(t)
|
||||
type itemUint16 struct {
|
||||
Value uint16
|
||||
}
|
||||
@@ -2038,6 +1869,7 @@ func TestSumByUint16x8(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSumByUint32x4(t *testing.T) {
|
||||
requireAVX(t)
|
||||
type itemUint32 struct {
|
||||
Value uint32
|
||||
}
|
||||
@@ -2072,6 +1904,7 @@ func TestSumByUint32x4(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSumByUint64x2(t *testing.T) {
|
||||
requireAVX(t)
|
||||
type itemUint64 struct {
|
||||
Value uint64
|
||||
}
|
||||
@@ -2106,6 +1939,7 @@ func TestSumByUint64x2(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSumByFloat32x4(t *testing.T) {
|
||||
requireAVX(t)
|
||||
type itemFloat32 struct {
|
||||
Value float32
|
||||
}
|
||||
@@ -2143,6 +1977,7 @@ func TestSumByFloat32x4(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSumByFloat64x2(t *testing.T) {
|
||||
requireAVX(t)
|
||||
type itemFloat64 struct {
|
||||
Value float64
|
||||
}
|
||||
@@ -2179,7 +2014,8 @@ func TestSumByFloat64x2(t *testing.T) {
|
||||
}
|
||||
|
||||
// Test type alias works correctly for SumBy
|
||||
func TestSSESumByTypeAlias(t *testing.T) {
|
||||
func TestAVXSumByTypeAlias(t *testing.T) {
|
||||
requireAVX(t)
|
||||
type myItem struct {
|
||||
Value myInt8
|
||||
}
|
||||
@@ -2196,6 +2032,7 @@ func TestSSESumByTypeAlias(t *testing.T) {
|
||||
// MeanBy tests
|
||||
|
||||
func TestMeanByInt8x16(t *testing.T) {
|
||||
requireAVX(t)
|
||||
testCases := []struct {
|
||||
name string
|
||||
input []item
|
||||
@@ -2227,6 +2064,7 @@ func TestMeanByInt8x16(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMeanByInt16x8(t *testing.T) {
|
||||
requireAVX(t)
|
||||
type itemInt16 struct {
|
||||
Value int16
|
||||
}
|
||||
@@ -2262,6 +2100,7 @@ func TestMeanByInt16x8(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMeanByInt32x4(t *testing.T) {
|
||||
requireAVX(t)
|
||||
type itemInt32 struct {
|
||||
Value int32
|
||||
}
|
||||
@@ -2297,6 +2136,7 @@ func TestMeanByInt32x4(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMeanByInt64x2(t *testing.T) {
|
||||
requireAVX(t)
|
||||
type itemInt64 struct {
|
||||
Value int64
|
||||
}
|
||||
@@ -2332,6 +2172,7 @@ func TestMeanByInt64x2(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMeanByUint8x16(t *testing.T) {
|
||||
requireAVX(t)
|
||||
type itemUint8 struct {
|
||||
Value uint8
|
||||
}
|
||||
@@ -2367,6 +2208,7 @@ func TestMeanByUint8x16(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMeanByUint16x8(t *testing.T) {
|
||||
requireAVX(t)
|
||||
type itemUint16 struct {
|
||||
Value uint16
|
||||
}
|
||||
@@ -2402,6 +2244,7 @@ func TestMeanByUint16x8(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMeanByUint32x4(t *testing.T) {
|
||||
requireAVX(t)
|
||||
type itemUint32 struct {
|
||||
Value uint32
|
||||
}
|
||||
@@ -2436,6 +2279,7 @@ func TestMeanByUint32x4(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMeanByUint64x2(t *testing.T) {
|
||||
requireAVX(t)
|
||||
type itemUint64 struct {
|
||||
Value uint64
|
||||
}
|
||||
@@ -2470,6 +2314,7 @@ func TestMeanByUint64x2(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMeanByFloat32x4(t *testing.T) {
|
||||
requireAVX(t)
|
||||
type itemFloat32 struct {
|
||||
Value float32
|
||||
}
|
||||
@@ -2507,6 +2352,7 @@ func TestMeanByFloat32x4(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMeanByFloat64x2(t *testing.T) {
|
||||
requireAVX(t)
|
||||
type itemFloat64 struct {
|
||||
Value float64
|
||||
}
|
||||
@@ -2543,7 +2389,8 @@ func TestMeanByFloat64x2(t *testing.T) {
|
||||
}
|
||||
|
||||
// Test type alias works correctly for MeanBy
|
||||
func TestSSEMeanByTypeAlias(t *testing.T) {
|
||||
func TestAVXMeanByTypeAlias(t *testing.T) {
|
||||
requireAVX(t)
|
||||
type myItem struct {
|
||||
Value myInt8
|
||||
}
|
||||
+33
-17
@@ -13,15 +13,15 @@ import (
|
||||
|
||||
// Benchmark suite for SIMD math operations compared to core lo package fallbacks.
|
||||
// These benchmarks measure the performance of Sum, Mean, Min, and Max operations
|
||||
// across different SIMD implementations (SSE, AVX2, AVX512) and data sizes.
|
||||
// across different SIMD implementations (AVX, AVX2, AVX512) and data sizes.
|
||||
|
||||
// Benchmark sizes to demonstrate performance characteristics at different scales
|
||||
var benchmarkSizes = []struct {
|
||||
name string
|
||||
size int
|
||||
}{
|
||||
{"small", 8}, // Smaller than SSE width (16 lanes for int8)
|
||||
{"medium", 128}, // Between SSE (16) and AVX2 (32) width for int8
|
||||
{"small", 8}, // Smaller than AVX width (16 lanes for int8)
|
||||
{"medium", 128}, // Between AVX (16) and AVX2 (32) width for int8
|
||||
{"large", 1024}, // Well above SIMD register widths
|
||||
{"xlarge", 8192}, // Large dataset for real-world performance
|
||||
}
|
||||
@@ -128,7 +128,8 @@ func BenchmarkSumInt8(b *testing.B) {
|
||||
_ = lo.Sum(data)
|
||||
}
|
||||
})
|
||||
b.Run("SSE-x16", func(b *testing.B) {
|
||||
b.Run("AVX-x16", func(b *testing.B) {
|
||||
requireAVX(b)
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = SumInt8x16(data)
|
||||
@@ -162,7 +163,8 @@ func BenchmarkSumInt16(b *testing.B) {
|
||||
_ = lo.Sum(data)
|
||||
}
|
||||
})
|
||||
b.Run("SSE-x8", func(b *testing.B) {
|
||||
b.Run("AVX-x8", func(b *testing.B) {
|
||||
requireAVX(b)
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = SumInt16x8(data)
|
||||
@@ -196,7 +198,8 @@ func BenchmarkSumInt32(b *testing.B) {
|
||||
_ = lo.Sum(data)
|
||||
}
|
||||
})
|
||||
b.Run("SSE-x4", func(b *testing.B) {
|
||||
b.Run("AVX-x4", func(b *testing.B) {
|
||||
requireAVX(b)
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = SumInt32x4(data)
|
||||
@@ -230,7 +233,8 @@ func BenchmarkSumInt64(b *testing.B) {
|
||||
_ = lo.Sum(data)
|
||||
}
|
||||
})
|
||||
b.Run("SSE-x2", func(b *testing.B) {
|
||||
b.Run("AVX-x2", func(b *testing.B) {
|
||||
requireAVX(b)
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = SumInt64x2(data)
|
||||
@@ -264,7 +268,8 @@ func BenchmarkSumFloat32(b *testing.B) {
|
||||
_ = lo.Sum(data)
|
||||
}
|
||||
})
|
||||
b.Run("SSE-x4", func(b *testing.B) {
|
||||
b.Run("AVX-x4", func(b *testing.B) {
|
||||
requireAVX(b)
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = SumFloat32x4(data)
|
||||
@@ -298,7 +303,8 @@ func BenchmarkSumFloat64(b *testing.B) {
|
||||
_ = lo.Sum(data)
|
||||
}
|
||||
})
|
||||
b.Run("SSE-x2", func(b *testing.B) {
|
||||
b.Run("AVX-x2", func(b *testing.B) {
|
||||
requireAVX(b)
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = SumFloat64x2(data)
|
||||
@@ -336,7 +342,8 @@ func BenchmarkMeanInt32(b *testing.B) {
|
||||
_ = lo.Mean(data)
|
||||
}
|
||||
})
|
||||
b.Run("SSE-x4", func(b *testing.B) {
|
||||
b.Run("AVX-x4", func(b *testing.B) {
|
||||
requireAVX(b)
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = MeanInt32x4(data)
|
||||
@@ -370,7 +377,8 @@ func BenchmarkMeanFloat64(b *testing.B) {
|
||||
_ = lo.Mean(data)
|
||||
}
|
||||
})
|
||||
b.Run("SSE-x2", func(b *testing.B) {
|
||||
b.Run("AVX-x2", func(b *testing.B) {
|
||||
requireAVX(b)
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = MeanFloat64x2(data)
|
||||
@@ -402,7 +410,8 @@ func BenchmarkMinInt32(b *testing.B) {
|
||||
for _, bs := range benchmarkSizes {
|
||||
b.Run(bs.name, func(b *testing.B) {
|
||||
data := generateInt32(bs.size)
|
||||
b.Run("SSE-x4", func(b *testing.B) {
|
||||
b.Run("AVX-x4", func(b *testing.B) {
|
||||
requireAVX(b)
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = MinInt32x4(data)
|
||||
@@ -430,7 +439,8 @@ func BenchmarkMinFloat64(b *testing.B) {
|
||||
for _, bs := range benchmarkSizes {
|
||||
b.Run(bs.name, func(b *testing.B) {
|
||||
data := generateFloat64(bs.size)
|
||||
b.Run("SSE-x2", func(b *testing.B) {
|
||||
b.Run("AVX-x2", func(b *testing.B) {
|
||||
requireAVX(b)
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = MinFloat64x2(data)
|
||||
@@ -462,7 +472,8 @@ func BenchmarkMaxInt32(b *testing.B) {
|
||||
for _, bs := range benchmarkSizes {
|
||||
b.Run(bs.name, func(b *testing.B) {
|
||||
data := generateInt32(bs.size)
|
||||
b.Run("SSE-x4", func(b *testing.B) {
|
||||
b.Run("AVX-x4", func(b *testing.B) {
|
||||
requireAVX(b)
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = MaxInt32x4(data)
|
||||
@@ -490,7 +501,8 @@ func BenchmarkMaxFloat64(b *testing.B) {
|
||||
for _, bs := range benchmarkSizes {
|
||||
b.Run(bs.name, func(b *testing.B) {
|
||||
data := generateFloat64(bs.size)
|
||||
b.Run("SSE-x2", func(b *testing.B) {
|
||||
b.Run("AVX-x2", func(b *testing.B) {
|
||||
requireAVX(b)
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = MaxFloat64x2(data)
|
||||
@@ -528,13 +540,16 @@ func BenchmarkSumInt8ByWidth(b *testing.B) {
|
||||
fn func() int8
|
||||
}{
|
||||
{"Fallback-lo", func() int8 { return lo.Sum(data) }},
|
||||
{"SSE-x16", func() int8 { return SumInt8x16(data) }},
|
||||
{"AVX-x16", func() int8 { return SumInt8x16(data) }},
|
||||
{"AVX2-x32", func() int8 { return SumInt8x32(data) }},
|
||||
{"AVX512-x64", func() int8 { return SumInt8x64(data) }},
|
||||
}
|
||||
|
||||
for _, bm := range benchmarks {
|
||||
b.Run(bm.name, func(b *testing.B) {
|
||||
if bm.name == "AVX-x16" {
|
||||
requireAVX(b)
|
||||
}
|
||||
if bm.name == "AVX2-x32" {
|
||||
requireAVX2(b)
|
||||
}
|
||||
@@ -578,7 +593,8 @@ func BenchmarkSumInt64SteadyState(b *testing.B) {
|
||||
_ = lo.Sum(data)
|
||||
}
|
||||
})
|
||||
b.Run("SSE-x2", func(b *testing.B) {
|
||||
b.Run("AVX-x2", func(b *testing.B) {
|
||||
requireAVX(b)
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = SumInt64x2(data)
|
||||
|
||||
+12
-10
@@ -24,13 +24,15 @@ func init() {
|
||||
}
|
||||
|
||||
// Type aliases for testing
|
||||
type myInt8 int8
|
||||
type myInt16 int16
|
||||
type myInt32 int32
|
||||
type myInt64 int64
|
||||
type myUint8 uint8
|
||||
type myUint16 uint16
|
||||
type myUint32 uint32
|
||||
type myUint64 uint64
|
||||
type myFloat32 float32
|
||||
type myFloat64 float64
|
||||
type (
|
||||
myInt8 int8
|
||||
myInt16 int16
|
||||
myInt32 int32
|
||||
myInt64 int64
|
||||
myUint8 uint8
|
||||
myUint16 uint16
|
||||
myUint32 uint32
|
||||
myUint64 uint64
|
||||
myFloat32 float32
|
||||
myFloat64 float64
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user