mirror of
https://github.com/samber/lo.git
synced 2026-04-22 15:37:14 +08:00
Experiments: adding SIMD helpers (#801)
* feat(exp,simd): adding SumAxB helpers * feat(exp,simd): adding MeanAxB and ClampAxB helpers * feat(exp,simd): adding MinAxB and MaxAxB helpers * refactor(exp,simd): group perf helper category + architecture * feat(exp,simd): adding ContainsAxB helpers * perf(exp,simd): cast to unsafe slice once * feat(exp,simd): call the right SIMD helper based on local architecture * chore: internal dependency linking * Update exp/simd/math.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * style: fix linter * style: fix linter * chore: enable simd in makefile * chore(ci): add simd package to test runs * chore(ci): add simd package to test runs only for go 1.26 * fix(simd): fix overflow * fix(simd): fix overflow and apply the same behavior than lo.Mean * doc(exp,simd): adding initial doc * refactor(simd): move intersect_avx2 and intersect_sse code into intersect_avx512 * fix(simd): call SSE fallback instead of lo.Sum for default helpers * feat(simd): cache simd features on package init to avoid repeated checks * perf(exp,simd): precompute length + improve code quality * perf(exp,simd): faster iteration for min/max value * test(exp,simd): adding benchmarks * test(exp,simd): adding benchmarks results * test(exp,simd): adding benchmarks results * doc(exp,simd): adding warning for overflows in SIMD operations * feat(exp,simd): adding more dispatch helpers * feat(exp,simd): adding SumBy variants * feat(exp,simd): adding MeanBy variants * fix(exp,simd): faster clamp * 💄 * doc(exp,simd): adding SumBy + MeanBy * fix(exp,simd): faster SIMD operations * chore(ci): enable the benchmarks temporary * chore(ci): display cpu architecture before running tests * chore(ci): github actions are hidding some useful stuffs * chore(ci): no SIMD VM available at Github during the weekend ??? * test(exp,simd): larger epsilon * oops * perf(exp,simd): faster iterations * doc(exp,simd): report last version of benchmarks * 💄 --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -0,0 +1,34 @@
|
|||||||
|
name: Tests (SIMD)
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
paths:
|
||||||
|
- 'exp/simd/**'
|
||||||
|
# pull_request:
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test-simd:
|
||||||
|
# GitHub hosted runners run on several architectures.
|
||||||
|
# Using Ubicloud ensures we run on AVX512.
|
||||||
|
runs-on: ubicloud-standard-2
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
go:
|
||||||
|
- "stable"
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v6
|
||||||
|
|
||||||
|
- name: Set up Go
|
||||||
|
uses: actions/setup-go@v6
|
||||||
|
with:
|
||||||
|
go-version: ${{ matrix.go }}
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
run: make test
|
||||||
|
|
||||||
|
- name: Benchmark
|
||||||
|
run: |
|
||||||
|
cd exp/simd/
|
||||||
|
GOEXPERIMENT=simd go test -run=^Benchmark -benchmem -bench ./...
|
||||||
@@ -1,9 +1,14 @@
|
|||||||
|
# Only build/test/lint exp/simd when Go version is >= 1.26 (requires goexperiment.simd)
|
||||||
|
GO_VERSION := $(shell go version 2>/dev/null | sed -n 's/.*go\([0-9]*\)\.\([0-9]*\).*/\1.\2/p')
|
||||||
|
GO_SIMD_SUPPORT := $(shell ver="$(GO_VERSION)"; [ -n "$$ver" ] && [ "$$(printf '%s\n1.26\n' "$$ver" | sort -V | tail -1)" = "$$ver" ] && echo yes)
|
||||||
|
|
||||||
build:
|
build:
|
||||||
go build -v ./...
|
go build -v ./...
|
||||||
|
@if [ -n "$(GO_SIMD_SUPPORT)" ]; then cd ./exp/simd && GOEXPERIMENT=simd go build -v ./; fi
|
||||||
|
|
||||||
test:
|
test:
|
||||||
go test -race ./...
|
go test -race ./...
|
||||||
|
@if [ -n "$(GO_SIMD_SUPPORT)" ]; then cd ./exp/simd && GOEXPERIMENT=simd go test -race ./; fi
|
||||||
watch-test:
|
watch-test:
|
||||||
reflex -t 50ms -s -- sh -c 'gotest -race ./...'
|
reflex -t 50ms -s -- sh -c 'gotest -race ./...'
|
||||||
|
|
||||||
@@ -32,9 +37,11 @@ tools:
|
|||||||
|
|
||||||
lint:
|
lint:
|
||||||
golangci-lint run --timeout 60s --max-same-issues 50 ./...
|
golangci-lint run --timeout 60s --max-same-issues 50 ./...
|
||||||
|
@if [ -n "$(GO_SIMD_SUPPORT)" ]; then cd ./exp/simd && golangci-lint run --timeout 60s --max-same-issues 50 ./; fi
|
||||||
# mdsf verify --debug --log-level warn docs/
|
# mdsf verify --debug --log-level warn docs/
|
||||||
lint-fix:
|
lint-fix:
|
||||||
golangci-lint run --timeout 60s --max-same-issues 50 --fix ./...
|
golangci-lint run --timeout 60s --max-same-issues 50 --fix ./...
|
||||||
|
@if [ -n "$(GO_SIMD_SUPPORT)" ]; then cd ./exp/simd && golangci-lint run --timeout 60s --max-same-issues 50 --fix ./; fi
|
||||||
# mdsf format --debug --log-level warn docs/
|
# mdsf format --debug --log-level warn docs/
|
||||||
|
|
||||||
audit:
|
audit:
|
||||||
|
|||||||
@@ -0,0 +1,82 @@
|
|||||||
|
---
|
||||||
|
name: Clamp
|
||||||
|
slug: clamp
|
||||||
|
sourceRef: exp/simd/math_sse.go#L424
|
||||||
|
category: exp
|
||||||
|
subCategory: simd
|
||||||
|
similarHelpers:
|
||||||
|
- exp#simd#clamp
|
||||||
|
position: 40
|
||||||
|
signatures:
|
||||||
|
- "func ClampInt8x16[T ~int8, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampInt8x32[T ~int8, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampInt8x64[T ~int8, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampInt16x8[T ~int16, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampInt16x16[T ~int16, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampInt16x32[T ~int16, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampInt32x4[T ~int32, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampInt32x8[T ~int32, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampInt32x16[T ~int32, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampInt64x2[T ~int64, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampInt64x4[T ~int64, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampInt64x8[T ~int64, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampUint8x16[T ~uint8, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampUint8x32[T ~uint8, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampUint8x64[T ~uint8, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampUint16x8[T ~uint16, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampUint16x16[T ~uint16, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampUint16x32[T ~uint16, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampUint32x4[T ~uint32, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampUint32x8[T ~uint32, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampUint32x16[T ~uint32, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampUint64x2[T ~uint64, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampUint64x4[T ~uint64, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampUint64x8[T ~uint64, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampFloat32x4[T ~float32, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampFloat32x8[T ~float32, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampFloat32x16[T ~float32, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampFloat64x2[T ~float64, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampFloat64x4[T ~float64, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
- "func ClampFloat64x8[T ~float64, Slice ~[]T](collection Slice, min, max T) Slice"
|
||||||
|
---
|
||||||
|
|
||||||
|
Clamps each element in a collection between min and max values using SIMD instructions. The suffix (x2, x4, x8, x16, x32, x64) indicates the number of lanes processed simultaneously.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- **Go 1.26+** with `GOEXPERIMENT=simd`
|
||||||
|
- **amd64** architecture only
|
||||||
|
|
||||||
|
### CPU compatibility
|
||||||
|
|
||||||
|
| SIMD variant | Lanes | Required flags | Typical CPUs |
|
||||||
|
| ------------ | ----- | -------------- | ------------------------------ |
|
||||||
|
| SSE (xN) | 2-16 | `sse2` | All amd64 |
|
||||||
|
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
|
||||||
|
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
|
||||||
|
|
||||||
|
> **Note**: Choose the variant matching your CPU's capabilities. Higher lane counts provide better performance but require newer CPU support.
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Using AVX2 variant (32 lanes at once) - Intel Haswell+ / AMD Excavator+
|
||||||
|
result := simd.ClampInt8x32([]int8{1, 5, 10, 15, 20}, 5, 15)
|
||||||
|
// []int8{5, 5, 10, 15, 15}
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Using AVX-512 variant (16 lanes at once) - Intel Skylake-X+
|
||||||
|
result := simd.ClampFloat32x16([]float32{0.5, 1.5, 2.5, 3.5}, 1.0, 3.0)
|
||||||
|
// []float32{1.0, 1.5, 2.5, 3.0}
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Using SSE variant (8 lanes at once) - works on all amd64
|
||||||
|
result := simd.ClampInt16x8([]int16{100, 150, 200, 250}, 120, 220)
|
||||||
|
// []int16{120, 150, 200, 220}
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Empty collection returns empty collection
|
||||||
|
result := simd.ClampUint32x4([]uint32{}, 10, 100)
|
||||||
|
// []uint32{}
|
||||||
|
```
|
||||||
@@ -0,0 +1,88 @@
|
|||||||
|
---
|
||||||
|
name: Contains
|
||||||
|
slug: contains
|
||||||
|
sourceRef: exp/simd/intersect_sse.go#L11
|
||||||
|
category: exp
|
||||||
|
subCategory: simd
|
||||||
|
similarHelpers:
|
||||||
|
- exp#simd#contains
|
||||||
|
position: 0
|
||||||
|
signatures:
|
||||||
|
- "func ContainsInt8x16[T ~int8](collection []T, target T) bool"
|
||||||
|
- "func ContainsInt8x32[T ~int8](collection []T, target T) bool"
|
||||||
|
- "func ContainsInt8x64[T ~int8](collection []T, target T) bool"
|
||||||
|
- "func ContainsInt16x8[T ~int16](collection []T, target T) bool"
|
||||||
|
- "func ContainsInt16x16[T ~int16](collection []T, target T) bool"
|
||||||
|
- "func ContainsInt16x32[T ~int16](collection []T, target T) bool"
|
||||||
|
- "func ContainsInt32x4[T ~int32](collection []T, target T) bool"
|
||||||
|
- "func ContainsInt32x8[T ~int32](collection []T, target T) bool"
|
||||||
|
- "func ContainsInt32x16[T ~int32](collection []T, target T) bool"
|
||||||
|
- "func ContainsInt64x2[T ~int64](collection []T, target T) bool"
|
||||||
|
- "func ContainsInt64x4[T ~int64](collection []T, target T) bool"
|
||||||
|
- "func ContainsInt64x8[T ~int64](collection []T, target T) bool"
|
||||||
|
- "func ContainsUint8x16[T ~uint8](collection []T, target T) bool"
|
||||||
|
- "func ContainsUint8x32[T ~uint8](collection []T, target T) bool"
|
||||||
|
- "func ContainsUint8x64[T ~uint8](collection []T, target T) bool"
|
||||||
|
- "func ContainsUint16x8[T ~uint16](collection []T, target T) bool"
|
||||||
|
- "func ContainsUint16x16[T ~uint16](collection []T, target T) bool"
|
||||||
|
- "func ContainsUint16x32[T ~uint16](collection []T, target T) bool"
|
||||||
|
- "func ContainsUint32x4[T ~uint32](collection []T, target T) bool"
|
||||||
|
- "func ContainsUint32x8[T ~uint32](collection []T, target T) bool"
|
||||||
|
- "func ContainsUint32x16[T ~uint32](collection []T, target T) bool"
|
||||||
|
- "func ContainsUint64x2[T ~uint64](collection []T, target T) bool"
|
||||||
|
- "func ContainsUint64x4[T ~uint64](collection []T, target T) bool"
|
||||||
|
- "func ContainsUint64x8[T ~uint64](collection []T, target T) bool"
|
||||||
|
- "func ContainsFloat32x4[T ~float32](collection []T, target T) bool"
|
||||||
|
- "func ContainsFloat32x8[T ~float32](collection []T, target T) bool"
|
||||||
|
- "func ContainsFloat32x16[T ~float32](collection []T, target T) bool"
|
||||||
|
- "func ContainsFloat64x2[T ~float64](collection []T, target T) bool"
|
||||||
|
- "func ContainsFloat64x4[T ~float64](collection []T, target T) bool"
|
||||||
|
- "func ContainsFloat64x8[T ~float64](collection []T, target T) bool"
|
||||||
|
---
|
||||||
|
|
||||||
|
Checks if a target value is present in a collection using SIMD instructions. The suffix (x4, x8, x16, x32, x64) indicates the number of lanes processed simultaneously.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- **Go 1.26+** with `GOEXPERIMENT=simd`
|
||||||
|
- **amd64** architecture only
|
||||||
|
|
||||||
|
### CPU compatibility
|
||||||
|
|
||||||
|
| SIMD variant | Lanes | Required flags | Typical CPUs |
|
||||||
|
| ------------ | ----- | -------------- | ------------------------------ |
|
||||||
|
| SSE (xN) | 2-16 | `sse2` | All amd64 |
|
||||||
|
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
|
||||||
|
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
|
||||||
|
|
||||||
|
> **Note**: Choose the variant matching your CPU's capabilities. Higher lane counts provide better performance but require newer CPU support.
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Using AVX2 variant (32 lanes at once) - Intel Haswell+ / AMD Excavator+
|
||||||
|
found := simd.ContainsInt8x32([]int8{1, 2, 3, 4, 5}, 3)
|
||||||
|
// true
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Using SSE variant (16 lanes at once) - works on all amd64
|
||||||
|
found := simd.ContainsInt64x2([]int64{1000000, 2000000, 3000000}, 2000000)
|
||||||
|
// true
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Using AVX-512 variant (64 lanes at once) - Intel Skylake-X+
|
||||||
|
found := simd.ContainsUint8x64([]uint8{10, 20, 30, 40, 50}, 30)
|
||||||
|
// true
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Float32 with AVX2 (8 lanes at once)
|
||||||
|
found := simd.ContainsFloat32x8([]float32{1.1, 2.2, 3.3, 4.4}, 3.3)
|
||||||
|
// true
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Empty collection returns false
|
||||||
|
found := simd.ContainsInt16x16([]int16{}, 5)
|
||||||
|
// false
|
||||||
|
```
|
||||||
@@ -0,0 +1,82 @@
|
|||||||
|
---
|
||||||
|
name: Max
|
||||||
|
slug: max
|
||||||
|
sourceRef: exp/simd/math_sse.go#L1328
|
||||||
|
category: exp
|
||||||
|
subCategory: simd
|
||||||
|
similarHelpers:
|
||||||
|
- exp#simd#max
|
||||||
|
position: 30
|
||||||
|
signatures:
|
||||||
|
- "func MaxInt8x16[T ~int8](collection []T) T"
|
||||||
|
- "func MaxInt8x32[T ~int8](collection []T) T"
|
||||||
|
- "func MaxInt8x64[T ~int8](collection []T) T"
|
||||||
|
- "func MaxInt16x8[T ~int16](collection []T) T"
|
||||||
|
- "func MaxInt16x16[T ~int16](collection []T) T"
|
||||||
|
- "func MaxInt16x32[T ~int16](collection []T) T"
|
||||||
|
- "func MaxInt32x4[T ~int32](collection []T) T"
|
||||||
|
- "func MaxInt32x8[T ~int32](collection []T) T"
|
||||||
|
- "func MaxInt32x16[T ~int32](collection []T) T"
|
||||||
|
- "func MaxInt64x2[T ~int64](collection []T) T"
|
||||||
|
- "func MaxInt64x4[T ~int64](collection []T) T"
|
||||||
|
- "func MaxInt64x8[T ~int64](collection []T) T"
|
||||||
|
- "func MaxUint8x16[T ~uint8](collection []T) T"
|
||||||
|
- "func MaxUint8x32[T ~uint8](collection []T) T"
|
||||||
|
- "func MaxUint8x64[T ~uint8](collection []T) T"
|
||||||
|
- "func MaxUint16x8[T ~uint16](collection []T) T"
|
||||||
|
- "func MaxUint16x16[T ~uint16](collection []T) T"
|
||||||
|
- "func MaxUint16x32[T ~uint16](collection []T) T"
|
||||||
|
- "func MaxUint32x4[T ~uint32](collection []T) T"
|
||||||
|
- "func MaxUint32x8[T ~uint32](collection []T) T"
|
||||||
|
- "func MaxUint32x16[T ~uint32](collection []T) T"
|
||||||
|
- "func MaxUint64x2[T ~uint64](collection []T) T"
|
||||||
|
- "func MaxUint64x4[T ~uint64](collection []T) T"
|
||||||
|
- "func MaxUint64x8[T ~uint64](collection []T) T"
|
||||||
|
- "func MaxFloat32x4[T ~float32](collection []T) T"
|
||||||
|
- "func MaxFloat32x8[T ~float32](collection []T) T"
|
||||||
|
- "func MaxFloat32x16[T ~float32](collection []T) T"
|
||||||
|
- "func MaxFloat64x2[T ~float64](collection []T) T"
|
||||||
|
- "func MaxFloat64x4[T ~float64](collection []T) T"
|
||||||
|
- "func MaxFloat64x8[T ~float64](collection []T) T"
|
||||||
|
---
|
||||||
|
|
||||||
|
Finds the maximum value in a collection using SIMD instructions. The suffix (x2, x4, x8, x16, x32, x64) indicates the number of lanes processed simultaneously.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- **Go 1.26+** with `GOEXPERIMENT=simd`
|
||||||
|
- **amd64** architecture only
|
||||||
|
|
||||||
|
### CPU compatibility
|
||||||
|
|
||||||
|
| SIMD variant | Lanes | Required flags | Typical CPUs |
|
||||||
|
| ------------ | ----- | -------------- | ------------------------------ |
|
||||||
|
| SSE (xN) | 2-16 | `sse2` | All amd64 |
|
||||||
|
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
|
||||||
|
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
|
||||||
|
|
||||||
|
> **Note**: Choose the variant matching your CPU's capabilities. Higher lane counts provide better performance but require newer CPU support.
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Using AVX2 variant (32 lanes at once) - Intel Haswell+ / AMD Excavator+
|
||||||
|
max := simd.MaxInt8x32([]int8{5, 2, 8, 1, 9})
|
||||||
|
// 9
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Using AVX-512 variant (16 lanes at once) - Intel Skylake-X+
|
||||||
|
max := simd.MaxFloat32x16([]float32{3.5, 1.2, 4.8, 2.1})
|
||||||
|
// 4.8
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Using SSE variant (4 lanes at once) - works on all amd64
|
||||||
|
max := simd.MaxInt32x4([]int32{100, 50, 200, 75})
|
||||||
|
// 200
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Empty collection returns 0
|
||||||
|
max := simd.MaxUint16x8([]uint16{})
|
||||||
|
// 0
|
||||||
|
```
|
||||||
@@ -0,0 +1,83 @@
|
|||||||
|
---
|
||||||
|
name: Mean
|
||||||
|
slug: mean
|
||||||
|
sourceRef: exp/simd/math_sse.go#L333
|
||||||
|
category: exp
|
||||||
|
subCategory: simd
|
||||||
|
similarHelpers:
|
||||||
|
- exp#simd#mean
|
||||||
|
- exp#simd#meanby
|
||||||
|
position: 10
|
||||||
|
signatures:
|
||||||
|
- "func MeanInt8x16[T ~int8](collection []T) T"
|
||||||
|
- "func MeanInt8x32[T ~int8](collection []T) T"
|
||||||
|
- "func MeanInt8x64[T ~int8](collection []T) T"
|
||||||
|
- "func MeanInt16x8[T ~int16](collection []T) T"
|
||||||
|
- "func MeanInt16x16[T ~int16](collection []T) T"
|
||||||
|
- "func MeanInt16x32[T ~int16](collection []T) T"
|
||||||
|
- "func MeanInt32x4[T ~int32](collection []T) T"
|
||||||
|
- "func MeanInt32x8[T ~int32](collection []T) T"
|
||||||
|
- "func MeanInt32x16[T ~int32](collection []T) T"
|
||||||
|
- "func MeanInt64x2[T ~int64](collection []T) T"
|
||||||
|
- "func MeanInt64x4[T ~int64](collection []T) T"
|
||||||
|
- "func MeanInt64x8[T ~int64](collection []T) T"
|
||||||
|
- "func MeanUint8x16[T ~uint8](collection []T) T"
|
||||||
|
- "func MeanUint8x32[T ~uint8](collection []T) T"
|
||||||
|
- "func MeanUint8x64[T ~uint8](collection []T) T"
|
||||||
|
- "func MeanUint16x8[T ~uint16](collection []T) T"
|
||||||
|
- "func MeanUint16x16[T ~uint16](collection []T) T"
|
||||||
|
- "func MeanUint16x32[T ~uint16](collection []T) T"
|
||||||
|
- "func MeanUint32x4[T ~uint32](collection []T) T"
|
||||||
|
- "func MeanUint32x8[T ~uint32](collection []T) T"
|
||||||
|
- "func MeanUint32x16[T ~uint32](collection []T) T"
|
||||||
|
- "func MeanUint64x2[T ~uint64](collection []T) T"
|
||||||
|
- "func MeanUint64x4[T ~uint64](collection []T) T"
|
||||||
|
- "func MeanUint64x8[T ~uint64](collection []T) T"
|
||||||
|
- "func MeanFloat32x4[T ~float32](collection []T) T"
|
||||||
|
- "func MeanFloat32x8[T ~float32](collection []T) T"
|
||||||
|
- "func MeanFloat32x16[T ~float32](collection []T) T"
|
||||||
|
- "func MeanFloat64x2[T ~float64](collection []T) T"
|
||||||
|
- "func MeanFloat64x4[T ~float64](collection []T) T"
|
||||||
|
- "func MeanFloat64x8[T ~float64](collection []T) T"
|
||||||
|
---
|
||||||
|
|
||||||
|
Calculates the arithmetic mean of a collection using SIMD instructions. The suffix (x2, x4, x8, x16, x32, x64) indicates the number of lanes processed simultaneously.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- **Go 1.26+** with `GOEXPERIMENT=simd`
|
||||||
|
- **amd64** architecture only
|
||||||
|
|
||||||
|
### CPU compatibility
|
||||||
|
|
||||||
|
| SIMD variant | Lanes | Required flags | Typical CPUs |
|
||||||
|
| ------------ | ----- | -------------- | ------------------------------ |
|
||||||
|
| SSE (xN) | 2-16 | `sse2` | All amd64 |
|
||||||
|
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
|
||||||
|
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
|
||||||
|
|
||||||
|
> **Note**: Choose the variant matching your CPU's capabilities. Higher lane counts provide better performance but require newer CPU support.
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Using AVX2 variant (32 lanes at once) - Intel Haswell+ / AMD Excavator+
|
||||||
|
mean := simd.MeanInt8x32([]int8{1, 2, 3, 4, 5})
|
||||||
|
// 3
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Using AVX-512 variant (16 lanes at once) - Intel Skylake-X+
|
||||||
|
mean := simd.MeanFloat32x16([]float32{1.0, 2.0, 3.0, 4.0})
|
||||||
|
// 2.5
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Using SSE variant (8 lanes at once) - works on all amd64
|
||||||
|
mean := simd.MeanInt16x8([]int16{10, 20, 30, 40})
|
||||||
|
// 25
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Empty collection returns 0
|
||||||
|
mean := simd.MeanUint32x4([]uint32{})
|
||||||
|
// 0
|
||||||
|
```
|
||||||
@@ -0,0 +1,138 @@
|
|||||||
|
---
|
||||||
|
name: MeanBy
|
||||||
|
slug: meanby
|
||||||
|
sourceRef: exp/simd/math.go#L1006
|
||||||
|
category: exp
|
||||||
|
subCategory: simd
|
||||||
|
similarHelpers:
|
||||||
|
- exp#simd#mean
|
||||||
|
- exp#simd#sumby
|
||||||
|
position: 30
|
||||||
|
signatures:
|
||||||
|
- "func MeanByInt8[T any, R ~int8](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByInt16[T any, R ~int16](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByInt32[T any, R ~int32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByInt64[T any, R ~int64](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByUint8[T any, R ~uint8](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByUint16[T any, R ~uint16](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByUint32[T any, R ~uint32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByUint64[T any, R ~uint64](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByFloat32[T any, R ~float32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByFloat64[T any, R ~float64](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByInt8x16[T any, R ~int8](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByInt8x32[T any, R ~int8](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByInt8x64[T any, R ~int8](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByInt16x8[T any, R ~int16](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByInt16x16[T any, R ~int16](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByInt16x32[T any, R ~int16](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByInt32x4[T any, R ~int32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByInt32x8[T any, R ~int32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByInt32x16[T any, R ~int32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByInt64x2[T any, R ~int64](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByInt64x4[T any, R ~int64](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByInt64x8[T any, R ~int64](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByUint8x16[T any, R ~uint8](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByUint8x32[T any, R ~uint8](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByUint8x64[T any, R ~uint8](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByUint16x8[T any, R ~uint16](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByUint16x16[T any, R ~uint16](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByUint16x32[T any, R ~uint16](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByUint32x4[T any, R ~uint32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByUint32x8[T any, R ~uint32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByUint32x16[T any, R ~uint32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByUint64x2[T any, R ~uint64](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByUint64x4[T any, R ~uint64](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByUint64x8[T any, R ~uint64](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByFloat32x4[T any, R ~float32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByFloat32x8[T any, R ~float32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByFloat32x16[T any, R ~float32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByFloat64x2[T any, R ~float64](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByFloat64x4[T any, R ~float64](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func MeanByFloat64x8[T any, R ~float64](collection []T, iteratee func(item T) R) R"
|
||||||
|
---
|
||||||
|
|
||||||
|
MeanBy transforms a collection using an iteratee function and calculates the arithmetic mean of the result using SIMD instructions. The automatic dispatch functions (e.g., `MeanByInt8`) will select the best SIMD variant based on CPU capabilities. The specific variants (e.g., `MeanByInt8x32`) use a fixed SIMD instruction set regardless of CPU capabilities.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- **Go 1.26+** with `GOEXPERIMENT=simd`
|
||||||
|
- **amd64** architecture only
|
||||||
|
|
||||||
|
### CPU compatibility
|
||||||
|
|
||||||
|
| SIMD variant | Lanes | Required flags | Typical CPUs |
|
||||||
|
| ------------ | ----- | -------------- | ------------------------------ |
|
||||||
|
| SSE (xN) | 2-16 | `sse2` | All amd64 |
|
||||||
|
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
|
||||||
|
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
|
||||||
|
|
||||||
|
> **Note**: The automatic dispatch functions (e.g., `MeanByInt8`) will use the best available SIMD variant for the current CPU. Use specific variants (e.g., `MeanByInt8x32`) only if you know your target CPU supports that instruction set.
|
||||||
|
|
||||||
|
```go
|
||||||
|
type Person struct {
|
||||||
|
Name string
|
||||||
|
Age int8
|
||||||
|
}
|
||||||
|
|
||||||
|
people := []Person{
|
||||||
|
{Name: "Alice", Age: 20},
|
||||||
|
{Name: "Bob", Age: 30},
|
||||||
|
{Name: "Charlie", Age: 40},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Automatic dispatch - uses best available SIMD
|
||||||
|
mean := simd.MeanByInt8(people, func(p Person) int8 {
|
||||||
|
return p.Age
|
||||||
|
})
|
||||||
|
// 30
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
type Product struct {
|
||||||
|
Name string
|
||||||
|
Price float32
|
||||||
|
}
|
||||||
|
|
||||||
|
products := []Product{
|
||||||
|
{Name: "Widget", Price: 10.50},
|
||||||
|
{Name: "Gadget", Price: 20.00},
|
||||||
|
{Name: "Tool", Price: 15.75},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mean price using specific AVX2 variant
|
||||||
|
mean := simd.MeanByFloat32x8(products, func(p Product) float32 {
|
||||||
|
return p.Price
|
||||||
|
})
|
||||||
|
// 15.4167
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
type Metric struct {
|
||||||
|
Value uint16
|
||||||
|
}
|
||||||
|
|
||||||
|
metrics := []Metric{
|
||||||
|
{Value: 100},
|
||||||
|
{Value: 200},
|
||||||
|
{Value: 300},
|
||||||
|
{Value: 400},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Using SSE variant - works on all amd64
|
||||||
|
mean := simd.MeanByUint16x8(metrics, func(m Metric) uint16 {
|
||||||
|
return m.Value
|
||||||
|
})
|
||||||
|
// 250
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Empty collection returns 0
|
||||||
|
type Item struct {
|
||||||
|
Count int64
|
||||||
|
}
|
||||||
|
|
||||||
|
mean := simd.MeanByInt64([]Item{}, func(i Item) int64 {
|
||||||
|
return i.Count
|
||||||
|
})
|
||||||
|
// 0
|
||||||
|
```
|
||||||
@@ -0,0 +1,82 @@
|
|||||||
|
---
|
||||||
|
name: Min
|
||||||
|
slug: min
|
||||||
|
sourceRef: exp/simd/math_sse.go#L834
|
||||||
|
category: exp
|
||||||
|
subCategory: simd
|
||||||
|
similarHelpers:
|
||||||
|
- exp#simd#min
|
||||||
|
position: 20
|
||||||
|
signatures:
|
||||||
|
- "func MinInt8x16[T ~int8](collection []T) T"
|
||||||
|
- "func MinInt8x32[T ~int8](collection []T) T"
|
||||||
|
- "func MinInt8x64[T ~int8](collection []T) T"
|
||||||
|
- "func MinInt16x8[T ~int16](collection []T) T"
|
||||||
|
- "func MinInt16x16[T ~int16](collection []T) T"
|
||||||
|
- "func MinInt16x32[T ~int16](collection []T) T"
|
||||||
|
- "func MinInt32x4[T ~int32](collection []T) T"
|
||||||
|
- "func MinInt32x8[T ~int32](collection []T) T"
|
||||||
|
- "func MinInt32x16[T ~int32](collection []T) T"
|
||||||
|
- "func MinInt64x2[T ~int64](collection []T) T"
|
||||||
|
- "func MinInt64x4[T ~int64](collection []T) T"
|
||||||
|
- "func MinInt64x8[T ~int64](collection []T) T"
|
||||||
|
- "func MinUint8x16[T ~uint8](collection []T) T"
|
||||||
|
- "func MinUint8x32[T ~uint8](collection []T) T"
|
||||||
|
- "func MinUint8x64[T ~uint8](collection []T) T"
|
||||||
|
- "func MinUint16x8[T ~uint16](collection []T) T"
|
||||||
|
- "func MinUint16x16[T ~uint16](collection []T) T"
|
||||||
|
- "func MinUint16x32[T ~uint16](collection []T) T"
|
||||||
|
- "func MinUint32x4[T ~uint32](collection []T) T"
|
||||||
|
- "func MinUint32x8[T ~uint32](collection []T) T"
|
||||||
|
- "func MinUint32x16[T ~uint32](collection []T) T"
|
||||||
|
- "func MinUint64x2[T ~uint64](collection []T) T"
|
||||||
|
- "func MinUint64x4[T ~uint64](collection []T) T"
|
||||||
|
- "func MinUint64x8[T ~uint64](collection []T) T"
|
||||||
|
- "func MinFloat32x4[T ~float32](collection []T) T"
|
||||||
|
- "func MinFloat32x8[T ~float32](collection []T) T"
|
||||||
|
- "func MinFloat32x16[T ~float32](collection []T) T"
|
||||||
|
- "func MinFloat64x2[T ~float64](collection []T) T"
|
||||||
|
- "func MinFloat64x4[T ~float64](collection []T) T"
|
||||||
|
- "func MinFloat64x8[T ~float64](collection []T) T"
|
||||||
|
---
|
||||||
|
|
||||||
|
Finds the minimum value in a collection using SIMD instructions. The suffix (x2, x4, x8, x16, x32, x64) indicates the number of lanes processed simultaneously.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- **Go 1.26+** with `GOEXPERIMENT=simd`
|
||||||
|
- **amd64** architecture only
|
||||||
|
|
||||||
|
### CPU compatibility
|
||||||
|
|
||||||
|
| SIMD variant | Lanes | Required flags | Typical CPUs |
|
||||||
|
| ------------ | ----- | -------------- | ------------------------------ |
|
||||||
|
| SSE (xN) | 2-16 | `sse2` | All amd64 |
|
||||||
|
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
|
||||||
|
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
|
||||||
|
|
||||||
|
> **Note**: Choose the variant matching your CPU's capabilities. Higher lane counts provide better performance but require newer CPU support.
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Using AVX2 variant (32 lanes at once) - Intel Haswell+ / AMD Excavator+
|
||||||
|
min := simd.MinInt8x32([]int8{5, 2, 8, 1, 9})
|
||||||
|
// 1
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Using AVX-512 variant (16 lanes at once) - Intel Skylake-X+
|
||||||
|
min := simd.MinFloat32x16([]float32{3.5, 1.2, 4.8, 2.1})
|
||||||
|
// 1.2
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Using SSE variant (4 lanes at once) - works on all amd64
|
||||||
|
min := simd.MinInt32x4([]int32{100, 50, 200, 75})
|
||||||
|
// 50
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Empty collection returns 0
|
||||||
|
min := simd.MinUint16x8([]uint16{})
|
||||||
|
// 0
|
||||||
|
```
|
||||||
@@ -0,0 +1,83 @@
|
|||||||
|
---
|
||||||
|
name: Sum
|
||||||
|
slug: sum
|
||||||
|
sourceRef: exp/simd/math_sse.go#L13
|
||||||
|
category: exp
|
||||||
|
subCategory: simd
|
||||||
|
similarHelpers:
|
||||||
|
- exp#simd#sum
|
||||||
|
- exp#simd#sumby
|
||||||
|
position: 0
|
||||||
|
signatures:
|
||||||
|
- "func SumInt8x16[T ~int8](collection []T) T"
|
||||||
|
- "func SumInt8x32[T ~int8](collection []T) T"
|
||||||
|
- "func SumInt8x64[T ~int8](collection []T) T"
|
||||||
|
- "func SumInt16x8[T ~int16](collection []T) T"
|
||||||
|
- "func SumInt16x16[T ~int16](collection []T) T"
|
||||||
|
- "func SumInt16x32[T ~int16](collection []T) T"
|
||||||
|
- "func SumInt32x4[T ~int32](collection []T) T"
|
||||||
|
- "func SumInt32x8[T ~int32](collection []T) T"
|
||||||
|
- "func SumInt32x16[T ~int32](collection []T) T"
|
||||||
|
- "func SumInt64x2[T ~int64](collection []T) T"
|
||||||
|
- "func SumInt64x4[T ~int64](collection []T) T"
|
||||||
|
- "func SumInt64x8[T ~int64](collection []T) T"
|
||||||
|
- "func SumUint8x16[T ~uint8](collection []T) T"
|
||||||
|
- "func SumUint8x32[T ~uint8](collection []T) T"
|
||||||
|
- "func SumUint8x64[T ~uint8](collection []T) T"
|
||||||
|
- "func SumUint16x8[T ~uint16](collection []T) T"
|
||||||
|
- "func SumUint16x16[T ~uint16](collection []T) T"
|
||||||
|
- "func SumUint16x32[T ~uint16](collection []T) T"
|
||||||
|
- "func SumUint32x4[T ~uint32](collection []T) T"
|
||||||
|
- "func SumUint32x8[T ~uint32](collection []T) T"
|
||||||
|
- "func SumUint32x16[T ~uint32](collection []T) T"
|
||||||
|
- "func SumUint64x2[T ~uint64](collection []T) T"
|
||||||
|
- "func SumUint64x4[T ~uint64](collection []T) T"
|
||||||
|
- "func SumUint64x8[T ~uint64](collection []T) T"
|
||||||
|
- "func SumFloat32x4[T ~float32](collection []T) T"
|
||||||
|
- "func SumFloat32x8[T ~float32](collection []T) T"
|
||||||
|
- "func SumFloat32x16[T ~float32](collection []T) T"
|
||||||
|
- "func SumFloat64x2[T ~float64](collection []T) T"
|
||||||
|
- "func SumFloat64x4[T ~float64](collection []T) T"
|
||||||
|
- "func SumFloat64x8[T ~float64](collection []T) T"
|
||||||
|
---
|
||||||
|
|
||||||
|
Sums the values in a collection using SIMD instructions. The suffix (x2, x4, x8, x16, x32, x64) indicates the number of lanes processed simultaneously.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- **Go 1.26+** with `GOEXPERIMENT=simd`
|
||||||
|
- **amd64** architecture only
|
||||||
|
|
||||||
|
### CPU compatibility
|
||||||
|
|
||||||
|
| SIMD variant | Lanes | Required flags | Typical CPUs |
|
||||||
|
| ------------ | ----- | -------------- | ------------------------------ |
|
||||||
|
| SSE (xN) | 2-16 | `sse2` | All amd64 |
|
||||||
|
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
|
||||||
|
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
|
||||||
|
|
||||||
|
> **Note**: Choose the variant matching your CPU's capabilities. Higher lane counts provide better performance but require newer CPU support.
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Using AVX2 variant (32 lanes at once) - Intel Haswell+ / AMD Excavator+
|
||||||
|
sum := simd.SumInt8x32([]int8{1, 2, 3, 4, 5})
|
||||||
|
// 15
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Using AVX-512 variant (16 lanes at once) - Intel Skylake-X+
|
||||||
|
sum := simd.SumFloat32x16([]float32{1.1, 2.2, 3.3, 4.4})
|
||||||
|
// 11
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Using SSE variant (4 lanes at once) - works on all amd64
|
||||||
|
sum := simd.SumInt32x4([]int32{1000000, 2000000, 3000000})
|
||||||
|
// 6000000
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Empty collection returns 0
|
||||||
|
sum := simd.SumUint16x16([]uint16{})
|
||||||
|
// 0
|
||||||
|
```
|
||||||
@@ -0,0 +1,139 @@
|
|||||||
|
---
|
||||||
|
name: SumBy
|
||||||
|
slug: sumby
|
||||||
|
sourceRef: exp/simd/math.go#L841
|
||||||
|
category: exp
|
||||||
|
subCategory: simd
|
||||||
|
similarHelpers:
|
||||||
|
- exp#simd#sum
|
||||||
|
- exp#simd#meanby
|
||||||
|
position: 20
|
||||||
|
signatures:
|
||||||
|
- "func SumByInt8[T any, R ~int8](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByInt16[T any, R ~int16](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByInt32[T any, R ~int32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByInt64[T any, R ~int64](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByUint8[T any, R ~uint8](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByUint16[T any, R ~uint16](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByUint32[T any, R ~uint32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByUint64[T any, R ~uint64](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByFloat32[T any, R ~float32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByFloat64[T any, R ~float64](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByInt8x16[T any, R ~int8](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByInt8x32[T any, R ~int8](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByInt8x64[T any, R ~int8](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByInt16x8[T any, R ~int16](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByInt16x16[T any, R ~int16](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByInt16x32[T any, R ~int16](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByInt32x4[T any, R ~int32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByInt32x8[T any, R ~int32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByInt32x16[T any, R ~int32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByInt64x2[T any, R ~int64](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByInt64x4[T any, R ~int64](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByInt64x8[T any, R ~int64](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByUint8x16[T any, R ~uint8](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByUint8x32[T any, R ~uint8](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByUint8x64[T any, R ~uint8](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByUint16x8[T any, R ~uint16](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByUint16x16[T any, R ~uint16](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByUint16x32[T any, R ~uint16](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByUint32x4[T any, R ~uint32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByUint32x8[T any, R ~uint32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByUint32x16[T any, R ~uint32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByUint64x2[T any, R ~uint64](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByUint64x4[T any, R ~uint64](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByUint64x8[T any, R ~uint64](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByFloat32x4[T any, R ~float32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByFloat32x8[T any, R ~float32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByFloat32x16[T any, R ~float32](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByFloat64x2[T any, R ~float64](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByFloat64x4[T any, R ~float64](collection []T, iteratee func(item T) R) R"
|
||||||
|
- "func SumByFloat64x8[T any, R ~float64](collection []T, iteratee func(item T) R) R"
|
||||||
|
---
|
||||||
|
|
||||||
|
SumBy transforms a collection using an iteratee function and sums the result using SIMD instructions. The automatic dispatch functions (e.g., `SumByInt8`) will select the best SIMD variant based on CPU capabilities. The specific variants (e.g., `SumByInt8x32`) use a fixed SIMD instruction set regardless of CPU capabilities.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- **Go 1.26+** with `GOEXPERIMENT=simd`
|
||||||
|
- **amd64** architecture only
|
||||||
|
|
||||||
|
### CPU compatibility
|
||||||
|
|
||||||
|
| SIMD variant | Lanes | Required flags | Typical CPUs |
|
||||||
|
| ------------ | ----- | -------------- | ------------------------------ |
|
||||||
|
| SSE (xN) | 2-16 | `sse2` | All amd64 |
|
||||||
|
| AVX2 (xN) | 4-32 | `avx2` | Intel Haswell+, AMD Excavator+ |
|
||||||
|
| AVX-512 (xN) | 8-64 | `avx512f` | Intel Skylake-X+, some Xeons |
|
||||||
|
|
||||||
|
> **Note**: The automatic dispatch functions (e.g., `SumByInt8`) will use the best available SIMD variant for the current CPU. Use specific variants (e.g., `SumByInt8x32`) only if you know your target CPU supports that instruction set.
|
||||||
|
|
||||||
|
```go
|
||||||
|
type Person struct {
|
||||||
|
Name string
|
||||||
|
Age int8
|
||||||
|
}
|
||||||
|
|
||||||
|
people := []Person{
|
||||||
|
{Name: "Alice", Age: 25},
|
||||||
|
{Name: "Bob", Age: 30},
|
||||||
|
{Name: "Charlie", Age: 35},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Automatic dispatch - uses best available SIMD
|
||||||
|
sum := simd.SumByInt8(people, func(p Person) int8 {
|
||||||
|
return p.Age
|
||||||
|
})
|
||||||
|
// 90
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
type Product struct {
|
||||||
|
Name string
|
||||||
|
Price float32
|
||||||
|
Stock int32
|
||||||
|
}
|
||||||
|
|
||||||
|
products := []Product{
|
||||||
|
{Name: "Widget", Price: 10.50, Stock: 5},
|
||||||
|
{Name: "Gadget", Price: 20.00, Stock: 3},
|
||||||
|
{Name: "Tool", Price: 15.75, Stock: 2},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sum stock value using specific AVX2 variant
|
||||||
|
sum := simd.SumByFloat32x8(products, func(p Product) float32 {
|
||||||
|
return p.Price * float32(p.Stock)
|
||||||
|
})
|
||||||
|
// 152.5
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
type Metric struct {
|
||||||
|
Value uint16
|
||||||
|
}
|
||||||
|
|
||||||
|
metrics := []Metric{
|
||||||
|
{Value: 100},
|
||||||
|
{Value: 200},
|
||||||
|
{Value: 300},
|
||||||
|
{Value: 400},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Using SSE variant - works on all amd64
|
||||||
|
sum := simd.SumByUint16x8(metrics, func(m Metric) uint16 {
|
||||||
|
return m.Value
|
||||||
|
})
|
||||||
|
// 1000
|
||||||
|
```
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Empty collection returns 0
|
||||||
|
type Item struct {
|
||||||
|
Count int64
|
||||||
|
}
|
||||||
|
|
||||||
|
sum := simd.SumByInt64([]Item{}, func(i Item) int64 {
|
||||||
|
return i.Count
|
||||||
|
})
|
||||||
|
// 0
|
||||||
|
```
|
||||||
@@ -0,0 +1,8 @@
|
|||||||
|
{
|
||||||
|
"label": "💨 SIMD",
|
||||||
|
"position": 5,
|
||||||
|
"link": {
|
||||||
|
"type": "generated-index",
|
||||||
|
"description": "SIMD operations"
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,44 @@
|
|||||||
|
---
|
||||||
|
title: SIMD operations
|
||||||
|
description: High-performance slice operations using SSE, AVX2 and AVX512 SIMD when built with Go 1.26+ and GOEXPERIMENT=simd on amd64.
|
||||||
|
sidebar_position: 0
|
||||||
|
hide_table_of_contents: true
|
||||||
|
---
|
||||||
|
|
||||||
|
:::warning Help improve this documentation
|
||||||
|
This documentation is still new and evolving. If you spot any mistakes, unclear explanations, or missing details, please [open an issue](https://github.com/samber/lo/issues).
|
||||||
|
|
||||||
|
Your feedback helps us improve!
|
||||||
|
:::
|
||||||
|
|
||||||
|
#
|
||||||
|
## SIMD helpers
|
||||||
|
|
||||||
|
This page lists all operations on slices, available in the `exp/simd` sub-package. These helpers use **SSE** (128-bit), **AVX2** (256-bit) or **AVX512** (512-bit) SIMD when built with Go 1.26+, the `GOEXPERIMENT=simd` flag, and on amd64.
|
||||||
|
|
||||||
|
## Performance
|
||||||
|
|
||||||
|
Benchmarks show that running SIMD operators on small datasets is slower:
|
||||||
|
|
||||||
|
```txt
|
||||||
|
BenchmarkSumInt8/small/Fallback-lo-4 203616572 5.875 ns/op
|
||||||
|
BenchmarkSumInt8/small/SSE-x16-4 100000000 12.04 ns/op
|
||||||
|
BenchmarkSumInt8/small/AVX2-x32-4 64041816 17.93 ns/op
|
||||||
|
BenchmarkSumInt8/small/AVX512-x64-4 26947528 44.75 ns/op
|
||||||
|
```
|
||||||
|
|
||||||
|
But much much faster on big datasets:
|
||||||
|
|
||||||
|
```txt
|
||||||
|
BenchmarkSumInt8/xlarge/Fallback-lo-4 247677 4860 ns/op
|
||||||
|
BenchmarkSumInt8/xlarge/SSE-x16-4 3851040 311.4 ns/op
|
||||||
|
BenchmarkSumInt8/xlarge/AVX2-x32-4 7100002 169.2 ns/op
|
||||||
|
BenchmarkSumInt8/xlarge/AVX512-x64-4 10107534 118.1 ns/op
|
||||||
|
```
|
||||||
|
|
||||||
|
import HelperList from '@site/plugins/helpers-pages/components/HelperList';
|
||||||
|
|
||||||
|
<HelperList
|
||||||
|
category="exp"
|
||||||
|
subCategory="simd"
|
||||||
|
/>
|
||||||
@@ -0,0 +1,446 @@
|
|||||||
|
# Benchmark
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
Benchmarks show that running SIMD operations on small datasets is slower:
|
||||||
|
|
||||||
|
```txt
|
||||||
|
BenchmarkSumInt8/small/Fallback-lo-2 248740710 5.218 ns/op
|
||||||
|
BenchmarkSumInt8/small/SSE-x16-2 126181464 9.485 ns/op
|
||||||
|
BenchmarkSumInt8/small/AVX2-x32-2 73059427 14.44 ns/op
|
||||||
|
BenchmarkSumInt8/small/AVX512-x64-2 49913169 24.41 ns/op
|
||||||
|
```
|
||||||
|
|
||||||
|
But SIMD is much faster on large datasets:
|
||||||
|
|
||||||
|
```txt
|
||||||
|
BenchmarkSumInt8/xlarge/Fallback-lo-2 273898 4383 ns/op
|
||||||
|
BenchmarkSumInt8/xlarge/SSE-x16-2 6928408 173.1 ns/op
|
||||||
|
BenchmarkSumInt8/xlarge/AVX2-x32-2 12639586 94.09 ns/op
|
||||||
|
BenchmarkSumInt8/xlarge/AVX512-x64-2 13509693 89.67 ns/op
|
||||||
|
```
|
||||||
|
|
||||||
|
## Run
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export GOEXPERIMENT=simd
|
||||||
|
cd exp/simd/
|
||||||
|
go test -bench ./... -run=^Benchmark -benchmem -bench
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# get instruction set
|
||||||
|
cat /proc/cpuinfo
|
||||||
|
```
|
||||||
|
|
||||||
|
## Result
|
||||||
|
|
||||||
|
```
|
||||||
|
archsimd.X86: AVX=true AVX2=true AVX512=true
|
||||||
|
goos: linux
|
||||||
|
goarch: amd64
|
||||||
|
pkg: github.com/samber/lo/exp/simd
|
||||||
|
cpu: AMD EPYC 9454P 48-Core Processor
|
||||||
|
|
||||||
|
...
|
||||||
|
|
||||||
|
PASS
|
||||||
|
ok github.com/samber/lo/exp/simd 596.213s
|
||||||
|
```
|
||||||
|
|
||||||
|
| Benchmark | Iterations | Time/op | Bytes/op | Allocs/op |
|
||||||
|
| ---------------------------------------------- | ---------- | ----------- | -------- | ----------- |
|
||||||
|
| BenchmarkContainsInt8/tiny/SSE-x16-2 | 312359204 | 3.625 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt8/tiny/AVX2-x32-2 | 277194441 | 4.531 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt8/tiny/AVX512-x64-2 | 336853209 | 3.401 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt8/small/SSE-x16-2 | 449132103 | 2.670 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt8/small/AVX2-x32-2 | 148648339 | 8.332 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt8/small/AVX512-x64-2 | 143124861 | 7.982 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt8/medium/SSE-x16-2 | 276816714 | 4.302 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt8/medium/AVX2-x32-2 | 345774957 | 3.529 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt8/medium/AVX512-x64-2 | 449868722 | 2.669 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt8/large/SSE-x16-2 | 100000000 | 10.68 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt8/large/AVX2-x32-2 | 172934200 | 6.941 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt8/large/AVX512-x64-2 | 280992625 | 4.384 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt8/xlarge/SSE-x16-2 | 187189599 | 6.203 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt8/xlarge/AVX2-x32-2 | 274289563 | 4.042 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt8/xlarge/AVX512-x64-2 | 375048555 | 2.953 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt8/massive/SSE-x16-2 | 86434948 | 14.02 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt8/massive/AVX2-x32-2 | 153742346 | 8.012 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt8/massive/AVX512-x64-2 | 259404483 | 5.214 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt16/tiny/SSE-x8-2 | 270309470 | 4.315 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt16/tiny/AVX2-x16-2 | 264874646 | 4.281 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt16/tiny/AVX512-x32-2 | 328810479 | 3.593 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt16/small/SSE-x8-2 | 374742561 | 3.206 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt16/small/AVX2-x16-2 | 449838870 | 2.678 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt16/small/AVX512-x32-2 | 143845734 | 8.484 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt16/medium/SSE-x8-2 | 185415590 | 6.448 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt16/medium/AVX2-x16-2 | 273780868 | 4.268 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt16/medium/AVX512-x32-2 | 350067484 | 3.431 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt16/large/SSE-x8-2 | 61109778 | 19.66 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt16/large/AVX2-x16-2 | 100000000 | 10.74 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt16/large/AVX512-x32-2 | 182886646 | 6.575 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt16/xlarge/SSE-x8-2 | 15220682 | 71.53 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt16/xlarge/AVX2-x16-2 | 31876572 | 37.57 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt16/xlarge/AVX512-x32-2 | 61992217 | 19.55 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt16/massive/SSE-x8-2 | 4372000 | 262.8 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt16/massive/AVX2-x16-2 | 9019658 | 131.1 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt16/massive/AVX512-x32-2 | 16568430 | 74.25 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt32/tiny/SSE-x4-2 | 499209442 | 2.406 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt32/tiny/AVX2-x8-2 | 350479609 | 3.433 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt32/tiny/AVX512-x16-2 | 280918554 | 4.309 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt32/small/SSE-x4-2 | 299561596 | 4.028 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt32/small/AVX2-x8-2 | 374064310 | 3.205 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt32/small/AVX512-x16-2 | 499219765 | 2.418 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt32/medium/SSE-x4-2 | 100000000 | 10.42 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt32/medium/AVX2-x8-2 | 187391635 | 6.403 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt32/medium/AVX512-x16-2 | 307955800 | 3.875 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt32/large/SSE-x4-2 | 33256420 | 36.05 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt32/large/AVX2-x8-2 | 62421526 | 19.23 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt32/large/AVX512-x16-2 | 100000000 | 10.36 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt32/xlarge/SSE-x4-2 | 8328856 | 144.9 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt32/xlarge/AVX2-x8-2 | 17039037 | 71.14 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt32/xlarge/AVX512-x16-2 | 28740241 | 41.77 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt32/massive/SSE-x4-2 | 3525885 | 332.3 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt32/massive/AVX2-x8-2 | 7318027 | 164.5 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt32/massive/AVX512-x16-2 | 12181366 | 99.08 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt64/tiny/SSE-x2-2 | 409014308 | 2.934 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt64/tiny/AVX2-x4-2 | 449210791 | 2.667 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt64/tiny/AVX512-x8-2 | 280998146 | 4.293 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt64/small/SSE-x2-2 | 195631429 | 6.172 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt64/small/AVX2-x4-2 | 281272394 | 4.308 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt64/small/AVX512-x8-2 | 408933924 | 3.044 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt64/medium/SSE-x2-2 | 63006909 | 18.94 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt64/medium/AVX2-x4-2 | 100000000 | 10.67 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt64/medium/AVX512-x8-2 | 197411126 | 6.016 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt64/large/SSE-x2-2 | 17098578 | 70.57 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt64/large/AVX2-x4-2 | 32558013 | 37.07 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt64/large/AVX512-x8-2 | 57629485 | 20.94 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt64/xlarge/SSE-x2-2 | 4286155 | 281.8 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt64/xlarge/AVX2-x4-2 | 8344772 | 143.8 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt64/xlarge/AVX512-x8-2 | 14428276 | 83.14 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt64/massive/SSE-x2-2 | 1000000 | 1012 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt64/massive/AVX2-x4-2 | 2350525 | 510.6 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt64/massive/AVX512-x8-2 | 3773523 | 318.1 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint8/tiny/SSE-x16-2 | 338880315 | 3.332 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint8/tiny/AVX2-x32-2 | 320784217 | 3.559 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint8/tiny/AVX512-x64-2 | 341599854 | 3.331 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint8/small/SSE-x16-2 | 449579424 | 2.670 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint8/small/AVX2-x32-2 | 140368142 | 8.648 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint8/small/AVX512-x64-2 | 146828888 | 8.182 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint8/medium/SSE-x16-2 | 374443974 | 3.472 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint8/medium/AVX2-x32-2 | 449271607 | 2.672 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint8/medium/AVX512-x64-2 | 598525731 | 2.018 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint8/large/SSE-x16-2 | 254828565 | 4.956 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint8/large/AVX2-x32-2 | 407777484 | 2.938 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint8/large/AVX512-x64-2 | 443472316 | 2.666 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint8/xlarge/SSE-x16-2 | 162196827 | 7.867 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint8/xlarge/AVX2-x32-2 | 268324950 | 4.518 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint8/xlarge/AVX512-x64-2 | 400437789 | 2.952 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint8/massive/SSE-x16-2 | 214548872 | 5.640 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint8/massive/AVX2-x32-2 | 348431553 | 3.391 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint8/massive/AVX512-x64-2 | 459781908 | 2.455 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint16/tiny/SSE-x8-2 | 276271912 | 4.297 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint16/tiny/AVX2-x16-2 | 281145528 | 4.270 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint16/tiny/AVX512-x32-2 | 315343911 | 3.667 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint16/small/SSE-x8-2 | 374632351 | 3.204 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint16/small/AVX2-x16-2 | 449355727 | 2.670 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint16/small/AVX512-x32-2 | 138088146 | 8.395 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint16/medium/SSE-x8-2 | 187276191 | 6.582 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint16/medium/AVX2-x16-2 | 281107980 | 4.306 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint16/medium/AVX512-x32-2 | 358850328 | 3.516 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint16/large/SSE-x8-2 | 59025931 | 19.98 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint16/large/AVX2-x16-2 | 100000000 | 10.68 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint16/large/AVX512-x32-2 | 179631354 | 6.569 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint16/xlarge/SSE-x8-2 | 16576267 | 71.63 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint16/xlarge/AVX2-x16-2 | 32578981 | 36.96 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint16/xlarge/AVX512-x32-2 | 61464870 | 19.44 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint16/massive/SSE-x8-2 | 2153736 | 557.4 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint16/massive/AVX2-x16-2 | 4225728 | 281.3 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint16/massive/AVX512-x32-2 | 7829936 | 145.1 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint32/tiny/SSE-x4-2 | 499390296 | 2.403 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint32/tiny/AVX2-x8-2 | 362964080 | 3.342 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint32/tiny/AVX512-x16-2 | 281063364 | 4.268 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint32/small/SSE-x4-2 | 293867554 | 4.004 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint32/small/AVX2-x8-2 | 374510434 | 3.203 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint32/small/AVX512-x16-2 | 499714206 | 2.402 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint32/medium/SSE-x4-2 | 100000000 | 10.42 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint32/medium/AVX2-x8-2 | 187258657 | 6.405 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint32/medium/AVX512-x16-2 | 312999210 | 3.881 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint32/large/SSE-x4-2 | 33298366 | 36.02 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint32/large/AVX2-x8-2 | 62409421 | 19.23 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint32/large/AVX512-x16-2 | 100000000 | 10.10 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint32/xlarge/SSE-x4-2 | 7948898 | 143.6 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint32/xlarge/AVX2-x8-2 | 17021738 | 70.49 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint32/xlarge/AVX512-x16-2 | 28742320 | 41.77 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint32/massive/SSE-x4-2 | 1595774 | 751.1 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint32/massive/AVX2-x8-2 | 3094242 | 381.1 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint32/massive/AVX512-x16-2 | 5080051 | 238.3 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint64/tiny/SSE-x2-2 | 374760351 | 3.203 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint64/tiny/AVX2-x4-2 | 498763054 | 2.419 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint64/tiny/AVX512-x8-2 | 319635274 | 3.582 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint64/small/SSE-x2-2 | 187032452 | 6.447 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint64/small/AVX2-x4-2 | 299546244 | 4.009 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint64/small/AVX512-x8-2 | 373937659 | 3.207 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint64/medium/SSE-x2-2 | 62413118 | 19.23 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint64/medium/AVX2-x4-2 | 113978791 | 10.42 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint64/medium/AVX512-x8-2 | 186965330 | 6.484 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint64/large/SSE-x2-2 | 17005768 | 70.57 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint64/large/AVX2-x4-2 | 33286495 | 36.69 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint64/large/AVX512-x8-2 | 61486065 | 19.93 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint64/xlarge/SSE-x2-2 | 4154370 | 280.8 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint64/xlarge/AVX2-x4-2 | 8371358 | 148.2 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint64/xlarge/AVX512-x8-2 | 14193795 | 72.36 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint64/massive/SSE-x2-2 | 1773937 | 676.4 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint64/massive/AVX2-x4-2 | 3500168 | 343.0 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsUint64/massive/AVX512-x8-2 | 7097266 | 249.3 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat32/tiny/SSE-x4-2 | 410522160 | 2.675 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat32/tiny/AVX2-x8-2 | 308565882 | 3.814 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat32/tiny/AVX512-x16-2 | 315331897 | 3.755 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat32/small/SSE-x4-2 | 278219434 | 4.642 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat32/small/AVX2-x8-2 | 362945481 | 3.287 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat32/small/AVX512-x16-2 | 408523153 | 2.941 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat32/medium/SSE-x4-2 | 100000000 | 10.77 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat32/medium/AVX2-x8-2 | 186186376 | 6.409 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat32/medium/AVX512-x16-2 | 264255108 | 4.619 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat32/large/SSE-x4-2 | 33028701 | 36.27 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat32/large/AVX2-x8-2 | 62465360 | 19.53 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat32/large/AVX512-x16-2 | 108213310 | 10.95 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat32/xlarge/SSE-x4-2 | 8359381 | 143.6 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat32/xlarge/AVX2-x8-2 | 17042701 | 70.46 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat32/xlarge/AVX512-x16-2 | 31806921 | 37.13 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat32/massive/SSE-x4-2 | 1000000 | 1100 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat32/massive/AVX2-x8-2 | 2164672 | 554.4 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat32/massive/AVX512-x16-2 | 4201453 | 293.9 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat64/tiny/SSE-x2-2 | 362183925 | 3.223 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat64/tiny/AVX2-x4-2 | 449021466 | 2.687 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat64/tiny/AVX512-x8-2 | 320176149 | 3.820 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat64/small/SSE-x2-2 | 187139116 | 6.415 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat64/small/AVX2-x4-2 | 280722585 | 4.300 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat64/small/AVX512-x8-2 | 335670502 | 3.472 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat64/medium/SSE-x2-2 | 62343927 | 19.23 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat64/medium/AVX2-x4-2 | 112332902 | 10.69 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat64/medium/AVX512-x8-2 | 179610780 | 6.741 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat64/large/SSE-x2-2 | 16996959 | 70.51 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat64/large/AVX2-x4-2 | 33017950 | 36.29 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat64/large/AVX512-x8-2 | 60322328 | 19.73 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat64/xlarge/SSE-x2-2 | 4141281 | 282.9 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat64/xlarge/AVX2-x4-2 | 7856590 | 145.0 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat64/xlarge/AVX512-x8-2 | 16623739 | 72.06 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat64/massive/SSE-x2-2 | 541202 | 2195 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat64/massive/AVX2-x4-2 | 1000000 | 1158 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsFloat64/massive/AVX512-x8-2 | 2115301 | 560.4 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsWorstCase/SSE-x4-2 | 7651734 | 145.6 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsWorstCase/AVX2-x8-2 | 14921599 | 70.49 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsWorstCase/AVX512-x16-2 | 28708478 | 41.38 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsBestCase/SSE-x4-2 | 534237578 | 2.136 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsBestCase/AVX2-x8-2 | 561252645 | 2.159 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsBestCase/AVX512-x16-2 | 560396454 | 2.137 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsNegative/tiny/SSE-x4-2 | 499649139 | 2.401 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsNegative/tiny/AVX2-x8-2 | 329743240 | 3.421 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsNegative/tiny/AVX512-x16-2 | 280516392 | 4.276 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsNegative/small/SSE-x4-2 | 299373171 | 4.006 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsNegative/small/AVX2-x8-2 | 374407988 | 3.267 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsNegative/small/AVX512-x16-2 | 486948346 | 2.424 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsNegative/medium/SSE-x4-2 | 100000000 | 10.41 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsNegative/medium/AVX2-x8-2 | 182899621 | 6.412 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsNegative/medium/AVX512-x16-2 | 311969776 | 3.829 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsNegative/large/SSE-x4-2 | 33309816 | 36.04 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsNegative/large/AVX2-x8-2 | 59912676 | 19.74 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsNegative/large/AVX512-x16-2 | 100000000 | 10.65 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsNegative/xlarge/SSE-x4-2 | 8346818 | 143.7 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsNegative/xlarge/AVX2-x8-2 | 16980399 | 70.54 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsNegative/xlarge/AVX512-x16-2 | 28676455 | 42.94 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsNegative/massive/SSE-x4-2 | 1000000 | 1151 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsNegative/massive/AVX2-x8-2 | 2161594 | 555.2 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsNegative/massive/AVX512-x16-2 | 3549094 | 350.5 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt8ByWidth/SSE-x16-2 | 331533141 | 3.222 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt8ByWidth/AVX2-x32-2 | 408741681 | 3.193 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt8ByWidth/AVX512-x64-2 | 365382873 | 3.241 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt64SteadyState/SSE-x2-2 | 5722603 | 211.5 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt64SteadyState/AVX2-x4-2 | 11711869 | 103.1 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkContainsInt64SteadyState/AVX512-x8-2 | 19671033 | 61.36 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt8/small/Fallback-lo-2 | 248740710 | 5.218 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt8/small/SSE-x16-2 | 126181464 | 9.485 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt8/small/AVX2-x32-2 | 73059427 | 14.44 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt8/small/AVX512-x64-2 | 49913169 | 24.41 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt8/medium/Fallback-lo-2 | 17278075 | 69.96 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt8/medium/SSE-x16-2 | 100000000 | 10.58 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt8/medium/AVX2-x32-2 | 91620999 | 13.10 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt8/medium/AVX512-x64-2 | 54082130 | 22.20 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt8/large/Fallback-lo-2 | 2006178 | 576.3 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt8/large/SSE-x16-2 | 41836690 | 27.82 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt8/large/AVX2-x32-2 | 51735399 | 23.04 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt8/large/AVX512-x64-2 | 40861586 | 29.40 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt8/xlarge/Fallback-lo-2 | 273898 | 4383 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt8/xlarge/SSE-x16-2 | 6928408 | 173.1 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt8/xlarge/AVX2-x32-2 | 12639586 | 94.09 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt8/xlarge/AVX512-x64-2 | 13509693 | 89.67 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt16/small/Fallback-lo-2 | 249444103 | 5.012 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt16/small/SSE-x8-2 | 244927230 | 5.052 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt16/small/AVX2-x16-2 | 122088517 | 9.715 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt16/small/AVX512-x32-2 | 54098370 | 22.00 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt16/medium/Fallback-lo-2 | 15782683 | 72.54 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt16/medium/SSE-x8-2 | 100000000 | 10.51 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt16/medium/AVX2-x16-2 | 100000000 | 10.75 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt16/medium/AVX512-x32-2 | 56147455 | 21.38 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt16/large/Fallback-lo-2 | 2173214 | 598.1 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt16/large/SSE-x8-2 | 26319481 | 44.73 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt16/large/AVX2-x16-2 | 40459519 | 27.91 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt16/large/AVX512-x32-2 | 39359752 | 31.28 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt16/xlarge/Fallback-lo-2 | 273932 | 4382 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt16/xlarge/SSE-x8-2 | 3557265 | 331.2 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt16/xlarge/AVX2-x16-2 | 6930166 | 173.4 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt16/xlarge/AVX512-x32-2 | 12100244 | 97.01 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt32/small/Fallback-lo-2 | 249566539 | 4.808 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt32/small/SSE-x4-2 | 259250019 | 4.581 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt32/small/AVX2-x8-2 | 232858933 | 5.404 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt32/small/AVX512-x16-2 | 100000000 | 11.18 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt32/medium/Fallback-lo-2 | 17274441 | 72.28 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt32/medium/SSE-x4-2 | 58400258 | 20.56 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt32/medium/AVX2-x8-2 | 110851756 | 10.67 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt32/medium/AVX512-x16-2 | 106593603 | 11.25 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt32/large/Fallback-lo-2 | 2171817 | 551.8 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt32/large/SSE-x4-2 | 8270253 | 146.0 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt32/large/AVX2-x8-2 | 22234518 | 46.06 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt32/large/AVX512-x16-2 | 37448763 | 32.31 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt32/xlarge/Fallback-lo-2 | 273699 | 4559 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt32/xlarge/SSE-x4-2 | 1000000 | 1102 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt32/xlarge/AVX2-x8-2 | 3586887 | 332.4 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt32/xlarge/AVX512-x16-2 | 7214437 | 170.5 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt64/small/Fallback-lo-2 | 417473124 | 2.886 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt64/small/SSE-x2-2 | 287521756 | 4.169 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt64/small/AVX2-x4-2 | 277783513 | 4.311 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt64/small/AVX512-x8-2 | 172823103 | 6.993 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt64/medium/Fallback-lo-2 | 34022653 | 35.27 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt64/medium/SSE-x2-2 | 49241248 | 24.05 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt64/medium/AVX2-x4-2 | 78897342 | 14.58 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt64/medium/AVX512-x8-2 | 84361297 | 14.03 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt64/large/Fallback-lo-2 | 3680988 | 282.3 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt64/large/SSE-x2-2 | 6293607 | 170.7 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt64/large/AVX2-x4-2 | 12739849 | 91.28 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt64/large/AVX512-x8-2 | 25508130 | 46.30 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt64/xlarge/Fallback-lo-2 | 546321 | 2283 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt64/xlarge/SSE-x2-2 | 877434 | 1289 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt64/xlarge/AVX2-x4-2 | 1845892 | 650.4 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt64/xlarge/AVX512-x8-2 | 2148355 | 550.8 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat32/small/Fallback-lo-2 | 411100770 | 2.951 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat32/small/SSE-x4-2 | 264013596 | 4.572 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat32/small/AVX2-x8-2 | 174478266 | 6.911 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat32/small/AVX512-x16-2 | 61182673 | 19.78 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat32/medium/Fallback-lo-2 | 33815070 | 35.68 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat32/medium/SSE-x4-2 | 58238188 | 20.66 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat32/medium/AVX2-x8-2 | 91316544 | 13.26 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat32/medium/AVX512-x16-2 | 80046624 | 15.08 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat32/large/Fallback-lo-2 | 4304168 | 278.7 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat32/large/SSE-x4-2 | 6198957 | 184.8 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat32/large/AVX2-x8-2 | 12260169 | 86.60 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat32/large/AVX512-x16-2 | 22147112 | 45.34 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat32/xlarge/Fallback-lo-2 | 546901 | 2193 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat32/xlarge/SSE-x4-2 | 736503 | 1622 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat32/xlarge/AVX2-x8-2 | 1493887 | 810.5 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat32/xlarge/AVX512-x16-2 | 2959298 | 393.4 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat64/small/Fallback-lo-2 | 410778070 | 3.043 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat64/small/SSE-x2-2 | 254156008 | 4.714 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat64/small/AVX2-x4-2 | 227604434 | 5.323 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat64/small/AVX512-x8-2 | 170099748 | 7.115 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat64/medium/Fallback-lo-2 | 33646345 | 35.78 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat64/medium/SSE-x2-2 | 32931152 | 34.92 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat64/medium/AVX2-x4-2 | 75389446 | 16.79 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat64/medium/AVX512-x8-2 | 89826181 | 13.33 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat64/large/Fallback-lo-2 | 4293837 | 302.8 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat64/large/SSE-x2-2 | 3146601 | 381.4 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat64/large/AVX2-x4-2 | 6373876 | 184.3 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat64/large/AVX512-x8-2 | 13464712 | 88.96 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat64/xlarge/Fallback-lo-2 | 545764 | 2193 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat64/xlarge/SSE-x2-2 | 368846 | 3390 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat64/xlarge/AVX2-x4-2 | 709940 | 1613 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumFloat64/xlarge/AVX512-x8-2 | 1480214 | 808.6 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanInt32/small/Fallback-lo-2 | 411529147 | 3.043 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanInt32/small/SSE-x4-2 | 204428401 | 5.872 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanInt32/small/AVX2-x8-2 | 187573928 | 6.214 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanInt32/small/AVX512-x16-2 | 98346700 | 12.12 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanInt32/medium/Fallback-lo-2 | 33481442 | 35.72 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanInt32/medium/SSE-x4-2 | 52042394 | 22.12 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanInt32/medium/AVX2-x8-2 | 96288541 | 13.44 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanInt32/medium/AVX512-x16-2 | 100995780 | 11.90 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanInt32/large/Fallback-lo-2 | 4296570 | 289.9 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanInt32/large/SSE-x4-2 | 7743022 | 146.4 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanInt32/large/AVX2-x8-2 | 24355988 | 46.26 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanInt32/large/AVX512-x16-2 | 37322655 | 32.89 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanInt32/xlarge/Fallback-lo-2 | 547008 | 2193 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanInt32/xlarge/SSE-x4-2 | 1087246 | 1112 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanInt32/xlarge/AVX2-x8-2 | 1386868 | 761.9 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanInt32/xlarge/AVX512-x16-2 | 7166142 | 170.7 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanFloat64/small/Fallback-lo-2 | 349760005 | 3.449 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanFloat64/small/SSE-x2-2 | 189674538 | 6.293 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanFloat64/small/AVX2-x4-2 | 159228600 | 7.531 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanFloat64/small/AVX512-x8-2 | 110196433 | 10.89 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanFloat64/medium/Fallback-lo-2 | 32968618 | 36.17 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanFloat64/medium/SSE-x2-2 | 30863817 | 37.69 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanFloat64/medium/AVX2-x4-2 | 62428772 | 19.66 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanFloat64/medium/AVX512-x8-2 | 77140984 | 15.54 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanFloat64/large/Fallback-lo-2 | 4281057 | 280.6 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanFloat64/large/SSE-x2-2 | 3057349 | 389.4 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanFloat64/large/AVX2-x4-2 | 6509438 | 185.9 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanFloat64/large/AVX512-x8-2 | 12668032 | 93.50 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanFloat64/xlarge/Fallback-lo-2 | 545898 | 2288 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanFloat64/xlarge/SSE-x2-2 | 367671 | 4048 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanFloat64/xlarge/AVX2-x4-2 | 739941 | 1621 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMeanFloat64/xlarge/AVX512-x8-2 | 1434867 | 811.3 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinInt32/small/SSE-x4-2 | 312338268 | 3.860 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinInt32/small/AVX2-x8-2 | 238034872 | 5.042 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinInt32/small/AVX512-x16-2 | 152600943 | 6.661 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinInt32/medium/SSE-x4-2 | 61051266 | 19.73 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinInt32/medium/AVX2-x8-2 | 91792144 | 13.11 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinInt32/medium/AVX512-x16-2 | 99994540 | 12.18 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinInt32/large/SSE-x4-2 | 8604774 | 140.5 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinInt32/large/AVX2-x8-2 | 15581037 | 77.56 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinInt32/large/AVX512-x16-2 | 30512421 | 40.24 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinInt32/xlarge/SSE-x4-2 | 1000000 | 1110 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinInt32/xlarge/AVX2-x8-2 | 2158272 | 557.2 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinInt32/xlarge/AVX512-x16-2 | 4253668 | 282.6 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinFloat64/small/SSE-x2-2 | 264129410 | 4.544 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinFloat64/small/AVX2-x4-2 | 299587609 | 4.008 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinFloat64/small/AVX512-x8-2 | 100000000 | 10.05 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinFloat64/medium/SSE-x2-2 | 32778514 | 36.93 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinFloat64/medium/AVX2-x4-2 | 53356347 | 20.30 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinFloat64/medium/AVX512-x8-2 | 74832976 | 16.21 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinFloat64/large/SSE-x2-2 | 3863326 | 300.0 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinFloat64/large/AVX2-x4-2 | 7670576 | 146.5 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinFloat64/large/AVX512-x8-2 | 14017984 | 78.21 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinFloat64/xlarge/SSE-x2-2 | 492739 | 2195 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinFloat64/xlarge/AVX2-x4-2 | 1000000 | 1103 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMinFloat64/xlarge/AVX512-x8-2 | 2145290 | 560.3 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxInt32/small/SSE-x4-2 | 306585705 | 3.860 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxInt32/small/AVX2-x8-2 | 237347997 | 5.086 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxInt32/small/AVX512-x16-2 | 201433966 | 6.130 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxInt32/medium/SSE-x4-2 | 60759631 | 19.92 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxInt32/medium/AVX2-x8-2 | 90934662 | 13.13 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxInt32/medium/AVX512-x16-2 | 98517944 | 12.18 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxInt32/large/SSE-x4-2 | 8590542 | 139.6 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxInt32/large/AVX2-x8-2 | 15770372 | 77.69 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxInt32/large/AVX512-x16-2 | 30197324 | 39.32 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxInt32/xlarge/SSE-x4-2 | 1000000 | 1104 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxInt32/xlarge/AVX2-x8-2 | 2152038 | 562.1 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxInt32/xlarge/AVX512-x16-2 | 3917990 | 296.7 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxFloat64/small/SSE-x2-2 | 249617162 | 4.816 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxFloat64/small/AVX2-x4-2 | 207017514 | 5.855 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxFloat64/small/AVX512-x8-2 | 66520290 | 17.74 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxFloat64/medium/SSE-x2-2 | 32307492 | 36.92 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxFloat64/medium/AVX2-x4-2 | 57306838 | 20.77 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxFloat64/medium/AVX512-x8-2 | 56911946 | 21.12 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxFloat64/large/SSE-x2-2 | 4259366 | 287.1 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxFloat64/large/AVX2-x4-2 | 7905420 | 148.9 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxFloat64/large/AVX512-x8-2 | 14100686 | 83.43 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxFloat64/xlarge/SSE-x2-2 | 545378 | 2243 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxFloat64/xlarge/AVX2-x4-2 | 1000000 | 1113 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkMaxFloat64/xlarge/AVX512-x8-2 | 2119741 | 565.7 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt8ByWidth/Fallback-lo-2 | 896775 | 1335 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt8ByWidth/SSE-x16-2 | 12557700 | 94.52 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt8ByWidth/AVX2-x32-2 | 18702537 | 55.03 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt8ByWidth/AVX512-x64-2 | 21342572 | 56.10 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt64SteadyState/Fallback-lo-2 | 513738 | 2195 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt64SteadyState/SSE-x2-2 | 928376 | 1296 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt64SteadyState/AVX2-x4-2 | 1836968 | 888.1 ns/op | 0 B/op | 0 allocs/op |
|
||||||
|
| BenchmarkSumInt64SteadyState/AVX512-x8-2 | 2141715 | 551.3 ns/op | 0 B/op | 0 allocs/op |
|
||||||
@@ -0,0 +1,42 @@
|
|||||||
|
# SIMD experiment (Go 1.26+)
|
||||||
|
|
||||||
|
This package requires **Go 1.26** with `GOEXPERIMENT=simd` and **amd64**.
|
||||||
|
|
||||||
|
See [benchmarks](./BENCHMARK.md).
|
||||||
|
|
||||||
|
## CPU compatibility (avoiding SIGILL)
|
||||||
|
|
||||||
|
If you see **SIGILL: illegal instruction** when running tests, the CPU or VM does not support the SIMD instructions used by that code.
|
||||||
|
|
||||||
|
### Check support on Linux
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# List SIMD-related flags
|
||||||
|
grep -E 'avx|sse' /proc/cpuinfo
|
||||||
|
|
||||||
|
# Or with lscpu
|
||||||
|
lscpu | grep -i avx
|
||||||
|
```
|
||||||
|
|
||||||
|
**Rough mapping:**
|
||||||
|
|
||||||
|
| Tests / code | Required flag(s) | Typical CPUs |
|
||||||
|
| ----------------- | -------------------------- | ----------------------------------------------------------------------- |
|
||||||
|
| SSE (128-bit) | `sse2` (baseline on amd64) | All amd64 |
|
||||||
|
| AVX2 (256-bit) | `avx2` | Intel Haswell+, AMD Excavator+ |
|
||||||
|
| AVX-512 (512-bit) | `avx512f` | Intel Skylake-X+, some Xeons; many AMD/consumer CPUs do **not** have it |
|
||||||
|
|
||||||
|
### What the tests do
|
||||||
|
|
||||||
|
- **AVX2 tests** call `requireAVX2(t)` and are **skipped** if the CPU does not support AVX2 (no SIGILL).
|
||||||
|
- **AVX-512 tests** (when enabled) should call `requireAVX512(t)` and skip when AVX-512 is not available.
|
||||||
|
|
||||||
|
So on a machine without AVX2, AVX2 tests will show as skipped instead of crashing.
|
||||||
|
|
||||||
|
### Run only SSE tests
|
||||||
|
|
||||||
|
If your environment does not support AVX2/AVX-512, you can still run the SSE tests:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
GOEXPERIMENT=simd go test -run SSE ./...
|
||||||
|
```
|
||||||
@@ -0,0 +1,28 @@
|
|||||||
|
//go:build go1.26 && goexperiment.simd && amd64
|
||||||
|
|
||||||
|
package simd
|
||||||
|
|
||||||
|
import "simd/archsimd"
|
||||||
|
|
||||||
|
// simdFeature represents the highest available SIMD instruction set
|
||||||
|
type simdFeature int
|
||||||
|
|
||||||
|
const (
|
||||||
|
simdFeatureNone simdFeature = iota
|
||||||
|
simdFeatureAVX
|
||||||
|
simdFeatureAVX2
|
||||||
|
simdFeatureAVX512
|
||||||
|
)
|
||||||
|
|
||||||
|
// currentSimdFeature is cached at package init to avoid repeated CPU feature checks
|
||||||
|
var currentSimdFeature simdFeature
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
if archsimd.X86.AVX512() {
|
||||||
|
currentSimdFeature = simdFeatureAVX512
|
||||||
|
} else if archsimd.X86.AVX2() {
|
||||||
|
currentSimdFeature = simdFeatureAVX2
|
||||||
|
} else if archsimd.X86.AVX() {
|
||||||
|
currentSimdFeature = simdFeatureAVX
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,56 @@
|
|||||||
|
//go:build go1.26 && goexperiment.simd && amd64
|
||||||
|
|
||||||
|
package simd
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"simd/archsimd"
|
||||||
|
)
|
||||||
|
|
||||||
|
// skipHelper is a small interface implemented by both *testing.T and *testing.B
|
||||||
|
// to allow unified CPU feature requirement checking for both tests and benchmarks.
|
||||||
|
type skipHelper interface {
|
||||||
|
Helper()
|
||||||
|
Skipf(format string, args ...any)
|
||||||
|
}
|
||||||
|
|
||||||
|
// How to check if your Linux CPU supports SIMD (avoids SIGILL):
|
||||||
|
//
|
||||||
|
// grep -E 'avx|sse' /proc/cpuinfo
|
||||||
|
//
|
||||||
|
// Or: lscpu | grep -i avx
|
||||||
|
//
|
||||||
|
// You need:
|
||||||
|
// - SSE tests (128-bit): sse2 (baseline on amd64), sse4.1/sse4.2 often used
|
||||||
|
// - AVX2 tests (256-bit): avx2 in flags
|
||||||
|
// - AVX-512 tests: avx512f (and often avx512bw, avx512vl)
|
||||||
|
//
|
||||||
|
// If your CPU lacks AVX2 or AVX-512, tests that use them will be skipped automatically.
|
||||||
|
|
||||||
|
// requireAVX2 skips the test/benchmark if the CPU does not support AVX2 (256-bit SIMD).
|
||||||
|
// Use at the start of each AVX2 test/benchmark to avoid SIGILL on older or non-x86 systems.
|
||||||
|
func requireAVX2(t skipHelper) {
|
||||||
|
t.Helper()
|
||||||
|
if !archsimd.X86.AVX2() {
|
||||||
|
t.Skipf("CPU does not support AVX2; skipping. Check compatibility: grep avx2 /proc/cpuinfo")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// requireAVX512 skips the test/benchmark if the CPU does not support AVX-512 Foundation.
|
||||||
|
// Use at the start of each AVX-512 test/benchmark to avoid SIGILL on CPUs without AVX-512.
|
||||||
|
func requireAVX512(t skipHelper) {
|
||||||
|
t.Helper()
|
||||||
|
if !archsimd.X86.AVX512() {
|
||||||
|
t.Skipf("CPU does not support AVX-512; skipping. Check compatibility: grep avx512 /proc/cpuinfo")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// PrintCPUFeatures prints detected x86 SIMD features (for debugging).
|
||||||
|
// Run: go test -run PrintCPUFeatures -v
|
||||||
|
func PrintCPUFeatures(t *testing.T) {
|
||||||
|
fmt.Fprintf(os.Stdout, "X86 HasAVX=%v HasAVX2=%v HasAVX512=%v\n",
|
||||||
|
archsimd.X86.AVX(), archsimd.X86.AVX2(), archsimd.X86.AVX512())
|
||||||
|
}
|
||||||
@@ -0,0 +1,9 @@
|
|||||||
|
module github.com/samber/lo/exp/simd
|
||||||
|
|
||||||
|
go 1.26.0
|
||||||
|
|
||||||
|
require github.com/samber/lo v0.0.0
|
||||||
|
|
||||||
|
require golang.org/x/text v0.22.0 // indirect
|
||||||
|
|
||||||
|
replace github.com/samber/lo => ../../
|
||||||
@@ -0,0 +1,12 @@
|
|||||||
|
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||||
|
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
|
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||||
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||||
|
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||||
|
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||||
|
go.uber.org/goleak v1.2.1 h1:NBol2c7O1ZokfZ0LEU9K6Whx/KnwvepVetCUhtKja4A=
|
||||||
|
go.uber.org/goleak v1.2.1/go.mod h1:qlT2yGI9QafXHhZZLxlSuNsMw3FFLxBr+tBRlmO1xH4=
|
||||||
|
golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM=
|
||||||
|
golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY=
|
||||||
|
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||||
|
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
@@ -0,0 +1,969 @@
|
|||||||
|
//go:build go1.26 && goexperiment.simd && amd64
|
||||||
|
|
||||||
|
package simd
|
||||||
|
|
||||||
|
import (
|
||||||
|
"simd/archsimd"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ContainsInt8x16 checks if collection contains target using SSE SIMD and AVX512 SIMD
|
||||||
|
func ContainsInt8x16[T ~int8](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes16
|
||||||
|
targetVec := archsimd.BroadcastInt8x16(int8(target))
|
||||||
|
|
||||||
|
base := unsafeSliceInt8(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadInt8x16Slice(s)
|
||||||
|
|
||||||
|
// Compare for equality; Equal returns a mask, ToBits() its bitmask.
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle remaining elements
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsInt16x8 checks if collection contains target using SSE SIMD and AVX512 SIMD
|
||||||
|
func ContainsInt16x8[T ~int16](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes8
|
||||||
|
targetVec := archsimd.BroadcastInt16x8(int16(target))
|
||||||
|
|
||||||
|
base := unsafeSliceInt16(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadInt16x8Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsInt32x4 checks if collection contains target using SSE SIMD and AVX512 SIMD
|
||||||
|
func ContainsInt32x4[T ~int32](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes4
|
||||||
|
targetVec := archsimd.BroadcastInt32x4(int32(target))
|
||||||
|
|
||||||
|
base := unsafeSliceInt32(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadInt32x4Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsInt64x2 checks if collection contains target using SSE SIMD and AVX512 SIMD
|
||||||
|
func ContainsInt64x2[T ~int64](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes2
|
||||||
|
targetVec := archsimd.BroadcastInt64x2(int64(target))
|
||||||
|
|
||||||
|
base := unsafeSliceInt64(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadInt64x2Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsUint8x16 checks if collection contains target using SSE SIMD and AVX512 SIMD
|
||||||
|
func ContainsUint8x16[T ~uint8](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes16
|
||||||
|
targetVec := archsimd.BroadcastUint8x16(uint8(target))
|
||||||
|
|
||||||
|
base := unsafeSliceUint8(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadUint8x16Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsUint16x8 checks if collection contains target using SSE SIMD and AVX512 SIMD
|
||||||
|
func ContainsUint16x8[T ~uint16](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes8
|
||||||
|
targetVec := archsimd.BroadcastUint16x8(uint16(target))
|
||||||
|
|
||||||
|
base := unsafeSliceUint16(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadUint16x8Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsUint32x4 checks if collection contains target using SSE SIMD and AVX512 SIMD
|
||||||
|
func ContainsUint32x4[T ~uint32](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes4
|
||||||
|
targetVec := archsimd.BroadcastUint32x4(uint32(target))
|
||||||
|
|
||||||
|
base := unsafeSliceUint32(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadUint32x4Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsUint64x2 checks if collection contains target using SSE SIMD and AVX512 SIMD
|
||||||
|
func ContainsUint64x2[T ~uint64](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes2
|
||||||
|
targetVec := archsimd.BroadcastUint64x2(uint64(target))
|
||||||
|
|
||||||
|
base := unsafeSliceUint64(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadUint64x2Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsFloat32x4 checks if collection contains target using SSE SIMD and AVX512 SIMD
|
||||||
|
func ContainsFloat32x4[T ~float32](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes4
|
||||||
|
targetVec := archsimd.BroadcastFloat32x4(float32(target))
|
||||||
|
|
||||||
|
base := unsafeSliceFloat32(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadFloat32x4Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsFloat64x2 checks if collection contains target using SSE SIMD and AVX512 SIMD
|
||||||
|
func ContainsFloat64x2[T ~float64](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes2
|
||||||
|
targetVec := archsimd.BroadcastFloat64x2(float64(target))
|
||||||
|
|
||||||
|
base := unsafeSliceFloat64(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadFloat64x2Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsInt8x32 checks if collection contains target using AVX2 SIMD
|
||||||
|
func ContainsInt8x32[T ~int8](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes32
|
||||||
|
targetVec := archsimd.BroadcastInt8x32(int8(target))
|
||||||
|
|
||||||
|
base := unsafeSliceInt8(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadInt8x32Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsInt16x16 checks if collection contains target using AVX2 SIMD
|
||||||
|
func ContainsInt16x16[T ~int16](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes16
|
||||||
|
targetVec := archsimd.BroadcastInt16x16(int16(target))
|
||||||
|
|
||||||
|
base := unsafeSliceInt16(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadInt16x16Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsInt32x8 checks if collection contains target using AVX2 SIMD
|
||||||
|
func ContainsInt32x8[T ~int32](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes8
|
||||||
|
targetVec := archsimd.BroadcastInt32x8(int32(target))
|
||||||
|
|
||||||
|
base := unsafeSliceInt32(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadInt32x8Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsInt64x4 checks if collection contains target using AVX2 SIMD
|
||||||
|
func ContainsInt64x4[T ~int64](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes4
|
||||||
|
targetVec := archsimd.BroadcastInt64x4(int64(target))
|
||||||
|
|
||||||
|
base := unsafeSliceInt64(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadInt64x4Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsUint8x32 checks if collection contains target using AVX2 SIMD
|
||||||
|
func ContainsUint8x32[T ~uint8](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes32
|
||||||
|
targetVec := archsimd.BroadcastUint8x32(uint8(target))
|
||||||
|
|
||||||
|
base := unsafeSliceUint8(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadUint8x32Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsUint16x16 checks if collection contains target using AVX2 SIMD
|
||||||
|
func ContainsUint16x16[T ~uint16](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes16
|
||||||
|
targetVec := archsimd.BroadcastUint16x16(uint16(target))
|
||||||
|
|
||||||
|
base := unsafeSliceUint16(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadUint16x16Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsUint32x8 checks if collection contains target using AVX2 SIMD
|
||||||
|
func ContainsUint32x8[T ~uint32](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes8
|
||||||
|
targetVec := archsimd.BroadcastUint32x8(uint32(target))
|
||||||
|
|
||||||
|
base := unsafeSliceUint32(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadUint32x8Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsUint64x4 checks if collection contains target using AVX2 SIMD
|
||||||
|
func ContainsUint64x4[T ~uint64](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes4
|
||||||
|
targetVec := archsimd.BroadcastUint64x4(uint64(target))
|
||||||
|
|
||||||
|
base := unsafeSliceUint64(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadUint64x4Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsFloat32x8 checks if collection contains target using AVX2 SIMD
|
||||||
|
func ContainsFloat32x8[T ~float32](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes8
|
||||||
|
targetVec := archsimd.BroadcastFloat32x8(float32(target))
|
||||||
|
|
||||||
|
base := unsafeSliceFloat32(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadFloat32x8Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsFloat64x4 checks if collection contains target using AVX2 SIMD
|
||||||
|
func ContainsFloat64x4[T ~float64](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes4
|
||||||
|
targetVec := archsimd.BroadcastFloat64x4(float64(target))
|
||||||
|
|
||||||
|
base := unsafeSliceFloat64(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadFloat64x4Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsInt8x64 checks if collection contains target using AVX-512 SIMD
|
||||||
|
func ContainsInt8x64[T ~int8](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes64
|
||||||
|
targetVec := archsimd.BroadcastInt8x64(int8(target))
|
||||||
|
|
||||||
|
base := unsafeSliceInt8(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadInt8x64Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsInt16x32 checks if collection contains target using AVX-512 SIMD
|
||||||
|
func ContainsInt16x32[T ~int16](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes32
|
||||||
|
targetVec := archsimd.BroadcastInt16x32(int16(target))
|
||||||
|
|
||||||
|
base := unsafeSliceInt16(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadInt16x32Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsInt32x16 checks if collection contains target using AVX-512 SIMD
|
||||||
|
func ContainsInt32x16[T ~int32](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes16
|
||||||
|
targetVec := archsimd.BroadcastInt32x16(int32(target))
|
||||||
|
|
||||||
|
base := unsafeSliceInt32(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadInt32x16Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsInt64x8 checks if collection contains target using AVX-512 SIMD
|
||||||
|
func ContainsInt64x8[T ~int64](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes8
|
||||||
|
targetVec := archsimd.BroadcastInt64x8(int64(target))
|
||||||
|
|
||||||
|
base := unsafeSliceInt64(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadInt64x8Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsUint8x64 checks if collection contains target using AVX-512 SIMD
|
||||||
|
func ContainsUint8x64[T ~uint8](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes64
|
||||||
|
targetVec := archsimd.BroadcastUint8x64(uint8(target))
|
||||||
|
|
||||||
|
base := unsafeSliceUint8(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadUint8x64Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsUint16x32 checks if collection contains target using AVX-512 SIMD
|
||||||
|
func ContainsUint16x32[T ~uint16](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes32
|
||||||
|
targetVec := archsimd.BroadcastUint16x32(uint16(target))
|
||||||
|
|
||||||
|
base := unsafeSliceUint16(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadUint16x32Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsUint32x16 checks if collection contains target using AVX-512 SIMD
|
||||||
|
func ContainsUint32x16[T ~uint32](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes16
|
||||||
|
targetVec := archsimd.BroadcastUint32x16(uint32(target))
|
||||||
|
|
||||||
|
base := unsafeSliceUint32(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadUint32x16Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsUint64x8 checks if collection contains target using AVX-512 SIMD
|
||||||
|
func ContainsUint64x8[T ~uint64](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes8
|
||||||
|
targetVec := archsimd.BroadcastUint64x8(uint64(target))
|
||||||
|
|
||||||
|
base := unsafeSliceUint64(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadUint64x8Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsFloat32x16 checks if collection contains target using AVX-512 SIMD
|
||||||
|
func ContainsFloat32x16[T ~float32](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes16
|
||||||
|
targetVec := archsimd.BroadcastFloat32x16(float32(target))
|
||||||
|
|
||||||
|
base := unsafeSliceFloat32(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadFloat32x16Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainsFloat64x8 checks if collection contains target using AVX-512 SIMD
|
||||||
|
func ContainsFloat64x8[T ~float64](collection []T, target T) bool {
|
||||||
|
length := uint(len(collection))
|
||||||
|
if length == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
const lanes = simdLanes8
|
||||||
|
targetVec := archsimd.BroadcastFloat64x8(float64(target))
|
||||||
|
|
||||||
|
base := unsafeSliceFloat64(collection, length)
|
||||||
|
|
||||||
|
i := uint(0)
|
||||||
|
for ; i+lanes <= length; i += lanes {
|
||||||
|
s := base[i : i+lanes]
|
||||||
|
v := archsimd.LoadFloat64x8Slice(s)
|
||||||
|
|
||||||
|
cmp := v.Equal(targetVec)
|
||||||
|
if cmp.ToBits() != 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ; i < length; i++ {
|
||||||
|
if collection[i] == target {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,557 @@
|
|||||||
|
//go:build go1.26 && goexperiment.simd && amd64
|
||||||
|
|
||||||
|
package simd
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Benchmark suite for SIMD Contains operations compared to core lo package fallbacks.
|
||||||
|
// These benchmarks measure the performance of element lookup operations
|
||||||
|
// across different SIMD implementations (SSE, AVX2, AVX512) and data sizes.
|
||||||
|
|
||||||
|
// Benchmark sizes for Contains operations
|
||||||
|
var containsBenchmarkSizes = []struct {
|
||||||
|
name string
|
||||||
|
size int
|
||||||
|
}{
|
||||||
|
{"tiny", 4}, // Smaller than SSE width (16 lanes for int8)
|
||||||
|
{"small", 16}, // Exactly SSE width for int8
|
||||||
|
{"medium", 64}, // Multiple of SSE, between SSE and AVX2 for int8
|
||||||
|
{"large", 256}, // Multiple of AVX2 (32 lanes for int8)
|
||||||
|
{"xlarge", 1024}, // Multiple of AVX512 (64 lanes for int8)
|
||||||
|
{"massive", 8192}, // Very large dataset
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// CONTAINS INT8 BENCHMARKS
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
func BenchmarkContainsInt8(b *testing.B) {
|
||||||
|
for _, bs := range containsBenchmarkSizes {
|
||||||
|
b.Run(bs.name, func(b *testing.B) {
|
||||||
|
data := generateInt8(bs.size)
|
||||||
|
target := int8(42)
|
||||||
|
|
||||||
|
b.Run("SSE-x16", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsInt8x16 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt8x16(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x32", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsInt8x32 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt8x32(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x64", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt8x64(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// CONTAINS INT16 BENCHMARKS
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
func BenchmarkContainsInt16(b *testing.B) {
|
||||||
|
for _, bs := range containsBenchmarkSizes {
|
||||||
|
b.Run(bs.name, func(b *testing.B) {
|
||||||
|
data := generateInt16(bs.size)
|
||||||
|
target := int16(42)
|
||||||
|
|
||||||
|
b.Run("SSE-x8", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsInt16x8 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt16x8(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x16", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsInt16x16 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt16x16(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x32", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt16x32(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// CONTAINS INT32 BENCHMARKS
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
func BenchmarkContainsInt32(b *testing.B) {
|
||||||
|
for _, bs := range containsBenchmarkSizes {
|
||||||
|
b.Run(bs.name, func(b *testing.B) {
|
||||||
|
data := generateInt32(bs.size)
|
||||||
|
target := int32(42)
|
||||||
|
|
||||||
|
b.Run("SSE-x4", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsInt32x4 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt32x4(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x8", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsInt32x8 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt32x8(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x16", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt32x16(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// CONTAINS INT64 BENCHMARKS
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
func BenchmarkContainsInt64(b *testing.B) {
|
||||||
|
for _, bs := range containsBenchmarkSizes {
|
||||||
|
b.Run(bs.name, func(b *testing.B) {
|
||||||
|
data := generateInt64(bs.size)
|
||||||
|
target := int64(42)
|
||||||
|
|
||||||
|
b.Run("SSE-x2", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsInt64x2 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt64x2(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x4", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsInt64x4 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt64x4(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x8", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt64x8(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// CONTAINS UINT8 BENCHMARKS
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
func BenchmarkContainsUint8(b *testing.B) {
|
||||||
|
for _, bs := range containsBenchmarkSizes {
|
||||||
|
b.Run(bs.name, func(b *testing.B) {
|
||||||
|
data := generateUint8(bs.size)
|
||||||
|
target := uint8(255)
|
||||||
|
|
||||||
|
b.Run("SSE-x16", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsUint8x16 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsUint8x16(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x32", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsUint8x32 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsUint8x32(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x64", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsUint8x64(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// CONTAINS UINT16 BENCHMARKS
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
func BenchmarkContainsUint16(b *testing.B) {
|
||||||
|
for _, bs := range containsBenchmarkSizes {
|
||||||
|
b.Run(bs.name, func(b *testing.B) {
|
||||||
|
data := generateUint16(bs.size)
|
||||||
|
target := uint16(42)
|
||||||
|
|
||||||
|
b.Run("SSE-x8", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsUint16x8 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsUint16x8(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x16", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsUint16x16 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsUint16x16(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x32", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsUint16x32(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// CONTAINS UINT32 BENCHMARKS
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
func BenchmarkContainsUint32(b *testing.B) {
|
||||||
|
for _, bs := range containsBenchmarkSizes {
|
||||||
|
b.Run(bs.name, func(b *testing.B) {
|
||||||
|
data := generateUint32(bs.size)
|
||||||
|
target := uint32(42)
|
||||||
|
|
||||||
|
b.Run("SSE-x4", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsUint32x4 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsUint32x4(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x8", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsUint32x8 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsUint32x8(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x16", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsUint32x16(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// CONTAINS UINT64 BENCHMARKS
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
func BenchmarkContainsUint64(b *testing.B) {
|
||||||
|
for _, bs := range containsBenchmarkSizes {
|
||||||
|
b.Run(bs.name, func(b *testing.B) {
|
||||||
|
data := generateUint64(bs.size)
|
||||||
|
target := uint64(42)
|
||||||
|
|
||||||
|
b.Run("SSE-x2", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsUint64x2 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsUint64x2(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x4", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsUint64x4 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsUint64x4(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x8", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsUint64x8(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// CONTAINS FLOAT32 BENCHMARKS
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
func BenchmarkContainsFloat32(b *testing.B) {
|
||||||
|
for _, bs := range containsBenchmarkSizes {
|
||||||
|
b.Run(bs.name, func(b *testing.B) {
|
||||||
|
data := generateFloat32(bs.size)
|
||||||
|
target := float32(42.5)
|
||||||
|
|
||||||
|
b.Run("SSE-x4", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsFloat32x4 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsFloat32x4(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x8", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsFloat32x8 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsFloat32x8(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x16", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsFloat32x16(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// CONTAINS FLOAT64 BENCHMARKS
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
func BenchmarkContainsFloat64(b *testing.B) {
|
||||||
|
for _, bs := range containsBenchmarkSizes {
|
||||||
|
b.Run(bs.name, func(b *testing.B) {
|
||||||
|
data := generateFloat64(bs.size)
|
||||||
|
target := float64(42.5)
|
||||||
|
|
||||||
|
b.Run("SSE-x2", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsFloat64x2 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsFloat64x2(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x4", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsFloat64x4 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsFloat64x4(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x8", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsFloat64x8(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// CONTAINS WORST-CASE BENCHMARKS (target at end)
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
// These benchmarks test worst-case performance where target is at the very end
|
||||||
|
func BenchmarkContainsWorstCase(b *testing.B) {
|
||||||
|
size := 1024
|
||||||
|
data := make([]int32, size)
|
||||||
|
for i := range data {
|
||||||
|
data[i] = int32(i)
|
||||||
|
}
|
||||||
|
target := int32(size - 1) // Target at the very end
|
||||||
|
|
||||||
|
b.Run("SSE-x4", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsInt32x4 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt32x4(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x8", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsInt32x8 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt32x8(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x16", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsInt32x16 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt32x16(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// CONTAINS BEST-CASE BENCHMARKS (target at beginning)
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
// These benchmarks test best-case performance where target is at the beginning
|
||||||
|
func BenchmarkContainsBestCase(b *testing.B) {
|
||||||
|
size := 1024
|
||||||
|
data := make([]int32, size)
|
||||||
|
for i := range data {
|
||||||
|
data[i] = int32(i)
|
||||||
|
}
|
||||||
|
target := int32(0) // Target at the very beginning
|
||||||
|
|
||||||
|
b.Run("SSE-x4", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsInt32x4 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt32x4(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x8", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsInt32x8 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt32x8(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x16", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsInt32x16 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt32x16(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// CONTAINS NEGATIVE-CASE BENCHMARKS (target not present)
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
// These benchmarks test performance when target is not in the collection
|
||||||
|
func BenchmarkContainsNegative(b *testing.B) {
|
||||||
|
for _, bs := range containsBenchmarkSizes {
|
||||||
|
b.Run(bs.name, func(b *testing.B) {
|
||||||
|
data := generateInt32(bs.size)
|
||||||
|
target := int32(999999) // Target that's unlikely to be in the data
|
||||||
|
|
||||||
|
b.Run("SSE-x4", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsInt32x4 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt32x4(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x8", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsInt32x8 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt32x8(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x16", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt32x16(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// CONTAINS LANE WIDTH COMPARISON BENCHMARK
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
// This benchmark shows how performance scales with SIMD register width
|
||||||
|
func BenchmarkContainsInt8ByWidth(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
|
||||||
|
size := 4096
|
||||||
|
data := generateInt8(size)
|
||||||
|
target := int8(42)
|
||||||
|
|
||||||
|
benchmarks := []struct {
|
||||||
|
name string
|
||||||
|
fn func() bool
|
||||||
|
}{
|
||||||
|
{"SSE-x16", func() bool { return ContainsInt8x16(data, target) }},
|
||||||
|
{"AVX2-x32", func() bool { return ContainsInt8x32(data, target) }},
|
||||||
|
{"AVX512-x64", func() bool { return ContainsInt8x64(data, target) }},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, bm := range benchmarks {
|
||||||
|
b.Run(bm.name, func(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = bm.fn()
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// STEADY STATE BENCHMARK
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
// This benchmark demonstrates the steady-state performance after warmup
|
||||||
|
func BenchmarkContainsInt64SteadyState(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
|
||||||
|
size := 8192
|
||||||
|
data := generateInt64(size)
|
||||||
|
target := int64(42)
|
||||||
|
|
||||||
|
// Warmup phase
|
||||||
|
for i := 0; i < 1000; i++ {
|
||||||
|
ContainsInt64x2(data, target)
|
||||||
|
ContainsInt64x4(data, target)
|
||||||
|
ContainsInt64x8(data, target)
|
||||||
|
}
|
||||||
|
|
||||||
|
b.ResetTimer() // Reset timer to exclude warmup
|
||||||
|
|
||||||
|
b.Run("SSE-x2", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsInt64x2 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt64x2(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x4", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsInt64x4 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt64x4(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x8", func(b *testing.B) {
|
||||||
|
requireAVX512(b) // ContainsInt64x8 is in intersect_avx512.go which uses AVX-512
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = ContainsInt64x8(data, target)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
+1163
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,601 @@
|
|||||||
|
//go:build go1.26 && goexperiment.simd && amd64
|
||||||
|
|
||||||
|
package simd
|
||||||
|
|
||||||
|
import (
|
||||||
|
"math/rand"
|
||||||
|
"simd/archsimd"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/samber/lo"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Benchmark suite for SIMD math operations compared to core lo package fallbacks.
|
||||||
|
// These benchmarks measure the performance of Sum, Mean, Min, and Max operations
|
||||||
|
// across different SIMD implementations (SSE, AVX2, AVX512) and data sizes.
|
||||||
|
|
||||||
|
// Benchmark sizes to demonstrate performance characteristics at different scales
|
||||||
|
var benchmarkSizes = []struct {
|
||||||
|
name string
|
||||||
|
size int
|
||||||
|
}{
|
||||||
|
{"small", 8}, // Smaller than SSE width (16 lanes for int8)
|
||||||
|
{"medium", 128}, // Between SSE (16) and AVX2 (32) width for int8
|
||||||
|
{"large", 1024}, // Well above SIMD register widths
|
||||||
|
{"xlarge", 8192}, // Large dataset for real-world performance
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
// Seeded for reproducibility
|
||||||
|
rand.Seed(time.Now().UnixNano())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to generate random test data
|
||||||
|
type benchDataGenerator[T any] func(n int) []T
|
||||||
|
|
||||||
|
func generateInt8(n int) []int8 {
|
||||||
|
data := make([]int8, n)
|
||||||
|
for i := range data {
|
||||||
|
data[i] = int8(rand.Intn(127) - 64)
|
||||||
|
}
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
func generateInt16(n int) []int16 {
|
||||||
|
data := make([]int16, n)
|
||||||
|
for i := range data {
|
||||||
|
data[i] = int16(rand.Intn(32767) - 16384)
|
||||||
|
}
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
func generateInt32(n int) []int32 {
|
||||||
|
data := make([]int32, n)
|
||||||
|
for i := range data {
|
||||||
|
data[i] = int32(rand.Intn(1000) - 500)
|
||||||
|
}
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
func generateInt64(n int) []int64 {
|
||||||
|
data := make([]int64, n)
|
||||||
|
for i := range data {
|
||||||
|
data[i] = rand.Int63() % 10000
|
||||||
|
}
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
func generateUint8(n int) []uint8 {
|
||||||
|
data := make([]uint8, n)
|
||||||
|
for i := range data {
|
||||||
|
data[i] = uint8(rand.Uint32() % 256)
|
||||||
|
}
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
func generateUint16(n int) []uint16 {
|
||||||
|
data := make([]uint16, n)
|
||||||
|
for i := range data {
|
||||||
|
data[i] = uint16(rand.Uint32() % 65536)
|
||||||
|
}
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
func generateUint32(n int) []uint32 {
|
||||||
|
data := make([]uint32, n)
|
||||||
|
for i := range data {
|
||||||
|
data[i] = rand.Uint32() % 10000
|
||||||
|
}
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
func generateUint64(n int) []uint64 {
|
||||||
|
data := make([]uint64, n)
|
||||||
|
for i := range data {
|
||||||
|
data[i] = rand.Uint64() % 10000
|
||||||
|
}
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
func generateFloat32(n int) []float32 {
|
||||||
|
data := make([]float32, n)
|
||||||
|
for i := range data {
|
||||||
|
data[i] = rand.Float32()*100 - 50
|
||||||
|
}
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
func generateFloat64(n int) []float64 {
|
||||||
|
data := make([]float64, n)
|
||||||
|
for i := range data {
|
||||||
|
data[i] = rand.Float64()*100 - 50
|
||||||
|
}
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// SUM BENCHMARKS
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
func BenchmarkSumInt8(b *testing.B) {
|
||||||
|
for _, bs := range benchmarkSizes {
|
||||||
|
b.Run(bs.name, func(b *testing.B) {
|
||||||
|
data := generateInt8(bs.size)
|
||||||
|
b.Run("Fallback-lo", func(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = lo.Sum(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("SSE-x16", func(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = SumInt8x16(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x32", func(b *testing.B) {
|
||||||
|
requireAVX2(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = SumInt8x32(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x64", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = SumInt8x64(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkSumInt16(b *testing.B) {
|
||||||
|
for _, bs := range benchmarkSizes {
|
||||||
|
b.Run(bs.name, func(b *testing.B) {
|
||||||
|
data := generateInt16(bs.size)
|
||||||
|
b.Run("Fallback-lo", func(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = lo.Sum(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("SSE-x8", func(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = SumInt16x8(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x16", func(b *testing.B) {
|
||||||
|
requireAVX2(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = SumInt16x16(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x32", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = SumInt16x32(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkSumInt32(b *testing.B) {
|
||||||
|
for _, bs := range benchmarkSizes {
|
||||||
|
b.Run(bs.name, func(b *testing.B) {
|
||||||
|
data := generateInt32(bs.size)
|
||||||
|
b.Run("Fallback-lo", func(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = lo.Sum(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("SSE-x4", func(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = SumInt32x4(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x8", func(b *testing.B) {
|
||||||
|
requireAVX2(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = SumInt32x8(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x16", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = SumInt32x16(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkSumInt64(b *testing.B) {
|
||||||
|
for _, bs := range benchmarkSizes {
|
||||||
|
b.Run(bs.name, func(b *testing.B) {
|
||||||
|
data := generateInt64(bs.size)
|
||||||
|
b.Run("Fallback-lo", func(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = lo.Sum(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("SSE-x2", func(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = SumInt64x2(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x4", func(b *testing.B) {
|
||||||
|
requireAVX2(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = SumInt64x4(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x8", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = SumInt64x8(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkSumFloat32(b *testing.B) {
|
||||||
|
for _, bs := range benchmarkSizes {
|
||||||
|
b.Run(bs.name, func(b *testing.B) {
|
||||||
|
data := generateFloat32(bs.size)
|
||||||
|
b.Run("Fallback-lo", func(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = lo.Sum(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("SSE-x4", func(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = SumFloat32x4(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x8", func(b *testing.B) {
|
||||||
|
requireAVX2(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = SumFloat32x8(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x16", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = SumFloat32x16(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkSumFloat64(b *testing.B) {
|
||||||
|
for _, bs := range benchmarkSizes {
|
||||||
|
b.Run(bs.name, func(b *testing.B) {
|
||||||
|
data := generateFloat64(bs.size)
|
||||||
|
b.Run("Fallback-lo", func(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = lo.Sum(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("SSE-x2", func(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = SumFloat64x2(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x4", func(b *testing.B) {
|
||||||
|
requireAVX2(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = SumFloat64x4(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x8", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = SumFloat64x8(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// MEAN BENCHMARKS
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
func BenchmarkMeanInt32(b *testing.B) {
|
||||||
|
for _, bs := range benchmarkSizes {
|
||||||
|
b.Run(bs.name, func(b *testing.B) {
|
||||||
|
data := generateInt32(bs.size)
|
||||||
|
b.Run("Fallback-lo", func(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = lo.Mean(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("SSE-x4", func(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = MeanInt32x4(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x8", func(b *testing.B) {
|
||||||
|
requireAVX2(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = MeanInt32x8(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x16", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = MeanInt32x16(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkMeanFloat64(b *testing.B) {
|
||||||
|
for _, bs := range benchmarkSizes {
|
||||||
|
b.Run(bs.name, func(b *testing.B) {
|
||||||
|
data := generateFloat64(bs.size)
|
||||||
|
b.Run("Fallback-lo", func(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = lo.Mean(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("SSE-x2", func(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = MeanFloat64x2(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x4", func(b *testing.B) {
|
||||||
|
requireAVX2(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = MeanFloat64x4(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x8", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = MeanFloat64x8(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// MIN BENCHMARKS
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
func BenchmarkMinInt32(b *testing.B) {
|
||||||
|
for _, bs := range benchmarkSizes {
|
||||||
|
b.Run(bs.name, func(b *testing.B) {
|
||||||
|
data := generateInt32(bs.size)
|
||||||
|
b.Run("SSE-x4", func(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = MinInt32x4(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x8", func(b *testing.B) {
|
||||||
|
requireAVX2(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = MinInt32x8(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x16", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = MinInt32x16(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkMinFloat64(b *testing.B) {
|
||||||
|
for _, bs := range benchmarkSizes {
|
||||||
|
b.Run(bs.name, func(b *testing.B) {
|
||||||
|
data := generateFloat64(bs.size)
|
||||||
|
b.Run("SSE-x2", func(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = MinFloat64x2(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x4", func(b *testing.B) {
|
||||||
|
requireAVX2(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = MinFloat64x4(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x8", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = MinFloat64x8(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// MAX BENCHMARKS
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
func BenchmarkMaxInt32(b *testing.B) {
|
||||||
|
for _, bs := range benchmarkSizes {
|
||||||
|
b.Run(bs.name, func(b *testing.B) {
|
||||||
|
data := generateInt32(bs.size)
|
||||||
|
b.Run("SSE-x4", func(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = MaxInt32x4(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x8", func(b *testing.B) {
|
||||||
|
requireAVX2(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = MaxInt32x8(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x16", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = MaxInt32x16(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkMaxFloat64(b *testing.B) {
|
||||||
|
for _, bs := range benchmarkSizes {
|
||||||
|
b.Run(bs.name, func(b *testing.B) {
|
||||||
|
data := generateFloat64(bs.size)
|
||||||
|
b.Run("SSE-x2", func(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = MaxFloat64x2(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x4", func(b *testing.B) {
|
||||||
|
requireAVX2(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = MaxFloat64x4(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x8", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = MaxFloat64x8(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// LANE WIDTH COMPARISON BENCHMARKS
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
// These benchmarks show how performance scales with SIMD register width
|
||||||
|
func BenchmarkSumInt8ByWidth(b *testing.B) {
|
||||||
|
size := 4096 // Large enough to see differences across implementations
|
||||||
|
data := generateInt8(size)
|
||||||
|
|
||||||
|
benchmarks := []struct {
|
||||||
|
name string
|
||||||
|
fn func() int8
|
||||||
|
}{
|
||||||
|
{"Fallback-lo", func() int8 { return lo.Sum(data) }},
|
||||||
|
{"SSE-x16", func() int8 { return SumInt8x16(data) }},
|
||||||
|
{"AVX2-x32", func() int8 { return SumInt8x32(data) }},
|
||||||
|
{"AVX512-x64", func() int8 { return SumInt8x64(data) }},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, bm := range benchmarks {
|
||||||
|
b.Run(bm.name, func(b *testing.B) {
|
||||||
|
if bm.name == "AVX2-x32" {
|
||||||
|
requireAVX2(b)
|
||||||
|
}
|
||||||
|
if bm.name == "AVX512-x64" {
|
||||||
|
requireAVX512(b)
|
||||||
|
}
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = bm.fn()
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// COMPARATIVE BENCHMARK WITH WARMUP
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
// This benchmark demonstrates the steady-state performance after warmup
|
||||||
|
func BenchmarkSumInt64SteadyState(b *testing.B) {
|
||||||
|
size := 8192
|
||||||
|
data := generateInt64(size)
|
||||||
|
|
||||||
|
// Warmup phase to ensure JIT compilation if applicable
|
||||||
|
for i := 0; i < 1000; i++ {
|
||||||
|
lo.Sum(data)
|
||||||
|
SumInt64x2(data)
|
||||||
|
if archsimd.X86.AVX2() {
|
||||||
|
SumInt64x4(data)
|
||||||
|
}
|
||||||
|
if archsimd.X86.AVX512() {
|
||||||
|
SumInt64x8(data)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
b.ResetTimer() // Reset timer to exclude warmup
|
||||||
|
|
||||||
|
b.Run("Fallback-lo", func(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = lo.Sum(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("SSE-x2", func(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = SumInt64x2(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX2-x4", func(b *testing.B) {
|
||||||
|
requireAVX2(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = SumInt64x4(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
b.Run("AVX512-x8", func(b *testing.B) {
|
||||||
|
requireAVX512(b)
|
||||||
|
b.ReportAllocs()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = SumInt64x8(data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,12 @@
|
|||||||
|
package simd
|
||||||
|
|
||||||
|
// Empty file to satisfy the build constraint for non-supported architectures.
|
||||||
|
|
||||||
|
const (
|
||||||
|
simdLanes2 = uint(2)
|
||||||
|
simdLanes4 = uint(4)
|
||||||
|
simdLanes8 = uint(8)
|
||||||
|
simdLanes16 = uint(16)
|
||||||
|
simdLanes32 = uint(32)
|
||||||
|
simdLanes64 = uint(64)
|
||||||
|
)
|
||||||
@@ -0,0 +1,36 @@
|
|||||||
|
//go:build go1.26 && goexperiment.simd && amd64
|
||||||
|
|
||||||
|
package simd
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"simd/archsimd"
|
||||||
|
)
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
for _, arg := range os.Args {
|
||||||
|
if strings.HasPrefix(arg, "-test.bench=") {
|
||||||
|
bench := strings.TrimPrefix(arg, "-test.bench=")
|
||||||
|
if bench != "" && bench != "none" {
|
||||||
|
fmt.Fprintf(os.Stdout, "archsimd.X86: AVX=%v AVX2=%v AVX512=%v\n",
|
||||||
|
archsimd.X86.AVX(), archsimd.X86.AVX2(), archsimd.X86.AVX512())
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Type aliases for testing
|
||||||
|
type myInt8 int8
|
||||||
|
type myInt16 int16
|
||||||
|
type myInt32 int32
|
||||||
|
type myInt64 int64
|
||||||
|
type myUint8 uint8
|
||||||
|
type myUint16 uint16
|
||||||
|
type myUint32 uint32
|
||||||
|
type myUint64 uint64
|
||||||
|
type myFloat32 float32
|
||||||
|
type myFloat64 float64
|
||||||
@@ -0,0 +1,86 @@
|
|||||||
|
//go:build go1.26 && goexperiment.simd && amd64
|
||||||
|
|
||||||
|
package simd
|
||||||
|
|
||||||
|
import "unsafe"
|
||||||
|
|
||||||
|
// unsafeSliceInt8 converts a []T (where T ~int8) to []int8 via unsafe operations.
|
||||||
|
// This helper reduces code duplication and the risk of copy-paste errors.
|
||||||
|
//
|
||||||
|
//go:nosplit
|
||||||
|
func unsafeSliceInt8[T ~int8](collection []T, length uint) []int8 {
|
||||||
|
// bearer:disable go_gosec_unsafe_unsafe
|
||||||
|
return unsafe.Slice((*int8)(unsafe.Pointer(&collection[0])), length)
|
||||||
|
}
|
||||||
|
|
||||||
|
// unsafeSliceInt16 converts a []T (where T ~int16) to []int16 via unsafe operations.
|
||||||
|
//
|
||||||
|
//go:nosplit
|
||||||
|
func unsafeSliceInt16[T ~int16](collection []T, length uint) []int16 {
|
||||||
|
// bearer:disable go_gosec_unsafe_unsafe
|
||||||
|
return unsafe.Slice((*int16)(unsafe.Pointer(&collection[0])), length)
|
||||||
|
}
|
||||||
|
|
||||||
|
// unsafeSliceInt32 converts a []T (where T ~int32) to []int32 via unsafe operations.
|
||||||
|
//
|
||||||
|
//go:nosplit
|
||||||
|
func unsafeSliceInt32[T ~int32](collection []T, length uint) []int32 {
|
||||||
|
// bearer:disable go_gosec_unsafe_unsafe
|
||||||
|
return unsafe.Slice((*int32)(unsafe.Pointer(&collection[0])), length)
|
||||||
|
}
|
||||||
|
|
||||||
|
// unsafeSliceInt64 converts a []T (where T ~int64) to []int64 via unsafe operations.
|
||||||
|
//
|
||||||
|
//go:nosplit
|
||||||
|
func unsafeSliceInt64[T ~int64](collection []T, length uint) []int64 {
|
||||||
|
// bearer:disable go_gosec_unsafe_unsafe
|
||||||
|
return unsafe.Slice((*int64)(unsafe.Pointer(&collection[0])), length)
|
||||||
|
}
|
||||||
|
|
||||||
|
// unsafeSliceUint8 converts a []T (where T ~uint8) to []uint8 via unsafe operations.
|
||||||
|
//
|
||||||
|
//go:nosplit
|
||||||
|
func unsafeSliceUint8[T ~uint8](collection []T, length uint) []uint8 {
|
||||||
|
// bearer:disable go_gosec_unsafe_unsafe
|
||||||
|
return unsafe.Slice((*uint8)(unsafe.Pointer(&collection[0])), length)
|
||||||
|
}
|
||||||
|
|
||||||
|
// unsafeSliceUint16 converts a []T (where T ~uint16) to []uint16 via unsafe operations.
|
||||||
|
//
|
||||||
|
//go:nosplit
|
||||||
|
func unsafeSliceUint16[T ~uint16](collection []T, length uint) []uint16 {
|
||||||
|
// bearer:disable go_gosec_unsafe_unsafe
|
||||||
|
return unsafe.Slice((*uint16)(unsafe.Pointer(&collection[0])), length)
|
||||||
|
}
|
||||||
|
|
||||||
|
// unsafeSliceUint32 converts a []T (where T ~uint32) to []uint32 via unsafe operations.
|
||||||
|
//
|
||||||
|
//go:nosplit
|
||||||
|
func unsafeSliceUint32[T ~uint32](collection []T, length uint) []uint32 {
|
||||||
|
// bearer:disable go_gosec_unsafe_unsafe
|
||||||
|
return unsafe.Slice((*uint32)(unsafe.Pointer(&collection[0])), length)
|
||||||
|
}
|
||||||
|
|
||||||
|
// unsafeSliceUint64 converts a []T (where T ~uint64) to []uint64 via unsafe operations.
|
||||||
|
//
|
||||||
|
//go:nosplit
|
||||||
|
func unsafeSliceUint64[T ~uint64](collection []T, length uint) []uint64 {
|
||||||
|
// bearer:disable go_gosec_unsafe_unsafe
|
||||||
|
return unsafe.Slice((*uint64)(unsafe.Pointer(&collection[0])), length)
|
||||||
|
}
|
||||||
|
|
||||||
|
// unsafeSliceFloat32 converts a []T (where T ~float32) to []float32 via unsafe operations.
|
||||||
|
//
|
||||||
|
//go:nosplit
|
||||||
|
func unsafeSliceFloat32[T ~float32](collection []T, length uint) []float32 {
|
||||||
|
// bearer:disable go_gosec_unsafe_unsafe
|
||||||
|
return unsafe.Slice((*float32)(unsafe.Pointer(&collection[0])), length)
|
||||||
|
}
|
||||||
|
|
||||||
|
// unsafeSliceFloat64 converts a []T (where T ~float64) to []float64 via unsafe operations.
|
||||||
|
//
|
||||||
|
//go:nosplit
|
||||||
|
func unsafeSliceFloat64[T ~float64](collection []T, length uint) []float64 {
|
||||||
|
// bearer:disable go_gosec_unsafe_unsafe
|
||||||
|
return unsafe.Slice((*float64)(unsafe.Pointer(&collection[0])), length)
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user