Files
lo/exp/simd/intersect_avx512.go
T
Samuel Berthe 035f1b358a Experiments: adding SIMD helpers (#801)
* feat(exp,simd): adding SumAxB helpers

* feat(exp,simd): adding MeanAxB and ClampAxB helpers

* feat(exp,simd): adding MinAxB and MaxAxB helpers

* refactor(exp,simd): group perf helper category + architecture

* feat(exp,simd): adding ContainsAxB helpers

* perf(exp,simd): cast to unsafe slice once

* feat(exp,simd): call the right SIMD helper based on local architecture

* chore: internal dependency linking

* Update exp/simd/math.go

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* style: fix linter

* style: fix linter

* chore: enable simd in makefile

* chore(ci): add simd package to test runs

* chore(ci): add simd package to test runs only for go 1.26

* fix(simd): fix overflow

* fix(simd): fix overflow and apply the same behavior than lo.Mean

* doc(exp,simd): adding initial doc

* refactor(simd): move intersect_avx2 and intersect_sse code into intersect_avx512

* fix(simd): call SSE fallback instead of lo.Sum for default helpers

* feat(simd): cache simd features on package init to avoid repeated checks

* perf(exp,simd): precompute length + improve code quality

* perf(exp,simd): faster iteration for min/max value

* test(exp,simd): adding benchmarks

* test(exp,simd): adding benchmarks results

* test(exp,simd): adding benchmarks results

* doc(exp,simd): adding warning for overflows in SIMD operations

* feat(exp,simd): adding more dispatch helpers

* feat(exp,simd): adding SumBy variants

* feat(exp,simd): adding MeanBy variants

* fix(exp,simd): faster clamp

* 💄

* doc(exp,simd): adding SumBy + MeanBy

* fix(exp,simd): faster SIMD operations

* chore(ci): enable the benchmarks temporary

* chore(ci): display cpu architecture before running tests

* chore(ci): github actions are hidding some useful stuffs

* chore(ci): no SIMD VM available at Github during the weekend ???

* test(exp,simd): larger epsilon

* oops

* perf(exp,simd): faster iterations

* doc(exp,simd): report last version of benchmarks

* 💄

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2026-02-21 19:19:36 +01:00

970 lines
19 KiB
Go

//go:build go1.26 && goexperiment.simd && amd64
package simd
import (
"simd/archsimd"
)
// ContainsInt8x16 checks if collection contains target using SSE SIMD and AVX512 SIMD
func ContainsInt8x16[T ~int8](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes16
targetVec := archsimd.BroadcastInt8x16(int8(target))
base := unsafeSliceInt8(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadInt8x16Slice(s)
// Compare for equality; Equal returns a mask, ToBits() its bitmask.
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
// Handle remaining elements
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsInt16x8 checks if collection contains target using SSE SIMD and AVX512 SIMD
func ContainsInt16x8[T ~int16](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes8
targetVec := archsimd.BroadcastInt16x8(int16(target))
base := unsafeSliceInt16(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadInt16x8Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsInt32x4 checks if collection contains target using SSE SIMD and AVX512 SIMD
func ContainsInt32x4[T ~int32](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes4
targetVec := archsimd.BroadcastInt32x4(int32(target))
base := unsafeSliceInt32(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadInt32x4Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsInt64x2 checks if collection contains target using SSE SIMD and AVX512 SIMD
func ContainsInt64x2[T ~int64](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes2
targetVec := archsimd.BroadcastInt64x2(int64(target))
base := unsafeSliceInt64(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadInt64x2Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsUint8x16 checks if collection contains target using SSE SIMD and AVX512 SIMD
func ContainsUint8x16[T ~uint8](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes16
targetVec := archsimd.BroadcastUint8x16(uint8(target))
base := unsafeSliceUint8(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadUint8x16Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsUint16x8 checks if collection contains target using SSE SIMD and AVX512 SIMD
func ContainsUint16x8[T ~uint16](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes8
targetVec := archsimd.BroadcastUint16x8(uint16(target))
base := unsafeSliceUint16(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadUint16x8Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsUint32x4 checks if collection contains target using SSE SIMD and AVX512 SIMD
func ContainsUint32x4[T ~uint32](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes4
targetVec := archsimd.BroadcastUint32x4(uint32(target))
base := unsafeSliceUint32(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadUint32x4Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsUint64x2 checks if collection contains target using SSE SIMD and AVX512 SIMD
func ContainsUint64x2[T ~uint64](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes2
targetVec := archsimd.BroadcastUint64x2(uint64(target))
base := unsafeSliceUint64(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadUint64x2Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsFloat32x4 checks if collection contains target using SSE SIMD and AVX512 SIMD
func ContainsFloat32x4[T ~float32](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes4
targetVec := archsimd.BroadcastFloat32x4(float32(target))
base := unsafeSliceFloat32(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadFloat32x4Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsFloat64x2 checks if collection contains target using SSE SIMD and AVX512 SIMD
func ContainsFloat64x2[T ~float64](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes2
targetVec := archsimd.BroadcastFloat64x2(float64(target))
base := unsafeSliceFloat64(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadFloat64x2Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsInt8x32 checks if collection contains target using AVX2 SIMD
func ContainsInt8x32[T ~int8](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes32
targetVec := archsimd.BroadcastInt8x32(int8(target))
base := unsafeSliceInt8(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadInt8x32Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsInt16x16 checks if collection contains target using AVX2 SIMD
func ContainsInt16x16[T ~int16](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes16
targetVec := archsimd.BroadcastInt16x16(int16(target))
base := unsafeSliceInt16(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadInt16x16Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsInt32x8 checks if collection contains target using AVX2 SIMD
func ContainsInt32x8[T ~int32](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes8
targetVec := archsimd.BroadcastInt32x8(int32(target))
base := unsafeSliceInt32(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadInt32x8Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsInt64x4 checks if collection contains target using AVX2 SIMD
func ContainsInt64x4[T ~int64](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes4
targetVec := archsimd.BroadcastInt64x4(int64(target))
base := unsafeSliceInt64(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadInt64x4Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsUint8x32 checks if collection contains target using AVX2 SIMD
func ContainsUint8x32[T ~uint8](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes32
targetVec := archsimd.BroadcastUint8x32(uint8(target))
base := unsafeSliceUint8(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadUint8x32Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsUint16x16 checks if collection contains target using AVX2 SIMD
func ContainsUint16x16[T ~uint16](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes16
targetVec := archsimd.BroadcastUint16x16(uint16(target))
base := unsafeSliceUint16(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadUint16x16Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsUint32x8 checks if collection contains target using AVX2 SIMD
func ContainsUint32x8[T ~uint32](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes8
targetVec := archsimd.BroadcastUint32x8(uint32(target))
base := unsafeSliceUint32(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadUint32x8Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsUint64x4 checks if collection contains target using AVX2 SIMD
func ContainsUint64x4[T ~uint64](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes4
targetVec := archsimd.BroadcastUint64x4(uint64(target))
base := unsafeSliceUint64(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadUint64x4Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsFloat32x8 checks if collection contains target using AVX2 SIMD
func ContainsFloat32x8[T ~float32](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes8
targetVec := archsimd.BroadcastFloat32x8(float32(target))
base := unsafeSliceFloat32(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadFloat32x8Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsFloat64x4 checks if collection contains target using AVX2 SIMD
func ContainsFloat64x4[T ~float64](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes4
targetVec := archsimd.BroadcastFloat64x4(float64(target))
base := unsafeSliceFloat64(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadFloat64x4Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsInt8x64 checks if collection contains target using AVX-512 SIMD
func ContainsInt8x64[T ~int8](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes64
targetVec := archsimd.BroadcastInt8x64(int8(target))
base := unsafeSliceInt8(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadInt8x64Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsInt16x32 checks if collection contains target using AVX-512 SIMD
func ContainsInt16x32[T ~int16](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes32
targetVec := archsimd.BroadcastInt16x32(int16(target))
base := unsafeSliceInt16(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadInt16x32Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsInt32x16 checks if collection contains target using AVX-512 SIMD
func ContainsInt32x16[T ~int32](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes16
targetVec := archsimd.BroadcastInt32x16(int32(target))
base := unsafeSliceInt32(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadInt32x16Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsInt64x8 checks if collection contains target using AVX-512 SIMD
func ContainsInt64x8[T ~int64](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes8
targetVec := archsimd.BroadcastInt64x8(int64(target))
base := unsafeSliceInt64(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadInt64x8Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsUint8x64 checks if collection contains target using AVX-512 SIMD
func ContainsUint8x64[T ~uint8](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes64
targetVec := archsimd.BroadcastUint8x64(uint8(target))
base := unsafeSliceUint8(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadUint8x64Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsUint16x32 checks if collection contains target using AVX-512 SIMD
func ContainsUint16x32[T ~uint16](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes32
targetVec := archsimd.BroadcastUint16x32(uint16(target))
base := unsafeSliceUint16(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadUint16x32Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsUint32x16 checks if collection contains target using AVX-512 SIMD
func ContainsUint32x16[T ~uint32](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes16
targetVec := archsimd.BroadcastUint32x16(uint32(target))
base := unsafeSliceUint32(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadUint32x16Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsUint64x8 checks if collection contains target using AVX-512 SIMD
func ContainsUint64x8[T ~uint64](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes8
targetVec := archsimd.BroadcastUint64x8(uint64(target))
base := unsafeSliceUint64(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadUint64x8Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsFloat32x16 checks if collection contains target using AVX-512 SIMD
func ContainsFloat32x16[T ~float32](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes16
targetVec := archsimd.BroadcastFloat32x16(float32(target))
base := unsafeSliceFloat32(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadFloat32x16Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}
// ContainsFloat64x8 checks if collection contains target using AVX-512 SIMD
func ContainsFloat64x8[T ~float64](collection []T, target T) bool {
length := uint(len(collection))
if length == 0 {
return false
}
const lanes = simdLanes8
targetVec := archsimd.BroadcastFloat64x8(float64(target))
base := unsafeSliceFloat64(collection, length)
i := uint(0)
for ; i+lanes <= length; i += lanes {
s := base[i : i+lanes]
v := archsimd.LoadFloat64x8Slice(s)
cmp := v.Equal(targetVec)
if cmp.ToBits() != 0 {
return true
}
}
for ; i < length; i++ {
if collection[i] == target {
return true
}
}
return false
}