mirror of
https://github.com/samber/lo.git
synced 2026-04-22 15:37:14 +08:00
perf: optimize Substring to work directly with strings instead of converting to runes (#822)
* perf: optimize Substring to work directly with strings instead of converting to runes
- Rewrite Substring to iterate over string bytes directly, avoiding full []rune conversion
- Improve performance for long strings by only processing necessary portions
- Add comprehensive test cases for Unicode handling, invalid UTF-8, and edge cases
- Add BenchmarkSubstring to measure performance improvements
- Improve documentation with detailed parameter descriptions
- Handle invalid UTF-8 sequences by converting to []rune when needed
Bencstat:
│ old.txt │ new.txt │
│ sec/op │ sec/op vs base │
Substring/{10_10}-4 558.85n ± 9% 39.75n ± 10% -92.89% (p=0.000 n=8)
Substring/{50_50}-4 783.10n ± 6% 85.15n ± 5% -89.13% (p=0.000 n=8)
Substring/{50_45}-4 773.30n ± 3% 126.5n ± 7% -83.65% (p=0.000 n=8)
Substring/{-50_50}-4 794.00n ± 2% 177.6n ± 7% -77.63% (p=0.000 n=8)
Substring/{-10_10}-4 542.85n ± 20% 41.82n ± 6% -92.30% (p=0.000 n=8)
geomean 680.4n 79.52n -88.31%
│ old.txt │ new.txt │
│ B/op │ B/op vs base │
Substring/{10_10}-4 432.0 ± 0% 0.0 ± 0% -100.00% (p=0.000 n=8)
Substring/{50_50}-4 480.0 ± 0% 0.0 ± 0% -100.00% (p=0.000 n=8)
Substring/{50_45}-4 464.0 ± 0% 0.0 ± 0% -100.00% (p=0.000 n=8)
Substring/{-50_50}-4 480.0 ± 0% 0.0 ± 0% -100.00% (p=0.000 n=8)
Substring/{-10_10}-4 432.0 ± 0% 0.0 ± 0% -100.00% (p=0.000 n=8)
│ old.txt │ new.txt │
│ allocs/op │ allocs/op vs base │
Substring/{10_10}-4 2.000 ± 0% 0.000 ± 0% -100.00% (p=0.000 n=8)
Substring/{50_50}-4 2.000 ± 0% 0.000 ± 0% -100.00% (p=0.000 n=8)
Substring/{50_45}-4 2.000 ± 0% 0.000 ± 0% -100.00% (p=0.000 n=8)
Substring/{-50_50}-4 2.000 ± 0% 0.000 ± 0% -100.00% (p=0.000 n=8)
Substring/{-10_10}-4 2.000 ± 0% 0.000 ± 0% -100.00% (p=0.000 n=8)
* Enhance substring documentation with Unicode details
Returns a substring starting at the given offset with the specified length. Supports negative offsets; out-of-bounds are clamped. Operates on Unicode runes (characters) and is optimized for zero allocations.
---------
Co-authored-by: Samuel Berthe <dev@samuel-berthe.fr>
This commit is contained in:
@@ -19,7 +19,7 @@ signatures:
|
|||||||
- "func Substring[T ~string](str T, offset int, length uint) T"
|
- "func Substring[T ~string](str T, offset int, length uint) T"
|
||||||
---
|
---
|
||||||
|
|
||||||
Returns a substring starting at the given offset with the specified length. Supports negative offsets; out-of-bounds are clamped.
|
Returns a substring starting at the given offset with the specified length. Supports negative offsets; out-of-bounds are clamped. Operates on Unicode runes (characters) and is optimized for zero allocations.
|
||||||
|
|
||||||
```go
|
```go
|
||||||
// Basic usage
|
// Basic usage
|
||||||
@@ -42,9 +42,9 @@ result = lo.Substring("hello", 1, 0)
|
|||||||
result = lo.Substring("hello", 10, 3)
|
result = lo.Substring("hello", 10, 3)
|
||||||
// result: ""
|
// result: ""
|
||||||
|
|
||||||
// With Unicode strings (byte-based)
|
// With Unicode strings (rune-aware)
|
||||||
result = lo.Substring("héllo", 1, 3)
|
result = lo.Substring("héllo", 1, 3)
|
||||||
// result: "él" (note: works with bytes, not runes)
|
// result: "él"
|
||||||
|
|
||||||
// Negative offset with negative values clamped
|
// Negative offset with negative values clamped
|
||||||
result = lo.Substring("hello", -10, 3)
|
result = lo.Substring("hello", -10, 3)
|
||||||
|
|||||||
@@ -7,10 +7,10 @@ import (
|
|||||||
"unicode"
|
"unicode"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
|
|
||||||
"github.com/samber/lo/internal/xrand"
|
|
||||||
|
|
||||||
"golang.org/x/text/cases"
|
"golang.org/x/text/cases"
|
||||||
"golang.org/x/text/language"
|
"golang.org/x/text/language"
|
||||||
|
|
||||||
|
"github.com/samber/lo/internal/xrand"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
@@ -100,28 +100,94 @@ func nearestPowerOfTwo(capacity int) int {
|
|||||||
return n + 1
|
return n + 1
|
||||||
}
|
}
|
||||||
|
|
||||||
// Substring return part of a string.
|
// Substring extracts a substring from a string with Unicode character (rune) awareness.
|
||||||
|
// offset - starting position of the substring (can be positive, negative, or zero)
|
||||||
|
// length - number of characters to extract
|
||||||
|
// With positive offset, counting starts from the beginning of the string
|
||||||
|
// With negative offset, counting starts from the end of the string
|
||||||
// Play: https://go.dev/play/p/TQlxQi82Lu1
|
// Play: https://go.dev/play/p/TQlxQi82Lu1
|
||||||
func Substring[T ~string](str T, offset int, length uint) T {
|
func Substring[T ~string](str T, offset int, length uint) T {
|
||||||
rs := []rune(str)
|
str = substring(str, offset, length)
|
||||||
size := len(rs)
|
|
||||||
|
|
||||||
if offset < 0 {
|
// Validate UTF-8 and fix invalid sequences
|
||||||
offset = size + offset
|
if !utf8.ValidString(string(str)) {
|
||||||
if offset < 0 {
|
// Convert to []rune to replicate behavior with duplicated �
|
||||||
offset = 0
|
str = T([]rune(str))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove null bytes from result
|
||||||
|
return T(strings.ReplaceAll(string(str), "\x00", ""))
|
||||||
|
}
|
||||||
|
|
||||||
|
func substring[T ~string](str T, offset int, length uint) T {
|
||||||
|
switch {
|
||||||
|
// Empty length or offset beyond string bounds - return empty string
|
||||||
|
case length == 0, offset >= len(str):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
// Positive offset - count from the beginning
|
||||||
|
case offset > 0:
|
||||||
|
// Skip offset runes from the start
|
||||||
|
for i, r := range str {
|
||||||
|
if offset--; offset == 0 {
|
||||||
|
str = str[i+utf8.RuneLen(r):]
|
||||||
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if offset >= size {
|
// If couldn't skip enough runes - string is shorter than offset
|
||||||
return Empty[T]()
|
if offset != 0 {
|
||||||
}
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
if length > uint(size)-uint(offset) {
|
// If remaining string is shorter than or equal to length - return it entirely
|
||||||
length = uint(size - offset)
|
if uint(len(str)) <= length {
|
||||||
}
|
return str
|
||||||
|
}
|
||||||
|
|
||||||
return T(strings.ReplaceAll(string(rs[offset:offset+int(length)]), "\x00", ""))
|
// Otherwise proceed to trimming by length
|
||||||
|
fallthrough
|
||||||
|
|
||||||
|
// Zero offset or offset less than minus string length - start from beginning
|
||||||
|
case offset < -len(str), offset == 0:
|
||||||
|
// Count length runes from the start
|
||||||
|
for i := range str {
|
||||||
|
if length == 0 {
|
||||||
|
return str[:i]
|
||||||
|
}
|
||||||
|
length--
|
||||||
|
}
|
||||||
|
|
||||||
|
return str
|
||||||
|
|
||||||
|
// Negative offset - count from the end of string
|
||||||
|
default: // -len(str) < offset < 0
|
||||||
|
// Helper function to move backward through runes
|
||||||
|
backwardPos := func(end int, count uint) (start int) {
|
||||||
|
for {
|
||||||
|
_, i := utf8.DecodeLastRuneInString(string(str[:end]))
|
||||||
|
end -= i
|
||||||
|
|
||||||
|
if count--; count == 0 || end == 0 {
|
||||||
|
return end
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
offset := uint(-offset)
|
||||||
|
|
||||||
|
// If offset is less than or equal to length - take from position to end
|
||||||
|
if offset <= length {
|
||||||
|
start := backwardPos(len(str), offset)
|
||||||
|
return str[start:]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Otherwise calculate start and end positions
|
||||||
|
end := backwardPos(len(str), offset-length)
|
||||||
|
start := backwardPos(end, length)
|
||||||
|
|
||||||
|
return str[start:end]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ChunkString returns a slice of strings split into groups of length size. If the string can't be split evenly,
|
// ChunkString returns a slice of strings split into groups of length size. If the string can't be split evenly,
|
||||||
|
|||||||
+61
-2
@@ -1,8 +1,11 @@
|
|||||||
package lo
|
package lo
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"unicode/utf8"
|
||||||
|
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
)
|
)
|
||||||
@@ -62,6 +65,7 @@ func TestSubstring(t *testing.T) {
|
|||||||
t.Parallel()
|
t.Parallel()
|
||||||
is := assert.New(t)
|
is := assert.New(t)
|
||||||
|
|
||||||
|
str0 := Substring("hello", 5, 10)
|
||||||
str1 := Substring("hello", 0, 0)
|
str1 := Substring("hello", 0, 0)
|
||||||
str2 := Substring("hello", 10, 2)
|
str2 := Substring("hello", 10, 2)
|
||||||
str3 := Substring("hello", -10, 2)
|
str3 := Substring("hello", -10, 2)
|
||||||
@@ -76,8 +80,25 @@ func TestSubstring(t *testing.T) {
|
|||||||
str12 := Substring("hello", -4, math.MaxUint)
|
str12 := Substring("hello", -4, math.MaxUint)
|
||||||
str13 := Substring("🏠🐶🐱", 0, 2)
|
str13 := Substring("🏠🐶🐱", 0, 2)
|
||||||
str14 := Substring("你好,世界", 0, 3)
|
str14 := Substring("你好,世界", 0, 3)
|
||||||
str15 := Substring("hello", 5, 1)
|
str15 := Substring("🏠🐶🐱", 1, 2)
|
||||||
|
str16 := Substring("🏠🐶🐱", -2, 2)
|
||||||
|
str17 := Substring("🏠🐶🐱", 3, 3)
|
||||||
|
str18 := Substring("🏠🐶🐱", 4, 3)
|
||||||
|
str19 := Substring("hello", 5, 1)
|
||||||
|
str20 := Substring("hello", -5, 5)
|
||||||
|
str21 := Substring("hello", -5, 4)
|
||||||
|
str22 := Substring("hello", -5, math.MaxUint)
|
||||||
|
str23 := Substring("\x00\x00\x00", 0, math.MaxUint)
|
||||||
|
str24 := Substring(string(utf8.RuneError), 0, math.MaxUint)
|
||||||
|
str25 := Substring("привет"[1:], 0, 6)
|
||||||
|
str26 := Substring("привет"[:2*5+1], 0, 6)
|
||||||
|
str27 := Substring("привет"[:2*5+1], -2, math.MaxUint)
|
||||||
|
str28 := Substring("🏠🐶🐱"[1:], 0, math.MaxUint)
|
||||||
|
str29 := Substring("🏠🐶🐱"[1:], 0, 2)
|
||||||
|
str30 := Substring("привет", 6, math.MaxUint)
|
||||||
|
str31 := Substring("привет", 6+1, math.MaxUint)
|
||||||
|
|
||||||
|
is.Empty(str0)
|
||||||
is.Empty(str1)
|
is.Empty(str1)
|
||||||
is.Empty(str2)
|
is.Empty(str2)
|
||||||
is.Equal("he", str3)
|
is.Equal("he", str3)
|
||||||
@@ -92,7 +113,45 @@ func TestSubstring(t *testing.T) {
|
|||||||
is.Equal("ello", str12)
|
is.Equal("ello", str12)
|
||||||
is.Equal("🏠🐶", str13)
|
is.Equal("🏠🐶", str13)
|
||||||
is.Equal("你好,", str14)
|
is.Equal("你好,", str14)
|
||||||
is.Empty(str15)
|
is.Equal("🐶🐱", str15)
|
||||||
|
is.Equal("🐶🐱", str16)
|
||||||
|
is.Empty(str17)
|
||||||
|
is.Empty(str18)
|
||||||
|
is.Empty(str19)
|
||||||
|
is.Equal("hello", str20)
|
||||||
|
is.Equal("hell", str21)
|
||||||
|
is.Equal("hello", str22)
|
||||||
|
is.Empty(str23)
|
||||||
|
is.Equal("�", str24)
|
||||||
|
is.Equal("�ривет", str25)
|
||||||
|
is.Equal("приве�", str26)
|
||||||
|
is.Equal("е�", str27)
|
||||||
|
is.Equal("���🐶🐱", str28)
|
||||||
|
is.Equal("��", str29)
|
||||||
|
is.Empty(str30)
|
||||||
|
is.Empty(str31)
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkSubstring(b *testing.B) {
|
||||||
|
str := strings.Repeat("1", 100)
|
||||||
|
|
||||||
|
for _, test := range []struct {
|
||||||
|
offset int
|
||||||
|
length uint
|
||||||
|
}{
|
||||||
|
{10, 10},
|
||||||
|
{50, 50},
|
||||||
|
{50, 45},
|
||||||
|
{-50, 50},
|
||||||
|
{-10, 10},
|
||||||
|
} {
|
||||||
|
fmt.Println(test)
|
||||||
|
b.Run(fmt.Sprint(test), func(b *testing.B) {
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = Substring(str, test.offset, test.length)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestRuneLength(t *testing.T) {
|
func TestRuneLength(t *testing.T) {
|
||||||
|
|||||||
Reference in New Issue
Block a user