mirror of
https://github.com/samber/lo.git
synced 2026-04-22 15:37:14 +08:00
perf: optimize Substring to work directly with strings instead of converting to runes (#822)
* perf: optimize Substring to work directly with strings instead of converting to runes
- Rewrite Substring to iterate over string bytes directly, avoiding full []rune conversion
- Improve performance for long strings by only processing necessary portions
- Add comprehensive test cases for Unicode handling, invalid UTF-8, and edge cases
- Add BenchmarkSubstring to measure performance improvements
- Improve documentation with detailed parameter descriptions
- Handle invalid UTF-8 sequences by converting to []rune when needed
Bencstat:
│ old.txt │ new.txt │
│ sec/op │ sec/op vs base │
Substring/{10_10}-4 558.85n ± 9% 39.75n ± 10% -92.89% (p=0.000 n=8)
Substring/{50_50}-4 783.10n ± 6% 85.15n ± 5% -89.13% (p=0.000 n=8)
Substring/{50_45}-4 773.30n ± 3% 126.5n ± 7% -83.65% (p=0.000 n=8)
Substring/{-50_50}-4 794.00n ± 2% 177.6n ± 7% -77.63% (p=0.000 n=8)
Substring/{-10_10}-4 542.85n ± 20% 41.82n ± 6% -92.30% (p=0.000 n=8)
geomean 680.4n 79.52n -88.31%
│ old.txt │ new.txt │
│ B/op │ B/op vs base │
Substring/{10_10}-4 432.0 ± 0% 0.0 ± 0% -100.00% (p=0.000 n=8)
Substring/{50_50}-4 480.0 ± 0% 0.0 ± 0% -100.00% (p=0.000 n=8)
Substring/{50_45}-4 464.0 ± 0% 0.0 ± 0% -100.00% (p=0.000 n=8)
Substring/{-50_50}-4 480.0 ± 0% 0.0 ± 0% -100.00% (p=0.000 n=8)
Substring/{-10_10}-4 432.0 ± 0% 0.0 ± 0% -100.00% (p=0.000 n=8)
│ old.txt │ new.txt │
│ allocs/op │ allocs/op vs base │
Substring/{10_10}-4 2.000 ± 0% 0.000 ± 0% -100.00% (p=0.000 n=8)
Substring/{50_50}-4 2.000 ± 0% 0.000 ± 0% -100.00% (p=0.000 n=8)
Substring/{50_45}-4 2.000 ± 0% 0.000 ± 0% -100.00% (p=0.000 n=8)
Substring/{-50_50}-4 2.000 ± 0% 0.000 ± 0% -100.00% (p=0.000 n=8)
Substring/{-10_10}-4 2.000 ± 0% 0.000 ± 0% -100.00% (p=0.000 n=8)
* Enhance substring documentation with Unicode details
Returns a substring starting at the given offset with the specified length. Supports negative offsets; out-of-bounds are clamped. Operates on Unicode runes (characters) and is optimized for zero allocations.
---------
Co-authored-by: Samuel Berthe <dev@samuel-berthe.fr>
This commit is contained in:
@@ -19,7 +19,7 @@ signatures:
|
||||
- "func Substring[T ~string](str T, offset int, length uint) T"
|
||||
---
|
||||
|
||||
Returns a substring starting at the given offset with the specified length. Supports negative offsets; out-of-bounds are clamped.
|
||||
Returns a substring starting at the given offset with the specified length. Supports negative offsets; out-of-bounds are clamped. Operates on Unicode runes (characters) and is optimized for zero allocations.
|
||||
|
||||
```go
|
||||
// Basic usage
|
||||
@@ -42,9 +42,9 @@ result = lo.Substring("hello", 1, 0)
|
||||
result = lo.Substring("hello", 10, 3)
|
||||
// result: ""
|
||||
|
||||
// With Unicode strings (byte-based)
|
||||
// With Unicode strings (rune-aware)
|
||||
result = lo.Substring("héllo", 1, 3)
|
||||
// result: "él" (note: works with bytes, not runes)
|
||||
// result: "él"
|
||||
|
||||
// Negative offset with negative values clamped
|
||||
result = lo.Substring("hello", -10, 3)
|
||||
|
||||
@@ -7,10 +7,10 @@ import (
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/samber/lo/internal/xrand"
|
||||
|
||||
"golang.org/x/text/cases"
|
||||
"golang.org/x/text/language"
|
||||
|
||||
"github.com/samber/lo/internal/xrand"
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -100,28 +100,94 @@ func nearestPowerOfTwo(capacity int) int {
|
||||
return n + 1
|
||||
}
|
||||
|
||||
// Substring return part of a string.
|
||||
// Substring extracts a substring from a string with Unicode character (rune) awareness.
|
||||
// offset - starting position of the substring (can be positive, negative, or zero)
|
||||
// length - number of characters to extract
|
||||
// With positive offset, counting starts from the beginning of the string
|
||||
// With negative offset, counting starts from the end of the string
|
||||
// Play: https://go.dev/play/p/TQlxQi82Lu1
|
||||
func Substring[T ~string](str T, offset int, length uint) T {
|
||||
rs := []rune(str)
|
||||
size := len(rs)
|
||||
str = substring(str, offset, length)
|
||||
|
||||
if offset < 0 {
|
||||
offset = size + offset
|
||||
if offset < 0 {
|
||||
offset = 0
|
||||
// Validate UTF-8 and fix invalid sequences
|
||||
if !utf8.ValidString(string(str)) {
|
||||
// Convert to []rune to replicate behavior with duplicated �
|
||||
str = T([]rune(str))
|
||||
}
|
||||
|
||||
// Remove null bytes from result
|
||||
return T(strings.ReplaceAll(string(str), "\x00", ""))
|
||||
}
|
||||
|
||||
func substring[T ~string](str T, offset int, length uint) T {
|
||||
switch {
|
||||
// Empty length or offset beyond string bounds - return empty string
|
||||
case length == 0, offset >= len(str):
|
||||
return ""
|
||||
|
||||
// Positive offset - count from the beginning
|
||||
case offset > 0:
|
||||
// Skip offset runes from the start
|
||||
for i, r := range str {
|
||||
if offset--; offset == 0 {
|
||||
str = str[i+utf8.RuneLen(r):]
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if offset >= size {
|
||||
return Empty[T]()
|
||||
// If couldn't skip enough runes - string is shorter than offset
|
||||
if offset != 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
if length > uint(size)-uint(offset) {
|
||||
length = uint(size - offset)
|
||||
// If remaining string is shorter than or equal to length - return it entirely
|
||||
if uint(len(str)) <= length {
|
||||
return str
|
||||
}
|
||||
|
||||
return T(strings.ReplaceAll(string(rs[offset:offset+int(length)]), "\x00", ""))
|
||||
// Otherwise proceed to trimming by length
|
||||
fallthrough
|
||||
|
||||
// Zero offset or offset less than minus string length - start from beginning
|
||||
case offset < -len(str), offset == 0:
|
||||
// Count length runes from the start
|
||||
for i := range str {
|
||||
if length == 0 {
|
||||
return str[:i]
|
||||
}
|
||||
length--
|
||||
}
|
||||
|
||||
return str
|
||||
|
||||
// Negative offset - count from the end of string
|
||||
default: // -len(str) < offset < 0
|
||||
// Helper function to move backward through runes
|
||||
backwardPos := func(end int, count uint) (start int) {
|
||||
for {
|
||||
_, i := utf8.DecodeLastRuneInString(string(str[:end]))
|
||||
end -= i
|
||||
|
||||
if count--; count == 0 || end == 0 {
|
||||
return end
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
offset := uint(-offset)
|
||||
|
||||
// If offset is less than or equal to length - take from position to end
|
||||
if offset <= length {
|
||||
start := backwardPos(len(str), offset)
|
||||
return str[start:]
|
||||
}
|
||||
|
||||
// Otherwise calculate start and end positions
|
||||
end := backwardPos(len(str), offset-length)
|
||||
start := backwardPos(end, length)
|
||||
|
||||
return str[start:end]
|
||||
}
|
||||
}
|
||||
|
||||
// ChunkString returns a slice of strings split into groups of length size. If the string can't be split evenly,
|
||||
|
||||
+61
-2
@@ -1,8 +1,11 @@
|
||||
package lo
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"strings"
|
||||
"testing"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
@@ -62,6 +65,7 @@ func TestSubstring(t *testing.T) {
|
||||
t.Parallel()
|
||||
is := assert.New(t)
|
||||
|
||||
str0 := Substring("hello", 5, 10)
|
||||
str1 := Substring("hello", 0, 0)
|
||||
str2 := Substring("hello", 10, 2)
|
||||
str3 := Substring("hello", -10, 2)
|
||||
@@ -76,8 +80,25 @@ func TestSubstring(t *testing.T) {
|
||||
str12 := Substring("hello", -4, math.MaxUint)
|
||||
str13 := Substring("🏠🐶🐱", 0, 2)
|
||||
str14 := Substring("你好,世界", 0, 3)
|
||||
str15 := Substring("hello", 5, 1)
|
||||
str15 := Substring("🏠🐶🐱", 1, 2)
|
||||
str16 := Substring("🏠🐶🐱", -2, 2)
|
||||
str17 := Substring("🏠🐶🐱", 3, 3)
|
||||
str18 := Substring("🏠🐶🐱", 4, 3)
|
||||
str19 := Substring("hello", 5, 1)
|
||||
str20 := Substring("hello", -5, 5)
|
||||
str21 := Substring("hello", -5, 4)
|
||||
str22 := Substring("hello", -5, math.MaxUint)
|
||||
str23 := Substring("\x00\x00\x00", 0, math.MaxUint)
|
||||
str24 := Substring(string(utf8.RuneError), 0, math.MaxUint)
|
||||
str25 := Substring("привет"[1:], 0, 6)
|
||||
str26 := Substring("привет"[:2*5+1], 0, 6)
|
||||
str27 := Substring("привет"[:2*5+1], -2, math.MaxUint)
|
||||
str28 := Substring("🏠🐶🐱"[1:], 0, math.MaxUint)
|
||||
str29 := Substring("🏠🐶🐱"[1:], 0, 2)
|
||||
str30 := Substring("привет", 6, math.MaxUint)
|
||||
str31 := Substring("привет", 6+1, math.MaxUint)
|
||||
|
||||
is.Empty(str0)
|
||||
is.Empty(str1)
|
||||
is.Empty(str2)
|
||||
is.Equal("he", str3)
|
||||
@@ -92,7 +113,45 @@ func TestSubstring(t *testing.T) {
|
||||
is.Equal("ello", str12)
|
||||
is.Equal("🏠🐶", str13)
|
||||
is.Equal("你好,", str14)
|
||||
is.Empty(str15)
|
||||
is.Equal("🐶🐱", str15)
|
||||
is.Equal("🐶🐱", str16)
|
||||
is.Empty(str17)
|
||||
is.Empty(str18)
|
||||
is.Empty(str19)
|
||||
is.Equal("hello", str20)
|
||||
is.Equal("hell", str21)
|
||||
is.Equal("hello", str22)
|
||||
is.Empty(str23)
|
||||
is.Equal("�", str24)
|
||||
is.Equal("�ривет", str25)
|
||||
is.Equal("приве�", str26)
|
||||
is.Equal("е�", str27)
|
||||
is.Equal("���🐶🐱", str28)
|
||||
is.Equal("��", str29)
|
||||
is.Empty(str30)
|
||||
is.Empty(str31)
|
||||
}
|
||||
|
||||
func BenchmarkSubstring(b *testing.B) {
|
||||
str := strings.Repeat("1", 100)
|
||||
|
||||
for _, test := range []struct {
|
||||
offset int
|
||||
length uint
|
||||
}{
|
||||
{10, 10},
|
||||
{50, 50},
|
||||
{50, 45},
|
||||
{-50, 50},
|
||||
{-10, 10},
|
||||
} {
|
||||
fmt.Println(test)
|
||||
b.Run(fmt.Sprint(test), func(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = Substring(str, test.offset, test.length)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestRuneLength(t *testing.T) {
|
||||
|
||||
Reference in New Issue
Block a user