perf: optimize Substring to work directly with strings instead of converting to runes (#822)

* perf: optimize Substring to work directly with strings instead of converting to runes

- Rewrite Substring to iterate over string bytes directly, avoiding full []rune conversion
- Improve performance for long strings by only processing necessary portions
- Add comprehensive test cases for Unicode handling, invalid UTF-8, and edge cases
- Add BenchmarkSubstring to measure performance improvements
- Improve documentation with detailed parameter descriptions
- Handle invalid UTF-8 sequences by converting to []rune when needed

Bencstat:

                   │    old.txt    │               new.txt               │
                   │    sec/op     │    sec/op     vs base               │
Substring/{10_10}-4    558.85n ±  9%   39.75n ± 10%  -92.89% (p=0.000 n=8)
Substring/{50_50}-4    783.10n ±  6%   85.15n ±  5%  -89.13% (p=0.000 n=8)
Substring/{50_45}-4    773.30n ±  3%   126.5n ±  7%  -83.65% (p=0.000 n=8)
Substring/{-50_50}-4   794.00n ±  2%   177.6n ±  7%  -77.63% (p=0.000 n=8)
Substring/{-10_10}-4   542.85n ± 20%   41.82n ±  6%  -92.30% (p=0.000 n=8)
geomean               680.4n         79.52n        -88.31%

                   │  old.txt   │               new.txt                │
                   │    B/op    │   B/op    vs base                    │
Substring/{10_10}-4    432.0 ± 0%   0.0 ± 0%  -100.00% (p=0.000 n=8)
Substring/{50_50}-4    480.0 ± 0%   0.0 ± 0%  -100.00% (p=0.000 n=8)
Substring/{50_45}-4    464.0 ± 0%   0.0 ± 0%  -100.00% (p=0.000 n=8)
Substring/{-50_50}-4   480.0 ± 0%   0.0 ± 0%  -100.00% (p=0.000 n=8)
Substring/{-10_10}-4   432.0 ± 0%   0.0 ± 0%  -100.00% (p=0.000 n=8)

                   │  old.txt   │                new.txt                 │
                   │ allocs/op  │ allocs/op   vs base                    │
Substring/{10_10}-4    2.000 ± 0%   0.000 ± 0%  -100.00% (p=0.000 n=8)
Substring/{50_50}-4    2.000 ± 0%   0.000 ± 0%  -100.00% (p=0.000 n=8)
Substring/{50_45}-4    2.000 ± 0%   0.000 ± 0%  -100.00% (p=0.000 n=8)
Substring/{-50_50}-4   2.000 ± 0%   0.000 ± 0%  -100.00% (p=0.000 n=8)
Substring/{-10_10}-4   2.000 ± 0%   0.000 ± 0%  -100.00% (p=0.000 n=8)

* Enhance substring documentation with Unicode details

Returns a substring starting at the given offset with the specified length. Supports negative offsets; out-of-bounds are clamped. Operates on Unicode runes (characters) and is optimized for zero allocations.

---------

Co-authored-by: Samuel Berthe <dev@samuel-berthe.fr>
This commit is contained in:
d-enk
2026-02-28 00:19:20 +03:00
committed by GitHub
parent ac8295b68a
commit 68f827d9bf
3 changed files with 147 additions and 22 deletions
+3 -3
View File
@@ -19,7 +19,7 @@ signatures:
- "func Substring[T ~string](str T, offset int, length uint) T"
---
Returns a substring starting at the given offset with the specified length. Supports negative offsets; out-of-bounds are clamped.
Returns a substring starting at the given offset with the specified length. Supports negative offsets; out-of-bounds are clamped. Operates on Unicode runes (characters) and is optimized for zero allocations.
```go
// Basic usage
@@ -42,9 +42,9 @@ result = lo.Substring("hello", 1, 0)
result = lo.Substring("hello", 10, 3)
// result: ""
// With Unicode strings (byte-based)
// With Unicode strings (rune-aware)
result = lo.Substring("héllo", 1, 3)
// result: "él" (note: works with bytes, not runes)
// result: "él"
// Negative offset with negative values clamped
result = lo.Substring("hello", -10, 3)
+80 -14
View File
@@ -7,10 +7,10 @@ import (
"unicode"
"unicode/utf8"
"github.com/samber/lo/internal/xrand"
"golang.org/x/text/cases"
"golang.org/x/text/language"
"github.com/samber/lo/internal/xrand"
)
var (
@@ -100,28 +100,94 @@ func nearestPowerOfTwo(capacity int) int {
return n + 1
}
// Substring return part of a string.
// Substring extracts a substring from a string with Unicode character (rune) awareness.
// offset - starting position of the substring (can be positive, negative, or zero)
// length - number of characters to extract
// With positive offset, counting starts from the beginning of the string
// With negative offset, counting starts from the end of the string
// Play: https://go.dev/play/p/TQlxQi82Lu1
func Substring[T ~string](str T, offset int, length uint) T {
rs := []rune(str)
size := len(rs)
str = substring(str, offset, length)
if offset < 0 {
offset = size + offset
if offset < 0 {
offset = 0
// Validate UTF-8 and fix invalid sequences
if !utf8.ValidString(string(str)) {
// Convert to []rune to replicate behavior with duplicated
str = T([]rune(str))
}
// Remove null bytes from result
return T(strings.ReplaceAll(string(str), "\x00", ""))
}
func substring[T ~string](str T, offset int, length uint) T {
switch {
// Empty length or offset beyond string bounds - return empty string
case length == 0, offset >= len(str):
return ""
// Positive offset - count from the beginning
case offset > 0:
// Skip offset runes from the start
for i, r := range str {
if offset--; offset == 0 {
str = str[i+utf8.RuneLen(r):]
break
}
}
if offset >= size {
return Empty[T]()
// If couldn't skip enough runes - string is shorter than offset
if offset != 0 {
return ""
}
if length > uint(size)-uint(offset) {
length = uint(size - offset)
// If remaining string is shorter than or equal to length - return it entirely
if uint(len(str)) <= length {
return str
}
return T(strings.ReplaceAll(string(rs[offset:offset+int(length)]), "\x00", ""))
// Otherwise proceed to trimming by length
fallthrough
// Zero offset or offset less than minus string length - start from beginning
case offset < -len(str), offset == 0:
// Count length runes from the start
for i := range str {
if length == 0 {
return str[:i]
}
length--
}
return str
// Negative offset - count from the end of string
default: // -len(str) < offset < 0
// Helper function to move backward through runes
backwardPos := func(end int, count uint) (start int) {
for {
_, i := utf8.DecodeLastRuneInString(string(str[:end]))
end -= i
if count--; count == 0 || end == 0 {
return end
}
}
}
offset := uint(-offset)
// If offset is less than or equal to length - take from position to end
if offset <= length {
start := backwardPos(len(str), offset)
return str[start:]
}
// Otherwise calculate start and end positions
end := backwardPos(len(str), offset-length)
start := backwardPos(end, length)
return str[start:end]
}
}
// ChunkString returns a slice of strings split into groups of length size. If the string can't be split evenly,
+61 -2
View File
@@ -1,8 +1,11 @@
package lo
import (
"fmt"
"math"
"strings"
"testing"
"unicode/utf8"
"github.com/stretchr/testify/assert"
)
@@ -62,6 +65,7 @@ func TestSubstring(t *testing.T) {
t.Parallel()
is := assert.New(t)
str0 := Substring("hello", 5, 10)
str1 := Substring("hello", 0, 0)
str2 := Substring("hello", 10, 2)
str3 := Substring("hello", -10, 2)
@@ -76,8 +80,25 @@ func TestSubstring(t *testing.T) {
str12 := Substring("hello", -4, math.MaxUint)
str13 := Substring("🏠🐶🐱", 0, 2)
str14 := Substring("你好,世界", 0, 3)
str15 := Substring("hello", 5, 1)
str15 := Substring("🏠🐶🐱", 1, 2)
str16 := Substring("🏠🐶🐱", -2, 2)
str17 := Substring("🏠🐶🐱", 3, 3)
str18 := Substring("🏠🐶🐱", 4, 3)
str19 := Substring("hello", 5, 1)
str20 := Substring("hello", -5, 5)
str21 := Substring("hello", -5, 4)
str22 := Substring("hello", -5, math.MaxUint)
str23 := Substring("\x00\x00\x00", 0, math.MaxUint)
str24 := Substring(string(utf8.RuneError), 0, math.MaxUint)
str25 := Substring("привет"[1:], 0, 6)
str26 := Substring("привет"[:2*5+1], 0, 6)
str27 := Substring("привет"[:2*5+1], -2, math.MaxUint)
str28 := Substring("🏠🐶🐱"[1:], 0, math.MaxUint)
str29 := Substring("🏠🐶🐱"[1:], 0, 2)
str30 := Substring("привет", 6, math.MaxUint)
str31 := Substring("привет", 6+1, math.MaxUint)
is.Empty(str0)
is.Empty(str1)
is.Empty(str2)
is.Equal("he", str3)
@@ -92,7 +113,45 @@ func TestSubstring(t *testing.T) {
is.Equal("ello", str12)
is.Equal("🏠🐶", str13)
is.Equal("你好,", str14)
is.Empty(str15)
is.Equal("🐶🐱", str15)
is.Equal("🐶🐱", str16)
is.Empty(str17)
is.Empty(str18)
is.Empty(str19)
is.Equal("hello", str20)
is.Equal("hell", str21)
is.Equal("hello", str22)
is.Empty(str23)
is.Equal("", str24)
is.Equal("ривет", str25)
is.Equal("приве", str26)
is.Equal("е", str27)
is.Equal("🐶🐱", str28)
is.Equal("", str29)
is.Empty(str30)
is.Empty(str31)
}
func BenchmarkSubstring(b *testing.B) {
str := strings.Repeat("1", 100)
for _, test := range []struct {
offset int
length uint
}{
{10, 10},
{50, 50},
{50, 45},
{-50, 50},
{-10, 10},
} {
fmt.Println(test)
b.Run(fmt.Sprint(test), func(b *testing.B) {
for i := 0; i < b.N; i++ {
_ = Substring(str, test.offset, test.length)
}
})
}
}
func TestRuneLength(t *testing.T) {