perf: optimize Substring to work directly with strings instead of converting to runes (#822)

* perf: optimize Substring to work directly with strings instead of converting to runes - Rewrite Substring to iterate over string bytes directly, avoiding full []rune conversion - Improve performance for long strings by only processing necessary portions - Add comprehensive test cases for Unicode handling, invalid UTF-8, and edge cases - Add BenchmarkSubstring to measure performance improvements - Improve documentation with detailed parameter descriptions - Handle invalid UTF-8 sequences by converting to []rune when needed Bencstat: │ old.txt │ new.txt │ │ sec/op │ sec/op vs base │ Substring/{10_10}-4 558.85n ± 9% 39.75n ± 10% -92.89% (p=0.000 n=8) Substring/{50_50}-4 783.10n ± 6% 85.15n ± 5% -89.13% (p=0.000 n=8) Substring/{50_45}-4 773.30n ± 3% 126.5n ± 7% -83.65% (p=0.000 n=8) Substring/{-50_50}-4 794.00n ± 2% 177.6n ± 7% -77.63% (p=0.000 n=8) Substring/{-10_10}-4 542.85n ± 20% 41.82n ± 6% -92.30% (p=0.000 n=8) geomean 680.4n 79.52n -88.31% │ old.txt │ new.txt │ │ B/op │ B/op vs base │ Substring/{10_10}-4 432.0 ± 0% 0.0 ± 0% -100.00% (p=0.000 n=8) Substring/{50_50}-4 480.0 ± 0% 0.0 ± 0% -100.00% (p=0.000 n=8) Substring/{50_45}-4 464.0 ± 0% 0.0 ± 0% -100.00% (p=0.000 n=8) Substring/{-50_50}-4 480.0 ± 0% 0.0 ± 0% -100.00% (p=0.000 n=8) Substring/{-10_10}-4 432.0 ± 0% 0.0 ± 0% -100.00% (p=0.000 n=8) │ old.txt │ new.txt │ │ allocs/op │ allocs/op vs base │ Substring/{10_10}-4 2.000 ± 0% 0.000 ± 0% -100.00% (p=0.000 n=8) Substring/{50_50}-4 2.000 ± 0% 0.000 ± 0% -100.00% (p=0.000 n=8) Substring/{50_45}-4 2.000 ± 0% 0.000 ± 0% -100.00% (p=0.000 n=8) Substring/{-50_50}-4 2.000 ± 0% 0.000 ± 0% -100.00% (p=0.000 n=8) Substring/{-10_10}-4 2.000 ± 0% 0.000 ± 0% -100.00% (p=0.000 n=8) * Enhance substring documentation with Unicode details Returns a substring starting at the given offset with the specified length. Supports negative offsets; out-of-bounds are clamped. Operates on Unicode runes (characters) and is optimized for zero allocations. --------- Co-authored-by: Samuel Berthe <dev@samuel-berthe.fr>
2026-04-22 15:37:14 +08:00 · 2026-02-28 00:19:20 +03:00
parent ac8295b68a
commit 68f827d9bf
3 changed files with 147 additions and 22 deletions
@@ -19,7 +19,7 @@ signatures:
  - "func Substring[T ~string](str T, offset int, length uint) T"
 ---

-Returns a substring starting at the given offset with the specified length. Supports negative offsets; out-of-bounds are clamped.
+Returns a substring starting at the given offset with the specified length. Supports negative offsets; out-of-bounds are clamped. Operates on Unicode runes (characters) and is optimized for zero allocations.

 ```go
 // Basic usage
@@ -42,9 +42,9 @@ result = lo.Substring("hello", 1, 0)
 result = lo.Substring("hello", 10, 3)
 // result: ""

-// With Unicode strings (byte-based)
+// With Unicode strings (rune-aware)
 result = lo.Substring("héllo", 1, 3)
-// result: "él" (note: works with bytes, not runes)
+// result: "él"

 // Negative offset with negative values clamped
 result = lo.Substring("hello", -10, 3)
@@ -7,10 +7,10 @@ import (
 	"unicode"
 	"unicode/utf8"

-	"github.com/samber/lo/internal/xrand"
-
 	"golang.org/x/text/cases"
 	"golang.org/x/text/language"
+
+	"github.com/samber/lo/internal/xrand"
 )

 var (
@@ -100,28 +100,94 @@ func nearestPowerOfTwo(capacity int) int {
 	return n + 1
 }

-// Substring return part of a string.
+// Substring extracts a substring from a string with Unicode character (rune) awareness.
+// offset - starting position of the substring (can be positive, negative, or zero)
+// length - number of characters to extract
+// With positive offset, counting starts from the beginning of the string
+// With negative offset, counting starts from the end of the string
 // Play: https://go.dev/play/p/TQlxQi82Lu1
 func Substring[T ~string](str T, offset int, length uint) T {
-	rs := []rune(str)
-	size := len(rs)
+	str = substring(str, offset, length)

-	if offset < 0 {
-		offset = size + offset
-		if offset < 0 {
-			offset = 0
+	// Validate UTF-8 and fix invalid sequences
+	if !utf8.ValidString(string(str)) {
+		// Convert to []rune to replicate behavior with duplicated �
+		str = T([]rune(str))
+	}
+
+	// Remove null bytes from result
+	return T(strings.ReplaceAll(string(str), "\x00", ""))
+}
+
+func substring[T ~string](str T, offset int, length uint) T {
+	switch {
+	// Empty length or offset beyond string bounds - return empty string
+	case length == 0, offset >= len(str):
+		return ""
+
+	// Positive offset - count from the beginning
+	case offset > 0:
+		// Skip offset runes from the start
+		for i, r := range str {
+			if offset--; offset == 0 {
+				str = str[i+utf8.RuneLen(r):]
+				break
 			}
 		}

-	if offset >= size {
-		return Empty[T]()
+		// If couldn't skip enough runes - string is shorter than offset
+		if offset != 0 {
+			return ""
 		}

-	if length > uint(size)-uint(offset) {
-		length = uint(size - offset)
+		// If remaining string is shorter than or equal to length - return it entirely
+		if uint(len(str)) <= length {
+			return str
 		}

-	return T(strings.ReplaceAll(string(rs[offset:offset+int(length)]), "\x00", ""))
+		// Otherwise proceed to trimming by length
+		fallthrough
+
+	// Zero offset or offset less than minus string length - start from beginning
+	case offset < -len(str), offset == 0:
+		// Count length runes from the start
+		for i := range str {
+			if length == 0 {
+				return str[:i]
+			}
+			length--
+		}
+
+		return str
+
+	// Negative offset - count from the end of string
+	default: // -len(str) < offset < 0
+		// Helper function to move backward through runes
+		backwardPos := func(end int, count uint) (start int) {
+			for {
+				_, i := utf8.DecodeLastRuneInString(string(str[:end]))
+				end -= i
+
+				if count--; count == 0 || end == 0 {
+					return end
+				}
+			}
+		}
+
+		offset := uint(-offset)
+
+		// If offset is less than or equal to length - take from position to end
+		if offset <= length {
+			start := backwardPos(len(str), offset)
+			return str[start:]
+		}
+
+		// Otherwise calculate start and end positions
+		end := backwardPos(len(str), offset-length)
+		start := backwardPos(end, length)
+
+		return str[start:end]
+	}
 }

 // ChunkString returns a slice of strings split into groups of length size. If the string can't be split evenly,
@@ -1,8 +1,11 @@
 package lo

 import (
+	"fmt"
 	"math"
+	"strings"
 	"testing"
+	"unicode/utf8"

 	"github.com/stretchr/testify/assert"
 )
@@ -62,6 +65,7 @@ func TestSubstring(t *testing.T) {
 	t.Parallel()
 	is := assert.New(t)

+	str0 := Substring("hello", 5, 10)
 	str1 := Substring("hello", 0, 0)
 	str2 := Substring("hello", 10, 2)
 	str3 := Substring("hello", -10, 2)
@@ -76,8 +80,25 @@ func TestSubstring(t *testing.T) {
 	str12 := Substring("hello", -4, math.MaxUint)
 	str13 := Substring("🏠🐶🐱", 0, 2)
 	str14 := Substring("你好，世界", 0, 3)
-	str15 := Substring("hello", 5, 1)
+	str15 := Substring("🏠🐶🐱", 1, 2)
+	str16 := Substring("🏠🐶🐱", -2, 2)
+	str17 := Substring("🏠🐶🐱", 3, 3)
+	str18 := Substring("🏠🐶🐱", 4, 3)
+	str19 := Substring("hello", 5, 1)
+	str20 := Substring("hello", -5, 5)
+	str21 := Substring("hello", -5, 4)
+	str22 := Substring("hello", -5, math.MaxUint)
+	str23 := Substring("\x00\x00\x00", 0, math.MaxUint)
+	str24 := Substring(string(utf8.RuneError), 0, math.MaxUint)
+	str25 := Substring("привет"[1:], 0, 6)
+	str26 := Substring("привет"[:2*5+1], 0, 6)
+	str27 := Substring("привет"[:2*5+1], -2, math.MaxUint)
+	str28 := Substring("🏠🐶🐱"[1:], 0, math.MaxUint)
+	str29 := Substring("🏠🐶🐱"[1:], 0, 2)
+	str30 := Substring("привет", 6, math.MaxUint)
+	str31 := Substring("привет", 6+1, math.MaxUint)

+	is.Empty(str0)
 	is.Empty(str1)
 	is.Empty(str2)
 	is.Equal("he", str3)
@@ -92,7 +113,45 @@ func TestSubstring(t *testing.T) {
 	is.Equal("ello", str12)
 	is.Equal("🏠🐶", str13)
 	is.Equal("你好，", str14)
-	is.Empty(str15)
+	is.Equal("🐶🐱", str15)
+	is.Equal("🐶🐱", str16)
+	is.Empty(str17)
+	is.Empty(str18)
+	is.Empty(str19)
+	is.Equal("hello", str20)
+	is.Equal("hell", str21)
+	is.Equal("hello", str22)
+	is.Empty(str23)
+	is.Equal("�", str24)
+	is.Equal("�ривет", str25)
+	is.Equal("приве�", str26)
+	is.Equal("е�", str27)
+	is.Equal("���🐶🐱", str28)
+	is.Equal("��", str29)
+	is.Empty(str30)
+	is.Empty(str31)
+}
+
+func BenchmarkSubstring(b *testing.B) {
+	str := strings.Repeat("1", 100)
+
+	for _, test := range []struct {
+		offset int
+		length uint
+	}{
+		{10, 10},
+		{50, 50},
+		{50, 45},
+		{-50, 50},
+		{-10, 10},
+	} {
+		fmt.Println(test)
+		b.Run(fmt.Sprint(test), func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				_ = Substring(str, test.offset, test.length)
+			}
+		})
+	}
 }

 func TestRuneLength(t *testing.T) {