diff --git a/docs/data/core-substring.md b/docs/data/core-substring.md index 6231ae1..511e8c5 100644 --- a/docs/data/core-substring.md +++ b/docs/data/core-substring.md @@ -19,7 +19,7 @@ signatures: - "func Substring[T ~string](str T, offset int, length uint) T" --- -Returns a substring starting at the given offset with the specified length. Supports negative offsets; out-of-bounds are clamped. +Returns a substring starting at the given offset with the specified length. Supports negative offsets; out-of-bounds are clamped. Operates on Unicode runes (characters) and is optimized for zero allocations. ```go // Basic usage @@ -42,9 +42,9 @@ result = lo.Substring("hello", 1, 0) result = lo.Substring("hello", 10, 3) // result: "" -// With Unicode strings (byte-based) +// With Unicode strings (rune-aware) result = lo.Substring("héllo", 1, 3) -// result: "él" (note: works with bytes, not runes) +// result: "él" // Negative offset with negative values clamped result = lo.Substring("hello", -10, 3) diff --git a/string.go b/string.go index 3f2b49e..9b0fc6e 100644 --- a/string.go +++ b/string.go @@ -7,10 +7,10 @@ import ( "unicode" "unicode/utf8" - "github.com/samber/lo/internal/xrand" - "golang.org/x/text/cases" "golang.org/x/text/language" + + "github.com/samber/lo/internal/xrand" ) var ( @@ -100,28 +100,94 @@ func nearestPowerOfTwo(capacity int) int { return n + 1 } -// Substring return part of a string. +// Substring extracts a substring from a string with Unicode character (rune) awareness. +// offset - starting position of the substring (can be positive, negative, or zero) +// length - number of characters to extract +// With positive offset, counting starts from the beginning of the string +// With negative offset, counting starts from the end of the string // Play: https://go.dev/play/p/TQlxQi82Lu1 func Substring[T ~string](str T, offset int, length uint) T { - rs := []rune(str) - size := len(rs) + str = substring(str, offset, length) - if offset < 0 { - offset = size + offset - if offset < 0 { - offset = 0 + // Validate UTF-8 and fix invalid sequences + if !utf8.ValidString(string(str)) { + // Convert to []rune to replicate behavior with duplicated � + str = T([]rune(str)) + } + + // Remove null bytes from result + return T(strings.ReplaceAll(string(str), "\x00", "")) +} + +func substring[T ~string](str T, offset int, length uint) T { + switch { + // Empty length or offset beyond string bounds - return empty string + case length == 0, offset >= len(str): + return "" + + // Positive offset - count from the beginning + case offset > 0: + // Skip offset runes from the start + for i, r := range str { + if offset--; offset == 0 { + str = str[i+utf8.RuneLen(r):] + break + } } - } - if offset >= size { - return Empty[T]() - } + // If couldn't skip enough runes - string is shorter than offset + if offset != 0 { + return "" + } - if length > uint(size)-uint(offset) { - length = uint(size - offset) - } + // If remaining string is shorter than or equal to length - return it entirely + if uint(len(str)) <= length { + return str + } - return T(strings.ReplaceAll(string(rs[offset:offset+int(length)]), "\x00", "")) + // Otherwise proceed to trimming by length + fallthrough + + // Zero offset or offset less than minus string length - start from beginning + case offset < -len(str), offset == 0: + // Count length runes from the start + for i := range str { + if length == 0 { + return str[:i] + } + length-- + } + + return str + + // Negative offset - count from the end of string + default: // -len(str) < offset < 0 + // Helper function to move backward through runes + backwardPos := func(end int, count uint) (start int) { + for { + _, i := utf8.DecodeLastRuneInString(string(str[:end])) + end -= i + + if count--; count == 0 || end == 0 { + return end + } + } + } + + offset := uint(-offset) + + // If offset is less than or equal to length - take from position to end + if offset <= length { + start := backwardPos(len(str), offset) + return str[start:] + } + + // Otherwise calculate start and end positions + end := backwardPos(len(str), offset-length) + start := backwardPos(end, length) + + return str[start:end] + } } // ChunkString returns a slice of strings split into groups of length size. If the string can't be split evenly, diff --git a/string_test.go b/string_test.go index 5496b76..76dafac 100644 --- a/string_test.go +++ b/string_test.go @@ -1,8 +1,11 @@ package lo import ( + "fmt" "math" + "strings" "testing" + "unicode/utf8" "github.com/stretchr/testify/assert" ) @@ -62,6 +65,7 @@ func TestSubstring(t *testing.T) { t.Parallel() is := assert.New(t) + str0 := Substring("hello", 5, 10) str1 := Substring("hello", 0, 0) str2 := Substring("hello", 10, 2) str3 := Substring("hello", -10, 2) @@ -76,8 +80,25 @@ func TestSubstring(t *testing.T) { str12 := Substring("hello", -4, math.MaxUint) str13 := Substring("🏠🐶🐱", 0, 2) str14 := Substring("你好,世界", 0, 3) - str15 := Substring("hello", 5, 1) + str15 := Substring("🏠🐶🐱", 1, 2) + str16 := Substring("🏠🐶🐱", -2, 2) + str17 := Substring("🏠🐶🐱", 3, 3) + str18 := Substring("🏠🐶🐱", 4, 3) + str19 := Substring("hello", 5, 1) + str20 := Substring("hello", -5, 5) + str21 := Substring("hello", -5, 4) + str22 := Substring("hello", -5, math.MaxUint) + str23 := Substring("\x00\x00\x00", 0, math.MaxUint) + str24 := Substring(string(utf8.RuneError), 0, math.MaxUint) + str25 := Substring("привет"[1:], 0, 6) + str26 := Substring("привет"[:2*5+1], 0, 6) + str27 := Substring("привет"[:2*5+1], -2, math.MaxUint) + str28 := Substring("🏠🐶🐱"[1:], 0, math.MaxUint) + str29 := Substring("🏠🐶🐱"[1:], 0, 2) + str30 := Substring("привет", 6, math.MaxUint) + str31 := Substring("привет", 6+1, math.MaxUint) + is.Empty(str0) is.Empty(str1) is.Empty(str2) is.Equal("he", str3) @@ -92,7 +113,45 @@ func TestSubstring(t *testing.T) { is.Equal("ello", str12) is.Equal("🏠🐶", str13) is.Equal("你好,", str14) - is.Empty(str15) + is.Equal("🐶🐱", str15) + is.Equal("🐶🐱", str16) + is.Empty(str17) + is.Empty(str18) + is.Empty(str19) + is.Equal("hello", str20) + is.Equal("hell", str21) + is.Equal("hello", str22) + is.Empty(str23) + is.Equal("�", str24) + is.Equal("�ривет", str25) + is.Equal("приве�", str26) + is.Equal("е�", str27) + is.Equal("���🐶🐱", str28) + is.Equal("��", str29) + is.Empty(str30) + is.Empty(str31) +} + +func BenchmarkSubstring(b *testing.B) { + str := strings.Repeat("1", 100) + + for _, test := range []struct { + offset int + length uint + }{ + {10, 10}, + {50, 50}, + {50, 45}, + {-50, 50}, + {-10, 10}, + } { + fmt.Println(test) + b.Run(fmt.Sprint(test), func(b *testing.B) { + for i := 0; i < b.N; i++ { + _ = Substring(str, test.offset, test.length) + } + }) + } } func TestRuneLength(t *testing.T) {