Refactor "string truncate" (#32984)

2025-07-23 02:38:35 +00:00 · 2024-12-26 11:56:03 +08:00
parent 594edad213
commit 9bfa9f450d
26 changed files with 199 additions and 140 deletions
--- a/modules/util/truncate.go
+++ b/modules/util/truncate.go
@@ -14,31 +14,92 @@ const (
 	asciiEllipsis = "..."
 )

-func IsLikelySplitLeftPart(s string) bool {
+func IsLikelyEllipsisLeftPart(s string) bool {
 	return strings.HasSuffix(s, utf8Ellipsis) || strings.HasSuffix(s, asciiEllipsis)
 }

-// SplitStringAtByteN splits a string at byte n accounting for rune boundaries. (Combining characters are not accounted for.)
-func SplitStringAtByteN(input string, n int) (left, right string) {
-	if len(input) <= n {
-		return input, ""
-	}
+// EllipsisDisplayString returns a truncated short string for display purpose.
+// The length is the approximate number of ASCII-width in the string (CJK/emoji are 2-ASCII width)
+// It appends "…" or "..." at the end of truncated string.
+// It guarantees the length of the returned runes doesn't exceed the limit.
+func EllipsisDisplayString(str string, limit int) string {
+	s, _, _, _ := ellipsisDisplayString(str, limit)
+	return s
+}

-	if !utf8.ValidString(input) {
-		if n-3 < 0 {
-			return input, ""
+// EllipsisDisplayStringX works like EllipsisDisplayString while it also returns the right part
+func EllipsisDisplayStringX(str string, limit int) (left, right string) {
+	left, offset, truncated, encounterInvalid := ellipsisDisplayString(str, limit)
+	if truncated {
+		right = str[offset:]
+		r, _ := utf8.DecodeRune(UnsafeStringToBytes(right))
+		encounterInvalid = encounterInvalid || r == utf8.RuneError
+		ellipsis := utf8Ellipsis
+		if encounterInvalid {
+			ellipsis = asciiEllipsis
 		}
-		return input[:n-3] + asciiEllipsis, asciiEllipsis + input[n-3:]
+		right = ellipsis + right
+	}
+	return left, right
+}
+
+func ellipsisDisplayString(str string, limit int) (res string, offset int, truncated, encounterInvalid bool) {
+	if len(str) <= limit {
+		return str, len(str), false, false
 	}

-	end := 0
-	for end <= n-3 {
-		_, size := utf8.DecodeRuneInString(input[end:])
-		if end+size > n-3 {
+	// To future maintainers: this logic must guarantee that the length of the returned runes doesn't exceed the limit,
+	// because the returned string will also be used as database value. UTF-8 VARCHAR(10) could store 10 rune characters,
+	// So each rune must be countered as at least 1 width.
+	// Even if there are some special Unicode characters (zero-width, combining, etc.), they should NEVER be counted as zero.
+	pos, used := 0, 0
+	for i, r := range str {
+		encounterInvalid = encounterInvalid || r == utf8.RuneError
+		pos = i
+		runeWidth := 1
+		if r >= 128 {
+			runeWidth = 2 // CJK/emoji chars are considered as 2-ASCII width
+		}
+		if used+runeWidth+3 > limit {
 			break
 		}
-		end += size
+		used += runeWidth
+		offset += utf8.RuneLen(r)
 	}

-	return input[:end] + utf8Ellipsis, utf8Ellipsis + input[end:]
+	// if the remaining are fewer than 3 runes, then maybe we could add them, no need to ellipse
+	if len(str)-pos <= 12 {
+		var nextCnt, nextWidth int
+		for _, r := range str[pos:] {
+			if nextCnt >= 4 {
+				break
+			}
+			nextWidth++
+			if r >= 128 {
+				nextWidth++ // CJK/emoji chars are considered as 2-ASCII width
+			}
+			nextCnt++
+		}
+		if nextCnt <= 3 && used+nextWidth <= limit {
+			return str, len(str), false, false
+		}
+	}
+	if limit < 3 {
+		// if the limit is so small, do not add ellipsis
+		return str[:offset], offset, true, false
+	}
+	ellipsis := utf8Ellipsis
+	if encounterInvalid {
+		ellipsis = asciiEllipsis
+	}
+	return str[:offset] + ellipsis, offset, true, encounterInvalid
+}
+
+// TruncateRunes returns a truncated string with given rune limit,
+// it returns input string if its rune length doesn't exceed the limit.
+func TruncateRunes(str string, limit int) string {
+	if utf8.RuneCountInString(str) < limit {
+		return str
+	}
+	return string([]rune(str)[:limit])
 }
--- a/modules/util/truncate_test.go
+++ b/modules/util/truncate_test.go
@@ -4,43 +4,94 @@
 package util

 import (
+	"fmt"
+	"strings"
 	"testing"

 	"github.com/stretchr/testify/assert"
 )

-func TestSplitString(t *testing.T) {
-	type testCase struct {
-		input    string
-		n        int
-		leftSub  string
-		ellipsis string
+func TestEllipsisString(t *testing.T) {
+	cases := []struct {
+		limit int
+
+		input, left, right string
+	}{
+		{limit: 0, input: "abcde", left: "", right: "…abcde"},
+		{limit: 1, input: "abcde", left: "", right: "…abcde"},
+		{limit: 2, input: "abcde", left: "", right: "…abcde"},
+		{limit: 3, input: "abcde", left: "…", right: "…abcde"},
+		{limit: 4, input: "abcde", left: "a…", right: "…bcde"},
+		{limit: 5, input: "abcde", left: "abcde", right: ""},
+		{limit: 6, input: "abcde", left: "abcde", right: ""},
+		{limit: 7, input: "abcde", left: "abcde", right: ""},
+
+		// a CJK char or emoji is considered as 2-ASCII width, the ellipsis is 3-ASCII width
+		{limit: 0, input: "测试文本", left: "", right: "…测试文本"},
+		{limit: 1, input: "测试文本", left: "", right: "…测试文本"},
+		{limit: 2, input: "测试文本", left: "", right: "…测试文本"},
+		{limit: 3, input: "测试文本", left: "…", right: "…测试文本"},
+		{limit: 4, input: "测试文本", left: "…", right: "…测试文本"},
+		{limit: 5, input: "测试文本", left: "测…", right: "…试文本"},
+		{limit: 6, input: "测试文本", left: "测…", right: "…试文本"},
+		{limit: 7, input: "测试文本", left: "测试…", right: "…文本"},
+		{limit: 8, input: "测试文本", left: "测试文本", right: ""},
+		{limit: 9, input: "测试文本", left: "测试文本", right: ""},
+	}
+	for _, c := range cases {
+		t.Run(fmt.Sprintf("%s(%d)", c.input, c.limit), func(t *testing.T) {
+			left, right := EllipsisDisplayStringX(c.input, c.limit)
+			assert.Equal(t, c.left, left, "left")
+			assert.Equal(t, c.right, right, "right")
+		})
 	}

-	test := func(tc []*testCase, f func(input string, n int) (left, right string)) {
-		for _, c := range tc {
-			l, r := f(c.input, c.n)
-			if c.ellipsis != "" {
-				assert.Equal(t, c.leftSub+c.ellipsis, l, "test split %q at %d, expected leftSub: %q", c.input, c.n, c.leftSub)
-				assert.Equal(t, c.ellipsis+c.input[len(c.leftSub):], r, "test split %s at %d, expected rightSub: %q", c.input, c.n, c.input[len(c.leftSub):])
-			} else {
-				assert.Equal(t, c.leftSub, l, "test split %q at %d, expected leftSub: %q", c.input, c.n, c.leftSub)
-				assert.Empty(t, r, "test split %q at %d, expected rightSub: %q", c.input, c.n, "")
-			}
+	t.Run("LongInput", func(t *testing.T) {
+		left, right := EllipsisDisplayStringX(strings.Repeat("abc", 240), 90)
+		assert.Equal(t, strings.Repeat("abc", 29)+"…", left)
+		assert.Equal(t, "…"+strings.Repeat("abc", 211), right)
+	})
+
+	t.Run("InvalidUtf8", func(t *testing.T) {
+		invalidCases := []struct {
+			limit       int
+			left, right string
+		}{
+			{limit: 0, left: "", right: "...\xef\x03\xfe\xef\x03\xfe"},
+			{limit: 1, left: "", right: "...\xef\x03\xfe\xef\x03\xfe"},
+			{limit: 2, left: "", right: "...\xef\x03\xfe\xef\x03\xfe"},
+			{limit: 3, left: "...", right: "...\xef\x03\xfe\xef\x03\xfe"},
+			{limit: 4, left: "...", right: "...\xef\x03\xfe\xef\x03\xfe"},
+			{limit: 5, left: "\xef\x03\xfe...", right: "...\xef\x03\xfe"},
+			{limit: 6, left: "\xef\x03\xfe\xef\x03\xfe", right: ""},
+			{limit: 7, left: "\xef\x03\xfe\xef\x03\xfe", right: ""},
 		}
-	}
+		for _, c := range invalidCases {
+			t.Run(fmt.Sprintf("%d", c.limit), func(t *testing.T) {
+				left, right := EllipsisDisplayStringX("\xef\x03\xfe\xef\x03\xfe", c.limit)
+				assert.Equal(t, c.left, left, "left")
+				assert.Equal(t, c.right, right, "right")
+			})
+		}
+	})

-	tc := []*testCase{
-		{"abc123xyz", 0, "", utf8Ellipsis},
-		{"abc123xyz", 1, "", utf8Ellipsis},
-		{"abc123xyz", 4, "a", utf8Ellipsis},
-		{"啊bc123xyz", 4, "", utf8Ellipsis},
-		{"啊bc123xyz", 6, "啊", utf8Ellipsis},
-		{"啊bc", 5, "啊bc", ""},
-		{"啊bc", 6, "啊bc", ""},
-		{"abc\xef\x03\xfe", 3, "", asciiEllipsis},
-		{"abc\xef\x03\xfe", 4, "a", asciiEllipsis},
-		{"\xef\x03", 1, "\xef\x03", ""},
-	}
-	test(tc, SplitStringAtByteN)
+	t.Run("IsLikelyEllipsisLeftPart", func(t *testing.T) {
+		assert.True(t, IsLikelyEllipsisLeftPart("abcde…"))
+		assert.True(t, IsLikelyEllipsisLeftPart("abcde..."))
+	})
+}
+
+func TestTruncateRunes(t *testing.T) {
+	assert.Equal(t, "", TruncateRunes("", 0))
+	assert.Equal(t, "", TruncateRunes("", 1))
+
+	assert.Equal(t, "", TruncateRunes("ab", 0))
+	assert.Equal(t, "a", TruncateRunes("ab", 1))
+	assert.Equal(t, "ab", TruncateRunes("ab", 2))
+	assert.Equal(t, "ab", TruncateRunes("ab", 3))
+
+	assert.Equal(t, "", TruncateRunes("测试", 0))
+	assert.Equal(t, "测", TruncateRunes("测试", 1))
+	assert.Equal(t, "测试", TruncateRunes("测试", 2))
+	assert.Equal(t, "测试", TruncateRunes("测试", 3))
 }