VictoriaMetrics · ahfuzhang · Oct 28, 2025 · Oct 30, 2025 · Oct 30, 2025 · func25
diff --git a/lib/logstorage/hash_tokenizer.go b/lib/logstorage/hash_tokenizer.go
@@ -2,6 +2,7 @@ package logstorage
 
 import (
 	"sync"
+	"unsafe"
 
 	"github.com/cespare/xxhash/v2"
 
@@ -65,50 +66,6 @@ func (t *hashTokenizer) reset() {
 	}
 }
 
-func (t *hashTokenizer) tokenizeString(dst []uint64, s string) []uint64 {
-	if !isASCII(s) {
-		// Slow path - s contains unicode chars
-		return t.tokenizeStringUnicode(dst, s)
-	}
-
-	// Fast path for ASCII s
-	i := 0
-	for i < len(s) {
-		// Search for the next token.
-		start := len(s)
-		for i < len(s) {
-			if !isTokenChar(s[i]) {
-				i++
-				continue
-			}
-			start = i
-			i++
-			break
-		}
-		// Search for the end of the token.
-		end := len(s)
-		for i < len(s) {
-			if isTokenChar(s[i]) {
-				i++
-				continue
-			}
-			end = i
-			i++
-			break
-		}
-		if end <= start {
-			break
-		}
-
-		// Register the token.
-		token := s[start:end]
-		if h, ok := t.addToken(token); ok {
-			dst = append(dst, h)
-		}
-	}
-	return dst
-}
-
 func (t *hashTokenizer) tokenizeStringUnicode(dst []uint64, s string) []uint64 {
 	for len(s) > 0 {
 		// Search for the next token.
@@ -179,3 +136,95 @@ func putHashTokenizer(t *hashTokenizer) {
 }
 
 var hashTokenizerPool sync.Pool
+
+func initAsciiTable() (table [256]byte) {
+	for i := '0'; i <= '9'; i++ {
+		table[i] = 1
+	}
+	for i := 'a'; i <= 'z'; i++ {
+		table[i] = 1
+	}
+	for i := 'A'; i <= 'Z'; i++ {
+		table[i] = 1
+	}
+	table['_'] = 1
+	return
+}
+
+func initUnicodeTable() (table [256]byte) {
+	for i := '0'; i <= '9'; i++ {
+		table[i] = 1
+	}
+	for i := 'a'; i <= 'z'; i++ {
+		table[i] = 1
+	}
+	for i := 'A'; i <= 'Z'; i++ {
+		table[i] = 1
+	}
+	table['_'] = 1
+	for i := 128; i <= 255; i++ {
+		table[i] = 1
+	}
+	return
+}
+
+var lookupTables [2][256]byte = func() [2][256]byte {
+	return [2][256]byte{
+		initAsciiTable(),
+		initUnicodeTable(),
+	}
+}()
+
+func (t *hashTokenizer) tokenizeString(dst []uint64, s string) []uint64 {
+	i := 0
+	ptr := unsafe.Pointer(unsafe.StringData(s))
+	var curUnicodeFlag byte
+	for i < len(s) {
+		curUnicodeFlag = 0
+		// Search for the next token.
+		start := len(s)
+		for i < len(s) {
+			c := *(*byte)(unsafe.Add(ptr, uintptr(i)))
+			unicodeFlag := (c & 0x80) >> 7
+			curUnicodeFlag |= unicodeFlag
+			found := lookupTables[unicodeFlag][c] // search both ASCII and Unicode tables
+			if found == 0 {
+				i++
+				continue
+			}
+			start = i
+			i++
+			break
+		}
+		// Search for the end of the token.
+		end := len(s)
+		for i < len(s) {
+			c := *(*byte)(unsafe.Add(ptr, uintptr(i)))
+			unicodeFlag := (c & 0x80) >> 7
+			curUnicodeFlag |= unicodeFlag // Unicode strings may begin with a non-Unicode character.
+			found := lookupTables[curUnicodeFlag][c]
+			if found != 0 {
+				i++
+				continue
+			}
+			end = i
+			i++
+			break
+		}
+		if end <= start {
+			break
+		}
+
+		// Register the token.
+		token := unsafe.String((*byte)(unsafe.Add(ptr, start)), end-start)
+		if curUnicodeFlag == 1 {
+			// Only perform tokenizeStringUnicode on very short substrings if the string contains Unicode characters
-			// Only perform tokenizeStringUnicode on very short substrings if the string contains Unicode characters
+			// If the current token contains non-ASCII bytes, delegate to tokenizeStringUnicode for rune-aware tokenization. Otherwise hash directly
-			// Only perform tokenizeStringUnicode on very short substrings if the string contains Unicode characters
+			// If the current token contains non-ASCII bytes, delegate to tokenizeStringUnicode for rune-aware tokenization. Otherwise hash directly
+			dst = t.tokenizeStringUnicode(dst, token)
+			continue
+		}
+		if h, ok := t.addToken(token); ok {
+			dst = append(dst, h)
+		}
+	}
+	return dst
+}