From b5189d50cb39f68c571b865e2d3420b33eeefa4b Mon Sep 17 00:00:00 2001
From: ahfuzhang <ahfuzhang@gmail.com>
Date: Tue, 28 Oct 2025 13:33:34 +0800
Subject: [PATCH 1/3] improve performance for token

---
 lib/logstorage/hash_tokenizer.go | 135 +++++++++++++++++++++----------
 1 file changed, 91 insertions(+), 44 deletions(-)

diff --git a/lib/logstorage/hash_tokenizer.go b/lib/logstorage/hash_tokenizer.go
index e7a185157b..6c92cbc37b 100644
--- a/lib/logstorage/hash_tokenizer.go
+++ b/lib/logstorage/hash_tokenizer.go
@@ -2,6 +2,7 @@ package logstorage
 
 import (
 	"sync"
+	"unsafe"
 
 	"github.com/cespare/xxhash/v2"
 
@@ -65,50 +66,6 @@ func (t *hashTokenizer) reset() {
 	}
 }
 
-func (t *hashTokenizer) tokenizeString(dst []uint64, s string) []uint64 {
-	if !isASCII(s) {
-		// Slow path - s contains unicode chars
-		return t.tokenizeStringUnicode(dst, s)
-	}
-
-	// Fast path for ASCII s
-	i := 0
-	for i < len(s) {
-		// Search for the next token.
-		start := len(s)
-		for i < len(s) {
-			if !isTokenChar(s[i]) {
-				i++
-				continue
-			}
-			start = i
-			i++
-			break
-		}
-		// Search for the end of the token.
-		end := len(s)
-		for i < len(s) {
-			if isTokenChar(s[i]) {
-				i++
-				continue
-			}
-			end = i
-			i++
-			break
-		}
-		if end <= start {
-			break
-		}
-
-		// Register the token.
-		token := s[start:end]
-		if h, ok := t.addToken(token); ok {
-			dst = append(dst, h)
-		}
-	}
-	return dst
-}
-
 func (t *hashTokenizer) tokenizeStringUnicode(dst []uint64, s string) []uint64 {
 	for len(s) > 0 {
 		// Search for the next token.
@@ -179,3 +136,93 @@ func putHashTokenizer(t *hashTokenizer) {
 }
 
 var hashTokenizerPool sync.Pool
+
+func initAsciiTable() (table [256]byte) {
+	for i := '0'; i <= '9'; i++ {
+		table[i] = 1
+	}
+	for i := 'a'; i <= 'z'; i++ {
+		table[i] = 1
+	}
+	for i := 'A'; i <= 'Z'; i++ {
+		table[i] = 1
+	}
+	table['_'] = 1
+	return
+}
+
+func initUnicodeTable() (table [256]byte) {
+	for i := '0'; i <= '9'; i++ {
+		table[i] = 1
+	}
+	for i := 'a'; i <= 'z'; i++ {
+		table[i] = 1
+	}
+	for i := 'A'; i <= 'Z'; i++ {
+		table[i] = 1
+	}
+	table['_'] = 1
+	for i := 128; i <= 255; i++ {
+		table[i] = 1
+	}
+	return
+}
+
+var lookupTables [2][256]byte = func() [2][256]byte {
+	return [2][256]byte{
+		initAsciiTable(),
+		initUnicodeTable(),
+	}
+}()
+
+func (t *hashTokenizer) tokenizeString(dst []uint64, s string) []uint64 {
+	i := 0
+	ptr := unsafe.Pointer(unsafe.StringData(s))
+	var curUnicodeFlag byte
+	for i < len(s) {
+		curUnicodeFlag = 0
+		// Search for the next token.
+		start := len(s)
+		for i < len(s) {
+			c := *(*byte)(unsafe.Add(ptr, uintptr(i)))
+			unicodeFlag := (c & 0x80) >> 7
+			curUnicodeFlag |= unicodeFlag
+			found := lookupTables[unicodeFlag][c] // search both ASCII and Unicode tables
+			if found == 0 {
+				i++
+				continue
+			}
+			start = i
+			i++
+			break
+		}
+		// Search for the end of the token.
+		end := len(s)
+		for i < len(s) {
+			c := *(*byte)(unsafe.Add(ptr, uintptr(i)))
+			found := lookupTables[curUnicodeFlag][c]
+			if found != 0 {
+				i++
+				continue
+			}
+			end = i
+			i++
+			break
+		}
+		if end <= start {
+			break
+		}
+
+		// Register the token.
+		token := unsafe.String((*byte)(unsafe.Add(ptr, start)), end-start)
+		if curUnicodeFlag == 1 {
+			// Only perform tokenizeStringUnicode on very short substrings if the string contains Unicode characters
+			dst = t.tokenizeStringUnicode(dst, token)
+			continue
+		}
+		if h, ok := t.addToken(token); ok {
+			dst = append(dst, h)
+		}
+	}
+	return dst
+}

From 3ab5509ac9fdd4fa7f750e79c3d62e305d961776 Mon Sep 17 00:00:00 2001
From: ahfuzhang <ahfuzhang@gmail.com>
Date: Thu, 30 Oct 2025 08:56:20 +0800
Subject: [PATCH 2/3] bug fix

---
 lib/logstorage/hash_tokenizer.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/logstorage/hash_tokenizer.go b/lib/logstorage/hash_tokenizer.go
index 6c92cbc37b..16b34211a0 100644
--- a/lib/logstorage/hash_tokenizer.go
+++ b/lib/logstorage/hash_tokenizer.go
@@ -200,6 +200,8 @@ func (t *hashTokenizer) tokenizeString(dst []uint64, s string) []uint64 {
 		end := len(s)
 		for i < len(s) {
 			c := *(*byte)(unsafe.Add(ptr, uintptr(i)))
+			unicodeFlag := (c & 0x80) >> 7
+			curUnicodeFlag |= unicodeFlag
 			found := lookupTables[curUnicodeFlag][c]
 			if found != 0 {
 				i++

From cc9648b6c5c0398da93e1f9be483a2e0c0e91eda Mon Sep 17 00:00:00 2001
From: ahfuzhang <ahfuzhang@gmail.com>
Date: Thu, 30 Oct 2025 09:00:31 +0800
Subject: [PATCH 3/3] bug fix: Unicode strings may begin with a non-Unicode
 character.

---
 lib/logstorage/hash_tokenizer.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/logstorage/hash_tokenizer.go b/lib/logstorage/hash_tokenizer.go
index 16b34211a0..ad9308654a 100644
--- a/lib/logstorage/hash_tokenizer.go
+++ b/lib/logstorage/hash_tokenizer.go
@@ -201,7 +201,7 @@ func (t *hashTokenizer) tokenizeString(dst []uint64, s string) []uint64 {
 		for i < len(s) {
 			c := *(*byte)(unsafe.Add(ptr, uintptr(i)))
 			unicodeFlag := (c & 0x80) >> 7
-			curUnicodeFlag |= unicodeFlag
+			curUnicodeFlag |= unicodeFlag // Unicode strings may begin with a non-Unicode character.
 			found := lookupTables[curUnicodeFlag][c]
 			if found != 0 {
 				i++