From b5189d50cb39f68c571b865e2d3420b33eeefa4b Mon Sep 17 00:00:00 2001 From: ahfuzhang Date: Tue, 28 Oct 2025 13:33:34 +0800 Subject: [PATCH 1/3] improve performance for token --- lib/logstorage/hash_tokenizer.go | 135 +++++++++++++++++++++---------- 1 file changed, 91 insertions(+), 44 deletions(-) diff --git a/lib/logstorage/hash_tokenizer.go b/lib/logstorage/hash_tokenizer.go index e7a185157b..6c92cbc37b 100644 --- a/lib/logstorage/hash_tokenizer.go +++ b/lib/logstorage/hash_tokenizer.go @@ -2,6 +2,7 @@ package logstorage import ( "sync" + "unsafe" "github.com/cespare/xxhash/v2" @@ -65,50 +66,6 @@ func (t *hashTokenizer) reset() { } } -func (t *hashTokenizer) tokenizeString(dst []uint64, s string) []uint64 { - if !isASCII(s) { - // Slow path - s contains unicode chars - return t.tokenizeStringUnicode(dst, s) - } - - // Fast path for ASCII s - i := 0 - for i < len(s) { - // Search for the next token. - start := len(s) - for i < len(s) { - if !isTokenChar(s[i]) { - i++ - continue - } - start = i - i++ - break - } - // Search for the end of the token. - end := len(s) - for i < len(s) { - if isTokenChar(s[i]) { - i++ - continue - } - end = i - i++ - break - } - if end <= start { - break - } - - // Register the token. - token := s[start:end] - if h, ok := t.addToken(token); ok { - dst = append(dst, h) - } - } - return dst -} - func (t *hashTokenizer) tokenizeStringUnicode(dst []uint64, s string) []uint64 { for len(s) > 0 { // Search for the next token. @@ -179,3 +136,93 @@ func putHashTokenizer(t *hashTokenizer) { } var hashTokenizerPool sync.Pool + +func initAsciiTable() (table [256]byte) { + for i := '0'; i <= '9'; i++ { + table[i] = 1 + } + for i := 'a'; i <= 'z'; i++ { + table[i] = 1 + } + for i := 'A'; i <= 'Z'; i++ { + table[i] = 1 + } + table['_'] = 1 + return +} + +func initUnicodeTable() (table [256]byte) { + for i := '0'; i <= '9'; i++ { + table[i] = 1 + } + for i := 'a'; i <= 'z'; i++ { + table[i] = 1 + } + for i := 'A'; i <= 'Z'; i++ { + table[i] = 1 + } + table['_'] = 1 + for i := 128; i <= 255; i++ { + table[i] = 1 + } + return +} + +var lookupTables [2][256]byte = func() [2][256]byte { + return [2][256]byte{ + initAsciiTable(), + initUnicodeTable(), + } +}() + +func (t *hashTokenizer) tokenizeString(dst []uint64, s string) []uint64 { + i := 0 + ptr := unsafe.Pointer(unsafe.StringData(s)) + var curUnicodeFlag byte + for i < len(s) { + curUnicodeFlag = 0 + // Search for the next token. + start := len(s) + for i < len(s) { + c := *(*byte)(unsafe.Add(ptr, uintptr(i))) + unicodeFlag := (c & 0x80) >> 7 + curUnicodeFlag |= unicodeFlag + found := lookupTables[unicodeFlag][c] // search both ASCII and Unicode tables + if found == 0 { + i++ + continue + } + start = i + i++ + break + } + // Search for the end of the token. + end := len(s) + for i < len(s) { + c := *(*byte)(unsafe.Add(ptr, uintptr(i))) + found := lookupTables[curUnicodeFlag][c] + if found != 0 { + i++ + continue + } + end = i + i++ + break + } + if end <= start { + break + } + + // Register the token. + token := unsafe.String((*byte)(unsafe.Add(ptr, start)), end-start) + if curUnicodeFlag == 1 { + // Only perform tokenizeStringUnicode on very short substrings if the string contains Unicode characters + dst = t.tokenizeStringUnicode(dst, token) + continue + } + if h, ok := t.addToken(token); ok { + dst = append(dst, h) + } + } + return dst +} From 3ab5509ac9fdd4fa7f750e79c3d62e305d961776 Mon Sep 17 00:00:00 2001 From: ahfuzhang Date: Thu, 30 Oct 2025 08:56:20 +0800 Subject: [PATCH 2/3] bug fix --- lib/logstorage/hash_tokenizer.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/logstorage/hash_tokenizer.go b/lib/logstorage/hash_tokenizer.go index 6c92cbc37b..16b34211a0 100644 --- a/lib/logstorage/hash_tokenizer.go +++ b/lib/logstorage/hash_tokenizer.go @@ -200,6 +200,8 @@ func (t *hashTokenizer) tokenizeString(dst []uint64, s string) []uint64 { end := len(s) for i < len(s) { c := *(*byte)(unsafe.Add(ptr, uintptr(i))) + unicodeFlag := (c & 0x80) >> 7 + curUnicodeFlag |= unicodeFlag found := lookupTables[curUnicodeFlag][c] if found != 0 { i++ From cc9648b6c5c0398da93e1f9be483a2e0c0e91eda Mon Sep 17 00:00:00 2001 From: ahfuzhang Date: Thu, 30 Oct 2025 09:00:31 +0800 Subject: [PATCH 3/3] bug fix: Unicode strings may begin with a non-Unicode character. --- lib/logstorage/hash_tokenizer.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/logstorage/hash_tokenizer.go b/lib/logstorage/hash_tokenizer.go index 16b34211a0..ad9308654a 100644 --- a/lib/logstorage/hash_tokenizer.go +++ b/lib/logstorage/hash_tokenizer.go @@ -201,7 +201,7 @@ func (t *hashTokenizer) tokenizeString(dst []uint64, s string) []uint64 { for i < len(s) { c := *(*byte)(unsafe.Add(ptr, uintptr(i))) unicodeFlag := (c & 0x80) >> 7 - curUnicodeFlag |= unicodeFlag + curUnicodeFlag |= unicodeFlag // Unicode strings may begin with a non-Unicode character. found := lookupTables[curUnicodeFlag][c] if found != 0 { i++