From 24928ba0a779c7292237af1a644cb78e039811be Mon Sep 17 00:00:00 2001 From: Ivan Shalganov Date: Wed, 22 Jan 2025 11:25:01 +0100 Subject: [PATCH 1/2] #34 add diacritic support --- parser.go | 6 +++++- tokenizer.go | 22 +++++++++++++++------- tokenizer_test.go | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 8 deletions(-) diff --git a/parser.go b/parser.go index 003608e..75b9102 100644 --- a/parser.go +++ b/parser.go @@ -246,7 +246,11 @@ func (p *parsing) parseKeyword() bool { var size int p.ensureBytes(4) r, size = utf8.DecodeRune(p.slice(p.pos, p.pos+4)) - if unicode.IsLetter(r) || runeExists(p.t.kwMajorSymbols, r) || (start != -1 && runeExists(p.t.kwMinorSymbols, r)) { + if unicode.IsLetter(r) || + (p.t.kwDiacriticMarkers && unicode.IsMark(r)) || + runeExists(p.t.kwMajorSymbols, r) || + (start != -1 && runeExists(p.t.kwMinorSymbols, r)) { + if start == -1 { start = p.pos } diff --git a/tokenizer.go b/tokenizer.go index 22092c6..b4b47b1 100644 --- a/tokenizer.go +++ b/tokenizer.go @@ -129,13 +129,14 @@ type Tokenizer struct { stopOnUnknown bool allowNumberUnderscore bool // all defined custom tokens {key: [token1, token2, ...], ...} - tokens map[TokenKey][]*tokenRef - index map[byte][]*tokenRef - quotes []*StringSettings - wSpaces []byte - kwMajorSymbols []rune - kwMinorSymbols []rune - pool sync.Pool + tokens map[TokenKey][]*tokenRef + index map[byte][]*tokenRef + quotes []*StringSettings + wSpaces []byte + kwMajorSymbols []rune + kwMinorSymbols []rune + kwDiacriticMarkers bool + pool sync.Pool } // New creates new tokenizer. @@ -176,6 +177,13 @@ func (t *Tokenizer) AllowKeywordSymbols(majorSymbols []rune, minorSymbols []rune return t } +// AllowKeywordDiacriticMarkers enables the support for diacritic markers in keywords: +// See https://en.wikipedia.org/wiki/Diacritic +func (t *Tokenizer) AllowKeywordDiacriticMarkers() *Tokenizer { + t.kwDiacriticMarkers = true + return t +} + // AllowKeywordUnderscore allows underscore symbol in keywords, like `one_two` or `_three` // // Deprecated: use AllowKeywordSymbols diff --git a/tokenizer_test.go b/tokenizer_test.go index 20e2d00..0f1a9a9 100644 --- a/tokenizer_test.go +++ b/tokenizer_test.go @@ -12,6 +12,10 @@ func TestTokenize(t *testing.T) { value interface{} token Token } + type items struct { + value string + tokens []Token + } tokenizer := New() condTokenKey := TokenKey(10) wordTokenKey := TokenKey(11) @@ -95,6 +99,38 @@ func TestTokenize(t *testing.T) { require.Equal(t, v.value, stream.CurrentToken().ValueUnescapedString(), "value %s -> %s: %s", v.token.value, v.value, stream.CurrentToken().Value()) } }) + + t.Run("diacritic", func(t *testing.T) { + diacritic := []items{ + {"Pes pije čaj", []Token{ + {key: TokenKeyword, value: []byte("Pes")}, + {key: TokenKeyword, value: []byte("pije")}, + {key: TokenKeyword, value: []byte("čaj")}, + }}, + {"L’élève visite Paris", []Token{ + {key: TokenKeyword, value: []byte("L’élève")}, + {key: TokenKeyword, value: []byte("visite")}, + {key: TokenKeyword, value: []byte("Paris")}, + }}, + {"Dieses Haus ist schön", []Token{ + {key: TokenKeyword, value: []byte("Dieses")}, + {key: TokenKeyword, value: []byte("Haus")}, + {key: TokenKeyword, value: []byte("ist")}, + {key: TokenKeyword, value: []byte("schön")}, + }}, + {"Ёж оди́н дома", []Token{ + {key: TokenKeyword, value: []byte("Ёж")}, + {key: TokenKeyword, value: []byte("оди́н")}, + {key: TokenKeyword, value: []byte("дома")}, + }}, + } + for _, v := range diacritic { + t.Run(v.value, func(t *testing.T) { + stream := tokenizer.ParseBytes([]byte(v.value)) + require.Equal(t, v.tokens, stream.GetSnippet(0, 0)) + }) + } + }) } func TestTokenizeEdgeCases(t *testing.T) { From e25eb8ca4eafd0788b5e90af1270687ca4077797 Mon Sep 17 00:00:00 2001 From: Ivan Shalganov Date: Thu, 13 Feb 2025 08:41:30 +0100 Subject: [PATCH 2/2] #34 add diacritic support --- stream.go | 110 +++++++++++++++++++++++----------------------- tokenizer_test.go | 7 ++- 2 files changed, 62 insertions(+), 55 deletions(-) diff --git a/stream.go b/stream.go index 3b8d19f..5f292b9 100644 --- a/stream.go +++ b/stream.go @@ -259,7 +259,7 @@ func (s *Stream) PrevToken() *Token { } // NextToken returns next token from the stream. -// If next token doesn't exist, the method returns TypeUndef token. +// If a next token doesn't exist, the method returns TypeUndef token. // Do not save a result (Token) into variables — the next token may be changed at any time. func (s *Stream) NextToken() *Token { if s.current.next != nil { @@ -269,8 +269,7 @@ func (s *Stream) NextToken() *Token { } // GoNextIfNextIs moves the stream pointer to the next token if the next token has specific token keys. -// If keys matched pointer will be updated and the method returned true. -// Otherwise, returned false. +// If a key matched pointer is updated and the method returns true. Otherwise, returns false. func (s *Stream) GoNextIfNextIs(key TokenKey, otherKeys ...TokenKey) bool { if s.NextToken().Is(key, otherKeys...) { s.GoNext() @@ -280,68 +279,71 @@ func (s *Stream) GoNextIfNextIs(key TokenKey, otherKeys ...TokenKey) bool { } // GetSnippet returns slice of tokens. -// Slice generated from current token position and include tokens before and after current token. +// Slice generated from a current token position and include a number of tokens before and after the current token. func (s *Stream) GetSnippet(before, after int) []Token { - var segment []Token - if s.current == undefToken { - if s.prev != nil && before > s.prev.id-s.head.id { - before = s.prev.id - s.head.id - } else { - before = 0 - } - } else if before > s.current.id-s.head.id { - before = s.current.id - s.head.id + if s.current == nil { + return nil } - if after > s.len-before-1 { - after = s.len - before - 1 + snippet := make([]Token, before+after+1) + start := 0 + end := before + after + snippet[before] = Token{ + id: s.current.id, + key: s.current.key, + value: s.current.value, + line: s.current.line, + offset: s.current.offset, + indent: s.current.indent, + string: s.current.string, } - segment = make([]Token, before+after+1) - if len(segment) == 0 { - return segment - } - var ptr *Token - if s.next != nil { - ptr = s.next - } else if s.prev != nil { - ptr = s.prev - } else { - ptr = s.current - } - for p := ptr; p != nil; p, before = ptr.prev, before-1 { - segment[before] = Token{ - id: ptr.id, - key: ptr.key, - value: ptr.value, - line: ptr.line, - offset: ptr.offset, - indent: ptr.indent, - string: ptr.string, - } - if before <= 0 { - break + if s.current.prev != nil && before > 0 { + ptr := s.current.prev + for i := 1; i <= before; i++ { + snippet[before-i] = Token{ + id: ptr.id, + key: ptr.key, + value: ptr.value, + line: ptr.line, + offset: ptr.offset, + indent: ptr.indent, + string: ptr.string, + } + ptr = ptr.prev + if ptr == nil { + start = before - i + break + } } } - for p, i := ptr.next, 1; p != nil; p, i = p.next, i+1 { - segment[before+i] = Token{ - id: p.id, - key: p.key, - value: p.value, - line: p.line, - offset: p.offset, - indent: p.indent, - string: p.string, - } - if i >= after { - break + if s.current.next != nil && after > 0 { + ptr := s.current.next + for i := 1; i <= after; i++ { + snippet[before+i] = Token{ // before - is offset + id: ptr.id, + key: ptr.key, + value: ptr.value, + line: ptr.line, + offset: ptr.offset, + indent: ptr.indent, + string: ptr.string, + } + ptr = ptr.next + if ptr == nil { + end = -i + break + } } } - return segment + if start == 0 && end == before+after { + return snippet + } + return snippet[start:end] } -// GetSnippetAsString returns tokens before and after current token as string. +// GetSnippetAsString returns tokens before and after a current token as string. // `maxStringLength` specifies max length of each token string. // Zero — unlimited token string length. -// If string is greater than maxLength method removes some runes in the middle of the string. +// If a string is greater than maxLength method removes some runes in the middle of the string. func (s *Stream) GetSnippetAsString(before, after, maxStringLength int) string { segments := s.GetSnippet(before, after) str := make([]string, len(segments)) diff --git a/tokenizer_test.go b/tokenizer_test.go index 3548565..60b682a 100644 --- a/tokenizer_test.go +++ b/tokenizer_test.go @@ -124,11 +124,16 @@ func TestTokenize(t *testing.T) { {key: TokenKeyword, value: []byte("оди́н")}, {key: TokenKeyword, value: []byte("дома")}, }}, + {"जब मैंने सुबह", []Token{ + {key: TokenKeyword, value: []byte("जब")}, + {key: TokenKeyword, value: []byte("मैंने")}, + {key: TokenKeyword, value: []byte("सुबह")}, + }}, } for _, v := range diacritic { t.Run(v.value, func(t *testing.T) { stream := tokenizer.ParseBytes([]byte(v.value)) - require.Equal(t, v.tokens, stream.GetSnippet(0, 0)) + require.Equal(t, v.tokens, stream.GetSnippet(0, 1)) }) } })