Skip to content

WIP #34: add diacritics and sings #37

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,11 @@ func (p *parsing) parseKeyword() bool {
var size int
p.ensureBytes(4)
r, size = utf8.DecodeRune(p.slice(p.pos, p.pos+4))
if unicode.IsLetter(r) || runeExists(p.t.kwMajorSymbols, r) || (start != -1 && runeExists(p.t.kwMinorSymbols, r)) {
if unicode.IsLetter(r) ||
(p.t.kwDiacriticMarkers && unicode.IsMark(r)) ||
runeExists(p.t.kwMajorSymbols, r) ||
(start != -1 && runeExists(p.t.kwMinorSymbols, r)) {

if start == -1 {
start = p.pos
}
Expand Down
110 changes: 56 additions & 54 deletions stream.go
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ func (s *Stream) PrevToken() *Token {
}

// NextToken returns next token from the stream.
// If next token doesn't exist, the method returns TypeUndef token.
// If a next token doesn't exist, the method returns TypeUndef token.
// Do not save a result (Token) into variables — the next token may be changed at any time.
func (s *Stream) NextToken() *Token {
if s.current.next != nil {
Expand All @@ -269,8 +269,7 @@ func (s *Stream) NextToken() *Token {
}

// GoNextIfNextIs moves the stream pointer to the next token if the next token has specific token keys.
// If keys matched pointer will be updated and the method returned true.
// Otherwise, returned false.
// If a key matched pointer is updated and the method returns true. Otherwise, returns false.
func (s *Stream) GoNextIfNextIs(key TokenKey, otherKeys ...TokenKey) bool {
if s.NextToken().Is(key, otherKeys...) {
s.GoNext()
Expand All @@ -280,68 +279,71 @@ func (s *Stream) GoNextIfNextIs(key TokenKey, otherKeys ...TokenKey) bool {
}

// GetSnippet returns slice of tokens.
// Slice generated from current token position and include tokens before and after current token.
// Slice generated from a current token position and include a number of tokens before and after the current token.
func (s *Stream) GetSnippet(before, after int) []Token {
var segment []Token
if s.current == undefToken {
if s.prev != nil && before > s.prev.id-s.head.id {
before = s.prev.id - s.head.id
} else {
before = 0
}
} else if before > s.current.id-s.head.id {
before = s.current.id - s.head.id
if s.current == nil {
return nil
}
if after > s.len-before-1 {
after = s.len - before - 1
snippet := make([]Token, before+after+1)
start := 0
end := before + after
snippet[before] = Token{
id: s.current.id,
key: s.current.key,
value: s.current.value,
line: s.current.line,
offset: s.current.offset,
indent: s.current.indent,
string: s.current.string,
}
segment = make([]Token, before+after+1)
if len(segment) == 0 {
return segment
}
var ptr *Token
if s.next != nil {
ptr = s.next
} else if s.prev != nil {
ptr = s.prev
} else {
ptr = s.current
}
for p := ptr; p != nil; p, before = ptr.prev, before-1 {
segment[before] = Token{
id: ptr.id,
key: ptr.key,
value: ptr.value,
line: ptr.line,
offset: ptr.offset,
indent: ptr.indent,
string: ptr.string,
}
if before <= 0 {
break
if s.current.prev != nil && before > 0 {
ptr := s.current.prev
for i := 1; i <= before; i++ {
snippet[before-i] = Token{
id: ptr.id,
key: ptr.key,
value: ptr.value,
line: ptr.line,
offset: ptr.offset,
indent: ptr.indent,
string: ptr.string,
}
ptr = ptr.prev
if ptr == nil {
start = before - i
break
}
}
}
for p, i := ptr.next, 1; p != nil; p, i = p.next, i+1 {
segment[before+i] = Token{
id: p.id,
key: p.key,
value: p.value,
line: p.line,
offset: p.offset,
indent: p.indent,
string: p.string,
}
if i >= after {
break
if s.current.next != nil && after > 0 {
ptr := s.current.next
for i := 1; i <= after; i++ {
snippet[before+i] = Token{ // before - is offset
id: ptr.id,
key: ptr.key,
value: ptr.value,
line: ptr.line,
offset: ptr.offset,
indent: ptr.indent,
string: ptr.string,
}
ptr = ptr.next
if ptr == nil {
end = -i
break
}
}
}
return segment
if start == 0 && end == before+after {
return snippet
}
return snippet[start:end]
}

// GetSnippetAsString returns tokens before and after current token as string.
// GetSnippetAsString returns tokens before and after a current token as string.
// `maxStringLength` specifies max length of each token string.
// Zero — unlimited token string length.
// If string is greater than maxLength method removes some runes in the middle of the string.
// If a string is greater than maxLength method removes some runes in the middle of the string.
func (s *Stream) GetSnippetAsString(before, after, maxStringLength int) string {
segments := s.GetSnippet(before, after)
str := make([]string, len(segments))
Expand Down
22 changes: 15 additions & 7 deletions tokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,14 @@ type Tokenizer struct {
stopOnUnknown bool
allowNumberUnderscore bool
// all defined custom tokens {key: [token1, token2, ...], ...}
tokens map[TokenKey][]*tokenRef
index map[byte][]*tokenRef
quotes []*StringSettings
wSpaces []byte
kwMajorSymbols []rune
kwMinorSymbols []rune
pool sync.Pool
tokens map[TokenKey][]*tokenRef
index map[byte][]*tokenRef
quotes []*StringSettings
wSpaces []byte
kwMajorSymbols []rune
kwMinorSymbols []rune
kwDiacriticMarkers bool
pool sync.Pool
}

// New creates new tokenizer.
Expand Down Expand Up @@ -176,6 +177,13 @@ func (t *Tokenizer) AllowKeywordSymbols(majorSymbols []rune, minorSymbols []rune
return t
}

// AllowKeywordDiacriticMarkers enables the support for diacritic markers in keywords:
// See https://en.wikipedia.org/wiki/Diacritic
func (t *Tokenizer) AllowKeywordDiacriticMarkers() *Tokenizer {
t.kwDiacriticMarkers = true
return t
}

// AllowKeywordUnderscore allows underscore symbol in keywords, like `one_two` or `_three`
//
// Deprecated: use AllowKeywordSymbols
Expand Down
41 changes: 41 additions & 0 deletions tokenizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ func TestTokenize(t *testing.T) {
value interface{}
token Token
}
type items struct {
value string
tokens []Token
}
tokenizer := New()
condTokenKey := TokenKey(10)
wordTokenKey := TokenKey(11)
Expand Down Expand Up @@ -96,6 +100,43 @@ func TestTokenize(t *testing.T) {
require.Equal(t, v.value, stream.CurrentToken().ValueUnescapedString(), "value %s -> %s: %s", v.token.value, v.value, stream.CurrentToken().Value())
}
})

t.Run("diacritic", func(t *testing.T) {
diacritic := []items{
{"Pes pije čaj", []Token{
{key: TokenKeyword, value: []byte("Pes")},
{key: TokenKeyword, value: []byte("pije")},
{key: TokenKeyword, value: []byte("čaj")},
}},
{"L’élève visite Paris", []Token{
{key: TokenKeyword, value: []byte("L’élève")},
{key: TokenKeyword, value: []byte("visite")},
{key: TokenKeyword, value: []byte("Paris")},
}},
{"Dieses Haus ist schön", []Token{
{key: TokenKeyword, value: []byte("Dieses")},
{key: TokenKeyword, value: []byte("Haus")},
{key: TokenKeyword, value: []byte("ist")},
{key: TokenKeyword, value: []byte("schön")},
}},
{"Ёж оди́н дома", []Token{
{key: TokenKeyword, value: []byte("Ёж")},
{key: TokenKeyword, value: []byte("оди́н")},
{key: TokenKeyword, value: []byte("дома")},
}},
{"जब मैंने सुबह", []Token{
{key: TokenKeyword, value: []byte("जब")},
{key: TokenKeyword, value: []byte("मैंने")},
{key: TokenKeyword, value: []byte("सुबह")},
}},
}
for _, v := range diacritic {
t.Run(v.value, func(t *testing.T) {
stream := tokenizer.ParseBytes([]byte(v.value))
require.Equal(t, v.tokens, stream.GetSnippet(0, 1))
})
}
})
}

func TestTokenizeEdgeCases(t *testing.T) {
Expand Down
Loading