From 4fad36482fd52fcf5f13be455ca067ba33909a3f Mon Sep 17 00:00:00 2001 From: JanErik Keller Date: Tue, 12 Sep 2017 16:32:55 +0200 Subject: [PATCH 1/3] replaced lexerStream with text/scanner.Scanner featurechanges: Don't escape single quotes in double qiates: Use "abc's" instead of "abc\'s" Added Support for new quote `abc` with unescapable content Added Support for Unicode number codes '\u263a' --- lexerStream.go | 39 ----- parsing.go | 366 +++++++++++++++++++---------------------- parsingFailure_test.go | 6 +- parsing_test.go | 43 ++++- 4 files changed, 211 insertions(+), 243 deletions(-) delete mode 100644 lexerStream.go diff --git a/lexerStream.go b/lexerStream.go deleted file mode 100644 index b72e6bd..0000000 --- a/lexerStream.go +++ /dev/null @@ -1,39 +0,0 @@ -package govaluate - -type lexerStream struct { - source []rune - position int - length int -} - -func newLexerStream(source string) *lexerStream { - - var ret *lexerStream - var runes []rune - - for _, character := range source { - runes = append(runes, character) - } - - ret = new(lexerStream) - ret.source = runes - ret.length = len(runes) - return ret -} - -func (this *lexerStream) readCharacter() rune { - - var character rune - - character = this.source[this.position] - this.position += 1 - return character -} - -func (this *lexerStream) rewind(amount int) { - this.position -= amount -} - -func (this lexerStream) canRead() bool { - return this.position < this.length -} diff --git a/parsing.go b/parsing.go index 40c7ed2..2d8b46a 100644 --- a/parsing.go +++ b/parsing.go @@ -7,6 +7,7 @@ import ( "regexp" "strconv" "strings" + "text/scanner" "time" "unicode" ) @@ -15,17 +16,19 @@ func parseTokens(expression string, functions map[string]ExpressionFunction) ([] var ret []ExpressionToken var token ExpressionToken - var stream *lexerStream + var stream scanner.Scanner var state lexerState var err error var found bool - stream = newLexerStream(expression) + reader := strings.NewReader(expression) + stream.Init(reader) + stream.Mode = scanner.ScanIdents | scanner.ScanFloats | scanner.ScanStrings | scanner.ScanRawStrings | scanner.ScanComments | scanner.SkipComments state = validLexerStates[0] - for stream.canRead() { + for stream.Peek() != scanner.EOF { - token, err, found = readToken(stream, state, functions) + token, err, found = readToken(&stream, state, functions) if err != nil { return ret, err @@ -52,7 +55,7 @@ func parseTokens(expression string, functions map[string]ExpressionFunction) ([] return ret, nil } -func readToken(stream *lexerStream, state lexerState, functions map[string]ExpressionFunction) (ExpressionToken, error, bool) { +func readToken(stream *scanner.Scanner, state lexerState, functions map[string]ExpressionFunction) (ExpressionToken, error, bool) { var function ExpressionFunction var ret ExpressionToken @@ -60,7 +63,6 @@ func readToken(stream *lexerStream, state lexerState, functions map[string]Expre var tokenTime time.Time var tokenString string var kind TokenKind - var character rune var found bool var completed bool var err error @@ -71,173 +73,161 @@ func readToken(stream *lexerStream, state lexerState, functions map[string]Expre // bracket always means variable // symbols are anything non-alphanumeric // all others read into a buffer until they reach the end of the stream - for stream.canRead() { - - character = stream.readCharacter() - - if unicode.IsSpace(character) { - continue + kind = UNKNOWN + switch stream.Scan() { + case scanner.EOF: + break + case scanner.Float: + kind = NUMERIC + tokenValue, err = strconv.ParseFloat(stream.TokenText(), 64) + if err != nil { + errorMsg := fmt.Sprintf("Unable to parse numeric value '%v' to float64\n", stream.TokenText()) + return ExpressionToken{}, errors.New(errorMsg), false } - - kind = UNKNOWN - - // numeric constant - if isNumeric(character) { - - if stream.canRead() && character == '0' { - character = stream.readCharacter() - - if stream.canRead() && character == 'x' { - tokenString, _ = readUntilFalse(stream, false, true, true, isHexDigit) - tokenValueInt, err := strconv.ParseUint(tokenString, 16, 64) - - if err != nil { - errorMsg := fmt.Sprintf("Unable to parse hex value '%v' to uint64\n", tokenString) - return ExpressionToken{}, errors.New(errorMsg), false - } - - kind = NUMERIC - tokenValue = float64(tokenValueInt) - break - } else { - stream.rewind(1) - } - } - - tokenString = readTokenUntilFalse(stream, isNumeric) - tokenValue, err = strconv.ParseFloat(tokenString, 64) - - if err != nil { - errorMsg := fmt.Sprintf("Unable to parse numeric value '%v' to float64\n", tokenString) - return ExpressionToken{}, errors.New(errorMsg), false - } - kind = NUMERIC - break + case scanner.Int: + kind = NUMERIC + i, err := strconv.ParseInt(stream.TokenText(), 0, 64) + tokenValue = float64(i) + if err != nil { + errorMsg := fmt.Sprintf("Unable to parse numeric value '%v' to float64\n", stream.TokenText()) + return ExpressionToken{}, errors.New(errorMsg), false } - - // comma, separator - if character == ',' { - - tokenValue = "," - kind = SEPARATOR - break + case ',': + tokenValue = "," + kind = SEPARATOR + case '[': + tokenValue, completed = readUntilFalse(stream, true, isNotClosingBracket) + kind = VARIABLE + + if !completed { + return ExpressionToken{}, errors.New("Unclosed parameter bracket"), false } - // escaped variable - if character == '[' { - - tokenValue, completed = readUntilFalse(stream, true, false, true, isNotClosingBracket) - kind = VARIABLE - - if !completed { - return ExpressionToken{}, errors.New("Unclosed parameter bracket"), false - } - - // above method normally rewinds us to the closing bracket, which we want to skip. - stream.rewind(-1) - break - } + // above method normally rewinds us to the closing bracket, which we want to skip. + stream.Next() + break + case scanner.Ident: // regular variable - or function? - if unicode.IsLetter(character) { - - tokenString = readTokenUntilFalse(stream, isVariableName) - - tokenValue = tokenString - kind = VARIABLE - - // boolean? - if tokenValue == "true" { - - kind = BOOLEAN - tokenValue = true - } else { - if tokenValue == "false" { + tokenString = stream.TokenText() - kind = BOOLEAN - tokenValue = false - } - } + //Hack for crazy escapes in variable names + if stream.Peek() == '\\' { + s, _ := readUntilFalse(stream, true, isVariableName) + tokenString = tokenString + s + } - // textual operator? - if tokenValue == "in" || tokenValue == "IN" { + tokenValue = tokenString + kind = VARIABLE + + switch tokenValue { + // boolean? + case "true": + + kind = BOOLEAN + tokenValue = true + case "false": + kind = BOOLEAN + tokenValue = false + // textual operator? + case "in", "IN": + + // force lower case for consistency + tokenValue = "in" + kind = COMPARATOR - // force lower case for consistency - tokenValue = "in" - kind = COMPARATOR - } + default: // function? function, found = functions[tokenString] if found { kind = FUNCTION tokenValue = function + break } // accessor? - accessorIndex := strings.Index(tokenString, ".") - if accessorIndex > 0 { - - // check that it doesn't end with a hanging period - if tokenString[len(tokenString)-1] == '.' { - errorMsg := fmt.Sprintf("Hanging accessor on token '%s'", tokenString) - return ExpressionToken{}, errors.New(errorMsg), false - } - - kind = ACCESSOR - splits := strings.Split(tokenString, ".") - tokenValue = splits - - // check that none of them are unexported - for i := 1; i < len(splits); i++ { + if stream.Peek() == '.' { + + splits := []string{tokenString} + for stream.Peek() == '.' { + stream.Scan() + // check that it doesn't end with a hanging period + if stream.Scan() != scanner.Ident { + errorMsg := fmt.Sprintf("Hanging accessor on token '%s'", tokenString) + return ExpressionToken{}, errors.New(errorMsg), false + } - firstCharacter := getFirstRune(splits[i]) + tokenString = stream.TokenText() + //Hack for crazy escapes in variable names + if stream.Peek() == '\'' { + s, _ := readUntilFalse(stream, true, isVariableName) + tokenString = tokenString + s + } + // check that none of them are unexported + firstCharacter := getFirstRune(tokenString) if unicode.ToUpper(firstCharacter) != firstCharacter { - errorMsg := fmt.Sprintf("Unable to access unexported field '%s' in token '%s'", splits[i], tokenString) + errorMsg := fmt.Sprintf("Unable to access unexported field '%s' in token '%s'", tokenString, strings.Join(splits, ".")) return ExpressionToken{}, errors.New(errorMsg), false } + + splits = append(splits, tokenString) } + + kind = ACCESSOR + tokenValue = splits } - break } - if !isNotQuote(character) { - tokenValue, completed = readUntilFalse(stream, true, false, true, isNotQuote) - - if !completed { - return ExpressionToken{}, errors.New("Unclosed string literal"), false - } + case scanner.String, scanner.RawString, '\'': + tokenString = stream.TokenText() + if tokenString == "'" { + var tokenBuffer bytes.Buffer - // advance the stream one position, since reading until false assumes the terminator is a real token - stream.rewind(-1) + for c := stream.Next(); c != '\''; c = stream.Next() { + if c == '\\' { + c = stream.Next() + if c != '\'' { + tokenBuffer.WriteRune('\\') + } + } + if c == scanner.EOF || c == '\n' { + return ExpressionToken{}, fmt.Errorf("Unclosed string literal '%s", tokenBuffer.String()), false + } + tokenBuffer.WriteRune(c) - // check to see if this can be parsed as a time. - tokenTime, found = tryParseTime(tokenValue.(string)) - if found { - kind = TIME - tokenValue = tokenTime - } else { - kind = STRING } - break + + tokenString = "\"" + strings.Replace(tokenBuffer.String(), `"`, `\"`, -1) + "\"" } - if character == '(' { - tokenValue = character - kind = CLAUSE - break + tokenValue, err = strconv.Unquote(tokenString) + + if err != nil { + return ExpressionToken{}, err, false } - if character == ')' { - tokenValue = character - kind = CLAUSE_CLOSE - break + // check to see if this can be parsed as a time. + tokenTime, found = tryParseTime(tokenValue.(string)) + if found { + kind = TIME + tokenValue = tokenTime + } else { + kind = STRING } + case '(': + tokenValue = '(' + kind = CLAUSE + case ')': + tokenValue = ')' + kind = CLAUSE_CLOSE + + default: // must be a known symbol - tokenString = readTokenUntilFalse(stream, isNotAlphanumeric) + tokenString = readTokenUntilFalse(stream, isOperation) tokenValue = tokenString // quick hack for the case where "-" can mean "prefixed negation" or "minus", which are used @@ -288,20 +278,31 @@ func readToken(stream *lexerStream, state lexerState, functions map[string]Expre return ret, nil, (kind != UNKNOWN) } -func readTokenUntilFalse(stream *lexerStream, condition func(rune) bool) string { +func readTokenUntilFalse(stream *scanner.Scanner, condition func(rune) bool) string { - var ret string + var tokenBuffer bytes.Buffer + var character rune - stream.rewind(1) - ret, _ = readUntilFalse(stream, false, true, true, condition) - return ret + tokenBuffer.WriteString(stream.TokenText()) + + for { + character = stream.Peek() + if character == scanner.EOF || !condition(character) { + break + } + stream.Next() + tokenBuffer.WriteString(string(character)) + + } + + return tokenBuffer.String() } /* - Returns the string that was read until the given [condition] was false, or whitespace was broken. - Returns false if the stream ended before whitespace was broken or condition was met. + Returns the string that was read until the given [condition] was false. + Returns false if the stream ended before condition was met. */ -func readUntilFalse(stream *lexerStream, includeWhitespace bool, breakWhitespace bool, allowEscaping bool, condition func(rune) bool) (string, bool) { +func readUntilFalse(stream *scanner.Scanner, allowEscaping bool, condition func(rune) bool) (string, bool) { var tokenBuffer bytes.Buffer var character rune @@ -309,36 +310,27 @@ func readUntilFalse(stream *lexerStream, includeWhitespace bool, breakWhitespace conditioned = false - for stream.canRead() { - - character = stream.readCharacter() - - // Use backslashes to escape anything - if allowEscaping && character == '\\' { - - character = stream.readCharacter() - tokenBuffer.WriteString(string(character)) - continue + for { + character = stream.Peek() + if character == scanner.EOF { + break } + if allowEscaping && character == '\\' { + stream.Next() - if unicode.IsSpace(character) { - - if breakWhitespace && tokenBuffer.Len() > 0 { - conditioned = true + if character == stream.Peek() { break } - if !includeWhitespace { - continue - } - } - - if condition(character) { - tokenBuffer.WriteString(string(character)) - } else { + } else if !condition(character) { conditioned = true - stream.rewind(1) break } + character = stream.Next() + + // Use backslashes to escape anything + + tokenBuffer.WriteRune(character) + } return tokenBuffer.String(), conditioned @@ -418,46 +410,20 @@ func isDigit(character rune) bool { return unicode.IsDigit(character) } -func isHexDigit(character rune) bool { - - character = unicode.ToLower(character) - - return unicode.IsDigit(character) || - character == 'a' || - character == 'b' || - character == 'c' || - character == 'd' || - character == 'e' || - character == 'f' -} - -func isNumeric(character rune) bool { - - return unicode.IsDigit(character) || character == '.' -} - -func isNotQuote(character rune) bool { - - return character != '\'' && character != '"' -} - -func isNotAlphanumeric(character rune) bool { - - return !(unicode.IsDigit(character) || - unicode.IsLetter(character) || - character == '(' || - character == ')' || - character == '[' || - character == ']' || // starting to feel like there needs to be an `isOperation` func (#59) - !isNotQuote(character)) +func isOperation(character rune) bool { + switch character { + case '=', '!', '<', '>', '~', '&', '|', '+', '-', '*', '/', '^', '%', ':', '?': + return true + default: + return false + } } func isVariableName(character rune) bool { return unicode.IsLetter(character) || unicode.IsDigit(character) || - character == '_' || - character == '.' + character == '_' } func isNotClosingBracket(character rune) bool { diff --git a/parsingFailure_test.go b/parsingFailure_test.go index d8a3184..c2e84d7 100644 --- a/parsingFailure_test.go +++ b/parsingFailure_test.go @@ -164,7 +164,7 @@ func TestParsingFailure(test *testing.T) { Name: "Multiple radix", Input: "127.0.0.1", - Expected: INVALID_NUMERIC, + Expected: INVALID_TOKEN_TRANSITION, }, ParsingFailureTest{ @@ -188,12 +188,12 @@ func TestParsingFailure(test *testing.T) { ParsingFailureTest{ Name: "Incomplete Hex", Input: "0x", - Expected: INVALID_TOKEN_TRANSITION, + Expected: INVALID_NUMERIC, }, ParsingFailureTest{ Name: "Invalid Hex literal", Input: "0x > 0", - Expected: INVALID_HEX, + Expected: INVALID_NUMERIC, }, ParsingFailureTest{ Name: "Hex float (Unsupported)", diff --git a/parsing_test.go b/parsing_test.go index d57b809..ed50429 100644 --- a/parsing_test.go +++ b/parsing_test.go @@ -19,6 +19,47 @@ type TokenParsingTest struct { Expected []ExpressionToken } +func TestXXX(test *testing.T) { + + var ret *EvaluableExpression + var err error + + ret = new(EvaluableExpression) + ret.QueryDateFormat = isoDateFormat + ret.inputExpression = "1 ?? 2" + + ret.tokens, err = parseTokens(ret.inputExpression, map[string]ExpressionFunction{"func": nil}) + if err != nil { + panic(err) + } + + err = checkBalance(ret.tokens) + if err != nil { + panic(err) + } + + err = checkExpressionSyntax(ret.tokens) + if err != nil { + panic(err) + } + + ret.tokens, err = optimizeTokens(ret.tokens) + if err != nil { + panic(err) + } + + ret.evaluationStages, err = planStages(ret.tokens) + if err != nil { + panic(err) + } + + ret.ChecksTypes = true + + if err != nil { + test.Fail() + } +} + func TestConstantParsing(test *testing.T) { tokenParsingTests := []TokenParsingTest{ @@ -1363,7 +1404,7 @@ func TestEscapedParameters(test *testing.T) { TokenParsingTest{ Name: "String literal uses backslash to escape", - Input: "\"foo\\'bar\"", + Input: "\"foo'bar\"", Expected: []ExpressionToken{ ExpressionToken{ Kind: STRING, From b853d558141324e8f2c5bcd0be1060b05e329b3c Mon Sep 17 00:00:00 2001 From: JanErik Keller Date: Tue, 12 Sep 2017 16:41:11 +0200 Subject: [PATCH 2/3] removed silly testfunction --- parsing_test.go | 41 ----------------------------------------- 1 file changed, 41 deletions(-) diff --git a/parsing_test.go b/parsing_test.go index ed50429..aef5991 100644 --- a/parsing_test.go +++ b/parsing_test.go @@ -19,47 +19,6 @@ type TokenParsingTest struct { Expected []ExpressionToken } -func TestXXX(test *testing.T) { - - var ret *EvaluableExpression - var err error - - ret = new(EvaluableExpression) - ret.QueryDateFormat = isoDateFormat - ret.inputExpression = "1 ?? 2" - - ret.tokens, err = parseTokens(ret.inputExpression, map[string]ExpressionFunction{"func": nil}) - if err != nil { - panic(err) - } - - err = checkBalance(ret.tokens) - if err != nil { - panic(err) - } - - err = checkExpressionSyntax(ret.tokens) - if err != nil { - panic(err) - } - - ret.tokens, err = optimizeTokens(ret.tokens) - if err != nil { - panic(err) - } - - ret.evaluationStages, err = planStages(ret.tokens) - if err != nil { - panic(err) - } - - ret.ChecksTypes = true - - if err != nil { - test.Fail() - } -} - func TestConstantParsing(test *testing.T) { tokenParsingTests := []TokenParsingTest{ From 4f0c118be1d09b706960b5a9c0cdb26227870c96 Mon Sep 17 00:00:00 2001 From: JanErik Keller Date: Wed, 13 Sep 2017 09:38:28 +0200 Subject: [PATCH 3/3] fixed logging issue reveald by GOVALUATE_TORTURE_TEST --- parsing.go | 1 + 1 file changed, 1 insertion(+) diff --git a/parsing.go b/parsing.go index 2d8b46a..5f8981e 100644 --- a/parsing.go +++ b/parsing.go @@ -24,6 +24,7 @@ func parseTokens(expression string, functions map[string]ExpressionFunction) ([] reader := strings.NewReader(expression) stream.Init(reader) stream.Mode = scanner.ScanIdents | scanner.ScanFloats | scanner.ScanStrings | scanner.ScanRawStrings | scanner.ScanComments | scanner.SkipComments + stream.Error = func(*scanner.Scanner, string) {} state = validLexerStates[0] for stream.Peek() != scanner.EOF {