Skip to content

Commit 9fcd0ac

Browse files
authored
Merge pull request #131 from mvorisek/fix_perf2
Improve tokenizer regex matching
2 parents e53096c + fda3d6e commit 9fcd0ac

File tree

1 file changed

+47
-57
lines changed

1 file changed

+47
-57
lines changed

src/Tokenizer.php

+47-57
Original file line numberDiff line numberDiff line change
@@ -720,11 +720,13 @@ final class Tokenizer
720720

721721
// Regular expressions for tokenizing
722722

723-
private readonly string $regexBoundaries;
724-
private readonly string $regexReserved;
725-
private readonly string $regexReservedNewline;
726-
private readonly string $regexReservedToplevel;
727-
private readonly string $regexFunction;
723+
private readonly string $nextTokenRegexNumber;
724+
private readonly string $nextTokenRegexBoundaryCharacter;
725+
private readonly string $nextTokenRegexReservedToplevel;
726+
private readonly string $nextTokenRegexReservedNewline;
727+
private readonly string $nextTokenRegexReserved;
728+
private readonly string $nextTokenRegexFunction;
729+
private readonly string $nextTokenRegexNonReserved;
728730

729731
/**
730732
* Punctuation that can be used as a boundary between other tokens
@@ -769,25 +771,30 @@ public function __construct()
769771
return array_keys($valuesMap);
770772
};
771773

772-
// Set up regular expressions
773-
$this->regexBoundaries = '(' . implode(
774-
'|',
775-
$this->quoteRegex($this->boundaries),
776-
) . ')';
777-
$this->regexReserved = '(' . implode(
778-
'|',
779-
$this->quoteRegex($sortByLengthFx($this->reserved)),
780-
) . ')';
781-
$this->regexReservedToplevel = str_replace(' ', '\s+', '(' . implode(
782-
'|',
783-
$this->quoteRegex($sortByLengthFx($this->reservedToplevel)),
784-
) . ')');
785-
$this->regexReservedNewline = str_replace(' ', '\s+', '(' . implode(
786-
'|',
787-
$this->quoteRegex($sortByLengthFx($this->reservedNewline)),
788-
) . ')');
774+
$buildRegexFromListFx = static function ($values) use ($sortByLengthFx) {
775+
return '(?>' . implode(
776+
'|',
777+
array_map(
778+
static fn ($v) => preg_quote($v, '/'),
779+
$sortByLengthFx($values),
780+
),
781+
) . ')';
782+
};
789783

790-
$this->regexFunction = '(' . implode('|', $this->quoteRegex($sortByLengthFx($this->functions))) . ')';
784+
// Set up regular expressions
785+
$regexBoundaries = $buildRegexFromListFx($this->boundaries);
786+
$regexReserved = $buildRegexFromListFx($this->reserved);
787+
$regexReservedToplevel = str_replace(' ', '\s+', $buildRegexFromListFx($this->reservedToplevel));
788+
$regexReservedNewline = str_replace(' ', '\s+', $buildRegexFromListFx($this->reservedNewline));
789+
$regexFunction = $buildRegexFromListFx($this->functions);
790+
791+
$this->nextTokenRegexNumber = '/\G(?:\d+(?:\.\d+)?|0x[\da-fA-F]+|0b[01]+)(?=$|\s|"\'`|' . $regexBoundaries . ')/';
792+
$this->nextTokenRegexBoundaryCharacter = '/\G' . $regexBoundaries . '/';
793+
$this->nextTokenRegexReservedToplevel = '/\G' . $regexReservedToplevel . '(?=$|\s|' . $regexBoundaries . ')/';
794+
$this->nextTokenRegexReservedNewline = '/\G' . $regexReservedNewline . '(?=$|\s|' . $regexBoundaries . ')/';
795+
$this->nextTokenRegexReserved = '/\G' . $regexReserved . '(?=$|\s|' . $regexBoundaries . ')/';
796+
$this->nextTokenRegexFunction = '/\G' . $regexFunction . '(?=\s*\()/';
797+
$this->nextTokenRegexNonReserved = '/\G.*?(?=$|\s|["\'`]|' . $regexBoundaries . ')/';
791798
}
792799

793800
/**
@@ -829,7 +836,6 @@ public function tokenize(string $string): Cursor
829836
*/
830837
private function createNextToken(string $string, string $upper, int $offset, Token|null $previous = null): Token
831838
{
832-
$matches = [];
833839
// Whitespace
834840
if (preg_match('/\G\s+/', $string, $matches, 0, $offset)) {
835841
return new Token(Token::TOKEN_TYPE_WHITESPACE, $matches[0]);
@@ -883,9 +889,9 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
883889
$value = $firstChar . $this->getNextQuotedString($string, $offset + 1);
884890
} else {
885891
// Non-quoted variable name
886-
preg_match('/\G(' . $firstChar . '[\w.$]+)/', $string, $matches, 0, $offset);
892+
preg_match('/\G[@:][\w.$]+/', $string, $matches, 0, $offset);
887893
if ($matches) {
888-
$value = $matches[1];
894+
$value = $matches[0];
889895
}
890896
}
891897

@@ -897,19 +903,19 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
897903
// Number (decimal, binary, or hex)
898904
if (
899905
preg_match(
900-
'/\G(\d+(\.\d+)?|0x[\da-fA-F]+|0b[01]+)($|\s|"\'`|' . $this->regexBoundaries . ')/',
906+
$this->nextTokenRegexNumber,
901907
$string,
902908
$matches,
903909
0,
904910
$offset,
905911
)
906912
) {
907-
return new Token(Token::TOKEN_TYPE_NUMBER, $matches[1]);
913+
return new Token(Token::TOKEN_TYPE_NUMBER, $matches[0]);
908914
}
909915

910916
// Boundary Character (punctuation and symbols)
911-
if (preg_match('/\G(' . $this->regexBoundaries . ')/', $string, $matches, 0, $offset)) {
912-
return new Token(Token::TOKEN_TYPE_BOUNDARY, $matches[1]);
917+
if (preg_match($this->nextTokenRegexBoundaryCharacter, $string, $matches, 0, $offset)) {
918+
return new Token(Token::TOKEN_TYPE_BOUNDARY, $matches[0]);
913919
}
914920

915921
// A reserved word cannot be preceded by a '.'
@@ -918,7 +924,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
918924
// Top Level Reserved Word
919925
if (
920926
preg_match(
921-
'/\G(' . $this->regexReservedToplevel . ')($|\s|' . $this->regexBoundaries . ')/',
927+
$this->nextTokenRegexReservedToplevel,
922928
$upper,
923929
$matches,
924930
0,
@@ -927,14 +933,14 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
927933
) {
928934
return new Token(
929935
Token::TOKEN_TYPE_RESERVED_TOPLEVEL,
930-
substr($string, $offset, strlen($matches[1])),
936+
substr($string, $offset, strlen($matches[0])),
931937
);
932938
}
933939

934940
// Newline Reserved Word
935941
if (
936942
preg_match(
937-
'/\G(' . $this->regexReservedNewline . ')($|\s|' . $this->regexBoundaries . ')/',
943+
$this->nextTokenRegexReservedNewline,
938944
$upper,
939945
$matches,
940946
0,
@@ -943,14 +949,14 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
943949
) {
944950
return new Token(
945951
Token::TOKEN_TYPE_RESERVED_NEWLINE,
946-
substr($string, $offset, strlen($matches[1])),
952+
substr($string, $offset, strlen($matches[0])),
947953
);
948954
}
949955

950956
// Other Reserved Word
951957
if (
952958
preg_match(
953-
'/\G(' . $this->regexReserved . ')($|\s|' . $this->regexBoundaries . ')/',
959+
$this->nextTokenRegexReserved,
954960
$upper,
955961
$matches,
956962
0,
@@ -959,40 +965,24 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
959965
) {
960966
return new Token(
961967
Token::TOKEN_TYPE_RESERVED,
962-
substr($string, $offset, strlen($matches[1])),
968+
substr($string, $offset, strlen($matches[0])),
963969
);
964970
}
965971
}
966972

967973
// A function must be succeeded by '('
968-
// this makes it so "count(" is considered a function, but "count" alone is not
969-
// function
970-
if (preg_match('/\G(' . $this->regexFunction . '[(]|\s|[)])/', $upper, $matches, 0, $offset)) {
974+
// this makes it so "count(" is considered a function, but "count" alone is not function
975+
if (preg_match($this->nextTokenRegexFunction, $upper, $matches, 0, $offset)) {
971976
return new Token(
972977
Token::TOKEN_TYPE_RESERVED,
973-
substr($string, $offset, strlen($matches[1]) - 1),
978+
substr($string, $offset, strlen($matches[0])),
974979
);
975980
}
976981

977982
// Non reserved word
978-
preg_match('/\G(.*?)($|\s|["\'`]|' . $this->regexBoundaries . ')/', $string, $matches, 0, $offset);
979-
980-
return new Token(Token::TOKEN_TYPE_WORD, $matches[1]);
981-
}
983+
preg_match($this->nextTokenRegexNonReserved, $string, $matches, 0, $offset);
982984

983-
/**
984-
* Helper function for building regular expressions for reserved words and boundary characters
985-
*
986-
* @param string[] $strings The strings to be quoted
987-
*
988-
* @return string[] The quoted strings
989-
*/
990-
private function quoteRegex(array $strings): array
991-
{
992-
return array_map(
993-
static fn (string $string): string => preg_quote($string, '/'),
994-
$strings,
995-
);
985+
return new Token(Token::TOKEN_TYPE_WORD, $matches[0]);
996986
}
997987

998988
private function getNextQuotedString(string $string, int $offset): string

0 commit comments

Comments
 (0)