@@ -720,11 +720,13 @@ final class Tokenizer
720
720
721
721
// Regular expressions for tokenizing
722
722
723
- private readonly string $ regexBoundaries ;
724
- private readonly string $ regexReserved ;
725
- private readonly string $ regexReservedNewline ;
726
- private readonly string $ regexReservedToplevel ;
727
- private readonly string $ regexFunction ;
723
+ private readonly string $ nextTokenRegexNumber ;
724
+ private readonly string $ nextTokenRegexBoundaryCharacter ;
725
+ private readonly string $ nextTokenRegexReservedToplevel ;
726
+ private readonly string $ nextTokenRegexReservedNewline ;
727
+ private readonly string $ nextTokenRegexReserved ;
728
+ private readonly string $ nextTokenRegexFunction ;
729
+ private readonly string $ nextTokenRegexNonReserved ;
728
730
729
731
/**
730
732
* Punctuation that can be used as a boundary between other tokens
@@ -769,25 +771,30 @@ public function __construct()
769
771
return array_keys ($ valuesMap );
770
772
};
771
773
772
- // Set up regular expressions
773
- $ this ->regexBoundaries = '( ' . implode (
774
- '| ' ,
775
- $ this ->quoteRegex ($ this ->boundaries ),
776
- ) . ') ' ;
777
- $ this ->regexReserved = '( ' . implode (
778
- '| ' ,
779
- $ this ->quoteRegex ($ sortByLengthFx ($ this ->reserved )),
780
- ) . ') ' ;
781
- $ this ->regexReservedToplevel = str_replace (' ' , '\s+ ' , '( ' . implode (
782
- '| ' ,
783
- $ this ->quoteRegex ($ sortByLengthFx ($ this ->reservedToplevel )),
784
- ) . ') ' );
785
- $ this ->regexReservedNewline = str_replace (' ' , '\s+ ' , '( ' . implode (
786
- '| ' ,
787
- $ this ->quoteRegex ($ sortByLengthFx ($ this ->reservedNewline )),
788
- ) . ') ' );
774
+ $ buildRegexFromListFx = static function ($ values ) use ($ sortByLengthFx ) {
775
+ return '(?> ' . implode (
776
+ '| ' ,
777
+ array_map (
778
+ static fn ($ v ) => preg_quote ($ v , '/ ' ),
779
+ $ sortByLengthFx ($ values ),
780
+ ),
781
+ ) . ') ' ;
782
+ };
789
783
790
- $ this ->regexFunction = '( ' . implode ('| ' , $ this ->quoteRegex ($ sortByLengthFx ($ this ->functions ))) . ') ' ;
784
+ // Set up regular expressions
785
+ $ regexBoundaries = $ buildRegexFromListFx ($ this ->boundaries );
786
+ $ regexReserved = $ buildRegexFromListFx ($ this ->reserved );
787
+ $ regexReservedToplevel = str_replace (' ' , '\s+ ' , $ buildRegexFromListFx ($ this ->reservedToplevel ));
788
+ $ regexReservedNewline = str_replace (' ' , '\s+ ' , $ buildRegexFromListFx ($ this ->reservedNewline ));
789
+ $ regexFunction = $ buildRegexFromListFx ($ this ->functions );
790
+
791
+ $ this ->nextTokenRegexNumber = '/\G(?:\d+(?:\.\d+)?|0x[\da-fA-F]+|0b[01]+)(?=$|\s|" \'`| ' . $ regexBoundaries . ')/ ' ;
792
+ $ this ->nextTokenRegexBoundaryCharacter = '/\G ' . $ regexBoundaries . '/ ' ;
793
+ $ this ->nextTokenRegexReservedToplevel = '/\G ' . $ regexReservedToplevel . '(?=$|\s| ' . $ regexBoundaries . ')/ ' ;
794
+ $ this ->nextTokenRegexReservedNewline = '/\G ' . $ regexReservedNewline . '(?=$|\s| ' . $ regexBoundaries . ')/ ' ;
795
+ $ this ->nextTokenRegexReserved = '/\G ' . $ regexReserved . '(?=$|\s| ' . $ regexBoundaries . ')/ ' ;
796
+ $ this ->nextTokenRegexFunction = '/\G ' . $ regexFunction . '(?=\s*\()/ ' ;
797
+ $ this ->nextTokenRegexNonReserved = '/\G.*?(?=$|\s|[" \'`]| ' . $ regexBoundaries . ')/ ' ;
791
798
}
792
799
793
800
/**
@@ -829,7 +836,6 @@ public function tokenize(string $string): Cursor
829
836
*/
830
837
private function createNextToken (string $ string , string $ upper , int $ offset , Token |null $ previous = null ): Token
831
838
{
832
- $ matches = [];
833
839
// Whitespace
834
840
if (preg_match ('/\G\s+/ ' , $ string , $ matches , 0 , $ offset )) {
835
841
return new Token (Token::TOKEN_TYPE_WHITESPACE , $ matches [0 ]);
@@ -883,9 +889,9 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
883
889
$ value = $ firstChar . $ this ->getNextQuotedString ($ string , $ offset + 1 );
884
890
} else {
885
891
// Non-quoted variable name
886
- preg_match ('/\G( ' . $ firstChar . ' [ \w.$]+) / ' , $ string , $ matches , 0 , $ offset );
892
+ preg_match ('/\G[@:][ \w.$]+/ ' , $ string , $ matches , 0 , $ offset );
887
893
if ($ matches ) {
888
- $ value = $ matches [1 ];
894
+ $ value = $ matches [0 ];
889
895
}
890
896
}
891
897
@@ -897,19 +903,19 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
897
903
// Number (decimal, binary, or hex)
898
904
if (
899
905
preg_match (
900
- ' /\G(\d+(\.\d+)?|0x[\da-fA-F]+|0b[01]+)($|\s|" \' `| ' . $ this ->regexBoundaries . ' )/ ' ,
906
+ $ this ->nextTokenRegexNumber ,
901
907
$ string ,
902
908
$ matches ,
903
909
0 ,
904
910
$ offset ,
905
911
)
906
912
) {
907
- return new Token (Token::TOKEN_TYPE_NUMBER , $ matches [1 ]);
913
+ return new Token (Token::TOKEN_TYPE_NUMBER , $ matches [0 ]);
908
914
}
909
915
910
916
// Boundary Character (punctuation and symbols)
911
- if (preg_match (' /\G( ' . $ this ->regexBoundaries . ' )/ ' , $ string , $ matches , 0 , $ offset )) {
912
- return new Token (Token::TOKEN_TYPE_BOUNDARY , $ matches [1 ]);
917
+ if (preg_match ($ this ->nextTokenRegexBoundaryCharacter , $ string , $ matches , 0 , $ offset )) {
918
+ return new Token (Token::TOKEN_TYPE_BOUNDARY , $ matches [0 ]);
913
919
}
914
920
915
921
// A reserved word cannot be preceded by a '.'
@@ -918,7 +924,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
918
924
// Top Level Reserved Word
919
925
if (
920
926
preg_match (
921
- ' /\G( ' . $ this ->regexReservedToplevel . ' )($|\s| ' . $ this -> regexBoundaries . ' )/ ' ,
927
+ $ this ->nextTokenRegexReservedToplevel ,
922
928
$ upper ,
923
929
$ matches ,
924
930
0 ,
@@ -927,14 +933,14 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
927
933
) {
928
934
return new Token (
929
935
Token::TOKEN_TYPE_RESERVED_TOPLEVEL ,
930
- substr ($ string , $ offset , strlen ($ matches [1 ])),
936
+ substr ($ string , $ offset , strlen ($ matches [0 ])),
931
937
);
932
938
}
933
939
934
940
// Newline Reserved Word
935
941
if (
936
942
preg_match (
937
- ' /\G( ' . $ this ->regexReservedNewline . ' )($|\s| ' . $ this -> regexBoundaries . ' )/ ' ,
943
+ $ this ->nextTokenRegexReservedNewline ,
938
944
$ upper ,
939
945
$ matches ,
940
946
0 ,
@@ -943,14 +949,14 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
943
949
) {
944
950
return new Token (
945
951
Token::TOKEN_TYPE_RESERVED_NEWLINE ,
946
- substr ($ string , $ offset , strlen ($ matches [1 ])),
952
+ substr ($ string , $ offset , strlen ($ matches [0 ])),
947
953
);
948
954
}
949
955
950
956
// Other Reserved Word
951
957
if (
952
958
preg_match (
953
- ' /\G( ' . $ this ->regexReserved . ' )($|\s| ' . $ this -> regexBoundaries . ' )/ ' ,
959
+ $ this ->nextTokenRegexReserved ,
954
960
$ upper ,
955
961
$ matches ,
956
962
0 ,
@@ -959,40 +965,24 @@ private function createNextToken(string $string, string $upper, int $offset, Tok
959
965
) {
960
966
return new Token (
961
967
Token::TOKEN_TYPE_RESERVED ,
962
- substr ($ string , $ offset , strlen ($ matches [1 ])),
968
+ substr ($ string , $ offset , strlen ($ matches [0 ])),
963
969
);
964
970
}
965
971
}
966
972
967
973
// A function must be succeeded by '('
968
- // this makes it so "count(" is considered a function, but "count" alone is not
969
- // function
970
- if (preg_match ('/\G( ' . $ this ->regexFunction . '[(]|\s|[)])/ ' , $ upper , $ matches , 0 , $ offset )) {
974
+ // this makes it so "count(" is considered a function, but "count" alone is not function
975
+ if (preg_match ($ this ->nextTokenRegexFunction , $ upper , $ matches , 0 , $ offset )) {
971
976
return new Token (
972
977
Token::TOKEN_TYPE_RESERVED ,
973
- substr ($ string , $ offset , strlen ($ matches [1 ]) - 1 ),
978
+ substr ($ string , $ offset , strlen ($ matches [0 ]) ),
974
979
);
975
980
}
976
981
977
982
// Non reserved word
978
- preg_match ('/\G(.*?)($|\s|[" \'`]| ' . $ this ->regexBoundaries . ')/ ' , $ string , $ matches , 0 , $ offset );
979
-
980
- return new Token (Token::TOKEN_TYPE_WORD , $ matches [1 ]);
981
- }
983
+ preg_match ($ this ->nextTokenRegexNonReserved , $ string , $ matches , 0 , $ offset );
982
984
983
- /**
984
- * Helper function for building regular expressions for reserved words and boundary characters
985
- *
986
- * @param string[] $strings The strings to be quoted
987
- *
988
- * @return string[] The quoted strings
989
- */
990
- private function quoteRegex (array $ strings ): array
991
- {
992
- return array_map (
993
- static fn (string $ string ): string => preg_quote ($ string , '/ ' ),
994
- $ strings ,
995
- );
985
+ return new Token (Token::TOKEN_TYPE_WORD , $ matches [0 ]);
996
986
}
997
987
998
988
private function getNextQuotedString (string $ string , int $ offset ): string
0 commit comments