4
4
5
5
namespace Doctrine \SqlFormatter ;
6
6
7
- use function array_combine ;
8
- use function array_keys ;
9
7
use function array_map ;
10
- use function arsort ;
11
- use function assert ;
12
- use function implode ;
8
+ use function count ;
9
+ use function is_int ;
13
10
use function preg_match ;
14
11
use function preg_quote ;
12
+ use function reset ;
15
13
use function str_replace ;
14
+ use function str_starts_with ;
16
15
use function strlen ;
17
16
use function strpos ;
18
17
use function strtoupper ;
19
18
use function substr ;
19
+ use function usort ;
20
20
21
21
/** @internal */
22
22
final class Tokenizer
@@ -762,31 +762,12 @@ final class Tokenizer
762
762
*/
763
763
public function __construct ()
764
764
{
765
- // Sort list from longest word to shortest, 3x faster than usort
766
- $ sortByLengthFx = static function ($ values ) {
767
- $ valuesMap = array_combine ($ values , array_map (strlen (...), $ values ));
768
- assert ($ valuesMap !== false );
769
- arsort ($ valuesMap );
770
-
771
- return array_keys ($ valuesMap );
772
- };
773
-
774
- $ buildRegexFromListFx = static function ($ values ) use ($ sortByLengthFx ) {
775
- return '(?> ' . implode (
776
- '| ' ,
777
- array_map (
778
- static fn ($ v ) => preg_quote ($ v , '/ ' ),
779
- $ sortByLengthFx ($ values ),
780
- ),
781
- ) . ') ' ;
782
- };
783
-
784
765
// Set up regular expressions
785
- $ regexBoundaries = $ buildRegexFromListFx ($ this ->boundaries );
786
- $ regexReserved = $ buildRegexFromListFx ($ this ->reserved );
787
- $ regexReservedToplevel = str_replace (' ' , '\s+ ' , $ buildRegexFromListFx ($ this ->reservedToplevel ));
788
- $ regexReservedNewline = str_replace (' ' , '\s+ ' , $ buildRegexFromListFx ($ this ->reservedNewline ));
789
- $ regexFunction = $ buildRegexFromListFx ($ this ->functions );
766
+ $ regexBoundaries = $ this -> makeRegexFromList ($ this ->boundaries );
767
+ $ regexReserved = $ this -> makeRegexFromList ($ this ->reserved );
768
+ $ regexReservedToplevel = str_replace (' ' , '\s+ ' , $ this -> makeRegexFromList ($ this ->reservedToplevel ));
769
+ $ regexReservedNewline = str_replace (' ' , '\s+ ' , $ this -> makeRegexFromList ($ this ->reservedNewline ));
770
+ $ regexFunction = $ this -> makeRegexFromList ($ this ->functions );
790
771
791
772
$ this ->nextTokenRegexNumber = '/\G(?:\d+(?:\.\d+)?|0x[\da-fA-F]+|0b[01]+)(?=$|\s|" \'`| ' . $ regexBoundaries . ')/ ' ;
792
773
$ this ->nextTokenRegexBoundaryCharacter = '/\G ' . $ regexBoundaries . '/ ' ;
@@ -797,6 +778,75 @@ public function __construct()
797
778
$ this ->nextTokenRegexNonReserved = '/\G.*?(?=$|\s|[" \'`]| ' . $ regexBoundaries . ')/ ' ;
798
779
}
799
780
781
+ /**
782
+ * Make regex from a list of values matching longest value first.
783
+ *
784
+ * Optimized for speed by matching alternative branch only once
785
+ * https://github.com/PCRE2Project/pcre2/issues/411 .
786
+ *
787
+ * @param list<string> $values
788
+ */
789
+ private function makeRegexFromList (array $ values , bool $ sorted = false ): string
790
+ {
791
+ // sort list alphabetically and from longest word to shortest
792
+ if (! $ sorted ) {
793
+ usort ($ values , static function (string $ a , string $ b ) {
794
+ return str_starts_with ($ a , $ b ) || str_starts_with ($ b , $ a )
795
+ ? strlen ($ b ) <=> strlen ($ a )
796
+ : $ a <=> $ b ;
797
+ });
798
+ }
799
+
800
+ /** @var array<int|string, list<string>> $valuesBySharedPrefix */
801
+ $ valuesBySharedPrefix = [];
802
+ $ items = [];
803
+ $ prefix = null ;
804
+
805
+ foreach ($ values as $ v ) {
806
+ if ($ prefix !== null && ! str_starts_with ($ v , substr ($ prefix , 0 , 1 ))) {
807
+ $ valuesBySharedPrefix [$ prefix ] = $ items ;
808
+ $ items = [];
809
+ $ prefix = null ;
810
+ }
811
+
812
+ $ items [] = $ v ;
813
+
814
+ if ($ prefix === null ) {
815
+ $ prefix = $ v ;
816
+ } else {
817
+ while (! str_starts_with ($ v , $ prefix )) {
818
+ $ prefix = substr ($ prefix , 0 , -1 );
819
+ }
820
+ }
821
+ }
822
+
823
+ if ($ items !== []) {
824
+ $ valuesBySharedPrefix [$ prefix ] = $ items ;
825
+ $ items = [];
826
+ $ prefix = null ;
827
+ }
828
+
829
+ $ regex = '(?> ' ;
830
+
831
+ foreach ($ valuesBySharedPrefix as $ prefix => $ items ) {
832
+ if ($ regex !== '(?> ' ) {
833
+ $ regex .= '| ' ;
834
+ }
835
+
836
+ if (is_int ($ prefix )) {
837
+ $ prefix = (string ) $ prefix ;
838
+ }
839
+
840
+ $ regex .= preg_quote ($ prefix , '/ ' );
841
+
842
+ $ regex .= count ($ items ) === 1
843
+ ? preg_quote (substr (reset ($ items ), strlen ($ prefix )), '/ ' )
844
+ : $ this ->makeRegexFromList (array_map (static fn ($ v ) => substr ($ v , strlen ($ prefix )), $ items ), true );
845
+ }
846
+
847
+ return $ regex . ') ' ;
848
+ }
849
+
800
850
/**
801
851
* Takes a SQL string and breaks it into tokens.
802
852
* Each token is an associative array with type and value.
0 commit comments