Skip to content

Commit b426bc5

Browse files
authored
Merge pull request #132 from mvorisek/fix_perf_build_regexes_optimized
Build optimized regex from string list
2 parents 9fcd0ac + 41606cd commit b426bc5

File tree

1 file changed

+79
-29
lines changed

1 file changed

+79
-29
lines changed

src/Tokenizer.php

+79-29
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,19 @@
44

55
namespace Doctrine\SqlFormatter;
66

7-
use function array_combine;
8-
use function array_keys;
97
use function array_map;
10-
use function arsort;
11-
use function assert;
12-
use function implode;
8+
use function count;
9+
use function is_int;
1310
use function preg_match;
1411
use function preg_quote;
12+
use function reset;
1513
use function str_replace;
14+
use function str_starts_with;
1615
use function strlen;
1716
use function strpos;
1817
use function strtoupper;
1918
use function substr;
19+
use function usort;
2020

2121
/** @internal */
2222
final class Tokenizer
@@ -762,31 +762,12 @@ final class Tokenizer
762762
*/
763763
public function __construct()
764764
{
765-
// Sort list from longest word to shortest, 3x faster than usort
766-
$sortByLengthFx = static function ($values) {
767-
$valuesMap = array_combine($values, array_map(strlen(...), $values));
768-
assert($valuesMap !== false);
769-
arsort($valuesMap);
770-
771-
return array_keys($valuesMap);
772-
};
773-
774-
$buildRegexFromListFx = static function ($values) use ($sortByLengthFx) {
775-
return '(?>' . implode(
776-
'|',
777-
array_map(
778-
static fn ($v) => preg_quote($v, '/'),
779-
$sortByLengthFx($values),
780-
),
781-
) . ')';
782-
};
783-
784765
// Set up regular expressions
785-
$regexBoundaries = $buildRegexFromListFx($this->boundaries);
786-
$regexReserved = $buildRegexFromListFx($this->reserved);
787-
$regexReservedToplevel = str_replace(' ', '\s+', $buildRegexFromListFx($this->reservedToplevel));
788-
$regexReservedNewline = str_replace(' ', '\s+', $buildRegexFromListFx($this->reservedNewline));
789-
$regexFunction = $buildRegexFromListFx($this->functions);
766+
$regexBoundaries = $this->makeRegexFromList($this->boundaries);
767+
$regexReserved = $this->makeRegexFromList($this->reserved);
768+
$regexReservedToplevel = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedToplevel));
769+
$regexReservedNewline = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedNewline));
770+
$regexFunction = $this->makeRegexFromList($this->functions);
790771

791772
$this->nextTokenRegexNumber = '/\G(?:\d+(?:\.\d+)?|0x[\da-fA-F]+|0b[01]+)(?=$|\s|"\'`|' . $regexBoundaries . ')/';
792773
$this->nextTokenRegexBoundaryCharacter = '/\G' . $regexBoundaries . '/';
@@ -797,6 +778,75 @@ public function __construct()
797778
$this->nextTokenRegexNonReserved = '/\G.*?(?=$|\s|["\'`]|' . $regexBoundaries . ')/';
798779
}
799780

781+
/**
782+
* Make regex from a list of values matching longest value first.
783+
*
784+
* Optimized for speed by matching alternative branch only once
785+
* https://github.com/PCRE2Project/pcre2/issues/411 .
786+
*
787+
* @param list<string> $values
788+
*/
789+
private function makeRegexFromList(array $values, bool $sorted = false): string
790+
{
791+
// sort list alphabetically and from longest word to shortest
792+
if (! $sorted) {
793+
usort($values, static function (string $a, string $b) {
794+
return str_starts_with($a, $b) || str_starts_with($b, $a)
795+
? strlen($b) <=> strlen($a)
796+
: $a <=> $b;
797+
});
798+
}
799+
800+
/** @var array<int|string, list<string>> $valuesBySharedPrefix */
801+
$valuesBySharedPrefix = [];
802+
$items = [];
803+
$prefix = null;
804+
805+
foreach ($values as $v) {
806+
if ($prefix !== null && ! str_starts_with($v, substr($prefix, 0, 1))) {
807+
$valuesBySharedPrefix[$prefix] = $items;
808+
$items = [];
809+
$prefix = null;
810+
}
811+
812+
$items[] = $v;
813+
814+
if ($prefix === null) {
815+
$prefix = $v;
816+
} else {
817+
while (! str_starts_with($v, $prefix)) {
818+
$prefix = substr($prefix, 0, -1);
819+
}
820+
}
821+
}
822+
823+
if ($items !== []) {
824+
$valuesBySharedPrefix[$prefix] = $items;
825+
$items = [];
826+
$prefix = null;
827+
}
828+
829+
$regex = '(?>';
830+
831+
foreach ($valuesBySharedPrefix as $prefix => $items) {
832+
if ($regex !== '(?>') {
833+
$regex .= '|';
834+
}
835+
836+
if (is_int($prefix)) {
837+
$prefix = (string) $prefix;
838+
}
839+
840+
$regex .= preg_quote($prefix, '/');
841+
842+
$regex .= count($items) === 1
843+
? preg_quote(substr(reset($items), strlen($prefix)), '/')
844+
: $this->makeRegexFromList(array_map(static fn ($v) => substr($v, strlen($prefix)), $items), true);
845+
}
846+
847+
return $regex . ')';
848+
}
849+
800850
/**
801851
* Takes a SQL string and breaks it into tokens.
802852
* Each token is an associative array with type and value.

0 commit comments

Comments
 (0)