diff --git a/README.md b/README.md index e510a2b..f949d92 100644 --- a/README.md +++ b/README.md @@ -152,3 +152,34 @@ echo $builder->build(['☺', '☹']); \xe2\x98[\xb9\xba] [\u2639\u263a] ``` + +### Adding meta-characters + +Some individual characters can be used to represent arbitrary expressions in the input strings. The requirements are that: + + 1. Only single characters (as per the input encoding) can be used. For example, `?` is allowed but not `??`. + 2. The regular expression must be valid on its own. For example, `.*` is valid but not `+`. + +In the following example, we emulate Bash-style jokers by mapping `?` to `.` and `*` to `.*`. + +```php +$builder = new s9e\RegexpBuilder\Builder([ + 'meta' => ['?' => '.', '*' => '.*'] +]); +echo $builder->build(['foo?', 'bar*']); +``` +``` +(?:bar.*|foo.) +``` + +In the following example, we map `X` to `\d`. Note that sequences produced by meta-characters may appear in character classes if the result is valid. + +```php +$builder = new s9e\RegexpBuilder\Builder([ + 'meta' => ['X' => '\\d'] +]); +echo $builder->build(['a', 'b', 'X']); +``` +``` +[\dab] +``` diff --git a/composer.json b/composer.json index 3915ba6..2765e19 100644 --- a/composer.json +++ b/composer.json @@ -1,13 +1,14 @@ { "name": "s9e/regexp-builder", - "version": "1.3.1-dev", + "version": "1.4.0", "type": "library", - "description": "Single-purpose library that generates regular expressions that match a list of literals.", + "description": "Single-purpose library that generates regular expressions that match a list of strings.", "homepage": "https://github.com/s9e/RegexpBuilder/", "keywords": ["regexp"], "license": "MIT", "require": { - "php": ">=5.5.1", + "lib-pcre": ">=7.2", + "php": ">=5.5.1", "phpunit/phpunit": "<5.8" }, "autoload": { diff --git a/src/Builder.php b/src/Builder.php index 66a95e0..3a73156 100644 --- a/src/Builder.php +++ b/src/Builder.php @@ -24,6 +24,11 @@ class Builder */ protected $input; + /** + * @var MetaCharacters + */ + protected $meta; + /** * @var Runner */ @@ -43,11 +48,13 @@ public function __construct(array $config = []) 'delimiter' => '/', 'input' => 'Bytes', 'inputOptions' => [], + 'meta' => [], 'output' => 'Bytes', 'outputOptions' => [] ]; $this->setInput($config['input'], $config['inputOptions']); + $this->setMeta($config['meta']); $this->setSerializer($config['output'], $config['outputOptions'], $config['delimiter']); $this->setRunner(); } @@ -68,6 +75,7 @@ public function build(array $strings) $strings = $this->splitStrings($strings); usort($strings, __CLASS__ . '::compareStrings'); + $strings = $this->meta->replaceMeta($strings); $strings = $this->runner->run($strings); return $this->serializer->serializeStrings($strings); @@ -121,6 +129,21 @@ protected function setInput($inputType, array $inputOptions) $this->input = new $className($inputOptions); } + /** + * Set the MetaCharacters instance in $this->meta + * + * @param array $map + * @return void + */ + protected function setMeta(array $map) + { + $this->meta = new MetaCharacters($this->input); + foreach ($map as $char => $expr) + { + $this->meta->add($char, $expr); + } + } + /** * Set the Runner instance $in this->runner * @@ -152,7 +175,7 @@ protected function setSerializer($outputType, array $outputOptions, $delimiter) $output = new $className($outputOptions); $escaper = new Escaper($delimiter); - $this->serializer = new Serializer($output, $escaper); + $this->serializer = new Serializer($output, $this->meta, $escaper); } /** diff --git a/src/MetaCharacters.php b/src/MetaCharacters.php new file mode 100644 index 0000000..fda313b --- /dev/null +++ b/src/MetaCharacters.php @@ -0,0 +1,218 @@ +input = $input; + } + + /** + * Add a meta-character to the list + * + * @param string $char Meta-character + * @param string $expr Regular expression + * @return void + */ + public function add($char, $expr) + { + $split = $this->input->split($char); + if (count($split) !== 1) + { + throw new InvalidArgumentException('Meta-characters must be represented by exactly one character'); + } + if (@preg_match('(' . $expr . ')u', '') === false) + { + throw new InvalidArgumentException("Invalid expression '" . $expr . "'"); + } + + $inputValue = $split[0]; + $metaValue = $this->computeValue($expr); + + $this->exprs[$metaValue] = $expr; + $this->meta[$inputValue] = $metaValue; + } + + /** + * Get the expression associated with a meta value + * + * @param integer $metaValue + * @return string + */ + public function getExpression($metaValue) + { + if (!isset($this->exprs[$metaValue])) + { + throw new InvalidArgumentException('Invalid meta value ' . $metaValue); + } + + return $this->exprs[$metaValue]; + } + + /** + * Return whether a given value represents a single character + * + * @param integer $value + * @return bool + */ + public function isChar($value) + { + return ($value >= 0 || ($value & self::IS_CHAR)); + } + + /** + * Return whether a given value represents a quantifiable expression + * + * @param integer $value + * @return bool + */ + public function isQuantifiable($value) + { + return ($value >= 0 || ($value & self::IS_QUANTIFIABLE)); + } + + /** + * Replace values from meta-characters in a list of strings with their meta value + * + * @param array[] $strings + * @return array[] + */ + public function replaceMeta(array $strings) + { + foreach ($strings as &$string) + { + foreach ($string as &$value) + { + if (isset($this->meta[$value])) + { + $value = $this->meta[$value]; + } + } + } + + return $strings; + } + + /** + * Compute and return a value for given expression + * + * Values are meant to be a unique negative integer. The last 2 bits indicate whether the + * expression is quantifiable and/or represents a single character. + * + * @param string $expr Regular expression + * @return integer + */ + protected function computeValue($expr) + { + $value = (1 + count($this->meta)) * -4; + if ($this->exprIsChar($expr)) + { + $value |= self::IS_CHAR; + } + if ($this->exprIsQuantifiable($expr)) + { + $value |= self::IS_QUANTIFIABLE; + } + + return $value; + } + + /** + * Test whether given expression represents a single character usable in a character class + * + * @param string $expr + * @return bool + */ + protected function exprIsChar($expr) + { + $regexps = [ + // Escaped literal or escape sequence such as \w but not \R + '(^\\\\[adefhnrstvwDHNSVW\\W]$)D', + + // Unicode properties such as \pL or \p{Lu} + '(^\\\\p(?:.|\\{[^}]+\\})$)Di', + + // An escape sequence such as \x1F or \x{2600} + '(^\\\\x(?:[0-9a-f]{2}|\\{[^}]+\\})$)Di' + ]; + + return $this->matchesAny($expr, $regexps); + } + + /** + * Test whether given expression is quantifiable + * + * @param string $expr + * @return bool + */ + protected function exprIsQuantifiable($expr) + { + $regexps = [ + // A dot or \R + '(^(?:\\.|\\\\R)$)D', + + // A character class + '(^\\[\\^?(?:([^\\\\\\]]|\\\\.)(?:-(?-1))?)++\\]$)D' + ]; + + return $this->matchesAny($expr, $regexps) || $this->exprIsChar($expr); + } + + /** + * Test whether given expression matches any of the given regexps + * + * @param string $expr + * @param string[] $regexps + * @return bool + */ + protected function matchesAny($expr, array $regexps) + { + foreach ($regexps as $regexp) + { + if (preg_match($regexp, $expr)) + { + return true; + } + } + + return false; + } +} \ No newline at end of file diff --git a/src/Serializer.php b/src/Serializer.php index 1ecfdc2..242305c 100644 --- a/src/Serializer.php +++ b/src/Serializer.php @@ -7,6 +7,7 @@ */ namespace s9e\RegexpBuilder; +use s9e\RegexpBuilder\MetaCharacters; use s9e\RegexpBuilder\Output\OutputInterface; class Serializer @@ -16,6 +17,11 @@ class Serializer */ protected $escaper; + /** + * @var MetaCharacters + */ + protected $meta; + /** * @var OutputInterface */ @@ -23,11 +29,13 @@ class Serializer /** * @param OutputInterface $output + * @parm MetaCharacters $meta * @param Escaper $escaper */ - public function __construct(OutputInterface $output, Escaper $escaper) + public function __construct(OutputInterface $output, MetaCharacters $meta, Escaper $escaper) { $this->escaper = $escaper; + $this->meta = $meta; $this->output = $output; } @@ -39,11 +47,16 @@ public function __construct(OutputInterface $output, Escaper $escaper) */ public function serializeStrings(array $strings) { - $info = $this->analyzeStrings($strings); - $alternations = $this->buildAlternations($info); - $expr = implode('|', $alternations); + $info = $this->analyzeStrings($strings); + $alternations = array_map([$this, 'serializeString'], $info['strings']); + if (!empty($info['chars'])) + { + // Prepend the character class to the list of alternations + array_unshift($alternations, $this->serializeCharacterClass($info['chars'])); + } - if (count($alternations) > 1 || $this->isOneOptionalString($info)) + $expr = implode('|', $alternations); + if ($this->needsParentheses($info)) { $expr = '(?:' . $expr . ')'; } @@ -65,7 +78,7 @@ public function serializeStrings(array $strings) */ protected function analyzeStrings(array $strings) { - $info = ['quantifier' => '']; + $info = ['alternationsCount' => 0, 'quantifier' => '']; if ($strings[0] === []) { $info['quantifier'] = '?'; @@ -75,48 +88,29 @@ protected function analyzeStrings(array $strings) $chars = $this->getChars($strings); if (count($chars) > 1) { + ++$info['alternationsCount']; $info['chars'] = array_values($chars); - $strings = array_diff_key($strings, $chars); + $strings = array_diff_key($strings, $chars); } - $info['strings'] = array_values($strings); + $info['strings'] = array_values($strings); + $info['alternationsCount'] += count($strings); return $info; } - /** - * Build the list of alternations based on given info - * - * @param array $info - * @return string[] - */ - protected function buildAlternations(array $info) - { - $alternations = []; - if (!empty($info['chars'])) - { - $alternations[] = $this->serializeCharacterClass($info['chars']); - } - foreach ($info['strings'] as $string) - { - $alternations[] = $this->serializeString($string); - } - - return $alternations; - } - /** * Return the portion of strings that are composed of a single character * * @param array[] - * @return array String key => codepoint + * @return array String key => value */ protected function getChars(array $strings) { $chars = []; foreach ($strings as $k => $string) { - if (count($string) === 1 && !is_array($string[0])) + if ($this->isChar($string)) { $chars[$k] = $string[0]; } @@ -156,15 +150,49 @@ protected function getRanges(array $values) } /** - * Test whether a string is optional and has more than one character + * Test whether given string represents a single character + * + * @param array $string + * @return bool + */ + protected function isChar(array $string) + { + return count($string) === 1 && !is_array($string[0]) && $this->meta->isChar($string[0]); + } + + /** + * Test whether an expression is quantifiable based on the strings info * * @param array $info * @return bool */ - protected function isOneOptionalString(array $info) + protected function isQuantifiable(array $info) + { + $strings = $info['strings']; + + return empty($strings) || $this->isSingleQuantifiableString($strings); + } + + /** + * Test whether a list of strings contains only one single quantifiable string + * + * @param string[] $strings + * @return bool + */ + protected function isSingleQuantifiableString(array $strings) { - // Test whether the first string has a quantifier and more than one element - return (!empty($info['quantifier']) && isset($info['strings'][0][1])); + return count($strings) === 1 && count($strings[0]) === 1 && $this->meta->isQuantifiable($strings[0][0]); + } + + /** + * Test whether an expression needs parentheses based on the strings info + * + * @param array $info + * @return bool + */ + protected function needsParentheses(array $info) + { + return ($info['alternationsCount'] > 1 || ($info['quantifier'] && !$this->isQuantifiable($info))); } /** @@ -178,14 +206,14 @@ protected function serializeCharacterClass(array $values) $expr = '['; foreach ($this->getRanges($values) as list($start, $end)) { - $expr .= $this->escaper->escapeCharacterClass($this->output->output($start)); + $expr .= $this->serializeCharacterClassUnit($start); if ($end > $start) { if ($end > $start + 1) { $expr .= '-'; } - $expr .= $this->escaper->escapeCharacterClass($this->output->output($end)); + $expr .= $this->serializeCharacterClassUnit($end); } } $expr .= ']'; @@ -193,6 +221,39 @@ protected function serializeCharacterClass(array $values) return $expr; } + /** + * Serialize a given value to be used in a character class + * + * @param integer $value + * @return string + */ + protected function serializeCharacterClassUnit($value) + { + return $this->serializeValue($value, 'escapeCharacterClass'); + } + + /** + * Serialize an element from a string + * + * @param array|integer $element + * @return string + */ + protected function serializeElement($element) + { + return (is_array($element)) ? $this->serializeStrings($element) : $this->serializeLiteral($element); + } + + /** + * Serialize a given value to be used as a literal + * + * @param integer $value + * @return string + */ + protected function serializeLiteral($value) + { + return $this->serializeValue($value, 'escapeLiteral'); + } + /** * Serialize a given string into a regular expression * @@ -201,12 +262,18 @@ protected function serializeCharacterClass(array $values) */ protected function serializeString(array $string) { - $expr = ''; - foreach ($string as $element) - { - $expr .= (is_array($element)) ? $this->serializeStrings($element) : $this->escaper->escapeLiteral($this->output->output($element)); - } + return implode('', array_map([$this, 'serializeElement'], $string)); + } - return $expr; + /** + * Serialize a given value + * + * @param integer $value + * @param string $escapeMethod + * @return string + */ + protected function serializeValue($value, $escapeMethod) + { + return ($value < 0) ? $this->meta->getExpression($value) : $this->escaper->$escapeMethod($this->output->output($value)); } } \ No newline at end of file diff --git a/tests/BuilderTest.php b/tests/BuilderTest.php index 649635a..cbb69f2 100644 --- a/tests/BuilderTest.php +++ b/tests/BuilderTest.php @@ -107,6 +107,31 @@ public function getBuilderTests() 'output' => 'JavaScript' ] ], + [ + ['x?'], + 'x.', + ['meta' => ['?' => '.']] + ], + [ + ['x', 'x?'], + 'x.?', + ['meta' => ['?' => '.']] + ], + [ + ['x?', 'xa', 'xb'], + 'x[\\dab]', + ['meta' => ['?' => '\\d']] + ], + [ + ['b', 'bX'], + 'b(?:xx)?', + ['meta' => ['X' => 'xx']] + ], + [ + ["\n", '.'], + '(?:\\n|.)', + ['meta' => ['.' => '.'], 'output' => 'PHP'] + ], ]; } } \ No newline at end of file diff --git a/tests/MetaCharactersTest.php b/tests/MetaCharactersTest.php new file mode 100644 index 0000000..b98a0f0 --- /dev/null +++ b/tests/MetaCharactersTest.php @@ -0,0 +1,106 @@ + $expr) + { + $meta->add($char, $expr); + } + + return $meta; + } + + /** + * @testdox Using multiple chars as meta-character throws an exception + * @expectedException InvalidArgumentException + * @expectedExceptionMessage Meta-characters must be represented by exactly one character + */ + public function testMultipleCharsException() + { + $this->getMeta(['xx' => 'x']); + } + + /** + * @testdox Invalid expressions throw an exception + * @expectedException InvalidArgumentException + * @expectedExceptionMessage Invalid expression '+++' + */ + public function testInvalidExceptionException() + { + $this->getMeta(['x' => '+++']); + } + + /** + * @testdox getExpression() returns the original expression that matches the given meta value + */ + public function testGetExpression() + { + $meta = $this->getMeta(["\0" => 'foo', "\1" => 'bar']); + $strings = $meta->replaceMeta([[0, 1]]); + + $this->assertEquals('foo', $meta->getExpression($strings[0][0])); + $this->assertEquals('bar', $meta->getExpression($strings[0][1])); + } + + /** + * @testdox getExpression() throws an exception on unknown meta values + * @expectedException InvalidArgumentException + * @expectedExceptionMessage Invalid meta value -1 + */ + public function testGetExpressionException() + { + $this->getMeta([])->getExpression(-1); + } + + /** + * @testdox Meta-characters properties + * @dataProvider getPropertiesTests + */ + public function testProperties($properties, $expr) + { + $meta = $this->getMeta(["\0" => $expr]); + $strings = $meta->replaceMeta([[0]]); + + $map = [ + 'c' => 'isChar', + 'q' => 'isQuantifiable' + ]; + foreach ($map as $c => $methodName) + { + $assertMethod = (strpos($properties, $c) === false) ? 'assertFalse' : 'assertTrue'; + $msg = $methodName . '(' . var_export($expr, true) . ')'; + $this->$assertMethod($meta->$methodName($strings[0][0]), $msg); + } + } + + public function getPropertiesTests() + { + return [ + ['cq', '\\w' ], + ['cq', '\\d' ], + ['cq', '\\x{2600}'], + ['cq', '\\pL' ], + ['cq', '\\p{^L}' ], + ['q', '.' ], + ['q', '\\R' ], + ['q', '[0-9]' ], + ['', '[0-9]+' ], + ['', '.*' ], + ['', 'xx' ], + ['', '^' ], + ['', '$' ], + ]; + } +} \ No newline at end of file diff --git a/tests/SerializerTest.php b/tests/SerializerTest.php index 02097ad..034c403 100644 --- a/tests/SerializerTest.php +++ b/tests/SerializerTest.php @@ -4,7 +4,9 @@ use PHPUnit_Framework_TestCase; use s9e\RegexpBuilder\Escaper; -use s9e\RegexpBuilder\Output\Bytes; +use s9e\RegexpBuilder\Input\Bytes as Input; +use s9e\RegexpBuilder\MetaCharacters; +use s9e\RegexpBuilder\Output\Bytes as Output; use s9e\RegexpBuilder\Serializer; /** @@ -17,7 +19,7 @@ class SerializerTest extends PHPUnit_Framework_TestCase */ public function test($original, $expected) { - $serializer = new Serializer(new Bytes, new Escaper); + $serializer = new Serializer(new Output, new MetaCharacters(new Input), new Escaper); $this->assertSame($expected, $serializer->serializeStrings($original, false)); }