Skip to content

Commit

Permalink
Added support for meta-characters
Browse files Browse the repository at this point in the history
  • Loading branch information
JoshyPHP committed Jan 5, 2018
1 parent 92309e7 commit 2272196
Show file tree
Hide file tree
Showing 8 changed files with 522 additions and 49 deletions.
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,34 @@ echo $builder->build(['☺', '☹']);
\xe2\x98[\xb9\xba]
[\u2639\u263a]
```

### Adding meta-characters

Some individual characters can be used to represent arbitrary expressions in the input strings. The requirements are that:

1. Only single characters (as per the input encoding) can be used. For example, `?` is allowed but not `??`.
2. The regular expression must be valid on its own. For example, `.*` is valid but not `+`.

In the following example, we emulate Bash-style jokers by mapping `?` to `.` and `*` to `.*`.

```php
$builder = new s9e\RegexpBuilder\Builder([
'meta' => ['?' => '.', '*' => '.*']
]);
echo $builder->build(['foo?', 'bar*']);
```
```
(?:bar.*|foo.)
```

In the following example, we map `X` to `\d`. Note that sequences produced by meta-characters may appear in character classes if the result is valid.

```php
$builder = new s9e\RegexpBuilder\Builder([
'meta' => ['X' => '\\d']
]);
echo $builder->build(['a', 'b', 'X']);
```
```
[\dab]
```
7 changes: 4 additions & 3 deletions composer.json
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
{
"name": "s9e/regexp-builder",
"version": "1.3.1-dev",
"version": "1.4.0",
"type": "library",
"description": "Single-purpose library that generates regular expressions that match a list of literals.",
"description": "Single-purpose library that generates regular expressions that match a list of strings.",
"homepage": "https://github.com/s9e/RegexpBuilder/",
"keywords": ["regexp"],
"license": "MIT",
"require": {
"php": ">=5.5.1",
"lib-pcre": ">=7.2",
"php": ">=5.5.1",
"phpunit/phpunit": "<5.8"
},
"autoload": {
Expand Down
25 changes: 24 additions & 1 deletion src/Builder.php
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ class Builder
*/
protected $input;

/**
* @var MetaCharacters
*/
protected $meta;

/**
* @var Runner
*/
Expand All @@ -43,11 +48,13 @@ public function __construct(array $config = [])
'delimiter' => '/',
'input' => 'Bytes',
'inputOptions' => [],
'meta' => [],
'output' => 'Bytes',
'outputOptions' => []
];

$this->setInput($config['input'], $config['inputOptions']);
$this->setMeta($config['meta']);
$this->setSerializer($config['output'], $config['outputOptions'], $config['delimiter']);
$this->setRunner();
}
Expand All @@ -68,6 +75,7 @@ public function build(array $strings)

$strings = $this->splitStrings($strings);
usort($strings, __CLASS__ . '::compareStrings');
$strings = $this->meta->replaceMeta($strings);
$strings = $this->runner->run($strings);

return $this->serializer->serializeStrings($strings);
Expand Down Expand Up @@ -121,6 +129,21 @@ protected function setInput($inputType, array $inputOptions)
$this->input = new $className($inputOptions);
}

/**
* Set the MetaCharacters instance in $this->meta
*
* @param array $map
* @return void
*/
protected function setMeta(array $map)
{
$this->meta = new MetaCharacters($this->input);
foreach ($map as $char => $expr)
{
$this->meta->add($char, $expr);
}
}

/**
* Set the Runner instance $in this->runner
*
Expand Down Expand Up @@ -152,7 +175,7 @@ protected function setSerializer($outputType, array $outputOptions, $delimiter)
$output = new $className($outputOptions);
$escaper = new Escaper($delimiter);

$this->serializer = new Serializer($output, $escaper);
$this->serializer = new Serializer($output, $this->meta, $escaper);
}

/**
Expand Down
218 changes: 218 additions & 0 deletions src/MetaCharacters.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
<?php

/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder;

use InvalidArgumentException;
use s9e\RegexpBuilder\Input\InputInterface;

class MetaCharacters
{
/**
* @const Bit value that indicates whether a meta-character represents a single character
*/
const IS_CHAR = 1;

/**
* @const Bit value that indicates whether a meta-character represents a quantifiable expression
*/
const IS_QUANTIFIABLE = 2;

/**
* @var array Map of meta values and the expression they represent
*/
protected $exprs = [];

/**
* @var InputInterface
*/
protected $input;

/**
* @var array Map of meta-characters' codepoints and their value
*/
protected $meta = [];

/**
* @param InputInterface $input
*/
public function __construct(InputInterface $input)
{
$this->input = $input;
}

/**
* Add a meta-character to the list
*
* @param string $char Meta-character
* @param string $expr Regular expression
* @return void
*/
public function add($char, $expr)
{
$split = $this->input->split($char);
if (count($split) !== 1)
{
throw new InvalidArgumentException('Meta-characters must be represented by exactly one character');
}
if (@preg_match('(' . $expr . ')u', '') === false)
{
throw new InvalidArgumentException("Invalid expression '" . $expr . "'");
}

$inputValue = $split[0];
$metaValue = $this->computeValue($expr);

$this->exprs[$metaValue] = $expr;
$this->meta[$inputValue] = $metaValue;
}

/**
* Get the expression associated with a meta value
*
* @param integer $metaValue
* @return string
*/
public function getExpression($metaValue)
{
if (!isset($this->exprs[$metaValue]))
{
throw new InvalidArgumentException('Invalid meta value ' . $metaValue);
}

return $this->exprs[$metaValue];
}

/**
* Return whether a given value represents a single character
*
* @param integer $value
* @return bool
*/
public function isChar($value)
{
return ($value >= 0 || ($value & self::IS_CHAR));
}

/**
* Return whether a given value represents a quantifiable expression
*
* @param integer $value
* @return bool
*/
public function isQuantifiable($value)
{
return ($value >= 0 || ($value & self::IS_QUANTIFIABLE));
}

/**
* Replace values from meta-characters in a list of strings with their meta value
*
* @param array[] $strings
* @return array[]
*/
public function replaceMeta(array $strings)
{
foreach ($strings as &$string)
{
foreach ($string as &$value)
{
if (isset($this->meta[$value]))
{
$value = $this->meta[$value];
}
}
}

return $strings;
}

/**
* Compute and return a value for given expression
*
* Values are meant to be a unique negative integer. The last 2 bits indicate whether the
* expression is quantifiable and/or represents a single character.
*
* @param string $expr Regular expression
* @return integer
*/
protected function computeValue($expr)
{
$value = (1 + count($this->meta)) * -4;
if ($this->exprIsChar($expr))
{
$value |= self::IS_CHAR;
}
if ($this->exprIsQuantifiable($expr))
{
$value |= self::IS_QUANTIFIABLE;
}

return $value;
}

/**
* Test whether given expression represents a single character usable in a character class
*
* @param string $expr
* @return bool
*/
protected function exprIsChar($expr)
{
$regexps = [
// Escaped literal or escape sequence such as \w but not \R
'(^\\\\[adefhnrstvwDHNSVW\\W]$)D',

// Unicode properties such as \pL or \p{Lu}
'(^\\\\p(?:.|\\{[^}]+\\})$)Di',

// An escape sequence such as \x1F or \x{2600}
'(^\\\\x(?:[0-9a-f]{2}|\\{[^}]+\\})$)Di'
];

return $this->matchesAny($expr, $regexps);
}

/**
* Test whether given expression is quantifiable
*
* @param string $expr
* @return bool
*/
protected function exprIsQuantifiable($expr)
{
$regexps = [
// A dot or \R
'(^(?:\\.|\\\\R)$)D',

// A character class
'(^\\[\\^?(?:([^\\\\\\]]|\\\\.)(?:-(?-1))?)++\\]$)D'
];

return $this->matchesAny($expr, $regexps) || $this->exprIsChar($expr);
}

/**
* Test whether given expression matches any of the given regexps
*
* @param string $expr
* @param string[] $regexps
* @return bool
*/
protected function matchesAny($expr, array $regexps)
{
foreach ($regexps as $regexp)
{
if (preg_match($regexp, $expr))
{
return true;
}
}

return false;
}
}
Loading

0 comments on commit 2272196

Please sign in to comment.