|
| 1 | +<?php declare(strict_types=1); |
| 2 | + |
| 3 | +namespace GraphQL\Utils; |
| 4 | + |
| 5 | +/** |
| 6 | + * Computes the lexical distance between strings A and B. |
| 7 | + * |
| 8 | + * The "distance" between two strings is given by counting the minimum number |
| 9 | + * of edits needed to transform string A into string B. An edit can be an |
| 10 | + * insertion, deletion, or substitution of a single character, or a swap of two |
| 11 | + * adjacent characters. |
| 12 | + * |
| 13 | + * Includes a custom alteration from Damerau-Levenshtein to treat case changes |
| 14 | + * as a single edit which helps identify mis-cased values with an edit distance |
| 15 | + * of 1. |
| 16 | + * |
| 17 | + * This distance can be useful for detecting typos in input or sorting |
| 18 | + * |
| 19 | + * Unlike the native levenshtein() function that always returns int, LexicalDistance::measure() returns int|null. |
| 20 | + * It takes into account the threshold and returns null if the measured distance is bigger. |
| 21 | + */ |
| 22 | +class LexicalDistance |
| 23 | +{ |
| 24 | + private string $input; |
| 25 | + |
| 26 | + private string $inputLowerCase; |
| 27 | + |
| 28 | + /** |
| 29 | + * List of char codes in the input string. |
| 30 | + * |
| 31 | + * @var array<int> |
| 32 | + */ |
| 33 | + private array $inputArray; |
| 34 | + |
| 35 | + public function __construct(string $input) |
| 36 | + { |
| 37 | + $this->input = $input; |
| 38 | + $this->inputLowerCase = \strtolower($input); |
| 39 | + $this->inputArray = self::stringToArray($this->inputLowerCase); |
| 40 | + } |
| 41 | + |
| 42 | + public function measure(string $option, float $threshold): ?int |
| 43 | + { |
| 44 | + if ($this->input === $option) { |
| 45 | + return 0; |
| 46 | + } |
| 47 | + |
| 48 | + $optionLowerCase = \strtolower($option); |
| 49 | + |
| 50 | + // Any case change counts as a single edit |
| 51 | + if ($this->inputLowerCase === $optionLowerCase) { |
| 52 | + return 1; |
| 53 | + } |
| 54 | + |
| 55 | + $a = self::stringToArray($optionLowerCase); |
| 56 | + $b = $this->inputArray; |
| 57 | + |
| 58 | + if (\count($a) < \count($b)) { |
| 59 | + $tmp = $a; |
| 60 | + $a = $b; |
| 61 | + $b = $tmp; |
| 62 | + } |
| 63 | + |
| 64 | + $aLength = \count($a); |
| 65 | + $bLength = \count($b); |
| 66 | + |
| 67 | + if ($aLength - $bLength > $threshold) { |
| 68 | + return null; |
| 69 | + } |
| 70 | + |
| 71 | + /** @var array<array<int>> $rows */ |
| 72 | + $rows = []; |
| 73 | + for ($i = 0; $i <= $bLength; ++$i) { |
| 74 | + $rows[0][$i] = $i; |
| 75 | + } |
| 76 | + |
| 77 | + for ($i = 1; $i <= $aLength; ++$i) { |
| 78 | + $upRow = &$rows[($i - 1) % 3]; |
| 79 | + $currentRow = &$rows[$i % 3]; |
| 80 | + |
| 81 | + $smallestCell = ($currentRow[0] = $i); |
| 82 | + for ($j = 1; $j <= $bLength; ++$j) { |
| 83 | + $cost = $a[$i - 1] === $b[$j - 1] ? 0 : 1; |
| 84 | + |
| 85 | + $currentCell = \min( |
| 86 | + $upRow[$j] + 1, // delete |
| 87 | + $currentRow[$j - 1] + 1, // insert |
| 88 | + $upRow[$j - 1] + $cost, // substitute |
| 89 | + ); |
| 90 | + |
| 91 | + if ($i > 1 && $j > 1 && $a[$i - 1] === $b[$j - 2] && $a[$i - 2] === $b[$j - 1]) { |
| 92 | + // transposition |
| 93 | + $doubleDiagonalCell = $rows[($i - 2) % 3][$j - 2]; |
| 94 | + $currentCell = \min($currentCell, $doubleDiagonalCell + 1); |
| 95 | + } |
| 96 | + |
| 97 | + if ($currentCell < $smallestCell) { |
| 98 | + $smallestCell = $currentCell; |
| 99 | + } |
| 100 | + |
| 101 | + $currentRow[$j] = $currentCell; |
| 102 | + } |
| 103 | + |
| 104 | + // Early exit, since distance can't go smaller than smallest element of the previous row. |
| 105 | + if ($smallestCell > $threshold) { |
| 106 | + return null; |
| 107 | + } |
| 108 | + } |
| 109 | + |
| 110 | + $distance = $rows[$aLength % 3][$bLength]; |
| 111 | + |
| 112 | + return $distance <= $threshold ? $distance : null; |
| 113 | + } |
| 114 | + |
| 115 | + /** |
| 116 | + * Returns a list of char codes in the given string. |
| 117 | + * |
| 118 | + * @return array<int> |
| 119 | + */ |
| 120 | + private static function stringToArray(string $str): array |
| 121 | + { |
| 122 | + $array = []; |
| 123 | + foreach (\mb_str_split($str) as $char) { |
| 124 | + $array[] = \mb_ord($char); |
| 125 | + } |
| 126 | + |
| 127 | + return $array; |
| 128 | + } |
| 129 | +} |
0 commit comments