Skip to content

Commit 48c70a9

Browse files
iluuu1994jschroed91
authored andcommitted
Fix issues with unicode characters - fixes #71 (#72)
* Fix issues with unicode characters * Prevent empty elements when splitting string into words
1 parent 231250a commit 48c70a9

File tree

7 files changed

+55
-45
lines changed

7 files changed

+55
-45
lines changed

Diff for: lib/Caxy/HtmlDiff/AbstractDiff.php

+21-11
Original file line numberDiff line numberDiff line change
@@ -398,9 +398,9 @@ protected function getClosingTag($tag)
398398
*/
399399
protected function getStringBetween($str, $start, $end)
400400
{
401-
$expStr = explode($start, $str, 2);
401+
$expStr = mb_split($start, $str, 2);
402402
if (count($expStr) > 1) {
403-
$expStr = explode($end, $expStr[ 1 ]);
403+
$expStr = mb_split($end, $expStr[ 1 ]);
404404
if (count($expStr) > 1) {
405405
array_pop($expStr);
406406

@@ -461,7 +461,7 @@ protected function setNewWords(array $newWords)
461461
*/
462462
protected function isPartOfWord($text)
463463
{
464-
return ctype_alnum(str_replace($this->config->getSpecialCaseChars(), '', $text));
464+
return $this->ctypeAlphanumUnicode(str_replace($this->config->getSpecialCaseChars(), '', $text));
465465
}
466466

467467
/**
@@ -485,15 +485,15 @@ protected function convertHtmlToListOfWords($characterString)
485485

486486
$current_word = '<';
487487
$mode = 'tag';
488-
} elseif (preg_match("/\s/", $character)) {
488+
} elseif (preg_match("/\s/u", $character)) {
489489
if ($current_word !== '') {
490490
$words[] = $current_word;
491491
}
492-
$current_word = $keepNewLines ? $character : preg_replace('/\s+/S', ' ', $character);
492+
$current_word = $keepNewLines ? $character : preg_replace('/\s+/Su', ' ', $character);
493493
$mode = 'whitespace';
494494
} else {
495495
if (
496-
(ctype_alnum($character) && (strlen($current_word) == 0 || $this->isPartOfWord($current_word))) ||
496+
(($this->ctypeAlphanumUnicode($character)) && (mb_strlen($current_word) == 0 || $this->isPartOfWord($current_word))) ||
497497
(in_array($character, $this->config->getSpecialCaseChars()) && isset($characterString[$i + 1]) && $this->isPartOfWord($characterString[$i + 1]))
498498
) {
499499
$current_word .= $character;
@@ -509,7 +509,7 @@ protected function convertHtmlToListOfWords($characterString)
509509
$words[] = $current_word;
510510
$current_word = '';
511511

512-
if (!preg_match('[^\s]', $character)) {
512+
if (!preg_match('[^\s]u', $character)) {
513513
$mode = 'whitespace';
514514
} else {
515515
$mode = 'character';
@@ -525,9 +525,9 @@ protected function convertHtmlToListOfWords($characterString)
525525
}
526526
$current_word = '<';
527527
$mode = 'tag';
528-
} elseif (preg_match("/\s/", $character)) {
528+
} elseif (preg_match("/\s/u", $character)) {
529529
$current_word .= $character;
530-
if (!$keepNewLines) $current_word = preg_replace('/\s+/S', ' ', $current_word);
530+
if (!$keepNewLines) $current_word = preg_replace('/\s+/Su', ' ', $current_word);
531531
} else {
532532
if ($current_word != '') {
533533
$words[] = $current_word;
@@ -574,7 +574,7 @@ protected function isEndOfTag($val)
574574
*/
575575
protected function isWhiteSpace($value)
576576
{
577-
return !preg_match('[^\s]', $value);
577+
return !preg_match('[^\s]u', $value);
578578
}
579579

580580
/**
@@ -585,6 +585,16 @@ protected function isWhiteSpace($value)
585585
protected function explode($value)
586586
{
587587
// as suggested by @onassar
588-
return preg_split('//u', $value);
588+
return preg_split('//u', $value, -1, PREG_SPLIT_NO_EMPTY);
589+
}
590+
591+
/**
592+
* @param string $str
593+
*
594+
* @return bool
595+
*/
596+
protected function ctypeAlphanumUnicode($str)
597+
{
598+
return preg_match("/^[a-zA-Z0-9\pL]+$/u", $str);
589599
}
590600
}

Diff for: lib/Caxy/HtmlDiff/HtmlDiff.php

+14-14
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ protected function createIsolatedDiffTagPlaceholders(&$words)
158158
foreach ($words as $index => $word) {
159159
$openIsolatedDiffTag = $this->isOpeningIsolatedDiffTag($word, $currentIsolatedDiffTag);
160160
if ($openIsolatedDiffTag) {
161-
if ($this->isSelfClosingTag($word) || stripos($word, '<img') !== false) {
161+
if ($this->isSelfClosingTag($word) || mb_stripos($word, '<img') !== false) {
162162
if ($openIsolatedDiffTags === 0) {
163163
$isolatedDiffTagIndices[] = array(
164164
'start' => $index,
@@ -205,7 +205,7 @@ protected function isOpeningIsolatedDiffTag($item, $currentIsolatedDiffTag = nul
205205
$tagsToMatch = $currentIsolatedDiffTag !== null
206206
? array($currentIsolatedDiffTag => $this->config->getIsolatedDiffTagPlaceholder($currentIsolatedDiffTag))
207207
: $this->config->getIsolatedDiffTags();
208-
$pattern = '#<%s(\s+[^>]*)?>#iU';
208+
$pattern = '#<%s(\s+[^>]*)?>#iUu';
209209
foreach ($tagsToMatch as $key => $value) {
210210
if (preg_match(sprintf($pattern, $key), $item)) {
211211
return $key;
@@ -217,7 +217,7 @@ protected function isOpeningIsolatedDiffTag($item, $currentIsolatedDiffTag = nul
217217

218218
protected function isSelfClosingTag($text)
219219
{
220-
return (bool) preg_match('/<[^>]+\/\s*>/', $text);
220+
return (bool) preg_match('/<[^>]+\/\s*>/u', $text);
221221
}
222222

223223
/**
@@ -231,7 +231,7 @@ protected function isClosingIsolatedDiffTag($item, $currentIsolatedDiffTag = nul
231231
$tagsToMatch = $currentIsolatedDiffTag !== null
232232
? array($currentIsolatedDiffTag => $this->config->getIsolatedDiffTagPlaceholder($currentIsolatedDiffTag))
233233
: $this->config->getIsolatedDiffTags();
234-
$pattern = '#</%s(\s+[^>]*)?>#iU';
234+
$pattern = '#</%s(\s+[^>]*)?>#iUu';
235235
foreach ($tagsToMatch as $key => $value) {
236236
if (preg_match(sprintf($pattern, $key), $item)) {
237237
return $key;
@@ -354,7 +354,7 @@ protected function diffElements($oldText, $newText, $stripWrappingTags = true)
354354
$wrapEnd = '';
355355

356356
if ($stripWrappingTags) {
357-
$pattern = '/(^<[^>]+>)|(<\/[^>]+>$)/i';
357+
$pattern = '/(^<[^>]+>)|(<\/[^>]+>$)/iu';
358358
$matches = array();
359359

360360
if (preg_match_all($pattern, $newText, $matches)) {
@@ -441,7 +441,7 @@ protected function processEqualOperation($operation)
441441
protected function getAttributeFromTag($text, $attribute)
442442
{
443443
$matches = array();
444-
if (preg_match(sprintf('/<[^>]*\b%s\s*=\s*([\'"])(.*)\1[^>]*>/i', $attribute), $text, $matches)) {
444+
if (preg_match(sprintf('/<[^>]*\b%s\s*=\s*([\'"])(.*)\1[^>]*>/iu', $attribute), $text, $matches)) {
445445
return htmlspecialchars_decode($matches[2]);
446446
}
447447

@@ -567,15 +567,15 @@ protected function insertTag($tag, $cssClass, &$words)
567567
}
568568
}
569569
}
570-
if (count($words) == 0 && strlen($specialCaseTagInjection) == 0) {
570+
if (count($words) == 0 && mb_strlen($specialCaseTagInjection) == 0) {
571571
break;
572572
}
573573
if ($specialCaseTagInjectionIsBefore) {
574574
$this->content .= $specialCaseTagInjection.implode('', $this->extractConsecutiveWords($words, 'tag'));
575575
} else {
576576
$workTag = $this->extractConsecutiveWords($words, 'tag');
577577
if (isset($workTag[ 0 ]) && $this->isOpeningTag($workTag[ 0 ]) && !$this->isClosingTag($workTag[ 0 ])) {
578-
if (strpos($workTag[ 0 ], 'class=')) {
578+
if (mb_strpos($workTag[ 0 ], 'class=')) {
579579
$workTag[ 0 ] = str_replace('class="', 'class="diffmod ', $workTag[ 0 ]);
580580
$workTag[ 0 ] = str_replace("class='", 'class="diffmod ', $workTag[ 0 ]);
581581
} else {
@@ -584,7 +584,7 @@ protected function insertTag($tag, $cssClass, &$words)
584584
}
585585

586586
$appendContent = implode('', $workTag).$specialCaseTagInjection;
587-
if (isset($workTag[0]) && false !== stripos($workTag[0], '<img')) {
587+
if (isset($workTag[0]) && false !== mb_stripos($workTag[0], '<img')) {
588588
$appendContent = $this->wrapText($appendContent, $tag, $cssClass);
589589
}
590590
$this->content .= $appendContent;
@@ -673,7 +673,7 @@ protected function isTag($item)
673673
*/
674674
protected function isOpeningTag($item)
675675
{
676-
return preg_match('#<[^>]+>\\s*#iU', $item);
676+
return preg_match('#<[^>]+>\\s*#iUu', $item);
677677
}
678678

679679
/**
@@ -683,7 +683,7 @@ protected function isOpeningTag($item)
683683
*/
684684
protected function isClosingTag($item)
685685
{
686-
return preg_match('#</[^>]+>\\s*#iU', $item);
686+
return preg_match('#</[^>]+>\\s*#iUu', $item);
687687
}
688688

689689
/**
@@ -769,10 +769,10 @@ protected function findMatchingBlocks($startInOld, $endInOld, $startInNew, $endI
769769
*/
770770
protected function stripTagAttributes($word)
771771
{
772-
$space = strpos($word, ' ', 1);
772+
$space = mb_strpos($word, ' ', 1);
773773

774774
if ($space) {
775-
return '<' . substr($word, 1, $space) . '>';
775+
return '<' . mb_substr($word, 1, $space) . '>';
776776
}
777777

778778
return trim($word, '<>');
@@ -850,7 +850,7 @@ protected function findMatch($startInOld, $endInOld, $startInNew, $endInNew)
850850
protected function isOnlyWhitespace($str)
851851
{
852852
// Slightly faster then using preg_match
853-
return $str !== '' && (strlen(trim($str)) === 0);
853+
return $str !== '' && (mb_strlen(trim($str)) === 0);
854854
}
855855

856856
/**

Diff for: lib/Caxy/HtmlDiff/HtmlDiffConfig.php

+1-1
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ public function setIsolatedDiffTags($isolatedDiffTags)
345345
public function addIsolatedDiffTag($tag, $placeholder = null)
346346
{
347347
if (null === $placeholder) {
348-
$placeholder = sprintf('[[REPLACE_%s]]', strtoupper($tag));
348+
$placeholder = sprintf('[[REPLACE_%s]]', mb_strtoupper($tag));
349349
}
350350

351351
if ($this->isIsolatedDiffTag($tag) && $this->isolatedDiffTags[$tag] !== $placeholder) {

Diff for: lib/Caxy/HtmlDiff/ListDiff.php

+6-6
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ protected function buildDiffList($words)
233233
$list[] = $word;
234234
}
235235
} else {
236-
$listType = substr($word, 1, 2);
236+
$listType = mb_substr($word, 1, 2);
237237
$listStartTag = $word;
238238
}
239239

@@ -254,7 +254,7 @@ protected function buildDiffList($words)
254254
if ($openListItems === 0) {
255255
// New top-level list item
256256
$currentListItem = array();
257-
$listItemType = substr($word, 1, 2);
257+
$listItemType = mb_substr($word, 1, 2);
258258
$listItemStart = $word;
259259
} else {
260260
$currentListItem[] = $word;
@@ -290,27 +290,27 @@ protected function isOpeningListTag($word, $type = null)
290290
{
291291
$filter = $type !== null ? array('<'.$type) : array('<ul', '<ol', '<dl');
292292

293-
return in_array(substr($word, 0, 3), $filter);
293+
return in_array(mb_substr($word, 0, 3), $filter);
294294
}
295295

296296
protected function isClosingListTag($word, $type = null)
297297
{
298298
$filter = $type !== null ? array('</'.$type) : array('</ul', '</ol', '</dl');
299299

300-
return in_array(substr($word, 0, 4), $filter);
300+
return in_array(mb_substr($word, 0, 4), $filter);
301301
}
302302

303303
protected function isOpeningListItemTag($word, $type = null)
304304
{
305305
$filter = $type !== null ? array('<'.$type) : array('<li', '<dd', '<dt');
306306

307-
return in_array(substr($word, 0, 3), $filter);
307+
return in_array(mb_substr($word, 0, 3), $filter);
308308
}
309309

310310
protected function isClosingListItemTag($word, $type = null)
311311
{
312312
$filter = $type !== null ? array('</'.$type) : array('</li', '</dd', '</dt');
313313

314-
return in_array(substr($word, 0, 4), $filter);
314+
return in_array(mb_substr($word, 0, 4), $filter);
315315
}
316316
}

Diff for: lib/Caxy/HtmlDiff/Preprocessor.php

+8-8
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,19 @@ class Preprocessor
77
public static function diffCommonPrefix($old, $new)
88
{
99
// Quick check for common null cases.
10-
if (strlen($old) == 0 || strlen($new) == 0 || substr($old, 0, 1) != substr($new, 0, 1)) {
10+
if (mb_strlen($old) == 0 || mb_strlen($new) == 0 || mb_substr($old, 0, 1) != mb_substr($new, 0, 1)) {
1111
return 0;
1212
}
1313

1414
// Binary Search
1515
$pointerMin = 0;
16-
$pointerMax = min(strlen($old), strlen($new));
16+
$pointerMax = min(mb_strlen($old), mb_strlen($new));
1717
$pointerMid = $pointerMax;
1818
$pointerStart = 0;
1919
while ($pointerMin < $pointerMid) {
2020
$cmp = substr_compare(
2121
$old,
22-
substr($new, $pointerStart, $pointerMid - $pointerStart),
22+
mb_substr($new, $pointerStart, $pointerMid - $pointerStart),
2323
$pointerStart,
2424
$pointerMid - $pointerStart
2525
);
@@ -37,19 +37,19 @@ public static function diffCommonPrefix($old, $new)
3737
public static function diffCommonSuffix($old, $new)
3838
{
3939
// Quick check for common null cases.
40-
if (strlen($old) == 0 || strlen($new) == 0 || substr($old, strlen($old) - 1, 1) != substr($new, strlen($new) - 1, 1)) {
40+
if (mb_strlen($old) == 0 || mb_strlen($new) == 0 || mb_substr($old, mb_strlen($old) - 1, 1) != mb_substr($new, mb_strlen($new) - 1, 1)) {
4141
return 0;
4242
}
4343

4444
// Binary Search
4545
$pointerMin = 0;
46-
$pointerMax = min(strlen($old), strlen($new));
46+
$pointerMax = min(mb_strlen($old), mb_strlen($new));
4747
$pointerMid = $pointerMax;
4848
$pointerEnd = 0;
49-
$oldLen = strlen($old);
50-
$newLen = strlen($new);
49+
$oldLen = mb_strlen($old);
50+
$newLen = mb_strlen($new);
5151
while ($pointerMin < $pointerMid) {
52-
if (substr($old, $oldLen - $pointerMid, $pointerMid - $pointerEnd) == substr($new, $newLen - $pointerMid, $pointerMid - $pointerEnd)) {
52+
if (mb_substr($old, $oldLen - $pointerMid, $pointerMid - $pointerEnd) == mb_substr($new, $newLen - $pointerMid, $pointerMid - $pointerEnd)) {
5353
$pointerMin = $pointerMid;
5454
$pointerEnd = $pointerMin;
5555
} else {

Diff for: lib/Caxy/HtmlDiff/Strategy/ListItemMatchStrategy.php

+4-4
Original file line numberDiff line numberDiff line change
@@ -63,20 +63,20 @@ public function isMatch($a, $b)
6363
// Check common prefix/ suffix length
6464
$aCleaned = trim($aStripped);
6565
$bCleaned = trim($bStripped);
66-
if (strlen($aCleaned) === 0 || strlen($bCleaned) === 0) {
66+
if (mb_strlen($aCleaned) === 0 || mb_strlen($bCleaned) === 0) {
6767
$aCleaned = $a;
6868
$bCleaned = $b;
6969
}
70-
if (strlen($aCleaned) === 0 || strlen($bCleaned) === 0) {
70+
if (mb_strlen($aCleaned) === 0 || mb_strlen($bCleaned) === 0) {
7171
return false;
7272
}
7373
$prefixIndex = Preprocessor::diffCommonPrefix($aCleaned, $bCleaned);
7474
$suffixIndex = Preprocessor::diffCommonSuffix($aCleaned, $bCleaned);
7575

7676
// Use shorter string, and see how much of it is leftover
77-
$len = min(strlen($aCleaned), strlen($bCleaned));
77+
$len = min(mb_strlen($aCleaned), mb_strlen($bCleaned));
7878
$remaining = $len - ($prefixIndex + $suffixIndex);
79-
$strLengthPercent = $len / max(strlen($a), strlen($b));
79+
$strLengthPercent = $len / max(mb_strlen($a), mb_strlen($b));
8080

8181
if ($remaining === 0 && $strLengthPercent > $this->lengthRatioThreshold) {
8282
return true;

Diff for: lib/Caxy/HtmlDiff/Table/TableDiff.php

+1-1
Original file line numberDiff line numberDiff line change
@@ -733,7 +733,7 @@ protected function htmlFromNode($node)
733733
protected function setInnerHtml($node, $html)
734734
{
735735
// DOMDocument::loadHTML does not allow empty strings.
736-
if (strlen(trim($html)) === 0) {
736+
if (mb_strlen(trim($html)) === 0) {
737737
$html = '<span class="empty"></span>';
738738
}
739739

0 commit comments

Comments
 (0)