Skip to content

Commit 8792955

Browse files
authored
Fix multi bytes characters in some regex of RegexFilter (#306)
1 parent 1f3bdec commit 8792955

File tree

2 files changed

+6
-6
lines changed

2 files changed

+6
-6
lines changed

src/Transformers/RegexFilter.php

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ class RegexFilter implements Transformer
5959
*
6060
* @var literal-string
6161
*/
62-
public const EXTRA_CHARACTERS = '/([^\w\s])(?=[^\w\s]*\1)/';
62+
public const EXTRA_CHARACTERS = '/([^\w\s])(?=[^\w\s]*\1)/u';
6363

6464
/**
6565
* Matches consecutively repeated words.
@@ -73,7 +73,7 @@ class RegexFilter implements Transformer
7373
*
7474
* @var literal-string
7575
*/
76-
public const EXTRA_WHITESPACE = '/\s(?=\s+)/';
76+
public const EXTRA_WHITESPACE = '/\s(?=\s+)/u';
7777

7878
/**
7979
* A pattern to match unicode emojis.
@@ -87,14 +87,14 @@ class RegexFilter implements Transformer
8787
*
8888
* @var literal-string
8989
*/
90-
public const MENTION = '/(@\w+)/';
90+
public const MENTION = '/(@\w+)/u';
9191

9292
/**
9393
* A pattern to match Twitter-style hashtags (ex. #MachineLearning).
9494
*
9595
* @var literal-string
9696
*/
97-
public const HASHTAG = '/(#\w+)/';
97+
public const HASHTAG = '/(#\w+)/u';
9898

9999
/**
100100
* A list of regular expression patterns used to filter the text columns of the dataset.

tests/Transformers/RegexFilterTest.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ protected function setUp() : void
3333
['Too weird to live, [email protected] too rare to die https://rubixml.com'],
3434
['A man who procrastinates in @his choosing will inevitably have his choice made for him by #circumstance'],
3535
['The quick quick brown fox jumped over the lazy man sitting at a bus stop drinking a can of Cola cola'],
36-
['Diese äpfel Äpfel schmecken sehr gut'],
36+
['Diese äpfel Äpfel schmecken sehr gut'],
3737
]);
3838

3939
$this->transformer = new RegexFilter([
@@ -68,7 +68,7 @@ public function transform() : void
6868
['Too weird to live, too rare to die '],
6969
['A man who procrastinates in choosing will inevitably have his choice made for him by '],
7070
['The quick brown fox jumped over the lazy man sitting at a bus stop drinking a can of cola'],
71-
['Diese Äpfel schmecken sehr gut'],
71+
['Diese Äpfel schmecken sehr gut'],
7272
];
7373

7474
$this->assertEquals($expected, $this->dataset->samples());

0 commit comments

Comments
 (0)