Skip to content

Commit 84bcd48

Browse files
committed
[PHP 8.4][Intl] Add grapheme_str_split
Add a polyfill for the `grapheme_str_split` function added in PHP 8.4. Requires PHP 7.3, because the polyfill is based on `\X` Regex, and it only works properly on PCRE2, which [only comes with PHP 7.3+](https://php.watch/versions/7.3/pcre2). Further, there are some cases that the polyfill cannot split complex characters (such as two consecutive country flag Emojis). This is now fixed in [PCRE2Project/pcre2#410](PCRE2Project/pcre2#410). However, this change will likely only make it to PHP 8.4. References: - [RFC: Grapheme cluster for `str_split` function: `grapheme_str_split`](https://wiki.php.net/rfc/grapheme_str_split) - [PHP.Watch: PHP 8.4: New `grapheme_str_split` function](https://php.watch/versions/8.4/grapheme_str_split)
1 parent c5ce28b commit 84bcd48

File tree

9 files changed

+134
-0
lines changed

9 files changed

+134
-0
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ Polyfills are provided for:
7272
- the `Deprecated` attribute introduced in PHP 8.4;
7373
- the `mb_trim`, `mb_ltrim` and `mb_rtrim` functions introduced in PHP 8.4;
7474
- the `CURL_HTTP_VERSION_3` and `CURL_HTTP_VERSION_3ONLY` constants introduced in PHP 8.4;
75+
- the `grapheme_str_split` function introduced in PHP 8.4;
7576

7677
It is strongly recommended to upgrade your PHP version and/or install the missing
7778
extensions whenever possible. This polyfill should be used only when there is no

src/Intl/Grapheme/Grapheme.php

+32
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
* - grapheme_strrpos - Find position (in grapheme units) of last occurrence of a string
2727
* - grapheme_strstr - Returns part of haystack string from the first occurrence of needle to the end of haystack
2828
* - grapheme_substr - Return part of a string
29+
* - grapheme_str_split - Splits a string into an array of individual or chunks of graphemes.
2930
*
3031
* @author Nicolas Grekas <[email protected]>
3132
*
@@ -191,6 +192,37 @@ public static function grapheme_strstr($s, $needle, $beforeNeedle = false)
191192
return mb_strstr($s, $needle, $beforeNeedle, 'UTF-8');
192193
}
193194

195+
public static function grapheme_str_split($s, $len = 1)
196+
{
197+
if (0 > $len || 1073741823 < $len) {
198+
if (80000 > \PHP_VERSION_ID) {
199+
return false;
200+
}
201+
202+
throw new \ValueError('grapheme_str_split(): Argument #2 ($length) must be greater than 0 and less than or equal to 1073741823.');
203+
}
204+
205+
if ('' === $s) {
206+
return [];
207+
}
208+
209+
if (!preg_match_all('/('.SYMFONY_GRAPHEME_CLUSTER_RX.')/u', $s, $matches)) {
210+
return false;
211+
}
212+
213+
if (1 === $len) {
214+
return $matches[0];
215+
}
216+
217+
$chunks = array_chunk($matches[0], $len);
218+
219+
foreach ($chunks as &$chunk) {
220+
$chunk = implode('', $chunk);
221+
}
222+
223+
return $chunks;
224+
}
225+
194226
private static function grapheme_position($s, $needle, $offset, $mode)
195227
{
196228
$needle = (string) $needle;

src/Intl/Grapheme/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ This component provides a partial, native PHP implementation of the
2121
- [`grapheme_strstr`](https://php.net/grapheme_strstr): Returns part of haystack string from
2222
the first occurrence of needle to the end of haystack
2323
- [`grapheme_substr`](https://php.net/grapheme_substr): Return part of a string
24+
- [`grapheme_str_split](https://php.net/grapheme_str_split): Splits a string into an array of individual or chunks of graphemes.
2425

2526
More information can be found in the
2627
[main Polyfill README](https://github.com/symfony/polyfill/blob/main/README.md).

src/Intl/Grapheme/bootstrap80.php

+3
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,6 @@ function grapheme_strstr(?string $haystack, ?string $needle, ?bool $beforeNeedle
4848
if (!function_exists('grapheme_substr')) {
4949
function grapheme_substr(?string $string, ?int $offset, ?int $length = null): string|false { return p\Grapheme::grapheme_substr((string) $string, (int) $offset, $length); }
5050
}
51+
if (!function_exists('grapheme_str_split')) {
52+
function grapheme_str_split(string $string, int $length = 1): array|false { return p\Grapheme::grapheme_str_split($string, $length); }
53+
}

src/Php84/Php84.php

+30
Original file line numberDiff line numberDiff line change
@@ -169,4 +169,34 @@ private static function mb_internal_trim(string $regex, string $string, ?string
169169

170170
return mb_convert_encoding($string, $encoding, 'UTF-8');
171171
}
172+
173+
public static function grapheme_str_split(string $string, int $length)
174+
{
175+
if (0 > $length || 1073741823 < $length) {
176+
throw new \ValueError('grapheme_str_split(): Argument #2 ($length) must be greater than 0 and less than or equal to 1073741823.');
177+
}
178+
179+
if ('' === $string) {
180+
return [];
181+
}
182+
183+
$regex = ((float) \PCRE_VERSION < 10 ? (float) \PCRE_VERSION >= 8.32 : (float) \PCRE_VERSION >= 10.39)
184+
? '\X'
185+
: '(?:\r\n|(?:[ -~\x{200C}\x{200D}]|[ᆨ-ᇹ]+|[ᄀ-ᅟ]*(?:[가개갸걔거게겨계고과괘괴교구궈궤귀규그긔기까깨꺄꺠꺼께껴꼐꼬꽈꽤꾀꾜꾸꿔꿰뀌뀨끄끠끼나내냐냬너네녀녜노놔놰뇌뇨누눠눼뉘뉴느늬니다대댜댸더데뎌뎨도돠돼되됴두둬뒈뒤듀드듸디따때땨떄떠떼뗘뗴또똬뙈뙤뚀뚜뚸뛔뛰뜌뜨띄띠라래랴럐러레려례로롸뢔뢰료루뤄뤠뤼류르릐리마매먀먜머메며몌모뫄뫠뫼묘무뭐뭬뮈뮤므믜미바배뱌뱨버베벼볘보봐봬뵈뵤부붜붸뷔뷰브븨비빠빼뺘뺴뻐뻬뼈뼤뽀뽜뽸뾔뾰뿌뿨쀄쀠쀼쁘쁴삐사새샤섀서세셔셰소솨쇄쇠쇼수숴쉐쉬슈스싀시싸쌔쌰썌써쎄쎠쎼쏘쏴쐐쐬쑈쑤쒀쒜쒸쓔쓰씌씨아애야얘어에여예오와왜외요우워웨위유으의이자재쟈쟤저제져졔조좌좨죄죠주줘줴쥐쥬즈즤지짜째쨔쨰쩌쩨쪄쪠쪼쫘쫴쬐쬬쭈쭤쮀쮜쮸쯔쯰찌차채챠챼처체쳐쳬초촤쵀최쵸추춰췌취츄츠츼치카캐캬컈커케켜켸코콰쾌쾨쿄쿠쿼퀘퀴큐크킈키타태탸턔터테텨톄토톼퇘퇴툐투퉈퉤튀튜트틔티파패퍄퍠퍼페펴폐포퐈퐤푀표푸풔풰퓌퓨프픠피하해햐햬허헤혀혜호화홰회효후훠훼휘휴흐희히]?[ᅠ-ᆢ]+|[가-힣])[ᆨ-ᇹ]*|[ᄀ-ᅟ]+|[^\p{Cc}\p{Cf}\p{Zl}\p{Zp}])[\p{Mn}\p{Me}\x{09BE}\x{09D7}\x{0B3E}\x{0B57}\x{0BBE}\x{0BD7}\x{0CC2}\x{0CD5}\x{0CD6}\x{0D3E}\x{0D57}\x{0DCF}\x{0DDF}\x{200C}\x{200D}\x{1D165}\x{1D16E}-\x{1D172}]*|[\p{Cc}\p{Cf}\p{Zl}\p{Zp}])';
186+
187+
if (!preg_match_all('/'. $regex .'/u', $string, $matches)) {
188+
return false;
189+
}
190+
191+
if (1 === $length) {
192+
return $matches[0];
193+
}
194+
195+
$chunks = array_chunk($matches[0], $length);
196+
foreach ($chunks as &$chunk) {
197+
$chunk = implode('', $chunk);
198+
}
199+
200+
return $chunks;
201+
}
172202
}

src/Php84/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ This component provides features added to PHP 8.4 core:
77
- [`array_find`, `array_find_key`, `array_any` and `array_all`](https://wiki.php.net/rfc/array_find)
88
- [`Deprecated`](https://wiki.php.net/rfc/deprecated_attribute)
99
- `CURL_HTTP_VERSION_3` and `CURL_HTTP_VERSION_3ONLY` constants
10+
- [`grapheme_str_split`](https://wiki.php.net/rfc/grapheme_str_split)
1011

1112
More information can be found in the
1213
[main Polyfill README](https://github.com/symfony/polyfill/blob/main/README.md).

src/Php84/bootstrap.php

+4
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,7 @@ function mb_ltrim(string $string, ?string $characters = null, ?string $encoding
6060
function mb_rtrim(string $string, ?string $characters = null, ?string $encoding = null): string { return p\Php84::mb_rtrim($string, $characters, $encoding); }
6161
}
6262
}
63+
64+
if (!function_exists('grapheme_str_split') && function_exists('grapheme_substr')) {
65+
function grapheme_str_split(string $string, int $length = 1) { return p\Php84::grapheme_str_split($string, $length); }
66+
}

tests/Intl/Grapheme/GraphemeTest.php

+31
Original file line numberDiff line numberDiff line change
@@ -209,4 +209,35 @@ public function testGraphemeStrstr()
209209
$this->assertSame('국어', grapheme_strstr('한국어', ''));
210210
$this->assertSame('ÉJÀ', grapheme_stristr('DÉJÀ', 'é'));
211211
}
212+
213+
/**
214+
* @dataProvider graphemeStrSplitDataProvider
215+
*/
216+
public function testGraphemeStrSplit(string $string, int $length, array $expectedValues)
217+
{
218+
$this->assertSame($expectedValues, grapheme_str_split($string, $length));
219+
}
220+
221+
public static function graphemeStrSplitDataProvider(): array
222+
{
223+
$cases = [
224+
['', 1, []],
225+
['PHP', 1, ['P', 'H', 'P']],
226+
['你好', 1, ['', '']],
227+
['අයේෂ්', 1, ['', 'යේ', 'ෂ්']],
228+
['สวัสดี', 2, ['สวั', 'สดี']],
229+
['土下座🙇‍♀を', 1, ["", "", "", "🙇‍♀", ""]],
230+
];
231+
232+
if (70300 <= PHP_VERSION_ID) {
233+
$cases[] = ['土下座🙇‍♀を', 1, ["", "", "", "🙇‍♀", ""]];
234+
}
235+
236+
// Fixed in https://github.com/PCRE2Project/pcre2/issues/410
237+
if (defined('PCRE_VERSION_MAJOR') && PCRE_VERSION_MAJOR > 10 && PCRE_VERSION_MINOR > 44) {
238+
$cases[] = ['👭🏻👰🏿‍♂️', 2, ['👭🏻', '👰🏿‍♂️']];
239+
}
240+
241+
return $cases;
242+
}
212243
}

tests/Php84/Php84Test.php

+31
Original file line numberDiff line numberDiff line change
@@ -319,4 +319,35 @@ public static function mbRTrimProvider(): iterable
319319

320320
yield ["foo\n", "foo\n", 'o'];
321321
}
322+
323+
/**
324+
* @dataProvider graphemeStrSplitDataProvider
325+
*/
326+
public function testGraphemeStrSplit(string $string, int $length, array $expectedValues)
327+
{
328+
$this->assertSame($expectedValues, grapheme_str_split($string, $length));
329+
}
330+
331+
public static function graphemeStrSplitDataProvider(): array
332+
{
333+
$cases = [
334+
['', 1, []],
335+
['PHP', 1, ['P', 'H', 'P']],
336+
['你好', 1, ['', '']],
337+
['අයේෂ්', 1, ['', 'යේ', 'ෂ්']],
338+
['สวัสดี', 2, ['สวั', 'สดี']],
339+
['土下座🙇‍♀を', 1, ["", "", "", "🙇‍♀", ""]],
340+
];
341+
342+
if (70300 <= PHP_VERSION_ID) {
343+
$cases[] = ['土下座🙇‍♀を', 1, ["", "", "", "🙇‍♀", ""]];
344+
}
345+
346+
// Fixed in https://github.com/PCRE2Project/pcre2/issues/410
347+
if (defined('PCRE_VERSION_MAJOR') && 10 < PCRE_VERSION_MAJOR && 44 < PCRE_VERSION_MINOR) {
348+
$cases[] = ['👭🏻👰🏿‍♂️', 2, ['👭🏻', '👰🏿‍♂️']];
349+
}
350+
351+
return $cases;
352+
}
322353
}

0 commit comments

Comments
 (0)