Skip to content

Commit bea3537

Browse files
committed
stdlib: Add classifying functions
When making scanners there might be a need to classify codepoints if the specification follows the Unicode standard. The following: is_whitespace/1, is_ID_start/1, is_ID_continue/1, category/1 is (at least) needed to scan and parse for example markdown. This commit is backward incompatible if the *undocumented* unicode_util:[is_]whitespace(..) functions have been used. unicode_util:whitespace() have been changed to unicode_util:pattern_whitespace() string module uses it and documents that is uses the `pattern_whitespace` property. So the name of the function have been changed, this is because `pattern_whitespace` is not a subset of `whitespace` characters, as was discovered during testing. is_whitespace(Char) have been changed to use the Unicode `whitespace` property. See Unicode Standard Annex #31 and #44.
1 parent 0a24434 commit bea3537

File tree

7 files changed

+356
-21
lines changed

7 files changed

+356
-21
lines changed

lib/stdlib/src/erl_stdlib_errors.erl

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -428,7 +428,15 @@ format_unicode_error(characters_to_nfkc_list, [_]) ->
428428
format_unicode_error(characters_to_nfkd_binary, [_]) ->
429429
[bad_char_data];
430430
format_unicode_error(characters_to_nfkd_list, [_]) ->
431-
[bad_char_data].
431+
[bad_char_data];
432+
format_unicode_error(category, [_]) ->
433+
[bad_char];
434+
format_unicode_error(is_whitespace, [_]) ->
435+
[bad_char];
436+
format_unicode_error(is_ID_start, [_]) ->
437+
[bad_char];
438+
format_unicode_error(is_ID_continue, [_]) ->
439+
[bad_char].
432440

433441
unicode_char_data(Chars) ->
434442
try unicode:characters_to_binary(Chars) of
@@ -1121,6 +1129,8 @@ expand_error(bad_boolean) ->
11211129
<<"not a boolean value">>;
11221130
expand_error(bad_binary_list) ->
11231131
<<"not a flat list of binaries">>;
1132+
expand_error(bad_char) ->
1133+
<<"not a valid character">>;
11241134
expand_error(bad_char_data) ->
11251135
<<"not valid character data (an iodata term)">>;
11261136
expand_error(bad_binary_pattern) ->

lib/stdlib/src/string.erl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -477,7 +477,7 @@ pad(CD, Length, both, Char) when is_integer(Length) ->
477477
-spec trim(String) -> unicode:chardata() when
478478
String :: unicode:chardata().
479479
trim(Str) ->
480-
trim(Str, both, unicode_util:whitespace()).
480+
trim(Str, both, unicode_util:pattern_whitespace()).
481481

482482
-doc """
483483
Equivalent to [`trim(String, Dir, Whitespace})`](`trim/3`) where
@@ -490,7 +490,7 @@ as Pattern_White_Space in
490490
String :: unicode:chardata(),
491491
Dir :: direction() | 'both'.
492492
trim(Str, Dir) ->
493-
trim(Str, Dir, unicode_util:whitespace()).
493+
trim(Str, Dir, unicode_util:pattern_whitespace()).
494494

495495
-doc """
496496
Returns a string, where leading or trailing, or both, `Characters` have been

lib/stdlib/src/unicode.erl

Lines changed: 177 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
%%
2222
-module(unicode).
2323
-moduledoc """
24-
Functions for converting Unicode characters.
24+
Functions for converting and classifying Unicode characters.
2525

2626
This module contains functions for converting between different character
2727
representations. It converts between ISO Latin-1 characters and Unicode
@@ -71,9 +71,12 @@ normalization can be found in the
7171
characters_to_nfkc_list/1, characters_to_nfkc_binary/1
7272
]).
7373

74+
-export([is_whitespace/1, is_ID_start/1, is_ID_continue/1, category/1]).
75+
7476
-export_type([chardata/0, charlist/0, encoding/0, external_chardata/0,
7577
external_charlist/0, latin1_char/0, latin1_chardata/0,
76-
latin1_charlist/0, latin1_binary/0, unicode_binary/0]).
78+
latin1_charlist/0, latin1_binary/0, unicode_binary/0,
79+
category/0]).
7780

7881
-type encoding() :: 'latin1' | 'unicode' | 'utf8'
7982
| 'utf16' | {'utf16', endian()}
@@ -108,6 +111,38 @@ than UTF-8 (that is, UTF-16 or UTF-32).
108111
latin1_binary() |
109112
latin1_charlist(),
110113
latin1_binary() | nil()).
114+
-doc "Character category".
115+
-type category() ::
116+
{letter,uppercase} |
117+
{letter,lowercase} |
118+
{letter,titlecase} |
119+
{letter,modifier} |
120+
{letter,other} |
121+
{mark,non_spacing} |
122+
{mark,spacing_combining} |
123+
{mark,enclosing} |
124+
{number,decimal} |
125+
{number,letter} |
126+
{number,other} |
127+
{separator,space} |
128+
{separator,line} |
129+
{separator,paragraph} |
130+
{other,control} |
131+
{other,format} |
132+
{other,surrogate} |
133+
{other,private} |
134+
{other,not_assigned} |
135+
{punctuation,connector} |
136+
{punctuation,dash} |
137+
{punctuation,open} |
138+
{punctuation,close} |
139+
{punctuation,initial} |
140+
{punctuation,final} |
141+
{punctuation,other} |
142+
{symbol,math} |
143+
{symbol,currency} |
144+
{symbol,modifier} |
145+
{symbol,other}.
111146

112147
%% We must inline these functions so that the stacktrace points to
113148
%% the correct function.
@@ -122,6 +157,8 @@ than UTF-8 (that is, UTF-16 or UTF-32).
122157

123158
-export([bin_is_7bit/1, characters_to_binary/2, characters_to_list/2]).
124159

160+
-define(IS_CP(CP), is_integer(CP, 0, 16#10FFFF)).
161+
125162
-doc false.
126163
-spec bin_is_7bit(Binary) -> boolean() when
127164
Binary :: binary().
@@ -681,13 +718,149 @@ characters_to_nfkc_binary(CD, N, Row, Acc) when N > 0 ->
681718
characters_to_nfkc_binary(CD, _, Row, Acc) ->
682719
characters_to_nfkc_binary(CD, ?GC_N, [], prepend_row_to_acc(Row, Acc)).
683720

721+
-doc """
722+
Returns true if `Char` is a whitespace.
723+
724+
Whitespace is defined in
725+
[Unicode Standard Annex #44](http://unicode.org/reports/tr44/).
726+
727+
```erlang
728+
1> unicode:is_whitespace($\s).
729+
true
730+
2> unicode:is_whitespace($😊).
731+
false
732+
```
733+
""".
734+
-doc(#{since => ~"@OTP-19858@"}).
735+
-spec is_whitespace(char()) -> boolean().
736+
is_whitespace(X)
737+
when X =:= 9; X =:= 10; X =:= 11; X =:= 12; X =:= 13; X =:= 32;
738+
X =:= 133; X =:= 160 ->
739+
true;
740+
is_whitespace(Char) when is_integer(Char, 0, 5000) -> %% Arbitrary limit without whitespace
741+
false;
742+
is_whitespace(Char) when ?IS_CP(Char) ->
743+
unicode_util:is_whitespace(Char);
744+
is_whitespace(Term) ->
745+
badarg_with_info([Term]).
746+
747+
748+
-doc """
749+
Returns true if `Char` is an identifier start.
750+
751+
Identifier start is defined by the ID_Start property in
752+
[Unicode Standard Annex #31](http://unicode.org/reports/tr31/).
753+
754+
```erlang
755+
1> unicode:is_ID_start($a).
756+
true
757+
2> unicode:is_ID_start($_).
758+
false
759+
3> unicode:is_ID_start($-).
760+
false
761+
```
762+
""".
763+
-doc(#{since => ~"@OTP-19858@"}).
764+
-spec is_ID_start(char()) -> boolean().
765+
is_ID_start(X) %% ASCII optimizations
766+
when X =:= 65; X =:= 66; X =:= 67; X =:= 68; X =:= 69; X =:= 70; X =:= 71;
767+
X =:= 72; X =:= 73; X =:= 74; X =:= 75; X =:= 76; X =:= 77; X =:= 78;
768+
X =:= 79; X =:= 80; X =:= 81; X =:= 82; X =:= 83; X =:= 84; X =:= 85;
769+
X =:= 86; X =:= 87; X =:= 88; X =:= 89; X =:= 90; X =:= 97; X =:= 98;
770+
X =:= 99; X =:= 100; X =:= 101; X =:= 102; X =:= 103; X =:= 104; X =:= 105;
771+
X =:= 106; X =:= 107; X =:= 108; X =:= 109; X =:= 110; X =:= 111; X =:= 112;
772+
X =:= 113; X =:= 114; X =:= 115; X =:= 116; X =:= 117; X =:= 118; X =:= 119;
773+
X =:= 120; X =:= 121; X =:= 122 ->
774+
true;
775+
is_ID_start(Char) when is_integer(Char, 0, 127) ->
776+
false;
777+
is_ID_start(Char) when ?IS_CP(Char) ->
778+
case unicode_util:category(Char) of
779+
{number,letter} -> true;
780+
{letter,modifier} -> unicode_util:is_letter_not_pattern_syntax(Char);
781+
{letter,_} -> true;
782+
{_,_} -> unicode_util:is_other_id_start(Char)
783+
end;
784+
is_ID_start(Term) ->
785+
badarg_with_info([Term]).
786+
787+
788+
-doc """
789+
Returns true if `Char` is an identifier continuation.
790+
791+
Identifier continuation is defined by the ID_Continue property in
792+
[Unicode Standard Annex #31](http://unicode.org/reports/tr31/).
793+
794+
```erlang
795+
1> unicode:is_ID_continue($a).
796+
true
797+
2> unicode:is_ID_continue($_).
798+
true
799+
3> unicode:is_ID_continue($-).
800+
false
801+
```
802+
""".
803+
-doc(#{since => ~"@OTP-19858@"}).
804+
-spec is_ID_continue(char()) -> boolean().
805+
is_ID_continue(X)
806+
when X =:= 48; X =:= 49; X =:= 50; X =:= 51; X =:= 52; X =:= 53; X =:= 54;
807+
X =:= 55; X =:= 56; X =:= 57; X =:= 65; X =:= 66; X =:= 67; X =:= 68;
808+
X =:= 69; X =:= 70; X =:= 71; X =:= 72; X =:= 73; X =:= 74; X =:= 75;
809+
X =:= 76; X =:= 77; X =:= 78; X =:= 79; X =:= 80; X =:= 81; X =:= 82;
810+
X =:= 83; X =:= 84; X =:= 85; X =:= 86; X =:= 87; X =:= 88; X =:= 89;
811+
X =:= 90; X =:= 95; X =:= 97; X =:= 98; X =:= 99; X =:= 100; X =:= 101;
812+
X =:= 102; X =:= 103; X =:= 104; X =:= 105; X =:= 106; X =:= 107;
813+
X =:= 108; X =:= 109; X =:= 110; X =:= 111; X =:= 112; X =:= 113;
814+
X =:= 114; X =:= 115; X =:= 116; X =:= 117; X =:= 118; X =:= 119;
815+
X =:= 120; X =:= 121; X =:= 122 ->
816+
true;
817+
is_ID_continue(Char) when is_integer(Char, 0, 127) ->
818+
false;
819+
is_ID_continue(Char) when ?IS_CP(Char) ->
820+
case unicode_util:category(Char) of
821+
{punctuation, connector} -> true;
822+
{mark,non_spacing} -> true;
823+
{mark,spacing_combining} -> true;
824+
{number,other} -> unicode_util:is_other_id_continue(Char);
825+
{number,_} -> true;
826+
{letter,modifier} -> unicode_util:is_letter_not_pattern_syntax(Char);
827+
{letter,_} -> true;
828+
{_,_} -> unicode_util:is_other_id_start(Char) orelse
829+
unicode_util:is_other_id_continue(Char)
830+
end;
831+
is_ID_continue(Term) ->
832+
badarg_with_info([Term]).
833+
834+
-doc """
835+
Returns the `Char` category.
836+
837+
```erlang
838+
1> unicode:category($a).
839+
{letter,lowercase}
840+
2> unicode:category($Ä).
841+
{letter,uppercase}
842+
3> unicode:category($😊).
843+
{symbol,other}
844+
3> unicode:category($€).
845+
{symbol,currency}
846+
3> unicode:category($[).
847+
{punctuation,open}
848+
```
849+
""".
850+
-doc(#{since => ~"@OTP-19858@"}).
851+
-spec category(char()) -> category().
852+
category(Char) when ?IS_CP(Char) ->
853+
unicode_util:category(Char);
854+
category(Term) ->
855+
badarg_with_info([Term]).
856+
857+
%% internals
858+
684859
acc_to_binary(Acc) ->
685860
list_to_binary(lists:reverse(Acc)).
686861
prepend_row_to_acc(Row, Acc) ->
687862
[characters_to_binary(lists:reverse(Row))|Acc].
688863

689-
%% internals
690-
691864
-doc false.
692865
characters_to_list_int(ML, Encoding) ->
693866
try

lib/stdlib/test/Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,9 @@ release_tests_spec: make_emakefile
197197
$(ERL_FILES) $(COVERFILE) $(EXTRA_FILES) "$(RELSYSDIR)"
198198
chmod -R u+w "$(RELSYSDIR)"
199199
@tar cf - *_SUITE_data property_test | (cd "$(RELSYSDIR)"; tar xf -)
200+
$(INSTALL_DIR) "$(RELSYSDIR)/unicode_SUITE_data"
201+
$(INSTALL_DATA) ../uc_spec/PropList.txt ../uc_spec/DerivedCoreProperties.txt \
202+
"$(RELSYSDIR)/unicode_SUITE_data"
200203
$(INSTALL_DIR) "$(RELSYSDIR)/stdlib_SUITE_data"
201204
$(INSTALL_DATA) $(ERL_TOP)/make/otp_version_tickets "$(RELSYSDIR)/stdlib_SUITE_data"
202205

0 commit comments

Comments
 (0)