2121% %
2222-module (unicode ).
2323-moduledoc """
24- Functions for converting Unicode characters.
24+ Functions for converting and classifying Unicode characters.
2525
2626This module contains functions for converting between different character
2727representations. It converts between ISO Latin-1 characters and Unicode
@@ -71,9 +71,12 @@ normalization can be found in the
7171 characters_to_nfkc_list /1 , characters_to_nfkc_binary /1
7272 ]).
7373
74+ -export ([is_whitespace /1 , is_ID_start /1 , is_ID_continue /1 , category /1 ]).
75+
7476-export_type ([chardata / 0 , charlist / 0 , encoding / 0 , external_chardata / 0 ,
7577 external_charlist / 0 , latin1_char / 0 , latin1_chardata / 0 ,
76- latin1_charlist / 0 , latin1_binary / 0 , unicode_binary / 0 ]).
78+ latin1_charlist / 0 , latin1_binary / 0 , unicode_binary / 0 ,
79+ category / 0 ]).
7780
7881-type encoding () :: 'latin1' | 'unicode' | 'utf8'
7982 | 'utf16' | {'utf16' , endian ()}
@@ -108,6 +111,38 @@ than UTF-8 (that is, UTF-16 or UTF-32).
108111 latin1_binary () |
109112 latin1_charlist (),
110113 latin1_binary () | nil ()).
114+ -doc " Character category" .
115+ -type category () ::
116+ {letter ,uppercase } |
117+ {letter ,lowercase } |
118+ {letter ,titlecase } |
119+ {letter ,modifier } |
120+ {letter ,other } |
121+ {mark ,non_spacing } |
122+ {mark ,spacing_combining } |
123+ {mark ,enclosing } |
124+ {number ,decimal } |
125+ {number ,letter } |
126+ {number ,other } |
127+ {separator ,space } |
128+ {separator ,line } |
129+ {separator ,paragraph } |
130+ {other ,control } |
131+ {other ,format } |
132+ {other ,surrogate } |
133+ {other ,private } |
134+ {other ,not_assigned } |
135+ {punctuation ,connector } |
136+ {punctuation ,dash } |
137+ {punctuation ,open } |
138+ {punctuation ,close } |
139+ {punctuation ,initial } |
140+ {punctuation ,final } |
141+ {punctuation ,other } |
142+ {symbol ,math } |
143+ {symbol ,currency } |
144+ {symbol ,modifier } |
145+ {symbol ,other }.
111146
112147% % We must inline these functions so that the stacktrace points to
113148% % the correct function.
@@ -122,6 +157,8 @@ than UTF-8 (that is, UTF-16 or UTF-32).
122157
123158-export ([bin_is_7bit /1 , characters_to_binary /2 , characters_to_list /2 ]).
124159
160+ -define (IS_CP (CP ), is_integer (CP , 0 , 16#10FFFF )).
161+
125162-doc false .
126163-spec bin_is_7bit (Binary ) -> boolean () when
127164 Binary :: binary ().
@@ -681,13 +718,149 @@ characters_to_nfkc_binary(CD, N, Row, Acc) when N > 0 ->
681718characters_to_nfkc_binary (CD , _ , Row , Acc ) ->
682719 characters_to_nfkc_binary (CD , ? GC_N , [], prepend_row_to_acc (Row , Acc )).
683720
721+ -doc """
722+ Returns true if `Char` is a whitespace.
723+
724+ Whitespace is defined in
725+ [Unicode Standard Annex #44](http://unicode.org/reports/tr44/).
726+
727+ ```erlang
728+ 1> unicode:is_whitespace($\s).
729+ true
730+ 2> unicode:is_whitespace($😊).
731+ false
732+ ```
733+ """ .
734+ -doc (#{since => ~ " @OTP-19858@" }).
735+ -spec is_whitespace (char ()) -> boolean ().
736+ is_whitespace (X )
737+ when X =:= 9 ; X =:= 10 ; X =:= 11 ; X =:= 12 ; X =:= 13 ; X =:= 32 ;
738+ X =:= 133 ; X =:= 160 ->
739+ true ;
740+ is_whitespace (Char ) when is_integer (Char , 0 , 5000 ) -> % % Arbitrary limit without whitespace
741+ false ;
742+ is_whitespace (Char ) when ? IS_CP (Char ) ->
743+ unicode_util :is_whitespace (Char );
744+ is_whitespace (Term ) ->
745+ badarg_with_info ([Term ]).
746+
747+
748+ -doc """
749+ Returns true if `Char` is an identifier start.
750+
751+ Identifier start is defined by the ID_Start property in
752+ [Unicode Standard Annex #31](http://unicode.org/reports/tr31/).
753+
754+ ```erlang
755+ 1> unicode:is_ID_start($a).
756+ true
757+ 2> unicode:is_ID_start($_).
758+ false
759+ 3> unicode:is_ID_start($-).
760+ false
761+ ```
762+ """ .
763+ -doc (#{since => ~ " @OTP-19858@" }).
764+ -spec is_ID_start (char ()) -> boolean ().
765+ is_ID_start (X ) % % ASCII optimizations
766+ when X =:= 65 ; X =:= 66 ; X =:= 67 ; X =:= 68 ; X =:= 69 ; X =:= 70 ; X =:= 71 ;
767+ X =:= 72 ; X =:= 73 ; X =:= 74 ; X =:= 75 ; X =:= 76 ; X =:= 77 ; X =:= 78 ;
768+ X =:= 79 ; X =:= 80 ; X =:= 81 ; X =:= 82 ; X =:= 83 ; X =:= 84 ; X =:= 85 ;
769+ X =:= 86 ; X =:= 87 ; X =:= 88 ; X =:= 89 ; X =:= 90 ; X =:= 97 ; X =:= 98 ;
770+ X =:= 99 ; X =:= 100 ; X =:= 101 ; X =:= 102 ; X =:= 103 ; X =:= 104 ; X =:= 105 ;
771+ X =:= 106 ; X =:= 107 ; X =:= 108 ; X =:= 109 ; X =:= 110 ; X =:= 111 ; X =:= 112 ;
772+ X =:= 113 ; X =:= 114 ; X =:= 115 ; X =:= 116 ; X =:= 117 ; X =:= 118 ; X =:= 119 ;
773+ X =:= 120 ; X =:= 121 ; X =:= 122 ->
774+ true ;
775+ is_ID_start (Char ) when is_integer (Char , 0 , 127 ) ->
776+ false ;
777+ is_ID_start (Char ) when ? IS_CP (Char ) ->
778+ case unicode_util :category (Char ) of
779+ {number ,letter } -> true ;
780+ {letter ,modifier } -> unicode_util :is_letter_not_pattern_syntax (Char );
781+ {letter ,_ } -> true ;
782+ {_ ,_ } -> unicode_util :is_other_id_start (Char )
783+ end ;
784+ is_ID_start (Term ) ->
785+ badarg_with_info ([Term ]).
786+
787+
788+ -doc """
789+ Returns true if `Char` is an identifier continuation.
790+
791+ Identifier continuation is defined by the ID_Continue property in
792+ [Unicode Standard Annex #31](http://unicode.org/reports/tr31/).
793+
794+ ```erlang
795+ 1> unicode:is_ID_continue($a).
796+ true
797+ 2> unicode:is_ID_continue($_).
798+ true
799+ 3> unicode:is_ID_continue($-).
800+ false
801+ ```
802+ """ .
803+ -doc (#{since => ~ " @OTP-19858@" }).
804+ -spec is_ID_continue (char ()) -> boolean ().
805+ is_ID_continue (X )
806+ when X =:= 48 ; X =:= 49 ; X =:= 50 ; X =:= 51 ; X =:= 52 ; X =:= 53 ; X =:= 54 ;
807+ X =:= 55 ; X =:= 56 ; X =:= 57 ; X =:= 65 ; X =:= 66 ; X =:= 67 ; X =:= 68 ;
808+ X =:= 69 ; X =:= 70 ; X =:= 71 ; X =:= 72 ; X =:= 73 ; X =:= 74 ; X =:= 75 ;
809+ X =:= 76 ; X =:= 77 ; X =:= 78 ; X =:= 79 ; X =:= 80 ; X =:= 81 ; X =:= 82 ;
810+ X =:= 83 ; X =:= 84 ; X =:= 85 ; X =:= 86 ; X =:= 87 ; X =:= 88 ; X =:= 89 ;
811+ X =:= 90 ; X =:= 95 ; X =:= 97 ; X =:= 98 ; X =:= 99 ; X =:= 100 ; X =:= 101 ;
812+ X =:= 102 ; X =:= 103 ; X =:= 104 ; X =:= 105 ; X =:= 106 ; X =:= 107 ;
813+ X =:= 108 ; X =:= 109 ; X =:= 110 ; X =:= 111 ; X =:= 112 ; X =:= 113 ;
814+ X =:= 114 ; X =:= 115 ; X =:= 116 ; X =:= 117 ; X =:= 118 ; X =:= 119 ;
815+ X =:= 120 ; X =:= 121 ; X =:= 122 ->
816+ true ;
817+ is_ID_continue (Char ) when is_integer (Char , 0 , 127 ) ->
818+ false ;
819+ is_ID_continue (Char ) when ? IS_CP (Char ) ->
820+ case unicode_util :category (Char ) of
821+ {punctuation , connector } -> true ;
822+ {mark ,non_spacing } -> true ;
823+ {mark ,spacing_combining } -> true ;
824+ {number ,other } -> unicode_util :is_other_id_continue (Char );
825+ {number ,_ } -> true ;
826+ {letter ,modifier } -> unicode_util :is_letter_not_pattern_syntax (Char );
827+ {letter ,_ } -> true ;
828+ {_ ,_ } -> unicode_util :is_other_id_start (Char ) orelse
829+ unicode_util :is_other_id_continue (Char )
830+ end ;
831+ is_ID_continue (Term ) ->
832+ badarg_with_info ([Term ]).
833+
834+ -doc """
835+ Returns the `Char` category.
836+
837+ ```erlang
838+ 1> unicode:category($a).
839+ {letter,lowercase}
840+ 2> unicode:category($Ä).
841+ {letter,uppercase}
842+ 3> unicode:category($😊).
843+ {symbol,other}
844+ 3> unicode:category($€).
845+ {symbol,currency}
846+ 3> unicode:category($[).
847+ {punctuation,open}
848+ ```
849+ """ .
850+ -doc (#{since => ~ " @OTP-19858@" }).
851+ -spec category (char ()) -> category ().
852+ category (Char ) when ? IS_CP (Char ) ->
853+ unicode_util :category (Char );
854+ category (Term ) ->
855+ badarg_with_info ([Term ]).
856+
857+ % % internals
858+
684859acc_to_binary (Acc ) ->
685860 list_to_binary (lists :reverse (Acc )).
686861prepend_row_to_acc (Row , Acc ) ->
687862 [characters_to_binary (lists :reverse (Row ))|Acc ].
688863
689- % % internals
690-
691864-doc false .
692865characters_to_list_int (ML , Encoding ) ->
693866 try
0 commit comments