9
9
import os
10
10
import subprocess
11
11
12
- NUM_CODEPOINTS = 0x110000
12
+ NUM_CODEPOINTS = 0x110000
13
+
13
14
14
15
def to_ranges (iter ):
15
16
current = None
@@ -23,19 +24,25 @@ def to_ranges(iter):
23
24
if current is not None :
24
25
yield tuple (current )
25
26
27
+
26
28
def get_escaped (codepoints ):
27
29
for c in codepoints :
28
- if (c .class_ or "Cn" ) in "Cc Cf Cs Co Cn Zl Zp Zs" .split () and c .value != ord (' ' ):
30
+ if (c .class_ or "Cn" ) in "Cc Cf Cs Co Cn Zl Zp Zs" .split () and c .value != ord (
31
+ " "
32
+ ):
29
33
yield c .value
30
34
35
+
31
36
def get_file (f ):
32
37
try :
33
38
return open (os .path .basename (f ))
34
39
except FileNotFoundError :
35
40
subprocess .run (["curl" , "-O" , f ], check = True )
36
41
return open (os .path .basename (f ))
37
42
38
- Codepoint = namedtuple ('Codepoint' , 'value class_' )
43
+
44
+ Codepoint = namedtuple ("Codepoint" , "value class_" )
45
+
39
46
40
47
def get_codepoints (f ):
41
48
r = csv .reader (f , delimiter = ";" )
@@ -66,13 +73,14 @@ def get_codepoints(f):
66
73
for c in range (prev_codepoint + 1 , NUM_CODEPOINTS ):
67
74
yield Codepoint (c , None )
68
75
76
+
69
77
def compress_singletons (singletons ):
70
- uppers = [] # (upper, # items in lowers)
78
+ uppers = [] # (upper, # items in lowers)
71
79
lowers = []
72
80
73
81
for i in singletons :
74
82
upper = i >> 8
75
- lower = i & 0xff
83
+ lower = i & 0xFF
76
84
if len (uppers ) == 0 or uppers [- 1 ][0 ] != upper :
77
85
uppers .append ((upper , 1 ))
78
86
else :
@@ -82,10 +90,11 @@ def compress_singletons(singletons):
82
90
83
91
return uppers , lowers
84
92
93
+
85
94
def compress_normal (normal ):
86
95
# lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f
87
96
# lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff
88
- compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)]
97
+ compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)]
89
98
90
99
prev_start = 0
91
100
for start , count in normal :
@@ -95,21 +104,22 @@ def compress_normal(normal):
95
104
96
105
assert truelen < 0x8000 and falselen < 0x8000
97
106
entry = []
98
- if truelen > 0x7f :
107
+ if truelen > 0x7F :
99
108
entry .append (0x80 | (truelen >> 8 ))
100
- entry .append (truelen & 0xff )
109
+ entry .append (truelen & 0xFF )
101
110
else :
102
- entry .append (truelen & 0x7f )
103
- if falselen > 0x7f :
111
+ entry .append (truelen & 0x7F )
112
+ if falselen > 0x7F :
104
113
entry .append (0x80 | (falselen >> 8 ))
105
- entry .append (falselen & 0xff )
114
+ entry .append (falselen & 0xFF )
106
115
else :
107
- entry .append (falselen & 0x7f )
116
+ entry .append (falselen & 0x7F )
108
117
109
118
compressed .append (entry )
110
119
111
120
return compressed
112
121
122
+
113
123
def print_singletons (uppers , lowers , uppersname , lowersname ):
114
124
print ("#[rustfmt::skip]" )
115
125
print ("const {}: &[(u8, u8)] = &[" .format (uppersname ))
@@ -119,22 +129,26 @@ def print_singletons(uppers, lowers, uppersname, lowersname):
119
129
print ("#[rustfmt::skip]" )
120
130
print ("const {}: &[u8] = &[" .format (lowersname ))
121
131
for i in range (0 , len (lowers ), 8 ):
122
- print (" {}" .format (" " .join ("{:#04x}," .format (x ) for x in lowers [i :i + 8 ])))
132
+ print (
133
+ " {}" .format (" " .join ("{:#04x}," .format (x ) for x in lowers [i : i + 8 ]))
134
+ )
123
135
print ("];" )
124
136
137
+
125
138
def print_normal (normal , normalname ):
126
139
print ("#[rustfmt::skip]" )
127
140
print ("const {}: &[u8] = &[" .format (normalname ))
128
141
for v in normal :
129
142
print (" {}" .format (" " .join ("{:#04x}," .format (i ) for i in v )))
130
143
print ("];" )
131
144
145
+
132
146
def main ():
133
147
file = get_file ("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt" )
134
148
135
149
codepoints = get_codepoints (file )
136
150
137
- CUTOFF = 0x10000
151
+ CUTOFF = 0x10000
138
152
singletons0 = []
139
153
singletons1 = []
140
154
normal0 = []
@@ -234,10 +248,11 @@ def main():
234
248
}\
235
249
""" )
236
250
print ()
237
- print_singletons (singletons0u , singletons0l , 'SINGLETONS0U' , 'SINGLETONS0L' )
238
- print_singletons (singletons1u , singletons1l , 'SINGLETONS1U' , 'SINGLETONS1L' )
239
- print_normal (normal0 , 'NORMAL0' )
240
- print_normal (normal1 , 'NORMAL1' )
251
+ print_singletons (singletons0u , singletons0l , "SINGLETONS0U" , "SINGLETONS0L" )
252
+ print_singletons (singletons1u , singletons1l , "SINGLETONS1U" , "SINGLETONS1L" )
253
+ print_normal (normal0 , "NORMAL0" )
254
+ print_normal (normal1 , "NORMAL1" )
255
+
241
256
242
- if __name__ == ' __main__' :
257
+ if __name__ == " __main__" :
243
258
main ()
0 commit comments