Skip to content

Commit 43cd99b

Browse files
authored
Unicode: Update to UCD 16 (PCRE2Project#503)
Updates to Unicode files to Unicode 16, adjusts tests, and the scripts used to parse UCD, to adapt to minor formatting differences in UCD 16. The `GenerateTest26.py` and `GenerateCommon.py` had a regexp to extract properties from the `ScriptExtensions.txt` file. Previously, all property lines had one space after space-separated list of scripts. In UCD-16, this list is adjusted with right-padding, which throws off the parser. This commit adjusts the regexps to ignore padding spaces.
1 parent 32f03ad commit 43cd99b

24 files changed

+14220
-6457
lines changed

maint/GenerateCommon.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ def reorder_scripts():
232232

233233
extended_script_abbrevs = set()
234234
with open("Unicode.tables/ScriptExtensions.txt") as f:
235-
names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #')
235+
names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+[A-Za-z]) +#')
236236

237237
for line in f:
238238
match_obj = names_re.match(line)

maint/GenerateTest26.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def gen_script_tests():
5555
script_data = [None] * len(script_names)
5656
char_data = [None] * 0x110000
5757

58-
property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #")
58+
property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+[A-Za-z]) +#")
5959
prev_name = ""
6060
script_idx = -1
6161

maint/Unicode.tables/BidiMirroring.txt

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
# BidiMirroring-15.0.0.txt
2-
# Date: 2022-05-03, 18:47:00 GMT [KW, RP]
3-
# © 2022 Unicode®, Inc.
4-
# For terms of use, see https://www.unicode.org/terms_of_use.html
1+
# BidiMirroring-16.0.0.txt
2+
# Date: 2024-01-30
3+
# © 2024 Unicode®, Inc.
4+
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
5+
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
56
#
67
# Unicode Character Database
78
# For documentation, see https://www.unicode.org/reports/tr44/
@@ -15,7 +16,7 @@
1516
# value, for which there is another Unicode character that typically has a glyph
1617
# that is the mirror image of the original character's glyph.
1718
#
18-
# The repertoire covered by the file is Unicode 15.0.0.
19+
# The repertoire covered by the file is Unicode 16.0.0.
1920
#
2021
# The file contains a list of lines with mappings from one code point
2122
# to another one for character-based mirroring.
@@ -44,7 +45,8 @@
4445
#
4546
# This file was originally created by Markus Scherer.
4647
# Extended for Unicode 3.2, 4.0, 4.1, 5.0, 5.1, 5.2, and 6.0 by Ken Whistler,
47-
# and for subsequent versions by Ken Whistler, Laurentiu Iancu, and Roozbeh Pournader.
48+
# and for subsequent versions by Ken Whistler, Laurentiu Iancu, Roozbeh Pournader,
49+
# and Robin Leroy.
4850
#
4951
# Historical and Compatibility Information:
5052
#
@@ -542,6 +544,7 @@ FF63; FF62 # [BEST FIT] HALFWIDTH RIGHT CORNER BRACKET
542544
# 225F; QUESTIONED EQUAL TO
543545
# 2260; NOT EQUAL TO
544546
# 2262; NOT IDENTICAL TO
547+
# 226D; NOT EQUIVALENT TO
545548
# 228C; MULTISET
546549
# 22A7; MODELS
547550
# 22AA; TRIPLE VERTICAL BAR RIGHT TURNSTILE

maint/Unicode.tables/CaseFolding.txt

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
# CaseFolding-15.0.0.txt
2-
# Date: 2022-02-02, 23:35:35 GMT
3-
# © 2022 Unicode®, Inc.
1+
# CaseFolding-16.0.0.txt
2+
# Date: 2024-04-30, 21:48:11 GMT
3+
# © 2024 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
5-
# For terms of use, see https://www.unicode.org/terms_of_use.html
5+
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
66
#
77
# Unicode Character Database
88
# For documentation, see https://www.unicode.org/reports/tr44/
@@ -603,6 +603,7 @@
603603
1C86; C; 044A; # CYRILLIC SMALL LETTER TALL HARD SIGN
604604
1C87; C; 0463; # CYRILLIC SMALL LETTER TALL YAT
605605
1C88; C; A64B; # CYRILLIC SMALL LETTER UNBLENDED UK
606+
1C89; C; 1C8A; # CYRILLIC CAPITAL LETTER TJE
606607
1C90; C; 10D0; # GEORGIAN MTAVRULI CAPITAL LETTER AN
607608
1C91; C; 10D1; # GEORGIAN MTAVRULI CAPITAL LETTER BAN
608609
1C92; C; 10D2; # GEORGIAN MTAVRULI CAPITAL LETTER GAN
@@ -929,6 +930,7 @@
929930
1FCC; S; 1FC3; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
930931
1FD2; F; 03B9 0308 0300; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
931932
1FD3; F; 03B9 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
933+
1FD3; S; 0390; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
932934
1FD6; F; 03B9 0342; # GREEK SMALL LETTER IOTA WITH PERISPOMENI
933935
1FD7; F; 03B9 0308 0342; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
934936
1FD8; C; 1FD0; # GREEK CAPITAL LETTER IOTA WITH VRACHY
@@ -937,6 +939,7 @@
937939
1FDB; C; 1F77; # GREEK CAPITAL LETTER IOTA WITH OXIA
938940
1FE2; F; 03C5 0308 0300; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
939941
1FE3; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
942+
1FE3; S; 03B0; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
940943
1FE4; F; 03C1 0313; # GREEK SMALL LETTER RHO WITH PSILI
941944
1FE6; F; 03C5 0342; # GREEK SMALL LETTER UPSILON WITH PERISPOMENI
942945
1FE7; F; 03C5 0308 0342; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
@@ -1238,9 +1241,13 @@ A7C5; C; 0282; # LATIN CAPITAL LETTER S WITH HOOK
12381241
A7C6; C; 1D8E; # LATIN CAPITAL LETTER Z WITH PALATAL HOOK
12391242
A7C7; C; A7C8; # LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY
12401243
A7C9; C; A7CA; # LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY
1244+
A7CB; C; 0264; # LATIN CAPITAL LETTER RAMS HORN
1245+
A7CC; C; A7CD; # LATIN CAPITAL LETTER S WITH DIAGONAL STROKE
12411246
A7D0; C; A7D1; # LATIN CAPITAL LETTER CLOSED INSULAR G
12421247
A7D6; C; A7D7; # LATIN CAPITAL LETTER MIDDLE SCOTS S
12431248
A7D8; C; A7D9; # LATIN CAPITAL LETTER SIGMOID S
1249+
A7DA; C; A7DB; # LATIN CAPITAL LETTER LAMBDA
1250+
A7DC; C; 019B; # LATIN CAPITAL LETTER LAMBDA WITH STROKE
12441251
A7F5; C; A7F6; # LATIN CAPITAL LETTER REVERSED HALF H
12451252
AB70; C; 13A0; # CHEROKEE SMALL LETTER A
12461253
AB71; C; 13A1; # CHEROKEE SMALL LETTER E
@@ -1328,6 +1335,7 @@ FB02; F; 0066 006C; # LATIN SMALL LIGATURE FL
13281335
FB03; F; 0066 0066 0069; # LATIN SMALL LIGATURE FFI
13291336
FB04; F; 0066 0066 006C; # LATIN SMALL LIGATURE FFL
13301337
FB05; F; 0073 0074; # LATIN SMALL LIGATURE LONG S T
1338+
FB05; S; FB06; # LATIN SMALL LIGATURE LONG S T
13311339
FB06; F; 0073 0074; # LATIN SMALL LIGATURE ST
13321340
FB13; F; 0574 0576; # ARMENIAN SMALL LIGATURE MEN NOW
13331341
FB14; F; 0574 0565; # ARMENIAN SMALL LIGATURE MEN ECH
@@ -1522,6 +1530,28 @@ FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z
15221530
10CB0; C; 10CF0; # OLD HUNGARIAN CAPITAL LETTER EZS
15231531
10CB1; C; 10CF1; # OLD HUNGARIAN CAPITAL LETTER ENT-SHAPED SIGN
15241532
10CB2; C; 10CF2; # OLD HUNGARIAN CAPITAL LETTER US
1533+
10D50; C; 10D70; # GARAY CAPITAL LETTER A
1534+
10D51; C; 10D71; # GARAY CAPITAL LETTER CA
1535+
10D52; C; 10D72; # GARAY CAPITAL LETTER MA
1536+
10D53; C; 10D73; # GARAY CAPITAL LETTER KA
1537+
10D54; C; 10D74; # GARAY CAPITAL LETTER BA
1538+
10D55; C; 10D75; # GARAY CAPITAL LETTER JA
1539+
10D56; C; 10D76; # GARAY CAPITAL LETTER SA
1540+
10D57; C; 10D77; # GARAY CAPITAL LETTER WA
1541+
10D58; C; 10D78; # GARAY CAPITAL LETTER LA
1542+
10D59; C; 10D79; # GARAY CAPITAL LETTER GA
1543+
10D5A; C; 10D7A; # GARAY CAPITAL LETTER DA
1544+
10D5B; C; 10D7B; # GARAY CAPITAL LETTER XA
1545+
10D5C; C; 10D7C; # GARAY CAPITAL LETTER YA
1546+
10D5D; C; 10D7D; # GARAY CAPITAL LETTER TA
1547+
10D5E; C; 10D7E; # GARAY CAPITAL LETTER RA
1548+
10D5F; C; 10D7F; # GARAY CAPITAL LETTER NYA
1549+
10D60; C; 10D80; # GARAY CAPITAL LETTER FA
1550+
10D61; C; 10D81; # GARAY CAPITAL LETTER NA
1551+
10D62; C; 10D82; # GARAY CAPITAL LETTER PA
1552+
10D63; C; 10D83; # GARAY CAPITAL LETTER HA
1553+
10D64; C; 10D84; # GARAY CAPITAL LETTER OLD KA
1554+
10D65; C; 10D85; # GARAY CAPITAL LETTER OLD NA
15251555
118A0; C; 118C0; # WARANG CITI CAPITAL LETTER NGAA
15261556
118A1; C; 118C1; # WARANG CITI CAPITAL LETTER A
15271557
118A2; C; 118C2; # WARANG CITI CAPITAL LETTER WI

0 commit comments

Comments
 (0)