From baaec014a83ea7d06ae30a8125b0a3d033950524 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 17 Mar 2025 12:15:28 -0600 Subject: [PATCH 1/8] mktables: White-space, comment only Add comments, and rewrap comment lines to fit 80 columns --- charclass_invlists.inc | 2 +- lib/unicore/mktables | 16 +++++++++++----- lib/unicore/uni_keywords.pl | 2 +- regcharclass.h | 2 +- regexp_constants.h | 2 +- uni_keywords.h | 2 +- 6 files changed, 16 insertions(+), 10 deletions(-) diff --git a/charclass_invlists.inc b/charclass_invlists.inc index 50dfd4b65bf6..45d8e19b5a61 100644 --- a/charclass_invlists.inc +++ b/charclass_invlists.inc @@ -436055,7 +436055,7 @@ static const U8 WB_table[23][23] = { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 8968a5ee00063fa31ff15474b95ccb6b9c228e3ebad6c20bc77f4225187e2023 lib/unicore/mktables + * 22d2c7df3fbb5b806dfcc96abf3e68adad24e5af405aaee0af2fbe7df8961fb1 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index a10ec339bdb1..def5557ee912 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -15179,11 +15179,11 @@ END # Perl tailors the WordBreak property so that \b{wb} doesn't split # adjacent spaces into separate words. Unicode 11.0 moved in that - # direction, but left TAB, FIGURE SPACE (U+2007), and (ironically) NO - # BREAK SPACE as breaking, so we retained the original Perl customization. - # To do this, in the Perl copy of WB, simply replace the mappings of - # horizontal space characters that otherwise would map to the default or - # the 11.0 'WSegSpace' to instead map to our tailoring. + # direction, but left TAB, FIGURE SPACE (U+2007), and (ironically) + # NO_BREAK SPACE as breaking, so we retained the original Perl + # customization. To do this, in the Perl copy of WB, simply replace the + # mappings of horizontal space characters that otherwise would map to the + # default or the 11.0 'WSegSpace' to instead map to our tailoring. my $perl_wb = property_ref('_Perl_WB'); my $default = $perl_wb->default_map; for my $range ($Blank->ranges) { @@ -19779,12 +19779,18 @@ my @input_file_objects = ( Input_file->new('IdStatus.txt', v13.0.0, Pre_Handler => \&setup_IdStatus, Property => 'Identifier_Status', + + # Part of UTS 39, so must be downloaded separately from + # unicode.org UCD => 0, ), Input_file->new('IdType.txt', v13.0.0, Pre_Handler => \&setup_IdType, Each_Line_Handler => \&filter_IdType_line, Property => 'Identifier_Type', + + # Part of UTS 39, so must be downloaded separately from + # unicode.org UCD => 0, ), Input_file->new('confusables.txt', v15.0.0, diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index fe9034ecda6c..6e10f7c9d678 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1331,7 +1331,7 @@ # 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt # 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt # 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt -# 8968a5ee00063fa31ff15474b95ccb6b9c228e3ebad6c20bc77f4225187e2023 lib/unicore/mktables +# 22d2c7df3fbb5b806dfcc96abf3e68adad24e5af405aaee0af2fbe7df8961fb1 lib/unicore/mktables # 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version # 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl # c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/regcharclass.h b/regcharclass.h index 0b7b686598ea..67ab51c0b779 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -3850,7 +3850,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 8968a5ee00063fa31ff15474b95ccb6b9c228e3ebad6c20bc77f4225187e2023 lib/unicore/mktables + * 22d2c7df3fbb5b806dfcc96abf3e68adad24e5af405aaee0af2fbe7df8961fb1 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * 2a984ef37eb37e718ed25d472988745196816f9147d6ee6822a8efeafda340e5 regen/regcharclass.pl diff --git a/regexp_constants.h b/regexp_constants.h index 38e727c60c90..35f999d83faf 100644 --- a/regexp_constants.h +++ b/regexp_constants.h @@ -78,7 +78,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 8968a5ee00063fa31ff15474b95ccb6b9c228e3ebad6c20bc77f4225187e2023 lib/unicore/mktables + * 22d2c7df3fbb5b806dfcc96abf3e68adad24e5af405aaee0af2fbe7df8961fb1 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/uni_keywords.h b/uni_keywords.h index e013651e107a..ea9ac37538cc 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -7756,7 +7756,7 @@ match_uniprop( const unsigned char * const key, const U16 key_len ) { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 8968a5ee00063fa31ff15474b95ccb6b9c228e3ebad6c20bc77f4225187e2023 lib/unicore/mktables + * 22d2c7df3fbb5b806dfcc96abf3e68adad24e5af405aaee0af2fbe7df8961fb1 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl From 9f8e1570c068ff77184a1156577582d730401eae Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 18 Mar 2025 12:48:39 -0600 Subject: [PATCH 2/8] mktables: Handle new property NFKC_Simple_Casefold Unicode 15.1 introduces this new property, which needs the same special handling as plain NFKC_Casefold does. --- charclass_invlists.inc | 2 +- lib/unicore/mktables | 9 ++++++--- lib/unicore/uni_keywords.pl | 2 +- regcharclass.h | 2 +- regexp_constants.h | 2 +- uni_keywords.h | 2 +- 6 files changed, 11 insertions(+), 8 deletions(-) diff --git a/charclass_invlists.inc b/charclass_invlists.inc index 45d8e19b5a61..2a7774924189 100644 --- a/charclass_invlists.inc +++ b/charclass_invlists.inc @@ -436055,7 +436055,7 @@ static const U8 WB_table[23][23] = { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 22d2c7df3fbb5b806dfcc96abf3e68adad24e5af405aaee0af2fbe7df8961fb1 lib/unicore/mktables + * 403d5535830f577f6fd20bd477dd1d91ee6e6dd7eeb37d7550affca96d2281f0 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index def5557ee912..80d10faaab60 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -10049,10 +10049,13 @@ sub finish_property_setup($file) { # file directly (it was documented in 5.12 and 5.14 as being thusly # usable), keep it from being adjusted. (range_size_1 is # used to force the traditional format.) - if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) { - $nfkc_cf->set_to_output_map($EXTERNAL_MAP); - $nfkc_cf->set_range_size_1(1); + foreach my $property (qw(NFKC_Casefold NFKC_Simple_Casefold)) { + if (defined (my $cf = property_ref($property))) { + $cf->set_to_output_map($EXTERNAL_MAP); + $cf->set_range_size_1(1); + } } + if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) { $bmg->set_to_output_map($EXTERNAL_MAP); $bmg->set_range_size_1(1); diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index 6e10f7c9d678..adbe957ddfdb 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1331,7 +1331,7 @@ # 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt # 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt # 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt -# 22d2c7df3fbb5b806dfcc96abf3e68adad24e5af405aaee0af2fbe7df8961fb1 lib/unicore/mktables +# 403d5535830f577f6fd20bd477dd1d91ee6e6dd7eeb37d7550affca96d2281f0 lib/unicore/mktables # 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version # 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl # c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/regcharclass.h b/regcharclass.h index 67ab51c0b779..6c93c65c4b83 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -3850,7 +3850,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 22d2c7df3fbb5b806dfcc96abf3e68adad24e5af405aaee0af2fbe7df8961fb1 lib/unicore/mktables + * 403d5535830f577f6fd20bd477dd1d91ee6e6dd7eeb37d7550affca96d2281f0 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * 2a984ef37eb37e718ed25d472988745196816f9147d6ee6822a8efeafda340e5 regen/regcharclass.pl diff --git a/regexp_constants.h b/regexp_constants.h index 35f999d83faf..c4ad7b23b3bb 100644 --- a/regexp_constants.h +++ b/regexp_constants.h @@ -78,7 +78,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 22d2c7df3fbb5b806dfcc96abf3e68adad24e5af405aaee0af2fbe7df8961fb1 lib/unicore/mktables + * 403d5535830f577f6fd20bd477dd1d91ee6e6dd7eeb37d7550affca96d2281f0 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/uni_keywords.h b/uni_keywords.h index ea9ac37538cc..4574501ccbfb 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -7756,7 +7756,7 @@ match_uniprop( const unsigned char * const key, const U16 key_len ) { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 22d2c7df3fbb5b806dfcc96abf3e68adad24e5af405aaee0af2fbe7df8961fb1 lib/unicore/mktables + * 403d5535830f577f6fd20bd477dd1d91ee6e6dd7eeb37d7550affca96d2281f0 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl From c522b7e735437f8a8a4154801675a614f598b362 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 18 Mar 2025 12:55:43 -0600 Subject: [PATCH 3/8] mktables: Prepare for new Unicode 15.1 \b{Lb} rules Unicode 15.1 introduces new line breaking rules for Indic languages, via a new property Indic_Conjunct_Break. mktables works in conjunction with regen/mk_invlists.pl to construct tables and DFAs for handling these. This commit prepares mktables to do its part for Unicode versions that have these new rules. --- charclass_invlists.inc | 2 +- lib/unicore/mktables | 40 +++++++++++++++++++++++++++++++++++++ lib/unicore/uni_keywords.pl | 2 +- regcharclass.h | 2 +- regexp_constants.h | 2 +- uni_keywords.h | 2 +- 6 files changed, 45 insertions(+), 5 deletions(-) diff --git a/charclass_invlists.inc b/charclass_invlists.inc index 2a7774924189..91f3a49f4d09 100644 --- a/charclass_invlists.inc +++ b/charclass_invlists.inc @@ -436055,7 +436055,7 @@ static const U8 WB_table[23][23] = { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 403d5535830f577f6fd20bd477dd1d91ee6e6dd7eeb37d7550affca96d2281f0 lib/unicore/mktables + * 036fe608a33647e8f9ccad32b298415afb4524fba02aaa1c5977711c97006900 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 80d10faaab60..9b5bf268ec45 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -15228,6 +15228,46 @@ END } } + # In Unicode 15.1, the InCB property was added, which causes us to have to + # split GCB into subclasses that match various subclasses of InCB + my $perl_gcb = property_ref('_Perl_GCB'); + my $incb = property_ref('InCB'); + if (defined $perl_gcb && defined $incb) { + + # For each class in GCB ... + foreach my $gcb_table ($perl_gcb->tables) { + my $gcb_name = $gcb_table->name; + + # ... we see if it has any code points that are in the three + # classes of interest in INCB. + foreach my $incb_table ($incb->table('Consonant'), + $incb->table('Extend'), + $incb->table('Linker')) + { + my $intersection = $gcb_table & $incb_table; + + # If the intersection is empty, then nothing need be done. + next unless $intersection->ranges; + + # Likewise if the intersection doesn't subtract anything, + # nothing need be done. + next if $gcb_table->matches_identically_to($intersection); + + # Otherwise, construct a new table consisting of the + # intersection, removing its entries from the existing GCB + # table. The name of the new table is the combination of the + # GCB and InCB table names + my $incb_name = $incb_table->name; + my $combined_name = "${gcb_name}_$incb_name"; + + foreach my $range ($intersection->ranges) { + $perl_gcb->replace_map($range->start, $range->end, + $combined_name); + } + } + } + } + # Create a version of the LineBreak property with the mappings that are # omitted in the default algorithm remapped to what # http://www.unicode.org/reports/tr14 says they should be. diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index adbe957ddfdb..055076009c8e 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1331,7 +1331,7 @@ # 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt # 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt # 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt -# 403d5535830f577f6fd20bd477dd1d91ee6e6dd7eeb37d7550affca96d2281f0 lib/unicore/mktables +# 036fe608a33647e8f9ccad32b298415afb4524fba02aaa1c5977711c97006900 lib/unicore/mktables # 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version # 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl # c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/regcharclass.h b/regcharclass.h index 6c93c65c4b83..5d573b8949ee 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -3850,7 +3850,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 403d5535830f577f6fd20bd477dd1d91ee6e6dd7eeb37d7550affca96d2281f0 lib/unicore/mktables + * 036fe608a33647e8f9ccad32b298415afb4524fba02aaa1c5977711c97006900 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * 2a984ef37eb37e718ed25d472988745196816f9147d6ee6822a8efeafda340e5 regen/regcharclass.pl diff --git a/regexp_constants.h b/regexp_constants.h index c4ad7b23b3bb..899f31625790 100644 --- a/regexp_constants.h +++ b/regexp_constants.h @@ -78,7 +78,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 403d5535830f577f6fd20bd477dd1d91ee6e6dd7eeb37d7550affca96d2281f0 lib/unicore/mktables + * 036fe608a33647e8f9ccad32b298415afb4524fba02aaa1c5977711c97006900 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/uni_keywords.h b/uni_keywords.h index 4574501ccbfb..862be9268c09 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -7756,7 +7756,7 @@ match_uniprop( const unsigned char * const key, const U16 key_len ) { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 403d5535830f577f6fd20bd477dd1d91ee6e6dd7eeb37d7550affca96d2281f0 lib/unicore/mktables + * 036fe608a33647e8f9ccad32b298415afb4524fba02aaa1c5977711c97006900 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl From ca2e9b7d10ebf6fe0f31713477438cbd16cc28bc Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 18 Mar 2025 12:54:47 -0600 Subject: [PATCH 4/8] mktables: Ignore missings entries in two files These files are changed in 15.1 to have @missings lines, whereas they didn't before. This leads to some warnings messages, so turn off looking at them, as we do for a number of other files. --- charclass_invlists.inc | 2 +- lib/unicore/mktables | 2 ++ lib/unicore/uni_keywords.pl | 2 +- regcharclass.h | 2 +- regexp_constants.h | 2 +- uni_keywords.h | 2 +- 6 files changed, 7 insertions(+), 5 deletions(-) diff --git a/charclass_invlists.inc b/charclass_invlists.inc index 91f3a49f4d09..0a6744330894 100644 --- a/charclass_invlists.inc +++ b/charclass_invlists.inc @@ -436055,7 +436055,7 @@ static const U8 WB_table[23][23] = { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 036fe608a33647e8f9ccad32b298415afb4524fba02aaa1c5977711c97006900 lib/unicore/mktables + * 9582c6075d11a1f57b806d54b9dd17063bbc66f9b19c1439e656e4dc155b13af lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 9b5bf268ec45..483bee648a4f 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -19821,6 +19821,7 @@ my @input_file_objects = ( ), Input_file->new('IdStatus.txt', v13.0.0, Pre_Handler => \&setup_IdStatus, + Has_Missings_Defaults => $IGNORED, Property => 'Identifier_Status', # Part of UTS 39, so must be downloaded separately from @@ -19829,6 +19830,7 @@ my @input_file_objects = ( ), Input_file->new('IdType.txt', v13.0.0, Pre_Handler => \&setup_IdType, + Has_Missings_Defaults => $IGNORED, Each_Line_Handler => \&filter_IdType_line, Property => 'Identifier_Type', diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index 055076009c8e..dc5bfb4c18d1 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1331,7 +1331,7 @@ # 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt # 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt # 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt -# 036fe608a33647e8f9ccad32b298415afb4524fba02aaa1c5977711c97006900 lib/unicore/mktables +# 9582c6075d11a1f57b806d54b9dd17063bbc66f9b19c1439e656e4dc155b13af lib/unicore/mktables # 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version # 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl # c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/regcharclass.h b/regcharclass.h index 5d573b8949ee..d09697d355ac 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -3850,7 +3850,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 036fe608a33647e8f9ccad32b298415afb4524fba02aaa1c5977711c97006900 lib/unicore/mktables + * 9582c6075d11a1f57b806d54b9dd17063bbc66f9b19c1439e656e4dc155b13af lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * 2a984ef37eb37e718ed25d472988745196816f9147d6ee6822a8efeafda340e5 regen/regcharclass.pl diff --git a/regexp_constants.h b/regexp_constants.h index 899f31625790..c213fb78e9f0 100644 --- a/regexp_constants.h +++ b/regexp_constants.h @@ -78,7 +78,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 036fe608a33647e8f9ccad32b298415afb4524fba02aaa1c5977711c97006900 lib/unicore/mktables + * 9582c6075d11a1f57b806d54b9dd17063bbc66f9b19c1439e656e4dc155b13af lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/uni_keywords.h b/uni_keywords.h index 862be9268c09..920277df2a10 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -7756,7 +7756,7 @@ match_uniprop( const unsigned char * const key, const U16 key_len ) { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 036fe608a33647e8f9ccad32b298415afb4524fba02aaa1c5977711c97006900 lib/unicore/mktables + * 9582c6075d11a1f57b806d54b9dd17063bbc66f9b19c1439e656e4dc155b13af lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl From e7730ad6ebb55d42450ad7bc01c62d2d61e16ba7 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 18 Mar 2025 13:22:13 -0600 Subject: [PATCH 5/8] mktables: Prepare to handle 15.1 \b{lb} Quote rules Unicode 15.1 changes the rules for line breaking with regards to Quotation marks. This prepares for that. --- charclass_invlists.inc | 2 +- lib/unicore/mktables | 14 ++++++++++++++ lib/unicore/uni_keywords.pl | 2 +- regcharclass.h | 2 +- regexp_constants.h | 2 +- uni_keywords.h | 2 +- 6 files changed, 19 insertions(+), 5 deletions(-) diff --git a/charclass_invlists.inc b/charclass_invlists.inc index 0a6744330894..18682c560683 100644 --- a/charclass_invlists.inc +++ b/charclass_invlists.inc @@ -436055,7 +436055,7 @@ static const U8 WB_table[23][23] = { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 9582c6075d11a1f57b806d54b9dd17063bbc66f9b19c1439e656e4dc155b13af lib/unicore/mktables + * 273236becd6f795425739144aded86309011c3604d5b309d3952b55065bd7f3b lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 483bee648a4f..0440dc8d8552 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -15367,6 +15367,20 @@ END } } } + elsif ($v_version ge 15.1.0 && $value eq standardize('Quotation')) { + + # Unicode 15.1 splits LB=QU initial quotes and final quotes, and + # regular quotes + for my $i ($range->start .. $range->end) { + my $gc_val = $gc->value_of($i); + if ($gc_val eq 'Pi') { + $perl_lb->replace_map($i, $i, "Initial_Quote"); + } + elsif ($gc_val eq 'Pf') { + $perl_lb->replace_map($i, $i, "Final_Quote"); + } + } + } } # This property is a modification of the scx property diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index dc5bfb4c18d1..f0a5ec572b0b 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1331,7 +1331,7 @@ # 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt # 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt # 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt -# 9582c6075d11a1f57b806d54b9dd17063bbc66f9b19c1439e656e4dc155b13af lib/unicore/mktables +# 273236becd6f795425739144aded86309011c3604d5b309d3952b55065bd7f3b lib/unicore/mktables # 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version # 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl # c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/regcharclass.h b/regcharclass.h index d09697d355ac..a4b63638e4bc 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -3850,7 +3850,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 9582c6075d11a1f57b806d54b9dd17063bbc66f9b19c1439e656e4dc155b13af lib/unicore/mktables + * 273236becd6f795425739144aded86309011c3604d5b309d3952b55065bd7f3b lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * 2a984ef37eb37e718ed25d472988745196816f9147d6ee6822a8efeafda340e5 regen/regcharclass.pl diff --git a/regexp_constants.h b/regexp_constants.h index c213fb78e9f0..a72c9b58ec53 100644 --- a/regexp_constants.h +++ b/regexp_constants.h @@ -78,7 +78,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 9582c6075d11a1f57b806d54b9dd17063bbc66f9b19c1439e656e4dc155b13af lib/unicore/mktables + * 273236becd6f795425739144aded86309011c3604d5b309d3952b55065bd7f3b lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/uni_keywords.h b/uni_keywords.h index 920277df2a10..9b5d23839b8c 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -7756,7 +7756,7 @@ match_uniprop( const unsigned char * const key, const U16 key_len ) { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 9582c6075d11a1f57b806d54b9dd17063bbc66f9b19c1439e656e4dc155b13af lib/unicore/mktables + * 273236becd6f795425739144aded86309011c3604d5b309d3952b55065bd7f3b lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl From b30f4ccdf65da8e349f499c000bb1d5f438864af Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 18 Mar 2025 13:25:28 -0600 Subject: [PATCH 6/8] mktables: Prepare to handle 15.1 \b{lb} dotted circle rule Unicode 15.1 adds new line breaking rules that depend on the dotted circle. This creates a table for that so that mk_invlists.pl doesn't have to have exception code for handling it. --- charclass_invlists.inc | 2 +- lib/unicore/mktables | 5 +++++ lib/unicore/uni_keywords.pl | 2 +- regcharclass.h | 2 +- regexp_constants.h | 2 +- uni_keywords.h | 2 +- 6 files changed, 10 insertions(+), 5 deletions(-) diff --git a/charclass_invlists.inc b/charclass_invlists.inc index 18682c560683..384e1c60e18f 100644 --- a/charclass_invlists.inc +++ b/charclass_invlists.inc @@ -436055,7 +436055,7 @@ static const U8 WB_table[23][23] = { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 273236becd6f795425739144aded86309011c3604d5b309d3952b55065bd7f3b lib/unicore/mktables + * a1534d11ec6bef256d1829e9fd74c2202ed7f1475ae3c7659d2566e92c7e0f02 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 0440dc8d8552..11d7ae3bd0a9 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -15383,6 +15383,11 @@ END } } + # This is an Alphabetic, but it doesn't need to be split off, because no + # current rule involving Alphabetics requires not including this. + $perl_lb->replace_map(0x25CC, 0x25CC, "Dotted_Circle") + if $v_version ge 15.1.0; + # This property is a modification of the scx property my $perl_scx = Property->new('_Perl_SCX', Fate => $INTERNAL_ONLY, diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index f0a5ec572b0b..993418f55955 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1331,7 +1331,7 @@ # 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt # 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt # 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt -# 273236becd6f795425739144aded86309011c3604d5b309d3952b55065bd7f3b lib/unicore/mktables +# a1534d11ec6bef256d1829e9fd74c2202ed7f1475ae3c7659d2566e92c7e0f02 lib/unicore/mktables # 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version # 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl # c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/regcharclass.h b/regcharclass.h index a4b63638e4bc..ede52e6b2cd5 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -3850,7 +3850,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 273236becd6f795425739144aded86309011c3604d5b309d3952b55065bd7f3b lib/unicore/mktables + * a1534d11ec6bef256d1829e9fd74c2202ed7f1475ae3c7659d2566e92c7e0f02 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * 2a984ef37eb37e718ed25d472988745196816f9147d6ee6822a8efeafda340e5 regen/regcharclass.pl diff --git a/regexp_constants.h b/regexp_constants.h index a72c9b58ec53..7a3c85ff0a83 100644 --- a/regexp_constants.h +++ b/regexp_constants.h @@ -78,7 +78,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 273236becd6f795425739144aded86309011c3604d5b309d3952b55065bd7f3b lib/unicore/mktables + * a1534d11ec6bef256d1829e9fd74c2202ed7f1475ae3c7659d2566e92c7e0f02 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/uni_keywords.h b/uni_keywords.h index 9b5d23839b8c..b9cec99e11c5 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -7756,7 +7756,7 @@ match_uniprop( const unsigned char * const key, const U16 key_len ) { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 273236becd6f795425739144aded86309011c3604d5b309d3952b55065bd7f3b lib/unicore/mktables + * a1534d11ec6bef256d1829e9fd74c2202ed7f1475ae3c7659d2566e92c7e0f02 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl From 51f594d608d6488866f83cd6c56c8bbb95632115 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 18 Mar 2025 14:31:23 -0600 Subject: [PATCH 7/8] mktables: Handle Unicode 16.0 Unikemet.txt file This is handled by ignoring it for now, and letting mktables know that the properties it contains are empty. This file, new in 16.0, gives extra information about Egyption Hieroglyphics newly encoded in 16.0. It is intended only for scholars of these ancient symbols. mktables normally handles new properties automatically, but this file is in a completely different format than previous ones, so mktables would have to be adapted to understand that. That might not be too hard, given that mktables has infrastructure to handle other outliers that have come along over the years from Unicode. But, by ignoring this file, we create empty tables which generate errors in other places in perl. These are real bugs that ought to be fixed, and will be before 16.0 is incorporated into blead. And how many Egyptologists are there in the world, much less how many use the latest Perl? So the perldelta will say that 16.0's support doesn't include these, which are mostly provisional anyway. --- charclass_invlists.inc | 2 +- lib/unicore/mktables | 21 +++++++++++++++++++++ lib/unicore/uni_keywords.pl | 2 +- regcharclass.h | 2 +- regexp_constants.h | 2 +- uni_keywords.h | 2 +- 6 files changed, 26 insertions(+), 5 deletions(-) diff --git a/charclass_invlists.inc b/charclass_invlists.inc index 384e1c60e18f..b8aabccb4b14 100644 --- a/charclass_invlists.inc +++ b/charclass_invlists.inc @@ -436055,7 +436055,7 @@ static const U8 WB_table[23][23] = { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * a1534d11ec6bef256d1829e9fd74c2202ed7f1475ae3c7659d2566e92c7e0f02 lib/unicore/mktables + * 803999c789a4780e22aa278e3dfa95162e48a0d329aa993cda26b99116672651 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 11d7ae3bd0a9..43833bf79744 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -871,6 +871,15 @@ push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend' push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133' if $v_version ge v6.2.0; +# These properties of Egyptian hieroglyphs are not yet handled by Perl. Their +# intended audience is only specialist Egyptologists +push @tables_that_may_be_empty, qw(kEH_Cat kEH_Desc kEH_HG kEH_IFAO + kEH_JSesh + kEH_NoMirror kEH_NoMirror=Yes + kEH_NoMirror=No + kEH_NoRotate kEH_NoRotate=Yes) + if $v_version ge v16.0.0; + # The lists below are hashes, so the key is the item in the list, and the # value is the reason why it is in the list. This makes generation of # documentation easier. @@ -19869,6 +19878,18 @@ my @input_file_objects = ( Skip => $Unused_Skip, UCD => 0, ), + Input_file->new('Unikemet.txt', v16.0.0, + # For Egyptian Hieroglyphs; is in an alien format to the + # other files Unicode furnishes. + Skip => $Unused_Skip, + UCD => 0, + ), + Input_file->new('DoNotEmit.txt', v16.0.0, + # Advice about characters that are unwise to create; not + # any properties, though we could create some. + Skip => $Unused_Skip, + UCD => 0, + ), ); # End of all the preliminaries. diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index 993418f55955..55335779aee6 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1331,7 +1331,7 @@ # 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt # 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt # 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt -# a1534d11ec6bef256d1829e9fd74c2202ed7f1475ae3c7659d2566e92c7e0f02 lib/unicore/mktables +# 803999c789a4780e22aa278e3dfa95162e48a0d329aa993cda26b99116672651 lib/unicore/mktables # 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version # 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl # c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/regcharclass.h b/regcharclass.h index ede52e6b2cd5..d53e111f7649 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -3850,7 +3850,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * a1534d11ec6bef256d1829e9fd74c2202ed7f1475ae3c7659d2566e92c7e0f02 lib/unicore/mktables + * 803999c789a4780e22aa278e3dfa95162e48a0d329aa993cda26b99116672651 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * 2a984ef37eb37e718ed25d472988745196816f9147d6ee6822a8efeafda340e5 regen/regcharclass.pl diff --git a/regexp_constants.h b/regexp_constants.h index 7a3c85ff0a83..7b9f8f6843e9 100644 --- a/regexp_constants.h +++ b/regexp_constants.h @@ -78,7 +78,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * a1534d11ec6bef256d1829e9fd74c2202ed7f1475ae3c7659d2566e92c7e0f02 lib/unicore/mktables + * 803999c789a4780e22aa278e3dfa95162e48a0d329aa993cda26b99116672651 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/uni_keywords.h b/uni_keywords.h index b9cec99e11c5..aeff12113f19 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -7756,7 +7756,7 @@ match_uniprop( const unsigned char * const key, const U16 key_len ) { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * a1534d11ec6bef256d1829e9fd74c2202ed7f1475ae3c7659d2566e92c7e0f02 lib/unicore/mktables + * 803999c789a4780e22aa278e3dfa95162e48a0d329aa993cda26b99116672651 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl From de01c61cc3f76721f9167da99219f49dac1ee271 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 18 Mar 2025 13:50:26 -0600 Subject: [PATCH 8/8] mktables: Support new Unicode 16.0 properties ID_Compat_Math_foo These new properties are automatically handled, but there is a problem. They have no short form names. Files are written for them based on their names, and those files are not distinguishable on a DOS 8.3 file system. The solution here is to manually override the automatically generated file names with distinguishable ones. --- charclass_invlists.inc | 2 +- lib/unicore/mktables | 14 +++++++++++++- lib/unicore/uni_keywords.pl | 2 +- regcharclass.h | 2 +- regexp_constants.h | 2 +- uni_keywords.h | 2 +- 6 files changed, 18 insertions(+), 6 deletions(-) diff --git a/charclass_invlists.inc b/charclass_invlists.inc index b8aabccb4b14..4a3866dafcdf 100644 --- a/charclass_invlists.inc +++ b/charclass_invlists.inc @@ -436055,7 +436055,7 @@ static const U8 WB_table[23][23] = { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 803999c789a4780e22aa278e3dfa95162e48a0d329aa993cda26b99116672651 lib/unicore/mktables + * 52af47e14e5de40ccba986bd59c6d8c863de93e3d231f7ecca62feb9c2b2990a lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 43833bf79744..75a3e201320e 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -8820,7 +8820,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } # filesystem to distinguish between, this is used to manually give short # names for the directory name immediately under $match_tables that the # match tables for this property should be placed in. - main::set_access('match_subdir', \%match_subdir, 'r'); + main::set_access('match_subdir', \%match_subdir, 'r', 's'); my %has_dependency; # A boolean that gives whether some table somewhere is defined as the @@ -10072,6 +10072,14 @@ sub finish_property_setup($file) { property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED); + # These two properties have no short names and the file names for them + # clash in DOS 8.3. Work around this by creating shorter file names that + # work + my $IDCMStart = property_ref("ID_Compat_Math_Start"); + $IDCMStart->set_match_subdir("IDCMStart") if defined $IDCMStart; + my $IDCMCont= property_ref("ID_Compat_Math_Continue"); + $IDCMCont->set_match_subdir("IDCMContinue") if defined $IDCMCont; + # The rest of this sub is for properties that need the Multi_Default class # to create objects for defaults. As of v15.0, this is no longer needed. @@ -13748,6 +13756,10 @@ END next if $range->start == 0x1D7CE; # This whole range was added in 3.1 next if $range->end == 0x19DA && $v_version eq v5.2.0; next if $range->end - $range->start < 9 && $v_version le 4.0.0; + + # 2 sequential series of 10 each were added in 16.0 + next if $range->start == 0x116D0 && $range->end == 0x116E3; + Carp::my_carp("Range $range unexpectedly doesn't contain 10" . " decimal digits. Code in regcomp.c assumes it does," . " and will have to be fixed. Proceeding anyway."); diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index 55335779aee6..e2b540304542 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1331,7 +1331,7 @@ # 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt # 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt # 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt -# 803999c789a4780e22aa278e3dfa95162e48a0d329aa993cda26b99116672651 lib/unicore/mktables +# 52af47e14e5de40ccba986bd59c6d8c863de93e3d231f7ecca62feb9c2b2990a lib/unicore/mktables # 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version # 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl # c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/regcharclass.h b/regcharclass.h index d53e111f7649..80a19bbb870d 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -3850,7 +3850,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 803999c789a4780e22aa278e3dfa95162e48a0d329aa993cda26b99116672651 lib/unicore/mktables + * 52af47e14e5de40ccba986bd59c6d8c863de93e3d231f7ecca62feb9c2b2990a lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * 2a984ef37eb37e718ed25d472988745196816f9147d6ee6822a8efeafda340e5 regen/regcharclass.pl diff --git a/regexp_constants.h b/regexp_constants.h index 7b9f8f6843e9..0ae73255508d 100644 --- a/regexp_constants.h +++ b/regexp_constants.h @@ -78,7 +78,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 803999c789a4780e22aa278e3dfa95162e48a0d329aa993cda26b99116672651 lib/unicore/mktables + * 52af47e14e5de40ccba986bd59c6d8c863de93e3d231f7ecca62feb9c2b2990a lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/uni_keywords.h b/uni_keywords.h index aeff12113f19..a2ed6d97d10c 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -7756,7 +7756,7 @@ match_uniprop( const unsigned char * const key, const U16 key_len ) { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 803999c789a4780e22aa278e3dfa95162e48a0d329aa993cda26b99116672651 lib/unicore/mktables + * 52af47e14e5de40ccba986bd59c6d8c863de93e3d231f7ecca62feb9c2b2990a lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl