diff --git a/charclass_invlists.inc b/charclass_invlists.inc index 50dfd4b65bf6..4a3866dafcdf 100644 --- a/charclass_invlists.inc +++ b/charclass_invlists.inc @@ -436055,7 +436055,7 @@ static const U8 WB_table[23][23] = { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 8968a5ee00063fa31ff15474b95ccb6b9c228e3ebad6c20bc77f4225187e2023 lib/unicore/mktables + * 52af47e14e5de40ccba986bd59c6d8c863de93e3d231f7ecca62feb9c2b2990a lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index a10ec339bdb1..75a3e201320e 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -871,6 +871,15 @@ push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend' push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133' if $v_version ge v6.2.0; +# These properties of Egyptian hieroglyphs are not yet handled by Perl. Their +# intended audience is only specialist Egyptologists +push @tables_that_may_be_empty, qw(kEH_Cat kEH_Desc kEH_HG kEH_IFAO + kEH_JSesh + kEH_NoMirror kEH_NoMirror=Yes + kEH_NoMirror=No + kEH_NoRotate kEH_NoRotate=Yes) + if $v_version ge v16.0.0; + # The lists below are hashes, so the key is the item in the list, and the # value is the reason why it is in the list. This makes generation of # documentation easier. @@ -8811,7 +8820,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } # filesystem to distinguish between, this is used to manually give short # names for the directory name immediately under $match_tables that the # match tables for this property should be placed in. - main::set_access('match_subdir', \%match_subdir, 'r'); + main::set_access('match_subdir', \%match_subdir, 'r', 's'); my %has_dependency; # A boolean that gives whether some table somewhere is defined as the @@ -10049,10 +10058,13 @@ sub finish_property_setup($file) { # file directly (it was documented in 5.12 and 5.14 as being thusly # usable), keep it from being adjusted. (range_size_1 is # used to force the traditional format.) - if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) { - $nfkc_cf->set_to_output_map($EXTERNAL_MAP); - $nfkc_cf->set_range_size_1(1); + foreach my $property (qw(NFKC_Casefold NFKC_Simple_Casefold)) { + if (defined (my $cf = property_ref($property))) { + $cf->set_to_output_map($EXTERNAL_MAP); + $cf->set_range_size_1(1); + } } + if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) { $bmg->set_to_output_map($EXTERNAL_MAP); $bmg->set_range_size_1(1); @@ -10060,6 +10072,14 @@ sub finish_property_setup($file) { property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED); + # These two properties have no short names and the file names for them + # clash in DOS 8.3. Work around this by creating shorter file names that + # work + my $IDCMStart = property_ref("ID_Compat_Math_Start"); + $IDCMStart->set_match_subdir("IDCMStart") if defined $IDCMStart; + my $IDCMCont= property_ref("ID_Compat_Math_Continue"); + $IDCMCont->set_match_subdir("IDCMContinue") if defined $IDCMCont; + # The rest of this sub is for properties that need the Multi_Default class # to create objects for defaults. As of v15.0, this is no longer needed. @@ -13736,6 +13756,10 @@ END next if $range->start == 0x1D7CE; # This whole range was added in 3.1 next if $range->end == 0x19DA && $v_version eq v5.2.0; next if $range->end - $range->start < 9 && $v_version le 4.0.0; + + # 2 sequential series of 10 each were added in 16.0 + next if $range->start == 0x116D0 && $range->end == 0x116E3; + Carp::my_carp("Range $range unexpectedly doesn't contain 10" . " decimal digits. Code in regcomp.c assumes it does," . " and will have to be fixed. Proceeding anyway."); @@ -15179,11 +15203,11 @@ END # Perl tailors the WordBreak property so that \b{wb} doesn't split # adjacent spaces into separate words. Unicode 11.0 moved in that - # direction, but left TAB, FIGURE SPACE (U+2007), and (ironically) NO - # BREAK SPACE as breaking, so we retained the original Perl customization. - # To do this, in the Perl copy of WB, simply replace the mappings of - # horizontal space characters that otherwise would map to the default or - # the 11.0 'WSegSpace' to instead map to our tailoring. + # direction, but left TAB, FIGURE SPACE (U+2007), and (ironically) + # NO_BREAK SPACE as breaking, so we retained the original Perl + # customization. To do this, in the Perl copy of WB, simply replace the + # mappings of horizontal space characters that otherwise would map to the + # default or the 11.0 'WSegSpace' to instead map to our tailoring. my $perl_wb = property_ref('_Perl_WB'); my $default = $perl_wb->default_map; for my $range ($Blank->ranges) { @@ -15225,6 +15249,46 @@ END } } + # In Unicode 15.1, the InCB property was added, which causes us to have to + # split GCB into subclasses that match various subclasses of InCB + my $perl_gcb = property_ref('_Perl_GCB'); + my $incb = property_ref('InCB'); + if (defined $perl_gcb && defined $incb) { + + # For each class in GCB ... + foreach my $gcb_table ($perl_gcb->tables) { + my $gcb_name = $gcb_table->name; + + # ... we see if it has any code points that are in the three + # classes of interest in INCB. + foreach my $incb_table ($incb->table('Consonant'), + $incb->table('Extend'), + $incb->table('Linker')) + { + my $intersection = $gcb_table & $incb_table; + + # If the intersection is empty, then nothing need be done. + next unless $intersection->ranges; + + # Likewise if the intersection doesn't subtract anything, + # nothing need be done. + next if $gcb_table->matches_identically_to($intersection); + + # Otherwise, construct a new table consisting of the + # intersection, removing its entries from the existing GCB + # table. The name of the new table is the combination of the + # GCB and InCB table names + my $incb_name = $incb_table->name; + my $combined_name = "${gcb_name}_$incb_name"; + + foreach my $range ($intersection->ranges) { + $perl_gcb->replace_map($range->start, $range->end, + $combined_name); + } + } + } + } + # Create a version of the LineBreak property with the mappings that are # omitted in the default algorithm remapped to what # http://www.unicode.org/reports/tr14 says they should be. @@ -15324,8 +15388,27 @@ END } } } + elsif ($v_version ge 15.1.0 && $value eq standardize('Quotation')) { + + # Unicode 15.1 splits LB=QU initial quotes and final quotes, and + # regular quotes + for my $i ($range->start .. $range->end) { + my $gc_val = $gc->value_of($i); + if ($gc_val eq 'Pi') { + $perl_lb->replace_map($i, $i, "Initial_Quote"); + } + elsif ($gc_val eq 'Pf') { + $perl_lb->replace_map($i, $i, "Final_Quote"); + } + } + } } + # This is an Alphabetic, but it doesn't need to be split off, because no + # current rule involving Alphabetics requires not including this. + $perl_lb->replace_map(0x25CC, 0x25CC, "Dotted_Circle") + if $v_version ge 15.1.0; + # This property is a modification of the scx property my $perl_scx = Property->new('_Perl_SCX', Fate => $INTERNAL_ONLY, @@ -19778,13 +19861,21 @@ my @input_file_objects = ( ), Input_file->new('IdStatus.txt', v13.0.0, Pre_Handler => \&setup_IdStatus, + Has_Missings_Defaults => $IGNORED, Property => 'Identifier_Status', + + # Part of UTS 39, so must be downloaded separately from + # unicode.org UCD => 0, ), Input_file->new('IdType.txt', v13.0.0, Pre_Handler => \&setup_IdType, + Has_Missings_Defaults => $IGNORED, Each_Line_Handler => \&filter_IdType_line, Property => 'Identifier_Type', + + # Part of UTS 39, so must be downloaded separately from + # unicode.org UCD => 0, ), Input_file->new('confusables.txt', v15.0.0, @@ -19799,6 +19890,18 @@ my @input_file_objects = ( Skip => $Unused_Skip, UCD => 0, ), + Input_file->new('Unikemet.txt', v16.0.0, + # For Egyptian Hieroglyphs; is in an alien format to the + # other files Unicode furnishes. + Skip => $Unused_Skip, + UCD => 0, + ), + Input_file->new('DoNotEmit.txt', v16.0.0, + # Advice about characters that are unwise to create; not + # any properties, though we could create some. + Skip => $Unused_Skip, + UCD => 0, + ), ); # End of all the preliminaries. diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index fe9034ecda6c..e2b540304542 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1331,7 +1331,7 @@ # 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt # 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt # 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt -# 8968a5ee00063fa31ff15474b95ccb6b9c228e3ebad6c20bc77f4225187e2023 lib/unicore/mktables +# 52af47e14e5de40ccba986bd59c6d8c863de93e3d231f7ecca62feb9c2b2990a lib/unicore/mktables # 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version # 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl # c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/regcharclass.h b/regcharclass.h index 0b7b686598ea..80a19bbb870d 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -3850,7 +3850,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 8968a5ee00063fa31ff15474b95ccb6b9c228e3ebad6c20bc77f4225187e2023 lib/unicore/mktables + * 52af47e14e5de40ccba986bd59c6d8c863de93e3d231f7ecca62feb9c2b2990a lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * 2a984ef37eb37e718ed25d472988745196816f9147d6ee6822a8efeafda340e5 regen/regcharclass.pl diff --git a/regexp_constants.h b/regexp_constants.h index 38e727c60c90..0ae73255508d 100644 --- a/regexp_constants.h +++ b/regexp_constants.h @@ -78,7 +78,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 8968a5ee00063fa31ff15474b95ccb6b9c228e3ebad6c20bc77f4225187e2023 lib/unicore/mktables + * 52af47e14e5de40ccba986bd59c6d8c863de93e3d231f7ecca62feb9c2b2990a lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/uni_keywords.h b/uni_keywords.h index e013651e107a..a2ed6d97d10c 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -7756,7 +7756,7 @@ match_uniprop( const unsigned char * const key, const U16 key_len ) { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 8968a5ee00063fa31ff15474b95ccb6b9c228e3ebad6c20bc77f4225187e2023 lib/unicore/mktables + * 52af47e14e5de40ccba986bd59c6d8c863de93e3d231f7ecca62feb9c2b2990a lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl