-
Notifications
You must be signed in to change notification settings - Fork 568
/
Copy pathmktables
20633 lines (17844 loc) · 860 KB
/
mktables
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/perl -w
# !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
# Any files created or read by this program should be listed in 'mktables.lst'
# Use -makelist to regenerate it.
# There was an attempt when this was first rewritten to make it 5.8
# compatible, but that has now been abandoned, and newer constructs are used
# as convenient.
# NOTE: this script can run quite slowly in older/slower systems.
# It can also consume a lot of memory (128 MB or more), you may need
# to raise your process resource limits (e.g. in bash, "ulimit -a"
# to inspect, and "ulimit -d ..." or "ulimit -m ..." to set)
my $start_time;
BEGIN { # Get the time the script started running; do it at compilation to
# get it as close as possible
$start_time= time;
}
require 5.010_001;
use strict;
use warnings;
use builtin qw(refaddr);
use Carp;
use Config;
use File::Find;
use File::Path;
use File::Spec;
use Text::Tabs;
use re "/aa";
use feature 'state';
use feature 'signatures';
no warnings qw( experimental::builtin );
sub DEBUG () { 0 } # Set to 0 for production; 1 for development
$| = 1 if DEBUG;
my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
sub NON_ASCII_PLATFORM { ord("A") != 65 }
# When a new version of Unicode is published, unfortunately the algorithms for
# dealing with various bounds, like \b{gcb}, \b{lb} may have to be updated
# manually. The changes may or may not be backward compatible with older
# releases. The code is in regen/mk_invlist.pl and regexec.c. Make the
# changes, then come back here and set the variable below to what version the
# code is expecting. If a newer version of Unicode is being compiled than
# expected, a warning will be generated. If an older version is being
# compiled, any bounds tests that fail in the generated test file (-maketest
# option) will be marked as TODO.
my $version_of_mk_invlist_bounds = v15.0.0;
##########################################################################
#
# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
# from the Unicode database files (lib/unicore/.../*.txt), It also generates
# a pod file and .t files, depending on option parameters.
#
# The structure of this file is:
# First these introductory comments; then
# code needed for everywhere, such as debugging stuff; then
# code to handle input parameters; then
# data structures likely to be of external interest (some of which depend on
# the input parameters, so follows them; then
# more data structures and subroutine and package (class) definitions; then
# the small actual loop to process the input files and finish up; then
# a __DATA__ section, for the .t tests
#
# This program works on all releases of Unicode so far. The outputs have been
# scrutinized most intently for release 5.1. The others have been checked for
# somewhat more than just sanity. It can handle all non-provisional Unicode
# character properties in those releases.
#
# This program is mostly about Unicode character (or code point) properties.
# A property describes some attribute or quality of a code point, like if it
# is lowercase or not, its name, what version of Unicode it was first defined
# in, or what its uppercase equivalent is. Unicode deals with these disparate
# possibilities by making all properties into mappings from each code point
# into some corresponding value. In the case of it being lowercase or not,
# the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
# property maps each Unicode code point to a single value, called a "property
# value". (Some more recently defined properties, map a code point to a set
# of values.)
#
# When using a property in a regular expression, what is desired isn't the
# mapping of the code point to its property's value, but the reverse (or the
# mathematical "inverse relation"): starting with the property value, "Does a
# code point map to it?" These are written in a "compound" form:
# \p{property=value}, e.g., \p{category=punctuation}. This program generates
# files containing the lists of code points that map to each such regular
# expression property value, one file per list
#
# There is also a single form shortcut that Perl adds for many of the commonly
# used properties. This happens for all binary properties, plus script,
# general_category, and block properties.
#
# Thus the outputs of this program are files. There are map files, mostly in
# the 'To' directory; and there are list files for use in regular expression
# matching, all in subdirectories of the 'lib' directory, with each
# subdirectory being named for the property that the lists in it are for.
# Bookkeeping, test, and documentation files are also generated.
my $matches_directory = 'lib'; # Where match (\p{}) files go.
my $map_directory = 'To'; # Where map files go.
# DATA STRUCTURES
#
# The major data structures of this program are Property, of course, but also
# Table. There are two kinds of tables, very similar to each other.
# "Match_Table" is the data structure giving the list of code points that have
# a particular property value, mentioned above. There is also a "Map_Table"
# data structure which gives the property's mapping from code point to value.
# There are two structures because the match tables need to be combined in
# various ways, such as constructing unions, intersections, complements, etc.,
# and the map ones don't. And there would be problems, perhaps subtle, if
# a map table were inadvertently operated on in some of those ways.
# The use of separate classes with operations defined on one but not the other
# prevents accidentally confusing the two.
#
# At the heart of each table's data structure is a "Range_List", which is just
# an ordered list of "Ranges", plus ancillary information, and methods to
# operate on them. A Range is a compact way to store property information.
# Each range has a starting code point, an ending code point, and a value that
# is meant to apply to all the code points between the two end points,
# inclusive. For a map table, this value is the property value for those
# code points. Two such ranges could be written like this:
# 0x41 .. 0x5A, 'Upper',
# 0x61 .. 0x7A, 'Lower'
#
# Each range also has a type used as a convenience to classify the values.
# Most ranges in this program will be Type 0, or normal, but there are some
# ranges that have a non-zero type. These are used only in map tables, and
# are for mappings that don't fit into the normal scheme of things. Mappings
# that require a hash entry to communicate with utf8.c are one example;
# another example is mappings for charnames.pm to use which indicate a name
# that is algorithmically determinable from its code point (and the reverse).
# These are used to significantly compact these tables, instead of listing
# each one of the tens of thousands individually.
#
# In a match table, the value of a range is irrelevant (and hence the type as
# well, which will always be 0), and arbitrarily set to the empty string.
# Using the example above, there would be two match tables for those two
# entries, one named Upper would contain the 0x41..0x5A range, and the other
# named Lower would contain 0x61..0x7A.
#
# Actually, there are two types of range lists, "Range_Map" is the one
# associated with map tables, and "Range_List" with match tables.
# Again, this is so that methods can be defined on one and not the others so
# as to prevent operating on them in incorrect ways.
#
# Eventually, most tables are written out to files to be read by Unicode::UCD.
# All tables could in theory be written, but some are suppressed because there
# is no current practical use for them. It is easy to change which get
# written by changing various lists that are near the top of the actual code
# in this file. The table data structures contain enough ancillary
# information to allow them to be treated as separate entities for writing,
# such as the path to each one's file. There is a heading in each map table
# that gives the format of its entries, and what the map is for all the code
# points missing from it. (This allows tables to be more compact.)
#
# The Property data structure contains one or more tables. All properties
# contain a map table (except the $perl property which is a
# pseudo-property containing only match tables), and any properties that
# are usable in regular expression matches also contain various matching
# tables, one for each value the property can have. A binary property can
# have two values, True and False (or Y and N, which are preferred by Unicode
# terminology). Thus each of these properties will have a map table that
# takes every code point and maps it to Y or N (but having ranges cuts the
# number of entries in that table way down), and two match tables, one
# which has a list of all the code points that map to Y, and one for all the
# code points that map to N. (For each binary property, a third table is also
# generated for the pseudo Perl property. It contains the identical code
# points as the Y table, but can be written in regular expressions, not in the
# compound form, but in a "single" form like \p{IsUppercase}.) Many
# properties are binary, but some properties have several possible values,
# some have many, and properties like Name have a different value for every
# named code point. Those will not, unless the controlling lists are changed,
# have their match tables written out. But all the ones which can be used in
# regular expression \p{} and \P{} constructs will. Prior to 5.14, generally
# a property would have either its map table or its match tables written but
# not both. Again, what gets written is controlled by lists which can easily
# be changed. Starting in 5.14, advantage was taken of this, and all the map
# tables needed to reconstruct the Unicode db are now written out, while
# suppressing the Unicode .txt files that contain the data. Our tables are
# much more compact than the .txt files, so a significant space savings was
# achieved. Also, tables are not written out that are trivially derivable
# from tables that do get written. So, there typically is no file containing
# the code points not matched by a binary property (the table for \P{} versus
# lowercase \p{}), since you just need to invert the True table to get the
# False table.
# Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on
# how many match tables there are and the content of the maps. This 'Type' is
# different than a range 'Type', so don't get confused by the two concepts
# having the same name.
#
# For information about the Unicode properties, see Unicode's UAX44 document:
my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
# As stated earlier, this program will work on any release of Unicode so far.
# Most obvious problems in earlier data have NOT been corrected except when
# necessary to make Perl or this program work reasonably, and to keep out
# potential security issues. For example, no folding information was given in
# early releases, so this program substitutes lower case instead, just so that
# a regular expression with the /i option will do something that actually
# gives the right results in many cases. There are also a couple other
# corrections for version 1.1.5, commented at the point they are made. As an
# example of corrections that weren't made (but could be) is this statement
# from DerivedAge.txt: "The supplementary private use code points and the
# non-character code points were assigned in version 2.0, but not specifically
# listed in the UCD until versions 3.0 and 3.1 respectively." (To be precise
# it was 3.0.1 not 3.0.0) More information on Unicode version glitches is
# further down in these introductory comments.
#
# This program works on all non-provisional properties as of the current
# Unicode release, though the files for some are suppressed for various
# reasons. You can change which are output by changing lists in this program.
#
# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
# loose matchings rules (from Unicode TR18):
#
# The recommended names for UCD properties and property values are in
# PropertyAliases.txt [Prop] and PropertyValueAliases.txt
# [PropValue]. There are both abbreviated names and longer, more
# descriptive names. It is strongly recommended that both names be
# recognized, and that loose matching of property names be used,
# whereby the case distinctions, whitespace, hyphens, and underbar
# are ignored.
#
# The program still allows Fuzzy to override its determination of if loose
# matching should be used, but it isn't currently used, as it is no longer
# needed; the calculations it makes are good enough.
#
# SUMMARY OF HOW IT WORKS:
#
# Process arguments
#
# A list is constructed containing each input file that is to be processed
#
# Each file on the list is processed in a loop, using the associated handler
# code for each:
# The PropertyAliases.txt and PropValueAliases.txt files are processed
# first. These files name the properties and property values.
# Objects are created of all the property and property value names
# that the rest of the input should expect, including all synonyms.
# The other input files give mappings from properties to property
# values. That is, they list code points and say what the mapping
# is under the given property. Some files give the mappings for
# just one property; and some for many. This program goes through
# each file and populates the properties and their map tables from
# them. Some properties are listed in more than one file, and
# Unicode has set up a precedence as to which has priority if there
# is a conflict. Thus the order of processing matters, and this
# program handles the conflict possibility by processing the
# overriding input files last, so that if necessary they replace
# earlier values.
# After this is all done, the program creates the property mappings not
# furnished by Unicode, but derivable from what it does give.
# The tables of code points that match each property value in each
# property that is accessible by regular expressions are created.
# The Perl-defined properties are created and populated. Many of these
# require data determined from the earlier steps
# Any Perl-defined synonyms are created, and name clashes between Perl
# and Unicode are reconciled and warned about.
# All the properties are written to files
# Any other files are written, and final warnings issued.
#
# For clarity, a number of operators have been overloaded to work on tables:
# ~ means invert (take all characters not in the set). The more
# conventional '!' is not used because of the possibility of confusing
# it with the actual boolean operation.
# + means union
# - means subtraction
# & means intersection
# The precedence of these is the order listed. Parentheses should be
# copiously used. These are not a general scheme. The operations aren't
# defined for a number of things, deliberately, to avoid getting into trouble.
# Operations are done on references and affect the underlying structures, so
# that the copy constructors for them have been overloaded to not return a new
# clone, but the input object itself.
#
# The bool operator is deliberately not overloaded to avoid confusion with
# "should it mean if the object merely exists, or also is non-empty?".
#
# WHY CERTAIN DESIGN DECISIONS WERE MADE
#
# This program needs to be able to run under miniperl. Therefore, it uses a
# minimum of other modules, and hence implements some things itself that could
# be gotten from CPAN
#
# This program uses inputs published by the Unicode Consortium. These can
# change incompatibly between releases without the Perl maintainers realizing
# it. Therefore this program is now designed to try to flag these. It looks
# at the directories where the inputs are, and flags any unrecognized files.
# It keeps track of all the properties in the files it handles, and flags any
# that it doesn't know how to handle. It also flags any input lines that
# don't match the expected syntax, among other checks.
#
# It is also designed so if a new input file matches one of the known
# templates, one hopefully just needs to add it to a list to have it
# processed.
#
# As mentioned earlier, some properties are given in more than one file. In
# particular, the files in the extracted directory are supposedly just
# reformattings of the others. But they contain information not easily
# derivable from the other files, including results for Unihan (which isn't
# usually available to this program) and for unassigned code points. They
# also have historically had errors or been incomplete. In an attempt to
# create the best possible data, this program thus processes them first to
# glean information missing from the other files; then processes those other
# files to override any errors in the extracted ones. Much of the design was
# driven by this need to store things and then possibly override them.
#
# It tries to keep fatal errors to a minimum, to generate something usable for
# testing purposes. It always looks for files that could be inputs, and will
# warn about any that it doesn't know how to handle (the -q option suppresses
# the warning).
#
# Why is there more than one type of range?
# This simplified things. There are some very specialized code points that
# have to be handled specially for output, such as Hangul syllable names.
# By creating a range type (done late in the development process), it
# allowed this to be stored with the range, and overridden by other input.
# Originally these were stored in another data structure, and it became a
# mess trying to decide if a second file that was for the same property was
# overriding the earlier one or not.
#
# Why are there two kinds of tables, match and map?
# (And there is a base class shared by the two as well.) As stated above,
# they actually are for different things. Development proceeded much more
# smoothly when I (khw) realized the distinction. Map tables are used to
# give the property value for every code point (actually every code point
# that doesn't map to a default value). Match tables are used for regular
# expression matches, and are essentially the inverse mapping. Separating
# the two allows more specialized methods, and error checks so that one
# can't just take the intersection of two map tables, for example, as that
# is nonsensical.
#
# What about 'fate' and 'status'. The concept of a table's fate was created
# late when it became clear that something more was needed. The difference
# between this and 'status' is unclean, and could be improved if someone
# wanted to spend the effort.
#
# DEBUGGING
#
# This program is written so it will run under miniperl. Occasionally changes
# will cause an error where the backtrace doesn't work well under miniperl.
# To diagnose the problem, you can instead run it under regular perl, if you
# have one compiled.
#
# There is a good trace facility. To enable it, first sub DEBUG must be set
# to return true. Then a line like
#
# local $to_trace = 1 if main::DEBUG;
#
# can be added to enable tracing in its lexical scope (plus dynamic) or until
# you insert another line:
#
# local $to_trace = 0 if main::DEBUG;
#
# To actually trace, use a line like "trace $a, @b, %c, ...;
#
# Some of the more complex subroutines already have trace statements in them.
# Permanent trace statements should be like:
#
# trace ... if main::DEBUG && $to_trace;
#
# main::stack_trace() will display what its name implies
#
# If there is just one or a few files that you're debugging, you can easily
# cause most everything else to be skipped. Change the line
#
# my $debug_skip = 0;
#
# to 1, and every file whose object is in @input_file_objects and doesn't have
# a, 'non_skip => 1,' in its constructor will be skipped. However, skipping
# Jamo.txt or UnicodeData.txt will likely cause fatal errors.
#
# To compare the output tables, it may be useful to specify the -annotate
# flag. (As of this writing, this can't be done on a clean workspace, due to
# requirements in Text::Tabs used in this option; so first run mktables
# without this option.) This option adds comment lines to each table, one for
# each non-algorithmically named character giving, currently its code point,
# name, and graphic representation if printable (and you have a font that
# knows about it). This makes it easier to see what the particular code
# points are in each output table. Non-named code points are annotated with a
# description of their status, and contiguous ones with the same description
# will be output as a range rather than individually. Algorithmically named
# characters are also output as ranges, except when there are just a few
# contiguous ones.
#
# FUTURE ISSUES
#
# The program would break if Unicode were to change its names so that
# interior white space, underscores, or dashes differences were significant
# within property and property value names.
#
# It might be easier to use the xml versions of the UCD if this program ever
# would need heavy revision, and the ability to handle old versions was not
# required. Also, it turns out to be risky to rely on this, as in early 2024,
# Unicode decided to drop the xml version. It was news to many that this was
# not considered to be an official product that needs to be maintained going
# forward. Someone acceptable to the Unicode management volunteered to take
# over from the retiring volunteer, and so it continues, but beware.
#
# There is the potential for name collisions, in that Perl has chosen names
# that Unicode could decide it also likes. There have been such collisions in
# the past, with mostly Perl deciding to adopt the Unicode definition of the
# name. However in the 5.2 Unicode beta testing, there were a number of such
# collisions, which were withdrawn before the final release, because of Perl's
# and other's protests. These all involved new properties which began with
# 'Is'. Based on the protests, Unicode is unlikely to try that again. Also,
# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
# Unicode document, so they are unlikely to be used by Unicode for another
# purpose. However, they might try something beginning with 'In', or use any
# of the other Perl-defined properties. This program will warn you of name
# collisions, and refuse to generate tables with them, but manual intervention
# will be required in this event. One scheme that could be implemented, if
# necessary, would be to have this program generate another file, or add a
# field to mktables.lst that gives the date of first definition of a property.
# Each new release of Unicode would use that file as a basis for the next
# iteration. And the Perl synonym addition code could sort based on the age
# of the property, so older properties get priority, and newer ones that clash
# would be refused; hence existing code would not be impacted, and some other
# synonym would have to be used for the new property. This is ugly, and
# manual intervention would certainly be easier to do in the short run; lets
# hope it never comes to this.
#
# A NOTE ON UNIHAN
#
# This program can generate tables from the Unihan database. But that DB
# isn't normally available, so it is marked as optional. Prior to version
# 5.2, this database was in a single file, Unihan.txt. In 5.2 the database
# was split into 8 different files, all beginning with the letters 'Unihan'.
# If you plunk those files down into the directory mktables ($0) is in, this
# program will read them and automatically create tables for the properties
# from it that are listed in PropertyAliases.txt and PropValueAliases.txt,
# plus any you add to the @cjk_properties array and the @cjk_property_values
# array, being sure to add necessary '# @missings' lines to the latter. For
# Unicode versions earlier than 5.2, most of the Unihan properties are not
# listed at all in PropertyAliases nor PropValueAliases. This program assumes
# for these early releases that you want the properties that are specified in
# the 5.2 release.
#
# You may need to adjust the entries to suit your purposes. setup_unihan(),
# and filter_unihan_line() are the functions where this is done. This program
# already does some adjusting to make the lines look more like the rest of the
# Unicode DB; You can see what that is in filter_unihan_line()
#
# There is a bug in the 3.2 data file in which some values for the
# kPrimaryNumeric property have commas and an unexpected comment. A filter
# could be added to correct these; or for a particular installation, the
# Unihan.txt file could be edited to fix them.
#
# HOW TO ADD A FILE TO BE PROCESSED
#
# A new file from Unicode needs to have an object constructed for it in
# @input_file_objects, probably at the end or at the end of the extracted
# ones. The program should warn you if its name will clash with others on
# restrictive file systems, like DOS. If so, figure out a better name, and
# add lines to the README.perl file giving that. If the file is a character
# property, it should be in the format that Unicode has implicitly
# standardized for such files for the more recently introduced ones.
# If so, the Input_file constructor for @input_file_objects can just be the
# file name and release it first appeared in. If not, then it should be
# possible to construct an each_line_handler() to massage the line into the
# standardized form.
#
# For non-character properties, more code will be needed. You can look at
# the existing entries for clues.
#
# UNICODE VERSIONS NOTES
#
# The Unicode UCD has had a number of errors in it over the versions. And
# these remain, by policy, in the standard for that version. Therefore it is
# risky to correct them, because code may be expecting the error. So this
# program doesn't generally make changes, unless the error breaks the Perl
# core. As an example, some versions of 2.1.x Jamo.txt have the wrong value
# for U+1105, which causes real problems for the algorithms for Jamo
# calculations, so it is changed here.
#
# But it isn't so clear cut as to what to do about concepts that are
# introduced in a later release; should they extend back to earlier releases
# where the concept just didn't exist? It was easier to do this than to not,
# so that's what was done. For example, the default value for code points not
# in the files for various properties was probably undefined until changed by
# some version. No_Block for blocks is such an example. This program will
# assign No_Block even in Unicode versions that didn't have it. This has the
# benefit that code being written doesn't have to special case earlier
# versions; and the detriment that it doesn't match the Standard precisely for
# the affected versions.
#
# Here are some observations about some of the issues in early versions:
#
# Prior to version 3.0, there were 3 character decompositions. These are not
# handled by Unicode::Normalize, nor will it compile when presented a version
# that has them. However, you can trivially get it to compile by simply
# ignoring those decompositions, by changing the croak to a carp. At the time
# of this writing, the line (in dist/Unicode-Normalize/Normalize.pm or
# dist/Unicode-Normalize/mkheader) reads
#
# croak("Weird Canonical Decomposition of U+$h");
#
# Simply comment it out. It will compile, but will not know about any three
# character decompositions.
# The number of code points in \p{alpha=True} halved in 2.1.9. It turns out
# that the reason is that the CJK block starting at 4E00 was removed from
# PropList, and was not put back in until 3.1.0. The Perl extension (the
# single property name \p{alpha}) has the correct values. But the compound
# form is simply not generated until 3.1, as it can be argued that prior to
# this release, this was not an official property. The comments for
# filter_old_style_proplist() give more details.
#
# Unicode introduced the synonym Space for White_Space in 4.1. Perl has
# always had a \p{Space}. In release 3.2 only, they are not synonymous. The
# reason is that 3.2 introduced U+205F=medium math space, which was not
# classed as white space, but Perl figured out that it should have been. 4.0
# reclassified it correctly.
#
# Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2
# this was erroneously a synonym for 202 (it should be 200). In 4.0, ATB
# became 202, and ATBL was left with no code points, as all the ones that
# mapped to 202 stayed mapped to 202. Thus if your program used the numeric
# name for the class, it would not have been affected, but if it used the
# mnemonic, it would have been.
#
# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that, code
# points which eventually came to have this script property value, instead
# mapped to "Unknown". But in the next release all these code points were
# moved to \p{sc=common} instead.
# The tests furnished by Unicode for testing WordBreak and SentenceBreak
# generate errors in 5.0 and earlier.
#
# The default for missing code points for BidiClass is complicated. Starting
# in 3.1.1, the derived file DBidiClass.txt handles this, but this program
# tries to do the best it can for earlier releases. It is done in
# process_PropertyAliases()
#
# In version 2.1.2, the entry in UnicodeData.txt:
# 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F;
# should instead be
# 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
# Without this change, there are casing problems for this character.
#
# Search for $string_compare_versions to see how to compare changes to
# properties between Unicode versions
#
##############################################################################
my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
# and errors
my $MAX_LINE_WIDTH = 78;
# Debugging aid to skip most files so as to not be distracted by them when
# concentrating on the ones being debugged. Add
# non_skip => 1,
# to the constructor for those files you want processed when you set this.
# Files with a first version number of 0 are special: they are always
# processed regardless of the state of this flag. Generally, Jamo.txt and
# UnicodeData.txt must not be skipped if you want this program to not die
# before normal completion.
my $debug_skip = 0;
# Normally these are suppressed.
my $write_Unicode_deprecated_tables = 0;
# Set to 1 to enable tracing.
our $to_trace = 0;
{ # Closure for trace: debugging aid
my $print_caller = 1; # ? Include calling subroutine name
my $main_with_colon = 'main::';
my $main_colon_length = length($main_with_colon);
sub trace {
return unless $to_trace; # Do nothing if global flag not set
my @input = @_;
local $DB::trace = 0;
$DB::trace = 0; # Quiet 'used only once' message
my $line_number;
# Loop looking up the stack to get the first non-trace caller
my $caller_line;
my $caller_name;
my $i = 0;
do {
$line_number = $caller_line;
(my $pkg, my $file, $caller_line, my $caller) = caller $i++;
$caller = $main_with_colon unless defined $caller;
$caller_name = $caller;
# get rid of pkg
$caller_name =~ s/.*:://;
if (substr($caller_name, 0, $main_colon_length)
eq $main_with_colon)
{
$caller_name = substr($caller_name, $main_colon_length);
}
} until ($caller_name ne 'trace');
# If the stack was empty, we were called from the top level
$caller_name = 'main' if ($caller_name eq ""
|| $caller_name eq 'trace');
my $output = "";
#print STDERR __LINE__, ": ", join ", ", @input, "\n";
foreach my $string (@input) {
if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
$output .= simple_dumper($string);
}
else {
$string = "$string" if ref $string;
$string = $UNDEF unless defined $string;
chomp $string;
$string = '""' if $string eq "";
$output .= " " if $output ne ""
&& $string ne ""
&& substr($output, -1, 1) ne " "
&& substr($string, 0, 1) ne " ";
$output .= $string;
}
}
print STDERR sprintf "%4d: ", $line_number if defined $line_number;
print STDERR "$caller_name: " if $print_caller;
print STDERR $output, "\n";
return;
}
}
sub stack_trace() {
local $to_trace = 1 if main::DEBUG;
my $line = (caller(0))[2];
my $i = 1;
# Accumulate the stack trace
while (1) {
my ($pkg, $file, $caller_line, $caller) = caller $i++;
last unless defined $caller;
trace "called from $caller() at line $line";
$line = $caller_line;
}
}
# This is for a rarely used development feature that allows you to compare two
# versions of the Unicode standard without having to deal with changes caused
# by the code points introduced in the later version. You probably also want
# to use the -annotate option when using this. Run this program on a unicore
# containing the starting release you want to compare. Save that output
# structure. Then, switching to a unicore with the ending release, change the
# "" in the $string_compare_versions definition just below to a string
# containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding
# to the starting release. This program will then compile, but throw away all
# code points introduced after the starting release. Finally use a diff tool
# to compare the two directory structures. They include only the code points
# common to both releases, and you can see the changes caused just by the
# underlying release semantic changes. For versions earlier than 3.2, you
# must copy a version of DAge.txt into the directory.
my $string_compare_versions = DEBUG && "";
my $compare_versions = DEBUG
&& $string_compare_versions
&& pack "C*", split /\./, $string_compare_versions;
sub uniques {
# Returns non-duplicated input values. From "Perl Best Practices:
# Encapsulated Cleverness". p. 455 in first edition.
my %seen;
# Arguably this breaks encapsulation, if the goal is to permit multiple
# distinct objects to stringify to the same value, and be interchangeable.
# However, for this program, no two objects stringify identically, and all
# lists passed to this function are either objects or strings. So this
# doesn't affect correctness, but it does give a couple of percent speedup.
no overloading;
return grep { ! $seen{$_}++ } @_;
}
$0 = File::Spec->canonpath($0);
my $make_test_script = 0; # ? Should we output a test script
my $make_norm_test_script = 0; # ? Should we output a normalization test script
my $write_unchanged_files = 0; # ? Should we update the output files even if
# we don't think they have changed
my $use_directory = ""; # ? Should we chdir somewhere.
my $pod_directory; # input directory to store the pod file.
my $pod_file = 'perluniprops';
my $t_path; # Path to the .t test file
my $file_list = 'mktables.lst'; # File to store input and output file names.
# This is used to speed up the build, by not
# executing the main body of the program if
# nothing on the list has changed since the
# previous build
my $make_list = 1; # ? Should we write $file_list. Set to always
# make a list so that when the release manager
# is preparing a release, they won't have to do
# special things
my $glob_list = 0; # ? Should we try to include unknown .txt files
# in the input.
my $output_range_counts = $debugging_build; # ? Should we include the number
# of code points in ranges in
# the output
my $annotate = 0; # ? Should character names be in the output
# Verbosity levels; 0 is quiet
my $NORMAL_VERBOSITY = 1;
my $PROGRESS = 2;
my $VERBOSE = 3;
my $verbosity = $NORMAL_VERBOSITY;
# Stored in mktables.lst so that if this program is called with different
# options, will regenerate even if the files otherwise look like they're
# up-to-date.
my $command_line_arguments = join " ", @ARGV;
# Process arguments
while (@ARGV) {
my $arg = shift @ARGV;
if ($arg eq '-v') {
$verbosity = $VERBOSE;
}
elsif ($arg eq '-p') {
$verbosity = $PROGRESS;
$| = 1; # Flush buffers as we go.
}
elsif ($arg eq '-q') {
$verbosity = 0;
}
elsif ($arg eq '-w') {
# update the files even if they haven't changed
$write_unchanged_files = 1;
}
elsif ($arg eq '-check') {
my $this = shift @ARGV;
my $ok = shift @ARGV;
if ($this ne $ok) {
print "Skipping as check params are not the same.\n";
exit(0);
}
}
elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
-d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
}
elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
{
$make_test_script = 1;
}
elsif ($arg eq '-makenormtest')
{
$make_norm_test_script = 1;
}
elsif ($arg eq '-makelist') {
$make_list = 1;
}
elsif ($arg eq '-C' && defined ($use_directory = shift)) {
-d $use_directory or croak "Unknown directory '$use_directory'";
}
elsif ($arg eq '-L') {
# Existence not tested until have chdir'd
$file_list = shift;
}
elsif ($arg eq '-globlist') {
$glob_list = 1;
}
elsif ($arg eq '-c') {
$output_range_counts = ! $output_range_counts
}
elsif ($arg eq '-annotate') {
$annotate = 1;
$debugging_build = 1;
$output_range_counts = 1;
}
else {
my $with_c = 'with';
$with_c .= 'out' if $output_range_counts; # Complements the state
croak <<END;
usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
[ -T test_file_path ] [-globlist] [-makelist] [-maketest]
[-check A B ]
-c : Output comments $with_c number of code points in ranges
-q : Quiet Mode: Only output serious warnings.
-p : Set verbosity level to normal plus show progress.
-v : Set Verbosity level high: Show progress and non-serious
warnings
-w : Write files regardless
-C dir : Change to this directory before proceeding. All relative paths
except those specified by the -P and -T options will be done
with respect to this directory.
-P dir : Output $pod_file file to directory 'dir'.
-T path : Create a test script as 'path'; overrides -maketest
-L filelist : Use alternate 'filelist' instead of standard one
-globlist : Take as input all non-Test *.txt files in current and sub
directories
-maketest : Make test script 'TestProp.pl' in current (or -C directory),
overrides -T
-makelist : Rewrite the file list $file_list based on current setup
-annotate : Output an annotation for each character in the table files;
useful for debugging mktables, looking at diffs; but is slow
and memory intensive
-check A B : Executes $0 only if A and B are the same
END
}
}
# Stores the most-recently changed file. If none have changed, can skip the
# build
my $most_recent = (stat $0)[9]; # Do this before the chdir!
# Change directories now, because need to read 'version' early.
if ($use_directory) {
if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
$pod_directory = File::Spec->rel2abs($pod_directory);
}
if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
$t_path = File::Spec->rel2abs($t_path);
}
chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
$pod_directory = File::Spec->abs2rel($pod_directory);
}
if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
$t_path = File::Spec->abs2rel($t_path);
}
}
# Get Unicode version into regular and v-string. This is done now because
# various tables below get populated based on it. These tables are populated
# here to be near the top of the file, and so easily seeable by those needing
# to modify things.
open my $VERSION, "<", "version"
or croak "$0: can't open required file 'version': $!\n";
my $string_version = <$VERSION>;
close $VERSION;
chomp $string_version;
my $v_version = pack "C*", split /\./, $string_version; # v string
my $unicode_version = ($compare_versions)
? ( "$string_compare_versions (using "
. "$string_version rules)")
: $string_version;
# The following are the complete names of properties with property values that
# are known to not match any code points in some versions of Unicode, but that
# may change in the future so they should be matchable, hence an empty file is
# generated for them.
my @tables_that_may_be_empty;
push @tables_that_may_be_empty, 'Joining_Type=Left_Joining'
if $v_version lt v6.3.0;
push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
if $v_version ge v4.1.0;
push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
if $v_version ge v6.0.0;
push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
if $v_version ge v6.1.0;
push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133'
if $v_version ge v6.2.0;
# These properties of Egyptian hieroglyphs are not yet handled by Perl. Their
# intended audience is only specialist Egyptologists
push @tables_that_may_be_empty, qw(kEH_Cat kEH_Desc kEH_HG kEH_IFAO
kEH_JSesh
kEH_NoMirror kEH_NoMirror=Yes
kEH_NoMirror=No
kEH_NoRotate kEH_NoRotate=Yes)
if $v_version ge v16.0.0;
# The lists below are hashes, so the key is the item in the list, and the
# value is the reason why it is in the list. This makes generation of
# documentation easier.
my %why_suppressed; # No file generated for these.
# Files aren't generated for empty extraneous properties. This is arguable.
# Extraneous properties generally come about because a property is no longer
# used in a newer version of Unicode. If we generated a file without code
# points, programs that used to work on that property will still execute
# without errors. It just won't ever match (or will always match, with \P{}).
# This means that the logic is now likely wrong. I (khw) think its better to
# find this out by getting an error message. Just move them to the table
# above to change this behavior
my %why_suppress_if_empty_warn_if_not = (
# It is the only property that has ever officially been removed from the
# Standard. The database never contained any code points for it.
'Special_Case_Condition' => 'Obsolete',
# Apparently never official, but there were code points in some versions of
# old-style PropList.txt
'Non_Break' => 'Obsolete',
);
# These would normally go in the warn table just above, but they were changed
# a long time before this program was written, so warnings about them are
# moot.
if ($v_version gt v3.2.0) {
push @tables_that_may_be_empty,
'Canonical_Combining_Class=Attached_Below_Left'
}
# Obsoleted
if ($v_version ge v11.0.0) {
push @tables_that_may_be_empty, qw(
Grapheme_Cluster_Break=E_Base
Grapheme_Cluster_Break=E_Base_GAZ
Grapheme_Cluster_Break=E_Modifier
Grapheme_Cluster_Break=Glue_After_Zwj
Word_Break=E_Base
Word_Break=E_Base_GAZ
Word_Break=E_Modifier
Word_Break=Glue_After_Zwj);
}
# Enum values for to_output_map() method in the Map_Table package. (0 is don't
# output)
my $EXTERNAL_MAP = 1;
my $INTERNAL_MAP = 2;
my $OUTPUT_ADJUSTED = 3;
# To override computed values for writing the map tables for these properties.
# The default for enum map tables is to write them out, so that the Unicode
# .txt files can be removed, but all the data to compute any property value
# for any code point is available in a more compact form.
my %global_to_output_map = (
# Needed by UCD.pm, but don't want to publicize that it exists, so won't
# get stuck supporting it if things change. Since it is a STRING
# property, it normally would be listed in the pod, but INTERNAL_MAP
# suppresses that.
Unicode_1_Name => $INTERNAL_MAP,
Present_In => 0, # Suppress, as easily computed from Age
Block => (NON_ASCII_PLATFORM) ? 1 : 0, # Suppress, as Blocks.txt is
# retained, but needed for
# non-ASCII
# Suppress, as mapping can be found instead from the
# Perl_Decomposition_Mapping file
Decomposition_Type => 0,
);
# There are several types of obsolete properties defined by Unicode. These
# must be hand-edited for every new Unicode release.
my %why_deprecated; # Generates a deprecated warning message if used.
my %why_stabilized; # Documentation only
my %why_obsolete; # Documentation only
{ # Closure
my $simple = 'Perl uses the more complete version';
my $unihan = 'Unihan properties are by default not enabled in the Perl core.';
my $other_properties = 'other properties';
my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
%why_deprecated = (
'Grapheme_Link' => 'Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
'Jamo_Short_Name' => $contributory,
'Line_Break=Surrogate' => 'Surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
'Other_Alphabetic' => $contributory,
'Other_Default_Ignorable_Code_Point' => $contributory,
'Other_Grapheme_Extend' => $contributory,
'Other_ID_Continue' => $contributory,
'Other_ID_Start' => $contributory,
'Other_Lowercase' => $contributory,
'Other_Math' => $contributory,
'Other_Uppercase' => $contributory,
'Expands_On_NFC' => $why_no_expand,
'Expands_On_NFD' => $why_no_expand,
'Expands_On_NFKC' => $why_no_expand,
'Expands_On_NFKD' => $why_no_expand,
);
%why_suppressed = (
# There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
# contains the same information, but without the algorithmically
# determinable Hangul syllables'. This file is not published, so it's
# existence is not noted in the comment.
'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::',
# Don't suppress ISO_Comment, as otherwise special handling is needed
# to differentiate between it and gc=c, which can be written as 'isc',
# which is the same characters as ISO_Comment's short name.
'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::",