3 # !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
4 # Any files created or read by this program should be listed in 'mktables.lst'
5 # Use -makelist to regenerate it.
7 # Needs 'no overloading' to run faster on miniperl. Code commented out at the
8 # subroutine objaddr can be used instead to work as far back (untested) as
19 sub DEBUG () { 0 } # Set to 0 for production; 1 for development
21 ##########################################################################
23 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
24 # from the Unicode database files (lib/unicore/.../*.txt), It also generates
25 # a pod file and a .t file
27 # The structure of this file is:
28 # First these introductory comments; then
29 # code needed for everywhere, such as debugging stuff; then
30 # code to handle input parameters; then
31 # data structures likely to be of external interest (some of which depend on
32 # the input parameters, so follows them; then
33 # more data structures and subroutine and package (class) definitions; then
34 # the small actual loop to process the input files and finish up; then
35 # a __DATA__ section, for the .t tests
37 # This program works on all releases of Unicode through at least 5.2. The
38 # outputs have been scrutinized most intently for release 5.1. The others
39 # have been checked for somewhat more than just sanity. It can handle all
40 # existing Unicode character properties in those releases.
42 # This program needs to be able to run under miniperl. Therefore, it uses a
43 # minimum of other modules, and hence implements some things itself that could
46 # This program uses inputs published by the Unicode Consortium. These can
47 # change incompatibly between releases without the Perl maintainers realizing
48 # it. Therefore this program is now designed to try to flag these. It looks
49 # at the directories where the inputs are, and flags any unrecognized files.
50 # It keeps track of all the properties in the files it handles, and flags any
51 # that it doesn't know how to handle. It also flags any input lines that
52 # don't match the expected syntax, among other checks.
53 # It is also designed so if a new input file matches one of the known
54 # templates, one hopefully just needs to add it to a list to have it
57 # It tries to keep fatal errors to a minimum, to generate something usable for
58 # testing purposes. It always looks for files that could be inputs, and will
59 # warn about any that it doesn't know how to handle (the -q option suppresses
62 # This program is mostly about Unicode character (or code point) properties.
63 # A property describes some attribute or quality of a code point, like if it
64 # is lowercase or not, its name, what version of Unicode it was first defined
65 # in, or what its uppercase equivalent is. Unicode deals with these disparate
66 # possibilities by making all properties into mappings from each code point
67 # into some corresponding value. In the case of it being lowercase or not,
68 # the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
69 # property maps each Unicode code point to a single value, called a "property
70 # value". (Hence each Unicode property is a true mathematical function with
71 # exactly one value per code point.)
73 # When using a property in a regular expression, what is desired isn't the
74 # mapping of the code point to its property's value, but the reverse (or the
75 # mathematical "inverse relation"): starting with the property value, "Does a
76 # code point map to it?" These are written in a "compound" form:
77 # \p{property=value}, e.g., \p{category=punctuation}. This program generates
78 # files containing the lists of code points that map to each such regular
79 # expression property value, one file per list
81 # There is also a single form shortcut that Perl adds for many of the commonly
82 # used properties. This happens for all binary properties, plus script,
83 # general_category, and block properties.
85 # Thus the outputs of this program are files. There are map files, mostly in
86 # the 'To' directory; and there are list files for use in regular expression
87 # matching, all in subdirectories of the 'lib' directory, with each
88 # subdirectory being named for the property that the lists in it are for.
89 # Bookkeeping, test, and documentation files are also generated.
91 my $matches_directory = 'lib'; # Where match (\p{}) files go.
92 my $map_directory = 'To'; # Where map files go.
96 # The major data structures of this program are Property, of course, but also
97 # Table. There are two kinds of tables, very similar to each other.
98 # "Match_Table" is the data structure giving the list of code points that have
99 # a particular property value, mentioned above. There is also a "Map_Table"
100 # data structure which gives the property's mapping from code point to value.
101 # There are two structures because the match tables need to be combined in
102 # various ways, such as constructing unions, intersections, complements, etc.,
103 # and the map ones don't. And there would be problems, perhaps subtle, if
104 # a map table were inadvertently operated on in some of those ways.
105 # The use of separate classes with operations defined on one but not the other
106 # prevents accidentally confusing the two.
108 # At the heart of each table's data structure is a "Range_List", which is just
109 # an ordered list of "Ranges", plus ancillary information, and methods to
110 # operate on them. A Range is a compact way to store property information.
111 # Each range has a starting code point, an ending code point, and a value that
112 # is meant to apply to all the code points between the two end points,
113 # inclusive. For a map table, this value is the property value for those
114 # code points. Two such ranges could be written like this:
115 # 0x41 .. 0x5A, 'Upper',
116 # 0x61 .. 0x7A, 'Lower'
118 # Each range also has a type used as a convenience to classify the values.
119 # Most ranges in this program will be Type 0, or normal, but there are some
120 # ranges that have a non-zero type. These are used only in map tables, and
121 # are for mappings that don't fit into the normal scheme of things. Mappings
122 # that require a hash entry to communicate with utf8.c are one example;
123 # another example is mappings for charnames.pm to use which indicate a name
124 # that is algorithmically determinable from its code point (and vice-versa).
125 # These are used to significantly compact these tables, instead of listing
126 # each one of the tens of thousands individually.
128 # In a match table, the value of a range is irrelevant (and hence the type as
129 # well, which will always be 0), and arbitrarily set to the null string.
130 # Using the example above, there would be two match tables for those two
131 # entries, one named Upper would contain the 0x41..0x5A range, and the other
132 # named Lower would contain 0x61..0x7A.
134 # Actually, there are two types of range lists, "Range_Map" is the one
135 # associated with map tables, and "Range_List" with match tables.
136 # Again, this is so that methods can be defined on one and not the other so as
137 # to prevent operating on them in incorrect ways.
139 # Eventually, most tables are written out to files to be read by utf8_heavy.pl
140 # in the perl core. All tables could in theory be written, but some are
141 # suppressed because there is no current practical use for them. It is easy
142 # to change which get written by changing various lists that are near the top
143 # of the actual code in this file. The table data structures contain enough
144 # ancillary information to allow them to be treated as separate entities for
145 # writing, such as the path to each one's file. There is a heading in each
146 # map table that gives the format of its entries, and what the map is for all
147 # the code points missing from it. (This allows tables to be more compact.)
149 # The Property data structure contains one or more tables. All properties
150 # contain a map table (except the $perl property which is a
151 # pseudo-property containing only match tables), and any properties that
152 # are usable in regular expression matches also contain various matching
153 # tables, one for each value the property can have. A binary property can
154 # have two values, True and False (or Y and N, which are preferred by Unicode
155 # terminology). Thus each of these properties will have a map table that
156 # takes every code point and maps it to Y or N (but having ranges cuts the
157 # number of entries in that table way down), and two match tables, one
158 # which has a list of all the code points that map to Y, and one for all the
159 # code points that map to N. (For each of these, a third table is also
160 # generated for the pseudo Perl property. It contains the identical code
161 # points as the Y table, but can be written, not in the compound form, but in
162 # a "single" form like \p{IsUppercase}.) Many properties are binary, but some
163 # properties have several possible values, some have many, and properties like
164 # Name have a different value for every named code point. Those will not,
165 # unless the controlling lists are changed, have their match tables written
166 # out. But all the ones which can be used in regular expression \p{} and \P{}
167 # constructs will. Generally a property will have either its map table or its
168 # match tables written but not both. Again, what gets written is controlled
169 # by lists which can easily be changed.
171 # For information about the Unicode properties, see Unicode's UAX44 document:
173 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
175 # As stated earlier, this program will work on any release of Unicode so far.
176 # Most obvious problems in earlier data have NOT been corrected except when
177 # necessary to make Perl or this program work reasonably. For example, no
178 # folding information was given in early releases, so this program uses the
179 # substitute of lower case, just so that a regular expression with the /i
180 # option will do something that actually gives the right results in many
181 # cases. There are also a couple other corrections for version 1.1.5,
182 # commented at the point they are made. As an example of corrections that
183 # weren't made (but could be) is this statement from DerivedAge.txt: "The
184 # supplementary private use code points and the non-character code points were
185 # assigned in version 2.0, but not specifically listed in the UCD until
186 # versions 3.0 and 3.1 respectively." (To be precise it was 3.0.1 not 3.0.0)
187 # More information on Unicode version glitches is further down in these
188 # introductory comments.
190 # This program works on all properties as of 5.2, though the files for some
191 # are suppressed from apparent lack of demand for. You can change which are
192 # output by changing lists in this program.
194 # The old version of mktables emphasized the term "Fuzzy" to mean Unocde's
195 # loose matchings rules (from Unicode TR18):
197 # The recommended names for UCD properties and property values are in
198 # PropertyAliases.txt [Prop] and PropertyValueAliases.txt
199 # [PropValue]. There are both abbreviated names and longer, more
200 # descriptive names. It is strongly recommended that both names be
201 # recognized, and that loose matching of property names be used,
202 # whereby the case distinctions, whitespace, hyphens, and underbar
204 # The program still allows Fuzzy to override its determination of if loose
205 # matching should be used, but it isn't currently used, as it is no longer
206 # needed; the calculations it makes are good enough.
208 # SUMMARY OF HOW IT WORKS:
212 # A list is constructed containing each input file that is to be processed
214 # Each file on the list is processed in a loop, using the associated handler
216 # The PropertyAliases.txt and PropValueAliases.txt files are processed
217 # first. These files name the properties and property values.
218 # Objects are created of all the property and property value names
219 # that the rest of the input should expect, including all synonyms.
220 # The other input files give mappings from properties to property
221 # values. That is, they list code points and say what the mapping
222 # is under the given property. Some files give the mappings for
223 # just one property; and some for many. This program goes through
224 # each file and populates the properties from them. Some properties
225 # are listed in more than one file, and Unicode has set up a
226 # precedence as to which has priority if there is a conflict. Thus
227 # the order of processing matters, and this program handles the
228 # conflict possibility by processing the overriding input files
229 # last, so that if necessary they replace earlier values.
230 # After this is all done, the program creates the property mappings not
231 # furnished by Unicode, but derivable from what it does give.
232 # The tables of code points that match each property value in each
233 # property that is accessible by regular expressions are created.
234 # The Perl-defined properties are created and populated. Many of these
235 # require data determined from the earlier steps
236 # Any Perl-defined synonyms are created, and name clashes between Perl
237 # and Unicode are reconciled.
238 # All the properties are written to files
239 # Any other files are written, and final warnings issued.
241 # As mentioned above, some properties are given in more than one file. In
242 # particular, the files in the extracted directory are supposedly just
243 # reformattings of the others. But they contain information not easily
244 # derivable from the other files, including results for Unihan, which this
245 # program doesn't ordinarily look at, and for unassigned code points. They
246 # also have historically had errors or been incomplete. In an attempt to
247 # create the best possible data, this program thus processes them first to
248 # glean information missing from the other files; then processes those other
249 # files to override any errors in the extracted ones.
251 # For clarity, a number of operators have been overloaded to work on tables:
252 # ~ means invert (take all characters not in the set). The more
253 # conventional '!' is not used because of the possibility of confusing
254 # it with the actual boolean operation.
256 # - means subtraction
257 # & means intersection
258 # The precedence of these is the order listed. Parentheses should be
259 # copiously used. These are not a general scheme. The operations aren't
260 # defined for a number of things, deliberately, to avoid getting into trouble.
261 # Operations are done on references and affect the underlying structures, so
262 # that the copy constructors for them have been overloaded to not return a new
263 # clone, but the input object itself.
265 # The bool operator is deliberately not overloaded to avoid confusion with
266 # "should it mean if the object merely exists, or also is non-empty?".
269 # WHY CERTAIN DESIGN DECISIONS WERE MADE
271 # XXX These comments need more work.
273 # Why have files written out for binary 'N' matches?
274 # For binary properties, if you know the mapping for either Y or N; the
275 # other is trivial to construct, so could be done at Perl run-time instead
276 # of having a file for it. That is, if someone types in \p{foo: N}, Perl
277 # could translate that to \P{foo: Y} and not need a file. The problem is
278 # communicating to Perl that a given property is binary. Perl can't figure
279 # it out from looking at the N (or No), as some non-binary properties have
280 # these as property values.
282 # There are several types of properties, based on what form their values can
283 # take on. These are described in more detail below in the DATA STRUCTURES
284 # section of these comments, but for now, you should know that there are
285 # string properties, whose values are strings of one or more code points (such
286 # as the Uppercase_mapping property); every other property maps to some other
287 # form, like true or false, or a number, or a name, etc. The reason there are
288 # two directories for map files is because of the way utf8.c works. It
289 # expects that any files there are string properties, that is that the
290 # mappings are each to one code point, with mappings in multiple code points
291 # handled specially in an extra hash data structure. Digit.pl is a table that
292 # is written there for historical reasons, even though it doesn't fit that
293 # mold. Thus it can't currently be looked at by the Perl core.
295 # There are no match tables generated for matches of the null string. These
296 # would like like \p{JSN=}. Perhaps something like them could be added if
297 # necessary. The JSN does have a real code point U+110B that maps to the null
298 # string, but it is a contributory property, and therefore not output by
303 # XXX Add more stuff here. use perl instead of miniperl to find problems with
308 # The program would break if Unicode were to change its names so that
309 # interior white space, underscores, or dashes differences were significant
310 # within property and property value names.
312 # It might be easier to use the xml versions of the UCD if this program ever
313 # would need heavy revision, and the ability to handle old versions was not
316 # There is the potential for name collisions, in that Perl has chosen names
317 # that Unicode could decide it also likes. There have been such collisions in
318 # the past, with mostly Perl deciding to adopt the Unicode definition of the
319 # name. However in the 5.2 Unicode beta testing, there were a number of such
320 # collisions, which were withdrawn before the final release, because of Perl's
321 # and other's protests. These all involved new properties which began with
322 # 'Is'. Based on the protests, Unicode is unlikely to try that again. Also,
323 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
324 # Unicode document, so they are unlikely to be used by Unicode for another
325 # purpose. However, they might try something beginning with 'In', or use any
326 # of the other Perl-defined properties. This program will warn you of name
327 # collisions, and refuse to generate tables with them, but manual intervention
328 # will be required in this event. One scheme that could be implemented, if
329 # necessary, would be to have this program generate another file, or add a
330 # field to mktables.lst that gives the date of first definition of a property.
331 # Each new release of Unicode would use that file as a basis for the next
332 # iteration. And the Perl synonym addition code could sort based on the age
333 # of the property, so older properties get priority, and newer ones that clash
334 # would be refused; hence existing code would not be impacted, and some other
335 # synonym would have to be used for the new property. This is ugly, and
336 # manual intervention would certainly be easier to do in the short run; lets
337 # hope it never comes to this.
341 # This program can generate tables from the Unihan database. But it doesn't
342 # by default, letting the CPAN module Unicode::Unihan handle them. Prior to
343 # version 5.2, this database was in a single file, Unihan.txt. In 5.2 the
344 # database was split into 8 different files, all beginning with the letters
345 # 'Unihan'. This program will read those file(s) if present, but it needs to
346 # know which of the many properties in the file(s) should have tables created
347 # for them. It will create tables for any properties listed in
348 # PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
349 # @cjk_properties array and the @cjk_property_values array. Thus, if a
350 # property you want is not in those files of the release you are building
351 # against, you must add it to those two arrays. Starting in 4.0, the
352 # Unicode_Radical_Stroke was listed in those files, so if the Unihan database
353 # is present in the directory, a table will be generated for that property.
354 # In 5.2, several more properties were added. For your convenience, the two
355 # arrays are initialized with all the 5.2 listed properties that are also in
356 # earlier releases. But these are commented out. You can just uncomment the
357 # ones you want, or use them as a template for adding entries for other
360 # You may need to adjust the entries to suit your purposes. setup_unihan(),
361 # and filter_unihan_line() are the functions where this is done. This program
362 # already does some adjusting to make the lines look more like the rest of the
363 # Unicode DB; You can see what that is in filter_unihan_line()
365 # There is a bug in the 3.2 data file in which some values for the
366 # kPrimaryNumeric property have commas and an unexpected comment. A filter
367 # could be added for these; or for a particular installation, the Unihan.txt
368 # file could be edited to fix them.
373 # Unicode Versions Notes
375 # alpha's numbers halve in 2.1.9, answer cjk block at 4E00 were removed from PropList; not changed, could add gc Letter, put back in in 3.1.0
376 # Some versions of 2.1.x Jamo.txt have the wrong value for 1105, which causes
377 # real problems for the algorithms for Jamo calculations, so it is changed
379 # White space vs Space. in 3.2 perl has +205F=medium math space, fixed in 4.0, and ok in 3.1.1 because not there in unicode. synonym introduced in 4.1
380 # ATBL = 202. 202 changed to ATB, and all code points stayed there. So if you were useing ATBL you were out of luck.
381 # Hrkt Katakana_Or_Hiragana came in 4.01, before was Unknown.
383 # The default for missing code points for BidiClass is complicated. Starting
384 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
385 # tries to do the best it can for earlier releases. It is done in
386 # process_PropertyAliases()
388 ##############################################################################
390 my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
392 my $MAX_LINE_WIDTH = 78;
394 # Debugging aid to skip most files so as to not be distracted by them when
395 # concentrating on the ones being debugged. Add
397 # to the constructor for those files you want processed when you set this.
398 # Files with a first version number of 0 are special: they are always
399 # processed regardless of the state of this flag.
402 # Set to 1 to enable tracing.
405 { # Closure for trace: debugging aid
406 my $print_caller = 1; # ? Include calling subroutine name
407 my $main_with_colon = 'main::';
408 my $main_colon_length = length($main_with_colon);
411 return unless $to_trace; # Do nothing if global flag not set
415 local $DB::trace = 0;
416 $DB::trace = 0; # Quiet 'used only once' message
420 # Loop looking up the stack to get the first non-trace caller
425 $line_number = $caller_line;
426 (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
427 $caller = $main_with_colon unless defined $caller;
429 $caller_name = $caller;
432 $caller_name =~ s/.*:://;
433 if (substr($caller_name, 0, $main_colon_length)
436 $caller_name = substr($caller_name, $main_colon_length);
439 } until ($caller_name ne 'trace');
441 # If the stack was empty, we were called from the top level
442 $caller_name = 'main' if ($caller_name eq ""
443 || $caller_name eq 'trace');
446 foreach my $string (@input) {
447 #print STDERR __LINE__, ": ", join ", ", @input, "\n";
448 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
449 $output .= simple_dumper($string);
452 $string = "$string" if ref $string;
453 $string = $UNDEF unless defined $string;
455 $string = '""' if $string eq "";
456 $output .= " " if $output ne ""
458 && substr($output, -1, 1) ne " "
459 && substr($string, 0, 1) ne " ";
464 print STDERR sprintf "%4d: ", $line_number if defined $line_number;
465 print STDERR "$caller_name: " if $print_caller;
466 print STDERR $output, "\n";
471 # This is for a rarely used development feature that allows you to compare two
472 # versions of the Unicode standard without having to deal with changes caused
473 # by the code points introduced in the later verson. Change the 0 to a SINGLE
474 # dotted Unicode release number (e.g. 2.1). Only code points introduced in
475 # that release and earlier will be used; later ones are thrown away. You use
476 # the version number of the earliest one you want to compare; then run this
477 # program on directory structures containing each release, and compare the
478 # outputs. These outputs will therefore include only the code points common
479 # to both releases, and you can see the changes caused just by the underlying
480 # release semantic changes. For versions earlier than 3.2, you must copy a
481 # version of DAge.txt into the directory.
482 my $string_compare_versions = DEBUG && 0; # e.g., v2.1;
483 my $compare_versions = DEBUG
484 && $string_compare_versions
485 && pack "C*", split /\./, $string_compare_versions;
488 # Returns non-duplicated input values. From "Perl Best Practices:
489 # Encapsulated Cleverness". p. 455 in first edition.
492 return grep { ! $seen{$_}++ } @_;
495 $0 = File::Spec->canonpath($0);
497 my $make_test_script = 0; # ? Should we output a test script
498 my $write_unchanged_files = 0; # ? Should we update the output files even if
499 # we don't think they have changed
500 my $use_directory = ""; # ? Should we chdir somewhere.
501 my $pod_directory; # input directory to store the pod file.
502 my $pod_file = 'perluniprops';
503 my $t_path; # Path to the .t test file
504 my $file_list = 'mktables.lst'; # File to store input and output file names.
505 # This is used to speed up the build, by not
506 # executing the main body of the program if
507 # nothing on the list has changed since the
509 my $make_list = 1; # ? Should we write $file_list. Set to always
510 # make a list so that when the pumpking is
511 # preparing a release, s/he won't have to do
513 my $glob_list = 0; # ? Should we try to include unknown .txt files
515 my $output_range_counts = 1; # ? Should we include the number of code points
516 # in ranges in the output
517 # Verbosity levels; 0 is quiet
518 my $NORMAL_VERBOSITY = 1;
522 my $verbosity = $NORMAL_VERBOSITY;
526 my $arg = shift @ARGV;
528 $verbosity = $VERBOSE;
530 elsif ($arg eq '-p') {
531 $verbosity = $PROGRESS;
532 $| = 1; # Flush buffers as we go.
534 elsif ($arg eq '-q') {
537 elsif ($arg eq '-w') {
538 $write_unchanged_files = 1; # update the files even if havent changed
540 elsif ($arg eq '-check') {
541 my $this = shift @ARGV;
542 my $ok = shift @ARGV;
544 print "Skipping as check params are not the same.\n";
548 elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
549 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
551 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
553 $make_test_script = 1;
555 elsif ($arg eq '-makelist') {
558 elsif ($arg eq '-C' && defined ($use_directory = shift)) {
559 -d $use_directory or croak "Unknown directory '$use_directory'";
561 elsif ($arg eq '-L') {
563 # Existence not tested until have chdir'd
566 elsif ($arg eq '-globlist') {
569 elsif ($arg eq '-c') {
570 $output_range_counts = ! $output_range_counts
574 $with_c .= 'out' if $output_range_counts; # Complements the state
576 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
577 [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
579 -c : Output comments $with_c number of code points in ranges
580 -q : Quiet Mode: Only output serious warnings.
581 -p : Set verbosity level to normal plus show progress.
582 -v : Set Verbosity level high: Show progress and non-serious
584 -w : Write files regardless
585 -C dir : Change to this directory before proceeding. All relative paths
586 except those specified by the -P and -T options will be done
587 with respect to this directory.
588 -P dir : Output $pod_file file to directory 'dir'.
589 -T path : Create a test script as 'path'; overrides -maketest
590 -L filelist : Use alternate 'filelist' instead of standard one
591 -globlist : Take as input all non-Test *.txt files in current and sub
593 -maketest : Make test script 'TestProp.pl' in current (or -C directory),
595 -makelist : Rewrite the file list $file_list based on current setup
596 -check A B : Executes $0 only if A and B are the same
601 # Stores the most-recently changed file. If none have changed, can skip the
603 my $youngest = -M $0; # Do this before the chdir!
605 # Change directories now, because need to read 'version' early.
606 if ($use_directory) {
607 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
608 $pod_directory = File::Spec->rel2abs($pod_directory);
610 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
611 $t_path = File::Spec->rel2abs($t_path);
613 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
614 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
615 $pod_directory = File::Spec->abs2rel($pod_directory);
617 if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
618 $t_path = File::Spec->abs2rel($t_path);
622 # Get Unicode version into regular and v-string. This is done now because
623 # various tables below get populated based on it. These tables are populated
624 # here to be near the top of the file, and so easily seeable by those needing
626 open my $VERSION, "<", "version"
627 or croak "$0: can't open required file 'version': $!\n";
628 my $string_version = <$VERSION>;
630 chomp $string_version;
631 my $v_version = pack "C*", split /\./, $string_version; # v string
633 # The following are the complete names of properties with property values that
634 # are known to not match any code points in some versions of Unicode, but that
635 # may change in the future so they should be matchable, hence an empty file is
636 # generated for them.
637 my @tables_that_may_be_empty = (
638 'Joining_Type=Left_Joining',
640 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
641 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
642 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
643 if $v_version ge v4.1.0;
645 # The lists below are hashes, so the key is the item in the list, and the
646 # value is the reason why it is in the list. This makes generation of
647 # documentation easier.
649 my %why_suppressed; # No file generated for these.
651 # Files aren't generated for empty extraneous properties. This is arguable.
652 # Extraneous properties generally come about because a property is no longer
653 # used in a newer version of Unicode. If we generated a file without code
654 # points, programs that used to work on that property will still execute
655 # without errors. It just won't ever match (or will always match, with \P{}).
656 # This means that the logic is now likely wrong. I (khw) think its better to
657 # find this out by getting an error message. Just move them to the table
658 # above to change this behavior
659 my %why_suppress_if_empty_warn_if_not = (
661 # It is the only property that has ever officially been removed from the
662 # Standard. The database never contained any code points for it.
663 'Special_Case_Condition' => 'Obsolete',
665 # Apparently never official, but there were code points in some versions of
666 # old-style PropList.txt
667 'Non_Break' => 'Obsolete',
670 # These would normally go in the warn table just above, but they were changed
671 # a long time before this program was written, so warnings about them are
673 if ($v_version gt v3.2.0) {
674 push @tables_that_may_be_empty,
675 'Canonical_Combining_Class=Attached_Below_Left'
678 # These are listed in the Property aliases file in 5.2, but Unihan is ignored
679 # unless explicitly added.
680 if ($v_version ge v5.2.0) {
681 my $unihan = 'Unihan; remove from list if using Unihan';
682 foreach my $table qw (
686 kCompatibilityVariant
700 $why_suppress_if_empty_warn_if_not{$table} = $unihan;
704 # Properties that this program ignores.
705 my @unimplemented_properties = (
706 'Unicode_Radical_Stroke' # Remove if changing to handle this one.
709 # There are several types of obsolete properties defined by Unicode. These
710 # must be hand-edited for every new Unicode release.
711 my %why_deprecated; # Generates a deprecated warning message if used.
712 my %why_stabilized; # Documentation only
713 my %why_obsolete; # Documentation only
716 my $simple = 'Perl uses the more complete version of this property';
717 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan';
719 my $other_properties = 'other properties';
720 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
721 my $why_no_expand = "Easily computed, and yet doesn't cover the common encoding forms (UTF-16/8)",
724 'Grapheme_Link' => 'Deprecated by Unicode. Use ccc=vr (Canonical_Combining_Class=Virama) instead',
725 'Jamo_Short_Name' => $contributory,
726 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
727 'Other_Alphabetic' => $contributory,
728 'Other_Default_Ignorable_Code_Point' => $contributory,
729 'Other_Grapheme_Extend' => $contributory,
730 'Other_ID_Continue' => $contributory,
731 'Other_ID_Start' => $contributory,
732 'Other_Lowercase' => $contributory,
733 'Other_Math' => $contributory,
734 'Other_Uppercase' => $contributory,
738 # There is a lib/unicore/Decomposition.pl (used by normalize.pm) which
739 # contains the same information, but without the algorithmically
740 # determinable Hangul syllables'. This file is not published, so it's
741 # existence is not noted in the comment.
742 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize',
744 'ISO_Comment' => 'Apparently no demand for it, but can access it through Unicode::UCD::charinfo. Obsoleted, and code points for it removed in Unicode 5.2',
745 'Unicode_1_Name' => "$simple, and no apparent demand for it, but can access it through Unicode::UCD::charinfo. If there is no later name for a code point, then this one is used instead in charnames",
747 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold",
748 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
749 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
750 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
752 'Name' => "Accessible via 'use charnames;'",
753 'Name_Alias' => "Accessible via 'use charnames;'",
755 # These are sort of jumping the gun; deprecation is proposed for
756 # Unicode version 6.0, but they have never been exposed by Perl, and
757 # likely are soon to be deprecated, so best not to expose them.
758 FC_NFKC_Closure => 'Use NFKC_Casefold instead',
759 Expands_On_NFC => $why_no_expand,
760 Expands_On_NFD => $why_no_expand,
761 Expands_On_NFKC => $why_no_expand,
762 Expands_On_NFKD => $why_no_expand,
765 # The following are suppressed because they were made contributory or
766 # deprecated by Unicode before Perl ever thought about supporting them.
767 foreach my $property ('Jamo_Short_Name', 'Grapheme_Link') {
768 $why_suppressed{$property} = $why_deprecated{$property};
771 # Customize the message for all the 'Other_' properties
772 foreach my $property (keys %why_deprecated) {
773 next if (my $main_property = $property) !~ s/^Other_//;
774 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
778 if ($v_version ge 4.0.0) {
779 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
781 if ($v_version ge 5.2.0) {
782 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
785 # Probably obsolete forever
786 if ($v_version ge v4.1.0) {
787 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common"';
790 # This program can create files for enumerated-like properties, such as
791 # 'Numeric_Type'. This file would be the same format as for a string
792 # property, with a mapping from code point to its value, so you could look up,
793 # for example, the script a code point is in. But no one so far wants this
794 # mapping, or they have found another way to get it since this is a new
795 # feature. So no file is generated except if it is in this list.
796 my @output_mapped_properties = split "\n", <<END;
799 # If you are using the Unihan database, you need to add the properties that
800 # you want to extract from it to this table. For your convenience, the
801 # properties in the 5.2 PropertyAliases.txt file are listed, commented out
802 my @cjk_properties = split "\n", <<'END';
803 #cjkAccountingNumeric; kAccountingNumeric
804 #cjkOtherNumeric; kOtherNumeric
805 #cjkPrimaryNumeric; kPrimaryNumeric
806 #cjkCompatibilityVariant; kCompatibilityVariant
808 #cjkIRG_GSource; kIRG_GSource
809 #cjkIRG_HSource; kIRG_HSource
810 #cjkIRG_JSource; kIRG_JSource
811 #cjkIRG_KPSource; kIRG_KPSource
812 #cjkIRG_KSource; kIRG_KSource
813 #cjkIRG_TSource; kIRG_TSource
814 #cjkIRG_USource; kIRG_USource
815 #cjkIRG_VSource; kIRG_VSource
816 #cjkRSUnicode; kRSUnicode ; Unicode_Radical_Stroke; URS
819 # Similarly for the property values. For your convenience, the lines in the
820 # 5.2 PropertyAliases.txt file are listed. Just remove the first BUT NOT both
822 my @cjk_property_values = split "\n", <<'END';
823 ## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
824 ## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
825 ## @missing: 0000..10FFFF; cjkIICore; <none>
826 ## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
827 ## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
828 ## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
829 ## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
830 ## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
831 ## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
832 ## @missing: 0000..10FFFF; cjkIRG_USource; <none>
833 ## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
834 ## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
835 ## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
836 ## @missing: 0000..10FFFF; cjkRSUnicode; <none>
839 # The input files don't list every code point. Those not listed are to be
840 # defaulted to some value. Below are hard-coded what those values are for
841 # non-binary properties as of 5.1. Starting in 5.0, there are
842 # machine-parsable comment lines in the files the give the defaults; so this
843 # list shouldn't have to be extended. The claim is that all missing entries
844 # for binary properties will default to 'N'. Unicode tried to change that in
845 # 5.2, but the beta period produced enough protest that they backed off.
847 # The defaults for the fields that appear in UnicodeData.txt in this hash must
848 # be in the form that it expects. The others may be synonyms.
849 my $CODE_POINT = '<code point>';
850 my %default_mapping = (
852 # Bidi_Class => Complicated; set in code
853 Bidi_Mirroring_Glyph => "",
855 Canonical_Combining_Class => 0,
856 Case_Folding => $CODE_POINT,
857 Decomposition_Mapping => $CODE_POINT,
858 Decomposition_Type => 'None',
859 East_Asian_Width => "Neutral",
860 FC_NFKC_Closure => $CODE_POINT,
861 General_Category => 'Cn',
862 Grapheme_Cluster_Break => 'Other',
863 Hangul_Syllable_Type => 'NA',
865 Jamo_Short_Name => "",
866 Joining_Group => "No_Joining_Group",
867 # Joining_Type => Complicated; set in code
868 kIICore => 'N', # Is converted to binary
869 #Line_Break => Complicated; set in code
870 Lowercase_Mapping => $CODE_POINT,
877 Numeric_Type => 'None',
878 Numeric_Value => 'NaN',
879 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
880 Sentence_Break => 'Other',
881 Simple_Case_Folding => $CODE_POINT,
882 Simple_Lowercase_Mapping => $CODE_POINT,
883 Simple_Titlecase_Mapping => $CODE_POINT,
884 Simple_Uppercase_Mapping => $CODE_POINT,
885 Titlecase_Mapping => $CODE_POINT,
886 Unicode_1_Name => "",
887 Unicode_Radical_Stroke => "",
888 Uppercase_Mapping => $CODE_POINT,
889 Word_Break => 'Other',
892 # Below are files that Unicode furnishes, but this program ignores, and why
893 my %ignored_files = (
894 'CJKRadicals.txt' => 'Unihan data',
895 'Index.txt' => 'An index, not actual data',
896 'NamedSqProv.txt' => 'Not officially part of the Unicode standard; Append it to NamedSequences.txt if you want to process the contents.',
897 'NamesList.txt' => 'Just adds commentary',
898 'NormalizationCorrections.txt' => 'Data is already in other files.',
899 'Props.txt' => 'Adds nothing to PropList.txt; only in very early releases',
900 'ReadMe.txt' => 'Just comments',
901 'README.TXT' => 'Just comments',
902 'StandardizedVariants.txt' => 'Only for glyph changes, not a Unicode character property. Does not fit into current scheme where one code point is mapped',
905 ################ End of externally interesting definitions ###############
908 # !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
909 # This file is machine-generated by $0 from the Unicode
910 # database, Version $string_version. Any changes made here will be lost!
913 my $INTERNAL_ONLY=<<"EOF";
915 # !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
916 # This file is for internal use by the Perl program only. The format and even
917 # the name or existence of this file are subject to change without notice.
918 # Don't use it directly.
921 my $DEVELOPMENT_ONLY=<<"EOF";
922 # !!!!!!! DEVELOPMENT USE ONLY !!!!!!!
923 # This file contains information artificially constrained to code points
924 # present in Unicode release $string_compare_versions.
925 # IT CANNOT BE RELIED ON. It is for use during development only and should
926 # not be used for production.
930 my $LAST_UNICODE_CODEPOINT_STRING = "10FFFF";
931 my $LAST_UNICODE_CODEPOINT = hex $LAST_UNICODE_CODEPOINT_STRING;
932 my $MAX_UNICODE_CODEPOINTS = $LAST_UNICODE_CODEPOINT + 1;
934 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
935 # two must be 10; if there are 5, the first must not be a 0. Written this way
936 # to decrease backtracking
938 qr/ \b (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
940 # This matches the beginning of the line in the Unicode db files that give the
941 # defaults for code points not listed (i.e., missing) in the file. The code
942 # depends on this ending with a semi-colon, so it can assume it is a valid
943 # field when the line is split() by semi-colons
944 my $missing_defaults_prefix =
945 qr/^#\s+\@missing:\s+0000\.\.$LAST_UNICODE_CODEPOINT_STRING\s*;/;
947 # Property types. Unicode has more types, but these are sufficient for our
949 my $UNKNOWN = -1; # initialized to illegal value
950 my $NON_STRING = 1; # Either binary or enum
952 my $ENUM = 3; # Include catalog
953 my $STRING = 4; # Anything else: string or misc
955 # Some input files have lines that give default values for code points not
956 # contained in the file. Sometimes these should be ignored.
957 my $NO_DEFAULTS = 0; # Must evaluate to false
961 # Range types. Each range has a type. Most ranges are type 0, for normal,
962 # and will appear in the main body of the tables in the output files, but
963 # there are other types of ranges as well, listed below, that are specially
964 # handled. There are pseudo-types as well that will never be stored as a
965 # type, but will affect the calculation of the type.
967 # 0 is for normal, non-specials
968 my $MULTI_CP = 1; # Sequence of more than code point
969 my $HANGUL_SYLLABLE = 2;
970 my $CP_IN_NAME = 3; # The NAME contains the code point appended to it.
971 my $NULL = 4; # The map is to the null string; utf8.c can't
972 # handle these, nor is there an accepted syntax
973 # for them in \p{} constructs
974 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
975 # otherwise be $MULTI_CP type are instead type 0
977 # process_generic_property_file() can accept certain overrides in its input.
978 # Each of these must begin AND end with $CMD_DELIM.
979 my $CMD_DELIM = "\a";
980 my $REPLACE_CMD = 'replace'; # Override the Replace
981 my $MAP_TYPE_CMD = 'map_type'; # Override the Type
986 # Values for the Replace argument to add_range.
987 # $NO # Don't replace; add only the code points not
989 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
990 # the comments at the subroutine definition.
991 my $UNCONDITIONALLY = 2; # Replace without conditions.
992 my $MULTIPLE = 4; # Don't replace, but add a duplicate record if
995 # Flags to give property statuses. The phrases are to remind maintainers that
996 # if the flag is changed, the indefinite article referring to it in the
997 # documentation may need to be as well.
999 my $SUPPRESSED = 'z'; # The character should never actually be seen, since
1001 my $PLACEHOLDER = 'P'; # Implies no pod entry generated
1002 my $DEPRECATED = 'D';
1003 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1004 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1005 my $DISCOURAGED = 'X';
1006 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1007 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1009 my $a_bold_stricter = "a 'B<$STRICTER>'";
1010 my $A_bold_stricter = "A 'B<$STRICTER>'";
1011 my $STABILIZED = 'S';
1012 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1013 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1015 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1016 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1018 my %status_past_participles = (
1019 $DISCOURAGED => 'discouraged',
1020 $SUPPRESSED => 'should never be generated',
1021 $STABILIZED => 'stabilized',
1022 $OBSOLETE => 'obsolete',
1023 $DEPRECATED => 'deprecated',
1026 # The format of the values of the map tables:
1027 my $BINARY_FORMAT = 'b';
1028 my $DECIMAL_FORMAT = 'd';
1029 my $FLOAT_FORMAT = 'f';
1030 my $INTEGER_FORMAT = 'i';
1031 my $HEX_FORMAT = 'x';
1032 my $RATIONAL_FORMAT = 'r';
1033 my $STRING_FORMAT = 's';
1035 my %map_table_formats = (
1036 $BINARY_FORMAT => 'binary',
1037 $DECIMAL_FORMAT => 'single decimal digit',
1038 $FLOAT_FORMAT => 'floating point number',
1039 $INTEGER_FORMAT => 'integer',
1040 $HEX_FORMAT => 'positive hex whole number; a code point',
1041 $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1042 $STRING_FORMAT => 'arbitrary string',
1045 # Unicode didn't put such derived files in a separate directory at first.
1046 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1047 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1048 my $AUXILIARY = 'auxiliary';
1050 # Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1051 my %loose_to_file_of; # loosely maps table names to their respective
1053 my %stricter_to_file_of; # same; but for stricter mapping.
1054 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1055 # their rational equivalent
1056 my %loose_property_name_of; # Loosely maps property names to standard form
1058 # These constants names and values were taken from the Unicode standard,
1059 # version 5.1, section 3.12. They are used in conjunction with Hangul
1069 my $NCount = $VCount * $TCount;
1071 # For Hangul syllables; These store the numbers from Jamo.txt in conjunction
1072 # with the above published constants.
1074 my %Jamo_L; # Leading consonants
1075 my %Jamo_V; # Vowels
1076 my %Jamo_T; # Trailing consonants
1078 my @backslash_X_tests; # List of tests read in for testing \X
1079 my @unhandled_properties; # Will contain a list of properties found in
1080 # the input that we didn't process.
1081 my @match_properties; # Properties that have match tables, to be
1083 my @map_properties; # Properties that get map files written
1084 my @named_sequences; # NamedSequences.txt contents.
1085 my %potential_files; # Generated list of all .txt files in the directory
1086 # structure so we can warn if something is being
1088 my @files_actually_output; # List of files we generated.
1089 my @more_Names; # Some code point names are compound; this is used
1090 # to store the extra components of them.
1091 my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1092 # the minimum before we consider it equivalent to a
1093 # candidate rational
1094 my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1096 # These store references to certain commonly used property objects
1101 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1102 my $has_In_conflicts = 0;
1103 my $has_Is_conflicts = 0;
1105 sub internal_file_to_platform ($) {
1106 # Convert our file paths which have '/' separators to those of the
1110 return undef unless defined $file;
1112 return File::Spec->join(split '/', $file);
1115 sub file_exists ($) { # platform independent '-e'. This program internally
1116 # uses slash as a path separator.
1118 return 0 if ! defined $file;
1119 return -e internal_file_to_platform($file);
1123 # Returns the address of the blessed input object.
1124 # It doesn't check for blessedness because that would do a string eval
1125 # every call, and the program is structured so that this is never called
1126 # for a non-blessed object.
1128 no overloading; # If overloaded, numifying below won't work.
1130 # Numifying a ref gives its address.
1134 # Commented code below should work on Perl 5.8.
1135 ## This 'require' doesn't necessarily work in miniperl, and even if it does,
1136 ## the native perl version of it (which is what would operate under miniperl)
1137 ## is extremely slow, as it does a string eval every call.
1138 #my $has_fast_scalar_util = $
\18 !~ /miniperl/
1139 # && defined eval "require Scalar::Util";
1142 # # Returns the address of the blessed input object. Uses the XS version if
1143 # # available. It doesn't check for blessedness because that would do a
1144 # # string eval every call, and the program is structured so that this is
1145 # # never called for a non-blessed object.
1147 # return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1149 # # Check at least that is a ref.
1150 # my $pkg = ref($_[0]) or return undef;
1152 # # Change to a fake package to defeat any overloaded stringify
1153 # bless $_[0], 'main::Fake';
1155 # # Numifying a ref gives its address.
1156 # my $addr = 0 + $_[0];
1158 # # Return to original class
1159 # bless $_[0], $pkg;
1166 return $a if $a >= $b;
1173 return $a if $a <= $b;
1177 sub clarify_number ($) {
1178 # This returns the input number with underscores inserted every 3 digits
1179 # in large (5 digits or more) numbers. Input must be entirely digits, not
1183 my $pos = length($number) - 3;
1184 return $number if $pos <= 1;
1186 substr($number, $pos, 0) = '_';
1195 # These routines give a uniform treatment of messages in this program. They
1196 # are placed in the Carp package to cause the stack trace to not include them,
1197 # although an alternative would be to use another package and set @CARP_NOT
1200 our $Verbose = 1 if main::DEBUG; # Useful info when debugging
1202 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1203 # and overload trying to load Scalar:Util under miniperl. See
1204 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1205 undef $overload::VERSION;
1208 my $message = shift || "";
1209 my $nofold = shift || 0;
1212 $message = main::join_lines($message);
1213 $message =~ s/^$0: *//; # Remove initial program name
1214 $message =~ s/[.;,]+$//; # Remove certain ending punctuation
1215 $message = "\n$0: $message;";
1217 # Fold the message with program name, semi-colon end punctuation
1218 # (which looks good with the message that carp appends to it), and a
1219 # hanging indent for continuation lines.
1220 $message = main::simple_fold($message, "", 4) unless $nofold;
1221 $message =~ s/\n$//; # Remove the trailing nl so what carp
1222 # appends is to the same line
1225 return $message if defined wantarray; # If a caller just wants the msg
1232 # This is called when it is clear that the problem is caused by a bug in
1235 my $message = shift;
1236 $message =~ s/^$0: *//;
1237 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1242 sub carp_too_few_args {
1244 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken.");
1248 my $args_ref = shift;
1251 my_carp_bug("Need at least $count arguments to "
1253 . ". Instead got: '"
1254 . join ', ', @$args_ref
1255 . "'. No action taken.");
1259 sub carp_extra_args {
1260 my $args_ref = shift;
1261 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_;
1263 unless (ref $args_ref) {
1264 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments.");
1267 my ($package, $file, $line) = caller;
1268 my $subroutine = (caller 1)[3];
1271 if (ref $args_ref eq 'HASH') {
1272 foreach my $key (keys %$args_ref) {
1273 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1275 $list = join ', ', each %{$args_ref};
1277 elsif (ref $args_ref eq 'ARRAY') {
1278 foreach my $arg (@$args_ref) {
1279 $arg = $UNDEF unless defined $arg;
1281 $list = join ', ', @$args_ref;
1284 my_carp_bug("Can't cope with ref "
1286 . " . argument to 'carp_extra_args'. Not checking arguments.");
1290 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped.");
1298 # This program uses the inside-out method for objects, as recommended in
1299 # "Perl Best Practices". This closure aids in generating those. There
1300 # are two routines. setup_package() is called once per package to set
1301 # things up, and then set_access() is called for each hash representing a
1302 # field in the object. These routines arrange for the object to be
1303 # properly destroyed when no longer used, and for standard accessor
1304 # functions to be generated. If you need more complex accessors, just
1305 # write your own and leave those accesses out of the call to set_access().
1306 # More details below.
1308 my %constructor_fields; # fields that are to be used in constructors; see
1311 # The values of this hash will be the package names as keys to other
1312 # hashes containing the name of each field in the package as keys, and
1313 # references to their respective hashes as values.
1317 # Sets up the package, creating standard DESTROY and dump methods
1318 # (unless already defined). The dump method is used in debugging by
1320 # The optional parameters are:
1321 # a) a reference to a hash, that gets populated by later
1322 # set_access() calls with one of the accesses being
1323 # 'constructor'. The caller can then refer to this, but it is
1324 # not otherwise used by these two routines.
1325 # b) a reference to a callback routine to call during destruction
1326 # of the object, before any fields are actually destroyed
1329 my $constructor_ref = delete $args{'Constructor_Fields'};
1330 my $destroy_callback = delete $args{'Destroy_Callback'};
1331 Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1334 my $package = (caller)[0];
1336 $package_fields{$package} = \%fields;
1337 $constructor_fields{$package} = $constructor_ref;
1339 unless ($package->can('DESTROY')) {
1340 my $destroy_name = "${package}::DESTROY";
1343 # Use typeglob to give the anonymous subroutine the name we want
1344 *$destroy_name = sub {
1346 my $addr = main::objaddr($self);
1348 $self->$destroy_callback if $destroy_callback;
1349 foreach my $field (keys %{$package_fields{$package}}) {
1350 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1351 delete $package_fields{$package}{$field}{$addr};
1357 unless ($package->can('dump')) {
1358 my $dump_name = "${package}::dump";
1362 return dump_inside_out($self, $package_fields{$package}, @_);
1369 # Arrange for the input field to be garbage collected when no longer
1370 # needed. Also, creates standard accessor functions for the field
1371 # based on the optional parameters-- none if none of these parameters:
1372 # 'addable' creates an 'add_NAME()' accessor function.
1373 # 'readable' or 'readable_array' creates a 'NAME()' accessor
1375 # 'settable' creates a 'set_NAME()' accessor function.
1376 # 'constructor' doesn't create an accessor function, but adds the
1377 # field to the hash that was previously passed to
1379 # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1380 # 'add' etc. all mean 'addable'.
1381 # The read accessor function will work on both array and scalar
1382 # values. If another accessor in the parameter list is 'a', the read
1383 # access assumes an array. You can also force it to be array access
1384 # by specifying 'readable_array' instead of 'readable'
1386 # A sort-of 'protected' access can be set-up by preceding the addable,
1387 # readable or settable with some initial portion of 'protected_' (but,
1388 # the underscore is required), like 'p_a', 'pro_set', etc. The
1389 # "protection" is only by convention. All that happens is that the
1390 # accessor functions' names begin with an underscore. So instead of
1391 # calling set_foo, the call is _set_foo. (Real protection could be
1392 # accomplished by having a new subroutine, end_package called at the
1393 # end of each package, and then storing the __LINE__ ranges and
1394 # checking them on every accessor. But that is way overkill.)
1396 # We create anonymous subroutines as the accessors and then use
1397 # typeglobs to assign them to the proper package and name
1399 my $name = shift; # Name of the field
1400 my $field = shift; # Reference to the inside-out hash containing the
1403 my $package = (caller)[0];
1405 if (! exists $package_fields{$package}) {
1406 croak "$0: Must call 'setup_package' before 'set_access'";
1409 # Stash the field so DESTROY can get it.
1410 $package_fields{$package}{$name} = $field;
1412 # Remaining arguments are the accessors. For each...
1413 foreach my $access (@_) {
1414 my $access = lc $access;
1418 # Match the input as far as it goes.
1419 if ($access =~ /^(p[^_]*)_/) {
1421 if (substr('protected_', 0, length $protected)
1425 # Add 1 for the underscore not included in $protected
1426 $access = substr($access, length($protected) + 1);
1434 if (substr('addable', 0, length $access) eq $access) {
1435 my $subname = "${package}::${protected}add_$name";
1438 # add_ accessor. Don't add if already there, which we
1439 # determine using 'eq' for scalars and '==' otherwise.
1442 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1445 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1447 return if grep { $value == $_ }
1448 @{$field->{main::objaddr $self}};
1451 return if grep { $value eq $_ }
1452 @{$field->{main::objaddr $self}};
1454 push @{$field->{main::objaddr $self}}, $value;
1458 elsif (substr('constructor', 0, length $access) eq $access) {
1460 Carp::my_carp_bug("Can't set-up 'protected' constructors")
1463 $constructor_fields{$package}{$name} = $field;
1466 elsif (substr('readable_array', 0, length $access) eq $access) {
1468 # Here has read access. If one of the other parameters for
1469 # access is array, or this one specifies array (by being more
1470 # than just 'readable_'), then create a subroutine that
1471 # assumes the data is an array. Otherwise just a scalar
1472 my $subname = "${package}::${protected}$name";
1473 if (grep { /^a/i } @_
1474 or length($access) > length('readable_'))
1479 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1480 my $addr = main::objaddr $_[0];
1481 if (ref $field->{$addr} ne 'ARRAY') {
1482 my $type = ref $field->{$addr};
1483 $type = 'scalar' unless $type;
1484 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems.");
1487 return scalar @{$field->{$addr}} unless wantarray;
1489 # Make a copy; had problems with caller modifying the
1490 # original otherwise
1491 my @return = @{$field->{$addr}};
1497 # Here not an array value, a simpler function.
1501 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1502 return $field->{main::objaddr $_[0]};
1506 elsif (substr('settable', 0, length $access) eq $access) {
1507 my $subname = "${package}::${protected}set_$name";
1512 return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
1513 Carp::carp_extra_args(\@_) if @_ > 2;
1515 # $self is $_[0]; $value is $_[1]
1516 $field->{main::objaddr $_[0]} = $_[1];
1521 Carp::my_carp_bug("Unknown accessor type $access. No accessor set.");
1530 # All input files use this object, which stores various attributes about them,
1531 # and provides for convenient, uniform handling. The run method wraps the
1532 # processing. It handles all the bookkeeping of opening, reading, and closing
1533 # the file, returning only significant input lines.
1535 # Each object gets a handler which processes the body of the file, and is
1536 # called by run(). Most should use the generic, default handler, which has
1537 # code scrubbed to handle things you might not expect. A handler should
1538 # basically be a while(next_line()) {...} loop.
1540 # You can also set up handlers to
1541 # 1) call before the first line is read for pre processing
1542 # 2) call to adjust each line of the input before the main handler gets them
1543 # 3) call upon EOF before the main handler exits its loop
1544 # 4) call at the end for post processing
1546 # $_ is used to store the input line, and is to be filtered by the
1547 # each_line_handler()s. So, if the format of the line is not in the desired
1548 # format for the main handler, these are used to do that adjusting. They can
1549 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
1550 # so the $_ output of one is used as the input to the next. None of the other
1551 # handlers are stackable, but could easily be changed to be so.
1553 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
1554 # which insert the parameters as lines to be processed before the next input
1555 # file line is read. This allows the EOF handler to flush buffers, for
1556 # example. The difference between the two routines is that the lines inserted
1557 # by insert_lines() are subjected to the each_line_handler()s. (So if you
1558 # called it from such a handler, you would get infinite recursion.) Lines
1559 # inserted by insert_adjusted_lines() go directly to the main handler without
1560 # any adjustments. If the post-processing handler calls any of these, there
1561 # will be no effect. Some error checking for these conditions could be added,
1562 # but it hasn't been done.
1564 # carp_bad_line() should be called to warn of bad input lines, which clears $_
1565 # to prevent further processing of the line. This routine will output the
1566 # message as a warning once, and then keep a count of the lines that have the
1567 # same message, and output that count at the end of the file's processing.
1568 # This keeps the number of messages down to a manageable amount.
1570 # get_missings() should be called to retrieve any @missing input lines.
1571 # Messages will be raised if this isn't done if the options aren't to ignore
1574 sub trace { return main::trace(@_); }
1577 # Keep track of fields that are to be put into the constructor.
1578 my %constructor_fields;
1580 main::setup_package(Constructor_Fields => \%constructor_fields);
1582 my %file; # Input file name, required
1583 main::set_access('file', \%file, qw{ c r });
1585 my %first_released; # Unicode version file was first released in, required
1586 main::set_access('first_released', \%first_released, qw{ c r });
1588 my %handler; # Subroutine to process the input file, defaults to
1589 # 'process_generic_property_file'
1590 main::set_access('handler', \%handler, qw{ c });
1593 # name of property this file is for. defaults to none, meaning not
1594 # applicable, or is otherwise determinable, for example, from each line.
1595 main::set_access('property', \%property, qw{ c });
1598 # If this is true, the file is optional. If not present, no warning is
1599 # output. If it is present, the string given by this parameter is
1600 # evaluated, and if false the file is not processed.
1601 main::set_access('optional', \%optional, 'c', 'r');
1604 # This is used for debugging, to skip processing of all but a few input
1605 # files. Add 'non_skip => 1' to the constructor for those files you want
1606 # processed when you set the $debug_skip global.
1607 main::set_access('non_skip', \%non_skip, 'c');
1610 # This is used to skip processing of this input file semi-permanently.
1611 # It is used for files that we aren't planning to process anytime soon,
1612 # but want to allow to be in the directory and not raise a message that we
1613 # are not handling. Mostly for test files. This is in contrast to the
1614 # non_skip element, which is supposed to be used very temporarily for
1615 # debugging. Sets 'optional' to 1
1616 main::set_access('skip', \%skip, 'c');
1618 my %each_line_handler;
1619 # list of subroutines to look at and filter each non-comment line in the
1620 # file. defaults to none. The subroutines are called in order, each is
1621 # to adjust $_ for the next one, and the final one adjusts it for
1623 main::set_access('each_line_handler', \%each_line_handler, 'c');
1625 my %has_missings_defaults;
1626 # ? Are there lines in the file giving default values for code points
1627 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is
1628 # the norm, but IGNORED means it has such lines, but the handler doesn't
1629 # use them. Having these three states allows us to catch changes to the
1630 # UCD that this program should track
1631 main::set_access('has_missings_defaults',
1632 \%has_missings_defaults, qw{ c r });
1635 # Subroutine to call before doing anything else in the file. If undef, no
1636 # such handler is called.
1637 main::set_access('pre_handler', \%pre_handler, qw{ c });
1640 # Subroutine to call upon getting an EOF on the input file, but before
1641 # that is returned to the main handler. This is to allow buffers to be
1642 # flushed. The handler is expected to call insert_lines() or
1643 # insert_adjusted() with the buffered material
1644 main::set_access('eof_handler', \%eof_handler, qw{ c r });
1647 # Subroutine to call after all the lines of the file are read in and
1648 # processed. If undef, no such handler is called.
1649 main::set_access('post_handler', \%post_handler, qw{ c });
1651 my %progress_message;
1652 # Message to print to display progress in lieu of the standard one
1653 main::set_access('progress_message', \%progress_message, qw{ c });
1656 # cache open file handle, internal. Is undef if file hasn't been
1657 # processed at all, empty if has;
1658 main::set_access('handle', \%handle);
1661 # cache of lines added virtually to the file, internal
1662 main::set_access('added_lines', \%added_lines);
1665 # cache of errors found, internal
1666 main::set_access('errors', \%errors);
1669 # storage of '@missing' defaults lines
1670 main::set_access('missings', \%missings);
1675 my $self = bless \do{ my $anonymous_scalar }, $class;
1676 my $addr = main::objaddr($self);
1679 $handler{$addr} = \&main::process_generic_property_file;
1680 $non_skip{$addr} = 0;
1682 $has_missings_defaults{$addr} = $NO_DEFAULTS;
1683 $handle{$addr} = undef;
1684 $added_lines{$addr} = [ ];
1685 $each_line_handler{$addr} = [ ];
1686 $errors{$addr} = { };
1687 $missings{$addr} = [ ];
1689 # Two positional parameters.
1690 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1691 $file{$addr} = main::internal_file_to_platform(shift);
1692 $first_released{$addr} = shift;
1694 # The rest of the arguments are key => value pairs
1695 # %constructor_fields has been set up earlier to list all possible
1696 # ones. Either set or push, depending on how the default has been set
1699 foreach my $key (keys %args) {
1700 my $argument = $args{$key};
1702 # Note that the fields are the lower case of the constructor keys
1703 my $hash = $constructor_fields{lc $key};
1704 if (! defined $hash) {
1705 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped");
1708 if (ref $hash->{$addr} eq 'ARRAY') {
1709 if (ref $argument eq 'ARRAY') {
1710 foreach my $argument (@{$argument}) {
1711 next if ! defined $argument;
1712 push @{$hash->{$addr}}, $argument;
1716 push @{$hash->{$addr}}, $argument if defined $argument;
1720 $hash->{$addr} = $argument;
1725 # If the file has a property for it, it means that the property is not
1726 # listed in the file's entries. So add a handler to the list of line
1727 # handlers to insert the property name into the lines, to provide a
1728 # uniform interface to the final processing subroutine.
1729 # the final code doesn't have to worry about that.
1730 if ($property{$addr}) {
1731 push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
1734 if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
1735 print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
1738 $optional{$addr} = 1 if $skip{$addr};
1746 qw("") => "_operator_stringify",
1747 "." => \&main::_operator_dot,
1750 sub _operator_stringify {
1753 return __PACKAGE__ . " object for " . $self->file;
1756 # flag to make sure extracted files are processed early
1757 my $seen_non_extracted_non_age = 0;
1760 # Process the input object $self. This opens and closes the file and
1761 # calls all the handlers for it. Currently, this can only be called
1762 # once per file, as it destroy's the EOF handler
1765 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1767 my $addr = main::objaddr $self;
1769 my $file = $file{$addr};
1771 # Don't process if not expecting this file (because released later
1772 # than this Unicode version), and isn't there. This means if someone
1773 # copies it into an earlier version's directory, we will go ahead and
1775 return if $first_released{$addr} gt $v_version && ! -e $file;
1777 # If in debugging mode and this file doesn't have the non-skip
1778 # flag set, and isn't one of the critical files, skip it.
1780 && $first_released{$addr} ne v0
1781 && ! $non_skip{$addr})
1783 print "Skipping $file in debugging\n" if $verbosity;
1787 # File could be optional
1788 if ($optional{$addr}) {
1789 return unless -e $file;
1790 my $result = eval $optional{$addr};
1791 if (! defined $result) {
1792 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped.");
1797 print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
1803 if (! defined $file || ! -e $file) {
1805 # If the file doesn't exist, see if have internal data for it
1806 # (based on first_released being 0).
1807 if ($first_released{$addr} eq v0) {
1808 $handle{$addr} = 'pretend_is_open';
1811 if (! $optional{$addr} # File could be optional
1812 && $v_version ge $first_released{$addr})
1814 print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
1821 # Here, the file exists. Some platforms may change the case of
1823 if ($seen_non_extracted_non_age) {
1824 if ($file =~ /$EXTRACTED/i) {
1825 Carp::my_carp_bug(join_lines(<<END
1826 $file should be processed just after the 'Prop...Alias' files, and before
1827 anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
1828 have subtle problems
1833 elsif ($EXTRACTED_DIR
1834 && $first_released{$addr} ne v0
1835 && $file !~ /$EXTRACTED/i
1836 && lc($file) ne 'dage.txt')
1838 # We don't set this (by the 'if' above) if we have no
1839 # extracted directory, so if running on an early version,
1840 # this test won't work. Not worth worrying about.
1841 $seen_non_extracted_non_age = 1;
1844 # And mark the file as having being processed, and warn if it
1845 # isn't a file we are expecting. As we process the files,
1846 # they are deleted from the hash, so any that remain at the
1847 # end of the program are files that we didn't process.
1848 my $fkey = File::Spec->rel2abs($file);
1849 my $expecting = delete $potential_files{$fkey};
1850 $expecting = delete $potential_files{lc($fkey)} unless defined $expecting;
1851 Carp::my_carp("Was not expecting '$file'.") if
1853 && ! defined $handle{$addr};
1855 # Having deleted from expected files, we can quit if not to do
1856 # anything. Don't print progress unless really want verbosity
1858 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
1862 # Open the file, converting the slashes used in this program
1863 # into the proper form for the OS
1865 if (not open $file_handle, "<", $file) {
1866 Carp::my_carp("Can't open $file. Skipping: $!");
1869 $handle{$addr} = $file_handle; # Cache the open file handle
1872 if ($verbosity >= $PROGRESS) {
1873 if ($progress_message{$addr}) {
1874 print "$progress_message{$addr}\n";
1877 # If using a virtual file, say so.
1878 print "Processing ", (-e $file)
1880 : "substitute $file",
1886 # Call any special handler for before the file.
1887 &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
1889 # Then the main handler
1890 &{$handler{$addr}}($self);
1892 # Then any special post-file handler.
1893 &{$post_handler{$addr}}($self) if $post_handler{$addr};
1895 # If any errors have been accumulated, output the counts (as the first
1896 # error message in each class was output when it was encountered).
1897 if ($errors{$addr}) {
1900 foreach my $error (keys %{$errors{$addr}}) {
1901 $total += $errors{$addr}->{$error};
1902 delete $errors{$addr}->{$error};
1907 = "A total of $total lines had errors in $file. ";
1909 $message .= ($types == 1)
1910 ? '(Only the first one was displayed.)'
1911 : '(Only the first of each type was displayed.)';
1912 Carp::my_carp($message);
1916 if (@{$missings{$addr}}) {
1917 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong");
1920 # If a real file handle, close it.
1921 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
1923 $handle{$addr} = ""; # Uses empty to indicate that has already seen
1924 # the file, as opposed to undef
1929 # Sets $_ to be the next logical input line, if any. Returns non-zero
1930 # if such a line exists. 'logical' means that any lines that have
1931 # been added via insert_lines() will be returned in $_ before the file
1935 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1937 my $addr = main::objaddr $self;
1939 # Here the file is open (or if the handle is not a ref, is an open
1940 # 'virtual' file). Get the next line; any inserted lines get priority
1941 # over the file itself.
1945 while (1) { # Loop until find non-comment, non-empty line
1946 #local $to_trace = 1 if main::DEBUG;
1947 my $inserted_ref = shift @{$added_lines{$addr}};
1948 if (defined $inserted_ref) {
1949 ($adjusted, $_) = @{$inserted_ref};
1950 trace $adjusted, $_ if main::DEBUG && $to_trace;
1951 return 1 if $adjusted;
1954 last if ! ref $handle{$addr}; # Don't read unless is real file
1955 last if ! defined ($_ = readline $handle{$addr});
1958 trace $_ if main::DEBUG && $to_trace;
1960 # See if this line is the comment line that defines what property
1961 # value that code points that are not listed in the file should
1962 # have. The format or existence of these lines is not guaranteed
1963 # by Unicode since they are comments, but the documentation says
1964 # that this was added for machine-readability, so probably won't
1965 # change. This works starting in Unicode Version 5.0. They look
1968 # @missing: 0000..10FFFF; Not_Reordered
1969 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
1970 # @missing: 0000..10FFFF; ; NaN
1972 # Save the line for a later get_missings() call.
1973 if (/$missing_defaults_prefix/) {
1974 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
1975 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries");
1977 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
1978 my @defaults = split /\s* ; \s*/x, $_;
1980 # The first field is the @missing, which ends in a
1981 # semi-colon, so can safely shift.
1984 # Some of these lines may have empty field placeholders
1985 # which get in the way. An example is:
1986 # @missing: 0000..10FFFF; ; NaN
1987 # Remove them. Process starting from the top so the
1988 # splice doesn't affect things still to be looked at.
1989 for (my $i = @defaults - 1; $i >= 0; $i--) {
1990 next if $defaults[$i] ne "";
1991 splice @defaults, $i, 1;
1994 # What's left should be just the property (maybe) and the
1995 # default. Having only one element means it doesn't have
1999 if (@defaults >= 1) {
2000 if (@defaults == 1) {
2001 $default = $defaults[0];
2004 $property = $defaults[0];
2005 $default = $defaults[1];
2011 || ($default =~ /^</
2012 && $default !~ /^<code *point>$/i
2013 && $default !~ /^<none>$/i))
2015 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
2019 # If the property is missing from the line, it should
2020 # be the one for the whole file
2021 $property = $property{$addr} if ! defined $property;
2023 # Change <none> to the null string, which is what it
2024 # really means. If the default is the code point
2025 # itself, set it to <code point>, which is what
2026 # Unicode uses (but sometimes they've forgotten the
2028 if ($default =~ /^<none>$/i) {
2031 elsif ($default =~ /^<code *point>$/i) {
2032 $default = $CODE_POINT;
2035 # Store them as a sub-arrays with both components.
2036 push @{$missings{$addr}}, [ $default, $property ];
2040 # There is nothing for the caller to process on this comment
2045 # Remove comments and trailing space, and skip this line if the
2051 # Call any handlers for this line, and skip further processing of
2052 # the line if the handler sets the line to null.
2053 foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2058 # Here the line is ok. return success.
2060 } # End of looping through lines.
2062 # If there is an EOF handler, call it (only once) and if it generates
2063 # more lines to process go back in the loop to handle them.
2064 if ($eof_handler{$addr}) {
2065 &{$eof_handler{$addr}}($self);
2066 $eof_handler{$addr} = ""; # Currently only get one shot at it.
2067 goto LINE if $added_lines{$addr};
2070 # Return failure -- no more lines.
2075 # Not currently used, not fully tested.
2077 # # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2078 # # record. Not callable from an each_line_handler(), nor does it call
2079 # # an each_line_handler() on the line.
2082 # my $addr = main::objaddr $self;
2084 # foreach my $inserted_ref (@{$added_lines{$addr}}) {
2085 # my ($adjusted, $line) = @{$inserted_ref};
2086 # next if $adjusted;
2088 # # Remove comments and trailing space, and return a non-empty
2091 # $line =~ s/\s+$//;
2092 # return $line if $line ne "";
2095 # return if ! ref $handle{$addr}; # Don't read unless is real file
2096 # while (1) { # Loop until find non-comment, non-empty line
2097 # local $to_trace = 1 if main::DEBUG;
2098 # trace $_ if main::DEBUG && $to_trace;
2099 # return if ! defined (my $line = readline $handle{$addr});
2101 # push @{$added_lines{$addr}}, [ 0, $line ];
2104 # $line =~ s/\s+$//;
2105 # return $line if $line ne "";
2113 # Lines can be inserted so that it looks like they were in the input
2114 # file at the place it was when this routine is called. See also
2115 # insert_adjusted_lines(). Lines inserted via this routine go through
2116 # any each_line_handler()
2120 # Each inserted line is an array, with the first element being 0 to
2121 # indicate that this line hasn't been adjusted, and needs to be
2123 push @{$added_lines{main::objaddr $self}}, map { [ 0, $_ ] } @_;
2127 sub insert_adjusted_lines {
2128 # Lines can be inserted so that it looks like they were in the input
2129 # file at the place it was when this routine is called. See also
2130 # insert_lines(). Lines inserted via this routine are already fully
2131 # adjusted, ready to be processed; each_line_handler()s handlers will
2132 # not be called. This means this is not a completely general
2133 # facility, as only the last each_line_handler on the stack should
2134 # call this. It could be made more general, by passing to each of the
2135 # line_handlers their position on the stack, which they would pass on
2136 # to this routine, and that would replace the boolean first element in
2137 # the anonymous array pushed here, so that the next_line routine could
2138 # use that to call only those handlers whose index is after it on the
2139 # stack. But this is overkill for what is needed now.
2142 trace $_[0] if main::DEBUG && $to_trace;
2144 # Each inserted line is an array, with the first element being 1 to
2145 # indicate that this line has been adjusted
2146 push @{$added_lines{main::objaddr $self}}, map { [ 1, $_ ] } @_;
2151 # Returns the stored up @missings lines' values, and clears the list.
2152 # The values are in an array, consisting of the default in the first
2153 # element, and the property in the 2nd. However, since these lines
2154 # can be stacked up, the return is an array of all these arrays.
2157 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2159 my $addr = main::objaddr $self;
2161 # If not accepting a list return, just return the first one.
2162 return shift @{$missings{$addr}} unless wantarray;
2164 my @return = @{$missings{$addr}};
2165 undef @{$missings{$addr}};
2169 sub _insert_property_into_line {
2170 # Add a property field to $_, if this file requires it.
2172 my $property = $property{main::objaddr shift};
2173 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2175 $_ =~ s/(;|$)/; $property$1/;
2180 # Output consistent error messages, using either a generic one, or the
2181 # one given by the optional parameter. To avoid gazillions of the
2182 # same message in case the syntax of a file is way off, this routine
2183 # only outputs the first instance of each message, incrementing a
2184 # count so the totals can be output at the end of the file.
2187 my $message = shift;
2188 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2190 my $addr = main::objaddr $self;
2192 $message = 'Unexpected line' unless $message;
2194 # No trailing punctuation so as to fit with our addenda.
2195 $message =~ s/[.:;,]$//;
2197 # If haven't seen this exact message before, output it now. Otherwise
2198 # increment the count of how many times it has occurred
2199 unless ($errors{$addr}->{$message}) {
2200 Carp::my_carp("$message in '$_' in "
2201 . $file{main::objaddr $self}
2202 . " at line $.. Skipping this line;");
2203 $errors{$addr}->{$message} = 1;
2206 $errors{$addr}->{$message}++;
2209 # Clear the line to prevent any further (meaningful) processing of it.
2216 package Multi_Default;
2218 # Certain properties in early versions of Unicode had more than one possible
2219 # default for code points missing from the files. In these cases, one
2220 # default applies to everything left over after all the others are applied,
2221 # and for each of the others, there is a description of which class of code
2222 # points applies to it. This object helps implement this by storing the
2223 # defaults, and for all but that final default, an eval string that generates
2224 # the class that it applies to.
2229 main::setup_package();
2232 # The defaults structure for the classes
2233 main::set_access('class_defaults', \%class_defaults);
2236 # The default that applies to everything left over.
2237 main::set_access('other_default', \%other_default, 'r');
2241 # The constructor is called with default => eval pairs, terminated by
2242 # the left-over default. e.g.
2243 # Multi_Default->new(
2244 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2246 # 'R' => 'some other expression that evaluates to code points',
2254 my $self = bless \do{my $anonymous_scalar}, $class;
2255 my $addr = main::objaddr($self);
2258 my $default = shift;
2260 $class_defaults{$addr}->{$default} = $eval;
2263 $other_default{$addr} = shift;
2268 sub get_next_defaults {
2269 # Iterates and returns the next class of defaults.
2271 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2273 my $addr = main::objaddr $self;
2275 return each %{$class_defaults{$addr}};
2281 # An alias is one of the names that a table goes by. This class defines them
2282 # including some attributes. Everything is currently setup in the
2288 main::setup_package();
2291 main::set_access('name', \%name, 'r');
2294 # Determined by the constructor code if this name should match loosely or
2295 # not. The constructor parameters can override this, but it isn't fully
2296 # implemented, as should have ability to override Unicode one's via
2297 # something like a set_loose_match()
2298 main::set_access('loose_match', \%loose_match, 'r');
2301 # Some aliases should not get their own entries because they are covered
2302 # by a wild-card, and some we want to discourage use of. Binary
2303 main::set_access('make_pod_entry', \%make_pod_entry, 'r');
2306 # Aliases have a status, like deprecated, or even suppressed (which means
2307 # they don't appear in documentation). Enum
2308 main::set_access('status', \%status, 'r');
2311 # Similarly, some aliases should not be considered as usable ones for
2312 # external use, such as file names, or we don't want documentation to
2313 # recommend them. Boolean
2314 main::set_access('externally_ok', \%externally_ok, 'r');
2319 my $self = bless \do { my $anonymous_scalar }, $class;
2320 my $addr = main::objaddr($self);
2322 $name{$addr} = shift;
2323 $loose_match{$addr} = shift;
2324 $make_pod_entry{$addr} = shift;
2325 $externally_ok{$addr} = shift;
2326 $status{$addr} = shift;
2328 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2330 # Null names are never ok externally
2331 $externally_ok{$addr} = 0 if $name{$addr} eq "";
2339 # A range is the basic unit for storing code points, and is described in the
2340 # comments at the beginning of the program. Each range has a starting code
2341 # point; an ending code point (not less than the starting one); a value
2342 # that applies to every code point in between the two end-points, inclusive;
2343 # and an enum type that applies to the value. The type is for the user's
2344 # convenience, and has no meaning here, except that a non-zero type is
2345 # considered to not obey the normal Unicode rules for having standard forms.
2347 # The same structure is used for both map and match tables, even though in the
2348 # latter, the value (and hence type) is irrelevant and could be used as a
2349 # comment. In map tables, the value is what all the code points in the range
2350 # map to. Type 0 values have the standardized version of the value stored as
2351 # well, so as to not have to recalculate it a lot.
2353 sub trace { return main::trace(@_); }
2357 main::setup_package();
2360 main::set_access('start', \%start, 'r', 's');
2363 main::set_access('end', \%end, 'r', 's');
2366 main::set_access('value', \%value, 'r');
2369 main::set_access('type', \%type, 'r');
2372 # The value in internal standard form. Defined only if the type is 0.
2373 main::set_access('standard_form', \%standard_form);
2375 # Note that if these fields change, the dump() method should as well
2378 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2381 my $self = bless \do { my $anonymous_scalar }, $class;
2382 my $addr = main::objaddr($self);
2384 $start{$addr} = shift;
2385 $end{$addr} = shift;
2389 my $value = delete $args{'Value'}; # Can be 0
2390 $value = "" unless defined $value;
2391 $value{$addr} = $value;
2393 $type{$addr} = delete $args{'Type'} || 0;
2395 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2397 if (! $type{$addr}) {
2398 $standard_form{$addr} = main::standardize($value);
2406 qw("") => "_operator_stringify",
2407 "." => \&main::_operator_dot,
2410 sub _operator_stringify {
2412 my $addr = main::objaddr $self;
2414 # Output it like '0041..0065 (value)'
2415 my $return = sprintf("%04X", $start{$addr})
2417 . sprintf("%04X", $end{$addr});
2418 my $value = $value{$addr};
2419 my $type = $type{$addr};
2421 $return .= "$value";
2422 $return .= ", Type=$type" if $type != 0;
2429 # The standard form is the value itself if the standard form is
2430 # undefined (that is if the value is special)
2433 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2435 my $addr = main::objaddr $self;
2437 return $standard_form{$addr} if defined $standard_form{$addr};
2438 return $value{$addr};
2442 # Human, not machine readable. For machine readable, comment out this
2443 # entire routine and let the standard one take effect.
2446 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2448 my $addr = main::objaddr $self;
2450 my $return = $indent
2451 . sprintf("%04X", $start{$addr})
2453 . sprintf("%04X", $end{$addr})
2454 . " '$value{$addr}';";
2455 if (! defined $standard_form{$addr}) {
2456 $return .= "(type=$type{$addr})";
2458 elsif ($standard_form{$addr} ne $value{$addr}) {
2459 $return .= "(standard '$standard_form{$addr}')";
2465 package _Range_List_Base;
2467 # Base class for range lists. A range list is simply an ordered list of
2468 # ranges, so that the ranges with the lowest starting numbers are first in it.
2470 # When a new range is added that is adjacent to an existing range that has the
2471 # same value and type, it merges with it to form a larger range.
2473 # Ranges generally do not overlap, except that there can be multiple entries
2474 # of single code point ranges. This is because of NameAliases.txt.
2476 # In this program, there is a standard value such that if two different
2477 # values, have the same standard value, they are considered equivalent. This
2478 # value was chosen so that it gives correct results on Unicode data
2480 # There are a number of methods to manipulate range lists, and some operators
2481 # are overloaded to handle them.
2483 # Because of the slowness of pure Perl objaddr() on miniperl, and measurements
2484 # showing this package was using a lot of real time calculating that, the code
2485 # was changed to only calculate it once per call stack. This is done by
2486 # consistently using the package variable $addr in routines, and only calling
2487 # objaddr() if it isn't defined, and setting that to be local, so that callees
2488 # will have it already. It would be a good thing to change this. XXX
2490 sub trace { return main::trace(@_); }
2496 main::setup_package();
2499 # The list of ranges
2500 main::set_access('ranges', \%ranges, 'readable_array');
2503 # The highest code point in the list. This was originally a method, but
2504 # actual measurements said it was used a lot.
2505 main::set_access('max', \%max, 'r');
2507 my %each_range_iterator;
2508 # Iterator position for each_range()
2509 main::set_access('each_range_iterator', \%each_range_iterator);
2512 # Name of parent this is attached to, if any. Solely for better error
2514 main::set_access('owner_name_of', \%owner_name_of, 'p_r');
2516 my %_search_ranges_cache;
2517 # A cache of the previous result from _search_ranges(), for better
2519 main::set_access('_search_ranges_cache', \%_search_ranges_cache);
2525 # Optional initialization data for the range list.
2526 my $initialize = delete $args{'Initialize'};
2530 # Use _union() to initialize. _union() returns an object of this
2531 # class, which means that it will call this constructor recursively.
2532 # But it won't have this $initialize parameter so that it won't
2533 # infinitely loop on this.
2534 return _union($class, $initialize, %args) if defined $initialize;
2536 $self = bless \do { my $anonymous_scalar }, $class;
2537 local $addr = main::objaddr($self);
2539 # Optional parent object, only for debug info.
2540 $owner_name_of{$addr} = delete $args{'Owner'};
2541 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
2543 # Stringify, in case it is an object.
2544 $owner_name_of{$addr} = "$owner_name_of{$addr}";
2546 # This is used only for error messages, and so a colon is added
2547 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
2549 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2551 # Max is initialized to a negative value that isn't adjacent to 0,
2555 $_search_ranges_cache{$addr} = 0;
2556 $ranges{$addr} = [];
2563 qw("") => "_operator_stringify",
2564 "." => \&main::_operator_dot,
2567 sub _operator_stringify {
2569 local $addr = main::objaddr($self) if !defined $addr;
2571 return "Range_List attached to '$owner_name_of{$addr}'"
2572 if $owner_name_of{$addr};
2573 return "anonymous Range_List " . \$self;
2577 # Returns the union of the input code points. It can be called as
2578 # either a constructor or a method. If called as a method, the result
2579 # will be a new() instance of the calling object, containing the union
2580 # of that object with the other parameter's code points; if called as
2581 # a constructor, the first parameter gives the class the new object
2582 # should be, and the second parameter gives the code points to go into
2584 # In either case, there are two parameters looked at by this routine;
2585 # any additional parameters are passed to the new() constructor.
2587 # The code points can come in the form of some object that contains
2588 # ranges, and has a conventionally named method to access them; or
2589 # they can be an array of individual code points (as integers); or
2590 # just a single code point.
2592 # If they are ranges, this routine doesn't make any effort to preserve
2593 # the range values of one input over the other. Therefore this base
2594 # class should not allow _union to be called from other than
2595 # initialization code, so as to prevent two tables from being added
2596 # together where the range values matter. The general form of this
2597 # routine therefore belongs in a derived class, but it was moved here
2598 # to avoid duplication of code. The failure to overload this in this
2599 # class keeps it safe.
2603 my @args; # Arguments to pass to the constructor
2607 # If a method call, will start the union with the object itself, and
2608 # the class of the new object will be the same as self.
2615 # Add the other required parameter.
2617 # Rest of parameters are passed on to the constructor
2619 # Accumulate all records from both lists.
2621 for my $arg (@args) {
2622 #local $to_trace = 0 if main::DEBUG;
2623 trace "argument = $arg" if main::DEBUG && $to_trace;
2624 if (! defined $arg) {
2626 if (defined $self) {
2627 $message .= $owner_name_of{main::objaddr $self};
2629 Carp::my_carp_bug($message .= "Undefined argument to _union. No union done.");
2632 $arg = [ $arg ] if ! ref $arg;
2633 my $type = ref $arg;
2634 if ($type eq 'ARRAY') {
2635 foreach my $element (@$arg) {
2636 push @records, Range->new($element, $element);
2639 elsif ($arg->isa('Range')) {
2640 push @records, $arg;
2642 elsif ($arg->can('ranges')) {
2643 push @records, $arg->ranges;
2647 if (defined $self) {
2648 $message .= $owner_name_of{main::objaddr $self};
2650 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done.");
2655 # Sort with the range containing the lowest ordinal first, but if
2656 # two ranges start at the same code point, sort with the bigger range
2657 # of the two first, because it takes fewer cycles.
2658 @records = sort { ($a->start <=> $b->start)
2660 # if b is shorter than a, b->end will be
2661 # less than a->end, and we want to select
2662 # a, so want to return -1
2663 ($b->end <=> $a->end)
2666 my $new = $class->new(@_);
2668 # Fold in records so long as they add new information.
2669 for my $set (@records) {
2670 my $start = $set->start;
2671 my $end = $set->end;
2672 my $value = $set->value;
2673 if ($start > $new->max) {
2674 $new->_add_delete('+', $start, $end, $value);
2676 elsif ($end > $new->max) {
2677 $new->_add_delete('+', $new->max +1, $end, $value);
2684 sub range_count { # Return the number of ranges in the range list
2686 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2688 local $addr = main::objaddr($self) if ! defined $addr;
2690 return scalar @{$ranges{$addr}};
2694 # Returns the minimum code point currently in the range list, or if
2695 # the range list is empty, 2 beyond the max possible. This is a
2696 # method because used so rarely, that not worth saving between calls,
2697 # and having to worry about changing it as ranges are added and
2701 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2703 local $addr = main::objaddr($self) if ! defined $addr;
2705 # If the range list is empty, return a large value that isn't adjacent
2706 # to any that could be in the range list, for simpler tests
2707 return $LAST_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
2708 return $ranges{$addr}->[0]->start;
2712 # Boolean: Is argument in the range list? If so returns $i such that:
2713 # range[$i]->end < $codepoint <= range[$i+1]->end
2714 # which is one beyond what you want; this is so that the 0th range
2715 # doesn't return false
2717 my $codepoint = shift;
2718 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2720 local $addr = main::objaddr $self if ! defined $addr;
2722 my $i = $self->_search_ranges($codepoint);
2723 return 0 unless defined $i;
2725 # The search returns $i, such that
2726 # range[$i-1]->end < $codepoint <= range[$i]->end
2727 # So is in the table if and only iff it is at least the start position
2729 return 0 if $ranges{$addr}->[$i]->start > $codepoint;
2734 # Returns the value associated with the code point, undef if none
2737 my $codepoint = shift;
2738 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2740 local $addr = main::objaddr $self if ! defined $addr;
2742 my $i = $self->contains($codepoint);
2745 # contains() returns 1 beyond where we should look
2746 return $ranges{$addr}->[$i-1]->value;
2749 sub _search_ranges {
2750 # Find the range in the list which contains a code point, or where it
2751 # should go if were to add it. That is, it returns $i, such that:
2752 # range[$i-1]->end < $codepoint <= range[$i]->end
2753 # Returns undef if no such $i is possible (e.g. at end of table), or
2754 # if there is an error.
2757 my $code_point = shift;
2758 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2760 local $addr = main::objaddr $self if ! defined $addr;
2762 return if $code_point > $max{$addr};
2763 my $r = $ranges{$addr}; # The current list of ranges
2764 my $range_list_size = scalar @$r;
2767 use integer; # want integer division
2769 # Use the cached result as the starting guess for this one, because,
2770 # an experiment on 5.1 showed that 90% of the time the cache was the
2771 # same as the result on the next call (and 7% it was one less).
2772 $i = $_search_ranges_cache{$addr};
2773 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob.
2774 # from an intervening deletion
2775 #local $to_trace = 1 if main::DEBUG;
2776 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
2777 return $i if $code_point <= $r->[$i]->end
2778 && ($i == 0 || $r->[$i-1]->end < $code_point);
2780 # Here the cache doesn't yield the correct $i. Try adding 1.
2781 if ($i < $range_list_size - 1
2782 && $r->[$i]->end < $code_point &&
2783 $code_point <= $r->[$i+1]->end)
2786 trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
2787 $_search_ranges_cache{$addr} = $i;
2791 # Here, adding 1 also didn't work. We do a binary search to
2792 # find the correct position, starting with current $i
2794 my $upper = $range_list_size - 1;
2796 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
2798 if ($code_point <= $r->[$i]->end) {
2800 # Here we have met the upper constraint. We can quit if we
2801 # also meet the lower one.
2802 last if $i == 0 || $r->[$i-1]->end < $code_point;
2804 $upper = $i; # Still too high.
2809 # Here, $r[$i]->end < $code_point, so look higher up.
2813 # Split search domain in half to try again.
2814 my $temp = ($upper + $lower) / 2;
2816 # No point in continuing unless $i changes for next time
2820 # We can't reach the highest element because of the averaging.
2821 # So if one below the upper edge, force it there and try one
2823 if ($i == $range_list_size - 2) {
2825 trace "Forcing to upper edge" if main::DEBUG && $to_trace;
2826 $i = $range_list_size - 1;
2828 # Change $lower as well so if fails next time through,
2829 # taking the average will yield the same $i, and we will
2830 # quit with the error message just below.
2834 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken.");
2838 } # End of while loop
2840 if (main::DEBUG && $to_trace) {
2841 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
2842 trace "i= [ $i ]", $r->[$i];
2843 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
2846 # Here we have found the offset. Cache it as a starting point for the
2848 $_search_ranges_cache{$addr} = $i;
2853 # Add, replace or delete ranges to or from a list. The $type
2854 # parameter gives which:
2855 # '+' => insert or replace a range, returning a list of any changed
2857 # '-' => delete a range, returning a list of any deleted ranges.
2859 # The next three parameters give respectively the start, end, and
2860 # value associated with the range. 'value' should be null unless the
2863 # The range list is kept sorted so that the range with the lowest
2864 # starting position is first in the list, and generally, adjacent
2865 # ranges with the same values are merged into single larger one (see
2866 # exceptions below).
2868 # There are more parameters, all are key => value pairs:
2869 # Type gives the type of the value. It is only valid for '+'.
2870 # All ranges have types; if this parameter is omitted, 0 is
2871 # assumed. Ranges with type 0 are assumed to obey the
2872 # Unicode rules for casing, etc; ranges with other types are
2873 # not. Otherwise, the type is arbitrary, for the caller's
2874 # convenience, and looked at only by this routine to keep
2875 # adjacent ranges of different types from being merged into
2876 # a single larger range, and when Replace =>
2877 # $IF_NOT_EQUIVALENT is specified (see just below).
2878 # Replace determines what to do if the range list already contains
2879 # ranges which coincide with all or portions of the input
2880 # range. It is only valid for '+':
2881 # => $NO means that the new value is not to replace
2882 # any existing ones, but any empty gaps of the
2883 # range list coinciding with the input range
2884 # will be filled in with the new value.
2885 # => $UNCONDITIONALLY means to replace the existing values with
2886 # this one unconditionally. However, if the
2887 # new and old values are identical, the
2888 # replacement is skipped to save cycles
2889 # => $IF_NOT_EQUIVALENT means to replace the existing values
2890 # with this one if they are not equivalent.
2891 # Ranges are equivalent if their types are the
2892 # same, and they are the same string, or if
2893 # both are type 0 ranges, if their Unicode
2894 # standard forms are identical. In this last
2895 # case, the routine chooses the more "modern"
2896 # one to use. This is because some of the
2897 # older files are formatted with values that
2898 # are, for example, ALL CAPs, whereas the
2899 # derived files have a more modern style,
2900 # which looks better. By looking for this
2901 # style when the pre-existing and replacement
2902 # standard forms are the same, we can move to
2904 # => $MULTIPLE means that if this range duplicates an
2905 # existing one, but has a different value,
2906 # don't replace the existing one, but insert
2907 # this, one so that the same range can occur
2909 # => anything else is the same as => $IF_NOT_EQUIVALENT
2911 # "same value" means identical for type-0 ranges, and it means having
2912 # the same standard forms for non-type-0 ranges.
2914 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
2917 my $operation = shift; # '+' for add/replace; '-' for delete;
2924 $value = "" if not defined $value; # warning: $value can be "0"
2926 my $replace = delete $args{'Replace'};
2927 $replace = $IF_NOT_EQUIVALENT unless defined $replace;
2929 my $type = delete $args{'Type'};
2930 $type = 0 unless defined $type;
2932 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2934 local $addr = main::objaddr($self) if ! defined $addr;
2936 if ($operation ne '+' && $operation ne '-') {
2937 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken.");
2940 unless (defined $start && defined $end) {
2941 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken.");
2944 unless ($end >= $start) {
2945 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
2948 #local $to_trace = 1 if main::DEBUG;
2950 if ($operation eq '-') {
2951 if ($replace != $IF_NOT_EQUIVALENT) {
2952 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT.");
2953 $replace = $IF_NOT_EQUIVALENT;
2956 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0.");
2960 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\".");
2965 my $r = $ranges{$addr}; # The current list of ranges
2966 my $range_list_size = scalar @$r; # And its size
2967 my $max = $max{$addr}; # The current high code point in
2968 # the list of ranges
2970 # Do a special case requiring fewer machine cycles when the new range
2971 # starts after the current highest point. The Unicode input data is
2972 # structured so this is common.
2973 if ($start > $max) {
2975 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace;
2976 return if $operation eq '-'; # Deleting a non-existing range is a
2979 # If the new range doesn't logically extend the current final one
2980 # in the range list, create a new range at the end of the range
2981 # list. (max cleverly is initialized to a negative number not
2982 # adjacent to 0 if the range list is empty, so even adding a range
2983 # to an empty range list starting at 0 will have this 'if'
2985 if ($start > $max + 1 # non-adjacent means can't extend.
2986 || @{$r}[-1]->value ne $value # values differ, can't extend.
2987 || @{$r}[-1]->type != $type # types differ, can't extend.
2989 push @$r, Range->new($start, $end,
2995 # Here, the new range starts just after the current highest in
2996 # the range list, and they have the same type and value.
2997 # Extend the current range to incorporate the new one.
2998 @{$r}[-1]->set_end($end);
3001 # This becomes the new maximum.
3006 #local $to_trace = 0 if main::DEBUG;
3008 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3010 # Here, the input range isn't after the whole rest of the range list.
3011 # Most likely 'splice' will be needed. The rest of the routine finds
3012 # the needed splice parameters, and if necessary, does the splice.
3013 # First, find the offset parameter needed by the splice function for
3014 # the input range. Note that the input range may span multiple
3015 # existing ones, but we'll worry about that later. For now, just find
3016 # the beginning. If the input range is to be inserted starting in a
3017 # position not currently in the range list, it must (obviously) come
3018 # just after the range below it, and just before the range above it.
3019 # Slightly less obviously, it will occupy the position currently
3020 # occupied by the range that is to come after it. More formally, we
3021 # are looking for the position, $i, in the array of ranges, such that:
3023 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3025 # (The ordered relationships within existing ranges are also shown in
3026 # the equation above). However, if the start of the input range is
3027 # within an existing range, the splice offset should point to that
3028 # existing range's position in the list; that is $i satisfies a
3029 # somewhat different equation, namely:
3031 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3033 # More briefly, $start can come before or after r[$i]->start, and at
3034 # this point, we don't know which it will be. However, these
3035 # two equations share these constraints:
3037 # r[$i-1]->end < $start <= r[$i]->end
3039 # And that is good enough to find $i.
3041 my $i = $self->_search_ranges($start);
3043 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed");
3047 # The search function returns $i such that:
3049 # r[$i-1]->end < $start <= r[$i]->end
3051 # That means that $i points to the first range in the range list
3052 # that could possibly be affected by this operation. We still don't
3053 # know if the start of the input range is within r[$i], or if it
3054 # points to empty space between r[$i-1] and r[$i].
3055 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3057 # Special case the insertion of data that is not to replace any
3059 if ($replace == $NO) { # If $NO, has to be operation '+'
3060 #local $to_trace = 1 if main::DEBUG;
3061 trace "Doesn't replace" if main::DEBUG && $to_trace;
3063 # Here, the new range is to take effect only on those code points
3064 # that aren't already in an existing range. This can be done by
3065 # looking through the existing range list and finding the gaps in
3066 # the ranges that this new range affects, and then calling this
3067 # function recursively on each of those gaps, leaving untouched
3068 # anything already in the list. Gather up a list of the changed
3069 # gaps first so that changes to the internal state as new ranges
3070 # are added won't be a problem.
3073 # First, if the starting point of the input range is outside an
3074 # existing one, there is a gap from there to the beginning of the
3075 # existing range -- add a span to fill the part that this new
3077 if ($start < $r->[$i]->start) {
3078 push @gap_list, Range->new($start,
3080 $r->[$i]->start - 1),
3082 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3085 # Then look through the range list for other gaps until we reach
3086 # the highest range affected by the input one.
3088 for ($j = $i+1; $j < $range_list_size; $j++) {
3089 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3090 last if $end < $r->[$j]->start;
3092 # If there is a gap between when this range starts and the
3093 # previous one ends, add a span to fill it. Note that just
3094 # because there are two ranges doesn't mean there is a
3095 # non-zero gap between them. It could be that they have
3096 # different values or types
3097 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3099 Range->new($r->[$j-1]->end + 1,
3100 $r->[$j]->start - 1,
3102 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3106 # Here, we have either found an existing range in the range list,
3107 # beyond the area affected by the input one, or we fell off the
3108 # end of the loop because the input range affects the whole rest
3109 # of the range list. In either case, $j is 1 higher than the
3110 # highest affected range. If $j == $i, it means that there are no
3111 # affected ranges, that the entire insertion is in the gap between
3112 # r[$i-1], and r[$i], which we already have taken care of before
3114 # On the other hand, if there are affected ranges, it might be
3115 # that there is a gap that needs filling after the final such
3116 # range to the end of the input range
3117 if ($r->[$j-1]->end < $end) {
3118 push @gap_list, Range->new(main::max($start,
3119 $r->[$j-1]->end + 1),
3122 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3125 # Call recursively to fill in all the gaps.
3126 foreach my $gap (@gap_list) {
3127 $self->_add_delete($operation,
3137 # Here, we have taken care of the case where $replace is $NO, which
3138 # means that whatever action we now take is done unconditionally. It
3139 # still could be that this call will result in a no-op, if duplicates
3140 # aren't allowed, and we are inserting a range that merely duplicates
3141 # data already in the range list; or also if deleting a non-existent
3143 # $i still points to the first potential affected range. Now find the
3144 # highest range affected, which will determine the length parameter to
3145 # splice. (The input range can span multiple existing ones.) While
3146 # we are looking through the range list, see also if this is an
3147 # insertion that will change the values of at least one of the
3148 # affected ranges. We don't need to do this check unless this is an
3149 # insertion of non-multiples, and also since this is a boolean, we
3150 # don't need to do it if have already determined that it will make a
3151 # change; just unconditionally change them. $cdm is created to be 1
3152 # if either of these is true. (The 'c' in the name comes from below)
3153 my $cdm = ($operation eq '-' || $replace == $MULTIPLE);
3154 my $j; # This will point to the highest affected range
3156 # For non-zero types, the standard form is the value itself;
3157 my $standard_form = ($type) ? $value : main::standardize($value);
3159 for ($j = $i; $j < $range_list_size; $j++) {
3160 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3162 # If find a range that it doesn't overlap into, we can stop
3164 last if $end < $r->[$j]->start;
3166 # Here, overlaps the range at $j. If the value's don't match,
3167 # and this is supposedly an insertion, it becomes a change
3168 # instead. This is what the 'c' stands for in $cdm.
3170 if ($r->[$j]->standard_form ne $standard_form) {
3175 # Here, the two values are essentially the same. If the
3176 # two are actually identical, replacing wouldn't change
3177 # anything so skip it.
3178 my $pre_existing = $r->[$j]->value;
3179 if ($pre_existing ne $value) {
3181 # Here the new and old standardized values are the
3182 # same, but the non-standardized values aren't. If
3183 # replacing unconditionally, then replace
3184 if( $replace == $UNCONDITIONALLY) {
3189 # Here, are replacing conditionally. Decide to
3190 # replace or not based on which appears to look
3191 # the "nicest". If one is mixed case and the
3192 # other isn't, choose the mixed case one.
3193 my $new_mixed = $value =~ /[A-Z]/
3194 && $value =~ /[a-z]/;
3195 my $old_mixed = $pre_existing =~ /[A-Z]/
3196 && $pre_existing =~ /[a-z]/;
3198 if ($old_mixed != $new_mixed) {
3199 $cdm = 1 if $new_mixed;
3200 if (main::DEBUG && $to_trace) {
3202 trace "Replacing $pre_existing with $value";
3205 trace "Retaining $pre_existing over $value";
3211 # Here casing wasn't different between the two.
3212 # If one has hyphens or underscores and the
3213 # other doesn't, choose the one with the
3215 my $new_punct = $value =~ /[-_]/;
3216 my $old_punct = $pre_existing =~ /[-_]/;
3218 if ($old_punct != $new_punct) {
3219 $cdm = 1 if $new_punct;
3220 if (main::DEBUG && $to_trace) {
3222 trace "Replacing $pre_existing with $value";
3225 trace "Retaining $pre_existing over $value";
3228 } # else existing one is just as "good";
3229 # retain it to save cycles.
3235 } # End of loop looking for highest affected range.
3237 # Here, $j points to one beyond the highest range that this insertion
3238 # affects (hence to beyond the range list if that range is the final
3239 # one in the range list).
3241 # The splice length is all the affected ranges. Get it before
3242 # subtracting, for efficiency, so we don't have to later add 1.
3243 my $length = $j - $i;
3245 $j--; # $j now points to the highest affected range.
3246 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3248 # If inserting a multiple record, this is where it goes, after all the
3249 # existing ones for this range. This implies an insertion, and no
3250 # change to any existing ranges. Note that $j can be -1 if this new
3251 # range doesn't actually duplicate any existing, and comes at the
3252 # beginning of the list, in which case we can handle it like any other
3253 # insertion, and is easier to do so.
3254 if ($replace == $MULTIPLE && $j >= 0) {
3256 # This restriction could be remedied with a little extra work, but
3257 # it won't hopefully ever be necessary
3258 if ($r->[$j]->start != $r->[$j]->end) {
3259 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple when the other range ($r->[$j]) contains more than one code point. No action taken.");
3263 # Don't add an exact duplicate, as it isn't really a multiple
3264 return if $value eq $r->[$j]->value && $type eq $r->[$j]->type;
3266 trace "Adding multiple record at $j+1 with $start..$end, $value" if main::DEBUG && $to_trace;
3267 my @return = splice @$r,
3274 if (main::DEBUG && $to_trace) {
3275 trace "After splice:";
3276 trace 'j-2=[', $j-2, ']', $r->[$j-2] if $j >= 2;
3277 trace 'j-1=[', $j-1, ']', $r->[$j-1] if $j >= 1;
3278 trace "j =[", $j, "]", $r->[$j] if $j >= 0;
3279 trace 'j+1=[', $j+1, ']', $r->[$j+1] if $j < @$r - 1;
3280 trace 'j+2=[', $j+2, ']', $r->[$j+2] if $j < @$r - 2;
3281 trace 'j+3=[', $j+3, ']', $r->[$j+3] if $j < @$r - 3;
3286 # Here, have taken care of $NO and $MULTIPLE replaces.
3287 # $j points to the highest affected range. But it can be < $i or even
3288 # -1. These happen only if the insertion is entirely in the gap
3289 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
3290 # above exited first time through with $end < $r->[$i]->start. (And
3291 # then we subtracted one from j) This implies also that $start <
3292 # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3293 # $start, so the entire input range is in the gap.
3296 # Here the entire input range is in the gap before $i.
3298 if (main::DEBUG && $to_trace) {
3300 trace "Entire range is between $r->[$i-1] and $r->[$i]";
3303 trace "Entire range is before $r->[$i]";
3306 return if $operation ne '+'; # Deletion of a non-existent range is
3311 # Here the entire input range is not in the gap before $i. There
3312 # is an affected one, and $j points to the highest such one.
3314 # At this point, here is the situation:
3315 # This is not an insertion of a multiple, nor of tentative ($NO)
3317 # $i points to the first element in the current range list that
3318 # may be affected by this operation. In fact, we know
3319 # that the range at $i is affected because we are in
3320 # the else branch of this 'if'
3321 # $j points to the highest affected range.
3323 # r[$i-1]->end < $start <= r[$i]->end
3325 # r[$i-1]->end < $start <= $end <= r[$j]->end
3328 # $cdm is a boolean which is set true if and only if this is a
3329 # change or deletion (multiple was handled above). In
3330 # other words, it could be renamed to be just $cd.
3332 # We now have enough information to decide if this call is a no-op
3333 # or not. It is a no-op if it is a deletion of a non-existent
3334 # range, or an insertion of already existing data.
3336 if (main::DEBUG && $to_trace && ! $cdm
3338 && $start >= $r->[$i]->start)
3342 return if ! $cdm # change or delete => not no-op
3343 && $i == $j # more than one affected range => not no-op
3345 # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
3346 # Further, $start and/or $end is >= r[$i]->start
3347 # The test below hence guarantees that
3348 # r[$i]->start < $start <= $end <= r[$i]->end
3349 # This means the input range is contained entirely in
3350 # the one at $i, so is a no-op
3351 && $start >= $r->[$i]->start;
3354 # Here, we know that some action will have to be taken. We have
3355 # calculated the offset and length (though adjustments may be needed)
3356 # for the splice. Now start constructing the replacement list.
3358 my $splice_start = $i;
3363 # See if should extend any adjacent ranges.
3364 if ($operation eq '-') { # Don't extend deletions
3365 $extends_below = $extends_above = 0;
3367 else { # Here, should extend any adjacent ranges. See if there are
3369 $extends_below = ($i > 0
3370 # can't extend unless adjacent
3371 && $r->[$i-1]->end == $start -1
3372 # can't extend unless are same standard value
3373 && $r->[$i-1]->standard_form eq $standard_form
3374 # can't extend unless share type
3375 && $r->[$i-1]->type == $type);
3376 $extends_above = ($j+1 < $range_list_size
3377 && $r->[$j+1]->start == $end +1
3378 && $r->[$j+1]->standard_form eq $standard_form
3379 && $r->[$j-1]->type == $type);
3381 if ($extends_below && $extends_above) { # Adds to both
3382 $splice_start--; # start replace at element below
3383 $length += 2; # will replace on both sides
3384 trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
3386 # The result will fill in any gap, replacing both sides, and
3387 # create one large range.
3388 @replacement = Range->new($r->[$i-1]->start,
3395 # Here we know that the result won't just be the conglomeration of
3396 # a new range with both its adjacent neighbors. But it could
3397 # extend one of them.
3399 if ($extends_below) {
3401 # Here the new element adds to the one below, but not to the
3402 # one above. If inserting, and only to that one range, can
3403 # just change its ending to include the new one.
3404 if ($length == 0 && ! $cdm) {
3405 $r->[$i-1]->set_end($end);
3406 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
3410 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
3411 $splice_start--; # start replace at element below
3412 $length++; # will replace the element below
3413 $start = $r->[$i-1]->start;
3416 elsif ($extends_above) {
3418 # Here the new element adds to the one above, but not below.
3419 # Mirror the code above
3420 if ($length == 0 && ! $cdm) {
3421 $r->[$j+1]->set_start($start);
3422 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
3426 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
3427 $length++; # will replace the element above
3428 $end = $r->[$j+1]->end;
3432 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
3434 # Finally, here we know there will have to be a splice.
3435 # If the change or delete affects only the highest portion of the
3436 # first affected range, the range will have to be split. The
3437 # splice will remove the whole range, but will replace it by a new
3438 # range containing just the unaffected part. So, in this case,
3439 # add to the replacement list just this unaffected portion.
3440 if (! $extends_below
3441 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
3444 Range->new($r->[$i]->start,
3446 Value => $r->[$i]->value,
3447 Type => $r->[$i]->type);
3450 # In the case of an insert or change, but not a delete, we have to
3451 # put in the new stuff; this comes next.
3452 if ($operation eq '+') {
3453 push @replacement, Range->new($start,
3459 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
3460 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
3462 # And finally, if we're changing or deleting only a portion of the
3463 # highest affected range, it must be split, as the lowest one was.
3464 if (! $extends_above
3465 && $j >= 0 # Remember that j can be -1 if before first
3467 && $end >= $r->[$j]->start
3468 && $end < $r->[$j]->end)
3471 Range->new($end + 1,
3473 Value => $r->[$j]->value,
3474 Type => $r->[$j]->type);
3478 # And do the splice, as calculated above
3479 if (main::DEBUG && $to_trace) {
3480 trace "replacing $length element(s) at $i with ";
3481 foreach my $replacement (@replacement) {
3482 trace " $replacement";
3484 trace "Before splice:";
3485 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3486 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3487 trace "i =[", $i, "]", $r->[$i];
3488 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3489 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3492 my @return = splice @$r, $splice_start, $length, @replacement;
3494 if (main::DEBUG && $to_trace) {
3495 trace "After splice:";
3496 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3497 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3498 trace "i =[", $i, "]", $r->[$i];
3499 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3500 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3501 trace "removed @return";
3504 # An actual deletion could have changed the maximum in the list.
3505 # There was no deletion if the splice didn't return something, but
3506 # otherwise recalculate it. This is done too rarely to worry about
3508 if ($operation eq '-' && @return) {
3509 $max{$addr} = $r->[-1]->end;
3514 sub reset_each_range { # reset the iterator for each_range();
3516 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3518 local $addr = main::objaddr $self if ! defined $addr;
3520 undef $each_range_iterator{$addr};
3525 # Iterate over each range in a range list. Results are undefined if
3526 # the range list is changed during the iteration.
3529 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3531 local $addr = main::objaddr($self) if ! defined $addr;
3533 return if $self->is_empty;
3535 $each_range_iterator{$addr} = -1
3536 if ! defined $each_range_iterator{$addr};
3537 $each_range_iterator{$addr}++;
3538 return $ranges{$addr}->[$each_range_iterator{$addr}]
3539 if $each_range_iterator{$addr} < @{$ranges{$addr}};
3540 undef $each_range_iterator{$addr};
3544 sub count { # Returns count of code points in range list
3546 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3548 local $addr = main::objaddr($self) if ! defined $addr;
3551 foreach my $range (@{$ranges{$addr}}) {
3552 $count += $range->end - $range->start + 1;
3557 sub delete_range { # Delete a range
3562 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3564 return $self->_add_delete('-', $start, $end, "");
3567 sub is_empty { # Returns boolean as to if a range list is empty
3569 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3571 local $addr = main::objaddr($self) if ! defined $addr;
3572 return scalar @{$ranges{$addr}} == 0;
3576 # Quickly returns a scalar suitable for separating tables into
3577 # buckets, i.e. it is a hash function of the contents of a table, so
3578 # there are relatively few conflicts.
3581 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3583 local $addr = main::objaddr($self) if ! defined $addr;
3585 # These are quickly computable. Return looks like 'min..max;count'
3586 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
3588 } # End closure for _Range_List_Base
3591 use base '_Range_List_Base';
3593 # A Range_List is a range list for match tables; i.e. the range values are
3594 # not significant. Thus a number of operations can be safely added to it,
3595 # such as inversion, intersection. Note that union is also an unsafe
3596 # operation when range values are cared about, and that method is in the base
3597 # class, not here. But things are set up so that that method is callable only
3598 # during initialization. Only in this derived class, is there an operation
3599 # that combines two tables. A Range_Map can thus be used to initialize a
3600 # Range_List, and its mappings will be in the list, but are not significant to
3603 sub trace { return main::trace(@_); }
3609 '+' => sub { my $self = shift;
3612 return $self->_union($other)
3614 '&' => sub { my $self = shift;
3617 return $self->_intersect($other, 0);
3624 # Returns a new Range_List that gives all code points not in $self.
3628 my $new = Range_List->new;
3630 # Go through each range in the table, finding the gaps between them
3631 my $max = -1; # Set so no gap before range beginning at 0
3632 for my $range ($self->ranges) {
3633 my $start = $range->start;
3634 my $end = $range->end;
3636 # If there is a gap before this range, the inverse will contain
3638 if ($start > $max + 1) {
3639 $new->add_range($max + 1, $start - 1);
3644 # And finally, add the gap from the end of the table to the max
3645 # possible code point
3646 if ($max < $LAST_UNICODE_CODEPOINT) {
3647 $new->add_range($max + 1, $LAST_UNICODE_CODEPOINT);
3653 # Returns a new Range_List with the argument deleted from it. The
3654 # argument can be a single code point, a range, or something that has
3655 # a range, with the _range_list() method on it returning them
3659 my $reversed = shift;
3660 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3663 Carp::my_carp_bug("Can't cope with a "
3665 . " being the second parameter in a '-'. Subtraction ignored.");
3669 my $new = Range_List->new(Initialize => $self);
3671 if (! ref $other) { # Single code point
3672 $new->delete_range($other, $other);
3674 elsif ($other->isa('Range')) {
3675 $new->delete_range($other->start, $other->end);
3677 elsif ($other->can('_range_list')) {
3678 foreach my $range ($other->_range_list->ranges) {
3679 $new->delete_range($range->start, $range->end);
3683 Carp::my_carp_bug("Can't cope with a "
3685 . " argument to '-'. Subtraction ignored."
3694 # Returns either a boolean giving whether the two inputs' range lists
3695 # intersect (overlap), or a new Range_List containing the intersection
3696 # of the two lists. The optional final parameter being true indicates
3697 # to do the check instead of the intersection.
3699 my $a_object = shift;
3700 my $b_object = shift;
3701 my $check_if_overlapping = shift;
3702 $check_if_overlapping = 0 unless defined $check_if_overlapping;
3703 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3705 if (! defined $b_object) {
3707 $message .= $a_object->_owner_name_of if defined $a_object;
3708 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done.");
3712 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
3713 # Thus the intersection could be much more simply be written:
3714 # return ~(~$a_object + ~$b_object);
3715 # But, this is slower, and when taking the inverse of a large
3716 # range_size_1 table, back when such tables were always stored that
3717 # way, it became prohibitively slow, hence the code was changed to the
3720 if ($b_object->isa('Range')) {
3721 $b_object = Range_List->new(Initialize => $b_object,
3722 Owner => $a_object->_owner_name_of);
3724 $b_object = $b_object->_range_list if $b_object->can('_range_list');
3726 my @a_ranges = $a_object->ranges;
3727 my @b_ranges = $b_object->ranges;
3729 #local $to_trace = 1 if main::DEBUG;
3730 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
3732 # Start with the first range in each list
3734 my $range_a = $a_ranges[$a_i];
3736 my $range_b = $b_ranges[$b_i];
3738 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
3739 if ! $check_if_overlapping;
3741 # If either list is empty, there is no intersection and no overlap
3742 if (! defined $range_a || ! defined $range_b) {
3743 return $check_if_overlapping ? 0 : $new;
3745 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
3747 # Otherwise, must calculate the intersection/overlap. Start with the
3748 # very first code point in each list
3749 my $a = $range_a->start;
3750 my $b = $range_b->start;
3752 # Loop through all the ranges of each list; in each iteration, $a and
3753 # $b are the current code points in their respective lists
3756 # If $a and $b are the same code point, ...
3759 # it means the lists overlap. If just checking for overlap
3760 # know the answer now,
3761 return 1 if $check_if_overlapping;
3763 # The intersection includes this code point plus anything else
3764 # common to both current ranges.
3766 my $end = main::min($range_a->end, $range_b->end);
3767 if (! $check_if_overlapping) {
3768 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
3769 $new->add_range($start, $end);
3772 # Skip ahead to the end of the current intersect
3775 # If the current intersect ends at the end of either range (as
3776 # it must for at least one of them), the next possible one
3777 # will be the beginning code point in it's list's next range.
3778 if ($a == $range_a->end) {
3779 $range_a = $a_ranges[++$a_i];
3780 last unless defined $range_a;
3781 $a = $range_a->start;
3783 if ($b == $range_b->end) {
3784 $range_b = $b_ranges[++$b_i];
3785 last unless defined $range_b;
3786 $b = $range_b->start;
3789 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
3793 # Not equal, but if the range containing $a encompasses $b,
3794 # change $a to be the middle of the range where it does equal
3795 # $b, so the next iteration will get the intersection
3796 if ($range_a->end >= $b) {
3801 # Here, the current range containing $a is entirely below
3802 # $b. Go try to find a range that could contain $b.
3803 $a_i = $a_object->_search_ranges($b);
3805 # If no range found, quit.
3806 last unless defined $a_i;
3808 # The search returns $a_i, such that
3809 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
3810 # Set $a to the beginning of this new range, and repeat.
3811 $range_a = $a_ranges[$a_i];
3812 $a = $range_a->start;
3815 else { # Here, $b < $a.
3817 # Mirror image code to the leg just above
3818 if ($range_b->end >= $a) {
3822 $b_i = $b_object->_search_ranges($a);
3823 last unless defined $b_i;
3824 $range_b = $b_ranges[$b_i];
3825 $b = $range_b->start;
3828 } # End of looping through ranges.
3830 # Intersection fully computed, or now know that there is no overlap
3831 return $check_if_overlapping ? 0 : $new;
3835 # Returns boolean giving whether the two arguments overlap somewhere
3839 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3841 return $self->_intersect($other, 1);
3845 # Add a range to the list.
3850 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3852 return $self->_add_delete('+', $start, $end, "");
3855 my $non_ASCII = (ord('A') != 65); # Assumes test on same platform
3857 sub is_code_point_usable {
3858 # This used only for making the test script. See if the input
3859 # proposed trial code point is one that Perl will handle. If second
3860 # parameter is 0, it won't select some code points for various
3861 # reasons, noted below.
3864 my $try_hard = shift;
3865 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3867 return 0 if $code < 0; # Never use a negative
3869 # For non-ASCII, we shun the characters that don't have Perl encoding-
3870 # independent symbols for them. 'A' is such a symbol, so is "\n".
3871 return $try_hard if $non_ASCII
3874 || ($code >= 0x0E && $code <= 0x1F)
3875 || ($code >= 0x01 && $code <= 0x06)
3878 # shun null. I'm (khw) not sure why this was done, but NULL would be
3879 # the character very frequently used.
3880 return $try_hard if $code == 0x0000;
3882 return 0 if $try_hard; # XXX Temporary until fix utf8.c
3884 # shun non-character code points.
3885 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
3886 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
3888 return $try_hard if $code > $LAST_UNICODE_CODEPOINT; # keep in range
3889 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
3894 sub get_valid_code_point {
3895 # Return a code point that's part of the range list. Returns nothing
3896 # if the table is empty or we can't find a suitable code point. This
3897 # used only for making the test script.
3900 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3902 my $addr = main::objaddr($self);
3904 # On first pass, don't choose less desirable code points; if no good
3905 # one is found, repeat, allowing a less desirable one to be selected.
3906 for my $try_hard (0, 1) {
3908 # Look through all the ranges for a usable code point.
3909 for my $set ($self->ranges) {
3911 # Try the edge cases first, starting with the end point of the
3913 my $end = $set->end;
3914 return $end if is_code_point_usable($end, $try_hard);
3916 # End point didn't, work. Start at the beginning and try
3917 # every one until find one that does work.
3918 for my $trial ($set->start .. $end - 1) {
3919 return $trial if is_code_point_usable($trial, $try_hard);
3923 return (); # If none found, give up.
3926 sub get_invalid_code_point {
3927 # Return a code point that's not part of the table. Returns nothing
3928 # if the table covers all code points or a suitable code point can't
3929 # be found. This used only for making the test script.
3932 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3934 # Just find a valid code point of the inverse, if any.
3935 return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
3937 } # end closure for Range_List
3940 use base '_Range_List_Base';
3942 # A Range_Map is a range list in which the range values (called maps) are
3943 # significant, and hence shouldn't be manipulated by our other code, which
3944 # could be ambiguous or lose things. For example, in taking the union of two
3945 # lists, which share code points, but which have differing values, which one
3946 # has precedence in the union?
3947 # It turns out that these operations aren't really necessary for map tables,
3948 # and so this class was created to make sure they aren't accidentally
3954 # Add a range containing a mapping value to the list
3957 # Rest of parameters passed on
3959 return $self->_add_delete('+', @_);
3963 # Adds entry to a range list which can duplicate an existing entry
3966 my $code_point = shift;
3968 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3970 return $self->add_map($code_point, $code_point,
3971 $value, Replace => $MULTIPLE);
3973 } # End of closure for package Range_Map
3975 package _Base_Table;
3977 # A table is the basic data structure that gets written out into a file for
3978 # use by the Perl core. This is the abstract base class implementing the
3979 # common elements from the derived ones. A list of the methods to be
3980 # furnished by an implementing class is just after the constructor.
3982 sub standardize { return main::standardize($_[0]); }
3983 sub trace { return main::trace(@_); }
3987 main::setup_package();
3990 # Object containing the ranges of the table.
3991 main::set_access('range_list', \%range_list, 'p_r', 'p_s');
3994 # The full table name.
3995 main::set_access('full_name', \%full_name, 'r');
3998 # The table name, almost always shorter
3999 main::set_access('name', \%name, 'r');
4002 # The shortest of all the aliases for this table, with underscores removed
4003 main::set_access('short_name', \%short_name);
4005 my %nominal_short_name_length;
4006 # The length of short_name before removing underscores
4007 main::set_access('nominal_short_name_length',
4008 \%nominal_short_name_length);
4011 # The complete name, including property.
4012 main::set_access('complete_name', \%complete_name, 'r');
4015 # Parent property this table is attached to.
4016 main::set_access('property', \%property, 'r');
4019 # Ordered list of aliases of the table's name. The first ones in the list
4020 # are output first in comments
4021 main::set_access('aliases', \%aliases, 'readable_array');
4024 # A comment associated with the table for human readers of the files
4025 main::set_access('comment', \%comment, 's');
4028 # A comment giving a short description of the table's meaning for human
4029 # readers of the files.
4030 main::set_access('description', \%description, 'readable_array');
4033 # A comment giving a short note about the table for human readers of the
4035 main::set_access('note', \%note, 'readable_array');
4038 # Boolean; if set means any file that contains this table is marked as for
4039 # internal-only use.
4040 main::set_access('internal_only', \%internal_only);
4042 my %find_table_from_alias;
4043 # The parent property passes this pointer to a hash which this class adds
4044 # all its aliases to, so that the parent can quickly take an alias and
4046 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4049 # After this table is made equivalent to another one; we shouldn't go
4050 # changing the contents because that could mean it's no longer equivalent
4051 main::set_access('locked', \%locked, 'r');
4054 # This gives the final path to the file containing the table. Each
4055 # directory in the path is an element in the array
4056 main::set_access('file_path', \%file_path, 'readable_array');
4059 # What is the table's status, normal, $OBSOLETE, etc. Enum
4060 main::set_access('status', \%status, 'r');
4063 # A comment about its being obsolete, or whatever non normal status it has
4064 main::set_access('status_info', \%status_info, 'r');
4067 # Is the table to be output with each range only a single code point?
4068 # This is done to avoid breaking existing code that may have come to rely
4069 # on this behavior in previous versions of this program.)
4070 main::set_access('range_size_1', \%range_size_1, 'r', 's');
4073 # A boolean set iff this table is a Perl extension to the Unicode
4075 main::set_access('perl_extension', \%perl_extension, 'r');
4078 # All arguments are key => value pairs, which you can see below, most
4079 # of which match fields documented above. Otherwise: Pod_Entry,
4080 # Externally_Ok, and Fuzzy apply to the names of the table, and are
4081 # documented in the Alias package
4083 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4087 my $self = bless \do { my $anonymous_scalar }, $class;
4088 my $addr = main::objaddr($self);
4092 $name{$addr} = delete $args{'Name'};
4093 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4094 $full_name{$addr} = delete $args{'Full_Name'};
4095 my $complete_name = $complete_name{$addr}
4096 = delete $args{'Complete_Name'};
4097 $internal_only{$addr} = delete $args{'Internal_Only_Warning'} || 0;
4098 $property{$addr} = delete $args{'_Property'};
4099 $range_list{$addr} = delete $args{'_Range_List'};
4100 $status{$addr} = delete $args{'Status'} || $NORMAL;
4101 $status_info{$addr} = delete $args{'_Status_Info'} || "";
4102 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
4104 my $description = delete $args{'Description'};
4105 my $externally_ok = delete $args{'Externally_Ok'};
4106 my $loose_match = delete $args{'Fuzzy'};
4107 my $note = delete $args{'Note'};
4108 my $make_pod_entry = delete $args{'Pod_Entry'};
4109 my $perl_extension = delete $args{'Perl_Extension'};
4111 # Shouldn't have any left over
4112 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4114 # Can't use || above because conceivably the name could be 0, and
4115 # can't use // operator in case this program gets used in Perl 5.8
4116 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
4118 $aliases{$addr} = [ ];
4119 $comment{$addr} = [ ];
4120 $description{$addr} = [ ];
4122 $file_path{$addr} = [ ];
4123 $locked{$addr} = "";
4125 push @{$description{$addr}}, $description if $description;
4126 push @{$note{$addr}}, $note if $note;
4128 if ($status{$addr} eq $PLACEHOLDER) {
4130 # A placeholder table doesn't get documented, is a perl extension,
4131 # and quite likely will be empty
4132 $make_pod_entry = 0 if ! defined $make_pod_entry;
4133 $perl_extension = 1 if ! defined $perl_extension;
4134 push @tables_that_may_be_empty, $complete_name{$addr};
4136 elsif (! $status{$addr}) {
4138 # If hasn't set its status already, see if it is on one of the
4139 # lists of properties or tables that have particular statuses; if
4140 # not, is normal. The lists are prioritized so the most serious
4141 # ones are checked first
4142 if (exists $why_suppressed{$complete_name}) {
4143 $status{$addr} = $SUPPRESSED;
4145 elsif (exists $why_deprecated{$complete_name}) {
4146 $status{$addr} = $DEPRECATED;
4148 elsif (exists $why_stabilized{$complete_name}) {
4149 $status{$addr} = $STABILIZED;
4151 elsif (exists $why_obsolete{$complete_name}) {
4152 $status{$addr} = $OBSOLETE;
4155 # Existence above doesn't necessarily mean there is a message
4156 # associated with it. Use the most serious message.
4157 if ($status{$addr}) {
4158 if ($why_suppressed{$complete_name}) {
4160 = $why_suppressed{$complete_name};
4162 elsif ($why_deprecated{$complete_name}) {
4164 = $why_deprecated{$complete_name};
4166 elsif ($why_stabilized{$complete_name}) {
4168 = $why_stabilized{$complete_name};
4170 elsif ($why_obsolete{$complete_name}) {
4172 = $why_obsolete{$complete_name};
4177 $perl_extension{$addr} = $perl_extension || 0;
4179 # By convention what typically gets printed only or first is what's
4180 # first in the list, so put the full name there for good output
4181 # clarity. Other routines rely on the full name being first on the
4183 $self->add_alias($full_name{$addr},
4184 Externally_Ok => $externally_ok,
4185 Fuzzy => $loose_match,
4186 Pod_Entry => $make_pod_entry,
4187 Status => $status{$addr},
4190 # Then comes the other name, if meaningfully different.
4191 if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4192 $self->add_alias($name{$addr},
4193 Externally_Ok => $externally_ok,
4194 Fuzzy => $loose_match,
4195 Pod_Entry => $make_pod_entry,
4196 Status => $status{$addr},
4203 # Here are the methods that are required to be defined by any derived
4209 # append_to_body and pre_body are called in the write() method
4210 # to add stuff after the main body of the table, but before
4211 # its close; and to prepend stuff before the beginning of the
4216 Carp::my_carp_bug( __LINE__
4217 . ": Must create method '$sub()' for "
4225 "." => \&main::_operator_dot,
4226 '!=' => \&main::_operator_not_equal,
4227 '==' => \&main::_operator_equal,
4231 # Returns the array of ranges associated with this table.
4233 return $range_list{main::objaddr shift}->ranges;
4237 # Add a synonym for this table.
4239 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
4242 my $name = shift; # The name to add.
4243 my $pointer = shift; # What the alias hash should point to. For
4244 # map tables, this is the parent property;
4245 # for match tables, it is the table itself.
4248 my $loose_match = delete $args{'Fuzzy'};
4250 my $make_pod_entry = delete $args{'Pod_Entry'};
4251 $make_pod_entry = $YES unless defined $make_pod_entry;
4253 my $externally_ok = delete $args{'Externally_Ok'};
4254 $externally_ok = 1 unless defined $externally_ok;
4256 my $status = delete $args{'Status'};
4257 $status = $NORMAL unless defined $status;
4259 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4261 # Capitalize the first letter of the alias unless it is one of the CJK
4262 # ones which specifically begins with a lower 'k'. Do this because
4263 # Unicode has varied whether they capitalize first letters or not, and
4264 # have later changed their minds and capitalized them, but not the
4265 # other way around. So do it always and avoid changes from release to
4267 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
4269 my $addr = main::objaddr $self;
4271 # Figure out if should be loosely matched if not already specified.
4272 if (! defined $loose_match) {
4274 # Is a loose_match if isn't null, and doesn't begin with an
4275 # underscore and isn't just a number
4277 && substr($name, 0, 1) ne '_'
4278 && $name !~ qr{^[0-9_.+-/]+$})
4287 # If this alias has already been defined, do nothing.
4288 return if defined $find_table_from_alias{$addr}->{$name};
4290 # That includes if it is standardly equivalent to an existing alias,
4291 # in which case, add this name to the list, so won't have to search
4293 my $standard_name = main::standardize($name);
4294 if (defined $find_table_from_alias{$addr}->{$standard_name}) {
4295 $find_table_from_alias{$addr}->{$name}
4296 = $find_table_from_alias{$addr}->{$standard_name};
4300 # Set the index hash for this alias for future quick reference.
4301 $find_table_from_alias{$addr}->{$name} = $pointer;
4302 $find_table_from_alias{$addr}->{$standard_name} = $pointer;
4303 local $to_trace = 0 if main::DEBUG;
4304 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
4305 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
4308 # Put the new alias at the end of the list of aliases unless the final
4309 # element begins with an underscore (meaning it is for internal perl
4310 # use) or is all numeric, in which case, put the new one before that
4311 # one. This floats any all-numeric or underscore-beginning aliases to
4312 # the end. This is done so that they are listed last in output lists,
4313 # to encourage the user to use a better name (either more descriptive
4314 # or not an internal-only one) instead. This ordering is relied on
4315 # implicitly elsewhere in this program, like in short_name()
4316 my $list = $aliases{$addr};
4317 my $insert_position = (@$list == 0
4318 || (substr($list->[-1]->name, 0, 1) ne '_'
4319 && $list->[-1]->name =~ /\D/))
4325 Alias->new($name, $loose_match, $make_pod_entry,
4326 $externally_ok, $status);
4328 # This name may be shorter than any existing ones, so clear the cache
4329 # of the shortest, so will have to be recalculated.
4330 undef $short_name{main::objaddr $self};
4335 # Returns a name suitable for use as the base part of a file name.
4336 # That is, shorter wins. It can return undef if there is no suitable
4337 # name. The name has all non-essential underscores removed.
4339 # The optional second parameter is a reference to a scalar in which
4340 # this routine will store the length the returned name had before the
4341 # underscores were removed, or undef if the return is undef.
4343 # The shortest name can change if new aliases are added. So using
4344 # this should be deferred until after all these are added. The code
4345 # that does that should clear this one's cache.
4346 # Any name with alphabetics is preferred over an all numeric one, even
4350 my $nominal_length_ptr = shift;
4351 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4353 my $addr = main::objaddr $self;
4355 # For efficiency, don't recalculate, but this means that adding new
4356 # aliases could change what the shortest is, so the code that does
4357 # that needs to undef this.
4358 if (defined $short_name{$addr}) {
4359 if ($nominal_length_ptr) {
4360 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4362 return $short_name{$addr};
4365 # Look at each alias
4366 foreach my $alias ($self->aliases()) {
4368 # Don't use an alias that isn't ok to use for an external name.
4369 next if ! $alias->externally_ok;
4371 my $name = main::Standardize($alias->name);
4372 trace $self, $name if main::DEBUG && $to_trace;
4374 # Take the first one, or a shorter one that isn't numeric. This
4375 # relies on numeric aliases always being last in the array
4376 # returned by aliases(). Any alpha one will have precedence.
4377 if (! defined $short_name{$addr}
4379 && length($name) < length($short_name{$addr})))
4381 # Remove interior underscores.
4382 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
4384 $nominal_short_name_length{$addr} = length $name;
4388 # If no suitable external name return undef
4389 if (! defined $short_name{$addr}) {
4390 $$nominal_length_ptr = undef if $nominal_length_ptr;
4394 # Don't allow a null external name.
4395 if ($short_name{$addr} eq "") {
4396 $short_name{$addr} = '_';
4397 $nominal_short_name_length{$addr} = 1;
4400 trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
4402 if ($nominal_length_ptr) {
4403 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4405 return $short_name{$addr};
4409 # Returns the external name that this table should be known by. This
4410 # is usually the short_name, but not if the short_name is undefined.
4413 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4415 my $short = $self->short_name;
4416 return $short if defined $short;
4421 sub add_description { # Adds the parameter as a short description.
4424 my $description = shift;
4426 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4428 push @{$description{main::objaddr $self}}, $description;
4433 sub add_note { # Adds the parameter as a short note.
4438 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4440 push @{$note{main::objaddr $self}}, $note;
4445 sub add_comment { # Adds the parameter as a comment.
4448 my $comment = shift;
4449 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4452 push @{$comment{main::objaddr $self}}, $comment;
4458 # Return the current comment for this table. If called in list
4459 # context, returns the array of comments. In scalar, returns a string
4460 # of each element joined together with a period ending each.
4463 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4465 my @list = @{$comment{main::objaddr $self}};
4466 return @list if wantarray;
4468 foreach my $sentence (@list) {
4469 $return .= '. ' if $return;
4470 $return .= $sentence;
4473 $return .= '.' if $return;
4478 # Initialize the table with the argument which is any valid
4479 # initialization for range lists.
4482 my $initialization = shift;
4483 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4485 # Replace the current range list with a new one of the same exact
4487 my $class = ref $range_list{main::objaddr $self};
4488 $range_list{main::objaddr $self} = $class->new(Owner => $self,
4489 Initialize => $initialization);
4495 # The header that is output for the table in the file it is written
4499 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4502 $return .= $DEVELOPMENT_ONLY if $compare_versions;
4504 $return .= $INTERNAL_ONLY if $internal_only{main::objaddr $self};
4509 # Write a representation of the table to its file.
4512 my $tab_stops = shift; # The number of tab stops over to put any
4514 my $suppress_value = shift; # Optional, if the value associated with
4515 # a range equals this one, don't write
4517 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4519 my $addr = main::objaddr($self);
4521 # Start with the header
4522 my @OUT = $self->header;
4525 push @OUT, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
4528 # Then any pre-body stuff.
4529 my $pre_body = $self->pre_body;
4530 push @OUT, $pre_body, "\n" if $pre_body;
4532 # The main body looks like a 'here' document
4533 push @OUT, "return <<'END';\n";
4535 if ($range_list{$addr}->is_empty) {
4537 # This is a kludge for empty tables to silence a warning in
4538 # utf8.c, which can't really deal with empty tables, but it can
4539 # deal with a table that matches nothing, as the inverse of 'Any'
4541 push @OUT, "!utf8::IsAny\n";
4544 my $range_size_1 = $range_size_1{$addr};
4546 # Output each range as part of the here document.
4547 for my $set ($range_list{$addr}->ranges) {
4548 my $start = $set->start;
4549 my $end = $set->end;
4550 my $value = $set->value;
4552 # Don't output ranges whose value is the one to suppress
4553 next if defined $suppress_value && $value eq $suppress_value;
4555 # If has or wants a single point range output
4556 if ($start == $end || $range_size_1) {
4557 for my $i ($start .. $end) {
4558 push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
4562 push @OUT, sprintf "%04X\t%04X\t%s", $start, $end, $value;
4564 # Add a comment with the size of the range, if requested.
4565 # Expand Tabs to make sure they all start in the same
4566 # column, and then unexpand to use mostly tabs.
4567 if (! $output_range_counts) {
4571 $OUT[-1] = Text::Tabs::expand($OUT[-1]);
4572 my $count = main::clarify_number($end - $start + 1);
4575 my $width = $tab_stops * 8 - 1;
4576 $OUT[-1] = sprintf("%-*s # [%s]\n",
4580 $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
4583 } # End of loop through all the table's ranges
4586 # Add anything that goes after the main body, but within the here
4588 my $append_to_body = $self->append_to_body;
4589 push @OUT, $append_to_body if $append_to_body;
4591 # And finish the here document.
4594 # All these files have a .pl suffix
4595 $file_path{$addr}->[-1] .= '.pl';
4597 main::write($file_path{$addr}, \@OUT);
4601 sub set_status { # Set the table's status
4603 my $status = shift; # The status enum value
4604 my $info = shift; # Any message associated with it.
4605 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4607 my $addr = main::objaddr($self);
4609 $status{$addr} = $status;
4610 $status_info{$addr} = $info;
4615 # Don't allow changes to the table from now on. This stores a stack
4616 # trace of where it was called, so that later attempts to modify it
4617 # can immediately show where it got locked.
4620 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4622 my $addr = main::objaddr $self;
4624 $locked{$addr} = "";
4626 my $line = (caller(0))[2];
4629 # Accumulate the stack trace
4631 my ($pkg, $file, $caller_line, $caller) = caller $i++;
4633 last unless defined $caller;
4635 $locked{$addr} .= " called from $caller() at line $line\n";
4636 $line = $caller_line;
4638 $locked{$addr} .= " called from main at line $line\n";
4643 sub carp_if_locked {
4644 # Return whether a table is locked or not, and, by the way, complain
4648 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4650 my $addr = main::objaddr $self;
4652 return 0 if ! $locked{$addr};
4653 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
4657 sub set_file_path { # Set the final directory path for this table
4659 # Rest of parameters passed on
4661 @{$file_path{main::objaddr $self}} = @_;
4665 # Accessors for the range list stored in this table. First for
4684 return $range_list{main::objaddr $self}->$sub(@_);
4688 # Then for ones that should fail if locked
4698 return if $self->carp_if_locked;
4699 return $range_list{main::objaddr $self}->$sub(@_);
4706 use base '_Base_Table';
4708 # A Map Table is a table that contains the mappings from code points to
4709 # values. There are two weird cases:
4710 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
4711 # are written in the table's file at the end of the table nonetheless. It
4712 # requires specially constructed code to handle these; utf8.c can not read
4713 # these in, so they should not go in $map_directory. As of this writing,
4714 # the only case that these happen is for named sequences used in
4715 # charnames.pm. But this code doesn't enforce any syntax on these, so
4716 # something else could come along that uses it.
4717 # 2) Specials are anything that doesn't fit syntactically into the body of the
4718 # table. The ranges for these have a map type of non-zero. The code below
4719 # knows about and handles each possible type. In most cases, these are
4720 # written as part of the header.
4722 # A map table deliberately can't be manipulated at will unlike match tables.
4723 # This is because of the ambiguities having to do with what to do with
4724 # overlapping code points. And there just isn't a need for those things;
4725 # what one wants to do is just query, add, replace, or delete mappings, plus
4726 # write the final result.
4727 # However, there is a method to get the list of possible ranges that aren't in
4728 # this table to use for defaulting missing code point mappings. And,
4729 # map_add_or_replace_non_nulls() does allow one to add another table to this
4730 # one, but it is clearly very specialized, and defined that the other's
4731 # non-null values replace this one's if there is any overlap.
4733 sub trace { return main::trace(@_); }
4737 main::setup_package();
4740 # Many input files omit some entries; this gives what the mapping for the
4741 # missing entries should be
4742 main::set_access('default_map', \%default_map, 'r');
4744 my %anomalous_entries;
4745 # Things that go in the body of the table which don't fit the normal
4746 # scheme of things, like having a range. Not much can be done with these
4747 # once there except to output them. This was created to handle named
4749 main::set_access('anomalous_entry', \%anomalous_entries, 'a');
4750 main::set_access('anomalous_entries', # Append singular, read plural
4751 \%anomalous_entries,
4755 # The format of the entries of the table. This is calculated from the
4756 # data in the table (or passed in the constructor). This is an enum e.g.,
4758 main::set_access('format', \%format);
4761 # This is a string, solely for documentation, indicating how one can get
4762 # access to this property via the Perl core.
4763 main::set_access('core_access', \%core_access, 'r', 's');
4766 # Boolean set when non-zero map-type ranges are added to this table,
4767 # which happens in only a few tables. This is purely for performance, to
4768 # avoid having to search through every table upon output, so if all the
4769 # non-zero maps got deleted before output, this would remain set, and the
4770 # only penalty would be performance. Currently, most map tables that get
4771 # output have specials in them, so this doesn't help that much anyway.
4772 main::set_access('has_specials', \%has_specials);
4775 # Boolean as to whether or not to write out this map table
4776 main::set_access('to_output_map', \%to_output_map, 's');
4785 # Optional initialization data for the table.
4786 my $initialize = delete $args{'Initialize'};
4788 my $core_access = delete $args{'Core_Access'};
4789 my $default_map = delete $args{'Default_Map'};
4790 my $format = delete $args{'Format'};
4791 my $property = delete $args{'_Property'};
4792 my $full_name = delete $args{'Full_Name'};
4793 # Rest of parameters passed on
4795 my $range_list = Range_Map->new(Owner => $property);
4797 my $self = $class->SUPER::new(
4799 Complete_Name => $full_name,
4800 Full_Name => $full_name,
4801 _Property => $property,
4802 _Range_List => $range_list,
4805 my $addr = main::objaddr $self;
4807 $anomalous_entries{$addr} = [];
4808 $core_access{$addr} = $core_access;
4809 $default_map{$addr} = $default_map;
4810 $format{$addr} = $format;
4812 $self->initialize($initialize) if defined $initialize;
4819 qw("") => "_operator_stringify",
4822 sub _operator_stringify {
4825 my $name = $self->property->full_name;
4826 $name = '""' if $name eq "";
4827 return "Map table for Property '$name'";
4831 # Add a synonym for this table (which means the property itself)
4834 # Rest of parameters passed on.
4836 $self->SUPER::add_alias($name, $self->property, @_);
4841 # Add a range of code points to the list of specially-handled code
4842 # points. $MULTI_CP is assumed if the type of special is not passed
4851 my $type = delete $args{'Type'} || 0;
4852 # Rest of parameters passed on
4854 # Can't change the table if locked.
4855 return if $self->carp_if_locked;
4857 my $addr = main::objaddr $self;
4859 $has_specials{$addr} = 1 if $type;
4861 $self->_range_list->add_map($lower, $upper,
4868 sub append_to_body {
4869 # Adds to the written HERE document of the table's body any anomalous
4870 # entries in the table..
4873 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4875 my $addr = main::objaddr $self;
4877 return "" unless @{$anomalous_entries{$addr}};
4878 return join("\n", @{$anomalous_entries{$addr}}) . "\n";
4881 sub map_add_or_replace_non_nulls {
4882 # This adds the mappings in the table $other to $self. Non-null
4883 # mappings from $other override those in $self. It essentially merges
4884 # the two tables, with the second having priority except for null
4889 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4891 return if $self->carp_if_locked;
4893 if (! $other->isa(__PACKAGE__)) {
4894 Carp::my_carp_bug("$other should be a "
4902 my $addr = main::objaddr $self;
4903 my $other_addr = main::objaddr $other;
4905 local $to_trace = 0 if main::DEBUG;
4907 my $self_range_list = $self->_range_list;
4908 my $other_range_list = $other->_range_list;
4909 foreach my $range ($other_range_list->ranges) {
4910 my $value = $range->value;
4911 next if $value eq "";
4912 $self_range_list->_add_delete('+',
4916 Type => $range->type,
4917 Replace => $UNCONDITIONALLY);
4920 # Copy the specials information from the other table to $self
4921 if ($has_specials{$other_addr}) {
4922 $has_specials{$addr} = 1;
4928 sub set_default_map {
4929 # Define what code points that are missing from the input files should
4934 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4936 my $addr = main::objaddr $self;
4938 # Convert the input to the standard equivalent, if any (won't have any
4939 # for $STRING properties)
4940 my $standard = $self->_find_table_from_alias->{$map};
4941 $map = $standard->name if defined $standard;
4943 # Warn if there already is a non-equivalent default map for this
4944 # property. Note that a default map can be a ref, which means that
4945 # what it actually means is delayed until later in the program, and it
4946 # IS permissible to override it here without a message.
4947 my $default_map = $default_map{$addr};
4948 if (defined $default_map
4949 && ! ref($default_map)
4950 && $default_map ne $map
4951 && main::Standardize($map) ne $default_map)
4953 my $property = $self->property;
4954 my $map_table = $property->table($map);
4955 my $default_table = $property->table($default_map);
4956 if (defined $map_table
4957 && defined $default_table
4958 && $map_table != $default_table)
4960 Carp::my_carp("Changing the default mapping for "
4962 . " from $default_map to $map'");
4966 $default_map{$addr} = $map;
4968 # Don't also create any missing table for this map at this point,
4969 # because if we did, it could get done before the main table add is
4970 # done for PropValueAliases.txt; instead the caller will have to make
4971 # sure it exists, if desired.
4976 # Returns boolean: should we write this map table?
4979 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4981 my $addr = main::objaddr $self;
4983 # If overridden, use that
4984 return $to_output_map{$addr} if defined $to_output_map{$addr};
4986 my $full_name = $self->full_name;
4988 # If table says to output, do so; if says to suppress it, do do.
4989 return 1 if grep { $_ eq $full_name } @output_mapped_properties;
4990 return 0 if $self->status eq $SUPPRESSED;
4992 my $type = $self->property->type;
4994 # Don't want to output binary map tables even for debugging.
4995 return 0 if $type == $BINARY;
4997 # But do want to output string ones.
4998 return 1 if $type == $STRING;
5000 # Otherwise is an $ENUM, don't output it
5005 # Returns a Range_List that is gaps of the current table. That is,
5009 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5011 my $current = Range_List->new(Initialize => $self->_range_list,
5012 Owner => $self->property);
5016 sub set_final_comment {
5017 # Just before output, create the comment that heads the file
5018 # containing this table.
5021 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5023 # No sense generating a comment if aren't going to write it out.
5024 return if ! $self->to_output_map;
5026 my $addr = main::objaddr $self;
5028 my $property = $self->property;
5030 # Get all the possible names for this property. Don't use any that
5031 # aren't ok for use in a file name, etc. This is perhaps causing that
5032 # flag to do double duty, and may have to be changed in the future to
5033 # have our own flag for just this purpose; but it works now to exclude
5034 # Perl generated synonyms from the lists for properties, where the
5035 # name is always the proper Unicode one.
5036 my @property_aliases = grep { $_->externally_ok } $self->aliases;
5038 my $count = $self->count;
5039 my $default_map = $default_map{$addr};
5041 # The ranges that map to the default aren't output, so subtract that
5042 # to get those actually output. A property with matching tables
5043 # already has the information calculated.
5044 if ($property->type != $STRING) {
5045 $count -= $property->table($default_map)->count;
5047 elsif (defined $default_map) {
5049 # But for $STRING properties, must calculate now. Subtract the
5050 # count from each range that maps to the default.
5051 foreach my $range ($self->_range_list->ranges) {
5052 if ($range->value eq $default_map) {
5053 $count -= $range->end +1 - $range->start;
5059 # Get a string version of $count with underscores in large numbers,
5061 my $string_count = main::clarify_number($count);
5063 my $code_points = ($count == 1)
5064 ? 'single code point'
5065 : "$string_count code points";
5070 if (@property_aliases <= 1) {
5071 $mapping = 'mapping';
5072 $these_mappings = 'this mapping';
5076 $mapping = 'synonymous mappings';
5077 $these_mappings = 'these mappings';
5081 if ($count >= $MAX_UNICODE_CODEPOINTS) {
5082 $cp = "any code point in Unicode Version $string_version";
5086 if ($default_map eq "") {
5087 $map_to = 'the null string';
5089 elsif ($default_map eq $CODE_POINT) {
5093 $map_to = "'$default_map'";
5096 $cp = "the single code point";
5099 $cp = "one of the $code_points";
5101 $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
5106 my $status = $self->status;
5108 my $warn = uc $status_past_participles{$status};
5111 !!!!!!! $warn !!!!!!!!!!!!!!!!!!!
5112 All property or property=value combinations contained in this file are $warn.
5113 See $unicode_reference_url for what this means.
5117 $comment .= "This file returns the $mapping:\n";
5119 for my $i (0 .. @property_aliases - 1) {
5120 $comment .= sprintf("%-8s%s\n",
5122 $property_aliases[$i]->name . '(cp)'
5126 "\nwhere 'cp' is $cp. Note that $these_mappings $are ";
5128 my $access = $core_access{$addr};
5130 $comment .= "accessible through the Perl core via $access.";
5133 $comment .= "not accessible through the Perl core directly.";
5136 # And append any commentary already set from the actual property.
5137 $comment .= "\n\n" . $self->comment if $self->comment;
5138 if ($self->description) {
5139 $comment .= "\n\n" . join " ", $self->description;
5142 $comment .= "\n\n" . join " ", $self->note;
5146 if (! $self->perl_extension) {
5149 For information about what this property really means, see:
5150 $unicode_reference_url
5154 if ($count) { # Format differs for empty table
5155 $comment.= "\nThe format of the ";
5156 if ($self->range_size_1) {
5158 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
5159 is in hex; MAPPING is what CODE_POINT maps to.
5164 # There are tables which end up only having one element per
5165 # range, but it is not worth keeping track of for making just
5166 # this comment a little better.
5168 non-comment portions of the main body of lines of this file is:
5169 START\\tSTOP\\tMAPPING where START is the starting code point of the
5170 range, in hex; STOP is the ending point, or if omitted, the range has just one
5171 code point; MAPPING is what each code point between START and STOP maps to.
5173 if ($output_range_counts) {
5175 Numbers in comments in [brackets] indicate how many code points are in the
5176 range (omitted when the range is a single code point or if the mapping is to
5182 $self->set_comment(main::join_lines($comment));
5186 my %swash_keys; # Makes sure don't duplicate swash names.
5189 # Returns the string that should be output in the file before the main
5190 # body of this table. This includes some hash entries identifying the
5191 # format of the body, and what the single value should be for all
5192 # ranges missing from it. It also includes any code points which have
5193 # map_types that don't go in the main table.
5196 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5198 my $addr = main::objaddr $self;
5200 my $name = $self->property->swash_name;
5202 if (defined $swash_keys{$name}) {
5203 Carp::my_carp(join_lines(<<END
5204 Already created a swash name '$name' for $swash_keys{$name}. This means that
5205 the same name desired for $self shouldn't be used. Bad News. This must be
5206 fixed before production use, but proceeding anyway
5210 $swash_keys{$name} = "$self";
5212 my $default_map = $default_map{$addr};
5215 if ($has_specials{$addr}) {
5217 # Here, some maps with non-zero type have been added to the table.
5218 # Go through the table and handle each of them. None will appear
5219 # in the body of the table, so delete each one as we go. The
5220 # code point count has already been calculated, so ok to delete
5223 my @multi_code_point_maps;
5224 my $has_hangul_syllables = 0;
5226 # The key is the base name of the code point, and the value is an
5227 # array giving all the ranges that use this base name. Each range
5228 # is actually a hash giving the 'low' and 'high' values of it.
5229 my %names_ending_in_code_point;
5231 # Inverse mapping. The list of ranges that have these kinds of
5232 # names. Each element contains the low, high, and base names in a
5234 my @code_points_ending_in_code_point;
5236 my $range_map = $self->_range_list;
5237 foreach my $range ($range_map->ranges) {
5238 next unless $range->type != 0;
5239 my $low = $range->start;
5240 my $high = $range->end;
5241 my $map = $range->value;
5242 my $type = $range->type;
5244 # No need to output the range if it maps to the default. And
5245 # the write method won't output it either, so no need to
5246 # delete it to keep it from being output, and is faster to
5247 # skip than to delete anyway.
5248 next if $map eq $default_map;
5250 # Delete the range to keep write() from trying to output it
5251 $range_map->delete_range($low, $high);
5253 # Switch based on the map type...
5254 if ($type == $HANGUL_SYLLABLE) {
5256 # These are entirely algorithmically determinable based on
5257 # some constants furnished by Unicode; for now, just set a
5258 # flag to indicate that have them. Below we will output
5259 # the code that does the algorithm.
5260 $has_hangul_syllables = 1;
5262 elsif ($type == $CP_IN_NAME) {
5264 # If the name ends in the code point it represents, are
5265 # also algorithmically determinable, but need information
5266 # about the map to do so. Both the map and its inverse
5267 # are stored in data structures output in the file.
5268 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
5269 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
5271 push @code_points_ending_in_code_point, { low => $low,
5276 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
5278 # Multi-code point maps and null string maps have an entry
5279 # for each code point in the range. They use the same
5281 for my $code_point ($low .. $high) {
5283 # The pack() below can't cope with surrogates.
5284 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
5285 Carp::my_carp("Surrogage code point '$code_point' in mapping to '$map' in $self. No map created");
5289 # Generate the hash entries for these in the form that
5290 # utf8.c understands.
5292 foreach my $to (split " ", $map) {
5293 if ($to !~ /^$code_point_re$/) {
5294 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created");
5297 $tostr .= sprintf "\\x{%s}", $to;
5300 # I (khw) have never waded through this line to
5301 # understand it well enough to comment it.
5302 my $utf8 = sprintf(qq["%s" => "$tostr",],
5303 join("", map { sprintf "\\x%02X", $_ }
5304 unpack("U0C*", pack("U", $code_point))));
5306 # Add a comment so that a human reader can more easily
5307 # see what's going on.
5308 push @multi_code_point_maps,
5309 sprintf("%-45s # U+%04X => %s", $utf8,
5315 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Using type 0 instead");
5316 $range_map->add_map($low, $high, $map, Replace => $UNCONDITIONALLY, Type => 0);
5318 } # End of loop through all ranges
5320 # Here have gone through the whole file. If actually generated
5321 # anything for each map type, add its respective header and
5323 if (@multi_code_point_maps) {
5326 # Some code points require special handling because their mappings are each to
5327 # multiple code points. These do not appear in the main body, but are defined
5328 # in the hash below.
5330 # The key: UTF-8 _bytes_, the value: UTF-8 (speed hack)
5331 %utf8::ToSpec$name = (
5333 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
5336 if ($has_hangul_syllables || @code_points_ending_in_code_point) {
5338 # Convert these structures to output format.
5339 my $code_points_ending_in_code_point =
5340 main::simple_dumper(\@code_points_ending_in_code_point,
5342 my $names = main::simple_dumper(\%names_ending_in_code_point,
5345 # Do the same with the Hangul names,
5351 if ($has_hangul_syllables) {
5353 # Construct a regular expression of all the possible
5354 # combinations of the Hangul syllables.
5355 my @L_re; # Leading consonants
5356 for my $i ($LBase .. $LBase + $LCount - 1) {
5357 push @L_re, $Jamo{$i}
5359 my @V_re; # Middle vowels
5360 for my $i ($VBase .. $VBase + $VCount - 1) {
5361 push @V_re, $Jamo{$i}
5363 my @T_re; # Trailing consonants
5364 for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
5365 push @T_re, $Jamo{$i}
5368 # The whole re is made up of the L V T combination.
5370 . join ('|', sort @L_re)
5372 . join ('|', sort @V_re)
5374 . join ('|', sort @T_re)
5377 # These hashes needed by the algorithm were generated
5378 # during reading of the Jamo.txt file
5379 $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
5380 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
5381 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
5382 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
5387 # To achieve significant memory savings when this file is read in,
5388 # algorithmically derivable code points are omitted from the main body below.
5389 # Instead, the following routines can be used to translate between name and
5390 # code point and vice versa
5394 # Matches legal code point. 4-6 hex numbers, If there are 6, the
5395 # first two must be '10'; if there are 5, the first must not be a '0'.
5396 my \$code_point_re = qr/$code_point_re/;
5398 # In the following hash, the keys are the bases of names which includes
5399 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The values
5400 # of each key is another hash which is used to get the low and high ends
5401 # for each range of code points that apply to the name
5402 my %names_ending_in_code_point = (
5406 # And the following array gives the inverse mapping from code points to
5407 # names. Lowest code points are first
5408 my \@code_points_ending_in_code_point = (
5409 $code_points_ending_in_code_point
5412 # Earlier releases didn't have Jamos. No sense outputting
5413 # them unless will be used.
5414 if ($has_hangul_syllables) {
5417 # Convert from code point to Jamo short name for use in composing Hangul
5423 # Leading consonant (can be null)
5433 # Optional trailing consonant
5438 # Computed re that splits up a Hangul name into LVT or LV syllables
5439 my \$syllable_re = qr/$jamo_re/;
5441 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
5442 my \$HANGUL_SYLLABLE_LENGTH = length \$HANGUL_SYLLABLE;
5444 # These constants names and values were taken from the Unicode standard,
5445 # version 5.1, section 3.12. They are used in conjunction with Hangul
5447 my \$SBase = 0xAC00;
5448 my \$LBase = 0x1100;
5449 my \$VBase = 0x1161;
5450 my \$TBase = 0x11A7;
5451 my \$SCount = 11172;
5455 my \$NCount = \$VCount * \$TCount;
5457 } # End of has Jamos
5459 $pre_body .= << 'END';
5461 sub name_to_code_point_special {
5464 # Returns undef if not one of the specially handled names; otherwise
5465 # returns the code point equivalent to the input name
5467 if ($has_hangul_syllables) {
5468 $pre_body .= << 'END';
5470 if (substr($name, 0, $HANGUL_SYLLABLE_LENGTH) eq $HANGUL_SYLLABLE) {
5471 $name = substr($name, $HANGUL_SYLLABLE_LENGTH);
5472 return if $name !~ qr/^$syllable_re$/;
5473 my $L = $Jamo_L{$1};
5474 my $V = $Jamo_V{$2};
5475 my $T = (defined $3) ? $Jamo_T{$3} : 0;
5476 return ($L * $VCount + $V) * $TCount + $T + $SBase;
5480 $pre_body .= << 'END';
5482 # Name must end in '-code_point' for this to handle.
5483 if ($name !~ /^ (.*) - ($code_point_re) $/x) {
5488 my $code_point = CORE::hex $2;
5490 # Name must be one of the ones which has the code point in it.
5491 return if ! $names_ending_in_code_point{$base};
5493 # Look through the list of ranges that apply to this name to see if
5494 # the code point is in one of them.
5495 for (my $i = 0; $i < scalar @{$names_ending_in_code_point{$base}{'low'}}; $i++) {
5496 return if $names_ending_in_code_point{$base}{'low'}->[$i] > $code_point;
5497 next if $names_ending_in_code_point{$base}{'high'}->[$i] < $code_point;
5499 # Here, the code point is in the range.
5503 # Here, looked like the name had a code point number in it, but
5504 # did not match one of the valid ones.
5508 sub code_point_to_name_special {
5509 my $code_point = shift;
5511 # Returns the name of a code point if algorithmically determinable;
5514 if ($has_hangul_syllables) {
5515 $pre_body .= << 'END';
5517 # If in the Hangul range, calculate the name based on Unicode's
5519 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
5521 my $SIndex = $code_point - $SBase;
5522 my $L = $LBase + $SIndex / $NCount;
5523 my $V = $VBase + ($SIndex % $NCount) / $TCount;
5524 my $T = $TBase + $SIndex % $TCount;
5525 $name = "$HANGUL_SYLLABLE $Jamo{$L}$Jamo{$V}";
5526 $name .= $Jamo{$T} if $T != $TBase;
5531 $pre_body .= << 'END';
5533 # Look through list of these code points for one in range.
5534 foreach my $hash (@code_points_ending_in_code_point) {
5535 return if $code_point < $hash->{'low'};
5536 if ($code_point <= $hash->{'high'}) {
5537 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
5540 return; # None found
5545 } # End of has hangul or code point in name maps.
5546 } # End of has specials
5548 # Calculate the format of the table if not already done.
5549 my $format = $format{$addr};
5550 my $property = $self->property;
5551 my $type = $property->type;
5552 if (! defined $format) {
5553 if ($type == $BINARY) {
5555 # Don't bother checking the values, because we elsewhere
5556 # verify that a binary table has only 2 values.
5557 $format = $BINARY_FORMAT;
5560 my @ranges = $self->_range_list->ranges;
5562 # default an empty table based on its type and default map
5565 # But it turns out that the only one we can say is a
5566 # non-string (besides binary, handled above) is when the
5567 # table is a string and the default map is to a code point
5568 if ($type == $STRING && $default_map eq $CODE_POINT) {
5569 $format = $HEX_FORMAT;
5572 $format = $STRING_FORMAT;
5577 # Start with the most restrictive format, and as we find
5578 # something that doesn't fit with that, change to the next
5579 # most restrictive, and so on.
5580 $format = $DECIMAL_FORMAT;
5581 foreach my $range (@ranges) {
5582 my $map = $range->value;
5583 if ($map ne $default_map) {
5584 last if $format eq $STRING_FORMAT; # already at
5587 $format = $INTEGER_FORMAT
5588 if $format eq $DECIMAL_FORMAT
5589 && $map !~ / ^ [0-9] $ /x;
5590 $format = $FLOAT_FORMAT
5591 if $format eq $INTEGER_FORMAT
5592 && $map !~ / ^ -? [0-9]+ $ /x;
5593 $format = $RATIONAL_FORMAT
5594 if $format eq $FLOAT_FORMAT
5595 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
5596 $format = $HEX_FORMAT
5597 if $format eq $RATIONAL_FORMAT
5598 && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
5599 $format = $STRING_FORMAT if $format eq $HEX_FORMAT
5600 && $map =~ /[^0-9A-F]/;
5605 } # end of calculating format
5608 # The name this swash is to be known by, with the format of the mappings in
5609 # the main body of the table, and what all code points missing from this file
5611 \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
5613 my $missing = $default_map;
5614 if ($missing eq $CODE_POINT
5615 && $format ne $HEX_FORMAT
5616 && ! defined $format{$addr}) # Is expected if was manually set
5618 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
5620 $format{$addr} = $format;
5621 $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$missing';";
5622 if ($missing eq $CODE_POINT) {
5623 $return .= ' # code point maps to itself';
5625 elsif ($missing eq "") {
5626 $return .= ' # code point maps to the null string';
5630 $return .= $pre_body;
5636 # Write the table to the file.
5639 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5641 my $addr = main::objaddr $self;
5643 return $self->SUPER::write(
5644 ($self->property == $block)
5645 ? 7 # block file needs more tab stops
5647 $default_map{$addr}); # don't write defaulteds
5650 # Accessors for the underlying list that should fail if locked.
5660 return if $self->carp_if_locked;
5661 return $self->_range_list->$sub(@_);
5664 } # End closure for Map_Table
5666 package Match_Table;
5667 use base '_Base_Table';
5669 # A Match table is one which is a list of all the code points that have
5670 # the same property and property value, for use in \p{property=value}
5671 # constructs in regular expressions. It adds very little data to the base
5672 # structure, but many methods, as these lists can be combined in many ways to
5674 # There are only a few concepts added:
5675 # 1) Equivalents and Relatedness.
5676 # Two tables can match the identical code points, but have different names.
5677 # This always happens when there is a perl single form extension
5678 # \p{IsProperty} for the Unicode compound form \P{Property=True}. The two
5679 # tables are set to be related, with the Perl extension being a child, and
5680 # the Unicode property being the parent.
5682 # It may be that two tables match the identical code points and we don't
5683 # know if they are related or not. This happens most frequently when the
5684 # Block and Script properties have the exact range. But note that a
5685 # revision to Unicode could add new code points to the script, which would
5686 # now have to be in a different block (as the block was filled, or there
5687 # would have been 'Unknown' script code points in it and they wouldn't have
5688 # been identical). So we can't rely on any two properties from Unicode
5689 # always matching the same code points from release to release, and thus
5690 # these tables are considered coincidentally equivalent--not related. When
5691 # two tables are unrelated but equivalent, one is arbitrarily chosen as the
5692 # 'leader', and the others are 'equivalents'. This concept is useful
5693 # to minimize the number of tables written out. Only one file is used for
5694 # any identical set of code points, with entries in Heavy.pl mapping all
5695 # the involved tables to it.
5697 # Related tables will always be identical; we set them up to be so. Thus
5698 # if the Unicode one is deprecated, the Perl one will be too. Not so for
5699 # unrelated tables. Relatedness makes generating the documentation easier.
5701 # 2) Conflicting. It may be that there will eventually be name clashes, with
5702 # the same name meaning different things. For a while, there actually were
5703 # conflicts, but they have so far been resolved by changing Perl's or
5704 # Unicode's definitions to match the other, but when this code was written,
5705 # it wasn't clear that that was what was going to happen. (Unicode changed
5706 # because of protests during their beta period.) Name clashes are warned
5707 # about during compilation, and the documentation. The generated tables
5708 # are sane, free of name clashes, because the code suppresses the Perl
5709 # version. But manual intervention to decide what the actual behavior
5710 # should be may be required should this happen. The introductory comments
5711 # have more to say about this.
5713 sub standardize { return main::standardize($_[0]); }
5714 sub trace { return main::trace(@_); }
5719 main::setup_package();
5722 # The leader table of this one; initially $self.
5723 main::set_access('leader', \%leader, 'r');
5726 # An array of any tables that have this one as their leader
5727 main::set_access('equivalents', \%equivalents, 'readable_array');
5730 # The parent table to this one, initially $self. This allows us to
5731 # distinguish between equivalent tables that are related, and those which
5732 # may not be, but share the same output file because they match the exact
5733 # same set of code points in the current Unicode release.
5734 main::set_access('parent', \%parent, 'r');
5737 # An array of any tables that have this one as their parent
5738 main::set_access('children', \%children, 'readable_array');
5741 # Array of any tables that would have the same name as this one with
5742 # a different meaning. This is used for the generated documentation.
5743 main::set_access('conflicting', \%conflicting, 'readable_array');
5746 # Set in the constructor for tables that are expected to match all code
5748 main::set_access('matches_all', \%matches_all, 'r');
5755 # The property for which this table is a listing of property values.
5756 my $property = delete $args{'_Property'};
5758 my $name = delete $args{'Name'};
5759 my $full_name = delete $args{'Full_Name'};
5760 $full_name = $name if ! defined $full_name;
5763 my $initialize = delete $args{'Initialize'};
5764 my $matches_all = delete $args{'Matches_All'} || 0;
5765 # Rest of parameters passed on.
5767 my $range_list = Range_List->new(Initialize => $initialize,
5768 Owner => $property);
5770 my $complete = $full_name;
5771 $complete = '""' if $complete eq ""; # A null name shouldn't happen,
5772 # but this helps debug if it
5774 # The complete name for a match table includes it's property in a
5775 # compound form 'property=table', except if the property is the
5776 # pseudo-property, perl, in which case it is just the single form,
5777 # 'table' (If you change the '=' must also change the ':' in lots of
5778 # places in this program that assume an equal sign)
5779 $complete = $property->full_name . "=$complete" if $property != $perl;
5782 my $self = $class->SUPER::new(%args,
5784 Complete_Name => $complete,
5785 Full_Name => $full_name,
5786 _Property => $property,
5787 _Range_List => $range_list,
5789 my $addr = main::objaddr $self;
5791 $conflicting{$addr} = [ ];
5792 $equivalents{$addr} = [ ];
5793 $children{$addr} = [ ];
5794 $matches_all{$addr} = $matches_all;
5795 $leader{$addr} = $self;
5796 $parent{$addr} = $self;
5801 # See this program's beginning comment block about overloading these.
5804 qw("") => "_operator_stringify",
5808 return if $self->carp_if_locked;
5816 return $self->_range_list + $other;
5822 return $self->_range_list & $other;
5828 return if $self->carp_if_locked;
5830 my $addr = main::objaddr $self;
5834 # Change the range list of this table to be the
5836 $self->_set_range_list($self->_range_list
5839 else { # $other is just a simple value
5840 $self->add_range($other, $other);
5844 '-' => sub { my $self = shift;
5846 my $reversed = shift;
5849 Carp::my_carp_bug("Can't cope with a "
5851 . " being the first parameter in a '-'. Subtraction ignored.");
5855 return $self->_range_list - $other;
5857 '~' => sub { my $self = shift;
5858 return ~ $self->_range_list;
5862 sub _operator_stringify {
5865 my $name = $self->complete_name;
5866 return "Table '$name'";
5870 # Add a synonym for this table. See the comments in the base class
5874 # Rest of parameters passed on.
5876 $self->SUPER::add_alias($name, $self, @_);
5880 sub add_conflicting {
5881 # Add the name of some other object to the list of ones that name
5882 # clash with this match table.
5885 my $conflicting_name = shift; # The name of the conflicting object
5886 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ?
5887 my $conflicting_object = shift; # Optional, the conflicting object
5888 # itself. This is used to
5889 # disambiguate the text if the input
5890 # name is identical to any of the
5891 # aliases $self is known by.
5892 # Sometimes the conflicting object is
5893 # merely hypothetical, so this has to
5894 # be an optional parameter.
5895 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5897 my $addr = main::objaddr $self;
5899 # Check if the conflicting name is exactly the same as any existing
5900 # alias in this table (as long as there is a real object there to
5901 # disambiguate with).
5902 if (defined $conflicting_object) {
5903 foreach my $alias ($self->aliases) {
5904 if ($alias->name eq $conflicting_name) {
5906 # Here, there is an exact match. This results in
5907 # ambiguous comments, so disambiguate by changing the
5908 # conflicting name to its object's complete equivalent.
5909 $conflicting_name = $conflicting_object->complete_name;
5915 # Convert to the \p{...} final name
5916 $conflicting_name = "\\$p" . "{$conflicting_name}";
5919 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
5921 push @{$conflicting{$addr}}, $conflicting_name;
5926 sub is_equivalent_to {
5927 # Return boolean of whether or not the other object is a table of this
5928 # type and has been marked equivalent to this one.
5932 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5934 return 0 if ! defined $other; # Can happen for incomplete early
5936 unless ($other->isa(__PACKAGE__)) {
5937 my $ref_other = ref $other;
5938 my $ref_self = ref $self;
5939 Carp::my_carp_bug("Argument to 'is_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
5943 # Two tables are equivalent if they have the same leader.
5944 return $leader{main::objaddr $self}
5945 == $leader{main::objaddr $other};
5949 sub matches_identically_to {
5950 # Return a boolean as to whether or not two tables match identical
5951 # sets of code points.
5955 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5957 unless ($other->isa(__PACKAGE__)) {
5958 my $ref_other = ref $other;
5959 my $ref_self = ref $self;
5960 Carp::my_carp_bug("Argument to 'matches_identically_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
5964 # These are ordered in increasing real time to figure out (at least
5965 # until a patch changes that and doesn't change this)
5966 return 0 if $self->max != $other->max;
5967 return 0 if $self->min != $other->min;
5968 return 0 if $self->range_count != $other->range_count;
5969 return 0 if $self->count != $other->count;
5971 # Here they could be identical because all the tests above passed.
5972 # The loop below is somewhat simpler since we know they have the same
5973 # number of elements. Compare range by range, until reach the end or
5974 # find something that differs.
5975 my @a_ranges = $self->_range_list->ranges;
5976 my @b_ranges = $other->_range_list->ranges;
5977 for my $i (0 .. @a_ranges - 1) {
5978 my $a = $a_ranges[$i];
5979 my $b = $b_ranges[$i];
5980 trace "self $a; other $b" if main::DEBUG && $to_trace;
5981 return 0 if $a->start != $b->start || $a->end != $b->end;
5986 sub set_equivalent_to {
5987 # Set $self equivalent to the parameter table.
5988 # The required Related => 'x' parameter is a boolean indicating
5989 # whether these tables are related or not. If related, $other becomes
5990 # the 'parent' of $self; if unrelated it becomes the 'leader'
5992 # Related tables share all characteristics except names; equivalents
5993 # not quite so many.
5994 # If they are related, one must be a perl extension. This is because
5995 # we can't guarantee that Unicode won't change one or the other in a
5996 # later release even if they are idential now.
6002 my $related = delete $args{'Related'};
6004 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
6006 return if ! defined $other; # Keep on going; happens in some early
6009 if (! defined $related) {
6010 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other");
6014 # If already are equivalent, no need to re-do it; if subroutine
6015 # returns null, it found an error, also do nothing
6016 my $are_equivalent = $self->is_equivalent_to($other);
6017 return if ! defined $are_equivalent || $are_equivalent;
6019 my $current_leader = ($related)
6020 ? $parent{main::objaddr $self}
6021 : $leader{main::objaddr $self};
6024 ! $other->perl_extension
6025 && ! $current_leader->perl_extension)
6027 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
6031 my $leader = main::objaddr $current_leader;
6032 my $other_addr = main::objaddr $other;
6034 # Any tables that are equivalent to or children of this table must now
6035 # instead be equivalent to or (children) to the new leader (parent),
6036 # still equivalent. The equivalency includes their matches_all info,
6037 # and for related tables, their status
6038 # All related tables are of necessity equivalent, but the converse
6039 # isn't necessarily true
6040 my $status = $other->status;
6041 my $status_info = $other->status_info;
6042 my $matches_all = $matches_all{other_addr};
6043 foreach my $table ($current_leader, @{$equivalents{$leader}}) {
6044 next if $table == $other;
6045 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
6047 my $table_addr = main::objaddr $table;
6048 $leader{$table_addr} = $other;
6049 $matches_all{$table_addr} = $matches_all;
6050 $self->_set_range_list($other->_range_list);
6051 push @{$equivalents{$other_addr}}, $table;
6053 $parent{$table_addr} = $other;
6054 push @{$children{$other_addr}}, $table;
6055 $table->set_status($status, $status_info);
6059 # Now that we've declared these to be equivalent, any changes to one
6060 # of the tables would invalidate that equivalency.
6066 sub add_range { # Add a range to the list for this table.
6068 # Rest of parameters passed on
6070 return if $self->carp_if_locked;
6071 return $self->_range_list->add_range(@_);
6074 sub pre_body { # Does nothing for match tables.
6078 sub append_to_body { # Does nothing for match tables.
6084 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6086 return $self->SUPER::write(2); # 2 tab stops
6089 sub set_final_comment {
6090 # This creates a comment for the file that is to hold the match table
6091 # $self. It is somewhat convoluted to make the English read nicely,
6092 # but, heh, it's just a comment.
6093 # This should be called only with the leader match table of all the
6094 # ones that share the same file. It lists all such tables, ordered so
6095 # that related ones are together.
6097 my $leader = shift; # Should only be called on the leader table of
6098 # an equivalent group
6099 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6101 my $addr = main::objaddr $leader;
6103 if ($leader{$addr} != $leader) {
6104 Carp::my_carp_bug(<<END
6105 set_final_comment() must be called on a leader table, which $leader is not.
6106 It is equivalent to $leader{$addr}. No comment created
6112 # Get the number of code points matched by each of the tables in this
6113 # file, and add underscores for clarity.
6114 my $count = $leader->count;
6115 my $string_count = main::clarify_number($count);
6117 my $loose_count = 0; # how many aliases loosely matched
6118 my $compound_name = ""; # ? Are any names compound?, and if so, an
6120 my $properties_with_compound_names = 0; # count of these
6123 my %flags; # The status flags used in the file
6124 my $total_entries = 0; # number of entries written in the comment
6125 my $matches_comment = ""; # The portion of the comment about the
6127 my @global_comments; # List of all the tables' comments that are
6128 # there before this routine was called.
6130 # Get list of all the parent tables that are equivalent to this one
6131 # (including itself).
6132 my @parents = grep { $parent{main::objaddr $_} == $_ }
6133 main::uniques($leader, @{$equivalents{$addr}});
6134 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated
6137 for my $parent (@parents) {
6139 my $property = $parent->property;
6141 # Special case 'N' tables in properties with two match tables when
6142 # the other is a 'Y' one. These are likely to be binary tables,
6143 # but not necessarily. In either case, \P{} will match the
6144 # complement of \p{}, and so if something is a synonym of \p, the
6145 # complement of that something will be the synonym of \P. This
6146 # would be true of any property with just two match tables, not
6147 # just those whose values are Y and N; but that would require a
6148 # little extra work, and there are none such so far in Unicode.
6149 my $perl_p = 'p'; # which is it? \p{} or \P{}
6150 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table
6152 if (scalar $property->tables == 2
6153 && $parent == $property->table('N')
6154 && defined (my $yes = $property->table('Y')))
6156 my $yes_addr = main::objaddr $yes;
6158 = grep { $_->property == $perl }
6161 $parent{$yes_addr}->children);
6163 # But these synonyms are \P{} ,not \p{}
6167 my @description; # Will hold the table description
6168 my @note; # Will hold the table notes.
6169 my @conflicting; # Will hold the table conflicts.
6171 # Look at the parent, any yes synonyms, and all the children
6172 for my $table ($parent,
6174 @{$children{main::objaddr $parent}})
6176 my $table_addr = main::objaddr $table;
6177 my $table_property = $table->property;
6179 # Tables are separated by a blank line to create a grouping.
6180 $matches_comment .= "\n" if $matches_comment;
6182 # The table is named based on the property and value
6183 # combination it is for, like script=greek. But there may be
6184 # a number of synonyms for each side, like 'sc' for 'script',
6185 # and 'grek' for 'greek'. Any combination of these is a valid
6186 # name for this table. In this case, there are three more,
6187 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than
6188 # listing all possible combinations in the comment, we make
6189 # sure that each synonym occurs at least once, and add
6190 # commentary that the other combinations are possible.
6191 my @property_aliases = $table_property->aliases;
6192 my @table_aliases = $table->aliases;
6194 Carp::my_carp_bug("$table doesn't have any names. Proceeding anyway.") unless @table_aliases;
6196 # The alias lists above are already ordered in the order we
6197 # want to output them. To ensure that each synonym is listed,
6198 # we must use the max of the two numbers.
6199 my $listed_combos = main::max(scalar @table_aliases,
6200 scalar @property_aliases);
6201 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
6203 my $property_had_compound_name = 0;
6205 for my $i (0 .. $listed_combos - 1) {
6208 # The current alias for the property is the next one on
6209 # the list, or if beyond the end, start over. Similarly
6210 # for the table (\p{prop=table})
6211 my $property_alias = $property_aliases
6212 [$i % @property_aliases]->name;
6213 my $table_alias_object = $table_aliases
6214 [$i % @table_aliases];
6215 my $table_alias = $table_alias_object->name;
6216 my $loose_match = $table_alias_object->loose_match;
6218 if ($table_alias !~ /\D/) { # Clarify large numbers.
6219 $table_alias = main::clarify_number($table_alias)
6222 # Add a comment for this alias combination
6223 my $current_match_comment;
6224 if ($table_property == $perl) {
6225 $current_match_comment = "\\$perl_p"
6229 $current_match_comment
6230 = "\\p{$property_alias=$table_alias}";
6231 $property_had_compound_name = 1;
6234 # Flag any abnormal status for this table.
6235 my $flag = $property->status
6237 || $table_alias_object->status;
6239 if ($flag ne $PLACEHOLDER) {
6240 $flags{$flag} = $status_past_participles{$flag};
6242 $flags{$flag} = <<END;
6243 a placeholder because it is not in Version $string_version of Unicode, but is
6244 needed by the Perl core to work gracefully. Because it is not in this version
6245 of Unicode, it will not be listed in $pod_file.pod
6252 # Pretty up the comment. Note the \b; it says don't make
6253 # this line a continuation.
6254 $matches_comment .= sprintf("\b%-1s%-s%s\n",
6257 $current_match_comment);
6258 } # End of generating the entries for this table.
6260 # Save these for output after this group of related tables.
6261 push @description, $table->description;
6262 push @note, $table->note;
6263 push @conflicting, $table->conflicting;
6265 # And this for output after all the tables.
6266 push @global_comments, $table->comment;
6268 # Compute an alternate compound name using the final property
6269 # synonym and the first table synonym with a colon instead of
6270 # the equal sign used elsewhere.
6271 if ($property_had_compound_name) {
6272 $properties_with_compound_names ++;
6273 if (! $compound_name || @property_aliases > 1) {
6274 $compound_name = $property_aliases[-1]->name
6276 . $table_aliases[0]->name;
6279 } # End of looping through all children of this table
6281 # Here have assembled in $matches_comment all the related tables
6282 # to the current parent (preceded by the same info for all the
6283 # previous parents). Put out information that applies to all of
6284 # the current family.
6287 # But output the conflicting information now, as it applies to
6289 my $conflicting = join ", ", @conflicting;
6291 $matches_comment .= <<END;
6293 Note that contrary to what you might expect, the above is NOT the same as
6295 $matches_comment .= "any of: " if @conflicting > 1;
6296 $matches_comment .= "$conflicting\n";
6300 $matches_comment .= "\n Meaning: "
6301 . join('; ', @description)
6305 $matches_comment .= "\n Note: "
6306 . join("\n ", @note)
6309 } # End of looping through all tables
6317 $code_points = 'single code point';
6321 $code_points = "$string_count code points";
6326 if ($total_entries <= 1) {
6329 $any_of_these = 'this'
6332 $synonyms = " any of the following regular expression constructs";
6333 $entries = 'entries';
6334 $any_of_these = 'any of these'
6338 if ($has_unrelated) {
6340 This file is for tables that are not necessarily related: To conserve
6341 resources, every table that matches the identical set of code points in this
6342 version of Unicode uses this file. Each one is listed in a separate group
6343 below. It could be that the tables will match the same set of code points in
6344 other Unicode releases, or it could be purely coincidence that they happen to
6345 be the same in Unicode $string_version, and hence may not in other versions.
6351 foreach my $flag (sort keys %flags) {
6353 '$flag' below means that this form is $flags{$flag}.
6355 next if $flag eq $PLACEHOLDER;
6356 $comment .= "Consult $pod_file.pod\n";
6362 This file returns the $code_points in Unicode Version $string_version that
6366 $pod_file.pod should be consulted for the syntax rules for $any_of_these,
6367 including if adding or subtracting white space, underscore, and hyphen
6368 characters matters or doesn't matter, and other permissible syntactic
6369 variants. Upper/lower case distinctions never matter.
6372 if ($compound_name) {
6375 A colon can be substituted for the equals sign, and
6377 if ($properties_with_compound_names > 1) {
6379 within each group above,
6382 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
6384 # Note the \b below, it says don't make that line a continuation.
6386 anything to the left of the equals (or colon) can be combined with anything to
6387 the right. Thus, for example,
6393 # And append any comment(s) from the actual tables. They are all
6394 # gathered here, so may not read all that well.
6395 if (@global_comments) {
6396 $comment .= "\n" . join("\n\n", @global_comments) . "\n";
6399 if ($count) { # The format differs if no code points, and needs no
6400 # explanation in that case
6403 The format of the lines of this file is:
6406 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
6407 STOP is the ending point, or if omitted, the range has just one code point.
6409 if ($output_range_counts) {
6411 Numbers in comments in [brackets] indicate how many code points are in the
6417 $leader->set_comment(main::join_lines($comment));
6421 # Accessors for the underlying list
6423 get_valid_code_point
6424 get_invalid_code_point
6432 return $self->_range_list->$sub(@_);
6435 } # End closure for Match_Table
6439 # The Property class represents a Unicode property, or the $perl
6440 # pseudo-property. It contains a map table initialized empty at construction
6441 # time, and for properties accessible through regular expressions, various
6442 # match tables, created through the add_match_table() method, and referenced
6443 # by the table('NAME') or tables() methods, the latter returning a list of all
6444 # of the match tables. Otherwise table operations implicitly are for the map
6447 # Most of the data in the property is actually about its map table, so it
6448 # mostly just uses that table's accessors for most methods. The two could
6449 # have been combined into one object, but for clarity because of their
6450 # differing semantics, they have been kept separate. It could be argued that
6451 # the 'file' and 'directory' fields should be kept with the map table.
6453 # Each property has a type. This can be set in the constructor, or in the
6454 # set_type accessor, but mostly it is figured out by the data. Every property
6455 # starts with unknown type, overridden by a parameter to the constructor, or
6456 # as match tables are added, or ranges added to the map table, the data is
6457 # inspected, and the type changed. After the table is mostly or entirely
6458 # filled, compute_type() should be called to finalize they analysis.
6460 # There are very few operations defined. One can safely remove a range from
6461 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
6462 # table to this one, replacing any in the intersection of the two.
6464 sub standardize { return main::standardize($_[0]); }
6465 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
6469 # This hash will contain as keys, all the aliases of all properties, and
6470 # as values, pointers to their respective property objects. This allows
6471 # quick look-up of a property from any of its names.
6472 my %alias_to_property_of;
6474 sub dump_alias_to_property_of {
6477 print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
6482 # This is a package subroutine, not called as a method.
6483 # If the single parameter is a literal '*' it returns a list of all
6484 # defined properties.
6485 # Otherwise, the single parameter is a name, and it returns a pointer
6486 # to the corresponding property object, or undef if none.
6488 # Properties can have several different names. The 'standard' form of
6489 # each of them is stored in %alias_to_property_of as they are defined.
6490 # But it's possible that this subroutine will be called with some
6491 # variant, so if the initial lookup fails, it is repeated with the
6492 # standarized form of the input name. If found, besides returning the
6493 # result, the input name is added to the list so future calls won't
6494 # have to do the conversion again.
6498 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6500 if (! defined $name) {
6501 Carp::my_carp_bug("Undefined input property. No action taken.");
6505 return main::uniques(values %alias_to_property_of) if $name eq '*';
6507 # Return cached result if have it.
6508 my $result = $alias_to_property_of{$name};
6509 return $result if defined $result;
6511 # Convert the input to standard form.
6512 my $standard_name = standardize($name);
6514 $result = $alias_to_property_of{$standard_name};
6515 return unless defined $result; # Don't cache undefs
6517 # Cache the result before returning it.
6518 $alias_to_property_of{$name} = $result;
6523 main::setup_package();
6526 # A pointer to the map table object for this property
6527 main::set_access('map', \%map);
6530 # The property's full name. This is a duplicate of the copy kept in the
6531 # map table, but is needed because stringify needs it during
6532 # construction of the map table, and then would have a chicken before egg
6534 main::set_access('full_name', \%full_name, 'r');
6537 # This hash will contain as keys, all the aliases of any match tables
6538 # attached to this property, and as values, the pointers to their
6539 # respective tables. This allows quick look-up of a table from any of its
6541 main::set_access('table_ref', \%table_ref);
6544 # The type of the property, $ENUM, $BINARY, etc
6545 main::set_access('type', \%type, 'r');
6548 # The filename where the map table will go (if actually written).
6549 # Normally defaulted, but can be overridden.
6550 main::set_access('file', \%file, 'r', 's');
6553 # The directory where the map table will go (if actually written).
6554 # Normally defaulted, but can be overridden.
6555 main::set_access('directory', \%directory, 's');
6557 my %pseudo_map_type;
6558 # This is used to affect the calculation of the map types for all the
6559 # ranges in the table. It should be set to one of the values that signify
6560 # to alter the calculation.
6561 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
6563 my %has_only_code_point_maps;
6564 # A boolean used to help in computing the type of data in the map table.
6565 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
6568 # A list of the first few distinct mappings this property has. This is
6569 # used to disambiguate between binary and enum property types, so don't
6570 # have to keep more than three.
6571 main::set_access('unique_maps', \%unique_maps);
6574 # The only required parameter is the positionally first, name. All
6575 # other parameters are key => value pairs. See the documentation just
6576 # above for the meanings of the ones not passed directly on to the map
6577 # table constructor.
6580 my $name = shift || "";
6582 my $self = property_ref($name);
6583 if (defined $self) {
6584 my $options_string = join ", ", @_;
6585 $options_string = ". Ignoring options $options_string" if $options_string;
6586 Carp::my_carp("$self is already in use. Using existing one$options_string;");
6592 $self = bless \do { my $anonymous_scalar }, $class;
6593 my $addr = main::objaddr $self;
6595 $directory{$addr} = delete $args{'Directory'};
6596 $file{$addr} = delete $args{'File'};
6597 $full_name{$addr} = delete $args{'Full_Name'} || $name;
6598 $type{$addr} = delete $args{'Type'} || $UNKNOWN;
6599 $pseudo_map_type{$addr} = delete $args{'Map_Type'};
6600 # Rest of parameters passed on.
6602 $has_only_code_point_maps{$addr} = 1;
6603 $table_ref{$addr} = { };
6604 $unique_maps{$addr} = { };
6606 $map{$addr} = Map_Table->new($name,
6607 Full_Name => $full_name{$addr},
6608 _Alias_Hash => \%alias_to_property_of,
6614 # See this program's beginning comment block about overloading the copy
6615 # constructor. Few operations are defined on properties, but a couple are
6616 # useful. It is safe to take the inverse of a property, and to remove a
6617 # single code point from it.
6620 qw("") => "_operator_stringify",
6621 "." => \&main::_operator_dot,
6622 '==' => \&main::_operator_equal,
6623 '!=' => \&main::_operator_not_equal,
6624 '=' => sub { return shift },
6625 '-=' => "_minus_and_equal",
6628 sub _operator_stringify {
6629 return "Property '" . shift->full_name . "'";
6632 sub _minus_and_equal {
6633 # Remove a single code point from the map table of a property.
6637 my $reversed = shift;
6638 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6641 Carp::my_carp_bug("Can't cope with a "
6643 . " argument to '-='. Subtraction ignored.");
6646 elsif ($reversed) { # Shouldnt happen in a -=, but just in case
6647 Carp::my_carp_bug("Can't cope with a "
6649 . " being the first parameter in a '-='. Subtraction ignored.");
6653 $map{main::objaddr $self}->delete_range($other, $other);
6658 sub add_match_table {
6659 # Add a new match table for this property, with name given by the
6660 # parameter. It returns a pointer to the table.
6666 my $addr = main::objaddr $self;
6668 my $table = $table_ref{$addr}{$name};
6669 my $standard_name = main::standardize($name);
6671 || (defined ($table = $table_ref{$addr}{$standard_name})))
6673 Carp::my_carp("Table '$name' in $self is already in use. Using existing one");
6674 $table_ref{$addr}{$name} = $table;
6679 # See if this is a perl extension, if not passed in.
6680 my $perl_extension = delete $args{'Perl_Extension'};
6682 = $self->perl_extension if ! defined $perl_extension;
6684 $table = Match_Table->new(
6686 Perl_Extension => $perl_extension,
6687 _Alias_Hash => $table_ref{$addr},
6690 # gets property's status by default
6691 Status => $self->status,
6692 _Status_Info => $self->status_info,
6694 Internal_Only_Warning => 1); # Override any
6696 return unless defined $table;
6699 # Save the names for quick look up
6700 $table_ref{$addr}{$standard_name} = $table;
6701 $table_ref{$addr}{$name} = $table;
6703 # Perhaps we can figure out the type of this property based on the
6704 # fact of adding this match table. First, string properties don't
6705 # have match tables; second, a binary property can't have 3 match
6707 if ($type{$addr} == $UNKNOWN) {
6708 $type{$addr} = $NON_STRING;
6710 elsif ($type{$addr} == $STRING) {
6711 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News.");
6712 $type{$addr} = $NON_STRING;
6714 elsif ($type{$addr} != $ENUM) {
6715 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
6716 && $type{$addr} == $BINARY)
6718 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News.");
6719 $type{$addr} = $ENUM;
6727 # Return a pointer to the match table (with name given by the
6728 # parameter) associated with this property; undef if none.
6732 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6734 my $addr = main::objaddr $self;
6736 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
6738 # If quick look-up failed, try again using the standard form of the
6739 # input name. If that succeeds, cache the result before returning so
6740 # won't have to standardize this input name again.
6741 my $standard_name = main::standardize($name);
6742 return unless defined $table_ref{$addr}{$standard_name};
6744 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
6745 return $table_ref{$addr}{$name};
6749 # Return a list of pointers to all the match tables attached to this
6752 return main::uniques(values %{$table_ref{main::objaddr shift}});
6756 # Returns the directory the map table for this property should be
6757 # output in. If a specific directory has been specified, that has
6758 # priority; 'undef' is returned if the type isn't defined;
6759 # or $map_directory for everything else.
6761 my $addr = main::objaddr shift;
6763 return $directory{$addr} if defined $directory{$addr};
6764 return undef if $type{$addr} == $UNKNOWN;
6765 return $map_directory;
6769 # Return the name that is used to both:
6770 # 1) Name the file that the map table is written to.
6771 # 2) The name of swash related stuff inside that file.
6772 # The reason for this is that the Perl core historically has used
6773 # certain names that aren't the same as the Unicode property names.
6774 # To continue using these, $file is hard-coded in this file for those,
6775 # but otherwise the standard name is used. This is different from the
6776 # external_name, so that the rest of the files, like in lib can use
6777 # the standard name always, without regard to historical precedent.
6780 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6782 my $addr = main::objaddr $self;
6784 return $file{$addr} if defined $file{$addr};
6785 return $map{$addr}->external_name;
6788 sub to_create_match_tables {
6789 # Returns a boolean as to whether or not match tables should be
6790 # created for this property.
6793 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6795 # The whole point of this pseudo property is match tables.
6796 return 1 if $self == $perl;
6798 my $addr = main::objaddr $self;
6800 # Don't generate tables of code points that match the property values
6801 # of a string property. Such a list would most likely have many
6802 # property values, each with just one or very few code points mapping
6804 return 0 if $type{$addr} == $STRING;
6806 # Don't generate anything for unimplemented properties.
6807 return 0 if grep { $self->complete_name eq $_ }
6808 @unimplemented_properties;
6813 sub property_add_or_replace_non_nulls {
6814 # This adds the mappings in the property $other to $self. Non-null
6815 # mappings from $other override those in $self. It essentially merges
6816 # the two properties, with the second having priority except for null
6821 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6823 if (! $other->isa(__PACKAGE__)) {
6824 Carp::my_carp_bug("$other should be a "
6832 return $map{main::objaddr $self}->
6833 map_add_or_replace_non_nulls($map{main::objaddr $other});
6837 # Set the type of the property. Mostly this is figured out by the
6838 # data in the table. But this is used to set it explicitly. The
6839 # reason it is not a standard accessor is that when setting a binary
6840 # property, we need to make sure that all the true/false aliases are
6841 # present, as they were omitted in early Unicode releases.
6845 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6847 if ($type != $ENUM && $type != $BINARY && $type != $STRING) {
6848 Carp::my_carp("Unrecognized type '$type'. Type not set");
6852 $type{main::objaddr $self} = $type;
6853 return if $type != $BINARY;
6855 my $yes = $self->table('Y');
6856 $yes = $self->table('Yes') if ! defined $yes;
6857 $yes = $self->add_match_table('Y') if ! defined $yes;
6858 $yes->add_alias('Yes');
6859 $yes->add_alias('T');
6860 $yes->add_alias('True');
6862 my $no = $self->table('N');
6863 $no = $self->table('No') if ! defined $no;
6864 $no = $self->add_match_table('N') if ! defined $no;
6865 $no->add_alias('No');
6866 $no->add_alias('F');
6867 $no->add_alias('False');
6872 # Add a map to the property's map table. This also keeps
6873 # track of the maps so that the property type can be determined from
6877 my $start = shift; # First code point in range
6878 my $end = shift; # Final code point in range
6879 my $map = shift; # What the range maps to.
6880 # Rest of parameters passed on.
6882 my $addr = main::objaddr $self;
6884 # If haven't the type of the property, gather information to figure it
6886 if ($type{$addr} == $UNKNOWN) {
6888 # If the map contains an interior blank or dash, or most other
6889 # nonword characters, it will be a string property. This
6890 # heuristic may actually miss some string properties. If so, they
6891 # may need to have explicit set_types called for them. This
6892 # happens in the Unihan properties.
6893 if ($map =~ / (?<= . ) [ -] (?= . ) /x
6894 || $map =~ / [^\w.\/\ -] /x)
6896 $self->set_type($STRING);
6898 # $unique_maps is used for disambiguating between ENUM and
6899 # BINARY later; since we know the property is not going to be
6900 # one of those, no point in keeping the data around
6901 undef $unique_maps{$addr};
6905 # Not necessarily a string. The final decision has to be
6906 # deferred until all the data are in. We keep track of if all
6907 # the values are code points for that eventual decision.
6908 $has_only_code_point_maps{$addr} &=
6909 $map =~ / ^ $code_point_re $/x;
6911 # For the purposes of disambiguating between binary and other
6912 # enumerations at the end, we keep track of the first three
6913 # distinct property values. Once we get to three, we know
6914 # it's not going to be binary, so no need to track more.
6915 if (scalar keys %{$unique_maps{$addr}} < 3) {
6916 $unique_maps{$addr}{main::standardize($map)} = 1;
6921 # Add the mapping by calling our map table's method
6922 return $map{$addr}->add_map($start, $end, $map, @_);
6926 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This
6927 # should be called after the property is mostly filled with its maps.
6928 # We have been keeping track of what the property values have been,
6929 # and now have the necessary information to figure out the type.
6932 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6934 my $addr = main::objaddr($self);
6936 my $type = $type{$addr};
6938 # If already have figured these out, no need to do so again, but we do
6939 # a double check on ENUMS to make sure that a string property hasn't
6940 # improperly been classified as an ENUM, so continue on with those.
6941 return if $type == $STRING || $type == $BINARY;
6943 # If every map is to a code point, is a string property.
6944 if ($type == $UNKNOWN
6945 && ($has_only_code_point_maps{$addr}
6946 || (defined $map{$addr}->default_map
6947 && $map{$addr}->default_map eq "")))
6949 $self->set_type($STRING);
6953 # Otherwise, it is to some sort of enumeration. (The case where
6954 # it is a Unicode miscellaneous property, and treated like a
6955 # string in this program is handled in add_map()). Distinguish
6956 # between binary and some other enumeration type. Of course, if
6957 # there are more than two values, it's not binary. But more
6958 # subtle is the test that the default mapping is defined means it
6959 # isn't binary. This in fact may change in the future if Unicode
6960 # changes the way its data is structured. But so far, no binary
6961 # properties ever have @missing lines for them, so the default map
6962 # isn't defined for them. The few properties that are two-valued
6963 # and aren't considered binary have the default map defined
6964 # starting in Unicode 5.0, when the @missing lines appeared; and
6965 # this program has special code to put in a default map for them
6966 # for earlier than 5.0 releases.
6968 || scalar keys %{$unique_maps{$addr}} > 2
6969 || defined $self->default_map)
6971 my $tables = $self->tables;
6972 my $count = $self->count;
6973 if ($verbosity && $count > 500 && $tables/$count > .1) {
6974 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
6976 $self->set_type($ENUM);
6979 $self->set_type($BINARY);
6982 undef $unique_maps{$addr}; # Garbage collect
6986 # Most of the accessors for a property actually apply to its map table.
6987 # Setup up accessor functions for those, referring to %map
7034 # 'property' above is for symmetry, so that one can take
7035 # the property of a property and get itself, and so don't
7036 # have to distinguish between properties and tables in
7043 return $map{main::objaddr $self}->$sub(@_);
7053 # Returns lines of the input joined together, so that they can be folded
7055 # This causes continuation lines to be joined together into one long line
7056 # for folding. A continuation line is any line that doesn't begin with a
7057 # space or "\b" (the latter is stripped from the output). This is so
7058 # lines can be be in a HERE document so as to fit nicely in the terminal
7059 # width, but be joined together in one long line, and then folded with
7060 # indents, '#' prefixes, etc, properly handled.
7061 # A blank separates the joined lines except if there is a break; an extra
7062 # blank is inserted after a period ending a line.
7064 # Intialize the return with the first line.
7065 my ($return, @lines) = split "\n", shift;
7067 # If the first line is null, it was an empty line, add the \n back in
7068 $return = "\n" if $return eq "";
7070 # Now join the remainder of the physical lines.
7071 for my $line (@lines) {
7073 # An empty line means wanted a blank line, so add two \n's to get that
7074 # effect, and go to the next line.
7075 if (length $line == 0) {
7080 # Look at the last character of what we have so far.
7081 my $previous_char = substr($return, -1, 1);
7083 # And at the next char to be output.
7084 my $next_char = substr($line, 0, 1);
7086 if ($previous_char ne "\n") {
7088 # Here didn't end wth a nl. If the next char a blank or \b, it
7089 # means that here there is a break anyway. So add a nl to the
7091 if ($next_char eq " " || $next_char eq "\b") {
7092 $previous_char = "\n";
7093 $return .= $previous_char;
7096 # Add an extra space after periods.
7097 $return .= " " if $previous_char eq '.';
7100 # Here $previous_char is still the latest character to be output. If
7101 # it isn't a nl, it means that the next line is to be a continuation
7102 # line, with a blank inserted between them.
7103 $return .= " " if $previous_char ne "\n";
7106 substr($line, 0, 1) = "" if $next_char eq "\b";
7108 # And append this next line.
7115 sub simple_fold($;$$$) {
7116 # Returns a string of the input (string or an array of strings) folded
7117 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
7119 # This is tailored for the kind of text written by this program,
7120 # especially the pod file, which can have very long names with
7121 # underscores in the middle, or words like AbcDefgHij.... We allow
7122 # breaking in the middle of such constructs if the line won't fit
7123 # otherwise. The break in such cases will come either just after an
7124 # underscore, or just before one of the Capital letters.
7126 local $to_trace = 0 if main::DEBUG;
7129 my $prefix = shift; # Optional string to prepend to each output
7131 $prefix = "" unless defined $prefix;
7133 my $hanging_indent = shift; # Optional number of spaces to indent
7134 # continuation lines
7135 $hanging_indent = 0 unless $hanging_indent;
7137 my $right_margin = shift; # Optional number of spaces to narrow the
7139 $right_margin = 0 unless defined $right_margin;
7141 # Call carp with the 'nofold' option to avoid it from trying to call us
7143 Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
7145 # The space available doesn't include what's automatically prepended
7146 # to each line, or what's reserved on the right.
7147 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
7148 # XXX Instead of using the 'nofold' perhaps better to look up the stack
7150 if (DEBUG && $hanging_indent >= $max) {
7151 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold');
7152 $hanging_indent = 0;
7155 # First, split into the current physical lines.
7157 if (ref $line) { # Better be an array, because not bothering to
7159 foreach my $line (@{$line}) {
7160 push @line, split /\n/, $line;
7164 @line = split /\n/, $line;
7167 #local $to_trace = 1 if main::DEBUG;
7168 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
7170 # Look at each current physical line.
7171 for (my $i = 0; $i < @line; $i++) {
7172 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
7173 #local $to_trace = 1 if main::DEBUG;
7174 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
7176 # Remove prefix, because will be added back anyway, don't want
7178 $line[$i] =~ s/^$prefix//;
7180 # Remove trailing space
7181 $line[$i] =~ s/\s+\Z//;
7183 # If the line is too long, fold it.
7184 if (length $line[$i] > $max) {
7187 # Here needs to fold. Save the leading space in the line for
7189 $line[$i] =~ /^ ( \s* )/x;
7190 my $leading_space = $1;
7191 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
7193 # If character at final permissible position is white space,
7194 # fold there, which will delete that white space
7195 if (substr($line[$i], $max - 1, 1) =~ /\s/) {
7196 $remainder = substr($line[$i], $max);
7197 $line[$i] = substr($line[$i], 0, $max - 1);
7201 # Otherwise fold at an acceptable break char closest to
7202 # the max length. Look at just the maximal initial
7203 # segment of the line
7204 my $segment = substr($line[$i], 0, $max - 1);
7206 /^ ( .{$hanging_indent} # Don't look before the
7208 \ * # Don't look in leading
7209 # blanks past the indent
7210 [^ ] .* # Find the right-most
7211 (?: # acceptable break:
7212 [ \s = ] # space or equal
7213 | - (?! [.0-9] ) # or non-unary minus.
7214 ) # $1 includes the character
7217 # Split into the initial part that fits, and remaining
7219 $remainder = substr($line[$i], length $1);
7221 trace $line[$i] if DEBUG && $to_trace;
7222 trace $remainder if DEBUG && $to_trace;
7225 # If didn't find a good breaking spot, see if there is a
7226 # not-so-good breaking spot. These are just after
7227 # underscores or where the case changes from lower to
7228 # upper. Use \a as a soft hyphen, but give up
7229 # and don't break the line if there is actually a \a
7230 # already in the input. We use an ascii character for the
7231 # soft-hyphen to avoid any attempt by miniperl to try to
7232 # access the files that this program is creating.
7233 elsif ($segment !~ /\a/
7234 && ($segment =~ s/_/_\a/g
7235 || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
7237 # Here were able to find at least one place to insert
7238 # our substitute soft hyphen. Find the right-most one
7239 # and replace it by a real hyphen.
7240 trace $segment if DEBUG && $to_trace;
7242 rindex($segment, "\a"),
7245 # Then remove the soft hyphen substitutes.
7246 $segment =~ s/\a//g;
7247 trace $segment if DEBUG && $to_trace;
7249 # And split into the initial part that fits, and
7250 # remainder of the line
7251 my $pos = rindex($segment, '-');
7252 $remainder = substr($line[$i], $pos);
7253 trace $remainder if DEBUG && $to_trace;
7254 $line[$i] = substr($segment, 0, $pos + 1);
7258 # Here we know if we can fold or not. If we can, $remainder
7259 # is what remains to be processed in the next iteration.
7260 if (defined $remainder) {
7261 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
7263 # Insert the folded remainder of the line as a new element
7264 # of the array. (It may still be too long, but we will
7265 # deal with that next time through the loop.) Omit any
7266 # leading space in the remainder.
7267 $remainder =~ s/^\s+//;
7268 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
7270 # But then indent by whichever is larger of:
7271 # 1) the leading space on the input line;
7272 # 2) the hanging indent.
7273 # This preserves indentation in the original line.
7274 my $lead = ($leading_space)
7275 ? length $leading_space
7277 $lead = max($lead, $hanging_indent);
7278 splice @line, $i+1, 0, (" " x $lead) . $remainder;
7282 # Ready to output the line. Get rid of any trailing space
7283 # And prefix by the required $prefix passed in.
7284 $line[$i] =~ s/\s+$//;
7285 $line[$i] = "$prefix$line[$i]\n";
7286 } # End of looping through all the lines.
7288 return join "", @line;
7291 sub property_ref { # Returns a reference to a property object.
7292 return Property::property_ref(@_);
7295 sub force_unlink ($) {
7296 my $filename = shift;
7297 return unless file_exists($filename);
7298 return if CORE::unlink($filename);
7300 # We might need write permission
7301 chmod 0777, $filename;
7302 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!");
7307 # Given a filename and a reference to an array of lines, write the lines
7309 # Filename can be given as an arrayref of directory names
7312 my $lines_ref = shift;
7313 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7315 if (! defined $lines_ref) {
7316 Carp::my_carp("Missing lines to write parameter for $file. Writing skipped;");
7320 # Get into a single string if an array, and get rid of, in Unix terms, any
7322 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
7323 $file = File::Spec->canonpath($file);
7325 # If has directories, make sure that they all exist
7326 (undef, my $directories, undef) = File::Spec->splitpath($file);
7327 File::Path::mkpath($directories) if $directories && ! -d $directories;
7329 push @files_actually_output, $file;
7333 $text = join "", @$lines_ref;
7337 Carp::my_carp("Output file '$file' is empty; writing it anyway;");
7340 force_unlink ($file);
7343 if (not open $OUT, ">", $file) {
7344 Carp::my_carp("can't open $file for output. Skipping this file: $!");
7347 print "$file written.\n" if $verbosity >= $VERBOSE;
7355 sub Standardize($) {
7356 # This converts the input name string into a standardized equivalent to
7360 unless (defined $name) {
7361 Carp::my_carp_bug("Standardize() called with undef. Returning undef.");
7365 # Remove any leading or trailing white space
7369 # Convert interior white space and hypens into underscores.
7370 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
7372 # Capitalize the letter following an underscore, and convert a sequence of
7373 # multiple underscores to a single one
7374 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
7376 # And capitalize the first letter, but not for the special cjk ones.
7377 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
7381 sub standardize ($) {
7382 # Returns a lower-cased standardized name, without underscores. This form
7383 # is chosen so that it can distinguish between any real versus superficial
7384 # Unicode name differences. It relies on the fact that Unicode doesn't
7385 # have interior underscores, white space, nor dashes in any
7386 # stricter-matched name. It should not be used on Unicode code point
7387 # names (the Name property), as they mostly, but not always follow these
7390 my $name = Standardize(shift);
7391 return if !defined $name;
7393 $name =~ s/ (?<= .) _ (?= . ) //xg;
7399 my $indent_increment = " " x 2;
7402 $main::simple_dumper_nesting = 0;
7405 # Like Simple Data::Dumper. Good enough for our needs. We can't use
7406 # the real thing as we have to run under miniperl.
7408 # It is designed so that on input it is at the beginning of a line,
7409 # and the final thing output in any call is a trailing ",\n".
7413 $indent = "" if ! defined $indent;
7415 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7417 # nesting level is localized, so that as the call stack pops, it goes
7418 # back to the prior value.
7419 local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
7420 undef %already_output if $main::simple_dumper_nesting == 0;
7421 $main::simple_dumper_nesting++;
7422 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
7424 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7426 # Determine the indent for recursive calls.
7427 my $next_indent = $indent . $indent_increment;
7432 # Dump of scalar: just output it in quotes if not a number. To do
7433 # so we must escape certain characters, and therefore need to
7434 # operate on a copy to avoid changing the original
7436 $copy = $UNDEF unless defined $copy;
7438 # Quote non-numbers (numbers also have optional leading '-' and
7440 if ($copy eq "" || $copy !~ /^ -? \d+ ( \. \d+ )? $/x) {
7442 # Escape apostrophe and backslash
7443 $copy =~ s/ ( ['\\] ) /\\$1/xg;
7446 $output = "$indent$copy,\n";
7450 # Keep track of cycles in the input, and refuse to infinitely loop
7451 if (defined $already_output{main::objaddr $item}) {
7452 return "${indent}ALREADY OUTPUT: $item\n";
7454 $already_output{main::objaddr $item} = $item;
7456 if (ref $item eq 'ARRAY') {
7459 if ($main::simple_dumper_nesting > 1) {
7461 $using_brackets = 1;
7464 $using_brackets = 0;
7467 # If the array is empty, put the closing bracket on the same
7468 # line. Otherwise, recursively add each array element
7474 for (my $i = 0; $i < @$item; $i++) {
7476 # Indent array elements one level
7477 $output .= &simple_dumper($item->[$i], $next_indent);
7478 $output =~ s/\n$//; # Remove trailing nl so as to
7479 $output .= " # [$i]\n"; # add a comment giving the
7482 $output .= $indent; # Indent closing ']' to orig level
7484 $output .= ']' if $using_brackets;
7487 elsif (ref $item eq 'HASH') {
7492 # No surrounding braces at top level
7494 if ($main::simple_dumper_nesting > 1) {
7497 $body_indent = $next_indent;
7498 $next_indent .= $indent_increment;
7503 $body_indent = $indent;
7507 # Output hashes sorted alphabetically instead of apparently
7508 # random. Use caseless alphabetic sort
7509 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
7511 if ($is_first_line) {
7515 $output .= "$body_indent";
7518 # The key must be a scalar, but this recursive call quotes
7520 $output .= &simple_dumper($key);
7522 # And change the trailing comma and nl to the hash fat
7523 # comma for clarity, and so the value can be on the same
7525 $output =~ s/,\n$/ => /;
7527 # Recursively call to get the value's dump.
7528 my $next = &simple_dumper($item->{$key}, $next_indent);
7530 # If the value is all on one line, remove its indent, so
7531 # will follow the => immediately. If it takes more than
7532 # one line, start it on a new line.
7533 if ($next !~ /\n.*\n/) {
7542 $output .= "$indent},\n" if $using_braces;
7544 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
7545 $output = $indent . ref($item) . "\n";
7546 # XXX see if blessed
7548 elsif ($item->can('dump')) {
7550 # By convention in this program, objects furnish a 'dump'
7551 # method. Since not doing any output at this level, just pass
7552 # on the input indent
7553 $output = $item->dump($indent);
7556 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping.");
7563 sub dump_inside_out {
7564 # Dump inside-out hashes in an object's state by converting them to a
7565 # regular hash and then calling simple_dumper on that.
7568 my $fields_ref = shift;
7569 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7571 my $addr = main::objaddr $object;
7574 foreach my $key (keys %$fields_ref) {
7575 $hash{$key} = $fields_ref->{$key}{$addr};
7578 return simple_dumper(\%hash, @_);
7582 # Overloaded '.' method that is common to all packages. It uses the
7583 # package's stringify method.
7587 my $reversed = shift;
7588 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7590 $other = "" unless defined $other;
7592 foreach my $which (\$self, \$other) {
7593 next unless ref $$which;
7594 if ($$which->can('_operator_stringify')) {
7595 $$which = $$which->_operator_stringify;
7598 my $ref = ref $$which;
7599 my $addr = main::objaddr $$which;
7600 $$which = "$ref ($addr)";
7608 sub _operator_equal {
7609 # Generic overloaded '==' routine. To be equal, they must be the exact
7615 return 0 unless defined $other;
7616 return 0 unless ref $other;
7617 return main::objaddr $self == main::objaddr $other;
7620 sub _operator_not_equal {
7624 return ! _operator_equal($self, $other);
7627 sub process_PropertyAliases($) {
7628 # This reads in the PropertyAliases.txt file, which contains almost all
7629 # the character properties in Unicode and their equivalent aliases:
7630 # scf ; Simple_Case_Folding ; sfc
7632 # Field 0 is the preferred short name for the property.
7633 # Field 1 is the full name.
7634 # Any succeeding ones are other accepted names.
7637 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7639 # This whole file was non-existent in early releases, so use our own
7641 $file->insert_lines(get_old_property_aliases())
7642 if ! -e 'PropertyAliases.txt';
7644 # Add any cjk properties that may have been defined.
7645 $file->insert_lines(@cjk_properties);
7647 while ($file->next_line) {
7649 my @data = split /\s*;\s*/;
7651 my $full = $data[1];
7653 my $this = Property->new($data[0], Full_Name => $full);
7655 # Start looking for more aliases after these two.
7656 for my $i (2 .. @data - 1) {
7657 $this->add_alias($data[$i]);
7664 sub finish_property_setup {
7665 # Finishes setting up after PropertyAliases.
7668 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7670 # This entry was missing from this file in earlier Unicode versions
7671 if (-e 'Jamo.txt') {
7672 my $jsn = property_ref('JSN');
7673 if (! defined $jsn) {
7674 $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name');
7678 # This entry is still missing as of 5.2, perhaps because no short name for
7680 if (-e 'NameAliases.txt') {
7681 my $aliases = property_ref('Name_Alias');
7682 if (! defined $aliases) {
7683 $aliases = Property->new('Name_Alias');
7687 # These are used so much, that we set globals for them.
7688 $gc = property_ref('General_Category');
7689 $block = property_ref('Block');
7691 # Perl adds this alias.
7692 $gc->add_alias('Category');
7694 # For backwards compatibility, these property files have particular names.
7695 my $upper = property_ref('Uppercase_Mapping');
7696 $upper->set_core_access('uc()');
7697 $upper->set_file('Upper'); # This is what utf8.c calls it
7699 my $lower = property_ref('Lowercase_Mapping');
7700 $lower->set_core_access('lc()');
7701 $lower->set_file('Lower');
7703 my $title = property_ref('Titlecase_Mapping');
7704 $title->set_core_access('ucfirst()');
7705 $title->set_file('Title');
7707 my $fold = property_ref('Case_Folding');
7708 $fold->set_file('Fold') if defined $fold;
7710 # utf8.c can't currently cope with non range-size-1 for these, and even if
7711 # it were changed to do so, someone else may be using them, expecting the
7713 foreach my $property (qw {
7720 property_ref($property)->set_range_size_1(1);
7723 # These two properties aren't actually used in the core, but unfortunately
7724 # the names just above that are in the core interfere with these, so
7725 # choose different names. These aren't a problem unless the map tables
7726 # for these files get written out.
7727 my $lowercase = property_ref('Lowercase');
7728 $lowercase->set_file('IsLower') if defined $lowercase;
7729 my $uppercase = property_ref('Uppercase');
7730 $uppercase->set_file('IsUpper') if defined $uppercase;
7732 # Set up the hard-coded default mappings, but only on properties defined
7734 foreach my $property (keys %default_mapping) {
7735 my $property_object = property_ref($property);
7736 next if ! defined $property_object;
7737 my $default_map = $default_mapping{$property};
7738 $property_object->set_default_map($default_map);
7740 # A map of <code point> implies the property is string.
7741 if ($property_object->type == $UNKNOWN
7742 && $default_map eq $CODE_POINT)
7744 $property_object->set_type($STRING);
7748 # The following use the Multi_Default class to create objects for
7751 # Bidi class has a complicated default, but the derived file takes care of
7752 # the complications, leaving just 'L'.
7753 if (file_exists("${EXTRACTED}DBidiClass.txt")) {
7754 property_ref('Bidi_Class')->set_default_map('L');
7759 # The derived file was introduced in 3.1.1. The values below are
7760 # taken from table 3-8, TUS 3.0
7762 'my $default = Range_List->new;
7763 $default->add_range(0x0590, 0x05FF);
7764 $default->add_range(0xFB1D, 0xFB4F);'
7767 # The defaults apply only to unassigned characters
7768 $default_R .= '$gc->table("Cn") & $default;';
7770 if ($v_version lt v3.0.0) {
7771 $default = Multi_Default->new(R => $default_R, 'L');
7775 # AL apparently not introduced until 3.0: TUS 2.x references are
7776 # not on-line to check it out
7778 'my $default = Range_List->new;
7779 $default->add_range(0x0600, 0x07BF);
7780 $default->add_range(0xFB50, 0xFDFF);
7781 $default->add_range(0xFE70, 0xFEFF);'
7784 # Non-character code points introduced in this release; aren't AL
7785 if ($v_version ge 3.1.0) {
7786 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
7788 $default_AL .= '$gc->table("Cn") & $default';
7789 $default = Multi_Default->new(AL => $default_AL,
7793 property_ref('Bidi_Class')->set_default_map($default);
7796 # Joining type has a complicated default, but the derived file takes care
7797 # of the complications, leaving just 'U' (or Non_Joining), except the file
7799 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
7800 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
7801 property_ref('Joining_Type')->set_default_map('Non_Joining');
7805 # Otherwise, there are not one, but two possibilities for the
7806 # missing defaults: T and U.
7807 # The missing defaults that evaluate to T are given by:
7808 # T = Mn + Cf - ZWNJ - ZWJ
7809 # where Mn and Cf are the general category values. In other words,
7810 # any non-spacing mark or any format control character, except
7811 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
7812 # WIDTH JOINER (joining type C).
7813 my $default = Multi_Default->new(
7814 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
7816 property_ref('Joining_Type')->set_default_map($default);
7820 # Line break has a complicated default in early releases. It is 'Unknown'
7821 # for non-assigned code points; 'AL' for assigned.
7822 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
7823 my $lb = property_ref('Line_Break');
7824 if ($v_version gt 3.2.0) {
7825 $lb->set_default_map('Unknown');
7828 my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
7830 $lb->set_default_map($default);
7833 # If has the URS property, make sure that the standard aliases are in
7834 # it, since not in the input tables in some versions.
7835 my $urs = property_ref('Unicode_Radical_Stroke');
7837 $urs->add_alias('cjkRSUnicode');
7838 $urs->add_alias('kRSUnicode');
7844 sub get_old_property_aliases() {
7845 # Returns what would be in PropertyAliases.txt if it existed in very old
7846 # versions of Unicode. It was derived from the one in 3.2, and pared
7847 # down based on the data that was actually in the older releases.
7848 # An attempt was made to use the existence of files to mean inclusion or
7849 # not of various aliases, but if this was not sufficient, using version
7850 # numbers was resorted to.
7854 # These are to be used in all versions (though some are constructed by
7855 # this program if missing)
7856 push @return, split /\n/, <<'END';
7858 Bidi_M ; Bidi_Mirrored
7860 ccc ; Canonical_Combining_Class
7861 dm ; Decomposition_Mapping
7862 dt ; Decomposition_Type
7863 gc ; General_Category
7865 lc ; Lowercase_Mapping
7867 na1 ; Unicode_1_Name
7870 sfc ; Simple_Case_Folding
7871 slc ; Simple_Lowercase_Mapping
7872 stc ; Simple_Titlecase_Mapping
7873 suc ; Simple_Uppercase_Mapping
7874 tc ; Titlecase_Mapping
7875 uc ; Uppercase_Mapping
7878 if (-e 'Blocks.txt') {
7879 push @return, "blk ; Block\n";
7881 if (-e 'ArabicShaping.txt') {
7882 push @return, split /\n/, <<'END';
7887 if (-e 'PropList.txt') {
7889 # This first set is in the original old-style proplist.
7890 push @return, split /\n/, <<'END';
7892 Bidi_C ; Bidi_Control
7900 Join_C ; Join_Control
7902 QMark ; Quotation_Mark
7903 Term ; Terminal_Punctuation
7904 WSpace ; White_Space
7906 # The next sets were added later
7907 if ($v_version ge v3.0.0) {
7908 push @return, split /\n/, <<'END';
7913 if ($v_version ge v3.0.1) {
7914 push @return, split /\n/, <<'END';
7915 NChar ; Noncharacter_Code_Point
7918 # The next sets were added in the new-style
7919 if ($v_version ge v3.1.0) {
7920 push @return, split /\n/, <<'END';
7921 OAlpha ; Other_Alphabetic
7922 OLower ; Other_Lowercase
7924 OUpper ; Other_Uppercase
7927 if ($v_version ge v3.1.1) {
7928 push @return, "AHex ; ASCII_Hex_Digit\n";
7931 if (-e 'EastAsianWidth.txt') {
7932 push @return, "ea ; East_Asian_Width\n";
7934 if (-e 'CompositionExclusions.txt') {
7935 push @return, "CE ; Composition_Exclusion\n";
7937 if (-e 'LineBreak.txt') {
7938 push @return, "lb ; Line_Break\n";
7940 if (-e 'BidiMirroring.txt') {
7941 push @return, "bmg ; Bidi_Mirroring_Glyph\n";
7943 if (-e 'Scripts.txt') {
7944 push @return, "sc ; Script\n";
7946 if (-e 'DNormalizationProps.txt') {
7947 push @return, split /\n/, <<'END';
7948 Comp_Ex ; Full_Composition_Exclusion
7949 FC_NFKC ; FC_NFKC_Closure
7950 NFC_QC ; NFC_Quick_Check
7951 NFD_QC ; NFD_Quick_Check
7952 NFKC_QC ; NFKC_Quick_Check
7953 NFKD_QC ; NFKD_Quick_Check
7954 XO_NFC ; Expands_On_NFC
7955 XO_NFD ; Expands_On_NFD
7956 XO_NFKC ; Expands_On_NFKC
7957 XO_NFKD ; Expands_On_NFKD
7960 if (-e 'DCoreProperties.txt') {
7961 push @return, split /\n/, <<'END';
7966 # These can also appear in some versions of PropList.txt
7967 push @return, "Lower ; Lowercase\n"
7968 unless grep { $_ =~ /^Lower\b/} @return;
7969 push @return, "Upper ; Uppercase\n"
7970 unless grep { $_ =~ /^Upper\b/} @return;
7973 # This flag requires the DAge.txt file to be copied into the directory.
7974 if (DEBUG && $compare_versions) {
7975 push @return, 'age ; Age';
7981 sub process_PropValueAliases {
7982 # This file contains values that properties look like:
7983 # bc ; AL ; Arabic_Letter
7984 # blk; n/a ; Greek_And_Coptic ; Greek
7986 # Field 0 is the property.
7987 # Field 1 is the short name of a property value or 'n/a' if no
7988 # short name exists;
7989 # Field 2 is the full property value name;
7990 # Any other fields are more synonyms for the property value.
7991 # Purely numeric property values are omitted from the file; as are some
7992 # others, fewer and fewer in later releases
7994 # Entries for the ccc property have an extra field before the
7996 # ccc; 0; NR ; Not_Reordered
7997 # It is the numeric value that the names are synonyms for.
7999 # There are comment entries for values missing from this file:
8000 # # @missing: 0000..10FFFF; ISO_Comment; <none>
8001 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
8004 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8006 # This whole file was non-existent in early releases, so use our own
8007 # internal one if necessary.
8008 if (! -e 'PropValueAliases.txt') {
8009 $file->insert_lines(get_old_property_value_aliases());
8012 # Add any explicit cjk values
8013 $file->insert_lines(@cjk_property_values);
8015 # This line is used only for testing the code that checks for name
8016 # conflicts. There is a script Inherited, and when this line is executed
8017 # it causes there to be a name conflict with the 'Inherited' that this
8018 # program generates for this block property value
8019 #$file->insert_lines('blk; n/a; Herited');
8022 # Process each line of the file ...
8023 while ($file->next_line) {
8025 my ($property, @data) = split /\s*;\s*/;
8027 # The full name for the ccc property value is in field 2 of the
8028 # remaining ones; field 1 for all other properties. Swap ccc fields 1
8029 # and 2. (Rightmost splice removes field 2, returning it; left splice
8030 # inserts that into field 1, thus shifting former field 1 to field 2.)
8031 splice (@data, 1, 0, splice(@data, 2, 1)) if $property eq 'ccc';
8033 # If there is no short name, use the full one in element 1
8034 $data[0] = $data[1] if $data[0] eq "n/a";
8036 # Earlier releases had the pseudo property 'qc' that should expand to
8037 # the ones that replace it below.
8038 if ($property eq 'qc') {
8039 if (lc $data[0] eq 'y') {
8040 $file->insert_lines('NFC_QC; Y ; Yes',
8046 elsif (lc $data[0] eq 'n') {
8047 $file->insert_lines('NFC_QC; N ; No',
8053 elsif (lc $data[0] eq 'm') {
8054 $file->insert_lines('NFC_QC; M ; Maybe',
8055 'NFKC_QC; M ; Maybe',
8059 $file->carp_bad_line("qc followed by unexpected '$data[0]");
8064 # The first field is the short name, 2nd is the full one.
8065 my $property_object = property_ref($property);
8066 my $table = $property_object->add_match_table($data[0],
8067 Full_Name => $data[1]);
8069 # Start looking for more aliases after these two.
8070 for my $i (2 .. @data - 1) {
8071 $table->add_alias($data[$i]);
8073 } # End of looping through the file
8075 # As noted in the comments early in the program, it generates tables for
8076 # the default values for all releases, even those for which the concept
8077 # didn't exist at the time. Here we add those if missing.
8078 my $age = property_ref('age');
8079 if (defined $age && ! defined $age->table('Unassigned')) {
8080 $age->add_match_table('Unassigned');
8082 $block->add_match_table('No_Block') if -e 'Blocks.txt'
8083 && ! defined $block->table('No_Block');
8086 # Now set the default mappings of the properties from the file. This is
8087 # done after the loop because a number of properties have only @missings
8088 # entries in the file, and may not show up until the end.
8089 my @defaults = $file->get_missings;
8090 foreach my $default_ref (@defaults) {
8091 my $default = $default_ref->[0];
8092 my $property = property_ref($default_ref->[1]);
8093 $property->set_default_map($default);
8098 sub get_old_property_value_aliases () {
8099 # Returns what would be in PropValueAliases.txt if it existed in very old
8100 # versions of Unicode. It was derived from the one in 3.2, and pared
8101 # down. An attempt was made to use the existence of files to mean
8102 # inclusion or not of various aliases, but if this was not sufficient,
8103 # using version numbers was resorted to.
8105 my @return = split /\n/, <<'END';
8106 bc ; AN ; Arabic_Number
8107 bc ; B ; Paragraph_Separator
8108 bc ; CS ; Common_Separator
8109 bc ; EN ; European_Number
8110 bc ; ES ; European_Separator
8111 bc ; ET ; European_Terminator
8112 bc ; L ; Left_To_Right
8113 bc ; ON ; Other_Neutral
8114 bc ; R ; Right_To_Left
8115 bc ; WS ; White_Space
8117 # The standard combining classes are very much different in v1, so only use
8118 # ones that look right (not checked thoroughly)
8119 ccc; 0; NR ; Not_Reordered
8120 ccc; 1; OV ; Overlay
8122 ccc; 8; KV ; Kana_Voicing
8124 ccc; 202; ATBL ; Attached_Below_Left
8125 ccc; 216; ATAR ; Attached_Above_Right
8126 ccc; 218; BL ; Below_Left
8128 ccc; 222; BR ; Below_Right
8130 ccc; 228; AL ; Above_Left
8132 ccc; 232; AR ; Above_Right
8133 ccc; 234; DA ; Double_Above
8135 dt ; can ; canonical
8149 gc ; C ; Other # Cc | Cf | Cn | Co | Cs
8151 gc ; Cn ; Unassigned
8152 gc ; Co ; Private_Use
8153 gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu
8154 gc ; LC ; Cased_Letter # Ll | Lt | Lu
8155 gc ; Ll ; Lowercase_Letter
8156 gc ; Lm ; Modifier_Letter
8157 gc ; Lo ; Other_Letter
8158 gc ; Lu ; Uppercase_Letter
8159 gc ; M ; Mark # Mc | Me | Mn
8160 gc ; Mc ; Spacing_Mark
8161 gc ; Mn ; Nonspacing_Mark
8162 gc ; N ; Number # Nd | Nl | No
8163 gc ; Nd ; Decimal_Number
8164 gc ; No ; Other_Number
8165 gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps
8166 gc ; Pd ; Dash_Punctuation
8167 gc ; Pe ; Close_Punctuation
8168 gc ; Po ; Other_Punctuation
8169 gc ; Ps ; Open_Punctuation
8170 gc ; S ; Symbol # Sc | Sk | Sm | So
8171 gc ; Sc ; Currency_Symbol
8172 gc ; Sm ; Math_Symbol
8173 gc ; So ; Other_Symbol
8174 gc ; Z ; Separator # Zl | Zp | Zs
8175 gc ; Zl ; Line_Separator
8176 gc ; Zp ; Paragraph_Separator
8177 gc ; Zs ; Space_Separator
8185 if (-e 'ArabicShaping.txt') {
8186 push @return, split /\n/, <<'END';
8193 jg ; n/a ; NO_JOINING_GROUP
8201 jt ; C ; Join_Causing
8202 jt ; D ; Dual_Joining
8203 jt ; L ; Left_Joining
8204 jt ; R ; Right_Joining
8205 jt ; U ; Non_Joining
8206 jt ; T ; Transparent
8208 if ($v_version ge v3.0.0) {
8209 push @return, split /\n/, <<'END';
8213 jg ; n/a ; DALATH_RISH
8216 jg ; n/a ; FINAL_SEMKATH
8219 jg ; n/a ; HAMZA_ON_HEH_GOAL
8226 jg ; n/a ; KNOTTED_HEH
8233 jg ; n/a ; REVERSED_PE
8237 jg ; n/a ; SWASH_KAF
8239 jg ; n/a ; TEH_MARBUTA
8242 jg ; n/a ; YEH_BARREE
8243 jg ; n/a ; YEH_WITH_TAIL
8252 if (-e 'EastAsianWidth.txt') {
8253 push @return, split /\n/, <<'END';
8263 if (-e 'LineBreak.txt') {
8264 push @return, split /\n/, <<'END';
8266 lb ; AL ; Alphabetic
8267 lb ; B2 ; Break_Both
8268 lb ; BA ; Break_After
8269 lb ; BB ; Break_Before
8270 lb ; BK ; Mandatory_Break
8271 lb ; CB ; Contingent_Break
8272 lb ; CL ; Close_Punctuation
8273 lb ; CM ; Combining_Mark
8274 lb ; CR ; Carriage_Return
8275 lb ; EX ; Exclamation
8278 lb ; ID ; Ideographic
8279 lb ; IN ; Inseperable
8280 lb ; IS ; Infix_Numeric
8282 lb ; NS ; Nonstarter
8284 lb ; OP ; Open_Punctuation
8285 lb ; PO ; Postfix_Numeric
8286 lb ; PR ; Prefix_Numeric
8288 lb ; SA ; Complex_Context
8291 lb ; SY ; Break_Symbols
8297 if (-e 'DNormalizationProps.txt') {
8298 push @return, split /\n/, <<'END';
8305 if (-e 'Scripts.txt') {
8306 push @return, split /\n/, <<'END';
8308 sc ; Armn ; Armenian
8310 sc ; Bopo ; Bopomofo
8311 sc ; Cans ; Canadian_Aboriginal
8312 sc ; Cher ; Cherokee
8313 sc ; Cyrl ; Cyrillic
8314 sc ; Deva ; Devanagari
8316 sc ; Ethi ; Ethiopic
8317 sc ; Geor ; Georgian
8320 sc ; Gujr ; Gujarati
8321 sc ; Guru ; Gurmukhi
8325 sc ; Hira ; Hiragana
8326 sc ; Ital ; Old_Italic
8327 sc ; Kana ; Katakana
8332 sc ; Mlym ; Malayalam
8333 sc ; Mong ; Mongolian
8337 sc ; Qaai ; Inherited
8351 if ($v_version ge v2.0.0) {
8352 push @return, split /\n/, <<'END';
8356 dt ; vert ; vertical
8361 gc ; Lt ; Titlecase_Letter
8362 gc ; Me ; Enclosing_Mark
8363 gc ; Nl ; Letter_Number
8364 gc ; Pc ; Connector_Punctuation
8365 gc ; Sk ; Modifier_Symbol
8368 if ($v_version ge v2.1.2) {
8369 push @return, "bc ; S ; Segment_Separator\n";
8371 if ($v_version ge v2.1.5) {
8372 push @return, split /\n/, <<'END';
8373 gc ; Pf ; Final_Punctuation
8374 gc ; Pi ; Initial_Punctuation
8377 if ($v_version ge v2.1.8) {
8378 push @return, "ccc; 240; IS ; Iota_Subscript\n";
8381 if ($v_version ge v3.0.0) {
8382 push @return, split /\n/, <<'END';
8383 bc ; AL ; Arabic_Letter
8384 bc ; BN ; Boundary_Neutral
8385 bc ; LRE ; Left_To_Right_Embedding
8386 bc ; LRO ; Left_To_Right_Override
8387 bc ; NSM ; Nonspacing_Mark
8388 bc ; PDF ; Pop_Directional_Format
8389 bc ; RLE ; Right_To_Left_Embedding
8390 bc ; RLO ; Right_To_Left_Override
8392 ccc; 233; DB ; Double_Below
8396 if ($v_version ge v3.1.0) {
8397 push @return, "ccc; 226; R ; Right\n";
8404 # This is used to store the range list of all the code points usable when
8405 # the little used $compare_versions feature is enabled.
8406 my $compare_versions_range_list;
8408 sub process_generic_property_file {
8409 # This processes a file containing property mappings and puts them
8410 # into internal map tables. It should be used to handle any property
8411 # files that have mappings from a code point or range thereof to
8412 # something else. This means almost all the UCD .txt files.
8413 # each_line_handlers() should be set to adjust the lines of these
8414 # files, if necessary, to what this routine understands:
8419 # the fields are: "codepoint range ; property; map"
8421 # meaning the codepoints in the range all have the value 'map' under
8423 # Beginning and trailing white space in each field are not signficant.
8424 # Note there is not a trailing semi-colon in the above. A trailing
8425 # semi-colon means the map is a null-string. An omitted map, as
8426 # opposed to a null-string, is assumed to be 'Y', based on Unicode
8427 # table syntax. (This could have been hidden from this routine by
8428 # doing it in the $file object, but that would require parsing of the
8429 # line there, so would have to parse it twice, or change the interface
8430 # to pass this an array. So not done.)
8432 # The map field may begin with a sequence of commands that apply to
8433 # this range. Each such command begins and ends with $CMD_DELIM.
8434 # These are used to indicate, for example, that the mapping for a
8435 # range has a non-default type.
8437 # This loops through the file, calling it's next_line() method, and
8438 # then taking the map and adding it to the property's table.
8439 # Complications arise because any number of properties can be in the
8440 # file, in any order, interspersed in any way. The first time a
8441 # property is seen, it gets information about that property and
8442 # caches it for quick retrieval later. It also normalizes the maps
8443 # so that only one of many synonym is stored. The Unicode input files
8444 # do use some multiple synonyms.
8447 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8449 my %property_info; # To keep track of what properties
8450 # have already had entries in the
8451 # current file, and info about each,
8452 # so don't have to recompute.
8453 my $property_name; # property currently being worked on
8454 my $property_type; # and its type
8455 my $previous_property_name = ""; # name from last time through loop
8456 my $property_object; # pointer to the current property's
8458 my $property_addr; # the address of that object
8459 my $default_map; # the string that code points missing
8460 # from the file map to
8461 my $default_table; # For non-string properties, a
8462 # reference to the match table that
8463 # will contain the list of code
8464 # points that map to $default_map.
8466 # Get the next real non-comment line
8468 while ($file->next_line) {
8470 # Default replacement type; means that if parts of the range have
8471 # already been stored in our tables, the new map overrides them if
8472 # they differ more than cosmetically
8473 my $replace = $IF_NOT_EQUIVALENT;
8474 my $map_type; # Default type for the map of this range
8476 #local $to_trace = 1 if main::DEBUG;
8477 trace $_ if main::DEBUG && $to_trace;
8479 # Split the line into components
8480 my ($range, $property_name, $map, @remainder)
8481 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
8483 # If more or less on the line than we are expecting, warn and skip
8486 $file->carp_bad_line('Extra fields');
8489 elsif ( ! defined $property_name) {
8490 $file->carp_bad_line('Missing property');
8494 # Examine the range.
8495 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
8497 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
8501 my $high = (defined $2) ? hex $2 : $low;
8503 # For the very specialized case of comparing two Unicode
8505 if (DEBUG && $compare_versions) {
8506 if ($property_name eq 'Age') {
8508 # Only allow code points at least as old as the version
8510 my $age = pack "C*", split(/\./, $map); # v string
8511 next LINE if $age gt $compare_versions;
8515 # Again, we throw out code points younger than those of
8516 # the specified version. By now, the Age property is
8517 # populated. We use the intersection of each input range
8518 # with this property to find what code points in it are
8519 # valid. To do the intersection, we have to convert the
8520 # Age property map to a Range_list. We only have to do
8522 if (! defined $compare_versions_range_list) {
8523 my $age = property_ref('Age');
8524 if (! -e 'DAge.txt') {
8525 croak "Need to have 'DAge.txt' file to do version comparison";
8527 elsif ($age->count == 0) {
8528 croak "The 'Age' table is empty, but its file exists";
8530 $compare_versions_range_list
8531 = Range_List->new(Initialize => $age);
8534 # An undefined map is always 'Y'
8535 $map = 'Y' if ! defined $map;
8537 # Calculate the intersection of the input range with the
8538 # code points that are known in the specified version
8539 my @ranges = ($compare_versions_range_list
8540 & Range->new($low, $high))->ranges;
8542 # If the intersection is empty, throw away this range
8543 next LINE unless @ranges;
8545 # Only examine the first range this time through the loop.
8546 my $this_range = shift @ranges;
8548 # Put any remaining ranges in the queue to be processed
8549 # later. Note that there is unnecessary work here, as we
8550 # will do the intersection again for each of these ranges
8551 # during some future iteration of the LINE loop, but this
8552 # code is not used in production. The later intersections
8553 # are guaranteed to not splinter, so this will not become
8555 my $line = join ';', $property_name, $map;
8556 foreach my $range (@ranges) {
8557 $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
8563 # And process the first range, like any other.
8564 $low = $this_range->start;
8565 $high = $this_range->end;
8567 } # End of $compare_versions
8569 # If changing to a new property, get the things constant per
8571 if ($previous_property_name ne $property_name) {
8573 $property_object = property_ref($property_name);
8574 if (! defined $property_object) {
8575 $file->carp_bad_line("Unexpected property '$property_name'. Skipped");
8578 $property_addr = main::objaddr($property_object);
8580 # Defer changing names until have a line that is acceptable
8581 # (the 'next' statement above means is unacceptable)
8582 $previous_property_name = $property_name;
8584 # If not the first time for this property, retrieve info about
8586 if (defined ($property_info{$property_addr}{'type'})) {
8587 $property_type = $property_info{$property_addr}{'type'};
8588 $default_map = $property_info{$property_addr}{'default'};
8590 = $property_info{$property_addr}{'pseudo_map_type'};
8592 = $property_info{$property_addr}{'default_table'};
8596 # Here, is the first time for this property. Set up the
8598 $property_type = $property_info{$property_addr}{'type'}
8599 = $property_object->type;
8601 = $property_info{$property_addr}{'pseudo_map_type'}
8602 = $property_object->pseudo_map_type;
8604 # The Unicode files are set up so that if the map is not
8605 # defined, it is a binary property
8606 if (! defined $map && $property_type != $BINARY) {
8607 if ($property_type != $UNKNOWN
8608 && $property_type != $NON_STRING)
8610 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map");
8613 $property_object->set_type($BINARY);
8615 = $property_info{$property_addr}{'type'}
8620 # Get any @missings default for this property. This
8621 # should precede the first entry for the property in the
8622 # input file, and is located in a comment that has been
8623 # stored by the Input_file class until we access it here.
8624 # It's possible that there is more than one such line
8625 # waiting for us; collect them all, and parse
8626 my @missings_list = $file->get_missings
8627 if $file->has_missings_defaults;
8628 foreach my $default_ref (@missings_list) {
8629 my $default = $default_ref->[0];
8630 my $addr = objaddr property_ref($default_ref->[1]);
8632 # For string properties, the default is just what the
8633 # file says, but non-string properties should already
8634 # have set up a table for the default property value;
8635 # use the table for these, so can resolve synonyms
8636 # later to a single standard one.
8637 if ($property_type == $STRING
8638 || $property_type == $UNKNOWN)
8640 $property_info{$addr}{'missings'} = $default;
8643 $property_info{$addr}{'missings'}
8644 = $property_object->table($default);
8648 # Finished storing all the @missings defaults in the input
8649 # file so far. Get the one for the current property.
8650 my $missings = $property_info{$property_addr}{'missings'};
8652 # But we likely have separately stored what the default
8653 # should be. (This is to accommodate versions of the
8654 # standard where the @missings lines are absent or
8655 # incomplete.) Hopefully the two will match. But check
8657 $default_map = $property_object->default_map;
8659 # If the map is a ref, it means that the default won't be
8660 # processed until later, so undef it, so next few lines
8661 # will redefine it to something that nothing will match
8662 undef $default_map if ref $default_map;
8664 # Create a $default_map if don't have one; maybe a dummy
8665 # that won't match anything.
8666 if (! defined $default_map) {
8668 # Use any @missings line in the file.
8669 if (defined $missings) {
8670 if (ref $missings) {
8671 $default_map = $missings->full_name;
8672 $default_table = $missings;
8675 $default_map = $missings;
8678 # And store it with the property for outside use.
8679 $property_object->set_default_map($default_map);
8683 # Neither an @missings nor a default map. Create
8684 # a dummy one, so won't have to test definedness
8686 $default_map = '_Perl This will never be in a file
8691 # Here, we have $default_map defined, possibly in terms of
8692 # $missings, but maybe not, and possibly is a dummy one.
8693 if (defined $missings) {
8695 # Make sure there is no conflict between the two.
8696 # $missings has priority.
8697 if (ref $missings) {
8699 = $property_object->table($default_map);
8700 if (! defined $default_table
8701 || $default_table != $missings)
8703 if (! defined $default_table) {
8704 $default_table = $UNDEF;
8706 $file->carp_bad_line(<<END
8707 The \@missings line for $property_name in $file says that missings default to
8708 $missings, but we expect it to be $default_table. $missings used.
8711 $default_table = $missings;
8712 $default_map = $missings->full_name;
8714 $property_info{$property_addr}{'default_table'}
8717 elsif ($default_map ne $missings) {
8718 $file->carp_bad_line(<<END
8719 The \@missings line for $property_name in $file says that missings default to
8720 $missings, but we expect it to be $default_map. $missings used.
8723 $default_map = $missings;
8727 $property_info{$property_addr}{'default'}
8730 # If haven't done so already, find the table corresponding
8731 # to this map for non-string properties.
8732 if (! defined $default_table
8733 && $property_type != $STRING
8734 && $property_type != $UNKNOWN)
8736 $default_table = $property_info{$property_addr}
8738 = $property_object->table($default_map);
8740 } # End of is first time for this property
8741 } # End of switching properties.
8743 # Ready to process the line.
8744 # The Unicode files are set up so that if the map is not defined,
8745 # it is a binary property with value 'Y'
8746 if (! defined $map) {
8751 # If the map begins with a special command to us (enclosed in
8752 # delimiters), extract the command(s).
8753 if (substr($map, 0, 1) eq $CMD_DELIM) {
8754 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
8756 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
8759 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
8763 $file->carp_bad_line("Unknown command line: '$1'");
8770 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
8773 # Here, we have a map to a particular code point, and the
8774 # default map is to a code point itself. If the range
8775 # includes the particular code point, change that portion of
8776 # the range to the default. This makes sure that in the final
8777 # table only the non-defaults are listed.
8778 my $decimal_map = hex $map;
8779 if ($low <= $decimal_map && $decimal_map <= $high) {
8781 # If the range includes stuff before or after the map
8782 # we're changing, split it and process the split-off parts
8784 if ($low < $decimal_map) {
8785 $file->insert_adjusted_lines(
8786 sprintf("%04X..%04X; %s; %s",
8792 if ($high > $decimal_map) {
8793 $file->insert_adjusted_lines(
8794 sprintf("%04X..%04X; %s; %s",
8800 $low = $high = $decimal_map;
8805 # If we can tell that this is a synonym for the default map, use
8806 # the default one instead.
8807 if ($property_type != $STRING
8808 && $property_type != $UNKNOWN)
8810 my $table = $property_object->table($map);
8811 if (defined $table && $table == $default_table) {
8812 $map = $default_map;
8816 # And figure out the map type if not known.
8817 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
8818 if ($map eq "") { # Nulls are always $NULL map type
8820 } # Otherwise, non-strings, and those that don't allow
8821 # $MULTI_CP, and those that aren't multiple code points are
8824 (($property_type != $STRING && $property_type != $UNKNOWN)
8825 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
8826 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x)
8831 $map_type = $MULTI_CP;
8835 $property_object->add_map($low, $high,
8838 Replace => $replace);
8839 } # End of loop through file's lines
8845 # XXX Unused until revise charnames;
8846 #sub check_and_handle_compound_name {
8847 # This looks at Name properties for parenthesized components and splits
8848 # them off. Thus it finds FF as an equivalent to Form Feed.
8849 # my $code_point = shift;
8851 # if ($name =~ /^ ( .*? ) ( \s* ) \( ( [^)]* ) \) (.*) $/x) {
8852 # #local $to_trace = 1 if main::DEBUG;
8853 # trace $1, $2, $3, $4 if main::DEBUG && $to_trace;
8854 # push @more_Names, "$code_point; $1";
8855 # push @more_Names, "$code_point; $3";
8856 # Carp::my_carp_bug("Expecting blank space before left parenthesis in '$_'. Proceeding and assuming it was there;") if $2 ne " ";
8857 # Carp::my_carp_bug("Not expecting anything after the right parenthesis in '$_'. Proceeding and ignoring that;") if $4 ne "";
8862 { # Closure for UnicodeData.txt handling
8864 # This file was the first one in the UCD; its design leads to some
8865 # awkwardness in processing. Here is a sample line:
8866 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
8867 # The fields in order are:
8868 my $i = 0; # The code point is in field 0, and is shifted off.
8869 my $NAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
8870 my $CATEGORY = $i++; # category (e.g. "Lu")
8871 my $CCC = $i++; # Canonical combining class (e.g. "230")
8872 my $BIDI = $i++; # directional class (e.g. "L")
8873 my $PERL_DECOMPOSITION = $i++; # decomposition mapping
8874 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value
8875 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
8876 # Dual-use in this program; see below
8877 my $NUMERIC = $i++; # numeric value
8878 my $MIRRORED = $i++; # ? mirrored
8879 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
8880 my $COMMENT = $i++; # iso comment
8881 my $UPPER = $i++; # simple uppercase mapping
8882 my $LOWER = $i++; # simple lowercase mapping
8883 my $TITLE = $i++; # simple titlecase mapping
8884 my $input_field_count = $i;
8886 # This routine in addition outputs these extra fields:
8887 my $DECOMP_TYPE = $i++; # Decomposition type
8888 my $DECOMP_MAP = $i++; # Must be last; another decomposition mapping
8889 my $last_field = $i - 1;
8891 # All these are read into an array for each line, with the indices defined
8892 # above. The empty fields in the example line above indicate that the
8893 # value is defaulted. The handler called for each line of the input
8894 # changes these to their defaults.
8896 # Here are the official names of the properties, in a parallel array:
8898 $field_names[$BIDI] = 'Bidi_Class';
8899 $field_names[$CATEGORY] = 'General_Category';
8900 $field_names[$CCC] = 'Canonical_Combining_Class';
8901 $field_names[$COMMENT] = 'ISO_Comment';
8902 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
8903 $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
8904 $field_names[$LOWER] = 'Simple_Lowercase_Mapping';
8905 $field_names[$MIRRORED] = 'Bidi_Mirrored';
8906 $field_names[$NAME] = 'Name';
8907 $field_names[$NUMERIC] = 'Numeric_Value';
8908 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
8909 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
8910 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
8911 $field_names[$TITLE] = 'Simple_Titlecase_Mapping';
8912 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
8913 $field_names[$UPPER] = 'Simple_Uppercase_Mapping';
8915 # Some of these need a little more explanation. The $PERL_DECIMAL_DIGIT
8916 # field does not lead to an official Unicode property, but is used in
8917 # calculating the Numeric_Type. Perl however, creates a file from this
8918 # field, so a Perl property is created from it. Similarly, the Other
8919 # Digit field is used only for calculating the Numeric_Type, and so it can
8920 # be safely re-used as the place to store the value for Numeric_Type;
8921 # hence it is referred to as $NUMERIC_TYPE_OTHER_DIGIT. The input field
8922 # named $PERL_DECOMPOSITION is a combination of both the decomposition
8923 # mapping and its type. Perl creates a file containing exactly this
8924 # field, so it is used for that. The two properties are separated into
8925 # two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
8927 # This file is processed like most in this program. Control is passed to
8928 # process_generic_property_file() which calls filter_UnicodeData_line()
8929 # for each input line. This filter converts the input into line(s) that
8930 # process_generic_property_file() understands. There is also a setup
8931 # routine called before any of the file is processed, and a handler for
8932 # EOF processing, all in this closure.
8934 # A huge speed-up occurred at the cost of some added complexity when these
8935 # routines were altered to buffer the outputs into ranges. Almost all the
8936 # lines of the input file apply to just one code point, and for most
8937 # properties, the map for the next code point up is the same as the
8938 # current one. So instead of creating a line for each property for each
8939 # input line, filter_UnicodeData_line() remembers what the previous map
8940 # of a property was, and doesn't generate a line to pass on until it has
8941 # to, as when the map changes; and that passed-on line encompasses the
8942 # whole contiguous range of code points that have the same map for that
8943 # property. This means a slight amount of extra setup, and having to
8944 # flush these buffers on EOF, testing if the maps have changed, plus
8945 # remembering state information in the closure. But it means a lot less
8946 # real time in not having to change the data base for each property on
8949 # Another complication is that there are already a few ranges designated
8950 # in the input. There are two lines for each, with the same maps except
8951 # the code point and name on each line. This was actually the hardest
8952 # thing to design around. The code points in those ranges may actually
8953 # have real maps not given by these two lines. These maps will either
8954 # be algorthimically determinable, or in the extracted files furnished
8955 # with the UCD. In the event of conflicts between these extracted files,
8956 # and this one, Unicode says that this one prevails. But it shouldn't
8957 # prevail for conflicts that occur in these ranges. The data from the
8958 # extracted files prevails in those cases. So, this program is structured
8959 # so that those files are processed first, storing maps. Then the other
8960 # files are processed, generally overwriting what the extracted files
8961 # stored. But just the range lines in this input file are processed
8962 # without overwriting. This is accomplished by adding a special string to
8963 # the lines output to tell process_generic_property_file() to turn off the
8964 # overwriting for just this one line.
8965 # A similar mechanism is used to tell it that the map is of a non-default
8968 sub setup_UnicodeData { # Called before any lines of the input are read
8970 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8972 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
8973 Directory => File::Spec->curdir(),
8974 File => 'Decomposition',
8975 Format => $STRING_FORMAT,
8976 Internal_Only_Warning => 1,
8977 Perl_Extension => 1,
8978 Default_Map => $CODE_POINT,
8980 # This is a specially formatted table
8981 # explicitly for normalize.pm, which
8982 # is expecting a particular format,
8983 # which means that mappings containing
8984 # multiple code points are in the main
8986 Map_Type => $COMPUTE_NO_MULTI_CP,
8989 $Perl_decomp->add_comment(join_lines(<<END
8990 This mapping is a combination of the Unicode 'Decomposition_Type' and
8991 'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is
8992 identical to the official Unicode 'Decomposition_Mapping' property except for
8994 1) It omits the algorithmically determinable Hangul syllable decompositions,
8995 which normalize.pm handles algorithmically.
8996 2) It contains the decomposition type as well. Non-canonical decompositions
8997 begin with a word in angle brackets, like <super>, which denotes the
8998 compatible decomposition type. If the map does not begin with the <angle
8999 brackets>, the decomposition is canonical.
9003 my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
9005 Perl_Extension => 1,
9006 File => 'Digit', # Trad. location
9007 Directory => $map_directory,
9011 $Decimal_Digit->add_comment(join_lines(<<END
9012 This file gives the mapping of all code points which represent a single
9013 decimal digit [0-9] to their respective digits. For example, the code point
9014 U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
9015 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
9020 # This property is not used for generating anything else, and is
9021 # usually not output. By making it last in the list, we can just
9022 # change the high end of the loop downwards to avoid the work of
9023 # generating a table that is just going to get thrown away.
9024 if (! property_ref('Decomposition_Mapping')->to_output_map) {
9030 my $first_time = 1; # ? Is this the first line of the file
9031 my $in_range = 0; # ? Are we in one of the file's ranges
9032 my $previous_cp; # hex code point of previous line
9033 my $decimal_previous_cp = -1; # And its decimal equivalent
9034 my @start; # For each field, the current starting
9035 # code point in hex for the range
9036 # being accumulated.
9037 my @fields; # The input fields;
9038 my @previous_fields; # And those from the previous call
9040 sub filter_UnicodeData_line {
9041 # Handle a single input line from UnicodeData.txt; see comments above
9042 # Conceptually this takes a single line from the file containing N
9043 # properties, and converts it into N lines with one property per line,
9044 # which is what the final handler expects. But there are
9045 # complications due to the quirkiness of the input file, and to save
9046 # time, it accumulates ranges where the property values don't change
9047 # and only emits lines when necessary. This is about an order of
9048 # magnitude fewer lines emitted.
9051 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9053 # $_ contains the input line.
9054 # -1 in split means retain trailing null fields
9055 (my $cp, @fields) = split /\s*;\s*/, $_, -1;
9057 #local $to_trace = 1 if main::DEBUG;
9058 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
9059 if (@fields > $input_field_count) {
9060 $file->carp_bad_line('Extra fields');
9065 my $decimal_cp = hex $cp;
9067 # We have to output all the buffered ranges when the next code point
9068 # is not exactly one after the previous one, which means there is a
9069 # gap in the ranges.
9070 my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
9072 # The decomposition mapping field requires special handling. It looks
9075 # <compat> 0032 0020
9078 # The decomposition type is enclosed in <brackets>; if missing, it
9079 # means the type is canonical. There are two decomposition mapping
9080 # tables: the one for use by Perl's normalize.pm has a special format
9081 # which is this field intact; the other, for general use is of
9082 # standard format. In either case we have to find the decomposition
9083 # type. Empty fields have None as their type, and map to the code
9085 if ($fields[$PERL_DECOMPOSITION] eq "") {
9086 $fields[$DECOMP_TYPE] = 'None';
9087 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
9090 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
9091 =~ / < ( .+? ) > \s* ( .+ ) /x;
9092 if (! defined $fields[$DECOMP_TYPE]) {
9093 $fields[$DECOMP_TYPE] = 'Canonical';
9094 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
9097 $fields[$DECOMP_MAP] = $map;
9101 # The 3 numeric fields also require special handling. The 2 digit
9102 # fields must be either empty or match the number field. This means
9103 # that if it is empty, they must be as well, and the numeric type is
9104 # None, and the numeric value is 'Nan'.
9105 # The decimal digit field must be empty or match the other digit
9106 # field. If the decimal digit field is non-empty, the code point is
9107 # a decimal digit, and the other two fields will have the same value.
9108 # If it is empty, but the other digit field is non-empty, the code
9109 # point is an 'other digit', and the number field will have the same
9110 # value as the other digit field. If the other digit field is empty,
9111 # but the number field is non-empty, the code point is a generic
9113 if ($fields[$NUMERIC] eq "") {
9114 if ($fields[$PERL_DECIMAL_DIGIT] ne ""
9115 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
9117 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway");
9119 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
9120 $fields[$NUMERIC] = 'NaN';
9123 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
9124 if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
9125 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
9126 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
9128 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
9129 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
9130 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
9133 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
9135 # Rationals require extra effort.
9136 register_fraction($fields[$NUMERIC])
9137 if $fields[$NUMERIC] =~ qr{/};
9141 # For the properties that have empty fields in the file, and which
9142 # mean something different from empty, change them to that default.
9143 # Certain fields just haven't been empty so far in any Unicode
9144 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
9145 # $CATEGORY. This leaves just the two fields, and so we hard-code in
9146 # the defaults; which are verly unlikely to ever change.
9147 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
9148 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
9150 # UAX44 says that if title is empty, it is the same as whatever upper
9152 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
9154 # There are a few pairs of lines like:
9155 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
9156 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
9157 # that define ranges. These should be processed after the fields are
9158 # adjusted above, as they may override some of them; but mostly what
9159 # is left is to possibly adjust the $NAME field. The names of all the
9160 # paired lines start with a '<', but this is also true of '<control>,
9161 # which isn't one of these special ones.
9162 if ($fields[$NAME] eq '<control>') {
9164 # Some code points in this file have the pseudo-name
9165 # '<control>', but the official name for such ones is the null
9167 $fields[$NAME] = "";
9169 # We had better not be in between range lines.
9171 $file->carp_bad_line("Expecting a closing range line, not a $fields[$NAME]'. Trying anyway");
9175 elsif (substr($fields[$NAME], 0, 1) ne '<') {
9177 # Here is a non-range line. We had better not be in between range
9180 $file->carp_bad_line("Expecting a closing range line, not a $fields[$NAME]'. Trying anyway");
9183 # XXX until charnames catches up.
9184 # if ($fields[$NAME] =~ s/- $cp $//x) {
9186 # # These are code points whose names end in their code points,
9187 # # which means the names are algorithmically derivable from the
9188 # # code points. To shorten the output Name file, the algorithm
9189 # # for deriving these is placed in the file instead of each
9190 # # code point, so they have map type $CP_IN_NAME
9191 # $fields[$NAME] = $CMD_DELIM
9199 # Some official names are really two alternate names with one in
9200 # parentheses. What we do here is use the full official one for
9201 # the standard property (stored just above), but for the charnames
9202 # table, we add two more entries, one for each of the alternate
9205 #check_and_handle_compound_name($cp, $fields[$NAME]);
9206 #check_and_handle_compound_name($cp, $unicode_1_name);
9207 # XXX until charnames catches up.
9209 elsif ($fields[$NAME] =~ /^<(.+), First>$/) {
9210 $fields[$NAME] = $1;
9212 # Here we are at the beginning of a range pair.
9214 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$NAME]'. Trying anyway");
9218 # Because the properties in the range do not overwrite any already
9219 # in the db, we must flush the buffers of what's already there, so
9220 # they get handled in the normal scheme.
9224 elsif ($fields[$NAME] !~ s/^<(.+), Last>$/$1/) {
9225 $file->carp_bad_line("Unexpected name starting with '<' $fields[$NAME]. Ignoring this line.");
9229 else { # Here, we are at the last line of a range pair.
9232 $file->carp_bad_line("Unexpected end of range $fields[$NAME] when not in one. Ignoring this line.");
9238 # Check that the input is valid: that the closing of the range is
9239 # the same as the beginning.
9240 foreach my $i (0 .. $last_field) {
9241 next if $fields[$i] eq $previous_fields[$i];
9242 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway");
9245 # The processing differs depending on the type of range,
9246 # determined by its $NAME
9247 if ($fields[$NAME] =~ /^Hangul Syllable/) {
9249 # Check that the data looks right.
9250 if ($decimal_previous_cp != $SBase) {
9251 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong");
9253 if ($decimal_cp != $SBase + $SCount - 1) {
9254 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong");
9257 # The Hangul syllable range has a somewhat complicated name
9258 # generation algorithm. Each code point in it has a canonical
9259 # decomposition also computable by an algorithm. The
9260 # perl decomposition map table built from these is used only
9261 # by normalize.pm, which has the algorithm built in it, so the
9262 # decomposition maps are not needed, and are large, so are
9263 # omitted from it. If the full decomposition map table is to
9264 # be output, the decompositions are generated for it, in the
9265 # EOF handling code for this input file.
9267 $previous_fields[$DECOMP_TYPE] = 'Canonical';
9269 # This range is stored in our internal structure with its
9270 # own map type, different from all others.
9271 $previous_fields[$NAME] = $CMD_DELIM
9278 elsif ($fields[$NAME] =~ /^CJK/) {
9280 # The name for these contains the code point itself, and all
9281 # are defined to have the same base name, regardless of what
9282 # is in the file. They are stored in our internal structure
9283 # with a map type of $CP_IN_NAME
9284 $previous_fields[$NAME] = $CMD_DELIM
9289 . 'CJK UNIFIED IDEOGRAPH';
9292 elsif ($fields[$CATEGORY] eq 'Co'
9293 || $fields[$CATEGORY] eq 'Cs')
9295 # The names of all the code points in these ranges are set to
9296 # null, as there are no names for the private use and
9297 # surrogate code points.
9299 $previous_fields[$NAME] = "";
9302 $file->carp_bad_line("Unexpected code point range $fields[$NAME] because category is $fields[$CATEGORY]. Attempting to process it.");
9305 # The first line of the range caused everything else to be output,
9306 # and then its values were stored as the beginning values for the
9307 # next set of ranges, which this one ends. Now, for each value,
9308 # add a command to tell the handler that these values should not
9309 # replace any existing ones in our database.
9310 foreach my $i (0 .. $last_field) {
9311 $previous_fields[$i] = $CMD_DELIM
9316 . $previous_fields[$i];
9319 # And change things so it looks like the entire range has been
9320 # gone through with this being the final part of it. Adding the
9321 # command above to each field will cause this range to be flushed
9322 # during the next iteration, as it guaranteed that the stored
9323 # field won't match whatever value the next one has.
9325 $decimal_previous_cp = $decimal_cp;
9327 # We are now set up for the next iteration; so skip the remaining
9328 # code in this subroutine that does the same thing, but doesn't
9329 # know about these ranges.
9334 # On the very first line, we fake it so the code below thinks there is
9335 # nothing to output, and initialize so that when it does get output it
9336 # uses the first line's values for the lowest part of the range.
9337 # (One could avoid this by using peek(), but then one would need to
9338 # know the adjustments done above and do the same ones in the setup
9339 # routine; not worth it)
9342 @previous_fields = @fields;
9343 @start = ($cp) x scalar @fields;
9344 $decimal_previous_cp = $decimal_cp - 1;
9347 # For each field, output the stored up ranges that this code point
9348 # doesn't fit in. Earlier we figured out if all ranges should be
9349 # terminated because of changing the replace or map type styles, or if
9350 # there is a gap between this new code point and the previous one, and
9351 # that is stored in $force_output. But even if those aren't true, we
9352 # need to output the range if this new code point's value for the
9353 # given property doesn't match the stored range's.
9354 #local $to_trace = 1 if main::DEBUG;
9355 foreach my $i (0 .. $last_field) {
9356 my $field = $fields[$i];
9357 if ($force_output || $field ne $previous_fields[$i]) {
9359 # Flush the buffer of stored values.
9360 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
9362 # Start a new range with this code point and its value
9364 $previous_fields[$i] = $field;
9368 # Set the values for the next time.
9370 $decimal_previous_cp = $decimal_cp;
9372 # The input line has generated whatever adjusted lines are needed, and
9373 # should not be looked at further.
9378 sub EOF_UnicodeData {
9379 # Called upon EOF to flush the buffers, and create the Hangul
9380 # decomposition mappings if needed.
9383 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9385 # Flush the buffers.
9386 foreach my $i (1 .. $last_field) {
9387 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
9390 if (-e 'Jamo.txt') {
9392 # The algorithm is published by Unicode, based on values in
9393 # Jamo.txt, (which should have been processed before this
9394 # subroutine), and the results left in %Jamo
9396 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated.");
9400 # If the full decomposition map table is being output, insert
9401 # into it the Hangul syllable mappings. This is to avoid having
9402 # to publish a subroutine in it to compute them. (which would
9403 # essentially be this code.) This uses the algorithm published by
9405 if (property_ref('Decomposition_Mapping')->to_output_map) {
9406 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
9408 my $SIndex = $S - $SBase;
9409 my $L = $LBase + $SIndex / $NCount;
9410 my $V = $VBase + ($SIndex % $NCount) / $TCount;
9411 my $T = $TBase + $SIndex % $TCount;
9413 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
9414 my $decomposition = sprintf("%04X %04X", $L, $V);
9415 $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
9416 $file->insert_adjusted_lines(
9417 sprintf("%04X; Decomposition_Mapping; %s",
9428 # Fix UCD lines in version 1. This is probably overkill, but this
9429 # fixes some glaring errors in Version 1 UnicodeData.txt. That file:
9430 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later
9431 # removed. This program retains them
9432 # 2) didn't include ranges, which it should have, and which are now
9433 # added in @corrected_lines below. It was hand populated by
9434 # taking the data from Version 2, verified by analyzing
9436 # 3) There is a syntax error in the entry for U+09F8 which could
9437 # cause problems for utf8_heavy, and so is changed. It's
9438 # numeric value was simply a minus sign, without any number.
9439 # (Eventually Unicode changed the code point to non-numeric.)
9440 # 4) The decomposition types often don't match later versions
9441 # exactly, and the whole syntax of that field is different; so
9442 # the syntax is changed as well as the types to their later
9443 # terminology. Otherwise normalize.pm would be very unhappy
9444 # 5) Many ccc classes are different. These are left intact.
9445 # 6) U+FF10 - U+FF19 are missing their numeric values in all three
9446 # fields. These are unchanged because it doesn't really cause
9447 # problems for Perl.
9448 # 7) A number of code points, such as controls, don't have their
9449 # Unicode Version 1 Names in this file. These are unchanged.
9451 my @corrected_lines = split /\n/, <<'END';
9452 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
9453 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
9454 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
9455 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
9456 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
9457 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
9461 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9463 #local $to_trace = 1 if main::DEBUG;
9464 trace $_ if main::DEBUG && $to_trace;
9466 # -1 => retain trailing null fields
9467 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9469 # At the first place that is wrong in the input, insert all the
9470 # corrections, replacing the wrong line.
9471 if ($code_point eq '4E00') {
9472 my @copy = @corrected_lines;
9474 ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9476 $file->insert_lines(@copy);
9480 if ($fields[$NUMERIC] eq '-') {
9481 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it.
9484 if ($fields[$PERL_DECOMPOSITION] ne "") {
9486 # Several entries have this change to superscript 2 or 3 in the
9487 # middle. Convert these to the modern version, which is to use
9488 # the actual U+00B2 and U+00B3 (the superscript forms) instead.
9489 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
9490 # 'HHHH HHHH 00B3 HHHH'.
9491 # It turns out that all of these that don't have another
9492 # decomposition defined at the beginning of the line have the
9493 # <square> decomposition in later releases.
9494 if ($code_point ne '00B2' && $code_point ne '00B3') {
9495 if ($fields[$PERL_DECOMPOSITION]
9496 =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
9498 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
9499 $fields[$PERL_DECOMPOSITION] = '<square> '
9500 . $fields[$PERL_DECOMPOSITION];
9505 # If is like '<+circled> 0052 <-circled>', convert to
9507 $fields[$PERL_DECOMPOSITION] =~
9508 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x;
9510 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
9511 $fields[$PERL_DECOMPOSITION] =~
9512 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
9513 or $fields[$PERL_DECOMPOSITION] =~
9514 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
9515 or $fields[$PERL_DECOMPOSITION] =~
9516 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
9517 or $fields[$PERL_DECOMPOSITION] =~
9518 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
9520 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
9521 $fields[$PERL_DECOMPOSITION] =~
9522 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
9524 # Change names to modern form.
9525 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
9526 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
9527 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
9528 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
9530 # One entry has weird braces
9531 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
9534 $_ = join ';', $code_point, @fields;
9535 trace $_ if main::DEBUG && $to_trace;
9539 sub filter_v2_1_5_ucd {
9540 # A dozen entries in this 2.1.5 file had the mirrored and numeric
9541 # columns swapped; These all had mirrored be 'N'. So if the numeric
9542 # column appears to be N, swap it back.
9544 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9545 if ($fields[$NUMERIC] eq 'N') {
9546 $fields[$NUMERIC] = $fields[$MIRRORED];
9547 $fields[$MIRRORED] = 'N';
9548 $_ = join ';', $code_point, @fields;
9552 } # End closure for UnicodeData
9554 sub process_GCB_test {
9557 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9559 while ($file->next_line) {
9560 push @backslash_X_tests, $_;
9566 sub process_NamedSequences {
9567 # NamedSequences.txt entries are just added to an array. Because these
9568 # don't look like the other tables, they have their own handler.
9570 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
9572 # This just adds the sequence to an array for later handling
9574 return; # XXX Until charnames catches up
9576 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9578 while ($file->next_line) {
9579 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
9581 $file->carp_bad_line(
9582 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
9585 push @named_sequences, "$sequence\t\t$name";
9594 sub filter_early_ea_lb {
9595 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a
9596 # third field be the name of the code point, which can be ignored in
9597 # most cases. But it can be meaningful if it marks a range:
9598 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
9599 # 3400;W;<CJK Ideograph Extension A, First>
9601 # We need to see the First in the example above to know it's a range.
9602 # They did not use the later range syntaxes. This routine changes it
9603 # to use the modern syntax.
9604 # $1 is the Input_file object.
9606 my @fields = split /\s*;\s*/;
9607 if ($fields[2] =~ /^<.*, First>/) {
9608 $first_range = $fields[0];
9611 elsif ($fields[2] =~ /^<.*, Last>/) {
9612 $_ = $_ = "$first_range..$fields[0]; $fields[1]";
9616 $_ = "$fields[0]; $fields[1]";
9623 sub filter_old_style_arabic_shaping {
9624 # Early versions used a different term for the later one.
9626 my @fields = split /\s*;\s*/;
9627 $fields[3] =~ s/<no shaping>/No_Joining_Group/;
9628 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores
9629 $_ = join ';', @fields;
9633 sub filter_arabic_shaping_line {
9634 # ArabicShaping.txt has entries that look like:
9636 # The field containing 'TEH' is not used. The next field is Joining_Type
9637 # and the last is Joining_Group
9638 # This generates two lines to pass on, one for each property on the input
9642 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9644 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9647 $file->carp_bad_line('Extra fields');
9652 $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
9653 $_ = "$fields[0]; Joining_Type; $fields[2]";
9658 sub setup_special_casing {
9659 # SpecialCasing.txt contains the non-simple case change mappings. The
9660 # simple ones are in UnicodeData.txt, and should already have been read
9662 # This routine initializes the full mappings to the simple, then as each
9663 # line is processed, it overrides the simple ones.
9666 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9668 # For each of the case change mappings...
9669 foreach my $case ('lc', 'tc', 'uc') {
9671 # The simple version's name in each mapping merely has an 's' in front
9673 my $simple = property_ref('s' . $case);
9674 unless (defined $simple && ! $simple->is_empty) {
9675 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
9678 # Initialize the full case mappings with the simple ones.
9679 property_ref($case)->initialize($simple);
9685 sub filter_special_casing_line {
9686 # Change the format of $_ from SpecialCasing.txt into something that the
9687 # generic handler understands. Each input line contains three case
9688 # mappings. This will generate three lines to pass to the generic handler
9689 # for each of those.
9691 # The input syntax (after stripping comments and trailing white space is
9692 # like one of the following (with the final two being entries that we
9694 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
9695 # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
9696 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
9697 # Note the trailing semi-colon, unlike many of the input files. That
9698 # means that there will be an extra null field generated by the split
9701 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9703 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9705 # field #4 is when this mapping is conditional. If any of these get
9706 # implemented, it would be by hard-coding in the casing functions in the
9707 # Perl core, not through tables. But if there is a new condition we don't
9708 # know about, output a warning. We know about all the conditions through
9710 if ($fields[4] ne "") {
9711 my @conditions = split ' ', $fields[4];
9712 if ($conditions[0] ne 'tr' # We know that these languages have
9713 # conditions, and some are multiple
9714 && $conditions[0] ne 'az'
9715 && $conditions[0] ne 'lt'
9717 # And, we know about a single condition Final_Sigma, but
9719 && ($v_version gt v5.2.0
9720 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
9722 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore");
9724 elsif ($conditions[0] ne 'Final_Sigma') {
9726 # Don't print out a message for Final_Sigma, because we have
9727 # hard-coded handling for it. (But the standard could change
9728 # what the rule should be, but it wouldn't show up here
9731 print "# SKIPPING Special Casing: $_\n"
9732 if $verbosity >= $VERBOSE;
9737 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
9738 $file->carp_bad_line('Extra fields');
9743 $_ = "$fields[0]; lc; $fields[1]";
9744 $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
9745 $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
9750 sub filter_old_style_case_folding {
9751 # This transforms $_ containing the case folding style of 3.0.1, to 3.1
9752 # and later style. Different letters were used in the earlier.
9755 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9757 my @fields = split /\s*;\s*/;
9758 if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
9761 elsif ($fields[1] eq 'L') {
9762 $fields[1] = 'C'; # L => C always
9764 elsif ($fields[1] eq 'E') {
9765 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise
9773 $file->carp_bad_line("Expecting L or E in second field");
9777 $_ = join("; ", @fields) . ';';
9781 { # Closure for case folding
9783 # Create the map for simple only if are going to output it, for otherwise
9784 # it takes no part in anything we do.
9785 my $to_output_simple;
9787 # These are experimental, perhaps will need these to pass to regcomp.c to
9788 # handle the cases where for example the Kelvin sign character folds to k,
9789 # and in regcomp, we need to know which of the characters can have a
9790 # non-latin1 char fold to it, so it doesn't do the optimizations it might
9792 my @latin1_singly_folded;
9795 sub setup_case_folding($) {
9796 # Read in the case foldings in CaseFolding.txt. This handles both
9797 # simple and full case folding.
9800 = property_ref('Simple_Case_Folding')->to_output_map;
9805 sub filter_case_folding_line {
9806 # Called for each line in CaseFolding.txt
9807 # Input lines look like:
9808 # 0041; C; 0061; # LATIN CAPITAL LETTER A
9809 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
9810 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
9812 # 'C' means that folding is the same for both simple and full
9813 # 'F' that it is only for full folding
9814 # 'S' that it is only for simple folding
9815 # 'T' is locale-dependent, and ignored
9816 # 'I' is a type of 'F' used in some early releases.
9817 # Note the trailing semi-colon, unlike many of the input files. That
9818 # means that there will be an extra null field generated by the split
9819 # below, which we ignore and hence is not an error.
9822 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9824 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
9825 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
9826 $file->carp_bad_line('Extra fields');
9831 if ($type eq 'T') { # Skip Turkic case folding, is locale dependent
9836 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
9837 # I are all full foldings
9838 if ($type eq 'C' || $type eq 'F' || $type eq 'I') {
9839 $_ = "$range; Case_Folding; $map";
9844 $file->carp_bad_line('Expecting C F I S or T in second field');
9849 # C and S are simple foldings, but simple case folding is not needed
9850 # unless we explicitly want its map table output.
9851 if ($to_output_simple && $type eq 'C' || $type eq 'S') {
9852 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
9855 # Experimental, see comment above
9856 if ($type ne 'S' && hex($range) >= 256) { # assumes range is 1 point
9857 my @folded = split ' ', $map;
9858 if (hex $folded[0] < 256 && @folded == 1) {
9859 push @latin1_singly_folded, hex $folded[0];
9861 foreach my $folded (@folded) {
9862 push @latin1_folded, hex $folded if hex $folded < 256;
9870 # Experimental, see comment above
9873 #local $to_trace = 1 if main::DEBUG;
9874 @latin1_singly_folded = uniques(@latin1_singly_folded);
9875 @latin1_folded = uniques(@latin1_folded);
9876 trace "latin1 single folded:", map { chr $_ } sort { $a <=> $b } @latin1_singly_folded if main::DEBUG && $to_trace;
9877 trace "latin1 folded:", map { chr $_ } sort { $a <=> $b } @latin1_folded if main::DEBUG && $to_trace;
9880 } # End case fold closure
9882 sub filter_jamo_line {
9883 # Filter Jamo.txt lines. This routine mainly is used to populate hashes
9884 # from this file that is used in generating the Name property for Jamo
9885 # code points. But, it also is used to convert early versions' syntax
9886 # into the modern form. Here are two examples:
9887 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax
9888 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax
9890 # The input is $_, the output is $_ filtered.
9892 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9894 # Let the caller handle unexpected input. In earlier versions, there was
9895 # a third field which is supposed to be a comment, but did not have a '#'
9897 return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
9899 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous
9902 # Some 2.1 versions had this wrong. Causes havoc with the algorithm.
9903 $fields[1] = 'R' if $fields[0] eq '1105';
9905 # Add to structure so can generate Names from it.
9906 my $cp = hex $fields[0];
9907 my $short_name = $fields[1];
9908 $Jamo{$cp} = $short_name;
9909 if ($cp <= $LBase + $LCount) {
9910 $Jamo_L{$short_name} = $cp - $LBase;
9912 elsif ($cp <= $VBase + $VCount) {
9913 $Jamo_V{$short_name} = $cp - $VBase;
9915 elsif ($cp <= $TBase + $TCount) {
9916 $Jamo_T{$short_name} = $cp - $TBase;
9919 Carp::my_carp_bug("Unexpected Jamo code point in $_");
9923 # Reassemble using just the first two fields to look like a typical
9924 # property file line
9925 $_ = "$fields[0]; $fields[1]";
9930 sub register_fraction($) {
9931 # This registers the input rational number so that it can be passed on to
9932 # utf8_heavy.pl, both in rational and floating forms.
9934 my $rational = shift;
9936 my $float = eval $rational;
9937 $nv_floating_to_rational{$float} = $rational;
9941 sub filter_numeric_value_line {
9942 # DNumValues contains lines of a different syntax than the typical
9944 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO
9946 # This routine transforms $_ containing the anomalous syntax to the
9947 # typical, by filtering out the extra columns, and convert early version
9948 # decimal numbers to strings that look like rational numbers.
9951 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9953 # Starting in 5.1, there is a rational field. Just use that, omitting the
9954 # extra columns. Otherwise convert the decimal number in the second field
9955 # to a rational, and omit extraneous columns.
9956 my @fields = split /\s*;\s*/, $_, -1;
9959 if ($v_version ge v5.1.0) {
9961 $file->carp_bad_line('Not 4 semi-colon separated fields');
9965 $rational = $fields[3];
9966 $_ = join '; ', @fields[ 0, 3 ];
9970 # Here, is an older Unicode file, which has decimal numbers instead of
9971 # rationals in it. Use the fraction to calculate the denominator and
9972 # convert to rational.
9974 if (@fields != 2 && @fields != 3) {
9975 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
9980 my $codepoints = $fields[0];
9981 my $decimal = $fields[1];
9982 if ($decimal =~ s/\.0+$//) {
9984 # Anything ending with a decimal followed by nothing but 0's is an
9986 $_ = "$codepoints; $decimal";
9987 $rational = $decimal;
9992 if ($decimal =~ /\.50*$/) {
9996 # Here have the hardcoded repeating decimals in the fraction, and
9997 # the denominator they imply. There were only a few denominators
9998 # in the older Unicode versions of this file which this code
9999 # handles, so it is easy to convert them.
10001 # The 4 is because of a round-off error in the Unicode 3.2 files
10002 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
10005 elsif ($decimal =~ /\.[27]50*$/) {
10008 elsif ($decimal =~ /\.[2468]0*$/) {
10011 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
10014 elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
10017 if ($denominator) {
10018 my $sign = ($decimal < 0) ? "-" : "";
10019 my $numerator = int((abs($decimal) * $denominator) + .5);
10020 $rational = "$sign$numerator/$denominator";
10021 $_ = "$codepoints; $rational";
10024 $file->carp_bad_line("Can't cope with number '$decimal'.");
10031 register_fraction($rational) if $rational =~ qr{/};
10036 my %unihan_properties;
10041 # Do any special setup for Unihan properties.
10043 # This property gives the wrong computed type, so override.
10044 my $usource = property_ref('kIRG_USource');
10045 $usource->set_type($STRING) if defined $usource;
10047 # This property is to be considered binary, so change all the values
10049 $iicore = property_ref('kIICore');
10050 if (defined $iicore) {
10051 $iicore->add_match_table('Y') if ! defined $iicore->table('Y');
10053 # We have to change the default map, because the @missing line is
10054 # misleading, given that we are treating it as binary.
10055 $iicore->set_default_map('N');
10056 $iicore->set_type($BINARY);
10062 sub filter_unihan_line {
10063 # Change unihan db lines to look like the others in the db. Here is
10065 # U+341C kCangjie IEKN
10067 # Tabs are used instead of semi-colons to separate fields; therefore
10068 # they may have semi-colons embedded in them. Change these to periods
10069 # so won't screw up the rest of the code.
10072 # Remove lines that don't look like ones we accept.
10073 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
10078 # Extract the property, and save a reference to its object.
10080 if (! exists $unihan_properties{$property}) {
10081 $unihan_properties{$property} = property_ref($property);
10084 # Don't do anything unless the property is one we're handling, which
10085 # we determine by seeing if there is an object defined for it or not
10086 if (! defined $unihan_properties{$property}) {
10091 # The iicore property is supposed to be a boolean, so convert to our
10092 # standard boolean form.
10093 if (defined $iicore && $unihan_properties{$property} == $iicore) {
10094 $_ =~ s/$property.*/$property\tY/
10097 # Convert the tab separators to our standard semi-colons, and convert
10098 # the U+HHHH notation to the rest of the standard's HHHH
10100 s/\b U \+ (?= $code_point_re )//xg;
10102 #local $to_trace = 1 if main::DEBUG;
10103 trace $_ if main::DEBUG && $to_trace;
10109 sub filter_blocks_lines {
10110 # In the Blocks.txt file, the names of the blocks don't quite match the
10111 # names given in PropertyValueAliases.txt, so this changes them so they
10112 # do match: Blanks and hyphens are changed into underscores. Also makes
10113 # early release versions look like later ones
10115 # $_ is transformed to the correct value.
10118 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10120 if ($v_version lt v3.2.0) {
10121 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
10126 # Old versions used a different syntax to mark the range.
10127 $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
10130 my @fields = split /\s*;\s*/, $_, -1;
10131 if (@fields != 2) {
10132 $file->carp_bad_line("Expecting exactly two fields");
10137 # Change hyphens and blanks in the block name field only
10138 $fields[1] =~ s/[ -]/_/g;
10139 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word
10141 $_ = join("; ", @fields);
10146 my $current_property;
10148 sub filter_old_style_proplist {
10149 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it
10150 # was in a completely different syntax. Ken Whistler of Unicode says
10151 # that it was something he used as an aid for his own purposes, but
10152 # was never an official part of the standard. However, comments in
10153 # DAge.txt indicate that non-character code points were available in
10154 # the UCD as of 3.1. It is unclear to me (khw) how they could be
10155 # there except through this file (but on the other hand, they first
10156 # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
10157 # not. But the claim is that it was published as an aid to others who
10158 # might want some more information than was given in the official UCD
10159 # of the time. Many of the properties in it were incorporated into
10160 # the later PropList.txt, but some were not. This program uses this
10161 # early file to generate property tables that are otherwise not
10162 # accessible in the early UCD's, and most were probably not really
10163 # official at that time, so one could argue that it should be ignored,
10164 # and you can easily modify things to skip this. And there are bugs
10165 # in this file in various versions. (For example, the 2.1.9 version
10166 # removes from Alphabetic the CJK range starting at 4E00, and they
10167 # weren't added back in until 3.1.0.) Many of this file's properties
10168 # were later sanctioned, so this code generates tables for those
10169 # properties that aren't otherwise in the UCD of the time but
10170 # eventually did become official, and throws away the rest. Here is a
10171 # list of all the ones that are thrown away:
10172 # Bidi=* duplicates UnicodeData.txt
10173 # Combining never made into official property;
10175 # Composite never made into official property.
10176 # Currency Symbol duplicates UnicodeData.txt: gc=sc
10177 # Decimal Digit duplicates UnicodeData.txt: gc=nd
10178 # Delimiter never made into official property;
10180 # Format Control never made into official property;
10182 # High Surrogate duplicates Blocks.txt
10183 # Ignorable Control never made into official property;
10185 # ISO Control duplicates UnicodeData.txt: gc=cc
10186 # Left of Pair never made into official property;
10187 # Line Separator duplicates UnicodeData.txt: gc=zl
10188 # Low Surrogate duplicates Blocks.txt
10189 # Non-break was actually listed as a property
10190 # in 3.2, but without any code
10191 # points. Unicode denies that this
10192 # was ever an official property
10193 # Non-spacing duplicate UnicodeData.txt: gc=mn
10194 # Numeric duplicates UnicodeData.txt: gc=cc
10195 # Paired Punctuation never made into official property;
10196 # appears to be gc=ps + gc=pe
10197 # Paragraph Separator duplicates UnicodeData.txt: gc=cc
10198 # Private Use duplicates UnicodeData.txt: gc=co
10199 # Private Use High Surrogate duplicates Blocks.txt
10200 # Punctuation duplicates UnicodeData.txt: gc=p
10201 # Space different definition than eventual
10203 # Titlecase duplicates UnicodeData.txt: gc=lt
10204 # Unassigned Code Value duplicates UnicodeData.txt: gc=cc
10205 # Zero-width never made into offical property;
10207 # Most of the properties have the same names in this file as in later
10208 # versions, but a couple do not.
10210 # This subroutine filters $_, converting it from the old style into
10211 # the new style. Here's a sample of the old-style
10213 # *******************************************
10215 # Property dump for: 0x100000A0 (Join Control)
10217 # 200C..200D (2 chars)
10219 # In the example, the property is "Join Control". It is kept in this
10220 # closure between calls to the subroutine. The numbers beginning with
10221 # 0x were internal to Ken's program that generated this file.
10223 # If this line contains the property name, extract it.
10224 if (/^Property dump for: [^(]*\((.*)\)/) {
10227 # Convert white space to underscores.
10230 # Convert the few properties that don't have the same name as
10231 # their modern counterparts
10232 s/Identifier_Part/ID_Continue/
10233 or s/Not_a_Character/NChar/;
10235 # If the name matches an existing property, use it.
10236 if (defined property_ref($_)) {
10237 trace "new property=", $_ if main::DEBUG && $to_trace;
10238 $current_property = $_;
10240 else { # Otherwise discard it
10241 trace "rejected property=", $_ if main::DEBUG && $to_trace;
10242 undef $current_property;
10244 $_ = ""; # The property is saved for the next lines of the
10245 # file, but this defining line is of no further use,
10246 # so clear it so that the caller won't process it
10249 elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
10251 # Here, the input line isn't a header defining a property for the
10252 # following section, and either we aren't in such a section, or
10253 # the line doesn't look like one that defines the code points in
10254 # such a section. Ignore this line.
10259 # Here, we have a line defining the code points for the current
10260 # stashed property. Anything starting with the first blank is
10261 # extraneous. Otherwise, it should look like a normal range to
10262 # the caller. Append the property name so that it looks just like
10263 # a modern PropList entry.
10266 $_ .= "; $current_property";
10268 trace $_ if main::DEBUG && $to_trace;
10271 } # End closure for old style proplist
10273 sub filter_old_style_normalization_lines {
10274 # For early releases of Unicode, the lines were like:
10275 # 74..2A76 ; NFKD_NO
10276 # For later releases this became:
10277 # 74..2A76 ; NFKD_QC; N
10278 # Filter $_ to look like those in later releases.
10279 # Similarly for MAYBEs
10281 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
10283 # Also, the property FC_NFKC was abbreviated to FNC
10288 sub finish_Unicode() {
10289 # This routine should be called after all the Unicode files have been read
10291 # 1) Adds the mappings for code points missing from the files which have
10292 # defaults specified for them.
10293 # 2) At this this point all mappings are known, so it computes the type of
10294 # each property whose type hasn't been determined yet.
10295 # 3) Calculates all the regular expression match tables based on the
10297 # 3) Calculates and adds the tables which are defined by Unicode, but
10298 # which aren't derived by them
10300 # For each property, fill in any missing mappings, and calculate the re
10301 # match tables. If a property has more than one missing mapping, the
10302 # default is a reference to a data structure, and requires data from other
10303 # properties to resolve. The sort is used to cause these to be processed
10304 # last, after all the other properties have been calculated.
10305 # (Fortunately, the missing properties so far don't depend on each other.)
10306 foreach my $property
10307 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
10310 # $perl has been defined, but isn't one of the Unicode properties that
10311 # need to be finished up.
10312 next if $property == $perl;
10314 # Handle the properties that have more than one possible default
10315 if (ref $property->default_map) {
10316 my $default_map = $property->default_map;
10318 # These properties have stored in the default_map:
10320 # 1) A default map which applies to all code points in a
10322 # 2) an expression which will evaluate to the list of code
10323 # points in that class
10325 # 3) the default map which applies to every other missing code
10328 # Go through each list.
10329 while (my ($default, $eval) = $default_map->get_next_defaults) {
10331 # Get the class list, and intersect it with all the so-far
10332 # unspecified code points yielding all the code points
10333 # in the class that haven't been specified.
10334 my $list = eval $eval;
10336 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
10340 # Narrow down the list to just those code points we don't have
10342 $list = $list & $property->inverse_list;
10344 # Add mappings to the property for each code point in the list
10345 foreach my $range ($list->ranges) {
10346 $property->add_map($range->start, $range->end, $default);
10350 # All remaining code points have the other mapping. Set that up
10351 # so the normal single-default mapping code will work on them
10352 $property->set_default_map($default_map->other_default);
10354 # And fall through to do that
10357 # We should have enough data now to compute the type of the property.
10358 $property->compute_type;
10359 my $property_type = $property->type;
10361 next if ! $property->to_create_match_tables;
10363 # Here want to create match tables for this property
10365 # The Unicode db always (so far, and they claim into the future) have
10366 # the default for missing entries in binary properties be 'N' (unless
10367 # there is a '@missing' line that specifies otherwise)
10368 if ($property_type == $BINARY && ! defined $property->default_map) {
10369 $property->set_default_map('N');
10372 # Add any remaining code points to the mapping, using the default for
10373 # missing code points
10374 if (defined (my $default_map = $property->default_map)) {
10375 foreach my $range ($property->inverse_list->ranges) {
10376 $property->add_map($range->start, $range->end, $default_map);
10379 # Make sure there is a match table for the default
10380 if (! defined $property->table($default_map)) {
10381 $property->add_match_table($default_map);
10385 # Have all we need to populate the match tables.
10386 my $property_name = $property->name;
10387 foreach my $range ($property->ranges) {
10388 my $map = $range->value;
10389 my $table = property_ref($property_name)->table($map);
10390 if (! defined $table) {
10392 # Integral and rational property values are not necessarily
10393 # defined in PropValueAliases, but all other ones should be,
10395 if ($v_version ge v5.1.0
10396 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
10398 Carp::my_carp("Table '$property_name=$map' should have been defined. Defining it now.")
10400 $table = property_ref($property_name)->add_match_table($map);
10403 $table->add_range($range->start, $range->end);
10406 # And add the Is_ prefix synonyms for Perl 5.6 compatibility, in which
10407 # all properties have this optional prefix. These do not get a
10408 # separate entry in the pod file, because are covered by a wild-card
10410 foreach my $alias ($property->aliases) {
10411 my $Is_name = 'Is_' . $alias->name;
10412 if (! defined (my $pre_existing = property_ref($Is_name))) {
10413 $property->add_alias($Is_name,
10415 Status => $alias->status,
10416 Externally_Ok => 0);
10420 # It seemed too much work to add in these warnings when it
10421 # appears that Unicode has made a decision never to begin a
10422 # property name with 'Is_', so this shouldn't happen, but just
10423 # in case, it is a warning.
10424 Carp::my_carp(<<END
10425 There is already an alias named $Is_name (from " . $pre_existing . "), so not
10426 creating this alias for $property. The generated table and pod files do not
10427 warn users of this conflict.
10430 $has_Is_conflicts++;
10432 } # End of loop through aliases for this property
10433 } # End of loop through all Unicode properties.
10435 # Fill in the mappings that Unicode doesn't completely furnish. First the
10436 # single letter major general categories. If Unicode were to start
10437 # delivering the values, this would be redundant, but better that than to
10438 # try to figure out if should skip and not get it right. Ths could happen
10439 # if a new major category were to be introduced, and the hard-coded test
10440 # wouldn't know about it.
10441 # This routine depends on the standard names for the general categories
10442 # being what it thinks they are, like 'Cn'. The major categories are the
10443 # union of all the general category tables which have the same first
10444 # letters. eg. L = Lu + Lt + Ll + Lo + Lm
10445 foreach my $minor_table ($gc->tables) {
10446 my $minor_name = $minor_table->name;
10447 next if length $minor_name == 1;
10448 if (length $minor_name != 2) {
10449 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped.");
10453 my $major_name = uc(substr($minor_name, 0, 1));
10454 my $major_table = $gc->table($major_name);
10455 $major_table += $minor_table;
10458 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt
10459 # defines it as LC)
10460 my $LC = $gc->table('LC');
10461 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards...
10462 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility.
10465 if ($LC->is_empty) { # Assume if not empty that Unicode has started to
10466 # deliver the correct values in it
10467 $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
10469 # Lt not in release 1.
10470 $LC += $gc->table('Lt') if defined $gc->table('Lt');
10472 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
10474 my $Cs = $gc->table('Cs');
10476 $Cs->add_note('Mostly not usable in Perl.');
10477 $Cs->add_comment(join_lines(<<END
10478 Surrogates are used exclusively for I/O in UTF-16, and should not appear in
10479 Unicode text, and hence their use will generate (usually fatal) messages
10485 # Folding information was introduced later into Unicode data. To get
10486 # Perl's case ignore (/i) to work at all in releases that don't have
10487 # folding, use the best available alternative, which is lower casing.
10488 my $fold = property_ref('Simple_Case_Folding');
10489 if ($fold->is_empty) {
10490 $fold->initialize(property_ref('Simple_Lowercase_Mapping'));
10491 $fold->add_note(join_lines(<<END
10492 WARNING: This table uses lower case as a substitute for missing fold
10498 # Multiple-character mapping was introduced later into Unicode data. If
10499 # missing, use the single-characters maps as best available alternative
10500 foreach my $map (qw { Uppercase_Mapping
10505 my $full = property_ref($map);
10506 if ($full->is_empty) {
10507 my $simple = property_ref('Simple_' . $map);
10508 $full->initialize($simple);
10509 $full->add_comment($simple->comment) if ($simple->comment);
10510 $full->add_note(join_lines(<<END
10511 WARNING: This table uses simple mapping (single-character only) as a
10512 substitute for missing multiple-character information
10520 sub compile_perl() {
10521 # Create perl-defined tables. Almost all are part of the pseudo-property
10522 # named 'perl' internally to this program. Many of these are recommended
10523 # in UTS#18 "Unicode Regular Expressions", and their derivations are based
10524 # on those found there.
10525 # Almost all of these are equivalent to some Unicode property.
10526 # A number of these properties have equivalents restricted to the ASCII
10527 # range, with their names prefaced by 'Posix', to signify that these match
10528 # what the Posix standard says they should match. A couple are
10529 # effectively this, but the name doesn't have 'Posix' in it because there
10530 # just isn't any Posix equivalent.
10532 # 'Any' is all code points. As an error check, instead of just setting it
10533 # to be that, construct it to be the union of all the major categories
10534 my $Any = $perl->add_match_table('Any',
10535 Description => "[\\x{0000}-\\x{$LAST_UNICODE_CODEPOINT_STRING}]",
10538 foreach my $major_table ($gc->tables) {
10540 # Major categories are the ones with single letter names.
10541 next if length($major_table->name) != 1;
10543 $Any += $major_table;
10546 if ($Any->max != $LAST_UNICODE_CODEPOINT) {
10547 Carp::my_carp_bug("Generated highest code point ("
10548 . sprintf("%X", $Any->max)
10549 . ") doesn't match expected value $LAST_UNICODE_CODEPOINT_STRING.")
10551 if ($Any->range_count != 1 || $Any->min != 0) {
10552 Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
10555 $Any->add_alias('All');
10557 # Assigned is the opposite of gc=unassigned
10558 my $Assigned = $perl->add_match_table('Assigned',
10559 Description => "All assigned code points",
10560 Initialize => ~ $gc->table('Unassigned'),
10563 # Our internal-only property should be treated as more than just a
10565 $perl->add_match_table('_CombAbove')
10566 ->set_equivalent_to(property_ref('ccc')->table('Above'),
10569 my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
10570 if (defined $block) { # This is equivalent to the block if have it.
10571 my $Unicode_ASCII = $block->table('Basic_Latin');
10572 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
10573 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
10577 # Very early releases didn't have blocks, so initialize ASCII ourselves if
10579 if ($ASCII->is_empty) {
10580 $ASCII->initialize([ 0..127 ]);
10583 # A number of the Perl synonyms have a restricted-range synonym whose name
10584 # begins with Posix. This hash gets filled in with them, so that they can
10585 # be populated in a small loop.
10586 my %posix_equivalent;
10588 # Get the best available case definitions. Early Unicode versions didn't
10589 # have Uppercase and Lowercase defined, so use the general category
10590 # instead for them.
10591 my $Lower = $perl->add_match_table('Lower');
10592 my $Unicode_Lower = property_ref('Lowercase');
10593 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
10594 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
10597 $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
10600 $posix_equivalent{'Lower'} = $Lower;
10602 my $Upper = $perl->add_match_table('Upper');
10603 my $Unicode_Upper = property_ref('Uppercase');
10604 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
10605 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
10608 $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
10611 $posix_equivalent{'Upper'} = $Upper;
10613 # Earliest releases didn't have title case. Initialize it to empty if not
10614 # otherwise present
10615 my $Title = $perl->add_match_table('Title');
10616 my $lt = $gc->table('Lt');
10618 $Title->set_equivalent_to($lt, Related => 1);
10621 # If this Unicode version doesn't have Cased, set up our own. From
10622 # Unicode 5.1: Definition D120: A character C is defined to be cased if
10623 # and only if C has the Lowercase or Uppercase property or has a
10624 # General_Category value of Titlecase_Letter.
10625 unless (defined property_ref('Cased')) {
10626 my $cased = $perl->add_match_table('Cased',
10627 Initialize => $Lower + $Upper + $Title,
10628 Description => 'Uppercase or Lowercase or Titlecase',
10632 # Similarly, set up our own Case_Ignorable property if this Unicode
10633 # version doesn't have it. From Unicode 5.1: Definition D121: A character
10634 # C is defined to be case-ignorable if C has the value MidLetter or the
10635 # value MidNumLet for the Word_Break property or its General_Category is
10636 # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
10637 # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
10639 # Perl has long had an internal-only alias for this property.
10640 my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable');
10641 my $case_ignorable = property_ref('Case_Ignorable');
10642 if (defined $case_ignorable && ! $case_ignorable->is_empty) {
10643 $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
10648 $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
10650 # The following three properties are not in early releases
10651 $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
10652 $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
10653 $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
10655 # For versions 4.1 - 5.0, there is no MidNumLet property, and
10656 # correspondingly the case-ignorable definition lacks that one. For
10657 # 4.0, it appears that it was meant to be the same definition, but was
10658 # inadvertently omitted from the standard's text, so add it if the
10659 # property actually is there
10660 my $wb = property_ref('Word_Break');
10662 my $midlet = $wb->table('MidLetter');
10663 $perl_case_ignorable += $midlet if defined $midlet;
10664 my $midnumlet = $wb->table('MidNumLet');
10665 $perl_case_ignorable += $midnumlet if defined $midnumlet;
10669 # In earlier versions of the standard, instead of the above two
10670 # properties , just the following characters were used:
10671 $perl_case_ignorable += 0x0027 # APOSTROPHE
10672 + 0x00AD # SOFT HYPHEN (SHY)
10673 + 0x2019; # RIGHT SINGLE QUOTATION MARK
10677 # The remaining perl defined tables are mostly based on Unicode TR 18,
10678 # "Annex C: Compatibility Properties". All of these have two versions,
10679 # one whose name generally begins with Posix that is posix-compliant, and
10680 # one that matches Unicode characters beyond the Posix, ASCII range
10682 my $Alpha = $perl->add_match_table('Alpha',
10683 Description => '[[:Alpha:]] extended beyond ASCII');
10685 # Alphabetic was not present in early releases
10686 my $Alphabetic = property_ref('Alphabetic');
10687 if (defined $Alphabetic && ! $Alphabetic->is_empty) {
10688 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
10692 # For early releases, we don't get it exactly right. The below
10693 # includes more than it should, which in 5.2 terms is: L + Nl +
10694 # Other_Alphabetic. Other_Alphabetic contains many characters from
10695 # Mn and Mc. It's better to match more than we should, than less than
10697 $Alpha->initialize($gc->table('Letter')
10699 + $gc->table('Mc'));
10700 $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
10702 $posix_equivalent{'Alpha'} = $Alpha;
10704 my $Alnum = $perl->add_match_table('Alnum',
10705 Description => "[[:Alnum:]] extended beyond ASCII",
10706 Initialize => $Alpha + $gc->table('Decimal_Number'),
10708 $posix_equivalent{'Alnum'} = $Alnum;
10710 my $Word = $perl->add_match_table('Word',
10711 Description => '\w, including beyond ASCII',
10712 Initialize => $Alnum + $gc->table('Mark'),
10714 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
10715 $Word += $Pc if defined $Pc;
10717 # There is no [[:Word:]], so the name doesn't begin with Posix.
10718 $perl->add_match_table('PerlWord',
10719 Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
10720 Initialize => $Word & $ASCII,
10723 my $Blank = $perl->add_match_table('Blank',
10724 Description => '\h, Horizontal white space',
10726 # 200B is Zero Width Space which is for line
10727 # break control, and was listed as
10728 # Space_Separator in early releases
10729 Initialize => $gc->table('Space_Separator')
10733 $Blank->add_alias('HorizSpace'); # Another name for it.
10734 $posix_equivalent{'Blank'} = $Blank;
10736 my $VertSpace = $perl->add_match_table('VertSpace',
10737 Description => '\v',
10738 Initialize => $gc->table('Line_Separator')
10739 + $gc->table('Paragraph_Separator')
10740 + 0x000A # LINE FEED
10741 + 0x000B # VERTICAL TAB
10742 + 0x000C # FORM FEED
10743 + 0x000D # CARRIAGE RETURN
10746 # No Posix equivalent for vertical space
10748 my $Space = $perl->add_match_table('Space',
10749 Description => '\s including beyond ASCII plus vertical tab = [[:Space:]]',
10750 Initialize => $Blank + $VertSpace,
10752 $posix_equivalent{'Space'} = $Space;
10754 # Perl's traditional space doesn't include Vertical Tab
10755 my $SpacePerl = $perl->add_match_table('SpacePerl',
10756 Description => '\s, including beyond ASCII',
10757 Initialize => $Space - 0x000B,
10759 $perl->add_match_table('PerlSpace',
10760 Description => '\s, restricted to ASCII',
10761 Initialize => $SpacePerl & $ASCII,
10764 my $Cntrl = $perl->add_match_table('Cntrl',
10765 Description => "[[:Cntrl:]] extended beyond ASCII");
10766 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
10767 $posix_equivalent{'Cntrl'} = $Cntrl;
10769 # $controls is a temporary used to construct Graph.
10770 my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
10771 + $gc->table('Control'));
10772 # Cs not in release 1
10773 $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
10775 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
10776 my $Graph = $perl->add_match_table('Graph',
10777 Description => "[[:Graph:]] extended beyond ASCII",
10778 Initialize => ~ ($Space + $controls),
10780 $posix_equivalent{'Graph'} = $Graph;
10782 my $Print = $perl->add_match_table('Print',
10783 Description => "[[:Print:]] extended beyond ASCII",
10784 Initialize => $Space + $Graph - $gc->table('Control'),
10786 $posix_equivalent{'Print'} = $Print;
10788 my $Punct = $perl->add_match_table('Punct');
10789 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
10791 # \p{punct} doesn't include the symbols, which posix does
10792 $perl->add_match_table('PosixPunct',
10793 Description => "[[:Punct:]]",
10794 Initialize => $ASCII & ($gc->table('Punctuation')
10795 + $gc->table('Symbol')),
10798 my $Digit = $perl->add_match_table('Digit',
10799 Description => '\d, extended beyond just [0-9]');
10800 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
10801 $posix_equivalent{'Digit'} = $Digit;
10803 # AHex was not present in early releases
10804 my $Xdigit = $perl->add_match_table('XDigit',
10805 Description => '[0-9A-Fa-f]');
10806 my $AHex = property_ref('ASCII_Hex_Digit');
10807 if (defined $AHex && ! $AHex->is_empty) {
10808 $Xdigit->set_equivalent_to($AHex->table('Y'), Related => 1);
10811 # (Have to use hex because could be running on an non-ASCII machine,
10812 # and we want the Unicode (ASCII) values)
10813 $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66 ]);
10816 # Now, add the ASCII-restricted tables that get uniform treatment
10817 while (my ($name, $table) = each %posix_equivalent) {
10818 $perl->add_match_table("Posix$name",
10819 Description => "[[:$name:]]",
10820 Initialize => $table & $ASCII,
10823 $perl->table('PosixDigit')->add_description('\d, restricted to ASCII');
10824 $perl->table('PosixDigit')->add_description('[0-9]');
10827 my $dt = property_ref('Decomposition_Type');
10828 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
10829 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
10830 Perl_Extension => 1,
10831 Note => 'Perl extension consisting of the union of all non-canonical decompositions',
10834 # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
10835 # than SD appeared, construct it ourselves, based on the first release SD
10837 my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ');
10838 my $soft_dotted = property_ref('Soft_Dotted');
10839 if (defined $soft_dotted && ! $soft_dotted->is_empty) {
10840 $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
10844 # This list came from 3.2 Soft_Dotted.
10845 $CanonDCIJ->initialize([ 0x0069,
10854 $CanonDCIJ = $CanonDCIJ & $Assigned;
10857 # These are used in Unicode's definition of \X
10858 my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1);
10859 my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1);
10861 my $gcb = property_ref('Grapheme_Cluster_Break');
10863 # The 'extended' grapheme cluster came in 5.1. The non-extended
10864 # definition differs too much from the traditional Perl one to use.
10865 if (defined $gcb && defined $gcb->table('SpacingMark')) {
10867 # Note that assumes HST is defined; it came in an earlier release than
10868 # GCB. In the line below, two negatives means: yes hangul
10869 $begin += ~ property_ref('Hangul_Syllable_Type')
10870 ->table('Not_Applicable')
10871 + ~ ($gcb->table('Control')
10872 + $gcb->table('CR')
10873 + $gcb->table('LF'));
10874 $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
10876 $extend += $gcb->table('Extend') + $gcb->table('SpacingMark');
10877 $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
10879 else { # Old definition, used on early releases.
10880 $extend += $gc->table('Mark')
10883 $begin += ~ $extend;
10885 # Here we may have a release that has the regular grapheme cluster
10886 # defined, or a release that doesn't have anything defined.
10887 # We set things up so the Perl core degrades gracefully, possibly with
10888 # placeholders that match nothing.
10890 if (! defined $gcb) {
10891 $gcb = Property->new('GCB', Status => $PLACEHOLDER);
10893 my $hst = property_ref('HST');
10894 if (!defined $hst) {
10895 $hst = Property->new('HST', Status => $PLACEHOLDER);
10896 $hst->add_match_table('Not_Applicable',
10897 Initialize => $Any,
10901 # On some releases, here we may not have the needed tables for the
10902 # perl core, in some releases we may.
10903 foreach my $name (qw{ L LV LVT T V prepend }) {
10904 my $table = $gcb->table($name);
10905 if (! defined $table) {
10906 $table = $gcb->add_match_table($name);
10907 push @tables_that_may_be_empty, $table->complete_name;
10910 # The HST property predates the GCB one, and has identical tables
10911 # for some of them, so use it if we can.
10912 if ($table->is_empty
10914 && defined $hst->table($name))
10916 $table += $hst->table($name);
10921 # More GCB. If we found some hangul syllables, populate a combined
10923 my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V');
10924 my $LV = $gcb->table('LV');
10925 if ($LV->is_empty) {
10926 push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
10928 $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
10929 $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
10932 # Create a new property specially located that is a combination of the
10933 # various Name properties: Name, Unicode_1_Name, Named Sequences, and
10934 # Name_Alias properties. (The final duplicates elements of the first.) A
10935 # comment for it is constructed based on the actual properties present and
10937 my $perl_charname = Property->new('Perl_Charnames',
10938 Core_Access => '\N{...} and charnames.pm',
10940 Directory => File::Spec->curdir(),
10942 Internal_Only_Warning => 1,
10943 Perl_Extension => 1,
10946 Initialize => property_ref('Unicode_1_Name'),
10948 # Name overrides Unicode_1_Name
10949 $perl_charname->property_add_or_replace_non_nulls(property_ref('Name'));
10950 my @composition = ('Name', 'Unicode_1_Name');
10952 if (@named_sequences) {
10953 push @composition, 'Named_Sequence';
10954 foreach my $sequence (@named_sequences) {
10955 $perl_charname->add_anomalous_entry($sequence);
10959 my $alias_sentence = "";
10960 my $alias = property_ref('Name_Alias');
10961 if (defined $alias) {
10962 push @composition, 'Name_Alias';
10963 $alias->reset_each_range;
10964 while (my ($range) = $alias->each_range) {
10965 next if $range->value eq "";
10966 if ($range->start != $range->end) {
10967 Carp::my_carp("Expecting only one code point in the range $range. Just to keep going, using just the first code point;");
10969 $perl_charname->add_duplicate($range->start, $range->value);
10971 $alias_sentence = <<END;
10972 The Name_Alias property adds duplicate code point entries with a corrected
10973 name. The original (less correct, but still valid) name will be physically
10978 if (@composition <= 2) { # Always at least 2
10979 $comment = join " and ", @composition;
10982 $comment = join ", ", @composition[0 .. scalar @composition - 2];
10983 $comment .= ", and $composition[-1]";
10986 # Wait for charnames to catch up
10987 # foreach my $entry (@more_Names,
10988 # split "\n", <<"END"
10996 #FEFF; BYTE ORDER MARK
10999 # #local $to_trace = 1 if main::DEBUG;
11000 # trace $entry if main::DEBUG && $to_trace;
11001 # my ($code_point, $name) = split /\s*;\s*/, $entry;
11002 # $code_point = hex $code_point;
11003 # trace $code_point, $name if main::DEBUG && $to_trace;
11004 # $perl_charname->add_duplicate($code_point, $name);
11006 # #$perl_charname->add_comment("This file is for charnames.pm. It is the union of the $comment properties, plus certain commonly used but unofficial names, such as 'FF' and 'ZWNJ'. Unicode_1_Name entries are used only for otherwise nameless code points.$alias_sentence");
11007 $perl_charname->add_comment(join_lines( <<END
11008 This file is for charnames.pm. It is the union of the $comment properties.
11009 Unicode_1_Name entries are used only for otherwise nameless code
11015 # The combining class property used by Perl's normalize.pm is not located
11016 # in the normal mapping directory; create a copy for it.
11017 my $ccc = property_ref('Canonical_Combining_Class');
11018 my $perl_ccc = Property->new('Perl_ccc',
11019 Default_Map => $ccc->default_map,
11020 Full_Name => 'Perl_Canonical_Combining_Class',
11021 Internal_Only_Warning => 1,
11022 Perl_Extension => 1,
11025 Initialize => $ccc,
11026 File => 'CombiningClass',
11027 Directory => File::Spec->curdir(),
11029 $perl_ccc->set_to_output_map(1);
11030 $perl_ccc->add_comment(join_lines(<<END
11031 This mapping is for normalize.pm. It is currently identical to the Unicode
11032 Canonical_Combining_Class property.
11036 # This one match table for it is needed for calculations on output
11037 my $default = $perl_ccc->add_match_table($ccc->default_map,
11038 Initialize => $ccc->table($ccc->default_map),
11039 Status => $SUPPRESSED);
11041 # Construct the Present_In property from the Age property.
11042 if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
11043 my $default_map = $age->default_map;
11044 my $in = Property->new('In',
11045 Default_Map => $default_map,
11046 Full_Name => "Present_In",
11047 Internal_Only_Warning => 1,
11048 Perl_Extension => 1,
11050 Initialize => $age,
11052 $in->add_comment(join_lines(<<END
11053 This file should not be used for any purpose. The values in this file are the
11054 same as for $age, and not for what $in really means. This is because anything
11055 defined in a given release should have multiple values: that release and all
11056 higher ones. But only one value per code point can be represented in a table
11061 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the
11062 # lowest numbered (earliest) come first, with the non-numeric one
11064 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
11066 : ($b->name !~ /^[\d.]*$/)
11068 : $a->name <=> $b->name
11071 # The Present_In property is the cumulative age properties. The first
11072 # one hence is identical to the first age one.
11073 my $previous_in = $in->add_match_table($first_age->name);
11074 $previous_in->set_equivalent_to($first_age, Related => 1);
11076 my $description_start = "Code point's usage introduced in version ";
11077 $first_age->add_description($description_start . $first_age->name);
11079 # To construct the accumlated values, for each of the age tables
11080 # starting with the 2nd earliest, merge the earliest with it, to get
11081 # all those code points existing in the 2nd earliest. Repeat merging
11082 # the new 2nd earliest with the 3rd earliest to get all those existing
11083 # in the 3rd earliest, and so on.
11084 foreach my $current_age (@rest_ages) {
11085 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric
11087 my $current_in = $in->add_match_table(
11088 $current_age->name,
11089 Initialize => $current_age + $previous_in,
11090 Description => $description_start
11091 . $current_age->name
11094 $previous_in = $current_in;
11096 # Add clarifying material for the corresponding age file. This is
11097 # in part because of the confusing and contradictory information
11098 # given in the Standard's documentation itself, as of 5.2.
11099 $current_age->add_description(
11100 "Code point's usage was introduced in version "
11101 . $current_age->name);
11102 $current_age->add_note("See also $in");
11106 # And finally the code points whose usages have yet to be decided are
11107 # the same in both properties. Note that permanently unassigned code
11108 # points actually have their usage assigned (as being permanently
11109 # unassigned), so that these tables are not the same as gc=cn.
11110 my $unassigned = $in->add_match_table($default_map);
11111 my $age_default = $age->table($default_map);
11112 $age_default->add_description(<<END
11113 Code point's usage has not been assigned in any Unicode release thus far.
11116 $unassigned->set_equivalent_to($age_default, Related => 1);
11120 # Finished creating all the perl properties. All non-internal non-string
11121 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with
11122 # an underscore.) These do not get a separate entry in the pod file
11123 foreach my $table ($perl->tables) {
11124 foreach my $alias ($table->aliases) {
11125 next if $alias->name =~ /^_/;
11126 $table->add_alias('Is_' . $alias->name,
11128 Status => $alias->status,
11129 Externally_Ok => 0);
11136 sub add_perl_synonyms() {
11137 # A number of Unicode tables have Perl synonyms that are expressed in
11138 # the single-form, \p{name}. These are:
11139 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
11140 # \p{Is_Name} as synonyms
11141 # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
11142 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
11143 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
11144 # conflict, \p{Value} and \p{Is_Value} as well
11146 # This routine generates these synonyms, warning of any unexpected
11149 # Construct the list of tables to get synonyms for. Start with all the
11150 # binary and the General_Category ones.
11151 my @tables = grep { $_->type == $BINARY } property_ref('*');
11152 push @tables, $gc->tables;
11154 # If the version of Unicode includes the Script property, add its tables
11155 if (defined property_ref('Script')) {
11156 push @tables, property_ref('Script')->tables;
11159 # The Block tables are kept separate because they are treated differently.
11160 # And the earliest versions of Unicode didn't include them, so add only if
11163 push @blocks, $block->tables if defined $block;
11165 # Here, have the lists of tables constructed. Process blocks last so that
11166 # if there are name collisions with them, blocks have lowest priority.
11167 # Should there ever be other collisions, manual intervention would be
11168 # required. See the comments at the beginning of the program for a
11169 # possible way to handle those semi-automatically.
11170 foreach my $table (@tables, @blocks) {
11172 # For non-binary properties, the synonym is just the name of the
11173 # table, like Greek, but for binary properties the synonym is the name
11174 # of the property, and means the code points in its 'Y' table.
11175 my $nominal = $table;
11176 my $nominal_property = $nominal->property;
11178 if (! $nominal->isa('Property')) {
11183 # Here is a binary property. Use the 'Y' table. Verify that is
11185 my $yes = $nominal->table('Y');
11186 unless (defined $yes) { # Must be defined, but is permissible to
11188 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping.");
11194 foreach my $alias ($nominal->aliases) {
11196 # Attempt to create a table in the perl directory for the
11197 # candidate table, using whatever aliases in it that don't
11198 # conflict. Also add non-conflicting aliases for all these
11199 # prefixed by 'Is_' (and/or 'In_' for Block property tables)
11201 foreach my $prefix ("", 'Is_', 'In_') {
11203 # Only Block properties can have added 'In_' aliases.
11204 next if $prefix eq 'In_' and $nominal_property != $block;
11206 my $proposed_name = $prefix . $alias->name;
11208 # No Is_Is, In_In, nor combinations thereof
11209 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
11210 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
11212 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
11214 # Get a reference to any existing table in the perl
11215 # directory with the desired name.
11216 my $pre_existing = $perl->table($proposed_name);
11218 if (! defined $pre_existing) {
11220 # No name collision, so ok to add the perl synonym.
11222 my $make_pod_entry;
11224 my $status = $actual->status;
11225 if ($nominal_property == $block) {
11227 # For block properties, the 'In' form is preferred for
11228 # external use; the pod file contains wild cards for
11229 # this and the 'Is' form so no entries for those; and
11230 # we don't want people using the name without the
11231 # 'In', so discourage that.
11232 if ($prefix eq "") {
11233 $make_pod_entry = 1;
11234 $status = $status || $DISCOURAGED;
11235 $externally_ok = 0;
11237 elsif ($prefix eq 'In_') {
11238 $make_pod_entry = 0;
11239 $status = $status || $NORMAL;
11240 $externally_ok = 1;
11243 $make_pod_entry = 0;
11244 $status = $status || $DISCOURAGED;
11245 $externally_ok = 0;
11248 elsif ($prefix ne "") {
11250 # The 'Is' prefix is handled in the pod by a wild
11251 # card, and we won't use it for an external name
11252 $make_pod_entry = 0;
11253 $status = $status || $NORMAL;
11254 $externally_ok = 0;
11258 # Here, is an empty prefix, non block. This gets its
11259 # own pod entry and can be used for an external name.
11260 $make_pod_entry = 1;
11261 $status = $status || $NORMAL;
11262 $externally_ok = 1;
11265 # Here, there isn't a perl pre-existing table with the
11266 # name. Look through the list of equivalents of this
11267 # table to see if one is a perl table.
11268 foreach my $equivalent ($actual->leader->equivalents) {
11269 next if $equivalent->property != $perl;
11271 # Here, have found a table for $perl. Add this alias
11272 # to it, and are done with this prefix.
11273 $equivalent->add_alias($proposed_name,
11274 Pod_Entry => $make_pod_entry,
11276 Externally_Ok => $externally_ok);
11277 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
11281 # Here, $perl doesn't already have a table that is a
11282 # synonym for this property, add one.
11283 my $added_table = $perl->add_match_table($proposed_name,
11284 Pod_Entry => $make_pod_entry,
11286 Externally_Ok => $externally_ok);
11287 # And it will be related to the actual table, since it is
11289 $added_table->set_equivalent_to($actual, Related => 1);
11290 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
11292 } # End of no pre-existing.
11294 # Here, there is a pre-existing table that has the proposed
11295 # name. We could be in trouble, but not if this is just a
11296 # synonym for another table that we have already made a child
11297 # of the pre-existing one.
11298 if ($pre_existing->is_equivalent_to($actual)) {
11299 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
11300 $pre_existing->add_alias($proposed_name);
11304 # Here, there is a name collision, but it still could be ok if
11305 # the tables match the identical set of code points, in which
11306 # case, we can combine the names. Compare each table's code
11307 # point list to see if they are identical.
11308 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
11309 if ($pre_existing->matches_identically_to($actual)) {
11311 # Here, they do match identically. Not a real conflict.
11312 # Make the perl version a child of the Unicode one, except
11313 # in the non-obvious case of where the perl name is
11314 # already a synonym of another Unicode property. (This is
11315 # excluded by the test for it being its own parent.) The
11316 # reason for this exclusion is that then the two Unicode
11317 # properties become related; and we don't really know if
11318 # they are or not. We generate documentation based on
11319 # relatedness, and this would be misleading. Code
11320 # later executed in the process will cause the tables to
11321 # be represented by a single file anyway, without making
11322 # it look in the pod like they are necessarily related.
11323 if ($pre_existing->parent == $pre_existing
11324 && ($pre_existing->property == $perl
11325 || $actual->property == $perl))
11327 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
11328 $pre_existing->set_equivalent_to($actual, Related => 1);
11330 elsif (main::DEBUG && $to_trace) {
11331 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
11332 trace $pre_existing->parent;
11337 # Here they didn't match identically, there is a real conflict
11338 # between our new name and a pre-existing property.
11339 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
11340 $pre_existing->add_conflicting($nominal->full_name,
11344 # Don't output a warning for aliases for the block
11345 # properties (unless they start with 'In_') as it is
11346 # expected that there will be conflicts and the block
11348 if ($verbosity >= $NORMAL_VERBOSITY
11349 && ($actual->property != $block || $prefix eq 'In_'))
11351 print simple_fold(join_lines(<<END
11352 There is already an alias named $proposed_name (from " . $pre_existing . "),
11353 so not creating this alias for " . $actual
11358 # Keep track for documentation purposes.
11359 $has_In_conflicts++ if $prefix eq 'In_';
11360 $has_Is_conflicts++ if $prefix eq 'Is_';
11365 # There are some properties which have No and Yes (and N and Y) as
11366 # property values, but aren't binary, and could possibly be confused with
11367 # binary ones. So create caveats for them. There are tables that are
11368 # named 'No', and tables that are named 'N', but confusion is not likely
11369 # unless they are the same table. For example, N meaning Number or
11370 # Neutral is not likely to cause confusion, so don't add caveats to things
11372 foreach my $property (grep { $_->type != $BINARY } property_ref('*')) {
11373 my $yes = $property->table('Yes');
11374 if (defined $yes) {
11375 my $y = $property->table('Y');
11376 if (defined $y && $yes == $y) {
11377 foreach my $alias ($property->aliases) {
11378 $yes->add_conflicting($alias->name);
11382 my $no = $property->table('No');
11384 my $n = $property->table('N');
11385 if (defined $n && $no == $n) {
11386 foreach my $alias ($property->aliases) {
11387 $no->add_conflicting($alias->name, 'P');
11396 sub register_file_for_name($$$) {
11397 # Given info about a table and a datafile that it should be associated
11398 # with, register that assocation
11401 my $directory_ref = shift; # Array of the directory path for the file
11402 my $file = shift; # The file name in the final directory, [-1].
11403 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11405 trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
11407 if ($table->isa('Property')) {
11408 $table->set_file_path(@$directory_ref, $file);
11409 push @map_properties, $table
11410 if $directory_ref->[0] eq $map_directory;
11414 # Do all of the work for all equivalent tables when called with the leader
11415 # table, so skip if isn't the leader.
11416 return if $table->leader != $table;
11418 # Join all the file path components together, using slashes.
11419 my $full_filename = join('/', @$directory_ref, $file);
11421 # All go in the same subdirectory of unicore
11422 if ($directory_ref->[0] ne $matches_directory) {
11423 Carp::my_carp("Unexpected directory in "
11424 . join('/', @{$directory_ref}, $file));
11427 # For this table and all its equivalents ...
11428 foreach my $table ($table, $table->equivalents) {
11430 # Associate it with its file internally. Don't include the
11431 # $matches_directory first component
11432 $table->set_file_path(@$directory_ref, $file);
11433 my $sub_filename = join('/', $directory_ref->[1, -1], $file);
11435 my $property = $table->property;
11436 $property = ($property == $perl)
11437 ? "" # 'perl' is never explicitly stated
11438 : standardize($property->name) . '=';
11440 my $deprecated = ($table->status eq $DEPRECATED)
11441 ? $table->status_info
11444 # And for each of the table's aliases... This inner loop eventually
11445 # goes through all aliases in the UCD that we generate regex match
11447 foreach my $alias ($table->aliases) {
11448 my $name = $alias->name;
11450 # Generate an entry in either the loose or strict hashes, which
11451 # will translate the property and alias names combination into the
11452 # file where the table for them is stored.
11454 if ($alias->loose_match) {
11455 $standard = $property . standardize($alias->name);
11456 if (exists $loose_to_file_of{$standard}) {
11457 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
11460 $loose_to_file_of{$standard} = $sub_filename;
11464 $standard = lc ($property . $name);
11465 if (exists $stricter_to_file_of{$standard}) {
11466 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
11469 $stricter_to_file_of{$standard} = $sub_filename;
11471 # Tightly coupled with how utf8_heavy.pl works, for a
11472 # floating point number that is a whole number, get rid of
11473 # the trailing decimal point and 0's, so that utf8_heavy
11474 # will work. Also note that this assumes that such a
11475 # number is matched strictly; so if that were to change,
11476 # this would be wrong.
11477 if ((my $integer_name = $name)
11478 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
11480 $stricter_to_file_of{$property . $integer_name}
11486 # Keep a list of the deprecated properties and their filenames
11488 $utf8::why_deprecated{$sub_filename} = $deprecated;
11497 my %base_names; # Names already used for avoiding DOS 8.3 filesystem
11499 my %full_dir_name_of; # Full length names of directories used.
11501 sub construct_filename($$$) {
11502 # Return a file name for a table, based on the table name, but perhaps
11503 # changed to get rid of non-portable characters in it, and to make
11504 # sure that it is unique on a file system that allows the names before
11505 # any period to be at most 8 characters (DOS). While we're at it
11506 # check and complain if there are any directory conflicts.
11508 my $name = shift; # The name to start with
11509 my $mutable = shift; # Boolean: can it be changed? If no, but
11510 # yet it must be to work properly, a warning
11512 my $directories_ref = shift; # A reference to an array containing the
11513 # path to the file, with each element one path
11514 # component. This is used because the same
11515 # name can be used in different directories.
11516 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11518 my $warn = ! defined wantarray; # If true, then if the name is
11519 # changed, a warning is issued as well.
11521 if (! defined $name) {
11522 Carp::my_carp("Undefined name in directory "
11523 . File::Spec->join(@$directories_ref)
11528 # Make sure that no directory names conflict with each other. Look at
11529 # each directory in the input file's path. If it is already in use,
11530 # assume it is correct, and is merely being re-used, but if we
11531 # truncate it to 8 characters, and find that there are two directories
11532 # that are the same for the first 8 characters, but differ after that,
11533 # then that is a problem.
11534 foreach my $directory (@$directories_ref) {
11535 my $short_dir = substr($directory, 0, 8);
11536 if (defined $full_dir_name_of{$short_dir}) {
11537 next if $full_dir_name_of{$short_dir} eq $directory;
11538 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway");
11541 $full_dir_name_of{$short_dir} = $directory;
11545 my $path = join '/', @$directories_ref;
11546 $path .= '/' if $path;
11548 # Remove interior underscores.
11549 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
11551 # Change any non-word character into an underscore, and truncate to 8.
11552 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_"
11553 substr($filename, 8) = "" if length($filename) > 8;
11555 # Make sure the basename doesn't conflict with something we
11556 # might have already written. If we have, say,
11563 while (my $num = $base_names{$path}{lc $filename}++) {
11564 $num++; # so basenames with numbers start with '2', which
11565 # just looks more natural.
11567 # Want to append $num, but if it'll make the basename longer
11568 # than 8 characters, pre-truncate $filename so that the result
11570 my $delta = length($filename) + length($num) - 8;
11572 substr($filename, -$delta) = $num;
11577 if ($warn && ! $warned) {
11579 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway.");
11583 return $filename if $mutable;
11585 # If not changeable, must return the input name, but warn if needed to
11586 # change it beyond shortening it.
11587 if ($name ne $filename
11588 && substr($name, 0, length($filename)) ne $filename) {
11589 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway.");
11595 # The pod file contains a very large table. Many of the lines in that table
11596 # would exceed a typical output window's size, and so need to be wrapped with
11597 # a hanging indent to make them look good. The pod language is really
11598 # insufficient here. There is no general construct to do that in pod, so it
11599 # is done here by beginning each such line with a space to cause the result to
11600 # be output without formatting, and doing all the formatting here. This leads
11601 # to the result that if the eventual display window is too narrow it won't
11602 # look good, and if the window is too wide, no advantage is taken of that
11603 # extra width. A further complication is that the output may be indented by
11604 # the formatter so that there is less space than expected. What I (khw) have
11605 # done is to assume that that indent is a particular number of spaces based on
11606 # what it is in my Linux system; people can always resize their windows if
11607 # necessary, but this is obviously less than desirable, but the best that can
11609 my $automatic_pod_indent = 8;
11611 # Try to format so that uses fewest lines, but few long left column entries
11612 # slide into the right column. An experiment on 5.1 data yielded the
11613 # following percentages that didn't cut into the other side along with the
11614 # associated first-column widths
11616 # 80% not too bad except for a few blocks
11617 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
11619 my $indent_info_column = 27; # 75% of lines didn't have overlap
11621 my $FILLER = 3; # Length of initial boiler-plate columns in a pod line
11622 # The 3 is because of:
11623 # 1 for the leading space to tell the pod formatter to
11626 # 1 for the space between the flag and the main data
11628 sub format_pod_line ($$$;$$) {
11629 # Take a pod line and return it, formatted properly
11631 my $first_column_width = shift;
11632 my $entry = shift; # Contents of left column
11633 my $info = shift; # Contents of right column
11635 my $status = shift || ""; # Any flag
11637 my $loose_match = shift; # Boolean.
11638 $loose_match = 1 unless defined $loose_match;
11640 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11643 $flags .= $STRICTER if ! $loose_match;
11645 $flags .= $status if $status;
11647 # There is a blank in the left column to cause the pod formatter to
11648 # output the line as-is.
11649 return sprintf " %-*s%-*s %s\n",
11650 # The first * in the format is replaced by this, the -1 is
11651 # to account for the leading blank. There isn't a
11652 # hard-coded blank after this to separate the flags from
11653 # the rest of the line, so that in the unlikely event that
11654 # multiple flags are shown on the same line, they both
11655 # will get displayed at the expense of that separation,
11656 # but since they are left justified, a blank will be
11657 # inserted in the normal case.
11661 # The other * in the format is replaced by this number to
11662 # cause the first main column to right fill with blanks.
11663 # The -1 is for the guaranteed blank following it.
11664 $first_column_width - $FILLER - 1,
11669 my @zero_match_tables; # List of tables that have no matches in this release
11671 sub make_table_pod_entries($) {
11672 # This generates the entries for the pod file for a given table.
11673 # Also done at this time are any children tables. The output looks like:
11674 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178)
11676 my $input_table = shift; # Table the entry is for
11677 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11679 # Generate parent and all its children at the same time.
11680 return if $input_table->parent != $input_table;
11682 my $property = $input_table->property;
11683 my $type = $property->type;
11684 my $full_name = $property->full_name;
11686 my $count = $input_table->count;
11687 my $string_count = clarify_number($count);
11688 my $status = $input_table->status;
11689 my $status_info = $input_table->status_info;
11691 my $entry_for_first_table; # The entry for the first table output.
11692 # Almost certainly, it is the parent.
11694 # For each related table (including itself), we will generate a pod entry
11695 # for each name each table goes by
11696 foreach my $table ($input_table, $input_table->children) {
11698 # utf8_heavy.pl cannot deal with null string property values, so don't
11700 next if $table->name eq "";
11702 # First, gather all the info that applies to this table as a whole.
11704 push @zero_match_tables, $table if $count == 0;
11706 my $table_property = $table->property;
11708 # The short name has all the underscores removed, while the full name
11709 # retains them. Later, we decide whether to output a short synonym
11710 # for the full one, we need to compare apples to apples, so we use the
11711 # short name's length including underscores.
11712 my $table_property_short_name_length;
11713 my $table_property_short_name
11714 = $table_property->short_name(\$table_property_short_name_length);
11715 my $table_property_full_name = $table_property->full_name;
11717 # Get how much savings there is in the short name over the full one
11718 # (delta will always be <= 0)
11719 my $table_property_short_delta = $table_property_short_name_length
11720 - length($table_property_full_name);
11721 my @table_description = $table->description;
11722 my @table_note = $table->note;
11724 # Generate an entry for each alias in this table.
11725 my $entry_for_first_alias; # saves the first one encountered.
11726 foreach my $alias ($table->aliases) {
11728 # Skip if not to go in pod.
11729 next unless $alias->make_pod_entry;
11731 # Start gathering all the components for the entry
11732 my $name = $alias->name;
11734 my $entry; # Holds the left column, may include extras
11735 my $entry_ref; # To refer to the left column's contents from
11736 # another entry; has no extras
11738 # First the left column of the pod entry. Tables for the $perl
11739 # property always use the single form.
11740 if ($table_property == $perl) {
11741 $entry = "\\p{$name}";
11742 $entry_ref = "\\p{$name}";
11744 else { # Compound form.
11746 # Only generate one entry for all the aliases that mean true
11747 # or false in binary properties. Append a '*' to indicate
11748 # some are missing. (The heading comment notes this.)
11749 my $wild_card_mark;
11750 if ($type == $BINARY) {
11751 next if $name ne 'N' && $name ne 'Y';
11752 $wild_card_mark = '*';
11755 $wild_card_mark = "";
11758 # Colon-space is used to give a little more space to be easier
11761 . $table_property_full_name
11762 . ": $name$wild_card_mark}";
11764 # But for the reference to this entry, which will go in the
11765 # right column, where space is at a premium, use equals
11767 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
11770 # Then the right (info) column. This is stored as components of
11771 # an array for the moment, then joined into a string later. For
11772 # non-internal only properties, begin the info with the entry for
11773 # the first table we encountered (if any), as things are ordered
11774 # so that that one is the most descriptive. This leads to the
11775 # info column of an entry being a more descriptive version of the
11778 if ($name =~ /^_/) {
11780 '(For internal use by Perl, not necessarily stable)';
11782 elsif ($entry_for_first_alias) {
11783 push @info, $entry_for_first_alias;
11786 # If this entry is equivalent to another, add that to the info,
11787 # using the first such table we encountered
11788 if ($entry_for_first_table) {
11790 push @info, "(= $entry_for_first_table)";
11793 push @info, $entry_for_first_table;
11797 # If the name is a large integer, add an equivalent with an
11798 # exponent for better readability
11799 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
11800 push @info, sprintf "(= %.1e)", $name
11803 my $parenthesized = "";
11804 if (! $entry_for_first_alias) {
11806 # This is the first alias for the current table. The alias
11807 # array is ordered so that this is the fullest, most
11808 # descriptive alias, so it gets the fullest info. The other
11809 # aliases are mostly merely pointers to this one, using the
11810 # information already added above.
11812 # Display any status message, but only on the parent table
11813 if ($status && ! $entry_for_first_table) {
11814 push @info, $status_info;
11817 # Put out any descriptive info
11818 if (@table_description || @table_note) {
11819 push @info, join "; ", @table_description, @table_note;
11822 # Look to see if there is a shorter name we can point people
11824 my $standard_name = standardize($name);
11826 my $proposed_short = $table->short_name;
11827 if (defined $proposed_short) {
11828 my $standard_short = standardize($proposed_short);
11830 # If the short name is shorter than the standard one, or
11831 # even it it's not, but the combination of it and its
11832 # short property name (as in \p{prop=short} ($perl doesn't
11833 # have this form)) saves at least two characters, then,
11834 # cause it to be listed as a shorter synonym.
11835 if (length $standard_short < length $standard_name
11836 || ($table_property != $perl
11837 && (length($standard_short)
11838 - length($standard_name)
11839 + $table_property_short_delta) # (<= 0)
11842 $short_name = $proposed_short;
11843 if ($table_property != $perl) {
11844 $short_name = $table_property_short_name
11847 $short_name = "\\p{$short_name}";
11851 # And if this is a compound form name, see if there is a
11852 # single form equivalent
11854 if ($table_property != $perl) {
11856 # Special case the binary N tables, so that will print
11857 # \P{single}, but use the Y table values to populate
11858 # 'single', as we haven't populated the N table.
11861 if ($type == $BINARY
11862 && $input_table == $property->table('No'))
11864 $test_table = $property->table('Yes');
11868 $test_table = $input_table;
11872 # Look for a single form amongst all the children.
11873 foreach my $table ($test_table->children) {
11874 next if $table->property != $perl;
11875 my $proposed_name = $table->short_name;
11876 next if ! defined $proposed_name;
11878 # Don't mention internal-only properties as a possible
11879 # single form synonym
11880 next if substr($proposed_name, 0, 1) eq '_';
11882 $proposed_name = "\\$p\{$proposed_name}";
11883 if (! defined $single_form
11884 || length($proposed_name) < length $single_form)
11886 $single_form = $proposed_name;
11888 # The goal here is to find a single form; not the
11889 # shortest possible one. We've already found a
11890 # short name. So, stop at the first single form
11891 # found, which is likely to be closer to the
11898 # Ouput both short and single in the same parenthesized
11899 # expression, but with only one of 'Single', 'Short' if there
11901 if ($short_name || $single_form || $table->conflicting) {
11902 $parenthesized .= '(';
11903 $parenthesized .= "Short: $short_name" if $short_name;
11904 if ($short_name && $single_form) {
11905 $parenthesized .= ', ';
11907 elsif ($single_form) {
11908 $parenthesized .= 'Single: ';
11910 $parenthesized .= $single_form if $single_form;
11915 # Warn if this property isn't the same as one that a
11916 # semi-casual user might expect. The other components of this
11917 # parenthesized structure are calculated only for the first entry
11918 # for this table, but the conflicting is deemed important enough
11919 # to go on every entry.
11920 my $conflicting = join " NOR ", $table->conflicting;
11921 if ($conflicting) {
11922 $parenthesized .= '(' if ! $parenthesized;
11923 $parenthesized .= '; ' if $parenthesized ne '(';
11924 $parenthesized .= "NOT $conflicting";
11926 $parenthesized .= ')' if $parenthesized;
11928 push @info, $parenthesized if $parenthesized;
11929 push @info, "($string_count)" if $output_range_counts;
11931 # Now, we have both the entry and info so add them to the
11932 # list of all the properties.
11933 push @match_properties,
11934 format_pod_line($indent_info_column,
11938 $alias->loose_match);
11940 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
11941 } # End of looping through the aliases for this table.
11943 if (! $entry_for_first_table) {
11944 $entry_for_first_table = $entry_for_first_alias;
11946 } # End of looping through all the related tables
11950 sub pod_alphanumeric_sort {
11951 # Sort pod entries alphanumerically.
11953 # The first few character columns are filler, plus the '\p{'; and get rid
11954 # of all the trailing stuff, starting with the trailing '}', so as to sort
11955 # on just 'Name=Value'
11956 (my $a = lc $a) =~ s/^ .*? { //x;
11958 (my $b = lc $b) =~ s/^ .*? { //x;
11961 # Determine if the two operands are both internal only or both not.
11962 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
11963 # should be the underscore that begins internal only
11964 my $a_is_internal = (substr($a, 0, 1) eq '_');
11965 my $b_is_internal = (substr($b, 0, 1) eq '_');
11967 # Sort so the internals come last in the table instead of first (which the
11968 # leading underscore would otherwise indicate).
11969 if ($a_is_internal != $b_is_internal) {
11970 return 1 if $a_is_internal;
11974 # Determine if the two operands are numeric property values or not.
11975 # A numeric property will look like xyz: 3. But the number
11976 # can begin with an optional minus sign, and may have a
11977 # fraction or rational component, like xyz: 3/2. If either
11978 # isn't numeric, use alphabetic sort.
11979 my ($a_initial, $a_number) =
11980 ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
11981 return $a cmp $b unless defined $a_number;
11982 my ($b_initial, $b_number) =
11983 ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
11984 return $a cmp $b unless defined $b_number;
11986 # Here they are both numeric, but use alphabetic sort if the
11987 # initial parts don't match
11988 return $a cmp $b if $a_initial ne $b_initial;
11990 # Convert rationals to floating for the comparison.
11991 $a_number = eval $a_number if $a_number =~ qr{/};
11992 $b_number = eval $b_number if $b_number =~ qr{/};
11994 return $a_number <=> $b_number;
11998 # Create the .pod file. This generates the various subsections and then
11999 # combines them in one big HERE document.
12001 return unless defined $pod_directory;
12002 print "Making pod file\n" if $verbosity >= $PROGRESS;
12004 my $exception_message =
12005 '(Any exceptions are individually noted beginning with the word NOT.)';
12007 if (-e 'Blocks.txt') {
12009 # Add the line: '\p{In_*} \p{Block: *}', with the warning message
12010 # if the global $has_In_conflicts indicates we have them.
12011 push @match_properties, format_pod_line($indent_info_column,
12014 . (($has_In_conflicts)
12015 ? " $exception_message"
12017 @block_warning = << "END";
12019 Matches in the Block property have shortcuts that begin with 'In_'. For
12020 example, \\p{Block=Latin1} can be written as \\p{In_Latin1}. For backward
12021 compatibility, if there is no conflict with another shortcut, these may also
12022 be written as \\p{Latin1} or \\p{Is_Latin1}. But, N.B., there are numerous
12023 such conflicting shortcuts. Use of these forms for Block is discouraged, and
12024 are flagged as such, not only because of the potential confusion as to what is
12025 meant, but also because a later release of Unicode may preempt the shortcut,
12026 and your program would no longer be correct. Use the 'In_' form instead to
12027 avoid this, or even more clearly, use the compound form, e.g.,
12028 \\p{blk:latin1}. See L<perlunicode/"Blocks"> for more information about this.
12031 my $text = "If an entry has flag(s) at its beginning, like '$DEPRECATED', the 'Is_' form has the same flag(s)";
12032 $text = "$exception_message $text" if $has_Is_conflicts;
12034 # And the 'Is_ line';
12035 push @match_properties, format_pod_line($indent_info_column,
12039 # Sort the properties array for output. It is sorted alphabetically
12040 # except numerically for numeric properties, and only output unique lines.
12041 @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
12043 my $formatted_properties = simple_fold(\@match_properties,
12045 # indent succeeding lines by two extra
12046 # which looks better
12047 $indent_info_column + 2,
12049 # shorten the line length by how much
12050 # the formatter indents, so the folded
12051 # line will fit in the space
12052 # presumably available
12053 $automatic_pod_indent);
12054 # Add column headings, indented to be a little more centered, but not
12056 $formatted_properties = format_pod_line($indent_info_column,
12060 . $formatted_properties;
12062 # Generate pod documentation lines for the tables that match nothing
12064 if (@zero_match_tables) {
12065 @zero_match_tables = uniques(@zero_match_tables);
12066 $zero_matches = join "\n\n",
12067 map { $_ = '=item \p{' . $_->complete_name . "}" }
12068 sort { $a->complete_name cmp $b->complete_name }
12069 uniques(@zero_match_tables);
12071 $zero_matches = <<END;
12073 =head2 Legal \\p{} and \\P{} constructs that match no characters
12075 Unicode has some property-value pairs that currently don't match anything.
12076 This happens generally either because they are obsolete, or for symmetry with
12077 other forms, but no language has yet been encoded that uses them. In this
12078 version of Unicode, the following match zero code points:
12089 # Generate list of properties that we don't accept, grouped by the reasons
12090 # why. This is so only put out the 'why' once, and then list all the
12091 # properties that have that reason under it.
12093 my %why_list; # The keys are the reasons; the values are lists of
12094 # properties that have the key as their reason
12096 # For each property, add it to the list that are suppressed for its reason
12097 # The sort will cause the alphabetically first properties to be added to
12098 # each list first, so each list will be sorted.
12099 foreach my $property (sort keys %why_suppressed) {
12100 push @{$why_list{$why_suppressed{$property}}}, $property;
12103 # For each reason (sorted by the first property that has that reason)...
12104 my @bad_re_properties;
12105 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
12108 # Add to the output, all the properties that have that reason. Start
12109 # with an empty line.
12110 push @bad_re_properties, "\n\n";
12112 my $has_item = 0; # Flag if actually output anything.
12113 foreach my $name (@{$why_list{$why}}) {
12115 # Split compound names into $property and $table components
12116 my $property = $name;
12118 if ($property =~ / (.*) = (.*) /x) {
12123 # This release of Unicode may not have a property that is
12124 # suppressed, so don't reference a non-existent one.
12125 $property = property_ref($property);
12126 next if ! defined $property;
12128 # And since this list is only for match tables, don't list the
12129 # ones that don't have match tables.
12130 next if ! $property->to_create_match_tables;
12132 # Find any abbreviation, and turn it into a compound name if this
12133 # is a property=value pair.
12134 my $short_name = $property->name;
12135 $short_name .= '=' . $property->table($table)->name if $table;
12137 # And add the property as an item for the reason.
12138 push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
12142 # And add the reason under the list of properties, if such a list
12143 # actually got generated. Note that the header got added
12144 # unconditionally before. But pod ignores extra blank lines, so no
12146 push @bad_re_properties, "\n$why\n" if $has_item;
12148 } # End of looping through each reason.
12150 # Generate a list of the properties whose map table we output, from the
12151 # global @map_properties.
12152 my @map_tables_actually_output;
12153 my $info_indent = 20; # Left column is narrower than \p{} table.
12154 foreach my $property (@map_properties) {
12156 # Get the path to the file; don't output any not in the standard
12158 my @path = $property->file_path;
12159 next if $path[0] ne $map_directory;
12160 shift @path; # Remove the standard name
12162 my $file = join '/', @path; # In case is in sub directory
12163 my $info = $property->full_name;
12164 my $short_name = $property->name;
12165 if ($info ne $short_name) {
12166 $info .= " ($short_name)";
12168 foreach my $more_info ($property->description,
12170 $property->status_info)
12172 next unless $more_info;
12174 $info .= ". $more_info";
12176 push @map_tables_actually_output, format_pod_line($info_indent,
12179 $property->status);
12182 # Sort alphabetically, and fold for output
12183 @map_tables_actually_output = sort
12184 pod_alphanumeric_sort @map_tables_actually_output;
12185 @map_tables_actually_output
12186 = simple_fold(\@map_tables_actually_output,
12189 $automatic_pod_indent);
12191 # Generate a list of the formats that can appear in the map tables.
12192 my @map_table_formats;
12193 foreach my $format (sort keys %map_table_formats) {
12194 push @map_table_formats, " $format $map_table_formats{$format}\n";
12197 # Everything is ready to assemble.
12198 my @OUT = << "END";
12203 To change this file, edit $0 instead.
12209 $pod_file - Complete index of Unicode Version $string_version properties
12213 There are many properties in Unicode, and Perl provides access to almost all of
12214 them, as well as some additional extensions and short-cut synonyms.
12216 And just about all of the few that aren't accessible through the Perl
12217 core are accessible through the modules: Unicode::Normalize and
12218 Unicode::UCD, and for Unihan properties, via the CPAN module Unicode::Unihan.
12220 This document merely lists all available properties and does not attempt to
12221 explain what each property really means. There is a brief description of each
12222 Perl extension. There is some detail about Blocks, Scripts, General_Category,
12223 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
12224 Unicode properties, refer to the Unicode standard. A good starting place is
12225 L<$unicode_reference_url>. More information on the Perl extensions is in
12226 L<perlrecharclass>.
12228 Note that you can define your own properties; see
12229 L<perlunicode/"User-Defined Character Properties">.
12231 =head1 Properties accessible through \\p{} and \\P{}
12233 The Perl regular expression \\p{} and \\P{} constructs give access to most of
12234 the Unicode character properties. The table below shows all these constructs,
12235 both single and compound forms.
12237 B<Compound forms> consist of two components, separated by an equals sign or a
12238 colon. The first component is the property name, and the second component is
12239 the particular value of the property to match against, for example,
12240 '\\p{Script: Greek}' or '\\p{Script=Greek}' both mean to match characters
12241 whose Script property is Greek.
12243 B<Single forms>, like '\\p{Greek}', are mostly Perl-defined shortcuts for
12244 their equivalent compound forms. The table shows these equivalences. (In our
12245 example, '\\p{Greek}' is a just a shortcut for '\\p{Script=Greek}'.)
12246 There are also a few Perl-defined single forms that are not shortcuts for a
12247 compound form. One such is \\p{Word}. These are also listed in the table.
12249 In parsing these constructs, Perl always ignores Upper/lower case differences
12250 everywhere within the {braces}. Thus '\\p{Greek}' means the same thing as
12251 '\\p{greek}'. But note that changing the case of the 'p' or 'P' before the
12252 left brace completely changes the meaning of the construct, from "match" (for
12253 '\\p{}') to "doesn't match" (for '\\P{}'). Casing in this document is for
12254 improved legibility.
12256 Also, white space, hyphens, and underscores are also normally ignored
12257 everywhere between the {braces}, and hence can be freely added or removed
12258 even if the C</x> modifier hasn't been specified on the regular expression.
12259 But $a_bold_stricter at the beginning of an entry in the table below
12260 means that tighter (stricter) rules are used for that entry:
12264 =item Single form (\\p{name}) tighter rules:
12266 White space, hyphens, and underscores ARE significant
12271 =item * white space adjacent to a non-word character
12273 =item * underscores separating digits in numbers
12277 That means, for example, that you can freely add or remove white space
12278 adjacent to (but within) the braces without affecting the meaning.
12280 =item Compound form (\\p{name=value} or \\p{name:value}) tighter rules:
12282 The tighter rules given above for the single form apply to everything to the
12283 right of the colon or equals; the looser rules still apply to everything to
12286 That means, for example, that you can freely add or remove white space
12287 adjacent to (but within) the braces and the colon or equal sign.
12291 Some properties are considered obsolete, but still available. There are
12292 several varieties of obsolesence:
12298 Properties marked with $a_bold_obsolete in the table are considered
12299 obsolete. At the time of this writing (Unicode version 5.2) there is no
12300 information in the Unicode standard about the implications of a property being
12305 Obsolete properties may be stabilized. This means that they are not actively
12306 maintained by Unicode, and will not be extended as new characters are added to
12307 the standard. Such properties are marked with $a_bold_stabilized in the
12308 table. At the time of this writing (Unicode version 5.2) there is no further
12309 information in the Unicode standard about the implications of a property being
12314 Obsolete properties may be deprecated. This means that their use is strongly
12315 discouraged, so much so that a warning will be issued if used, unless the
12316 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
12317 statement. $A_bold_deprecated flags each such entry in the table, and
12318 the entry there for the longest, most descriptive version of the property will
12319 give the reason it is deprecated, and perhaps advice. Perl may issue such a
12320 warning, even for properties that aren't officially deprecated by Unicode,
12321 when there used to be characters or code points that were matched by them, but
12322 no longer. This is to warn you that your program may not work like it did on
12323 earlier Unicode releases.
12325 A deprecated property may be made unavailable in a future Perl version, so it
12326 is best to move away from them.
12330 Some Perl extensions are present for backwards compatibility and are
12331 discouraged from being used, but not obsolete. $A_bold_discouraged
12332 flags each such entry in the table.
12336 The table below has two columns. The left column contains the \\p{}
12337 constructs to look up, possibly preceeded by the flags mentioned above; and
12338 the right column contains information about them, like a description, or
12339 synonyms. It shows both the single and compound forms for each property that
12340 has them. If the left column is a short name for a property, the right column
12341 will give its longer, more descriptive name; and if the left column is the
12342 longest name, the right column will show any equivalent shortest name, in both
12343 single and compound forms if applicable.
12345 The right column will also caution you if a property means something different
12346 than what might normally be expected.
12348 Numbers in (parentheses) indicate the total number of code points matched by
12349 the property. For emphasis, those properties that match no code points at all
12350 are listed as well in a separate section following the table.
12352 There is no description given for most non-Perl defined properties (See
12353 $unicode_reference_url for that).
12355 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
12356 combinations. For example, entries like:
12358 \\p{Gc: *} \\p{General_Category: *}
12360 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
12361 for the latter is also valid for the former. Similarly,
12365 means that if and only if, for example, \\p{Foo} exists, then \\p{Is_Foo} and
12366 \\p{IsFoo} are also valid and all mean the same thing. And similarly,
12367 \\p{Foo=Bar} means the same as \\p{Is_Foo=Bar} and \\p{IsFoo=Bar}. '*' here
12368 is restricted to something not beginning with an underscore.
12370 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
12371 And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and
12372 'N*' to indicate this, and doesn't have separate entries for the other
12373 possibilities. Note that not all properties which have values 'Yes' and 'No'
12374 are binary, and they have all their values spelled out without using this wild
12375 card, and a C<NOT> clause in their description that highlights their not being
12376 binary. These also require the compound form to match them, whereas true
12377 binary properties have both single and compound forms available.
12379 Note that all non-essential underscores are removed in the display of the
12386 =item B<*> is a wild-card
12388 =item B<(\\d+)> in the info column gives the number of code points matched by
12391 =item B<$DEPRECATED> means this is deprecated.
12393 =item B<$OBSOLETE> means this is obsolete.
12395 =item B<$STABILIZED> means this is stabilized.
12397 =item B<$STRICTER> means tighter (stricter) name matching applies.
12399 =item B<$DISCOURAGED> means use of this form is discouraged.
12403 $formatted_properties
12407 =head1 Properties not accessible through \\p{} and \\P{}
12409 A few properties are accessible in Perl via various function calls only.
12411 Lowercase_Mapping lc() and lcfirst()
12412 Titlecase_Mapping ucfirst()
12413 Uppercase_Mapping uc()
12415 Case_Folding is accessible through the /i modifier in regular expressions.
12417 The Name property is accessible through the \\N{} interpolation in
12418 double-quoted strings and regular expressions, but both usages require a C<use
12419 charnames;> to be specified, which also contains related functions viacode()
12422 =head1 Unicode regular expression properties that are NOT accepted by Perl
12424 Perl will generate an error for a few character properties in Unicode when
12425 used in a regular expression. The non-Unihan ones are listed below, with the
12426 reasons they are not accepted, perhaps with work-arounds. The short names for
12427 the properties are listed enclosed in (parentheses).
12435 An installation can choose to allow any of these to be matched by changing the
12436 controlling lists contained in the program C<\$Config{privlib}>/F<unicore/$0>
12437 and then re-running F<$0>. (C<\%Config> is available from the Config module).
12439 =head1 Files in the I<To> directory (for serious hackers only)
12441 All Unicode properties are really mappings (in the mathematical sense) from
12442 code points to their respective values. As part of its build process,
12443 Perl constructs tables containing these mappings for all properties that it
12444 deals with. But only a few of these are written out into files.
12445 Those written out are in the directory C<\$Config{privlib}>/F<unicore/To/>
12446 (%Config is available from the Config module).
12448 Those ones written are ones needed by Perl internally during execution, or for
12449 which there is some demand, and those for which there is no access through the
12450 Perl core. Generally, properties that can be used in regular expression
12451 matching do not have their map tables written, like Script. Nor are the
12452 simplistic properties that have a better, more complete version, such as
12453 Simple_Uppercase_Mapping (Uppercase_Mapping is written instead).
12455 None of the properties in the I<To> directory are currently directly
12456 accessible through the Perl core, although some may be accessed indirectly.
12457 For example, the uc() function implements the Uppercase_Mapping property and
12458 uses the F<Upper.pl> file found in this directory.
12460 The available files with their properties (short names in parentheses),
12461 and any flags or comments about them, are:
12463 @map_tables_actually_output
12465 An installation can choose to change which files are generated by changing the
12466 controlling lists contained in the program C<\$Config{privlib}>/F<unicore/$0>
12467 and then re-running F<$0>.
12469 Each of these files defines two hash entries to help reading programs decipher
12470 it. One of them looks like this:
12472 \$utf8::SwashInfo{'ToNAME'}{'format'} = 's';
12474 where 'NAME' is a name to indicate the property. For backwards compatibility,
12475 this is not necessarily the property's official Unicode name. (The 'To' is
12476 also for backwards compatibility.) The hash entry gives the format of the
12477 mapping fields of the table, currently one of the following:
12481 This format applies only to the entries in the main body of the table.
12482 Entries defined in hashes or ones that are missing from the list can have a
12485 The value that the missing entries have is given by the other SwashInfo hash
12486 entry line; it looks like this:
12488 \$utf8::SwashInfo{'ToNAME'}{'missing'} = 'NaN';
12490 This example line says that any Unicode code points not explicitly listed in
12491 the file have the value 'NaN' under the property indicated by NAME. If the
12492 value is the special string C<< <code point> >>, it means that the value for
12493 any missing code point is the code point itself. This happens, for example,
12494 in the file for Uppercase_Mapping (To/Upper.pl), in which code points like the
12495 character 'A', are missing because the uppercase of 'A' is itself.
12499 L<$unicode_reference_url>
12508 main::write([ $pod_directory, "$pod_file.pod" ], @OUT);
12512 sub make_Heavy () {
12513 # Create and write Heavy.pl, which passes info about the tables to
12520 # This file is for the use of utf8_heavy.pl
12522 # Maps property names in loose standard form to its standard name
12523 \%utf8::loose_property_name_of = (
12526 push @heavy, simple_dumper (\%loose_property_name_of, ' ' x 4);
12527 push @heavy, <<END;
12530 # Maps property, table to file for those using stricter matching
12531 \%utf8::stricter_to_file_of = (
12533 push @heavy, simple_dumper (\%stricter_to_file_of, ' ' x 4);
12534 push @heavy, <<END;
12537 # Maps property, table to file for those using loose matching
12538 \%utf8::loose_to_file_of = (
12540 push @heavy, simple_dumper (\%loose_to_file_of, ' ' x 4);
12541 push @heavy, <<END;
12544 # Maps floating point to fractional form
12545 \%utf8::nv_floating_to_rational = (
12547 push @heavy, simple_dumper (\%nv_floating_to_rational, ' ' x 4);
12548 push @heavy, <<END;
12551 # If a floating point number doesn't have enough digits in it to get this
12552 # close to a fraction, it isn't considered to be that fraction even if all the
12553 # digits it does have match.
12554 \$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
12556 # Deprecated tables to generate a warning for. The key is the file containing
12557 # the table, so as to avoid duplication, as many property names can map to the
12558 # file, but we only need one entry for all of them.
12559 \%utf8::why_deprecated = (
12562 push @heavy, simple_dumper (\%utf8::why_deprecated, ' ' x 4);
12563 push @heavy, <<END;
12569 main::write("Heavy.pl", @heavy);
12573 sub write_all_tables() {
12574 # Write out all the tables generated by this program to files, as well as
12575 # the supporting data structures, pod file, and .t file.
12577 my @writables; # List of tables that actually get written
12578 my %match_tables_to_write; # Used to collapse identical match tables
12579 # into one file. Each key is a hash function
12580 # result to partition tables into buckets.
12581 # Each value is an array of the tables that
12582 # fit in the bucket.
12584 # For each property ...
12585 # (sort so that if there is an immutable file name, it has precedence, so
12586 # some other property can't come in and take over its file name. If b's
12587 # file name is defined, will return 1, meaning to take it first; don't
12588 # care if both defined, as they had better be different anyway)
12590 foreach my $property (sort { defined $b->file } property_ref('*')) {
12591 my $type = $property->type;
12593 # And for each table for that property, starting with the mapping
12596 foreach my $table($property,
12598 # and all the match tables for it (if any), sorted so
12599 # the ones with the shortest associated file name come
12600 # first. The length sorting prevents problems of a
12601 # longer file taking a name that might have to be used
12602 # by a shorter one. The alphabetic sorting prevents
12603 # differences between releases
12604 sort { my $ext_a = $a->external_name;
12605 return 1 if ! defined $ext_a;
12606 my $ext_b = $b->external_name;
12607 return -1 if ! defined $ext_b;
12608 my $cmp = length $ext_a <=> length $ext_b;
12610 # Return result if lengths not equal
12611 return $cmp if $cmp;
12613 # Alphabetic if lengths equal
12614 return $ext_a cmp $ext_b
12615 } $property->tables
12619 # Here we have a table associated with a property. It could be
12620 # the map table (done first for each property), or one of the
12621 # other tables. Determine which type.
12622 my $is_property = $table->isa('Property');
12624 my $name = $table->name;
12625 my $complete_name = $table->complete_name;
12627 # See if should suppress the table if is empty, but warn if it
12628 # contains something.
12629 my $suppress_if_empty_warn_if_not = grep { $complete_name eq $_ }
12630 keys %why_suppress_if_empty_warn_if_not;
12632 # Calculate if this table should have any code points associated
12634 my $expected_empty =
12636 # $perl should be empty, as well as properties that we just
12637 # don't do anything with
12639 && ($table == $perl
12640 || grep { $complete_name eq $_ }
12641 @unimplemented_properties
12645 # Match tables in properties we skipped populating should be
12647 || (! $is_property && ! $property->to_create_match_tables)
12649 # Tables and properties that are expected to have no code
12650 # points should be empty
12651 || $suppress_if_empty_warn_if_not
12654 # Set a boolean if this table is the complement of an empty binary
12656 my $is_complement_of_empty_binary =
12657 $type == $BINARY &&
12658 (($table == $property->table('Y')
12659 && $property->table('N')->is_empty)
12660 || ($table == $property->table('N')
12661 && $property->table('Y')->is_empty));
12664 # Some tables should match everything
12665 my $expected_full =
12667 ? # All these types of map tables will be full because
12668 # they will have been populated with defaults
12669 ($type == $ENUM || $type == $BINARY)
12671 : # A match table should match everything if its method
12673 ($table->matches_all
12675 # The complement of an empty binary table will match
12677 || $is_complement_of_empty_binary
12681 if ($table->is_empty) {
12684 if ($suppress_if_empty_warn_if_not) {
12685 $table->set_status($SUPPRESSED,
12686 $why_suppress_if_empty_warn_if_not{$complete_name});
12689 # Suppress expected empty tables.
12690 next TABLE if $expected_empty;
12692 # And setup to later output a warning for those that aren't
12693 # known to be allowed to be empty. Don't do the warning if
12694 # this table is a child of another one to avoid duplicating
12695 # the warning that should come from the parent one.
12696 if (($table == $property || $table->parent == $table)
12697 && $table->status ne $SUPPRESSED
12698 && ! grep { $complete_name =~ /^$_$/ }
12699 @tables_that_may_be_empty)
12701 push @unhandled_properties, "$table";
12704 elsif ($expected_empty) {
12706 if ($suppress_if_empty_warn_if_not) {
12707 $because = " because $why_suppress_if_empty_warn_if_not{$complete_name}";
12710 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
12713 my $count = $table->count;
12714 if ($expected_full) {
12715 if ($count != $MAX_UNICODE_CODEPOINTS) {
12716 Carp::my_carp("$table matches only "
12717 . clarify_number($count)
12718 . " Unicode code points but should match "
12719 . clarify_number($MAX_UNICODE_CODEPOINTS)
12721 . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
12722 . "). Proceeding anyway.");
12725 # Here is expected to be full. If it is because it is the
12726 # complement of an (empty) binary table that is to be
12727 # suppressed, then suppress this one as well.
12728 if ($is_complement_of_empty_binary) {
12729 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
12730 my $opposing = $property->table($opposing_name);
12731 my $opposing_status = $opposing->status;
12732 if ($opposing_status) {
12733 $table->set_status($opposing_status,
12734 $opposing->status_info);
12738 elsif ($count == $MAX_UNICODE_CODEPOINTS) {
12739 if ($table == $property || $table->leader == $table) {
12740 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
12744 if ($table->status eq $SUPPRESSED) {
12745 if (! $is_property) {
12746 my @children = $table->children;
12747 foreach my $child (@children) {
12748 if ($child->status ne $SUPPRESSED) {
12749 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
12756 if (! $is_property) {
12758 # Several things need to be done just once for each related
12759 # group of match tables. Do them on the parent.
12760 if ($table->parent == $table) {
12762 # Add an entry in the pod file for the table; it also does
12764 make_table_pod_entries($table) if defined $pod_directory;
12766 # See if the the table matches identical code points with
12767 # something that has already been output. In that case,
12768 # no need to have two files with the same code points in
12769 # them. We use the table's hash() method to store these
12770 # in buckets, so that it is quite likely that if two
12771 # tables are in the same bucket they will be identical, so
12772 # don't have to compare tables frequently. The tables
12773 # have to have the same status to share a file, so add
12774 # this to the bucket hash. (The reason for this latter is
12775 # that Heavy.pl associates a status with a file.)
12776 my $hash = $table->hash . ';' . $table->status;
12778 # Look at each table that is in the same bucket as this
12780 foreach my $comparison (@{$match_tables_to_write{$hash}})
12782 if ($table->matches_identically_to($comparison)) {
12783 $table->set_equivalent_to($comparison,
12789 # Here, not equivalent, add this table to the bucket.
12790 push @{$match_tables_to_write{$hash}}, $table;
12795 # Here is the property itself.
12796 # Don't write out or make references to the $perl property
12797 next if $table == $perl;
12799 if ($type != $STRING) {
12801 # There is a mapping stored of the various synonyms to the
12802 # standardized name of the property for utf8_heavy.pl.
12803 # Also, the pod file contains entries of the form:
12804 # \p{alias: *} \p{full: *}
12805 # rather than show every possible combination of things.
12807 my @property_aliases = $property->aliases;
12809 # The full name of this property is stored by convention
12810 # first in the alias array
12811 my $full_property_name =
12812 '\p{' . $property_aliases[0]->name . ': *}';
12813 my $standard_property_name = standardize($table->name);
12815 # For each synonym ...
12816 for my $i (0 .. @property_aliases - 1) {
12817 my $alias = $property_aliases[$i];
12818 my $alias_name = $alias->name;
12819 my $alias_standard = standardize($alias_name);
12821 # Set the mapping for utf8_heavy of the alias to the
12823 if (exists ($loose_property_name_of{$alias_standard}))
12825 Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained");
12828 $loose_property_name_of{$alias_standard}
12829 = $standard_property_name;
12832 # Now for the pod entry for this alias. Skip if not
12833 # outputting a pod; skip the first one, which is the
12834 # full name so won't have an entry like: '\p{full: *}
12835 # \p{full: *}', and skip if don't want an entry for
12838 || ! defined $pod_directory
12839 || ! $alias->make_pod_entry;
12841 push @match_properties,
12842 format_pod_line($indent_info_column,
12843 '\p{' . $alias->name . ': *}',
12844 $full_property_name,
12847 } # End of non-string-like property code
12850 # Don't output a mapping file if not desired.
12851 next if ! $property->to_output_map;
12854 # Here, we know we want to write out the table, but don't do it
12855 # yet because there may be other tables that come along and will
12856 # want to share the file, and the file's comments will change to
12857 # mention them. So save for later.
12858 push @writables, $table;
12860 } # End of looping through the property and all its tables.
12861 } # End of looping through all properties.
12863 # Now have all the tables that will have files written for them. Do it.
12864 foreach my $table (@writables) {
12867 my $property = $table->property;
12868 my $is_property = ($table == $property);
12869 if (! $is_property) {
12871 # Match tables for the property go in lib/$subdirectory, which is
12872 # the property's name. Don't use the standard file name for this,
12873 # as may get an unfamiliar alias
12874 @directory = ($matches_directory, $property->external_name);
12878 @directory = $table->directory;
12879 $filename = $table->file;
12882 # Use specified filename if avaliable, or default to property's
12883 # shortest name. We need an 8.3 safe filename (which means "an 8
12884 # safe" filename, since after the dot is only 'pl', which is < 3)
12885 # The 2nd parameter is if the filename shouldn't be changed, and
12886 # it shouldn't iff there is a hard-coded name for this table.
12887 $filename = construct_filename(
12888 $filename || $table->external_name,
12889 ! $filename, # mutable if no filename
12892 register_file_for_name($table, \@directory, $filename);
12894 # Only need to write one file when shared by more than one
12896 next if ! $is_property && $table->leader != $table;
12898 # Construct a nice comment to add to the file
12899 $table->set_final_comment;
12905 # Write out the pod file
12911 make_property_test_script() if $make_test_script;
12915 my @white_space_separators = ( # This used only for making the test script.
12922 sub generate_separator($) {
12923 # This used only for making the test script. It generates the colon or
12924 # equal separator between the property and property value, with random
12925 # white space surrounding the separator
12929 return "" if $lhs eq ""; # No separator if there's only one (the r) side
12931 # Choose space before and after randomly
12932 my $spaces_before =$white_space_separators[rand(@white_space_separators)];
12933 my $spaces_after = $white_space_separators[rand(@white_space_separators)];
12935 # And return the whole complex, half the time using a colon, half the
12937 return $spaces_before
12938 . (rand() < 0.5) ? '=' : ':'
12942 sub generate_tests($$$$$$) {
12943 # This used only for making the test script. It generates test cases that
12944 # are expected to compile successfully in perl. Note that the lhs and
12945 # rhs are assumed to already be as randomized as the caller wants.
12947 my $file_handle = shift; # Where to output the tests
12948 my $lhs = shift; # The property: what's to the left of the colon
12949 # or equals separator
12950 my $rhs = shift; # The property value; what's to the right
12951 my $valid_code = shift; # A code point that's known to be in the
12952 # table given by lhs=rhs; undef if table is
12954 my $invalid_code = shift; # A code point known to not be in the table;
12955 # undef if the table is all code points
12956 my $warning = shift;
12958 # Get the colon or equal
12959 my $separator = generate_separator($lhs);
12961 # The whole 'property=value'
12962 my $name = "$lhs$separator$rhs";
12964 # Create a complete set of tests, with complements.
12965 if (defined $valid_code) {
12966 printf $file_handle
12967 qq/Expect(1, $valid_code, '\\p{$name}', $warning);\n/;
12968 printf $file_handle
12969 qq/Expect(0, $valid_code, '\\p{^$name}', $warning);\n/;
12970 printf $file_handle
12971 qq/Expect(0, $valid_code, '\\P{$name}', $warning);\n/;
12972 printf $file_handle
12973 qq/Expect(1, $valid_code, '\\P{^$name}', $warning);\n/;
12975 if (defined $invalid_code) {
12976 printf $file_handle
12977 qq/Expect(0, $invalid_code, '\\p{$name}', $warning);\n/;
12978 printf $file_handle
12979 qq/Expect(1, $invalid_code, '\\p{^$name}', $warning);\n/;
12980 printf $file_handle
12981 qq/Expect(1, $invalid_code, '\\P{$name}', $warning);\n/;
12982 printf $file_handle
12983 qq/Expect(0, $invalid_code, '\\P{^$name}', $warning);\n/;
12988 sub generate_error($$$$) {
12989 # This used only for making the test script. It generates test cases that
12990 # are expected to not only not match, but to be syntax or similar errors
12992 my $file_handle = shift; # Where to output to.
12993 my $lhs = shift; # The property: what's to the left of the
12994 # colon or equals separator
12995 my $rhs = shift; # The property value; what's to the right
12996 my $already_in_error = shift; # Boolean; if true it's known that the
12997 # unmodified lhs and rhs will cause an error.
12998 # This routine should not force another one
12999 # Get the colon or equal
13000 my $separator = generate_separator($lhs);
13002 # Since this is an error only, don't bother to randomly decide whether to
13003 # put the error on the left or right side; and assume that the rhs is
13004 # loosely matched, again for convenience rather than rigor.
13005 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
13007 my $property = $lhs . $separator . $rhs;
13009 print $file_handle qq/Error('\\p{$property}');\n/;
13010 print $file_handle qq/Error('\\P{$property}');\n/;
13014 # These are used only for making the test script
13015 # XXX Maybe should also have a bad strict seps, which includes underscore.
13017 my @good_loose_seps = (
13024 my @bad_loose_seps = (
13029 sub randomize_stricter_name {
13030 # This used only for making the test script. Take the input name and
13031 # return a randomized, but valid version of it under the stricter matching
13035 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13037 # If the name looks like a number (integer, floating, or rational), do
13039 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
13042 my $separator = $3;
13044 # If there isn't a sign, part of the time add a plus
13045 # Note: Not testing having any denominator having a minus sign
13047 $sign = '+' if rand() <= .3;
13050 # And add 0 or more leading zeros.
13051 $name = $sign . ('0' x int rand(10)) . $number;
13053 if (defined $separator) {
13054 my $extra_zeros = '0' x int rand(10);
13056 if ($separator eq '.') {
13058 # Similarly, add 0 or more trailing zeros after a decimal
13060 $name .= $extra_zeros;
13064 # Or, leading zeros before the denominator
13065 $name =~ s,/,/$extra_zeros,;
13070 # For legibility of the test, only change the case of whole sections at a
13071 # time. To do this, first split into sections. The split returns the
13074 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
13075 trace $section if main::DEBUG && $to_trace;
13077 if (length $section > 1 && $section !~ /\D/) {
13079 # If the section is a sequence of digits, about half the time
13080 # randomly add underscores between some of them.
13083 # Figure out how many underscores to add. max is 1 less than
13084 # the number of digits. (But add 1 at the end to make sure
13085 # result isn't 0, and compensate earlier by subtracting 2
13087 my $num_underscores = int rand(length($section) - 2) + 1;
13089 # And add them evenly throughout, for convenience, not rigor
13091 my $spacing = (length($section) - 1)/ $num_underscores;
13092 my $temp = $section;
13094 for my $i (1 .. $num_underscores) {
13095 $section .= substr($temp, 0, $spacing, "") . '_';
13099 push @sections, $section;
13103 # Here not a sequence of digits. Change the case of the section
13105 my $switch = int rand(4);
13106 if ($switch == 0) {
13107 push @sections, uc $section;
13109 elsif ($switch == 1) {
13110 push @sections, lc $section;
13112 elsif ($switch == 2) {
13113 push @sections, ucfirst $section;
13116 push @sections, $section;
13120 trace "returning", join "", @sections if main::DEBUG && $to_trace;
13121 return join "", @sections;
13124 sub randomize_loose_name($;$) {
13125 # This used only for making the test script
13128 my $want_error = shift; # if true, make an error
13129 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13131 $name = randomize_stricter_name($name);
13134 push @parts, $good_loose_seps[rand(@good_loose_seps)];
13135 for my $part (split /[-\s_]+/, $name) {
13137 if ($want_error and rand() < 0.3) {
13138 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
13142 push @parts, $good_loose_seps[rand(@good_loose_seps)];
13145 push @parts, $part;
13147 my $new = join("", @parts);
13148 trace "$name => $new" if main::DEBUG && $to_trace;
13151 if (rand() >= 0.5) {
13152 $new .= $bad_loose_seps[rand(@bad_loose_seps)];
13155 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
13161 # Used to make sure don't generate duplicate test cases.
13162 my %test_generated;
13164 sub make_property_test_script() {
13165 # This used only for making the test script
13166 # this written directly -- it's huge.
13168 print "Making test script\n" if $verbosity >= $PROGRESS;
13170 # This uses randomness to test different possibilities without testing all
13171 # possibilities. To ensure repeatability, set the seed to 0. But if
13172 # tests are added, it will perturb all later ones in the .t file
13175 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
13177 force_unlink ($t_path);
13178 push @files_actually_output, $t_path;
13180 if (not open $OUT, "> $t_path") {
13181 Carp::my_carp("Can't open $t_path. Skipping: $!");
13185 # Keep going down an order of magnitude
13186 # until find that adding this quantity to
13187 # 1 remains 1; but put an upper limit on
13188 # this so in case this algorithm doesn't
13189 # work properly on some platform, that we
13190 # won't loop forever.
13192 my $min_floating_slop = 1;
13193 while (1+ $min_floating_slop != 1
13196 my $next = $min_floating_slop / 10;
13197 last if $next == 0; # If underflows,
13199 $min_floating_slop = $next;
13201 print $OUT $HEADER, <DATA>;
13203 foreach my $property (property_ref('*')) {
13204 foreach my $table ($property->tables) {
13206 # Find code points that match, and don't match this table.
13207 my $valid = $table->get_valid_code_point;
13208 my $invalid = $table->get_invalid_code_point;
13209 my $warning = ($table->status eq $DEPRECATED)
13213 # Test each possible combination of the property's aliases with
13214 # the table's. If this gets to be too many, could do what is done
13215 # in the set_final_comment() for Tables
13216 my @table_aliases = $table->aliases;
13217 my @property_aliases = $table->property->aliases;
13218 my $max = max(scalar @table_aliases, scalar @property_aliases);
13219 for my $j (0 .. $max - 1) {
13221 # The current alias for property is the next one on the list,
13222 # or if beyond the end, start over. Similarly for table
13224 = $property_aliases[$j % @property_aliases]->name;
13226 $property_name = "" if $table->property == $perl;
13227 my $table_alias = $table_aliases[$j % @table_aliases];
13228 my $table_name = $table_alias->name;
13229 my $loose_match = $table_alias->loose_match;
13231 # If the table doesn't have a file, any test for it is
13232 # already guaranteed to be in error
13233 my $already_error = ! $table->file_path;
13235 # Generate error cases for this alias.
13236 generate_error($OUT,
13241 # If the table is guaranteed to always generate an error,
13242 # quit now without generating success cases.
13243 next if $already_error;
13245 # Now for the success cases.
13247 if ($loose_match) {
13249 # For loose matching, create an extra test case for the
13251 my $standard = standardize($table_name);
13253 # $test_name should be a unique combination for each test
13254 # case; used just to avoid duplicate tests
13255 my $test_name = "$property_name=$standard";
13257 # Don't output duplicate test cases.
13258 if (! exists $test_generated{$test_name}) {
13259 $test_generated{$test_name} = 1;
13260 generate_tests($OUT,
13268 $random = randomize_loose_name($table_name)
13270 else { # Stricter match
13271 $random = randomize_stricter_name($table_name);
13274 # Now for the main test case for this alias.
13275 my $test_name = "$property_name=$random";
13276 if (! exists $test_generated{$test_name}) {
13277 $test_generated{$test_name} = 1;
13278 generate_tests($OUT,
13286 # If the name is a rational number, add tests for the
13287 # floating point equivalent.
13288 if ($table_name =~ qr{/}) {
13290 # Calculate the float, and find just the fraction.
13291 my $float = eval $table_name;
13292 my ($whole, $fraction)
13293 = $float =~ / (.*) \. (.*) /x;
13295 # Starting with one digit after the decimal point,
13296 # create a test for each possible precision (number of
13297 # digits past the decimal point) until well beyond the
13298 # native number found on this machine. (If we started
13299 # with 0 digits, it would be an integer, which could
13300 # well match an unrelated table)
13302 for my $i (1 .. $min_floating_slop + 3) {
13303 my $table_name = sprintf("%.*f", $i, $float);
13304 if ($i < $MIN_FRACTION_LENGTH) {
13306 # If the test case has fewer digits than the
13307 # minimum acceptable precision, it shouldn't
13308 # succeed, so we expect an error for it.
13309 # E.g., 2/3 = .7 at one decimal point, and we
13310 # shouldn't say it matches .7. We should make
13311 # it be .667 at least before agreeing that the
13312 # intent was to match 2/3. But at the
13313 # less-than- acceptable level of precision, it
13314 # might actually match an unrelated number.
13315 # So don't generate a test case if this
13316 # conflating is possible. In our example, we
13317 # don't want 2/3 matching 7/10, if there is
13318 # a 7/10 code point.
13320 (keys %nv_floating_to_rational)
13323 if abs($table_name - $existing)
13324 < $MAX_FLOATING_SLOP;
13326 generate_error($OUT,
13329 1 # 1 => already an error
13334 # Here the number of digits exceeds the
13335 # minimum we think is needed. So generate a
13336 # success test case for it.
13337 generate_tests($OUT,
13352 foreach my $test (@backslash_X_tests) {
13353 print $OUT "Test_X('$test');\n";
13356 print $OUT "Finished();\n";
13361 # This is a list of the input files and how to handle them. The files are
13362 # processed in their order in this list. Some reordering is possible if
13363 # desired, but the v0 files should be first, and the extracted before the
13364 # others except DAge.txt (as data in an extracted file can be over-ridden by
13365 # the non-extracted. Some other files depend on data derived from an earlier
13366 # file, like UnicodeData requires data from Jamo, and the case changing and
13367 # folding requires data from Unicode. Mostly, it safest to order by first
13368 # version releases in (except the Jamo). DAge.txt is read before the
13369 # extracted ones because of the rarely used feature $compare_versions. In the
13370 # unlikely event that there were ever an extracted file that contained the Age
13371 # property information, it would have to go in front of DAge.
13373 # The version strings allow the program to know whether to expect a file or
13374 # not, but if a file exists in the directory, it will be processed, even if it
13375 # is in a version earlier than expected, so you can copy files from a later
13376 # release into an earlier release's directory.
13377 my @input_file_objects = (
13378 Input_file->new('PropertyAliases.txt', v0,
13379 Handler => \&process_PropertyAliases,
13381 Input_file->new(undef, v0, # No file associated with this
13382 Progress_Message => 'Finishing property setup',
13383 Handler => \&finish_property_setup,
13385 Input_file->new('PropValueAliases.txt', v0,
13386 Handler => \&process_PropValueAliases,
13387 Has_Missings_Defaults => $NOT_IGNORED,
13389 Input_file->new('DAge.txt', v3.2.0,
13390 Has_Missings_Defaults => $NOT_IGNORED,
13393 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
13394 Property => 'General_Category',
13396 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
13397 Property => 'Canonical_Combining_Class',
13398 Has_Missings_Defaults => $NOT_IGNORED,
13400 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
13401 Property => 'Numeric_Type',
13402 Has_Missings_Defaults => $NOT_IGNORED,
13404 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
13405 Property => 'East_Asian_Width',
13406 Has_Missings_Defaults => $NOT_IGNORED,
13408 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
13409 Property => 'Line_Break',
13410 Has_Missings_Defaults => $NOT_IGNORED,
13412 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
13413 Property => 'Bidi_Class',
13414 Has_Missings_Defaults => $NOT_IGNORED,
13416 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
13417 Property => 'Decomposition_Type',
13418 Has_Missings_Defaults => $NOT_IGNORED,
13420 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
13421 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
13422 Property => 'Numeric_Value',
13423 Each_Line_Handler => \&filter_numeric_value_line,
13424 Has_Missings_Defaults => $NOT_IGNORED,
13426 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
13427 Property => 'Joining_Group',
13428 Has_Missings_Defaults => $NOT_IGNORED,
13431 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
13432 Property => 'Joining_Type',
13433 Has_Missings_Defaults => $NOT_IGNORED,
13435 Input_file->new('Jamo.txt', v2.0.0,
13436 Property => 'Jamo_Short_Name',
13437 Each_Line_Handler => \&filter_jamo_line,
13439 Input_file->new('UnicodeData.txt', v1.1.5,
13440 Pre_Handler => \&setup_UnicodeData,
13442 # We clean up this file for some early versions.
13443 Each_Line_Handler => [ (($v_version lt v2.0.0 )
13445 : ($v_version eq v2.1.5)
13446 ? \&filter_v2_1_5_ucd
13449 # And the main filter
13450 \&filter_UnicodeData_line,
13452 EOF_Handler => \&EOF_UnicodeData,
13454 Input_file->new('ArabicShaping.txt', v2.0.0,
13455 Each_Line_Handler =>
13456 [ ($v_version lt 4.1.0)
13457 ? \&filter_old_style_arabic_shaping
13459 \&filter_arabic_shaping_line,
13461 Has_Missings_Defaults => $NOT_IGNORED,
13463 Input_file->new('Blocks.txt', v2.0.0,
13464 Property => 'Block',
13465 Has_Missings_Defaults => $NOT_IGNORED,
13466 Each_Line_Handler => \&filter_blocks_lines
13468 Input_file->new('PropList.txt', v2.0.0,
13469 Each_Line_Handler => (($v_version lt v3.1.0)
13470 ? \&filter_old_style_proplist
13473 Input_file->new('Unihan.txt', v2.0.0,
13474 Pre_Handler => \&setup_unihan,
13476 Each_Line_Handler => \&filter_unihan_line,
13478 Input_file->new('SpecialCasing.txt', v2.1.8,
13479 Each_Line_Handler => \&filter_special_casing_line,
13480 Pre_Handler => \&setup_special_casing,
13483 'LineBreak.txt', v3.0.0,
13484 Has_Missings_Defaults => $NOT_IGNORED,
13485 Property => 'Line_Break',
13486 # Early versions had problematic syntax
13487 Each_Line_Handler => (($v_version lt v3.1.0)
13488 ? \&filter_early_ea_lb
13491 Input_file->new('EastAsianWidth.txt', v3.0.0,
13492 Property => 'East_Asian_Width',
13493 Has_Missings_Defaults => $NOT_IGNORED,
13494 # Early versions had problematic syntax
13495 Each_Line_Handler => (($v_version lt v3.1.0)
13496 ? \&filter_early_ea_lb
13499 Input_file->new('CompositionExclusions.txt', v3.0.0,
13500 Property => 'Composition_Exclusion',
13502 Input_file->new('BidiMirroring.txt', v3.0.1,
13503 Property => 'Bidi_Mirroring_Glyph',
13505 Input_file->new("NormalizationTest.txt", v3.0.1,
13508 Input_file->new('CaseFolding.txt', v3.0.1,
13509 Pre_Handler => \&setup_case_folding,
13510 Each_Line_Handler =>
13511 [ ($v_version lt v3.1.0)
13512 ? \&filter_old_style_case_folding
13514 \&filter_case_folding_line
13516 Post_Handler => \&post_fold,
13518 Input_file->new('DCoreProperties.txt', v3.1.0,
13519 # 5.2 changed this file
13520 Has_Missings_Defaults => (($v_version ge v5.2.0)
13524 Input_file->new('Scripts.txt', v3.1.0,
13525 Property => 'Script',
13526 Has_Missings_Defaults => $NOT_IGNORED,
13528 Input_file->new('DNormalizationProps.txt', v3.1.0,
13529 Has_Missings_Defaults => $NOT_IGNORED,
13530 Each_Line_Handler => (($v_version lt v4.0.1)
13531 ? \&filter_old_style_normalization_lines
13534 Input_file->new('HangulSyllableType.txt', v4.0.0,
13535 Has_Missings_Defaults => $NOT_IGNORED,
13536 Property => 'Hangul_Syllable_Type'),
13537 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
13538 Property => 'Word_Break',
13539 Has_Missings_Defaults => $NOT_IGNORED,
13541 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0,
13542 Property => 'Grapheme_Cluster_Break',
13543 Has_Missings_Defaults => $NOT_IGNORED,
13545 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
13546 Handler => \&process_GCB_test,
13548 Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
13551 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
13554 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
13557 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
13558 Property => 'Sentence_Break',
13559 Has_Missings_Defaults => $NOT_IGNORED,
13561 Input_file->new('NamedSequences.txt', v4.1.0,
13562 Handler => \&process_NamedSequences
13564 Input_file->new('NameAliases.txt', v5.0.0,
13565 Property => 'Name_Alias',
13567 Input_file->new("BidiTest.txt", v5.2.0,
13570 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
13572 Each_Line_Handler => \&filter_unihan_line,
13574 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
13576 Each_Line_Handler => \&filter_unihan_line,
13578 Input_file->new('UnihanIRGSources.txt', v5.2.0,
13580 Pre_Handler => \&setup_unihan,
13581 Each_Line_Handler => \&filter_unihan_line,
13583 Input_file->new('UnihanNumericValues.txt', v5.2.0,
13585 Each_Line_Handler => \&filter_unihan_line,
13587 Input_file->new('UnihanOtherMappings.txt', v5.2.0,
13589 Each_Line_Handler => \&filter_unihan_line,
13591 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
13593 Each_Line_Handler => \&filter_unihan_line,
13595 Input_file->new('UnihanReadings.txt', v5.2.0,
13597 Each_Line_Handler => \&filter_unihan_line,
13599 Input_file->new('UnihanVariants.txt', v5.2.0,
13601 Each_Line_Handler => \&filter_unihan_line,
13605 # End of all the preliminaries.
13608 if ($compare_versions) {
13609 Carp::my_carp(<<END
13610 Warning. \$compare_versions is set. Output is not suitable for production
13615 # Put into %potential_files a list of all the files in the directory structure
13616 # that could be inputs to this program, excluding those that we should ignore.
13617 # Use absolute file names because it makes it easier across machine types.
13618 my @ignored_files_full_names = map { File::Spec->rel2abs(
13619 internal_file_to_platform($_))
13620 } keys %ignored_files;
13623 return unless /\.txt$/i; # Some platforms change the name's case
13624 my $full = lc(File::Spec->rel2abs($_));
13625 $potential_files{$full} = 1
13626 if ! grep { $full eq lc($_) } @ignored_files_full_names;
13629 }, File::Spec->curdir());
13631 my @mktables_list_output_files;
13633 if ($write_unchanged_files) {
13634 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
13637 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
13639 if (! open $file_handle, "<", $file_list) {
13640 Carp::my_carp("Failed to open '$file_list' (this is expected to be missing the first time); turning on -globlist option instead: $!");
13646 # Read and parse mktables.lst, placing the results from the first part
13647 # into @input, and the second part into @mktables_list_output_files
13648 for my $list ( \@input, \@mktables_list_output_files ) {
13649 while (<$file_handle>) {
13650 s/^ \s+ | \s+ $//xg;
13651 next if /^ \s* (?: \# .* )? $/x;
13653 my ( $file ) = split /\t/;
13654 push @$list, $file;
13656 @$list = uniques(@$list);
13660 # Look through all the input files
13661 foreach my $input (@input) {
13662 next if $input eq 'version'; # Already have checked this.
13664 # Ignore if doesn't exist. The checking about whether we care or
13665 # not is done via the Input_file object.
13666 next if ! file_exists($input);
13668 # The paths are stored with relative names, and with '/' as the
13669 # delimiter; convert to absolute on this machine
13670 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
13671 $potential_files{$full} = 1
13672 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
13676 close $file_handle;
13681 # Here wants to process all .txt files in the directory structure.
13682 # Convert them to full path names. They are stored in the platform's
13685 foreach my $object (@input_file_objects) {
13686 my $file = $object->file;
13687 next unless defined $file;
13688 push @known_files, File::Spec->rel2abs($file);
13691 my @unknown_input_files;
13692 foreach my $file (keys %potential_files) {
13693 next if grep { lc($file) eq lc($_) } @known_files;
13695 # Here, the file is unknown to us. Get relative path name
13696 $file = File::Spec->abs2rel($file);
13697 push @unknown_input_files, $file;
13699 # What will happen is we create a data structure for it, and add it to
13700 # the list of input files to process. First get the subdirectories
13702 my (undef, $directories, undef) = File::Spec->splitpath($file);
13703 $directories =~ s;/$;;; # Can have extraneous trailing '/'
13704 my @directories = File::Spec->splitdir($directories);
13706 # If the file isn't extracted (meaning none of the directories is the
13707 # extracted one), just add it to the end of the list of inputs.
13708 if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
13709 push @input_file_objects, Input_file->new($file, v0);
13713 # Here, the file is extracted. It needs to go ahead of most other
13714 # processing. Search for the first input file that isn't a
13715 # special required property (that is, find one whose first_release
13716 # is non-0), and isn't extracted. Also, the Age property file is
13717 # processed before the extracted ones, just in case
13718 # $compare_versions is set.
13719 for (my $i = 0; $i < @input_file_objects; $i++) {
13720 if ($input_file_objects[$i]->first_released ne v0
13721 && lc($input_file_objects[$i]->file) ne 'dage.txt'
13722 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
13724 splice @input_file_objects, $i, 0,
13725 Input_file->new($file, v0);
13732 if (@unknown_input_files) {
13733 print STDERR simple_fold(join_lines(<<END
13735 The following files are unknown as to how to handle. Assuming they are
13736 typical property files. You'll know by later error messages if it worked or
13739 ) . " " . join(", ", @unknown_input_files) . "\n\n");
13741 } # End of looking through directory structure for more .txt files.
13743 # Create the list of input files from the objects we have defined, plus
13745 my @input_files = 'version';
13746 foreach my $object (@input_file_objects) {
13747 my $file = $object->file;
13748 next if ! defined $file; # Not all objects have files
13749 next if $object->optional && ! -e $file;
13750 push @input_files, $file;
13753 if ( $verbosity >= $VERBOSE ) {
13754 print "Expecting ".scalar( @input_files )." input files. ",
13755 "Checking ".scalar( @mktables_list_output_files )." output files.\n";
13758 # We set $youngest to be the most recently changed input file, including this
13759 # program itself (done much earlier in this file)
13760 foreach my $in (@input_files) {
13762 next unless defined $age; # Keep going even if missing a file
13763 $youngest = $age if $age < $youngest;
13765 # See that the input files have distinct names, to warn someone if they
13766 # are adding a new one
13768 my ($volume, $directories, $file ) = File::Spec->splitpath($in);
13769 $directories =~ s;/$;;; # Can have extraneous trailing '/'
13770 my @directories = File::Spec->splitdir($directories);
13771 my $base = $file =~ s/\.txt$//;
13772 construct_filename($file, 'mutable', \@directories);
13776 my $ok = ! $write_unchanged_files
13777 && scalar @mktables_list_output_files; # If none known, rebuild
13779 # Now we check to see if any output files are older than youngest, if
13780 # they are, we need to continue on, otherwise we can presumably bail.
13782 foreach my $out (@mktables_list_output_files) {
13783 if ( ! file_exists($out)) {
13784 print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
13788 #local $to_trace = 1 if main::DEBUG;
13789 trace $youngest, -M $out if main::DEBUG && $to_trace;
13790 if ( -M $out > $youngest ) {
13791 #trace "$out: age: ", -M $out, ", youngest: $youngest\n" if main::DEBUG && $to_trace;
13792 print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
13799 print "Files seem to be ok, not bothering to rebuild.\n";
13802 print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
13804 # Ready to do the major processing. First create the perl pseudo-property.
13805 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
13807 # Process each input file
13808 foreach my $file (@input_file_objects) {
13812 # Finish the table generation.
13814 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
13817 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
13820 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
13821 add_perl_synonyms();
13823 print "Writing tables\n" if $verbosity >= $PROGRESS;
13824 write_all_tables();
13826 # Write mktables.lst
13827 if ( $file_list and $make_list ) {
13829 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
13830 foreach my $file (@input_files, @files_actually_output) {
13831 my (undef, $directories, $file) = File::Spec->splitpath($file);
13832 my @directories = File::Spec->splitdir($directories);
13833 $file = join '/', @directories, $file;
13837 if (! open $ofh,">",$file_list) {
13838 Carp::my_carp("Can't write to '$file_list'. Skipping: $!");
13842 print $ofh <<"END";
13844 # $file_list -- File list for $0.
13846 # Autogenerated on @{[scalar localtime]}
13848 # - First section is input files
13849 # ($0 itself is not listed but is automatically considered an input)
13850 # - Section seperator is /^=+\$/
13851 # - Second section is a list of output files.
13852 # - Lines matching /^\\s*#/ are treated as comments
13853 # which along with blank lines are ignored.
13859 print $ofh "$_\n" for sort(@input_files);
13860 print $ofh "\n=================================\n# Output files:\n\n";
13861 print $ofh "$_\n" for sort @files_actually_output;
13862 print $ofh "\n# ",scalar(@input_files)," input files\n",
13863 "# ",scalar(@files_actually_output)+1," output files\n\n",
13866 or Carp::my_carp("Failed to close $ofh: $!");
13868 print "Filelist has ",scalar(@input_files)," input files and ",
13869 scalar(@files_actually_output)+1," output files\n"
13870 if $verbosity >= $VERBOSE;
13874 # Output these warnings unless -q explicitly specified.
13875 if ($verbosity >= $NORMAL_VERBOSITY) {
13876 if (@unhandled_properties) {
13877 print "\nProperties and tables that unexpectedly have no code points\n";
13878 foreach my $property (sort @unhandled_properties) {
13879 print $property, "\n";
13883 if (%potential_files) {
13884 print "\nInput files that are not considered:\n";
13885 foreach my $file (sort keys %potential_files) {
13886 print File::Spec->abs2rel($file), "\n";
13889 print "\nAll done\n" if $verbosity >= $VERBOSE;
13893 # TRAILING CODE IS USED BY make_property_test_script()
13899 # Test qr/\X/ and the \p{} regular expression constructs. This file is
13900 # constructed by mktables from the tables it generates, so if mktables is
13901 # buggy, this won't necessarily catch those bugs. Tests are generated for all
13902 # feasible properties; a few aren't currently feasible; see
13903 # is_code_point_usable() in mktables for details.
13905 # Standard test packages are not used because this manipulates SIG_WARN. It
13906 # exits 0 if every non-skipped test succeeded; -1 if any failed.
13911 my $non_ASCII = (ord('A') != 65);
13913 # The 256 8-bit characters in ASCII ordinal order, with the ones that don't
13914 # have Perl names replaced by -1
13915 my @ascii_ordered_chars = (
13918 "\a", "\b", "\t", "\n",
13922 " ", "!", "\"", "#", '$', "%", "&", "'",
13923 "(", ")", "*", "+", ",", "-", ".", "/",
13924 "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
13925 ":", ";", "<", "=", ">", "?", "@",
13926 "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
13927 "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
13928 "[", "\\", "]", "^", "_", "`",
13929 "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
13930 "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
13931 "{", "|", "}", "~",
13935 sub ASCII_ord_to_native ($) {
13936 # Converts input ordinal number to the native one, if can be done easily.
13937 # Returns -1 otherwise.
13941 return $ord if $ord > 255 || ! $non_ASCII;
13942 my $result = $ascii_ordered_chars[$ord];
13943 return $result if $result eq '-1';
13944 return ord($result);
13948 my $expected = shift;
13951 my $warning_type = shift; # Type of warning message, like 'deprecated'
13953 my $line = (caller)[2];
13955 # Convert the non-ASCII code points expressible as characters to their
13956 # ASCII equivalents, and skip the others.
13957 $ord = ASCII_ord_to_native($ord);
13960 print "ok $Tests - "
13961 . sprintf("\"\\x{%04X}\"", $ord)
13962 . " =~ $regex # Skipped: non-ASCII\n";
13966 # Convert the code point to hex form
13967 my $string = sprintf "\"\\x{%04X}\"", $ord;
13971 # The first time through, use all warnings. If the input should generate
13972 # a warning, add another time through with them turned off
13973 push @tests, "no warnings '$warning_type';" if $warning_type;
13975 foreach my $no_warnings (@tests) {
13977 # Store any warning messages instead of outputting them
13978 local $SIG{__WARN__} = $SIG{__WARN__};
13979 my $warning_message;
13980 $SIG{__WARN__} = sub { $warning_message = $_[0] };
13984 # A string eval is needed because of the 'no warnings'.
13985 # Assumes no parens in the regular expression
13986 my $result = eval "$no_warnings
13987 my \$RegObj = qr($regex);
13988 $string =~ \$RegObj ? 1 : 0";
13989 if (not defined $result) {
13990 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
13993 elsif ($result ^ $expected) {
13994 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
13997 elsif ($warning_message) {
13998 if (! $warning_type || ($warning_type && $no_warnings)) {
13999 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
14003 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
14006 elsif ($warning_type && ! $no_warnings) {
14007 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
14011 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
14020 if (eval { 'x' =~ qr/$regex/; 1 }) {
14022 my $line = (caller)[2];
14023 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
14026 my $line = (caller)[2];
14027 print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
14032 # GCBTest.txt character that separates grapheme clusters
14033 my $breakable_utf8 = my $breakable = chr(0xF7);
14034 utf8::upgrade($breakable_utf8);
14036 # GCBTest.txt character that indicates that the adjoining code points are part
14037 # of the same grapheme cluster
14038 my $nobreak_utf8 = my $nobreak = chr(0xD7);
14039 utf8::upgrade($nobreak_utf8);
14042 # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt
14043 # Each such line is a sequence of code points given by their hex numbers,
14044 # separated by the two characters defined just before this subroutine that
14045 # indicate that either there can or cannot be a break between the adjacent
14046 # code points. If there isn't a break, that means the sequence forms an
14047 # extended grapheme cluster, which means that \X should match the whole
14048 # thing. If there is a break, \X should stop there. This is all
14049 # converted by this routine into a match:
14050 # $string =~ /(\X)/,
14051 # Each \X should match the next cluster; and that is what is checked.
14053 my $template = shift;
14055 my $line = (caller)[2];
14057 # The line contains characters above the ASCII range, but in Latin1. It
14058 # may or may not be in utf8, and if it is, it may or may not know it. So,
14059 # convert these characters to 8 bits. If knows is in utf8, simply
14061 if (utf8::is_utf8($template)) {
14062 utf8::downgrade($template);
14065 # Otherwise, if it is in utf8, but doesn't know it, the next lines
14066 # convert the two problematic characters to their 8-bit equivalents.
14067 # If it isn't in utf8, they don't harm anything.
14069 $template =~ s/$nobreak_utf8/$nobreak/g;
14070 $template =~ s/$breakable_utf8/$breakable/g;
14073 # Get rid of the leading and trailing breakables
14074 $template =~ s/^ \s* $breakable \s* //x;
14075 $template =~ s/ \s* $breakable \s* $ //x;
14077 # And no-breaks become just a space.
14078 $template =~ s/ \s* $nobreak \s* / /xg;
14080 # Split the input into segments that are breakable between them.
14081 my @segments = split /\s*$breakable\s*/, $template;
14084 my $display_string = "";
14086 my @should_display;
14088 # Convert the code point sequence in each segment into a Perl string of
14090 foreach my $segment (@segments) {
14091 my @code_points = split /\s+/, $segment;
14092 my $this_string = "";
14093 my $this_display = "";
14094 foreach my $code_point (@code_points) {
14095 my $ord = ASCII_ord_to_native(hex $code_point);
14098 print "ok $Tests - String containing $code_point =~ /(\\X)/g # Skipped: non-ASCII\n";
14101 $this_string .= chr $ord;
14102 $this_display .= "\\x{$code_point}";
14105 # The next cluster should match the string in this segment.
14106 push @should_match, $this_string;
14107 push @should_display, $this_display;
14108 $string .= $this_string;
14109 $display_string .= $this_display;
14112 # If a string can be represented in both non-ut8 and utf8, test both cases
14114 for my $to_upgrade (0 .. 1) {
14118 # If already in utf8, would just be a repeat
14119 next UPGRADE if utf8::is_utf8($string);
14121 utf8::upgrade($string);
14124 # Finally, do the \X match.
14125 my @matches = $string =~ /(\X)/g;
14127 # Look through each matched cluster to verify that it matches what we
14129 my $min = (@matches < @should_match) ? @matches : @should_match;
14130 for my $i (0 .. $min - 1) {
14132 if ($matches[$i] eq $should_match[$i]) {
14133 print "ok $Tests - ";
14135 print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
14137 print "And \\X #", $i + 1,
14139 print " correctly matched $should_display[$i]; line $line\n";
14141 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
14142 unpack("U*", $matches[$i]));
14143 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
14145 " should have matched $should_display[$i]",
14146 " but instead matched $matches[$i]",
14147 ". Abandoning rest of line $line\n";
14152 # And the number of matches should equal the number of expected matches.
14154 if (@matches == @should_match) {
14155 print "ok $Tests - Nothing was left over; line $line\n";
14157 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
14165 print "1..$Tests\n";
14166 exit($Fails ? -1 : 0);
14169 Error('\p{Script=InGreek}'); # Bug #69018
14170 Test_X("1100 $nobreak 1161"); # Bug #70940