3 # !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
4 # Any files created or read by this program should be listed in 'mktables.lst'
5 # Use -makelist to regenerate it.
7 # Needs 'no overloading' to run faster on miniperl. Code commented out at the
8 # subroutine objaddr can be used instead to work as far back (untested) as
19 sub DEBUG () { 0 } # Set to 0 for production; 1 for development
21 ##########################################################################
23 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
24 # from the Unicode database files (lib/unicore/.../*.txt), It also generates
25 # a pod file and a .t file
27 # The structure of this file is:
28 # First these introductory comments; then
29 # code needed for everywhere, such as debugging stuff; then
30 # code to handle input parameters; then
31 # data structures likely to be of external interest (some of which depend on
32 # the input parameters, so follows them; then
33 # more data structures and subroutine and package (class) definitions; then
34 # the small actual loop to process the input files and finish up; then
35 # a __DATA__ section, for the .t tests
37 # This program works on all releases of Unicode through at least 5.2. The
38 # outputs have been scrutinized most intently for release 5.1. The others
39 # have been checked for somewhat more than just sanity. It can handle all
40 # existing Unicode character properties in those releases.
42 # This program needs to be able to run under miniperl. Therefore, it uses a
43 # minimum of other modules, and hence implements some things itself that could
46 # This program uses inputs published by the Unicode Consortium. These can
47 # change incompatibly between releases without the Perl maintainers realizing
48 # it. Therefore this program is now designed to try to flag these. It looks
49 # at the directories where the inputs are, and flags any unrecognized files.
50 # It keeps track of all the properties in the files it handles, and flags any
51 # that it doesn't know how to handle. It also flags any input lines that
52 # don't match the expected syntax, among other checks.
53 # It is also designed so if a new input file matches one of the known
54 # templates, one hopefully just needs to add it to a list to have it
57 # It tries to keep fatal errors to a minimum, to generate something usable for
58 # testing purposes. It always looks for files that could be inputs, and will
59 # warn about any that it doesn't know how to handle (the -q option suppresses
62 # This program is mostly about Unicode character (or code point) properties.
63 # A property describes some attribute or quality of a code point, like if it
64 # is lowercase or not, its name, what version of Unicode it was first defined
65 # in, or what its uppercase equivalent is. Unicode deals with these disparate
66 # possibilities by making all properties into mappings from each code point
67 # into some corresponding value. In the case of it being lowercase or not,
68 # the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
69 # property maps each Unicode code point to a single value, called a "property
70 # value". (Hence each Unicode property is a true mathematical function with
71 # exactly one value per code point.)
73 # When using a property in a regular expression, what is desired isn't the
74 # mapping of the code point to its property's value, but the reverse (or the
75 # mathematical "inverse relation"): starting with the property value, "Does a
76 # code point map to it?" These are written in a "compound" form:
77 # \p{property=value}, e.g., \p{category=punctuation}. This program generates
78 # files containing the lists of code points that map to each such regular
79 # expression property value, one file per list
81 # There is also a single form shortcut that Perl adds for many of the commonly
82 # used properties. This happens for all binary properties, plus script,
83 # general_category, and block properties.
85 # Thus the outputs of this program are files. There are map files, mostly in
86 # the 'To' directory; and there are list files for use in regular expression
87 # matching, all in subdirectories of the 'lib' directory, with each
88 # subdirectory being named for the property that the lists in it are for.
89 # Bookkeeping, test, and documentation files are also generated.
91 my $matches_directory = 'lib'; # Where match (\p{}) files go.
92 my $map_directory = 'To'; # Where map files go.
96 # The major data structures of this program are Property, of course, but also
97 # Table. There are two kinds of tables, very similar to each other.
98 # "Match_Table" is the data structure giving the list of code points that have
99 # a particular property value, mentioned above. There is also a "Map_Table"
100 # data structure which gives the property's mapping from code point to value.
101 # There are two structures because the match tables need to be combined in
102 # various ways, such as constructing unions, intersections, complements, etc.,
103 # and the map ones don't. And there would be problems, perhaps subtle, if
104 # a map table were inadvertently operated on in some of those ways.
105 # The use of separate classes with operations defined on one but not the other
106 # prevents accidentally confusing the two.
108 # At the heart of each table's data structure is a "Range_List", which is just
109 # an ordered list of "Ranges", plus ancillary information, and methods to
110 # operate on them. A Range is a compact way to store property information.
111 # Each range has a starting code point, an ending code point, and a value that
112 # is meant to apply to all the code points between the two end points,
113 # inclusive. For a map table, this value is the property value for those
114 # code points. Two such ranges could be written like this:
115 # 0x41 .. 0x5A, 'Upper',
116 # 0x61 .. 0x7A, 'Lower'
118 # Each range also has a type used as a convenience to classify the values.
119 # Most ranges in this program will be Type 0, or normal, but there are some
120 # ranges that have a non-zero type. These are used only in map tables, and
121 # are for mappings that don't fit into the normal scheme of things. Mappings
122 # that require a hash entry to communicate with utf8.c are one example;
123 # another example is mappings for charnames.pm to use which indicate a name
124 # that is algorithmically determinable from its code point (and vice-versa).
125 # These are used to significantly compact these tables, instead of listing
126 # each one of the tens of thousands individually.
128 # In a match table, the value of a range is irrelevant (and hence the type as
129 # well, which will always be 0), and arbitrarily set to the null string.
130 # Using the example above, there would be two match tables for those two
131 # entries, one named Upper would contain the 0x41..0x5A range, and the other
132 # named Lower would contain 0x61..0x7A.
134 # Actually, there are two types of range lists, "Range_Map" is the one
135 # associated with map tables, and "Range_List" with match tables.
136 # Again, this is so that methods can be defined on one and not the other so as
137 # to prevent operating on them in incorrect ways.
139 # Eventually, most tables are written out to files to be read by utf8_heavy.pl
140 # in the perl core. All tables could in theory be written, but some are
141 # suppressed because there is no current practical use for them. It is easy
142 # to change which get written by changing various lists that are near the top
143 # of the actual code in this file. The table data structures contain enough
144 # ancillary information to allow them to be treated as separate entities for
145 # writing, such as the path to each one's file. There is a heading in each
146 # map table that gives the format of its entries, and what the map is for all
147 # the code points missing from it. (This allows tables to be more compact.)
149 # The Property data structure contains one or more tables. All properties
150 # contain a map table (except the $perl property which is a
151 # pseudo-property containing only match tables), and any properties that
152 # are usable in regular expression matches also contain various matching
153 # tables, one for each value the property can have. A binary property can
154 # have two values, True and False (or Y and N, which are preferred by Unicode
155 # terminology). Thus each of these properties will have a map table that
156 # takes every code point and maps it to Y or N (but having ranges cuts the
157 # number of entries in that table way down), and two match tables, one
158 # which has a list of all the code points that map to Y, and one for all the
159 # code points that map to N. (For each of these, a third table is also
160 # generated for the pseudo Perl property. It contains the identical code
161 # points as the Y table, but can be written, not in the compound form, but in
162 # a "single" form like \p{IsUppercase}.) Many properties are binary, but some
163 # properties have several possible values, some have many, and properties like
164 # Name have a different value for every named code point. Those will not,
165 # unless the controlling lists are changed, have their match tables written
166 # out. But all the ones which can be used in regular expression \p{} and \P{}
167 # constructs will. Generally a property will have either its map table or its
168 # match tables written but not both. Again, what gets written is controlled
169 # by lists which can easily be changed.
171 # For information about the Unicode properties, see Unicode's UAX44 document:
173 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
175 # As stated earlier, this program will work on any release of Unicode so far.
176 # Most obvious problems in earlier data have NOT been corrected except when
177 # necessary to make Perl or this program work reasonably. For example, no
178 # folding information was given in early releases, so this program uses the
179 # substitute of lower case, just so that a regular expression with the /i
180 # option will do something that actually gives the right results in many
181 # cases. There are also a couple other corrections for version 1.1.5,
182 # commented at the point they are made. As an example of corrections that
183 # weren't made (but could be) is this statement from DerivedAge.txt: "The
184 # supplementary private use code points and the non-character code points were
185 # assigned in version 2.0, but not specifically listed in the UCD until
186 # versions 3.0 and 3.1 respectively." (To be precise it was 3.0.1 not 3.0.0)
187 # More information on Unicode version glitches is further down in these
188 # introductory comments.
190 # This program works on all properties as of 5.2, though the files for some
191 # are suppressed from apparent lack of demand for. You can change which are
192 # output by changing lists in this program.
194 # The old version of mktables emphasized the term "Fuzzy" to mean Unocde's
195 # loose matchings rules (from Unicode TR18):
197 # The recommended names for UCD properties and property values are in
198 # PropertyAliases.txt [Prop] and PropertyValueAliases.txt
199 # [PropValue]. There are both abbreviated names and longer, more
200 # descriptive names. It is strongly recommended that both names be
201 # recognized, and that loose matching of property names be used,
202 # whereby the case distinctions, whitespace, hyphens, and underbar
204 # The program still allows Fuzzy to override its determination of if loose
205 # matching should be used, but it isn't currently used, as it is no longer
206 # needed; the calculations it makes are good enough.
208 # SUMMARY OF HOW IT WORKS:
212 # A list is constructed containing each input file that is to be processed
214 # Each file on the list is processed in a loop, using the associated handler
216 # The PropertyAliases.txt and PropValueAliases.txt files are processed
217 # first. These files name the properties and property values.
218 # Objects are created of all the property and property value names
219 # that the rest of the input should expect, including all synonyms.
220 # The other input files give mappings from properties to property
221 # values. That is, they list code points and say what the mapping
222 # is under the given property. Some files give the mappings for
223 # just one property; and some for many. This program goes through
224 # each file and populates the properties from them. Some properties
225 # are listed in more than one file, and Unicode has set up a
226 # precedence as to which has priority if there is a conflict. Thus
227 # the order of processing matters, and this program handles the
228 # conflict possibility by processing the overriding input files
229 # last, so that if necessary they replace earlier values.
230 # After this is all done, the program creates the property mappings not
231 # furnished by Unicode, but derivable from what it does give.
232 # The tables of code points that match each property value in each
233 # property that is accessible by regular expressions are created.
234 # The Perl-defined properties are created and populated. Many of these
235 # require data determined from the earlier steps
236 # Any Perl-defined synonyms are created, and name clashes between Perl
237 # and Unicode are reconciled.
238 # All the properties are written to files
239 # Any other files are written, and final warnings issued.
241 # As mentioned above, some properties are given in more than one file. In
242 # particular, the files in the extracted directory are supposedly just
243 # reformattings of the others. But they contain information not easily
244 # derivable from the other files, including results for Unihan, which this
245 # program doesn't ordinarily look at, and for unassigned code points. They
246 # also have historically had errors or been incomplete. In an attempt to
247 # create the best possible data, this program thus processes them first to
248 # glean information missing from the other files; then processes those other
249 # files to override any errors in the extracted ones.
251 # For clarity, a number of operators have been overloaded to work on tables:
252 # ~ means invert (take all characters not in the set). The more
253 # conventional '!' is not used because of the possibility of confusing
254 # it with the actual boolean operation.
256 # - means subtraction
257 # & means intersection
258 # The precedence of these is the order listed. Parentheses should be
259 # copiously used. These are not a general scheme. The operations aren't
260 # defined for a number of things, deliberately, to avoid getting into trouble.
261 # Operations are done on references and affect the underlying structures, so
262 # that the copy constructors for them have been overloaded to not return a new
263 # clone, but the input object itself.
265 # The bool operator is deliberately not overloaded to avoid confusion with
266 # "should it mean if the object merely exists, or also is non-empty?".
269 # WHY CERTAIN DESIGN DECISIONS WERE MADE
271 # XXX These comments need more work.
273 # Why have files written out for binary 'N' matches?
274 # For binary properties, if you know the mapping for either Y or N; the
275 # other is trivial to construct, so could be done at Perl run-time instead
276 # of having a file for it. That is, if someone types in \p{foo: N}, Perl
277 # could translate that to \P{foo: Y} and not need a file. The problem is
278 # communicating to Perl that a given property is binary. Perl can't figure
279 # it out from looking at the N (or No), as some non-binary properties have
280 # these as property values.
282 # There are several types of properties, based on what form their values can
283 # take on. These are described in more detail below in the DATA STRUCTURES
284 # section of these comments, but for now, you should know that there are
285 # string properties, whose values are strings of one or more code points (such
286 # as the Uppercase_mapping property); every other property maps to some other
287 # form, like true or false, or a number, or a name, etc. The reason there are
288 # two directories for map files is because of the way utf8.c works. It
289 # expects that any files there are string properties, that is that the
290 # mappings are each to one code point, with mappings in multiple code points
291 # handled specially in an extra hash data structure. Digit.pl is a table that
292 # is written there for historical reasons, even though it doesn't fit that
293 # mold. Thus it can't currently be looked at by the Perl core.
295 # There are no match tables generated for matches of the null string. These
296 # would like like \p{JSN=}. Perhaps something like them could be added if
297 # necessary. The JSN does have a real code point U+110B that maps to the null
298 # string, but it is a contributory property, and therefore not output by
303 # XXX Add more stuff here. use perl instead of miniperl to find problems with
308 # The program would break if Unicode were to change its names so that
309 # interior white space, underscores, or dashes differences were significant
310 # within property and property value names.
312 # It might be easier to use the xml versions of the UCD if this program ever
313 # would need heavy revision, and the ability to handle old versions was not
316 # There is the potential for name collisions, in that Perl has chosen names
317 # that Unicode could decide it also likes. There have been such collisions in
318 # the past, with mostly Perl deciding to adopt the Unicode definition of the
319 # name. However in the 5.2 Unicode beta testing, there were a number of such
320 # collisions, which were withdrawn before the final release, because of Perl's
321 # and other's protests. These all involved new properties which began with
322 # 'Is'. Based on the protests, Unicode is unlikely to try that again. Also,
323 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
324 # Unicode document, so they are unlikely to be used by Unicode for another
325 # purpose. However, they might try something beginning with 'In', or use any
326 # of the other Perl-defined properties. This program will warn you of name
327 # collisions, and refuse to generate tables with them, but manual intervention
328 # will be required in this event. One scheme that could be implemented, if
329 # necessary, would be to have this program generate another file, or add a
330 # field to mktables.lst that gives the date of first definition of a property.
331 # Each new release of Unicode would use that file as a basis for the next
332 # iteration. And the Perl synonym addition code could sort based on the age
333 # of the property, so older properties get priority, and newer ones that clash
334 # would be refused; hence existing code would not be impacted, and some other
335 # synonym would have to be used for the new property. This is ugly, and
336 # manual intervention would certainly be easier to do in the short run; lets
337 # hope it never comes to this.
341 # This program can generate tables from the Unihan database. But it doesn't
342 # by default, letting the CPAN module Unicode::Unihan handle them. Prior to
343 # version 5.2, this database was in a single file, Unihan.txt. In 5.2 the
344 # database was split into 8 different files, all beginning with the letters
345 # 'Unihan'. This program will read those file(s) if present, but it needs to
346 # know which of the many properties in the file(s) should have tables created
347 # for them. It will create tables for any properties listed in
348 # PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
349 # @cjk_properties array and the @cjk_property_values array. Thus, if a
350 # property you want is not in those files of the release you are building
351 # against, you must add it to those two arrays. Starting in 4.0, the
352 # Unicode_Radical_Stroke was listed in those files, so if the Unihan database
353 # is present in the directory, a table will be generated for that property.
354 # In 5.2, several more properties were added. For your convenience, the two
355 # arrays are initialized with all the 5.2 listed properties that are also in
356 # earlier releases. But these are commented out. You can just uncomment the
357 # ones you want, or use them as a template for adding entries for other
360 # You may need to adjust the entries to suit your purposes. setup_unihan(),
361 # and filter_unihan_line() are the functions where this is done. This program
362 # already does some adjusting to make the lines look more like the rest of the
363 # Unicode DB; You can see what that is in filter_unihan_line()
365 # There is a bug in the 3.2 data file in which some values for the
366 # kPrimaryNumeric property have commas and an unexpected comment. A filter
367 # could be added for these; or for a particular installation, the Unihan.txt
368 # file could be edited to fix them.
373 # Unicode Versions Notes
375 # alpha's numbers halve in 2.1.9, answer cjk block at 4E00 were removed from PropList; not changed, could add gc Letter, put back in in 3.1.0
376 # Some versions of 2.1.x Jamo.txt have the wrong value for 1105, which causes
377 # real problems for the algorithms for Jamo calculations, so it is changed
379 # White space vs Space. in 3.2 perl has +205F=medium math space, fixed in 4.0, and ok in 3.1.1 because not there in unicode. synonym introduced in 4.1
380 # ATBL = 202. 202 changed to ATB, and all code points stayed there. So if you were useing ATBL you were out of luck.
381 # Hrkt Katakana_Or_Hiragana came in 4.01, before was Unknown.
383 # The default for missing code points for BidiClass is complicated. Starting
384 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
385 # tries to do the best it can for earlier releases. It is done in
386 # process_PropertyAliases()
388 ##############################################################################
390 my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
392 my $MAX_LINE_WIDTH = 78;
394 # Debugging aid to skip most files so as to not be distracted by them when
395 # concentrating on the ones being debugged. Add
397 # to the constructor for those files you want processed when you set this.
398 # Files with a first version number of 0 are special: they are always
399 # processed regardless of the state of this flag.
402 # Set to 1 to enable tracing.
405 { # Closure for trace: debugging aid
406 my $print_caller = 1; # ? Include calling subroutine name
407 my $main_with_colon = 'main::';
408 my $main_colon_length = length($main_with_colon);
411 return unless $to_trace; # Do nothing if global flag not set
415 local $DB::trace = 0;
416 $DB::trace = 0; # Quiet 'used only once' message
420 # Loop looking up the stack to get the first non-trace caller
425 $line_number = $caller_line;
426 (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
427 $caller = $main_with_colon unless defined $caller;
429 $caller_name = $caller;
432 $caller_name =~ s/.*:://;
433 if (substr($caller_name, 0, $main_colon_length)
436 $caller_name = substr($caller_name, $main_colon_length);
439 } until ($caller_name ne 'trace');
441 # If the stack was empty, we were called from the top level
442 $caller_name = 'main' if ($caller_name eq ""
443 || $caller_name eq 'trace');
446 foreach my $string (@input) {
447 #print STDERR __LINE__, ": ", join ", ", @input, "\n";
448 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
449 $output .= simple_dumper($string);
452 $string = "$string" if ref $string;
453 $string = $UNDEF unless defined $string;
455 $string = '""' if $string eq "";
456 $output .= " " if $output ne ""
458 && substr($output, -1, 1) ne " "
459 && substr($string, 0, 1) ne " ";
464 print STDERR sprintf "%4d: ", $line_number if defined $line_number;
465 print STDERR "$caller_name: " if $print_caller;
466 print STDERR $output, "\n";
471 # This is for a rarely used development feature that allows you to compare two
472 # versions of the Unicode standard without having to deal with changes caused
473 # by the code points introduced in the later verson. Change the 0 to a SINGLE
474 # dotted Unicode release number (e.g. 2.1). Only code points introduced in
475 # that release and earlier will be used; later ones are thrown away. You use
476 # the version number of the earliest one you want to compare; then run this
477 # program on directory structures containing each release, and compare the
478 # outputs. These outputs will therefore include only the code points common
479 # to both releases, and you can see the changes caused just by the underlying
480 # release semantic changes. For versions earlier than 3.2, you must copy a
481 # version of DAge.txt into the directory.
482 my $string_compare_versions = DEBUG && 0; # e.g., v2.1;
483 my $compare_versions = DEBUG
484 && $string_compare_versions
485 && pack "C*", split /\./, $string_compare_versions;
488 # Returns non-duplicated input values. From "Perl Best Practices:
489 # Encapsulated Cleverness". p. 455 in first edition.
492 return grep { ! $seen{$_}++ } @_;
495 $0 = File::Spec->canonpath($0);
497 my $make_test_script = 0; # ? Should we output a test script
498 my $write_unchanged_files = 0; # ? Should we update the output files even if
499 # we don't think they have changed
500 my $use_directory = ""; # ? Should we chdir somewhere.
501 my $pod_directory; # input directory to store the pod file.
502 my $pod_file = 'perluniprops';
503 my $t_path; # Path to the .t test file
504 my $file_list = 'mktables.lst'; # File to store input and output file names.
505 # This is used to speed up the build, by not
506 # executing the main body of the program if
507 # nothing on the list has changed since the
509 my $make_list = 1; # ? Should we write $file_list. Set to always
510 # make a list so that when the pumpking is
511 # preparing a release, s/he won't have to do
513 my $glob_list = 0; # ? Should we try to include unknown .txt files
515 my $output_range_counts = 1; # ? Should we include the number of code points
516 # in ranges in the output
517 # Verbosity levels; 0 is quiet
518 my $NORMAL_VERBOSITY = 1;
522 my $verbosity = $NORMAL_VERBOSITY;
526 my $arg = shift @ARGV;
528 $verbosity = $VERBOSE;
530 elsif ($arg eq '-p') {
531 $verbosity = $PROGRESS;
532 $| = 1; # Flush buffers as we go.
534 elsif ($arg eq '-q') {
537 elsif ($arg eq '-w') {
538 $write_unchanged_files = 1; # update the files even if havent changed
540 elsif ($arg eq '-check') {
541 my $this = shift @ARGV;
542 my $ok = shift @ARGV;
544 print "Skipping as check params are not the same.\n";
548 elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
549 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
551 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
553 $make_test_script = 1;
555 elsif ($arg eq '-makelist') {
558 elsif ($arg eq '-C' && defined ($use_directory = shift)) {
559 -d $use_directory or croak "Unknown directory '$use_directory'";
561 elsif ($arg eq '-L') {
563 # Existence not tested until have chdir'd
566 elsif ($arg eq '-globlist') {
569 elsif ($arg eq '-c') {
570 $output_range_counts = ! $output_range_counts
574 $with_c .= 'out' if $output_range_counts; # Complements the state
576 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
577 [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
579 -c : Output comments $with_c number of code points in ranges
580 -q : Quiet Mode: Only output serious warnings.
581 -p : Set verbosity level to normal plus show progress.
582 -v : Set Verbosity level high: Show progress and non-serious
584 -w : Write files regardless
585 -C dir : Change to this directory before proceeding. All relative paths
586 except those specified by the -P and -T options will be done
587 with respect to this directory.
588 -P dir : Output $pod_file file to directory 'dir'.
589 -T path : Create a test script as 'path'; overrides -maketest
590 -L filelist : Use alternate 'filelist' instead of standard one
591 -globlist : Take as input all non-Test *.txt files in current and sub
593 -maketest : Make test script 'TestProp.pl' in current (or -C directory),
595 -makelist : Rewrite the file list $file_list based on current setup
596 -check A B : Executes $0 only if A and B are the same
601 # Stores the most-recently changed file. If none have changed, can skip the
603 my $youngest = -M $0; # Do this before the chdir!
605 # Change directories now, because need to read 'version' early.
606 if ($use_directory) {
607 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
608 $pod_directory = File::Spec->rel2abs($pod_directory);
610 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
611 $t_path = File::Spec->rel2abs($t_path);
613 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
614 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
615 $pod_directory = File::Spec->abs2rel($pod_directory);
617 if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
618 $t_path = File::Spec->abs2rel($t_path);
622 # Get Unicode version into regular and v-string. This is done now because
623 # various tables below get populated based on it. These tables are populated
624 # here to be near the top of the file, and so easily seeable by those needing
626 open my $VERSION, "<", "version"
627 or croak "$0: can't open required file 'version': $!\n";
628 my $string_version = <$VERSION>;
630 chomp $string_version;
631 my $v_version = pack "C*", split /\./, $string_version; # v string
633 # The following are the complete names of properties with property values that
634 # are known to not match any code points in some versions of Unicode, but that
635 # may change in the future so they should be matchable, hence an empty file is
636 # generated for them.
637 my @tables_that_may_be_empty = (
638 'Joining_Type=Left_Joining',
640 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
641 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
642 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
643 if $v_version ge v4.1.0;
645 # The lists below are hashes, so the key is the item in the list, and the
646 # value is the reason why it is in the list. This makes generation of
647 # documentation easier.
649 my %why_suppressed; # No file generated for these.
651 # Files aren't generated for empty extraneous properties. This is arguable.
652 # Extraneous properties generally come about because a property is no longer
653 # used in a newer version of Unicode. If we generated a file without code
654 # points, programs that used to work on that property will still execute
655 # without errors. It just won't ever match (or will always match, with \P{}).
656 # This means that the logic is now likely wrong. I (khw) think its better to
657 # find this out by getting an error message. Just move them to the table
658 # above to change this behavior
659 my %why_suppress_if_empty_warn_if_not = (
661 # It is the only property that has ever officially been removed from the
662 # Standard. The database never contained any code points for it.
663 'Special_Case_Condition' => 'Obsolete',
665 # Apparently never official, but there were code points in some versions of
666 # old-style PropList.txt
667 'Non_Break' => 'Obsolete',
670 # These would normally go in the warn table just above, but they were changed
671 # a long time before this program was written, so warnings about them are
673 if ($v_version gt v3.2.0) {
674 push @tables_that_may_be_empty,
675 'Canonical_Combining_Class=Attached_Below_Left'
678 # These are listed in the Property aliases file in 5.2, but Unihan is ignored
679 # unless explicitly added.
680 if ($v_version ge v5.2.0) {
681 my $unihan = 'Unihan; remove from list if using Unihan';
682 foreach my $table qw (
686 kCompatibilityVariant
700 $why_suppress_if_empty_warn_if_not{$table} = $unihan;
704 # Properties that this program ignores.
705 my @unimplemented_properties = (
706 'Unicode_Radical_Stroke' # Remove if changing to handle this one.
709 # There are several types of obsolete properties defined by Unicode. These
710 # must be hand-edited for every new Unicode release.
711 my %why_deprecated; # Generates a deprecated warning message if used.
712 my %why_stabilized; # Documentation only
713 my %why_obsolete; # Documentation only
716 my $simple = 'Perl uses the more complete version of this property';
717 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan';
719 my $other_properties = 'other properties';
720 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
721 my $why_no_expand = "Easily computed, and yet doesn't cover the common encoding forms (UTF-16/8)",
724 'Grapheme_Link' => 'Deprecated by Unicode. Use ccc=vr (Canonical_Combining_Class=Virama) instead',
725 'Jamo_Short_Name' => $contributory,
726 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
727 'Other_Alphabetic' => $contributory,
728 'Other_Default_Ignorable_Code_Point' => $contributory,
729 'Other_Grapheme_Extend' => $contributory,
730 'Other_ID_Continue' => $contributory,
731 'Other_ID_Start' => $contributory,
732 'Other_Lowercase' => $contributory,
733 'Other_Math' => $contributory,
734 'Other_Uppercase' => $contributory,
738 # There is a lib/unicore/Decomposition.pl (used by normalize.pm) which
739 # contains the same information, but without the algorithmically
740 # determinable Hangul syllables'. This file is not published, so it's
741 # existence is not noted in the comment.
742 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize',
744 'ISO_Comment' => 'Apparently no demand for it, but can access it through Unicode::UCD::charinfo. Obsoleted, and code points for it removed in Unicode 5.2',
745 'Unicode_1_Name' => "$simple, and no apparent demand for it, but can access it through Unicode::UCD::charinfo. If there is no later name for a code point, then this one is used instead in charnames",
747 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold",
748 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
749 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
750 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
752 'Name' => "Accessible via 'use charnames;'",
753 'Name_Alias' => "Accessible via 'use charnames;'",
755 # These are sort of jumping the gun; deprecation is proposed for
756 # Unicode version 6.0, but they have never been exposed by Perl, and
757 # likely are soon to be deprecated, so best not to expose them.
758 FC_NFKC_Closure => 'Use NFKC_Casefold instead',
759 Expands_On_NFC => $why_no_expand,
760 Expands_On_NFD => $why_no_expand,
761 Expands_On_NFKC => $why_no_expand,
762 Expands_On_NFKD => $why_no_expand,
765 # The following are suppressed because they were made contributory or
766 # deprecated by Unicode before Perl ever thought about supporting them.
767 foreach my $property ('Jamo_Short_Name', 'Grapheme_Link') {
768 $why_suppressed{$property} = $why_deprecated{$property};
771 # Customize the message for all the 'Other_' properties
772 foreach my $property (keys %why_deprecated) {
773 next if (my $main_property = $property) !~ s/^Other_//;
774 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
778 if ($v_version ge 4.0.0) {
779 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
781 if ($v_version ge 5.2.0) {
782 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
785 # Probably obsolete forever
786 if ($v_version ge v4.1.0) {
787 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common"';
790 # This program can create files for enumerated-like properties, such as
791 # 'Numeric_Type'. This file would be the same format as for a string
792 # property, with a mapping from code point to its value, so you could look up,
793 # for example, the script a code point is in. But no one so far wants this
794 # mapping, or they have found another way to get it since this is a new
795 # feature. So no file is generated except if it is in this list.
796 my @output_mapped_properties = split "\n", <<END;
799 # If you are using the Unihan database, you need to add the properties that
800 # you want to extract from it to this table. For your convenience, the
801 # properties in the 5.2 PropertyAliases.txt file are listed, commented out
802 my @cjk_properties = split "\n", <<'END';
803 #cjkAccountingNumeric; kAccountingNumeric
804 #cjkOtherNumeric; kOtherNumeric
805 #cjkPrimaryNumeric; kPrimaryNumeric
806 #cjkCompatibilityVariant; kCompatibilityVariant
808 #cjkIRG_GSource; kIRG_GSource
809 #cjkIRG_HSource; kIRG_HSource
810 #cjkIRG_JSource; kIRG_JSource
811 #cjkIRG_KPSource; kIRG_KPSource
812 #cjkIRG_KSource; kIRG_KSource
813 #cjkIRG_TSource; kIRG_TSource
814 #cjkIRG_USource; kIRG_USource
815 #cjkIRG_VSource; kIRG_VSource
816 #cjkRSUnicode; kRSUnicode ; Unicode_Radical_Stroke; URS
819 # Similarly for the property values. For your convenience, the lines in the
820 # 5.2 PropertyAliases.txt file are listed. Just remove the first BUT NOT both
822 my @cjk_property_values = split "\n", <<'END';
823 ## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
824 ## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
825 ## @missing: 0000..10FFFF; cjkIICore; <none>
826 ## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
827 ## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
828 ## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
829 ## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
830 ## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
831 ## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
832 ## @missing: 0000..10FFFF; cjkIRG_USource; <none>
833 ## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
834 ## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
835 ## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
836 ## @missing: 0000..10FFFF; cjkRSUnicode; <none>
839 # The input files don't list every code point. Those not listed are to be
840 # defaulted to some value. Below are hard-coded what those values are for
841 # non-binary properties as of 5.1. Starting in 5.0, there are
842 # machine-parsable comment lines in the files the give the defaults; so this
843 # list shouldn't have to be extended. The claim is that all missing entries
844 # for binary properties will default to 'N'. Unicode tried to change that in
845 # 5.2, but the beta period produced enough protest that they backed off.
847 # The defaults for the fields that appear in UnicodeData.txt in this hash must
848 # be in the form that it expects. The others may be synonyms.
849 my $CODE_POINT = '<code point>';
850 my %default_mapping = (
852 # Bidi_Class => Complicated; set in code
853 Bidi_Mirroring_Glyph => "",
855 Canonical_Combining_Class => 0,
856 Case_Folding => $CODE_POINT,
857 Decomposition_Mapping => $CODE_POINT,
858 Decomposition_Type => 'None',
859 East_Asian_Width => "Neutral",
860 FC_NFKC_Closure => $CODE_POINT,
861 General_Category => 'Cn',
862 Grapheme_Cluster_Break => 'Other',
863 Hangul_Syllable_Type => 'NA',
865 Jamo_Short_Name => "",
866 Joining_Group => "No_Joining_Group",
867 # Joining_Type => Complicated; set in code
868 kIICore => 'N', # Is converted to binary
869 #Line_Break => Complicated; set in code
870 Lowercase_Mapping => $CODE_POINT,
877 Numeric_Type => 'None',
878 Numeric_Value => 'NaN',
879 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
880 Sentence_Break => 'Other',
881 Simple_Case_Folding => $CODE_POINT,
882 Simple_Lowercase_Mapping => $CODE_POINT,
883 Simple_Titlecase_Mapping => $CODE_POINT,
884 Simple_Uppercase_Mapping => $CODE_POINT,
885 Titlecase_Mapping => $CODE_POINT,
886 Unicode_1_Name => "",
887 Unicode_Radical_Stroke => "",
888 Uppercase_Mapping => $CODE_POINT,
889 Word_Break => 'Other',
892 # Below are files that Unicode furnishes, but this program ignores, and why
893 my %ignored_files = (
894 'CJKRadicals.txt' => 'Unihan data',
895 'Index.txt' => 'An index, not actual data',
896 'NamedSqProv.txt' => 'Not officially part of the Unicode standard; Append it to NamedSequences.txt if you want to process the contents.',
897 'NamesList.txt' => 'Just adds commentary',
898 'NormalizationCorrections.txt' => 'Data is already in other files.',
899 'Props.txt' => 'Adds nothing to PropList.txt; only in very early releases',
900 'ReadMe.txt' => 'Just comments',
901 'README.TXT' => 'Just comments',
902 'StandardizedVariants.txt' => 'Only for glyph changes, not a Unicode character property. Does not fit into current scheme where one code point is mapped',
905 ################ End of externally interesting definitions ###############
908 # !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
909 # This file is machine-generated by $0 from the Unicode
910 # database, Version $string_version. Any changes made here will be lost!
913 my $INTERNAL_ONLY=<<"EOF";
915 # !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
916 # This file is for internal use by the Perl program only. The format and even
917 # the name or existence of this file are subject to change without notice.
918 # Don't use it directly.
921 my $DEVELOPMENT_ONLY=<<"EOF";
922 # !!!!!!! DEVELOPMENT USE ONLY !!!!!!!
923 # This file contains information artificially constrained to code points
924 # present in Unicode release $string_compare_versions.
925 # IT CANNOT BE RELIED ON. It is for use during development only and should
926 # not be used for production.
930 my $LAST_UNICODE_CODEPOINT_STRING = "10FFFF";
931 my $LAST_UNICODE_CODEPOINT = hex $LAST_UNICODE_CODEPOINT_STRING;
932 my $MAX_UNICODE_CODEPOINTS = $LAST_UNICODE_CODEPOINT + 1;
934 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
935 # two must be 10; if there are 5, the first must not be a 0. Written this way
936 # to decrease backtracking
938 qr/ \b (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
940 # This matches the beginning of the line in the Unicode db files that give the
941 # defaults for code points not listed (i.e., missing) in the file. The code
942 # depends on this ending with a semi-colon, so it can assume it is a valid
943 # field when the line is split() by semi-colons
944 my $missing_defaults_prefix =
945 qr/^#\s+\@missing:\s+0000\.\.$LAST_UNICODE_CODEPOINT_STRING\s*;/;
947 # Property types. Unicode has more types, but these are sufficient for our
949 my $UNKNOWN = -1; # initialized to illegal value
950 my $NON_STRING = 1; # Either binary or enum
952 my $ENUM = 3; # Include catalog
953 my $STRING = 4; # Anything else: string or misc
955 # Some input files have lines that give default values for code points not
956 # contained in the file. Sometimes these should be ignored.
957 my $NO_DEFAULTS = 0; # Must evaluate to false
961 # Range types. Each range has a type. Most ranges are type 0, for normal,
962 # and will appear in the main body of the tables in the output files, but
963 # there are other types of ranges as well, listed below, that are specially
964 # handled. There are pseudo-types as well that will never be stored as a
965 # type, but will affect the calculation of the type.
967 # 0 is for normal, non-specials
968 my $MULTI_CP = 1; # Sequence of more than code point
969 my $HANGUL_SYLLABLE = 2;
970 my $CP_IN_NAME = 3; # The NAME contains the code point appended to it.
971 my $NULL = 4; # The map is to the null string; utf8.c can't
972 # handle these, nor is there an accepted syntax
973 # for them in \p{} constructs
974 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
975 # otherwise be $MULTI_CP type are instead type 0
977 # process_generic_property_file() can accept certain overrides in its input.
978 # Each of these must begin AND end with $CMD_DELIM.
979 my $CMD_DELIM = "\a";
980 my $REPLACE_CMD = 'replace'; # Override the Replace
981 my $MAP_TYPE_CMD = 'map_type'; # Override the Type
986 # Values for the Replace argument to add_range.
987 # $NO # Don't replace; add only the code points not
989 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
990 # the comments at the subroutine definition.
991 my $UNCONDITIONALLY = 2; # Replace without conditions.
992 my $MULTIPLE = 4; # Don't replace, but add a duplicate record if
995 # Flags to give property statuses. The phrases are to remind maintainers that
996 # if the flag is changed, the indefinite article referring to it in the
997 # documentation may need to be as well.
999 my $SUPPRESSED = 'z'; # The character should never actually be seen, since
1001 my $DEPRECATED = 'D';
1002 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1003 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1004 my $DISCOURAGED = 'X';
1005 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1006 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1008 my $a_bold_stricter = "a 'B<$STRICTER>'";
1009 my $A_bold_stricter = "A 'B<$STRICTER>'";
1010 my $STABILIZED = 'S';
1011 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1012 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1014 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1015 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1017 my %status_past_participles = (
1018 $DISCOURAGED => 'discouraged',
1019 $SUPPRESSED => 'should never be generated',
1020 $STABILIZED => 'stabilized',
1021 $OBSOLETE => 'obsolete',
1022 $DEPRECATED => 'deprecated'
1025 # The format of the values of the map tables:
1026 my $BINARY_FORMAT = 'b';
1027 my $DECIMAL_FORMAT = 'd';
1028 my $FLOAT_FORMAT = 'f';
1029 my $INTEGER_FORMAT = 'i';
1030 my $HEX_FORMAT = 'x';
1031 my $RATIONAL_FORMAT = 'r';
1032 my $STRING_FORMAT = 's';
1034 my %map_table_formats = (
1035 $BINARY_FORMAT => 'binary',
1036 $DECIMAL_FORMAT => 'single decimal digit',
1037 $FLOAT_FORMAT => 'floating point number',
1038 $INTEGER_FORMAT => 'integer',
1039 $HEX_FORMAT => 'positive hex whole number; a code point',
1040 $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1041 $STRING_FORMAT => 'arbitrary string',
1044 # Unicode didn't put such derived files in a separate directory at first.
1045 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1046 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1047 my $AUXILIARY = 'auxiliary';
1049 # Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1050 my %loose_to_file_of; # loosely maps table names to their respective
1052 my %stricter_to_file_of; # same; but for stricter mapping.
1053 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1054 # their rational equivalent
1055 my %loose_property_name_of; # Loosely maps property names to standard form
1057 # These constants names and values were taken from the Unicode standard,
1058 # version 5.1, section 3.12. They are used in conjunction with Hangul
1068 my $NCount = $VCount * $TCount;
1070 # For Hangul syllables; These store the numbers from Jamo.txt in conjunction
1071 # with the above published constants.
1073 my %Jamo_L; # Leading consonants
1074 my %Jamo_V; # Vowels
1075 my %Jamo_T; # Trailing consonants
1077 my @unhandled_properties; # Will contain a list of properties found in
1078 # the input that we didn't process.
1079 my @match_properties; # Properties that have match tables, to be
1081 my @map_properties; # Properties that get map files written
1082 my @named_sequences; # NamedSequences.txt contents.
1083 my %potential_files; # Generated list of all .txt files in the directory
1084 # structure so we can warn if something is being
1086 my @files_actually_output; # List of files we generated.
1087 my @more_Names; # Some code point names are compound; this is used
1088 # to store the extra components of them.
1089 my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1090 # the minimum before we consider it equivalent to a
1091 # candidate rational
1092 my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1094 # These store references to certain commonly used property objects
1099 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1100 my $has_In_conflicts = 0;
1101 my $has_Is_conflicts = 0;
1103 sub internal_file_to_platform ($) {
1104 # Convert our file paths which have '/' separators to those of the
1108 return undef unless defined $file;
1110 return File::Spec->join(split '/', $file);
1113 sub file_exists ($) { # platform independent '-e'. This program internally
1114 # uses slash as a path separator.
1116 return 0 if ! defined $file;
1117 return -e internal_file_to_platform($file);
1121 # Returns the address of the blessed input object.
1122 # It doesn't check for blessedness because that would do a string eval
1123 # every call, and the program is structured so that this is never called
1124 # for a non-blessed object.
1126 no overloading; # If overloaded, numifying below won't work.
1128 # Numifying a ref gives its address.
1132 # Commented code below should work on Perl 5.8.
1133 ## This 'require' doesn't necessarily work in miniperl, and even if it does,
1134 ## the native perl version of it (which is what would operate under miniperl)
1135 ## is extremely slow, as it does a string eval every call.
1136 #my $has_fast_scalar_util = $
\18 !~ /miniperl/
1137 # && defined eval "require Scalar::Util";
1140 # # Returns the address of the blessed input object. Uses the XS version if
1141 # # available. It doesn't check for blessedness because that would do a
1142 # # string eval every call, and the program is structured so that this is
1143 # # never called for a non-blessed object.
1145 # return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1147 # # Check at least that is a ref.
1148 # my $pkg = ref($_[0]) or return undef;
1150 # # Change to a fake package to defeat any overloaded stringify
1151 # bless $_[0], 'main::Fake';
1153 # # Numifying a ref gives its address.
1154 # my $addr = 0 + $_[0];
1156 # # Return to original class
1157 # bless $_[0], $pkg;
1164 return $a if $a >= $b;
1171 return $a if $a <= $b;
1175 sub clarify_number ($) {
1176 # This returns the input number with underscores inserted every 3 digits
1177 # in large (5 digits or more) numbers. Input must be entirely digits, not
1181 my $pos = length($number) - 3;
1182 return $number if $pos <= 1;
1184 substr($number, $pos, 0) = '_';
1193 # These routines give a uniform treatment of messages in this program. They
1194 # are placed in the Carp package to cause the stack trace to not include them,
1195 # although an alternative would be to use another package and set @CARP_NOT
1198 our $Verbose = 1 if main::DEBUG; # Useful info when debugging
1200 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1201 # and overload trying to load Scalar:Util under miniperl. See
1202 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1203 undef $overload::VERSION;
1206 my $message = shift || "";
1207 my $nofold = shift || 0;
1210 $message = main::join_lines($message);
1211 $message =~ s/^$0: *//; # Remove initial program name
1212 $message =~ s/[.;,]+$//; # Remove certain ending punctuation
1213 $message = "\n$0: $message;";
1215 # Fold the message with program name, semi-colon end punctuation
1216 # (which looks good with the message that carp appends to it), and a
1217 # hanging indent for continuation lines.
1218 $message = main::simple_fold($message, "", 4) unless $nofold;
1219 $message =~ s/\n$//; # Remove the trailing nl so what carp
1220 # appends is to the same line
1223 return $message if defined wantarray; # If a caller just wants the msg
1230 # This is called when it is clear that the problem is caused by a bug in
1233 my $message = shift;
1234 $message =~ s/^$0: *//;
1235 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1240 sub carp_too_few_args {
1242 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken.");
1246 my $args_ref = shift;
1249 my_carp_bug("Need at least $count arguments to "
1251 . ". Instead got: '"
1252 . join ', ', @$args_ref
1253 . "'. No action taken.");
1257 sub carp_extra_args {
1258 my $args_ref = shift;
1259 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_;
1261 unless (ref $args_ref) {
1262 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments.");
1265 my ($package, $file, $line) = caller;
1266 my $subroutine = (caller 1)[3];
1269 if (ref $args_ref eq 'HASH') {
1270 foreach my $key (keys %$args_ref) {
1271 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1273 $list = join ', ', each %{$args_ref};
1275 elsif (ref $args_ref eq 'ARRAY') {
1276 foreach my $arg (@$args_ref) {
1277 $arg = $UNDEF unless defined $arg;
1279 $list = join ', ', @$args_ref;
1282 my_carp_bug("Can't cope with ref "
1284 . " . argument to 'carp_extra_args'. Not checking arguments.");
1288 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped.");
1296 # This program uses the inside-out method for objects, as recommended in
1297 # "Perl Best Practices". This closure aids in generating those. There
1298 # are two routines. setup_package() is called once per package to set
1299 # things up, and then set_access() is called for each hash representing a
1300 # field in the object. These routines arrange for the object to be
1301 # properly destroyed when no longer used, and for standard accessor
1302 # functions to be generated. If you need more complex accessors, just
1303 # write your own and leave those accesses out of the call to set_access().
1304 # More details below.
1306 my %constructor_fields; # fields that are to be used in constructors; see
1309 # The values of this hash will be the package names as keys to other
1310 # hashes containing the name of each field in the package as keys, and
1311 # references to their respective hashes as values.
1315 # Sets up the package, creating standard DESTROY and dump methods
1316 # (unless already defined). The dump method is used in debugging by
1318 # The optional parameters are:
1319 # a) a reference to a hash, that gets populated by later
1320 # set_access() calls with one of the accesses being
1321 # 'constructor'. The caller can then refer to this, but it is
1322 # not otherwise used by these two routines.
1323 # b) a reference to a callback routine to call during destruction
1324 # of the object, before any fields are actually destroyed
1327 my $constructor_ref = delete $args{'Constructor_Fields'};
1328 my $destroy_callback = delete $args{'Destroy_Callback'};
1329 Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1332 my $package = (caller)[0];
1334 $package_fields{$package} = \%fields;
1335 $constructor_fields{$package} = $constructor_ref;
1337 unless ($package->can('DESTROY')) {
1338 my $destroy_name = "${package}::DESTROY";
1341 # Use typeglob to give the anonymous subroutine the name we want
1342 *$destroy_name = sub {
1344 my $addr = main::objaddr($self);
1346 $self->$destroy_callback if $destroy_callback;
1347 foreach my $field (keys %{$package_fields{$package}}) {
1348 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1349 delete $package_fields{$package}{$field}{$addr};
1355 unless ($package->can('dump')) {
1356 my $dump_name = "${package}::dump";
1360 return dump_inside_out($self, $package_fields{$package}, @_);
1367 # Arrange for the input field to be garbage collected when no longer
1368 # needed. Also, creates standard accessor functions for the field
1369 # based on the optional parameters-- none if none of these parameters:
1370 # 'addable' creates an 'add_NAME()' accessor function.
1371 # 'readable' or 'readable_array' creates a 'NAME()' accessor
1373 # 'settable' creates a 'set_NAME()' accessor function.
1374 # 'constructor' doesn't create an accessor function, but adds the
1375 # field to the hash that was previously passed to
1377 # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1378 # 'add' etc. all mean 'addable'.
1379 # The read accessor function will work on both array and scalar
1380 # values. If another accessor in the parameter list is 'a', the read
1381 # access assumes an array. You can also force it to be array access
1382 # by specifying 'readable_array' instead of 'readable'
1384 # A sort-of 'protected' access can be set-up by preceding the addable,
1385 # readable or settable with some initial portion of 'protected_' (but,
1386 # the underscore is required), like 'p_a', 'pro_set', etc. The
1387 # "protection" is only by convention. All that happens is that the
1388 # accessor functions' names begin with an underscore. So instead of
1389 # calling set_foo, the call is _set_foo. (Real protection could be
1390 # accomplished by having a new subroutine, end_package called at the
1391 # end of each package, and then storing the __LINE__ ranges and
1392 # checking them on every accessor. But that is way overkill.)
1394 # We create anonymous subroutines as the accessors and then use
1395 # typeglobs to assign them to the proper package and name
1397 my $name = shift; # Name of the field
1398 my $field = shift; # Reference to the inside-out hash containing the
1401 my $package = (caller)[0];
1403 if (! exists $package_fields{$package}) {
1404 croak "$0: Must call 'setup_package' before 'set_access'";
1407 # Stash the field so DESTROY can get it.
1408 $package_fields{$package}{$name} = $field;
1410 # Remaining arguments are the accessors. For each...
1411 foreach my $access (@_) {
1412 my $access = lc $access;
1416 # Match the input as far as it goes.
1417 if ($access =~ /^(p[^_]*)_/) {
1419 if (substr('protected_', 0, length $protected)
1423 # Add 1 for the underscore not included in $protected
1424 $access = substr($access, length($protected) + 1);
1432 if (substr('addable', 0, length $access) eq $access) {
1433 my $subname = "${package}::${protected}add_$name";
1436 # add_ accessor. Don't add if already there, which we
1437 # determine using 'eq' for scalars and '==' otherwise.
1440 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1443 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1445 return if grep { $value == $_ }
1446 @{$field->{main::objaddr $self}};
1449 return if grep { $value eq $_ }
1450 @{$field->{main::objaddr $self}};
1452 push @{$field->{main::objaddr $self}}, $value;
1456 elsif (substr('constructor', 0, length $access) eq $access) {
1458 Carp::my_carp_bug("Can't set-up 'protected' constructors")
1461 $constructor_fields{$package}{$name} = $field;
1464 elsif (substr('readable_array', 0, length $access) eq $access) {
1466 # Here has read access. If one of the other parameters for
1467 # access is array, or this one specifies array (by being more
1468 # than just 'readable_'), then create a subroutine that
1469 # assumes the data is an array. Otherwise just a scalar
1470 my $subname = "${package}::${protected}$name";
1471 if (grep { /^a/i } @_
1472 or length($access) > length('readable_'))
1477 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1478 my $addr = main::objaddr $_[0];
1479 if (ref $field->{$addr} ne 'ARRAY') {
1480 my $type = ref $field->{$addr};
1481 $type = 'scalar' unless $type;
1482 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems.");
1485 return scalar @{$field->{$addr}} unless wantarray;
1487 # Make a copy; had problems with caller modifying the
1488 # original otherwise
1489 my @return = @{$field->{$addr}};
1495 # Here not an array value, a simpler function.
1499 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1500 return $field->{main::objaddr $_[0]};
1504 elsif (substr('settable', 0, length $access) eq $access) {
1505 my $subname = "${package}::${protected}set_$name";
1510 return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
1511 Carp::carp_extra_args(\@_) if @_ > 2;
1513 # $self is $_[0]; $value is $_[1]
1514 $field->{main::objaddr $_[0]} = $_[1];
1519 Carp::my_carp_bug("Unknown accessor type $access. No accessor set.");
1528 # All input files use this object, which stores various attributes about them,
1529 # and provides for convenient, uniform handling. The run method wraps the
1530 # processing. It handles all the bookkeeping of opening, reading, and closing
1531 # the file, returning only significant input lines.
1533 # Each object gets a handler which processes the body of the file, and is
1534 # called by run(). Most should use the generic, default handler, which has
1535 # code scrubbed to handle things you might not expect. A handler should
1536 # basically be a while(next_line()) {...} loop.
1538 # You can also set up handlers to
1539 # 1) call before the first line is read for pre processing
1540 # 2) call to adjust each line of the input before the main handler gets them
1541 # 3) call upon EOF before the main handler exits its loop
1542 # 4) call at the end for post processing
1544 # $_ is used to store the input line, and is to be filtered by the
1545 # each_line_handler()s. So, if the format of the line is not in the desired
1546 # format for the main handler, these are used to do that adjusting. They can
1547 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
1548 # so the $_ output of one is used as the input to the next. None of the other
1549 # handlers are stackable, but could easily be changed to be so.
1551 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
1552 # which insert the parameters as lines to be processed before the next input
1553 # file line is read. This allows the EOF handler to flush buffers, for
1554 # example. The difference between the two routines is that the lines inserted
1555 # by insert_lines() are subjected to the each_line_handler()s. (So if you
1556 # called it from such a handler, you would get infinite recursion.) Lines
1557 # inserted by insert_adjusted_lines() go directly to the main handler without
1558 # any adjustments. If the post-processing handler calls any of these, there
1559 # will be no effect. Some error checking for these conditions could be added,
1560 # but it hasn't been done.
1562 # carp_bad_line() should be called to warn of bad input lines, which clears $_
1563 # to prevent further processing of the line. This routine will output the
1564 # message as a warning once, and then keep a count of the lines that have the
1565 # same message, and output that count at the end of the file's processing.
1566 # This keeps the number of messages down to a manageable amount.
1568 # get_missings() should be called to retrieve any @missing input lines.
1569 # Messages will be raised if this isn't done if the options aren't to ignore
1572 sub trace { return main::trace(@_); }
1575 # Keep track of fields that are to be put into the constructor.
1576 my %constructor_fields;
1578 main::setup_package(Constructor_Fields => \%constructor_fields);
1580 my %file; # Input file name, required
1581 main::set_access('file', \%file, qw{ c r });
1583 my %first_released; # Unicode version file was first released in, required
1584 main::set_access('first_released', \%first_released, qw{ c r });
1586 my %handler; # Subroutine to process the input file, defaults to
1587 # 'process_generic_property_file'
1588 main::set_access('handler', \%handler, qw{ c });
1591 # name of property this file is for. defaults to none, meaning not
1592 # applicable, or is otherwise determinable, for example, from each line.
1593 main::set_access('property', \%property, qw{ c });
1596 # If this is true, the file is optional. If not present, no warning is
1597 # output. If it is present, the string given by this parameter is
1598 # evaluated, and if false the file is not processed.
1599 main::set_access('optional', \%optional, 'c', 'r');
1602 # This is used for debugging, to skip processing of all but a few input
1603 # files. Add 'non_skip => 1' to the constructor for those files you want
1604 # processed when you set the $debug_skip global.
1605 main::set_access('non_skip', \%non_skip, 'c');
1607 my %each_line_handler;
1608 # list of subroutines to look at and filter each non-comment line in the
1609 # file. defaults to none. The subroutines are called in order, each is
1610 # to adjust $_ for the next one, and the final one adjusts it for
1612 main::set_access('each_line_handler', \%each_line_handler, 'c');
1614 my %has_missings_defaults;
1615 # ? Are there lines in the file giving default values for code points
1616 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is
1617 # the norm, but IGNORED means it has such lines, but the handler doesn't
1618 # use them. Having these three states allows us to catch changes to the
1619 # UCD that this program should track
1620 main::set_access('has_missings_defaults',
1621 \%has_missings_defaults, qw{ c r });
1624 # Subroutine to call before doing anything else in the file. If undef, no
1625 # such handler is called.
1626 main::set_access('pre_handler', \%pre_handler, qw{ c });
1629 # Subroutine to call upon getting an EOF on the input file, but before
1630 # that is returned to the main handler. This is to allow buffers to be
1631 # flushed. The handler is expected to call insert_lines() or
1632 # insert_adjusted() with the buffered material
1633 main::set_access('eof_handler', \%eof_handler, qw{ c r });
1636 # Subroutine to call after all the lines of the file are read in and
1637 # processed. If undef, no such handler is called.
1638 main::set_access('post_handler', \%post_handler, qw{ c });
1640 my %progress_message;
1641 # Message to print to display progress in lieu of the standard one
1642 main::set_access('progress_message', \%progress_message, qw{ c });
1645 # cache open file handle, internal. Is undef if file hasn't been
1646 # processed at all, empty if has;
1647 main::set_access('handle', \%handle);
1650 # cache of lines added virtually to the file, internal
1651 main::set_access('added_lines', \%added_lines);
1654 # cache of errors found, internal
1655 main::set_access('errors', \%errors);
1658 # storage of '@missing' defaults lines
1659 main::set_access('missings', \%missings);
1664 my $self = bless \do{ my $anonymous_scalar }, $class;
1665 my $addr = main::objaddr($self);
1668 $handler{$addr} = \&main::process_generic_property_file;
1669 $non_skip{$addr} = 0;
1670 $has_missings_defaults{$addr} = $NO_DEFAULTS;
1671 $handle{$addr} = undef;
1672 $added_lines{$addr} = [ ];
1673 $each_line_handler{$addr} = [ ];
1674 $errors{$addr} = { };
1675 $missings{$addr} = [ ];
1677 # Two positional parameters.
1678 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1679 $file{$addr} = main::internal_file_to_platform(shift);
1680 $first_released{$addr} = shift;
1682 # The rest of the arguments are key => value pairs
1683 # %constructor_fields has been set up earlier to list all possible
1684 # ones. Either set or push, depending on how the default has been set
1687 foreach my $key (keys %args) {
1688 my $argument = $args{$key};
1690 # Note that the fields are the lower case of the constructor keys
1691 my $hash = $constructor_fields{lc $key};
1692 if (! defined $hash) {
1693 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped");
1696 if (ref $hash->{$addr} eq 'ARRAY') {
1697 if (ref $argument eq 'ARRAY') {
1698 foreach my $argument (@{$argument}) {
1699 next if ! defined $argument;
1700 push @{$hash->{$addr}}, $argument;
1704 push @{$hash->{$addr}}, $argument if defined $argument;
1708 $hash->{$addr} = $argument;
1713 # If the file has a property for it, it means that the property is not
1714 # listed in the file's entries. So add a handler to the list of line
1715 # handlers to insert the property name into the lines, to provide a
1716 # uniform interface to the final processing subroutine.
1717 # the final code doesn't have to worry about that.
1718 if ($property{$addr}) {
1719 push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
1722 if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
1723 print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
1732 qw("") => "_operator_stringify",
1733 "." => \&main::_operator_dot,
1736 sub _operator_stringify {
1739 return __PACKAGE__ . " object for " . $self->file;
1742 # flag to make sure extracted files are processed early
1743 my $seen_non_extracted_non_age = 0;
1746 # Process the input object $self. This opens and closes the file and
1747 # calls all the handlers for it. Currently, this can only be called
1748 # once per file, as it destroy's the EOF handler
1751 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1753 my $addr = main::objaddr $self;
1755 my $file = $file{$addr};
1757 # Don't process if not expecting this file (because released later
1758 # than this Unicode version), and isn't there. This means if someone
1759 # copies it into an earlier version's directory, we will go ahead and
1761 return if $first_released{$addr} gt $v_version && ! -e $file;
1763 # If in debugging mode and this file doesn't have the non-skip
1764 # flag set, and isn't one of the critical files, skip it.
1766 && $first_released{$addr} ne v0
1767 && ! $non_skip{$addr})
1769 print "Skipping $file in debugging\n" if $verbosity;
1773 # File could be optional
1774 if ($optional{$addr}){
1775 return unless -e $file;
1776 my $result = eval $optional{$addr};
1777 if (! defined $result) {
1778 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped.");
1783 print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
1789 if (! defined $file || ! -e $file) {
1791 # If the file doesn't exist, see if have internal data for it
1792 # (based on first_released being 0).
1793 if ($first_released{$addr} eq v0) {
1794 $handle{$addr} = 'pretend_is_open';
1797 if (! $optional{$addr} # File could be optional
1798 && $v_version ge $first_released{$addr})
1800 print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
1807 # Here, the file exists
1808 if ($seen_non_extracted_non_age) {
1809 if ($file =~ /$EXTRACTED/i) {
1810 Carp::my_carp_bug(join_lines(<<END
1811 $file should be processed just after the 'Prop...Alias' files, and before
1812 anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
1813 have subtle problems
1818 elsif ($EXTRACTED_DIR
1819 && $first_released{$addr} ne v0
1820 && $file !~ /$EXTRACTED/i
1821 && lc($file) ne 'dage.txt')
1823 # We don't set this (by the 'if' above) if we have no
1824 # extracted directory, so if running on an early version,
1825 # this test won't work. Not worth worrying about.
1826 $seen_non_extracted_non_age = 1;
1829 # And mark the file as having being processed, and warn if it
1830 # isn't a file we are expecting. As we process the files,
1831 # they are deleted from the hash, so any that remain at the
1832 # end of the program are files that we didn't process.
1833 my $fkey = File::Spec->rel2abs($file);
1834 my $expecting = delete $potential_files{$fkey};
1835 $expecting = delete $potential_files{lc($fkey)} unless defined $expecting;
1836 Carp::my_carp("Was not expecting '$file'.") if
1838 && ! defined $handle{$addr};
1840 # Open the file, converting the slashes used in this program
1841 # into the proper form for the OS
1843 if (not open $file_handle, "<", $file) {
1844 Carp::my_carp("Can't open $file. Skipping: $!");
1847 $handle{$addr} = $file_handle; # Cache the open file handle
1850 if ($verbosity >= $PROGRESS) {
1851 if ($progress_message{$addr}) {
1852 print "$progress_message{$addr}\n";
1855 # If using a virtual file, say so.
1856 print "Processing ", (-e $file)
1858 : "substitute $file",
1864 # Call any special handler for before the file.
1865 &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
1867 # Then the main handler
1868 &{$handler{$addr}}($self);
1870 # Then any special post-file handler.
1871 &{$post_handler{$addr}}($self) if $post_handler{$addr};
1873 # If any errors have been accumulated, output the counts (as the first
1874 # error message in each class was output when it was encountered).
1875 if ($errors{$addr}) {
1878 foreach my $error (keys %{$errors{$addr}}) {
1879 $total += $errors{$addr}->{$error};
1880 delete $errors{$addr}->{$error};
1885 = "A total of $total lines had errors in $file. ";
1887 $message .= ($types == 1)
1888 ? '(Only the first one was displayed.)'
1889 : '(Only the first of each type was displayed.)';
1890 Carp::my_carp($message);
1894 if (@{$missings{$addr}}) {
1895 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong");
1898 # If a real file handle, close it.
1899 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
1901 $handle{$addr} = ""; # Uses empty to indicate that has already seen
1902 # the file, as opposed to undef
1907 # Sets $_ to be the next logical input line, if any. Returns non-zero
1908 # if such a line exists. 'logical' means that any lines that have
1909 # been added via insert_lines() will be returned in $_ before the file
1913 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1915 my $addr = main::objaddr $self;
1917 # Here the file is open (or if the handle is not a ref, is an open
1918 # 'virtual' file). Get the next line; any inserted lines get priority
1919 # over the file itself.
1923 while (1) { # Loop until find non-comment, non-empty line
1924 #local $to_trace = 1 if main::DEBUG;
1925 my $inserted_ref = shift @{$added_lines{$addr}};
1926 if (defined $inserted_ref) {
1927 ($adjusted, $_) = @{$inserted_ref};
1928 trace $adjusted, $_ if main::DEBUG && $to_trace;
1929 return 1 if $adjusted;
1932 last if ! ref $handle{$addr}; # Don't read unless is real file
1933 last if ! defined ($_ = readline $handle{$addr});
1936 trace $_ if main::DEBUG && $to_trace;
1938 # See if this line is the comment line that defines what property
1939 # value that code points that are not listed in the file should
1940 # have. The format or existence of these lines is not guaranteed
1941 # by Unicode since they are comments, but the documentation says
1942 # that this was added for machine-readability, so probably won't
1943 # change. This works starting in Unicode Version 5.0. They look
1946 # @missing: 0000..10FFFF; Not_Reordered
1947 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
1948 # @missing: 0000..10FFFF; ; NaN
1950 # Save the line for a later get_missings() call.
1951 if (/$missing_defaults_prefix/) {
1952 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
1953 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries");
1955 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
1956 my @defaults = split /\s* ; \s*/x, $_;
1958 # The first field is the @missing, which ends in a
1959 # semi-colon, so can safely shift.
1962 # Some of these lines may have empty field placeholders
1963 # which get in the way. An example is:
1964 # @missing: 0000..10FFFF; ; NaN
1965 # Remove them. Process starting from the top so the
1966 # splice doesn't affect things still to be looked at.
1967 for (my $i = @defaults - 1; $i >= 0; $i--) {
1968 next if $defaults[$i] ne "";
1969 splice @defaults, $i, 1;
1972 # What's left should be just the property (maybe) and the
1973 # default. Having only one element means it doesn't have
1977 if (@defaults >= 1) {
1978 if (@defaults == 1) {
1979 $default = $defaults[0];
1982 $property = $defaults[0];
1983 $default = $defaults[1];
1989 || ($default =~ /^</
1990 && $default !~ /^<code *point>$/i
1991 && $default !~ /^<none>$/i))
1993 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
1997 # If the property is missing from the line, it should
1998 # be the one for the whole file
1999 $property = $property{$addr} if ! defined $property;
2001 # Change <none> to the null string, which is what it
2002 # really means. If the default is the code point
2003 # itself, set it to <code point>, which is what
2004 # Unicode uses (but sometimes they've forgotten the
2006 if ($default =~ /^<none>$/i) {
2009 elsif ($default =~ /^<code *point>$/i) {
2010 $default = $CODE_POINT;
2013 # Store them as a sub-arrays with both components.
2014 push @{$missings{$addr}}, [ $default, $property ];
2018 # There is nothing for the caller to process on this comment
2023 # Remove comments and trailing space, and skip this line if the
2029 # Call any handlers for this line, and skip further processing of
2030 # the line if the handler sets the line to null.
2031 foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2036 # Here the line is ok. return success.
2038 } # End of looping through lines.
2040 # If there is an EOF handler, call it (only once) and if it generates
2041 # more lines to process go back in the loop to handle them.
2042 if ($eof_handler{$addr}) {
2043 &{$eof_handler{$addr}}($self);
2044 $eof_handler{$addr} = ""; # Currently only get one shot at it.
2045 goto LINE if $added_lines{$addr};
2048 # Return failure -- no more lines.
2053 # Not currently used, not fully tested.
2055 # # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2056 # # record. Not callable from an each_line_handler(), nor does it call
2057 # # an each_line_handler() on the line.
2060 # my $addr = main::objaddr $self;
2062 # foreach my $inserted_ref (@{$added_lines{$addr}}) {
2063 # my ($adjusted, $line) = @{$inserted_ref};
2064 # next if $adjusted;
2066 # # Remove comments and trailing space, and return a non-empty
2069 # $line =~ s/\s+$//;
2070 # return $line if $line ne "";
2073 # return if ! ref $handle{$addr}; # Don't read unless is real file
2074 # while (1) { # Loop until find non-comment, non-empty line
2075 # local $to_trace = 1 if main::DEBUG;
2076 # trace $_ if main::DEBUG && $to_trace;
2077 # return if ! defined (my $line = readline $handle{$addr});
2079 # push @{$added_lines{$addr}}, [ 0, $line ];
2082 # $line =~ s/\s+$//;
2083 # return $line if $line ne "";
2091 # Lines can be inserted so that it looks like they were in the input
2092 # file at the place it was when this routine is called. See also
2093 # insert_adjusted_lines(). Lines inserted via this routine go through
2094 # any each_line_handler()
2098 # Each inserted line is an array, with the first element being 0 to
2099 # indicate that this line hasn't been adjusted, and needs to be
2101 push @{$added_lines{main::objaddr $self}}, map { [ 0, $_ ] } @_;
2105 sub insert_adjusted_lines {
2106 # Lines can be inserted so that it looks like they were in the input
2107 # file at the place it was when this routine is called. See also
2108 # insert_lines(). Lines inserted via this routine are already fully
2109 # adjusted, ready to be processed; each_line_handler()s handlers will
2110 # not be called. This means this is not a completely general
2111 # facility, as only the last each_line_handler on the stack should
2112 # call this. It could be made more general, by passing to each of the
2113 # line_handlers their position on the stack, which they would pass on
2114 # to this routine, and that would replace the boolean first element in
2115 # the anonymous array pushed here, so that the next_line routine could
2116 # use that to call only those handlers whose index is after it on the
2117 # stack. But this is overkill for what is needed now.
2120 trace $_[0] if main::DEBUG && $to_trace;
2122 # Each inserted line is an array, with the first element being 1 to
2123 # indicate that this line has been adjusted
2124 push @{$added_lines{main::objaddr $self}}, map { [ 1, $_ ] } @_;
2129 # Returns the stored up @missings lines' values, and clears the list.
2130 # The values are in an array, consisting of the default in the first
2131 # element, and the property in the 2nd. However, since these lines
2132 # can be stacked up, the return is an array of all these arrays.
2135 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2137 my $addr = main::objaddr $self;
2139 # If not accepting a list return, just return the first one.
2140 return shift @{$missings{$addr}} unless wantarray;
2142 my @return = @{$missings{$addr}};
2143 undef @{$missings{$addr}};
2147 sub _insert_property_into_line {
2148 # Add a property field to $_, if this file requires it.
2150 my $property = $property{main::objaddr shift};
2151 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2153 $_ =~ s/(;|$)/; $property$1/;
2158 # Output consistent error messages, using either a generic one, or the
2159 # one given by the optional parameter. To avoid gazillions of the
2160 # same message in case the syntax of a file is way off, this routine
2161 # only outputs the first instance of each message, incrementing a
2162 # count so the totals can be output at the end of the file.
2165 my $message = shift;
2166 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2168 my $addr = main::objaddr $self;
2170 $message = 'Unexpected line' unless $message;
2172 # No trailing punctuation so as to fit with our addenda.
2173 $message =~ s/[.:;,]$//;
2175 # If haven't seen this exact message before, output it now. Otherwise
2176 # increment the count of how many times it has occurred
2177 unless ($errors{$addr}->{$message}) {
2178 Carp::my_carp("$message in '$_' in "
2179 . $file{main::objaddr $self}
2180 . " at line $.. Skipping this line;");
2181 $errors{$addr}->{$message} = 1;
2184 $errors{$addr}->{$message}++;
2187 # Clear the line to prevent any further (meaningful) processing of it.
2194 package Multi_Default;
2196 # Certain properties in early versions of Unicode had more than one possible
2197 # default for code points missing from the files. In these cases, one
2198 # default applies to everything left over after all the others are applied,
2199 # and for each of the others, there is a description of which class of code
2200 # points applies to it. This object helps implement this by storing the
2201 # defaults, and for all but that final default, an eval string that generates
2202 # the class that it applies to.
2207 main::setup_package();
2210 # The defaults structure for the classes
2211 main::set_access('class_defaults', \%class_defaults);
2214 # The default that applies to everything left over.
2215 main::set_access('other_default', \%other_default, 'r');
2219 # The constructor is called with default => eval pairs, terminated by
2220 # the left-over default. e.g.
2221 # Multi_Default->new(
2222 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2224 # 'R' => 'some other expression that evaluates to code points',
2232 my $self = bless \do{my $anonymous_scalar}, $class;
2233 my $addr = main::objaddr($self);
2236 my $default = shift;
2238 $class_defaults{$addr}->{$default} = $eval;
2241 $other_default{$addr} = shift;
2246 sub get_next_defaults {
2247 # Iterates and returns the next class of defaults.
2249 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2251 my $addr = main::objaddr $self;
2253 return each %{$class_defaults{$addr}};
2259 # An alias is one of the names that a table goes by. This class defines them
2260 # including some attributes. Everything is currently setup in the
2266 main::setup_package();
2269 main::set_access('name', \%name, 'r');
2272 # Determined by the constructor code if this name should match loosely or
2273 # not. The constructor parameters can override this, but it isn't fully
2274 # implemented, as should have ability to override Unicode one's via
2275 # something like a set_loose_match()
2276 main::set_access('loose_match', \%loose_match, 'r');
2279 # Some aliases should not get their own entries because they are covered
2280 # by a wild-card, and some we want to discourage use of. Binary
2281 main::set_access('make_pod_entry', \%make_pod_entry, 'r');
2284 # Aliases have a status, like deprecated, or even suppressed (which means
2285 # they don't appear in documentation). Enum
2286 main::set_access('status', \%status, 'r');
2289 # Similarly, some aliases should not be considered as usable ones for
2290 # external use, such as file names, or we don't want documentation to
2291 # recommend them. Boolean
2292 main::set_access('externally_ok', \%externally_ok, 'r');
2297 my $self = bless \do { my $anonymous_scalar }, $class;
2298 my $addr = main::objaddr($self);
2300 $name{$addr} = shift;
2301 $loose_match{$addr} = shift;
2302 $make_pod_entry{$addr} = shift;
2303 $externally_ok{$addr} = shift;
2304 $status{$addr} = shift;
2306 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2308 # Null names are never ok externally
2309 $externally_ok{$addr} = 0 if $name{$addr} eq "";
2317 # A range is the basic unit for storing code points, and is described in the
2318 # comments at the beginning of the program. Each range has a starting code
2319 # point; an ending code point (not less than the starting one); a value
2320 # that applies to every code point in between the two end-points, inclusive;
2321 # and an enum type that applies to the value. The type is for the user's
2322 # convenience, and has no meaning here, except that a non-zero type is
2323 # considered to not obey the normal Unicode rules for having standard forms.
2325 # The same structure is used for both map and match tables, even though in the
2326 # latter, the value (and hence type) is irrelevant and could be used as a
2327 # comment. In map tables, the value is what all the code points in the range
2328 # map to. Type 0 values have the standardized version of the value stored as
2329 # well, so as to not have to recalculate it a lot.
2331 sub trace { return main::trace(@_); }
2335 main::setup_package();
2338 main::set_access('start', \%start, 'r', 's');
2341 main::set_access('end', \%end, 'r', 's');
2344 main::set_access('value', \%value, 'r');
2347 main::set_access('type', \%type, 'r');
2350 # The value in internal standard form. Defined only if the type is 0.
2351 main::set_access('standard_form', \%standard_form);
2353 # Note that if these fields change, the dump() method should as well
2356 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2359 my $self = bless \do { my $anonymous_scalar }, $class;
2360 my $addr = main::objaddr($self);
2362 $start{$addr} = shift;
2363 $end{$addr} = shift;
2367 my $value = delete $args{'Value'}; # Can be 0
2368 $value = "" unless defined $value;
2369 $value{$addr} = $value;
2371 $type{$addr} = delete $args{'Type'} || 0;
2373 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2375 if (! $type{$addr}) {
2376 $standard_form{$addr} = main::standardize($value);
2384 qw("") => "_operator_stringify",
2385 "." => \&main::_operator_dot,
2388 sub _operator_stringify {
2390 my $addr = main::objaddr $self;
2392 # Output it like '0041..0065 (value)'
2393 my $return = sprintf("%04X", $start{$addr})
2395 . sprintf("%04X", $end{$addr});
2396 my $value = $value{$addr};
2397 my $type = $type{$addr};
2399 $return .= "$value";
2400 $return .= ", Type=$type" if $type != 0;
2407 # The standard form is the value itself if the standard form is
2408 # undefined (that is if the value is special)
2411 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2413 my $addr = main::objaddr $self;
2415 return $standard_form{$addr} if defined $standard_form{$addr};
2416 return $value{$addr};
2420 # Human, not machine readable. For machine readable, comment out this
2421 # entire routine and let the standard one take effect.
2424 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2426 my $addr = main::objaddr $self;
2428 my $return = $indent
2429 . sprintf("%04X", $start{$addr})
2431 . sprintf("%04X", $end{$addr})
2432 . " '$value{$addr}';";
2433 if (! defined $standard_form{$addr}) {
2434 $return .= "(type=$type{$addr})";
2436 elsif ($standard_form{$addr} ne $value{$addr}) {
2437 $return .= "(standard '$standard_form{$addr}')";
2443 package _Range_List_Base;
2445 # Base class for range lists. A range list is simply an ordered list of
2446 # ranges, so that the ranges with the lowest starting numbers are first in it.
2448 # When a new range is added that is adjacent to an existing range that has the
2449 # same value and type, it merges with it to form a larger range.
2451 # Ranges generally do not overlap, except that there can be multiple entries
2452 # of single code point ranges. This is because of NameAliases.txt.
2454 # In this program, there is a standard value such that if two different
2455 # values, have the same standard value, they are considered equivalent. This
2456 # value was chosen so that it gives correct results on Unicode data
2458 # There are a number of methods to manipulate range lists, and some operators
2459 # are overloaded to handle them.
2461 # Because of the slowness of pure Perl objaddr() on miniperl, and measurements
2462 # showing this package was using a lot of real time calculating that, the code
2463 # was changed to only calculate it once per call stack. This is done by
2464 # consistently using the package variable $addr in routines, and only calling
2465 # objaddr() if it isn't defined, and setting that to be local, so that callees
2466 # will have it already. It would be a good thing to change this. XXX
2468 sub trace { return main::trace(@_); }
2474 main::setup_package();
2477 # The list of ranges
2478 main::set_access('ranges', \%ranges, 'readable_array');
2481 # The highest code point in the list. This was originally a method, but
2482 # actual measurements said it was used a lot.
2483 main::set_access('max', \%max, 'r');
2485 my %each_range_iterator;
2486 # Iterator position for each_range()
2487 main::set_access('each_range_iterator', \%each_range_iterator);
2490 # Name of parent this is attached to, if any. Solely for better error
2492 main::set_access('owner_name_of', \%owner_name_of, 'p_r');
2494 my %_search_ranges_cache;
2495 # A cache of the previous result from _search_ranges(), for better
2497 main::set_access('_search_ranges_cache', \%_search_ranges_cache);
2503 # Optional initialization data for the range list.
2504 my $initialize = delete $args{'Initialize'};
2508 # Use _union() to initialize. _union() returns an object of this
2509 # class, which means that it will call this constructor recursively.
2510 # But it won't have this $initialize parameter so that it won't
2511 # infinitely loop on this.
2512 return _union($class, $initialize, %args) if defined $initialize;
2514 $self = bless \do { my $anonymous_scalar }, $class;
2515 local $addr = main::objaddr($self);
2517 # Optional parent object, only for debug info.
2518 $owner_name_of{$addr} = delete $args{'Owner'};
2519 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
2521 # Stringify, in case it is an object.
2522 $owner_name_of{$addr} = "$owner_name_of{$addr}";
2524 # This is used only for error messages, and so a colon is added
2525 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
2527 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2529 # Max is initialized to a negative value that isn't adjacent to 0,
2533 $_search_ranges_cache{$addr} = 0;
2534 $ranges{$addr} = [];
2541 qw("") => "_operator_stringify",
2542 "." => \&main::_operator_dot,
2545 sub _operator_stringify {
2547 local $addr = main::objaddr($self) if !defined $addr;
2549 return "Range_List attached to '$owner_name_of{$addr}'"
2550 if $owner_name_of{$addr};
2551 return "anonymous Range_List " . \$self;
2555 # Returns the union of the input code points. It can be called as
2556 # either a constructor or a method. If called as a method, the result
2557 # will be a new() instance of the calling object, containing the union
2558 # of that object with the other parameter's code points; if called as
2559 # a constructor, the first parameter gives the class the new object
2560 # should be, and the second parameter gives the code points to go into
2562 # In either case, there are two parameters looked at by this routine;
2563 # any additional parameters are passed to the new() constructor.
2565 # The code points can come in the form of some object that contains
2566 # ranges, and has a conventionally named method to access them; or
2567 # they can be an array of individual code points (as integers); or
2568 # just a single code point.
2570 # If they are ranges, this routine doesn't make any effort to preserve
2571 # the range values of one input over the other. Therefore this base
2572 # class should not allow _union to be called from other than
2573 # initialization code, so as to prevent two tables from being added
2574 # together where the range values matter. The general form of this
2575 # routine therefore belongs in a derived class, but it was moved here
2576 # to avoid duplication of code. The failure to overload this in this
2577 # class keeps it safe.
2581 my @args; # Arguments to pass to the constructor
2585 # If a method call, will start the union with the object itself, and
2586 # the class of the new object will be the same as self.
2593 # Add the other required parameter.
2595 # Rest of parameters are passed on to the constructor
2597 # Accumulate all records from both lists.
2599 for my $arg (@args) {
2600 #local $to_trace = 0 if main::DEBUG;
2601 trace "argument = $arg" if main::DEBUG && $to_trace;
2602 if (! defined $arg) {
2604 if (defined $self) {
2605 $message .= $owner_name_of{main::objaddr $self};
2607 Carp::my_carp_bug($message .= "Undefined argument to _union. No union done.");
2610 $arg = [ $arg ] if ! ref $arg;
2611 my $type = ref $arg;
2612 if ($type eq 'ARRAY') {
2613 foreach my $element (@$arg) {
2614 push @records, Range->new($element, $element);
2617 elsif ($arg->isa('Range')) {
2618 push @records, $arg;
2620 elsif ($arg->can('ranges')) {
2621 push @records, $arg->ranges;
2625 if (defined $self) {
2626 $message .= $owner_name_of{main::objaddr $self};
2628 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done.");
2633 # Sort with the range containing the lowest ordinal first, but if
2634 # two ranges start at the same code point, sort with the bigger range
2635 # of the two first, because it takes fewer cycles.
2636 @records = sort { ($a->start <=> $b->start)
2638 # if b is shorter than a, b->end will be
2639 # less than a->end, and we want to select
2640 # a, so want to return -1
2641 ($b->end <=> $a->end)
2644 my $new = $class->new(@_);
2646 # Fold in records so long as they add new information.
2647 for my $set (@records) {
2648 my $start = $set->start;
2649 my $end = $set->end;
2650 my $value = $set->value;
2651 if ($start > $new->max) {
2652 $new->_add_delete('+', $start, $end, $value);
2654 elsif ($end > $new->max) {
2655 $new->_add_delete('+', $new->max +1, $end, $value);
2662 sub range_count { # Return the number of ranges in the range list
2664 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2666 local $addr = main::objaddr($self) if ! defined $addr;
2668 return scalar @{$ranges{$addr}};
2672 # Returns the minimum code point currently in the range list, or if
2673 # the range list is empty, 2 beyond the max possible. This is a
2674 # method because used so rarely, that not worth saving between calls,
2675 # and having to worry about changing it as ranges are added and
2679 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2681 local $addr = main::objaddr($self) if ! defined $addr;
2683 # If the range list is empty, return a large value that isn't adjacent
2684 # to any that could be in the range list, for simpler tests
2685 return $LAST_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
2686 return $ranges{$addr}->[0]->start;
2690 # Boolean: Is argument in the range list? If so returns $i such that:
2691 # range[$i]->end < $codepoint <= range[$i+1]->end
2692 # which is one beyond what you want; this is so that the 0th range
2693 # doesn't return false
2695 my $codepoint = shift;
2696 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2698 local $addr = main::objaddr $self if ! defined $addr;
2700 my $i = $self->_search_ranges($codepoint);
2701 return 0 unless defined $i;
2703 # The search returns $i, such that
2704 # range[$i-1]->end < $codepoint <= range[$i]->end
2705 # So is in the table if and only iff it is at least the start position
2707 return 0 if $ranges{$addr}->[$i]->start > $codepoint;
2712 # Returns the value associated with the code point, undef if none
2715 my $codepoint = shift;
2716 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2718 local $addr = main::objaddr $self if ! defined $addr;
2720 my $i = $self->contains($codepoint);
2723 # contains() returns 1 beyond where we should look
2724 return $ranges{$addr}->[$i-1]->value;
2727 sub _search_ranges {
2728 # Find the range in the list which contains a code point, or where it
2729 # should go if were to add it. That is, it returns $i, such that:
2730 # range[$i-1]->end < $codepoint <= range[$i]->end
2731 # Returns undef if no such $i is possible (e.g. at end of table), or
2732 # if there is an error.
2735 my $code_point = shift;
2736 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2738 local $addr = main::objaddr $self if ! defined $addr;
2740 return if $code_point > $max{$addr};
2741 my $r = $ranges{$addr}; # The current list of ranges
2742 my $range_list_size = scalar @$r;
2745 use integer; # want integer division
2747 # Use the cached result as the starting guess for this one, because,
2748 # an experiment on 5.1 showed that 90% of the time the cache was the
2749 # same as the result on the next call (and 7% it was one less).
2750 $i = $_search_ranges_cache{$addr};
2751 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob.
2752 # from an intervening deletion
2753 #local $to_trace = 1 if main::DEBUG;
2754 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
2755 return $i if $code_point <= $r->[$i]->end
2756 && ($i == 0 || $r->[$i-1]->end < $code_point);
2758 # Here the cache doesn't yield the correct $i. Try adding 1.
2759 if ($i < $range_list_size - 1
2760 && $r->[$i]->end < $code_point &&
2761 $code_point <= $r->[$i+1]->end)
2764 trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
2765 $_search_ranges_cache{$addr} = $i;
2769 # Here, adding 1 also didn't work. We do a binary search to
2770 # find the correct position, starting with current $i
2772 my $upper = $range_list_size - 1;
2774 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
2776 if ($code_point <= $r->[$i]->end) {
2778 # Here we have met the upper constraint. We can quit if we
2779 # also meet the lower one.
2780 last if $i == 0 || $r->[$i-1]->end < $code_point;
2782 $upper = $i; # Still too high.
2787 # Here, $r[$i]->end < $code_point, so look higher up.
2791 # Split search domain in half to try again.
2792 my $temp = ($upper + $lower) / 2;
2794 # No point in continuing unless $i changes for next time
2798 # We can't reach the highest element because of the averaging.
2799 # So if one below the upper edge, force it there and try one
2801 if ($i == $range_list_size - 2) {
2803 trace "Forcing to upper edge" if main::DEBUG && $to_trace;
2804 $i = $range_list_size - 1;
2806 # Change $lower as well so if fails next time through,
2807 # taking the average will yield the same $i, and we will
2808 # quit with the error message just below.
2812 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken.");
2816 } # End of while loop
2818 if (main::DEBUG && $to_trace) {
2819 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
2820 trace "i= [ $i ]", $r->[$i];
2821 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
2824 # Here we have found the offset. Cache it as a starting point for the
2826 $_search_ranges_cache{$addr} = $i;
2831 # Add, replace or delete ranges to or from a list. The $type
2832 # parameter gives which:
2833 # '+' => insert or replace a range, returning a list of any changed
2835 # '-' => delete a range, returning a list of any deleted ranges.
2837 # The next three parameters give respectively the start, end, and
2838 # value associated with the range. 'value' should be null unless the
2841 # The range list is kept sorted so that the range with the lowest
2842 # starting position is first in the list, and generally, adjacent
2843 # ranges with the same values are merged into single larger one (see
2844 # exceptions below).
2846 # There are more parameters, all are key => value pairs:
2847 # Type gives the type of the value. It is only valid for '+'.
2848 # All ranges have types; if this parameter is omitted, 0 is
2849 # assumed. Ranges with type 0 are assumed to obey the
2850 # Unicode rules for casing, etc; ranges with other types are
2851 # not. Otherwise, the type is arbitrary, for the caller's
2852 # convenience, and looked at only by this routine to keep
2853 # adjacent ranges of different types from being merged into
2854 # a single larger range, and when Replace =>
2855 # $IF_NOT_EQUIVALENT is specified (see just below).
2856 # Replace determines what to do if the range list already contains
2857 # ranges which coincide with all or portions of the input
2858 # range. It is only valid for '+':
2859 # => $NO means that the new value is not to replace
2860 # any existing ones, but any empty gaps of the
2861 # range list coinciding with the input range
2862 # will be filled in with the new value.
2863 # => $UNCONDITIONALLY means to replace the existing values with
2864 # this one unconditionally. However, if the
2865 # new and old values are identical, the
2866 # replacement is skipped to save cycles
2867 # => $IF_NOT_EQUIVALENT means to replace the existing values
2868 # with this one if they are not equivalent.
2869 # Ranges are equivalent if their types are the
2870 # same, and they are the same string, or if
2871 # both are type 0 ranges, if their Unicode
2872 # standard forms are identical. In this last
2873 # case, the routine chooses the more "modern"
2874 # one to use. This is because some of the
2875 # older files are formatted with values that
2876 # are, for example, ALL CAPs, whereas the
2877 # derived files have a more modern style,
2878 # which looks better. By looking for this
2879 # style when the pre-existing and replacement
2880 # standard forms are the same, we can move to
2882 # => $MULTIPLE means that if this range duplicates an
2883 # existing one, but has a different value,
2884 # don't replace the existing one, but insert
2885 # this, one so that the same range can occur
2887 # => anything else is the same as => $IF_NOT_EQUIVALENT
2889 # "same value" means identical for type-0 ranges, and it means having
2890 # the same standard forms for non-type-0 ranges.
2892 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
2895 my $operation = shift; # '+' for add/replace; '-' for delete;
2902 $value = "" if not defined $value; # warning: $value can be "0"
2904 my $replace = delete $args{'Replace'};
2905 $replace = $IF_NOT_EQUIVALENT unless defined $replace;
2907 my $type = delete $args{'Type'};
2908 $type = 0 unless defined $type;
2910 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2912 local $addr = main::objaddr($self) if ! defined $addr;
2914 if ($operation ne '+' && $operation ne '-') {
2915 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken.");
2918 unless (defined $start && defined $end) {
2919 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken.");
2922 unless ($end >= $start) {
2923 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
2926 #local $to_trace = 1 if main::DEBUG;
2928 if ($operation eq '-') {
2929 if ($replace != $IF_NOT_EQUIVALENT) {
2930 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT.");
2931 $replace = $IF_NOT_EQUIVALENT;
2934 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0.");
2938 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\".");
2943 my $r = $ranges{$addr}; # The current list of ranges
2944 my $range_list_size = scalar @$r; # And its size
2945 my $max = $max{$addr}; # The current high code point in
2946 # the list of ranges
2948 # Do a special case requiring fewer machine cycles when the new range
2949 # starts after the current highest point. The Unicode input data is
2950 # structured so this is common.
2951 if ($start > $max) {
2953 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace;
2954 return if $operation eq '-'; # Deleting a non-existing range is a
2957 # If the new range doesn't logically extend the current final one
2958 # in the range list, create a new range at the end of the range
2959 # list. (max cleverly is initialized to a negative number not
2960 # adjacent to 0 if the range list is empty, so even adding a range
2961 # to an empty range list starting at 0 will have this 'if'
2963 if ($start > $max + 1 # non-adjacent means can't extend.
2964 || @{$r}[-1]->value ne $value # values differ, can't extend.
2965 || @{$r}[-1]->type != $type # types differ, can't extend.
2967 push @$r, Range->new($start, $end,
2973 # Here, the new range starts just after the current highest in
2974 # the range list, and they have the same type and value.
2975 # Extend the current range to incorporate the new one.
2976 @{$r}[-1]->set_end($end);
2979 # This becomes the new maximum.
2984 #local $to_trace = 0 if main::DEBUG;
2986 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
2988 # Here, the input range isn't after the whole rest of the range list.
2989 # Most likely 'splice' will be needed. The rest of the routine finds
2990 # the needed splice parameters, and if necessary, does the splice.
2991 # First, find the offset parameter needed by the splice function for
2992 # the input range. Note that the input range may span multiple
2993 # existing ones, but we'll worry about that later. For now, just find
2994 # the beginning. If the input range is to be inserted starting in a
2995 # position not currently in the range list, it must (obviously) come
2996 # just after the range below it, and just before the range above it.
2997 # Slightly less obviously, it will occupy the position currently
2998 # occupied by the range that is to come after it. More formally, we
2999 # are looking for the position, $i, in the array of ranges, such that:
3001 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3003 # (The ordered relationships within existing ranges are also shown in
3004 # the equation above). However, if the start of the input range is
3005 # within an existing range, the splice offset should point to that
3006 # existing range's position in the list; that is $i satisfies a
3007 # somewhat different equation, namely:
3009 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3011 # More briefly, $start can come before or after r[$i]->start, and at
3012 # this point, we don't know which it will be. However, these
3013 # two equations share these constraints:
3015 # r[$i-1]->end < $start <= r[$i]->end
3017 # And that is good enough to find $i.
3019 my $i = $self->_search_ranges($start);
3021 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed");
3025 # The search function returns $i such that:
3027 # r[$i-1]->end < $start <= r[$i]->end
3029 # That means that $i points to the first range in the range list
3030 # that could possibly be affected by this operation. We still don't
3031 # know if the start of the input range is within r[$i], or if it
3032 # points to empty space between r[$i-1] and r[$i].
3033 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3035 # Special case the insertion of data that is not to replace any
3037 if ($replace == $NO) { # If $NO, has to be operation '+'
3038 #local $to_trace = 1 if main::DEBUG;
3039 trace "Doesn't replace" if main::DEBUG && $to_trace;
3041 # Here, the new range is to take effect only on those code points
3042 # that aren't already in an existing range. This can be done by
3043 # looking through the existing range list and finding the gaps in
3044 # the ranges that this new range affects, and then calling this
3045 # function recursively on each of those gaps, leaving untouched
3046 # anything already in the list. Gather up a list of the changed
3047 # gaps first so that changes to the internal state as new ranges
3048 # are added won't be a problem.
3051 # First, if the starting point of the input range is outside an
3052 # existing one, there is a gap from there to the beginning of the
3053 # existing range -- add a span to fill the part that this new
3055 if ($start < $r->[$i]->start) {
3056 push @gap_list, Range->new($start,
3058 $r->[$i]->start - 1),
3060 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3063 # Then look through the range list for other gaps until we reach
3064 # the highest range affected by the input one.
3066 for ($j = $i+1; $j < $range_list_size; $j++) {
3067 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3068 last if $end < $r->[$j]->start;
3070 # If there is a gap between when this range starts and the
3071 # previous one ends, add a span to fill it. Note that just
3072 # because there are two ranges doesn't mean there is a
3073 # non-zero gap between them. It could be that they have
3074 # different values or types
3075 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3077 Range->new($r->[$j-1]->end + 1,
3078 $r->[$j]->start - 1,
3080 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3084 # Here, we have either found an existing range in the range list,
3085 # beyond the area affected by the input one, or we fell off the
3086 # end of the loop because the input range affects the whole rest
3087 # of the range list. In either case, $j is 1 higher than the
3088 # highest affected range. If $j == $i, it means that there are no
3089 # affected ranges, that the entire insertion is in the gap between
3090 # r[$i-1], and r[$i], which we already have taken care of before
3092 # On the other hand, if there are affected ranges, it might be
3093 # that there is a gap that needs filling after the final such
3094 # range to the end of the input range
3095 if ($r->[$j-1]->end < $end) {
3096 push @gap_list, Range->new(main::max($start,
3097 $r->[$j-1]->end + 1),
3100 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3103 # Call recursively to fill in all the gaps.
3104 foreach my $gap (@gap_list) {
3105 $self->_add_delete($operation,
3115 # Here, we have taken care of the case where $replace is $NO, which
3116 # means that whatever action we now take is done unconditionally. It
3117 # still could be that this call will result in a no-op, if duplicates
3118 # aren't allowed, and we are inserting a range that merely duplicates
3119 # data already in the range list; or also if deleting a non-existent
3121 # $i still points to the first potential affected range. Now find the
3122 # highest range affected, which will determine the length parameter to
3123 # splice. (The input range can span multiple existing ones.) While
3124 # we are looking through the range list, see also if this is an
3125 # insertion that will change the values of at least one of the
3126 # affected ranges. We don't need to do this check unless this is an
3127 # insertion of non-multiples, and also since this is a boolean, we
3128 # don't need to do it if have already determined that it will make a
3129 # change; just unconditionally change them. $cdm is created to be 1
3130 # if either of these is true. (The 'c' in the name comes from below)
3131 my $cdm = ($operation eq '-' || $replace == $MULTIPLE);
3132 my $j; # This will point to the highest affected range
3134 # For non-zero types, the standard form is the value itself;
3135 my $standard_form = ($type) ? $value : main::standardize($value);
3137 for ($j = $i; $j < $range_list_size; $j++) {
3138 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3140 # If find a range that it doesn't overlap into, we can stop
3142 last if $end < $r->[$j]->start;
3144 # Here, overlaps the range at $j. If the value's don't match,
3145 # and this is supposedly an insertion, it becomes a change
3146 # instead. This is what the 'c' stands for in $cdm.
3148 if ($r->[$j]->standard_form ne $standard_form) {
3153 # Here, the two values are essentially the same. If the
3154 # two are actually identical, replacing wouldn't change
3155 # anything so skip it.
3156 my $pre_existing = $r->[$j]->value;
3157 if ($pre_existing ne $value) {
3159 # Here the new and old standardized values are the
3160 # same, but the non-standardized values aren't. If
3161 # replacing unconditionally, then replace
3162 if( $replace == $UNCONDITIONALLY) {
3167 # Here, are replacing conditionally. Decide to
3168 # replace or not based on which appears to look
3169 # the "nicest". If one is mixed case and the
3170 # other isn't, choose the mixed case one.
3171 my $new_mixed = $value =~ /[A-Z]/
3172 && $value =~ /[a-z]/;
3173 my $old_mixed = $pre_existing =~ /[A-Z]/
3174 && $pre_existing =~ /[a-z]/;
3176 if ($old_mixed != $new_mixed) {
3177 $cdm = 1 if $new_mixed;
3178 if (main::DEBUG && $to_trace) {
3180 trace "Replacing $pre_existing with $value";
3183 trace "Retaining $pre_existing over $value";
3189 # Here casing wasn't different between the two.
3190 # If one has hyphens or underscores and the
3191 # other doesn't, choose the one with the
3193 my $new_punct = $value =~ /[-_]/;
3194 my $old_punct = $pre_existing =~ /[-_]/;
3196 if ($old_punct != $new_punct) {
3197 $cdm = 1 if $new_punct;
3198 if (main::DEBUG && $to_trace) {
3200 trace "Replacing $pre_existing with $value";
3203 trace "Retaining $pre_existing over $value";
3206 } # else existing one is just as "good";
3207 # retain it to save cycles.
3213 } # End of loop looking for highest affected range.
3215 # Here, $j points to one beyond the highest range that this insertion
3216 # affects (hence to beyond the range list if that range is the final
3217 # one in the range list).
3219 # The splice length is all the affected ranges. Get it before
3220 # subtracting, for efficiency, so we don't have to later add 1.
3221 my $length = $j - $i;
3223 $j--; # $j now points to the highest affected range.
3224 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3226 # If inserting a multiple record, this is where it goes, after all the
3227 # existing ones for this range. This implies an insertion, and no
3228 # change to any existing ranges. Note that $j can be -1 if this new
3229 # range doesn't actually duplicate any existing, and comes at the
3230 # beginning of the list, in which case we can handle it like any other
3231 # insertion, and is easier to do so.
3232 if ($replace == $MULTIPLE && $j >= 0) {
3234 # This restriction could be remedied with a little extra work, but
3235 # it won't hopefully ever be necessary
3236 if ($r->[$j]->start != $r->[$j]->end) {
3237 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple when the other range ($r->[$j]) contains more than one code point. No action taken.");
3241 # Don't add an exact duplicate, as it isn't really a multiple
3242 return if $value eq $r->[$j]->value && $type eq $r->[$j]->type;
3244 trace "Adding multiple record at $j+1 with $start..$end, $value" if main::DEBUG && $to_trace;
3245 my @return = splice @$r,
3252 if (main::DEBUG && $to_trace) {
3253 trace "After splice:";
3254 trace 'j-2=[', $j-2, ']', $r->[$j-2] if $j >= 2;
3255 trace 'j-1=[', $j-1, ']', $r->[$j-1] if $j >= 1;
3256 trace "j =[", $j, "]", $r->[$j] if $j >= 0;
3257 trace 'j+1=[', $j+1, ']', $r->[$j+1] if $j < @$r - 1;
3258 trace 'j+2=[', $j+2, ']', $r->[$j+2] if $j < @$r - 2;
3259 trace 'j+3=[', $j+3, ']', $r->[$j+3] if $j < @$r - 3;
3264 # Here, have taken care of $NO and $MULTIPLE replaces.
3265 # $j points to the highest affected range. But it can be < $i or even
3266 # -1. These happen only if the insertion is entirely in the gap
3267 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
3268 # above exited first time through with $end < $r->[$i]->start. (And
3269 # then we subtracted one from j) This implies also that $start <
3270 # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3271 # $start, so the entire input range is in the gap.
3274 # Here the entire input range is in the gap before $i.
3276 if (main::DEBUG && $to_trace) {
3278 trace "Entire range is between $r->[$i-1] and $r->[$i]";
3281 trace "Entire range is before $r->[$i]";
3284 return if $operation ne '+'; # Deletion of a non-existent range is
3289 # Here the entire input range is not in the gap before $i. There
3290 # is an affected one, and $j points to the highest such one.
3292 # At this point, here is the situation:
3293 # This is not an insertion of a multiple, nor of tentative ($NO)
3295 # $i points to the first element in the current range list that
3296 # may be affected by this operation. In fact, we know
3297 # that the range at $i is affected because we are in
3298 # the else branch of this 'if'
3299 # $j points to the highest affected range.
3301 # r[$i-1]->end < $start <= r[$i]->end
3303 # r[$i-1]->end < $start <= $end <= r[$j]->end
3306 # $cdm is a boolean which is set true if and only if this is a
3307 # change or deletion (multiple was handled above). In
3308 # other words, it could be renamed to be just $cd.
3310 # We now have enough information to decide if this call is a no-op
3311 # or not. It is a no-op if it is a deletion of a non-existent
3312 # range, or an insertion of already existing data.
3314 if (main::DEBUG && $to_trace && ! $cdm
3316 && $start >= $r->[$i]->start)
3320 return if ! $cdm # change or delete => not no-op
3321 && $i == $j # more than one affected range => not no-op
3323 # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
3324 # Further, $start and/or $end is >= r[$i]->start
3325 # The test below hence guarantees that
3326 # r[$i]->start < $start <= $end <= r[$i]->end
3327 # This means the input range is contained entirely in
3328 # the one at $i, so is a no-op
3329 && $start >= $r->[$i]->start;
3332 # Here, we know that some action will have to be taken. We have
3333 # calculated the offset and length (though adjustments may be needed)
3334 # for the splice. Now start constructing the replacement list.
3336 my $splice_start = $i;
3341 # See if should extend any adjacent ranges.
3342 if ($operation eq '-') { # Don't extend deletions
3343 $extends_below = $extends_above = 0;
3345 else { # Here, should extend any adjacent ranges. See if there are
3347 $extends_below = ($i > 0
3348 # can't extend unless adjacent
3349 && $r->[$i-1]->end == $start -1
3350 # can't extend unless are same standard value
3351 && $r->[$i-1]->standard_form eq $standard_form
3352 # can't extend unless share type
3353 && $r->[$i-1]->type == $type);
3354 $extends_above = ($j+1 < $range_list_size
3355 && $r->[$j+1]->start == $end +1
3356 && $r->[$j+1]->standard_form eq $standard_form
3357 && $r->[$j-1]->type == $type);
3359 if ($extends_below && $extends_above) { # Adds to both
3360 $splice_start--; # start replace at element below
3361 $length += 2; # will replace on both sides
3362 trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
3364 # The result will fill in any gap, replacing both sides, and
3365 # create one large range.
3366 @replacement = Range->new($r->[$i-1]->start,
3373 # Here we know that the result won't just be the conglomeration of
3374 # a new range with both its adjacent neighbors. But it could
3375 # extend one of them.
3377 if ($extends_below) {
3379 # Here the new element adds to the one below, but not to the
3380 # one above. If inserting, and only to that one range, can
3381 # just change its ending to include the new one.
3382 if ($length == 0 && ! $cdm) {
3383 $r->[$i-1]->set_end($end);
3384 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
3388 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
3389 $splice_start--; # start replace at element below
3390 $length++; # will replace the element below
3391 $start = $r->[$i-1]->start;
3394 elsif ($extends_above) {
3396 # Here the new element adds to the one above, but not below.
3397 # Mirror the code above
3398 if ($length == 0 && ! $cdm) {
3399 $r->[$j+1]->set_start($start);
3400 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
3404 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
3405 $length++; # will replace the element above
3406 $end = $r->[$j+1]->end;
3410 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
3412 # Finally, here we know there will have to be a splice.
3413 # If the change or delete affects only the highest portion of the
3414 # first affected range, the range will have to be split. The
3415 # splice will remove the whole range, but will replace it by a new
3416 # range containing just the unaffected part. So, in this case,
3417 # add to the replacement list just this unaffected portion.
3418 if (! $extends_below
3419 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
3422 Range->new($r->[$i]->start,
3424 Value => $r->[$i]->value,
3425 Type => $r->[$i]->type);
3428 # In the case of an insert or change, but not a delete, we have to
3429 # put in the new stuff; this comes next.
3430 if ($operation eq '+') {
3431 push @replacement, Range->new($start,
3437 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
3438 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
3440 # And finally, if we're changing or deleting only a portion of the
3441 # highest affected range, it must be split, as the lowest one was.
3442 if (! $extends_above
3443 && $j >= 0 # Remember that j can be -1 if before first
3445 && $end >= $r->[$j]->start
3446 && $end < $r->[$j]->end)
3449 Range->new($end + 1,
3451 Value => $r->[$j]->value,
3452 Type => $r->[$j]->type);
3456 # And do the splice, as calculated above
3457 if (main::DEBUG && $to_trace) {
3458 trace "replacing $length element(s) at $i with ";
3459 foreach my $replacement (@replacement) {
3460 trace " $replacement";
3462 trace "Before splice:";
3463 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3464 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3465 trace "i =[", $i, "]", $r->[$i];
3466 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3467 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3470 my @return = splice @$r, $splice_start, $length, @replacement;
3472 if (main::DEBUG && $to_trace) {
3473 trace "After splice:";
3474 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3475 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3476 trace "i =[", $i, "]", $r->[$i];
3477 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3478 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3479 trace "removed @return";
3482 # An actual deletion could have changed the maximum in the list.
3483 # There was no deletion if the splice didn't return something, but
3484 # otherwise recalculate it. This is done too rarely to worry about
3486 if ($operation eq '-' && @return) {
3487 $max{$addr} = $r->[-1]->end;
3492 sub reset_each_range { # reset the iterator for each_range();
3494 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3496 local $addr = main::objaddr $self if ! defined $addr;
3498 undef $each_range_iterator{$addr};
3503 # Iterate over each range in a range list. Results are undefined if
3504 # the range list is changed during the iteration.
3507 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3509 local $addr = main::objaddr($self) if ! defined $addr;
3511 return if $self->is_empty;
3513 $each_range_iterator{$addr} = -1
3514 if ! defined $each_range_iterator{$addr};
3515 $each_range_iterator{$addr}++;
3516 return $ranges{$addr}->[$each_range_iterator{$addr}]
3517 if $each_range_iterator{$addr} < @{$ranges{$addr}};
3518 undef $each_range_iterator{$addr};
3522 sub count { # Returns count of code points in range list
3524 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3526 local $addr = main::objaddr($self) if ! defined $addr;
3529 foreach my $range (@{$ranges{$addr}}) {
3530 $count += $range->end - $range->start + 1;
3535 sub delete_range { # Delete a range
3540 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3542 return $self->_add_delete('-', $start, $end, "");
3545 sub is_empty { # Returns boolean as to if a range list is empty
3547 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3549 local $addr = main::objaddr($self) if ! defined $addr;
3550 return scalar @{$ranges{$addr}} == 0;
3554 # Quickly returns a scalar suitable for separating tables into
3555 # buckets, i.e. it is a hash function of the contents of a table, so
3556 # there are relatively few conflicts.
3559 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3561 local $addr = main::objaddr($self) if ! defined $addr;
3563 # These are quickly computable. Return looks like 'min..max;count'
3564 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
3566 } # End closure for _Range_List_Base
3569 use base '_Range_List_Base';
3571 # A Range_List is a range list for match tables; i.e. the range values are
3572 # not significant. Thus a number of operations can be safely added to it,
3573 # such as inversion, intersection. Note that union is also an unsafe
3574 # operation when range values are cared about, and that method is in the base
3575 # class, not here. But things are set up so that that method is callable only
3576 # during initialization. Only in this derived class, is there an operation
3577 # that combines two tables. A Range_Map can thus be used to initialize a
3578 # Range_List, and its mappings will be in the list, but are not significant to
3581 sub trace { return main::trace(@_); }
3587 '+' => sub { my $self = shift;
3590 return $self->_union($other)
3592 '&' => sub { my $self = shift;
3595 return $self->_intersect($other, 0);
3602 # Returns a new Range_List that gives all code points not in $self.
3606 my $new = Range_List->new;
3608 # Go through each range in the table, finding the gaps between them
3609 my $max = -1; # Set so no gap before range beginning at 0
3610 for my $range ($self->ranges) {
3611 my $start = $range->start;
3612 my $end = $range->end;
3614 # If there is a gap before this range, the inverse will contain
3616 if ($start > $max + 1) {
3617 $new->add_range($max + 1, $start - 1);
3622 # And finally, add the gap from the end of the table to the max
3623 # possible code point
3624 if ($max < $LAST_UNICODE_CODEPOINT) {
3625 $new->add_range($max + 1, $LAST_UNICODE_CODEPOINT);
3631 # Returns a new Range_List with the argument deleted from it. The
3632 # argument can be a single code point, a range, or something that has
3633 # a range, with the _range_list() method on it returning them
3637 my $reversed = shift;
3638 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3641 Carp::my_carp_bug("Can't cope with a "
3643 . " being the second parameter in a '-'. Subtraction ignored.");
3647 my $new = Range_List->new(Initialize => $self);
3649 if (! ref $other) { # Single code point
3650 $new->delete_range($other, $other);
3652 elsif ($other->isa('Range')) {
3653 $new->delete_range($other->start, $other->end);
3655 elsif ($other->can('_range_list')) {
3656 foreach my $range ($other->_range_list->ranges) {
3657 $new->delete_range($range->start, $range->end);
3661 Carp::my_carp_bug("Can't cope with a "
3663 . " argument to '-'. Subtraction ignored."
3672 # Returns either a boolean giving whether the two inputs' range lists
3673 # intersect (overlap), or a new Range_List containing the intersection
3674 # of the two lists. The optional final parameter being true indicates
3675 # to do the check instead of the intersection.
3677 my $a_object = shift;
3678 my $b_object = shift;
3679 my $check_if_overlapping = shift;
3680 $check_if_overlapping = 0 unless defined $check_if_overlapping;
3681 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3683 if (! defined $b_object) {
3685 $message .= $a_object->_owner_name_of if defined $a_object;
3686 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done.");
3690 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
3691 # Thus the intersection could be much more simply be written:
3692 # return ~(~$a_object + ~$b_object);
3693 # But, this is slower, and when taking the inverse of a large
3694 # range_size_1 table, back when such tables were always stored that
3695 # way, it became prohibitively slow, hence the code was changed to the
3698 if ($b_object->isa('Range')) {
3699 $b_object = Range_List->new(Initialize => $b_object,
3700 Owner => $a_object->_owner_name_of);
3702 $b_object = $b_object->_range_list if $b_object->can('_range_list');
3704 my @a_ranges = $a_object->ranges;
3705 my @b_ranges = $b_object->ranges;
3707 #local $to_trace = 1 if main::DEBUG;
3708 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
3710 # Start with the first range in each list
3712 my $range_a = $a_ranges[$a_i];
3714 my $range_b = $b_ranges[$b_i];
3716 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
3717 if ! $check_if_overlapping;
3719 # If either list is empty, there is no intersection and no overlap
3720 if (! defined $range_a || ! defined $range_b) {
3721 return $check_if_overlapping ? 0 : $new;
3723 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
3725 # Otherwise, must calculate the intersection/overlap. Start with the
3726 # very first code point in each list
3727 my $a = $range_a->start;
3728 my $b = $range_b->start;
3730 # Loop through all the ranges of each list; in each iteration, $a and
3731 # $b are the current code points in their respective lists
3734 # If $a and $b are the same code point, ...
3737 # it means the lists overlap. If just checking for overlap
3738 # know the answer now,
3739 return 1 if $check_if_overlapping;
3741 # The intersection includes this code point plus anything else
3742 # common to both current ranges.
3744 my $end = main::min($range_a->end, $range_b->end);
3745 if (! $check_if_overlapping) {
3746 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
3747 $new->add_range($start, $end);
3750 # Skip ahead to the end of the current intersect
3753 # If the current intersect ends at the end of either range (as
3754 # it must for at least one of them), the next possible one
3755 # will be the beginning code point in it's list's next range.
3756 if ($a == $range_a->end) {
3757 $range_a = $a_ranges[++$a_i];
3758 last unless defined $range_a;
3759 $a = $range_a->start;
3761 if ($b == $range_b->end) {
3762 $range_b = $b_ranges[++$b_i];
3763 last unless defined $range_b;
3764 $b = $range_b->start;
3767 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
3771 # Not equal, but if the range containing $a encompasses $b,
3772 # change $a to be the middle of the range where it does equal
3773 # $b, so the next iteration will get the intersection
3774 if ($range_a->end >= $b) {
3779 # Here, the current range containing $a is entirely below
3780 # $b. Go try to find a range that could contain $b.
3781 $a_i = $a_object->_search_ranges($b);
3783 # If no range found, quit.
3784 last unless defined $a_i;
3786 # The search returns $a_i, such that
3787 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
3788 # Set $a to the beginning of this new range, and repeat.
3789 $range_a = $a_ranges[$a_i];
3790 $a = $range_a->start;
3793 else { # Here, $b < $a.
3795 # Mirror image code to the leg just above
3796 if ($range_b->end >= $a) {
3800 $b_i = $b_object->_search_ranges($a);
3801 last unless defined $b_i;
3802 $range_b = $b_ranges[$b_i];
3803 $b = $range_b->start;
3806 } # End of looping through ranges.
3808 # Intersection fully computed, or now know that there is no overlap
3809 return $check_if_overlapping ? 0 : $new;
3813 # Returns boolean giving whether the two arguments overlap somewhere
3817 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3819 return $self->_intersect($other, 1);
3823 # Add a range to the list.
3828 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3830 return $self->_add_delete('+', $start, $end, "");
3833 my $non_ASCII = (ord('A') != 65); # Assumes test on same platform
3835 sub is_code_point_usable {
3836 # This used only for making the test script. See if the input
3837 # proposed trial code point is one that Perl will handle. If second
3838 # parameter is 0, it won't select some code points for various
3839 # reasons, noted below.
3842 my $try_hard = shift;
3843 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3845 return 0 if $code < 0; # Never use a negative
3847 # For non-ASCII, we shun the characters that don't have Perl encoding-
3848 # independent symbols for them. 'A' is such a symbol, so is "\n".
3849 # Note, this program hopefully will work on 5.8 Perls, and \v is not
3850 # such a symbol in them.
3851 return $try_hard if $non_ASCII
3854 || ($code >= 0x0E && $code <= 0x1F)
3855 || ($code >= 0x01 && $code <= 0x06)
3856 || $code == 0x0B); # \v introduced after 5.8
3858 # shun null. I'm (khw) not sure why this was done, but NULL would be
3859 # the character very frequently used.
3860 return $try_hard if $code == 0x0000;
3862 return 0 if $try_hard; # XXX Temporary until fix utf8.c
3864 # shun non-character code points.
3865 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
3866 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
3868 return $try_hard if $code > $LAST_UNICODE_CODEPOINT; # keep in range
3869 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
3874 sub get_valid_code_point {
3875 # Return a code point that's part of the range list. Returns nothing
3876 # if the table is empty or we can't find a suitable code point. This
3877 # used only for making the test script.
3880 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3882 my $addr = main::objaddr($self);
3884 # On first pass, don't choose less desirable code points; if no good
3885 # one is found, repeat, allowing a less desirable one to be selected.
3886 for my $try_hard (0, 1) {
3888 # Look through all the ranges for a usable code point.
3889 for my $set ($self->ranges) {
3891 # Try the edge cases first, starting with the end point of the
3893 my $end = $set->end;
3894 return $end if is_code_point_usable($end, $try_hard);
3896 # End point didn't, work. Start at the beginning and try
3897 # every one until find one that does work.
3898 for my $trial ($set->start .. $end - 1) {
3899 return $trial if is_code_point_usable($trial, $try_hard);
3903 return (); # If none found, give up.
3906 sub get_invalid_code_point {
3907 # Return a code point that's not part of the table. Returns nothing
3908 # if the table covers all code points or a suitable code point can't
3909 # be found. This used only for making the test script.
3912 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3914 # Just find a valid code point of the inverse, if any.
3915 return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
3917 } # end closure for Range_List
3920 use base '_Range_List_Base';
3922 # A Range_Map is a range list in which the range values (called maps) are
3923 # significant, and hence shouldn't be manipulated by our other code, which
3924 # could be ambiguous or lose things. For example, in taking the union of two
3925 # lists, which share code points, but which have differing values, which one
3926 # has precedence in the union?
3927 # It turns out that these operations aren't really necessary for map tables,
3928 # and so this class was created to make sure they aren't accidentally
3934 # Add a range containing a mapping value to the list
3937 # Rest of parameters passed on
3939 return $self->_add_delete('+', @_);
3943 # Adds entry to a range list which can duplicate an existing entry
3946 my $code_point = shift;
3948 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3950 return $self->add_map($code_point, $code_point,
3951 $value, Replace => $MULTIPLE);
3953 } # End of closure for package Range_Map
3955 package _Base_Table;
3957 # A table is the basic data structure that gets written out into a file for
3958 # use by the Perl core. This is the abstract base class implementing the
3959 # common elements from the derived ones. A list of the methods to be
3960 # furnished by an implementing class is just after the constructor.
3962 sub standardize { return main::standardize($_[0]); }
3963 sub trace { return main::trace(@_); }
3967 main::setup_package();
3970 # Object containing the ranges of the table.
3971 main::set_access('range_list', \%range_list, 'p_r', 'p_s');
3974 # The full table name.
3975 main::set_access('full_name', \%full_name, 'r');
3978 # The table name, almost always shorter
3979 main::set_access('name', \%name, 'r');
3982 # The shortest of all the aliases for this table, with underscores removed
3983 main::set_access('short_name', \%short_name);
3985 my %nominal_short_name_length;
3986 # The length of short_name before removing underscores
3987 main::set_access('nominal_short_name_length',
3988 \%nominal_short_name_length);
3991 # The complete name, including property.
3992 main::set_access('complete_name', \%complete_name, 'r');
3995 # Parent property this table is attached to.
3996 main::set_access('property', \%property, 'r');
3999 # Ordered list of aliases of the table's name. The first ones in the list
4000 # are output first in comments
4001 main::set_access('aliases', \%aliases, 'readable_array');
4004 # A comment associated with the table for human readers of the files
4005 main::set_access('comment', \%comment, 's');
4008 # A comment giving a short description of the table's meaning for human
4009 # readers of the files.
4010 main::set_access('description', \%description, 'readable_array');
4013 # A comment giving a short note about the table for human readers of the
4015 main::set_access('note', \%note, 'readable_array');
4018 # Boolean; if set means any file that contains this table is marked as for
4019 # internal-only use.
4020 main::set_access('internal_only', \%internal_only);
4022 my %find_table_from_alias;
4023 # The parent property passes this pointer to a hash which this class adds
4024 # all its aliases to, so that the parent can quickly take an alias and
4026 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4029 # After this table is made equivalent to another one; we shouldn't go
4030 # changing the contents because that could mean it's no longer equivalent
4031 main::set_access('locked', \%locked, 'r');
4034 # This gives the final path to the file containing the table. Each
4035 # directory in the path is an element in the array
4036 main::set_access('file_path', \%file_path, 'readable_array');
4039 # What is the table's status, normal, $OBSOLETE, etc. Enum
4040 main::set_access('status', \%status, 'r');
4043 # A comment about its being obsolete, or whatever non normal status it has
4044 main::set_access('status_info', \%status_info, 'r');
4047 # Is the table to be output with each range only a single code point?
4048 # This is done to avoid breaking existing code that may have come to rely
4049 # on this behavior in previous versions of this program.)
4050 main::set_access('range_size_1', \%range_size_1, 'r', 's');
4053 # A boolean set iff this table is a Perl extension to the Unicode
4055 main::set_access('perl_extension', \%perl_extension, 'r');
4058 # All arguments are key => value pairs, which you can see below, most
4059 # of which match fields documented above. Otherwise: Pod_Entry,
4060 # Externally_Ok, and Fuzzy apply to the names of the table, and are
4061 # documented in the Alias package
4063 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4067 my $self = bless \do { my $anonymous_scalar }, $class;
4068 my $addr = main::objaddr($self);
4072 $name{$addr} = delete $args{'Name'};
4073 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4074 $full_name{$addr} = delete $args{'Full_Name'};
4075 my $complete_name = $complete_name{$addr}
4076 = delete $args{'Complete_Name'};
4077 $internal_only{$addr} = delete $args{'Internal_Only_Warning'} || 0;
4078 $perl_extension{$addr} = delete $args{'Perl_Extension'} || 0;
4079 $property{$addr} = delete $args{'_Property'};
4080 $range_list{$addr} = delete $args{'_Range_List'};
4081 $status{$addr} = delete $args{'Status'} || $NORMAL;
4082 $status_info{$addr} = delete $args{'_Status_Info'} || "";
4083 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
4085 my $description = delete $args{'Description'};
4086 my $externally_ok = delete $args{'Externally_Ok'};
4087 my $loose_match = delete $args{'Fuzzy'};
4088 my $note = delete $args{'Note'};
4089 my $make_pod_entry = delete $args{'Pod_Entry'};
4091 # Shouldn't have any left over
4092 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4094 # Can't use || above because conceivably the name could be 0, and
4095 # can't use // operator in case this program gets used in Perl 5.8
4096 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
4098 $aliases{$addr} = [ ];
4099 $comment{$addr} = [ ];
4100 $description{$addr} = [ ];
4102 $file_path{$addr} = [ ];
4103 $locked{$addr} = "";
4105 push @{$description{$addr}}, $description if $description;
4106 push @{$note{$addr}}, $note if $note;
4108 # If hasn't set its status already, see if it is on one of the lists
4109 # of properties or tables that have particular statuses; if not, is
4110 # normal. The lists are prioritized so the most serious ones are
4112 if (! $status{$addr}) {
4113 if (exists $why_suppressed{$complete_name}) {
4114 $status{$addr} = $SUPPRESSED;
4116 elsif (exists $why_deprecated{$complete_name}) {
4117 $status{$addr} = $DEPRECATED;
4119 elsif (exists $why_stabilized{$complete_name}) {
4120 $status{$addr} = $STABILIZED;
4122 elsif (exists $why_obsolete{$complete_name}) {
4123 $status{$addr} = $OBSOLETE;
4126 # Existence above doesn't necessarily mean there is a message
4127 # associated with it. Use the most serious message.
4128 if ($status{$addr}) {
4129 if ($why_suppressed{$complete_name}) {
4131 = $why_suppressed{$complete_name};
4133 elsif ($why_deprecated{$complete_name}) {
4135 = $why_deprecated{$complete_name};
4137 elsif ($why_stabilized{$complete_name}) {
4139 = $why_stabilized{$complete_name};
4141 elsif ($why_obsolete{$complete_name}) {
4143 = $why_obsolete{$complete_name};
4148 # By convention what typically gets printed only or first is what's
4149 # first in the list, so put the full name there for good output
4150 # clarity. Other routines rely on the full name being first on the
4152 $self->add_alias($full_name{$addr},
4153 Externally_Ok => $externally_ok,
4154 Fuzzy => $loose_match,
4155 Pod_Entry => $make_pod_entry,
4156 Status => $status{$addr},
4159 # Then comes the other name, if meaningfully different.
4160 if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4161 $self->add_alias($name{$addr},
4162 Externally_Ok => $externally_ok,
4163 Fuzzy => $loose_match,
4164 Pod_Entry => $make_pod_entry,
4165 Status => $status{$addr},
4172 # Here are the methods that are required to be defined by any derived
4178 # append_to_body and pre_body are called in the write() method
4179 # to add stuff after the main body of the table, but before
4180 # its close; and to prepend stuff before the beginning of the
4185 Carp::my_carp_bug( __LINE__
4186 . ": Must create method '$sub()' for "
4194 "." => \&main::_operator_dot,
4195 '!=' => \&main::_operator_not_equal,
4196 '==' => \&main::_operator_equal,
4200 # Returns the array of ranges associated with this table.
4202 return $range_list{main::objaddr shift}->ranges;
4206 # Add a synonym for this table.
4208 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
4211 my $name = shift; # The name to add.
4212 my $pointer = shift; # What the alias hash should point to. For
4213 # map tables, this is the parent property;
4214 # for match tables, it is the table itself.
4217 my $loose_match = delete $args{'Fuzzy'};
4219 my $make_pod_entry = delete $args{'Pod_Entry'};
4220 $make_pod_entry = $YES unless defined $make_pod_entry;
4222 my $externally_ok = delete $args{'Externally_Ok'};
4223 $externally_ok = 1 unless defined $externally_ok;
4225 my $status = delete $args{'Status'};
4226 $status = $NORMAL unless defined $status;
4228 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4230 # Capitalize the first letter of the alias unless it is one of the CJK
4231 # ones which specifically begins with a lower 'k'. Do this because
4232 # Unicode has varied whether they capitalize first letters or not, and
4233 # have later changed their minds and capitalized them, but not the
4234 # other way around. So do it always and avoid changes from release to
4236 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
4238 my $addr = main::objaddr $self;
4240 # Figure out if should be loosely matched if not already specified.
4241 if (! defined $loose_match) {
4243 # Is a loose_match if isn't null, and doesn't begin with an
4244 # underscore and isn't just a number
4246 && substr($name, 0, 1) ne '_'
4247 && $name !~ qr{^[0-9_.+-/]+$})
4256 # If this alias has already been defined, do nothing.
4257 return if defined $find_table_from_alias{$addr}->{$name};
4259 # That includes if it is standardly equivalent to an existing alias,
4260 # in which case, add this name to the list, so won't have to search
4262 my $standard_name = main::standardize($name);
4263 if (defined $find_table_from_alias{$addr}->{$standard_name}) {
4264 $find_table_from_alias{$addr}->{$name}
4265 = $find_table_from_alias{$addr}->{$standard_name};
4269 # Set the index hash for this alias for future quick reference.
4270 $find_table_from_alias{$addr}->{$name} = $pointer;
4271 $find_table_from_alias{$addr}->{$standard_name} = $pointer;
4272 local $to_trace = 0 if main::DEBUG;
4273 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
4274 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
4277 # Put the new alias at the end of the list of aliases unless the final
4278 # element begins with an underscore (meaning it is for internal perl
4279 # use) or is all numeric, in which case, put the new one before that
4280 # one. This floats any all-numeric or underscore-beginning aliases to
4281 # the end. This is done so that they are listed last in output lists,
4282 # to encourage the user to use a better name (either more descriptive
4283 # or not an internal-only one) instead. This ordering is relied on
4284 # implicitly elsewhere in this program, like in short_name()
4285 my $list = $aliases{$addr};
4286 my $insert_position = (@$list == 0
4287 || (substr($list->[-1]->name, 0, 1) ne '_'
4288 && $list->[-1]->name =~ /\D/))
4294 Alias->new($name, $loose_match, $make_pod_entry,
4295 $externally_ok, $status);
4297 # This name may be shorter than any existing ones, so clear the cache
4298 # of the shortest, so will have to be recalculated.
4299 undef $short_name{main::objaddr $self};
4304 # Returns a name suitable for use as the base part of a file name.
4305 # That is, shorter wins. It can return undef if there is no suitable
4306 # name. The name has all non-essential underscores removed.
4308 # The optional second parameter is a reference to a scalar in which
4309 # this routine will store the length the returned name had before the
4310 # underscores were removed, or undef if the return is undef.
4312 # The shortest name can change if new aliases are added. So using
4313 # this should be deferred until after all these are added. The code
4314 # that does that should clear this one's cache.
4315 # Any name with alphabetics is preferred over an all numeric one, even
4319 my $nominal_length_ptr = shift;
4320 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4322 my $addr = main::objaddr $self;
4324 # For efficiency, don't recalculate, but this means that adding new
4325 # aliases could change what the shortest is, so the code that does
4326 # that needs to undef this.
4327 if (defined $short_name{$addr}) {
4328 if ($nominal_length_ptr) {
4329 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4331 return $short_name{$addr};
4334 # Look at each alias
4335 foreach my $alias ($self->aliases()) {
4337 # Don't use an alias that isn't ok to use for an external name.
4338 next if ! $alias->externally_ok;
4340 my $name = main::Standardize($alias->name);
4341 trace $self, $name if main::DEBUG && $to_trace;
4343 # Take the first one, or a shorter one that isn't numeric. This
4344 # relies on numeric aliases always being last in the array
4345 # returned by aliases(). Any alpha one will have precedence.
4346 if (! defined $short_name{$addr}
4348 && length($name) < length($short_name{$addr})))
4350 # Remove interior underscores.
4351 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
4353 $nominal_short_name_length{$addr} = length $name;
4357 # If no suitable external name return undef
4358 if (! defined $short_name{$addr}) {
4359 $$nominal_length_ptr = undef if $nominal_length_ptr;
4363 # Don't allow a null external name.
4364 if ($short_name{$addr} eq "") {
4365 $short_name{$addr} = '_';
4366 $nominal_short_name_length{$addr} = 1;
4369 trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
4371 if ($nominal_length_ptr) {
4372 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4374 return $short_name{$addr};
4378 # Returns the external name that this table should be known by. This
4379 # is usually the short_name, but not if the short_name is undefined.
4382 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4384 my $short = $self->short_name;
4385 return $short if defined $short;
4390 sub add_description { # Adds the parameter as a short description.
4393 my $description = shift;
4395 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4397 push @{$description{main::objaddr $self}}, $description;
4402 sub add_note { # Adds the parameter as a short note.
4407 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4409 push @{$note{main::objaddr $self}}, $note;
4414 sub add_comment { # Adds the parameter as a comment.
4417 my $comment = shift;
4418 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4421 push @{$comment{main::objaddr $self}}, $comment;
4427 # Return the current comment for this table. If called in list
4428 # context, returns the array of comments. In scalar, returns a string
4429 # of each element joined together with a period ending each.
4432 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4434 my @list = @{$comment{main::objaddr $self}};
4435 return @list if wantarray;
4437 foreach my $sentence (@list) {
4438 $return .= '. ' if $return;
4439 $return .= $sentence;
4442 $return .= '.' if $return;
4447 # Initialize the table with the argument which is any valid
4448 # initialization for range lists.
4451 my $initialization = shift;
4452 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4454 # Replace the current range list with a new one of the same exact
4456 my $class = ref $range_list{main::objaddr $self};
4457 $range_list{main::objaddr $self} = $class->new(Owner => $self,
4458 Initialize => $initialization);
4464 # The header that is output for the table in the file it is written
4468 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4471 $return .= $DEVELOPMENT_ONLY if $compare_versions;
4473 $return .= $INTERNAL_ONLY if $internal_only{main::objaddr $self};
4478 # Write a representation of the table to its file.
4481 my $tab_stops = shift; # The number of tab stops over to put any
4483 my $suppress_value = shift; # Optional, if the value associated with
4484 # a range equals this one, don't write
4486 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4488 my $addr = main::objaddr($self);
4490 # Start with the header
4491 my @OUT = $self->header;
4494 push @OUT, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
4497 # Then any pre-body stuff.
4498 my $pre_body = $self->pre_body;
4499 push @OUT, $pre_body, "\n" if $pre_body;
4501 # The main body looks like a 'here' document
4502 push @OUT, "return <<'END';\n";
4504 if ($range_list{$addr}->is_empty) {
4506 # This is a kludge for empty tables to silence a warning in
4507 # utf8.c, which can't really deal with empty tables, but it can
4508 # deal with a table that matches nothing, as the inverse of 'Any'
4510 push @OUT, "!utf8::IsAny\n";
4513 my $range_size_1 = $range_size_1{$addr};
4515 # Output each range as part of the here document.
4516 for my $set ($range_list{$addr}->ranges) {
4517 my $start = $set->start;
4518 my $end = $set->end;
4519 my $value = $set->value;
4521 # Don't output ranges whose value is the one to suppress
4522 next if defined $suppress_value && $value eq $suppress_value;
4524 # If has or wants a single point range output
4525 if ($start == $end || $range_size_1) {
4526 for my $i ($start .. $end) {
4527 push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
4531 push @OUT, sprintf "%04X\t%04X\t%s", $start, $end, $value;
4533 # Add a comment with the size of the range, if requested.
4534 # Expand Tabs to make sure they all start in the same
4535 # column, and then unexpand to use mostly tabs.
4536 if (! $output_range_counts) {
4540 $OUT[-1] = Text::Tabs::expand($OUT[-1]);
4541 my $count = main::clarify_number($end - $start + 1);
4544 my $width = $tab_stops * 8 - 1;
4545 $OUT[-1] = sprintf("%-*s # [%s]\n",
4549 $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
4552 } # End of loop through all the table's ranges
4555 # Add anything that goes after the main body, but within the here
4557 my $append_to_body = $self->append_to_body;
4558 push @OUT, $append_to_body if $append_to_body;
4560 # And finish the here document.
4563 # All these files have a .pl suffix
4564 $file_path{$addr}->[-1] .= '.pl';
4566 main::write($file_path{$addr}, \@OUT);
4570 sub set_status { # Set the table's status
4572 my $status = shift; # The status enum value
4573 my $info = shift; # Any message associated with it.
4574 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4576 my $addr = main::objaddr($self);
4578 $status{$addr} = $status;
4579 $status_info{$addr} = $info;
4584 # Don't allow changes to the table from now on. This stores a stack
4585 # trace of where it was called, so that later attempts to modify it
4586 # can immediately show where it got locked.
4589 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4591 my $addr = main::objaddr $self;
4593 $locked{$addr} = "";
4595 my $line = (caller(0))[2];
4598 # Accumulate the stack trace
4600 my ($pkg, $file, $caller_line, $caller) = caller $i++;
4602 last unless defined $caller;
4604 $locked{$addr} .= " called from $caller() at line $line\n";
4605 $line = $caller_line;
4607 $locked{$addr} .= " called from main at line $line\n";
4612 sub carp_if_locked {
4613 # Return whether a table is locked or not, and, by the way, complain
4617 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4619 my $addr = main::objaddr $self;
4621 return 0 if ! $locked{$addr};
4622 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
4626 sub set_file_path { # Set the final directory path for this table
4628 # Rest of parameters passed on
4630 @{$file_path{main::objaddr $self}} = @_;
4634 # Accessors for the range list stored in this table. First for
4653 return $range_list{main::objaddr $self}->$sub(@_);
4657 # Then for ones that should fail if locked
4667 return if $self->carp_if_locked;
4668 return $range_list{main::objaddr $self}->$sub(@_);
4675 use base '_Base_Table';
4677 # A Map Table is a table that contains the mappings from code points to
4678 # values. There are two weird cases:
4679 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
4680 # are written in the table's file at the end of the table nonetheless. It
4681 # requires specially constructed code to handle these; utf8.c can not read
4682 # these in, so they should not go in $map_directory. As of this writing,
4683 # the only case that these happen is for named sequences used in
4684 # charnames.pm. But this code doesn't enforce any syntax on these, so
4685 # something else could come along that uses it.
4686 # 2) Specials are anything that doesn't fit syntactically into the body of the
4687 # table. The ranges for these have a map type of non-zero. The code below
4688 # knows about and handles each possible type. In most cases, these are
4689 # written as part of the header.
4691 # A map table deliberately can't be manipulated at will unlike match tables.
4692 # This is because of the ambiguities having to do with what to do with
4693 # overlapping code points. And there just isn't a need for those things;
4694 # what one wants to do is just query, add, replace, or delete mappings, plus
4695 # write the final result.
4696 # However, there is a method to get the list of possible ranges that aren't in
4697 # this table to use for defaulting missing code point mappings. And,
4698 # map_add_or_replace_non_nulls() does allow one to add another table to this
4699 # one, but it is clearly very specialized, and defined that the other's
4700 # non-null values replace this one's if there is any overlap.
4702 sub trace { return main::trace(@_); }
4706 main::setup_package();
4709 # Many input files omit some entries; this gives what the mapping for the
4710 # missing entries should be
4711 main::set_access('default_map', \%default_map, 'r');
4713 my %anomalous_entries;
4714 # Things that go in the body of the table which don't fit the normal
4715 # scheme of things, like having a range. Not much can be done with these
4716 # once there except to output them. This was created to handle named
4718 main::set_access('anomalous_entry', \%anomalous_entries, 'a');
4719 main::set_access('anomalous_entries', # Append singular, read plural
4720 \%anomalous_entries,
4724 # The format of the entries of the table. This is calculated from the
4725 # data in the table (or passed in the constructor). This is an enum e.g.,
4727 main::set_access('format', \%format);
4730 # This is a string, solely for documentation, indicating how one can get
4731 # access to this property via the Perl core.
4732 main::set_access('core_access', \%core_access, 'r', 's');
4735 # Boolean set when non-zero map-type ranges are added to this table,
4736 # which happens in only a few tables. This is purely for performance, to
4737 # avoid having to search through every table upon output, so if all the
4738 # non-zero maps got deleted before output, this would remain set, and the
4739 # only penalty would be performance. Currently, most map tables that get
4740 # output have specials in them, so this doesn't help that much anyway.
4741 main::set_access('has_specials', \%has_specials);
4744 # Boolean as to whether or not to write out this map table
4745 main::set_access('to_output_map', \%to_output_map, 's');
4754 # Optional initialization data for the table.
4755 my $initialize = delete $args{'Initialize'};
4757 my $core_access = delete $args{'Core_Access'};
4758 my $default_map = delete $args{'Default_Map'};
4759 my $format = delete $args{'Format'};
4760 my $property = delete $args{'_Property'};
4761 my $full_name = delete $args{'Full_Name'};
4762 # Rest of parameters passed on
4764 my $range_list = Range_Map->new(Owner => $property);
4766 my $self = $class->SUPER::new(
4768 Complete_Name => $full_name,
4769 Full_Name => $full_name,
4770 _Property => $property,
4771 _Range_List => $range_list,
4774 my $addr = main::objaddr $self;
4776 $anomalous_entries{$addr} = [];
4777 $core_access{$addr} = $core_access;
4778 $default_map{$addr} = $default_map;
4779 $format{$addr} = $format;
4781 $self->initialize($initialize) if defined $initialize;
4788 qw("") => "_operator_stringify",
4791 sub _operator_stringify {
4794 my $name = $self->property->full_name;
4795 $name = '""' if $name eq "";
4796 return "Map table for Property '$name'";
4800 # Add a synonym for this table (which means the property itself)
4803 # Rest of parameters passed on.
4805 $self->SUPER::add_alias($name, $self->property, @_);
4810 # Add a range of code points to the list of specially-handled code
4811 # points. $MULTI_CP is assumed if the type of special is not passed
4820 my $type = delete $args{'Type'} || 0;
4821 # Rest of parameters passed on
4823 # Can't change the table if locked.
4824 return if $self->carp_if_locked;
4826 my $addr = main::objaddr $self;
4828 $has_specials{$addr} = 1 if $type;
4830 $self->_range_list->add_map($lower, $upper,
4837 sub append_to_body {
4838 # Adds to the written HERE document of the table's body any anomalous
4839 # entries in the table..
4842 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4844 my $addr = main::objaddr $self;
4846 return "" unless @{$anomalous_entries{$addr}};
4847 return join("\n", @{$anomalous_entries{$addr}}) . "\n";
4850 sub map_add_or_replace_non_nulls {
4851 # This adds the mappings in the table $other to $self. Non-null
4852 # mappings from $other override those in $self. It essentially merges
4853 # the two tables, with the second having priority except for null
4858 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4860 return if $self->carp_if_locked;
4862 if (! $other->isa(__PACKAGE__)) {
4863 Carp::my_carp_bug("$other should be a "
4871 my $addr = main::objaddr $self;
4872 my $other_addr = main::objaddr $other;
4874 local $to_trace = 0 if main::DEBUG;
4876 my $self_range_list = $self->_range_list;
4877 my $other_range_list = $other->_range_list;
4878 foreach my $range ($other_range_list->ranges) {
4879 my $value = $range->value;
4880 next if $value eq "";
4881 $self_range_list->_add_delete('+',
4885 Type => $range->type,
4886 Replace => $UNCONDITIONALLY);
4889 # Copy the specials information from the other table to $self
4890 if ($has_specials{$other_addr}) {
4891 $has_specials{$addr} = 1;
4897 sub set_default_map {
4898 # Define what code points that are missing from the input files should
4903 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4905 my $addr = main::objaddr $self;
4907 # Convert the input to the standard equivalent, if any (won't have any
4908 # for $STRING properties)
4909 my $standard = $self->_find_table_from_alias->{$map};
4910 $map = $standard->name if defined $standard;
4912 # Warn if there already is a non-equivalent default map for this
4913 # property. Note that a default map can be a ref, which means that
4914 # what it actually means is delayed until later in the program, and it
4915 # IS permissible to override it here without a message.
4916 my $default_map = $default_map{$addr};
4917 if (defined $default_map
4918 && ! ref($default_map)
4919 && $default_map ne $map
4920 && main::Standardize($map) ne $default_map)
4922 my $property = $self->property;
4923 my $map_table = $property->table($map);
4924 my $default_table = $property->table($default_map);
4925 if (defined $map_table
4926 && defined $default_table
4927 && $map_table != $default_table)
4929 Carp::my_carp("Changing the default mapping for "
4931 . " from $default_map to $map'");
4935 $default_map{$addr} = $map;
4937 # Don't also create any missing table for this map at this point,
4938 # because if we did, it could get done before the main table add is
4939 # done for PropValueAliases.txt; instead the caller will have to make
4940 # sure it exists, if desired.
4945 # Returns boolean: should we write this map table?
4948 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4950 my $addr = main::objaddr $self;
4952 # If overridden, use that
4953 return $to_output_map{$addr} if defined $to_output_map{$addr};
4955 my $full_name = $self->full_name;
4957 # If table says to output, do so; if says to suppress it, do do.
4958 return 1 if grep { $_ eq $full_name } @output_mapped_properties;
4959 return 0 if $self->status eq $SUPPRESSED;
4961 my $type = $self->property->type;
4963 # Don't want to output binary map tables even for debugging.
4964 return 0 if $type == $BINARY;
4966 # But do want to output string ones.
4967 return 1 if $type == $STRING;
4969 # Otherwise is an $ENUM, don't output it
4974 # Returns a Range_List that is gaps of the current table. That is,
4978 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4980 my $current = Range_List->new(Initialize => $self->_range_list,
4981 Owner => $self->property);
4985 sub set_final_comment {
4986 # Just before output, create the comment that heads the file
4987 # containing this table.
4990 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4992 # No sense generating a comment if aren't going to write it out.
4993 return if ! $self->to_output_map;
4995 my $addr = main::objaddr $self;
4997 my $property = $self->property;
4999 # Get all the possible names for this property. Don't use any that
5000 # aren't ok for use in a file name, etc. This is perhaps causing that
5001 # flag to do double duty, and may have to be changed in the future to
5002 # have our own flag for just this purpose; but it works now to exclude
5003 # Perl generated synonyms from the lists for properties, where the
5004 # name is always the proper Unicode one.
5005 my @property_aliases = grep { $_->externally_ok } $self->aliases;
5007 my $count = $self->count;
5008 my $default_map = $default_map{$addr};
5010 # The ranges that map to the default aren't output, so subtract that
5011 # to get those actually output. A property with matching tables
5012 # already has the information calculated.
5013 if ($property->type != $STRING) {
5014 $count -= $property->table($default_map)->count;
5016 elsif (defined $default_map) {
5018 # But for $STRING properties, must calculate now. Subtract the
5019 # count from each range that maps to the default.
5020 foreach my $range ($self->_range_list->ranges) {
5021 if ($range->value eq $default_map) {
5022 $count -= $range->end +1 - $range->start;
5028 # Get a string version of $count with underscores in large numbers,
5030 my $string_count = main::clarify_number($count);
5032 my $code_points = ($count == 1)
5033 ? 'single code point'
5034 : "$string_count code points";
5039 if (@property_aliases <= 1) {
5040 $mapping = 'mapping';
5041 $these_mappings = 'this mapping';
5045 $mapping = 'synonymous mappings';
5046 $these_mappings = 'these mappings';
5050 if ($count >= $MAX_UNICODE_CODEPOINTS) {
5051 $cp = "any code point in Unicode Version $string_version";
5055 if ($default_map eq "") {
5056 $map_to = 'the null string';
5058 elsif ($default_map eq $CODE_POINT) {
5062 $map_to = "'$default_map'";
5065 $cp = "the single code point";
5068 $cp = "one of the $code_points";
5070 $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
5075 my $status = $self->status;
5077 my $warn = uc $status_past_participles{$status};
5080 !!!!!!! $warn !!!!!!!!!!!!!!!!!!!
5081 All property or property=value combinations contained in this file are $warn.
5082 See $unicode_reference_url for what this means.
5086 $comment .= "This file returns the $mapping:\n";
5088 for my $i (0 .. @property_aliases - 1) {
5089 $comment .= sprintf("%-8s%s\n",
5091 $property_aliases[$i]->name . '(cp)'
5095 "\nwhere 'cp' is $cp. Note that $these_mappings $are ";
5097 my $access = $core_access{$addr};
5099 $comment .= "accessible through the Perl core via $access.";
5102 $comment .= "not accessible through the Perl core directly.";
5105 # And append any commentary already set from the actual property.
5106 $comment .= "\n\n" . $self->comment if $self->comment;
5107 if ($self->description) {
5108 $comment .= "\n\n" . join " ", $self->description;
5111 $comment .= "\n\n" . join " ", $self->note;
5115 if (! $self->perl_extension) {
5118 For information about what this property really means, see:
5119 $unicode_reference_url
5123 if ($count) { # Format differs for empty table
5124 $comment.= "\nThe format of the ";
5125 if ($self->range_size_1) {
5127 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
5128 is in hex; MAPPING is what CODE_POINT maps to.
5133 # There are tables which end up only having one element per
5134 # range, but it is not worth keeping track of for making just
5135 # this comment a little better.
5137 non-comment portions of the main body of lines of this file is:
5138 START\\tSTOP\\tMAPPING where START is the starting code point of the
5139 range, in hex; STOP is the ending point, or if omitted, the range has just one
5140 code point; MAPPING is what each code point between START and STOP maps to.
5142 if ($output_range_counts) {
5144 Numbers in comments in [brackets] indicate how many code points are in the
5145 range (omitted when the range is a single code point or if the mapping is to
5151 $self->set_comment(main::join_lines($comment));
5155 my %swash_keys; # Makes sure don't duplicate swash names.
5158 # Returns the string that should be output in the file before the main
5159 # body of this table. This includes some hash entries identifying the
5160 # format of the body, and what the single value should be for all
5161 # ranges missing from it. It also includes any code points which have
5162 # map_types that don't go in the main table.
5165 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5167 my $addr = main::objaddr $self;
5169 my $name = $self->property->swash_name;
5171 if (defined $swash_keys{$name}) {
5172 Carp::my_carp(join_lines(<<END
5173 Already created a swash name '$name' for $swash_keys{$name}. This means that
5174 the same name desired for $self shouldn't be used. Bad News. This must be
5175 fixed before production use, but proceeding anyway
5179 $swash_keys{$name} = "$self";
5181 my $default_map = $default_map{$addr};
5184 if ($has_specials{$addr}) {
5186 # Here, some maps with non-zero type have been added to the table.
5187 # Go through the table and handle each of them. None will appear
5188 # in the body of the table, so delete each one as we go. The
5189 # code point count has already been calculated, so ok to delete
5192 my @multi_code_point_maps;
5193 my $has_hangul_syllables = 0;
5195 # The key is the base name of the code point, and the value is an
5196 # array giving all the ranges that use this base name. Each range
5197 # is actually a hash giving the 'low' and 'high' values of it.
5198 my %names_ending_in_code_point;
5200 # Inverse mapping. The list of ranges that have these kinds of
5201 # names. Each element contains the low, high, and base names in a
5203 my @code_points_ending_in_code_point;
5205 my $range_map = $self->_range_list;
5206 foreach my $range ($range_map->ranges) {
5207 next unless $range->type != 0;
5208 my $low = $range->start;
5209 my $high = $range->end;
5210 my $map = $range->value;
5211 my $type = $range->type;
5213 # No need to output the range if it maps to the default. And
5214 # the write method won't output it either, so no need to
5215 # delete it to keep it from being output, and is faster to
5216 # skip than to delete anyway.
5217 next if $map eq $default_map;
5219 # Delete the range to keep write() from trying to output it
5220 $range_map->delete_range($low, $high);
5222 # Switch based on the map type...
5223 if ($type == $HANGUL_SYLLABLE) {
5225 # These are entirely algorithmically determinable based on
5226 # some constants furnished by Unicode; for now, just set a
5227 # flag to indicate that have them. Below we will output
5228 # the code that does the algorithm.
5229 $has_hangul_syllables = 1;
5231 elsif ($type == $CP_IN_NAME) {
5233 # If the name ends in the code point it represents, are
5234 # also algorithmically determinable, but need information
5235 # about the map to do so. Both the map and its inverse
5236 # are stored in data structures output in the file.
5237 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
5238 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
5240 push @code_points_ending_in_code_point, { low => $low,
5245 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
5247 # Multi-code point maps and null string maps have an entry
5248 # for each code point in the range. They use the same
5250 for my $code_point ($low .. $high) {
5252 # The pack() below can't cope with surrogates.
5253 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
5254 Carp::my_carp("Surrogage code point '$code_point' in mapping to '$map' in $self. No map created");
5258 # Generate the hash entries for these in the form that
5259 # utf8.c understands.
5261 foreach my $to (split " ", $map) {
5262 if ($to !~ /^$code_point_re$/) {
5263 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created");
5266 $tostr .= sprintf "\\x{%s}", $to;
5269 # I (khw) have never waded through this line to
5270 # understand it well enough to comment it.
5271 my $utf8 = sprintf(qq["%s" => "$tostr",],
5272 join("", map { sprintf "\\x%02X", $_ }
5273 unpack("U0C*", pack("U", $code_point))));
5275 # Add a comment so that a human reader can more easily
5276 # see what's going on.
5277 push @multi_code_point_maps,
5278 sprintf("%-45s # U+%04X => %s", $utf8,
5284 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Using type 0 instead");
5285 $range_map->add_map($low, $high, $map, Replace => $UNCONDITIONALLY, Type => 0);
5287 } # End of loop through all ranges
5289 # Here have gone through the whole file. If actually generated
5290 # anything for each map type, add its respective header and
5292 if (@multi_code_point_maps) {
5295 # Some code points require special handling because their mappings are each to
5296 # multiple code points. These do not appear in the main body, but are defined
5297 # in the hash below.
5299 # The key: UTF-8 _bytes_, the value: UTF-8 (speed hack)
5300 %utf8::ToSpec$name = (
5302 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
5305 if ($has_hangul_syllables || @code_points_ending_in_code_point) {
5307 # Convert these structures to output format.
5308 my $code_points_ending_in_code_point =
5309 main::simple_dumper(\@code_points_ending_in_code_point,
5311 my $names = main::simple_dumper(\%names_ending_in_code_point,
5314 # Do the same with the Hangul names,
5320 if ($has_hangul_syllables) {
5322 # Construct a regular expression of all the possible
5323 # combinations of the Hangul syllables.
5324 my @L_re; # Leading consonants
5325 for my $i ($LBase .. $LBase + $LCount - 1) {
5326 push @L_re, $Jamo{$i}
5328 my @V_re; # Middle vowels
5329 for my $i ($VBase .. $VBase + $VCount - 1) {
5330 push @V_re, $Jamo{$i}
5332 my @T_re; # Trailing consonants
5333 for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
5334 push @T_re, $Jamo{$i}
5337 # The whole re is made up of the L V T combination.
5339 . join ('|', sort @L_re)
5341 . join ('|', sort @V_re)
5343 . join ('|', sort @T_re)
5346 # These hashes needed by the algorithm were generated
5347 # during reading of the Jamo.txt file
5348 $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
5349 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
5350 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
5351 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
5356 # To achieve significant memory savings when this file is read in,
5357 # algorithmically derivable code points are omitted from the main body below.
5358 # Instead, the following routines can be used to translate between name and
5359 # code point and vice versa
5363 # Matches legal code point. 4-6 hex numbers, If there are 6, the
5364 # first two must be '10'; if there are 5, the first must not be a '0'.
5365 my \$code_point_re = qr/$code_point_re/;
5367 # In the following hash, the keys are the bases of names which includes
5368 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The values
5369 # of each key is another hash which is used to get the low and high ends
5370 # for each range of code points that apply to the name
5371 my %names_ending_in_code_point = (
5375 # And the following array gives the inverse mapping from code points to
5376 # names. Lowest code points are first
5377 my \@code_points_ending_in_code_point = (
5378 $code_points_ending_in_code_point
5381 # Earlier releases didn't have Jamos. No sense outputting
5382 # them unless will be used.
5383 if ($has_hangul_syllables) {
5386 # Convert from code point to Jamo short name for use in composing Hangul
5392 # Leading consonant (can be null)
5402 # Optional trailing consonant
5407 # Computed re that splits up a Hangul name into LVT or LV syllables
5408 my \$syllable_re = qr/$jamo_re/;
5410 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
5411 my \$HANGUL_SYLLABLE_LENGTH = length \$HANGUL_SYLLABLE;
5413 # These constants names and values were taken from the Unicode standard,
5414 # version 5.1, section 3.12. They are used in conjunction with Hangul
5416 my \$SBase = 0xAC00;
5417 my \$LBase = 0x1100;
5418 my \$VBase = 0x1161;
5419 my \$TBase = 0x11A7;
5420 my \$SCount = 11172;
5424 my \$NCount = \$VCount * \$TCount;
5426 } # End of has Jamos
5428 $pre_body .= << 'END';
5430 sub name_to_code_point_special {
5433 # Returns undef if not one of the specially handled names; otherwise
5434 # returns the code point equivalent to the input name
5436 if ($has_hangul_syllables) {
5437 $pre_body .= << 'END';
5439 if (substr($name, 0, $HANGUL_SYLLABLE_LENGTH) eq $HANGUL_SYLLABLE) {
5440 $name = substr($name, $HANGUL_SYLLABLE_LENGTH);
5441 return if $name !~ qr/^$syllable_re$/;
5442 my $L = $Jamo_L{$1};
5443 my $V = $Jamo_V{$2};
5444 my $T = (defined $3) ? $Jamo_T{$3} : 0;
5445 return ($L * $VCount + $V) * $TCount + $T + $SBase;
5449 $pre_body .= << 'END';
5451 # Name must end in '-code_point' for this to handle.
5452 if ($name !~ /^ (.*) - ($code_point_re) $/x) {
5457 my $code_point = CORE::hex $2;
5459 # Name must be one of the ones which has the code point in it.
5460 return if ! $names_ending_in_code_point{$base};
5462 # Look through the list of ranges that apply to this name to see if
5463 # the code point is in one of them.
5464 for (my $i = 0; $i < scalar @{$names_ending_in_code_point{$base}{'low'}}; $i++) {
5465 return if $names_ending_in_code_point{$base}{'low'}->[$i] > $code_point;
5466 next if $names_ending_in_code_point{$base}{'high'}->[$i] < $code_point;
5468 # Here, the code point is in the range.
5472 # Here, looked like the name had a code point number in it, but
5473 # did not match one of the valid ones.
5477 sub code_point_to_name_special {
5478 my $code_point = shift;
5480 # Returns the name of a code point if algorithmically determinable;
5483 if ($has_hangul_syllables) {
5484 $pre_body .= << 'END';
5486 # If in the Hangul range, calculate the name based on Unicode's
5488 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
5490 my $SIndex = $code_point - $SBase;
5491 my $L = $LBase + $SIndex / $NCount;
5492 my $V = $VBase + ($SIndex % $NCount) / $TCount;
5493 my $T = $TBase + $SIndex % $TCount;
5494 $name = "$HANGUL_SYLLABLE $Jamo{$L}$Jamo{$V}";
5495 $name .= $Jamo{$T} if $T != $TBase;
5500 $pre_body .= << 'END';
5502 # Look through list of these code points for one in range.
5503 foreach my $hash (@code_points_ending_in_code_point) {
5504 return if $code_point < $hash->{'low'};
5505 if ($code_point <= $hash->{'high'}) {
5506 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
5509 return; # None found
5514 } # End of has hangul or code point in name maps.
5515 } # End of has specials
5517 # Calculate the format of the table if not already done.
5518 my $format = $format{$addr};
5519 my $property = $self->property;
5520 my $type = $property->type;
5521 if (! defined $format) {
5522 if ($type == $BINARY) {
5524 # Don't bother checking the values, because we elsewhere
5525 # verify that a binary table has only 2 values.
5526 $format = $BINARY_FORMAT;
5529 my @ranges = $self->_range_list->ranges;
5531 # default an empty table based on its type and default map
5534 # But it turns out that the only one we can say is a
5535 # non-string (besides binary, handled above) is when the
5536 # table is a string and the default map is to a code point
5537 if ($type == $STRING && $default_map eq $CODE_POINT) {
5538 $format = $HEX_FORMAT;
5541 $format = $STRING_FORMAT;
5546 # Start with the most restrictive format, and as we find
5547 # something that doesn't fit with that, change to the next
5548 # most restrictive, and so on.
5549 $format = $DECIMAL_FORMAT;
5550 foreach my $range (@ranges) {
5551 my $map = $range->value;
5552 if ($map ne $default_map) {
5553 last if $format eq $STRING_FORMAT; # already at
5556 $format = $INTEGER_FORMAT
5557 if $format eq $DECIMAL_FORMAT
5558 && $map !~ / ^ [0-9] $ /x;
5559 $format = $FLOAT_FORMAT
5560 if $format eq $INTEGER_FORMAT
5561 && $map !~ / ^ -? [0-9]+ $ /x;
5562 $format = $RATIONAL_FORMAT
5563 if $format eq $FLOAT_FORMAT
5564 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
5565 $format = $HEX_FORMAT
5566 if $format eq $RATIONAL_FORMAT
5567 && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
5568 $format = $STRING_FORMAT if $format eq $HEX_FORMAT
5569 && $map =~ /[^0-9A-F]/;
5574 } # end of calculating format
5577 # The name this swash is to be known by, with the format of the mappings in
5578 # the main body of the table, and what all code points missing from this file
5580 \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
5582 my $missing = $default_map;
5583 if ($missing eq $CODE_POINT
5584 && $format ne $HEX_FORMAT
5585 && ! defined $format{$addr}) # Is expected if was manually set
5587 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
5589 $format{$addr} = $format;
5590 $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$missing';";
5591 if ($missing eq $CODE_POINT) {
5592 $return .= ' # code point maps to itself';
5594 elsif ($missing eq "") {
5595 $return .= ' # code point maps to the null string';
5599 $return .= $pre_body;
5605 # Write the table to the file.
5608 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5610 my $addr = main::objaddr $self;
5612 return $self->SUPER::write(
5613 ($self->property == $block)
5614 ? 7 # block file needs more tab stops
5616 $default_map{$addr}); # don't write defaulteds
5619 # Accessors for the underlying list that should fail if locked.
5629 return if $self->carp_if_locked;
5630 return $self->_range_list->$sub(@_);
5633 } # End closure for Map_Table
5635 package Match_Table;
5636 use base '_Base_Table';
5638 # A Match table is one which is a list of all the code points that have
5639 # the same property and property value, for use in \p{property=value}
5640 # constructs in regular expressions. It adds very little data to the base
5641 # structure, but many methods, as these lists can be combined in many ways to
5643 # There are only a few concepts added:
5644 # 1) Equivalents and Relatedness.
5645 # Two tables can match the identical code points, but have different names.
5646 # This always happens when there is a perl single form extension
5647 # \p{IsProperty} for the Unicode compound form \P{Property=True}. The two
5648 # tables are set to be related, with the Perl extension being a child, and
5649 # the Unicode property being the parent.
5651 # It may be that two tables match the identical code points and we don't
5652 # know if they are related or not. This happens most frequently when the
5653 # Block and Script properties have the exact range. But note that a
5654 # revision to Unicode could add new code points to the script, which would
5655 # now have to be in a different block (as the block was filled, or there
5656 # would have been 'Unknown' script code points in it and they wouldn't have
5657 # been identical). So we can't rely on any two properties from Unicode
5658 # always matching the same code points from release to release, and thus
5659 # these tables are considered coincidentally equivalent--not related. When
5660 # two tables are unrelated but equivalent, one is arbitrarily chosen as the
5661 # 'leader', and the others are 'equivalents'. This concept is useful
5662 # to minimize the number of tables written out. Only one file is used for
5663 # any identical set of code points, with entries in Heavy.pl mapping all
5664 # the involved tables to it.
5666 # Related tables will always be identical; we set them up to be so. Thus
5667 # if the Unicode one is deprecated, the Perl one will be too. Not so for
5668 # unrelated tables. Relatedness makes generating the documentation easier.
5670 # 2) Conflicting. It may be that there will eventually be name clashes, with
5671 # the same name meaning different things. For a while, there actually were
5672 # conflicts, but they have so far been resolved by changing Perl's or
5673 # Unicode's definitions to match the other, but when this code was written,
5674 # it wasn't clear that that was what was going to happen. (Unicode changed
5675 # because of protests during their beta period.) Name clashes are warned
5676 # about during compilation, and the documentation. The generated tables
5677 # are sane, free of name clashes, because the code suppresses the Perl
5678 # version. But manual intervention to decide what the actual behavior
5679 # should be may be required should this happen. The introductory comments
5680 # have more to say about this.
5682 sub standardize { return main::standardize($_[0]); }
5683 sub trace { return main::trace(@_); }
5688 main::setup_package();
5691 # The leader table of this one; initially $self.
5692 main::set_access('leader', \%leader, 'r');
5695 # An array of any tables that have this one as their leader
5696 main::set_access('equivalents', \%equivalents, 'readable_array');
5699 # The parent table to this one, initially $self. This allows us to
5700 # distinguish between equivalent tables that are related, and those which
5701 # may not be, but share the same output file because they match the exact
5702 # same set of code points in the current Unicode release.
5703 main::set_access('parent', \%parent, 'r');
5706 # An array of any tables that have this one as their parent
5707 main::set_access('children', \%children, 'readable_array');
5710 # Array of any tables that would have the same name as this one with
5711 # a different meaning. This is used for the generated documentation.
5712 main::set_access('conflicting', \%conflicting, 'readable_array');
5715 # Set in the constructor for tables that are expected to match all code
5717 main::set_access('matches_all', \%matches_all, 'r');
5724 # The property for which this table is a listing of property values.
5725 my $property = delete $args{'_Property'};
5727 my $name = delete $args{'Name'};
5728 my $full_name = delete $args{'Full_Name'};
5729 $full_name = $name if ! defined $full_name;
5732 my $initialize = delete $args{'Initialize'};
5733 my $matches_all = delete $args{'Matches_All'} || 0;
5734 # Rest of parameters passed on.
5736 my $range_list = Range_List->new(Initialize => $initialize,
5737 Owner => $property);
5739 my $complete = $full_name;
5740 $complete = '""' if $complete eq ""; # A null name shouldn't happen,
5741 # but this helps debug if it
5743 # The complete name for a match table includes it's property in a
5744 # compound form 'property=table', except if the property is the
5745 # pseudo-property, perl, in which case it is just the single form,
5746 # 'table' (If you change the '=' must also change the ':' in lots of
5747 # places in this program that assume an equal sign)
5748 $complete = $property->full_name . "=$complete" if $property != $perl;
5751 my $self = $class->SUPER::new(%args,
5753 Complete_Name => $complete,
5754 Full_Name => $full_name,
5755 _Property => $property,
5756 _Range_List => $range_list,
5758 my $addr = main::objaddr $self;
5760 $conflicting{$addr} = [ ];
5761 $equivalents{$addr} = [ ];
5762 $children{$addr} = [ ];
5763 $matches_all{$addr} = $matches_all;
5764 $leader{$addr} = $self;
5765 $parent{$addr} = $self;
5770 # See this program's beginning comment block about overloading these.
5773 qw("") => "_operator_stringify",
5777 return if $self->carp_if_locked;
5785 return $self->_range_list + $other;
5791 return $self->_range_list & $other;
5797 return if $self->carp_if_locked;
5799 my $addr = main::objaddr $self;
5803 # Change the range list of this table to be the
5805 $self->_set_range_list($self->_range_list
5808 else { # $other is just a simple value
5809 $self->add_range($other, $other);
5813 '-' => sub { my $self = shift;
5815 my $reversed = shift;
5818 Carp::my_carp_bug("Can't cope with a "
5820 . " being the first parameter in a '-'. Subtraction ignored.");
5824 return $self->_range_list - $other;
5826 '~' => sub { my $self = shift;
5827 return ~ $self->_range_list;
5831 sub _operator_stringify {
5834 my $name = $self->complete_name;
5835 return "Table '$name'";
5839 # Add a synonym for this table. See the comments in the base class
5843 # Rest of parameters passed on.
5845 $self->SUPER::add_alias($name, $self, @_);
5849 sub add_conflicting {
5850 # Add the name of some other object to the list of ones that name
5851 # clash with this match table.
5854 my $conflicting_name = shift; # The name of the conflicting object
5855 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ?
5856 my $conflicting_object = shift; # Optional, the conflicting object
5857 # itself. This is used to
5858 # disambiguate the text if the input
5859 # name is identical to any of the
5860 # aliases $self is known by.
5861 # Sometimes the conflicting object is
5862 # merely hypothetical, so this has to
5863 # be an optional parameter.
5864 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5866 my $addr = main::objaddr $self;
5868 # Check if the conflicting name is exactly the same as any existing
5869 # alias in this table (as long as there is a real object there to
5870 # disambiguate with).
5871 if (defined $conflicting_object) {
5872 foreach my $alias ($self->aliases) {
5873 if ($alias->name eq $conflicting_name) {
5875 # Here, there is an exact match. This results in
5876 # ambiguous comments, so disambiguate by changing the
5877 # conflicting name to its object's complete equivalent.
5878 $conflicting_name = $conflicting_object->complete_name;
5884 # Convert to the \p{...} final name
5885 $conflicting_name = "\\$p" . "{$conflicting_name}";
5888 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
5890 push @{$conflicting{$addr}}, $conflicting_name;
5895 sub is_equivalent_to {
5896 # Return boolean of whether or not the other object is a table of this
5897 # type and has been marked equivalent to this one.
5901 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5903 return 0 if ! defined $other; # Can happen for incomplete early
5905 unless ($other->isa(__PACKAGE__)) {
5906 my $ref_other = ref $other;
5907 my $ref_self = ref $self;
5908 Carp::my_carp_bug("Argument to 'is_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
5912 # Two tables are equivalent if they have the same leader.
5913 return $leader{main::objaddr $self}
5914 == $leader{main::objaddr $other};
5918 sub matches_identically_to {
5919 # Return a boolean as to whether or not two tables match identical
5920 # sets of code points.
5924 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5926 unless ($other->isa(__PACKAGE__)) {
5927 my $ref_other = ref $other;
5928 my $ref_self = ref $self;
5929 Carp::my_carp_bug("Argument to 'matches_identically_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
5933 # These are ordered in increasing real time to figure out (at least
5934 # until a patch changes that and doesn't change this)
5935 return 0 if $self->max != $other->max;
5936 return 0 if $self->min != $other->min;
5937 return 0 if $self->range_count != $other->range_count;
5938 return 0 if $self->count != $other->count;
5940 # Here they could be identical because all the tests above passed.
5941 # The loop below is somewhat simpler since we know they have the same
5942 # number of elements. Compare range by range, until reach the end or
5943 # find something that differs.
5944 my @a_ranges = $self->_range_list->ranges;
5945 my @b_ranges = $other->_range_list->ranges;
5946 for my $i (0 .. @a_ranges - 1) {
5947 my $a = $a_ranges[$i];
5948 my $b = $b_ranges[$i];
5949 trace "self $a; other $b" if main::DEBUG && $to_trace;
5950 return 0 if $a->start != $b->start || $a->end != $b->end;
5955 sub set_equivalent_to {
5956 # Set $self equivalent to the parameter table.
5957 # The required Related => 'x' parameter is a boolean indicating
5958 # whether these tables are related or not. If related, $other becomes
5959 # the 'parent' of $self; if unrelated it becomes the 'leader'
5961 # Related tables share all characteristics except names; equivalents
5962 # not quite so many.
5963 # If they are related, one must be a perl extension. This is because
5964 # we can't guarantee that Unicode won't change one or the other in a
5965 # later release even if they are idential now.
5971 my $related = delete $args{'Related'};
5973 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5975 return if ! defined $other; # Keep on going; happens in some early
5978 if (! defined $related) {
5979 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other");
5983 # If already are equivalent, no need to re-do it; if subroutine
5984 # returns null, it found an error, also do nothing
5985 my $are_equivalent = $self->is_equivalent_to($other);
5986 return if ! defined $are_equivalent || $are_equivalent;
5988 my $current_leader = ($related)
5989 ? $parent{main::objaddr $self}
5990 : $leader{main::objaddr $self};
5993 ! $other->perl_extension
5994 && ! $current_leader->perl_extension)
5996 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
6000 my $leader = main::objaddr $current_leader;
6001 my $other_addr = main::objaddr $other;
6003 # Any tables that are equivalent to or children of this table must now
6004 # instead be equivalent to or (children) to the new leader (parent),
6005 # still equivalent. The equivalency includes their matches_all info,
6006 # and for related tables, their status
6007 # All related tables are of necessity equivalent, but the converse
6008 # isn't necessarily true
6009 my $status = $other->status;
6010 my $status_info = $other->status_info;
6011 my $matches_all = $matches_all{other_addr};
6012 foreach my $table ($current_leader, @{$equivalents{$leader}}) {
6013 next if $table == $other;
6014 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
6016 my $table_addr = main::objaddr $table;
6017 $leader{$table_addr} = $other;
6018 $matches_all{$table_addr} = $matches_all;
6019 $self->_set_range_list($other->_range_list);
6020 push @{$equivalents{$other_addr}}, $table;
6022 $parent{$table_addr} = $other;
6023 push @{$children{$other_addr}}, $table;
6024 $table->set_status($status, $status_info);
6028 # Now that we've declared these to be equivalent, any changes to one
6029 # of the tables would invalidate that equivalency.
6035 sub add_range { # Add a range to the list for this table.
6037 # Rest of parameters passed on
6039 return if $self->carp_if_locked;
6040 return $self->_range_list->add_range(@_);
6043 sub pre_body { # Does nothing for match tables.
6047 sub append_to_body { # Does nothing for match tables.
6053 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6055 return $self->SUPER::write(2); # 2 tab stops
6058 sub set_final_comment {
6059 # This creates a comment for the file that is to hold the match table
6060 # $self. It is somewhat convoluted to make the English read nicely,
6061 # but, heh, it's just a comment.
6062 # This should be called only with the leader match table of all the
6063 # ones that share the same file. It lists all such tables, ordered so
6064 # that related ones are together.
6066 my $leader = shift; # Should only be called on the leader table of
6067 # an equivalent group
6068 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6070 my $addr = main::objaddr $leader;
6072 if ($leader{$addr} != $leader) {
6073 Carp::my_carp_bug(<<END
6074 set_final_comment() must be called on a leader table, which $leader is not.
6075 It is equivalent to $leader{$addr}. No comment created
6081 # Get the number of code points matched by each of the tables in this
6082 # file, and add underscores for clarity.
6083 my $count = $leader->count;
6084 my $string_count = main::clarify_number($count);
6086 my $loose_count = 0; # how many aliases loosely matched
6087 my $compound_name = ""; # ? Are any names compound?, and if so, an
6089 my $properties_with_compound_names = 0; # count of these
6092 my %flags; # The status flags used in the file
6093 my $total_entries = 0; # number of entries written in the comment
6094 my $matches_comment = ""; # The portion of the comment about the
6096 my @global_comments; # List of all the tables' comments that are
6097 # there before this routine was called.
6099 # Get list of all the parent tables that are equivalent to this one
6100 # (including itself).
6101 my @parents = grep { $parent{main::objaddr $_} == $_ }
6102 main::uniques($leader, @{$equivalents{$addr}});
6103 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated
6106 for my $parent (@parents) {
6108 my $property = $parent->property;
6110 # Special case 'N' tables in properties with two match tables when
6111 # the other is a 'Y' one. These are likely to be binary tables,
6112 # but not necessarily. In either case, \P{} will match the
6113 # complement of \p{}, and so if something is a synonym of \p, the
6114 # complement of that something will be the synonym of \P. This
6115 # would be true of any property with just two match tables, not
6116 # just those whose values are Y and N; but that would require a
6117 # little extra work, and there are none such so far in Unicode.
6118 my $perl_p = 'p'; # which is it? \p{} or \P{}
6119 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table
6121 if (scalar $property->tables == 2
6122 && $parent == $property->table('N')
6123 && defined (my $yes = $property->table('Y')))
6125 my $yes_addr = main::objaddr $yes;
6127 = grep { $_->property == $perl }
6130 $parent{$yes_addr}->children);
6132 # But these synonyms are \P{} ,not \p{}
6136 my @description; # Will hold the table description
6137 my @note; # Will hold the table notes.
6138 my @conflicting; # Will hold the table conflicts.
6140 # Look at the parent, any yes synonyms, and all the children
6141 for my $table ($parent,
6143 @{$children{main::objaddr $parent}})
6145 my $table_addr = main::objaddr $table;
6146 my $table_property = $table->property;
6148 # Tables are separated by a blank line to create a grouping.
6149 $matches_comment .= "\n" if $matches_comment;
6151 # The table is named based on the property and value
6152 # combination it is for, like script=greek. But there may be
6153 # a number of synonyms for each side, like 'sc' for 'script',
6154 # and 'grek' for 'greek'. Any combination of these is a valid
6155 # name for this table. In this case, there are three more,
6156 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than
6157 # listing all possible combinations in the comment, we make
6158 # sure that each synonym occurs at least once, and add
6159 # commentary that the other combinations are possible.
6160 my @property_aliases = $table_property->aliases;
6161 my @table_aliases = $table->aliases;
6163 Carp::my_carp_bug("$table doesn't have any names. Proceeding anyway.") unless @table_aliases;
6165 # The alias lists above are already ordered in the order we
6166 # want to output them. To ensure that each synonym is listed,
6167 # we must use the max of the two numbers.
6168 my $listed_combos = main::max(scalar @table_aliases,
6169 scalar @property_aliases);
6170 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
6172 my $property_had_compound_name = 0;
6174 for my $i (0 .. $listed_combos - 1) {
6177 # The current alias for the property is the next one on
6178 # the list, or if beyond the end, start over. Similarly
6179 # for the table (\p{prop=table})
6180 my $property_alias = $property_aliases
6181 [$i % @property_aliases]->name;
6182 my $table_alias_object = $table_aliases
6183 [$i % @table_aliases];
6184 my $table_alias = $table_alias_object->name;
6185 my $loose_match = $table_alias_object->loose_match;
6187 if ($table_alias !~ /\D/) { # Clarify large numbers.
6188 $table_alias = main::clarify_number($table_alias)
6191 # Add a comment for this alias combination
6192 my $current_match_comment;
6193 if ($table_property == $perl) {
6194 $current_match_comment = "\\$perl_p"
6198 $current_match_comment
6199 = "\\p{$property_alias=$table_alias}";
6200 $property_had_compound_name = 1;
6203 # Flag any abnormal status for this table.
6204 my $flag = $property->status
6206 || $table_alias_object->status;
6207 $flags{$flag} = $status_past_participles{$flag} if $flag;
6211 # Pretty up the comment. Note the \b; it says don't make
6212 # this line a continuation.
6213 $matches_comment .= sprintf("\b%-1s%-s%s\n",
6216 $current_match_comment);
6217 } # End of generating the entries for this table.
6219 # Save these for output after this group of related tables.
6220 push @description, $table->description;
6221 push @note, $table->note;
6222 push @conflicting, $table->conflicting;
6224 # Compute an alternate compound name using the final property
6225 # synonym and the first table synonym with a colon instead of
6226 # the equal sign used elsewhere.
6227 if ($property_had_compound_name) {
6228 $properties_with_compound_names ++;
6229 if (! $compound_name || @property_aliases > 1) {
6230 $compound_name = $property_aliases[-1]->name
6232 . $table_aliases[0]->name;
6235 } # End of looping through all children of this table
6237 # Here have assembled in $matches_comment all the related tables
6238 # to the current parent (preceded by the same info for all the
6239 # previous parents). Put out information that applies to all of
6240 # the current family.
6243 # But output the conflicting information now, as it applies to
6245 my $conflicting = join ", ", @conflicting;
6247 $matches_comment .= <<END;
6249 Note that contrary to what you might expect, the above is NOT the same as
6251 $matches_comment .= "any of: " if @conflicting > 1;
6252 $matches_comment .= "$conflicting\n";
6256 $matches_comment .= "\n Meaning: "
6257 . join('; ', @description)
6261 $matches_comment .= "\n Note: "
6262 . join("\n ", @note)
6265 } # End of looping through all tables
6273 $code_points = 'single code point';
6277 $code_points = "$string_count code points";
6282 if ($total_entries <= 1) {
6285 $any_of_these = 'this'
6288 $synonyms = " any of the following regular expression constructs";
6289 $entries = 'entries';
6290 $any_of_these = 'any of these'
6294 if ($has_unrelated) {
6296 This file is for tables that are not necessarily related: To conserve
6297 resources, every table that matches the identical set of code points in this
6298 version of Unicode uses this file. Each one is listed in a separate group
6299 below. It could be that the tables will match the same set of code points in
6300 other Unicode releases, or it could be purely coincidence that they happen to
6301 be the same in Unicode $string_version, and hence may not in other versions.
6307 foreach my $flag (sort keys %flags) {
6309 '$flag' below means that this form is $flags{$flag}. Consult $pod_file.pod
6316 This file returns the $code_points in Unicode Version $string_version that
6320 $pod_file.pod should be consulted for the rules on using $any_of_these,
6321 including if adding or subtracting white space, underscore, and hyphen
6322 characters matters or doesn't matter, and other permissible syntactic
6323 variants. Upper/lower case distinctions never matter.
6326 if ($compound_name) {
6329 A colon can be substituted for the equals sign, and
6331 if ($properties_with_compound_names > 1) {
6333 within each group above,
6336 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
6338 # Note the \b below, it says don't make that line a continuation.
6340 anything to the left of the equals (or colon) can be combined with anything to
6341 the right. Thus, for example,
6347 # And append any comment(s) from the actual tables. They are all
6348 # gathered here, so may not read all that well.
6349 $comment .= "\n" . join "\n\n", @global_comments if @global_comments;
6351 if ($count) { # The format differs if no code points, and needs no
6352 # explanation in that case
6355 The format of the lines of this file is:
6358 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
6359 STOP is the ending point, or if omitted, the range has just one code point.
6361 if ($output_range_counts) {
6363 Numbers in comments in [brackets] indicate how many code points are in the
6369 $leader->set_comment(main::join_lines($comment));
6373 # Accessors for the underlying list
6375 get_valid_code_point
6376 get_invalid_code_point
6384 return $self->_range_list->$sub(@_);
6387 } # End closure for Match_Table
6391 # The Property class represents a Unicode property, or the $perl
6392 # pseudo-property. It contains a map table initialized empty at construction
6393 # time, and for properties accessible through regular expressions, various
6394 # match tables, created through the add_match_table() method, and referenced
6395 # by the table('NAME') or tables() methods, the latter returning a list of all
6396 # of the match tables. Otherwise table operations implicitly are for the map
6399 # Most of the data in the property is actually about its map table, so it
6400 # mostly just uses that table's accessors for most methods. The two could
6401 # have been combined into one object, but for clarity because of their
6402 # differing semantics, they have been kept separate. It could be argued that
6403 # the 'file' and 'directory' fields should be kept with the map table.
6405 # Each property has a type. This can be set in the constructor, or in the
6406 # set_type accessor, but mostly it is figured out by the data. Every property
6407 # starts with unknown type, overridden by a parameter to the constructor, or
6408 # as match tables are added, or ranges added to the map table, the data is
6409 # inspected, and the type changed. After the table is mostly or entirely
6410 # filled, compute_type() should be called to finalize they analysis.
6412 # There are very few operations defined. One can safely remove a range from
6413 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
6414 # table to this one, replacing any in the intersection of the two.
6416 sub standardize { return main::standardize($_[0]); }
6417 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
6421 # This hash will contain as keys, all the aliases of all properties, and
6422 # as values, pointers to their respective property objects. This allows
6423 # quick look-up of a property from any of its names.
6424 my %alias_to_property_of;
6426 sub dump_alias_to_property_of {
6429 print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
6434 # This is a package subroutine, not called as a method.
6435 # If the single parameter is a literal '*' it returns a list of all
6436 # defined properties.
6437 # Otherwise, the single parameter is a name, and it returns a pointer
6438 # to the corresponding property object, or undef if none.
6440 # Properties can have several different names. The 'standard' form of
6441 # each of them is stored in %alias_to_property_of as they are defined.
6442 # But it's possible that this subroutine will be called with some
6443 # variant, so if the initial lookup fails, it is repeated with the
6444 # standarized form of the input name. If found, besides returning the
6445 # result, the input name is added to the list so future calls won't
6446 # have to do the conversion again.
6450 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6452 if (! defined $name) {
6453 Carp::my_carp_bug("Undefined input property. No action taken.");
6457 return main::uniques(values %alias_to_property_of) if $name eq '*';
6459 # Return cached result if have it.
6460 my $result = $alias_to_property_of{$name};
6461 return $result if defined $result;
6463 # Convert the input to standard form.
6464 my $standard_name = standardize($name);
6466 $result = $alias_to_property_of{$standard_name};
6467 return unless defined $result; # Don't cache undefs
6469 # Cache the result before returning it.
6470 $alias_to_property_of{$name} = $result;
6475 main::setup_package();
6478 # A pointer to the map table object for this property
6479 main::set_access('map', \%map);
6482 # The property's full name. This is a duplicate of the copy kept in the
6483 # map table, but is needed because stringify needs it during
6484 # construction of the map table, and then would have a chicken before egg
6486 main::set_access('full_name', \%full_name, 'r');
6489 # This hash will contain as keys, all the aliases of any match tables
6490 # attached to this property, and as values, the pointers to their
6491 # respective tables. This allows quick look-up of a table from any of its
6493 main::set_access('table_ref', \%table_ref);
6496 # The type of the property, $ENUM, $BINARY, etc
6497 main::set_access('type', \%type, 'r');
6500 # The filename where the map table will go (if actually written).
6501 # Normally defaulted, but can be overridden.
6502 main::set_access('file', \%file, 'r', 's');
6505 # The directory where the map table will go (if actually written).
6506 # Normally defaulted, but can be overridden.
6507 main::set_access('directory', \%directory, 's');
6509 my %pseudo_map_type;
6510 # This is used to affect the calculation of the map types for all the
6511 # ranges in the table. It should be set to one of the values that signify
6512 # to alter the calculation.
6513 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
6515 my %has_only_code_point_maps;
6516 # A boolean used to help in computing the type of data in the map table.
6517 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
6520 # A list of the first few distinct mappings this property has. This is
6521 # used to disambiguate between binary and enum property types, so don't
6522 # have to keep more than three.
6523 main::set_access('unique_maps', \%unique_maps);
6526 # The only required parameter is the positionally first, name. All
6527 # other parameters are key => value pairs. See the documentation just
6528 # above for the meanings of the ones not passed directly on to the map
6529 # table constructor.
6532 my $name = shift || "";
6534 my $self = property_ref($name);
6535 if (defined $self) {
6536 my $options_string = join ", ", @_;
6537 $options_string = ". Ignoring options $options_string" if $options_string;
6538 Carp::my_carp("$self is already in use. Using existing one$options_string;");
6544 $self = bless \do { my $anonymous_scalar }, $class;
6545 my $addr = main::objaddr $self;
6547 $directory{$addr} = delete $args{'Directory'};
6548 $file{$addr} = delete $args{'File'};
6549 $full_name{$addr} = delete $args{'Full_Name'} || $name;
6550 $type{$addr} = delete $args{'Type'} || $UNKNOWN;
6551 $pseudo_map_type{$addr} = delete $args{'Map_Type'};
6552 # Rest of parameters passed on.
6554 $has_only_code_point_maps{$addr} = 1;
6555 $table_ref{$addr} = { };
6556 $unique_maps{$addr} = { };
6558 $map{$addr} = Map_Table->new($name,
6559 Full_Name => $full_name{$addr},
6560 _Alias_Hash => \%alias_to_property_of,
6566 # See this program's beginning comment block about overloading the copy
6567 # constructor. Few operations are defined on properties, but a couple are
6568 # useful. It is safe to take the inverse of a property, and to remove a
6569 # single code point from it.
6572 qw("") => "_operator_stringify",
6573 "." => \&main::_operator_dot,
6574 '==' => \&main::_operator_equal,
6575 '!=' => \&main::_operator_not_equal,
6576 '=' => sub { return shift },
6577 '-=' => "_minus_and_equal",
6580 sub _operator_stringify {
6581 return "Property '" . shift->full_name . "'";
6584 sub _minus_and_equal {
6585 # Remove a single code point from the map table of a property.
6589 my $reversed = shift;
6590 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6593 Carp::my_carp_bug("Can't cope with a "
6595 . " argument to '-='. Subtraction ignored.");
6598 elsif ($reversed) { # Shouldnt happen in a -=, but just in case
6599 Carp::my_carp_bug("Can't cope with a "
6601 . " being the first parameter in a '-='. Subtraction ignored.");
6605 $map{main::objaddr $self}->delete_range($other, $other);
6610 sub add_match_table {
6611 # Add a new match table for this property, with name given by the
6612 # parameter. It returns a pointer to the table.
6618 my $addr = main::objaddr $self;
6620 my $table = $table_ref{$addr}{$name};
6621 my $standard_name = main::standardize($name);
6623 || (defined ($table = $table_ref{$addr}{$standard_name})))
6625 Carp::my_carp("Table '$name' in $self is already in use. Using existing one");
6626 $table_ref{$addr}{$name} = $table;
6631 # See if this is a perl extension, if not passed in.
6632 my $perl_extension = delete $args{'Perl_Extension'};
6634 = $self->perl_extension if ! defined $perl_extension;
6636 $table = Match_Table->new(
6638 Perl_Extension => $perl_extension,
6639 _Alias_Hash => $table_ref{$addr},
6642 # gets property's status by default
6643 Status => $self->status,
6644 _Status_Info => $self->status_info,
6646 Internal_Only_Warning => 1); # Override any
6648 return unless defined $table;
6651 # Save the names for quick look up
6652 $table_ref{$addr}{$standard_name} = $table;
6653 $table_ref{$addr}{$name} = $table;
6655 # Perhaps we can figure out the type of this property based on the
6656 # fact of adding this match table. First, string properties don't
6657 # have match tables; second, a binary property can't have 3 match
6659 if ($type{$addr} == $UNKNOWN) {
6660 $type{$addr} = $NON_STRING;
6662 elsif ($type{$addr} == $STRING) {
6663 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News.");
6664 $type{$addr} = $NON_STRING;
6666 elsif ($type{$addr} != $ENUM) {
6667 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
6668 && $type{$addr} == $BINARY)
6670 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News.");
6671 $type{$addr} = $ENUM;
6679 # Return a pointer to the match table (with name given by the
6680 # parameter) associated with this property; undef if none.
6684 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6686 my $addr = main::objaddr $self;
6688 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
6690 # If quick look-up failed, try again using the standard form of the
6691 # input name. If that succeeds, cache the result before returning so
6692 # won't have to standardize this input name again.
6693 my $standard_name = main::standardize($name);
6694 return unless defined $table_ref{$addr}{$standard_name};
6696 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
6697 return $table_ref{$addr}{$name};
6701 # Return a list of pointers to all the match tables attached to this
6704 return main::uniques(values %{$table_ref{main::objaddr shift}});
6708 # Returns the directory the map table for this property should be
6709 # output in. If a specific directory has been specified, that has
6710 # priority; 'undef' is returned if the type isn't defined;
6711 # or $map_directory for everything else.
6713 my $addr = main::objaddr shift;
6715 return $directory{$addr} if defined $directory{$addr};
6716 return undef if $type{$addr} == $UNKNOWN;
6717 return $map_directory;
6721 # Return the name that is used to both:
6722 # 1) Name the file that the map table is written to.
6723 # 2) The name of swash related stuff inside that file.
6724 # The reason for this is that the Perl core historically has used
6725 # certain names that aren't the same as the Unicode property names.
6726 # To continue using these, $file is hard-coded in this file for those,
6727 # but otherwise the standard name is used. This is different from the
6728 # external_name, so that the rest of the files, like in lib can use
6729 # the standard name always, without regard to historical precedent.
6732 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6734 my $addr = main::objaddr $self;
6736 return $file{$addr} if defined $file{$addr};
6737 return $map{$addr}->external_name;
6740 sub to_create_match_tables {
6741 # Returns a boolean as to whether or not match tables should be
6742 # created for this property.
6745 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6747 # The whole point of this pseudo property is match tables.
6748 return 1 if $self == $perl;
6750 my $addr = main::objaddr $self;
6752 # Don't generate tables of code points that match the property values
6753 # of a string property. Such a list would most likely have many
6754 # property values, each with just one or very few code points mapping
6756 return 0 if $type{$addr} == $STRING;
6758 # Don't generate anything for unimplemented properties.
6759 return 0 if grep { $self->complete_name eq $_ }
6760 @unimplemented_properties;
6765 sub property_add_or_replace_non_nulls {
6766 # This adds the mappings in the property $other to $self. Non-null
6767 # mappings from $other override those in $self. It essentially merges
6768 # the two properties, with the second having priority except for null
6773 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6775 if (! $other->isa(__PACKAGE__)) {
6776 Carp::my_carp_bug("$other should be a "
6784 return $map{main::objaddr $self}->
6785 map_add_or_replace_non_nulls($map{main::objaddr $other});
6789 # Set the type of the property. Mostly this is figured out by the
6790 # data in the table. But this is used to set it explicitly. The
6791 # reason it is not a standard accessor is that when setting a binary
6792 # property, we need to make sure that all the true/false aliases are
6793 # present, as they were omitted in early Unicode releases.
6797 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6799 if ($type != $ENUM && $type != $BINARY && $type != $STRING) {
6800 Carp::my_carp("Unrecognized type '$type'. Type not set");
6804 $type{main::objaddr $self} = $type;
6805 return if $type != $BINARY;
6807 my $yes = $self->table('Y');
6808 $yes = $self->table('Yes') if ! defined $yes;
6809 $yes = $self->add_match_table('Y') if ! defined $yes;
6810 $yes->add_alias('Yes');
6811 $yes->add_alias('T');
6812 $yes->add_alias('True');
6814 my $no = $self->table('N');
6815 $no = $self->table('No') if ! defined $no;
6816 $no = $self->add_match_table('N') if ! defined $no;
6817 $no->add_alias('No');
6818 $no->add_alias('F');
6819 $no->add_alias('False');
6824 # Add a map to the property's map table. This also keeps
6825 # track of the maps so that the property type can be determined from
6829 my $start = shift; # First code point in range
6830 my $end = shift; # Final code point in range
6831 my $map = shift; # What the range maps to.
6832 # Rest of parameters passed on.
6834 my $addr = main::objaddr $self;
6836 # If haven't the type of the property, gather information to figure it
6838 if ($type{$addr} == $UNKNOWN) {
6840 # If the map contains an interior blank or dash, or most other
6841 # nonword characters, it will be a string property. This
6842 # heuristic may actually miss some string properties. If so, they
6843 # may need to have explicit set_types called for them. This
6844 # happens in the Unihan properties.
6845 if ($map =~ / (?<= . ) [ -] (?= . ) /x
6846 || $map =~ / [^\w.\/\ -] /x)
6848 $self->set_type($STRING);
6850 # $unique_maps is used for disambiguating between ENUM and
6851 # BINARY later; since we know the property is not going to be
6852 # one of those, no point in keeping the data around
6853 undef $unique_maps{$addr};
6857 # Not necessarily a string. The final decision has to be
6858 # deferred until all the data are in. We keep track of if all
6859 # the values are code points for that eventual decision.
6860 $has_only_code_point_maps{$addr} &=
6861 $map =~ / ^ $code_point_re $/x;
6863 # For the purposes of disambiguating between binary and other
6864 # enumerations at the end, we keep track of the first three
6865 # distinct property values. Once we get to three, we know
6866 # it's not going to be binary, so no need to track more.
6867 if (scalar keys %{$unique_maps{$addr}} < 3) {
6868 $unique_maps{$addr}{main::standardize($map)} = 1;
6873 # Add the mapping by calling our map table's method
6874 return $map{$addr}->add_map($start, $end, $map, @_);
6878 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This
6879 # should be called after the property is mostly filled with its maps.
6880 # We have been keeping track of what the property values have been,
6881 # and now have the necessary information to figure out the type.
6884 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6886 my $addr = main::objaddr($self);
6888 my $type = $type{$addr};
6890 # If already have figured these out, no need to do so again, but we do
6891 # a double check on ENUMS to make sure that a string property hasn't
6892 # improperly been classified as an ENUM, so continue on with those.
6893 return if $type == $STRING || $type == $BINARY;
6895 # If every map is to a code point, is a string property.
6896 if ($type == $UNKNOWN
6897 && ($has_only_code_point_maps{$addr}
6898 || (defined $map{$addr}->default_map
6899 && $map{$addr}->default_map eq "")))
6901 $self->set_type($STRING);
6905 # Otherwise, it is to some sort of enumeration. (The case where
6906 # it is a Unicode miscellaneous property, and treated like a
6907 # string in this program is handled in add_map()). Distinguish
6908 # between binary and some other enumeration type. Of course, if
6909 # there are more than two values, it's not binary. But more
6910 # subtle is the test that the default mapping is defined means it
6911 # isn't binary. This in fact may change in the future if Unicode
6912 # changes the way its data is structured. But so far, no binary
6913 # properties ever have @missing lines for them, so the default map
6914 # isn't defined for them. The few properties that are two-valued
6915 # and aren't considered binary have the default map defined
6916 # starting in Unicode 5.0, when the @missing lines appeared; and
6917 # this program has special code to put in a default map for them
6918 # for earlier than 5.0 releases.
6920 || scalar keys %{$unique_maps{$addr}} > 2
6921 || defined $self->default_map)
6923 my $tables = $self->tables;
6924 my $count = $self->count;
6925 if ($verbosity && $count > 500 && $tables/$count > .1) {
6926 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
6928 $self->set_type($ENUM);
6931 $self->set_type($BINARY);
6934 undef $unique_maps{$addr}; # Garbage collect
6938 # Most of the accessors for a property actually apply to its map table.
6939 # Setup up accessor functions for those, referring to %map
6986 # 'property' above is for symmetry, so that one can take
6987 # the property of a property and get itself, and so don't
6988 # have to distinguish between properties and tables in
6995 return $map{main::objaddr $self}->$sub(@_);
7005 # Returns lines of the input joined together, so that they can be folded
7007 # This causes continuation lines to be joined together into one long line
7008 # for folding. A continuation line is any line that doesn't begin with a
7009 # space or "\b" (the latter is stripped from the output). This is so
7010 # lines can be be in a HERE document so as to fit nicely in the terminal
7011 # width, but be joined together in one long line, and then folded with
7012 # indents, '#' prefixes, etc, properly handled.
7013 # A blank separates the joined lines except if there is a break; an extra
7014 # blank is inserted after a period ending a line.
7016 # Intialize the return with the first line.
7017 my ($return, @lines) = split "\n", shift;
7019 # If the first line is null, it was an empty line, add the \n back in
7020 $return = "\n" if $return eq "";
7022 # Now join the remainder of the physical lines.
7023 for my $line (@lines) {
7025 # An empty line means wanted a blank line, so add two \n's to get that
7026 # effect, and go to the next line.
7027 if (length $line == 0) {
7032 # Look at the last character of what we have so far.
7033 my $previous_char = substr($return, -1, 1);
7035 # And at the next char to be output.
7036 my $next_char = substr($line, 0, 1);
7038 if ($previous_char ne "\n") {
7040 # Here didn't end wth a nl. If the next char a blank or \b, it
7041 # means that here there is a break anyway. So add a nl to the
7043 if ($next_char eq " " || $next_char eq "\b") {
7044 $previous_char = "\n";
7045 $return .= $previous_char;
7048 # Add an extra space after periods.
7049 $return .= " " if $previous_char eq '.';
7052 # Here $previous_char is still the latest character to be output. If
7053 # it isn't a nl, it means that the next line is to be a continuation
7054 # line, with a blank inserted between them.
7055 $return .= " " if $previous_char ne "\n";
7058 substr($line, 0, 1) = "" if $next_char eq "\b";
7060 # And append this next line.
7067 sub simple_fold($;$$$) {
7068 # Returns a string of the input (string or an array of strings) folded
7069 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
7071 # This is tailored for the kind of text written by this program,
7072 # especially the pod file, which can have very long names with
7073 # underscores in the middle, or words like AbcDefgHij.... We allow
7074 # breaking in the middle of such constructs if the line won't fit
7075 # otherwise. The break in such cases will come either just after an
7076 # underscore, or just before one of the Capital letters.
7078 local $to_trace = 0 if main::DEBUG;
7081 my $prefix = shift; # Optional string to prepend to each output
7083 $prefix = "" unless defined $prefix;
7085 my $hanging_indent = shift; # Optional number of spaces to indent
7086 # continuation lines
7087 $hanging_indent = 0 unless $hanging_indent;
7089 my $right_margin = shift; # Optional number of spaces to narrow the
7091 $right_margin = 0 unless defined $right_margin;
7093 # Call carp with the 'nofold' option to avoid it from trying to call us
7095 Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
7097 # The space available doesn't include what's automatically prepended
7098 # to each line, or what's reserved on the right.
7099 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
7100 # XXX Instead of using the 'nofold' perhaps better to look up the stack
7102 if (DEBUG && $hanging_indent >= $max) {
7103 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold');
7104 $hanging_indent = 0;
7107 # First, split into the current physical lines.
7109 if (ref $line) { # Better be an array, because not bothering to
7111 foreach my $line (@{$line}) {
7112 push @line, split /\n/, $line;
7116 @line = split /\n/, $line;
7119 #local $to_trace = 1 if main::DEBUG;
7120 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
7122 # Look at each current physical line.
7123 for (my $i = 0; $i < @line; $i++) {
7124 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
7125 #local $to_trace = 1 if main::DEBUG;
7126 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
7128 # Remove prefix, because will be added back anyway, don't want
7130 $line[$i] =~ s/^$prefix//;
7132 # Remove trailing space
7133 $line[$i] =~ s/\s+\Z//;
7135 # If the line is too long, fold it.
7136 if (length $line[$i] > $max) {
7139 # Here needs to fold. Save the leading space in the line for
7141 $line[$i] =~ /^ ( \s* )/x;
7142 my $leading_space = $1;
7143 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
7145 # If character at final permissible position is white space,
7146 # fold there, which will delete that white space
7147 if (substr($line[$i], $max - 1, 1) =~ /\s/) {
7148 $remainder = substr($line[$i], $max);
7149 $line[$i] = substr($line[$i], 0, $max - 1);
7153 # Otherwise fold at an acceptable break char closest to
7154 # the max length. Look at just the maximal initial
7155 # segment of the line
7156 my $segment = substr($line[$i], 0, $max - 1);
7158 /^ ( .{$hanging_indent} # Don't look before the
7160 \ * # Don't look in leading
7161 # blanks past the indent
7162 [^ ] .* # Find the right-most
7163 (?: # acceptable break:
7164 [ \s = ] # space or equal
7165 | - (?! [.0-9] ) # or non-unary minus.
7166 ) # $1 includes the character
7169 # Split into the initial part that fits, and remaining
7171 $remainder = substr($line[$i], length $1);
7173 trace $line[$i] if DEBUG && $to_trace;
7174 trace $remainder if DEBUG && $to_trace;
7177 # If didn't find a good breaking spot, see if there is a
7178 # not-so-good breaking spot. These are just after
7179 # underscores or where the case changes from lower to
7180 # upper. Use \a as a soft hyphen, but give up
7181 # and don't break the line if there is actually a \a
7182 # already in the input. We use an ascii character for the
7183 # soft-hyphen to avoid any attempt by miniperl to try to
7184 # access the files that this program is creating.
7185 elsif ($segment !~ /\a/
7186 && ($segment =~ s/_/_\a/g
7187 || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
7189 # Here were able to find at least one place to insert
7190 # our substitute soft hyphen. Find the right-most one
7191 # and replace it by a real hyphen.
7192 trace $segment if DEBUG && $to_trace;
7194 rindex($segment, "\a"),
7197 # Then remove the soft hyphen substitutes.
7198 $segment =~ s/\a//g;
7199 trace $segment if DEBUG && $to_trace;
7201 # And split into the initial part that fits, and
7202 # remainder of the line
7203 my $pos = rindex($segment, '-');
7204 $remainder = substr($line[$i], $pos);
7205 trace $remainder if DEBUG && $to_trace;
7206 $line[$i] = substr($segment, 0, $pos + 1);
7210 # Here we know if we can fold or not. If we can, $remainder
7211 # is what remains to be processed in the next iteration.
7212 if (defined $remainder) {
7213 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
7215 # Insert the folded remainder of the line as a new element
7216 # of the array. (It may still be too long, but we will
7217 # deal with that next time through the loop.) Omit any
7218 # leading space in the remainder.
7219 $remainder =~ s/^\s+//;
7220 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
7222 # But then indent by whichever is larger of:
7223 # 1) the leading space on the input line;
7224 # 2) the hanging indent.
7225 # This preserves indentation in the original line.
7226 my $lead = ($leading_space)
7227 ? length $leading_space
7229 $lead = max($lead, $hanging_indent);
7230 splice @line, $i+1, 0, (" " x $lead) . $remainder;
7234 # Ready to output the line. Get rid of any trailing space
7235 # And prefix by the required $prefix passed in.
7236 $line[$i] =~ s/\s+$//;
7237 $line[$i] = "$prefix$line[$i]\n";
7238 } # End of looping through all the lines.
7240 return join "", @line;
7243 sub property_ref { # Returns a reference to a property object.
7244 return Property::property_ref(@_);
7247 sub force_unlink ($) {
7248 my $filename = shift;
7249 return unless file_exists($filename);
7250 return if CORE::unlink($filename);
7252 # We might need write permission
7253 chmod 0777, $filename;
7254 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!");
7259 # Given a filename and a reference to an array of lines, write the lines
7261 # Filename can be given as an arrayref of directory names
7264 my $lines_ref = shift;
7265 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7267 if (! defined $lines_ref) {
7268 Carp::my_carp("Missing lines to write parameter for $file. Writing skipped;");
7272 # Get into a single string if an array, and get rid of, in Unix terms, any
7274 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
7275 $file = File::Spec->canonpath($file);
7277 # If has directories, make sure that they all exist
7278 (undef, my $directories, undef) = File::Spec->splitpath($file);
7279 File::Path::mkpath($directories) if $directories && ! -d $directories;
7281 push @files_actually_output, $file;
7285 $text = join "", @$lines_ref;
7289 Carp::my_carp("Output file '$file' is empty; writing it anyway;");
7292 force_unlink ($file);
7295 if (not open $OUT, ">", $file) {
7296 Carp::my_carp("can't open $file for output. Skipping this file: $!");
7299 print "$file written.\n" if $verbosity >= $VERBOSE;
7307 sub Standardize($) {
7308 # This converts the input name string into a standardized equivalent to
7312 unless (defined $name) {
7313 Carp::my_carp_bug("Standardize() called with undef. Returning undef.");
7317 # Remove any leading or trailing white space
7321 # Convert interior white space and hypens into underscores.
7322 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
7324 # Capitalize the letter following an underscore, and convert a sequence of
7325 # multiple underscores to a single one
7326 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
7328 # And capitalize the first letter, but not for the special cjk ones.
7329 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
7333 sub standardize ($) {
7334 # Returns a lower-cased standardized name, without underscores. This form
7335 # is chosen so that it can distinguish between any real versus superficial
7336 # Unicode name differences. It relies on the fact that Unicode doesn't
7337 # have interior underscores, white space, nor dashes in any
7338 # stricter-matched name. It should not be used on Unicode code point
7339 # names (the Name property), as they mostly, but not always follow these
7342 my $name = Standardize(shift);
7343 return if !defined $name;
7345 $name =~ s/ (?<= .) _ (?= . ) //xg;
7351 my $indent_increment = " " x 2;
7354 $main::simple_dumper_nesting = 0;
7357 # Like Simple Data::Dumper. Good enough for our needs. We can't use
7358 # the real thing as we have to run under miniperl.
7360 # It is designed so that on input it is at the beginning of a line,
7361 # and the final thing output in any call is a trailing ",\n".
7365 $indent = "" if ! defined $indent;
7367 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7369 # nesting level is localized, so that as the call stack pops, it goes
7370 # back to the prior value.
7371 local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
7372 undef %already_output if $main::simple_dumper_nesting == 0;
7373 $main::simple_dumper_nesting++;
7374 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
7376 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7378 # Determine the indent for recursive calls.
7379 my $next_indent = $indent . $indent_increment;
7384 # Dump of scalar: just output it in quotes if not a number. To do
7385 # so we must escape certain characters, and therefore need to
7386 # operate on a copy to avoid changing the original
7388 $copy = $UNDEF unless defined $copy;
7390 # Quote non-numbers (numbers also have optional leading '-' and
7392 if ($copy eq "" || $copy !~ /^ -? \d+ ( \. \d+ )? $/x) {
7394 # Escape apostrophe and backslash
7395 $copy =~ s/ ( ['\\] ) /\\$1/xg;
7398 $output = "$indent$copy,\n";
7402 # Keep track of cycles in the input, and refuse to infinitely loop
7403 if (defined $already_output{main::objaddr $item}) {
7404 return "${indent}ALREADY OUTPUT: $item\n";
7406 $already_output{main::objaddr $item} = $item;
7408 if (ref $item eq 'ARRAY') {
7411 if ($main::simple_dumper_nesting > 1) {
7413 $using_brackets = 1;
7416 $using_brackets = 0;
7419 # If the array is empty, put the closing bracket on the same
7420 # line. Otherwise, recursively add each array element
7426 for (my $i = 0; $i < @$item; $i++) {
7428 # Indent array elements one level
7429 $output .= &simple_dumper($item->[$i], $next_indent);
7430 $output =~ s/\n$//; # Remove trailing nl so as to
7431 $output .= " # [$i]\n"; # add a comment giving the
7434 $output .= $indent; # Indent closing ']' to orig level
7436 $output .= ']' if $using_brackets;
7439 elsif (ref $item eq 'HASH') {
7444 # No surrounding braces at top level
7446 if ($main::simple_dumper_nesting > 1) {
7449 $body_indent = $next_indent;
7450 $next_indent .= $indent_increment;
7455 $body_indent = $indent;
7459 # Output hashes sorted alphabetically instead of apparently
7460 # random. Use caseless alphabetic sort
7461 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
7463 if ($is_first_line) {
7467 $output .= "$body_indent";
7470 # The key must be a scalar, but this recursive call quotes
7472 $output .= &simple_dumper($key);
7474 # And change the trailing comma and nl to the hash fat
7475 # comma for clarity, and so the value can be on the same
7477 $output =~ s/,\n$/ => /;
7479 # Recursively call to get the value's dump.
7480 my $next = &simple_dumper($item->{$key}, $next_indent);
7482 # If the value is all on one line, remove its indent, so
7483 # will follow the => immediately. If it takes more than
7484 # one line, start it on a new line.
7485 if ($next !~ /\n.*\n/) {
7494 $output .= "$indent},\n" if $using_braces;
7496 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
7497 $output = $indent . ref($item) . "\n";
7498 # XXX see if blessed
7500 elsif ($item->can('dump')) {
7502 # By convention in this program, objects furnish a 'dump'
7503 # method. Since not doing any output at this level, just pass
7504 # on the input indent
7505 $output = $item->dump($indent);
7508 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping.");
7515 sub dump_inside_out {
7516 # Dump inside-out hashes in an object's state by converting them to a
7517 # regular hash and then calling simple_dumper on that.
7520 my $fields_ref = shift;
7521 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7523 my $addr = main::objaddr $object;
7526 foreach my $key (keys %$fields_ref) {
7527 $hash{$key} = $fields_ref->{$key}{$addr};
7530 return simple_dumper(\%hash, @_);
7534 # Overloaded '.' method that is common to all packages. It uses the
7535 # package's stringify method.
7539 my $reversed = shift;
7540 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7542 $other = "" unless defined $other;
7544 foreach my $which (\$self, \$other) {
7545 next unless ref $$which;
7546 if ($$which->can('_operator_stringify')) {
7547 $$which = $$which->_operator_stringify;
7550 my $ref = ref $$which;
7551 my $addr = main::objaddr $$which;
7552 $$which = "$ref ($addr)";
7560 sub _operator_equal {
7561 # Generic overloaded '==' routine. To be equal, they must be the exact
7567 return 0 unless defined $other;
7568 return 0 unless ref $other;
7569 return main::objaddr $self == main::objaddr $other;
7572 sub _operator_not_equal {
7576 return ! _operator_equal($self, $other);
7579 sub process_PropertyAliases($) {
7580 # This reads in the PropertyAliases.txt file, which contains almost all
7581 # the character properties in Unicode and their equivalent aliases:
7582 # scf ; Simple_Case_Folding ; sfc
7584 # Field 0 is the preferred short name for the property.
7585 # Field 1 is the full name.
7586 # Any succeeding ones are other accepted names.
7589 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7591 # This whole file was non-existent in early releases, so use our own
7593 $file->insert_lines(get_old_property_aliases())
7594 if ! -e 'PropertyAliases.txt';
7596 # Add any cjk properties that may have been defined.
7597 $file->insert_lines(@cjk_properties);
7599 while ($file->next_line) {
7601 my @data = split /\s*;\s*/;
7603 my $full = $data[1];
7605 my $this = Property->new($data[0], Full_Name => $full);
7607 # Start looking for more aliases after these two.
7608 for my $i (2 .. @data - 1) {
7609 $this->add_alias($data[$i]);
7616 sub finish_property_setup {
7617 # Finishes setting up after PropertyAliases.
7620 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7622 # This entry was missing from this file in earlier Unicode versions
7623 if (-e 'Jamo.txt') {
7624 my $jsn = property_ref('JSN');
7625 if (! defined $jsn) {
7626 $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name');
7630 # This entry is still missing as of 5.2, perhaps because no short name for
7632 if (-e 'NameAliases.txt') {
7633 my $aliases = property_ref('Name_Alias');
7634 if (! defined $aliases) {
7635 $aliases = Property->new('Name_Alias');
7639 # These are used so much, that we set globals for them.
7640 $gc = property_ref('General_Category');
7641 $block = property_ref('Block');
7643 # Perl adds this alias.
7644 $gc->add_alias('Category');
7646 # For backwards compatibility, these property files have particular names.
7647 my $upper = property_ref('Uppercase_Mapping');
7648 $upper->set_core_access('uc()');
7649 $upper->set_file('Upper'); # This is what utf8.c calls it
7651 my $lower = property_ref('Lowercase_Mapping');
7652 $lower->set_core_access('lc()');
7653 $lower->set_file('Lower');
7655 my $title = property_ref('Titlecase_Mapping');
7656 $title->set_core_access('ucfirst()');
7657 $title->set_file('Title');
7659 my $fold = property_ref('Case_Folding');
7660 $fold->set_file('Fold') if defined $fold;
7662 # utf8.c can't currently cope with non range-size-1 for these, and even if
7663 # it were changed to do so, someone else may be using them, expecting the
7665 foreach my $property (qw {
7672 property_ref($property)->set_range_size_1(1);
7675 # These two properties aren't actually used in the core, but unfortunately
7676 # the names just above that are in the core interfere with these, so
7677 # choose different names. These aren't a problem unless the map tables
7678 # for these files get written out.
7679 my $lowercase = property_ref('Lowercase');
7680 $lowercase->set_file('IsLower') if defined $lowercase;
7681 my $uppercase = property_ref('Uppercase');
7682 $uppercase->set_file('IsUpper') if defined $uppercase;
7684 # Set up the hard-coded default mappings, but only on properties defined
7686 foreach my $property (keys %default_mapping) {
7687 my $property_object = property_ref($property);
7688 next if ! defined $property_object;
7689 my $default_map = $default_mapping{$property};
7690 $property_object->set_default_map($default_map);
7692 # A map of <code point> implies the property is string.
7693 if ($property_object->type == $UNKNOWN
7694 && $default_map eq $CODE_POINT)
7696 $property_object->set_type($STRING);
7700 # The following use the Multi_Default class to create objects for
7703 # Bidi class has a complicated default, but the derived file takes care of
7704 # the complications, leaving just 'L'.
7705 if (file_exists("${EXTRACTED}DBidiClass.txt")) {
7706 property_ref('Bidi_Class')->set_default_map('L');
7711 # The derived file was introduced in 3.1.1. The values below are
7712 # taken from table 3-8, TUS 3.0
7714 'my $default = Range_List->new;
7715 $default->add_range(0x0590, 0x05FF);
7716 $default->add_range(0xFB1D, 0xFB4F);'
7719 # The defaults apply only to unassigned characters
7720 $default_R .= '$gc->table("Cn") & $default;';
7722 if ($v_version lt v3.0.0) {
7723 $default = Multi_Default->new(R => $default_R, 'L');
7727 # AL apparently not introduced until 3.0: TUS 2.x references are
7728 # not on-line to check it out
7730 'my $default = Range_List->new;
7731 $default->add_range(0x0600, 0x07BF);
7732 $default->add_range(0xFB50, 0xFDFF);
7733 $default->add_range(0xFE70, 0xFEFF);'
7736 # Non-character code points introduced in this release; aren't AL
7737 if ($v_version ge 3.1.0) {
7738 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
7740 $default_AL .= '$gc->table("Cn") & $default';
7741 $default = Multi_Default->new(AL => $default_AL,
7745 property_ref('Bidi_Class')->set_default_map($default);
7748 # Joining type has a complicated default, but the derived file takes care
7749 # of the complications, leaving just 'U' (or Non_Joining), except the file
7751 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
7752 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
7753 property_ref('Joining_Type')->set_default_map('Non_Joining');
7757 # Otherwise, there are not one, but two possibilities for the
7758 # missing defaults: T and U.
7759 # The missing defaults that evaluate to T are given by:
7760 # T = Mn + Cf - ZWNJ - ZWJ
7761 # where Mn and Cf are the general category values. In other words,
7762 # any non-spacing mark or any format control character, except
7763 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
7764 # WIDTH JOINER (joining type C).
7765 my $default = Multi_Default->new(
7766 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
7768 property_ref('Joining_Type')->set_default_map($default);
7772 # Line break has a complicated default in early releases. It is 'Unknown'
7773 # for non-assigned code points; 'AL' for assigned.
7774 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
7775 my $lb = property_ref('Line_Break');
7776 if ($v_version gt 3.2.0) {
7777 $lb->set_default_map('Unknown');
7780 my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
7782 $lb->set_default_map($default);
7785 # If has the URS property, make sure that the standard aliases are in
7786 # it, since not in the input tables in some versions.
7787 my $urs = property_ref('Unicode_Radical_Stroke');
7789 $urs->add_alias('cjkRSUnicode');
7790 $urs->add_alias('kRSUnicode');
7796 sub get_old_property_aliases() {
7797 # Returns what would be in PropertyAliases.txt if it existed in very old
7798 # versions of Unicode. It was derived from the one in 3.2, and pared
7799 # down based on the data that was actually in the older releases.
7800 # An attempt was made to use the existence of files to mean inclusion or
7801 # not of various aliases, but if this was not sufficient, using version
7802 # numbers was resorted to.
7806 # These are to be used in all versions (though some are constructed by
7807 # this program if missing)
7808 push @return, split /\n/, <<'END';
7810 Bidi_M ; Bidi_Mirrored
7812 ccc ; Canonical_Combining_Class
7813 dm ; Decomposition_Mapping
7814 dt ; Decomposition_Type
7815 gc ; General_Category
7817 lc ; Lowercase_Mapping
7819 na1 ; Unicode_1_Name
7822 sfc ; Simple_Case_Folding
7823 slc ; Simple_Lowercase_Mapping
7824 stc ; Simple_Titlecase_Mapping
7825 suc ; Simple_Uppercase_Mapping
7826 tc ; Titlecase_Mapping
7827 uc ; Uppercase_Mapping
7830 if (-e 'Blocks.txt') {
7831 push @return, "blk ; Block\n";
7833 if (-e 'ArabicShaping.txt') {
7834 push @return, split /\n/, <<'END';
7839 if (-e 'PropList.txt') {
7841 # This first set is in the original old-style proplist.
7842 push @return, split /\n/, <<'END';
7844 Bidi_C ; Bidi_Control
7852 Join_C ; Join_Control
7854 QMark ; Quotation_Mark
7855 Term ; Terminal_Punctuation
7856 WSpace ; White_Space
7858 # The next sets were added later
7859 if ($v_version ge v3.0.0) {
7860 push @return, split /\n/, <<'END';
7865 if ($v_version ge v3.0.1) {
7866 push @return, split /\n/, <<'END';
7867 NChar ; Noncharacter_Code_Point
7870 # The next sets were added in the new-style
7871 if ($v_version ge v3.1.0) {
7872 push @return, split /\n/, <<'END';
7873 OAlpha ; Other_Alphabetic
7874 OLower ; Other_Lowercase
7876 OUpper ; Other_Uppercase
7879 if ($v_version ge v3.1.1) {
7880 push @return, "AHex ; ASCII_Hex_Digit\n";
7883 if (-e 'EastAsianWidth.txt') {
7884 push @return, "ea ; East_Asian_Width\n";
7886 if (-e 'CompositionExclusions.txt') {
7887 push @return, "CE ; Composition_Exclusion\n";
7889 if (-e 'LineBreak.txt') {
7890 push @return, "lb ; Line_Break\n";
7892 if (-e 'BidiMirroring.txt') {
7893 push @return, "bmg ; Bidi_Mirroring_Glyph\n";
7895 if (-e 'Scripts.txt') {
7896 push @return, "sc ; Script\n";
7898 if (-e 'DNormalizationProps.txt') {
7899 push @return, split /\n/, <<'END';
7900 Comp_Ex ; Full_Composition_Exclusion
7901 FC_NFKC ; FC_NFKC_Closure
7902 NFC_QC ; NFC_Quick_Check
7903 NFD_QC ; NFD_Quick_Check
7904 NFKC_QC ; NFKC_Quick_Check
7905 NFKD_QC ; NFKD_Quick_Check
7906 XO_NFC ; Expands_On_NFC
7907 XO_NFD ; Expands_On_NFD
7908 XO_NFKC ; Expands_On_NFKC
7909 XO_NFKD ; Expands_On_NFKD
7912 if (-e 'DCoreProperties.txt') {
7913 push @return, split /\n/, <<'END';
7918 # These can also appear in some versions of PropList.txt
7919 push @return, "Lower ; Lowercase\n"
7920 unless grep { $_ =~ /^Lower\b/} @return;
7921 push @return, "Upper ; Uppercase\n"
7922 unless grep { $_ =~ /^Upper\b/} @return;
7925 # This flag requires the DAge.txt file to be copied into the directory.
7926 if (DEBUG && $compare_versions) {
7927 push @return, 'age ; Age';
7933 sub process_PropValueAliases {
7934 # This file contains values that properties look like:
7935 # bc ; AL ; Arabic_Letter
7936 # blk; n/a ; Greek_And_Coptic ; Greek
7938 # Field 0 is the property.
7939 # Field 1 is the short name of a property value or 'n/a' if no
7940 # short name exists;
7941 # Field 2 is the full property value name;
7942 # Any other fields are more synonyms for the property value.
7943 # Purely numeric property values are omitted from the file; as are some
7944 # others, fewer and fewer in later releases
7946 # Entries for the ccc property have an extra field before the
7948 # ccc; 0; NR ; Not_Reordered
7949 # It is the numeric value that the names are synonyms for.
7951 # There are comment entries for values missing from this file:
7952 # # @missing: 0000..10FFFF; ISO_Comment; <none>
7953 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
7956 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7958 # This whole file was non-existent in early releases, so use our own
7959 # internal one if necessary.
7960 if (! -e 'PropValueAliases.txt') {
7961 $file->insert_lines(get_old_property_value_aliases());
7964 # Add any explicit cjk values
7965 $file->insert_lines(@cjk_property_values);
7967 # This line is used only for testing the code that checks for name
7968 # conflicts. There is a script Inherited, and when this line is executed
7969 # it causes there to be a name conflict with the 'Inherited' that this
7970 # program generates for this block property value
7971 #$file->insert_lines('blk; n/a; Herited');
7974 # Process each line of the file ...
7975 while ($file->next_line) {
7977 my ($property, @data) = split /\s*;\s*/;
7979 # The full name for the ccc property value is in field 2 of the
7980 # remaining ones; field 1 for all other properties. Swap ccc fields 1
7981 # and 2. (Rightmost splice removes field 2, returning it; left splice
7982 # inserts that into field 1, thus shifting former field 1 to field 2.)
7983 splice (@data, 1, 0, splice(@data, 2, 1)) if $property eq 'ccc';
7985 # If there is no short name, use the full one in element 1
7986 $data[0] = $data[1] if $data[0] eq "n/a";
7988 # Earlier releases had the pseudo property 'qc' that should expand to
7989 # the ones that replace it below.
7990 if ($property eq 'qc') {
7991 if (lc $data[0] eq 'y') {
7992 $file->insert_lines('NFC_QC; Y ; Yes',
7998 elsif (lc $data[0] eq 'n') {
7999 $file->insert_lines('NFC_QC; N ; No',
8005 elsif (lc $data[0] eq 'm') {
8006 $file->insert_lines('NFC_QC; M ; Maybe',
8007 'NFKC_QC; M ; Maybe',
8011 $file->carp_bad_line("qc followed by unexpected '$data[0]");
8016 # The first field is the short name, 2nd is the full one.
8017 my $property_object = property_ref($property);
8018 my $table = $property_object->add_match_table($data[0],
8019 Full_Name => $data[1]);
8021 # Start looking for more aliases after these two.
8022 for my $i (2 .. @data - 1) {
8023 $table->add_alias($data[$i]);
8025 } # End of looping through the file
8027 # As noted in the comments early in the program, it generates tables for
8028 # the default values for all releases, even those for which the concept
8029 # didn't exist at the time. Here we add those if missing.
8030 my $age = property_ref('age');
8031 if (defined $age && ! defined $age->table('Unassigned')) {
8032 $age->add_match_table('Unassigned');
8034 $block->add_match_table('No_Block') if -e 'Blocks.txt'
8035 && ! defined $block->table('No_Block');
8038 # Now set the default mappings of the properties from the file. This is
8039 # done after the loop because a number of properties have only @missings
8040 # entries in the file, and may not show up until the end.
8041 my @defaults = $file->get_missings;
8042 foreach my $default_ref (@defaults) {
8043 my $default = $default_ref->[0];
8044 my $property = property_ref($default_ref->[1]);
8045 $property->set_default_map($default);
8050 sub get_old_property_value_aliases () {
8051 # Returns what would be in PropValueAliases.txt if it existed in very old
8052 # versions of Unicode. It was derived from the one in 3.2, and pared
8053 # down. An attempt was made to use the existence of files to mean
8054 # inclusion or not of various aliases, but if this was not sufficient,
8055 # using version numbers was resorted to.
8057 my @return = split /\n/, <<'END';
8058 bc ; AN ; Arabic_Number
8059 bc ; B ; Paragraph_Separator
8060 bc ; CS ; Common_Separator
8061 bc ; EN ; European_Number
8062 bc ; ES ; European_Separator
8063 bc ; ET ; European_Terminator
8064 bc ; L ; Left_To_Right
8065 bc ; ON ; Other_Neutral
8066 bc ; R ; Right_To_Left
8067 bc ; WS ; White_Space
8069 # The standard combining classes are very much different in v1, so only use
8070 # ones that look right (not checked thoroughly)
8071 ccc; 0; NR ; Not_Reordered
8072 ccc; 1; OV ; Overlay
8074 ccc; 8; KV ; Kana_Voicing
8076 ccc; 202; ATBL ; Attached_Below_Left
8077 ccc; 216; ATAR ; Attached_Above_Right
8078 ccc; 218; BL ; Below_Left
8080 ccc; 222; BR ; Below_Right
8082 ccc; 228; AL ; Above_Left
8084 ccc; 232; AR ; Above_Right
8085 ccc; 234; DA ; Double_Above
8087 dt ; can ; canonical
8101 gc ; C ; Other # Cc | Cf | Cn | Co | Cs
8103 gc ; Cn ; Unassigned
8104 gc ; Co ; Private_Use
8105 gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu
8106 gc ; LC ; Cased_Letter # Ll | Lt | Lu
8107 gc ; Ll ; Lowercase_Letter
8108 gc ; Lm ; Modifier_Letter
8109 gc ; Lo ; Other_Letter
8110 gc ; Lu ; Uppercase_Letter
8111 gc ; M ; Mark # Mc | Me | Mn
8112 gc ; Mc ; Spacing_Mark
8113 gc ; Mn ; Nonspacing_Mark
8114 gc ; N ; Number # Nd | Nl | No
8115 gc ; Nd ; Decimal_Number
8116 gc ; No ; Other_Number
8117 gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps
8118 gc ; Pd ; Dash_Punctuation
8119 gc ; Pe ; Close_Punctuation
8120 gc ; Po ; Other_Punctuation
8121 gc ; Ps ; Open_Punctuation
8122 gc ; S ; Symbol # Sc | Sk | Sm | So
8123 gc ; Sc ; Currency_Symbol
8124 gc ; Sm ; Math_Symbol
8125 gc ; So ; Other_Symbol
8126 gc ; Z ; Separator # Zl | Zp | Zs
8127 gc ; Zl ; Line_Separator
8128 gc ; Zp ; Paragraph_Separator
8129 gc ; Zs ; Space_Separator
8137 if (-e 'ArabicShaping.txt') {
8138 push @return, split /\n/, <<'END';
8145 jg ; n/a ; NO_JOINING_GROUP
8153 jt ; C ; Join_Causing
8154 jt ; D ; Dual_Joining
8155 jt ; L ; Left_Joining
8156 jt ; R ; Right_Joining
8157 jt ; U ; Non_Joining
8158 jt ; T ; Transparent
8160 if ($v_version ge v3.0.0) {
8161 push @return, split /\n/, <<'END';
8165 jg ; n/a ; DALATH_RISH
8168 jg ; n/a ; FINAL_SEMKATH
8171 jg ; n/a ; HAMZA_ON_HEH_GOAL
8178 jg ; n/a ; KNOTTED_HEH
8185 jg ; n/a ; REVERSED_PE
8189 jg ; n/a ; SWASH_KAF
8191 jg ; n/a ; TEH_MARBUTA
8194 jg ; n/a ; YEH_BARREE
8195 jg ; n/a ; YEH_WITH_TAIL
8204 if (-e 'EastAsianWidth.txt') {
8205 push @return, split /\n/, <<'END';
8215 if (-e 'LineBreak.txt') {
8216 push @return, split /\n/, <<'END';
8218 lb ; AL ; Alphabetic
8219 lb ; B2 ; Break_Both
8220 lb ; BA ; Break_After
8221 lb ; BB ; Break_Before
8222 lb ; BK ; Mandatory_Break
8223 lb ; CB ; Contingent_Break
8224 lb ; CL ; Close_Punctuation
8225 lb ; CM ; Combining_Mark
8226 lb ; CR ; Carriage_Return
8227 lb ; EX ; Exclamation
8230 lb ; ID ; Ideographic
8231 lb ; IN ; Inseperable
8232 lb ; IS ; Infix_Numeric
8234 lb ; NS ; Nonstarter
8236 lb ; OP ; Open_Punctuation
8237 lb ; PO ; Postfix_Numeric
8238 lb ; PR ; Prefix_Numeric
8240 lb ; SA ; Complex_Context
8243 lb ; SY ; Break_Symbols
8249 if (-e 'DNormalizationProps.txt') {
8250 push @return, split /\n/, <<'END';
8257 if (-e 'Scripts.txt') {
8258 push @return, split /\n/, <<'END';
8260 sc ; Armn ; Armenian
8262 sc ; Bopo ; Bopomofo
8263 sc ; Cans ; Canadian_Aboriginal
8264 sc ; Cher ; Cherokee
8265 sc ; Cyrl ; Cyrillic
8266 sc ; Deva ; Devanagari
8268 sc ; Ethi ; Ethiopic
8269 sc ; Geor ; Georgian
8272 sc ; Gujr ; Gujarati
8273 sc ; Guru ; Gurmukhi
8277 sc ; Hira ; Hiragana
8278 sc ; Ital ; Old_Italic
8279 sc ; Kana ; Katakana
8284 sc ; Mlym ; Malayalam
8285 sc ; Mong ; Mongolian
8289 sc ; Qaai ; Inherited
8303 if ($v_version ge v2.0.0) {
8304 push @return, split /\n/, <<'END';
8308 dt ; vert ; vertical
8313 gc ; Lt ; Titlecase_Letter
8314 gc ; Me ; Enclosing_Mark
8315 gc ; Nl ; Letter_Number
8316 gc ; Pc ; Connector_Punctuation
8317 gc ; Sk ; Modifier_Symbol
8320 if ($v_version ge v2.1.2) {
8321 push @return, "bc ; S ; Segment_Separator\n";
8323 if ($v_version ge v2.1.5) {
8324 push @return, split /\n/, <<'END';
8325 gc ; Pf ; Final_Punctuation
8326 gc ; Pi ; Initial_Punctuation
8329 if ($v_version ge v2.1.8) {
8330 push @return, "ccc; 240; IS ; Iota_Subscript\n";
8333 if ($v_version ge v3.0.0) {
8334 push @return, split /\n/, <<'END';
8335 bc ; AL ; Arabic_Letter
8336 bc ; BN ; Boundary_Neutral
8337 bc ; LRE ; Left_To_Right_Embedding
8338 bc ; LRO ; Left_To_Right_Override
8339 bc ; NSM ; Nonspacing_Mark
8340 bc ; PDF ; Pop_Directional_Format
8341 bc ; RLE ; Right_To_Left_Embedding
8342 bc ; RLO ; Right_To_Left_Override
8344 ccc; 233; DB ; Double_Below
8348 if ($v_version ge v3.1.0) {
8349 push @return, "ccc; 226; R ; Right\n";
8356 # This is used to store the range list of all the code points usable when
8357 # the little used $compare_versions feature is enabled.
8358 my $compare_versions_range_list;
8360 sub process_generic_property_file {
8361 # This processes a file containing property mappings and puts them
8362 # into internal map tables. It should be used to handle any property
8363 # files that have mappings from a code point or range thereof to
8364 # something else. This means almost all the UCD .txt files.
8365 # each_line_handlers() should be set to adjust the lines of these
8366 # files, if necessary, to what this routine understands:
8371 # the fields are: "codepoint range ; property; map"
8373 # meaning the codepoints in the range all have the value 'map' under
8375 # Beginning and trailing white space in each field are not signficant.
8376 # Note there is not a trailing semi-colon in the above. A trailing
8377 # semi-colon means the map is a null-string. An omitted map, as
8378 # opposed to a null-string, is assumed to be 'Y', based on Unicode
8379 # table syntax. (This could have been hidden from this routine by
8380 # doing it in the $file object, but that would require parsing of the
8381 # line there, so would have to parse it twice, or change the interface
8382 # to pass this an array. So not done.)
8384 # The map field may begin with a sequence of commands that apply to
8385 # this range. Each such command begins and ends with $CMD_DELIM.
8386 # These are used to indicate, for example, that the mapping for a
8387 # range has a non-default type.
8389 # This loops through the file, calling it's next_line() method, and
8390 # then taking the map and adding it to the property's table.
8391 # Complications arise because any number of properties can be in the
8392 # file, in any order, interspersed in any way. The first time a
8393 # property is seen, it gets information about that property and
8394 # caches it for quick retrieval later. It also normalizes the maps
8395 # so that only one of many synonym is stored. The Unicode input files
8396 # do use some multiple synonyms.
8399 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8401 my %property_info; # To keep track of what properties
8402 # have already had entries in the
8403 # current file, and info about each,
8404 # so don't have to recompute.
8405 my $property_name; # property currently being worked on
8406 my $property_type; # and its type
8407 my $previous_property_name = ""; # name from last time through loop
8408 my $property_object; # pointer to the current property's
8410 my $property_addr; # the address of that object
8411 my $default_map; # the string that code points missing
8412 # from the file map to
8413 my $default_table; # For non-string properties, a
8414 # reference to the match table that
8415 # will contain the list of code
8416 # points that map to $default_map.
8418 # Get the next real non-comment line
8420 while ($file->next_line) {
8422 # Default replacement type; means that if parts of the range have
8423 # already been stored in our tables, the new map overrides them if
8424 # they differ more than cosmetically
8425 my $replace = $IF_NOT_EQUIVALENT;
8426 my $map_type; # Default type for the map of this range
8428 #local $to_trace = 1 if main::DEBUG;
8429 trace $_ if main::DEBUG && $to_trace;
8431 # Split the line into components
8432 my ($range, $property_name, $map, @remainder)
8433 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
8435 # If more or less on the line than we are expecting, warn and skip
8438 $file->carp_bad_line('Extra fields');
8441 elsif ( ! defined $property_name) {
8442 $file->carp_bad_line('Missing property');
8446 # Examine the range.
8447 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
8449 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
8453 my $high = (defined $2) ? hex $2 : $low;
8455 # For the very specialized case of comparing two Unicode
8457 if (DEBUG && $compare_versions) {
8458 if ($property_name eq 'Age') {
8460 # Only allow code points at least as old as the version
8462 my $age = pack "C*", split(/\./, $map); # v string
8463 next LINE if $age gt $compare_versions;
8467 # Again, we throw out code points younger than those of
8468 # the specified version. By now, the Age property is
8469 # populated. We use the intersection of each input range
8470 # with this property to find what code points in it are
8471 # valid. To do the intersection, we have to convert the
8472 # Age property map to a Range_list. We only have to do
8474 if (! defined $compare_versions_range_list) {
8475 my $age = property_ref('Age');
8476 if (! -e 'DAge.txt') {
8477 croak "Need to have 'DAge.txt' file to do version comparison";
8479 elsif ($age->count == 0) {
8480 croak "The 'Age' table is empty, but its file exists";
8482 $compare_versions_range_list
8483 = Range_List->new(Initialize => $age);
8486 # An undefined map is always 'Y'
8487 $map = 'Y' if ! defined $map;
8489 # Calculate the intersection of the input range with the
8490 # code points that are known in the specified version
8491 my @ranges = ($compare_versions_range_list
8492 & Range->new($low, $high))->ranges;
8494 # If the intersection is empty, throw away this range
8495 next LINE unless @ranges;
8497 # Only examine the first range this time through the loop.
8498 my $this_range = shift @ranges;
8500 # Put any remaining ranges in the queue to be processed
8501 # later. Note that there is unnecessary work here, as we
8502 # will do the intersection again for each of these ranges
8503 # during some future iteration of the LINE loop, but this
8504 # code is not used in production. The later intersections
8505 # are guaranteed to not splinter, so this will not become
8507 my $line = join ';', $property_name, $map;
8508 foreach my $range (@ranges) {
8509 $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
8515 # And process the first range, like any other.
8516 $low = $this_range->start;
8517 $high = $this_range->end;
8519 } # End of $compare_versions
8521 # If changing to a new property, get the things constant per
8523 if ($previous_property_name ne $property_name) {
8525 $property_object = property_ref($property_name);
8526 if (! defined $property_object) {
8527 $file->carp_bad_line("Unexpected property '$property_name'. Skipped");
8530 $property_addr = main::objaddr($property_object);
8532 # Defer changing names until have a line that is acceptable
8533 # (the 'next' statement above means is unacceptable)
8534 $previous_property_name = $property_name;
8536 # If not the first time for this property, retrieve info about
8538 if (defined ($property_info{$property_addr}{'type'})) {
8539 $property_type = $property_info{$property_addr}{'type'};
8540 $default_map = $property_info{$property_addr}{'default'};
8542 = $property_info{$property_addr}{'pseudo_map_type'};
8544 = $property_info{$property_addr}{'default_table'};
8548 # Here, is the first time for this property. Set up the
8550 $property_type = $property_info{$property_addr}{'type'}
8551 = $property_object->type;
8553 = $property_info{$property_addr}{'pseudo_map_type'}
8554 = $property_object->pseudo_map_type;
8556 # The Unicode files are set up so that if the map is not
8557 # defined, it is a binary property
8558 if (! defined $map && $property_type != $BINARY) {
8559 if ($property_type != $UNKNOWN
8560 && $property_type != $NON_STRING)
8562 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map");
8565 $property_object->set_type($BINARY);
8567 = $property_info{$property_addr}{'type'}
8572 # Get any @missings default for this property. This
8573 # should precede the first entry for the property in the
8574 # input file, and is located in a comment that has been
8575 # stored by the Input_file class until we access it here.
8576 # It's possible that there is more than one such line
8577 # waiting for us; collect them all, and parse
8578 my @missings_list = $file->get_missings
8579 if $file->has_missings_defaults;
8580 foreach my $default_ref (@missings_list) {
8581 my $default = $default_ref->[0];
8582 my $addr = objaddr property_ref($default_ref->[1]);
8584 # For string properties, the default is just what the
8585 # file says, but non-string properties should already
8586 # have set up a table for the default property value;
8587 # use the table for these, so can resolve synonyms
8588 # later to a single standard one.
8589 if ($property_type == $STRING
8590 || $property_type == $UNKNOWN)
8592 $property_info{$addr}{'missings'} = $default;
8595 $property_info{$addr}{'missings'}
8596 = $property_object->table($default);
8600 # Finished storing all the @missings defaults in the input
8601 # file so far. Get the one for the current property.
8602 my $missings = $property_info{$property_addr}{'missings'};
8604 # But we likely have separately stored what the default
8605 # should be. (This is to accommodate versions of the
8606 # standard where the @missings lines are absent or
8607 # incomplete.) Hopefully the two will match. But check
8609 $default_map = $property_object->default_map;
8611 # If the map is a ref, it means that the default won't be
8612 # processed until later, so undef it, so next few lines
8613 # will redefine it to something that nothing will match
8614 undef $default_map if ref $default_map;
8616 # Create a $default_map if don't have one; maybe a dummy
8617 # that won't match anything.
8618 if (! defined $default_map) {
8620 # Use any @missings line in the file.
8621 if (defined $missings) {
8622 if (ref $missings) {
8623 $default_map = $missings->full_name;
8624 $default_table = $missings;
8627 $default_map = $missings;
8630 # And store it with the property for outside use.
8631 $property_object->set_default_map($default_map);
8635 # Neither an @missings nor a default map. Create
8636 # a dummy one, so won't have to test definedness
8638 $default_map = '_Perl This will never be in a file
8643 # Here, we have $default_map defined, possibly in terms of
8644 # $missings, but maybe not, and possibly is a dummy one.
8645 if (defined $missings) {
8647 # Make sure there is no conflict between the two.
8648 # $missings has priority.
8649 if (ref $missings) {
8651 = $property_object->table($default_map);
8652 if (! defined $default_table
8653 || $default_table != $missings)
8655 if (! defined $default_table) {
8656 $default_table = $UNDEF;
8658 $file->carp_bad_line(<<END
8659 The \@missings line for $property_name in $file says that missings default to
8660 $missings, but we expect it to be $default_table. $missings used.
8663 $default_table = $missings;
8664 $default_map = $missings->full_name;
8666 $property_info{$property_addr}{'default_table'}
8669 elsif ($default_map ne $missings) {
8670 $file->carp_bad_line(<<END
8671 The \@missings line for $property_name in $file says that missings default to
8672 $missings, but we expect it to be $default_map. $missings used.
8675 $default_map = $missings;
8679 $property_info{$property_addr}{'default'}
8682 # If haven't done so already, find the table corresponding
8683 # to this map for non-string properties.
8684 if (! defined $default_table
8685 && $property_type != $STRING
8686 && $property_type != $UNKNOWN)
8688 $default_table = $property_info{$property_addr}
8690 = $property_object->table($default_map);
8692 } # End of is first time for this property
8693 } # End of switching properties.
8695 # Ready to process the line.
8696 # The Unicode files are set up so that if the map is not defined,
8697 # it is a binary property with value 'Y'
8698 if (! defined $map) {
8703 # If the map begins with a special command to us (enclosed in
8704 # delimiters), extract the command(s).
8705 if (substr($map, 0, 1) eq $CMD_DELIM) {
8706 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
8708 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
8711 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
8715 $file->carp_bad_line("Unknown command line: '$1'");
8722 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
8725 # Here, we have a map to a particular code point, and the
8726 # default map is to a code point itself. If the range
8727 # includes the particular code point, change that portion of
8728 # the range to the default. This makes sure that in the final
8729 # table only the non-defaults are listed.
8730 my $decimal_map = hex $map;
8731 if ($low <= $decimal_map && $decimal_map <= $high) {
8733 # If the range includes stuff before or after the map
8734 # we're changing, split it and process the split-off parts
8736 if ($low < $decimal_map) {
8737 $file->insert_adjusted_lines(
8738 sprintf("%04X..%04X; %s; %s",
8744 if ($high > $decimal_map) {
8745 $file->insert_adjusted_lines(
8746 sprintf("%04X..%04X; %s; %s",
8752 $low = $high = $decimal_map;
8757 # If we can tell that this is a synonym for the default map, use
8758 # the default one instead.
8759 if ($property_type != $STRING
8760 && $property_type != $UNKNOWN)
8762 my $table = $property_object->table($map);
8763 if (defined $table && $table == $default_table) {
8764 $map = $default_map;
8768 # And figure out the map type if not known.
8769 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
8770 if ($map eq "") { # Nulls are always $NULL map type
8772 } # Otherwise, non-strings, and those that don't allow
8773 # $MULTI_CP, and those that aren't multiple code points are
8776 (($property_type != $STRING && $property_type != $UNKNOWN)
8777 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
8778 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x)
8783 $map_type = $MULTI_CP;
8787 $property_object->add_map($low, $high,
8790 Replace => $replace);
8791 } # End of loop through file's lines
8797 # XXX Unused until revise charnames;
8798 #sub check_and_handle_compound_name {
8799 # This looks at Name properties for parenthesized components and splits
8800 # them off. Thus it finds FF as an equivalent to Form Feed.
8801 # my $code_point = shift;
8803 # if ($name =~ /^ ( .*? ) ( \s* ) \( ( [^)]* ) \) (.*) $/x) {
8804 # #local $to_trace = 1 if main::DEBUG;
8805 # trace $1, $2, $3, $4 if main::DEBUG && $to_trace;
8806 # push @more_Names, "$code_point; $1";
8807 # push @more_Names, "$code_point; $3";
8808 # Carp::my_carp_bug("Expecting blank space before left parenthesis in '$_'. Proceeding and assuming it was there;") if $2 ne " ";
8809 # Carp::my_carp_bug("Not expecting anything after the right parenthesis in '$_'. Proceeding and ignoring that;") if $4 ne "";
8814 { # Closure for UnicodeData.txt handling
8816 # This file was the first one in the UCD; its design leads to some
8817 # awkwardness in processing. Here is a sample line:
8818 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
8819 # The fields in order are:
8820 my $i = 0; # The code point is in field 0, and is shifted off.
8821 my $NAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
8822 my $CATEGORY = $i++; # category (e.g. "Lu")
8823 my $CCC = $i++; # Canonical combining class (e.g. "230")
8824 my $BIDI = $i++; # directional class (e.g. "L")
8825 my $PERL_DECOMPOSITION = $i++; # decomposition mapping
8826 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value
8827 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
8828 # Dual-use in this program; see below
8829 my $NUMERIC = $i++; # numeric value
8830 my $MIRRORED = $i++; # ? mirrored
8831 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
8832 my $COMMENT = $i++; # iso comment
8833 my $UPPER = $i++; # simple uppercase mapping
8834 my $LOWER = $i++; # simple lowercase mapping
8835 my $TITLE = $i++; # simple titlecase mapping
8836 my $input_field_count = $i;
8838 # This routine in addition outputs these extra fields:
8839 my $DECOMP_TYPE = $i++; # Decomposition type
8840 my $DECOMP_MAP = $i++; # Must be last; another decomposition mapping
8841 my $last_field = $i - 1;
8843 # All these are read into an array for each line, with the indices defined
8844 # above. The empty fields in the example line above indicate that the
8845 # value is defaulted. The handler called for each line of the input
8846 # changes these to their defaults.
8848 # Here are the official names of the properties, in a parallel array:
8850 $field_names[$BIDI] = 'Bidi_Class';
8851 $field_names[$CATEGORY] = 'General_Category';
8852 $field_names[$CCC] = 'Canonical_Combining_Class';
8853 $field_names[$COMMENT] = 'ISO_Comment';
8854 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
8855 $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
8856 $field_names[$LOWER] = 'Simple_Lowercase_Mapping';
8857 $field_names[$MIRRORED] = 'Bidi_Mirrored';
8858 $field_names[$NAME] = 'Name';
8859 $field_names[$NUMERIC] = 'Numeric_Value';
8860 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
8861 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
8862 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
8863 $field_names[$TITLE] = 'Simple_Titlecase_Mapping';
8864 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
8865 $field_names[$UPPER] = 'Simple_Uppercase_Mapping';
8867 # Some of these need a little more explanation. The $PERL_DECIMAL_DIGIT
8868 # field does not lead to an official Unicode property, but is used in
8869 # calculating the Numeric_Type. Perl however, creates a file from this
8870 # field, so a Perl property is created from it. Similarly, the Other
8871 # Digit field is used only for calculating the Numeric_Type, and so it can
8872 # be safely re-used as the place to store the value for Numeric_Type;
8873 # hence it is referred to as $NUMERIC_TYPE_OTHER_DIGIT. The input field
8874 # named $PERL_DECOMPOSITION is a combination of both the decomposition
8875 # mapping and its type. Perl creates a file containing exactly this
8876 # field, so it is used for that. The two properties are separated into
8877 # two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
8879 # This file is processed like most in this program. Control is passed to
8880 # process_generic_property_file() which calls filter_UnicodeData_line()
8881 # for each input line. This filter converts the input into line(s) that
8882 # process_generic_property_file() understands. There is also a setup
8883 # routine called before any of the file is processed, and a handler for
8884 # EOF processing, all in this closure.
8886 # A huge speed-up occurred at the cost of some added complexity when these
8887 # routines were altered to buffer the outputs into ranges. Almost all the
8888 # lines of the input file apply to just one code point, and for most
8889 # properties, the map for the next code point up is the same as the
8890 # current one. So instead of creating a line for each property for each
8891 # input line, filter_UnicodeData_line() remembers what the previous map
8892 # of a property was, and doesn't generate a line to pass on until it has
8893 # to, as when the map changes; and that passed-on line encompasses the
8894 # whole contiguous range of code points that have the same map for that
8895 # property. This means a slight amount of extra setup, and having to
8896 # flush these buffers on EOF, testing if the maps have changed, plus
8897 # remembering state information in the closure. But it means a lot less
8898 # real time in not having to change the data base for each property on
8901 # Another complication is that there are already a few ranges designated
8902 # in the input. There are two lines for each, with the same maps except
8903 # the code point and name on each line. This was actually the hardest
8904 # thing to design around. The code points in those ranges may actually
8905 # have real maps not given by these two lines. These maps will either
8906 # be algorthimically determinable, or in the extracted files furnished
8907 # with the UCD. In the event of conflicts between these extracted files,
8908 # and this one, Unicode says that this one prevails. But it shouldn't
8909 # prevail for conflicts that occur in these ranges. The data from the
8910 # extracted files prevails in those cases. So, this program is structured
8911 # so that those files are processed first, storing maps. Then the other
8912 # files are processed, generally overwriting what the extracted files
8913 # stored. But just the range lines in this input file are processed
8914 # without overwriting. This is accomplished by adding a special string to
8915 # the lines output to tell process_generic_property_file() to turn off the
8916 # overwriting for just this one line.
8917 # A similar mechanism is used to tell it that the map is of a non-default
8920 sub setup_UnicodeData { # Called before any lines of the input are read
8922 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8924 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
8925 Directory => File::Spec->curdir(),
8926 File => 'Decomposition',
8927 Format => $STRING_FORMAT,
8928 Internal_Only_Warning => 1,
8929 Perl_Extension => 1,
8930 Default_Map => $CODE_POINT,
8932 # This is a specially formatted table
8933 # explicitly for normalize.pm, which
8934 # is expecting a particular format,
8935 # which means that mappings containing
8936 # multiple code points are in the main
8938 Map_Type => $COMPUTE_NO_MULTI_CP,
8941 $Perl_decomp->add_comment(join_lines(<<END
8942 This mapping is a combination of the Unicode 'Decomposition_Type' and
8943 'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is
8944 identical to the official Unicode 'Decomposition_Mapping' property except for
8946 1) It omits the algorithmically determinable Hangul syllable decompositions,
8947 which normalize.pm handles algorithmically.
8948 2) It contains the decomposition type as well. Non-canonical decompositions
8949 begin with a word in angle brackets, like <super>, which denotes the
8950 compatible decomposition type. If the map does not begin with the <angle
8951 brackets>, the decomposition is canonical.
8955 my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
8957 Perl_Extension => 1,
8958 File => 'Digit', # Trad. location
8959 Directory => $map_directory,
8963 $Decimal_Digit->add_comment(join_lines(<<END
8964 This file gives the mapping of all code points which represent a single
8965 decimal digit [0-9] to their respective digits. For example, the code point
8966 U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
8967 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
8972 # This property is not used for generating anything else, and is
8973 # usually not output. By making it last in the list, we can just
8974 # change the high end of the loop downwards to avoid the work of
8975 # generating a table that is just going to get thrown away.
8976 if (! property_ref('Decomposition_Mapping')->to_output_map) {
8982 my $first_time = 1; # ? Is this the first line of the file
8983 my $in_range = 0; # ? Are we in one of the file's ranges
8984 my $previous_cp; # hex code point of previous line
8985 my $decimal_previous_cp = -1; # And its decimal equivalent
8986 my @start; # For each field, the current starting
8987 # code point in hex for the range
8988 # being accumulated.
8989 my @fields; # The input fields;
8990 my @previous_fields; # And those from the previous call
8992 sub filter_UnicodeData_line {
8993 # Handle a single input line from UnicodeData.txt; see comments above
8994 # Conceptually this takes a single line from the file containing N
8995 # properties, and converts it into N lines with one property per line,
8996 # which is what the final handler expects. But there are
8997 # complications due to the quirkiness of the input file, and to save
8998 # time, it accumulates ranges where the property values don't change
8999 # and only emits lines when necessary. This is about an order of
9000 # magnitude fewer lines emitted.
9003 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9005 # $_ contains the input line.
9006 # -1 in split means retain trailing null fields
9007 (my $cp, @fields) = split /\s*;\s*/, $_, -1;
9009 #local $to_trace = 1 if main::DEBUG;
9010 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
9011 if (@fields > $input_field_count) {
9012 $file->carp_bad_line('Extra fields');
9017 my $decimal_cp = hex $cp;
9019 # We have to output all the buffered ranges when the next code point
9020 # is not exactly one after the previous one, which means there is a
9021 # gap in the ranges.
9022 my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
9024 # The decomposition mapping field requires special handling. It looks
9027 # <compat> 0032 0020
9030 # The decomposition type is enclosed in <brackets>; if missing, it
9031 # means the type is canonical. There are two decomposition mapping
9032 # tables: the one for use by Perl's normalize.pm has a special format
9033 # which is this field intact; the other, for general use is of
9034 # standard format. In either case we have to find the decomposition
9035 # type. Empty fields have None as their type, and map to the code
9037 if ($fields[$PERL_DECOMPOSITION] eq "") {
9038 $fields[$DECOMP_TYPE] = 'None';
9039 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
9042 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
9043 =~ / < ( .+? ) > \s* ( .+ ) /x;
9044 if (! defined $fields[$DECOMP_TYPE]) {
9045 $fields[$DECOMP_TYPE] = 'Canonical';
9046 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
9049 $fields[$DECOMP_MAP] = $map;
9053 # The 3 numeric fields also require special handling. The 2 digit
9054 # fields must be either empty or match the number field. This means
9055 # that if it is empty, they must be as well, and the numeric type is
9056 # None, and the numeric value is 'Nan'.
9057 # The decimal digit field must be empty or match the other digit
9058 # field. If the decimal digit field is non-empty, the code point is
9059 # a decimal digit, and the other two fields will have the same value.
9060 # If it is empty, but the other digit field is non-empty, the code
9061 # point is an 'other digit', and the number field will have the same
9062 # value as the other digit field. If the other digit field is empty,
9063 # but the number field is non-empty, the code point is a generic
9065 if ($fields[$NUMERIC] eq "") {
9066 if ($fields[$PERL_DECIMAL_DIGIT] ne ""
9067 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
9069 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway");
9071 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
9072 $fields[$NUMERIC] = 'NaN';
9075 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
9076 if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
9077 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
9078 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
9080 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
9081 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
9082 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
9085 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
9087 # Rationals require extra effort.
9088 register_fraction($fields[$NUMERIC])
9089 if $fields[$NUMERIC] =~ qr{/};
9093 # For the properties that have empty fields in the file, and which
9094 # mean something different from empty, change them to that default.
9095 # Certain fields just haven't been empty so far in any Unicode
9096 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
9097 # $CATEGORY. This leaves just the two fields, and so we hard-code in
9098 # the defaults; which are verly unlikely to ever change.
9099 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
9100 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
9102 # UAX44 says that if title is empty, it is the same as whatever upper
9104 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
9106 # There are a few pairs of lines like:
9107 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
9108 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
9109 # that define ranges. These should be processed after the fields are
9110 # adjusted above, as they may override some of them; but mostly what
9111 # is left is to possibly adjust the $NAME field. The names of all the
9112 # paired lines start with a '<', but this is also true of '<control>,
9113 # which isn't one of these special ones.
9114 if ($fields[$NAME] eq '<control>') {
9116 # Some code points in this file have the pseudo-name
9117 # '<control>', but the official name for such ones is the null
9119 $fields[$NAME] = "";
9121 # We had better not be in between range lines.
9123 $file->carp_bad_line("Expecting a closing range line, not a $fields[$NAME]'. Trying anyway");
9127 elsif (substr($fields[$NAME], 0, 1) ne '<') {
9129 # Here is a non-range line. We had better not be in between range
9132 $file->carp_bad_line("Expecting a closing range line, not a $fields[$NAME]'. Trying anyway");
9135 # XXX until charnames catches up.
9136 # if ($fields[$NAME] =~ s/- $cp $//x) {
9138 # # These are code points whose names end in their code points,
9139 # # which means the names are algorithmically derivable from the
9140 # # code points. To shorten the output Name file, the algorithm
9141 # # for deriving these is placed in the file instead of each
9142 # # code point, so they have map type $CP_IN_NAME
9143 # $fields[$NAME] = $CMD_DELIM
9151 # Some official names are really two alternate names with one in
9152 # parentheses. What we do here is use the full official one for
9153 # the standard property (stored just above), but for the charnames
9154 # table, we add two more entries, one for each of the alternate
9157 #check_and_handle_compound_name($cp, $fields[$NAME]);
9158 #check_and_handle_compound_name($cp, $unicode_1_name);
9159 # XXX until charnames catches up.
9161 elsif ($fields[$NAME] =~ /^<(.+), First>$/) {
9162 $fields[$NAME] = $1;
9164 # Here we are at the beginning of a range pair.
9166 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$NAME]'. Trying anyway");
9170 # Because the properties in the range do not overwrite any already
9171 # in the db, we must flush the buffers of what's already there, so
9172 # they get handled in the normal scheme.
9176 elsif ($fields[$NAME] !~ s/^<(.+), Last>$/$1/) {
9177 $file->carp_bad_line("Unexpected name starting with '<' $fields[$NAME]. Ignoring this line.");
9181 else { # Here, we are at the last line of a range pair.
9184 $file->carp_bad_line("Unexpected end of range $fields[$NAME] when not in one. Ignoring this line.");
9190 # Check that the input is valid: that the closing of the range is
9191 # the same as the beginning.
9192 foreach my $i (0 .. $last_field) {
9193 next if $fields[$i] eq $previous_fields[$i];
9194 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway");
9197 # The processing differs depending on the type of range,
9198 # determined by its $NAME
9199 if ($fields[$NAME] =~ /^Hangul Syllable/) {
9201 # Check that the data looks right.
9202 if ($decimal_previous_cp != $SBase) {
9203 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong");
9205 if ($decimal_cp != $SBase + $SCount - 1) {
9206 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong");
9209 # The Hangul syllable range has a somewhat complicated name
9210 # generation algorithm. Each code point in it has a canonical
9211 # decomposition also computable by an algorithm. The
9212 # perl decomposition map table built from these is used only
9213 # by normalize.pm, which has the algorithm built in it, so the
9214 # decomposition maps are not needed, and are large, so are
9215 # omitted from it. If the full decomposition map table is to
9216 # be output, the decompositions are generated for it, in the
9217 # EOF handling code for this input file.
9219 $previous_fields[$DECOMP_TYPE] = 'Canonical';
9221 # This range is stored in our internal structure with its
9222 # own map type, different from all others.
9223 $previous_fields[$NAME] = $CMD_DELIM
9230 elsif ($fields[$NAME] =~ /^CJK/) {
9232 # The name for these contains the code point itself, and all
9233 # are defined to have the same base name, regardless of what
9234 # is in the file. They are stored in our internal structure
9235 # with a map type of $CP_IN_NAME
9236 $previous_fields[$NAME] = $CMD_DELIM
9241 . 'CJK UNIFIED IDEOGRAPH';
9244 elsif ($fields[$CATEGORY] eq 'Co'
9245 || $fields[$CATEGORY] eq 'Cs')
9247 # The names of all the code points in these ranges are set to
9248 # null, as there are no names for the private use and
9249 # surrogate code points.
9251 $previous_fields[$NAME] = "";
9254 $file->carp_bad_line("Unexpected code point range $fields[$NAME] because category is $fields[$CATEGORY]. Attempting to process it.");
9257 # The first line of the range caused everything else to be output,
9258 # and then its values were stored as the beginning values for the
9259 # next set of ranges, which this one ends. Now, for each value,
9260 # add a command to tell the handler that these values should not
9261 # replace any existing ones in our database.
9262 foreach my $i (0 .. $last_field) {
9263 $previous_fields[$i] = $CMD_DELIM
9268 . $previous_fields[$i];
9271 # And change things so it looks like the entire range has been
9272 # gone through with this being the final part of it. Adding the
9273 # command above to each field will cause this range to be flushed
9274 # during the next iteration, as it guaranteed that the stored
9275 # field won't match whatever value the next one has.
9277 $decimal_previous_cp = $decimal_cp;
9279 # We are now set up for the next iteration; so skip the remaining
9280 # code in this subroutine that does the same thing, but doesn't
9281 # know about these ranges.
9286 # On the very first line, we fake it so the code below thinks there is
9287 # nothing to output, and initialize so that when it does get output it
9288 # uses the first line's values for the lowest part of the range.
9289 # (One could avoid this by using peek(), but then one would need to
9290 # know the adjustments done above and do the same ones in the setup
9291 # routine; not worth it)
9294 @previous_fields = @fields;
9295 @start = ($cp) x scalar @fields;
9296 $decimal_previous_cp = $decimal_cp - 1;
9299 # For each field, output the stored up ranges that this code point
9300 # doesn't fit in. Earlier we figured out if all ranges should be
9301 # terminated because of changing the replace or map type styles, or if
9302 # there is a gap between this new code point and the previous one, and
9303 # that is stored in $force_output. But even if those aren't true, we
9304 # need to output the range if this new code point's value for the
9305 # given property doesn't match the stored range's.
9306 #local $to_trace = 1 if main::DEBUG;
9307 foreach my $i (0 .. $last_field) {
9308 my $field = $fields[$i];
9309 if ($force_output || $field ne $previous_fields[$i]) {
9311 # Flush the buffer of stored values.
9312 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
9314 # Start a new range with this code point and its value
9316 $previous_fields[$i] = $field;
9320 # Set the values for the next time.
9322 $decimal_previous_cp = $decimal_cp;
9324 # The input line has generated whatever adjusted lines are needed, and
9325 # should not be looked at further.
9330 sub EOF_UnicodeData {
9331 # Called upon EOF to flush the buffers, and create the Hangul
9332 # decomposition mappings if needed.
9335 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9337 # Flush the buffers.
9338 foreach my $i (1 .. $last_field) {
9339 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
9342 if (-e 'Jamo.txt') {
9344 # The algorithm is published by Unicode, based on values in
9345 # Jamo.txt, (which should have been processed before this
9346 # subroutine), and the results left in %Jamo
9348 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated.");
9352 # If the full decomposition map table is being output, insert
9353 # into it the Hangul syllable mappings. This is to avoid having
9354 # to publish a subroutine in it to compute them. (which would
9355 # essentially be this code.) This uses the algorithm published by
9357 if (property_ref('Decomposition_Mapping')->to_output_map) {
9358 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
9360 my $SIndex = $S - $SBase;
9361 my $L = $LBase + $SIndex / $NCount;
9362 my $V = $VBase + ($SIndex % $NCount) / $TCount;
9363 my $T = $TBase + $SIndex % $TCount;
9365 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
9366 my $decomposition = sprintf("%04X %04X", $L, $V);
9367 $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
9368 $file->insert_adjusted_lines(
9369 sprintf("%04X; Decomposition_Mapping; %s",
9380 # Fix UCD lines in version 1. This is probably overkill, but this
9381 # fixes some glaring errors in Version 1 UnicodeData.txt. That file:
9382 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later
9383 # removed. This program retains them
9384 # 2) didn't include ranges, which it should have, and which are now
9385 # added in @corrected_lines below. It was hand populated by
9386 # taking the data from Version 2, verified by analyzing
9388 # 3) There is a syntax error in the entry for U+09F8 which could
9389 # cause problems for utf8_heavy, and so is changed. It's
9390 # numeric value was simply a minus sign, without any number.
9391 # (Eventually Unicode changed the code point to non-numeric.)
9392 # 4) The decomposition types often don't match later versions
9393 # exactly, and the whole syntax of that field is different; so
9394 # the syntax is changed as well as the types to their later
9395 # terminology. Otherwise normalize.pm would be very unhappy
9396 # 5) Many ccc classes are different. These are left intact.
9397 # 6) U+FF10 - U+FF19 are missing their numeric values in all three
9398 # fields. These are unchanged because it doesn't really cause
9399 # problems for Perl.
9400 # 7) A number of code points, such as controls, don't have their
9401 # Unicode Version 1 Names in this file. These are unchanged.
9403 my @corrected_lines = split /\n/, <<'END';
9404 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
9405 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
9406 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
9407 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
9408 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
9409 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
9413 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9415 #local $to_trace = 1 if main::DEBUG;
9416 trace $_ if main::DEBUG && $to_trace;
9418 # -1 => retain trailing null fields
9419 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9421 # At the first place that is wrong in the input, insert all the
9422 # corrections, replacing the wrong line.
9423 if ($code_point eq '4E00') {
9424 my @copy = @corrected_lines;
9426 ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9428 $file->insert_lines(@copy);
9432 if ($fields[$NUMERIC] eq '-') {
9433 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it.
9436 if ($fields[$PERL_DECOMPOSITION] ne "") {
9438 # Several entries have this change to superscript 2 or 3 in the
9439 # middle. Convert these to the modern version, which is to use
9440 # the actual U+00B2 and U+00B3 (the superscript forms) instead.
9441 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
9442 # 'HHHH HHHH 00B3 HHHH'.
9443 # It turns out that all of these that don't have another
9444 # decomposition defined at the beginning of the line have the
9445 # <square> decomposition in later releases.
9446 if ($code_point ne '00B2' && $code_point ne '00B3') {
9447 if ($fields[$PERL_DECOMPOSITION]
9448 =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
9450 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
9451 $fields[$PERL_DECOMPOSITION] = '<square> '
9452 . $fields[$PERL_DECOMPOSITION];
9457 # If is like '<+circled> 0052 <-circled>', convert to
9459 $fields[$PERL_DECOMPOSITION] =~
9460 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x;
9462 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
9463 $fields[$PERL_DECOMPOSITION] =~
9464 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
9465 or $fields[$PERL_DECOMPOSITION] =~
9466 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
9467 or $fields[$PERL_DECOMPOSITION] =~
9468 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
9469 or $fields[$PERL_DECOMPOSITION] =~
9470 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
9472 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
9473 $fields[$PERL_DECOMPOSITION] =~
9474 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
9476 # Change names to modern form.
9477 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
9478 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
9479 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
9480 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
9482 # One entry has weird braces
9483 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
9486 $_ = join ';', $code_point, @fields;
9487 trace $_ if main::DEBUG && $to_trace;
9491 sub filter_v2_1_5_ucd {
9492 # A dozen entries in this 2.1.5 file had the mirrored and numeric
9493 # columns swapped; These all had mirrored be 'N'. So if the numeric
9494 # column appears to be N, swap it back.
9496 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9497 if ($fields[$NUMERIC] eq 'N') {
9498 $fields[$NUMERIC] = $fields[$MIRRORED];
9499 $fields[$MIRRORED] = 'N';
9500 $_ = join ';', $code_point, @fields;
9504 } # End closure for UnicodeData
9506 sub process_NamedSequences {
9507 # NamedSequences.txt entries are just added to an array. Because these
9508 # don't look like the other tables, they have their own handler.
9510 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
9512 # This just adds the sequence to an array for later handling
9514 return; # XXX Until charnames catches up
9516 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9518 while ($file->next_line) {
9519 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
9521 $file->carp_bad_line(
9522 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
9525 push @named_sequences, "$sequence\t\t$name";
9534 sub filter_early_ea_lb {
9535 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a
9536 # third field be the name of the code point, which can be ignored in
9537 # most cases. But it can be meaningful if it marks a range:
9538 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
9539 # 3400;W;<CJK Ideograph Extension A, First>
9541 # We need to see the First in the example above to know it's a range.
9542 # They did not use the later range syntaxes. This routine changes it
9543 # to use the modern syntax.
9544 # $1 is the Input_file object.
9546 my @fields = split /\s*;\s*/;
9547 if ($fields[2] =~ /^<.*, First>/) {
9548 $first_range = $fields[0];
9551 elsif ($fields[2] =~ /^<.*, Last>/) {
9552 $_ = $_ = "$first_range..$fields[0]; $fields[1]";
9556 $_ = "$fields[0]; $fields[1]";
9563 sub filter_old_style_arabic_shaping {
9564 # Early versions used a different term for the later one.
9566 my @fields = split /\s*;\s*/;
9567 $fields[3] =~ s/<no shaping>/No_Joining_Group/;
9568 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores
9569 $_ = join ';', @fields;
9573 sub filter_arabic_shaping_line {
9574 # ArabicShaping.txt has entries that look like:
9576 # The field containing 'TEH' is not used. The next field is Joining_Type
9577 # and the last is Joining_Group
9578 # This generates two lines to pass on, one for each property on the input
9582 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9584 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9587 $file->carp_bad_line('Extra fields');
9592 $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
9593 $_ = "$fields[0]; Joining_Type; $fields[2]";
9598 sub setup_special_casing {
9599 # SpecialCasing.txt contains the non-simple case change mappings. The
9600 # simple ones are in UnicodeData.txt, and should already have been read
9602 # This routine initializes the full mappings to the simple, then as each
9603 # line is processed, it overrides the simple ones.
9606 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9608 # For each of the case change mappings...
9609 foreach my $case ('lc', 'tc', 'uc') {
9611 # The simple version's name in each mapping merely has an 's' in front
9613 my $simple = property_ref('s' . $case);
9614 unless (defined $simple && ! $simple->is_empty) {
9615 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
9618 # Initialize the full case mappings with the simple ones.
9619 property_ref($case)->initialize($simple);
9625 sub filter_special_casing_line {
9626 # Change the format of $_ from SpecialCasing.txt into something that the
9627 # generic handler understands. Each input line contains three case
9628 # mappings. This will generate three lines to pass to the generic handler
9629 # for each of those.
9631 # The input syntax (after stripping comments and trailing white space is
9632 # like one of the following (with the final two being entries that we
9634 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
9635 # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
9636 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
9637 # Note the trailing semi-colon, unlike many of the input files. That
9638 # means that there will be an extra null field generated by the split
9641 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9643 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9645 # field #4 is when this mapping is conditional. If any of these get
9646 # implemented, it would be by hard-coding in the casing functions in the
9647 # Perl core, not through tables. But if there is a new condition we don't
9648 # know about, output a warning. We know about all the conditions through
9650 if ($fields[4] ne "") {
9651 my @conditions = split ' ', $fields[4];
9652 if ($conditions[0] ne 'tr' # We know that these languages have
9653 # conditions, and some are multiple
9654 && $conditions[0] ne 'az'
9655 && $conditions[0] ne 'lt'
9657 # And, we know about a single condition Final_Sigma, but
9659 && ($v_version gt v5.2.0
9660 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
9662 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore");
9664 elsif ($conditions[0] ne 'Final_Sigma') {
9666 # Don't print out a message for Final_Sigma, because we have
9667 # hard-coded handling for it. (But the standard could change
9668 # what the rule should be, but it wouldn't show up here
9671 print "# SKIPPING Special Casing: $_\n"
9672 if $verbosity >= $VERBOSE;
9677 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
9678 $file->carp_bad_line('Extra fields');
9683 $_ = "$fields[0]; lc; $fields[1]";
9684 $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
9685 $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
9690 sub filter_old_style_case_folding {
9691 # This transforms $_ containing the case folding style of 3.0.1, to 3.1
9692 # and later style. Different letters were used in the earlier.
9695 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9697 my @fields = split /\s*;\s*/;
9698 if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
9701 elsif ($fields[1] eq 'L') {
9702 $fields[1] = 'C'; # L => C always
9704 elsif ($fields[1] eq 'E') {
9705 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise
9713 $file->carp_bad_line("Expecting L or E in second field");
9717 $_ = join("; ", @fields) . ';';
9721 { # Closure for case folding
9723 # Create the map for simple only if are going to output it, for otherwise
9724 # it takes no part in anything we do.
9725 my $to_output_simple;
9727 # These are experimental, perhaps will need these to pass to regcomp.c to
9728 # handle the cases where for example the Kelvin sign character folds to k,
9729 # and in regcomp, we need to know which of the characters can have a
9730 # non-latin1 char fold to it, so it doesn't do the optimizations it might
9732 my @latin1_singly_folded;
9735 sub setup_case_folding($) {
9736 # Read in the case foldings in CaseFolding.txt. This handles both
9737 # simple and full case folding.
9740 = property_ref('Simple_Case_Folding')->to_output_map;
9745 sub filter_case_folding_line {
9746 # Called for each line in CaseFolding.txt
9747 # Input lines look like:
9748 # 0041; C; 0061; # LATIN CAPITAL LETTER A
9749 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
9750 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
9752 # 'C' means that folding is the same for both simple and full
9753 # 'F' that it is only for full folding
9754 # 'S' that it is only for simple folding
9755 # 'T' is locale-dependent, and ignored
9756 # 'I' is a type of 'F' used in some early releases.
9757 # Note the trailing semi-colon, unlike many of the input files. That
9758 # means that there will be an extra null field generated by the split
9759 # below, which we ignore and hence is not an error.
9762 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9764 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
9765 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
9766 $file->carp_bad_line('Extra fields');
9771 if ($type eq 'T') { # Skip Turkic case folding, is locale dependent
9776 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
9777 # I are all full foldings
9778 if ($type eq 'C' || $type eq 'F' || $type eq 'I') {
9779 $_ = "$range; Case_Folding; $map";
9784 $file->carp_bad_line('Expecting C F I S or T in second field');
9789 # C and S are simple foldings, but simple case folding is not needed
9790 # unless we explicitly want its map table output.
9791 if ($to_output_simple && $type eq 'C' || $type eq 'S') {
9792 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
9795 # Experimental, see comment above
9796 if ($type ne 'S' && hex($range) >= 256) { # assumes range is 1 point
9797 my @folded = split ' ', $map;
9798 if (hex $folded[0] < 256 && @folded == 1) {
9799 push @latin1_singly_folded, hex $folded[0];
9801 foreach my $folded (@folded) {
9802 push @latin1_folded, hex $folded if hex $folded < 256;
9810 # Experimental, see comment above
9813 #local $to_trace = 1 if main::DEBUG;
9814 @latin1_singly_folded = uniques(@latin1_singly_folded);
9815 @latin1_folded = uniques(@latin1_folded);
9816 trace "latin1 single folded:", map { chr $_ } sort { $a <=> $b } @latin1_singly_folded if main::DEBUG && $to_trace;
9817 trace "latin1 folded:", map { chr $_ } sort { $a <=> $b } @latin1_folded if main::DEBUG && $to_trace;
9820 } # End case fold closure
9822 sub filter_jamo_line {
9823 # Filter Jamo.txt lines. This routine mainly is used to populate hashes
9824 # from this file that is used in generating the Name property for Jamo
9825 # code points. But, it also is used to convert early versions' syntax
9826 # into the modern form. Here are two examples:
9827 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax
9828 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax
9830 # The input is $_, the output is $_ filtered.
9832 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9834 # Let the caller handle unexpected input. In earlier versions, there was
9835 # a third field which is supposed to be a comment, but did not have a '#'
9837 return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
9839 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous
9842 # Some 2.1 versions had this wrong. Causes havoc with the algorithm.
9843 $fields[1] = 'R' if $fields[0] eq '1105';
9845 # Add to structure so can generate Names from it.
9846 my $cp = hex $fields[0];
9847 my $short_name = $fields[1];
9848 $Jamo{$cp} = $short_name;
9849 if ($cp <= $LBase + $LCount) {
9850 $Jamo_L{$short_name} = $cp - $LBase;
9852 elsif ($cp <= $VBase + $VCount) {
9853 $Jamo_V{$short_name} = $cp - $VBase;
9855 elsif ($cp <= $TBase + $TCount) {
9856 $Jamo_T{$short_name} = $cp - $TBase;
9859 Carp::my_carp_bug("Unexpected Jamo code point in $_");
9863 # Reassemble using just the first two fields to look like a typical
9864 # property file line
9865 $_ = "$fields[0]; $fields[1]";
9870 sub register_fraction($) {
9871 # This registers the input rational number so that it can be passed on to
9872 # utf8_heavy.pl, both in rational and floating forms.
9874 my $rational = shift;
9876 my $float = eval $rational;
9877 $nv_floating_to_rational{$float} = $rational;
9881 sub filter_numeric_value_line {
9882 # DNumValues contains lines of a different syntax than the typical
9884 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO
9886 # This routine transforms $_ containing the anomalous syntax to the
9887 # typical, by filtering out the extra columns, and convert early version
9888 # decimal numbers to strings that look like rational numbers.
9891 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9893 # Starting in 5.1, there is a rational field. Just use that, omitting the
9894 # extra columns. Otherwise convert the decimal number in the second field
9895 # to a rational, and omit extraneous columns.
9896 my @fields = split /\s*;\s*/, $_, -1;
9899 if ($v_version ge v5.1.0) {
9901 $file->carp_bad_line('Not 4 semi-colon separated fields');
9905 $rational = $fields[3];
9906 $_ = join '; ', @fields[ 0, 3 ];
9910 # Here, is an older Unicode file, which has decimal numbers instead of
9911 # rationals in it. Use the fraction to calculate the denominator and
9912 # convert to rational.
9914 if (@fields != 2 && @fields != 3) {
9915 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
9920 my $codepoints = $fields[0];
9921 my $decimal = $fields[1];
9922 if ($decimal =~ s/\.0+$//) {
9924 # Anything ending with a decimal followed by nothing but 0's is an
9926 $_ = "$codepoints; $decimal";
9927 $rational = $decimal;
9932 if ($decimal =~ /\.50*$/) {
9936 # Here have the hardcoded repeating decimals in the fraction, and
9937 # the denominator they imply. There were only a few denominators
9938 # in the older Unicode versions of this file which this code
9939 # handles, so it is easy to convert them.
9941 # The 4 is because of a round-off error in the Unicode 3.2 files
9942 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
9945 elsif ($decimal =~ /\.[27]50*$/) {
9948 elsif ($decimal =~ /\.[2468]0*$/) {
9951 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
9954 elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
9958 my $sign = ($decimal < 0) ? "-" : "";
9959 my $numerator = int((abs($decimal) * $denominator) + .5);
9960 $rational = "$sign$numerator/$denominator";
9961 $_ = "$codepoints; $rational";
9964 $file->carp_bad_line("Can't cope with number '$decimal'.");
9971 register_fraction($rational) if $rational =~ qr{/};
9976 my %unihan_properties;
9981 # Do any special setup for Unihan properties.
9983 # This property gives the wrong computed type, so override.
9984 my $usource = property_ref('kIRG_USource');
9985 $usource->set_type($STRING) if defined $usource;
9987 # This property is to be considered binary, so change all the values
9989 $iicore = property_ref('kIICore');
9990 if (defined $iicore) {
9991 $iicore->add_match_table('Y') if ! defined $iicore->table('Y');
9993 # We have to change the default map, because the @missing line is
9994 # misleading, given that we are treating it as binary.
9995 $iicore->set_default_map('N');
9996 $iicore->set_type($BINARY);
10002 sub filter_unihan_line {
10003 # Change unihan db lines to look like the others in the db. Here is
10005 # U+341C kCangjie IEKN
10007 # Tabs are used instead of semi-colons to separate fields; therefore
10008 # they may have semi-colons embedded in them. Change these to periods
10009 # so won't screw up the rest of the code.
10012 # Remove lines that don't look like ones we accept.
10013 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
10018 # Extract the property, and save a reference to its object.
10020 if (! exists $unihan_properties{$property}) {
10021 $unihan_properties{$property} = property_ref($property);
10024 # Don't do anything unless the property is one we're handling, which
10025 # we determine by seeing if there is an object defined for it or not
10026 if (! defined $unihan_properties{$property}) {
10031 # The iicore property is supposed to be a boolean, so convert to our
10032 # standard boolean form.
10033 if (defined $iicore && $unihan_properties{$property} == $iicore) {
10034 $_ =~ s/$property.*/$property\tY/
10037 # Convert the tab separators to our standard semi-colons, and convert
10038 # the U+HHHH notation to the rest of the standard's HHHH
10040 s/\b U \+ (?= $code_point_re )//xg;
10042 #local $to_trace = 1 if main::DEBUG;
10043 trace $_ if main::DEBUG && $to_trace;
10049 sub filter_blocks_lines {
10050 # In the Blocks.txt file, the names of the blocks don't quite match the
10051 # names given in PropertyValueAliases.txt, so this changes them so they
10052 # do match: Blanks and hyphens are changed into underscores. Also makes
10053 # early release versions look like later ones
10055 # $_ is transformed to the correct value.
10058 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10060 if ($v_version lt v3.2.0) {
10061 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
10066 # Old versions used a different syntax to mark the range.
10067 $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
10070 my @fields = split /\s*;\s*/, $_, -1;
10071 if (@fields != 2) {
10072 $file->carp_bad_line("Expecting exactly two fields");
10077 # Change hyphens and blanks in the block name field only
10078 $fields[1] =~ s/[ -]/_/g;
10079 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word
10081 $_ = join("; ", @fields);
10086 my $current_property;
10088 sub filter_old_style_proplist {
10089 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it
10090 # was in a completely different syntax. Ken Whistler of Unicode says
10091 # that it was something he used as an aid for his own purposes, but
10092 # was never an official part of the standard. However, comments in
10093 # DAge.txt indicate that non-character code points were available in
10094 # the UCD as of 3.1. It is unclear to me (khw) how they could be
10095 # there except through this file (but on the other hand, they first
10096 # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
10097 # not. But the claim is that it was published as an aid to others who
10098 # might want some more information than was given in the official UCD
10099 # of the time. Many of the properties in it were incorporated into
10100 # the later PropList.txt, but some were not. This program uses this
10101 # early file to generate property tables that are otherwise not
10102 # accessible in the early UCD's, and most were probably not really
10103 # official at that time, so one could argue that it should be ignored,
10104 # and you can easily modify things to skip this. And there are bugs
10105 # in this file in various versions. (For example, the 2.1.9 version
10106 # removes from Alphabetic the CJK range starting at 4E00, and they
10107 # weren't added back in until 3.1.0.) Many of this file's properties
10108 # were later sanctioned, so this code generates tables for those
10109 # properties that aren't otherwise in the UCD of the time but
10110 # eventually did become official, and throws away the rest. Here is a
10111 # list of all the ones that are thrown away:
10112 # Bidi=* duplicates UnicodeData.txt
10113 # Combining never made into official property;
10115 # Composite never made into official property.
10116 # Currency Symbol duplicates UnicodeData.txt: gc=sc
10117 # Decimal Digit duplicates UnicodeData.txt: gc=nd
10118 # Delimiter never made into official property;
10120 # Format Control never made into official property;
10122 # High Surrogate duplicates Blocks.txt
10123 # Ignorable Control never made into official property;
10125 # ISO Control duplicates UnicodeData.txt: gc=cc
10126 # Left of Pair never made into official property;
10127 # Line Separator duplicates UnicodeData.txt: gc=zl
10128 # Low Surrogate duplicates Blocks.txt
10129 # Non-break was actually listed as a property
10130 # in 3.2, but without any code
10131 # points. Unicode denies that this
10132 # was ever an official property
10133 # Non-spacing duplicate UnicodeData.txt: gc=mn
10134 # Numeric duplicates UnicodeData.txt: gc=cc
10135 # Paired Punctuation never made into official property;
10136 # appears to be gc=ps + gc=pe
10137 # Paragraph Separator duplicates UnicodeData.txt: gc=cc
10138 # Private Use duplicates UnicodeData.txt: gc=co
10139 # Private Use High Surrogate duplicates Blocks.txt
10140 # Punctuation duplicates UnicodeData.txt: gc=p
10141 # Space different definition than eventual
10143 # Titlecase duplicates UnicodeData.txt: gc=lt
10144 # Unassigned Code Value duplicates UnicodeData.txt: gc=cc
10145 # Zero-width never made into offical property;
10147 # Most of the properties have the same names in this file as in later
10148 # versions, but a couple do not.
10150 # This subroutine filters $_, converting it from the old style into
10151 # the new style. Here's a sample of the old-style
10153 # *******************************************
10155 # Property dump for: 0x100000A0 (Join Control)
10157 # 200C..200D (2 chars)
10159 # In the example, the property is "Join Control". It is kept in this
10160 # closure between calls to the subroutine. The numbers beginning with
10161 # 0x were internal to Ken's program that generated this file.
10163 # If this line contains the property name, extract it.
10164 if (/^Property dump for: [^(]*\((.*)\)/) {
10167 # Convert white space to underscores.
10170 # Convert the few properties that don't have the same name as
10171 # their modern counterparts
10172 s/Identifier_Part/ID_Continue/
10173 or s/Not_a_Character/NChar/;
10175 # If the name matches an existing property, use it.
10176 if (defined property_ref($_)) {
10177 trace "new property=", $_ if main::DEBUG && $to_trace;
10178 $current_property = $_;
10180 else { # Otherwise discard it
10181 trace "rejected property=", $_ if main::DEBUG && $to_trace;
10182 undef $current_property;
10184 $_ = ""; # The property is saved for the next lines of the
10185 # file, but this defining line is of no further use,
10186 # so clear it so that the caller won't process it
10189 elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
10191 # Here, the input line isn't a header defining a property for the
10192 # following section, and either we aren't in such a section, or
10193 # the line doesn't look like one that defines the code points in
10194 # such a section. Ignore this line.
10199 # Here, we have a line defining the code points for the current
10200 # stashed property. Anything starting with the first blank is
10201 # extraneous. Otherwise, it should look like a normal range to
10202 # the caller. Append the property name so that it looks just like
10203 # a modern PropList entry.
10206 $_ .= "; $current_property";
10208 trace $_ if main::DEBUG && $to_trace;
10211 } # End closure for old style proplist
10213 sub filter_old_style_normalization_lines {
10214 # For early releases of Unicode, the lines were like:
10215 # 74..2A76 ; NFKD_NO
10216 # For later releases this became:
10217 # 74..2A76 ; NFKD_QC; N
10218 # Filter $_ to look like those in later releases.
10219 # Similarly for MAYBEs
10221 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
10223 # Also, the property FC_NFKC was abbreviated to FNC
10228 sub finish_Unicode() {
10229 # This routine should be called after all the Unicode files have been read
10231 # 1) Adds the mappings for code points missing from the files which have
10232 # defaults specified for them.
10233 # 2) At this this point all mappings are known, so it computes the type of
10234 # each property whose type hasn't been determined yet.
10235 # 3) Calculates all the regular expression match tables based on the
10237 # 3) Calculates and adds the tables which are defined by Unicode, but
10238 # which aren't derived by them
10240 # For each property, fill in any missing mappings, and calculate the re
10241 # match tables. If a property has more than one missing mapping, the
10242 # default is a reference to a data structure, and requires data from other
10243 # properties to resolve. The sort is used to cause these to be processed
10244 # last, after all the other properties have been calculated.
10245 # (Fortunately, the missing properties so far don't depend on each other.)
10246 foreach my $property
10247 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
10250 # $perl has been defined, but isn't one of the Unicode properties that
10251 # need to be finished up.
10252 next if $property == $perl;
10254 # Handle the properties that have more than one possible default
10255 if (ref $property->default_map) {
10256 my $default_map = $property->default_map;
10258 # These properties have stored in the default_map:
10260 # 1) A default map which applies to all code points in a
10262 # 2) an expression which will evaluate to the list of code
10263 # points in that class
10265 # 3) the default map which applies to every other missing code
10268 # Go through each list.
10269 while (my ($default, $eval) = $default_map->get_next_defaults) {
10271 # Get the class list, and intersect it with all the so-far
10272 # unspecified code points yielding all the code points
10273 # in the class that haven't been specified.
10274 my $list = eval $eval;
10276 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
10280 # Narrow down the list to just those code points we don't have
10282 $list = $list & $property->inverse_list;
10284 # Add mappings to the property for each code point in the list
10285 foreach my $range ($list->ranges) {
10286 $property->add_map($range->start, $range->end, $default);
10290 # All remaining code points have the other mapping. Set that up
10291 # so the normal single-default mapping code will work on them
10292 $property->set_default_map($default_map->other_default);
10294 # And fall through to do that
10297 # We should have enough data now to compute the type of the property.
10298 $property->compute_type;
10299 my $property_type = $property->type;
10301 next if ! $property->to_create_match_tables;
10303 # Here want to create match tables for this property
10305 # The Unicode db always (so far, and they claim into the future) have
10306 # the default for missing entries in binary properties be 'N' (unless
10307 # there is a '@missing' line that specifies otherwise)
10308 if ($property_type == $BINARY && ! defined $property->default_map) {
10309 $property->set_default_map('N');
10312 # Add any remaining code points to the mapping, using the default for
10313 # missing code points
10314 if (defined (my $default_map = $property->default_map)) {
10315 foreach my $range ($property->inverse_list->ranges) {
10316 $property->add_map($range->start, $range->end, $default_map);
10319 # Make sure there is a match table for the default
10320 if (! defined $property->table($default_map)) {
10321 $property->add_match_table($default_map);
10325 # Have all we need to populate the match tables.
10326 my $property_name = $property->name;
10327 foreach my $range ($property->ranges) {
10328 my $map = $range->value;
10329 my $table = property_ref($property_name)->table($map);
10330 if (! defined $table) {
10332 # Integral and rational property values are not necessarily
10333 # defined in PropValueAliases, but all other ones should be,
10335 if ($v_version ge v5.1.0
10336 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
10338 Carp::my_carp("Table '$property_name=$map' should have been defined. Defining it now.")
10340 $table = property_ref($property_name)->add_match_table($map);
10343 $table->add_range($range->start, $range->end);
10346 # And add the Is_ prefix synonyms for Perl 5.6 compatibility, in which
10347 # all properties have this optional prefix. These do not get a
10348 # separate entry in the pod file, because are covered by a wild-card
10350 foreach my $alias ($property->aliases) {
10351 my $Is_name = 'Is_' . $alias->name;
10352 if (! defined (my $pre_existing = property_ref($Is_name))) {
10353 $property->add_alias($Is_name,
10355 Status => $alias->status,
10356 Externally_Ok => 0);
10360 # It seemed too much work to add in these warnings when it
10361 # appears that Unicode has made a decision never to begin a
10362 # property name with 'Is_', so this shouldn't happen, but just
10363 # in case, it is a warning.
10364 Carp::my_carp(<<END
10365 There is already an alias named $Is_name (from " . $pre_existing . "), so not
10366 creating this alias for $property. The generated table and pod files do not
10367 warn users of this conflict.
10370 $has_Is_conflicts++;
10372 } # End of loop through aliases for this property
10373 } # End of loop through all Unicode properties.
10375 # Fill in the mappings that Unicode doesn't completely furnish. First the
10376 # single letter major general categories. If Unicode were to start
10377 # delivering the values, this would be redundant, but better that than to
10378 # try to figure out if should skip and not get it right. Ths could happen
10379 # if a new major category were to be introduced, and the hard-coded test
10380 # wouldn't know about it.
10381 # This routine depends on the standard names for the general categories
10382 # being what it thinks they are, like 'Cn'. The major categories are the
10383 # union of all the general category tables which have the same first
10384 # letters. eg. L = Lu + Lt + Ll + Lo + Lm
10385 foreach my $minor_table ($gc->tables) {
10386 my $minor_name = $minor_table->name;
10387 next if length $minor_name == 1;
10388 if (length $minor_name != 2) {
10389 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped.");
10393 my $major_name = uc(substr($minor_name, 0, 1));
10394 my $major_table = $gc->table($major_name);
10395 $major_table += $minor_table;
10398 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt
10399 # defines it as LC)
10400 my $LC = $gc->table('LC');
10401 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards...
10402 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility.
10405 if ($LC->is_empty) { # Assume if not empty that Unicode has started to
10406 # deliver the correct values in it
10407 $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
10409 # Lt not in release 1.
10410 $LC += $gc->table('Lt') if defined $gc->table('Lt');
10412 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
10414 my $Cs = $gc->table('Cs');
10416 $Cs->add_note('Mostly not usable in Perl.');
10417 $Cs->add_comment(join_lines(<<END
10418 Surrogates are used exclusively for I/O in UTF-16, and should not appear in
10419 Unicode text, and hence their use will generate (usually fatal) messages
10425 # Folding information was introduced later into Unicode data. To get
10426 # Perl's case ignore (/i) to work at all in releases that don't have
10427 # folding, use the best available alternative, which is lower casing.
10428 my $fold = property_ref('Simple_Case_Folding');
10429 if ($fold->is_empty) {
10430 $fold->initialize(property_ref('Simple_Lowercase_Mapping'));
10431 $fold->add_note(join_lines(<<END
10432 WARNING: This table uses lower case as a substitute for missing fold
10438 # Multiple-character mapping was introduced later into Unicode data. If
10439 # missing, use the single-characters maps as best available alternative
10440 foreach my $map (qw { Uppercase_Mapping
10445 my $full = property_ref($map);
10446 if ($full->is_empty) {
10447 my $simple = property_ref('Simple_' . $map);
10448 $full->initialize($simple);
10449 $full->add_comment($simple->comment) if ($simple->comment);
10450 $full->add_note(join_lines(<<END
10451 WARNING: This table uses simple mapping (single-character only) as a
10452 substitute for missing multiple-character information
10460 sub compile_perl() {
10461 # Create perl-defined tables. Almost all are part of the pseudo-property
10462 # named 'perl' internally to this program. Many of these are recommended
10463 # in UTS#18 "Unicode Regular Expressions", and their derivations are based
10464 # on those found there.
10465 # Almost all of these are equivalent to some Unicode property.
10466 # A number of these properties have equivalents restricted to the ASCII
10467 # range, with their names prefaced by 'Posix', to signify that these match
10468 # what the Posix standard says they should match. A couple are
10469 # effectively this, but the name doesn't have 'Posix' in it because there
10470 # just isn't any Posix equivalent.
10472 # 'Any' is all code points. As an error check, instead of just setting it
10473 # to be that, construct it to be the union of all the major categories
10474 my $Any = $perl->add_match_table('Any',
10475 Description => "[\\x{0000}-\\x{$LAST_UNICODE_CODEPOINT_STRING}]",
10478 foreach my $major_table ($gc->tables) {
10480 # Major categories are the ones with single letter names.
10481 next if length($major_table->name) != 1;
10483 $Any += $major_table;
10486 if ($Any->max != $LAST_UNICODE_CODEPOINT) {
10487 Carp::my_carp_bug("Generated highest code point ("
10488 . sprintf("%X", $Any->max)
10489 . ") doesn't match expected value $LAST_UNICODE_CODEPOINT_STRING.")
10491 if ($Any->range_count != 1 || $Any->min != 0) {
10492 Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
10495 $Any->add_alias('All');
10497 # Assigned is the opposite of gc=unassigned
10498 my $Assigned = $perl->add_match_table('Assigned',
10499 Description => "All assigned code points",
10500 Initialize => ~ $gc->table('Unassigned'),
10503 # Our internal-only property should be treated as more than just a
10505 $perl->add_match_table('_CombAbove')
10506 ->set_equivalent_to(property_ref('ccc')->table('Above'),
10509 my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
10510 if (defined $block) { # This is equivalent to the block if have it.
10511 my $Unicode_ASCII = $block->table('Basic_Latin');
10512 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
10513 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
10517 # Very early releases didn't have blocks, so initialize ASCII ourselves if
10519 if ($ASCII->is_empty) {
10520 $ASCII->initialize([ 0..127 ]);
10523 # A number of the Perl synonyms have a restricted-range synonym whose name
10524 # begins with Posix. This hash gets filled in with them, so that they can
10525 # be populated in a small loop.
10526 my %posix_equivalent;
10528 # Get the best available case definitions. Early Unicode versions didn't
10529 # have Uppercase and Lowercase defined, so use the general category
10530 # instead for them.
10531 my $Lower = $perl->add_match_table('Lower');
10532 my $Unicode_Lower = property_ref('Lowercase');
10533 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
10534 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
10537 $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
10540 $posix_equivalent{'Lower'} = $Lower;
10542 my $Upper = $perl->add_match_table('Upper');
10543 my $Unicode_Upper = property_ref('Uppercase');
10544 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
10545 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
10548 $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
10551 $posix_equivalent{'Upper'} = $Upper;
10553 # Earliest releases didn't have title case. Initialize it to empty if not
10554 # otherwise present
10555 my $Title = $perl->add_match_table('Title');
10556 my $lt = $gc->table('Lt');
10558 $Title->set_equivalent_to($lt, Related => 1);
10561 # If this Unicode version doesn't have Cased, set up our own. From
10562 # Unicode 5.1: Definition D120: A character C is defined to be cased if
10563 # and only if C has the Lowercase or Uppercase property or has a
10564 # General_Category value of Titlecase_Letter.
10565 unless (defined property_ref('Cased')) {
10566 my $cased = $perl->add_match_table('Cased',
10567 Initialize => $Lower + $Upper + $Title,
10568 Description => 'Uppercase or Lowercase or Titlecase',
10572 # Similarly, set up our own Case_Ignorable property if this Unicode
10573 # version doesn't have it. From Unicode 5.1: Definition D121: A character
10574 # C is defined to be case-ignorable if C has the value MidLetter or the
10575 # value MidNumLet for the Word_Break property or its General_Category is
10576 # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
10577 # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
10579 # Perl has long had an internal-only alias for this property.
10580 my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable');
10581 my $case_ignorable = property_ref('Case_Ignorable');
10582 if (defined $case_ignorable && ! $case_ignorable->is_empty) {
10583 $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
10588 $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
10590 # The following three properties are not in early releases
10591 $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
10592 $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
10593 $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
10595 # For versions 4.1 - 5.0, there is no MidNumLet property, and
10596 # correspondingly the case-ignorable definition lacks that one. For
10597 # 4.0, it appears that it was meant to be the same definition, but was
10598 # inadvertently omitted from the standard's text, so add it if the
10599 # property actually is there
10600 my $wb = property_ref('Word_Break');
10602 my $midlet = $wb->table('MidLetter');
10603 $perl_case_ignorable += $midlet if defined $midlet;
10604 my $midnumlet = $wb->table('MidNumLet');
10605 $perl_case_ignorable += $midnumlet if defined $midnumlet;
10609 # In earlier versions of the standard, instead of the above two
10610 # properties , just the following characters were used:
10611 $perl_case_ignorable += 0x0027 # APOSTROPHE
10612 + 0x00AD # SOFT HYPHEN (SHY)
10613 + 0x2019; # RIGHT SINGLE QUOTATION MARK
10617 # The remaining perl defined tables are mostly based on Unicode TR 18,
10618 # "Annex C: Compatibility Properties". All of these have two versions,
10619 # one whose name generally begins with Posix that is posix-compliant, and
10620 # one that matches Unicode characters beyond the Posix, ASCII range
10622 my $Alpha = $perl->add_match_table('Alpha',
10623 Description => '[[:Alpha:]] extended beyond ASCII');
10625 # Alphabetic was not present in early releases
10626 my $Alphabetic = property_ref('Alphabetic');
10627 if (defined $Alphabetic && ! $Alphabetic->is_empty) {
10628 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
10632 # For early releases, we don't get it exactly right. The below
10633 # includes more than it should, which in 5.2 terms is: L + Nl +
10634 # Other_Alphabetic. Other_Alphabetic contains many characters from
10635 # Mn and Mc. It's better to match more than we should, than less than
10637 $Alpha->initialize($gc->table('Letter')
10639 + $gc->table('Mc'));
10640 $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
10642 $posix_equivalent{'Alpha'} = $Alpha;
10644 my $Alnum = $perl->add_match_table('Alnum',
10645 Description => "[[:Alnum:]] extended beyond ASCII",
10646 Initialize => $Alpha + $gc->table('Decimal_Number'),
10648 $posix_equivalent{'Alnum'} = $Alnum;
10650 my $Word = $perl->add_match_table('Word',
10651 Description => '\w, including beyond ASCII',
10652 Initialize => $Alnum + $gc->table('Mark'),
10654 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
10655 $Word += $Pc if defined $Pc;
10657 # There is no [[:Word:]], so the name doesn't begin with Posix.
10658 $perl->add_match_table('PerlWord',
10659 Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
10660 Initialize => $Word & $ASCII,
10663 my $Blank = $perl->add_match_table('Blank',
10664 Description => '\h, Horizontal white space',
10666 # 200B is Zero Width Space which is for line
10667 # break control, and was listed as
10668 # Space_Separator in early releases
10669 Initialize => $gc->table('Space_Separator')
10673 $Blank->add_alias('HorizSpace'); # Another name for it.
10674 $posix_equivalent{'Blank'} = $Blank;
10676 my $VertSpace = $perl->add_match_table('VertSpace',
10677 Description => '\v',
10678 Initialize => $gc->table('Line_Separator')
10679 + $gc->table('Paragraph_Separator')
10680 + 0x000A # LINE FEED
10681 + 0x000B # VERTICAL TAB
10682 + 0x000C # FORM FEED
10683 + 0x000D # CARRIAGE RETURN
10686 # No Posix equivalent for vertical space
10688 my $Space = $perl->add_match_table('Space',
10689 Description => '\s including beyond ASCII plus vertical tab = [[:Space:]]',
10690 Initialize => $Blank + $VertSpace,
10692 $posix_equivalent{'Space'} = $Space;
10694 # Perl's traditional space doesn't include Vertical Tab
10695 my $SpacePerl = $perl->add_match_table('SpacePerl',
10696 Description => '\s, including beyond ASCII',
10697 Initialize => $Space - 0x000B,
10699 $perl->add_match_table('PerlSpace',
10700 Description => '\s, restricted to ASCII',
10701 Initialize => $SpacePerl & $ASCII,
10704 my $Cntrl = $perl->add_match_table('Cntrl',
10705 Description => "[[:Cntrl:]] extended beyond ASCII");
10706 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
10707 $posix_equivalent{'Cntrl'} = $Cntrl;
10709 # $controls is a temporary used to construct Graph.
10710 my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
10711 + $gc->table('Control'));
10712 # Cs not in release 1
10713 $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
10715 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
10716 my $Graph = $perl->add_match_table('Graph',
10717 Description => "[[:Graph:]] extended beyond ASCII",
10718 Initialize => ~ ($Space + $controls),
10720 $posix_equivalent{'Graph'} = $Graph;
10722 my $Print = $perl->add_match_table('Print',
10723 Description => "[[:Print:]] extended beyond ASCII",
10724 Initialize => $Space + $Graph - $gc->table('Control'),
10726 $posix_equivalent{'Print'} = $Print;
10728 my $Punct = $perl->add_match_table('Punct');
10729 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
10731 # \p{punct} doesn't include the symbols, which posix does
10732 $perl->add_match_table('PosixPunct',
10733 Description => "[[:Punct:]]",
10734 Initialize => $ASCII & ($gc->table('Punctuation')
10735 + $gc->table('Symbol')),
10738 my $Digit = $perl->add_match_table('Digit',
10739 Description => '\d, extended beyond just [0-9]');
10740 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
10741 $posix_equivalent{'Digit'} = $Digit;
10743 # AHex was not present in early releases
10744 my $Xdigit = $perl->add_match_table('XDigit',
10745 Description => '[0-9A-Fa-f]');
10746 my $AHex = property_ref('ASCII_Hex_Digit');
10747 if (defined $AHex && ! $AHex->is_empty) {
10748 $Xdigit->set_equivalent_to($AHex->table('Y'), Related => 1);
10751 # (Have to use hex because could be running on an non-ASCII machine,
10752 # and we want the Unicode (ASCII) values)
10753 $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66 ]);
10756 # Now, add the ASCII-restricted tables that get uniform treatment
10757 while (my ($name, $table) = each %posix_equivalent) {
10758 $perl->add_match_table("Posix$name",
10759 Description => "[[:$name:]]",
10760 Initialize => $table & $ASCII,
10763 $perl->table('PosixDigit')->add_description('\d, restricted to ASCII');
10764 $perl->table('PosixDigit')->add_description('[0-9]');
10767 my $dt = property_ref('Decomposition_Type');
10768 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
10769 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
10770 Perl_Extension => 1,
10771 Note => 'Perl extension consisting of the union of all non-canonical decompositions',
10774 # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
10775 # than SD appeared, construct it ourselves, based on the first release SD
10777 my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ');
10778 my $soft_dotted = property_ref('Soft_Dotted');
10779 if (defined $soft_dotted && ! $soft_dotted->is_empty) {
10780 $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
10784 # This list came from 3.2 Soft_Dotted.
10785 $CanonDCIJ->initialize([ 0x0069,
10794 $CanonDCIJ = $CanonDCIJ & $Assigned;
10797 # These are used in Unicode's definition of \X
10798 my $gcb = property_ref('Grapheme_Cluster_Break');
10799 #my $extend = $perl->add_match_table('_X_Extend');
10800 my $extend = $perl->add_match_table('_GCB_Extend');
10801 # XXX until decide what todo my $begin = $perl->add_match_table('_X_Begin');
10802 if (defined $gcb) {
10803 $extend += $gcb->table('Extend') + $gcb->table('SpacingMark')
10804 #$begin += ~ ($gcb->table('Control')
10805 # + $gcb->table('CR')
10806 # + $gcb->table('LF'));
10808 else { # Old definition, used on early releases.
10809 $extend += $gc->table('Mark')
10812 #$begin += ~ $extend;
10815 # Create a new property specially located that is a combination of the
10816 # various Name properties: Name, Unicode_1_Name, Named Sequences, and
10817 # Name_Alias properties. (The final duplicates elements of the first.) A
10818 # comment for it is constructed based on the actual properties present and
10820 my $perl_charname = Property->new('Perl_Charnames',
10821 Core_Access => '\N{...} and charnames.pm',
10823 Directory => File::Spec->curdir(),
10825 Internal_Only_Warning => 1,
10826 Perl_Extension => 1,
10829 Initialize => property_ref('Unicode_1_Name'),
10831 # Name overrides Unicode_1_Name
10832 $perl_charname->property_add_or_replace_non_nulls(property_ref('Name'));
10833 my @composition = ('Name', 'Unicode_1_Name');
10835 if (@named_sequences) {
10836 push @composition, 'Named_Sequence';
10837 foreach my $sequence (@named_sequences) {
10838 $perl_charname->add_anomalous_entry($sequence);
10842 my $alias_sentence = "";
10843 my $alias = property_ref('Name_Alias');
10844 if (defined $alias) {
10845 push @composition, 'Name_Alias';
10846 $alias->reset_each_range;
10847 while (my ($range) = $alias->each_range) {
10848 next if $range->value eq "";
10849 if ($range->start != $range->end) {
10850 Carp::my_carp("Expecting only one code point in the range $range. Just to keep going, using just the first code point;");
10852 $perl_charname->add_duplicate($range->start, $range->value);
10854 $alias_sentence = <<END;
10855 The Name_Alias property adds duplicate code point entries with a corrected
10856 name. The original (less correct, but still valid) name will be physically
10861 if (@composition <= 2) { # Always at least 2
10862 $comment = join " and ", @composition;
10865 $comment = join ", ", @composition[0 .. scalar @composition - 2];
10866 $comment .= ", and $composition[-1]";
10869 # Wait for charnames to catch up
10870 # foreach my $entry (@more_Names,
10871 # split "\n", <<"END"
10879 #FEFF; BYTE ORDER MARK
10882 # #local $to_trace = 1 if main::DEBUG;
10883 # trace $entry if main::DEBUG && $to_trace;
10884 # my ($code_point, $name) = split /\s*;\s*/, $entry;
10885 # $code_point = hex $code_point;
10886 # trace $code_point, $name if main::DEBUG && $to_trace;
10887 # $perl_charname->add_duplicate($code_point, $name);
10889 # #$perl_charname->add_comment("This file is for charnames.pm. It is the union of the $comment properties, plus certain commonly used but unofficial names, such as 'FF' and 'ZWNJ'. Unicode_1_Name entries are used only for otherwise nameless code points.$alias_sentence");
10890 $perl_charname->add_comment(join_lines( <<END
10891 This file is for charnames.pm. It is the union of the $comment properties.
10892 Unicode_1_Name entries are used only for otherwise nameless code
10898 # The combining class property used by Perl's normalize.pm is not located
10899 # in the normal mapping directory; create a copy for it.
10900 my $ccc = property_ref('Canonical_Combining_Class');
10901 my $perl_ccc = Property->new('Perl_ccc',
10902 Default_Map => $ccc->default_map,
10903 Full_Name => 'Perl_Canonical_Combining_Class',
10904 Internal_Only_Warning => 1,
10905 Perl_Extension => 1,
10908 Initialize => $ccc,
10909 File => 'CombiningClass',
10910 Directory => File::Spec->curdir(),
10912 $perl_ccc->set_to_output_map(1);
10913 $perl_ccc->add_comment(join_lines(<<END
10914 This mapping is for normalize.pm. It is currently identical to the Unicode
10915 Canonical_Combining_Class property.
10919 # This one match table for it is needed for calculations on output
10920 my $default = $perl_ccc->add_match_table($ccc->default_map,
10921 Initialize => $ccc->table($ccc->default_map),
10922 Status => $SUPPRESSED);
10924 # Construct the Present_In property from the Age property.
10925 if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
10926 my $default_map = $age->default_map;
10927 my $in = Property->new('In',
10928 Default_Map => $default_map,
10929 Full_Name => "Present_In",
10930 Internal_Only_Warning => 1,
10931 Perl_Extension => 1,
10933 Initialize => $age,
10935 $in->add_comment(join_lines(<<END
10936 This file should not be used for any purpose. The values in this file are the
10937 same as for $age, and not for what $in really means. This is because anything
10938 defined in a given release should have multiple values: that release and all
10939 higher ones. But only one value per code point can be represented in a table
10944 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the
10945 # lowest numbered (earliest) come first, with the non-numeric one
10947 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
10949 : ($b->name !~ /^[\d.]*$/)
10951 : $a->name <=> $b->name
10954 # The Present_In property is the cumulative age properties. The first
10955 # one hence is identical to the first age one.
10956 my $previous_in = $in->add_match_table($first_age->name);
10957 $previous_in->set_equivalent_to($first_age, Related => 1);
10959 my $description_start = "Code point's usage introduced in version ";
10960 $first_age->add_description($description_start . $first_age->name);
10962 # To construct the accumlated values, for each of the age tables
10963 # starting with the 2nd earliest, merge the earliest with it, to get
10964 # all those code points existing in the 2nd earliest. Repeat merging
10965 # the new 2nd earliest with the 3rd earliest to get all those existing
10966 # in the 3rd earliest, and so on.
10967 foreach my $current_age (@rest_ages) {
10968 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric
10970 my $current_in = $in->add_match_table(
10971 $current_age->name,
10972 Initialize => $current_age + $previous_in,
10973 Description => $description_start
10974 . $current_age->name
10977 $previous_in = $current_in;
10979 # Add clarifying material for the corresponding age file. This is
10980 # in part because of the confusing and contradictory information
10981 # given in the Standard's documentation itself, as of 5.2.
10982 $current_age->add_description(
10983 "Code point's usage was introduced in version "
10984 . $current_age->name);
10985 $current_age->add_note("See also $in");
10989 # And finally the code points whose usages have yet to be decided are
10990 # the same in both properties. Note that permanently unassigned code
10991 # points actually have their usage assigned (as being permanently
10992 # unassigned), so that these tables are not the same as gc=cn.
10993 my $unassigned = $in->add_match_table($default_map);
10994 my $age_default = $age->table($default_map);
10995 $age_default->add_description(<<END
10996 Code point's usage has not been assigned in any Unicode release thus far.
10999 $unassigned->set_equivalent_to($age_default, Related => 1);
11003 # Finished creating all the perl properties. All non-internal non-string
11004 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with
11005 # an underscore.) These do not get a separate entry in the pod file
11006 foreach my $table ($perl->tables) {
11007 foreach my $alias ($table->aliases) {
11008 next if $alias->name =~ /^_/;
11009 $table->add_alias('Is_' . $alias->name,
11011 Status => $alias->status,
11012 Externally_Ok => 0);
11019 sub add_perl_synonyms() {
11020 # A number of Unicode tables have Perl synonyms that are expressed in
11021 # the single-form, \p{name}. These are:
11022 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
11023 # \p{Is_Name} as synonyms
11024 # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
11025 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
11026 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
11027 # conflict, \p{Value} and \p{Is_Value} as well
11029 # This routine generates these synonyms, warning of any unexpected
11032 # Construct the list of tables to get synonyms for. Start with all the
11033 # binary and the General_Category ones.
11034 my @tables = grep { $_->type == $BINARY } property_ref('*');
11035 push @tables, $gc->tables;
11037 # If the version of Unicode includes the Script property, add its tables
11038 if (defined property_ref('Script')) {
11039 push @tables, property_ref('Script')->tables;
11042 # The Block tables are kept separate because they are treated differently.
11043 # And the earliest versions of Unicode didn't include them, so add only if
11046 push @blocks, $block->tables if defined $block;
11048 # Here, have the lists of tables constructed. Process blocks last so that
11049 # if there are name collisions with them, blocks have lowest priority.
11050 # Should there ever be other collisions, manual intervention would be
11051 # required. See the comments at the beginning of the program for a
11052 # possible way to handle those semi-automatically.
11053 foreach my $table (@tables, @blocks) {
11055 # For non-binary properties, the synonym is just the name of the
11056 # table, like Greek, but for binary properties the synonym is the name
11057 # of the property, and means the code points in its 'Y' table.
11058 my $nominal = $table;
11059 my $nominal_property = $nominal->property;
11061 if (! $nominal->isa('Property')) {
11066 # Here is a binary property. Use the 'Y' table. Verify that is
11068 my $yes = $nominal->table('Y');
11069 unless (defined $yes) { # Must be defined, but is permissible to
11071 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping.");
11077 foreach my $alias ($nominal->aliases) {
11079 # Attempt to create a table in the perl directory for the
11080 # candidate table, using whatever aliases in it that don't
11081 # conflict. Also add non-conflicting aliases for all these
11082 # prefixed by 'Is_' (and/or 'In_' for Block property tables)
11084 foreach my $prefix ("", 'Is_', 'In_') {
11086 # Only Block properties can have added 'In_' aliases.
11087 next if $prefix eq 'In_' and $nominal_property != $block;
11089 my $proposed_name = $prefix . $alias->name;
11091 # No Is_Is, In_In, nor combinations thereof
11092 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
11093 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
11095 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
11097 # Get a reference to any existing table in the perl
11098 # directory with the desired name.
11099 my $pre_existing = $perl->table($proposed_name);
11101 if (! defined $pre_existing) {
11103 # No name collision, so ok to add the perl synonym.
11105 my $make_pod_entry;
11107 my $status = $actual->status;
11108 if ($nominal_property == $block) {
11110 # For block properties, the 'In' form is preferred for
11111 # external use; the pod file contains wild cards for
11112 # this and the 'Is' form so no entries for those; and
11113 # we don't want people using the name without the
11114 # 'In', so discourage that.
11115 if ($prefix eq "") {
11116 $make_pod_entry = 1;
11117 $status = $status || $DISCOURAGED;
11118 $externally_ok = 0;
11120 elsif ($prefix eq 'In_') {
11121 $make_pod_entry = 0;
11122 $status = $status || $NORMAL;
11123 $externally_ok = 1;
11126 $make_pod_entry = 0;
11127 $status = $status || $DISCOURAGED;
11128 $externally_ok = 0;
11131 elsif ($prefix ne "") {
11133 # The 'Is' prefix is handled in the pod by a wild
11134 # card, and we won't use it for an external name
11135 $make_pod_entry = 0;
11136 $status = $status || $NORMAL;
11137 $externally_ok = 0;
11141 # Here, is an empty prefix, non block. This gets its
11142 # own pod entry and can be used for an external name.
11143 $make_pod_entry = 1;
11144 $status = $status || $NORMAL;
11145 $externally_ok = 1;
11148 # Here, there isn't a perl pre-existing table with the
11149 # name. Look through the list of equivalents of this
11150 # table to see if one is a perl table.
11151 foreach my $equivalent ($actual->leader->equivalents) {
11152 next if $equivalent->property != $perl;
11154 # Here, have found a table for $perl. Add this alias
11155 # to it, and are done with this prefix.
11156 $equivalent->add_alias($proposed_name,
11157 Pod_Entry => $make_pod_entry,
11159 Externally_Ok => $externally_ok);
11160 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
11164 # Here, $perl doesn't already have a table that is a
11165 # synonym for this property, add one.
11166 my $added_table = $perl->add_match_table($proposed_name,
11167 Pod_Entry => $make_pod_entry,
11169 Externally_Ok => $externally_ok);
11170 # And it will be related to the actual table, since it is
11172 $added_table->set_equivalent_to($actual, Related => 1);
11173 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
11175 } # End of no pre-existing.
11177 # Here, there is a pre-existing table that has the proposed
11178 # name. We could be in trouble, but not if this is just a
11179 # synonym for another table that we have already made a child
11180 # of the pre-existing one.
11181 if ($pre_existing->is_equivalent_to($actual)) {
11182 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
11183 $pre_existing->add_alias($proposed_name);
11187 # Here, there is a name collision, but it still could be ok if
11188 # the tables match the identical set of code points, in which
11189 # case, we can combine the names. Compare each table's code
11190 # point list to see if they are identical.
11191 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
11192 if ($pre_existing->matches_identically_to($actual)) {
11194 # Here, they do match identically. Not a real conflict.
11195 # Make the perl version a child of the Unicode one, except
11196 # in the non-obvious case of where the perl name is
11197 # already a synonym of another Unicode property. (This is
11198 # excluded by the test for it being its own parent.) The
11199 # reason for this exclusion is that then the two Unicode
11200 # properties become related; and we don't really know if
11201 # they are or not. We generate documentation based on
11202 # relatedness, and this would be misleading. Code
11203 # later executed in the process will cause the tables to
11204 # be represented by a single file anyway, without making
11205 # it look in the pod like they are necessarily related.
11206 if ($pre_existing->parent == $pre_existing
11207 && ($pre_existing->property == $perl
11208 || $actual->property == $perl))
11210 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
11211 $pre_existing->set_equivalent_to($actual, Related => 1);
11213 elsif (main::DEBUG && $to_trace) {
11214 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
11215 trace $pre_existing->parent;
11220 # Here they didn't match identically, there is a real conflict
11221 # between our new name and a pre-existing property.
11222 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
11223 $pre_existing->add_conflicting($nominal->full_name,
11227 # Don't output a warning for aliases for the block
11228 # properties (unless they start with 'In_') as it is
11229 # expected that there will be conflicts and the block
11231 if ($verbosity >= $NORMAL_VERBOSITY
11232 && ($actual->property != $block || $prefix eq 'In_'))
11234 print simple_fold(join_lines(<<END
11235 There is already an alias named $proposed_name (from " . $pre_existing . "),
11236 so not creating this alias for " . $actual
11241 # Keep track for documentation purposes.
11242 $has_In_conflicts++ if $prefix eq 'In_';
11243 $has_Is_conflicts++ if $prefix eq 'Is_';
11248 # There are some properties which have No and Yes (and N and Y) as
11249 # property values, but aren't binary, and could possibly be confused with
11250 # binary ones. So create caveats for them. There are tables that are
11251 # named 'No', and tables that are named 'N', but confusion is not likely
11252 # unless they are the same table. For example, N meaning Number or
11253 # Neutral is not likely to cause confusion, so don't add caveats to things
11255 foreach my $property (grep { $_->type != $BINARY } property_ref('*')) {
11256 my $yes = $property->table('Yes');
11257 if (defined $yes) {
11258 my $y = $property->table('Y');
11259 if (defined $y && $yes == $y) {
11260 foreach my $alias ($property->aliases) {
11261 $yes->add_conflicting($alias->name);
11265 my $no = $property->table('No');
11267 my $n = $property->table('N');
11268 if (defined $n && $no == $n) {
11269 foreach my $alias ($property->aliases) {
11270 $no->add_conflicting($alias->name, 'P');
11279 sub register_file_for_name($$$) {
11280 # Given info about a table and a datafile that it should be associated
11281 # with, register that assocation
11284 my $directory_ref = shift; # Array of the directory path for the file
11285 my $file = shift; # The file name in the final directory, [-1].
11286 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11288 trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
11290 if ($table->isa('Property')) {
11291 $table->set_file_path(@$directory_ref, $file);
11292 push @map_properties, $table
11293 if $directory_ref->[0] eq $map_directory;
11297 # Do all of the work for all equivalent tables when called with the leader
11298 # table, so skip if isn't the leader.
11299 return if $table->leader != $table;
11301 # Join all the file path components together, using slashes.
11302 my $full_filename = join('/', @$directory_ref, $file);
11304 # All go in the same subdirectory of unicore
11305 if ($directory_ref->[0] ne $matches_directory) {
11306 Carp::my_carp("Unexpected directory in "
11307 . join('/', @{$directory_ref}, $file));
11310 # For this table and all its equivalents ...
11311 foreach my $table ($table, $table->equivalents) {
11313 # Associate it with its file internally. Don't include the
11314 # $matches_directory first component
11315 $table->set_file_path(@$directory_ref, $file);
11316 my $sub_filename = join('/', $directory_ref->[1, -1], $file);
11318 my $property = $table->property;
11319 $property = ($property == $perl)
11320 ? "" # 'perl' is never explicitly stated
11321 : standardize($property->name) . '=';
11323 my $deprecated = ($table->status eq $DEPRECATED)
11324 ? $table->status_info
11327 # And for each of the table's aliases... This inner loop eventually
11328 # goes through all aliases in the UCD that we generate regex match
11330 foreach my $alias ($table->aliases) {
11331 my $name = $alias->name;
11333 # Generate an entry in either the loose or strict hashes, which
11334 # will translate the property and alias names combination into the
11335 # file where the table for them is stored.
11337 if ($alias->loose_match) {
11338 $standard = $property . standardize($alias->name);
11339 if (exists $loose_to_file_of{$standard}) {
11340 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
11343 $loose_to_file_of{$standard} = $sub_filename;
11347 $standard = lc ($property . $name);
11348 if (exists $stricter_to_file_of{$standard}) {
11349 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
11352 $stricter_to_file_of{$standard} = $sub_filename;
11354 # Tightly coupled with how utf8_heavy.pl works, for a
11355 # floating point number that is a whole number, get rid of
11356 # the trailing decimal point and 0's, so that utf8_heavy
11357 # will work. Also note that this assumes that such a
11358 # number is matched strictly; so if that were to change,
11359 # this would be wrong.
11360 if ((my $integer_name = $name)
11361 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
11363 $stricter_to_file_of{$property . $integer_name}
11369 # Keep a list of the deprecated properties and their filenames
11371 $utf8::why_deprecated{$sub_filename} = $deprecated;
11380 my %base_names; # Names already used for avoiding DOS 8.3 filesystem
11382 my %full_dir_name_of; # Full length names of directories used.
11384 sub construct_filename($$$) {
11385 # Return a file name for a table, based on the table name, but perhaps
11386 # changed to get rid of non-portable characters in it, and to make
11387 # sure that it is unique on a file system that allows the names before
11388 # any period to be at most 8 characters (DOS). While we're at it
11389 # check and complain if there are any directory conflicts.
11391 my $name = shift; # The name to start with
11392 my $mutable = shift; # Boolean: can it be changed? If no, but
11393 # yet it must be to work properly, a warning
11395 my $directories_ref = shift; # A reference to an array containing the
11396 # path to the file, with each element one path
11397 # component. This is used because the same
11398 # name can be used in different directories.
11399 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11401 my $warn = ! defined wantarray; # If true, then if the name is
11402 # changed, a warning is issued as well.
11404 if (! defined $name) {
11405 Carp::my_carp("Undefined name in directory "
11406 . File::Spec->join(@$directories_ref)
11411 # Make sure that no directory names conflict with each other. Look at
11412 # each directory in the input file's path. If it is already in use,
11413 # assume it is correct, and is merely being re-used, but if we
11414 # truncate it to 8 characters, and find that there are two directories
11415 # that are the same for the first 8 characters, but differ after that,
11416 # then that is a problem.
11417 foreach my $directory (@$directories_ref) {
11418 my $short_dir = substr($directory, 0, 8);
11419 if (defined $full_dir_name_of{$short_dir}) {
11420 next if $full_dir_name_of{$short_dir} eq $directory;
11421 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway");
11424 $full_dir_name_of{$short_dir} = $directory;
11428 my $path = join '/', @$directories_ref;
11429 $path .= '/' if $path;
11431 # Remove interior underscores.
11432 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
11434 # Change any non-word character into an underscore, and truncate to 8.
11435 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_"
11436 substr($filename, 8) = "" if length($filename) > 8;
11438 # Make sure the basename doesn't conflict with something we
11439 # might have already written. If we have, say,
11446 while (my $num = $base_names{$path}{lc $filename}++) {
11447 $num++; # so basenames with numbers start with '2', which
11448 # just looks more natural.
11450 # Want to append $num, but if it'll make the basename longer
11451 # than 8 characters, pre-truncate $filename so that the result
11453 my $delta = length($filename) + length($num) - 8;
11455 substr($filename, -$delta) = $num;
11460 if ($warn && ! $warned) {
11462 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway.");
11466 return $filename if $mutable;
11468 # If not changeable, must return the input name, but warn if needed to
11469 # change it beyond shortening it.
11470 if ($name ne $filename
11471 && substr($name, 0, length($filename)) ne $filename) {
11472 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway.");
11478 # The pod file contains a very large table. Many of the lines in that table
11479 # would exceed a typical output window's size, and so need to be wrapped with
11480 # a hanging indent to make them look good. The pod language is really
11481 # insufficient here. There is no general construct to do that in pod, so it
11482 # is done here by beginning each such line with a space to cause the result to
11483 # be output without formatting, and doing all the formatting here. This leads
11484 # to the result that if the eventual display window is too narrow it won't
11485 # look good, and if the window is too wide, no advantage is taken of that
11486 # extra width. A further complication is that the output may be indented by
11487 # the formatter so that there is less space than expected. What I (khw) have
11488 # done is to assume that that indent is a particular number of spaces based on
11489 # what it is in my Linux system; people can always resize their windows if
11490 # necessary, but this is obviously less than desirable, but the best that can
11492 my $automatic_pod_indent = 8;
11494 # Try to format so that uses fewest lines, but few long left column entries
11495 # slide into the right column. An experiment on 5.1 data yielded the
11496 # following percentages that didn't cut into the other side along with the
11497 # associated first-column widths
11499 # 80% not too bad except for a few blocks
11500 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
11502 my $indent_info_column = 27; # 75% of lines didn't have overlap
11504 my $FILLER = 3; # Length of initial boiler-plate columns in a pod line
11505 # The 3 is because of:
11506 # 1 for the leading space to tell the pod formatter to
11509 # 1 for the space between the flag and the main data
11511 sub format_pod_line ($$$;$$) {
11512 # Take a pod line and return it, formatted properly
11514 my $first_column_width = shift;
11515 my $entry = shift; # Contents of left column
11516 my $info = shift; # Contents of right column
11518 my $status = shift || ""; # Any flag
11520 my $loose_match = shift; # Boolean.
11521 $loose_match = 1 unless defined $loose_match;
11523 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11526 $flags .= $STRICTER if ! $loose_match;
11528 $flags .= $status if $status;
11530 # There is a blank in the left column to cause the pod formatter to
11531 # output the line as-is.
11532 return sprintf " %-*s%-*s %s\n",
11533 # The first * in the format is replaced by this, the -1 is
11534 # to account for the leading blank. There isn't a
11535 # hard-coded blank after this to separate the flags from
11536 # the rest of the line, so that in the unlikely event that
11537 # multiple flags are shown on the same line, they both
11538 # will get displayed at the expense of that separation,
11539 # but since they are left justified, a blank will be
11540 # inserted in the normal case.
11544 # The other * in the format is replaced by this number to
11545 # cause the first main column to right fill with blanks.
11546 # The -1 is for the guaranteed blank following it.
11547 $first_column_width - $FILLER - 1,
11552 my @zero_match_tables; # List of tables that have no matches in this release
11554 sub make_table_pod_entries($) {
11555 # This generates the entries for the pod file for a given table.
11556 # Also done at this time are any children tables. The output looks like:
11557 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178)
11559 my $input_table = shift; # Table the entry is for
11560 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11562 # Generate parent and all its children at the same time.
11563 return if $input_table->parent != $input_table;
11565 my $property = $input_table->property;
11566 my $type = $property->type;
11567 my $full_name = $property->full_name;
11569 my $count = $input_table->count;
11570 my $string_count = clarify_number($count);
11571 my $status = $input_table->status;
11572 my $status_info = $input_table->status_info;
11574 my $entry_for_first_table; # The entry for the first table output.
11575 # Almost certainly, it is the parent.
11577 # For each related table (including itself), we will generate a pod entry
11578 # for each name each table goes by
11579 foreach my $table ($input_table, $input_table->children) {
11581 # utf8_heavy.pl cannot deal with null string property values, so don't
11583 next if $table->name eq "";
11585 # First, gather all the info that applies to this table as a whole.
11587 push @zero_match_tables, $table if $count == 0;
11589 my $table_property = $table->property;
11591 # The short name has all the underscores removed, while the full name
11592 # retains them. Later, we decide whether to output a short synonym
11593 # for the full one, we need to compare apples to apples, so we use the
11594 # short name's length including underscores.
11595 my $table_property_short_name_length;
11596 my $table_property_short_name
11597 = $table_property->short_name(\$table_property_short_name_length);
11598 my $table_property_full_name = $table_property->full_name;
11600 # Get how much savings there is in the short name over the full one
11601 # (delta will always be <= 0)
11602 my $table_property_short_delta = $table_property_short_name_length
11603 - length($table_property_full_name);
11604 my @table_description = $table->description;
11605 my @table_note = $table->note;
11607 # Generate an entry for each alias in this table.
11608 my $entry_for_first_alias; # saves the first one encountered.
11609 foreach my $alias ($table->aliases) {
11611 # Skip if not to go in pod.
11612 next unless $alias->make_pod_entry;
11614 # Start gathering all the components for the entry
11615 my $name = $alias->name;
11617 my $entry; # Holds the left column, may include extras
11618 my $entry_ref; # To refer to the left column's contents from
11619 # another entry; has no extras
11621 # First the left column of the pod entry. Tables for the $perl
11622 # property always use the single form.
11623 if ($table_property == $perl) {
11624 $entry = "\\p{$name}";
11625 $entry_ref = "\\p{$name}";
11627 else { # Compound form.
11629 # Only generate one entry for all the aliases that mean true
11630 # or false in binary properties. Append a '*' to indicate
11631 # some are missing. (The heading comment notes this.)
11632 my $wild_card_mark;
11633 if ($type == $BINARY) {
11634 next if $name ne 'N' && $name ne 'Y';
11635 $wild_card_mark = '*';
11638 $wild_card_mark = "";
11641 # Colon-space is used to give a little more space to be easier
11644 . $table_property_full_name
11645 . ": $name$wild_card_mark}";
11647 # But for the reference to this entry, which will go in the
11648 # right column, where space is at a premium, use equals
11650 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
11653 # Then the right (info) column. This is stored as components of
11654 # an array for the moment, then joined into a string later. For
11655 # non-internal only properties, begin the info with the entry for
11656 # the first table we encountered (if any), as things are ordered
11657 # so that that one is the most descriptive. This leads to the
11658 # info column of an entry being a more descriptive version of the
11661 if ($name =~ /^_/) {
11663 '(For internal use by Perl, not necessarily stable)';
11665 elsif ($entry_for_first_alias) {
11666 push @info, $entry_for_first_alias;
11669 # If this entry is equivalent to another, add that to the info,
11670 # using the first such table we encountered
11671 if ($entry_for_first_table) {
11673 push @info, "(= $entry_for_first_table)";
11676 push @info, $entry_for_first_table;
11680 # If the name is a large integer, add an equivalent with an
11681 # exponent for better readability
11682 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
11683 push @info, sprintf "(= %.1e)", $name
11686 my $parenthesized = "";
11687 if (! $entry_for_first_alias) {
11689 # This is the first alias for the current table. The alias
11690 # array is ordered so that this is the fullest, most
11691 # descriptive alias, so it gets the fullest info. The other
11692 # aliases are mostly merely pointers to this one, using the
11693 # information already added above.
11695 # Display any status message, but only on the parent table
11696 if ($status && ! $entry_for_first_table) {
11697 push @info, $status_info;
11700 # Put out any descriptive info
11701 if (@table_description || @table_note) {
11702 push @info, join "; ", @table_description, @table_note;
11705 # Look to see if there is a shorter name we can point people
11707 my $standard_name = standardize($name);
11709 my $proposed_short = $table->short_name;
11710 if (defined $proposed_short) {
11711 my $standard_short = standardize($proposed_short);
11713 # If the short name is shorter than the standard one, or
11714 # even it it's not, but the combination of it and its
11715 # short property name (as in \p{prop=short} ($perl doesn't
11716 # have this form)) saves at least two characters, then,
11717 # cause it to be listed as a shorter synonym.
11718 if (length $standard_short < length $standard_name
11719 || ($table_property != $perl
11720 && (length($standard_short)
11721 - length($standard_name)
11722 + $table_property_short_delta) # (<= 0)
11725 $short_name = $proposed_short;
11726 if ($table_property != $perl) {
11727 $short_name = $table_property_short_name
11730 $short_name = "\\p{$short_name}";
11734 # And if this is a compound form name, see if there is a
11735 # single form equivalent
11737 if ($table_property != $perl) {
11739 # Special case the binary N tables, so that will print
11740 # \P{single}, but use the Y table values to populate
11741 # 'single', as we haven't populated the N table.
11744 if ($type == $BINARY
11745 && $input_table == $property->table('No'))
11747 $test_table = $property->table('Yes');
11751 $test_table = $input_table;
11755 # Look for a single form amongst all the children.
11756 foreach my $table ($test_table->children) {
11757 next if $table->property != $perl;
11758 my $proposed_name = $table->short_name;
11759 next if ! defined $proposed_name;
11761 # Don't mention internal-only properties as a possible
11762 # single form synonym
11763 next if substr($proposed_name, 0, 1) eq '_';
11765 $proposed_name = "\\$p\{$proposed_name}";
11766 if (! defined $single_form
11767 || length($proposed_name) < length $single_form)
11769 $single_form = $proposed_name;
11771 # The goal here is to find a single form; not the
11772 # shortest possible one. We've already found a
11773 # short name. So, stop at the first single form
11774 # found, which is likely to be closer to the
11781 # Ouput both short and single in the same parenthesized
11782 # expression, but with only one of 'Single', 'Short' if there
11784 if ($short_name || $single_form || $table->conflicting) {
11785 $parenthesized .= '(';
11786 $parenthesized .= "Short: $short_name" if $short_name;
11787 if ($short_name && $single_form) {
11788 $parenthesized .= ', ';
11790 elsif ($single_form) {
11791 $parenthesized .= 'Single: ';
11793 $parenthesized .= $single_form if $single_form;
11798 # Warn if this property isn't the same as one that a
11799 # semi-casual user might expect. The other components of this
11800 # parenthesized structure are calculated only for the first entry
11801 # for this table, but the conflicting is deemed important enough
11802 # to go on every entry.
11803 my $conflicting = join " NOR ", $table->conflicting;
11804 if ($conflicting) {
11805 $parenthesized .= '(' if ! $parenthesized;
11806 $parenthesized .= '; ' if $parenthesized ne '(';
11807 $parenthesized .= "NOT $conflicting";
11809 $parenthesized .= ')' if $parenthesized;
11811 push @info, $parenthesized if $parenthesized;
11812 push @info, "($string_count)" if $output_range_counts;
11814 # Now, we have both the entry and info so add them to the
11815 # list of all the properties.
11816 push @match_properties,
11817 format_pod_line($indent_info_column,
11821 $alias->loose_match);
11823 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
11824 } # End of looping through the aliases for this table.
11826 if (! $entry_for_first_table) {
11827 $entry_for_first_table = $entry_for_first_alias;
11829 } # End of looping through all the related tables
11833 sub pod_alphanumeric_sort {
11834 # Sort pod entries alphanumerically.
11836 # The first few character columns are filler, plus the '\p{'; and get rid
11837 # of all the trailing stuff, starting with the trailing '}', so as to sort
11838 # on just 'Name=Value'
11839 (my $a = lc $a) =~ s/^ .*? { //x;
11841 (my $b = lc $b) =~ s/^ .*? { //x;
11844 # Determine if the two operands are both internal only or both not.
11845 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
11846 # should be the underscore that begins internal only
11847 my $a_is_internal = (substr($a, 0, 1) eq '_');
11848 my $b_is_internal = (substr($b, 0, 1) eq '_');
11850 # Sort so the internals come last in the table instead of first (which the
11851 # leading underscore would otherwise indicate).
11852 if ($a_is_internal != $b_is_internal) {
11853 return 1 if $a_is_internal;
11857 # Determine if the two operands are numeric property values or not.
11858 # A numeric property will look like xyz: 3. But the number
11859 # can begin with an optional minus sign, and may have a
11860 # fraction or rational component, like xyz: 3/2. If either
11861 # isn't numeric, use alphabetic sort.
11862 my ($a_initial, $a_number) =
11863 ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
11864 return $a cmp $b unless defined $a_number;
11865 my ($b_initial, $b_number) =
11866 ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
11867 return $a cmp $b unless defined $b_number;
11869 # Here they are both numeric, but use alphabetic sort if the
11870 # initial parts don't match
11871 return $a cmp $b if $a_initial ne $b_initial;
11873 # Convert rationals to floating for the comparison.
11874 $a_number = eval $a_number if $a_number =~ qr{/};
11875 $b_number = eval $b_number if $b_number =~ qr{/};
11877 return $a_number <=> $b_number;
11881 # Create the .pod file. This generates the various subsections and then
11882 # combines them in one big HERE document.
11884 return unless defined $pod_directory;
11885 print "Making pod file\n" if $verbosity >= $PROGRESS;
11887 my $exception_message =
11888 '(Any exceptions are individually noted beginning with the word NOT.)';
11890 if (-e 'Blocks.txt') {
11892 # Add the line: '\p{In_*} \p{Block: *}', with the warning message
11893 # if the global $has_In_conflicts indicates we have them.
11894 push @match_properties, format_pod_line($indent_info_column,
11897 . (($has_In_conflicts)
11898 ? " $exception_message"
11900 @block_warning = << "END";
11902 Matches in the Block property have shortcuts that begin with 'In_'. For
11903 example, \\p{Block=Latin1} can be written as \\p{In_Latin1}. For backward
11904 compatibility, if there is no conflict with another shortcut, these may also
11905 be written as \\p{Latin1} or \\p{Is_Latin1}. But, N.B., there are numerous
11906 such conflicting shortcuts. Use of these forms for Block is discouraged, and
11907 are flagged as such, not only because of the potential confusion as to what is
11908 meant, but also because a later release of Unicode may preempt the shortcut,
11909 and your program would no longer be correct. Use the 'In_' form instead to
11910 avoid this, or even more clearly, use the compound form, e.g.,
11911 \\p{blk:latin1}. See L<perlunicode/"Blocks"> for more information about this.
11914 my $text = "If an entry has flag(s) at its beginning, like '$DEPRECATED', the 'Is_' form has the same flag(s)";
11915 $text = "$exception_message $text" if $has_Is_conflicts;
11917 # And the 'Is_ line';
11918 push @match_properties, format_pod_line($indent_info_column,
11922 # Sort the properties array for output. It is sorted alphabetically
11923 # except numerically for numeric properties, and only output unique lines.
11924 @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
11926 my $formatted_properties = simple_fold(\@match_properties,
11928 # indent succeeding lines by two extra
11929 # which looks better
11930 $indent_info_column + 2,
11932 # shorten the line length by how much
11933 # the formatter indents, so the folded
11934 # line will fit in the space
11935 # presumably available
11936 $automatic_pod_indent);
11937 # Add column headings, indented to be a little more centered, but not
11939 $formatted_properties = format_pod_line($indent_info_column,
11943 . $formatted_properties;
11945 # Generate pod documentation lines for the tables that match nothing
11947 if (@zero_match_tables) {
11948 @zero_match_tables = uniques(@zero_match_tables);
11949 $zero_matches = join "\n\n",
11950 map { $_ = '=item \p{' . $_->complete_name . "}" }
11951 sort { $a->complete_name cmp $b->complete_name }
11952 uniques(@zero_match_tables);
11954 $zero_matches = <<END;
11956 =head2 Legal \\p{} and \\P{} constructs that match no characters
11958 Unicode has some property-value pairs that currently don't match anything.
11959 This happens generally either because they are obsolete, or for symmetry with
11960 other forms, but no language has yet been encoded that uses them. In this
11961 version of Unicode, the following match zero code points:
11972 # Generate list of properties that we don't accept, grouped by the reasons
11973 # why. This is so only put out the 'why' once, and then list all the
11974 # properties that have that reason under it.
11976 my %why_list; # The keys are the reasons; the values are lists of
11977 # properties that have the key as their reason
11979 # For each property, add it to the list that are suppressed for its reason
11980 # The sort will cause the alphabetically first properties to be added to
11981 # each list first, so each list will be sorted.
11982 foreach my $property (sort keys %why_suppressed) {
11983 push @{$why_list{$why_suppressed{$property}}}, $property;
11986 # For each reason (sorted by the first property that has that reason)...
11987 my @bad_re_properties;
11988 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
11991 # Add to the output, all the properties that have that reason. Start
11992 # with an empty line.
11993 push @bad_re_properties, "\n\n";
11995 my $has_item = 0; # Flag if actually output anything.
11996 foreach my $name (@{$why_list{$why}}) {
11998 # Split compound names into $property and $table components
11999 my $property = $name;
12001 if ($property =~ / (.*) = (.*) /x) {
12006 # This release of Unicode may not have a property that is
12007 # suppressed, so don't reference a non-existent one.
12008 $property = property_ref($property);
12009 next if ! defined $property;
12011 # And since this list is only for match tables, don't list the
12012 # ones that don't have match tables.
12013 next if ! $property->to_create_match_tables;
12015 # Find any abbreviation, and turn it into a compound name if this
12016 # is a property=value pair.
12017 my $short_name = $property->name;
12018 $short_name .= '=' . $property->table($table)->name if $table;
12020 # And add the property as an item for the reason.
12021 push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
12025 # And add the reason under the list of properties, if such a list
12026 # actually got generated. Note that the header got added
12027 # unconditionally before. But pod ignores extra blank lines, so no
12029 push @bad_re_properties, "\n$why\n" if $has_item;
12031 } # End of looping through each reason.
12033 # Generate a list of the properties whose map table we output, from the
12034 # global @map_properties.
12035 my @map_tables_actually_output;
12036 my $info_indent = 20; # Left column is narrower than \p{} table.
12037 foreach my $property (@map_properties) {
12039 # Get the path to the file; don't output any not in the standard
12041 my @path = $property->file_path;
12042 next if $path[0] ne $map_directory;
12043 shift @path; # Remove the standard name
12045 my $file = join '/', @path; # In case is in sub directory
12046 my $info = $property->full_name;
12047 my $short_name = $property->name;
12048 if ($info ne $short_name) {
12049 $info .= " ($short_name)";
12051 foreach my $more_info ($property->description,
12053 $property->status_info)
12055 next unless $more_info;
12057 $info .= ". $more_info";
12059 push @map_tables_actually_output, format_pod_line($info_indent,
12062 $property->status);
12065 # Sort alphabetically, and fold for output
12066 @map_tables_actually_output = sort
12067 pod_alphanumeric_sort @map_tables_actually_output;
12068 @map_tables_actually_output
12069 = simple_fold(\@map_tables_actually_output,
12072 $automatic_pod_indent);
12074 # Generate a list of the formats that can appear in the map tables.
12075 my @map_table_formats;
12076 foreach my $format (sort keys %map_table_formats) {
12077 push @map_table_formats, " $format $map_table_formats{$format}\n";
12080 # Everything is ready to assemble.
12081 my @OUT = << "END";
12086 To change this file, edit $0 instead.
12092 $pod_file - Complete index of Unicode Version $string_version properties
12096 There are many properties in Unicode, and Perl provides access to almost all of
12097 them, as well as some additional extensions and short-cut synonyms.
12099 And just about all of the few that aren't accessible through the Perl
12100 core are accessible through the modules: Unicode::Normalize and
12101 Unicode::UCD, and for Unihan properties, via the CPAN module Unicode::Unihan.
12103 This document merely lists all available properties and does not attempt to
12104 explain what each property really means. There is a brief description of each
12105 Perl extension. There is some detail about Blocks, Scripts, General_Category,
12106 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
12107 Unicode properties, refer to the Unicode standard. A good starting place is
12108 L<$unicode_reference_url>. More information on the Perl extensions is in
12109 L<perlrecharclass>.
12111 Note that you can define your own properties; see
12112 L<perlunicode/"User-Defined Character Properties">.
12114 =head1 Properties accessible through \\p{} and \\P{}
12116 The Perl regular expression \\p{} and \\P{} constructs give access to most of
12117 the Unicode character properties. The table below shows all these constructs,
12118 both single and compound forms.
12120 B<Compound forms> consist of two components, separated by an equals sign or a
12121 colon. The first component is the property name, and the second component is
12122 the particular value of the property to match against, for example,
12123 '\\p{Script: Greek}' or '\\p{Script=Greek}' both mean to match characters
12124 whose Script property is Greek.
12126 B<Single forms>, like '\\p{Greek}', are mostly Perl-defined shortcuts for
12127 their equivalent compound forms. The table shows these equivalences. (In our
12128 example, '\\p{Greek}' is a just a shortcut for '\\p{Script=Greek}'.)
12129 There are also a few Perl-defined single forms that are not shortcuts for a
12130 compound form. One such is \\p{Word}. These are also listed in the table.
12132 In parsing these constructs, Perl always ignores Upper/lower case differences
12133 everywhere within the {braces}. Thus '\\p{Greek}' means the same thing as
12134 '\\p{greek}'. But note that changing the case of the 'p' or 'P' before the
12135 left brace completely changes the meaning of the construct, from "match" (for
12136 '\\p{}') to "doesn't match" (for '\\P{}'). Casing in this document is for
12137 improved legibility.
12139 Also, white space, hyphens, and underscores are also normally ignored
12140 everywhere between the {braces}, and hence can be freely added or removed
12141 even if the C</x> modifier hasn't been specified on the regular expression.
12142 But $a_bold_stricter at the beginning of an entry in the table below
12143 means that tighter (stricter) rules are used for that entry:
12147 =item Single form (\\p{name}) tighter rules:
12149 White space, hyphens, and underscores ARE significant
12154 =item * white space adjacent to a non-word character
12156 =item * underscores separating digits in numbers
12160 That means, for example, that you can freely add or remove white space
12161 adjacent to (but within) the braces without affecting the meaning.
12163 =item Compound form (\\p{name=value} or \\p{name:value}) tighter rules:
12165 The tighter rules given above for the single form apply to everything to the
12166 right of the colon or equals; the looser rules still apply to everything to
12169 That means, for example, that you can freely add or remove white space
12170 adjacent to (but within) the braces and the colon or equal sign.
12174 Some properties are considered obsolete, but still available. There are
12175 several varieties of obsolesence:
12181 Properties marked with $a_bold_obsolete in the table are considered
12182 obsolete. At the time of this writing (Unicode version 5.2) there is no
12183 information in the Unicode standard about the implications of a property being
12188 Obsolete properties may be stabilized. This means that they are not actively
12189 maintained by Unicode, and will not be extended as new characters are added to
12190 the standard. Such properties are marked with $a_bold_stabilized in the
12191 table. At the time of this writing (Unicode version 5.2) there is no further
12192 information in the Unicode standard about the implications of a property being
12197 Obsolete properties may be deprecated. This means that their use is strongly
12198 discouraged, so much so that a warning will be issued if used, unless the
12199 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
12200 statement. $A_bold_deprecated flags each such entry in the table, and
12201 the entry there for the longest, most descriptive version of the property will
12202 give the reason it is deprecated, and perhaps advice. Perl may issue such a
12203 warning, even for properties that aren't officially deprecated by Unicode,
12204 when there used to be characters or code points that were matched by them, but
12205 no longer. This is to warn you that your program may not work like it did on
12206 earlier Unicode releases.
12208 A deprecated property may be made unavailable in a future Perl version, so it
12209 is best to move away from them.
12213 Some Perl extensions are present for backwards compatibility and are
12214 discouraged from being used, but not obsolete. $A_bold_discouraged
12215 flags each such entry in the table.
12219 The table below has two columns. The left column contains the \\p{}
12220 constructs to look up, possibly preceeded by the flags mentioned above; and
12221 the right column contains information about them, like a description, or
12222 synonyms. It shows both the single and compound forms for each property that
12223 has them. If the left column is a short name for a property, the right column
12224 will give its longer, more descriptive name; and if the left column is the
12225 longest name, the right column will show any equivalent shortest name, in both
12226 single and compound forms if applicable.
12228 The right column will also caution you if a property means something different
12229 than what might normally be expected.
12231 Numbers in (parentheses) indicate the total number of code points matched by
12232 the property. For emphasis, those properties that match no code points at all
12233 are listed as well in a separate section following the table.
12235 There is no description given for most non-Perl defined properties (See
12236 $unicode_reference_url for that).
12238 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
12239 combinations. For example, entries like:
12241 \\p{Gc: *} \\p{General_Category: *}
12243 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
12244 for the latter is also valid for the former. Similarly,
12248 means that if and only if, for example, \\p{Foo} exists, then \\p{Is_Foo} and
12249 \\p{IsFoo} are also valid and all mean the same thing. And similarly,
12250 \\p{Foo=Bar} means the same as \\p{Is_Foo=Bar} and \\p{IsFoo=Bar}. '*' here
12251 is restricted to something not beginning with an underscore.
12253 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
12254 And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and
12255 'N*' to indicate this, and doesn't have separate entries for the other
12256 possibilities. Note that not all properties which have values 'Yes' and 'No'
12257 are binary, and they have all their values spelled out without using this wild
12258 card, and a C<NOT> clause in their description that highlights their not being
12259 binary. These also require the compound form to match them, whereas true
12260 binary properties have both single and compound forms available.
12262 Note that all non-essential underscores are removed in the display of the
12269 =item B<*> is a wild-card
12271 =item B<(\\d+)> in the info column gives the number of code points matched by
12274 =item B<$DEPRECATED> means this is deprecated.
12276 =item B<$OBSOLETE> means this is obsolete.
12278 =item B<$STABILIZED> means this is stabilized.
12280 =item B<$STRICTER> means tighter (stricter) name matching applies.
12282 =item B<$DISCOURAGED> means use of this form is discouraged.
12286 $formatted_properties
12290 =head1 Properties not accessible through \\p{} and \\P{}
12292 A few properties are accessible in Perl via various function calls only.
12294 Lowercase_Mapping lc() and lcfirst()
12295 Titlecase_Mapping ucfirst()
12296 Uppercase_Mapping uc()
12298 Case_Folding is accessible through the /i modifier in regular expressions.
12300 The Name property is accessible through the \\N{} interpolation in
12301 double-quoted strings and regular expressions, but both usages require a C<use
12302 charnames;> to be specified, which also contains related functions viacode()
12305 =head1 Unicode regular expression properties that are NOT accepted by Perl
12307 Perl will generate an error for a few character properties in Unicode when
12308 used in a regular expression. The non-Unihan ones are listed below, with the
12309 reasons they are not accepted, perhaps with work-arounds. The short names for
12310 the properties are listed enclosed in (parentheses).
12318 An installation can choose to allow any of these to be matched by changing the
12319 controlling lists contained in the program C<\$Config{privlib}>/F<unicore/$0>
12320 and then re-running F<$0>. (C<\%Config> is available from the Config module).
12322 =head1 Files in the I<To> directory (for serious hackers only)
12324 All Unicode properties are really mappings (in the mathematical sense) from
12325 code points to their respective values. As part of its build process,
12326 Perl constructs tables containing these mappings for all properties that it
12327 deals with. But only a few of these are written out into files.
12328 Those written out are in the directory C<\$Config{privlib}>/F<unicore/To/>
12329 (%Config is available from the Config module).
12331 Those ones written are ones needed by Perl internally during execution, or for
12332 which there is some demand, and those for which there is no access through the
12333 Perl core. Generally, properties that can be used in regular expression
12334 matching do not have their map tables written, like Script. Nor are the
12335 simplistic properties that have a better, more complete version, such as
12336 Simple_Uppercase_Mapping (Uppercase_Mapping is written instead).
12338 None of the properties in the I<To> directory are currently directly
12339 accessible through the Perl core, although some may be accessed indirectly.
12340 For example, the uc() function implements the Uppercase_Mapping property and
12341 uses the F<Upper.pl> file found in this directory.
12343 The available files with their properties (short names in parentheses),
12344 and any flags or comments about them, are:
12346 @map_tables_actually_output
12348 An installation can choose to change which files are generated by changing the
12349 controlling lists contained in the program C<\$Config{privlib}>/F<unicore/$0>
12350 and then re-running F<$0>.
12352 Each of these files defines two hash entries to help reading programs decipher
12353 it. One of them looks like this:
12355 \$utf8::SwashInfo{'ToNAME'}{'format'} = 's';
12357 where 'NAME' is a name to indicate the property. For backwards compatibility,
12358 this is not necessarily the property's official Unicode name. (The 'To' is
12359 also for backwards compatibility.) The hash entry gives the format of the
12360 mapping fields of the table, currently one of the following:
12364 This format applies only to the entries in the main body of the table.
12365 Entries defined in hashes or ones that are missing from the list can have a
12368 The value that the missing entries have is given by the other SwashInfo hash
12369 entry line; it looks like this:
12371 \$utf8::SwashInfo{'ToNAME'}{'missing'} = 'NaN';
12373 This example line says that any Unicode code points not explicitly listed in
12374 the file have the value 'NaN' under the property indicated by NAME. If the
12375 value is the special string C<< <code point> >>, it means that the value for
12376 any missing code point is the code point itself. This happens, for example,
12377 in the file for Uppercase_Mapping (To/Upper.pl), in which code points like the
12378 character 'A', are missing because the uppercase of 'A' is itself.
12382 L<$unicode_reference_url>
12391 main::write([ $pod_directory, "$pod_file.pod" ], @OUT);
12395 sub make_Heavy () {
12396 # Create and write Heavy.pl, which passes info about the tables to
12403 # This file is for the use of utf8_heavy.pl
12405 # Maps property names in loose standard form to its standard name
12406 \%utf8::loose_property_name_of = (
12409 push @heavy, simple_dumper (\%loose_property_name_of, ' ' x 4);
12410 push @heavy, <<END;
12413 # Maps property, table to file for those using stricter matching
12414 \%utf8::stricter_to_file_of = (
12416 push @heavy, simple_dumper (\%stricter_to_file_of, ' ' x 4);
12417 push @heavy, <<END;
12420 # Maps property, table to file for those using loose matching
12421 \%utf8::loose_to_file_of = (
12423 push @heavy, simple_dumper (\%loose_to_file_of, ' ' x 4);
12424 push @heavy, <<END;
12427 # Maps floating point to fractional form
12428 \%utf8::nv_floating_to_rational = (
12430 push @heavy, simple_dumper (\%nv_floating_to_rational, ' ' x 4);
12431 push @heavy, <<END;
12434 # If a floating point number doesn't have enough digits in it to get this
12435 # close to a fraction, it isn't considered to be that fraction even if all the
12436 # digits it does have match.
12437 \$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
12439 # Deprecated tables to generate a warning for. The key is the file containing
12440 # the table, so as to avoid duplication, as many property names can map to the
12441 # file, but we only need one entry for all of them.
12442 \%utf8::why_deprecated = (
12445 push @heavy, simple_dumper (\%utf8::why_deprecated, ' ' x 4);
12446 push @heavy, <<END;
12452 main::write("Heavy.pl", @heavy);
12456 sub write_all_tables() {
12457 # Write out all the tables generated by this program to files, as well as
12458 # the supporting data structures, pod file, and .t file.
12460 my @writables; # List of tables that actually get written
12461 my %match_tables_to_write; # Used to collapse identical match tables
12462 # into one file. Each key is a hash function
12463 # result to partition tables into buckets.
12464 # Each value is an array of the tables that
12465 # fit in the bucket.
12467 # For each property ...
12468 # (sort so that if there is an immutable file name, it has precedence, so
12469 # some other property can't come in and take over its file name. If b's
12470 # file name is defined, will return 1, meaning to take it first; don't
12471 # care if both defined, as they had better be different anyway)
12473 foreach my $property (sort { defined $b->file } property_ref('*')) {
12474 my $type = $property->type;
12476 # And for each table for that property, starting with the mapping
12479 foreach my $table($property,
12481 # and all the match tables for it (if any), sorted so
12482 # the ones with the shortest associated file name come
12483 # first. The length sorting prevents problems of a
12484 # longer file taking a name that might have to be used
12485 # by a shorter one. The alphabetic sorting prevents
12486 # differences between releases
12487 sort { my $ext_a = $a->external_name;
12488 return 1 if ! defined $ext_a;
12489 my $ext_b = $b->external_name;
12490 return -1 if ! defined $ext_b;
12491 my $cmp = length $ext_a <=> length $ext_b;
12493 # Return result if lengths not equal
12494 return $cmp if $cmp;
12496 # Alphabetic if lengths equal
12497 return $ext_a cmp $ext_b
12498 } $property->tables
12502 # Here we have a table associated with a property. It could be
12503 # the map table (done first for each property), or one of the
12504 # other tables. Determine which type.
12505 my $is_property = $table->isa('Property');
12507 my $name = $table->name;
12508 my $complete_name = $table->complete_name;
12510 # See if should suppress the table if is empty, but warn if it
12511 # contains something.
12512 my $suppress_if_empty_warn_if_not = grep { $complete_name eq $_ }
12513 keys %why_suppress_if_empty_warn_if_not;
12515 # Calculate if this table should have any code points associated
12517 my $expected_empty =
12519 # $perl should be empty, as well as properties that we just
12520 # don't do anything with
12522 && ($table == $perl
12523 || grep { $complete_name eq $_ }
12524 @unimplemented_properties
12528 # Match tables in properties we skipped populating should be
12530 || (! $is_property && ! $property->to_create_match_tables)
12532 # Tables and properties that are expected to have no code
12533 # points should be empty
12534 || $suppress_if_empty_warn_if_not
12537 # Set a boolean if this table is the complement of an empty binary
12539 my $is_complement_of_empty_binary =
12540 $type == $BINARY &&
12541 (($table == $property->table('Y')
12542 && $property->table('N')->is_empty)
12543 || ($table == $property->table('N')
12544 && $property->table('Y')->is_empty));
12547 # Some tables should match everything
12548 my $expected_full =
12550 ? # All these types of map tables will be full because
12551 # they will have been populated with defaults
12552 ($type == $ENUM || $type == $BINARY)
12554 : # A match table should match everything if its method
12556 ($table->matches_all
12558 # The complement of an empty binary table will match
12560 || $is_complement_of_empty_binary
12564 if ($table->is_empty) {
12567 if ($suppress_if_empty_warn_if_not) {
12568 $table->set_status($SUPPRESSED,
12569 $why_suppress_if_empty_warn_if_not{$complete_name});
12572 # Suppress expected empty tables.
12573 next TABLE if $expected_empty;
12575 # And setup to later output a warning for those that aren't
12576 # known to be allowed to be empty. Don't do the warning if
12577 # this table is a child of another one to avoid duplicating
12578 # the warning that should come from the parent one.
12579 if (($table == $property || $table->parent == $table)
12580 && $table->status ne $SUPPRESSED
12581 && ! grep { $complete_name =~ /^$_$/ }
12582 @tables_that_may_be_empty)
12584 push @unhandled_properties, "$table";
12587 elsif ($expected_empty) {
12589 if ($suppress_if_empty_warn_if_not) {
12590 $because = " because $why_suppress_if_empty_warn_if_not{$complete_name}";
12593 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
12596 my $count = $table->count;
12597 if ($expected_full) {
12598 if ($count != $MAX_UNICODE_CODEPOINTS) {
12599 Carp::my_carp("$table matches only "
12600 . clarify_number($count)
12601 . " Unicode code points but should match "
12602 . clarify_number($MAX_UNICODE_CODEPOINTS)
12604 . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
12605 . "). Proceeding anyway.");
12608 # Here is expected to be full. If it is because it is the
12609 # complement of an (empty) binary table that is to be
12610 # suppressed, then suppress this one as well.
12611 if ($is_complement_of_empty_binary) {
12612 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
12613 my $opposing = $property->table($opposing_name);
12614 my $opposing_status = $opposing->status;
12615 if ($opposing_status) {
12616 $table->set_status($opposing_status,
12617 $opposing->status_info);
12621 elsif ($count == $MAX_UNICODE_CODEPOINTS) {
12622 if ($table == $property || $table->leader == $table) {
12623 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
12627 if ($table->status eq $SUPPRESSED) {
12628 if (! $is_property) {
12629 my @children = $table->children;
12630 foreach my $child (@children) {
12631 if ($child->status ne $SUPPRESSED) {
12632 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
12639 if (! $is_property) {
12641 # Several things need to be done just once for each related
12642 # group of match tables. Do them on the parent.
12643 if ($table->parent == $table) {
12645 # Add an entry in the pod file for the table; it also does
12647 make_table_pod_entries($table) if defined $pod_directory;
12649 # See if the the table matches identical code points with
12650 # something that has already been output. In that case,
12651 # no need to have two files with the same code points in
12652 # them. We use the table's hash() method to store these
12653 # in buckets, so that it is quite likely that if two
12654 # tables are in the same bucket they will be identical, so
12655 # don't have to compare tables frequently. The tables
12656 # have to have the same status to share a file, so add
12657 # this to the bucket hash. (The reason for this latter is
12658 # that Heavy.pl associates a status with a file.)
12659 my $hash = $table->hash . ';' . $table->status;
12661 # Look at each table that is in the same bucket as this
12663 foreach my $comparison (@{$match_tables_to_write{$hash}})
12665 if ($table->matches_identically_to($comparison)) {
12666 $table->set_equivalent_to($comparison,
12672 # Here, not equivalent, add this table to the bucket.
12673 push @{$match_tables_to_write{$hash}}, $table;
12678 # Here is the property itself.
12679 # Don't write out or make references to the $perl property
12680 next if $table == $perl;
12682 if ($type != $STRING) {
12684 # There is a mapping stored of the various synonyms to the
12685 # standardized name of the property for utf8_heavy.pl.
12686 # Also, the pod file contains entries of the form:
12687 # \p{alias: *} \p{full: *}
12688 # rather than show every possible combination of things.
12690 my @property_aliases = $property->aliases;
12692 # The full name of this property is stored by convention
12693 # first in the alias array
12694 my $full_property_name =
12695 '\p{' . $property_aliases[0]->name . ': *}';
12696 my $standard_property_name = standardize($table->name);
12698 # For each synonym ...
12699 for my $i (0 .. @property_aliases - 1) {
12700 my $alias = $property_aliases[$i];
12701 my $alias_name = $alias->name;
12702 my $alias_standard = standardize($alias_name);
12704 # Set the mapping for utf8_heavy of the alias to the
12706 if (exists ($loose_property_name_of{$alias_standard}))
12708 Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained");
12711 $loose_property_name_of{$alias_standard}
12712 = $standard_property_name;
12715 # Now for the pod entry for this alias. Skip if not
12716 # outputting a pod; skip the first one, which is the
12717 # full name so won't have an entry like: '\p{full: *}
12718 # \p{full: *}', and skip if don't want an entry for
12721 || ! defined $pod_directory
12722 || ! $alias->make_pod_entry;
12724 push @match_properties,
12725 format_pod_line($indent_info_column,
12726 '\p{' . $alias->name . ': *}',
12727 $full_property_name,
12730 } # End of non-string-like property code
12733 # Don't output a mapping file if not desired.
12734 next if ! $property->to_output_map;
12737 # Here, we know we want to write out the table, but don't do it
12738 # yet because there may be other tables that come along and will
12739 # want to share the file, and the file's comments will change to
12740 # mention them. So save for later.
12741 push @writables, $table;
12743 } # End of looping through the property and all its tables.
12744 } # End of looping through all properties.
12746 # Now have all the tables that will have files written for them. Do it.
12747 foreach my $table (@writables) {
12750 my $property = $table->property;
12751 my $is_property = ($table == $property);
12752 if (! $is_property) {
12754 # Match tables for the property go in lib/$subdirectory, which is
12755 # the property's name. Don't use the standard file name for this,
12756 # as may get an unfamiliar alias
12757 @directory = ($matches_directory, $property->external_name);
12761 @directory = $table->directory;
12762 $filename = $table->file;
12765 # Use specified filename if avaliable, or default to property's
12766 # shortest name. We need an 8.3 safe filename (which means "an 8
12767 # safe" filename, since after the dot is only 'pl', which is < 3)
12768 # The 2nd parameter is if the filename shouldn't be changed, and
12769 # it shouldn't iff there is a hard-coded name for this table.
12770 $filename = construct_filename(
12771 $filename || $table->external_name,
12772 ! $filename, # mutable if no filename
12775 register_file_for_name($table, \@directory, $filename);
12777 # Only need to write one file when shared by more than one
12779 next if ! $is_property && $table->leader != $table;
12781 # Construct a nice comment to add to the file
12782 $table->set_final_comment;
12788 # Write out the pod file
12794 make_property_test_script() if $make_test_script;
12798 my @white_space_separators = ( # This used only for making the test script.
12805 sub generate_separator($) {
12806 # This used only for making the test script. It generates the colon or
12807 # equal separator between the property and property value, with random
12808 # white space surrounding the separator
12812 return "" if $lhs eq ""; # No separator if there's only one (the r) side
12814 # Choose space before and after randomly
12815 my $spaces_before =$white_space_separators[rand(@white_space_separators)];
12816 my $spaces_after = $white_space_separators[rand(@white_space_separators)];
12818 # And return the whole complex, half the time using a colon, half the
12820 return $spaces_before
12821 . (rand() < 0.5) ? '=' : ':'
12825 sub generate_tests($$$$$$) {
12826 # This used only for making the test script. It generates test cases that
12827 # are expected to compile successfully in perl. Note that the lhs and
12828 # rhs are assumed to already be as randomized as the caller wants.
12830 my $file_handle = shift; # Where to output the tests
12831 my $lhs = shift; # The property: what's to the left of the colon
12832 # or equals separator
12833 my $rhs = shift; # The property value; what's to the right
12834 my $valid_code = shift; # A code point that's known to be in the
12835 # table given by lhs=rhs; undef if table is
12837 my $invalid_code = shift; # A code point known to not be in the table;
12838 # undef if the table is all code points
12839 my $warning = shift;
12841 # Get the colon or equal
12842 my $separator = generate_separator($lhs);
12844 # The whole 'property=value'
12845 my $name = "$lhs$separator$rhs";
12847 # Create a complete set of tests, with complements.
12848 if (defined $valid_code) {
12849 printf $file_handle
12850 qq/Expect(1, $valid_code, '\\p{$name}', $warning);\n/;
12851 printf $file_handle
12852 qq/Expect(0, $valid_code, '\\p{^$name}', $warning);\n/;
12853 printf $file_handle
12854 qq/Expect(0, $valid_code, '\\P{$name}', $warning);\n/;
12855 printf $file_handle
12856 qq/Expect(1, $valid_code, '\\P{^$name}', $warning);\n/;
12858 if (defined $invalid_code) {
12859 printf $file_handle
12860 qq/Expect(0, $invalid_code, '\\p{$name}', $warning);\n/;
12861 printf $file_handle
12862 qq/Expect(1, $invalid_code, '\\p{^$name}', $warning);\n/;
12863 printf $file_handle
12864 qq/Expect(1, $invalid_code, '\\P{$name}', $warning);\n/;
12865 printf $file_handle
12866 qq/Expect(0, $invalid_code, '\\P{^$name}', $warning);\n/;
12871 sub generate_error($$$$) {
12872 # This used only for making the test script. It generates test cases that
12873 # are expected to not only not match, but to be syntax or similar errors
12875 my $file_handle = shift; # Where to output to.
12876 my $lhs = shift; # The property: what's to the left of the
12877 # colon or equals separator
12878 my $rhs = shift; # The property value; what's to the right
12879 my $already_in_error = shift; # Boolean; if true it's known that the
12880 # unmodified lhs and rhs will cause an error.
12881 # This routine should not force another one
12882 # Get the colon or equal
12883 my $separator = generate_separator($lhs);
12885 # Since this is an error only, don't bother to randomly decide whether to
12886 # put the error on the left or right side; and assume that the rhs is
12887 # loosely matched, again for convenience rather than rigor.
12888 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
12890 my $property = $lhs . $separator . $rhs;
12892 print $file_handle qq/Error('\\p{$property}');\n/;
12893 print $file_handle qq/Error('\\P{$property}');\n/;
12897 # These are used only for making the test script
12898 # XXX Maybe should also have a bad strict seps, which includes underscore.
12900 my @good_loose_seps = (
12907 my @bad_loose_seps = (
12912 sub randomize_stricter_name {
12913 # This used only for making the test script. Take the input name and
12914 # return a randomized, but valid version of it under the stricter matching
12918 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12920 # If the name looks like a number (integer, floating, or rational), do
12922 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
12925 my $separator = $3;
12927 # If there isn't a sign, part of the time add a plus
12928 # Note: Not testing having any denominator having a minus sign
12930 $sign = '+' if rand() <= .3;
12933 # And add 0 or more leading zeros.
12934 $name = $sign . ('0' x int rand(10)) . $number;
12936 if (defined $separator) {
12937 my $extra_zeros = '0' x int rand(10);
12939 if ($separator eq '.') {
12941 # Similarly, add 0 or more trailing zeros after a decimal
12943 $name .= $extra_zeros;
12947 # Or, leading zeros before the denominator
12948 $name =~ s,/,/$extra_zeros,;
12953 # For legibility of the test, only change the case of whole sections at a
12954 # time. To do this, first split into sections. The split returns the
12957 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
12958 trace $section if main::DEBUG && $to_trace;
12960 if (length $section > 1 && $section !~ /\D/) {
12962 # If the section is a sequence of digits, about half the time
12963 # randomly add underscores between some of them.
12966 # Figure out how many underscores to add. max is 1 less than
12967 # the number of digits. (But add 1 at the end to make sure
12968 # result isn't 0, and compensate earlier by subtracting 2
12970 my $num_underscores = int rand(length($section) - 2) + 1;
12972 # And add them evenly throughout, for convenience, not rigor
12974 my $spacing = (length($section) - 1)/ $num_underscores;
12975 my $temp = $section;
12977 for my $i (1 .. $num_underscores) {
12978 $section .= substr($temp, 0, $spacing, "") . '_';
12982 push @sections, $section;
12986 # Here not a sequence of digits. Change the case of the section
12988 my $switch = int rand(4);
12989 if ($switch == 0) {
12990 push @sections, uc $section;
12992 elsif ($switch == 1) {
12993 push @sections, lc $section;
12995 elsif ($switch == 2) {
12996 push @sections, ucfirst $section;
12999 push @sections, $section;
13003 trace "returning", join "", @sections if main::DEBUG && $to_trace;
13004 return join "", @sections;
13007 sub randomize_loose_name($;$) {
13008 # This used only for making the test script
13011 my $want_error = shift; # if true, make an error
13012 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13014 $name = randomize_stricter_name($name);
13017 push @parts, $good_loose_seps[rand(@good_loose_seps)];
13018 for my $part (split /[-\s_]+/, $name) {
13020 if ($want_error and rand() < 0.3) {
13021 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
13025 push @parts, $good_loose_seps[rand(@good_loose_seps)];
13028 push @parts, $part;
13030 my $new = join("", @parts);
13031 trace "$name => $new" if main::DEBUG && $to_trace;
13034 if (rand() >= 0.5) {
13035 $new .= $bad_loose_seps[rand(@bad_loose_seps)];
13038 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
13044 # Used to make sure don't generate duplicate test cases.
13045 my %test_generated;
13047 sub make_property_test_script() {
13048 # This used only for making the test script
13049 # this written directly -- it's huge.
13051 print "Making test script\n" if $verbosity >= $PROGRESS;
13053 # This uses randomness to test different possibilities without testing all
13054 # possibilities. To ensure repeatability, set the seed to 0. But if
13055 # tests are added, it will perturb all later ones in the .t file
13058 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
13060 force_unlink ($t_path);
13061 push @files_actually_output, $t_path;
13063 if (not open $OUT, "> $t_path") {
13064 Carp::my_carp("Can't open $t_path. Skipping: $!");
13068 # Keep going down an order of magnitude
13069 # until find that adding this quantity to
13070 # 1 remains 1; but put an upper limit on
13071 # this so in case this algorithm doesn't
13072 # work properly on some platform, that we
13073 # won't loop forever.
13075 my $min_floating_slop = 1;
13076 while (1+ $min_floating_slop != 1
13079 my $next = $min_floating_slop / 10;
13080 last if $next == 0; # If underflows,
13082 $min_floating_slop = $next;
13084 print $OUT $HEADER, <DATA>;
13086 foreach my $property (property_ref('*')) {
13087 foreach my $table ($property->tables) {
13089 # Find code points that match, and don't match this table.
13090 my $valid = $table->get_valid_code_point;
13091 my $invalid = $table->get_invalid_code_point;
13092 my $warning = ($table->status eq $DEPRECATED)
13096 # Test each possible combination of the property's aliases with
13097 # the table's. If this gets to be too many, could do what is done
13098 # in the set_final_comment() for Tables
13099 my @table_aliases = $table->aliases;
13100 my @property_aliases = $table->property->aliases;
13101 my $max = max(scalar @table_aliases, scalar @property_aliases);
13102 for my $j (0 .. $max - 1) {
13104 # The current alias for property is the next one on the list,
13105 # or if beyond the end, start over. Similarly for table
13107 = $property_aliases[$j % @property_aliases]->name;
13109 $property_name = "" if $table->property == $perl;
13110 my $table_alias = $table_aliases[$j % @table_aliases];
13111 my $table_name = $table_alias->name;
13112 my $loose_match = $table_alias->loose_match;
13114 # If the table doesn't have a file, any test for it is
13115 # already guaranteed to be in error
13116 my $already_error = ! $table->file_path;
13118 # Generate error cases for this alias.
13119 generate_error($OUT,
13124 # If the table is guaranteed to always generate an error,
13125 # quit now without generating success cases.
13126 next if $already_error;
13128 # Now for the success cases.
13130 if ($loose_match) {
13132 # For loose matching, create an extra test case for the
13134 my $standard = standardize($table_name);
13136 # $test_name should be a unique combination for each test
13137 # case; used just to avoid duplicate tests
13138 my $test_name = "$property_name=$standard";
13140 # Don't output duplicate test cases.
13141 if (! exists $test_generated{$test_name}) {
13142 $test_generated{$test_name} = 1;
13143 generate_tests($OUT,
13151 $random = randomize_loose_name($table_name)
13153 else { # Stricter match
13154 $random = randomize_stricter_name($table_name);
13157 # Now for the main test case for this alias.
13158 my $test_name = "$property_name=$random";
13159 if (! exists $test_generated{$test_name}) {
13160 $test_generated{$test_name} = 1;
13161 generate_tests($OUT,
13169 # If the name is a rational number, add tests for the
13170 # floating point equivalent.
13171 if ($table_name =~ qr{/}) {
13173 # Calculate the float, and find just the fraction.
13174 my $float = eval $table_name;
13175 my ($whole, $fraction)
13176 = $float =~ / (.*) \. (.*) /x;
13178 # Starting with one digit after the decimal point,
13179 # create a test for each possible precision (number of
13180 # digits past the decimal point) until well beyond the
13181 # native number found on this machine. (If we started
13182 # with 0 digits, it would be an integer, which could
13183 # well match an unrelated table)
13185 for my $i (1 .. $min_floating_slop + 3) {
13186 my $table_name = sprintf("%.*f", $i, $float);
13187 if ($i < $MIN_FRACTION_LENGTH) {
13189 # If the test case has fewer digits than the
13190 # minimum acceptable precision, it shouldn't
13191 # succeed, so we expect an error for it.
13192 # E.g., 2/3 = .7 at one decimal point, and we
13193 # shouldn't say it matches .7. We should make
13194 # it be .667 at least before agreeing that the
13195 # intent was to match 2/3. But at the
13196 # less-than- acceptable level of precision, it
13197 # might actually match an unrelated number.
13198 # So don't generate a test case if this
13199 # conflating is possible. In our example, we
13200 # don't want 2/3 matching 7/10, if there is
13201 # a 7/10 code point.
13203 (keys %nv_floating_to_rational)
13206 if abs($table_name - $existing)
13207 < $MAX_FLOATING_SLOP;
13209 generate_error($OUT,
13212 1 # 1 => already an error
13217 # Here the number of digits exceeds the
13218 # minimum we think is needed. So generate a
13219 # success test case for it.
13220 generate_tests($OUT,
13234 print $OUT "Finished();\n";
13239 # This is a list of the input files and how to handle them. The files are
13240 # processed in their order in this list. Some reordering is possible if
13241 # desired, but the v0 files should be first, and the extracted before the
13242 # others except DAge.txt (as data in an extracted file can be over-ridden by
13243 # the non-extracted. Some other files depend on data derived from an earlier
13244 # file, like UnicodeData requires data from Jamo, and the case changing and
13245 # folding requires data from Unicode. Mostly, it safest to order by first
13246 # version releases in (except the Jamo). DAge.txt is read before the
13247 # extracted ones because of the rarely used feature $compare_versions. In the
13248 # unlikely event that there were ever an extracted file that contained the Age
13249 # property information, it would have to go in front of DAge.
13251 # The version strings allow the program to know whether to expect a file or
13252 # not, but if a file exists in the directory, it will be processed, even if it
13253 # is in a version earlier than expected, so you can copy files from a later
13254 # release into an earlier release's directory.
13255 my @input_file_objects = (
13256 Input_file->new('PropertyAliases.txt', v0,
13257 Handler => \&process_PropertyAliases,
13259 Input_file->new(undef, v0, # No file associated with this
13260 Progress_Message => 'Finishing property setup',
13261 Handler => \&finish_property_setup,
13263 Input_file->new('PropValueAliases.txt', v0,
13264 Handler => \&process_PropValueAliases,
13265 Has_Missings_Defaults => $NOT_IGNORED,
13267 Input_file->new('DAge.txt', v3.2.0,
13268 Has_Missings_Defaults => $NOT_IGNORED,
13271 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
13272 Property => 'General_Category',
13274 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
13275 Property => 'Canonical_Combining_Class',
13276 Has_Missings_Defaults => $NOT_IGNORED,
13278 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
13279 Property => 'Numeric_Type',
13280 Has_Missings_Defaults => $NOT_IGNORED,
13282 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
13283 Property => 'East_Asian_Width',
13284 Has_Missings_Defaults => $NOT_IGNORED,
13286 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
13287 Property => 'Line_Break',
13288 Has_Missings_Defaults => $NOT_IGNORED,
13290 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
13291 Property => 'Bidi_Class',
13292 Has_Missings_Defaults => $NOT_IGNORED,
13294 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
13295 Property => 'Decomposition_Type',
13296 Has_Missings_Defaults => $NOT_IGNORED,
13298 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
13299 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
13300 Property => 'Numeric_Value',
13301 Each_Line_Handler => \&filter_numeric_value_line,
13302 Has_Missings_Defaults => $NOT_IGNORED,
13304 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
13305 Property => 'Joining_Group',
13306 Has_Missings_Defaults => $NOT_IGNORED,
13309 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
13310 Property => 'Joining_Type',
13311 Has_Missings_Defaults => $NOT_IGNORED,
13313 Input_file->new('Jamo.txt', v2.0.0,
13314 Property => 'Jamo_Short_Name',
13315 Each_Line_Handler => \&filter_jamo_line,
13317 Input_file->new('UnicodeData.txt', v1.1.5,
13318 Pre_Handler => \&setup_UnicodeData,
13320 # We clean up this file for some early versions.
13321 Each_Line_Handler => [ (($v_version lt v2.0.0 )
13323 : ($v_version eq v2.1.5)
13324 ? \&filter_v2_1_5_ucd
13327 # And the main filter
13328 \&filter_UnicodeData_line,
13330 EOF_Handler => \&EOF_UnicodeData,
13332 Input_file->new('ArabicShaping.txt', v2.0.0,
13333 Each_Line_Handler =>
13334 [ ($v_version lt 4.1.0)
13335 ? \&filter_old_style_arabic_shaping
13337 \&filter_arabic_shaping_line,
13339 Has_Missings_Defaults => $NOT_IGNORED,
13341 Input_file->new('Blocks.txt', v2.0.0,
13342 Property => 'Block',
13343 Has_Missings_Defaults => $NOT_IGNORED,
13344 Each_Line_Handler => \&filter_blocks_lines
13346 Input_file->new('PropList.txt', v2.0.0,
13347 Each_Line_Handler => (($v_version lt v3.1.0)
13348 ? \&filter_old_style_proplist
13351 Input_file->new('Unihan.txt', v2.0.0,
13352 Pre_Handler => \&setup_unihan,
13354 Each_Line_Handler => \&filter_unihan_line,
13356 Input_file->new('SpecialCasing.txt', v2.1.8,
13357 Each_Line_Handler => \&filter_special_casing_line,
13358 Pre_Handler => \&setup_special_casing,
13361 'LineBreak.txt', v3.0.0,
13362 Has_Missings_Defaults => $NOT_IGNORED,
13363 Property => 'Line_Break',
13364 # Early versions had problematic syntax
13365 Each_Line_Handler => (($v_version lt v3.1.0)
13366 ? \&filter_early_ea_lb
13369 Input_file->new('EastAsianWidth.txt', v3.0.0,
13370 Property => 'East_Asian_Width',
13371 Has_Missings_Defaults => $NOT_IGNORED,
13372 # Early versions had problematic syntax
13373 Each_Line_Handler => (($v_version lt v3.1.0)
13374 ? \&filter_early_ea_lb
13377 Input_file->new('CompositionExclusions.txt', v3.0.0,
13378 Property => 'Composition_Exclusion',
13380 Input_file->new('BidiMirroring.txt', v3.0.1,
13381 Property => 'Bidi_Mirroring_Glyph',
13383 Input_file->new('CaseFolding.txt', v3.0.1,
13384 Pre_Handler => \&setup_case_folding,
13385 Each_Line_Handler =>
13386 [ ($v_version lt v3.1.0)
13387 ? \&filter_old_style_case_folding
13389 \&filter_case_folding_line
13391 Post_Handler => \&post_fold,
13393 Input_file->new('DCoreProperties.txt', v3.1.0,
13394 # 5.2 changed this file
13395 Has_Missings_Defaults => (($v_version ge v5.2.0)
13399 Input_file->new('Scripts.txt', v3.1.0,
13400 Property => 'Script',
13401 Has_Missings_Defaults => $NOT_IGNORED,
13403 Input_file->new('DNormalizationProps.txt', v3.1.0,
13404 Has_Missings_Defaults => $NOT_IGNORED,
13405 Each_Line_Handler => (($v_version lt v4.0.1)
13406 ? \&filter_old_style_normalization_lines
13409 Input_file->new('HangulSyllableType.txt', v4.0.0,
13410 Has_Missings_Defaults => $NOT_IGNORED,
13411 Property => 'Hangul_Syllable_Type'),
13412 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
13413 Property => 'Word_Break',
13414 Has_Missings_Defaults => $NOT_IGNORED,
13416 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0,
13417 Property => 'Grapheme_Cluster_Break',
13418 Has_Missings_Defaults => $NOT_IGNORED,
13420 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
13421 Property => 'Sentence_Break',
13422 Has_Missings_Defaults => $NOT_IGNORED,
13424 Input_file->new('NamedSequences.txt', v4.1.0,
13425 Handler => \&process_NamedSequences
13427 Input_file->new('NameAliases.txt', v5.0.0,
13428 Property => 'Name_Alias',
13430 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
13432 Each_Line_Handler => \&filter_unihan_line,
13434 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
13436 Each_Line_Handler => \&filter_unihan_line,
13438 Input_file->new('UnihanIRGSources.txt', v5.2.0,
13440 Pre_Handler => \&setup_unihan,
13441 Each_Line_Handler => \&filter_unihan_line,
13443 Input_file->new('UnihanNumericValues.txt', v5.2.0,
13445 Each_Line_Handler => \&filter_unihan_line,
13447 Input_file->new('UnihanOtherMappings.txt', v5.2.0,
13449 Each_Line_Handler => \&filter_unihan_line,
13451 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
13453 Each_Line_Handler => \&filter_unihan_line,
13455 Input_file->new('UnihanReadings.txt', v5.2.0,
13457 Each_Line_Handler => \&filter_unihan_line,
13459 Input_file->new('UnihanVariants.txt', v5.2.0,
13461 Each_Line_Handler => \&filter_unihan_line,
13465 # End of all the preliminaries.
13468 if ($compare_versions) {
13469 Carp::my_carp(<<END
13470 Warning. \$compare_versions is set. Output is not suitable for production
13475 # Put into %potential_files a list of all the files in the directory structure
13476 # that could be inputs to this program, excluding those that we should ignore.
13477 # Also don't consider test files. Use absolute file names because it makes it
13478 # easier across machine types.
13479 my @ignored_files_full_names = map { File::Spec->rel2abs(
13480 internal_file_to_platform($_))
13481 } keys %ignored_files;
13484 return unless /\.txt$/i;
13485 return if /Test\.txt$/i;
13486 my $full = lc(File::Spec->rel2abs($_));
13487 $potential_files{$full} = 1
13488 if ! grep { $full eq lc($_) } @ignored_files_full_names;
13491 }, File::Spec->curdir());
13493 my @mktables_list_output_files;
13495 if ($write_unchanged_files) {
13496 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
13499 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
13501 if (! open $file_handle, "<", $file_list) {
13502 Carp::my_carp("Failed to open '$file_list' (this is expected to be missing the first time); turning on -globlist option instead: $!");
13508 # Read and parse mktables.lst, placing the results from the first part
13509 # into @input, and the second part into @mktables_list_output_files
13510 for my $list ( \@input, \@mktables_list_output_files ) {
13511 while (<$file_handle>) {
13512 s/^ \s+ | \s+ $//xg;
13513 next if /^ \s* (?: \# .* )? $/x;
13515 my ( $file ) = split /\t/;
13516 push @$list, $file;
13518 @$list = uniques(@$list);
13522 # Look through all the input files
13523 foreach my $input (@input) {
13524 next if $input eq 'version'; # Already have checked this.
13526 # Ignore if doesn't exist. The checking about whether we care or
13527 # not is done via the Input_file object.
13528 next if ! file_exists($input);
13530 # The paths are stored with relative names, and with '/' as the
13531 # delimiter; convert to absolute on this machine
13532 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
13533 $potential_files{$full} = 1
13534 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
13538 close $file_handle;
13543 # Here wants to process all .txt files in the directory structure.
13544 # Convert them to full path names. They are stored in the platform's
13547 foreach my $object (@input_file_objects) {
13548 my $file = $object->file;
13549 next unless defined $file;
13550 push @known_files, File::Spec->rel2abs($file);
13553 my @unknown_input_files;
13554 foreach my $file (keys %potential_files) {
13555 next if grep { lc($file) eq lc($_) } @known_files;
13557 # Here, the file is unknown to us. Get relative path name
13558 $file = File::Spec->abs2rel($file);
13559 push @unknown_input_files, $file;
13561 # What will happen is we create a data structure for it, and add it to
13562 # the list of input files to process. First get the subdirectories
13564 my (undef, $directories, undef) = File::Spec->splitpath($file);
13565 $directories =~ s;/$;;; # Can have extraneous trailing '/'
13566 my @directories = File::Spec->splitdir($directories);
13568 # If the file isn't extracted (meaning none of the directories is the
13569 # extracted one), just add it to the end of the list of inputs.
13570 if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
13571 push @input_file_objects, Input_file->new($file, v0);
13575 # Here, the file is extracted. It needs to go ahead of most other
13576 # processing. Search for the first input file that isn't a
13577 # special required property (that is, find one whose first_release
13578 # is non-0), and isn't extracted. Also, the Age property file is
13579 # processed before the extracted ones, just in case
13580 # $compare_versions is set.
13581 for (my $i = 0; $i < @input_file_objects; $i++) {
13582 if ($input_file_objects[$i]->first_released ne v0
13583 && lc($input_file_objects[$i]->file) ne 'dage.txt'
13584 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
13586 splice @input_file_objects, $i, 0,
13587 Input_file->new($file, v0);
13594 if (@unknown_input_files) {
13595 print STDERR simple_fold(join_lines(<<END
13597 The following files are unknown as to how to handle. Assuming they are
13598 typical property files. You'll know by later error messages if it worked or
13601 ) . " " . join(", ", @unknown_input_files) . "\n\n");
13603 } # End of looking through directory structure for more .txt files.
13605 # Create the list of input files from the objects we have defined, plus
13607 my @input_files = 'version';
13608 foreach my $object (@input_file_objects) {
13609 my $file = $object->file;
13610 next if ! defined $file; # Not all objects have files
13611 next if $object->optional && ! -e $file;
13612 push @input_files, $file;
13615 if ( $verbosity >= $VERBOSE ) {
13616 print "Expecting ".scalar( @input_files )." input files. ",
13617 "Checking ".scalar( @mktables_list_output_files )." output files.\n";
13620 # We set $youngest to be the most recently changed input file, including this
13621 # program itself (done much earlier in this file)
13622 foreach my $in (@input_files) {
13624 next unless defined $age; # Keep going even if missing a file
13625 $youngest = $age if $age < $youngest;
13627 # See that the input files have distinct names, to warn someone if they
13628 # are adding a new one
13630 my ($volume, $directories, $file ) = File::Spec->splitpath($in);
13631 $directories =~ s;/$;;; # Can have extraneous trailing '/'
13632 my @directories = File::Spec->splitdir($directories);
13633 my $base = $file =~ s/\.txt$//;
13634 construct_filename($file, 'mutable', \@directories);
13638 my $ok = ! $write_unchanged_files
13639 && scalar @mktables_list_output_files; # If none known, rebuild
13641 # Now we check to see if any output files are older than youngest, if
13642 # they are, we need to continue on, otherwise we can presumably bail.
13644 foreach my $out (@mktables_list_output_files) {
13645 if ( ! file_exists($out)) {
13646 print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
13650 #local $to_trace = 1 if main::DEBUG;
13651 trace $youngest, -M $out if main::DEBUG && $to_trace;
13652 if ( -M $out > $youngest ) {
13653 #trace "$out: age: ", -M $out, ", youngest: $youngest\n" if main::DEBUG && $to_trace;
13654 print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
13661 print "Files seem to be ok, not bothering to rebuild.\n";
13664 print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
13666 # Ready to do the major processing. First create the perl pseudo-property.
13667 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
13669 # Process each input file
13670 foreach my $file (@input_file_objects) {
13674 # Finish the table generation.
13676 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
13679 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
13682 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
13683 add_perl_synonyms();
13685 print "Writing tables\n" if $verbosity >= $PROGRESS;
13686 write_all_tables();
13688 # Write mktables.lst
13689 if ( $file_list and $make_list ) {
13691 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
13692 foreach my $file (@input_files, @files_actually_output) {
13693 my (undef, $directories, $file) = File::Spec->splitpath($file);
13694 my @directories = File::Spec->splitdir($directories);
13695 $file = join '/', @directories, $file;
13699 if (! open $ofh,">",$file_list) {
13700 Carp::my_carp("Can't write to '$file_list'. Skipping: $!");
13704 print $ofh <<"END";
13706 # $file_list -- File list for $0.
13708 # Autogenerated on @{[scalar localtime]}
13710 # - First section is input files
13711 # ($0 itself is not listed but is automatically considered an input)
13712 # - Section seperator is /^=+\$/
13713 # - Second section is a list of output files.
13714 # - Lines matching /^\\s*#/ are treated as comments
13715 # which along with blank lines are ignored.
13721 print $ofh "$_\n" for sort(@input_files);
13722 print $ofh "\n=================================\n# Output files:\n\n";
13723 print $ofh "$_\n" for sort @files_actually_output;
13724 print $ofh "\n# ",scalar(@input_files)," input files\n",
13725 "# ",scalar(@files_actually_output)+1," output files\n\n",
13728 or Carp::my_carp("Failed to close $ofh: $!");
13730 print "Filelist has ",scalar(@input_files)," input files and ",
13731 scalar(@files_actually_output)+1," output files\n"
13732 if $verbosity >= $VERBOSE;
13736 # Output these warnings unless -q explicitly specified.
13737 if ($verbosity >= $NORMAL_VERBOSITY) {
13738 if (@unhandled_properties) {
13739 print "\nProperties and tables that unexpectedly have no code points\n";
13740 foreach my $property (sort @unhandled_properties) {
13741 print $property, "\n";
13745 if (%potential_files) {
13746 print "\nInput files that are not considered:\n";
13747 foreach my $file (sort keys %potential_files) {
13748 print File::Spec->abs2rel($file), "\n";
13751 print "\nAll done\n" if $verbosity >= $VERBOSE;
13755 # TRAILING CODE IS USED BY make_property_test_script()
13761 # Test the \p{} regular expression constructs. This file is constructed by
13762 # mktables from the tables it generates, so if mktables is buggy, this won't
13763 # necessarily catch those bugs. Tests are generated for all feasible
13764 # properties; a few aren't currently feasible; see is_code_point_usable()
13765 # in mktables for details.
13767 # Standard test packages are not used because this manipulates SIG_WARN. It
13768 # exits 0 if every non-skipped test succeeded; -1 if any failed.
13774 my $non_ASCII = (ord('A') != 65);
13776 # The first 127 ASCII characters in ordinal order, with the ones that don't
13777 # have Perl names (as of 5.8) replaced by dots. The 127th is used as the
13779 my $ascii_to_ebcdic = "\0......\a\b\t\n.\f\r.................. !\"#\$\%&'()*+,-./0123456789:;<=>?\@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
13780 #for my $i (0..126) {
13781 # print $i, ": ", substr($ascii_to_ebcdic, $i, 1), "\n";
13785 my $expected = shift;
13788 my $warning_type = shift; # Type of warning message, like 'deprecated'
13790 my $line = (caller)[2];
13792 # Convert the code point to hex form
13793 my $string = sprintf "\"\\x{%04X}\"", $ord;
13795 # Convert the non-ASCII code points expressible as characters in Perl 5.8
13796 # to their ASCII equivalents, and skip the others.
13797 if ($non_ASCII && $ord < 255) {
13799 # Dots are used as place holders in the conversion string for the
13800 # non-convertible ones, so check for it first.
13801 if ($ord == 0x2E) {
13805 # Any dots returned are non-convertible.
13806 && ((my $char = substr($ascii_to_ebcdic, $ord, 1)) ne '.'))
13808 #print STDERR "$ord, $char, \n";
13814 print "ok $Tests - $string =~ $regex # Skipped: non-ASCII\n";
13819 # The first time through, use all warnings.
13822 # If the input should generate a warning, add another time through with
13824 push @tests, "no warnings '$warning_type';" if $warning_type;
13826 foreach my $no_warnings (@tests) {
13828 # Store any warning messages instead of outputting them
13829 local $SIG{__WARN__} = $SIG{__WARN__};
13830 my $warning_message;
13831 $SIG{__WARN__} = sub { $warning_message = $_[0] };
13835 # A string eval is needed because of the 'no warnings'.
13836 # Assumes no parens in the regular expression
13837 my $result = eval "$no_warnings
13838 my \$RegObj = qr($regex);
13839 $string =~ \$RegObj ? 1 : 0";
13840 if (not defined $result) {
13841 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
13844 elsif ($result ^ $expected) {
13845 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
13848 elsif ($warning_message) {
13849 if (! $warning_type || ($warning_type && $no_warnings)) {
13850 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
13854 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
13857 elsif ($warning_type && ! $no_warnings) {
13858 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
13862 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
13871 if (eval { 'x' =~ qr/$regex/; 1 }) {
13873 my $line = (caller)[2];
13874 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
13877 my $line = (caller)[2];
13878 print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
13884 print "1..$Tests\n";
13885 exit($Fails ? -1 : 0);
13888 Error('\p{Script=InGreek}'); # Bug #69018