lib/unicore/mktables

   1 #!/usr/bin/perl -w
   2
   3 # !!!!!!!!!!!!!!       IF YOU MODIFY THIS FILE       !!!!!!!!!!!!!!!!!!!!!!!!!
   4 # Any files created or read by this program should be listed in 'mktables.lst'
   5 # Use -makelist to regenerate it.
   6
   7 # Needs 'no overloading' to run faster on miniperl.  Code commented out at the
   8 # subroutine objaddr can be used instead to work as far back (untested) as
   9 # 5.8: needs pack "U".
  10 require 5.010_001;
  11 use strict;
  12 use warnings;
  13 use Carp;
  14 use File::Find;
  15 use File::Path;
  16 use File::Spec;
  17 use Text::Tabs;
  18
  19 sub DEBUG () { 0 }  # Set to 0 for production; 1 for development
  20
  21 ##########################################################################
  22 #
  23 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
  24 # from the Unicode database files (lib/unicore/.../*.txt),  It also generates
  25 # a pod file and a .t file
  26 #
  27 # The structure of this file is:
  28 #   First these introductory comments; then
  29 #   code needed for everywhere, such as debugging stuff; then
  30 #   code to handle input parameters; then
  31 #   data structures likely to be of external interest (some of which depend on
  32 #       the input parameters, so follows them; then
  33 #   more data structures and subroutine and package (class) definitions; then
  34 #   the small actual loop to process the input files and finish up; then
  35 #   a __DATA__ section, for the .t tests
  36 #
  37 # This program works on all releases of Unicode through at least 5.2.  The
  38 # outputs have been scrutinized most intently for release 5.1.  The others
  39 # have been checked for somewhat more than just sanity.  It can handle all
  40 # existing Unicode character properties in those releases.
  41 #
  42 # This program needs to be able to run under miniperl.  Therefore, it uses a
  43 # minimum of other modules, and hence implements some things itself that could
  44 # be gotten from CPAN
  45 #
  46 # This program uses inputs published by the Unicode Consortium.  These can
  47 # change incompatibly between releases without the Perl maintainers realizing
  48 # it.  Therefore this program is now designed to try to flag these.  It looks
  49 # at the directories where the inputs are, and flags any unrecognized files.
  50 # It keeps track of all the properties in the files it handles, and flags any
  51 # that it doesn't know how to handle.  It also flags any input lines that
  52 # don't match the expected syntax, among other checks.
  53 # It is also designed so if a new input file matches one of the known
  54 # templates, one hopefully just needs to add it to a list to have it
  55 # processed.
  56 #
  57 # It tries to keep fatal errors to a minimum, to generate something usable for
  58 # testing purposes.  It always looks for files that could be inputs, and will
  59 # warn about any that it doesn't know how to handle (the -q option suppresses
  60 # the warning).
  61 #
  62 # This program is mostly about Unicode character (or code point) properties.
  63 # A property describes some attribute or quality of a code point, like if it
  64 # is lowercase or not, its name, what version of Unicode it was first defined
  65 # in, or what its uppercase equivalent is.  Unicode deals with these disparate
  66 # possibilities by making all properties into mappings from each code point
  67 # into some corresponding value.  In the case of it being lowercase or not,
  68 # the mapping is either to 'Y' or 'N' (or various synonyms thereof).  Each
  69 # property maps each Unicode code point to a single value, called a "property
  70 # value".  (Hence each Unicode property is a true mathematical function with
  71 # exactly one value per code point.)
  72 #
  73 # When using a property in a regular expression, what is desired isn't the
  74 # mapping of the code point to its property's value, but the reverse (or the
  75 # mathematical "inverse relation"): starting with the property value, "Does a
  76 # code point map to it?"  These are written in a "compound" form:
  77 # \p{property=value}, e.g., \p{category=punctuation}.  This program generates
  78 # files containing the lists of code points that map to each such regular
  79 # expression property value, one file per list
  80 #
  81 # There is also a single form shortcut that Perl adds for many of the commonly
  82 # used properties.  This happens for all binary properties, plus script,
  83 # general_category, and block properties.
  84 #
  85 # Thus the outputs of this program are files.  There are map files, mostly in
  86 # the 'To' directory; and there are list files for use in regular expression
  87 # matching, all in subdirectories of the 'lib' directory, with each
  88 # subdirectory being named for the property that the lists in it are for.
  89 # Bookkeeping, test, and documentation files are also generated.
  90
  91 my $matches_directory = 'lib';   # Where match (\p{}) files go.
  92 my $map_directory = 'To';        # Where map files go.
  93
  94 # DATA STRUCTURES
  95 #
  96 # The major data structures of this program are Property, of course, but also
  97 # Table.  There are two kinds of tables, very similar to each other.
  98 # "Match_Table" is the data structure giving the list of code points that have
  99 # a particular property value, mentioned above.  There is also a "Map_Table"
 100 # data structure which gives the property's mapping from code point to value.
 101 # There are two structures because the match tables need to be combined in
 102 # various ways, such as constructing unions, intersections, complements, etc.,
 103 # and the map ones don't.  And there would be problems, perhaps subtle, if
 104 # a map table were inadvertently operated on in some of those ways.
 105 # The use of separate classes with operations defined on one but not the other
 106 # prevents accidentally confusing the two.
 107 #
 108 # At the heart of each table's data structure is a "Range_List", which is just
 109 # an ordered list of "Ranges", plus ancillary information, and methods to
 110 # operate on them.  A Range is a compact way to store property information.
 111 # Each range has a starting code point, an ending code point, and a value that
 112 # is meant to apply to all the code points between the two end points,
 113 # inclusive.  For a map table, this value is the property value for those
 114 # code points.  Two such ranges could be written like this:
 115 #   0x41 .. 0x5A, 'Upper',
 116 #   0x61 .. 0x7A, 'Lower'
 117 #
 118 # Each range also has a type used as a convenience to classify the values.
 119 # Most ranges in this program will be Type 0, or normal, but there are some
 120 # ranges that have a non-zero type.  These are used only in map tables, and
 121 # are for mappings that don't fit into the normal scheme of things.  Mappings
 122 # that require a hash entry to communicate with utf8.c are one example;
 123 # another example is mappings for charnames.pm to use which indicate a name
 124 # that is algorithmically determinable from its code point (and vice-versa).
 125 # These are used to significantly compact these tables, instead of listing
 126 # each one of the tens of thousands individually.
 127 #
 128 # In a match table, the value of a range is irrelevant (and hence the type as
 129 # well, which will always be 0), and arbitrarily set to the null string.
 130 # Using the example above, there would be two match tables for those two
 131 # entries, one named Upper would contain the 0x41..0x5A range, and the other
 132 # named Lower would contain 0x61..0x7A.
 133 #
 134 # Actually, there are two types of range lists, "Range_Map" is the one
 135 # associated with map tables, and "Range_List" with match tables.
 136 # Again, this is so that methods can be defined on one and not the other so as
 137 # to prevent operating on them in incorrect ways.
 138 #
 139 # Eventually, most tables are written out to files to be read by utf8_heavy.pl
 140 # in the perl core.  All tables could in theory be written, but some are
 141 # suppressed because there is no current practical use for them.  It is easy
 142 # to change which get written by changing various lists that are near the top
 143 # of the actual code in this file.  The table data structures contain enough
 144 # ancillary information to allow them to be treated as separate entities for
 145 # writing, such as the path to each one's file.  There is a heading in each
 146 # map table that gives the format of its entries, and what the map is for all
 147 # the code points missing from it.  (This allows tables to be more compact.)
 148
 149 # The Property data structure contains one or more tables.  All properties
 150 # contain a map table (except the $perl property which is a
 151 # pseudo-property containing only match tables), and any properties that
 152 # are usable in regular expression matches also contain various matching
 153 # tables, one for each value the property can have.  A binary property can
 154 # have two values, True and False (or Y and N, which are preferred by Unicode
 155 # terminology).  Thus each of these properties will have a map table that
 156 # takes every code point and maps it to Y or N (but having ranges cuts the
 157 # number of entries in that table way down), and two match tables, one
 158 # which has a list of all the code points that map to Y, and one for all the
 159 # code points that map to N.  (For each of these, a third table is also
 160 # generated for the pseudo Perl property.  It contains the identical code
 161 # points as the Y table, but can be written, not in the compound form, but in
 162 # a "single" form like \p{IsUppercase}.)  Many properties are binary, but some
 163 # properties have several possible values, some have many, and properties like
 164 # Name have a different value for every named code point.  Those will not,
 165 # unless the controlling lists are changed, have their match tables written
 166 # out.  But all the ones which can be used in regular expression \p{} and \P{}
 167 # constructs will.  Generally a property will have either its map table or its
 168 # match tables written but not both.  Again, what gets written is controlled
 169 # by lists which can easily be changed.
 170
 171 # For information about the Unicode properties, see Unicode's UAX44 document:
 172
 173 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
 174
 175 # As stated earlier, this program will work on any release of Unicode so far.
 176 # Most obvious problems in earlier data have NOT been corrected except when
 177 # necessary to make Perl or this program work reasonably.  For example, no
 178 # folding information was given in early releases, so this program uses the
 179 # substitute of lower case, just so that a regular expression with the /i
 180 # option will do something that actually gives the right results in many
 181 # cases.  There are also a couple other corrections for version 1.1.5,
 182 # commented at the point they are made.  As an example of corrections that
 183 # weren't made (but could be) is this statement from DerivedAge.txt: "The
 184 # supplementary private use code points and the non-character code points were
 185 # assigned in version 2.0, but not specifically listed in the UCD until
 186 # versions 3.0 and 3.1 respectively."  (To be precise it was 3.0.1 not 3.0.0)
 187 # More information on Unicode version glitches is further down in these
 188 # introductory comments.
 189 #
 190 # This program works on all properties as of 5.2, though the files for some
 191 # are suppressed from apparent lack of demand for.  You can change which are
 192 # output by changing lists in this program.
 193
 194 # The old version of mktables emphasized the term "Fuzzy" to mean Unocde's
 195 # loose matchings rules (from Unicode TR18):
 196 #
 197 #    The recommended names for UCD properties and property values are in
 198 #    PropertyAliases.txt [Prop] and PropertyValueAliases.txt
 199 #    [PropValue]. There are both abbreviated names and longer, more
 200 #    descriptive names. It is strongly recommended that both names be
 201 #    recognized, and that loose matching of property names be used,
 202 #    whereby the case distinctions, whitespace, hyphens, and underbar
 203 #    are ignored.
 204 # The program still allows Fuzzy to override its determination of if loose
 205 # matching should be used, but it isn't currently used, as it is no longer
 206 # needed; the calculations it makes are good enough.
 207
 208 # SUMMARY OF HOW IT WORKS:
 209 #
 210 #   Process arguments
 211 #
 212 #   A list is constructed containing each input file that is to be processed
 213 #
 214 #   Each file on the list is processed in a loop, using the associated handler
 215 #   code for each:
 216 #        The PropertyAliases.txt and PropValueAliases.txt files are processed
 217 #            first.  These files name the properties and property values.
 218 #            Objects are created of all the property and property value names
 219 #            that the rest of the input should expect, including all synonyms.
 220 #        The other input files give mappings from properties to property
 221 #           values.  That is, they list code points and say what the mapping
 222 #           is under the given property.  Some files give the mappings for
 223 #           just one property; and some for many.  This program goes through
 224 #           each file and populates the properties from them.  Some properties
 225 #           are listed in more than one file, and Unicode has set up a
 226 #           precedence as to which has priority if there is a conflict.  Thus
 227 #           the order of processing matters, and this program handles the
 228 #           conflict possibility by processing the overriding input files
 229 #           last, so that if necessary they replace earlier values.
 230 #        After this is all done, the program creates the property mappings not
 231 #            furnished by Unicode, but derivable from what it does give.
 232 #        The tables of code points that match each property value in each
 233 #            property that is accessible by regular expressions are created.
 234 #        The Perl-defined properties are created and populated.  Many of these
 235 #            require data determined from the earlier steps
 236 #        Any Perl-defined synonyms are created, and name clashes between Perl
 237 #            and Unicode are reconciled.
 238 #        All the properties are written to files
 239 #        Any other files are written, and final warnings issued.
 240
 241 # As mentioned above, some properties are given in more than one file.  In
 242 # particular, the files in the extracted directory are supposedly just
 243 # reformattings of the others.  But they contain information not easily
 244 # derivable from the other files, including results for Unihan, which this
 245 # program doesn't ordinarily look at, and for unassigned code points.  They
 246 # also have historically had errors or been incomplete.  In an attempt to
 247 # create the best possible data, this program thus processes them first to
 248 # glean information missing from the other files; then processes those other
 249 # files to override any errors in the extracted ones.
 250
 251 # For clarity, a number of operators have been overloaded to work on tables:
 252 #   ~ means invert (take all characters not in the set).  The more
 253 #       conventional '!' is not used because of the possibility of confusing
 254 #       it with the actual boolean operation.
 255 #   + means union
 256 #   - means subtraction
 257 #   & means intersection
 258 # The precedence of these is the order listed.  Parentheses should be
 259 # copiously used.  These are not a general scheme.  The operations aren't
 260 # defined for a number of things, deliberately, to avoid getting into trouble.
 261 # Operations are done on references and affect the underlying structures, so
 262 # that the copy constructors for them have been overloaded to not return a new
 263 # clone, but the input object itself.
 264
 265 # The bool operator is deliberately not overloaded to avoid confusion with
 266 # "should it mean if the object merely exists, or also is non-empty?".
 267
 268 #
 269 # WHY CERTAIN DESIGN DECISIONS WERE MADE
 270
 271 # XXX These comments need more work.
 272 #
 273 # Why have files written out for binary 'N' matches?
 274 #   For binary properties, if you know the mapping for either Y or N; the
 275 #   other is trivial to construct, so could be done at Perl run-time instead
 276 #   of having a file for it.  That is, if someone types in \p{foo: N}, Perl
 277 #   could translate that to \P{foo: Y} and not need a file.   The problem is
 278 #   communicating to Perl that a given property is binary.  Perl can't figure
 279 #   it out from looking at the N (or No), as some non-binary properties have
 280 #   these as property values.
 281 # Why
 282 # There are several types of properties, based on what form their values can
 283 # take on.  These are described in more detail below in the DATA STRUCTURES
 284 # section of these comments, but for now, you should know that there are
 285 # string properties, whose values are strings of one or more code points (such
 286 # as the Uppercase_mapping property); every other property maps to some other
 287 # form, like true or false, or a number, or a name, etc.  The reason there are
 288 # two directories for map files is because of the way utf8.c works.  It
 289 # expects that any files there are string properties, that is that the
 290 # mappings are each to one code point, with mappings in multiple code points
 291 # handled specially in an extra hash data structure.  Digit.pl is a table that
 292 # is written there for historical reasons, even though it doesn't fit that
 293 # mold.  Thus it can't currently be looked at by the Perl core.
 294 #
 295 # There are no match tables generated for matches of the null string.  These
 296 # would like like \p{JSN=}.  Perhaps something like them could be added if
 297 # necessary.  The JSN does have a real code point U+110B that maps to the null
 298 # string, but it is a contributory property, and therefore not output by
 299 # default.
 300 #
 301 # DEBUGGING
 302 #
 303 # XXX Add more stuff here.   use perl instead of miniperl to find problems with
 304 # Scalar::Util
 305
 306 # FUTURE ISSUES
 307 #
 308 # The program would break if Unicode were to change its names so that
 309 # interior white space, underscores, or dashes differences were significant
 310 # within property and property value names.
 311 #
 312 # It might be easier to use the xml versions of the UCD if this program ever
 313 # would need heavy revision, and the ability to handle old versions was not
 314 # required.
 315 #
 316 # There is the potential for name collisions, in that Perl has chosen names
 317 # that Unicode could decide it also likes.  There have been such collisions in
 318 # the past, with mostly Perl deciding to adopt the Unicode definition of the
 319 # name.  However in the 5.2 Unicode beta testing, there were a number of such
 320 # collisions, which were withdrawn before the final release, because of Perl's
 321 # and other's protests.  These all involved new properties which began with
 322 # 'Is'.  Based on the protests, Unicode is unlikely to try that again.  Also,
 323 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
 324 # Unicode document, so they are unlikely to be used by Unicode for another
 325 # purpose.  However, they might try something beginning with 'In', or use any
 326 # of the other Perl-defined properties.  This program will warn you of name
 327 # collisions, and refuse to generate tables with them, but manual intervention
 328 # will be required in this event.  One scheme that could be implemented, if
 329 # necessary, would be to have this program generate another file, or add a
 330 # field to mktables.lst that gives the date of first definition of a property.
 331 # Each new release of Unicode would use that file as a basis for the next
 332 # iteration.  And the Perl synonym addition code could sort based on the age
 333 # of the property, so older properties get priority, and newer ones that clash
 334 # would be refused; hence existing code would not be impacted, and some other
 335 # synonym would have to be used for the new property.  This is ugly, and
 336 # manual intervention would certainly be easier to do in the short run; lets
 337 # hope it never comes to this.
 338
 339 # A NOTE ON UNIHAN
 340 #
 341 # This program can generate tables from the Unihan database.  But it doesn't
 342 # by default, letting the CPAN module Unicode::Unihan handle them.  Prior to
 343 # version 5.2, this database was in a single file, Unihan.txt.  In 5.2 the
 344 # database was split into 8 different files, all beginning with the letters
 345 # 'Unihan'.  This program will read those file(s) if present, but it needs to
 346 # know which of the many properties in the file(s) should have tables created
 347 # for them.  It will create tables for any properties listed in
 348 # PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
 349 # @cjk_properties array and the @cjk_property_values array.  Thus, if a
 350 # property you want is not in those files of the release you are building
 351 # against, you must add it to those two arrays.  Starting in 4.0, the
 352 # Unicode_Radical_Stroke was listed in those files, so if the Unihan database
 353 # is present in the directory, a table will be generated for that property.
 354 # In 5.2, several more properties were added.  For your convenience, the two
 355 # arrays are initialized with all the 5.2 listed properties that are also in
 356 # earlier releases.  But these are commented out.  You can just uncomment the
 357 # ones you want, or use them as a template for adding entries for other
 358 # properties.
 359 #
 360 # You may need to adjust the entries to suit your purposes.  setup_unihan(),
 361 # and filter_unihan_line() are the functions where this is done.  This program
 362 # already does some adjusting to make the lines look more like the rest of the
 363 # Unicode DB;  You can see what that is in filter_unihan_line()
 364 #
 365 # There is a bug in the 3.2 data file in which some values for the
 366 # kPrimaryNumeric property have commas and an unexpected comment.  A filter
 367 # could be added for these; or for a particular installation, the Unihan.txt
 368 # file could be edited to fix them.
 369 # have to be
 370 #
 371 # HOW TO ADD A FILE
 372
 373 # Unicode Versions Notes
 374
 375 # alpha's numbers halve in 2.1.9, answer cjk block at 4E00 were removed from PropList; not changed, could add gc Letter, put back in in 3.1.0
 376 # Some versions of 2.1.x Jamo.txt have the wrong value for 1105, which causes
 377 # real problems for the algorithms for Jamo calculations, so it is changed
 378 # here.
 379 #   White space vs Space.  in 3.2 perl has +205F=medium math space, fixed in 4.0, and ok in 3.1.1 because not there in unicode. synonym introduced in 4.1
 380 # ATBL = 202.  202 changed to ATB, and all code points stayed there.  So if you were useing ATBL you were out of luck.
 381 # Hrkt Katakana_Or_Hiragana came in 4.01, before was Unknown.
 382 #
 383 # The default for missing code points for BidiClass is complicated.  Starting
 384 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
 385 # tries to do the best it can for earlier releases.  It is done in
 386 # process_PropertyAliases()
 387 #
 388 ##############################################################################
 389
 390 my $UNDEF = ':UNDEF:';  # String to print out for undefined values in tracing
 391                         # and errors
 392 my $MAX_LINE_WIDTH = 78;
 393
 394 # Debugging aid to skip most files so as to not be distracted by them when
 395 # concentrating on the ones being debugged.  Add
 396 # non_skip => 1,
 397 # to the constructor for those files you want processed when you set this.
 398 # Files with a first version number of 0 are special: they are always
 399 # processed regardless of the state of this flag.
 400 my $debug_skip = 0;
 401
 402 # Set to 1 to enable tracing.
 403 our $to_trace = 0;
 404
 405 { # Closure for trace: debugging aid
 406     my $print_caller = 1;        # ? Include calling subroutine name
 407     my $main_with_colon = 'main::';
 408     my $main_colon_length = length($main_with_colon);
 409
 410     sub trace {
 411         return unless $to_trace;        # Do nothing if global flag not set
 412
 413         my @input = @_;
 414
 415         local $DB::trace = 0;
 416         $DB::trace = 0;          # Quiet 'used only once' message
 417
 418         my $line_number;
 419
 420         # Loop looking up the stack to get the first non-trace caller
 421         my $caller_line;
 422         my $caller_name;
 423         my $i = 0;
 424         do {
 425             $line_number = $caller_line;
 426             (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
 427             $caller = $main_with_colon unless defined $caller;
 428
 429             $caller_name = $caller;
 430
 431             # get rid of pkg
 432             $caller_name =~ s/.*:://;
 433             if (substr($caller_name, 0, $main_colon_length)
 434                 eq $main_with_colon)
 435             {
 436                 $caller_name = substr($caller_name, $main_colon_length);
 437             }
 438
 439         } until ($caller_name ne 'trace');
 440
 441         # If the stack was empty, we were called from the top level
 442         $caller_name = 'main' if ($caller_name eq ""
 443                                     || $caller_name eq 'trace');
 444
 445         my $output = "";
 446         foreach my $string (@input) {
 447             #print STDERR __LINE__, ": ", join ", ", @input, "\n";
 448             if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
 449                 $output .= simple_dumper($string);
 450             }
 451             else {
 452                 $string = "$string" if ref $string;
 453                 $string = $UNDEF unless defined $string;
 454                 chomp $string;
 455                 $string = '""' if $string eq "";
 456                 $output .= " " if $output ne ""
 457                                 && $string ne ""
 458                                 && substr($output, -1, 1) ne " "
 459                                 && substr($string, 0, 1) ne " ";
 460                 $output .= $string;
 461             }
 462         }
 463
 464         print STDERR sprintf "%4d: ", $line_number if defined $line_number;
 465         print STDERR "$caller_name: " if $print_caller;
 466         print STDERR $output, "\n";
 467         return;
 468     }
 469 }
 470
 471 # This is for a rarely used development feature that allows you to compare two
 472 # versions of the Unicode standard without having to deal with changes caused
 473 # by the code points introduced in the later verson.  Change the 0 to a SINGLE
 474 # dotted Unicode release number (e.g. 2.1).  Only code points introduced in
 475 # that release and earlier will be used; later ones are thrown away.  You use
 476 # the version number of the earliest one you want to compare; then run this
 477 # program on directory structures containing each release, and compare the
 478 # outputs.  These outputs will therefore include only the code points common
 479 # to both releases, and you can see the changes caused just by the underlying
 480 # release semantic changes.  For versions earlier than 3.2, you must copy a
 481 # version of DAge.txt into the directory.
 482 my $string_compare_versions = DEBUG && 0; #  e.g., v2.1;
 483 my $compare_versions = DEBUG
 484                        && $string_compare_versions
 485                        && pack "C*", split /\./, $string_compare_versions;
 486
 487 sub uniques {
 488     # Returns non-duplicated input values.  From "Perl Best Practices:
 489     # Encapsulated Cleverness".  p. 455 in first edition.
 490
 491     my %seen;
 492     return grep { ! $seen{$_}++ } @_;
 493 }
 494
 495 $0 = File::Spec->canonpath($0);
 496
 497 my $make_test_script = 0;      # ? Should we output a test script
 498 my $write_unchanged_files = 0; # ? Should we update the output files even if
 499                                #    we don't think they have changed
 500 my $use_directory = "";        # ? Should we chdir somewhere.
 501 my $pod_directory;             # input directory to store the pod file.
 502 my $pod_file = 'perluniprops';
 503 my $t_path;                     # Path to the .t test file
 504 my $file_list = 'mktables.lst'; # File to store input and output file names.
 505                                # This is used to speed up the build, by not
 506                                # executing the main body of the program if
 507                                # nothing on the list has changed since the
 508                                # previous build
 509 my $make_list = 1;             # ? Should we write $file_list.  Set to always
 510                                # make a list so that when the pumpking is
 511                                # preparing a release, s/he won't have to do
 512                                # special things
 513 my $glob_list = 0;             # ? Should we try to include unknown .txt files
 514                                # in the input.
 515 my $output_range_counts = 1;   # ? Should we include the number of code points
 516                                # in ranges in the output
 517 # Verbosity levels; 0 is quiet
 518 my $NORMAL_VERBOSITY = 1;
 519 my $PROGRESS = 2;
 520 my $VERBOSE = 3;
 521
 522 my $verbosity = $NORMAL_VERBOSITY;
 523
 524 # Process arguments
 525 while (@ARGV) {
 526     my $arg = shift @ARGV;
 527     if ($arg eq '-v') {
 528         $verbosity = $VERBOSE;
 529     }
 530     elsif ($arg eq '-p') {
 531         $verbosity = $PROGRESS;
 532         $| = 1;     # Flush buffers as we go.
 533     }
 534     elsif ($arg eq '-q') {
 535         $verbosity = 0;
 536     }
 537     elsif ($arg eq '-w') {
 538         $write_unchanged_files = 1; # update the files even if havent changed
 539     }
 540     elsif ($arg eq '-check') {
 541         my $this = shift @ARGV;
 542         my $ok = shift @ARGV;
 543         if ($this ne $ok) {
 544             print "Skipping as check params are not the same.\n";
 545             exit(0);
 546         }
 547     }
 548     elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
 549         -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
 550     }
 551     elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
 552     {
 553         $make_test_script = 1;
 554     }
 555     elsif ($arg eq '-makelist') {
 556         $make_list = 1;
 557     }
 558     elsif ($arg eq '-C' && defined ($use_directory = shift)) {
 559         -d $use_directory or croak "Unknown directory '$use_directory'";
 560     }
 561     elsif ($arg eq '-L') {
 562
 563         # Existence not tested until have chdir'd
 564         $file_list = shift;
 565     }
 566     elsif ($arg eq '-globlist') {
 567         $glob_list = 1;
 568     }
 569     elsif ($arg eq '-c') {
 570         $output_range_counts = ! $output_range_counts
 571     }
 572     else {
 573         my $with_c = 'with';
 574         $with_c .= 'out' if $output_range_counts;   # Complements the state
 575         croak <<END;
 576 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
 577           [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
 578           [-check A B ]
 579   -c          : Output comments $with_c number of code points in ranges
 580   -q          : Quiet Mode: Only output serious warnings.
 581   -p          : Set verbosity level to normal plus show progress.
 582   -v          : Set Verbosity level high:  Show progress and non-serious
 583                 warnings
 584   -w          : Write files regardless
 585   -C dir      : Change to this directory before proceeding. All relative paths
 586                 except those specified by the -P and -T options will be done
 587                 with respect to this directory.
 588   -P dir      : Output $pod_file file to directory 'dir'.
 589   -T path     : Create a test script as 'path'; overrides -maketest
 590   -L filelist : Use alternate 'filelist' instead of standard one
 591   -globlist   : Take as input all non-Test *.txt files in current and sub
 592                 directories
 593   -maketest   : Make test script 'TestProp.pl' in current (or -C directory),
 594                 overrides -T
 595   -makelist   : Rewrite the file list $file_list based on current setup
 596   -check A B  : Executes $0 only if A and B are the same
 597 END
 598     }
 599 }
 600
 601 # Stores the most-recently changed file.  If none have changed, can skip the
 602 # build
 603 my $youngest = -M $0;   # Do this before the chdir!
 604
 605 # Change directories now, because need to read 'version' early.
 606 if ($use_directory) {
 607     if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
 608         $pod_directory = File::Spec->rel2abs($pod_directory);
 609     }
 610     if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
 611         $t_path = File::Spec->rel2abs($t_path);
 612     }
 613     chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
 614     if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
 615         $pod_directory = File::Spec->abs2rel($pod_directory);
 616     }
 617     if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
 618         $t_path = File::Spec->abs2rel($t_path);
 619     }
 620 }
 621
 622 # Get Unicode version into regular and v-string.  This is done now because
 623 # various tables below get populated based on it.  These tables are populated
 624 # here to be near the top of the file, and so easily seeable by those needing
 625 # to modify things.
 626 open my $VERSION, "<", "version"
 627                     or croak "$0: can't open required file 'version': $!\n";
 628 my $string_version = <$VERSION>;
 629 close $VERSION;
 630 chomp $string_version;
 631 my $v_version = pack "C*", split /\./, $string_version;        # v string
 632
 633 # The following are the complete names of properties with property values that
 634 # are known to not match any code points in some versions of Unicode, but that
 635 # may change in the future so they should be matchable, hence an empty file is
 636 # generated for them.
 637 my @tables_that_may_be_empty = (
 638                                 'Joining_Type=Left_Joining',
 639                                 );
 640 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
 641 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
 642 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
 643                                                     if $v_version ge v4.1.0;
 644
 645 # The lists below are hashes, so the key is the item in the list, and the
 646 # value is the reason why it is in the list.  This makes generation of
 647 # documentation easier.
 648
 649 my %why_suppressed;  # No file generated for these.
 650
 651 # Files aren't generated for empty extraneous properties.  This is arguable.
 652 # Extraneous properties generally come about because a property is no longer
 653 # used in a newer version of Unicode.  If we generated a file without code
 654 # points, programs that used to work on that property will still execute
 655 # without errors.  It just won't ever match (or will always match, with \P{}).
 656 # This means that the logic is now likely wrong.  I (khw) think its better to
 657 # find this out by getting an error message.  Just move them to the table
 658 # above to change this behavior
 659 my %why_suppress_if_empty_warn_if_not = (
 660
 661    # It is the only property that has ever officially been removed from the
 662    # Standard.  The database never contained any code points for it.
 663    'Special_Case_Condition' => 'Obsolete',
 664
 665    # Apparently never official, but there were code points in some versions of
 666    # old-style PropList.txt
 667    'Non_Break' => 'Obsolete',
 668 );
 669
 670 # These would normally go in the warn table just above, but they were changed
 671 # a long time before this program was written, so warnings about them are
 672 # moot.
 673 if ($v_version gt v3.2.0) {
 674     push @tables_that_may_be_empty,
 675                                 'Canonical_Combining_Class=Attached_Below_Left'
 676 }
 677
 678 # These are listed in the Property aliases file in 5.2, but Unihan is ignored
 679 # unless explicitly added.
 680 if ($v_version ge v5.2.0) {
 681     my $unihan = 'Unihan; remove from list if using Unihan';
 682     foreach my $table qw (
 683                            kAccountingNumeric
 684                            kOtherNumeric
 685                            kPrimaryNumeric
 686                            kCompatibilityVariant
 687                            kIICore
 688                            kIRG_GSource
 689                            kIRG_HSource
 690                            kIRG_JSource
 691                            kIRG_KPSource
 692                            kIRG_MSource
 693                            kIRG_KSource
 694                            kIRG_TSource
 695                            kIRG_USource
 696                            kIRG_VSource
 697                            kRSUnicode
 698                         )
 699     {
 700         $why_suppress_if_empty_warn_if_not{$table} = $unihan;
 701     }
 702 }
 703
 704 # Properties that this program ignores.
 705 my @unimplemented_properties = (
 706 'Unicode_Radical_Stroke'    # Remove if changing to handle this one.
 707 );
 708
 709 # There are several types of obsolete properties defined by Unicode.  These
 710 # must be hand-edited for every new Unicode release.
 711 my %why_deprecated;  # Generates a deprecated warning message if used.
 712 my %why_stabilized;  # Documentation only
 713 my %why_obsolete;    # Documentation only
 714
 715 {   # Closure
 716     my $simple = 'Perl uses the more complete version of this property';
 717     my $unihan = 'Unihan properties are by default not enabled in the Perl core.  Instead use CPAN: Unicode::Unihan';
 718
 719     my $other_properties = 'other properties';
 720     my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
 721     my $why_no_expand  = "Easily computed, and yet doesn't cover the common encoding forms (UTF-16/8)",
 722
 723     %why_deprecated = (
 724         'Grapheme_Link' => 'Deprecated by Unicode.  Use ccc=vr (Canonical_Combining_Class=Virama) instead',
 725         'Jamo_Short_Name' => $contributory,
 726         'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
 727         'Other_Alphabetic' => $contributory,
 728         'Other_Default_Ignorable_Code_Point' => $contributory,
 729         'Other_Grapheme_Extend' => $contributory,
 730         'Other_ID_Continue' => $contributory,
 731         'Other_ID_Start' => $contributory,
 732         'Other_Lowercase' => $contributory,
 733         'Other_Math' => $contributory,
 734         'Other_Uppercase' => $contributory,
 735     );
 736
 737     %why_suppressed = (
 738         # There is a lib/unicore/Decomposition.pl (used by normalize.pm) which
 739         # contains the same information, but without the algorithmically
 740         # determinable Hangul syllables'.  This file is not published, so it's
 741         # existence is not noted in the comment.
 742         'Decomposition_Mapping' => 'Accessible via Unicode::Normalize',
 743
 744         'ISO_Comment' => 'Apparently no demand for it, but can access it through Unicode::UCD::charinfo.  Obsoleted, and code points for it removed in Unicode 5.2',
 745         'Unicode_1_Name' => "$simple, and no apparent demand for it, but can access it through Unicode::UCD::charinfo.  If there is no later name for a code point, then this one is used instead in charnames",
 746
 747         'Simple_Case_Folding' => "$simple.  Can access this through Unicode::UCD::casefold",
 748         'Simple_Lowercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo",
 749         'Simple_Titlecase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo",
 750         'Simple_Uppercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo",
 751
 752         'Name' => "Accessible via 'use charnames;'",
 753         'Name_Alias' => "Accessible via 'use charnames;'",
 754
 755         # These are sort of jumping the gun; deprecation is proposed for
 756         # Unicode version 6.0, but they have never been exposed by Perl, and
 757         # likely are soon to be deprecated, so best not to expose them.
 758         FC_NFKC_Closure => 'Use NFKC_Casefold instead',
 759         Expands_On_NFC => $why_no_expand,
 760         Expands_On_NFD => $why_no_expand,
 761         Expands_On_NFKC => $why_no_expand,
 762         Expands_On_NFKD => $why_no_expand,
 763     );
 764
 765     # The following are suppressed because they were made contributory or
 766     # deprecated by Unicode before Perl ever thought about supporting them.
 767     foreach my $property ('Jamo_Short_Name', 'Grapheme_Link') {
 768         $why_suppressed{$property} = $why_deprecated{$property};
 769     }
 770
 771     # Customize the message for all the 'Other_' properties
 772     foreach my $property (keys %why_deprecated) {
 773         next if (my $main_property = $property) !~ s/^Other_//;
 774         $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
 775     }
 776 }
 777
 778 if ($v_version ge 4.0.0) {
 779     $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
 780 }
 781 if ($v_version ge 5.2.0) {
 782     $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
 783 }
 784
 785 # Probably obsolete forever
 786 if ($v_version ge v4.1.0) {
 787     $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete.  All code points previously matched by this have been moved to "Script=Common"';
 788 }
 789
 790 # This program can create files for enumerated-like properties, such as
 791 # 'Numeric_Type'.  This file would be the same format as for a string
 792 # property, with a mapping from code point to its value, so you could look up,
 793 # for example, the script a code point is in.  But no one so far wants this
 794 # mapping, or they have found another way to get it since this is a new
 795 # feature.  So no file is generated except if it is in this list.
 796 my @output_mapped_properties = split "\n", <<END;
 797 END
 798
 799 # If you are using the Unihan database, you need to add the properties that
 800 # you want to extract from it to this table.  For your convenience, the
 801 # properties in the 5.2 PropertyAliases.txt file are listed, commented out
 802 my @cjk_properties = split "\n", <<'END';
 803 #cjkAccountingNumeric; kAccountingNumeric
 804 #cjkOtherNumeric; kOtherNumeric
 805 #cjkPrimaryNumeric; kPrimaryNumeric
 806 #cjkCompatibilityVariant; kCompatibilityVariant
 807 #cjkIICore ; kIICore
 808 #cjkIRG_GSource; kIRG_GSource
 809 #cjkIRG_HSource; kIRG_HSource
 810 #cjkIRG_JSource; kIRG_JSource
 811 #cjkIRG_KPSource; kIRG_KPSource
 812 #cjkIRG_KSource; kIRG_KSource
 813 #cjkIRG_TSource; kIRG_TSource
 814 #cjkIRG_USource; kIRG_USource
 815 #cjkIRG_VSource; kIRG_VSource
 816 #cjkRSUnicode; kRSUnicode                ; Unicode_Radical_Stroke; URS
 817 END
 818
 819 # Similarly for the property values.  For your convenience, the lines in the
 820 # 5.2 PropertyAliases.txt file are listed.  Just remove the first BUT NOT both
 821 # '#' marks
 822 my @cjk_property_values = split "\n", <<'END';
 823 ## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
 824 ## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
 825 ## @missing: 0000..10FFFF; cjkIICore; <none>
 826 ## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
 827 ## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
 828 ## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
 829 ## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
 830 ## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
 831 ## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
 832 ## @missing: 0000..10FFFF; cjkIRG_USource; <none>
 833 ## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
 834 ## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
 835 ## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
 836 ## @missing: 0000..10FFFF; cjkRSUnicode; <none>
 837 END
 838
 839 # The input files don't list every code point.  Those not listed are to be
 840 # defaulted to some value.  Below are hard-coded what those values are for
 841 # non-binary properties as of 5.1.  Starting in 5.0, there are
 842 # machine-parsable comment lines in the files the give the defaults; so this
 843 # list shouldn't have to be extended.  The claim is that all missing entries
 844 # for binary properties will default to 'N'.  Unicode tried to change that in
 845 # 5.2, but the beta period produced enough protest that they backed off.
 846 #
 847 # The defaults for the fields that appear in UnicodeData.txt in this hash must
 848 # be in the form that it expects.  The others may be synonyms.
 849 my $CODE_POINT = '<code point>';
 850 my %default_mapping = (
 851     Age => "Unassigned",
 852     # Bidi_Class => Complicated; set in code
 853     Bidi_Mirroring_Glyph => "",
 854     Block => 'No_Block',
 855     Canonical_Combining_Class => 0,
 856     Case_Folding => $CODE_POINT,
 857     Decomposition_Mapping => $CODE_POINT,
 858     Decomposition_Type => 'None',
 859     East_Asian_Width => "Neutral",
 860     FC_NFKC_Closure => $CODE_POINT,
 861     General_Category => 'Cn',
 862     Grapheme_Cluster_Break => 'Other',
 863     Hangul_Syllable_Type => 'NA',
 864     ISO_Comment => "",
 865     Jamo_Short_Name => "",
 866     Joining_Group => "No_Joining_Group",
 867     # Joining_Type => Complicated; set in code
 868     kIICore => 'N',   #                       Is converted to binary
 869     #Line_Break => Complicated; set in code
 870     Lowercase_Mapping => $CODE_POINT,
 871     Name => "",
 872     Name_Alias => "",
 873     NFC_QC => 'Yes',
 874     NFD_QC => 'Yes',
 875     NFKC_QC => 'Yes',
 876     NFKD_QC => 'Yes',
 877     Numeric_Type => 'None',
 878     Numeric_Value => 'NaN',
 879     Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
 880     Sentence_Break => 'Other',
 881     Simple_Case_Folding => $CODE_POINT,
 882     Simple_Lowercase_Mapping => $CODE_POINT,
 883     Simple_Titlecase_Mapping => $CODE_POINT,
 884     Simple_Uppercase_Mapping => $CODE_POINT,
 885     Titlecase_Mapping => $CODE_POINT,
 886     Unicode_1_Name => "",
 887     Unicode_Radical_Stroke => "",
 888     Uppercase_Mapping => $CODE_POINT,
 889     Word_Break => 'Other',
 890 );
 891
 892 # Below are files that Unicode furnishes, but this program ignores, and why
 893 my %ignored_files = (
 894     'CJKRadicals.txt' => 'Unihan data',
 895     'Index.txt' => 'An index, not actual data',
 896     'NamedSqProv.txt' => 'Not officially part of the Unicode standard; Append it to NamedSequences.txt if you want to process the contents.',
 897     'NamesList.txt' => 'Just adds commentary',
 898     'NormalizationCorrections.txt' => 'Data is already in other files.',
 899     'Props.txt' => 'Adds nothing to PropList.txt; only in very early releases',
 900     'ReadMe.txt' => 'Just comments',
 901     'README.TXT' => 'Just comments',
 902     'StandardizedVariants.txt' => 'Only for glyph changes, not a Unicode character property.  Does not fit into current scheme where one code point is mapped',
 903 );
 904
 905 ################ End of externally interesting definitions ###############
 906
 907 my $HEADER=<<"EOF";
 908 # !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
 909 # This file is machine-generated by $0 from the Unicode
 910 # database, Version $string_version.  Any changes made here will be lost!
 911 EOF
 912
 913 my $INTERNAL_ONLY=<<"EOF";
 914
 915 # !!!!!!!   INTERNAL PERL USE ONLY   !!!!!!!
 916 # This file is for internal use by the Perl program only.  The format and even
 917 # the name or existence of this file are subject to change without notice.
 918 # Don't use it directly.
 919 EOF
 920
 921 my $DEVELOPMENT_ONLY=<<"EOF";
 922 # !!!!!!!   DEVELOPMENT USE ONLY   !!!!!!!
 923 # This file contains information artificially constrained to code points
 924 # present in Unicode release $string_compare_versions.
 925 # IT CANNOT BE RELIED ON.  It is for use during development only and should
 926 # not be used for production.
 927
 928 EOF
 929
 930 my $LAST_UNICODE_CODEPOINT_STRING = "10FFFF";
 931 my $LAST_UNICODE_CODEPOINT = hex $LAST_UNICODE_CODEPOINT_STRING;
 932 my $MAX_UNICODE_CODEPOINTS = $LAST_UNICODE_CODEPOINT + 1;
 933
 934 # Matches legal code point.  4-6 hex numbers, If there are 6, the first
 935 # two must be 10; if there are 5, the first must not be a 0.  Written this way
 936 # to decrease backtracking
 937 my $code_point_re =
 938         qr/ \b (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
 939
 940 # This matches the beginning of the line in the Unicode db files that give the
 941 # defaults for code points not listed (i.e., missing) in the file.  The code
 942 # depends on this ending with a semi-colon, so it can assume it is a valid
 943 # field when the line is split() by semi-colons
 944 my $missing_defaults_prefix =
 945             qr/^#\s+\@missing:\s+0000\.\.$LAST_UNICODE_CODEPOINT_STRING\s*;/;
 946
 947 # Property types.  Unicode has more types, but these are sufficient for our
 948 # purposes.
 949 my $UNKNOWN = -1;   # initialized to illegal value
 950 my $NON_STRING = 1; # Either binary or enum
 951 my $BINARY = 2;
 952 my $ENUM = 3;       # Include catalog
 953 my $STRING = 4;     # Anything else: string or misc
 954
 955 # Some input files have lines that give default values for code points not
 956 # contained in the file.  Sometimes these should be ignored.
 957 my $NO_DEFAULTS = 0;        # Must evaluate to false
 958 my $NOT_IGNORED = 1;
 959 my $IGNORED = 2;
 960
 961 # Range types.  Each range has a type.  Most ranges are type 0, for normal,
 962 # and will appear in the main body of the tables in the output files, but
 963 # there are other types of ranges as well, listed below, that are specially
 964 # handled.   There are pseudo-types as well that will never be stored as a
 965 # type, but will affect the calculation of the type.
 966
 967 # 0 is for normal, non-specials
 968 my $MULTI_CP = 1;           # Sequence of more than code point
 969 my $HANGUL_SYLLABLE = 2;
 970 my $CP_IN_NAME = 3;         # The NAME contains the code point appended to it.
 971 my $NULL = 4;               # The map is to the null string; utf8.c can't
 972                             # handle these, nor is there an accepted syntax
 973                             # for them in \p{} constructs
 974 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
 975                              # otherwise be $MULTI_CP type are instead type 0
 976
 977 # process_generic_property_file() can accept certain overrides in its input.
 978 # Each of these must begin AND end with $CMD_DELIM.
 979 my $CMD_DELIM = "\a";
 980 my $REPLACE_CMD = 'replace';    # Override the Replace
 981 my $MAP_TYPE_CMD = 'map_type';  # Override the Type
 982
 983 my $NO = 0;
 984 my $YES = 1;
 985
 986 # Values for the Replace argument to add_range.
 987 # $NO                      # Don't replace; add only the code points not
 988                            # already present.
 989 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
 990                            # the comments at the subroutine definition.
 991 my $UNCONDITIONALLY = 2;   # Replace without conditions.
 992 my $MULTIPLE = 4;          # Don't replace, but add a duplicate record if
 993                            # already there
 994
 995 # Flags to give property statuses.  The phrases are to remind maintainers that
 996 # if the flag is changed, the indefinite article referring to it in the
 997 # documentation may need to be as well.
 998 my $NORMAL = "";
 999 my $SUPPRESSED = 'z';   # The character should never actually be seen, since
1000                         # it is suppressed
1001 my $DEPRECATED = 'D';
1002 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1003 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1004 my $DISCOURAGED = 'X';
1005 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1006 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1007 my $STRICTER = 'T';
1008 my $a_bold_stricter = "a 'B<$STRICTER>'";
1009 my $A_bold_stricter = "A 'B<$STRICTER>'";
1010 my $STABILIZED = 'S';
1011 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1012 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1013 my $OBSOLETE = 'O';
1014 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1015 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1016
1017 my %status_past_participles = (
1018     $DISCOURAGED => 'discouraged',
1019     $SUPPRESSED => 'should never be generated',
1020     $STABILIZED => 'stabilized',
1021     $OBSOLETE => 'obsolete',
1022     $DEPRECATED => 'deprecated'
1023 );
1024
1025 # The format of the values of the map tables:
1026 my $BINARY_FORMAT = 'b';
1027 my $DECIMAL_FORMAT = 'd';
1028 my $FLOAT_FORMAT = 'f';
1029 my $INTEGER_FORMAT = 'i';
1030 my $HEX_FORMAT = 'x';
1031 my $RATIONAL_FORMAT = 'r';
1032 my $STRING_FORMAT = 's';
1033
1034 my %map_table_formats = (
1035     $BINARY_FORMAT => 'binary',
1036     $DECIMAL_FORMAT => 'single decimal digit',
1037     $FLOAT_FORMAT => 'floating point number',
1038     $INTEGER_FORMAT => 'integer',
1039     $HEX_FORMAT => 'positive hex whole number; a code point',
1040     $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1041     $STRING_FORMAT => 'arbitrary string',
1042 );
1043
1044 # Unicode didn't put such derived files in a separate directory at first.
1045 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1046 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1047 my $AUXILIARY = 'auxiliary';
1048
1049 # Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1050 my %loose_to_file_of;       # loosely maps table names to their respective
1051                             # files
1052 my %stricter_to_file_of;    # same; but for stricter mapping.
1053 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1054                              # their rational equivalent
1055 my %loose_property_name_of; # Loosely maps property names to standard form
1056
1057 # These constants names and values were taken from the Unicode standard,
1058 # version 5.1, section 3.12.  They are used in conjunction with Hangul
1059 # syllables
1060 my $SBase = 0xAC00;
1061 my $LBase = 0x1100;
1062 my $VBase = 0x1161;
1063 my $TBase = 0x11A7;
1064 my $SCount = 11172;
1065 my $LCount = 19;
1066 my $VCount = 21;
1067 my $TCount = 28;
1068 my $NCount = $VCount * $TCount;
1069
1070 # For Hangul syllables;  These store the numbers from Jamo.txt in conjunction
1071 # with the above published constants.
1072 my %Jamo;
1073 my %Jamo_L;     # Leading consonants
1074 my %Jamo_V;     # Vowels
1075 my %Jamo_T;     # Trailing consonants
1076
1077 my @unhandled_properties;  # Will contain a list of properties found in
1078                            # the input that we didn't process.
1079 my @match_properties;      # Properties that have match tables, to be
1080                            # listed in the pod
1081 my @map_properties;        # Properties that get map files written
1082 my @named_sequences;       # NamedSequences.txt contents.
1083 my %potential_files;       # Generated list of all .txt files in the directory
1084                            # structure so we can warn if something is being
1085                            # ignored.
1086 my @files_actually_output; # List of files we generated.
1087 my @more_Names;            # Some code point names are compound; this is used
1088                            # to store the extra components of them.
1089 my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1090                            # the minimum before we consider it equivalent to a
1091                            # candidate rational
1092 my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1093
1094 # These store references to certain commonly used property objects
1095 my $gc;
1096 my $perl;
1097 my $block;
1098
1099 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1100 my $has_In_conflicts = 0;
1101 my $has_Is_conflicts = 0;
1102
1103 sub internal_file_to_platform ($) {
1104     # Convert our file paths which have '/' separators to those of the
1105     # platform.
1106
1107     my $file = shift;
1108     return undef unless defined $file;
1109
1110     return File::Spec->join(split '/', $file);
1111 }
1112
1113 sub file_exists ($) {   # platform independent '-e'.  This program internally
1114                         # uses slash as a path separator.
1115     my $file = shift;
1116     return 0 if ! defined $file;
1117     return -e internal_file_to_platform($file);
1118 }
1119
1120 sub objaddr($) {
1121     # Returns the address of the blessed input object.
1122     # It doesn't check for blessedness because that would do a string eval
1123     # every call, and the program is structured so that this is never called
1124     # for a non-blessed object.
1125
1126     no overloading; # If overloaded, numifying below won't work.
1127
1128     # Numifying a ref gives its address.
1129     return 0 + $_[0];
1130 }
1131
1132 # Commented code below should work on Perl 5.8.
1133 ## This 'require' doesn't necessarily work in miniperl, and even if it does,
1134 ## the native perl version of it (which is what would operate under miniperl)
1135 ## is extremely slow, as it does a string eval every call.
1136 #my $has_fast_scalar_util = $\18 !~ /miniperl/
1137 #                            && defined eval "require Scalar::Util";
1138 #
1139 #sub objaddr($) {
1140 #    # Returns the address of the blessed input object.  Uses the XS version if
1141 #    # available.  It doesn't check for blessedness because that would do a
1142 #    # string eval every call, and the program is structured so that this is
1143 #    # never called for a non-blessed object.
1144 #
1145 #    return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1146 #
1147 #    # Check at least that is a ref.
1148 #    my $pkg = ref($_[0]) or return undef;
1149 #
1150 #    # Change to a fake package to defeat any overloaded stringify
1151 #    bless $_[0], 'main::Fake';
1152 #
1153 #    # Numifying a ref gives its address.
1154 #    my $addr = 0 + $_[0];
1155 #
1156 #    # Return to original class
1157 #    bless $_[0], $pkg;
1158 #    return $addr;
1159 #}
1160
1161 sub max ($$) {
1162     my $a = shift;
1163     my $b = shift;
1164     return $a if $a >= $b;
1165     return $b;
1166 }
1167
1168 sub min ($$) {
1169     my $a = shift;
1170     my $b = shift;
1171     return $a if $a <= $b;
1172     return $b;
1173 }
1174
1175 sub clarify_number ($) {
1176     # This returns the input number with underscores inserted every 3 digits
1177     # in large (5 digits or more) numbers.  Input must be entirely digits, not
1178     # checked.
1179
1180     my $number = shift;
1181     my $pos = length($number) - 3;
1182     return $number if $pos <= 1;
1183     while ($pos > 0) {
1184         substr($number, $pos, 0) = '_';
1185         $pos -= 3;
1186     }
1187     return $number;
1188 }
1189
1190
1191 package Carp;
1192
1193 # These routines give a uniform treatment of messages in this program.  They
1194 # are placed in the Carp package to cause the stack trace to not include them,
1195 # although an alternative would be to use another package and set @CARP_NOT
1196 # for it.
1197
1198 our $Verbose = 1 if main::DEBUG;  # Useful info when debugging
1199
1200 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1201 # and overload trying to load Scalar:Util under miniperl.  See
1202 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1203 undef $overload::VERSION;
1204
1205 sub my_carp {
1206     my $message = shift || "";
1207     my $nofold = shift || 0;
1208
1209     if ($message) {
1210         $message = main::join_lines($message);
1211         $message =~ s/^$0: *//;     # Remove initial program name
1212         $message =~ s/[.;,]+$//;    # Remove certain ending punctuation
1213         $message = "\n$0: $message;";
1214
1215         # Fold the message with program name, semi-colon end punctuation
1216         # (which looks good with the message that carp appends to it), and a
1217         # hanging indent for continuation lines.
1218         $message = main::simple_fold($message, "", 4) unless $nofold;
1219         $message =~ s/\n$//;        # Remove the trailing nl so what carp
1220                                     # appends is to the same line
1221     }
1222
1223     return $message if defined wantarray;   # If a caller just wants the msg
1224
1225     carp $message;
1226     return;
1227 }
1228
1229 sub my_carp_bug {
1230     # This is called when it is clear that the problem is caused by a bug in
1231     # this program.
1232
1233     my $message = shift;
1234     $message =~ s/^$0: *//;
1235     $message = my_carp("Bug in $0.  Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1236     carp $message;
1237     return;
1238 }
1239
1240 sub carp_too_few_args {
1241     if (@_ != 2) {
1242         my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'.  No action taken.");
1243         return;
1244     }
1245
1246     my $args_ref = shift;
1247     my $count = shift;
1248
1249     my_carp_bug("Need at least $count arguments to "
1250         . (caller 1)[3]
1251         . ".  Instead got: '"
1252         . join ', ', @$args_ref
1253         . "'.  No action taken.");
1254     return;
1255 }
1256
1257 sub carp_extra_args {
1258     my $args_ref = shift;
1259     my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . ");  Extras ignored.") if @_;
1260
1261     unless (ref $args_ref) {
1262         my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref.  Not checking arguments.");
1263         return;
1264     }
1265     my ($package, $file, $line) = caller;
1266     my $subroutine = (caller 1)[3];
1267
1268     my $list;
1269     if (ref $args_ref eq 'HASH') {
1270         foreach my $key (keys %$args_ref) {
1271             $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1272         }
1273         $list = join ', ', each %{$args_ref};
1274     }
1275     elsif (ref $args_ref eq 'ARRAY') {
1276         foreach my $arg (@$args_ref) {
1277             $arg = $UNDEF unless defined $arg;
1278         }
1279         $list = join ', ', @$args_ref;
1280     }
1281     else {
1282         my_carp_bug("Can't cope with ref "
1283                 . ref($args_ref)
1284                 . " . argument to 'carp_extra_args'.  Not checking arguments.");
1285         return;
1286     }
1287
1288     my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine.  Skipped.");
1289     return;
1290 }
1291
1292 package main;
1293
1294 { # Closure
1295
1296     # This program uses the inside-out method for objects, as recommended in
1297     # "Perl Best Practices".  This closure aids in generating those.  There
1298     # are two routines.  setup_package() is called once per package to set
1299     # things up, and then set_access() is called for each hash representing a
1300     # field in the object.  These routines arrange for the object to be
1301     # properly destroyed when no longer used, and for standard accessor
1302     # functions to be generated.  If you need more complex accessors, just
1303     # write your own and leave those accesses out of the call to set_access().
1304     # More details below.
1305
1306     my %constructor_fields; # fields that are to be used in constructors; see
1307                             # below
1308
1309     # The values of this hash will be the package names as keys to other
1310     # hashes containing the name of each field in the package as keys, and
1311     # references to their respective hashes as values.
1312     my %package_fields;
1313
1314     sub setup_package {
1315         # Sets up the package, creating standard DESTROY and dump methods
1316         # (unless already defined).  The dump method is used in debugging by
1317         # simple_dumper().
1318         # The optional parameters are:
1319         #   a)  a reference to a hash, that gets populated by later
1320         #       set_access() calls with one of the accesses being
1321         #       'constructor'.  The caller can then refer to this, but it is
1322         #       not otherwise used by these two routines.
1323         #   b)  a reference to a callback routine to call during destruction
1324         #       of the object, before any fields are actually destroyed
1325
1326         my %args = @_;
1327         my $constructor_ref = delete $args{'Constructor_Fields'};
1328         my $destroy_callback = delete $args{'Destroy_Callback'};
1329         Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1330
1331         my %fields;
1332         my $package = (caller)[0];
1333
1334         $package_fields{$package} = \%fields;
1335         $constructor_fields{$package} = $constructor_ref;
1336
1337         unless ($package->can('DESTROY')) {
1338             my $destroy_name = "${package}::DESTROY";
1339             no strict "refs";
1340
1341             # Use typeglob to give the anonymous subroutine the name we want
1342             *$destroy_name = sub {
1343                 my $self = shift;
1344                 my $addr = main::objaddr($self);
1345
1346                 $self->$destroy_callback if $destroy_callback;
1347                 foreach my $field (keys %{$package_fields{$package}}) {
1348                     #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1349                     delete $package_fields{$package}{$field}{$addr};
1350                 }
1351                 return;
1352             }
1353         }
1354
1355         unless ($package->can('dump')) {
1356             my $dump_name = "${package}::dump";
1357             no strict "refs";
1358             *$dump_name = sub {
1359                 my $self = shift;
1360                 return dump_inside_out($self, $package_fields{$package}, @_);
1361             }
1362         }
1363         return;
1364     }
1365
1366     sub set_access {
1367         # Arrange for the input field to be garbage collected when no longer
1368         # needed.  Also, creates standard accessor functions for the field
1369         # based on the optional parameters-- none if none of these parameters:
1370         #   'addable'    creates an 'add_NAME()' accessor function.
1371         #   'readable' or 'readable_array'   creates a 'NAME()' accessor
1372         #                function.
1373         #   'settable'   creates a 'set_NAME()' accessor function.
1374         #   'constructor' doesn't create an accessor function, but adds the
1375         #                field to the hash that was previously passed to
1376         #                setup_package();
1377         # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1378         # 'add' etc. all mean 'addable'.
1379         # The read accessor function will work on both array and scalar
1380         # values.  If another accessor in the parameter list is 'a', the read
1381         # access assumes an array.  You can also force it to be array access
1382         # by specifying 'readable_array' instead of 'readable'
1383         #
1384         # A sort-of 'protected' access can be set-up by preceding the addable,
1385         # readable or settable with some initial portion of 'protected_' (but,
1386         # the underscore is required), like 'p_a', 'pro_set', etc.  The
1387         # "protection" is only by convention.  All that happens is that the
1388         # accessor functions' names begin with an underscore.  So instead of
1389         # calling set_foo, the call is _set_foo.  (Real protection could be
1390         # accomplished by having a new subroutine, end_package called at the
1391         # end of each package, and then storing the __LINE__ ranges and
1392         # checking them on every accessor.  But that is way overkill.)
1393
1394         # We create anonymous subroutines as the accessors and then use
1395         # typeglobs to assign them to the proper package and name
1396
1397         my $name = shift;   # Name of the field
1398         my $field = shift;  # Reference to the inside-out hash containing the
1399                             # field
1400
1401         my $package = (caller)[0];
1402
1403         if (! exists $package_fields{$package}) {
1404             croak "$0: Must call 'setup_package' before 'set_access'";
1405         }
1406
1407         # Stash the field so DESTROY can get it.
1408         $package_fields{$package}{$name} = $field;
1409
1410         # Remaining arguments are the accessors.  For each...
1411         foreach my $access (@_) {
1412             my $access = lc $access;
1413
1414             my $protected = "";
1415
1416             # Match the input as far as it goes.
1417             if ($access =~ /^(p[^_]*)_/) {
1418                 $protected = $1;
1419                 if (substr('protected_', 0, length $protected)
1420                     eq $protected)
1421                 {
1422
1423                     # Add 1 for the underscore not included in $protected
1424                     $access = substr($access, length($protected) + 1);
1425                     $protected = '_';
1426                 }
1427                 else {
1428                     $protected = "";
1429                 }
1430             }
1431
1432             if (substr('addable', 0, length $access) eq $access) {
1433                 my $subname = "${package}::${protected}add_$name";
1434                 no strict "refs";
1435
1436                 # add_ accessor.  Don't add if already there, which we
1437                 # determine using 'eq' for scalars and '==' otherwise.
1438                 *$subname = sub {
1439                     use strict "refs";
1440                     return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1441                     my $self = shift;
1442                     my $value = shift;
1443                     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1444                     if (ref $value) {
1445                         return if grep { $value == $_ }
1446                                             @{$field->{main::objaddr $self}};
1447                     }
1448                     else {
1449                         return if grep { $value eq $_ }
1450                                             @{$field->{main::objaddr $self}};
1451                     }
1452                     push @{$field->{main::objaddr $self}}, $value;
1453                     return;
1454                 }
1455             }
1456             elsif (substr('constructor', 0, length $access) eq $access) {
1457                 if ($protected) {
1458                     Carp::my_carp_bug("Can't set-up 'protected' constructors")
1459                 }
1460                 else {
1461                     $constructor_fields{$package}{$name} = $field;
1462                 }
1463             }
1464             elsif (substr('readable_array', 0, length $access) eq $access) {
1465
1466                 # Here has read access.  If one of the other parameters for
1467                 # access is array, or this one specifies array (by being more
1468                 # than just 'readable_'), then create a subroutine that
1469                 # assumes the data is an array.  Otherwise just a scalar
1470                 my $subname = "${package}::${protected}$name";
1471                 if (grep { /^a/i } @_
1472                     or length($access) > length('readable_'))
1473                 {
1474                     no strict "refs";
1475                     *$subname = sub {
1476                         use strict "refs";
1477                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1478                         my $addr = main::objaddr $_[0];
1479                         if (ref $field->{$addr} ne 'ARRAY') {
1480                             my $type = ref $field->{$addr};
1481                             $type = 'scalar' unless $type;
1482                             Carp::my_carp_bug("Trying to read $name as an array when it is a $type.  Big problems.");
1483                             return;
1484                         }
1485                         return scalar @{$field->{$addr}} unless wantarray;
1486
1487                         # Make a copy; had problems with caller modifying the
1488                         # original otherwise
1489                         my @return = @{$field->{$addr}};
1490                         return @return;
1491                     }
1492                 }
1493                 else {
1494
1495                     # Here not an array value, a simpler function.
1496                     no strict "refs";
1497                     *$subname = sub {
1498                         use strict "refs";
1499                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1500                         return $field->{main::objaddr $_[0]};
1501                     }
1502                 }
1503             }
1504             elsif (substr('settable', 0, length $access) eq $access) {
1505                 my $subname = "${package}::${protected}set_$name";
1506                 no strict "refs";
1507                 *$subname = sub {
1508                     use strict "refs";
1509                     if (main::DEBUG) {
1510                         return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
1511                         Carp::carp_extra_args(\@_) if @_ > 2;
1512                     }
1513                     # $self is $_[0]; $value is $_[1]
1514                     $field->{main::objaddr $_[0]} = $_[1];
1515                     return;
1516                 }
1517             }
1518             else {
1519                 Carp::my_carp_bug("Unknown accessor type $access.  No accessor set.");
1520             }
1521         }
1522         return;
1523     }
1524 }
1525
1526 package Input_file;
1527
1528 # All input files use this object, which stores various attributes about them,
1529 # and provides for convenient, uniform handling.  The run method wraps the
1530 # processing.  It handles all the bookkeeping of opening, reading, and closing
1531 # the file, returning only significant input lines.
1532 #
1533 # Each object gets a handler which processes the body of the file, and is
1534 # called by run().  Most should use the generic, default handler, which has
1535 # code scrubbed to handle things you might not expect.  A handler should
1536 # basically be a while(next_line()) {...} loop.
1537 #
1538 # You can also set up handlers to
1539 #   1) call before the first line is read for pre processing
1540 #   2) call to adjust each line of the input before the main handler gets them
1541 #   3) call upon EOF before the main handler exits its loop
1542 #   4) call at the end for post processing
1543 #
1544 # $_ is used to store the input line, and is to be filtered by the
1545 # each_line_handler()s.  So, if the format of the line is not in the desired
1546 # format for the main handler, these are used to do that adjusting.  They can
1547 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
1548 # so the $_ output of one is used as the input to the next.  None of the other
1549 # handlers are stackable, but could easily be changed to be so.
1550 #
1551 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
1552 # which insert the parameters as lines to be processed before the next input
1553 # file line is read.  This allows the EOF handler to flush buffers, for
1554 # example.  The difference between the two routines is that the lines inserted
1555 # by insert_lines() are subjected to the each_line_handler()s.  (So if you
1556 # called it from such a handler, you would get infinite recursion.)  Lines
1557 # inserted by insert_adjusted_lines() go directly to the main handler without
1558 # any adjustments.  If the  post-processing handler calls any of these, there
1559 # will be no effect.  Some error checking for these conditions could be added,
1560 # but it hasn't been done.
1561 #
1562 # carp_bad_line() should be called to warn of bad input lines, which clears $_
1563 # to prevent further processing of the line.  This routine will output the
1564 # message as a warning once, and then keep a count of the lines that have the
1565 # same message, and output that count at the end of the file's processing.
1566 # This keeps the number of messages down to a manageable amount.
1567 #
1568 # get_missings() should be called to retrieve any @missing input lines.
1569 # Messages will be raised if this isn't done if the options aren't to ignore
1570 # missings.
1571
1572 sub trace { return main::trace(@_); }
1573
1574 { # Closure
1575     # Keep track of fields that are to be put into the constructor.
1576     my %constructor_fields;
1577
1578     main::setup_package(Constructor_Fields => \%constructor_fields);
1579
1580     my %file; # Input file name, required
1581     main::set_access('file', \%file, qw{ c r });
1582
1583     my %first_released; # Unicode version file was first released in, required
1584     main::set_access('first_released', \%first_released, qw{ c r });
1585
1586     my %handler;    # Subroutine to process the input file, defaults to
1587                     # 'process_generic_property_file'
1588     main::set_access('handler', \%handler, qw{ c });
1589
1590     my %property;
1591     # name of property this file is for.  defaults to none, meaning not
1592     # applicable, or is otherwise determinable, for example, from each line.
1593     main::set_access('property', \%property, qw{ c });
1594
1595     my %optional;
1596     # If this is true, the file is optional.  If not present, no warning is
1597     # output.  If it is present, the string given by this parameter is
1598     # evaluated, and if false the file is not processed.
1599     main::set_access('optional', \%optional, 'c', 'r');
1600
1601     my %non_skip;
1602     # This is used for debugging, to skip processing of all but a few input
1603     # files.  Add 'non_skip => 1' to the constructor for those files you want
1604     # processed when you set the $debug_skip global.
1605     main::set_access('non_skip', \%non_skip, 'c');
1606
1607     my %each_line_handler;
1608     # list of subroutines to look at and filter each non-comment line in the
1609     # file.  defaults to none.  The subroutines are called in order, each is
1610     # to adjust $_ for the next one, and the final one adjusts it for
1611     # 'handler'
1612     main::set_access('each_line_handler', \%each_line_handler, 'c');
1613
1614     my %has_missings_defaults;
1615     # ? Are there lines in the file giving default values for code points
1616     # missing from it?.  Defaults to NO_DEFAULTS.  Otherwise NOT_IGNORED is
1617     # the norm, but IGNORED means it has such lines, but the handler doesn't
1618     # use them.  Having these three states allows us to catch changes to the
1619     # UCD that this program should track
1620     main::set_access('has_missings_defaults',
1621                                         \%has_missings_defaults, qw{ c r });
1622
1623     my %pre_handler;
1624     # Subroutine to call before doing anything else in the file.  If undef, no
1625     # such handler is called.
1626     main::set_access('pre_handler', \%pre_handler, qw{ c });
1627
1628     my %eof_handler;
1629     # Subroutine to call upon getting an EOF on the input file, but before
1630     # that is returned to the main handler.  This is to allow buffers to be
1631     # flushed.  The handler is expected to call insert_lines() or
1632     # insert_adjusted() with the buffered material
1633     main::set_access('eof_handler', \%eof_handler, qw{ c r });
1634
1635     my %post_handler;
1636     # Subroutine to call after all the lines of the file are read in and
1637     # processed.  If undef, no such handler is called.
1638     main::set_access('post_handler', \%post_handler, qw{ c });
1639
1640     my %progress_message;
1641     # Message to print to display progress in lieu of the standard one
1642     main::set_access('progress_message', \%progress_message, qw{ c });
1643
1644     my %handle;
1645     # cache open file handle, internal.  Is undef if file hasn't been
1646     # processed at all, empty if has;
1647     main::set_access('handle', \%handle);
1648
1649     my %added_lines;
1650     # cache of lines added virtually to the file, internal
1651     main::set_access('added_lines', \%added_lines);
1652
1653     my %errors;
1654     # cache of errors found, internal
1655     main::set_access('errors', \%errors);
1656
1657     my %missings;
1658     # storage of '@missing' defaults lines
1659     main::set_access('missings', \%missings);
1660
1661     sub new {
1662         my $class = shift;
1663
1664         my $self = bless \do{ my $anonymous_scalar }, $class;
1665         my $addr = main::objaddr($self);
1666
1667         # Set defaults
1668         $handler{$addr} = \&main::process_generic_property_file;
1669         $non_skip{$addr} = 0;
1670         $has_missings_defaults{$addr} = $NO_DEFAULTS;
1671         $handle{$addr} = undef;
1672         $added_lines{$addr} = [ ];
1673         $each_line_handler{$addr} = [ ];
1674         $errors{$addr} = { };
1675         $missings{$addr} = [ ];
1676
1677         # Two positional parameters.
1678         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1679         $file{$addr} = main::internal_file_to_platform(shift);
1680         $first_released{$addr} = shift;
1681
1682         # The rest of the arguments are key => value pairs
1683         # %constructor_fields has been set up earlier to list all possible
1684         # ones.  Either set or push, depending on how the default has been set
1685         # up just above.
1686         my %args = @_;
1687         foreach my $key (keys %args) {
1688             my $argument = $args{$key};
1689
1690             # Note that the fields are the lower case of the constructor keys
1691             my $hash = $constructor_fields{lc $key};
1692             if (! defined $hash) {
1693                 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self.  Skipped");
1694                 next;
1695             }
1696             if (ref $hash->{$addr} eq 'ARRAY') {
1697                 if (ref $argument eq 'ARRAY') {
1698                     foreach my $argument (@{$argument}) {
1699                         next if ! defined $argument;
1700                         push @{$hash->{$addr}}, $argument;
1701                     }
1702                 }
1703                 else {
1704                     push @{$hash->{$addr}}, $argument if defined $argument;
1705                 }
1706             }
1707             else {
1708                 $hash->{$addr} = $argument;
1709             }
1710             delete $args{$key};
1711         };
1712
1713         # If the file has a property for it, it means that the property is not
1714         # listed in the file's entries.  So add a handler to the list of line
1715         # handlers to insert the property name into the lines, to provide a
1716         # uniform interface to the final processing subroutine.
1717         # the final code doesn't have to worry about that.
1718         if ($property{$addr}) {
1719             push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
1720         }
1721
1722         if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
1723             print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
1724         }
1725
1726         return $self;
1727     }
1728
1729
1730     use overload
1731         fallback => 0,
1732         qw("") => "_operator_stringify",
1733         "." => \&main::_operator_dot,
1734     ;
1735
1736     sub _operator_stringify {
1737         my $self = shift;
1738
1739         return __PACKAGE__ . " object for " . $self->file;
1740     }
1741
1742     # flag to make sure extracted files are processed early
1743     my $seen_non_extracted_non_age = 0;
1744
1745     sub run {
1746         # Process the input object $self.  This opens and closes the file and
1747         # calls all the handlers for it.  Currently,  this can only be called
1748         # once per file, as it destroy's the EOF handler
1749
1750         my $self = shift;
1751         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1752
1753         my $addr = main::objaddr $self;
1754
1755         my $file = $file{$addr};
1756
1757         # Don't process if not expecting this file (because released later
1758         # than this Unicode version), and isn't there.  This means if someone
1759         # copies it into an earlier version's directory, we will go ahead and
1760         # process it.
1761         return if $first_released{$addr} gt $v_version && ! -e $file;
1762
1763         # If in debugging mode and this file doesn't have the non-skip
1764         # flag set, and isn't one of the critical files, skip it.
1765         if ($debug_skip
1766             && $first_released{$addr} ne v0
1767             && ! $non_skip{$addr})
1768         {
1769             print "Skipping $file in debugging\n" if $verbosity;
1770             return;
1771         }
1772
1773         # File could be optional
1774         if ($optional{$addr}){
1775             return unless -e $file;
1776             my $result = eval $optional{$addr};
1777             if (! defined $result) {
1778                 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}.  $file Skipped.");
1779                 return;
1780             }
1781             if (! $result) {
1782                 if ($verbosity) {
1783                     print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
1784                 }
1785                 return;
1786             }
1787         }
1788
1789         if (! defined $file || ! -e $file) {
1790
1791             # If the file doesn't exist, see if have internal data for it
1792             # (based on first_released being 0).
1793             if ($first_released{$addr} eq v0) {
1794                 $handle{$addr} = 'pretend_is_open';
1795             }
1796             else {
1797                 if (! $optional{$addr}  # File could be optional
1798                     && $v_version ge $first_released{$addr})
1799                 {
1800                     print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
1801                 }
1802                 return;
1803             }
1804         }
1805         else {
1806
1807             # Here, the file exists
1808             if ($seen_non_extracted_non_age) {
1809                 if ($file =~ /$EXTRACTED/i) {
1810                     Carp::my_carp_bug(join_lines(<<END
1811 $file should be processed just after the 'Prop...Alias' files, and before
1812 anything not in the $EXTRACTED_DIR directory.  Proceeding, but the results may
1813 have subtle problems
1814 END
1815                     ));
1816                 }
1817             }
1818             elsif ($EXTRACTED_DIR
1819                     && $first_released{$addr} ne v0
1820                     && $file !~ /$EXTRACTED/i
1821                     && lc($file) ne 'dage.txt')
1822             {
1823                 # We don't set this (by the 'if' above) if we have no
1824                 # extracted directory, so if running on an early version,
1825                 # this test won't work.  Not worth worrying about.
1826                 $seen_non_extracted_non_age = 1;
1827             }
1828
1829             # And mark the file as having being processed, and warn if it
1830             # isn't a file we are expecting.  As we process the files,
1831             # they are deleted from the hash, so any that remain at the
1832             # end of the program are files that we didn't process.
1833             my $fkey = File::Spec->rel2abs($file);
1834             my $expecting = delete $potential_files{$fkey};
1835             $expecting = delete $potential_files{lc($fkey)} unless defined $expecting;
1836             Carp::my_carp("Was not expecting '$file'.") if
1837                     ! $expecting
1838                     && ! defined $handle{$addr};
1839
1840             # Open the file, converting the slashes used in this program
1841             # into the proper form for the OS
1842             my $file_handle;
1843             if (not open $file_handle, "<", $file) {
1844                 Carp::my_carp("Can't open $file.  Skipping: $!");
1845                 return 0;
1846             }
1847             $handle{$addr} = $file_handle; # Cache the open file handle
1848         }
1849
1850         if ($verbosity >= $PROGRESS) {
1851             if ($progress_message{$addr}) {
1852                 print "$progress_message{$addr}\n";
1853             }
1854             else {
1855                 # If using a virtual file, say so.
1856                 print "Processing ", (-e $file)
1857                                        ? $file
1858                                        : "substitute $file",
1859                                      "\n";
1860             }
1861         }
1862
1863
1864         # Call any special handler for before the file.
1865         &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
1866
1867         # Then the main handler
1868         &{$handler{$addr}}($self);
1869
1870         # Then any special post-file handler.
1871         &{$post_handler{$addr}}($self) if $post_handler{$addr};
1872
1873         # If any errors have been accumulated, output the counts (as the first
1874         # error message in each class was output when it was encountered).
1875         if ($errors{$addr}) {
1876             my $total = 0;
1877             my $types = 0;
1878             foreach my $error (keys %{$errors{$addr}}) {
1879                 $total += $errors{$addr}->{$error};
1880                 delete $errors{$addr}->{$error};
1881                 $types++;
1882             }
1883             if ($total > 1) {
1884                 my $message
1885                         = "A total of $total lines had errors in $file.  ";
1886
1887                 $message .= ($types == 1)
1888                             ? '(Only the first one was displayed.)'
1889                             : '(Only the first of each type was displayed.)';
1890                 Carp::my_carp($message);
1891             }
1892         }
1893
1894         if (@{$missings{$addr}}) {
1895             Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines.  Generated tables likely are wrong");
1896         }
1897
1898         # If a real file handle, close it.
1899         close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
1900                                                         ref $handle{$addr};
1901         $handle{$addr} = "";   # Uses empty to indicate that has already seen
1902                                # the file, as opposed to undef
1903         return;
1904     }
1905
1906     sub next_line {
1907         # Sets $_ to be the next logical input line, if any.  Returns non-zero
1908         # if such a line exists.  'logical' means that any lines that have
1909         # been added via insert_lines() will be returned in $_ before the file
1910         # is read again.
1911
1912         my $self = shift;
1913         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1914
1915         my $addr = main::objaddr $self;
1916
1917         # Here the file is open (or if the handle is not a ref, is an open
1918         # 'virtual' file).  Get the next line; any inserted lines get priority
1919         # over the file itself.
1920         my $adjusted;
1921
1922         LINE:
1923         while (1) { # Loop until find non-comment, non-empty line
1924             #local $to_trace = 1 if main::DEBUG;
1925             my $inserted_ref = shift @{$added_lines{$addr}};
1926             if (defined $inserted_ref) {
1927                 ($adjusted, $_) = @{$inserted_ref};
1928                 trace $adjusted, $_ if main::DEBUG && $to_trace;
1929                 return 1 if $adjusted;
1930             }
1931             else {
1932                 last if ! ref $handle{$addr}; # Don't read unless is real file
1933                 last if ! defined ($_ = readline $handle{$addr});
1934             }
1935             chomp;
1936             trace $_ if main::DEBUG && $to_trace;
1937
1938             # See if this line is the comment line that defines what property
1939             # value that code points that are not listed in the file should
1940             # have.  The format or existence of these lines is not guaranteed
1941             # by Unicode since they are comments, but the documentation says
1942             # that this was added for machine-readability, so probably won't
1943             # change.  This works starting in Unicode Version 5.0.  They look
1944             # like:
1945             #
1946             # @missing: 0000..10FFFF; Not_Reordered
1947             # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
1948             # @missing: 0000..10FFFF; ; NaN
1949             #
1950             # Save the line for a later get_missings() call.
1951             if (/$missing_defaults_prefix/) {
1952                 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
1953                     $self->carp_bad_line("Unexpected \@missing line.  Assuming no missing entries");
1954                 }
1955                 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
1956                     my @defaults = split /\s* ; \s*/x, $_;
1957
1958                     # The first field is the @missing, which ends in a
1959                     # semi-colon, so can safely shift.
1960                     shift @defaults;
1961
1962                     # Some of these lines may have empty field placeholders
1963                     # which get in the way.  An example is:
1964                     # @missing: 0000..10FFFF; ; NaN
1965                     # Remove them.  Process starting from the top so the
1966                     # splice doesn't affect things still to be looked at.
1967                     for (my $i = @defaults - 1; $i >= 0; $i--) {
1968                         next if $defaults[$i] ne "";
1969                         splice @defaults, $i, 1;
1970                     }
1971
1972                     # What's left should be just the property (maybe) and the
1973                     # default.  Having only one element means it doesn't have
1974                     # the property.
1975                     my $default;
1976                     my $property;
1977                     if (@defaults >= 1) {
1978                         if (@defaults == 1) {
1979                             $default = $defaults[0];
1980                         }
1981                         else {
1982                             $property = $defaults[0];
1983                             $default = $defaults[1];
1984                         }
1985                     }
1986
1987                     if (@defaults < 1
1988                         || @defaults > 2
1989                         || ($default =~ /^</
1990                             && $default !~ /^<code *point>$/i
1991                             && $default !~ /^<none>$/i))
1992                     {
1993                         $self->carp_bad_line("Unrecognized \@missing line: $_.  Assuming no missing entries");
1994                     }
1995                     else {
1996
1997                         # If the property is missing from the line, it should
1998                         # be the one for the whole file
1999                         $property = $property{$addr} if ! defined $property;
2000
2001                         # Change <none> to the null string, which is what it
2002                         # really means.  If the default is the code point
2003                         # itself, set it to <code point>, which is what
2004                         # Unicode uses (but sometimes they've forgotten the
2005                         # space)
2006                         if ($default =~ /^<none>$/i) {
2007                             $default = "";
2008                         }
2009                         elsif ($default =~ /^<code *point>$/i) {
2010                             $default = $CODE_POINT;
2011                         }
2012
2013                         # Store them as a sub-arrays with both components.
2014                         push @{$missings{$addr}}, [ $default, $property ];
2015                     }
2016                 }
2017
2018                 # There is nothing for the caller to process on this comment
2019                 # line.
2020                 next;
2021             }
2022
2023             # Remove comments and trailing space, and skip this line if the
2024             # result is empty
2025             s/#.*//;
2026             s/\s+$//;
2027             next if /^$/;
2028
2029             # Call any handlers for this line, and skip further processing of
2030             # the line if the handler sets the line to null.
2031             foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2032                 &{$sub_ref}($self);
2033                 next LINE if /^$/;
2034             }
2035
2036             # Here the line is ok.  return success.
2037             return 1;
2038         } # End of looping through lines.
2039
2040         # If there is an EOF handler, call it (only once) and if it generates
2041         # more lines to process go back in the loop to handle them.
2042         if ($eof_handler{$addr}) {
2043             &{$eof_handler{$addr}}($self);
2044             $eof_handler{$addr} = "";   # Currently only get one shot at it.
2045             goto LINE if $added_lines{$addr};
2046         }
2047
2048         # Return failure -- no more lines.
2049         return 0;
2050
2051     }
2052
2053 #   Not currently used, not fully tested.
2054 #    sub peek {
2055 #        # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2056 #        # record.  Not callable from an each_line_handler(), nor does it call
2057 #        # an each_line_handler() on the line.
2058 #
2059 #        my $self = shift;
2060 #        my $addr = main::objaddr $self;
2061 #
2062 #        foreach my $inserted_ref (@{$added_lines{$addr}}) {
2063 #            my ($adjusted, $line) = @{$inserted_ref};
2064 #            next if $adjusted;
2065 #
2066 #            # Remove comments and trailing space, and return a non-empty
2067 #            # resulting line
2068 #            $line =~ s/#.*//;
2069 #            $line =~ s/\s+$//;
2070 #            return $line if $line ne "";
2071 #        }
2072 #
2073 #        return if ! ref $handle{$addr}; # Don't read unless is real file
2074 #        while (1) { # Loop until find non-comment, non-empty line
2075 #            local $to_trace = 1 if main::DEBUG;
2076 #            trace $_ if main::DEBUG && $to_trace;
2077 #            return if ! defined (my $line = readline $handle{$addr});
2078 #            chomp $line;
2079 #            push @{$added_lines{$addr}}, [ 0, $line ];
2080 #
2081 #            $line =~ s/#.*//;
2082 #            $line =~ s/\s+$//;
2083 #            return $line if $line ne "";
2084 #        }
2085 #
2086 #        return;
2087 #    }
2088
2089
2090     sub insert_lines {
2091         # Lines can be inserted so that it looks like they were in the input
2092         # file at the place it was when this routine is called.  See also
2093         # insert_adjusted_lines().  Lines inserted via this routine go through
2094         # any each_line_handler()
2095
2096         my $self = shift;
2097
2098         # Each inserted line is an array, with the first element being 0 to
2099         # indicate that this line hasn't been adjusted, and needs to be
2100         # processed.
2101         push @{$added_lines{main::objaddr $self}}, map { [ 0, $_ ] } @_;
2102         return;
2103     }
2104
2105     sub insert_adjusted_lines {
2106         # Lines can be inserted so that it looks like they were in the input
2107         # file at the place it was when this routine is called.  See also
2108         # insert_lines().  Lines inserted via this routine are already fully
2109         # adjusted, ready to be processed; each_line_handler()s handlers will
2110         # not be called.  This means this is not a completely general
2111         # facility, as only the last each_line_handler on the stack should
2112         # call this.  It could be made more general, by passing to each of the
2113         # line_handlers their position on the stack, which they would pass on
2114         # to this routine, and that would replace the boolean first element in
2115         # the anonymous array pushed here, so that the next_line routine could
2116         # use that to call only those handlers whose index is after it on the
2117         # stack.  But this is overkill for what is needed now.
2118
2119         my $self = shift;
2120         trace $_[0] if main::DEBUG && $to_trace;
2121
2122         # Each inserted line is an array, with the first element being 1 to
2123         # indicate that this line has been adjusted
2124         push @{$added_lines{main::objaddr $self}}, map { [ 1, $_ ] } @_;
2125         return;
2126     }
2127
2128     sub get_missings {
2129         # Returns the stored up @missings lines' values, and clears the list.
2130         # The values are in an array, consisting of the default in the first
2131         # element, and the property in the 2nd.  However, since these lines
2132         # can be stacked up, the return is an array of all these arrays.
2133
2134         my $self = shift;
2135         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2136
2137         my $addr = main::objaddr $self;
2138
2139         # If not accepting a list return, just return the first one.
2140         return shift @{$missings{$addr}} unless wantarray;
2141
2142         my @return = @{$missings{$addr}};
2143         undef @{$missings{$addr}};
2144         return @return;
2145     }
2146
2147     sub _insert_property_into_line {
2148         # Add a property field to $_, if this file requires it.
2149
2150         my $property = $property{main::objaddr shift};
2151         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2152
2153         $_ =~ s/(;|$)/; $property$1/;
2154         return;
2155     }
2156
2157     sub carp_bad_line {
2158         # Output consistent error messages, using either a generic one, or the
2159         # one given by the optional parameter.  To avoid gazillions of the
2160         # same message in case the syntax of a  file is way off, this routine
2161         # only outputs the first instance of each message, incrementing a
2162         # count so the totals can be output at the end of the file.
2163
2164         my $self = shift;
2165         my $message = shift;
2166         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2167
2168         my $addr = main::objaddr $self;
2169
2170         $message = 'Unexpected line' unless $message;
2171
2172         # No trailing punctuation so as to fit with our addenda.
2173         $message =~ s/[.:;,]$//;
2174
2175         # If haven't seen this exact message before, output it now.  Otherwise
2176         # increment the count of how many times it has occurred
2177         unless ($errors{$addr}->{$message}) {
2178             Carp::my_carp("$message in '$_' in "
2179                             . $file{main::objaddr $self}
2180                             . " at line $..  Skipping this line;");
2181             $errors{$addr}->{$message} = 1;
2182         }
2183         else {
2184             $errors{$addr}->{$message}++;
2185         }
2186
2187         # Clear the line to prevent any further (meaningful) processing of it.
2188         $_ = "";
2189
2190         return;
2191     }
2192 } # End closure
2193
2194 package Multi_Default;
2195
2196 # Certain properties in early versions of Unicode had more than one possible
2197 # default for code points missing from the files.  In these cases, one
2198 # default applies to everything left over after all the others are applied,
2199 # and for each of the others, there is a description of which class of code
2200 # points applies to it.  This object helps implement this by storing the
2201 # defaults, and for all but that final default, an eval string that generates
2202 # the class that it applies to.
2203
2204
2205 {   # Closure
2206
2207     main::setup_package();
2208
2209     my %class_defaults;
2210     # The defaults structure for the classes
2211     main::set_access('class_defaults', \%class_defaults);
2212
2213     my %other_default;
2214     # The default that applies to everything left over.
2215     main::set_access('other_default', \%other_default, 'r');
2216
2217
2218     sub new {
2219         # The constructor is called with default => eval pairs, terminated by
2220         # the left-over default. e.g.
2221         # Multi_Default->new(
2222         #        'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2223         #               -  0x200D',
2224         #        'R' => 'some other expression that evaluates to code points',
2225         #        .
2226         #        .
2227         #        .
2228         #        'U'));
2229
2230         my $class = shift;
2231
2232         my $self = bless \do{my $anonymous_scalar}, $class;
2233         my $addr = main::objaddr($self);
2234
2235         while (@_ > 1) {
2236             my $default = shift;
2237             my $eval = shift;
2238             $class_defaults{$addr}->{$default} = $eval;
2239         }
2240
2241         $other_default{$addr} = shift;
2242
2243         return $self;
2244     }
2245
2246     sub get_next_defaults {
2247         # Iterates and returns the next class of defaults.
2248         my $self = shift;
2249         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2250
2251         my $addr = main::objaddr $self;
2252
2253         return each %{$class_defaults{$addr}};
2254     }
2255 }
2256
2257 package Alias;
2258
2259 # An alias is one of the names that a table goes by.  This class defines them
2260 # including some attributes.  Everything is currently setup in the
2261 # constructor.
2262
2263
2264 {   # Closure
2265
2266     main::setup_package();
2267
2268     my %name;
2269     main::set_access('name', \%name, 'r');
2270
2271     my %loose_match;
2272     # Determined by the constructor code if this name should match loosely or
2273     # not.  The constructor parameters can override this, but it isn't fully
2274     # implemented, as should have ability to override Unicode one's via
2275     # something like a set_loose_match()
2276     main::set_access('loose_match', \%loose_match, 'r');
2277
2278     my %make_pod_entry;
2279     # Some aliases should not get their own entries because they are covered
2280     # by a wild-card, and some we want to discourage use of.  Binary
2281     main::set_access('make_pod_entry', \%make_pod_entry, 'r');
2282
2283     my %status;
2284     # Aliases have a status, like deprecated, or even suppressed (which means
2285     # they don't appear in documentation).  Enum
2286     main::set_access('status', \%status, 'r');
2287
2288     my %externally_ok;
2289     # Similarly, some aliases should not be considered as usable ones for
2290     # external use, such as file names, or we don't want documentation to
2291     # recommend them.  Boolean
2292     main::set_access('externally_ok', \%externally_ok, 'r');
2293
2294     sub new {
2295         my $class = shift;
2296
2297         my $self = bless \do { my $anonymous_scalar }, $class;
2298         my $addr = main::objaddr($self);
2299
2300         $name{$addr} = shift;
2301         $loose_match{$addr} = shift;
2302         $make_pod_entry{$addr} = shift;
2303         $externally_ok{$addr} = shift;
2304         $status{$addr} = shift;
2305
2306         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2307
2308         # Null names are never ok externally
2309         $externally_ok{$addr} = 0 if $name{$addr} eq "";
2310
2311         return $self;
2312     }
2313 }
2314
2315 package Range;
2316
2317 # A range is the basic unit for storing code points, and is described in the
2318 # comments at the beginning of the program.  Each range has a starting code
2319 # point; an ending code point (not less than the starting one); a value
2320 # that applies to every code point in between the two end-points, inclusive;
2321 # and an enum type that applies to the value.  The type is for the user's
2322 # convenience, and has no meaning here, except that a non-zero type is
2323 # considered to not obey the normal Unicode rules for having standard forms.
2324 #
2325 # The same structure is used for both map and match tables, even though in the
2326 # latter, the value (and hence type) is irrelevant and could be used as a
2327 # comment.  In map tables, the value is what all the code points in the range
2328 # map to.  Type 0 values have the standardized version of the value stored as
2329 # well, so as to not have to recalculate it a lot.
2330
2331 sub trace { return main::trace(@_); }
2332
2333 {   # Closure
2334
2335     main::setup_package();
2336
2337     my %start;
2338     main::set_access('start', \%start, 'r', 's');
2339
2340     my %end;
2341     main::set_access('end', \%end, 'r', 's');
2342
2343     my %value;
2344     main::set_access('value', \%value, 'r');
2345
2346     my %type;
2347     main::set_access('type', \%type, 'r');
2348
2349     my %standard_form;
2350     # The value in internal standard form.  Defined only if the type is 0.
2351     main::set_access('standard_form', \%standard_form);
2352
2353     # Note that if these fields change, the dump() method should as well
2354
2355     sub new {
2356         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2357         my $class = shift;
2358
2359         my $self = bless \do { my $anonymous_scalar }, $class;
2360         my $addr = main::objaddr($self);
2361
2362         $start{$addr} = shift;
2363         $end{$addr} = shift;
2364
2365         my %args = @_;
2366
2367         my $value = delete $args{'Value'};  # Can be 0
2368         $value = "" unless defined $value;
2369         $value{$addr} = $value;
2370
2371         $type{$addr} = delete $args{'Type'} || 0;
2372
2373         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2374
2375         if (! $type{$addr}) {
2376             $standard_form{$addr} = main::standardize($value);
2377         }
2378
2379         return $self;
2380     }
2381
2382     use overload
2383         fallback => 0,
2384         qw("") => "_operator_stringify",
2385         "." => \&main::_operator_dot,
2386     ;
2387
2388     sub _operator_stringify {
2389         my $self = shift;
2390         my $addr = main::objaddr $self;
2391
2392         # Output it like '0041..0065 (value)'
2393         my $return = sprintf("%04X", $start{$addr})
2394                         .  '..'
2395                         . sprintf("%04X", $end{$addr});
2396         my $value = $value{$addr};
2397         my $type = $type{$addr};
2398         $return .= ' (';
2399         $return .= "$value";
2400         $return .= ", Type=$type" if $type != 0;
2401         $return .= ')';
2402
2403         return $return;
2404     }
2405
2406     sub standard_form {
2407         # The standard form is the value itself if the standard form is
2408         # undefined (that is if the value is special)
2409
2410         my $self = shift;
2411         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2412
2413         my $addr = main::objaddr $self;
2414
2415         return $standard_form{$addr} if defined $standard_form{$addr};
2416         return $value{$addr};
2417     }
2418
2419     sub dump {
2420         # Human, not machine readable.  For machine readable, comment out this
2421         # entire routine and let the standard one take effect.
2422         my $self = shift;
2423         my $indent = shift;
2424         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2425
2426         my $addr = main::objaddr $self;
2427
2428         my $return = $indent
2429                     . sprintf("%04X", $start{$addr})
2430                     . '..'
2431                     . sprintf("%04X", $end{$addr})
2432                     . " '$value{$addr}';";
2433         if (! defined $standard_form{$addr}) {
2434             $return .= "(type=$type{$addr})";
2435         }
2436         elsif ($standard_form{$addr} ne $value{$addr}) {
2437             $return .= "(standard '$standard_form{$addr}')";
2438         }
2439         return $return;
2440     }
2441 } # End closure
2442
2443 package _Range_List_Base;
2444
2445 # Base class for range lists.  A range list is simply an ordered list of
2446 # ranges, so that the ranges with the lowest starting numbers are first in it.
2447 #
2448 # When a new range is added that is adjacent to an existing range that has the
2449 # same value and type, it merges with it to form a larger range.
2450 #
2451 # Ranges generally do not overlap, except that there can be multiple entries
2452 # of single code point ranges.  This is because of NameAliases.txt.
2453 #
2454 # In this program, there is a standard value such that if two different
2455 # values, have the same standard value, they are considered equivalent.  This
2456 # value was chosen so that it gives correct results on Unicode data
2457
2458 # There are a number of methods to manipulate range lists, and some operators
2459 # are overloaded to handle them.
2460
2461 # Because of the slowness of pure Perl objaddr() on miniperl, and measurements
2462 # showing this package was using a lot of real time calculating that, the code
2463 # was changed to only calculate it once per call stack.  This is done by
2464 # consistently using the package variable $addr in routines, and only calling
2465 # objaddr() if it isn't defined, and setting that to be local, so that callees
2466 # will have it already.  It would be a good thing to change this. XXX
2467
2468 sub trace { return main::trace(@_); }
2469
2470 { # Closure
2471
2472     our $addr;
2473
2474     main::setup_package();
2475
2476     my %ranges;
2477     # The list of ranges
2478     main::set_access('ranges', \%ranges, 'readable_array');
2479
2480     my %max;
2481     # The highest code point in the list.  This was originally a method, but
2482     # actual measurements said it was used a lot.
2483     main::set_access('max', \%max, 'r');
2484
2485     my %each_range_iterator;
2486     # Iterator position for each_range()
2487     main::set_access('each_range_iterator', \%each_range_iterator);
2488
2489     my %owner_name_of;
2490     # Name of parent this is attached to, if any.  Solely for better error
2491     # messages.
2492     main::set_access('owner_name_of', \%owner_name_of, 'p_r');
2493
2494     my %_search_ranges_cache;
2495     # A cache of the previous result from _search_ranges(), for better
2496     # performance
2497     main::set_access('_search_ranges_cache', \%_search_ranges_cache);
2498
2499     sub new {
2500         my $class = shift;
2501         my %args = @_;
2502
2503         # Optional initialization data for the range list.
2504         my $initialize = delete $args{'Initialize'};
2505
2506         my $self;
2507
2508         # Use _union() to initialize.  _union() returns an object of this
2509         # class, which means that it will call this constructor recursively.
2510         # But it won't have this $initialize parameter so that it won't
2511         # infinitely loop on this.
2512         return _union($class, $initialize, %args) if defined $initialize;
2513
2514         $self = bless \do { my $anonymous_scalar }, $class;
2515         local $addr = main::objaddr($self);
2516
2517         # Optional parent object, only for debug info.
2518         $owner_name_of{$addr} = delete $args{'Owner'};
2519         $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
2520
2521         # Stringify, in case it is an object.
2522         $owner_name_of{$addr} = "$owner_name_of{$addr}";
2523
2524         # This is used only for error messages, and so a colon is added
2525         $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
2526
2527         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2528
2529         # Max is initialized to a negative value that isn't adjacent to 0,
2530         # for simpler tests
2531         $max{$addr} = -2;
2532
2533         $_search_ranges_cache{$addr} = 0;
2534         $ranges{$addr} = [];
2535
2536         return $self;
2537     }
2538
2539     use overload
2540         fallback => 0,
2541         qw("") => "_operator_stringify",
2542         "." => \&main::_operator_dot,
2543     ;
2544
2545     sub _operator_stringify {
2546         my $self = shift;
2547         local $addr = main::objaddr($self) if !defined $addr;
2548
2549         return "Range_List attached to '$owner_name_of{$addr}'"
2550                                                 if $owner_name_of{$addr};
2551         return "anonymous Range_List " . \$self;
2552     }
2553
2554     sub _union {
2555         # Returns the union of the input code points.  It can be called as
2556         # either a constructor or a method.  If called as a method, the result
2557         # will be a new() instance of the calling object, containing the union
2558         # of that object with the other parameter's code points;  if called as
2559         # a constructor, the first parameter gives the class the new object
2560         # should be, and the second parameter gives the code points to go into
2561         # it.
2562         # In either case, there are two parameters looked at by this routine;
2563         # any additional parameters are passed to the new() constructor.
2564         #
2565         # The code points can come in the form of some object that contains
2566         # ranges, and has a conventionally named method to access them; or
2567         # they can be an array of individual code points (as integers); or
2568         # just a single code point.
2569         #
2570         # If they are ranges, this routine doesn't make any effort to preserve
2571         # the range values of one input over the other.  Therefore this base
2572         # class should not allow _union to be called from other than
2573         # initialization code, so as to prevent two tables from being added
2574         # together where the range values matter.  The general form of this
2575         # routine therefore belongs in a derived class, but it was moved here
2576         # to avoid duplication of code.  The failure to overload this in this
2577         # class keeps it safe.
2578         #
2579
2580         my $self;
2581         my @args;   # Arguments to pass to the constructor
2582
2583         my $class = shift;
2584
2585         # If a method call, will start the union with the object itself, and
2586         # the class of the new object will be the same as self.
2587         if (ref $class) {
2588             $self = $class;
2589             $class = ref $self;
2590             push @args, $self;
2591         }
2592
2593         # Add the other required parameter.
2594         push @args, shift;
2595         # Rest of parameters are passed on to the constructor
2596
2597         # Accumulate all records from both lists.
2598         my @records;
2599         for my $arg (@args) {
2600             #local $to_trace = 0 if main::DEBUG;
2601             trace "argument = $arg" if main::DEBUG && $to_trace;
2602             if (! defined $arg) {
2603                 my $message = "";
2604                 if (defined $self) {
2605                     $message .= $owner_name_of{main::objaddr $self};
2606                 }
2607                 Carp::my_carp_bug($message .= "Undefined argument to _union.  No union done.");
2608                 return;
2609             }
2610             $arg = [ $arg ] if ! ref $arg;
2611             my $type = ref $arg;
2612             if ($type eq 'ARRAY') {
2613                 foreach my $element (@$arg) {
2614                     push @records, Range->new($element, $element);
2615                 }
2616             }
2617             elsif ($arg->isa('Range')) {
2618                 push @records, $arg;
2619             }
2620             elsif ($arg->can('ranges')) {
2621                 push @records, $arg->ranges;
2622             }
2623             else {
2624                 my $message = "";
2625                 if (defined $self) {
2626                     $message .= $owner_name_of{main::objaddr $self};
2627                 }
2628                 Carp::my_carp_bug($message . "Cannot take the union of a $type.  No union done.");
2629                 return;
2630             }
2631         }
2632
2633         # Sort with the range containing the lowest ordinal first, but if
2634         # two ranges start at the same code point, sort with the bigger range
2635         # of the two first, because it takes fewer cycles.
2636         @records = sort { ($a->start <=> $b->start)
2637                                       or
2638                                     # if b is shorter than a, b->end will be
2639                                     # less than a->end, and we want to select
2640                                     # a, so want to return -1
2641                                     ($b->end <=> $a->end)
2642                                    } @records;
2643
2644         my $new = $class->new(@_);
2645
2646         # Fold in records so long as they add new information.
2647         for my $set (@records) {
2648             my $start = $set->start;
2649             my $end   = $set->end;
2650             my $value   = $set->value;
2651             if ($start > $new->max) {
2652                 $new->_add_delete('+', $start, $end, $value);
2653             }
2654             elsif ($end > $new->max) {
2655                 $new->_add_delete('+', $new->max +1, $end, $value);
2656             }
2657         }
2658
2659         return $new;
2660     }
2661
2662     sub range_count {        # Return the number of ranges in the range list
2663         my $self = shift;
2664         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2665
2666         local $addr = main::objaddr($self) if ! defined $addr;
2667
2668         return scalar @{$ranges{$addr}};
2669     }
2670
2671     sub min {
2672         # Returns the minimum code point currently in the range list, or if
2673         # the range list is empty, 2 beyond the max possible.  This is a
2674         # method because used so rarely, that not worth saving between calls,
2675         # and having to worry about changing it as ranges are added and
2676         # deleted.
2677
2678         my $self = shift;
2679         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2680
2681         local $addr = main::objaddr($self) if ! defined $addr;
2682
2683         # If the range list is empty, return a large value that isn't adjacent
2684         # to any that could be in the range list, for simpler tests
2685         return $LAST_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
2686         return $ranges{$addr}->[0]->start;
2687     }
2688
2689     sub contains {
2690         # Boolean: Is argument in the range list?  If so returns $i such that:
2691         #   range[$i]->end < $codepoint <= range[$i+1]->end
2692         # which is one beyond what you want; this is so that the 0th range
2693         # doesn't return false
2694         my $self = shift;
2695         my $codepoint = shift;
2696         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2697
2698         local $addr = main::objaddr $self if ! defined $addr;
2699
2700         my $i = $self->_search_ranges($codepoint);
2701         return 0 unless defined $i;
2702
2703         # The search returns $i, such that
2704         #   range[$i-1]->end < $codepoint <= range[$i]->end
2705         # So is in the table if and only iff it is at least the start position
2706         # of range $i.
2707         return 0 if $ranges{$addr}->[$i]->start > $codepoint;
2708         return $i + 1;
2709     }
2710
2711     sub value_of {
2712         # Returns the value associated with the code point, undef if none
2713
2714         my $self = shift;
2715         my $codepoint = shift;
2716         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2717
2718         local $addr = main::objaddr $self if ! defined $addr;
2719
2720         my $i = $self->contains($codepoint);
2721         return unless $i;
2722
2723         # contains() returns 1 beyond where we should look
2724         return $ranges{$addr}->[$i-1]->value;
2725     }
2726
2727     sub _search_ranges {
2728         # Find the range in the list which contains a code point, or where it
2729         # should go if were to add it.  That is, it returns $i, such that:
2730         #   range[$i-1]->end < $codepoint <= range[$i]->end
2731         # Returns undef if no such $i is possible (e.g. at end of table), or
2732         # if there is an error.
2733
2734         my $self = shift;
2735         my $code_point = shift;
2736         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2737
2738         local $addr = main::objaddr $self if ! defined $addr;
2739
2740         return if $code_point > $max{$addr};
2741         my $r = $ranges{$addr};                # The current list of ranges
2742         my $range_list_size = scalar @$r;
2743         my $i;
2744
2745         use integer;        # want integer division
2746
2747         # Use the cached result as the starting guess for this one, because,
2748         # an experiment on 5.1 showed that 90% of the time the cache was the
2749         # same as the result on the next call (and 7% it was one less).
2750         $i = $_search_ranges_cache{$addr};
2751         $i = 0 if $i >= $range_list_size;   # Reset if no longer valid (prob.
2752                                             # from an intervening deletion
2753         #local $to_trace = 1 if main::DEBUG;
2754         trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
2755         return $i if $code_point <= $r->[$i]->end
2756                      && ($i == 0 || $r->[$i-1]->end < $code_point);
2757
2758         # Here the cache doesn't yield the correct $i.  Try adding 1.
2759         if ($i < $range_list_size - 1
2760             && $r->[$i]->end < $code_point &&
2761             $code_point <= $r->[$i+1]->end)
2762         {
2763             $i++;
2764             trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
2765             $_search_ranges_cache{$addr} = $i;
2766             return $i;
2767         }
2768
2769         # Here, adding 1 also didn't work.  We do a binary search to
2770         # find the correct position, starting with current $i
2771         my $lower = 0;
2772         my $upper = $range_list_size - 1;
2773         while (1) {
2774             trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
2775
2776             if ($code_point <= $r->[$i]->end) {
2777
2778                 # Here we have met the upper constraint.  We can quit if we
2779                 # also meet the lower one.
2780                 last if $i == 0 || $r->[$i-1]->end < $code_point;
2781
2782                 $upper = $i;        # Still too high.
2783
2784             }
2785             else {
2786
2787                 # Here, $r[$i]->end < $code_point, so look higher up.
2788                 $lower = $i;
2789             }
2790
2791             # Split search domain in half to try again.
2792             my $temp = ($upper + $lower) / 2;
2793
2794             # No point in continuing unless $i changes for next time
2795             # in the loop.
2796             if ($temp == $i) {
2797
2798                 # We can't reach the highest element because of the averaging.
2799                 # So if one below the upper edge, force it there and try one
2800                 # more time.
2801                 if ($i == $range_list_size - 2) {
2802
2803                     trace "Forcing to upper edge" if main::DEBUG && $to_trace;
2804                     $i = $range_list_size - 1;
2805
2806                     # Change $lower as well so if fails next time through,
2807                     # taking the average will yield the same $i, and we will
2808                     # quit with the error message just below.
2809                     $lower = $i;
2810                     next;
2811                 }
2812                 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go.  No action taken.");
2813                 return;
2814             }
2815             $i = $temp;
2816         } # End of while loop
2817
2818         if (main::DEBUG && $to_trace) {
2819             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
2820             trace "i=  [ $i ]", $r->[$i];
2821             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
2822         }
2823
2824         # Here we have found the offset.  Cache it as a starting point for the
2825         # next call.
2826         $_search_ranges_cache{$addr} = $i;
2827         return $i;
2828     }
2829
2830     sub _add_delete {
2831         # Add, replace or delete ranges to or from a list.  The $type
2832         # parameter gives which:
2833         #   '+' => insert or replace a range, returning a list of any changed
2834         #          ranges.
2835         #   '-' => delete a range, returning a list of any deleted ranges.
2836         #
2837         # The next three parameters give respectively the start, end, and
2838         # value associated with the range.  'value' should be null unless the
2839         # operation is '+';
2840         #
2841         # The range list is kept sorted so that the range with the lowest
2842         # starting position is first in the list, and generally, adjacent
2843         # ranges with the same values are merged into single larger one (see
2844         # exceptions below).
2845         #
2846         # There are more parameters, all are key => value pairs:
2847         #   Type    gives the type of the value.  It is only valid for '+'.
2848         #           All ranges have types; if this parameter is omitted, 0 is
2849         #           assumed.  Ranges with type 0 are assumed to obey the
2850         #           Unicode rules for casing, etc; ranges with other types are
2851         #           not.  Otherwise, the type is arbitrary, for the caller's
2852         #           convenience, and looked at only by this routine to keep
2853         #           adjacent ranges of different types from being merged into
2854         #           a single larger range, and when Replace =>
2855         #           $IF_NOT_EQUIVALENT is specified (see just below).
2856         #   Replace  determines what to do if the range list already contains
2857         #            ranges which coincide with all or portions of the input
2858         #            range.  It is only valid for '+':
2859         #       => $NO            means that the new value is not to replace
2860         #                         any existing ones, but any empty gaps of the
2861         #                         range list coinciding with the input range
2862         #                         will be filled in with the new value.
2863         #       => $UNCONDITIONALLY  means to replace the existing values with
2864         #                         this one unconditionally.  However, if the
2865         #                         new and old values are identical, the
2866         #                         replacement is skipped to save cycles
2867         #       => $IF_NOT_EQUIVALENT means to replace the existing values
2868         #                         with this one if they are not equivalent.
2869         #                         Ranges are equivalent if their types are the
2870         #                         same, and they are the same string, or if
2871         #                         both are type 0 ranges, if their Unicode
2872         #                         standard forms are identical.  In this last
2873         #                         case, the routine chooses the more "modern"
2874         #                         one to use.  This is because some of the
2875         #                         older files are formatted with values that
2876         #                         are, for example, ALL CAPs, whereas the
2877         #                         derived files have a more modern style,
2878         #                         which looks better.  By looking for this
2879         #                         style when the pre-existing and replacement
2880         #                         standard forms are the same, we can move to
2881         #                         the modern style
2882         #       => $MULTIPLE      means that if this range duplicates an
2883         #                         existing one, but has a different value,
2884         #                         don't replace the existing one, but insert
2885         #                         this, one so that the same range can occur
2886         #                         multiple times.
2887         #       => anything else  is the same as => $IF_NOT_EQUIVALENT
2888         #
2889         # "same value" means identical for type-0 ranges, and it means having
2890         # the same standard forms for non-type-0 ranges.
2891
2892         return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
2893
2894         my $self = shift;
2895         my $operation = shift;   # '+' for add/replace; '-' for delete;
2896         my $start = shift;
2897         my $end   = shift;
2898         my $value = shift;
2899
2900         my %args = @_;
2901
2902         $value = "" if not defined $value;        # warning: $value can be "0"
2903
2904         my $replace = delete $args{'Replace'};
2905         $replace = $IF_NOT_EQUIVALENT unless defined $replace;
2906
2907         my $type = delete $args{'Type'};
2908         $type = 0 unless defined $type;
2909
2910         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2911
2912         local $addr = main::objaddr($self) if ! defined $addr;
2913
2914         if ($operation ne '+' && $operation ne '-') {
2915             Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'.  No action taken.");
2916             return;
2917         }
2918         unless (defined $start && defined $end) {
2919             Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete.  No action taken.");
2920             return;
2921         }
2922         unless ($end >= $start) {
2923             Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . ").  No action taken.");
2924             return;
2925         }
2926         #local $to_trace = 1 if main::DEBUG;
2927
2928         if ($operation eq '-') {
2929             if ($replace != $IF_NOT_EQUIVALENT) {
2930                 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list.  Assuming Replace => \$IF_NOT_EQUIVALENT.");
2931                 $replace = $IF_NOT_EQUIVALENT;
2932             }
2933             if ($type) {
2934                 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list.  Assuming Type => 0.");
2935                 $type = 0;
2936             }
2937             if ($value ne "") {
2938                 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list.  Assuming Value => \"\".");
2939                 $value = "";
2940             }
2941         }
2942
2943         my $r = $ranges{$addr};               # The current list of ranges
2944         my $range_list_size = scalar @$r;     # And its size
2945         my $max = $max{$addr};                # The current high code point in
2946                                               # the list of ranges
2947
2948         # Do a special case requiring fewer machine cycles when the new range
2949         # starts after the current highest point.  The Unicode input data is
2950         # structured so this is common.
2951         if ($start > $max) {
2952
2953             trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace;
2954             return if $operation eq '-'; # Deleting a non-existing range is a
2955                                          # no-op
2956
2957             # If the new range doesn't logically extend the current final one
2958             # in the range list, create a new range at the end of the range
2959             # list.  (max cleverly is initialized to a negative number not
2960             # adjacent to 0 if the range list is empty, so even adding a range
2961             # to an empty range list starting at 0 will have this 'if'
2962             # succeed.)
2963             if ($start > $max + 1        # non-adjacent means can't extend.
2964                 || @{$r}[-1]->value ne $value # values differ, can't extend.
2965                 || @{$r}[-1]->type != $type # types differ, can't extend.
2966             ) {
2967                 push @$r, Range->new($start, $end,
2968                                      Value => $value,
2969                                      Type => $type);
2970             }
2971             else {
2972
2973                 # Here, the new range starts just after the current highest in
2974                 # the range list, and they have the same type and value.
2975                 # Extend the current range to incorporate the new one.
2976                 @{$r}[-1]->set_end($end);
2977             }
2978
2979             # This becomes the new maximum.
2980             $max{$addr} = $end;
2981
2982             return;
2983         }
2984         #local $to_trace = 0 if main::DEBUG;
2985
2986         trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
2987
2988         # Here, the input range isn't after the whole rest of the range list.
2989         # Most likely 'splice' will be needed.  The rest of the routine finds
2990         # the needed splice parameters, and if necessary, does the splice.
2991         # First, find the offset parameter needed by the splice function for
2992         # the input range.  Note that the input range may span multiple
2993         # existing ones, but we'll worry about that later.  For now, just find
2994         # the beginning.  If the input range is to be inserted starting in a
2995         # position not currently in the range list, it must (obviously) come
2996         # just after the range below it, and just before the range above it.
2997         # Slightly less obviously, it will occupy the position currently
2998         # occupied by the range that is to come after it.  More formally, we
2999         # are looking for the position, $i, in the array of ranges, such that:
3000         #
3001         # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3002         #
3003         # (The ordered relationships within existing ranges are also shown in
3004         # the equation above).  However, if the start of the input range is
3005         # within an existing range, the splice offset should point to that
3006         # existing range's position in the list; that is $i satisfies a
3007         # somewhat different equation, namely:
3008         #
3009         #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3010         #
3011         # More briefly, $start can come before or after r[$i]->start, and at
3012         # this point, we don't know which it will be.  However, these
3013         # two equations share these constraints:
3014         #
3015         #   r[$i-1]->end < $start <= r[$i]->end
3016         #
3017         # And that is good enough to find $i.
3018
3019         my $i = $self->_search_ranges($start);
3020         if (! defined $i) {
3021             Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined.  Operation '$operation' not performed");
3022             return;
3023         }
3024
3025         # The search function returns $i such that:
3026         #
3027         # r[$i-1]->end < $start <= r[$i]->end
3028         #
3029         # That means that $i points to the first range in the range list
3030         # that could possibly be affected by this operation.  We still don't
3031         # know if the start of the input range is within r[$i], or if it
3032         # points to empty space between r[$i-1] and r[$i].
3033         trace "[$i] is the beginning splice point.  Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3034
3035         # Special case the insertion of data that is not to replace any
3036         # existing data.
3037         if ($replace == $NO) {  # If $NO, has to be operation '+'
3038             #local $to_trace = 1 if main::DEBUG;
3039             trace "Doesn't replace" if main::DEBUG && $to_trace;
3040
3041             # Here, the new range is to take effect only on those code points
3042             # that aren't already in an existing range.  This can be done by
3043             # looking through the existing range list and finding the gaps in
3044             # the ranges that this new range affects, and then calling this
3045             # function recursively on each of those gaps, leaving untouched
3046             # anything already in the list.  Gather up a list of the changed
3047             # gaps first so that changes to the internal state as new ranges
3048             # are added won't be a problem.
3049             my @gap_list;
3050
3051             # First, if the starting point of the input range is outside an
3052             # existing one, there is a gap from there to the beginning of the
3053             # existing range -- add a span to fill the part that this new
3054             # range occupies
3055             if ($start < $r->[$i]->start) {
3056                 push @gap_list, Range->new($start,
3057                                            main::min($end,
3058                                                      $r->[$i]->start - 1),
3059                                            Type => $type);
3060                 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3061             }
3062
3063             # Then look through the range list for other gaps until we reach
3064             # the highest range affected by the input one.
3065             my $j;
3066             for ($j = $i+1; $j < $range_list_size; $j++) {
3067                 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3068                 last if $end < $r->[$j]->start;
3069
3070                 # If there is a gap between when this range starts and the
3071                 # previous one ends, add a span to fill it.  Note that just
3072                 # because there are two ranges doesn't mean there is a
3073                 # non-zero gap between them.  It could be that they have
3074                 # different values or types
3075                 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3076                     push @gap_list,
3077                         Range->new($r->[$j-1]->end + 1,
3078                                    $r->[$j]->start - 1,
3079                                    Type => $type);
3080                     trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3081                 }
3082             }
3083
3084             # Here, we have either found an existing range in the range list,
3085             # beyond the area affected by the input one, or we fell off the
3086             # end of the loop because the input range affects the whole rest
3087             # of the range list.  In either case, $j is 1 higher than the
3088             # highest affected range.  If $j == $i, it means that there are no
3089             # affected ranges, that the entire insertion is in the gap between
3090             # r[$i-1], and r[$i], which we already have taken care of before
3091             # the loop.
3092             # On the other hand, if there are affected ranges, it might be
3093             # that there is a gap that needs filling after the final such
3094             # range to the end of the input range
3095             if ($r->[$j-1]->end < $end) {
3096                     push @gap_list, Range->new(main::max($start,
3097                                                          $r->[$j-1]->end + 1),
3098                                                $end,
3099                                                Type => $type);
3100                     trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3101             }
3102
3103             # Call recursively to fill in all the gaps.
3104             foreach my $gap (@gap_list) {
3105                 $self->_add_delete($operation,
3106                                    $gap->start,
3107                                    $gap->end,
3108                                    $value,
3109                                    Type => $type);
3110             }
3111
3112             return;
3113         }
3114
3115         # Here, we have taken care of the case where $replace is $NO, which
3116         # means that whatever action we now take is done unconditionally.  It
3117         # still could be that this call will result in a no-op, if duplicates
3118         # aren't allowed, and we are inserting a range that merely duplicates
3119         # data already in the range list; or also if deleting a non-existent
3120         # range.
3121         # $i still points to the first potential affected range.  Now find the
3122         # highest range affected, which will determine the length parameter to
3123         # splice.  (The input range can span multiple existing ones.)  While
3124         # we are looking through the range list, see also if this is an
3125         # insertion that will change the values of at least one of the
3126         # affected ranges.  We don't need to do this check unless this is an
3127         # insertion of non-multiples, and also since this is a boolean, we
3128         # don't need to do it if have already determined that it will make a
3129         # change; just unconditionally change them.  $cdm is created to be 1
3130         # if either of these is true. (The 'c' in the name comes from below)
3131         my $cdm = ($operation eq '-' || $replace == $MULTIPLE);
3132         my $j;        # This will point to the highest affected range
3133
3134         # For non-zero types, the standard form is the value itself;
3135         my $standard_form = ($type) ? $value : main::standardize($value);
3136
3137         for ($j = $i; $j < $range_list_size; $j++) {
3138             trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3139
3140             # If find a range that it doesn't overlap into, we can stop
3141             # searching
3142             last if $end < $r->[$j]->start;
3143
3144             # Here, overlaps the range at $j.  If the value's don't match,
3145             # and this is supposedly an insertion, it becomes a change
3146             # instead.  This is what the 'c' stands for in $cdm.
3147             if (! $cdm) {
3148                 if ($r->[$j]->standard_form ne $standard_form) {
3149                     $cdm = 1;
3150                 }
3151                 else {
3152
3153                     # Here, the two values are essentially the same.  If the
3154                     # two are actually identical, replacing wouldn't change
3155                     # anything so skip it.
3156                     my $pre_existing = $r->[$j]->value;
3157                     if ($pre_existing ne $value) {
3158
3159                         # Here the new and old standardized values are the
3160                         # same, but the non-standardized values aren't.  If
3161                         # replacing unconditionally, then replace
3162                         if( $replace == $UNCONDITIONALLY) {
3163                             $cdm = 1;
3164                         }
3165                         else {
3166
3167                             # Here, are replacing conditionally.  Decide to
3168                             # replace or not based on which appears to look
3169                             # the "nicest".  If one is mixed case and the
3170                             # other isn't, choose the mixed case one.
3171                             my $new_mixed = $value =~ /[A-Z]/
3172                                             && $value =~ /[a-z]/;
3173                             my $old_mixed = $pre_existing =~ /[A-Z]/
3174                                             && $pre_existing =~ /[a-z]/;
3175
3176                             if ($old_mixed != $new_mixed) {
3177                                 $cdm = 1 if $new_mixed;
3178                                 if (main::DEBUG && $to_trace) {
3179                                     if ($cdm) {
3180                                         trace "Replacing $pre_existing with $value";
3181                                     }
3182                                     else {
3183                                         trace "Retaining $pre_existing over $value";
3184                                     }
3185                                 }
3186                             }
3187                             else {
3188
3189                                 # Here casing wasn't different between the two.
3190                                 # If one has hyphens or underscores and the
3191                                 # other doesn't, choose the one with the
3192                                 # punctuation.
3193                                 my $new_punct = $value =~ /[-_]/;
3194                                 my $old_punct = $pre_existing =~ /[-_]/;
3195
3196                                 if ($old_punct != $new_punct) {
3197                                     $cdm = 1 if $new_punct;
3198                                     if (main::DEBUG && $to_trace) {
3199                                         if ($cdm) {
3200                                             trace "Replacing $pre_existing with $value";
3201                                         }
3202                                         else {
3203                                             trace "Retaining $pre_existing over $value";
3204                                         }
3205                                     }
3206                                 }   # else existing one is just as "good";
3207                                     # retain it to save cycles.
3208                             }
3209                         }
3210                     }
3211                 }
3212             }
3213         } # End of loop looking for highest affected range.
3214
3215         # Here, $j points to one beyond the highest range that this insertion
3216         # affects (hence to beyond the range list if that range is the final
3217         # one in the range list).
3218
3219         # The splice length is all the affected ranges.  Get it before
3220         # subtracting, for efficiency, so we don't have to later add 1.
3221         my $length = $j - $i;
3222
3223         $j--;        # $j now points to the highest affected range.
3224         trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3225
3226         # If inserting a multiple record, this is where it goes, after all the
3227         # existing ones for this range.  This implies an insertion, and no
3228         # change to any existing ranges.  Note that $j can be -1 if this new
3229         # range doesn't actually duplicate any existing, and comes at the
3230         # beginning of the list, in which case we can handle it like any other
3231         # insertion, and is easier to do so.
3232         if ($replace == $MULTIPLE && $j >= 0) {
3233
3234             # This restriction could be remedied with a little extra work, but
3235             # it won't hopefully ever be necessary
3236             if ($r->[$j]->start != $r->[$j]->end) {
3237                 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple when the other range ($r->[$j]) contains more than one code point.  No action taken.");
3238                 return;
3239             }
3240
3241             # Don't add an exact duplicate, as it isn't really a multiple
3242             return if $value eq $r->[$j]->value && $type eq $r->[$j]->type;
3243
3244             trace "Adding multiple record at $j+1 with $start..$end, $value" if main::DEBUG && $to_trace;
3245             my @return = splice @$r,
3246                                 $j+1,
3247                                 0,
3248                                 Range->new($start,
3249                                            $end,
3250                                            Value => $value,
3251                                            Type => $type);
3252             if (main::DEBUG && $to_trace) {
3253                 trace "After splice:";
3254                 trace 'j-2=[', $j-2, ']', $r->[$j-2] if $j >= 2;
3255                 trace 'j-1=[', $j-1, ']', $r->[$j-1] if $j >= 1;
3256                 trace "j  =[", $j, "]", $r->[$j] if $j >= 0;
3257                 trace 'j+1=[', $j+1, ']', $r->[$j+1] if $j < @$r - 1;
3258                 trace 'j+2=[', $j+2, ']', $r->[$j+2] if $j < @$r - 2;
3259                 trace 'j+3=[', $j+3, ']', $r->[$j+3] if $j < @$r - 3;
3260             }
3261             return @return;
3262         }
3263
3264         # Here, have taken care of $NO and $MULTIPLE replaces.
3265         # $j points to the highest affected range.  But it can be < $i or even
3266         # -1.  These happen only if the insertion is entirely in the gap
3267         # between r[$i-1] and r[$i].  Here's why: j < i means that the j loop
3268         # above exited first time through with $end < $r->[$i]->start.  (And
3269         # then we subtracted one from j)  This implies also that $start <
3270         # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3271         # $start, so the entire input range is in the gap.
3272         if ($j < $i) {
3273
3274             # Here the entire input range is in the gap before $i.
3275
3276             if (main::DEBUG && $to_trace) {
3277                 if ($i) {
3278                     trace "Entire range is between $r->[$i-1] and $r->[$i]";
3279                 }
3280                 else {
3281                     trace "Entire range is before $r->[$i]";
3282                 }
3283             }
3284             return if $operation ne '+'; # Deletion of a non-existent range is
3285                                          # a no-op
3286         }
3287         else {
3288
3289             # Here the entire input range is not in the gap before $i.  There
3290             # is an affected one, and $j points to the highest such one.
3291
3292             # At this point, here is the situation:
3293             # This is not an insertion of a multiple, nor of tentative ($NO)
3294             # data.
3295             #   $i  points to the first element in the current range list that
3296             #            may be affected by this operation.  In fact, we know
3297             #            that the range at $i is affected because we are in
3298             #            the else branch of this 'if'
3299             #   $j  points to the highest affected range.
3300             # In other words,
3301             #   r[$i-1]->end < $start <= r[$i]->end
3302             # And:
3303             #   r[$i-1]->end < $start <= $end <= r[$j]->end
3304             #
3305             # Also:
3306             #   $cdm is a boolean which is set true if and only if this is a
3307             #        change or deletion (multiple was handled above).  In
3308             #        other words, it could be renamed to be just $cd.
3309
3310             # We now have enough information to decide if this call is a no-op
3311             # or not.  It is a no-op if it is a deletion of a non-existent
3312             # range, or an insertion of already existing data.
3313
3314             if (main::DEBUG && $to_trace && ! $cdm
3315                                          && $i == $j
3316                                          && $start >= $r->[$i]->start)
3317             {
3318                     trace "no-op";
3319             }
3320             return if ! $cdm      # change or delete => not no-op
3321                       && $i == $j # more than one affected range => not no-op
3322
3323                       # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
3324                       # Further, $start and/or $end is >= r[$i]->start
3325                       # The test below hence guarantees that
3326                       #     r[$i]->start < $start <= $end <= r[$i]->end
3327                       # This means the input range is contained entirely in
3328                       # the one at $i, so is a no-op
3329                       && $start >= $r->[$i]->start;
3330         }
3331
3332         # Here, we know that some action will have to be taken.  We have
3333         # calculated the offset and length (though adjustments may be needed)
3334         # for the splice.  Now start constructing the replacement list.
3335         my @replacement;
3336         my $splice_start = $i;
3337
3338         my $extends_below;
3339         my $extends_above;
3340
3341         # See if should extend any adjacent ranges.
3342         if ($operation eq '-') { # Don't extend deletions
3343             $extends_below = $extends_above = 0;
3344         }
3345         else {  # Here, should extend any adjacent ranges.  See if there are
3346                 # any.
3347             $extends_below = ($i > 0
3348                             # can't extend unless adjacent
3349                             && $r->[$i-1]->end == $start -1
3350                             # can't extend unless are same standard value
3351                             && $r->[$i-1]->standard_form eq $standard_form
3352                             # can't extend unless share type
3353                             && $r->[$i-1]->type == $type);
3354             $extends_above = ($j+1 < $range_list_size
3355                             && $r->[$j+1]->start == $end +1
3356                             && $r->[$j+1]->standard_form eq $standard_form
3357                             && $r->[$j-1]->type == $type);
3358         }
3359         if ($extends_below && $extends_above) { # Adds to both
3360             $splice_start--;     # start replace at element below
3361             $length += 2;        # will replace on both sides
3362             trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
3363
3364             # The result will fill in any gap, replacing both sides, and
3365             # create one large range.
3366             @replacement = Range->new($r->[$i-1]->start,
3367                                       $r->[$j+1]->end,
3368                                       Value => $value,
3369                                       Type => $type);
3370         }
3371         else {
3372
3373             # Here we know that the result won't just be the conglomeration of
3374             # a new range with both its adjacent neighbors.  But it could
3375             # extend one of them.
3376
3377             if ($extends_below) {
3378
3379                 # Here the new element adds to the one below, but not to the
3380                 # one above.  If inserting, and only to that one range,  can
3381                 # just change its ending to include the new one.
3382                 if ($length == 0 && ! $cdm) {
3383                     $r->[$i-1]->set_end($end);
3384                     trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
3385                     return;
3386                 }
3387                 else {
3388                     trace "Changing inserted range to start at ", sprintf("%04X",  $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
3389                     $splice_start--;        # start replace at element below
3390                     $length++;              # will replace the element below
3391                     $start = $r->[$i-1]->start;
3392                 }
3393             }
3394             elsif ($extends_above) {
3395
3396                 # Here the new element adds to the one above, but not below.
3397                 # Mirror the code above
3398                 if ($length == 0 && ! $cdm) {
3399                     $r->[$j+1]->set_start($start);
3400                     trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
3401                     return;
3402                 }
3403                 else {
3404                     trace "Changing inserted range to end at ", sprintf("%04X",  $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
3405                     $length++;        # will replace the element above
3406                     $end = $r->[$j+1]->end;
3407                 }
3408             }
3409
3410             trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
3411
3412             # Finally, here we know there will have to be a splice.
3413             # If the change or delete affects only the highest portion of the
3414             # first affected range, the range will have to be split.  The
3415             # splice will remove the whole range, but will replace it by a new
3416             # range containing just the unaffected part.  So, in this case,
3417             # add to the replacement list just this unaffected portion.
3418             if (! $extends_below
3419                 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
3420             {
3421                 push @replacement,
3422                     Range->new($r->[$i]->start,
3423                                $start - 1,
3424                                Value => $r->[$i]->value,
3425                                Type => $r->[$i]->type);
3426             }
3427
3428             # In the case of an insert or change, but not a delete, we have to
3429             # put in the new stuff;  this comes next.
3430             if ($operation eq '+') {
3431                 push @replacement, Range->new($start,
3432                                               $end,
3433                                               Value => $value,
3434                                               Type => $type);
3435             }
3436
3437             trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
3438             #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
3439
3440             # And finally, if we're changing or deleting only a portion of the
3441             # highest affected range, it must be split, as the lowest one was.
3442             if (! $extends_above
3443                 && $j >= 0  # Remember that j can be -1 if before first
3444                             # current element
3445                 && $end >= $r->[$j]->start
3446                 && $end < $r->[$j]->end)
3447             {
3448                 push @replacement,
3449                     Range->new($end + 1,
3450                                $r->[$j]->end,
3451                                Value => $r->[$j]->value,
3452                                Type => $r->[$j]->type);
3453             }
3454         }
3455
3456         # And do the splice, as calculated above
3457         if (main::DEBUG && $to_trace) {
3458             trace "replacing $length element(s) at $i with ";
3459             foreach my $replacement (@replacement) {
3460                 trace "    $replacement";
3461             }
3462             trace "Before splice:";
3463             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3464             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3465             trace "i  =[", $i, "]", $r->[$i];
3466             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3467             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3468         }
3469
3470         my @return = splice @$r, $splice_start, $length, @replacement;
3471
3472         if (main::DEBUG && $to_trace) {
3473             trace "After splice:";
3474             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3475             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3476             trace "i  =[", $i, "]", $r->[$i];
3477             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3478             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3479             trace "removed @return";
3480         }
3481
3482         # An actual deletion could have changed the maximum in the list.
3483         # There was no deletion if the splice didn't return something, but
3484         # otherwise recalculate it.  This is done too rarely to worry about
3485         # performance.
3486         if ($operation eq '-' && @return) {
3487             $max{$addr} = $r->[-1]->end;
3488         }
3489         return @return;
3490     }
3491
3492     sub reset_each_range {  # reset the iterator for each_range();
3493         my $self = shift;
3494         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3495
3496         local $addr = main::objaddr $self if ! defined $addr;
3497
3498         undef $each_range_iterator{$addr};
3499         return;
3500     }
3501
3502     sub each_range {
3503         # Iterate over each range in a range list.  Results are undefined if
3504         # the range list is changed during the iteration.
3505
3506         my $self = shift;
3507         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3508
3509         local $addr = main::objaddr($self) if ! defined $addr;
3510
3511         return if $self->is_empty;
3512
3513         $each_range_iterator{$addr} = -1
3514                                 if ! defined $each_range_iterator{$addr};
3515         $each_range_iterator{$addr}++;
3516         return $ranges{$addr}->[$each_range_iterator{$addr}]
3517                         if $each_range_iterator{$addr} < @{$ranges{$addr}};
3518         undef $each_range_iterator{$addr};
3519         return;
3520     }
3521
3522     sub count {        # Returns count of code points in range list
3523         my $self = shift;
3524         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3525
3526         local $addr = main::objaddr($self) if ! defined $addr;
3527
3528         my $count = 0;
3529         foreach my $range (@{$ranges{$addr}}) {
3530             $count += $range->end - $range->start + 1;
3531         }
3532         return $count;
3533     }
3534
3535     sub delete_range {    # Delete a range
3536         my $self = shift;
3537         my $start = shift;
3538         my $end = shift;
3539
3540         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3541
3542         return $self->_add_delete('-', $start, $end, "");
3543     }
3544
3545     sub is_empty { # Returns boolean as to if a range list is empty
3546         my $self = shift;
3547         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3548
3549         local $addr = main::objaddr($self) if ! defined $addr;
3550         return scalar @{$ranges{$addr}} == 0;
3551     }
3552
3553     sub hash {
3554         # Quickly returns a scalar suitable for separating tables into
3555         # buckets, i.e. it is a hash function of the contents of a table, so
3556         # there are relatively few conflicts.
3557
3558         my $self = shift;
3559         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3560
3561         local $addr = main::objaddr($self) if ! defined $addr;
3562
3563         # These are quickly computable.  Return looks like 'min..max;count'
3564         return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
3565     }
3566 } # End closure for _Range_List_Base
3567
3568 package Range_List;
3569 use base '_Range_List_Base';
3570
3571 # A Range_List is a range list for match tables; i.e. the range values are
3572 # not significant.  Thus a number of operations can be safely added to it,
3573 # such as inversion, intersection.  Note that union is also an unsafe
3574 # operation when range values are cared about, and that method is in the base
3575 # class, not here.  But things are set up so that that method is callable only
3576 # during initialization.  Only in this derived class, is there an operation
3577 # that combines two tables.  A Range_Map can thus be used to initialize a
3578 # Range_List, and its mappings will be in the list, but are not significant to
3579 # this class.
3580
3581 sub trace { return main::trace(@_); }
3582
3583 { # Closure
3584
3585     use overload
3586         fallback => 0,
3587         '+' => sub { my $self = shift;
3588                     my $other = shift;
3589
3590                     return $self->_union($other)
3591                 },
3592         '&' => sub { my $self = shift;
3593                     my $other = shift;
3594
3595                     return $self->_intersect($other, 0);
3596                 },
3597         '~' => "_invert",
3598         '-' => "_subtract",
3599     ;
3600
3601     sub _invert {
3602         # Returns a new Range_List that gives all code points not in $self.
3603
3604         my $self = shift;
3605
3606         my $new = Range_List->new;
3607
3608         # Go through each range in the table, finding the gaps between them
3609         my $max = -1;   # Set so no gap before range beginning at 0
3610         for my $range ($self->ranges) {
3611             my $start = $range->start;
3612             my $end   = $range->end;
3613
3614             # If there is a gap before this range, the inverse will contain
3615             # that gap.
3616             if ($start > $max + 1) {
3617                 $new->add_range($max + 1, $start - 1);
3618             }
3619             $max = $end;
3620         }
3621
3622         # And finally, add the gap from the end of the table to the max
3623         # possible code point
3624         if ($max < $LAST_UNICODE_CODEPOINT) {
3625             $new->add_range($max + 1, $LAST_UNICODE_CODEPOINT);
3626         }
3627         return $new;
3628     }
3629
3630     sub _subtract {
3631         # Returns a new Range_List with the argument deleted from it.  The
3632         # argument can be a single code point, a range, or something that has
3633         # a range, with the _range_list() method on it returning them
3634
3635         my $self = shift;
3636         my $other = shift;
3637         my $reversed = shift;
3638         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3639
3640         if ($reversed) {
3641             Carp::my_carp_bug("Can't cope with a "
3642              .  __PACKAGE__
3643              . " being the second parameter in a '-'.  Subtraction ignored.");
3644             return $self;
3645         }
3646
3647         my $new = Range_List->new(Initialize => $self);
3648
3649         if (! ref $other) { # Single code point
3650             $new->delete_range($other, $other);
3651         }
3652         elsif ($other->isa('Range')) {
3653             $new->delete_range($other->start, $other->end);
3654         }
3655         elsif ($other->can('_range_list')) {
3656             foreach my $range ($other->_range_list->ranges) {
3657                 $new->delete_range($range->start, $range->end);
3658             }
3659         }
3660         else {
3661             Carp::my_carp_bug("Can't cope with a "
3662                         . ref($other)
3663                         . " argument to '-'.  Subtraction ignored."
3664                         );
3665             return $self;
3666         }
3667
3668         return $new;
3669     }
3670
3671     sub _intersect {
3672         # Returns either a boolean giving whether the two inputs' range lists
3673         # intersect (overlap), or a new Range_List containing the intersection
3674         # of the two lists.  The optional final parameter being true indicates
3675         # to do the check instead of the intersection.
3676
3677         my $a_object = shift;
3678         my $b_object = shift;
3679         my $check_if_overlapping = shift;
3680         $check_if_overlapping = 0 unless defined $check_if_overlapping;
3681         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3682
3683         if (! defined $b_object) {
3684             my $message = "";
3685             $message .= $a_object->_owner_name_of if defined $a_object;
3686             Carp::my_carp_bug($message .= "Called with undefined value.  Intersection not done.");
3687             return;
3688         }
3689
3690         # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
3691         # Thus the intersection could be much more simply be written:
3692         #   return ~(~$a_object + ~$b_object);
3693         # But, this is slower, and when taking the inverse of a large
3694         # range_size_1 table, back when such tables were always stored that
3695         # way, it became prohibitively slow, hence the code was changed to the
3696         # below
3697
3698         if ($b_object->isa('Range')) {
3699             $b_object = Range_List->new(Initialize => $b_object,
3700                                         Owner => $a_object->_owner_name_of);
3701         }
3702         $b_object = $b_object->_range_list if $b_object->can('_range_list');
3703
3704         my @a_ranges = $a_object->ranges;
3705         my @b_ranges = $b_object->ranges;
3706
3707         #local $to_trace = 1 if main::DEBUG;
3708         trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
3709
3710         # Start with the first range in each list
3711         my $a_i = 0;
3712         my $range_a = $a_ranges[$a_i];
3713         my $b_i = 0;
3714         my $range_b = $b_ranges[$b_i];
3715
3716         my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
3717                                                 if ! $check_if_overlapping;
3718
3719         # If either list is empty, there is no intersection and no overlap
3720         if (! defined $range_a || ! defined $range_b) {
3721             return $check_if_overlapping ? 0 : $new;
3722         }
3723         trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
3724
3725         # Otherwise, must calculate the intersection/overlap.  Start with the
3726         # very first code point in each list
3727         my $a = $range_a->start;
3728         my $b = $range_b->start;
3729
3730         # Loop through all the ranges of each list; in each iteration, $a and
3731         # $b are the current code points in their respective lists
3732         while (1) {
3733
3734             # If $a and $b are the same code point, ...
3735             if ($a == $b) {
3736
3737                 # it means the lists overlap.  If just checking for overlap
3738                 # know the answer now,
3739                 return 1 if $check_if_overlapping;
3740
3741                 # The intersection includes this code point plus anything else
3742                 # common to both current ranges.
3743                 my $start = $a;
3744                 my $end = main::min($range_a->end, $range_b->end);
3745                 if (! $check_if_overlapping) {
3746                     trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
3747                     $new->add_range($start, $end);
3748                 }
3749
3750                 # Skip ahead to the end of the current intersect
3751                 $a = $b = $end;
3752
3753                 # If the current intersect ends at the end of either range (as
3754                 # it must for at least one of them), the next possible one
3755                 # will be the beginning code point in it's list's next range.
3756                 if ($a == $range_a->end) {
3757                     $range_a = $a_ranges[++$a_i];
3758                     last unless defined $range_a;
3759                     $a = $range_a->start;
3760                 }
3761                 if ($b == $range_b->end) {
3762                     $range_b = $b_ranges[++$b_i];
3763                     last unless defined $range_b;
3764                     $b = $range_b->start;
3765                 }
3766
3767                 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
3768             }
3769             elsif ($a < $b) {
3770
3771                 # Not equal, but if the range containing $a encompasses $b,
3772                 # change $a to be the middle of the range where it does equal
3773                 # $b, so the next iteration will get the intersection
3774                 if ($range_a->end >= $b) {
3775                     $a = $b;
3776                 }
3777                 else {
3778
3779                     # Here, the current range containing $a is entirely below
3780                     # $b.  Go try to find a range that could contain $b.
3781                     $a_i = $a_object->_search_ranges($b);
3782
3783                     # If no range found, quit.
3784                     last unless defined $a_i;
3785
3786                     # The search returns $a_i, such that
3787                     #   range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
3788                     # Set $a to the beginning of this new range, and repeat.
3789                     $range_a = $a_ranges[$a_i];
3790                     $a = $range_a->start;
3791                 }
3792             }
3793             else { # Here, $b < $a.
3794
3795                 # Mirror image code to the leg just above
3796                 if ($range_b->end >= $a) {
3797                     $b = $a;
3798                 }
3799                 else {
3800                     $b_i = $b_object->_search_ranges($a);
3801                     last unless defined $b_i;
3802                     $range_b = $b_ranges[$b_i];
3803                     $b = $range_b->start;
3804                 }
3805             }
3806         } # End of looping through ranges.
3807
3808         # Intersection fully computed, or now know that there is no overlap
3809         return $check_if_overlapping ? 0 : $new;
3810     }
3811
3812     sub overlaps {
3813         # Returns boolean giving whether the two arguments overlap somewhere
3814
3815         my $self = shift;
3816         my $other = shift;
3817         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3818
3819         return $self->_intersect($other, 1);
3820     }
3821
3822     sub add_range {
3823         # Add a range to the list.
3824
3825         my $self = shift;
3826         my $start = shift;
3827         my $end = shift;
3828         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3829
3830         return $self->_add_delete('+', $start, $end, "");
3831     }
3832
3833     my $non_ASCII = (ord('A') != 65);   # Assumes test on same platform
3834
3835     sub is_code_point_usable {
3836         # This used only for making the test script.  See if the input
3837         # proposed trial code point is one that Perl will handle.  If second
3838         # parameter is 0, it won't select some code points for various
3839         # reasons, noted below.
3840
3841         my $code = shift;
3842         my $try_hard = shift;
3843         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3844
3845         return 0 if $code < 0;                # Never use a negative
3846
3847         # For non-ASCII, we shun the characters that don't have Perl encoding-
3848         # independent symbols for them.  'A' is such a symbol, so is "\n".
3849         # Note, this program hopefully will work on 5.8 Perls, and \v is not
3850         # such a symbol in them.
3851         return $try_hard if $non_ASCII
3852                             && $code <= 0xFF
3853                             && ($code >= 0x7F
3854                                 || ($code >= 0x0E && $code <= 0x1F)
3855                                 || ($code >= 0x01 && $code <= 0x06)
3856                                 || $code == 0x0B);  # \v introduced after 5.8
3857
3858         # shun null.  I'm (khw) not sure why this was done, but NULL would be
3859         # the character very frequently used.
3860         return $try_hard if $code == 0x0000;
3861
3862         return 0 if $try_hard;  # XXX Temporary until fix utf8.c
3863
3864         # shun non-character code points.
3865         return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
3866         return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
3867
3868         return $try_hard if $code > $LAST_UNICODE_CODEPOINT;   # keep in range
3869         return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
3870
3871         return 1;
3872     }
3873
3874     sub get_valid_code_point {
3875         # Return a code point that's part of the range list.  Returns nothing
3876         # if the table is empty or we can't find a suitable code point.  This
3877         # used only for making the test script.
3878
3879         my $self = shift;
3880         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3881
3882         my $addr = main::objaddr($self);
3883
3884         # On first pass, don't choose less desirable code points; if no good
3885         # one is found, repeat, allowing a less desirable one to be selected.
3886         for my $try_hard (0, 1) {
3887
3888             # Look through all the ranges for a usable code point.
3889             for my $set ($self->ranges) {
3890
3891                 # Try the edge cases first, starting with the end point of the
3892                 # range.
3893                 my $end = $set->end;
3894                 return $end if is_code_point_usable($end, $try_hard);
3895
3896                 # End point didn't, work.  Start at the beginning and try
3897                 # every one until find one that does work.
3898                 for my $trial ($set->start .. $end - 1) {
3899                     return $trial if is_code_point_usable($trial, $try_hard);
3900                 }
3901             }
3902         }
3903         return ();  # If none found, give up.
3904     }
3905
3906     sub get_invalid_code_point {
3907         # Return a code point that's not part of the table.  Returns nothing
3908         # if the table covers all code points or a suitable code point can't
3909         # be found.  This used only for making the test script.
3910
3911         my $self = shift;
3912         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3913
3914         # Just find a valid code point of the inverse, if any.
3915         return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
3916     }
3917 } # end closure for Range_List
3918
3919 package Range_Map;
3920 use base '_Range_List_Base';
3921
3922 # A Range_Map is a range list in which the range values (called maps) are
3923 # significant, and hence shouldn't be manipulated by our other code, which
3924 # could be ambiguous or lose things.  For example, in taking the union of two
3925 # lists, which share code points, but which have differing values, which one
3926 # has precedence in the union?
3927 # It turns out that these operations aren't really necessary for map tables,
3928 # and so this class was created to make sure they aren't accidentally
3929 # applied to them.
3930
3931 { # Closure
3932
3933     sub add_map {
3934         # Add a range containing a mapping value to the list
3935
3936         my $self = shift;
3937         # Rest of parameters passed on
3938
3939         return $self->_add_delete('+', @_);
3940     }
3941
3942     sub add_duplicate {
3943         # Adds entry to a range list which can duplicate an existing entry
3944
3945         my $self = shift;
3946         my $code_point = shift;
3947         my $value = shift;
3948         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3949
3950         return $self->add_map($code_point, $code_point,
3951                                 $value, Replace => $MULTIPLE);
3952     }
3953 } # End of closure for package Range_Map
3954
3955 package _Base_Table;
3956
3957 # A table is the basic data structure that gets written out into a file for
3958 # use by the Perl core.  This is the abstract base class implementing the
3959 # common elements from the derived ones.  A list of the methods to be
3960 # furnished by an implementing class is just after the constructor.
3961
3962 sub standardize { return main::standardize($_[0]); }
3963 sub trace { return main::trace(@_); }
3964
3965 { # Closure
3966
3967     main::setup_package();
3968
3969     my %range_list;
3970     # Object containing the ranges of the table.
3971     main::set_access('range_list', \%range_list, 'p_r', 'p_s');
3972
3973     my %full_name;
3974     # The full table name.
3975     main::set_access('full_name', \%full_name, 'r');
3976
3977     my %name;
3978     # The table name, almost always shorter
3979     main::set_access('name', \%name, 'r');
3980
3981     my %short_name;
3982     # The shortest of all the aliases for this table, with underscores removed
3983     main::set_access('short_name', \%short_name);
3984
3985     my %nominal_short_name_length;
3986     # The length of short_name before removing underscores
3987     main::set_access('nominal_short_name_length',
3988                     \%nominal_short_name_length);
3989
3990     my %complete_name;
3991     # The complete name, including property.
3992     main::set_access('complete_name', \%complete_name, 'r');
3993
3994     my %property;
3995     # Parent property this table is attached to.
3996     main::set_access('property', \%property, 'r');
3997
3998     my %aliases;
3999     # Ordered list of aliases of the table's name.  The first ones in the list
4000     # are output first in comments
4001     main::set_access('aliases', \%aliases, 'readable_array');
4002
4003     my %comment;
4004     # A comment associated with the table for human readers of the files
4005     main::set_access('comment', \%comment, 's');
4006
4007     my %description;
4008     # A comment giving a short description of the table's meaning for human
4009     # readers of the files.
4010     main::set_access('description', \%description, 'readable_array');
4011
4012     my %note;
4013     # A comment giving a short note about the table for human readers of the
4014     # files.
4015     main::set_access('note', \%note, 'readable_array');
4016
4017     my %internal_only;
4018     # Boolean; if set means any file that contains this table is marked as for
4019     # internal-only use.
4020     main::set_access('internal_only', \%internal_only);
4021
4022     my %find_table_from_alias;
4023     # The parent property passes this pointer to a hash which this class adds
4024     # all its aliases to, so that the parent can quickly take an alias and
4025     # find this table.
4026     main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4027
4028     my %locked;
4029     # After this table is made equivalent to another one; we shouldn't go
4030     # changing the contents because that could mean it's no longer equivalent
4031     main::set_access('locked', \%locked, 'r');
4032
4033     my %file_path;
4034     # This gives the final path to the file containing the table.  Each
4035     # directory in the path is an element in the array
4036     main::set_access('file_path', \%file_path, 'readable_array');
4037
4038     my %status;
4039     # What is the table's status, normal, $OBSOLETE, etc.  Enum
4040     main::set_access('status', \%status, 'r');
4041
4042     my %status_info;
4043     # A comment about its being obsolete, or whatever non normal status it has
4044     main::set_access('status_info', \%status_info, 'r');
4045
4046     my %range_size_1;
4047     # Is the table to be output with each range only a single code point?
4048     # This is done to avoid breaking existing code that may have come to rely
4049     # on this behavior in previous versions of this program.)
4050     main::set_access('range_size_1', \%range_size_1, 'r', 's');
4051
4052     my %perl_extension;
4053     # A boolean set iff this table is a Perl extension to the Unicode
4054     # standard.
4055     main::set_access('perl_extension', \%perl_extension, 'r');
4056
4057     sub new {
4058         # All arguments are key => value pairs, which you can see below, most
4059         # of which match fields documented above.  Otherwise: Pod_Entry,
4060         # Externally_Ok, and Fuzzy apply to the names of the table, and are
4061         # documented in the Alias package
4062
4063         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4064
4065         my $class = shift;
4066
4067         my $self = bless \do { my $anonymous_scalar }, $class;
4068         my $addr = main::objaddr($self);
4069
4070         my %args = @_;
4071
4072         $name{$addr} = delete $args{'Name'};
4073         $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4074         $full_name{$addr} = delete $args{'Full_Name'};
4075         my $complete_name = $complete_name{$addr}
4076                           = delete $args{'Complete_Name'};
4077         $internal_only{$addr} = delete $args{'Internal_Only_Warning'} || 0;
4078         $perl_extension{$addr} = delete $args{'Perl_Extension'} || 0;
4079         $property{$addr} = delete $args{'_Property'};
4080         $range_list{$addr} = delete $args{'_Range_List'};
4081         $status{$addr} = delete $args{'Status'} || $NORMAL;
4082         $status_info{$addr} = delete $args{'_Status_Info'} || "";
4083         $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
4084
4085         my $description = delete $args{'Description'};
4086         my $externally_ok = delete $args{'Externally_Ok'};
4087         my $loose_match = delete $args{'Fuzzy'};
4088         my $note = delete $args{'Note'};
4089         my $make_pod_entry = delete $args{'Pod_Entry'};
4090
4091         # Shouldn't have any left over
4092         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4093
4094         # Can't use || above because conceivably the name could be 0, and
4095         # can't use // operator in case this program gets used in Perl 5.8
4096         $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
4097
4098         $aliases{$addr} = [ ];
4099         $comment{$addr} = [ ];
4100         $description{$addr} = [ ];
4101         $note{$addr} = [ ];
4102         $file_path{$addr} = [ ];
4103         $locked{$addr} = "";
4104
4105         push @{$description{$addr}}, $description if $description;
4106         push @{$note{$addr}}, $note if $note;
4107
4108         # If hasn't set its status already, see if it is on one of the lists
4109         # of properties or tables that have particular statuses; if not, is
4110         # normal.  The lists are prioritized so the most serious ones are
4111         # checked first
4112         if (! $status{$addr}) {
4113             if (exists $why_suppressed{$complete_name}) {
4114                 $status{$addr} = $SUPPRESSED;
4115             }
4116             elsif (exists $why_deprecated{$complete_name}) {
4117                 $status{$addr} = $DEPRECATED;
4118             }
4119             elsif (exists $why_stabilized{$complete_name}) {
4120                 $status{$addr} = $STABILIZED;
4121             }
4122             elsif (exists $why_obsolete{$complete_name}) {
4123                 $status{$addr} = $OBSOLETE;
4124             }
4125
4126             # Existence above doesn't necessarily mean there is a message
4127             # associated with it.  Use the most serious message.
4128             if ($status{$addr}) {
4129                 if ($why_suppressed{$complete_name}) {
4130                     $status_info{$addr}
4131                                 = $why_suppressed{$complete_name};
4132                 }
4133                 elsif ($why_deprecated{$complete_name}) {
4134                     $status_info{$addr}
4135                                 = $why_deprecated{$complete_name};
4136                 }
4137                 elsif ($why_stabilized{$complete_name}) {
4138                     $status_info{$addr}
4139                                 = $why_stabilized{$complete_name};
4140                 }
4141                 elsif ($why_obsolete{$complete_name}) {
4142                     $status_info{$addr}
4143                                 = $why_obsolete{$complete_name};
4144                 }
4145             }
4146         }
4147
4148         # By convention what typically gets printed only or first is what's
4149         # first in the list, so put the full name there for good output
4150         # clarity.  Other routines rely on the full name being first on the
4151         # list
4152         $self->add_alias($full_name{$addr},
4153                             Externally_Ok => $externally_ok,
4154                             Fuzzy => $loose_match,
4155                             Pod_Entry => $make_pod_entry,
4156                             Status => $status{$addr},
4157                             );
4158
4159         # Then comes the other name, if meaningfully different.
4160         if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4161             $self->add_alias($name{$addr},
4162                             Externally_Ok => $externally_ok,
4163                             Fuzzy => $loose_match,
4164                             Pod_Entry => $make_pod_entry,
4165                             Status => $status{$addr},
4166                             );
4167         }
4168
4169         return $self;
4170     }
4171
4172     # Here are the methods that are required to be defined by any derived
4173     # class
4174     for my $sub qw(
4175                     append_to_body
4176                     pre_body
4177                 )
4178                 # append_to_body and pre_body are called in the write() method
4179                 # to add stuff after the main body of the table, but before
4180                 # its close; and to prepend stuff before the beginning of the
4181                 # table.
4182     {
4183         no strict "refs";
4184         *$sub = sub {
4185             Carp::my_carp_bug( __LINE__
4186                               . ": Must create method '$sub()' for "
4187                               . ref shift);
4188             return;
4189         }
4190     }
4191
4192     use overload
4193         fallback => 0,
4194         "." => \&main::_operator_dot,
4195         '!=' => \&main::_operator_not_equal,
4196         '==' => \&main::_operator_equal,
4197     ;
4198
4199     sub ranges {
4200         # Returns the array of ranges associated with this table.
4201
4202         return $range_list{main::objaddr shift}->ranges;
4203     }
4204
4205     sub add_alias {
4206         # Add a synonym for this table.
4207
4208         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
4209
4210         my $self = shift;
4211         my $name = shift;       # The name to add.
4212         my $pointer = shift;    # What the alias hash should point to.  For
4213                                 # map tables, this is the parent property;
4214                                 # for match tables, it is the table itself.
4215
4216         my %args = @_;
4217         my $loose_match = delete $args{'Fuzzy'};
4218
4219         my $make_pod_entry = delete $args{'Pod_Entry'};
4220         $make_pod_entry = $YES unless defined $make_pod_entry;
4221
4222         my $externally_ok = delete $args{'Externally_Ok'};
4223         $externally_ok = 1 unless defined $externally_ok;
4224
4225         my $status = delete $args{'Status'};
4226         $status = $NORMAL unless defined $status;
4227
4228         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4229
4230         # Capitalize the first letter of the alias unless it is one of the CJK
4231         # ones which specifically begins with a lower 'k'.  Do this because
4232         # Unicode has varied whether they capitalize first letters or not, and
4233         # have later changed their minds and capitalized them, but not the
4234         # other way around.  So do it always and avoid changes from release to
4235         # release
4236         $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
4237
4238         my $addr = main::objaddr $self;
4239
4240         # Figure out if should be loosely matched if not already specified.
4241         if (! defined $loose_match) {
4242
4243             # Is a loose_match if isn't null, and doesn't begin with an
4244             # underscore and isn't just a number
4245             if ($name ne ""
4246                 && substr($name, 0, 1) ne '_'
4247                 && $name !~ qr{^[0-9_.+-/]+$})
4248             {
4249                 $loose_match = 1;
4250             }
4251             else {
4252                 $loose_match = 0;
4253             }
4254         }
4255
4256         # If this alias has already been defined, do nothing.
4257         return if defined $find_table_from_alias{$addr}->{$name};
4258
4259         # That includes if it is standardly equivalent to an existing alias,
4260         # in which case, add this name to the list, so won't have to search
4261         # for it again.
4262         my $standard_name = main::standardize($name);
4263         if (defined $find_table_from_alias{$addr}->{$standard_name}) {
4264             $find_table_from_alias{$addr}->{$name}
4265                         = $find_table_from_alias{$addr}->{$standard_name};
4266             return;
4267         }
4268
4269         # Set the index hash for this alias for future quick reference.
4270         $find_table_from_alias{$addr}->{$name} = $pointer;
4271         $find_table_from_alias{$addr}->{$standard_name} = $pointer;
4272         local $to_trace = 0 if main::DEBUG;
4273         trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
4274         trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
4275
4276
4277         # Put the new alias at the end of the list of aliases unless the final
4278         # element begins with an underscore (meaning it is for internal perl
4279         # use) or is all numeric, in which case, put the new one before that
4280         # one.  This floats any all-numeric or underscore-beginning aliases to
4281         # the end.  This is done so that they are listed last in output lists,
4282         # to encourage the user to use a better name (either more descriptive
4283         # or not an internal-only one) instead.  This ordering is relied on
4284         # implicitly elsewhere in this program, like in short_name()
4285         my $list = $aliases{$addr};
4286         my $insert_position = (@$list == 0
4287                                 || (substr($list->[-1]->name, 0, 1) ne '_'
4288                                     && $list->[-1]->name =~ /\D/))
4289                             ? @$list
4290                             : @$list - 1;
4291         splice @$list,
4292                 $insert_position,
4293                 0,
4294                 Alias->new($name, $loose_match, $make_pod_entry,
4295                                                     $externally_ok, $status);
4296
4297         # This name may be shorter than any existing ones, so clear the cache
4298         # of the shortest, so will have to be recalculated.
4299         undef $short_name{main::objaddr $self};
4300         return;
4301     }
4302
4303     sub short_name {
4304         # Returns a name suitable for use as the base part of a file name.
4305         # That is, shorter wins.  It can return undef if there is no suitable
4306         # name.  The name has all non-essential underscores removed.
4307
4308         # The optional second parameter is a reference to a scalar in which
4309         # this routine will store the length the returned name had before the
4310         # underscores were removed, or undef if the return is undef.
4311
4312         # The shortest name can change if new aliases are added.  So using
4313         # this should be deferred until after all these are added.  The code
4314         # that does that should clear this one's cache.
4315         # Any name with alphabetics is preferred over an all numeric one, even
4316         # if longer.
4317
4318         my $self = shift;
4319         my $nominal_length_ptr = shift;
4320         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4321
4322         my $addr = main::objaddr $self;
4323
4324         # For efficiency, don't recalculate, but this means that adding new
4325         # aliases could change what the shortest is, so the code that does
4326         # that needs to undef this.
4327         if (defined $short_name{$addr}) {
4328             if ($nominal_length_ptr) {
4329                 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4330             }
4331             return $short_name{$addr};
4332         }
4333
4334         # Look at each alias
4335         foreach my $alias ($self->aliases()) {
4336
4337             # Don't use an alias that isn't ok to use for an external name.
4338             next if ! $alias->externally_ok;
4339
4340             my $name = main::Standardize($alias->name);
4341             trace $self, $name if main::DEBUG && $to_trace;
4342
4343             # Take the first one, or a shorter one that isn't numeric.  This
4344             # relies on numeric aliases always being last in the array
4345             # returned by aliases().  Any alpha one will have precedence.
4346             if (! defined $short_name{$addr}
4347                 || ($name =~ /\D/
4348                     && length($name) < length($short_name{$addr})))
4349             {
4350                 # Remove interior underscores.
4351                 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
4352
4353                 $nominal_short_name_length{$addr} = length $name;
4354             }
4355         }
4356
4357         # If no suitable external name return undef
4358         if (! defined $short_name{$addr}) {
4359             $$nominal_length_ptr = undef if $nominal_length_ptr;
4360             return;
4361         }
4362
4363         # Don't allow a null external name.
4364         if ($short_name{$addr} eq "") {
4365             $short_name{$addr} = '_';
4366             $nominal_short_name_length{$addr} = 1;
4367         }
4368
4369         trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
4370
4371         if ($nominal_length_ptr) {
4372             $$nominal_length_ptr = $nominal_short_name_length{$addr};
4373         }
4374         return $short_name{$addr};
4375     }
4376
4377     sub external_name {
4378         # Returns the external name that this table should be known by.  This
4379         # is usually the short_name, but not if the short_name is undefined.
4380
4381         my $self = shift;
4382         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4383
4384         my $short = $self->short_name;
4385         return $short if defined $short;
4386
4387         return '_';
4388     }
4389
4390     sub add_description { # Adds the parameter as a short description.
4391
4392         my $self = shift;
4393         my $description = shift;
4394         chomp $description;
4395         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4396
4397         push @{$description{main::objaddr $self}}, $description;
4398
4399         return;
4400     }
4401
4402     sub add_note { # Adds the parameter as a short note.
4403
4404         my $self = shift;
4405         my $note = shift;
4406         chomp $note;
4407         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4408
4409         push @{$note{main::objaddr $self}}, $note;
4410
4411         return;
4412     }
4413
4414     sub add_comment { # Adds the parameter as a comment.
4415
4416         my $self = shift;
4417         my $comment = shift;
4418         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4419
4420         chomp $comment;
4421         push @{$comment{main::objaddr $self}}, $comment;
4422
4423         return;
4424     }
4425
4426     sub comment {
4427         # Return the current comment for this table.  If called in list
4428         # context, returns the array of comments.  In scalar, returns a string
4429         # of each element joined together with a period ending each.
4430
4431         my $self = shift;
4432         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4433
4434         my @list = @{$comment{main::objaddr $self}};
4435         return @list if wantarray;
4436         my $return = "";
4437         foreach my $sentence (@list) {
4438             $return .= '.  ' if $return;
4439             $return .= $sentence;
4440             $return =~ s/\.$//;
4441         }
4442         $return .= '.' if $return;
4443         return $return;
4444     }
4445
4446     sub initialize {
4447         # Initialize the table with the argument which is any valid
4448         # initialization for range lists.
4449
4450         my $self = shift;
4451         my $initialization = shift;
4452         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4453
4454         # Replace the current range list with a new one of the same exact
4455         # type.
4456         my $class = ref $range_list{main::objaddr $self};
4457         $range_list{main::objaddr $self} = $class->new(Owner => $self,
4458                                         Initialize => $initialization);
4459         return;
4460
4461     }
4462
4463     sub header {
4464         # The header that is output for the table in the file it is written
4465         # in.
4466
4467         my $self = shift;
4468         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4469
4470         my $return = "";
4471         $return .= $DEVELOPMENT_ONLY if $compare_versions;
4472         $return .= $HEADER;
4473         $return .= $INTERNAL_ONLY if $internal_only{main::objaddr $self};
4474         return $return;
4475     }
4476
4477     sub write {
4478         # Write a representation of the table to its file.
4479
4480         my $self = shift;
4481         my $tab_stops = shift;       # The number of tab stops over to put any
4482                                      # comment.
4483         my $suppress_value = shift;  # Optional, if the value associated with
4484                                      # a range equals this one, don't write
4485                                      # the range
4486         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4487
4488         my $addr = main::objaddr($self);
4489
4490         # Start with the header
4491         my @OUT = $self->header;
4492
4493         # Then the comments
4494         push @OUT, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
4495                                                         if $comment{$addr};
4496
4497         # Then any pre-body stuff.
4498         my $pre_body = $self->pre_body;
4499         push @OUT, $pre_body, "\n" if $pre_body;
4500
4501         # The main body looks like a 'here' document
4502         push @OUT, "return <<'END';\n";
4503
4504         if ($range_list{$addr}->is_empty) {
4505
4506             # This is a kludge for empty tables to silence a warning in
4507             # utf8.c, which can't really deal with empty tables, but it can
4508             # deal with a table that matches nothing, as the inverse of 'Any'
4509             # does.
4510             push @OUT, "!utf8::IsAny\n";
4511         }
4512         else {
4513             my $range_size_1 = $range_size_1{$addr};
4514
4515             # Output each range as part of the here document.
4516             for my $set ($range_list{$addr}->ranges) {
4517                 my $start = $set->start;
4518                 my $end   = $set->end;
4519                 my $value  = $set->value;
4520
4521                 # Don't output ranges whose value is the one to suppress
4522                 next if defined $suppress_value && $value eq $suppress_value;
4523
4524                 # If has or wants a single point range output
4525                 if ($start == $end || $range_size_1) {
4526                     for my $i ($start .. $end) {
4527                         push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
4528                     }
4529                 }
4530                 else  {
4531                     push @OUT, sprintf "%04X\t%04X\t%s", $start, $end, $value;
4532
4533                     # Add a comment with the size of the range, if requested.
4534                     # Expand Tabs to make sure they all start in the same
4535                     # column, and then unexpand to use mostly tabs.
4536                     if (! $output_range_counts) {
4537                         $OUT[-1] .= "\n";
4538                     }
4539                     else {
4540                         $OUT[-1] = Text::Tabs::expand($OUT[-1]);
4541                         my $count = main::clarify_number($end - $start + 1);
4542                         use integer;
4543
4544                         my $width = $tab_stops * 8 - 1;
4545                         $OUT[-1] = sprintf("%-*s # [%s]\n",
4546                                             $width,
4547                                             $OUT[-1],
4548                                             $count);
4549                         $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
4550                     }
4551                 }
4552             } # End of loop through all the table's ranges
4553         }
4554
4555         # Add anything that goes after the main body, but within the here
4556         # document,
4557         my $append_to_body = $self->append_to_body;
4558         push @OUT, $append_to_body if $append_to_body;
4559
4560         # And finish the here document.
4561         push @OUT, "END\n";
4562
4563         # All these files have a .pl suffix
4564         $file_path{$addr}->[-1] .= '.pl';
4565
4566         main::write($file_path{$addr}, \@OUT);
4567         return;
4568     }
4569
4570     sub set_status {    # Set the table's status
4571         my $self = shift;
4572         my $status = shift; # The status enum value
4573         my $info = shift;   # Any message associated with it.
4574         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4575
4576         my $addr = main::objaddr($self);
4577
4578         $status{$addr} = $status;
4579         $status_info{$addr} = $info;
4580         return;
4581     }
4582
4583     sub lock {
4584         # Don't allow changes to the table from now on.  This stores a stack
4585         # trace of where it was called, so that later attempts to modify it
4586         # can immediately show where it got locked.
4587
4588         my $self = shift;
4589         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4590
4591         my $addr = main::objaddr $self;
4592
4593         $locked{$addr} = "";
4594
4595         my $line = (caller(0))[2];
4596         my $i = 1;
4597
4598         # Accumulate the stack trace
4599         while (1) {
4600             my ($pkg, $file, $caller_line, $caller) = caller $i++;
4601
4602             last unless defined $caller;
4603
4604             $locked{$addr} .= "    called from $caller() at line $line\n";
4605             $line = $caller_line;
4606         }
4607         $locked{$addr} .= "    called from main at line $line\n";
4608
4609         return;
4610     }
4611
4612     sub carp_if_locked {
4613         # Return whether a table is locked or not, and, by the way, complain
4614         # if is locked
4615
4616         my $self = shift;
4617         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4618
4619         my $addr = main::objaddr $self;
4620
4621         return 0 if ! $locked{$addr};
4622         Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
4623         return 1;
4624     }
4625
4626     sub set_file_path { # Set the final directory path for this table
4627         my $self = shift;
4628         # Rest of parameters passed on
4629
4630         @{$file_path{main::objaddr $self}} = @_;
4631         return
4632     }
4633
4634     # Accessors for the range list stored in this table.  First for
4635     # unconditional
4636     for my $sub qw(
4637                     contains
4638                     count
4639                     each_range
4640                     hash
4641                     is_empty
4642                     max
4643                     min
4644                     range_count
4645                     reset_each_range
4646                     value_of
4647                 )
4648     {
4649         no strict "refs";
4650         *$sub = sub {
4651             use strict "refs";
4652             my $self = shift;
4653             return $range_list{main::objaddr $self}->$sub(@_);
4654         }
4655     }
4656
4657     # Then for ones that should fail if locked
4658     for my $sub qw(
4659                     delete_range
4660                 )
4661     {
4662         no strict "refs";
4663         *$sub = sub {
4664             use strict "refs";
4665             my $self = shift;
4666
4667             return if $self->carp_if_locked;
4668             return $range_list{main::objaddr $self}->$sub(@_);
4669         }
4670     }
4671
4672 } # End closure
4673
4674 package Map_Table;
4675 use base '_Base_Table';
4676
4677 # A Map Table is a table that contains the mappings from code points to
4678 # values.  There are two weird cases:
4679 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
4680 #    are written in the table's file at the end of the table nonetheless.  It
4681 #    requires specially constructed code to handle these; utf8.c can not read
4682 #    these in, so they should not go in $map_directory.  As of this writing,
4683 #    the only case that these happen is for named sequences used in
4684 #    charnames.pm.   But this code doesn't enforce any syntax on these, so
4685 #    something else could come along that uses it.
4686 # 2) Specials are anything that doesn't fit syntactically into the body of the
4687 #    table.  The ranges for these have a map type of non-zero.  The code below
4688 #    knows about and handles each possible type.   In most cases, these are
4689 #    written as part of the header.
4690 #
4691 # A map table deliberately can't be manipulated at will unlike match tables.
4692 # This is because of the ambiguities having to do with what to do with
4693 # overlapping code points.  And there just isn't a need for those things;
4694 # what one wants to do is just query, add, replace, or delete mappings, plus
4695 # write the final result.
4696 # However, there is a method to get the list of possible ranges that aren't in
4697 # this table to use for defaulting missing code point mappings.  And,
4698 # map_add_or_replace_non_nulls() does allow one to add another table to this
4699 # one, but it is clearly very specialized, and defined that the other's
4700 # non-null values replace this one's if there is any overlap.
4701
4702 sub trace { return main::trace(@_); }
4703
4704 { # Closure
4705
4706     main::setup_package();
4707
4708     my %default_map;
4709     # Many input files omit some entries; this gives what the mapping for the
4710     # missing entries should be
4711     main::set_access('default_map', \%default_map, 'r');
4712
4713     my %anomalous_entries;
4714     # Things that go in the body of the table which don't fit the normal
4715     # scheme of things, like having a range.  Not much can be done with these
4716     # once there except to output them.  This was created to handle named
4717     # sequences.
4718     main::set_access('anomalous_entry', \%anomalous_entries, 'a');
4719     main::set_access('anomalous_entries',       # Append singular, read plural
4720                     \%anomalous_entries,
4721                     'readable_array');
4722
4723     my %format;
4724     # The format of the entries of the table.  This is calculated from the
4725     # data in the table (or passed in the constructor).  This is an enum e.g.,
4726     # $STRING_FORMAT
4727     main::set_access('format', \%format);
4728
4729     my %core_access;
4730     # This is a string, solely for documentation, indicating how one can get
4731     # access to this property via the Perl core.
4732     main::set_access('core_access', \%core_access, 'r', 's');
4733
4734     my %has_specials;
4735     # Boolean set when non-zero map-type ranges are added to this table,
4736     # which happens in only a few tables.  This is purely for performance, to
4737     # avoid having to search through every table upon output, so if all the
4738     # non-zero maps got deleted before output, this would remain set, and the
4739     # only penalty would be performance.  Currently, most map tables that get
4740     # output have specials in them, so this doesn't help that much anyway.
4741     main::set_access('has_specials', \%has_specials);
4742
4743     my %to_output_map;
4744     # Boolean as to whether or not to write out this map table
4745     main::set_access('to_output_map', \%to_output_map, 's');
4746
4747
4748     sub new {
4749         my $class = shift;
4750         my $name = shift;
4751
4752         my %args = @_;
4753
4754         # Optional initialization data for the table.
4755         my $initialize = delete $args{'Initialize'};
4756
4757         my $core_access = delete $args{'Core_Access'};
4758         my $default_map = delete $args{'Default_Map'};
4759         my $format = delete $args{'Format'};
4760         my $property = delete $args{'_Property'};
4761         my $full_name = delete $args{'Full_Name'};
4762         # Rest of parameters passed on
4763
4764         my $range_list = Range_Map->new(Owner => $property);
4765
4766         my $self = $class->SUPER::new(
4767                                     Name => $name,
4768                                     Complete_Name =>  $full_name,
4769                                     Full_Name => $full_name,
4770                                     _Property => $property,
4771                                     _Range_List => $range_list,
4772                                     %args);
4773
4774         my $addr = main::objaddr $self;
4775
4776         $anomalous_entries{$addr} = [];
4777         $core_access{$addr} = $core_access;
4778         $default_map{$addr} = $default_map;
4779         $format{$addr} = $format;
4780
4781         $self->initialize($initialize) if defined $initialize;
4782
4783         return $self;
4784     }
4785
4786     use overload
4787         fallback => 0,
4788         qw("") => "_operator_stringify",
4789     ;
4790
4791     sub _operator_stringify {
4792         my $self = shift;
4793
4794         my $name = $self->property->full_name;
4795         $name = '""' if $name eq "";
4796         return "Map table for Property '$name'";
4797     }
4798
4799     sub add_alias {
4800         # Add a synonym for this table (which means the property itself)
4801         my $self = shift;
4802         my $name = shift;
4803         # Rest of parameters passed on.
4804
4805         $self->SUPER::add_alias($name, $self->property, @_);
4806         return;
4807     }
4808
4809     sub add_map {
4810         # Add a range of code points to the list of specially-handled code
4811         # points.  $MULTI_CP is assumed if the type of special is not passed
4812         # in.
4813
4814         my $self = shift;
4815         my $lower = shift;
4816         my $upper = shift;
4817         my $string = shift;
4818         my %args = @_;
4819
4820         my $type = delete $args{'Type'} || 0;
4821         # Rest of parameters passed on
4822
4823         # Can't change the table if locked.
4824         return if $self->carp_if_locked;
4825
4826         my $addr = main::objaddr $self;
4827
4828         $has_specials{$addr} = 1 if $type;
4829
4830         $self->_range_list->add_map($lower, $upper,
4831                                     $string,
4832                                     @_,
4833                                     Type => $type);
4834         return;
4835     }
4836
4837     sub append_to_body {
4838         # Adds to the written HERE document of the table's body any anomalous
4839         # entries in the table..
4840
4841         my $self = shift;
4842         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4843
4844         my $addr = main::objaddr $self;
4845
4846         return "" unless @{$anomalous_entries{$addr}};
4847         return join("\n", @{$anomalous_entries{$addr}}) . "\n";
4848     }
4849
4850     sub map_add_or_replace_non_nulls {
4851         # This adds the mappings in the table $other to $self.  Non-null
4852         # mappings from $other override those in $self.  It essentially merges
4853         # the two tables, with the second having priority except for null
4854         # mappings.
4855
4856         my $self = shift;
4857         my $other = shift;
4858         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4859
4860         return if $self->carp_if_locked;
4861
4862         if (! $other->isa(__PACKAGE__)) {
4863             Carp::my_carp_bug("$other should be a "
4864                         . __PACKAGE__
4865                         . ".  Not a '"
4866                         . ref($other)
4867                         . "'.  Not added;");
4868             return;
4869         }
4870
4871         my $addr = main::objaddr $self;
4872         my $other_addr = main::objaddr $other;
4873
4874         local $to_trace = 0 if main::DEBUG;
4875
4876         my $self_range_list = $self->_range_list;
4877         my $other_range_list = $other->_range_list;
4878         foreach my $range ($other_range_list->ranges) {
4879             my $value = $range->value;
4880             next if $value eq "";
4881             $self_range_list->_add_delete('+',
4882                                           $range->start,
4883                                           $range->end,
4884                                           $value,
4885                                           Type => $range->type,
4886                                           Replace => $UNCONDITIONALLY);
4887         }
4888
4889         # Copy the specials information from the other table to $self
4890         if ($has_specials{$other_addr}) {
4891             $has_specials{$addr} = 1;
4892         }
4893
4894         return;
4895     }
4896
4897     sub set_default_map {
4898         # Define what code points that are missing from the input files should
4899         # map to
4900
4901         my $self = shift;
4902         my $map = shift;
4903         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4904
4905         my $addr = main::objaddr $self;
4906
4907         # Convert the input to the standard equivalent, if any (won't have any
4908         # for $STRING properties)
4909         my $standard = $self->_find_table_from_alias->{$map};
4910         $map = $standard->name if defined $standard;
4911
4912         # Warn if there already is a non-equivalent default map for this
4913         # property.  Note that a default map can be a ref, which means that
4914         # what it actually means is delayed until later in the program, and it
4915         # IS permissible to override it here without a message.
4916         my $default_map = $default_map{$addr};
4917         if (defined $default_map
4918             && ! ref($default_map)
4919             && $default_map ne $map
4920             && main::Standardize($map) ne $default_map)
4921         {
4922             my $property = $self->property;
4923             my $map_table = $property->table($map);
4924             my $default_table = $property->table($default_map);
4925             if (defined $map_table
4926                 && defined $default_table
4927                 && $map_table != $default_table)
4928             {
4929                 Carp::my_carp("Changing the default mapping for "
4930                             . $property
4931                             . " from $default_map to $map'");
4932             }
4933         }
4934
4935         $default_map{$addr} = $map;
4936
4937         # Don't also create any missing table for this map at this point,
4938         # because if we did, it could get done before the main table add is
4939         # done for PropValueAliases.txt; instead the caller will have to make
4940         # sure it exists, if desired.
4941         return;
4942     }
4943
4944     sub to_output_map {
4945         # Returns boolean: should we write this map table?
4946
4947         my $self = shift;
4948         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4949
4950         my $addr = main::objaddr $self;
4951
4952         # If overridden, use that
4953         return $to_output_map{$addr} if defined $to_output_map{$addr};
4954
4955         my $full_name = $self->full_name;
4956
4957         # If table says to output, do so; if says to suppress it, do do.
4958         return 1 if grep { $_ eq $full_name } @output_mapped_properties;
4959         return 0 if $self->status eq $SUPPRESSED;
4960
4961         my $type = $self->property->type;
4962
4963         # Don't want to output binary map tables even for debugging.
4964         return 0 if $type == $BINARY;
4965
4966         # But do want to output string ones.
4967         return 1 if $type == $STRING;
4968
4969         # Otherwise is an $ENUM, don't output it
4970         return 0;
4971     }
4972
4973     sub inverse_list {
4974         # Returns a Range_List that is gaps of the current table.  That is,
4975         # the inversion
4976
4977         my $self = shift;
4978         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4979
4980         my $current = Range_List->new(Initialize => $self->_range_list,
4981                                 Owner => $self->property);
4982         return ~ $current;
4983     }
4984
4985     sub set_final_comment {
4986         # Just before output, create the comment that heads the file
4987         # containing this table.
4988
4989         my $self = shift;
4990         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4991
4992         # No sense generating a comment if aren't going to write it out.
4993         return if ! $self->to_output_map;
4994
4995         my $addr = main::objaddr $self;
4996
4997         my $property = $self->property;
4998
4999         # Get all the possible names for this property.  Don't use any that
5000         # aren't ok for use in a file name, etc.  This is perhaps causing that
5001         # flag to do double duty, and may have to be changed in the future to
5002         # have our own flag for just this purpose; but it works now to exclude
5003         # Perl generated synonyms from the lists for properties, where the
5004         # name is always the proper Unicode one.
5005         my @property_aliases = grep { $_->externally_ok } $self->aliases;
5006
5007         my $count = $self->count;
5008         my $default_map = $default_map{$addr};
5009
5010         # The ranges that map to the default aren't output, so subtract that
5011         # to get those actually output.  A property with matching tables
5012         # already has the information calculated.
5013         if ($property->type != $STRING) {
5014             $count -= $property->table($default_map)->count;
5015         }
5016         elsif (defined $default_map) {
5017
5018             # But for $STRING properties, must calculate now.  Subtract the
5019             # count from each range that maps to the default.
5020             foreach my $range ($self->_range_list->ranges) {
5021                 if ($range->value eq $default_map) {
5022                     $count -= $range->end +1 - $range->start;
5023                 }
5024             }
5025
5026         }
5027
5028         # Get a  string version of $count with underscores in large numbers,
5029         # for clarity.
5030         my $string_count = main::clarify_number($count);
5031
5032         my $code_points = ($count == 1)
5033                         ? 'single code point'
5034                         : "$string_count code points";
5035
5036         my $mapping;
5037         my $these_mappings;
5038         my $are;
5039         if (@property_aliases <= 1) {
5040             $mapping = 'mapping';
5041             $these_mappings = 'this mapping';
5042             $are = 'is'
5043         }
5044         else {
5045             $mapping = 'synonymous mappings';
5046             $these_mappings = 'these mappings';
5047             $are = 'are'
5048         }
5049         my $cp;
5050         if ($count >= $MAX_UNICODE_CODEPOINTS) {
5051             $cp = "any code point in Unicode Version $string_version";
5052         }
5053         else {
5054             my $map_to;
5055             if ($default_map eq "") {
5056                 $map_to = 'the null string';
5057             }
5058             elsif ($default_map eq $CODE_POINT) {
5059                 $map_to = "itself";
5060             }
5061             else {
5062                 $map_to = "'$default_map'";
5063             }
5064             if ($count == 1) {
5065                 $cp = "the single code point";
5066             }
5067             else {
5068                 $cp = "one of the $code_points";
5069             }
5070             $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
5071         }
5072
5073         my $comment = "";
5074
5075         my $status = $self->status;
5076         if ($status) {
5077             my $warn = uc $status_past_participles{$status};
5078             $comment .= <<END;
5079
5080 !!!!!!!   $warn !!!!!!!!!!!!!!!!!!!
5081  All property or property=value combinations contained in this file are $warn.
5082  See $unicode_reference_url for what this means.
5083
5084 END
5085         }
5086         $comment .= "This file returns the $mapping:\n";
5087
5088         for my $i (0 .. @property_aliases - 1) {
5089             $comment .= sprintf("%-8s%s\n",
5090                                 " ",
5091                                 $property_aliases[$i]->name . '(cp)'
5092                                 );
5093         }
5094         $comment .=
5095                 "\nwhere 'cp' is $cp.  Note that $these_mappings $are ";
5096
5097         my $access = $core_access{$addr};
5098         if ($access) {
5099             $comment .= "accessible through the Perl core via $access.";
5100         }
5101         else {
5102             $comment .= "not accessible through the Perl core directly.";
5103         }
5104
5105         # And append any commentary already set from the actual property.
5106         $comment .= "\n\n" . $self->comment if $self->comment;
5107         if ($self->description) {
5108             $comment .= "\n\n" . join " ", $self->description;
5109         }
5110         if ($self->note) {
5111             $comment .= "\n\n" . join " ", $self->note;
5112         }
5113         $comment .= "\n";
5114
5115         if (! $self->perl_extension) {
5116             $comment .= <<END;
5117
5118 For information about what this property really means, see:
5119 $unicode_reference_url
5120 END
5121         }
5122
5123         if ($count) {        # Format differs for empty table
5124                 $comment.= "\nThe format of the ";
5125             if ($self->range_size_1) {
5126                 $comment.= <<END;
5127 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
5128 is in hex; MAPPING is what CODE_POINT maps to.
5129 END
5130             }
5131             else {
5132
5133                 # There are tables which end up only having one element per
5134                 # range, but it is not worth keeping track of for making just
5135                 # this comment a little better.
5136                 $comment.= <<END;
5137 non-comment portions of the main body of lines of this file is:
5138 START\\tSTOP\\tMAPPING where START is the starting code point of the
5139 range, in hex; STOP is the ending point, or if omitted, the range has just one
5140 code point; MAPPING is what each code point between START and STOP maps to.
5141 END
5142                 if ($output_range_counts) {
5143                     $comment .= <<END;
5144 Numbers in comments in [brackets] indicate how many code points are in the
5145 range (omitted when the range is a single code point or if the mapping is to
5146 the null string).
5147 END
5148                 }
5149             }
5150         }
5151         $self->set_comment(main::join_lines($comment));
5152         return;
5153     }
5154
5155     my %swash_keys; # Makes sure don't duplicate swash names.
5156
5157     sub pre_body {
5158         # Returns the string that should be output in the file before the main
5159         # body of this table.  This includes some hash entries identifying the
5160         # format of the body, and what the single value should be for all
5161         # ranges missing from it.  It also includes any code points which have
5162         # map_types that don't go in the main table.
5163
5164         my $self = shift;
5165         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5166
5167         my $addr = main::objaddr $self;
5168
5169         my $name = $self->property->swash_name;
5170
5171         if (defined $swash_keys{$name}) {
5172             Carp::my_carp(join_lines(<<END
5173 Already created a swash name '$name' for $swash_keys{$name}.  This means that
5174 the same name desired for $self shouldn't be used.  Bad News.  This must be
5175 fixed before production use, but proceeding anyway
5176 END
5177             ));
5178         }
5179         $swash_keys{$name} = "$self";
5180
5181         my $default_map = $default_map{$addr};
5182
5183         my $pre_body = "";
5184         if ($has_specials{$addr}) {
5185
5186             # Here, some maps with non-zero type have been added to the table.
5187             # Go through the table and handle each of them.  None will appear
5188             # in the body of the table, so delete each one as we go.  The
5189             # code point count has already been calculated, so ok to delete
5190             # now.
5191
5192             my @multi_code_point_maps;
5193             my $has_hangul_syllables = 0;
5194
5195             # The key is the base name of the code point, and the value is an
5196             # array giving all the ranges that use this base name.  Each range
5197             # is actually a hash giving the 'low' and 'high' values of it.
5198             my %names_ending_in_code_point;
5199
5200             # Inverse mapping.  The list of ranges that have these kinds of
5201             # names.  Each element contains the low, high, and base names in a
5202             # hash.
5203             my @code_points_ending_in_code_point;
5204
5205             my $range_map = $self->_range_list;
5206             foreach my $range ($range_map->ranges) {
5207                 next unless $range->type != 0;
5208                 my $low = $range->start;
5209                 my $high = $range->end;
5210                 my $map = $range->value;
5211                 my $type = $range->type;
5212
5213                 # No need to output the range if it maps to the default.  And
5214                 # the write method won't output it either, so no need to
5215                 # delete it to keep it from being output, and is faster to
5216                 # skip than to delete anyway.
5217                 next if $map eq $default_map;
5218
5219                 # Delete the range to keep write() from trying to output it
5220                 $range_map->delete_range($low, $high);
5221
5222                 # Switch based on the map type...
5223                 if ($type == $HANGUL_SYLLABLE) {
5224
5225                     # These are entirely algorithmically determinable based on
5226                     # some constants furnished by Unicode; for now, just set a
5227                     # flag to indicate that have them.  Below we will output
5228                     # the code that does the algorithm.
5229                     $has_hangul_syllables = 1;
5230                 }
5231                 elsif ($type == $CP_IN_NAME) {
5232
5233                     # If the name ends in the code point it represents, are
5234                     # also algorithmically determinable, but need information
5235                     # about the map to do so.  Both the map and its inverse
5236                     # are stored in data structures output in the file.
5237                     push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
5238                     push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
5239
5240                     push @code_points_ending_in_code_point, { low => $low,
5241                                                               high => $high,
5242                                                               name => $map
5243                                                             };
5244                 }
5245                 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
5246
5247                     # Multi-code point maps and null string maps have an entry
5248                     # for each code point in the range.  They use the same
5249                     # output format.
5250                     for my $code_point ($low .. $high) {
5251
5252                         # The pack() below can't cope with surrogates.
5253                         if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
5254                             Carp::my_carp("Surrogage code point '$code_point' in mapping to '$map' in $self.  No map created");
5255                             next;
5256                         }
5257
5258                         # Generate the hash entries for these in the form that
5259                         # utf8.c understands.
5260                         my $tostr = "";
5261                         foreach my $to (split " ", $map) {
5262                             if ($to !~ /^$code_point_re$/) {
5263                                 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self.  No map created");
5264                                 next;
5265                             }
5266                             $tostr .= sprintf "\\x{%s}", $to;
5267                         }
5268
5269                         # I (khw) have never waded through this line to
5270                         # understand it well enough to comment it.
5271                         my $utf8 = sprintf(qq["%s" => "$tostr",],
5272                                 join("", map { sprintf "\\x%02X", $_ }
5273                                     unpack("U0C*", pack("U", $code_point))));
5274
5275                         # Add a comment so that a human reader can more easily
5276                         # see what's going on.
5277                         push @multi_code_point_maps,
5278                                 sprintf("%-45s # U+%04X => %s", $utf8,
5279                                                                 $code_point,
5280                                                                 $map);
5281                     }
5282                 }
5283                 else {
5284                     Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self.  Using type 0 instead");
5285                     $range_map->add_map($low, $high, $map, Replace => $UNCONDITIONALLY, Type => 0);
5286                 }
5287             } # End of loop through all ranges
5288
5289             # Here have gone through the whole file.  If actually generated
5290             # anything for each map type, add its respective header and
5291             # trailer
5292             if (@multi_code_point_maps) {
5293                 $pre_body .= <<END;
5294
5295 # Some code points require special handling because their mappings are each to
5296 # multiple code points.  These do not appear in the main body, but are defined
5297 # in the hash below.
5298
5299 # The key: UTF-8 _bytes_, the value: UTF-8 (speed hack)
5300 %utf8::ToSpec$name = (
5301 END
5302                 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
5303             }
5304
5305             if ($has_hangul_syllables || @code_points_ending_in_code_point) {
5306
5307                 # Convert these structures to output format.
5308                 my $code_points_ending_in_code_point =
5309                     main::simple_dumper(\@code_points_ending_in_code_point,
5310                                         ' ' x 8);
5311                 my $names = main::simple_dumper(\%names_ending_in_code_point,
5312                                                 ' ' x 8);
5313
5314                 # Do the same with the Hangul names,
5315                 my $jamo;
5316                 my $jamo_l;
5317                 my $jamo_v;
5318                 my $jamo_t;
5319                 my $jamo_re;
5320                 if ($has_hangul_syllables) {
5321
5322                     # Construct a regular expression of all the possible
5323                     # combinations of the Hangul syllables.
5324                     my @L_re;   # Leading consonants
5325                     for my $i ($LBase .. $LBase + $LCount - 1) {
5326                         push @L_re, $Jamo{$i}
5327                     }
5328                     my @V_re;   # Middle vowels
5329                     for my $i ($VBase .. $VBase + $VCount - 1) {
5330                         push @V_re, $Jamo{$i}
5331                     }
5332                     my @T_re;   # Trailing consonants
5333                     for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
5334                         push @T_re, $Jamo{$i}
5335                     }
5336
5337                     # The whole re is made up of the L V T combination.
5338                     $jamo_re = '('
5339                                . join ('|', sort @L_re)
5340                                . ')('
5341                                . join ('|', sort @V_re)
5342                                . ')('
5343                                . join ('|', sort @T_re)
5344                                . ')?';
5345
5346                     # These hashes needed by the algorithm were generated
5347                     # during reading of the Jamo.txt file
5348                     $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
5349                     $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
5350                     $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
5351                     $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
5352                 }
5353
5354                 $pre_body .= <<END;
5355
5356 # To achieve significant memory savings when this file is read in,
5357 # algorithmically derivable code points are omitted from the main body below.
5358 # Instead, the following routines can be used to translate between name and
5359 # code point and vice versa
5360
5361 { # Closure
5362
5363     # Matches legal code point.  4-6 hex numbers, If there are 6, the
5364     # first two must be '10'; if there are 5, the first must not be a '0'.
5365     my \$code_point_re = qr/$code_point_re/;
5366
5367     # In the following hash, the keys are the bases of names which includes
5368     # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The values
5369     # of each key is another hash which is used to get the low and high ends
5370     # for each range of code points that apply to the name
5371     my %names_ending_in_code_point = (
5372 $names
5373     );
5374
5375     # And the following array gives the inverse mapping from code points to
5376     # names.  Lowest code points are first
5377     my \@code_points_ending_in_code_point = (
5378 $code_points_ending_in_code_point
5379     );
5380 END
5381                 # Earlier releases didn't have Jamos.  No sense outputting
5382                 # them unless will be used.
5383                 if ($has_hangul_syllables) {
5384                     $pre_body .= <<END;
5385
5386     # Convert from code point to Jamo short name for use in composing Hangul
5387     # syllable names
5388     my %Jamo = (
5389 $jamo
5390     );
5391
5392     # Leading consonant (can be null)
5393     my %Jamo_L = (
5394 $jamo_l
5395     );
5396
5397     # Vowel
5398     my %Jamo_V = (
5399 $jamo_v
5400     );
5401
5402     # Optional trailing consonant
5403     my %Jamo_T = (
5404 $jamo_t
5405     );
5406
5407     # Computed re that splits up a Hangul name into LVT or LV syllables
5408     my \$syllable_re = qr/$jamo_re/;
5409
5410     my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
5411     my \$HANGUL_SYLLABLE_LENGTH = length \$HANGUL_SYLLABLE;
5412
5413     # These constants names and values were taken from the Unicode standard,
5414     # version 5.1, section 3.12.  They are used in conjunction with Hangul
5415     # syllables
5416     my \$SBase = 0xAC00;
5417     my \$LBase = 0x1100;
5418     my \$VBase = 0x1161;
5419     my \$TBase = 0x11A7;
5420     my \$SCount = 11172;
5421     my \$LCount = 19;
5422     my \$VCount = 21;
5423     my \$TCount = 28;
5424     my \$NCount = \$VCount * \$TCount;
5425 END
5426                 } # End of has Jamos
5427
5428                 $pre_body .= << 'END';
5429
5430     sub name_to_code_point_special {
5431         my $name = shift;
5432
5433         # Returns undef if not one of the specially handled names; otherwise
5434         # returns the code point equivalent to the input name
5435 END
5436                 if ($has_hangul_syllables) {
5437                     $pre_body .= << 'END';
5438
5439         if (substr($name, 0, $HANGUL_SYLLABLE_LENGTH) eq $HANGUL_SYLLABLE) {
5440             $name = substr($name, $HANGUL_SYLLABLE_LENGTH);
5441             return if $name !~ qr/^$syllable_re$/;
5442             my $L = $Jamo_L{$1};
5443             my $V = $Jamo_V{$2};
5444             my $T = (defined $3) ? $Jamo_T{$3} : 0;
5445             return ($L * $VCount + $V) * $TCount + $T + $SBase;
5446         }
5447 END
5448                 }
5449                 $pre_body .= << 'END';
5450
5451         # Name must end in '-code_point' for this to handle.
5452         if ($name !~ /^ (.*) - ($code_point_re) $/x) {
5453             return;
5454         }
5455
5456         my $base = $1;
5457         my $code_point = CORE::hex $2;
5458
5459         # Name must be one of the ones which has the code point in it.
5460         return if ! $names_ending_in_code_point{$base};
5461
5462         # Look through the list of ranges that apply to this name to see if
5463         # the code point is in one of them.
5464         for (my $i = 0; $i < scalar @{$names_ending_in_code_point{$base}{'low'}}; $i++) {
5465             return if $names_ending_in_code_point{$base}{'low'}->[$i] > $code_point;
5466             next if $names_ending_in_code_point{$base}{'high'}->[$i] < $code_point;
5467
5468             # Here, the code point is in the range.
5469             return $code_point;
5470         }
5471
5472         # Here, looked like the name had a code point number in it, but
5473         # did not match one of the valid ones.
5474         return;
5475     }
5476
5477     sub code_point_to_name_special {
5478         my $code_point = shift;
5479
5480         # Returns the name of a code point if algorithmically determinable;
5481         # undef if not
5482 END
5483                 if ($has_hangul_syllables) {
5484                     $pre_body .= << 'END';
5485
5486         # If in the Hangul range, calculate the name based on Unicode's
5487         # algorithm
5488         if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
5489             use integer;
5490             my $SIndex = $code_point - $SBase;
5491             my $L = $LBase + $SIndex / $NCount;
5492             my $V = $VBase + ($SIndex % $NCount) / $TCount;
5493             my $T = $TBase + $SIndex % $TCount;
5494             $name = "$HANGUL_SYLLABLE $Jamo{$L}$Jamo{$V}";
5495             $name .= $Jamo{$T} if $T != $TBase;
5496             return $name;
5497         }
5498 END
5499                 }
5500                 $pre_body .= << 'END';
5501
5502         # Look through list of these code points for one in range.
5503         foreach my $hash (@code_points_ending_in_code_point) {
5504             return if $code_point < $hash->{'low'};
5505             if ($code_point <= $hash->{'high'}) {
5506                 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
5507             }
5508         }
5509         return;            # None found
5510     }
5511 } # End closure
5512
5513 END
5514             } # End of has hangul or code point in name maps.
5515         } # End of has specials
5516
5517         # Calculate the format of the table if not already done.
5518         my $format = $format{$addr};
5519         my $property = $self->property;
5520         my $type = $property->type;
5521         if (! defined $format) {
5522             if ($type == $BINARY) {
5523
5524                 # Don't bother checking the values, because we elsewhere
5525                 # verify that a binary table has only 2 values.
5526                 $format = $BINARY_FORMAT;
5527             }
5528             else {
5529                 my @ranges = $self->_range_list->ranges;
5530
5531                 # default an empty table based on its type and default map
5532                 if (! @ranges) {
5533
5534                     # But it turns out that the only one we can say is a
5535                     # non-string (besides binary, handled above) is when the
5536                     # table is a string and the default map is to a code point
5537                     if ($type == $STRING && $default_map eq $CODE_POINT) {
5538                         $format = $HEX_FORMAT;
5539                     }
5540                     else {
5541                         $format = $STRING_FORMAT;
5542                     }
5543                 }
5544                 else {
5545
5546                     # Start with the most restrictive format, and as we find
5547                     # something that doesn't fit with that, change to the next
5548                     # most restrictive, and so on.
5549                     $format = $DECIMAL_FORMAT;
5550                     foreach my $range (@ranges) {
5551                         my $map = $range->value;
5552                         if ($map ne $default_map) {
5553                             last if $format eq $STRING_FORMAT;  # already at
5554                                                                 # least
5555                                                                 # restrictive
5556                             $format = $INTEGER_FORMAT
5557                                                 if $format eq $DECIMAL_FORMAT
5558                                                     && $map !~ / ^ [0-9] $ /x;
5559                             $format = $FLOAT_FORMAT
5560                                             if $format eq $INTEGER_FORMAT
5561                                                 && $map !~ / ^ -? [0-9]+ $ /x;
5562                             $format = $RATIONAL_FORMAT
5563                                 if $format eq $FLOAT_FORMAT
5564                                     && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
5565                             $format = $HEX_FORMAT
5566                             if $format eq $RATIONAL_FORMAT
5567                                 && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
5568                             $format = $STRING_FORMAT if $format eq $HEX_FORMAT
5569                                                        && $map =~ /[^0-9A-F]/;
5570                         }
5571                     }
5572                 }
5573             }
5574         } # end of calculating format
5575
5576         my $return = <<END;
5577 # The name this swash is to be known by, with the format of the mappings in
5578 # the main body of the table, and what all code points missing from this file
5579 # map to.
5580 \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
5581 END
5582         my $missing = $default_map;
5583         if ($missing eq $CODE_POINT
5584             && $format ne $HEX_FORMAT
5585             && ! defined $format{$addr})    # Is expected if was manually set
5586         {
5587             Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
5588         }
5589         $format{$addr} = $format;
5590         $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$missing';";
5591         if ($missing eq $CODE_POINT) {
5592             $return .= ' # code point maps to itself';
5593         }
5594         elsif ($missing eq "") {
5595             $return .= ' # code point maps to the null string';
5596         }
5597         $return .= "\n";
5598
5599         $return .= $pre_body;
5600
5601         return $return;
5602     }
5603
5604     sub write {
5605         # Write the table to the file.
5606
5607         my $self = shift;
5608         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5609
5610         my $addr = main::objaddr $self;
5611
5612         return $self->SUPER::write(
5613             ($self->property == $block)
5614                 ? 7     # block file needs more tab stops
5615                 : 3,
5616             $default_map{$addr});   # don't write defaulteds
5617     }
5618
5619     # Accessors for the underlying list that should fail if locked.
5620     for my $sub qw(
5621                     add_duplicate
5622                 )
5623     {
5624         no strict "refs";
5625         *$sub = sub {
5626             use strict "refs";
5627             my $self = shift;
5628
5629             return if $self->carp_if_locked;
5630             return $self->_range_list->$sub(@_);
5631         }
5632     }
5633 } # End closure for Map_Table
5634
5635 package Match_Table;
5636 use base '_Base_Table';
5637
5638 # A Match table is one which is a list of all the code points that have
5639 # the same property and property value, for use in \p{property=value}
5640 # constructs in regular expressions.  It adds very little data to the base
5641 # structure, but many methods, as these lists can be combined in many ways to
5642 # form new ones.
5643 # There are only a few concepts added:
5644 # 1) Equivalents and Relatedness.
5645 #    Two tables can match the identical code points, but have different names.
5646 #    This always happens when there is a perl single form extension
5647 #    \p{IsProperty} for the Unicode compound form \P{Property=True}.  The two
5648 #    tables are set to be related, with the Perl extension being a child, and
5649 #    the Unicode property being the parent.
5650 #
5651 #    It may be that two tables match the identical code points and we don't
5652 #    know if they are related or not.  This happens most frequently when the
5653 #    Block and Script properties have the exact range.  But note that a
5654 #    revision to Unicode could add new code points to the script, which would
5655 #    now have to be in a different block (as the block was filled, or there
5656 #    would have been 'Unknown' script code points in it and they wouldn't have
5657 #    been identical).  So we can't rely on any two properties from Unicode
5658 #    always matching the same code points from release to release, and thus
5659 #    these tables are considered coincidentally equivalent--not related.  When
5660 #    two tables are unrelated but equivalent, one is arbitrarily chosen as the
5661 #    'leader', and the others are 'equivalents'.  This concept is useful
5662 #    to minimize the number of tables written out.  Only one file is used for
5663 #    any identical set of code points, with entries in Heavy.pl mapping all
5664 #    the involved tables to it.
5665 #
5666 #    Related tables will always be identical; we set them up to be so.  Thus
5667 #    if the Unicode one is deprecated, the Perl one will be too.  Not so for
5668 #    unrelated tables.  Relatedness makes generating the documentation easier.
5669 #
5670 # 2) Conflicting.  It may be that there will eventually be name clashes, with
5671 #    the same name meaning different things.  For a while, there actually were
5672 #    conflicts, but they have so far been resolved by changing Perl's or
5673 #    Unicode's definitions to match the other, but when this code was written,
5674 #    it wasn't clear that that was what was going to happen.  (Unicode changed
5675 #    because of protests during their beta period.)  Name clashes are warned
5676 #    about during compilation, and the documentation.  The generated tables
5677 #    are sane, free of name clashes, because the code suppresses the Perl
5678 #    version.  But manual intervention to decide what the actual behavior
5679 #    should be may be required should this happen.  The introductory comments
5680 #    have more to say about this.
5681
5682 sub standardize { return main::standardize($_[0]); }
5683 sub trace { return main::trace(@_); }
5684
5685
5686 { # Closure
5687
5688     main::setup_package();
5689
5690     my %leader;
5691     # The leader table of this one; initially $self.
5692     main::set_access('leader', \%leader, 'r');
5693
5694     my %equivalents;
5695     # An array of any tables that have this one as their leader
5696     main::set_access('equivalents', \%equivalents, 'readable_array');
5697
5698     my %parent;
5699     # The parent table to this one, initially $self.  This allows us to
5700     # distinguish between equivalent tables that are related, and those which
5701     # may not be, but share the same output file because they match the exact
5702     # same set of code points in the current Unicode release.
5703     main::set_access('parent', \%parent, 'r');
5704
5705     my %children;
5706     # An array of any tables that have this one as their parent
5707     main::set_access('children', \%children, 'readable_array');
5708
5709     my %conflicting;
5710     # Array of any tables that would have the same name as this one with
5711     # a different meaning.  This is used for the generated documentation.
5712     main::set_access('conflicting', \%conflicting, 'readable_array');
5713
5714     my %matches_all;
5715     # Set in the constructor for tables that are expected to match all code
5716     # points.
5717     main::set_access('matches_all', \%matches_all, 'r');
5718
5719     sub new {
5720         my $class = shift;
5721
5722         my %args = @_;
5723
5724         # The property for which this table is a listing of property values.
5725         my $property = delete $args{'_Property'};
5726
5727         my $name = delete $args{'Name'};
5728         my $full_name = delete $args{'Full_Name'};
5729         $full_name = $name if ! defined $full_name;
5730
5731         # Optional
5732         my $initialize = delete $args{'Initialize'};
5733         my $matches_all = delete $args{'Matches_All'} || 0;
5734         # Rest of parameters passed on.
5735
5736         my $range_list = Range_List->new(Initialize => $initialize,
5737                                          Owner => $property);
5738
5739         my $complete = $full_name;
5740         $complete = '""' if $complete eq "";  # A null name shouldn't happen,
5741                                               # but this helps debug if it
5742                                               # does
5743         # The complete name for a match table includes it's property in a
5744         # compound form 'property=table', except if the property is the
5745         # pseudo-property, perl, in which case it is just the single form,
5746         # 'table' (If you change the '=' must also change the ':' in lots of
5747         # places in this program that assume an equal sign)
5748         $complete = $property->full_name . "=$complete" if $property != $perl;
5749
5750
5751         my $self = $class->SUPER::new(%args,
5752                                       Name => $name,
5753                                       Complete_Name => $complete,
5754                                       Full_Name => $full_name,
5755                                       _Property => $property,
5756                                       _Range_List => $range_list,
5757                                       );
5758         my $addr = main::objaddr $self;
5759
5760         $conflicting{$addr} = [ ];
5761         $equivalents{$addr} = [ ];
5762         $children{$addr} = [ ];
5763         $matches_all{$addr} = $matches_all;
5764         $leader{$addr} = $self;
5765         $parent{$addr} = $self;
5766
5767         return $self;
5768     }
5769
5770     # See this program's beginning comment block about overloading these.
5771     use overload
5772         fallback => 0,
5773         qw("") => "_operator_stringify",
5774         '=' => sub {
5775                     my $self = shift;
5776
5777                     return if $self->carp_if_locked;
5778                     return $self;
5779                 },
5780
5781         '+' => sub {
5782                         my $self = shift;
5783                         my $other = shift;
5784
5785                         return $self->_range_list + $other;
5786                     },
5787         '&' => sub {
5788                         my $self = shift;
5789                         my $other = shift;
5790
5791                         return $self->_range_list & $other;
5792                     },
5793         '+=' => sub {
5794                         my $self = shift;
5795                         my $other = shift;
5796
5797                         return if $self->carp_if_locked;
5798
5799                         my $addr = main::objaddr $self;
5800
5801                         if (ref $other) {
5802
5803                             # Change the range list of this table to be the
5804                             # union of the two.
5805                             $self->_set_range_list($self->_range_list
5806                                                     + $other);
5807                         }
5808                         else {    # $other is just a simple value
5809                             $self->add_range($other, $other);
5810                         }
5811                         return $self;
5812                     },
5813         '-' => sub { my $self = shift;
5814                     my $other = shift;
5815                     my $reversed = shift;
5816
5817                     if ($reversed) {
5818                         Carp::my_carp_bug("Can't cope with a "
5819                             .  __PACKAGE__
5820                             . " being the first parameter in a '-'.  Subtraction ignored.");
5821                         return;
5822                     }
5823
5824                     return $self->_range_list - $other;
5825                 },
5826         '~' => sub { my $self = shift;
5827                     return ~ $self->_range_list;
5828                 },
5829     ;
5830
5831     sub _operator_stringify {
5832         my $self = shift;
5833
5834         my $name = $self->complete_name;
5835         return "Table '$name'";
5836     }
5837
5838     sub add_alias {
5839         # Add a synonym for this table.  See the comments in the base class
5840
5841         my $self = shift;
5842         my $name = shift;
5843         # Rest of parameters passed on.
5844
5845         $self->SUPER::add_alias($name, $self, @_);
5846         return;
5847     }
5848
5849     sub add_conflicting {
5850         # Add the name of some other object to the list of ones that name
5851         # clash with this match table.
5852
5853         my $self = shift;
5854         my $conflicting_name = shift;   # The name of the conflicting object
5855         my $p = shift || 'p';           # Optional, is this a \p{} or \P{} ?
5856         my $conflicting_object = shift; # Optional, the conflicting object
5857                                         # itself.  This is used to
5858                                         # disambiguate the text if the input
5859                                         # name is identical to any of the
5860                                         # aliases $self is known by.
5861                                         # Sometimes the conflicting object is
5862                                         # merely hypothetical, so this has to
5863                                         # be an optional parameter.
5864         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5865
5866         my $addr = main::objaddr $self;
5867
5868         # Check if the conflicting name is exactly the same as any existing
5869         # alias in this table (as long as there is a real object there to
5870         # disambiguate with).
5871         if (defined $conflicting_object) {
5872             foreach my $alias ($self->aliases) {
5873                 if ($alias->name eq $conflicting_name) {
5874
5875                     # Here, there is an exact match.  This results in
5876                     # ambiguous comments, so disambiguate by changing the
5877                     # conflicting name to its object's complete equivalent.
5878                     $conflicting_name = $conflicting_object->complete_name;
5879                     last;
5880                 }
5881             }
5882         }
5883
5884         # Convert to the \p{...} final name
5885         $conflicting_name = "\\$p" . "{$conflicting_name}";
5886
5887         # Only add once
5888         return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
5889
5890         push @{$conflicting{$addr}}, $conflicting_name;
5891
5892         return;
5893     }
5894
5895     sub is_equivalent_to {
5896         # Return boolean of whether or not the other object is a table of this
5897         # type and has been marked equivalent to this one.
5898
5899         my $self = shift;
5900         my $other = shift;
5901         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5902
5903         return 0 if ! defined $other; # Can happen for incomplete early
5904                                       # releases
5905         unless ($other->isa(__PACKAGE__)) {
5906             my $ref_other = ref $other;
5907             my $ref_self = ref $self;
5908             Carp::my_carp_bug("Argument to 'is_equivalent_to' must be another $ref_self, not a '$ref_other'.  $other not set equivalent to $self.");
5909             return 0;
5910         }
5911
5912         # Two tables are equivalent if they have the same leader.
5913         return $leader{main::objaddr $self}
5914                 == $leader{main::objaddr $other};
5915         return;
5916     }
5917
5918     sub matches_identically_to {
5919         # Return a boolean as to whether or not two tables match identical
5920         # sets of code points.
5921
5922         my $self = shift;
5923         my $other = shift;
5924         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5925
5926         unless ($other->isa(__PACKAGE__)) {
5927             my $ref_other = ref $other;
5928             my $ref_self = ref $self;
5929             Carp::my_carp_bug("Argument to 'matches_identically_to' must be another $ref_self, not a '$ref_other'.  $other not set equivalent to $self.");
5930             return 0;
5931         }
5932
5933         # These are ordered in increasing real time to figure out (at least
5934         # until a patch changes that and doesn't change this)
5935         return 0 if $self->max != $other->max;
5936         return 0 if $self->min != $other->min;
5937         return 0 if $self->range_count != $other->range_count;
5938         return 0 if $self->count != $other->count;
5939
5940         # Here they could be identical because all the tests above passed.
5941         # The loop below is somewhat simpler since we know they have the same
5942         # number of elements.  Compare range by range, until reach the end or
5943         # find something that differs.
5944         my @a_ranges = $self->_range_list->ranges;
5945         my @b_ranges = $other->_range_list->ranges;
5946         for my $i (0 .. @a_ranges - 1) {
5947             my $a = $a_ranges[$i];
5948             my $b = $b_ranges[$i];
5949             trace "self $a; other $b" if main::DEBUG && $to_trace;
5950             return 0 if $a->start != $b->start || $a->end != $b->end;
5951         }
5952         return 1;
5953     }
5954
5955     sub set_equivalent_to {
5956         # Set $self equivalent to the parameter table.
5957         # The required Related => 'x' parameter is a boolean indicating
5958         # whether these tables are related or not.  If related, $other becomes
5959         # the 'parent' of $self; if unrelated it becomes the 'leader'
5960         #
5961         # Related tables share all characteristics except names; equivalents
5962         # not quite so many.
5963         # If they are related, one must be a perl extension.  This is because
5964         # we can't guarantee that Unicode won't change one or the other in a
5965         # later release even if they are idential now.
5966
5967         my $self = shift;
5968         my $other = shift;
5969
5970         my %args = @_;
5971         my $related = delete $args{'Related'};
5972
5973         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5974
5975         return if ! defined $other;     # Keep on going; happens in some early
5976                                         # Unicode releases.
5977
5978         if (! defined $related) {
5979             Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter.  Assuming $self is not related to $other");
5980             $related = 0;
5981         }
5982
5983         # If already are equivalent, no need to re-do it;  if subroutine
5984         # returns null, it found an error, also do nothing
5985         my $are_equivalent = $self->is_equivalent_to($other);
5986         return if ! defined $are_equivalent || $are_equivalent;
5987
5988         my $current_leader = ($related)
5989                              ? $parent{main::objaddr $self}
5990                              : $leader{main::objaddr $self};
5991
5992         if ($related &&
5993             ! $other->perl_extension
5994             && ! $current_leader->perl_extension)
5995         {
5996             Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties.  Assuming $self is not related to $other");
5997             $related = 0;
5998         }
5999
6000         my $leader = main::objaddr $current_leader;
6001         my $other_addr = main::objaddr $other;
6002
6003         # Any tables that are equivalent to or children of this table must now
6004         # instead be equivalent to or (children) to the new leader (parent),
6005         # still equivalent.  The equivalency includes their matches_all info,
6006         # and for related tables, their status
6007         # All related tables are of necessity equivalent, but the converse
6008         # isn't necessarily true
6009         my $status = $other->status;
6010         my $status_info = $other->status_info;
6011         my $matches_all = $matches_all{other_addr};
6012         foreach my $table ($current_leader, @{$equivalents{$leader}}) {
6013             next if $table == $other;
6014             trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
6015
6016             my $table_addr = main::objaddr $table;
6017             $leader{$table_addr} = $other;
6018             $matches_all{$table_addr} = $matches_all;
6019             $self->_set_range_list($other->_range_list);
6020             push @{$equivalents{$other_addr}}, $table;
6021             if ($related) {
6022                 $parent{$table_addr} = $other;
6023                 push @{$children{$other_addr}}, $table;
6024                 $table->set_status($status, $status_info);
6025             }
6026         }
6027
6028         # Now that we've declared these to be equivalent, any changes to one
6029         # of the tables would invalidate that equivalency.
6030         $self->lock;
6031         $other->lock;
6032         return;
6033     }
6034
6035     sub add_range { # Add a range to the list for this table.
6036         my $self = shift;
6037         # Rest of parameters passed on
6038
6039         return if $self->carp_if_locked;
6040         return $self->_range_list->add_range(@_);
6041     }
6042
6043     sub pre_body {  # Does nothing for match tables.
6044         return
6045     }
6046
6047     sub append_to_body {  # Does nothing for match tables.
6048         return
6049     }
6050
6051     sub write {
6052         my $self = shift;
6053         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6054
6055         return $self->SUPER::write(2); # 2 tab stops
6056     }
6057
6058     sub set_final_comment {
6059         # This creates a comment for the file that is to hold the match table
6060         # $self.  It is somewhat convoluted to make the English read nicely,
6061         # but, heh, it's just a comment.
6062         # This should be called only with the leader match table of all the
6063         # ones that share the same file.  It lists all such tables, ordered so
6064         # that related ones are together.
6065
6066         my $leader = shift;   # Should only be called on the leader table of
6067                               # an equivalent group
6068         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6069
6070         my $addr = main::objaddr $leader;
6071
6072         if ($leader{$addr} != $leader) {
6073             Carp::my_carp_bug(<<END
6074 set_final_comment() must be called on a leader table, which $leader is not.
6075 It is equivalent to $leader{$addr}.  No comment created
6076 END
6077             );
6078             return;
6079         }
6080
6081         # Get the number of code points matched by each of the tables in this
6082         # file, and add underscores for clarity.
6083         my $count = $leader->count;
6084         my $string_count = main::clarify_number($count);
6085
6086         my $loose_count = 0;        # how many aliases loosely matched
6087         my $compound_name = "";     # ? Are any names compound?, and if so, an
6088                                     # example
6089         my $properties_with_compound_names = 0;    # count of these
6090
6091
6092         my %flags;              # The status flags used in the file
6093         my $total_entries = 0;  # number of entries written in the comment
6094         my $matches_comment = ""; # The portion of the comment about the
6095                                   # \p{}'s
6096         my @global_comments;    # List of all the tables' comments that are
6097                                 # there before this routine was called.
6098
6099         # Get list of all the parent tables that are equivalent to this one
6100         # (including itself).
6101         my @parents = grep { $parent{main::objaddr $_} == $_ }
6102                             main::uniques($leader, @{$equivalents{$addr}});
6103         my $has_unrelated = (@parents >= 2);  # boolean, ? are there unrelated
6104                                               # tables
6105
6106         for my $parent (@parents) {
6107
6108             my $property = $parent->property;
6109
6110             # Special case 'N' tables in properties with two match tables when
6111             # the other is a 'Y' one.  These are likely to be binary tables,
6112             # but not necessarily.  In either case, \P{} will match the
6113             # complement of \p{}, and so if something is a synonym of \p, the
6114             # complement of that something will be the synonym of \P.  This
6115             # would be true of any property with just two match tables, not
6116             # just those whose values are Y and N; but that would require a
6117             # little extra work, and there are none such so far in Unicode.
6118             my $perl_p = 'p';        # which is it?  \p{} or \P{}
6119             my @yes_perl_synonyms;   # list of any synonyms for the 'Y' table
6120
6121             if (scalar $property->tables == 2
6122                 && $parent == $property->table('N')
6123                 && defined (my $yes = $property->table('Y')))
6124             {
6125                 my $yes_addr = main::objaddr $yes;
6126                 @yes_perl_synonyms
6127                     = grep { $_->property == $perl }
6128                                     main::uniques($yes,
6129                                                 $parent{$yes_addr},
6130                                                 $parent{$yes_addr}->children);
6131
6132                 # But these synonyms are \P{} ,not \p{}
6133                 $perl_p = 'P';
6134             }
6135
6136             my @description;        # Will hold the table description
6137             my @note;               # Will hold the table notes.
6138             my @conflicting;        # Will hold the table conflicts.
6139
6140             # Look at the parent, any yes synonyms, and all the children
6141             for my $table ($parent,
6142                            @yes_perl_synonyms,
6143                            @{$children{main::objaddr $parent}})
6144             {
6145                 my $table_addr = main::objaddr $table;
6146                 my $table_property = $table->property;
6147
6148                 # Tables are separated by a blank line to create a grouping.
6149                 $matches_comment .= "\n" if $matches_comment;
6150
6151                 # The table is named based on the property and value
6152                 # combination it is for, like script=greek.  But there may be
6153                 # a number of synonyms for each side, like 'sc' for 'script',
6154                 # and 'grek' for 'greek'.  Any combination of these is a valid
6155                 # name for this table.  In this case, there are three more,
6156                 # 'sc=grek', 'sc=greek', and 'script='grek'.  Rather than
6157                 # listing all possible combinations in the comment, we make
6158                 # sure that each synonym occurs at least once, and add
6159                 # commentary that the other combinations are possible.
6160                 my @property_aliases = $table_property->aliases;
6161                 my @table_aliases = $table->aliases;
6162
6163                 Carp::my_carp_bug("$table doesn't have any names.  Proceeding anyway.") unless @table_aliases;
6164
6165                 # The alias lists above are already ordered in the order we
6166                 # want to output them.  To ensure that each synonym is listed,
6167                 # we must use the max of the two numbers.
6168                 my $listed_combos = main::max(scalar @table_aliases,
6169                                                 scalar @property_aliases);
6170                 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
6171
6172                 my $property_had_compound_name = 0;
6173
6174                 for my $i (0 .. $listed_combos - 1) {
6175                     $total_entries++;
6176
6177                     # The current alias for the property is the next one on
6178                     # the list, or if beyond the end, start over.  Similarly
6179                     # for the table (\p{prop=table})
6180                     my $property_alias = $property_aliases
6181                                             [$i % @property_aliases]->name;
6182                     my $table_alias_object = $table_aliases
6183                                                         [$i % @table_aliases];
6184                     my $table_alias = $table_alias_object->name;
6185                     my $loose_match = $table_alias_object->loose_match;
6186
6187                     if ($table_alias !~ /\D/) { # Clarify large numbers.
6188                         $table_alias = main::clarify_number($table_alias)
6189                     }
6190
6191                     # Add a comment for this alias combination
6192                     my $current_match_comment;
6193                     if ($table_property == $perl) {
6194                         $current_match_comment = "\\$perl_p"
6195                                                     . "{$table_alias}";
6196                     }
6197                     else {
6198                         $current_match_comment
6199                                         = "\\p{$property_alias=$table_alias}";
6200                         $property_had_compound_name = 1;
6201                     }
6202
6203                     # Flag any abnormal status for this table.
6204                     my $flag = $property->status
6205                                 || $table->status
6206                                 || $table_alias_object->status;
6207                     $flags{$flag} = $status_past_participles{$flag} if $flag;
6208
6209                     $loose_count++;
6210
6211                     # Pretty up the comment.  Note the \b; it says don't make
6212                     # this line a continuation.
6213                     $matches_comment .= sprintf("\b%-1s%-s%s\n",
6214                                         $flag,
6215                                         " " x 7,
6216                                         $current_match_comment);
6217                 } # End of generating the entries for this table.
6218
6219                 # Save these for output after this group of related tables.
6220                 push @description, $table->description;
6221                 push @note, $table->note;
6222                 push @conflicting, $table->conflicting;
6223
6224                 # Compute an alternate compound name using the final property
6225                 # synonym and the first table synonym with a colon instead of
6226                 # the equal sign used elsewhere.
6227                 if ($property_had_compound_name) {
6228                     $properties_with_compound_names ++;
6229                     if (! $compound_name || @property_aliases > 1) {
6230                         $compound_name = $property_aliases[-1]->name
6231                                         . ': '
6232                                         . $table_aliases[0]->name;
6233                     }
6234                 }
6235             } # End of looping through all children of this table
6236
6237             # Here have assembled in $matches_comment all the related tables
6238             # to the current parent (preceded by the same info for all the
6239             # previous parents).  Put out information that applies to all of
6240             # the current family.
6241             if (@conflicting) {
6242
6243                 # But output the conflicting information now, as it applies to
6244                 # just this table.
6245                 my $conflicting = join ", ", @conflicting;
6246                 if ($conflicting) {
6247                     $matches_comment .= <<END;
6248
6249     Note that contrary to what you might expect, the above is NOT the same as
6250 END
6251                     $matches_comment .= "any of: " if @conflicting > 1;
6252                     $matches_comment .= "$conflicting\n";
6253                 }
6254             }
6255             if (@description) {
6256                 $matches_comment .= "\n    Meaning: "
6257                                     . join('; ', @description)
6258                                     . "\n";
6259             }
6260             if (@note) {
6261                 $matches_comment .= "\n    Note: "
6262                                     . join("\n    ", @note)
6263                                     . "\n";
6264             }
6265         } # End of looping through all tables
6266
6267
6268         my $code_points;
6269         my $match;
6270         my $any_of_these;
6271         if ($count == 1) {
6272             $match = 'matches';
6273             $code_points = 'single code point';
6274         }
6275         else {
6276             $match = 'match';
6277             $code_points = "$string_count code points";
6278         }
6279
6280         my $synonyms;
6281         my $entries;
6282         if ($total_entries <= 1) {
6283             $synonyms = "";
6284             $entries = 'entry';
6285             $any_of_these = 'this'
6286         }
6287         else {
6288             $synonyms = " any of the following regular expression constructs";
6289             $entries = 'entries';
6290             $any_of_these = 'any of these'
6291         }
6292
6293         my $comment = "";
6294         if ($has_unrelated) {
6295             $comment .= <<END;
6296 This file is for tables that are not necessarily related:  To conserve
6297 resources, every table that matches the identical set of code points in this
6298 version of Unicode uses this file.  Each one is listed in a separate group
6299 below.  It could be that the tables will match the same set of code points in
6300 other Unicode releases, or it could be purely coincidence that they happen to
6301 be the same in Unicode $string_version, and hence may not in other versions.
6302
6303 END
6304         }
6305
6306         if (%flags) {
6307             foreach my $flag (sort keys %flags) {
6308                 $comment .= <<END;
6309 '$flag' below means that this form is $flags{$flag}.  Consult $pod_file.pod
6310 END
6311             }
6312             $comment .= "\n";
6313         }
6314
6315         $comment .= <<END;
6316 This file returns the $code_points in Unicode Version $string_version that
6317 $match$synonyms:
6318
6319 $matches_comment
6320 $pod_file.pod should be consulted for the rules on using $any_of_these,
6321 including if adding or subtracting white space, underscore, and hyphen
6322 characters matters or doesn't matter, and other permissible syntactic
6323 variants.  Upper/lower case distinctions never matter.
6324 END
6325
6326         if ($compound_name) {
6327             $comment .= <<END;
6328
6329 A colon can be substituted for the equals sign, and
6330 END
6331             if ($properties_with_compound_names > 1) {
6332                 $comment .= <<END;
6333 within each group above,
6334 END
6335             }
6336             $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
6337
6338             # Note the \b below, it says don't make that line a continuation.
6339             $comment .= <<END;
6340 anything to the left of the equals (or colon) can be combined with anything to
6341 the right.  Thus, for example,
6342 $compound_name
6343 \bis also valid.
6344 END
6345         }
6346
6347         # And append any comment(s) from the actual tables.  They are all
6348         # gathered here, so may not read all that well.
6349         $comment .= "\n" . join "\n\n", @global_comments if @global_comments;
6350
6351         if ($count) {   # The format differs if no code points, and needs no
6352                         # explanation in that case
6353                 $comment.= <<END;
6354
6355 The format of the lines of this file is:
6356 END
6357             $comment.= <<END;
6358 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
6359 STOP is the ending point, or if omitted, the range has just one code point.
6360 END
6361             if ($output_range_counts) {
6362                 $comment .= <<END;
6363 Numbers in comments in [brackets] indicate how many code points are in the
6364 range.
6365 END
6366             }
6367         }
6368
6369         $leader->set_comment(main::join_lines($comment));
6370         return;
6371     }
6372
6373     # Accessors for the underlying list
6374     for my $sub qw(
6375                     get_valid_code_point
6376                     get_invalid_code_point
6377                 )
6378     {
6379         no strict "refs";
6380         *$sub = sub {
6381             use strict "refs";
6382             my $self = shift;
6383
6384             return $self->_range_list->$sub(@_);
6385         }
6386     }
6387 } # End closure for Match_Table
6388
6389 package Property;
6390
6391 # The Property class represents a Unicode property, or the $perl
6392 # pseudo-property.  It contains a map table initialized empty at construction
6393 # time, and for properties accessible through regular expressions, various
6394 # match tables, created through the add_match_table() method, and referenced
6395 # by the table('NAME') or tables() methods, the latter returning a list of all
6396 # of the match tables.  Otherwise table operations implicitly are for the map
6397 # table.
6398 #
6399 # Most of the data in the property is actually about its map table, so it
6400 # mostly just uses that table's accessors for most methods.  The two could
6401 # have been combined into one object, but for clarity because of their
6402 # differing semantics, they have been kept separate.  It could be argued that
6403 # the 'file' and 'directory' fields should be kept with the map table.
6404 #
6405 # Each property has a type.  This can be set in the constructor, or in the
6406 # set_type accessor, but mostly it is figured out by the data.  Every property
6407 # starts with unknown type, overridden by a parameter to the constructor, or
6408 # as match tables are added, or ranges added to the map table, the data is
6409 # inspected, and the type changed.  After the table is mostly or entirely
6410 # filled, compute_type() should be called to finalize they analysis.
6411 #
6412 # There are very few operations defined.  One can safely remove a range from
6413 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
6414 # table to this one, replacing any in the intersection of the two.
6415
6416 sub standardize { return main::standardize($_[0]); }
6417 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
6418
6419 {   # Closure
6420
6421     # This hash will contain as keys, all the aliases of all properties, and
6422     # as values, pointers to their respective property objects.  This allows
6423     # quick look-up of a property from any of its names.
6424     my %alias_to_property_of;
6425
6426     sub dump_alias_to_property_of {
6427         # For debugging
6428
6429         print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
6430         return;
6431     }
6432
6433     sub property_ref {
6434         # This is a package subroutine, not called as a method.
6435         # If the single parameter is a literal '*' it returns a list of all
6436         # defined properties.
6437         # Otherwise, the single parameter is a name, and it returns a pointer
6438         # to the corresponding property object, or undef if none.
6439         #
6440         # Properties can have several different names.  The 'standard' form of
6441         # each of them is stored in %alias_to_property_of as they are defined.
6442         # But it's possible that this subroutine will be called with some
6443         # variant, so if the initial lookup fails, it is repeated with the
6444         # standarized form of the input name.  If found, besides returning the
6445         # result, the input name is added to the list so future calls won't
6446         # have to do the conversion again.
6447
6448         my $name = shift;
6449
6450         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6451
6452         if (! defined $name) {
6453             Carp::my_carp_bug("Undefined input property.  No action taken.");
6454             return;
6455         }
6456
6457         return main::uniques(values %alias_to_property_of) if $name eq '*';
6458
6459         # Return cached result if have it.
6460         my $result = $alias_to_property_of{$name};
6461         return $result if defined $result;
6462
6463         # Convert the input to standard form.
6464         my $standard_name = standardize($name);
6465
6466         $result = $alias_to_property_of{$standard_name};
6467         return unless defined $result;        # Don't cache undefs
6468
6469         # Cache the result before returning it.
6470         $alias_to_property_of{$name} = $result;
6471         return $result;
6472     }
6473
6474
6475     main::setup_package();
6476
6477     my %map;
6478     # A pointer to the map table object for this property
6479     main::set_access('map', \%map);
6480
6481     my %full_name;
6482     # The property's full name.  This is a duplicate of the copy kept in the
6483     # map table, but is needed because stringify needs it during
6484     # construction of the map table, and then would have a chicken before egg
6485     # problem.
6486     main::set_access('full_name', \%full_name, 'r');
6487
6488     my %table_ref;
6489     # This hash will contain as keys, all the aliases of any match tables
6490     # attached to this property, and as values, the pointers to their
6491     # respective tables.  This allows quick look-up of a table from any of its
6492     # names.
6493     main::set_access('table_ref', \%table_ref);
6494
6495     my %type;
6496     # The type of the property, $ENUM, $BINARY, etc
6497     main::set_access('type', \%type, 'r');
6498
6499     my %file;
6500     # The filename where the map table will go (if actually written).
6501     # Normally defaulted, but can be overridden.
6502     main::set_access('file', \%file, 'r', 's');
6503
6504     my %directory;
6505     # The directory where the map table will go (if actually written).
6506     # Normally defaulted, but can be overridden.
6507     main::set_access('directory', \%directory, 's');
6508
6509     my %pseudo_map_type;
6510     # This is used to affect the calculation of the map types for all the
6511     # ranges in the table.  It should be set to one of the values that signify
6512     # to alter the calculation.
6513     main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
6514
6515     my %has_only_code_point_maps;
6516     # A boolean used to help in computing the type of data in the map table.
6517     main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
6518
6519     my %unique_maps;
6520     # A list of the first few distinct mappings this property has.  This is
6521     # used to disambiguate between binary and enum property types, so don't
6522     # have to keep more than three.
6523     main::set_access('unique_maps', \%unique_maps);
6524
6525     sub new {
6526         # The only required parameter is the positionally first, name.  All
6527         # other parameters are key => value pairs.  See the documentation just
6528         # above for the meanings of the ones not passed directly on to the map
6529         # table constructor.
6530
6531         my $class = shift;
6532         my $name = shift || "";
6533
6534         my $self = property_ref($name);
6535         if (defined $self) {
6536             my $options_string = join ", ", @_;
6537             $options_string = ".  Ignoring options $options_string" if $options_string;
6538             Carp::my_carp("$self is already in use.  Using existing one$options_string;");
6539             return $self;
6540         }
6541
6542         my %args = @_;
6543
6544         $self = bless \do { my $anonymous_scalar }, $class;
6545         my $addr = main::objaddr $self;
6546
6547         $directory{$addr} = delete $args{'Directory'};
6548         $file{$addr} = delete $args{'File'};
6549         $full_name{$addr} = delete $args{'Full_Name'} || $name;
6550         $type{$addr} = delete $args{'Type'} || $UNKNOWN;
6551         $pseudo_map_type{$addr} = delete $args{'Map_Type'};
6552         # Rest of parameters passed on.
6553
6554         $has_only_code_point_maps{$addr} = 1;
6555         $table_ref{$addr} = { };
6556         $unique_maps{$addr} = { };
6557
6558         $map{$addr} = Map_Table->new($name,
6559                                     Full_Name => $full_name{$addr},
6560                                     _Alias_Hash => \%alias_to_property_of,
6561                                     _Property => $self,
6562                                     %args);
6563         return $self;
6564     }
6565
6566     # See this program's beginning comment block about overloading the copy
6567     # constructor.  Few operations are defined on properties, but a couple are
6568     # useful.  It is safe to take the inverse of a property, and to remove a
6569     # single code point from it.
6570     use overload
6571         fallback => 0,
6572         qw("") => "_operator_stringify",
6573         "." => \&main::_operator_dot,
6574         '==' => \&main::_operator_equal,
6575         '!=' => \&main::_operator_not_equal,
6576         '=' => sub { return shift },
6577         '-=' => "_minus_and_equal",
6578     ;
6579
6580     sub _operator_stringify {
6581         return "Property '" .  shift->full_name . "'";
6582     }
6583
6584     sub _minus_and_equal {
6585         # Remove a single code point from the map table of a property.
6586
6587         my $self = shift;
6588         my $other = shift;
6589         my $reversed = shift;
6590         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6591
6592         if (ref $other) {
6593             Carp::my_carp_bug("Can't cope with a "
6594                         . ref($other)
6595                         . " argument to '-='.  Subtraction ignored.");
6596             return $self;
6597         }
6598         elsif ($reversed) {   # Shouldnt happen in a -=, but just in case
6599             Carp::my_carp_bug("Can't cope with a "
6600             .  __PACKAGE__
6601             . " being the first parameter in a '-='.  Subtraction ignored.");
6602             return $self;
6603         }
6604         else {
6605             $map{main::objaddr $self}->delete_range($other, $other);
6606         }
6607         return $self;
6608     }
6609
6610     sub add_match_table {
6611         # Add a new match table for this property, with name given by the
6612         # parameter.  It returns a pointer to the table.
6613
6614         my $self = shift;
6615         my $name = shift;
6616         my %args = @_;
6617
6618         my $addr = main::objaddr $self;
6619
6620         my $table = $table_ref{$addr}{$name};
6621         my $standard_name = main::standardize($name);
6622         if (defined $table
6623             || (defined ($table = $table_ref{$addr}{$standard_name})))
6624         {
6625             Carp::my_carp("Table '$name' in $self is already in use.  Using existing one");
6626             $table_ref{$addr}{$name} = $table;
6627             return $table;
6628         }
6629         else {
6630
6631             # See if this is a perl extension, if not passed in.
6632             my $perl_extension = delete $args{'Perl_Extension'};
6633             $perl_extension
6634                         = $self->perl_extension if ! defined $perl_extension;
6635
6636             $table = Match_Table->new(
6637                                 Name => $name,
6638                                 Perl_Extension => $perl_extension,
6639                                 _Alias_Hash => $table_ref{$addr},
6640                                 _Property => $self,
6641
6642                                 # gets property's status by default
6643                                 Status => $self->status,
6644                                 _Status_Info => $self->status_info,
6645                                 %args,
6646                                 Internal_Only_Warning => 1); # Override any
6647                                                              # input param
6648             return unless defined $table;
6649         }
6650
6651         # Save the names for quick look up
6652         $table_ref{$addr}{$standard_name} = $table;
6653         $table_ref{$addr}{$name} = $table;
6654
6655         # Perhaps we can figure out the type of this property based on the
6656         # fact of adding this match table.  First, string properties don't
6657         # have match tables; second, a binary property can't have 3 match
6658         # tables
6659         if ($type{$addr} == $UNKNOWN) {
6660             $type{$addr} = $NON_STRING;
6661         }
6662         elsif ($type{$addr} == $STRING) {
6663             Carp::my_carp("$self Added a match table '$name' to a string property '$self'.  Changed it to a non-string property.  Bad News.");
6664             $type{$addr} = $NON_STRING;
6665         }
6666         elsif ($type{$addr} != $ENUM) {
6667             if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
6668                 && $type{$addr} == $BINARY)
6669             {
6670                 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary.  Changing its type to 'enum'.  Bad News.");
6671                 $type{$addr} = $ENUM;
6672             }
6673         }
6674
6675         return $table;
6676     }
6677
6678     sub table {
6679         # Return a pointer to the match table (with name given by the
6680         # parameter) associated with this property; undef if none.
6681
6682         my $self = shift;
6683         my $name = shift;
6684         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6685
6686         my $addr = main::objaddr $self;
6687
6688         return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
6689
6690         # If quick look-up failed, try again using the standard form of the
6691         # input name.  If that succeeds, cache the result before returning so
6692         # won't have to standardize this input name again.
6693         my $standard_name = main::standardize($name);
6694         return unless defined $table_ref{$addr}{$standard_name};
6695
6696         $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
6697         return $table_ref{$addr}{$name};
6698     }
6699
6700     sub tables {
6701         # Return a list of pointers to all the match tables attached to this
6702         # property
6703
6704         return main::uniques(values %{$table_ref{main::objaddr shift}});
6705     }
6706
6707     sub directory {
6708         # Returns the directory the map table for this property should be
6709         # output in.  If a specific directory has been specified, that has
6710         # priority;  'undef' is returned if the type isn't defined;
6711         # or $map_directory for everything else.
6712
6713         my $addr = main::objaddr shift;
6714
6715         return $directory{$addr} if defined $directory{$addr};
6716         return undef if $type{$addr} == $UNKNOWN;
6717         return $map_directory;
6718     }
6719
6720     sub swash_name {
6721         # Return the name that is used to both:
6722         #   1)  Name the file that the map table is written to.
6723         #   2)  The name of swash related stuff inside that file.
6724         # The reason for this is that the Perl core historically has used
6725         # certain names that aren't the same as the Unicode property names.
6726         # To continue using these, $file is hard-coded in this file for those,
6727         # but otherwise the standard name is used.  This is different from the
6728         # external_name, so that the rest of the files, like in lib can use
6729         # the standard name always, without regard to historical precedent.
6730
6731         my $self = shift;
6732         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6733
6734         my $addr = main::objaddr $self;
6735
6736         return $file{$addr} if defined $file{$addr};
6737         return $map{$addr}->external_name;
6738     }
6739
6740     sub to_create_match_tables {
6741         # Returns a boolean as to whether or not match tables should be
6742         # created for this property.
6743
6744         my $self = shift;
6745         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6746
6747         # The whole point of this pseudo property is match tables.
6748         return 1 if $self == $perl;
6749
6750         my $addr = main::objaddr $self;
6751
6752         # Don't generate tables of code points that match the property values
6753         # of a string property.  Such a list would most likely have many
6754         # property values, each with just one or very few code points mapping
6755         # to it.
6756         return 0 if $type{$addr} == $STRING;
6757
6758         # Don't generate anything for unimplemented properties.
6759         return 0 if grep { $self->complete_name eq $_ }
6760                                                     @unimplemented_properties;
6761         # Otherwise, do.
6762         return 1;
6763     }
6764
6765     sub property_add_or_replace_non_nulls {
6766         # This adds the mappings in the property $other to $self.  Non-null
6767         # mappings from $other override those in $self.  It essentially merges
6768         # the two properties, with the second having priority except for null
6769         # mappings.
6770
6771         my $self = shift;
6772         my $other = shift;
6773         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6774
6775         if (! $other->isa(__PACKAGE__)) {
6776             Carp::my_carp_bug("$other should be a "
6777                             . __PACKAGE__
6778                             . ".  Not a '"
6779                             . ref($other)
6780                             . "'.  Not added;");
6781             return;
6782         }
6783
6784         return $map{main::objaddr $self}->
6785                 map_add_or_replace_non_nulls($map{main::objaddr $other});
6786     }
6787
6788     sub set_type {
6789         # Set the type of the property.  Mostly this is figured out by the
6790         # data in the table.  But this is used to set it explicitly.  The
6791         # reason it is not a standard accessor is that when setting a binary
6792         # property, we need to make sure that all the true/false aliases are
6793         # present, as they were omitted in early Unicode releases.
6794
6795         my $self = shift;
6796         my $type = shift;
6797         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6798
6799         if ($type != $ENUM && $type != $BINARY && $type != $STRING) {
6800             Carp::my_carp("Unrecognized type '$type'.  Type not set");
6801             return;
6802         }
6803
6804         $type{main::objaddr $self} = $type;
6805         return if $type != $BINARY;
6806
6807         my $yes = $self->table('Y');
6808         $yes = $self->table('Yes') if ! defined $yes;
6809         $yes = $self->add_match_table('Y') if ! defined $yes;
6810         $yes->add_alias('Yes');
6811         $yes->add_alias('T');
6812         $yes->add_alias('True');
6813
6814         my $no = $self->table('N');
6815         $no = $self->table('No') if ! defined $no;
6816         $no = $self->add_match_table('N') if ! defined $no;
6817         $no->add_alias('No');
6818         $no->add_alias('F');
6819         $no->add_alias('False');
6820         return;
6821     }
6822
6823     sub add_map {
6824         # Add a map to the property's map table.  This also keeps
6825         # track of the maps so that the property type can be determined from
6826         # its data.
6827
6828         my $self = shift;
6829         my $start = shift;  # First code point in range
6830         my $end = shift;    # Final code point in range
6831         my $map = shift;    # What the range maps to.
6832         # Rest of parameters passed on.
6833
6834         my $addr = main::objaddr $self;
6835
6836         # If haven't the type of the property, gather information to figure it
6837         # out.
6838         if ($type{$addr} == $UNKNOWN) {
6839
6840             # If the map contains an interior blank or dash, or most other
6841             # nonword characters, it will be a string property.  This
6842             # heuristic may actually miss some string properties.  If so, they
6843             # may need to have explicit set_types called for them.  This
6844             # happens in the Unihan properties.
6845             if ($map =~ / (?<= . ) [ -] (?= . ) /x
6846                 || $map =~ / [^\w.\/\ -]  /x)
6847             {
6848                 $self->set_type($STRING);
6849
6850                 # $unique_maps is used for disambiguating between ENUM and
6851                 # BINARY later; since we know the property is not going to be
6852                 # one of those, no point in keeping the data around
6853                 undef $unique_maps{$addr};
6854             }
6855             else {
6856
6857                 # Not necessarily a string.  The final decision has to be
6858                 # deferred until all the data are in.  We keep track of if all
6859                 # the values are code points for that eventual decision.
6860                 $has_only_code_point_maps{$addr} &=
6861                                             $map =~ / ^ $code_point_re $/x;
6862
6863                 # For the purposes of disambiguating between binary and other
6864                 # enumerations at the end, we keep track of the first three
6865                 # distinct property values.  Once we get to three, we know
6866                 # it's not going to be binary, so no need to track more.
6867                 if (scalar keys %{$unique_maps{$addr}} < 3) {
6868                     $unique_maps{$addr}{main::standardize($map)} = 1;
6869                 }
6870             }
6871         }
6872
6873         # Add the mapping by calling our map table's method
6874         return $map{$addr}->add_map($start, $end, $map, @_);
6875     }
6876
6877     sub compute_type {
6878         # Compute the type of the property: $ENUM, $STRING, or $BINARY.  This
6879         # should be called after the property is mostly filled with its maps.
6880         # We have been keeping track of what the property values have been,
6881         # and now have the necessary information to figure out the type.
6882
6883         my $self = shift;
6884         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6885
6886         my $addr = main::objaddr($self);
6887
6888         my $type = $type{$addr};
6889
6890         # If already have figured these out, no need to do so again, but we do
6891         # a double check on ENUMS to make sure that a string property hasn't
6892         # improperly been classified as an ENUM, so continue on with those.
6893         return if $type == $STRING || $type == $BINARY;
6894
6895         # If every map is to a code point, is a string property.
6896         if ($type == $UNKNOWN
6897             && ($has_only_code_point_maps{$addr}
6898                 || (defined $map{$addr}->default_map
6899                     && $map{$addr}->default_map eq "")))
6900         {
6901             $self->set_type($STRING);
6902         }
6903         else {
6904
6905             # Otherwise, it is to some sort of enumeration.  (The case where
6906             # it is a Unicode miscellaneous property, and treated like a
6907             # string in this program is handled in add_map()).  Distinguish
6908             # between binary and some other enumeration type.  Of course, if
6909             # there are more than two values, it's not binary.  But more
6910             # subtle is the test that the default mapping is defined means it
6911             # isn't binary.  This in fact may change in the future if Unicode
6912             # changes the way its data is structured.  But so far, no binary
6913             # properties ever have @missing lines for them, so the default map
6914             # isn't defined for them.  The few properties that are two-valued
6915             # and aren't considered binary have the default map defined
6916             # starting in Unicode 5.0, when the @missing lines appeared; and
6917             # this program has special code to put in a default map for them
6918             # for earlier than 5.0 releases.
6919             if ($type == $ENUM
6920                 || scalar keys %{$unique_maps{$addr}} > 2
6921                 || defined $self->default_map)
6922             {
6923                 my $tables = $self->tables;
6924                 my $count = $self->count;
6925                 if ($verbosity && $count > 500 && $tables/$count > .1) {
6926                     Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
6927                 }
6928                 $self->set_type($ENUM);
6929             }
6930             else {
6931                 $self->set_type($BINARY);
6932             }
6933         }
6934         undef $unique_maps{$addr};  # Garbage collect
6935         return;
6936     }
6937
6938     # Most of the accessors for a property actually apply to its map table.
6939     # Setup up accessor functions for those, referring to %map
6940     for my $sub qw(
6941                     add_alias
6942                     add_anomalous_entry
6943                     add_comment
6944                     add_conflicting
6945                     add_description
6946                     add_duplicate
6947                     add_note
6948                     aliases
6949                     comment
6950                     complete_name
6951                     core_access
6952                     count
6953                     default_map
6954                     delete_range
6955                     description
6956                     each_range
6957                     external_name
6958                     file_path
6959                     format
6960                     initialize
6961                     inverse_list
6962                     is_empty
6963                     name
6964                     note
6965                     perl_extension
6966                     property
6967                     range_count
6968                     ranges
6969                     range_size_1
6970                     reset_each_range
6971                     set_comment
6972                     set_core_access
6973                     set_default_map
6974                     set_file_path
6975                     set_final_comment
6976                     set_range_size_1
6977                     set_status
6978                     set_to_output_map
6979                     short_name
6980                     status
6981                     status_info
6982                     to_output_map
6983                     value_of
6984                     write
6985                 )
6986                     # 'property' above is for symmetry, so that one can take
6987                     # the property of a property and get itself, and so don't
6988                     # have to distinguish between properties and tables in
6989                     # calling code
6990     {
6991         no strict "refs";
6992         *$sub = sub {
6993             use strict "refs";
6994             my $self = shift;
6995             return $map{main::objaddr $self}->$sub(@_);
6996         }
6997     }
6998
6999
7000 } # End closure
7001
7002 package main;
7003
7004 sub join_lines($) {
7005     # Returns lines of the input joined together, so that they can be folded
7006     # properly.
7007     # This causes continuation lines to be joined together into one long line
7008     # for folding.  A continuation line is any line that doesn't begin with a
7009     # space or "\b" (the latter is stripped from the output).  This is so
7010     # lines can be be in a HERE document so as to fit nicely in the terminal
7011     # width, but be joined together in one long line, and then folded with
7012     # indents, '#' prefixes, etc, properly handled.
7013     # A blank separates the joined lines except if there is a break; an extra
7014     # blank is inserted after a period ending a line.
7015
7016     # Intialize the return with the first line.
7017     my ($return, @lines) = split "\n", shift;
7018
7019     # If the first line is null, it was an empty line, add the \n back in
7020     $return = "\n" if $return eq "";
7021
7022     # Now join the remainder of the physical lines.
7023     for my $line (@lines) {
7024
7025         # An empty line means wanted a blank line, so add two \n's to get that
7026         # effect, and go to the next line.
7027         if (length $line == 0) {
7028             $return .= "\n\n";
7029             next;
7030         }
7031
7032         # Look at the last character of what we have so far.
7033         my $previous_char = substr($return, -1, 1);
7034
7035         # And at the next char to be output.
7036         my $next_char = substr($line, 0, 1);
7037
7038         if ($previous_char ne "\n") {
7039
7040             # Here didn't end wth a nl.  If the next char a blank or \b, it
7041             # means that here there is a break anyway.  So add a nl to the
7042             # output.
7043             if ($next_char eq " " || $next_char eq "\b") {
7044                 $previous_char = "\n";
7045                 $return .= $previous_char;
7046             }
7047
7048             # Add an extra space after periods.
7049             $return .= " " if $previous_char eq '.';
7050         }
7051
7052         # Here $previous_char is still the latest character to be output.  If
7053         # it isn't a nl, it means that the next line is to be a continuation
7054         # line, with a blank inserted between them.
7055         $return .= " " if $previous_char ne "\n";
7056
7057         # Get rid of any \b
7058         substr($line, 0, 1) = "" if $next_char eq "\b";
7059
7060         # And append this next line.
7061         $return .= $line;
7062     }
7063
7064     return $return;
7065 }
7066
7067 sub simple_fold($;$$$) {
7068     # Returns a string of the input (string or an array of strings) folded
7069     # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
7070     # a \n
7071     # This is tailored for the kind of text written by this program,
7072     # especially the pod file, which can have very long names with
7073     # underscores in the middle, or words like AbcDefgHij....  We allow
7074     # breaking in the middle of such constructs if the line won't fit
7075     # otherwise.  The break in such cases will come either just after an
7076     # underscore, or just before one of the Capital letters.
7077
7078     local $to_trace = 0 if main::DEBUG;
7079
7080     my $line = shift;
7081     my $prefix = shift;     # Optional string to prepend to each output
7082                             # line
7083     $prefix = "" unless defined $prefix;
7084
7085     my $hanging_indent = shift; # Optional number of spaces to indent
7086                                 # continuation lines
7087     $hanging_indent = 0 unless $hanging_indent;
7088
7089     my $right_margin = shift;   # Optional number of spaces to narrow the
7090                                 # total width by.
7091     $right_margin = 0 unless defined $right_margin;
7092
7093     # Call carp with the 'nofold' option to avoid it from trying to call us
7094     # recursively
7095     Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
7096
7097     # The space available doesn't include what's automatically prepended
7098     # to each line, or what's reserved on the right.
7099     my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
7100     # XXX Instead of using the 'nofold' perhaps better to look up the stack
7101
7102     if (DEBUG && $hanging_indent >= $max) {
7103         Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max.  Using 0", 'nofold');
7104         $hanging_indent = 0;
7105     }
7106
7107     # First, split into the current physical lines.
7108     my @line;
7109     if (ref $line) {        # Better be an array, because not bothering to
7110                             # test
7111         foreach my $line (@{$line}) {
7112             push @line, split /\n/, $line;
7113         }
7114     }
7115     else {
7116         @line = split /\n/, $line;
7117     }
7118
7119     #local $to_trace = 1 if main::DEBUG;
7120     trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
7121
7122     # Look at each current physical line.
7123     for (my $i = 0; $i < @line; $i++) {
7124         Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
7125         #local $to_trace = 1 if main::DEBUG;
7126         trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
7127
7128         # Remove prefix, because will be added back anyway, don't want
7129         # doubled prefix
7130         $line[$i] =~ s/^$prefix//;
7131
7132         # Remove trailing space
7133         $line[$i] =~ s/\s+\Z//;
7134
7135         # If the line is too long, fold it.
7136         if (length $line[$i] > $max) {
7137             my $remainder;
7138
7139             # Here needs to fold.  Save the leading space in the line for
7140             # later.
7141             $line[$i] =~ /^ ( \s* )/x;
7142             my $leading_space = $1;
7143             trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
7144
7145             # If character at final permissible position is white space,
7146             # fold there, which will delete that white space
7147             if (substr($line[$i], $max - 1, 1) =~ /\s/) {
7148                 $remainder = substr($line[$i], $max);
7149                 $line[$i] = substr($line[$i], 0, $max - 1);
7150             }
7151             else {
7152
7153                 # Otherwise fold at an acceptable break char closest to
7154                 # the max length.  Look at just the maximal initial
7155                 # segment of the line
7156                 my $segment = substr($line[$i], 0, $max - 1);
7157                 if ($segment =~
7158                     /^ ( .{$hanging_indent}   # Don't look before the
7159                                               #  indent.
7160                         \ *                   # Don't look in leading
7161                                               #  blanks past the indent
7162                             [^ ] .*           # Find the right-most
7163                         (?:                   #  acceptable break:
7164                             [ \s = ]          # space or equal
7165                             | - (?! [.0-9] )  # or non-unary minus.
7166                         )                     # $1 includes the character
7167                     )/x)
7168                 {
7169                     # Split into the initial part that fits, and remaining
7170                     # part of the input
7171                     $remainder = substr($line[$i], length $1);
7172                     $line[$i] = $1;
7173                     trace $line[$i] if DEBUG && $to_trace;
7174                     trace $remainder if DEBUG && $to_trace;
7175                 }
7176
7177                 # If didn't find a good breaking spot, see if there is a
7178                 # not-so-good breaking spot.  These are just after
7179                 # underscores or where the case changes from lower to
7180                 # upper.  Use \a as a soft hyphen, but give up
7181                 # and don't break the line if there is actually a \a
7182                 # already in the input.  We use an ascii character for the
7183                 # soft-hyphen to avoid any attempt by miniperl to try to
7184                 # access the files that this program is creating.
7185                 elsif ($segment !~ /\a/
7186                        && ($segment =~ s/_/_\a/g
7187                        || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
7188                 {
7189                     # Here were able to find at least one place to insert
7190                     # our substitute soft hyphen.  Find the right-most one
7191                     # and replace it by a real hyphen.
7192                     trace $segment if DEBUG && $to_trace;
7193                     substr($segment,
7194                             rindex($segment, "\a"),
7195                             1) = '-';
7196
7197                     # Then remove the soft hyphen substitutes.
7198                     $segment =~ s/\a//g;
7199                     trace $segment if DEBUG && $to_trace;
7200
7201                     # And split into the initial part that fits, and
7202                     # remainder of the line
7203                     my $pos = rindex($segment, '-');
7204                     $remainder = substr($line[$i], $pos);
7205                     trace $remainder if DEBUG && $to_trace;
7206                     $line[$i] = substr($segment, 0, $pos + 1);
7207                 }
7208             }
7209
7210             # Here we know if we can fold or not.  If we can, $remainder
7211             # is what remains to be processed in the next iteration.
7212             if (defined $remainder) {
7213                 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
7214
7215                 # Insert the folded remainder of the line as a new element
7216                 # of the array.  (It may still be too long, but we will
7217                 # deal with that next time through the loop.)  Omit any
7218                 # leading space in the remainder.
7219                 $remainder =~ s/^\s+//;
7220                 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
7221
7222                 # But then indent by whichever is larger of:
7223                 # 1) the leading space on the input line;
7224                 # 2) the hanging indent.
7225                 # This preserves indentation in the original line.
7226                 my $lead = ($leading_space)
7227                             ? length $leading_space
7228                             : $hanging_indent;
7229                 $lead = max($lead, $hanging_indent);
7230                 splice @line, $i+1, 0, (" " x $lead) . $remainder;
7231             }
7232         }
7233
7234         # Ready to output the line. Get rid of any trailing space
7235         # And prefix by the required $prefix passed in.
7236         $line[$i] =~ s/\s+$//;
7237         $line[$i] = "$prefix$line[$i]\n";
7238     } # End of looping through all the lines.
7239
7240     return join "", @line;
7241 }
7242
7243 sub property_ref {  # Returns a reference to a property object.
7244     return Property::property_ref(@_);
7245 }
7246
7247 sub force_unlink ($) {
7248     my $filename = shift;
7249     return unless file_exists($filename);
7250     return if CORE::unlink($filename);
7251
7252     # We might need write permission
7253     chmod 0777, $filename;
7254     CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename.  Proceeding anyway: $!");
7255     return;
7256 }
7257
7258 sub write ($\@) {
7259     # Given a filename and a reference to an array of lines, write the lines
7260     # to the file
7261     # Filename can be given as an arrayref of directory names
7262
7263     my $file  = shift;
7264     my $lines_ref = shift;
7265     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7266
7267     if (! defined $lines_ref) {
7268         Carp::my_carp("Missing lines to write parameter for $file.  Writing skipped;");
7269         return;
7270     }
7271
7272     # Get into a single string if an array, and get rid of, in Unix terms, any
7273     # leading '.'
7274     $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
7275     $file = File::Spec->canonpath($file);
7276
7277     # If has directories, make sure that they all exist
7278     (undef, my $directories, undef) = File::Spec->splitpath($file);
7279     File::Path::mkpath($directories) if $directories && ! -d $directories;
7280
7281     push @files_actually_output, $file;
7282
7283     my $text;
7284     if (@$lines_ref) {
7285         $text = join "", @$lines_ref;
7286     }
7287     else {
7288         $text = "";
7289         Carp::my_carp("Output file '$file' is empty; writing it anyway;");
7290     }
7291
7292     force_unlink ($file);
7293
7294     my $OUT;
7295     if (not open $OUT, ">", $file) {
7296         Carp::my_carp("can't open $file for output.  Skipping this file: $!");
7297         return;
7298     }
7299     print "$file written.\n" if $verbosity >= $VERBOSE;
7300
7301     print $OUT $text;
7302     close $OUT;
7303     return;
7304 }
7305
7306
7307 sub Standardize($) {
7308     # This converts the input name string into a standardized equivalent to
7309     # use internally.
7310
7311     my $name = shift;
7312     unless (defined $name) {
7313       Carp::my_carp_bug("Standardize() called with undef.  Returning undef.");
7314       return;
7315     }
7316
7317     # Remove any leading or trailing white space
7318     $name =~ s/^\s+//g;
7319     $name =~ s/\s+$//g;
7320
7321     # Convert interior white space and hypens into underscores.
7322     $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
7323
7324     # Capitalize the letter following an underscore, and convert a sequence of
7325     # multiple underscores to a single one
7326     $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
7327
7328     # And capitalize the first letter, but not for the special cjk ones.
7329     $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
7330     return $name;
7331 }
7332
7333 sub standardize ($) {
7334     # Returns a lower-cased standardized name, without underscores.  This form
7335     # is chosen so that it can distinguish between any real versus superficial
7336     # Unicode name differences.  It relies on the fact that Unicode doesn't
7337     # have interior underscores, white space, nor dashes in any
7338     # stricter-matched name.  It should not be used on Unicode code point
7339     # names (the Name property), as they mostly, but not always follow these
7340     # rules.
7341
7342     my $name = Standardize(shift);
7343     return if !defined $name;
7344
7345     $name =~ s/ (?<= .) _ (?= . ) //xg;
7346     return lc $name;
7347 }
7348
7349 {   # Closure
7350
7351     my $indent_increment = " " x 2;
7352     my %already_output;
7353
7354     $main::simple_dumper_nesting = 0;
7355
7356     sub simple_dumper {
7357         # Like Simple Data::Dumper. Good enough for our needs. We can't use
7358         # the real thing as we have to run under miniperl.
7359
7360         # It is designed so that on input it is at the beginning of a line,
7361         # and the final thing output in any call is a trailing ",\n".
7362
7363         my $item = shift;
7364         my $indent = shift;
7365         $indent = "" if ! defined $indent;
7366
7367         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7368
7369         # nesting level is localized, so that as the call stack pops, it goes
7370         # back to the prior value.
7371         local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
7372         undef %already_output if $main::simple_dumper_nesting == 0;
7373         $main::simple_dumper_nesting++;
7374         #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
7375
7376         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7377
7378         # Determine the indent for recursive calls.
7379         my $next_indent = $indent . $indent_increment;
7380
7381         my $output;
7382         if (! ref $item) {
7383
7384             # Dump of scalar: just output it in quotes if not a number.  To do
7385             # so we must escape certain characters, and therefore need to
7386             # operate on a copy to avoid changing the original
7387             my $copy = $item;
7388             $copy = $UNDEF unless defined $copy;
7389
7390             # Quote non-numbers (numbers also have optional leading '-' and
7391             # fractions)
7392             if ($copy eq "" || $copy !~ /^ -? \d+ ( \. \d+ )? $/x) {
7393
7394                 # Escape apostrophe and backslash
7395                 $copy =~ s/ ( ['\\] ) /\\$1/xg;
7396                 $copy = "'$copy'";
7397             }
7398             $output = "$indent$copy,\n";
7399         }
7400         else {
7401
7402             # Keep track of cycles in the input, and refuse to infinitely loop
7403             if (defined $already_output{main::objaddr $item}) {
7404                 return "${indent}ALREADY OUTPUT: $item\n";
7405             }
7406             $already_output{main::objaddr $item} = $item;
7407
7408             if (ref $item eq 'ARRAY') {
7409                 my $using_brackets;
7410                 $output = $indent;
7411                 if ($main::simple_dumper_nesting > 1) {
7412                     $output .= '[';
7413                     $using_brackets = 1;
7414                 }
7415                 else {
7416                     $using_brackets = 0;
7417                 }
7418
7419                 # If the array is empty, put the closing bracket on the same
7420                 # line.  Otherwise, recursively add each array element
7421                 if (@$item == 0) {
7422                     $output .= " ";
7423                 }
7424                 else {
7425                     $output .= "\n";
7426                     for (my $i = 0; $i < @$item; $i++) {
7427
7428                         # Indent array elements one level
7429                         $output .= &simple_dumper($item->[$i], $next_indent);
7430                         $output =~ s/\n$//;      # Remove trailing nl so as to
7431                         $output .= " # [$i]\n";  # add a comment giving the
7432                                                  # array index
7433                     }
7434                     $output .= $indent;     # Indent closing ']' to orig level
7435                 }
7436                 $output .= ']' if $using_brackets;
7437                 $output .= ",\n";
7438             }
7439             elsif (ref $item eq 'HASH') {
7440                 my $is_first_line;
7441                 my $using_braces;
7442                 my $body_indent;
7443
7444                 # No surrounding braces at top level
7445                 $output .= $indent;
7446                 if ($main::simple_dumper_nesting > 1) {
7447                     $output .= "{\n";
7448                     $is_first_line = 0;
7449                     $body_indent = $next_indent;
7450                     $next_indent .= $indent_increment;
7451                     $using_braces = 1;
7452                 }
7453                 else {
7454                     $is_first_line = 1;
7455                     $body_indent = $indent;
7456                     $using_braces = 0;
7457                 }
7458
7459                 # Output hashes sorted alphabetically instead of apparently
7460                 # random.  Use caseless alphabetic sort
7461                 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
7462                 {
7463                     if ($is_first_line) {
7464                         $is_first_line = 0;
7465                     }
7466                     else {
7467                         $output .= "$body_indent";
7468                     }
7469
7470                     # The key must be a scalar, but this recursive call quotes
7471                     # it
7472                     $output .= &simple_dumper($key);
7473
7474                     # And change the trailing comma and nl to the hash fat
7475                     # comma for clarity, and so the value can be on the same
7476                     # line
7477                     $output =~ s/,\n$/ => /;
7478
7479                     # Recursively call to get the value's dump.
7480                     my $next = &simple_dumper($item->{$key}, $next_indent);
7481
7482                     # If the value is all on one line, remove its indent, so
7483                     # will follow the => immediately.  If it takes more than
7484                     # one line, start it on a new line.
7485                     if ($next !~ /\n.*\n/) {
7486                         $next =~ s/^ *//;
7487                     }
7488                     else {
7489                         $output .= "\n";
7490                     }
7491                     $output .= $next;
7492                 }
7493
7494                 $output .= "$indent},\n" if $using_braces;
7495             }
7496             elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
7497                 $output = $indent . ref($item) . "\n";
7498                 # XXX see if blessed
7499             }
7500             elsif ($item->can('dump')) {
7501
7502                 # By convention in this program, objects furnish a 'dump'
7503                 # method.  Since not doing any output at this level, just pass
7504                 # on the input indent
7505                 $output = $item->dump($indent);
7506             }
7507             else {
7508                 Carp::my_carp("Can't cope with dumping a " . ref($item) . ".  Skipping.");
7509             }
7510         }
7511         return $output;
7512     }
7513 }
7514
7515 sub dump_inside_out {
7516     # Dump inside-out hashes in an object's state by converting them to a
7517     # regular hash and then calling simple_dumper on that.
7518
7519     my $object = shift;
7520     my $fields_ref = shift;
7521     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7522
7523     my $addr = main::objaddr $object;
7524
7525     my %hash;
7526     foreach my $key (keys %$fields_ref) {
7527         $hash{$key} = $fields_ref->{$key}{$addr};
7528     }
7529
7530     return simple_dumper(\%hash, @_);
7531 }
7532
7533 sub _operator_dot {
7534     # Overloaded '.' method that is common to all packages.  It uses the
7535     # package's stringify method.
7536
7537     my $self = shift;
7538     my $other = shift;
7539     my $reversed = shift;
7540     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7541
7542     $other = "" unless defined $other;
7543
7544     foreach my $which (\$self, \$other) {
7545         next unless ref $$which;
7546         if ($$which->can('_operator_stringify')) {
7547             $$which = $$which->_operator_stringify;
7548         }
7549         else {
7550             my $ref = ref $$which;
7551             my $addr = main::objaddr $$which;
7552             $$which = "$ref ($addr)";
7553         }
7554     }
7555     return ($reversed)
7556             ? "$other$self"
7557             : "$self$other";
7558 }
7559
7560 sub _operator_equal {
7561     # Generic overloaded '==' routine.  To be equal, they must be the exact
7562     # same object
7563
7564     my $self = shift;
7565     my $other = shift;
7566
7567     return 0 unless defined $other;
7568     return 0 unless ref $other;
7569     return main::objaddr $self == main::objaddr $other;
7570 }
7571
7572 sub _operator_not_equal {
7573     my $self = shift;
7574     my $other = shift;
7575
7576     return ! _operator_equal($self, $other);
7577 }
7578
7579 sub process_PropertyAliases($) {
7580     # This reads in the PropertyAliases.txt file, which contains almost all
7581     # the character properties in Unicode and their equivalent aliases:
7582     # scf       ; Simple_Case_Folding         ; sfc
7583     #
7584     # Field 0 is the preferred short name for the property.
7585     # Field 1 is the full name.
7586     # Any succeeding ones are other accepted names.
7587
7588     my $file= shift;
7589     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7590
7591     # This whole file was non-existent in early releases, so use our own
7592     # internal one.
7593     $file->insert_lines(get_old_property_aliases())
7594                                                 if ! -e 'PropertyAliases.txt';
7595
7596     # Add any cjk properties that may have been defined.
7597     $file->insert_lines(@cjk_properties);
7598
7599     while ($file->next_line) {
7600
7601         my @data = split /\s*;\s*/;
7602
7603         my $full = $data[1];
7604
7605         my $this = Property->new($data[0], Full_Name => $full);
7606
7607         # Start looking for more aliases after these two.
7608         for my $i (2 .. @data - 1) {
7609             $this->add_alias($data[$i]);
7610         }
7611
7612     }
7613     return;
7614 }
7615
7616 sub finish_property_setup {
7617     # Finishes setting up after PropertyAliases.
7618
7619     my $file = shift;
7620     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7621
7622     # This entry was missing from this file in earlier Unicode versions
7623     if (-e 'Jamo.txt') {
7624         my $jsn = property_ref('JSN');
7625         if (! defined $jsn) {
7626             $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name');
7627         }
7628     }
7629
7630     # This entry is still missing as of 5.2, perhaps because no short name for
7631     # it.
7632     if (-e 'NameAliases.txt') {
7633         my $aliases = property_ref('Name_Alias');
7634         if (! defined $aliases) {
7635             $aliases = Property->new('Name_Alias');
7636         }
7637     }
7638
7639     # These are used so much, that we set globals for them.
7640     $gc = property_ref('General_Category');
7641     $block = property_ref('Block');
7642
7643     # Perl adds this alias.
7644     $gc->add_alias('Category');
7645
7646     # For backwards compatibility, these property files have particular names.
7647     my $upper = property_ref('Uppercase_Mapping');
7648     $upper->set_core_access('uc()');
7649     $upper->set_file('Upper'); # This is what utf8.c calls it
7650
7651     my $lower = property_ref('Lowercase_Mapping');
7652     $lower->set_core_access('lc()');
7653     $lower->set_file('Lower');
7654
7655     my $title = property_ref('Titlecase_Mapping');
7656     $title->set_core_access('ucfirst()');
7657     $title->set_file('Title');
7658
7659     my $fold = property_ref('Case_Folding');
7660     $fold->set_file('Fold') if defined $fold;
7661
7662     # utf8.c can't currently cope with non range-size-1 for these, and even if
7663     # it were changed to do so, someone else may be using them, expecting the
7664     # old style
7665     foreach my $property (qw {
7666                                 Case_Folding
7667                                 Lowercase_Mapping
7668                                 Titlecase_Mapping
7669                                 Uppercase_Mapping
7670                             })
7671     {
7672         property_ref($property)->set_range_size_1(1);
7673     }
7674
7675     # These two properties aren't actually used in the core, but unfortunately
7676     # the names just above that are in the core interfere with these, so
7677     # choose different names.  These aren't a problem unless the map tables
7678     # for these files get written out.
7679     my $lowercase = property_ref('Lowercase');
7680     $lowercase->set_file('IsLower') if defined $lowercase;
7681     my $uppercase = property_ref('Uppercase');
7682     $uppercase->set_file('IsUpper') if defined $uppercase;
7683
7684     # Set up the hard-coded default mappings, but only on properties defined
7685     # for this release
7686     foreach my $property (keys %default_mapping) {
7687         my $property_object = property_ref($property);
7688         next if ! defined $property_object;
7689         my $default_map = $default_mapping{$property};
7690         $property_object->set_default_map($default_map);
7691
7692         # A map of <code point> implies the property is string.
7693         if ($property_object->type == $UNKNOWN
7694             && $default_map eq $CODE_POINT)
7695         {
7696             $property_object->set_type($STRING);
7697         }
7698     }
7699
7700     # The following use the Multi_Default class to create objects for
7701     # defaults.
7702
7703     # Bidi class has a complicated default, but the derived file takes care of
7704     # the complications, leaving just 'L'.
7705     if (file_exists("${EXTRACTED}DBidiClass.txt")) {
7706         property_ref('Bidi_Class')->set_default_map('L');
7707     }
7708     else {
7709         my $default;
7710
7711         # The derived file was introduced in 3.1.1.  The values below are
7712         # taken from table 3-8, TUS 3.0
7713         my $default_R =
7714             'my $default = Range_List->new;
7715              $default->add_range(0x0590, 0x05FF);
7716              $default->add_range(0xFB1D, 0xFB4F);'
7717         ;
7718
7719         # The defaults apply only to unassigned characters
7720         $default_R .= '$gc->table("Cn") & $default;';
7721
7722         if ($v_version lt v3.0.0) {
7723             $default = Multi_Default->new(R => $default_R, 'L');
7724         }
7725         else {
7726
7727             # AL apparently not introduced until 3.0:  TUS 2.x references are
7728             # not on-line to check it out
7729             my $default_AL =
7730                 'my $default = Range_List->new;
7731                  $default->add_range(0x0600, 0x07BF);
7732                  $default->add_range(0xFB50, 0xFDFF);
7733                  $default->add_range(0xFE70, 0xFEFF);'
7734             ;
7735
7736             # Non-character code points introduced in this release; aren't AL
7737             if ($v_version ge 3.1.0) {
7738                 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
7739             }
7740             $default_AL .= '$gc->table("Cn") & $default';
7741             $default = Multi_Default->new(AL => $default_AL,
7742                                           R => $default_R,
7743                                           'L');
7744         }
7745         property_ref('Bidi_Class')->set_default_map($default);
7746     }
7747
7748     # Joining type has a complicated default, but the derived file takes care
7749     # of the complications, leaving just 'U' (or Non_Joining), except the file
7750     # is bad in 3.1.0
7751     if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
7752         if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
7753             property_ref('Joining_Type')->set_default_map('Non_Joining');
7754         }
7755         else {
7756
7757             # Otherwise, there are not one, but two possibilities for the
7758             # missing defaults: T and U.
7759             # The missing defaults that evaluate to T are given by:
7760             # T = Mn + Cf - ZWNJ - ZWJ
7761             # where Mn and Cf are the general category values. In other words,
7762             # any non-spacing mark or any format control character, except
7763             # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
7764             # WIDTH JOINER (joining type C).
7765             my $default = Multi_Default->new(
7766                'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
7767                'Non_Joining');
7768             property_ref('Joining_Type')->set_default_map($default);
7769         }
7770     }
7771
7772     # Line break has a complicated default in early releases. It is 'Unknown'
7773     # for non-assigned code points; 'AL' for assigned.
7774     if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
7775         my $lb = property_ref('Line_Break');
7776         if ($v_version gt 3.2.0) {
7777             $lb->set_default_map('Unknown');
7778         }
7779         else {
7780             my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
7781                                               'AL');
7782             $lb->set_default_map($default);
7783         }
7784
7785         # If has the URS property, make sure that the standard aliases are in
7786         # it, since not in the input tables in some versions.
7787         my $urs = property_ref('Unicode_Radical_Stroke');
7788         if (defined $urs) {
7789             $urs->add_alias('cjkRSUnicode');
7790             $urs->add_alias('kRSUnicode');
7791         }
7792     }
7793     return;
7794 }
7795
7796 sub get_old_property_aliases() {
7797     # Returns what would be in PropertyAliases.txt if it existed in very old
7798     # versions of Unicode.  It was derived from the one in 3.2, and pared
7799     # down based on the data that was actually in the older releases.
7800     # An attempt was made to use the existence of files to mean inclusion or
7801     # not of various aliases, but if this was not sufficient, using version
7802     # numbers was resorted to.
7803
7804     my @return;
7805
7806     # These are to be used in all versions (though some are constructed by
7807     # this program if missing)
7808     push @return, split /\n/, <<'END';
7809 bc        ; Bidi_Class
7810 Bidi_M    ; Bidi_Mirrored
7811 cf        ; Case_Folding
7812 ccc       ; Canonical_Combining_Class
7813 dm        ; Decomposition_Mapping
7814 dt        ; Decomposition_Type
7815 gc        ; General_Category
7816 isc       ; ISO_Comment
7817 lc        ; Lowercase_Mapping
7818 na        ; Name
7819 na1       ; Unicode_1_Name
7820 nt        ; Numeric_Type
7821 nv        ; Numeric_Value
7822 sfc       ; Simple_Case_Folding
7823 slc       ; Simple_Lowercase_Mapping
7824 stc       ; Simple_Titlecase_Mapping
7825 suc       ; Simple_Uppercase_Mapping
7826 tc        ; Titlecase_Mapping
7827 uc        ; Uppercase_Mapping
7828 END
7829
7830     if (-e 'Blocks.txt') {
7831         push @return, "blk       ; Block\n";
7832     }
7833     if (-e 'ArabicShaping.txt') {
7834         push @return, split /\n/, <<'END';
7835 jg        ; Joining_Group
7836 jt        ; Joining_Type
7837 END
7838     }
7839     if (-e 'PropList.txt') {
7840
7841         # This first set is in the original old-style proplist.
7842         push @return, split /\n/, <<'END';
7843 Alpha     ; Alphabetic
7844 Bidi_C    ; Bidi_Control
7845 Dash      ; Dash
7846 Dia       ; Diacritic
7847 Ext       ; Extender
7848 Hex       ; Hex_Digit
7849 Hyphen    ; Hyphen
7850 IDC       ; ID_Continue
7851 Ideo      ; Ideographic
7852 Join_C    ; Join_Control
7853 Math      ; Math
7854 QMark     ; Quotation_Mark
7855 Term      ; Terminal_Punctuation
7856 WSpace    ; White_Space
7857 END
7858         # The next sets were added later
7859         if ($v_version ge v3.0.0) {
7860             push @return, split /\n/, <<'END';
7861 Upper     ; Uppercase
7862 Lower     ; Lowercase
7863 END
7864         }
7865         if ($v_version ge v3.0.1) {
7866             push @return, split /\n/, <<'END';
7867 NChar     ; Noncharacter_Code_Point
7868 END
7869         }
7870         # The next sets were added in the new-style
7871         if ($v_version ge v3.1.0) {
7872             push @return, split /\n/, <<'END';
7873 OAlpha    ; Other_Alphabetic
7874 OLower    ; Other_Lowercase
7875 OMath     ; Other_Math
7876 OUpper    ; Other_Uppercase
7877 END
7878         }
7879         if ($v_version ge v3.1.1) {
7880             push @return, "AHex      ; ASCII_Hex_Digit\n";
7881         }
7882     }
7883     if (-e 'EastAsianWidth.txt') {
7884         push @return, "ea        ; East_Asian_Width\n";
7885     }
7886     if (-e 'CompositionExclusions.txt') {
7887         push @return, "CE        ; Composition_Exclusion\n";
7888     }
7889     if (-e 'LineBreak.txt') {
7890         push @return, "lb        ; Line_Break\n";
7891     }
7892     if (-e 'BidiMirroring.txt') {
7893         push @return, "bmg       ; Bidi_Mirroring_Glyph\n";
7894     }
7895     if (-e 'Scripts.txt') {
7896         push @return, "sc        ; Script\n";
7897     }
7898     if (-e 'DNormalizationProps.txt') {
7899         push @return, split /\n/, <<'END';
7900 Comp_Ex   ; Full_Composition_Exclusion
7901 FC_NFKC   ; FC_NFKC_Closure
7902 NFC_QC    ; NFC_Quick_Check
7903 NFD_QC    ; NFD_Quick_Check
7904 NFKC_QC   ; NFKC_Quick_Check
7905 NFKD_QC   ; NFKD_Quick_Check
7906 XO_NFC    ; Expands_On_NFC
7907 XO_NFD    ; Expands_On_NFD
7908 XO_NFKC   ; Expands_On_NFKC
7909 XO_NFKD   ; Expands_On_NFKD
7910 END
7911     }
7912     if (-e 'DCoreProperties.txt') {
7913         push @return, split /\n/, <<'END';
7914 IDS       ; ID_Start
7915 XIDC      ; XID_Continue
7916 XIDS      ; XID_Start
7917 END
7918         # These can also appear in some versions of PropList.txt
7919         push @return, "Lower     ; Lowercase\n"
7920                                     unless grep { $_ =~ /^Lower\b/} @return;
7921         push @return, "Upper     ; Uppercase\n"
7922                                     unless grep { $_ =~ /^Upper\b/} @return;
7923     }
7924
7925     # This flag requires the DAge.txt file to be copied into the directory.
7926     if (DEBUG && $compare_versions) {
7927         push @return, 'age       ; Age';
7928     }
7929
7930     return @return;
7931 }
7932
7933 sub process_PropValueAliases {
7934     # This file contains values that properties look like:
7935     # bc ; AL        ; Arabic_Letter
7936     # blk; n/a       ; Greek_And_Coptic                 ; Greek
7937     #
7938     # Field 0 is the property.
7939     # Field 1 is the short name of a property value or 'n/a' if no
7940     #                short name exists;
7941     # Field 2 is the full property value name;
7942     # Any other fields are more synonyms for the property value.
7943     # Purely numeric property values are omitted from the file; as are some
7944     # others, fewer and fewer in later releases
7945
7946     # Entries for the ccc property have an extra field before the
7947     # abbreviation:
7948     # ccc;   0; NR   ; Not_Reordered
7949     # It is the numeric value that the names are synonyms for.
7950
7951     # There are comment entries for values missing from this file:
7952     # # @missing: 0000..10FFFF; ISO_Comment; <none>
7953     # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
7954
7955     my $file= shift;
7956     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7957
7958     # This whole file was non-existent in early releases, so use our own
7959     # internal one if necessary.
7960     if (! -e 'PropValueAliases.txt') {
7961         $file->insert_lines(get_old_property_value_aliases());
7962     }
7963
7964     # Add any explicit cjk values
7965     $file->insert_lines(@cjk_property_values);
7966
7967     # This line is used only for testing the code that checks for name
7968     # conflicts.  There is a script Inherited, and when this line is executed
7969     # it causes there to be a name conflict with the 'Inherited' that this
7970     # program generates for this block property value
7971     #$file->insert_lines('blk; n/a; Herited');
7972
7973
7974     # Process each line of the file ...
7975     while ($file->next_line) {
7976
7977         my ($property, @data) = split /\s*;\s*/;
7978
7979         # The full name for the ccc property value is in field 2 of the
7980         # remaining ones; field 1 for all other properties.  Swap ccc fields 1
7981         # and 2.  (Rightmost splice removes field 2, returning it; left splice
7982         # inserts that into field 1, thus shifting former field 1 to field 2.)
7983         splice (@data, 1, 0, splice(@data, 2, 1)) if $property eq 'ccc';
7984
7985         # If there is no short name, use the full one in element 1
7986         $data[0] = $data[1] if $data[0] eq "n/a";
7987
7988         # Earlier releases had the pseudo property 'qc' that should expand to
7989         # the ones that replace it below.
7990         if ($property eq 'qc') {
7991             if (lc $data[0] eq 'y') {
7992                 $file->insert_lines('NFC_QC; Y      ; Yes',
7993                                     'NFD_QC; Y      ; Yes',
7994                                     'NFKC_QC; Y     ; Yes',
7995                                     'NFKD_QC; Y     ; Yes',
7996                                     );
7997             }
7998             elsif (lc $data[0] eq 'n') {
7999                 $file->insert_lines('NFC_QC; N      ; No',
8000                                     'NFD_QC; N      ; No',
8001                                     'NFKC_QC; N     ; No',
8002                                     'NFKD_QC; N     ; No',
8003                                     );
8004             }
8005             elsif (lc $data[0] eq 'm') {
8006                 $file->insert_lines('NFC_QC; M      ; Maybe',
8007                                     'NFKC_QC; M     ; Maybe',
8008                                     );
8009             }
8010             else {
8011                 $file->carp_bad_line("qc followed by unexpected '$data[0]");
8012             }
8013             next;
8014         }
8015
8016         # The first field is the short name, 2nd is the full one.
8017         my $property_object = property_ref($property);
8018         my $table = $property_object->add_match_table($data[0],
8019                                                 Full_Name => $data[1]);
8020
8021         # Start looking for more aliases after these two.
8022         for my $i (2 .. @data - 1) {
8023             $table->add_alias($data[$i]);
8024         }
8025     } # End of looping through the file
8026
8027     # As noted in the comments early in the program, it generates tables for
8028     # the default values for all releases, even those for which the concept
8029     # didn't exist at the time.  Here we add those if missing.
8030     my $age = property_ref('age');
8031     if (defined $age && ! defined $age->table('Unassigned')) {
8032         $age->add_match_table('Unassigned');
8033     }
8034     $block->add_match_table('No_Block') if -e 'Blocks.txt'
8035                                     && ! defined $block->table('No_Block');
8036
8037
8038     # Now set the default mappings of the properties from the file.  This is
8039     # done after the loop because a number of properties have only @missings
8040     # entries in the file, and may not show up until the end.
8041     my @defaults = $file->get_missings;
8042     foreach my $default_ref (@defaults) {
8043         my $default = $default_ref->[0];
8044         my $property = property_ref($default_ref->[1]);
8045         $property->set_default_map($default);
8046     }
8047     return;
8048 }
8049
8050 sub get_old_property_value_aliases () {
8051     # Returns what would be in PropValueAliases.txt if it existed in very old
8052     # versions of Unicode.  It was derived from the one in 3.2, and pared
8053     # down.  An attempt was made to use the existence of files to mean
8054     # inclusion or not of various aliases, but if this was not sufficient,
8055     # using version numbers was resorted to.
8056
8057     my @return = split /\n/, <<'END';
8058 bc ; AN        ; Arabic_Number
8059 bc ; B         ; Paragraph_Separator
8060 bc ; CS        ; Common_Separator
8061 bc ; EN        ; European_Number
8062 bc ; ES        ; European_Separator
8063 bc ; ET        ; European_Terminator
8064 bc ; L         ; Left_To_Right
8065 bc ; ON        ; Other_Neutral
8066 bc ; R         ; Right_To_Left
8067 bc ; WS        ; White_Space
8068
8069 # The standard combining classes are very much different in v1, so only use
8070 # ones that look right (not checked thoroughly)
8071 ccc;   0; NR   ; Not_Reordered
8072 ccc;   1; OV   ; Overlay
8073 ccc;   7; NK   ; Nukta
8074 ccc;   8; KV   ; Kana_Voicing
8075 ccc;   9; VR   ; Virama
8076 ccc; 202; ATBL ; Attached_Below_Left
8077 ccc; 216; ATAR ; Attached_Above_Right
8078 ccc; 218; BL   ; Below_Left
8079 ccc; 220; B    ; Below
8080 ccc; 222; BR   ; Below_Right
8081 ccc; 224; L    ; Left
8082 ccc; 228; AL   ; Above_Left
8083 ccc; 230; A    ; Above
8084 ccc; 232; AR   ; Above_Right
8085 ccc; 234; DA   ; Double_Above
8086
8087 dt ; can       ; canonical
8088 dt ; enc       ; circle
8089 dt ; fin       ; final
8090 dt ; font      ; font
8091 dt ; fra       ; fraction
8092 dt ; init      ; initial
8093 dt ; iso       ; isolated
8094 dt ; med       ; medial
8095 dt ; n/a       ; none
8096 dt ; nb        ; noBreak
8097 dt ; sqr       ; square
8098 dt ; sub       ; sub
8099 dt ; sup       ; super
8100
8101 gc ; C         ; Other                            # Cc | Cf | Cn | Co | Cs
8102 gc ; Cc        ; Control
8103 gc ; Cn        ; Unassigned
8104 gc ; Co        ; Private_Use
8105 gc ; L         ; Letter                           # Ll | Lm | Lo | Lt | Lu
8106 gc ; LC        ; Cased_Letter                     # Ll | Lt | Lu
8107 gc ; Ll        ; Lowercase_Letter
8108 gc ; Lm        ; Modifier_Letter
8109 gc ; Lo        ; Other_Letter
8110 gc ; Lu        ; Uppercase_Letter
8111 gc ; M         ; Mark                             # Mc | Me | Mn
8112 gc ; Mc        ; Spacing_Mark
8113 gc ; Mn        ; Nonspacing_Mark
8114 gc ; N         ; Number                           # Nd | Nl | No
8115 gc ; Nd        ; Decimal_Number
8116 gc ; No        ; Other_Number
8117 gc ; P         ; Punctuation                      # Pc | Pd | Pe | Pf | Pi | Po | Ps
8118 gc ; Pd        ; Dash_Punctuation
8119 gc ; Pe        ; Close_Punctuation
8120 gc ; Po        ; Other_Punctuation
8121 gc ; Ps        ; Open_Punctuation
8122 gc ; S         ; Symbol                           # Sc | Sk | Sm | So
8123 gc ; Sc        ; Currency_Symbol
8124 gc ; Sm        ; Math_Symbol
8125 gc ; So        ; Other_Symbol
8126 gc ; Z         ; Separator                        # Zl | Zp | Zs
8127 gc ; Zl        ; Line_Separator
8128 gc ; Zp        ; Paragraph_Separator
8129 gc ; Zs        ; Space_Separator
8130
8131 nt ; de        ; Decimal
8132 nt ; di        ; Digit
8133 nt ; n/a       ; None
8134 nt ; nu        ; Numeric
8135 END
8136
8137     if (-e 'ArabicShaping.txt') {
8138         push @return, split /\n/, <<'END';
8139 jg ; n/a       ; AIN
8140 jg ; n/a       ; ALEF
8141 jg ; n/a       ; DAL
8142 jg ; n/a       ; GAF
8143 jg ; n/a       ; LAM
8144 jg ; n/a       ; MEEM
8145 jg ; n/a       ; NO_JOINING_GROUP
8146 jg ; n/a       ; NOON
8147 jg ; n/a       ; QAF
8148 jg ; n/a       ; SAD
8149 jg ; n/a       ; SEEN
8150 jg ; n/a       ; TAH
8151 jg ; n/a       ; WAW
8152
8153 jt ; C         ; Join_Causing
8154 jt ; D         ; Dual_Joining
8155 jt ; L         ; Left_Joining
8156 jt ; R         ; Right_Joining
8157 jt ; U         ; Non_Joining
8158 jt ; T         ; Transparent
8159 END
8160         if ($v_version ge v3.0.0) {
8161             push @return, split /\n/, <<'END';
8162 jg ; n/a       ; ALAPH
8163 jg ; n/a       ; BEH
8164 jg ; n/a       ; BETH
8165 jg ; n/a       ; DALATH_RISH
8166 jg ; n/a       ; E
8167 jg ; n/a       ; FEH
8168 jg ; n/a       ; FINAL_SEMKATH
8169 jg ; n/a       ; GAMAL
8170 jg ; n/a       ; HAH
8171 jg ; n/a       ; HAMZA_ON_HEH_GOAL
8172 jg ; n/a       ; HE
8173 jg ; n/a       ; HEH
8174 jg ; n/a       ; HEH_GOAL
8175 jg ; n/a       ; HETH
8176 jg ; n/a       ; KAF
8177 jg ; n/a       ; KAPH
8178 jg ; n/a       ; KNOTTED_HEH
8179 jg ; n/a       ; LAMADH
8180 jg ; n/a       ; MIM
8181 jg ; n/a       ; NUN
8182 jg ; n/a       ; PE
8183 jg ; n/a       ; QAPH
8184 jg ; n/a       ; REH
8185 jg ; n/a       ; REVERSED_PE
8186 jg ; n/a       ; SADHE
8187 jg ; n/a       ; SEMKATH
8188 jg ; n/a       ; SHIN
8189 jg ; n/a       ; SWASH_KAF
8190 jg ; n/a       ; TAW
8191 jg ; n/a       ; TEH_MARBUTA
8192 jg ; n/a       ; TETH
8193 jg ; n/a       ; YEH
8194 jg ; n/a       ; YEH_BARREE
8195 jg ; n/a       ; YEH_WITH_TAIL
8196 jg ; n/a       ; YUDH
8197 jg ; n/a       ; YUDH_HE
8198 jg ; n/a       ; ZAIN
8199 END
8200         }
8201     }
8202
8203
8204     if (-e 'EastAsianWidth.txt') {
8205         push @return, split /\n/, <<'END';
8206 ea ; A         ; Ambiguous
8207 ea ; F         ; Fullwidth
8208 ea ; H         ; Halfwidth
8209 ea ; N         ; Neutral
8210 ea ; Na        ; Narrow
8211 ea ; W         ; Wide
8212 END
8213     }
8214
8215     if (-e 'LineBreak.txt') {
8216         push @return, split /\n/, <<'END';
8217 lb ; AI        ; Ambiguous
8218 lb ; AL        ; Alphabetic
8219 lb ; B2        ; Break_Both
8220 lb ; BA        ; Break_After
8221 lb ; BB        ; Break_Before
8222 lb ; BK        ; Mandatory_Break
8223 lb ; CB        ; Contingent_Break
8224 lb ; CL        ; Close_Punctuation
8225 lb ; CM        ; Combining_Mark
8226 lb ; CR        ; Carriage_Return
8227 lb ; EX        ; Exclamation
8228 lb ; GL        ; Glue
8229 lb ; HY        ; Hyphen
8230 lb ; ID        ; Ideographic
8231 lb ; IN        ; Inseperable
8232 lb ; IS        ; Infix_Numeric
8233 lb ; LF        ; Line_Feed
8234 lb ; NS        ; Nonstarter
8235 lb ; NU        ; Numeric
8236 lb ; OP        ; Open_Punctuation
8237 lb ; PO        ; Postfix_Numeric
8238 lb ; PR        ; Prefix_Numeric
8239 lb ; QU        ; Quotation
8240 lb ; SA        ; Complex_Context
8241 lb ; SG        ; Surrogate
8242 lb ; SP        ; Space
8243 lb ; SY        ; Break_Symbols
8244 lb ; XX        ; Unknown
8245 lb ; ZW        ; ZWSpace
8246 END
8247     }
8248
8249     if (-e 'DNormalizationProps.txt') {
8250         push @return, split /\n/, <<'END';
8251 qc ; M         ; Maybe
8252 qc ; N         ; No
8253 qc ; Y         ; Yes
8254 END
8255     }
8256
8257     if (-e 'Scripts.txt') {
8258         push @return, split /\n/, <<'END';
8259 sc ; Arab      ; Arabic
8260 sc ; Armn      ; Armenian
8261 sc ; Beng      ; Bengali
8262 sc ; Bopo      ; Bopomofo
8263 sc ; Cans      ; Canadian_Aboriginal
8264 sc ; Cher      ; Cherokee
8265 sc ; Cyrl      ; Cyrillic
8266 sc ; Deva      ; Devanagari
8267 sc ; Dsrt      ; Deseret
8268 sc ; Ethi      ; Ethiopic
8269 sc ; Geor      ; Georgian
8270 sc ; Goth      ; Gothic
8271 sc ; Grek      ; Greek
8272 sc ; Gujr      ; Gujarati
8273 sc ; Guru      ; Gurmukhi
8274 sc ; Hang      ; Hangul
8275 sc ; Hani      ; Han
8276 sc ; Hebr      ; Hebrew
8277 sc ; Hira      ; Hiragana
8278 sc ; Ital      ; Old_Italic
8279 sc ; Kana      ; Katakana
8280 sc ; Khmr      ; Khmer
8281 sc ; Knda      ; Kannada
8282 sc ; Laoo      ; Lao
8283 sc ; Latn      ; Latin
8284 sc ; Mlym      ; Malayalam
8285 sc ; Mong      ; Mongolian
8286 sc ; Mymr      ; Myanmar
8287 sc ; Ogam      ; Ogham
8288 sc ; Orya      ; Oriya
8289 sc ; Qaai      ; Inherited
8290 sc ; Runr      ; Runic
8291 sc ; Sinh      ; Sinhala
8292 sc ; Syrc      ; Syriac
8293 sc ; Taml      ; Tamil
8294 sc ; Telu      ; Telugu
8295 sc ; Thaa      ; Thaana
8296 sc ; Thai      ; Thai
8297 sc ; Tibt      ; Tibetan
8298 sc ; Yiii      ; Yi
8299 sc ; Zyyy      ; Common
8300 END
8301     }
8302
8303     if ($v_version ge v2.0.0) {
8304         push @return, split /\n/, <<'END';
8305 dt ; com       ; compat
8306 dt ; nar       ; narrow
8307 dt ; sml       ; small
8308 dt ; vert      ; vertical
8309 dt ; wide      ; wide
8310
8311 gc ; Cf        ; Format
8312 gc ; Cs        ; Surrogate
8313 gc ; Lt        ; Titlecase_Letter
8314 gc ; Me        ; Enclosing_Mark
8315 gc ; Nl        ; Letter_Number
8316 gc ; Pc        ; Connector_Punctuation
8317 gc ; Sk        ; Modifier_Symbol
8318 END
8319     }
8320     if ($v_version ge v2.1.2) {
8321         push @return, "bc ; S         ; Segment_Separator\n";
8322     }
8323     if ($v_version ge v2.1.5) {
8324         push @return, split /\n/, <<'END';
8325 gc ; Pf        ; Final_Punctuation
8326 gc ; Pi        ; Initial_Punctuation
8327 END
8328     }
8329     if ($v_version ge v2.1.8) {
8330         push @return, "ccc; 240; IS   ; Iota_Subscript\n";
8331     }
8332
8333     if ($v_version ge v3.0.0) {
8334         push @return, split /\n/, <<'END';
8335 bc ; AL        ; Arabic_Letter
8336 bc ; BN        ; Boundary_Neutral
8337 bc ; LRE       ; Left_To_Right_Embedding
8338 bc ; LRO       ; Left_To_Right_Override
8339 bc ; NSM       ; Nonspacing_Mark
8340 bc ; PDF       ; Pop_Directional_Format
8341 bc ; RLE       ; Right_To_Left_Embedding
8342 bc ; RLO       ; Right_To_Left_Override
8343
8344 ccc; 233; DB   ; Double_Below
8345 END
8346     }
8347
8348     if ($v_version ge v3.1.0) {
8349         push @return, "ccc; 226; R    ; Right\n";
8350     }
8351
8352     return @return;
8353 }
8354
8355 { # Closure
8356     # This is used to store the range list of all the code points usable when
8357     # the little used $compare_versions feature is enabled.
8358     my $compare_versions_range_list;
8359
8360     sub process_generic_property_file {
8361         # This processes a file containing property mappings and puts them
8362         # into internal map tables.  It should be used to handle any property
8363         # files that have mappings from a code point or range thereof to
8364         # something else.  This means almost all the UCD .txt files.
8365         # each_line_handlers() should be set to adjust the lines of these
8366         # files, if necessary, to what this routine understands:
8367         #
8368         # 0374          ; NFD_QC; N
8369         # 003C..003E    ; Math
8370         #
8371         # the fields are: "codepoint range ; property; map"
8372         #
8373         # meaning the codepoints in the range all have the value 'map' under
8374         # 'property'.
8375         # Beginning and trailing white space in each field are not signficant.
8376         # Note there is not a trailing semi-colon in the above.  A trailing
8377         # semi-colon means the map is a null-string.  An omitted map, as
8378         # opposed to a null-string, is assumed to be 'Y', based on Unicode
8379         # table syntax.  (This could have been hidden from this routine by
8380         # doing it in the $file object, but that would require parsing of the
8381         # line there, so would have to parse it twice, or change the interface
8382         # to pass this an array.  So not done.)
8383         #
8384         # The map field may begin with a sequence of commands that apply to
8385         # this range.  Each such command begins and ends with $CMD_DELIM.
8386         # These are used to indicate, for example, that the mapping for a
8387         # range has a non-default type.
8388         #
8389         # This loops through the file, calling it's next_line() method, and
8390         # then taking the map and adding it to the property's table.
8391         # Complications arise because any number of properties can be in the
8392         # file, in any order, interspersed in any way.  The first time a
8393         # property is seen, it gets information about that property and
8394         # caches it for quick retrieval later.  It also normalizes the maps
8395         # so that only one of many synonym is stored.  The Unicode input files
8396         # do use some multiple synonyms.
8397
8398         my $file = shift;
8399         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8400
8401         my %property_info;               # To keep track of what properties
8402                                          # have already had entries in the
8403                                          # current file, and info about each,
8404                                          # so don't have to recompute.
8405         my $property_name;               # property currently being worked on
8406         my $property_type;               # and its type
8407         my $previous_property_name = ""; # name from last time through loop
8408         my $property_object;             # pointer to the current property's
8409                                          # object
8410         my $property_addr;               # the address of that object
8411         my $default_map;                 # the string that code points missing
8412                                          # from the file map to
8413         my $default_table;               # For non-string properties, a
8414                                          # reference to the match table that
8415                                          # will contain the list of code
8416                                          # points that map to $default_map.
8417
8418         # Get the next real non-comment line
8419         LINE:
8420         while ($file->next_line) {
8421
8422             # Default replacement type; means that if parts of the range have
8423             # already been stored in our tables, the new map overrides them if
8424             # they differ more than cosmetically
8425             my $replace = $IF_NOT_EQUIVALENT;
8426             my $map_type;            # Default type for the map of this range
8427
8428             #local $to_trace = 1 if main::DEBUG;
8429             trace $_ if main::DEBUG && $to_trace;
8430
8431             # Split the line into components
8432             my ($range, $property_name, $map, @remainder)
8433                 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
8434
8435             # If more or less on the line than we are expecting, warn and skip
8436             # the line
8437             if (@remainder) {
8438                 $file->carp_bad_line('Extra fields');
8439                 next LINE;
8440             }
8441             elsif ( ! defined $property_name) {
8442                 $file->carp_bad_line('Missing property');
8443                 next LINE;
8444             }
8445
8446             # Examine the range.
8447             if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
8448             {
8449                 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
8450                 next LINE;
8451             }
8452             my $low = hex $1;
8453             my $high = (defined $2) ? hex $2 : $low;
8454
8455             # For the very specialized case of comparing two Unicode
8456             # versions...
8457             if (DEBUG && $compare_versions) {
8458                 if ($property_name eq 'Age') {
8459
8460                     # Only allow code points at least as old as the version
8461                     # specified.
8462                     my $age = pack "C*", split(/\./, $map);        # v string
8463                     next LINE if $age gt $compare_versions;
8464                 }
8465                 else {
8466
8467                     # Again, we throw out code points younger than those of
8468                     # the specified version.  By now, the Age property is
8469                     # populated.  We use the intersection of each input range
8470                     # with this property to find what code points in it are
8471                     # valid.   To do the intersection, we have to convert the
8472                     # Age property map to a Range_list.  We only have to do
8473                     # this once.
8474                     if (! defined $compare_versions_range_list) {
8475                         my $age = property_ref('Age');
8476                         if (! -e 'DAge.txt') {
8477                             croak "Need to have 'DAge.txt' file to do version comparison";
8478                         }
8479                         elsif ($age->count == 0) {
8480                             croak "The 'Age' table is empty, but its file exists";
8481                         }
8482                         $compare_versions_range_list
8483                                         = Range_List->new(Initialize => $age);
8484                     }
8485
8486                     # An undefined map is always 'Y'
8487                     $map = 'Y' if ! defined $map;
8488
8489                     # Calculate the intersection of the input range with the
8490                     # code points that are known in the specified version
8491                     my @ranges = ($compare_versions_range_list
8492                                   & Range->new($low, $high))->ranges;
8493
8494                     # If the intersection is empty, throw away this range
8495                     next LINE unless @ranges;
8496
8497                     # Only examine the first range this time through the loop.
8498                     my $this_range = shift @ranges;
8499
8500                     # Put any remaining ranges in the queue to be processed
8501                     # later.  Note that there is unnecessary work here, as we
8502                     # will do the intersection again for each of these ranges
8503                     # during some future iteration of the LINE loop, but this
8504                     # code is not used in production.  The later intersections
8505                     # are guaranteed to not splinter, so this will not become
8506                     # an infinite loop.
8507                     my $line = join ';', $property_name, $map;
8508                     foreach my $range (@ranges) {
8509                         $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
8510                                                             $range->start,
8511                                                             $range->end,
8512                                                             $line));
8513                     }
8514
8515                     # And process the first range, like any other.
8516                     $low = $this_range->start;
8517                     $high = $this_range->end;
8518                 }
8519             } # End of $compare_versions
8520
8521             # If changing to a new property, get the things constant per
8522             # property
8523             if ($previous_property_name ne $property_name) {
8524
8525                 $property_object = property_ref($property_name);
8526                 if (! defined $property_object) {
8527                     $file->carp_bad_line("Unexpected property '$property_name'.  Skipped");
8528                     next LINE;
8529                 }
8530                 $property_addr = main::objaddr($property_object);
8531
8532                 # Defer changing names until have a line that is acceptable
8533                 # (the 'next' statement above means is unacceptable)
8534                 $previous_property_name = $property_name;
8535
8536                 # If not the first time for this property, retrieve info about
8537                 # it from the cache
8538                 if (defined ($property_info{$property_addr}{'type'})) {
8539                     $property_type = $property_info{$property_addr}{'type'};
8540                     $default_map = $property_info{$property_addr}{'default'};
8541                     $map_type
8542                         = $property_info{$property_addr}{'pseudo_map_type'};
8543                     $default_table
8544                             = $property_info{$property_addr}{'default_table'};
8545                 }
8546                 else {
8547
8548                     # Here, is the first time for this property.  Set up the
8549                     # cache.
8550                     $property_type = $property_info{$property_addr}{'type'}
8551                                    = $property_object->type;
8552                     $map_type
8553                         = $property_info{$property_addr}{'pseudo_map_type'}
8554                         = $property_object->pseudo_map_type;
8555
8556                     # The Unicode files are set up so that if the map is not
8557                     # defined, it is a binary property
8558                     if (! defined $map && $property_type != $BINARY) {
8559                         if ($property_type != $UNKNOWN
8560                             && $property_type != $NON_STRING)
8561                         {
8562                             $file->carp_bad_line("No mapping defined on a non-binary property.  Using 'Y' for the map");
8563                         }
8564                         else {
8565                             $property_object->set_type($BINARY);
8566                             $property_type
8567                                 = $property_info{$property_addr}{'type'}
8568                                 = $BINARY;
8569                         }
8570                     }
8571
8572                     # Get any @missings default for this property.  This
8573                     # should precede the first entry for the property in the
8574                     # input file, and is located in a comment that has been
8575                     # stored by the Input_file class until we access it here.
8576                     # It's possible that there is more than one such line
8577                     # waiting for us; collect them all, and parse
8578                     my @missings_list = $file->get_missings
8579                                             if $file->has_missings_defaults;
8580                     foreach my $default_ref (@missings_list) {
8581                         my $default = $default_ref->[0];
8582                         my $addr = objaddr property_ref($default_ref->[1]);
8583
8584                         # For string properties, the default is just what the
8585                         # file says, but non-string properties should already
8586                         # have set up a table for the default property value;
8587                         # use the table for these, so can resolve synonyms
8588                         # later to a single standard one.
8589                         if ($property_type == $STRING
8590                             || $property_type == $UNKNOWN)
8591                         {
8592                             $property_info{$addr}{'missings'} = $default;
8593                         }
8594                         else {
8595                             $property_info{$addr}{'missings'}
8596                                         = $property_object->table($default);
8597                         }
8598                     }
8599
8600                     # Finished storing all the @missings defaults in the input
8601                     # file so far.  Get the one for the current property.
8602                     my $missings = $property_info{$property_addr}{'missings'};
8603
8604                     # But we likely have separately stored what the default
8605                     # should be.  (This is to accommodate versions of the
8606                     # standard where the @missings lines are absent or
8607                     # incomplete.)  Hopefully the two will match.  But check
8608                     # it out.
8609                     $default_map = $property_object->default_map;
8610
8611                     # If the map is a ref, it means that the default won't be
8612                     # processed until later, so undef it, so next few lines
8613                     # will redefine it to something that nothing will match
8614                     undef $default_map if ref $default_map;
8615
8616                     # Create a $default_map if don't have one; maybe a dummy
8617                     # that won't match anything.
8618                     if (! defined $default_map) {
8619
8620                         # Use any @missings line in the file.
8621                         if (defined $missings) {
8622                             if (ref $missings) {
8623                                 $default_map = $missings->full_name;
8624                                 $default_table = $missings;
8625                             }
8626                             else {
8627                                 $default_map = $missings;
8628                             }
8629
8630                             # And store it with the property for outside use.
8631                             $property_object->set_default_map($default_map);
8632                         }
8633                         else {
8634
8635                             # Neither an @missings nor a default map.  Create
8636                             # a dummy one, so won't have to test definedness
8637                             # in the main loop.
8638                             $default_map = '_Perl This will never be in a file
8639                                             from Unicode';
8640                         }
8641                     }
8642
8643                     # Here, we have $default_map defined, possibly in terms of
8644                     # $missings, but maybe not, and possibly is a dummy one.
8645                     if (defined $missings) {
8646
8647                         # Make sure there is no conflict between the two.
8648                         # $missings has priority.
8649                         if (ref $missings) {
8650                             $default_table
8651                                         = $property_object->table($default_map);
8652                             if (! defined $default_table
8653                                 || $default_table != $missings)
8654                             {
8655                                 if (! defined $default_table) {
8656                                     $default_table = $UNDEF;
8657                                 }
8658                                 $file->carp_bad_line(<<END
8659 The \@missings line for $property_name in $file says that missings default to
8660 $missings, but we expect it to be $default_table.  $missings used.
8661 END
8662                                 );
8663                                 $default_table = $missings;
8664                                 $default_map = $missings->full_name;
8665                             }
8666                             $property_info{$property_addr}{'default_table'}
8667                                                         = $default_table;
8668                         }
8669                         elsif ($default_map ne $missings) {
8670                             $file->carp_bad_line(<<END
8671 The \@missings line for $property_name in $file says that missings default to
8672 $missings, but we expect it to be $default_map.  $missings used.
8673 END
8674                             );
8675                             $default_map = $missings;
8676                         }
8677                     }
8678
8679                     $property_info{$property_addr}{'default'}
8680                                                     = $default_map;
8681
8682                     # If haven't done so already, find the table corresponding
8683                     # to this map for non-string properties.
8684                     if (! defined $default_table
8685                         && $property_type != $STRING
8686                         && $property_type != $UNKNOWN)
8687                     {
8688                         $default_table = $property_info{$property_addr}
8689                                                         {'default_table'}
8690                                     = $property_object->table($default_map);
8691                     }
8692                 } # End of is first time for this property
8693             } # End of switching properties.
8694
8695             # Ready to process the line.
8696             # The Unicode files are set up so that if the map is not defined,
8697             # it is a binary property with value 'Y'
8698             if (! defined $map) {
8699                 $map = 'Y';
8700             }
8701             else {
8702
8703                 # If the map begins with a special command to us (enclosed in
8704                 # delimiters), extract the command(s).
8705                 if (substr($map, 0, 1) eq $CMD_DELIM) {
8706                     while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
8707                         my $command = $1;
8708                         if ($command =~  / ^ $REPLACE_CMD= (.*) /x) {
8709                             $replace = $1;
8710                         }
8711                         elsif ($command =~  / ^ $MAP_TYPE_CMD= (.*) /x) {
8712                             $map_type = $1;
8713                         }
8714                         else {
8715                            $file->carp_bad_line("Unknown command line: '$1'");
8716                            next LINE;
8717                         }
8718                     }
8719                 }
8720             }
8721
8722             if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
8723             {
8724
8725                 # Here, we have a map to a particular code point, and the
8726                 # default map is to a code point itself.  If the range
8727                 # includes the particular code point, change that portion of
8728                 # the range to the default.  This makes sure that in the final
8729                 # table only the non-defaults are listed.
8730                 my $decimal_map = hex $map;
8731                 if ($low <= $decimal_map && $decimal_map <= $high) {
8732
8733                     # If the range includes stuff before or after the map
8734                     # we're changing, split it and process the split-off parts
8735                     # later.
8736                     if ($low < $decimal_map) {
8737                         $file->insert_adjusted_lines(
8738                                             sprintf("%04X..%04X; %s; %s",
8739                                                     $low,
8740                                                     $decimal_map - 1,
8741                                                     $property_name,
8742                                                     $map));
8743                     }
8744                     if ($high > $decimal_map) {
8745                         $file->insert_adjusted_lines(
8746                                             sprintf("%04X..%04X; %s; %s",
8747                                                     $decimal_map + 1,
8748                                                     $high,
8749                                                     $property_name,
8750                                                     $map));
8751                     }
8752                     $low = $high = $decimal_map;
8753                     $map = $CODE_POINT;
8754                 }
8755             }
8756
8757             # If we can tell that this is a synonym for the default map, use
8758             # the default one instead.
8759             if ($property_type != $STRING
8760                 && $property_type != $UNKNOWN)
8761             {
8762                 my $table = $property_object->table($map);
8763                 if (defined $table && $table == $default_table) {
8764                     $map = $default_map;
8765                 }
8766             }
8767
8768             # And figure out the map type if not known.
8769             if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
8770                 if ($map eq "") {   # Nulls are always $NULL map type
8771                     $map_type = $NULL;
8772                 } # Otherwise, non-strings, and those that don't allow
8773                   # $MULTI_CP, and those that aren't multiple code points are
8774                   # 0
8775                 elsif
8776                    (($property_type != $STRING && $property_type != $UNKNOWN)
8777                    || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
8778                    || $map !~ /^ $code_point_re ( \  $code_point_re )+ $ /x)
8779                 {
8780                     $map_type = 0;
8781                 }
8782                 else {
8783                     $map_type = $MULTI_CP;
8784                 }
8785             }
8786
8787             $property_object->add_map($low, $high,
8788                                         $map,
8789                                         Type => $map_type,
8790                                         Replace => $replace);
8791         } # End of loop through file's lines
8792
8793         return;
8794     }
8795 }
8796
8797 # XXX Unused until revise charnames;
8798 #sub check_and_handle_compound_name {
8799 #    This looks at Name properties for parenthesized components and splits
8800 #    them off.  Thus it finds FF as an equivalent to Form Feed.
8801 #    my $code_point = shift;
8802 #    my $name = shift;
8803 #    if ($name =~ /^ ( .*? ) ( \s* ) \( ( [^)]* ) \) (.*) $/x) {
8804 #        #local $to_trace = 1 if main::DEBUG;
8805 #        trace $1, $2, $3, $4 if main::DEBUG && $to_trace;
8806 #        push @more_Names, "$code_point; $1";
8807 #        push @more_Names, "$code_point; $3";
8808 #        Carp::my_carp_bug("Expecting blank space before left parenthesis in '$_'.  Proceeding and assuming it was there;") if $2 ne " ";
8809 #        Carp::my_carp_bug("Not expecting anything after the right parenthesis in '$_'.  Proceeding and ignoring that;") if $4 ne "";
8810 #    }
8811 #    return;
8812 #}
8813
8814 { # Closure for UnicodeData.txt handling
8815
8816     # This file was the first one in the UCD; its design leads to some
8817     # awkwardness in processing.  Here is a sample line:
8818     # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
8819     # The fields in order are:
8820     my $i = 0;            # The code point is in field 0, and is shifted off.
8821     my $NAME = $i++;      # character name (e.g. "LATIN CAPITAL LETTER A")
8822     my $CATEGORY = $i++;  # category (e.g. "Lu")
8823     my $CCC = $i++;       # Canonical combining class (e.g. "230")
8824     my $BIDI = $i++;      # directional class (e.g. "L")
8825     my $PERL_DECOMPOSITION = $i++;  # decomposition mapping
8826     my $PERL_DECIMAL_DIGIT = $i++;   # decimal digit value
8827     my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
8828                                          # Dual-use in this program; see below
8829     my $NUMERIC = $i++;   # numeric value
8830     my $MIRRORED = $i++;  # ? mirrored
8831     my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
8832     my $COMMENT = $i++;   # iso comment
8833     my $UPPER = $i++;     # simple uppercase mapping
8834     my $LOWER = $i++;     # simple lowercase mapping
8835     my $TITLE = $i++;     # simple titlecase mapping
8836     my $input_field_count = $i;
8837
8838     # This routine in addition outputs these extra fields:
8839     my $DECOMP_TYPE = $i++; # Decomposition type
8840     my $DECOMP_MAP = $i++;  # Must be last; another decomposition mapping
8841     my $last_field = $i - 1;
8842
8843     # All these are read into an array for each line, with the indices defined
8844     # above.  The empty fields in the example line above indicate that the
8845     # value is defaulted.  The handler called for each line of the input
8846     # changes these to their defaults.
8847
8848     # Here are the official names of the properties, in a parallel array:
8849     my @field_names;
8850     $field_names[$BIDI] = 'Bidi_Class';
8851     $field_names[$CATEGORY] = 'General_Category';
8852     $field_names[$CCC] = 'Canonical_Combining_Class';
8853     $field_names[$COMMENT] = 'ISO_Comment';
8854     $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
8855     $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
8856     $field_names[$LOWER] = 'Simple_Lowercase_Mapping';
8857     $field_names[$MIRRORED] = 'Bidi_Mirrored';
8858     $field_names[$NAME] = 'Name';
8859     $field_names[$NUMERIC] = 'Numeric_Value';
8860     $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
8861     $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
8862     $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
8863     $field_names[$TITLE] = 'Simple_Titlecase_Mapping';
8864     $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
8865     $field_names[$UPPER] = 'Simple_Uppercase_Mapping';
8866
8867     # Some of these need a little more explanation.  The $PERL_DECIMAL_DIGIT
8868     # field does not lead to an official Unicode property, but is used in
8869     # calculating the Numeric_Type.  Perl however, creates a file from this
8870     # field, so a Perl property is created from it.  Similarly, the Other
8871     # Digit field is used only for calculating the Numeric_Type, and so it can
8872     # be safely re-used as the place to store the value for Numeric_Type;
8873     # hence it is referred to as $NUMERIC_TYPE_OTHER_DIGIT.  The input field
8874     # named $PERL_DECOMPOSITION is a combination of both the decomposition
8875     # mapping and its type.  Perl creates a file containing exactly this
8876     # field, so it is used for that.  The two properties are separated into
8877     # two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
8878
8879     # This file is processed like most in this program.  Control is passed to
8880     # process_generic_property_file() which calls filter_UnicodeData_line()
8881     # for each input line.  This filter converts the input into line(s) that
8882     # process_generic_property_file() understands.  There is also a setup
8883     # routine called before any of the file is processed, and a handler for
8884     # EOF processing, all in this closure.
8885
8886     # A huge speed-up occurred at the cost of some added complexity when these
8887     # routines were altered to buffer the outputs into ranges.  Almost all the
8888     # lines of the input file apply to just one code point, and for most
8889     # properties, the map for the next code point up is the same as the
8890     # current one.  So instead of creating a line for each property for each
8891     # input line, filter_UnicodeData_line() remembers what the previous map
8892     # of a property was, and doesn't generate a line to pass on until it has
8893     # to, as when the map changes; and that passed-on line encompasses the
8894     # whole contiguous range of code points that have the same map for that
8895     # property.  This means a slight amount of extra setup, and having to
8896     # flush these buffers on EOF, testing if the maps have changed, plus
8897     # remembering state information in the closure.  But it means a lot less
8898     # real time in not having to change the data base for each property on
8899     # each line.
8900
8901     # Another complication is that there are already a few ranges designated
8902     # in the input.  There are two lines for each, with the same maps except
8903     # the code point and name on each line.  This was actually the hardest
8904     # thing to design around.  The code points in those ranges may actually
8905     # have real maps not given by these two lines.  These maps will either
8906     # be algorthimically determinable, or in the extracted files furnished
8907     # with the UCD.  In the event of conflicts between these extracted files,
8908     # and this one, Unicode says that this one prevails.  But it shouldn't
8909     # prevail for conflicts that occur in these ranges.  The data from the
8910     # extracted files prevails in those cases.  So, this program is structured
8911     # so that those files are processed first, storing maps.  Then the other
8912     # files are processed, generally overwriting what the extracted files
8913     # stored.  But just the range lines in this input file are processed
8914     # without overwriting.  This is accomplished by adding a special string to
8915     # the lines output to tell process_generic_property_file() to turn off the
8916     # overwriting for just this one line.
8917     # A similar mechanism is used to tell it that the map is of a non-default
8918     # type.
8919
8920     sub setup_UnicodeData { # Called before any lines of the input are read
8921         my $file = shift;
8922         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8923
8924         my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
8925                                         Directory => File::Spec->curdir(),
8926                                         File => 'Decomposition',
8927                                         Format => $STRING_FORMAT,
8928                                         Internal_Only_Warning => 1,
8929                                         Perl_Extension => 1,
8930                                         Default_Map => $CODE_POINT,
8931
8932                                         # This is a specially formatted table
8933                                         # explicitly for normalize.pm, which
8934                                         # is expecting a particular format,
8935                                         # which means that mappings containing
8936                                         # multiple code points are in the main
8937                                         # body of the table
8938                                         Map_Type => $COMPUTE_NO_MULTI_CP,
8939                                         Type => $STRING,
8940                                         );
8941         $Perl_decomp->add_comment(join_lines(<<END
8942 This mapping is a combination of the Unicode 'Decomposition_Type' and
8943 'Decomposition_Mapping' properties, formatted for use by normalize.pm.  It is
8944 identical to the official Unicode 'Decomposition_Mapping'  property except for
8945 two things:
8946  1) It omits the algorithmically determinable Hangul syllable decompositions,
8947 which normalize.pm handles algorithmically.
8948  2) It contains the decomposition type as well.  Non-canonical decompositions
8949 begin with a word in angle brackets, like <super>, which denotes the
8950 compatible decomposition type.  If the map does not begin with the <angle
8951 brackets>, the decomposition is canonical.
8952 END
8953         ));
8954
8955         my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
8956                                         Default_Map => "",
8957                                         Perl_Extension => 1,
8958                                         File => 'Digit',    # Trad. location
8959                                         Directory => $map_directory,
8960                                         Type => $STRING,
8961                                         Range_Size_1 => 1,
8962                                         );
8963         $Decimal_Digit->add_comment(join_lines(<<END
8964 This file gives the mapping of all code points which represent a single
8965 decimal digit [0-9] to their respective digits.  For example, the code point
8966 U+0031 (an ASCII '1') is mapped to a numeric 1.  These code points are those
8967 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
8968 numerals.
8969 END
8970         ));
8971
8972         # This property is not used for generating anything else, and is
8973         # usually not output.  By making it last in the list, we can just
8974         # change the high end of the loop downwards to avoid the work of
8975         # generating a table that is just going to get thrown away.
8976         if (! property_ref('Decomposition_Mapping')->to_output_map) {
8977             $last_field--;
8978         }
8979         return;
8980     }
8981
8982     my $first_time = 1;                 # ? Is this the first line of the file
8983     my $in_range = 0;                   # ? Are we in one of the file's ranges
8984     my $previous_cp;                    # hex code point of previous line
8985     my $decimal_previous_cp = -1;       # And its decimal equivalent
8986     my @start;                          # For each field, the current starting
8987                                         # code point in hex for the range
8988                                         # being accumulated.
8989     my @fields;                         # The input fields;
8990     my @previous_fields;                # And those from the previous call
8991
8992     sub filter_UnicodeData_line {
8993         # Handle a single input line from UnicodeData.txt; see comments above
8994         # Conceptually this takes a single line from the file containing N
8995         # properties, and converts it into N lines with one property per line,
8996         # which is what the final handler expects.  But there are
8997         # complications due to the quirkiness of the input file, and to save
8998         # time, it accumulates ranges where the property values don't change
8999         # and only emits lines when necessary.  This is about an order of
9000         # magnitude fewer lines emitted.
9001
9002         my $file = shift;
9003         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9004
9005         # $_ contains the input line.
9006         # -1 in split means retain trailing null fields
9007         (my $cp, @fields) = split /\s*;\s*/, $_, -1;
9008
9009         #local $to_trace = 1 if main::DEBUG;
9010         trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
9011         if (@fields > $input_field_count) {
9012             $file->carp_bad_line('Extra fields');
9013             $_ = "";
9014             return;
9015         }
9016
9017         my $decimal_cp = hex $cp;
9018
9019         # We have to output all the buffered ranges when the next code point
9020         # is not exactly one after the previous one, which means there is a
9021         # gap in the ranges.
9022         my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
9023
9024         # The decomposition mapping field requires special handling.  It looks
9025         # like either:
9026         #
9027         # <compat> 0032 0020
9028         # 0041 0300
9029         #
9030         # The decomposition type is enclosed in <brackets>; if missing, it
9031         # means the type is canonical.  There are two decomposition mapping
9032         # tables: the one for use by Perl's normalize.pm has a special format
9033         # which is this field intact; the other, for general use is of
9034         # standard format.  In either case we have to find the decomposition
9035         # type.  Empty fields have None as their type, and map to the code
9036         # point itself
9037         if ($fields[$PERL_DECOMPOSITION] eq "") {
9038             $fields[$DECOMP_TYPE] = 'None';
9039             $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
9040         }
9041         else {
9042             ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
9043                                             =~ / < ( .+? ) > \s* ( .+ ) /x;
9044             if (! defined $fields[$DECOMP_TYPE]) {
9045                 $fields[$DECOMP_TYPE] = 'Canonical';
9046                 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
9047             }
9048             else {
9049                 $fields[$DECOMP_MAP] = $map;
9050             }
9051         }
9052
9053         # The 3 numeric fields also require special handling.  The 2 digit
9054         # fields must be either empty or match the number field.  This means
9055         # that if it is empty, they must be as well, and the numeric type is
9056         # None, and the numeric value is 'Nan'.
9057         # The decimal digit field must be empty or match the other digit
9058         # field.  If the decimal digit field is non-empty, the code point is
9059         # a decimal digit, and the other two fields will have the same value.
9060         # If it is empty, but the other digit field is non-empty, the code
9061         # point is an 'other digit', and the number field will have the same
9062         # value as the other digit field.  If the other digit field is empty,
9063         # but the number field is non-empty, the code point is a generic
9064         # numeric type.
9065         if ($fields[$NUMERIC] eq "") {
9066             if ($fields[$PERL_DECIMAL_DIGIT] ne ""
9067                 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
9068             ) {
9069                 $file->carp_bad_line("Numeric values inconsistent.  Trying to process anyway");
9070             }
9071             $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
9072             $fields[$NUMERIC] = 'NaN';
9073         }
9074         else {
9075             $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number.  Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
9076             if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
9077                 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
9078                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
9079             }
9080             elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
9081                 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
9082                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
9083             }
9084             else {
9085                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
9086
9087                 # Rationals require extra effort.
9088                 register_fraction($fields[$NUMERIC])
9089                                                 if $fields[$NUMERIC] =~ qr{/};
9090             }
9091         }
9092
9093         # For the properties that have empty fields in the file, and which
9094         # mean something different from empty, change them to that default.
9095         # Certain fields just haven't been empty so far in any Unicode
9096         # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
9097         # $CATEGORY.  This leaves just the two fields, and so we hard-code in
9098         # the defaults; which are verly unlikely to ever change.
9099         $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
9100         $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
9101
9102         # UAX44 says that if title is empty, it is the same as whatever upper
9103         # is,
9104         $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
9105
9106         # There are a few pairs of lines like:
9107         #   AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
9108         #   D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
9109         # that define ranges.  These should be processed after the fields are
9110         # adjusted above, as they may override some of them; but mostly what
9111         # is left is to possibly adjust the $NAME field.  The names of all the
9112         # paired lines start with a '<', but this is also true of '<control>,
9113         # which isn't one of these special ones.
9114         if ($fields[$NAME] eq '<control>') {
9115
9116             # Some code points in this file have the pseudo-name
9117             # '<control>', but the official name for such ones is the null
9118             # string.
9119             $fields[$NAME] = "";
9120
9121             # We had better not be in between range lines.
9122             if ($in_range) {
9123                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$NAME]'.  Trying anyway");
9124                 $in_range = 0;
9125             }
9126         }
9127         elsif (substr($fields[$NAME], 0, 1) ne '<') {
9128
9129             # Here is a non-range line.  We had better not be in between range
9130             # lines.
9131             if ($in_range) {
9132                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$NAME]'.  Trying anyway");
9133                 $in_range = 0;
9134             }
9135             # XXX until charnames catches up.
9136 #            if ($fields[$NAME] =~ s/- $cp $//x) {
9137 #
9138 #                # These are code points whose names end in their code points,
9139 #                # which means the names are algorithmically derivable from the
9140 #                # code points.  To shorten the output Name file, the algorithm
9141 #                # for deriving these is placed in the file instead of each
9142 #                # code point, so they have map type $CP_IN_NAME
9143 #                $fields[$NAME] = $CMD_DELIM
9144 #                                 . $MAP_TYPE_CMD
9145 #                                 . '='
9146 #                                 . $CP_IN_NAME
9147 #                                 . $CMD_DELIM
9148 #                                 . $fields[$NAME];
9149 #            }
9150
9151             # Some official names are really two alternate names with one in
9152             # parentheses.  What we do here is use the full official one for
9153             # the standard property (stored just above), but for the charnames
9154             # table, we add two more entries, one for each of the alternate
9155             # ones.
9156             # elsif name ne ""
9157             #check_and_handle_compound_name($cp, $fields[$NAME]);
9158             #check_and_handle_compound_name($cp, $unicode_1_name);
9159             # XXX until charnames catches up.
9160         }
9161         elsif ($fields[$NAME] =~ /^<(.+), First>$/) {
9162             $fields[$NAME] = $1;
9163
9164             # Here we are at the beginning of a range pair.
9165             if ($in_range) {
9166                 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$NAME]'.  Trying anyway");
9167             }
9168             $in_range = 1;
9169
9170             # Because the properties in the range do not overwrite any already
9171             # in the db, we must flush the buffers of what's already there, so
9172             # they get handled in the normal scheme.
9173             $force_output = 1;
9174
9175         }
9176         elsif ($fields[$NAME] !~ s/^<(.+), Last>$/$1/) {
9177             $file->carp_bad_line("Unexpected name starting with '<' $fields[$NAME].  Ignoring this line.");
9178             $_ = "";
9179             return;
9180         }
9181         else { # Here, we are at the last line of a range pair.
9182
9183             if (! $in_range) {
9184                 $file->carp_bad_line("Unexpected end of range $fields[$NAME] when not in one.  Ignoring this line.");
9185                 $_ = "";
9186                 return;
9187             }
9188             $in_range = 0;
9189
9190             # Check that the input is valid: that the closing of the range is
9191             # the same as the beginning.
9192             foreach my $i (0 .. $last_field) {
9193                 next if $fields[$i] eq $previous_fields[$i];
9194                 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'.  Bad News.  Trying anyway");
9195             }
9196
9197             # The processing differs depending on the type of range,
9198             # determined by its $NAME
9199             if ($fields[$NAME] =~ /^Hangul Syllable/) {
9200
9201                 # Check that the data looks right.
9202                 if ($decimal_previous_cp != $SBase) {
9203                     $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp.  Bad News.  Results will be wrong");
9204                 }
9205                 if ($decimal_cp != $SBase + $SCount - 1) {
9206                     $file->carp_bad_line("Unexpected Hangul syllable end = $cp.  Bad News.  Results will be wrong");
9207                 }
9208
9209                 # The Hangul syllable range has a somewhat complicated name
9210                 # generation algorithm.  Each code point in it has a canonical
9211                 # decomposition also computable by an algorithm.  The
9212                 # perl decomposition map table built from these is used only
9213                 # by normalize.pm, which has the algorithm built in it, so the
9214                 # decomposition maps are not needed, and are large, so are
9215                 # omitted from it.  If the full decomposition map table is to
9216                 # be output, the decompositions are generated for it, in the
9217                 # EOF handling code for this input file.
9218
9219                 $previous_fields[$DECOMP_TYPE] = 'Canonical';
9220
9221                 # This range is stored in our internal structure with its
9222                 # own map type, different from all others.
9223                 $previous_fields[$NAME] = $CMD_DELIM
9224                                           . $MAP_TYPE_CMD
9225                                           . '='
9226                                           . $HANGUL_SYLLABLE
9227                                           . $CMD_DELIM
9228                                           . $fields[$NAME];
9229             }
9230             elsif ($fields[$NAME] =~ /^CJK/) {
9231
9232                 # The name for these contains the code point itself, and all
9233                 # are defined to have the same base name, regardless of what
9234                 # is in the file.  They are stored in our internal structure
9235                 # with a map type of $CP_IN_NAME
9236                 $previous_fields[$NAME] = $CMD_DELIM
9237                                            . $MAP_TYPE_CMD
9238                                            . '='
9239                                            . $CP_IN_NAME
9240                                            . $CMD_DELIM
9241                                            . 'CJK UNIFIED IDEOGRAPH';
9242
9243             }
9244             elsif ($fields[$CATEGORY] eq 'Co'
9245                      || $fields[$CATEGORY] eq 'Cs')
9246             {
9247                 # The names of all the code points in these ranges are set to
9248                 # null, as there are no names for the private use and
9249                 # surrogate code points.
9250
9251                 $previous_fields[$NAME] = "";
9252             }
9253             else {
9254                 $file->carp_bad_line("Unexpected code point range $fields[$NAME] because category is $fields[$CATEGORY].  Attempting to process it.");
9255             }
9256
9257             # The first line of the range caused everything else to be output,
9258             # and then its values were stored as the beginning values for the
9259             # next set of ranges, which this one ends.  Now, for each value,
9260             # add a command to tell the handler that these values should not
9261             # replace any existing ones in our database.
9262             foreach my $i (0 .. $last_field) {
9263                 $previous_fields[$i] = $CMD_DELIM
9264                                         . $REPLACE_CMD
9265                                         . '='
9266                                         . $NO
9267                                         . $CMD_DELIM
9268                                         . $previous_fields[$i];
9269             }
9270
9271             # And change things so it looks like the entire range has been
9272             # gone through with this being the final part of it.  Adding the
9273             # command above to each field will cause this range to be flushed
9274             # during the next iteration, as it guaranteed that the stored
9275             # field won't match whatever value the next one has.
9276             $previous_cp = $cp;
9277             $decimal_previous_cp = $decimal_cp;
9278
9279             # We are now set up for the next iteration; so skip the remaining
9280             # code in this subroutine that does the same thing, but doesn't
9281             # know about these ranges.
9282             $_ = "";
9283             return;
9284         }
9285
9286         # On the very first line, we fake it so the code below thinks there is
9287         # nothing to output, and initialize so that when it does get output it
9288         # uses the first line's values for the lowest part of the range.
9289         # (One could avoid this by using peek(), but then one would need to
9290         # know the adjustments done above and do the same ones in the setup
9291         # routine; not worth it)
9292         if ($first_time) {
9293             $first_time = 0;
9294             @previous_fields = @fields;
9295             @start = ($cp) x scalar @fields;
9296             $decimal_previous_cp = $decimal_cp - 1;
9297         }
9298
9299         # For each field, output the stored up ranges that this code point
9300         # doesn't fit in.  Earlier we figured out if all ranges should be
9301         # terminated because of changing the replace or map type styles, or if
9302         # there is a gap between this new code point and the previous one, and
9303         # that is stored in $force_output.  But even if those aren't true, we
9304         # need to output the range if this new code point's value for the
9305         # given property doesn't match the stored range's.
9306         #local $to_trace = 1 if main::DEBUG;
9307         foreach my $i (0 .. $last_field) {
9308             my $field = $fields[$i];
9309             if ($force_output || $field ne $previous_fields[$i]) {
9310
9311                 # Flush the buffer of stored values.
9312                 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
9313
9314                 # Start a new range with this code point and its value
9315                 $start[$i] = $cp;
9316                 $previous_fields[$i] = $field;
9317             }
9318         }
9319
9320         # Set the values for the next time.
9321         $previous_cp = $cp;
9322         $decimal_previous_cp = $decimal_cp;
9323
9324         # The input line has generated whatever adjusted lines are needed, and
9325         # should not be looked at further.
9326         $_ = "";
9327         return;
9328     }
9329
9330     sub EOF_UnicodeData {
9331         # Called upon EOF to flush the buffers, and create the Hangul
9332         # decomposition mappings if needed.
9333
9334         my $file = shift;
9335         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9336
9337         # Flush the buffers.
9338         foreach my $i (1 .. $last_field) {
9339             $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
9340         }
9341
9342         if (-e 'Jamo.txt') {
9343
9344             # The algorithm is published by Unicode, based on values in
9345             # Jamo.txt, (which should have been processed before this
9346             # subroutine), and the results left in %Jamo
9347             unless (%Jamo) {
9348                 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt.  Hangul syllables not generated.");
9349                 return;
9350             }
9351
9352             # If the full decomposition map table is being output, insert
9353             # into it the Hangul syllable mappings.  This is to avoid having
9354             # to publish a subroutine in it to compute them.  (which would
9355             # essentially be this code.)  This uses the algorithm published by
9356             # Unicode.
9357             if (property_ref('Decomposition_Mapping')->to_output_map) {
9358                 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
9359                     use integer;
9360                     my $SIndex = $S - $SBase;
9361                     my $L = $LBase + $SIndex / $NCount;
9362                     my $V = $VBase + ($SIndex % $NCount) / $TCount;
9363                     my $T = $TBase + $SIndex % $TCount;
9364
9365                     trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
9366                     my $decomposition = sprintf("%04X %04X", $L, $V);
9367                     $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
9368                     $file->insert_adjusted_lines(
9369                                 sprintf("%04X; Decomposition_Mapping; %s",
9370                                         $S,
9371                                         $decomposition));
9372                 }
9373             }
9374         }
9375
9376         return;
9377     }
9378
9379     sub filter_v1_ucd {
9380         # Fix UCD lines in version 1.  This is probably overkill, but this
9381         # fixes some glaring errors in Version 1 UnicodeData.txt.  That file:
9382         # 1)    had many Hangul (U+3400 - U+4DFF) code points that were later
9383         #       removed.  This program retains them
9384         # 2)    didn't include ranges, which it should have, and which are now
9385         #       added in @corrected_lines below.  It was hand populated by
9386         #       taking the data from Version 2, verified by analyzing
9387         #       DAge.txt.
9388         # 3)    There is a syntax error in the entry for U+09F8 which could
9389         #       cause problems for utf8_heavy, and so is changed.  It's
9390         #       numeric value was simply a minus sign, without any number.
9391         #       (Eventually Unicode changed the code point to non-numeric.)
9392         # 4)    The decomposition types often don't match later versions
9393         #       exactly, and the whole syntax of that field is different; so
9394         #       the syntax is changed as well as the types to their later
9395         #       terminology.  Otherwise normalize.pm would be very unhappy
9396         # 5)    Many ccc classes are different.  These are left intact.
9397         # 6)    U+FF10 - U+FF19 are missing their numeric values in all three
9398         #       fields.  These are unchanged because it doesn't really cause
9399         #       problems for Perl.
9400         # 7)    A number of code points, such as controls, don't have their
9401         #       Unicode Version 1 Names in this file.  These are unchanged.
9402
9403         my @corrected_lines = split /\n/, <<'END';
9404 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
9405 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
9406 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
9407 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
9408 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
9409 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
9410 END
9411
9412         my $file = shift;
9413         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9414
9415         #local $to_trace = 1 if main::DEBUG;
9416         trace $_ if main::DEBUG && $to_trace;
9417
9418         # -1 => retain trailing null fields
9419         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9420
9421         # At the first place that is wrong in the input, insert all the
9422         # corrections, replacing the wrong line.
9423         if ($code_point eq '4E00') {
9424             my @copy = @corrected_lines;
9425             $_ = shift @copy;
9426             ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9427
9428             $file->insert_lines(@copy);
9429         }
9430
9431
9432         if ($fields[$NUMERIC] eq '-') {
9433             $fields[$NUMERIC] = '-1';  # This is what 2.0 made it.
9434         }
9435
9436         if  ($fields[$PERL_DECOMPOSITION] ne "") {
9437
9438             # Several entries have this change to superscript 2 or 3 in the
9439             # middle.  Convert these to the modern version, which is to use
9440             # the actual U+00B2 and U+00B3 (the superscript forms) instead.
9441             # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
9442             # 'HHHH HHHH 00B3 HHHH'.
9443             # It turns out that all of these that don't have another
9444             # decomposition defined at the beginning of the line have the
9445             # <square> decomposition in later releases.
9446             if ($code_point ne '00B2' && $code_point ne '00B3') {
9447                 if  ($fields[$PERL_DECOMPOSITION]
9448                                     =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
9449                 {
9450                     if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
9451                         $fields[$PERL_DECOMPOSITION] = '<square> '
9452                         . $fields[$PERL_DECOMPOSITION];
9453                     }
9454                 }
9455             }
9456
9457             # If is like '<+circled> 0052 <-circled>', convert to
9458             # '<circled> 0052'
9459             $fields[$PERL_DECOMPOSITION] =~
9460                             s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x;
9461
9462             # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
9463             $fields[$PERL_DECOMPOSITION] =~
9464                             s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
9465             or $fields[$PERL_DECOMPOSITION] =~
9466                             s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
9467             or $fields[$PERL_DECOMPOSITION] =~
9468                             s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
9469             or $fields[$PERL_DECOMPOSITION] =~
9470                         s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
9471
9472             # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
9473             $fields[$PERL_DECOMPOSITION] =~
9474                     s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
9475
9476             # Change names to modern form.
9477             $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
9478             $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
9479             $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
9480             $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
9481
9482             # One entry has weird braces
9483             $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
9484         }
9485
9486         $_ = join ';', $code_point, @fields;
9487         trace $_ if main::DEBUG && $to_trace;
9488         return;
9489     }
9490
9491     sub filter_v2_1_5_ucd {
9492         # A dozen entries in this 2.1.5 file had the mirrored and numeric
9493         # columns swapped;  These all had mirrored be 'N'.  So if the numeric
9494         # column appears to be N, swap it back.
9495
9496         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9497         if ($fields[$NUMERIC] eq 'N') {
9498             $fields[$NUMERIC] = $fields[$MIRRORED];
9499             $fields[$MIRRORED] = 'N';
9500             $_ = join ';', $code_point, @fields;
9501         }
9502         return;
9503     }
9504 } # End closure for UnicodeData
9505
9506 sub process_NamedSequences {
9507     # NamedSequences.txt entries are just added to an array.  Because these
9508     # don't look like the other tables, they have their own handler.
9509     # An example:
9510     # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
9511     #
9512     # This just adds the sequence to an array for later handling
9513
9514     return; # XXX Until charnames catches up
9515     my $file = shift;
9516     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9517
9518     while ($file->next_line) {
9519         my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
9520         if (@remainder) {
9521             $file->carp_bad_line(
9522                 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
9523             next;
9524         }
9525         push @named_sequences, "$sequence\t\t$name";
9526     }
9527     return;
9528 }
9529
9530 { # Closure
9531
9532     my $first_range;
9533
9534     sub  filter_early_ea_lb {
9535         # Fixes early EastAsianWidth.txt and LineBreak.txt files.  These had a
9536         # third field be the name of the code point, which can be ignored in
9537         # most cases.  But it can be meaningful if it marks a range:
9538         # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
9539         # 3400;W;<CJK Ideograph Extension A, First>
9540         #
9541         # We need to see the First in the example above to know it's a range.
9542         # They did not use the later range syntaxes.  This routine changes it
9543         # to use the modern syntax.
9544         # $1 is the Input_file object.
9545
9546         my @fields = split /\s*;\s*/;
9547         if ($fields[2] =~ /^<.*, First>/) {
9548             $first_range = $fields[0];
9549             $_ = "";
9550         }
9551         elsif ($fields[2] =~ /^<.*, Last>/) {
9552             $_ = $_ = "$first_range..$fields[0]; $fields[1]";
9553         }
9554         else {
9555             undef $first_range;
9556             $_ = "$fields[0]; $fields[1]";
9557         }
9558
9559         return;
9560     }
9561 }
9562
9563 sub filter_old_style_arabic_shaping {
9564     # Early versions used a different term for the later one.
9565
9566     my @fields = split /\s*;\s*/;
9567     $fields[3] =~ s/<no shaping>/No_Joining_Group/;
9568     $fields[3] =~ s/\s+/_/g;                # Change spaces to underscores
9569     $_ = join ';', @fields;
9570     return;
9571 }
9572
9573 sub filter_arabic_shaping_line {
9574     # ArabicShaping.txt has entries that look like:
9575     # 062A; TEH; D; BEH
9576     # The field containing 'TEH' is not used.  The next field is Joining_Type
9577     # and the last is Joining_Group
9578     # This generates two lines to pass on, one for each property on the input
9579     # line.
9580
9581     my $file = shift;
9582     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9583
9584     my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9585
9586     if (@fields > 4) {
9587         $file->carp_bad_line('Extra fields');
9588         $_ = "";
9589         return;
9590     }
9591
9592     $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
9593     $_ = "$fields[0]; Joining_Type; $fields[2]";
9594
9595     return;
9596 }
9597
9598 sub setup_special_casing {
9599     # SpecialCasing.txt contains the non-simple case change mappings.  The
9600     # simple ones are in UnicodeData.txt, and should already have been read
9601     # in.
9602     # This routine initializes the full mappings to the simple, then as each
9603     # line is processed, it overrides the simple ones.
9604
9605     my $file= shift;
9606     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9607
9608     # For each of the case change mappings...
9609     foreach my $case ('lc', 'tc', 'uc') {
9610
9611         # The simple version's name in each mapping merely has an 's' in front
9612         # of the full one's
9613         my $simple = property_ref('s' . $case);
9614         unless (defined $simple && ! $simple->is_empty) {
9615             Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing.  Only special casing will be generated.");
9616         }
9617
9618         # Initialize the full case mappings with the simple ones.
9619         property_ref($case)->initialize($simple);
9620     }
9621
9622     return;
9623 }
9624
9625 sub filter_special_casing_line {
9626     # Change the format of $_ from SpecialCasing.txt into something that the
9627     # generic handler understands.  Each input line contains three case
9628     # mappings.  This will generate three lines to pass to the generic handler
9629     # for each of those.
9630
9631     # The input syntax (after stripping comments and trailing white space is
9632     # like one of the following (with the final two being entries that we
9633     # ignore):
9634     # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
9635     # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
9636     # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
9637     # Note the trailing semi-colon, unlike many of the input files.  That
9638     # means that there will be an extra null field generated by the split
9639
9640     my $file = shift;
9641     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9642
9643     my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9644
9645     # field #4 is when this mapping is conditional.  If any of these get
9646     # implemented, it would be by hard-coding in the casing functions in the
9647     # Perl core, not through tables.  But if there is a new condition we don't
9648     # know about, output a warning.  We know about all the conditions through
9649     # 5.2
9650     if ($fields[4] ne "") {
9651         my @conditions = split ' ', $fields[4];
9652         if ($conditions[0] ne 'tr'  # We know that these languages have
9653                                     # conditions, and some are multiple
9654             && $conditions[0] ne 'az'
9655             && $conditions[0] ne 'lt'
9656
9657             # And, we know about a single condition Final_Sigma, but
9658             # nothing else.
9659             && ($v_version gt v5.2.0
9660                 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
9661         {
9662             $file->carp_bad_line("Unknown condition '$fields[4]'.  You should inspect it and either add code to handle it, or add to list of those that are to ignore");
9663         }
9664         elsif ($conditions[0] ne 'Final_Sigma') {
9665
9666                 # Don't print out a message for Final_Sigma, because we have
9667                 # hard-coded handling for it.  (But the standard could change
9668                 # what the rule should be, but it wouldn't show up here
9669                 # anyway.
9670
9671                 print "# SKIPPING Special Casing: $_\n"
9672                                                     if $verbosity >= $VERBOSE;
9673         }
9674         $_ = "";
9675         return;
9676     }
9677     elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
9678         $file->carp_bad_line('Extra fields');
9679         $_ = "";
9680         return;
9681     }
9682
9683     $_ = "$fields[0]; lc; $fields[1]";
9684     $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
9685     $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
9686
9687     return;
9688 }
9689
9690 sub filter_old_style_case_folding {
9691     # This transforms $_ containing the case folding style of 3.0.1, to 3.1
9692     # and later style.  Different letters were used in the earlier.
9693
9694     my $file = shift;
9695     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9696
9697     my @fields = split /\s*;\s*/;
9698     if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
9699         $fields[1] = 'I';
9700     }
9701     elsif ($fields[1] eq 'L') {
9702         $fields[1] = 'C';             # L => C always
9703     }
9704     elsif ($fields[1] eq 'E') {
9705         if ($fields[2] =~ / /) {      # E => C if one code point; F otherwise
9706             $fields[1] = 'F'
9707         }
9708         else {
9709             $fields[1] = 'C'
9710         }
9711     }
9712     else {
9713         $file->carp_bad_line("Expecting L or E in second field");
9714         $_ = "";
9715         return;
9716     }
9717     $_ = join("; ", @fields) . ';';
9718     return;
9719 }
9720
9721 { # Closure for case folding
9722
9723     # Create the map for simple only if are going to output it, for otherwise
9724     # it takes no part in anything we do.
9725     my $to_output_simple;
9726
9727     # These are experimental, perhaps will need these to pass to regcomp.c to
9728     # handle the cases where for example the Kelvin sign character folds to k,
9729     # and in regcomp, we need to know which of the characters can have a
9730     # non-latin1 char fold to it, so it doesn't do the optimizations it might
9731     # otherwise.
9732     my @latin1_singly_folded;
9733     my @latin1_folded;
9734
9735     sub setup_case_folding($) {
9736         # Read in the case foldings in CaseFolding.txt.  This handles both
9737         # simple and full case folding.
9738
9739         $to_output_simple
9740                         = property_ref('Simple_Case_Folding')->to_output_map;
9741
9742         return;
9743     }
9744
9745     sub filter_case_folding_line {
9746         # Called for each line in CaseFolding.txt
9747         # Input lines look like:
9748         # 0041; C; 0061; # LATIN CAPITAL LETTER A
9749         # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
9750         # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
9751         #
9752         # 'C' means that folding is the same for both simple and full
9753         # 'F' that it is only for full folding
9754         # 'S' that it is only for simple folding
9755         # 'T' is locale-dependent, and ignored
9756         # 'I' is a type of 'F' used in some early releases.
9757         # Note the trailing semi-colon, unlike many of the input files.  That
9758         # means that there will be an extra null field generated by the split
9759         # below, which we ignore and hence is not an error.
9760
9761         my $file = shift;
9762         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9763
9764         my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
9765         if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
9766             $file->carp_bad_line('Extra fields');
9767             $_ = "";
9768             return;
9769         }
9770
9771         if ($type eq 'T') {   # Skip Turkic case folding, is locale dependent
9772             $_ = "";
9773             return;
9774         }
9775
9776         # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
9777         # I are all full foldings
9778         if ($type eq 'C' || $type eq 'F' || $type eq 'I') {
9779             $_ = "$range; Case_Folding; $map";
9780         }
9781         else {
9782             $_ = "";
9783             if ($type ne 'S') {
9784                $file->carp_bad_line('Expecting C F I S or T in second field');
9785                return;
9786             }
9787         }
9788
9789         # C and S are simple foldings, but simple case folding is not needed
9790         # unless we explicitly want its map table output.
9791         if ($to_output_simple && $type eq 'C' || $type eq 'S') {
9792             $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
9793         }
9794
9795         # Experimental, see comment above
9796         if ($type ne 'S' && hex($range) >= 256) {   # assumes range is 1 point
9797             my @folded = split ' ', $map;
9798             if (hex $folded[0] < 256 && @folded == 1) {
9799                 push @latin1_singly_folded, hex $folded[0];
9800             }
9801             foreach my $folded (@folded) {
9802                 push @latin1_folded, hex $folded if hex $folded < 256;
9803             }
9804         }
9805
9806         return;
9807     }
9808
9809     sub post_fold {
9810         # Experimental, see comment above
9811         return;
9812
9813         #local $to_trace = 1 if main::DEBUG;
9814         @latin1_singly_folded = uniques(@latin1_singly_folded);
9815         @latin1_folded = uniques(@latin1_folded);
9816         trace "latin1 single folded:", map { chr $_ } sort { $a <=> $b } @latin1_singly_folded if main::DEBUG && $to_trace;
9817         trace "latin1 folded:", map { chr $_ } sort { $a <=> $b } @latin1_folded if main::DEBUG && $to_trace;
9818         return;
9819     }
9820 } # End case fold closure
9821
9822 sub filter_jamo_line {
9823     # Filter Jamo.txt lines.  This routine mainly is used to populate hashes
9824     # from this file that is used in generating the Name property for Jamo
9825     # code points.  But, it also is used to convert early versions' syntax
9826     # into the modern form.  Here are two examples:
9827     # 1100; G   # HANGUL CHOSEONG KIYEOK            # Modern syntax
9828     # U+1100; G; HANGUL CHOSEONG KIYEOK             # 2.0 syntax
9829     #
9830     # The input is $_, the output is $_ filtered.
9831
9832     my @fields = split /\s*;\s*/, $_, -1;  # -1 => retain trailing null fields
9833
9834     # Let the caller handle unexpected input.  In earlier versions, there was
9835     # a third field which is supposed to be a comment, but did not have a '#'
9836     # before it.
9837     return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
9838
9839     $fields[0] =~ s/^U\+//;     # Also, early versions had this extraneous
9840                                 # beginning.
9841
9842     # Some 2.1 versions had this wrong.  Causes havoc with the algorithm.
9843     $fields[1] = 'R' if $fields[0] eq '1105';
9844
9845     # Add to structure so can generate Names from it.
9846     my $cp = hex $fields[0];
9847     my $short_name = $fields[1];
9848     $Jamo{$cp} = $short_name;
9849     if ($cp <= $LBase + $LCount) {
9850         $Jamo_L{$short_name} = $cp - $LBase;
9851     }
9852     elsif ($cp <= $VBase + $VCount) {
9853         $Jamo_V{$short_name} = $cp - $VBase;
9854     }
9855     elsif ($cp <= $TBase + $TCount) {
9856         $Jamo_T{$short_name} = $cp - $TBase;
9857     }
9858     else {
9859         Carp::my_carp_bug("Unexpected Jamo code point in $_");
9860     }
9861
9862
9863     # Reassemble using just the first two fields to look like a typical
9864     # property file line
9865     $_ = "$fields[0]; $fields[1]";
9866
9867     return;
9868 }
9869
9870 sub register_fraction($) {
9871     # This registers the input rational number so that it can be passed on to
9872     # utf8_heavy.pl, both in rational and floating forms.
9873
9874     my $rational = shift;
9875
9876     my $float = eval $rational;
9877     $nv_floating_to_rational{$float} = $rational;
9878     return;
9879 }
9880
9881 sub filter_numeric_value_line {
9882     # DNumValues contains lines of a different syntax than the typical
9883     # property file:
9884     # 0F33          ; -0.5 ; ; -1/2 # No       TIBETAN DIGIT HALF ZERO
9885     #
9886     # This routine transforms $_ containing the anomalous syntax to the
9887     # typical, by filtering out the extra columns, and convert early version
9888     # decimal numbers to strings that look like rational numbers.
9889
9890     my $file = shift;
9891     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9892
9893     # Starting in 5.1, there is a rational field.  Just use that, omitting the
9894     # extra columns.  Otherwise convert the decimal number in the second field
9895     # to a rational, and omit extraneous columns.
9896     my @fields = split /\s*;\s*/, $_, -1;
9897     my $rational;
9898
9899     if ($v_version ge v5.1.0) {
9900         if (@fields != 4) {
9901             $file->carp_bad_line('Not 4 semi-colon separated fields');
9902             $_ = "";
9903             return;
9904         }
9905         $rational = $fields[3];
9906         $_ = join '; ', @fields[ 0, 3 ];
9907     }
9908     else {
9909
9910         # Here, is an older Unicode file, which has decimal numbers instead of
9911         # rationals in it.  Use the fraction to calculate the denominator and
9912         # convert to rational.
9913
9914         if (@fields != 2 && @fields != 3) {
9915             $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
9916             $_ = "";
9917             return;
9918         }
9919
9920         my $codepoints = $fields[0];
9921         my $decimal = $fields[1];
9922         if ($decimal =~ s/\.0+$//) {
9923
9924             # Anything ending with a decimal followed by nothing but 0's is an
9925             # integer
9926             $_ = "$codepoints; $decimal";
9927             $rational = $decimal;
9928         }
9929         else {
9930
9931             my $denominator;
9932             if ($decimal =~ /\.50*$/) {
9933                 $denominator = 2;
9934             }
9935
9936             # Here have the hardcoded repeating decimals in the fraction, and
9937             # the denominator they imply.  There were only a few denominators
9938             # in the older Unicode versions of this file which this code
9939             # handles, so it is easy to convert them.
9940
9941             # The 4 is because of a round-off error in the Unicode 3.2 files
9942             elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
9943                 $denominator = 3;
9944             }
9945             elsif ($decimal =~ /\.[27]50*$/) {
9946                 $denominator = 4;
9947             }
9948             elsif ($decimal =~ /\.[2468]0*$/) {
9949                 $denominator = 5;
9950             }
9951             elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
9952                 $denominator = 6;
9953             }
9954             elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
9955                 $denominator = 8;
9956             }
9957             if ($denominator) {
9958                 my $sign = ($decimal < 0) ? "-" : "";
9959                 my $numerator = int((abs($decimal) * $denominator) + .5);
9960                 $rational = "$sign$numerator/$denominator";
9961                 $_ = "$codepoints; $rational";
9962             }
9963             else {
9964                 $file->carp_bad_line("Can't cope with number '$decimal'.");
9965                 $_ = "";
9966                 return;
9967             }
9968         }
9969     }
9970
9971     register_fraction($rational) if $rational =~ qr{/};
9972     return;
9973 }
9974
9975 { # Closure
9976     my %unihan_properties;
9977     my $iicore;
9978
9979
9980     sub setup_unihan {
9981         # Do any special setup for Unihan properties.
9982
9983         # This property gives the wrong computed type, so override.
9984         my $usource = property_ref('kIRG_USource');
9985         $usource->set_type($STRING) if defined $usource;
9986
9987         # This property is to be considered binary, so change all the values
9988         # to Y.
9989         $iicore = property_ref('kIICore');
9990         if (defined $iicore) {
9991             $iicore->add_match_table('Y') if ! defined $iicore->table('Y');
9992
9993             # We have to change the default map, because the @missing line is
9994             # misleading, given that we are treating it as binary.
9995             $iicore->set_default_map('N');
9996             $iicore->set_type($BINARY);
9997         }
9998
9999         return;
10000     }
10001
10002     sub filter_unihan_line {
10003         # Change unihan db lines to look like the others in the db.  Here is
10004         # an input sample:
10005         #   U+341C        kCangjie        IEKN
10006
10007         # Tabs are used instead of semi-colons to separate fields; therefore
10008         # they may have semi-colons embedded in them.  Change these to periods
10009         # so won't screw up the rest of the code.
10010         s/;/./g;
10011
10012         # Remove lines that don't look like ones we accept.
10013         if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
10014             $_ = "";
10015             return;
10016         }
10017
10018         # Extract the property, and save a reference to its object.
10019         my $property = $1;
10020         if (! exists $unihan_properties{$property}) {
10021             $unihan_properties{$property} = property_ref($property);
10022         }
10023
10024         # Don't do anything unless the property is one we're handling, which
10025         # we determine by seeing if there is an object defined for it or not
10026         if (! defined $unihan_properties{$property}) {
10027             $_ = "";
10028             return;
10029         }
10030
10031         # The iicore property is supposed to be a boolean, so convert to our
10032         # standard boolean form.
10033         if (defined $iicore && $unihan_properties{$property} == $iicore) {
10034             $_ =~ s/$property.*/$property\tY/
10035         }
10036
10037         # Convert the tab separators to our standard semi-colons, and convert
10038         # the U+HHHH notation to the rest of the standard's HHHH
10039         s/\t/;/g;
10040         s/\b U \+ (?= $code_point_re )//xg;
10041
10042         #local $to_trace = 1 if main::DEBUG;
10043         trace $_ if main::DEBUG && $to_trace;
10044
10045         return;
10046     }
10047 }
10048
10049 sub filter_blocks_lines {
10050     # In the Blocks.txt file, the names of the blocks don't quite match the
10051     # names given in PropertyValueAliases.txt, so this changes them so they
10052     # do match:  Blanks and hyphens are changed into underscores.  Also makes
10053     # early release versions look like later ones
10054     #
10055     # $_ is transformed to the correct value.
10056
10057     my $file = shift;
10058         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10059
10060     if ($v_version lt v3.2.0) {
10061         if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
10062             $_ = "";
10063             return;
10064         }
10065
10066         # Old versions used a different syntax to mark the range.
10067         $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
10068     }
10069
10070     my @fields = split /\s*;\s*/, $_, -1;
10071     if (@fields != 2) {
10072         $file->carp_bad_line("Expecting exactly two fields");
10073         $_ = "";
10074         return;
10075     }
10076
10077     # Change hyphens and blanks in the block name field only
10078     $fields[1] =~ s/[ -]/_/g;
10079     $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g;   # Capitalize first letter of word
10080
10081     $_ = join("; ", @fields);
10082     return;
10083 }
10084
10085 { # Closure
10086     my $current_property;
10087
10088     sub filter_old_style_proplist {
10089         # PropList.txt has been in Unicode since version 2.0.  Until 3.1, it
10090         # was in a completely different syntax.  Ken Whistler of Unicode says
10091         # that it was something he used as an aid for his own purposes, but
10092         # was never an official part of the standard.  However, comments in
10093         # DAge.txt indicate that non-character code points were available in
10094         # the UCD as of 3.1.  It is unclear to me (khw) how they could be
10095         # there except through this file (but on the other hand, they first
10096         # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
10097         # not.  But the claim is that it was published as an aid to others who
10098         # might want some more information than was given in the official UCD
10099         # of the time.  Many of the properties in it were incorporated into
10100         # the later PropList.txt, but some were not.  This program uses this
10101         # early file to generate property tables that are otherwise not
10102         # accessible in the early UCD's, and most were probably not really
10103         # official at that time, so one could argue that it should be ignored,
10104         # and you can easily modify things to skip this.  And there are bugs
10105         # in this file in various versions.  (For example, the 2.1.9 version
10106         # removes from Alphabetic the CJK range starting at 4E00, and they
10107         # weren't added back in until 3.1.0.)  Many of this file's properties
10108         # were later sanctioned, so this code generates tables for those
10109         # properties that aren't otherwise in the UCD of the time but
10110         # eventually did become official, and throws away the rest.  Here is a
10111         # list of all the ones that are thrown away:
10112         #   Bidi=*                       duplicates UnicodeData.txt
10113         #   Combining                    never made into official property;
10114         #                                is \P{ccc=0}
10115         #   Composite                    never made into official property.
10116         #   Currency Symbol              duplicates UnicodeData.txt: gc=sc
10117         #   Decimal Digit                duplicates UnicodeData.txt: gc=nd
10118         #   Delimiter                    never made into official property;
10119         #                                removed in 3.0.1
10120         #   Format Control               never made into official property;
10121         #                                similar to gc=cf
10122         #   High Surrogate               duplicates Blocks.txt
10123         #   Ignorable Control            never made into official property;
10124         #                                similar to di=y
10125         #   ISO Control                  duplicates UnicodeData.txt: gc=cc
10126         #   Left of Pair                 never made into official property;
10127         #   Line Separator               duplicates UnicodeData.txt: gc=zl
10128         #   Low Surrogate                duplicates Blocks.txt
10129         #   Non-break                    was actually listed as a property
10130         #                                in 3.2, but without any code
10131         #                                points.  Unicode denies that this
10132         #                                was ever an official property
10133         #   Non-spacing                  duplicate UnicodeData.txt: gc=mn
10134         #   Numeric                      duplicates UnicodeData.txt: gc=cc
10135         #   Paired Punctuation           never made into official property;
10136         #                                appears to be gc=ps + gc=pe
10137         #   Paragraph Separator          duplicates UnicodeData.txt: gc=cc
10138         #   Private Use                  duplicates UnicodeData.txt: gc=co
10139         #   Private Use High Surrogate   duplicates Blocks.txt
10140         #   Punctuation                  duplicates UnicodeData.txt: gc=p
10141         #   Space                        different definition than eventual
10142         #                                one.
10143         #   Titlecase                    duplicates UnicodeData.txt: gc=lt
10144         #   Unassigned Code Value        duplicates UnicodeData.txt: gc=cc
10145         #   Zero-width                   never made into offical property;
10146         #                                subset of gc=cf
10147         # Most of the properties have the same names in this file as in later
10148         # versions, but a couple do not.
10149         #
10150         # This subroutine filters $_, converting it from the old style into
10151         # the new style.  Here's a sample of the old-style
10152         #
10153         #   *******************************************
10154         #
10155         #   Property dump for: 0x100000A0 (Join Control)
10156         #
10157         #   200C..200D  (2 chars)
10158         #
10159         # In the example, the property is "Join Control".  It is kept in this
10160         # closure between calls to the subroutine.  The numbers beginning with
10161         # 0x were internal to Ken's program that generated this file.
10162
10163         # If this line contains the property name, extract it.
10164         if (/^Property dump for: [^(]*\((.*)\)/) {
10165             $_ = $1;
10166
10167             # Convert white space to underscores.
10168             s/ /_/g;
10169
10170             # Convert the few properties that don't have the same name as
10171             # their modern counterparts
10172             s/Identifier_Part/ID_Continue/
10173             or s/Not_a_Character/NChar/;
10174
10175             # If the name matches an existing property, use it.
10176             if (defined property_ref($_)) {
10177                 trace "new property=", $_ if main::DEBUG && $to_trace;
10178                 $current_property = $_;
10179             }
10180             else {        # Otherwise discard it
10181                 trace "rejected property=", $_ if main::DEBUG && $to_trace;
10182                 undef $current_property;
10183             }
10184             $_ = "";    # The property is saved for the next lines of the
10185                         # file, but this defining line is of no further use,
10186                         # so clear it so that the caller won't process it
10187                         # further.
10188         }
10189         elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
10190
10191             # Here, the input line isn't a header defining a property for the
10192             # following section, and either we aren't in such a section, or
10193             # the line doesn't look like one that defines the code points in
10194             # such a section.  Ignore this line.
10195             $_ = "";
10196         }
10197         else {
10198
10199             # Here, we have a line defining the code points for the current
10200             # stashed property.  Anything starting with the first blank is
10201             # extraneous.  Otherwise, it should look like a normal range to
10202             # the caller.  Append the property name so that it looks just like
10203             # a modern PropList entry.
10204
10205             $_ =~ s/\s.*//;
10206             $_ .= "; $current_property";
10207         }
10208         trace $_ if main::DEBUG && $to_trace;
10209         return;
10210     }
10211 } # End closure for old style proplist
10212
10213 sub filter_old_style_normalization_lines {
10214     # For early releases of Unicode, the lines were like:
10215     #        74..2A76    ; NFKD_NO
10216     # For later releases this became:
10217     #        74..2A76    ; NFKD_QC; N
10218     # Filter $_ to look like those in later releases.
10219     # Similarly for MAYBEs
10220
10221     s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
10222
10223     # Also, the property FC_NFKC was abbreviated to FNC
10224     s/FNC/FC_NFKC/;
10225     return;
10226 }
10227
10228 sub finish_Unicode() {
10229     # This routine should be called after all the Unicode files have been read
10230     # in.  It:
10231     # 1) Adds the mappings for code points missing from the files which have
10232     #    defaults specified for them.
10233     # 2) At this this point all mappings are known, so it computes the type of
10234     #    each property whose type hasn't been determined yet.
10235     # 3) Calculates all the regular expression match tables based on the
10236     #    mappings.
10237     # 3) Calculates and adds the tables which are defined by Unicode, but
10238     #    which aren't derived by them
10239
10240     # For each property, fill in any missing mappings, and calculate the re
10241     # match tables.  If a property has more than one missing mapping, the
10242     # default is a reference to a data structure, and requires data from other
10243     # properties to resolve.  The sort is used to cause these to be processed
10244     # last, after all the other properties have been calculated.
10245     # (Fortunately, the missing properties so far don't depend on each other.)
10246     foreach my $property
10247         (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
10248         property_ref('*'))
10249     {
10250         # $perl has been defined, but isn't one of the Unicode properties that
10251         # need to be finished up.
10252         next if $property == $perl;
10253
10254         # Handle the properties that have more than one possible default
10255         if (ref $property->default_map) {
10256             my $default_map = $property->default_map;
10257
10258             # These properties have stored in the default_map:
10259             # One or more of:
10260             #   1)  A default map which applies to all code points in a
10261             #       certain class
10262             #   2)  an expression which will evaluate to the list of code
10263             #       points in that class
10264             # And
10265             #   3) the default map which applies to every other missing code
10266             #      point.
10267             #
10268             # Go through each list.
10269             while (my ($default, $eval) = $default_map->get_next_defaults) {
10270
10271                 # Get the class list, and intersect it with all the so-far
10272                 # unspecified code points yielding all the code points
10273                 # in the class that haven't been specified.
10274                 my $list = eval $eval;
10275                 if ($@) {
10276                     Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
10277                     last;
10278                 }
10279
10280                 # Narrow down the list to just those code points we don't have
10281                 # maps for yet.
10282                 $list = $list & $property->inverse_list;
10283
10284                 # Add mappings to the property for each code point in the list
10285                 foreach my $range ($list->ranges) {
10286                     $property->add_map($range->start, $range->end, $default);
10287                 }
10288             }
10289
10290             # All remaining code points have the other mapping.  Set that up
10291             # so the normal single-default mapping code will work on them
10292             $property->set_default_map($default_map->other_default);
10293
10294             # And fall through to do that
10295         }
10296
10297         # We should have enough data now to compute the type of the property.
10298         $property->compute_type;
10299         my $property_type = $property->type;
10300
10301         next if ! $property->to_create_match_tables;
10302
10303         # Here want to create match tables for this property
10304
10305         # The Unicode db always (so far, and they claim into the future) have
10306         # the default for missing entries in binary properties be 'N' (unless
10307         # there is a '@missing' line that specifies otherwise)
10308         if ($property_type == $BINARY && ! defined $property->default_map) {
10309             $property->set_default_map('N');
10310         }
10311
10312         # Add any remaining code points to the mapping, using the default for
10313         # missing code points
10314         if (defined (my $default_map = $property->default_map)) {
10315             foreach my $range ($property->inverse_list->ranges) {
10316                 $property->add_map($range->start, $range->end, $default_map);
10317             }
10318
10319             # Make sure there is a match table for the default
10320             if (! defined $property->table($default_map)) {
10321                 $property->add_match_table($default_map);
10322             }
10323         }
10324
10325         # Have all we need to populate the match tables.
10326         my $property_name = $property->name;
10327         foreach my $range ($property->ranges) {
10328             my $map = $range->value;
10329             my $table = property_ref($property_name)->table($map);
10330             if (! defined $table) {
10331
10332                 # Integral and rational property values are not necessarily
10333                 # defined in PropValueAliases, but all other ones should be,
10334                 # starting in 5.1
10335                 if ($v_version ge v5.1.0
10336                     && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
10337                 {
10338                     Carp::my_carp("Table '$property_name=$map' should have been defined.  Defining it now.")
10339                 }
10340                 $table = property_ref($property_name)->add_match_table($map);
10341             }
10342
10343             $table->add_range($range->start, $range->end);
10344         }
10345
10346         # And add the Is_ prefix synonyms for Perl 5.6 compatibility, in which
10347         # all properties have this optional prefix.  These do not get a
10348         # separate entry in the pod file, because are covered by a wild-card
10349         # entry
10350         foreach my $alias ($property->aliases) {
10351             my $Is_name = 'Is_' . $alias->name;
10352             if (! defined (my $pre_existing = property_ref($Is_name))) {
10353                 $property->add_alias($Is_name,
10354                                      Pod_Entry => 0,
10355                                      Status => $alias->status,
10356                                      Externally_Ok => 0);
10357             }
10358             else {
10359
10360                 # It seemed too much work to add in these warnings when it
10361                 # appears that Unicode has made a decision never to begin a
10362                 # property name with 'Is_', so this shouldn't happen, but just
10363                 # in case, it is a warning.
10364                 Carp::my_carp(<<END
10365 There is already an alias named $Is_name (from " . $pre_existing . "), so not
10366 creating this alias for $property.  The generated table and pod files do not
10367 warn users of this conflict.
10368 END
10369                 );
10370                 $has_Is_conflicts++;
10371             }
10372         } # End of loop through aliases for this property
10373     } # End of loop through all Unicode properties.
10374
10375     # Fill in the mappings that Unicode doesn't completely furnish.  First the
10376     # single letter major general categories.  If Unicode were to start
10377     # delivering the values, this would be redundant, but better that than to
10378     # try to figure out if should skip and not get it right.  Ths could happen
10379     # if a new major category were to be introduced, and the hard-coded test
10380     # wouldn't know about it.
10381     # This routine depends on the standard names for the general categories
10382     # being what it thinks they are, like 'Cn'.  The major categories are the
10383     # union of all the general category tables which have the same first
10384     # letters. eg. L = Lu + Lt + Ll + Lo + Lm
10385     foreach my $minor_table ($gc->tables) {
10386         my $minor_name = $minor_table->name;
10387         next if length $minor_name == 1;
10388         if (length $minor_name != 2) {
10389             Carp::my_carp_bug("Unexpected general category '$minor_name'.  Skipped.");
10390             next;
10391         }
10392
10393         my $major_name = uc(substr($minor_name, 0, 1));
10394         my $major_table = $gc->table($major_name);
10395         $major_table += $minor_table;
10396     }
10397
10398     # LC is Ll, Lu, and Lt.  (used to be L& or L_, but PropValueAliases.txt
10399     # defines it as LC)
10400     my $LC = $gc->table('LC');
10401     $LC->add_alias('L_', Status => $DISCOURAGED);   # For backwards...
10402     $LC->add_alias('L&', Status => $DISCOURAGED);   # compatibility.
10403
10404
10405     if ($LC->is_empty) { # Assume if not empty that Unicode has started to
10406                          # deliver the correct values in it
10407         $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
10408
10409         # Lt not in release 1.
10410         $LC += $gc->table('Lt') if defined $gc->table('Lt');
10411     }
10412     $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
10413
10414     my $Cs = $gc->table('Cs');
10415     if (defined $Cs) {
10416         $Cs->add_note('Mostly not usable in Perl.');
10417         $Cs->add_comment(join_lines(<<END
10418 Surrogates are used exclusively for I/O in UTF-16, and should not appear in
10419 Unicode text, and hence their use will generate (usually fatal) messages
10420 END
10421         ));
10422     }
10423
10424
10425     # Folding information was introduced later into Unicode data.  To get
10426     # Perl's case ignore (/i) to work at all in releases that don't have
10427     # folding, use the best available alternative, which is lower casing.
10428     my $fold = property_ref('Simple_Case_Folding');
10429     if ($fold->is_empty) {
10430         $fold->initialize(property_ref('Simple_Lowercase_Mapping'));
10431         $fold->add_note(join_lines(<<END
10432 WARNING: This table uses lower case as a substitute for missing fold
10433 information
10434 END
10435         ));
10436     }
10437
10438     # Multiple-character mapping was introduced later into Unicode data.  If
10439     # missing, use the single-characters maps as best available alternative
10440     foreach my $map (qw {   Uppercase_Mapping
10441                             Lowercase_Mapping
10442                             Titlecase_Mapping
10443                             Case_Folding
10444                         } ) {
10445         my $full = property_ref($map);
10446         if ($full->is_empty) {
10447             my $simple = property_ref('Simple_' . $map);
10448             $full->initialize($simple);
10449             $full->add_comment($simple->comment) if ($simple->comment);
10450             $full->add_note(join_lines(<<END
10451 WARNING: This table uses simple mapping (single-character only) as a
10452 substitute for missing multiple-character information
10453 END
10454             ));
10455         }
10456     }
10457     return
10458 }
10459
10460 sub compile_perl() {
10461     # Create perl-defined tables.  Almost all are part of the pseudo-property
10462     # named 'perl' internally to this program.  Many of these are recommended
10463     # in UTS#18 "Unicode Regular Expressions", and their derivations are based
10464     # on those found there.
10465     # Almost all of these are equivalent to some Unicode property.
10466     # A number of these properties have equivalents restricted to the ASCII
10467     # range, with their names prefaced by 'Posix', to signify that these match
10468     # what the Posix standard says they should match.  A couple are
10469     # effectively this, but the name doesn't have 'Posix' in it because there
10470     # just isn't any Posix equivalent.
10471
10472     # 'Any' is all code points.  As an error check, instead of just setting it
10473     # to be that, construct it to be the union of all the major categories
10474     my $Any = $perl->add_match_table('Any',
10475             Description  => "[\\x{0000}-\\x{$LAST_UNICODE_CODEPOINT_STRING}]",
10476             Matches_All => 1);
10477
10478     foreach my $major_table ($gc->tables) {
10479
10480         # Major categories are the ones with single letter names.
10481         next if length($major_table->name) != 1;
10482
10483         $Any += $major_table;
10484     }
10485
10486     if ($Any->max != $LAST_UNICODE_CODEPOINT) {
10487         Carp::my_carp_bug("Generated highest code point ("
10488            . sprintf("%X", $Any->max)
10489            . ") doesn't match expected value $LAST_UNICODE_CODEPOINT_STRING.")
10490     }
10491     if ($Any->range_count != 1 || $Any->min != 0) {
10492      Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
10493     }
10494
10495     $Any->add_alias('All');
10496
10497     # Assigned is the opposite of gc=unassigned
10498     my $Assigned = $perl->add_match_table('Assigned',
10499                                 Description  => "All assigned code points",
10500                                 Initialize => ~ $gc->table('Unassigned'),
10501                                 );
10502
10503     # Our internal-only property should be treated as more than just a
10504     # synonym.
10505     $perl->add_match_table('_CombAbove')
10506             ->set_equivalent_to(property_ref('ccc')->table('Above'),
10507                                                                 Related => 1);
10508
10509     my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
10510     if (defined $block) {   # This is equivalent to the block if have it.
10511         my $Unicode_ASCII = $block->table('Basic_Latin');
10512         if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
10513             $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
10514         }
10515     }
10516
10517     # Very early releases didn't have blocks, so initialize ASCII ourselves if
10518     # necessary
10519     if ($ASCII->is_empty) {
10520         $ASCII->initialize([ 0..127 ]);
10521     }
10522
10523     # A number of the Perl synonyms have a restricted-range synonym whose name
10524     # begins with Posix.  This hash gets filled in with them, so that they can
10525     # be populated in a small loop.
10526     my %posix_equivalent;
10527
10528     # Get the best available case definitions.  Early Unicode versions didn't
10529     # have Uppercase and Lowercase defined, so use the general category
10530     # instead for them.
10531     my $Lower = $perl->add_match_table('Lower');
10532     my $Unicode_Lower = property_ref('Lowercase');
10533     if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
10534         $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
10535     }
10536     else {
10537         $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
10538                                                                 Related => 1);
10539     }
10540     $posix_equivalent{'Lower'} = $Lower;
10541
10542     my $Upper = $perl->add_match_table('Upper');
10543     my $Unicode_Upper = property_ref('Uppercase');
10544     if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
10545         $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
10546     }
10547     else {
10548         $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
10549                                                                 Related => 1);
10550     }
10551     $posix_equivalent{'Upper'} = $Upper;
10552
10553     # Earliest releases didn't have title case.  Initialize it to empty if not
10554     # otherwise present
10555     my $Title = $perl->add_match_table('Title');
10556     my $lt = $gc->table('Lt');
10557     if (defined $lt) {
10558         $Title->set_equivalent_to($lt, Related => 1);
10559     }
10560
10561     # If this Unicode version doesn't have Cased, set up our own.  From
10562     # Unicode 5.1: Definition D120: A character C is defined to be cased if
10563     # and only if C has the Lowercase or Uppercase property or has a
10564     # General_Category value of Titlecase_Letter.
10565     unless (defined property_ref('Cased')) {
10566         my $cased = $perl->add_match_table('Cased',
10567                         Initialize => $Lower + $Upper + $Title,
10568                         Description => 'Uppercase or Lowercase or Titlecase',
10569                         );
10570     }
10571
10572     # Similarly, set up our own Case_Ignorable property if this Unicode
10573     # version doesn't have it.  From Unicode 5.1: Definition D121: A character
10574     # C is defined to be case-ignorable if C has the value MidLetter or the
10575     # value MidNumLet for the Word_Break property or its General_Category is
10576     # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
10577     # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
10578
10579     # Perl has long had an internal-only alias for this property.
10580     my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable');
10581     my $case_ignorable = property_ref('Case_Ignorable');
10582     if (defined $case_ignorable && ! $case_ignorable->is_empty) {
10583         $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
10584                                                                 Related => 1);
10585     }
10586     else {
10587
10588         $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
10589
10590         # The following three properties are not in early releases
10591         $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
10592         $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
10593         $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
10594
10595         # For versions 4.1 - 5.0, there is no MidNumLet property, and
10596         # correspondingly the case-ignorable definition lacks that one.  For
10597         # 4.0, it appears that it was meant to be the same definition, but was
10598         # inadvertently omitted from the standard's text, so add it if the
10599         # property actually is there
10600         my $wb = property_ref('Word_Break');
10601         if (defined $wb) {
10602             my $midlet = $wb->table('MidLetter');
10603             $perl_case_ignorable += $midlet if defined $midlet;
10604             my $midnumlet = $wb->table('MidNumLet');
10605             $perl_case_ignorable += $midnumlet if defined $midnumlet;
10606         }
10607         else {
10608
10609             # In earlier versions of the standard, instead of the above two
10610             # properties , just the following characters were used:
10611             $perl_case_ignorable +=  0x0027  # APOSTROPHE
10612                                 +   0x00AD  # SOFT HYPHEN (SHY)
10613                                 +   0x2019; # RIGHT SINGLE QUOTATION MARK
10614         }
10615     }
10616
10617     # The remaining perl defined tables are mostly based on Unicode TR 18,
10618     # "Annex C: Compatibility Properties".  All of these have two versions,
10619     # one whose name generally begins with Posix that is posix-compliant, and
10620     # one that matches Unicode characters beyond the Posix, ASCII range
10621
10622     my $Alpha = $perl->add_match_table('Alpha',
10623                         Description => '[[:Alpha:]] extended beyond ASCII');
10624
10625     # Alphabetic was not present in early releases
10626     my $Alphabetic = property_ref('Alphabetic');
10627     if (defined $Alphabetic && ! $Alphabetic->is_empty) {
10628         $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
10629     }
10630     else {
10631
10632         # For early releases, we don't get it exactly right.  The below
10633         # includes more than it should, which in 5.2 terms is: L + Nl +
10634         # Other_Alphabetic.  Other_Alphabetic contains many characters from
10635         # Mn and Mc.  It's better to match more than we should, than less than
10636         # we should.
10637         $Alpha->initialize($gc->table('Letter')
10638                             + $gc->table('Mn')
10639                             + $gc->table('Mc'));
10640         $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
10641     }
10642     $posix_equivalent{'Alpha'} = $Alpha;
10643
10644     my $Alnum = $perl->add_match_table('Alnum',
10645                         Description => "[[:Alnum:]] extended beyond ASCII",
10646                         Initialize => $Alpha + $gc->table('Decimal_Number'),
10647                         );
10648     $posix_equivalent{'Alnum'} = $Alnum;
10649
10650     my $Word = $perl->add_match_table('Word',
10651                                 Description => '\w, including beyond ASCII',
10652                                 Initialize => $Alnum + $gc->table('Mark'),
10653                                 );
10654     my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
10655     $Word += $Pc if defined $Pc;
10656
10657     # There is no [[:Word:]], so the name doesn't begin with Posix.
10658     $perl->add_match_table('PerlWord',
10659                     Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
10660                     Initialize => $Word & $ASCII,
10661                     );
10662
10663     my $Blank = $perl->add_match_table('Blank',
10664                                 Description => '\h, Horizontal white space',
10665
10666                                 # 200B is Zero Width Space which is for line
10667                                 # break control, and was listed as
10668                                 # Space_Separator in early releases
10669                                 Initialize => $gc->table('Space_Separator')
10670                                             +   0x0009  # TAB
10671                                             -   0x200B, # ZWSP
10672                                 );
10673     $Blank->add_alias('HorizSpace');        # Another name for it.
10674     $posix_equivalent{'Blank'} = $Blank;
10675
10676     my $VertSpace = $perl->add_match_table('VertSpace',
10677                             Description => '\v',
10678                             Initialize => $gc->table('Line_Separator')
10679                                         + $gc->table('Paragraph_Separator')
10680                                         + 0x000A  # LINE FEED
10681                                         + 0x000B  # VERTICAL TAB
10682                                         + 0x000C  # FORM FEED
10683                                         + 0x000D  # CARRIAGE RETURN
10684                                         + 0x0085, # NEL
10685                             );
10686     # No Posix equivalent for vertical space
10687
10688     my $Space = $perl->add_match_table('Space',
10689         Description => '\s including beyond ASCII plus vertical tab = [[:Space:]]',
10690         Initialize => $Blank + $VertSpace,
10691     );
10692     $posix_equivalent{'Space'} = $Space;
10693
10694     # Perl's traditional space doesn't include Vertical Tab
10695     my $SpacePerl = $perl->add_match_table('SpacePerl',
10696                                   Description => '\s, including beyond ASCII',
10697                                   Initialize => $Space - 0x000B,
10698                                 );
10699     $perl->add_match_table('PerlSpace',
10700                             Description => '\s, restricted to ASCII',
10701                             Initialize => $SpacePerl & $ASCII,
10702                             );
10703
10704     my $Cntrl = $perl->add_match_table('Cntrl',
10705                         Description => "[[:Cntrl:]] extended beyond ASCII");
10706     $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
10707     $posix_equivalent{'Cntrl'} = $Cntrl;
10708
10709     # $controls is a temporary used to construct Graph.
10710     my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
10711                                                 + $gc->table('Control'));
10712     # Cs not in release 1
10713     $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
10714
10715     # Graph is  ~space &  ~(Cc|Cs|Cn) = ~(space + $controls)
10716     my $Graph = $perl->add_match_table('Graph',
10717                         Description => "[[:Graph:]] extended beyond ASCII",
10718                         Initialize => ~ ($Space + $controls),
10719                         );
10720     $posix_equivalent{'Graph'} = $Graph;
10721
10722     my $Print = $perl->add_match_table('Print',
10723                         Description => "[[:Print:]] extended beyond ASCII",
10724                         Initialize => $Space + $Graph - $gc->table('Control'),
10725                         );
10726     $posix_equivalent{'Print'} = $Print;
10727
10728     my $Punct = $perl->add_match_table('Punct');
10729     $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
10730
10731     # \p{punct} doesn't include the symbols, which posix does
10732     $perl->add_match_table('PosixPunct',
10733                             Description => "[[:Punct:]]",
10734                             Initialize => $ASCII & ($gc->table('Punctuation')
10735                                                     + $gc->table('Symbol')),
10736                             );
10737
10738     my $Digit = $perl->add_match_table('Digit',
10739                             Description => '\d, extended beyond just [0-9]');
10740     $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
10741     $posix_equivalent{'Digit'} = $Digit;
10742
10743     # AHex was not present in early releases
10744     my $Xdigit = $perl->add_match_table('XDigit',
10745                                         Description => '[0-9A-Fa-f]');
10746     my $AHex = property_ref('ASCII_Hex_Digit');
10747     if (defined $AHex && ! $AHex->is_empty) {
10748         $Xdigit->set_equivalent_to($AHex->table('Y'), Related => 1);
10749     }
10750     else {
10751         # (Have to use hex because could be running on an non-ASCII machine,
10752         # and we want the Unicode (ASCII) values)
10753         $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66 ]);
10754     }
10755
10756     # Now, add the ASCII-restricted tables that get uniform treatment
10757     while (my ($name, $table) = each %posix_equivalent) {
10758         $perl->add_match_table("Posix$name",
10759                                 Description => "[[:$name:]]",
10760                                 Initialize => $table & $ASCII,
10761                                 );
10762     }
10763     $perl->table('PosixDigit')->add_description('\d, restricted to ASCII');
10764     $perl->table('PosixDigit')->add_description('[0-9]');
10765
10766
10767     my $dt = property_ref('Decomposition_Type');
10768     $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
10769         Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
10770         Perl_Extension => 1,
10771         Note => 'Perl extension consisting of the union of all non-canonical decompositions',
10772         );
10773
10774     # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
10775     # than SD appeared, construct it ourselves, based on the first release SD
10776     # was in.
10777     my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ');
10778     my $soft_dotted = property_ref('Soft_Dotted');
10779     if (defined $soft_dotted && ! $soft_dotted->is_empty) {
10780         $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
10781     }
10782     else {
10783
10784         # This list came from 3.2 Soft_Dotted.
10785         $CanonDCIJ->initialize([ 0x0069,
10786                                  0x006A,
10787                                  0x012F,
10788                                  0x0268,
10789                                  0x0456,
10790                                  0x0458,
10791                                  0x1E2D,
10792                                  0x1ECB,
10793                                ]);
10794         $CanonDCIJ = $CanonDCIJ & $Assigned;
10795     }
10796
10797     # These are used in Unicode's definition of \X
10798     my $gcb = property_ref('Grapheme_Cluster_Break');
10799     #my $extend = $perl->add_match_table('_X_Extend');
10800     my $extend = $perl->add_match_table('_GCB_Extend');
10801     # XXX until decide what todo my $begin = $perl->add_match_table('_X_Begin');
10802     if (defined $gcb) {
10803         $extend += $gcb->table('Extend') + $gcb->table('SpacingMark')
10804         #$begin += ~ ($gcb->table('Control')
10805         #             + $gcb->table('CR')
10806         #             + $gcb->table('LF'));
10807     }
10808     else {    # Old definition, used on early releases.
10809         $extend += $gc->table('Mark')
10810                     + 0x200C    # ZWNJ
10811                     + 0x200D;    # ZWJ
10812         #$begin += ~ $extend;
10813     }
10814
10815     # Create a new property specially located that is a combination of the
10816     # various Name properties: Name, Unicode_1_Name, Named Sequences, and
10817     # Name_Alias properties.  (The final duplicates elements of the first.)  A
10818     # comment for it is constructed based on the actual properties present and
10819     # used
10820     my $perl_charname = Property->new('Perl_Charnames',
10821                                 Core_Access => '\N{...} and charnames.pm',
10822                                 Default_Map => "",
10823                                 Directory => File::Spec->curdir(),
10824                                 File => 'Name',
10825                                 Internal_Only_Warning => 1,
10826                                 Perl_Extension => 1,
10827                                 Range_Size_1 => 1,
10828                                 Type => $STRING,
10829                                 Initialize => property_ref('Unicode_1_Name'),
10830                                 );
10831     # Name overrides Unicode_1_Name
10832     $perl_charname->property_add_or_replace_non_nulls(property_ref('Name'));
10833     my @composition = ('Name', 'Unicode_1_Name');
10834
10835     if (@named_sequences) {
10836         push @composition, 'Named_Sequence';
10837         foreach my $sequence (@named_sequences) {
10838             $perl_charname->add_anomalous_entry($sequence);
10839         }
10840     }
10841
10842     my $alias_sentence = "";
10843     my $alias = property_ref('Name_Alias');
10844     if (defined $alias) {
10845         push @composition, 'Name_Alias';
10846         $alias->reset_each_range;
10847         while (my ($range) = $alias->each_range) {
10848             next if $range->value eq "";
10849             if ($range->start != $range->end) {
10850                 Carp::my_carp("Expecting only one code point in the range $range.  Just to keep going, using just the first code point;");
10851             }
10852             $perl_charname->add_duplicate($range->start, $range->value);
10853         }
10854         $alias_sentence = <<END;
10855 The Name_Alias property adds duplicate code point entries with a corrected
10856 name.  The original (less correct, but still valid) name will be physically
10857 first.
10858 END
10859     }
10860     my $comment;
10861     if (@composition <= 2) { # Always at least 2
10862         $comment = join " and ", @composition;
10863     }
10864     else {
10865         $comment = join ", ", @composition[0 .. scalar @composition - 2];
10866         $comment .= ", and $composition[-1]";
10867     }
10868
10869     # Wait for charnames to catch up
10870 #    foreach my $entry (@more_Names,
10871 #                        split "\n", <<"END"
10872 #000A; LF
10873 #000C; FF
10874 #000D; CR
10875 #0085; NEL
10876 #200C; ZWNJ
10877 #200D; ZWJ
10878 #FEFF; BOM
10879 #FEFF; BYTE ORDER MARK
10880 #END
10881 #    ) {
10882 #        #local $to_trace = 1 if main::DEBUG;
10883 #        trace $entry if main::DEBUG && $to_trace;
10884 #        my ($code_point, $name) = split /\s*;\s*/, $entry;
10885 #        $code_point = hex $code_point;
10886 #        trace $code_point, $name if main::DEBUG && $to_trace;
10887 #        $perl_charname->add_duplicate($code_point, $name);
10888 #    }
10889 #    #$perl_charname->add_comment("This file is for charnames.pm.  It is the union of the $comment properties, plus certain commonly used but unofficial names, such as 'FF' and 'ZWNJ'.  Unicode_1_Name entries are used only for otherwise nameless code points.$alias_sentence");
10890     $perl_charname->add_comment(join_lines( <<END
10891 This file is for charnames.pm.  It is the union of the $comment properties.
10892 Unicode_1_Name entries are used only for otherwise nameless code
10893 points.
10894 $alias_sentence
10895 END
10896     ));
10897
10898     # The combining class property used by Perl's normalize.pm is not located
10899     # in the normal mapping directory; create a copy for it.
10900     my $ccc = property_ref('Canonical_Combining_Class');
10901     my $perl_ccc = Property->new('Perl_ccc',
10902                             Default_Map => $ccc->default_map,
10903                             Full_Name => 'Perl_Canonical_Combining_Class',
10904                             Internal_Only_Warning => 1,
10905                             Perl_Extension => 1,
10906                             Pod_Entry =>0,
10907                             Type => $ENUM,
10908                             Initialize => $ccc,
10909                             File => 'CombiningClass',
10910                             Directory => File::Spec->curdir(),
10911                             );
10912     $perl_ccc->set_to_output_map(1);
10913     $perl_ccc->add_comment(join_lines(<<END
10914 This mapping is for normalize.pm.  It is currently identical to the Unicode
10915 Canonical_Combining_Class property.
10916 END
10917     ));
10918
10919     # This one match table for it is needed for calculations on output
10920     my $default = $perl_ccc->add_match_table($ccc->default_map,
10921                         Initialize => $ccc->table($ccc->default_map),
10922                         Status => $SUPPRESSED);
10923
10924     # Construct the Present_In property from the Age property.
10925     if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
10926         my $default_map = $age->default_map;
10927         my $in = Property->new('In',
10928                                 Default_Map => $default_map,
10929                                 Full_Name => "Present_In",
10930                                 Internal_Only_Warning => 1,
10931                                 Perl_Extension => 1,
10932                                 Type => $ENUM,
10933                                 Initialize => $age,
10934                                 );
10935         $in->add_comment(join_lines(<<END
10936 This file should not be used for any purpose.  The values in this file are the
10937 same as for $age, and not for what $in really means.  This is because anything
10938 defined in a given release should have multiple values: that release and all
10939 higher ones.  But only one value per code point can be represented in a table
10940 like this.
10941 END
10942         ));
10943
10944         # The Age tables are named like 1.5, 2.0, 2.1, ....  Sort so that the
10945         # lowest numbered (earliest) come first, with the non-numeric one
10946         # last.
10947         my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
10948                                             ? 1
10949                                             : ($b->name !~ /^[\d.]*$/)
10950                                                 ? -1
10951                                                 : $a->name <=> $b->name
10952                                             } $age->tables;
10953
10954         # The Present_In property is the cumulative age properties.  The first
10955         # one hence is identical to the first age one.
10956         my $previous_in = $in->add_match_table($first_age->name);
10957         $previous_in->set_equivalent_to($first_age, Related => 1);
10958
10959         my $description_start = "Code point's usage introduced in version ";
10960         $first_age->add_description($description_start . $first_age->name);
10961
10962         # To construct the accumlated values, for each of the age tables
10963         # starting with the 2nd earliest, merge the earliest with it, to get
10964         # all those code points existing in the 2nd earliest.  Repeat merging
10965         # the new 2nd earliest with the 3rd earliest to get all those existing
10966         # in the 3rd earliest, and so on.
10967         foreach my $current_age (@rest_ages) {
10968             next if $current_age->name !~ /^[\d.]*$/;   # Skip the non-numeric
10969
10970             my $current_in = $in->add_match_table(
10971                                     $current_age->name,
10972                                     Initialize => $current_age + $previous_in,
10973                                     Description => $description_start
10974                                                     . $current_age->name
10975                                                     . ' or earlier',
10976                                     );
10977             $previous_in = $current_in;
10978
10979             # Add clarifying material for the corresponding age file.  This is
10980             # in part because of the confusing and contradictory information
10981             # given in the Standard's documentation itself, as of 5.2.
10982             $current_age->add_description(
10983                             "Code point's usage was introduced in version "
10984                             . $current_age->name);
10985             $current_age->add_note("See also $in");
10986
10987         }
10988
10989         # And finally the code points whose usages have yet to be decided are
10990         # the same in both properties.  Note that permanently unassigned code
10991         # points actually have their usage assigned (as being permanently
10992         # unassigned), so that these tables are not the same as gc=cn.
10993         my $unassigned = $in->add_match_table($default_map);
10994         my $age_default = $age->table($default_map);
10995         $age_default->add_description(<<END
10996 Code point's usage has not been assigned in any Unicode release thus far.
10997 END
10998         );
10999         $unassigned->set_equivalent_to($age_default, Related => 1);
11000     }
11001
11002
11003     # Finished creating all the perl properties.  All non-internal non-string
11004     # ones have a synonym of 'Is_' prefixed.  (Internal properties begin with
11005     # an underscore.)  These do not get a separate entry in the pod file
11006     foreach my $table ($perl->tables) {
11007         foreach my $alias ($table->aliases) {
11008             next if $alias->name =~ /^_/;
11009             $table->add_alias('Is_' . $alias->name,
11010                                Pod_Entry => 0,
11011                                Status => $alias->status,
11012                                Externally_Ok => 0);
11013         }
11014     }
11015
11016     return;
11017 }
11018
11019 sub add_perl_synonyms() {
11020     # A number of Unicode tables have Perl synonyms that are expressed in
11021     # the single-form, \p{name}.  These are:
11022     #   All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
11023     #       \p{Is_Name} as synonyms
11024     #   \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
11025     #   \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
11026     #   \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
11027     #       conflict, \p{Value} and \p{Is_Value} as well
11028     #
11029     # This routine generates these synonyms, warning of any unexpected
11030     # conflicts.
11031
11032     # Construct the list of tables to get synonyms for.  Start with all the
11033     # binary and the General_Category ones.
11034     my @tables = grep { $_->type == $BINARY } property_ref('*');
11035     push @tables, $gc->tables;
11036
11037     # If the version of Unicode includes the Script property, add its tables
11038     if (defined property_ref('Script')) {
11039         push @tables, property_ref('Script')->tables;
11040     }
11041
11042     # The Block tables are kept separate because they are treated differently.
11043     # And the earliest versions of Unicode didn't include them, so add only if
11044     # there are some.
11045     my @blocks;
11046     push @blocks, $block->tables if defined $block;
11047
11048     # Here, have the lists of tables constructed.  Process blocks last so that
11049     # if there are name collisions with them, blocks have lowest priority.
11050     # Should there ever be other collisions, manual intervention would be
11051     # required.  See the comments at the beginning of the program for a
11052     # possible way to handle those semi-automatically.
11053     foreach my $table (@tables,  @blocks) {
11054
11055         # For non-binary properties, the synonym is just the name of the
11056         # table, like Greek, but for binary properties the synonym is the name
11057         # of the property, and means the code points in its 'Y' table.
11058         my $nominal = $table;
11059         my $nominal_property = $nominal->property;
11060         my $actual;
11061         if (! $nominal->isa('Property')) {
11062             $actual = $table;
11063         }
11064         else {
11065
11066             # Here is a binary property.  Use the 'Y' table.  Verify that is
11067             # there
11068             my $yes = $nominal->table('Y');
11069             unless (defined $yes) {  # Must be defined, but is permissible to
11070                                      # be empty.
11071                 Carp::my_carp_bug("Undefined $nominal, 'Y'.  Skipping.");
11072                 next;
11073             }
11074             $actual = $yes;
11075         }
11076
11077         foreach my $alias ($nominal->aliases) {
11078
11079             # Attempt to create a table in the perl directory for the
11080             # candidate table, using whatever aliases in it that don't
11081             # conflict.  Also add non-conflicting aliases for all these
11082             # prefixed by 'Is_' (and/or 'In_' for Block property tables)
11083             PREFIX:
11084             foreach my $prefix ("", 'Is_', 'In_') {
11085
11086                 # Only Block properties can have added 'In_' aliases.
11087                 next if $prefix eq 'In_' and $nominal_property != $block;
11088
11089                 my $proposed_name = $prefix . $alias->name;
11090
11091                 # No Is_Is, In_In, nor combinations thereof
11092                 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
11093                 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
11094
11095                 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
11096
11097                 # Get a reference to any existing table in the perl
11098                 # directory with the desired name.
11099                 my $pre_existing = $perl->table($proposed_name);
11100
11101                 if (! defined $pre_existing) {
11102
11103                     # No name collision, so ok to add the perl synonym.
11104
11105                     my $make_pod_entry;
11106                     my $externally_ok;
11107                     my $status = $actual->status;
11108                     if ($nominal_property == $block) {
11109
11110                         # For block properties, the 'In' form is preferred for
11111                         # external use; the pod file contains wild cards for
11112                         # this and the 'Is' form so no entries for those; and
11113                         # we don't want people using the name without the
11114                         # 'In', so discourage that.
11115                         if ($prefix eq "") {
11116                             $make_pod_entry = 1;
11117                             $status = $status || $DISCOURAGED;
11118                             $externally_ok = 0;
11119                         }
11120                         elsif ($prefix eq 'In_') {
11121                             $make_pod_entry = 0;
11122                             $status = $status || $NORMAL;
11123                             $externally_ok = 1;
11124                         }
11125                         else {
11126                             $make_pod_entry = 0;
11127                             $status = $status || $DISCOURAGED;
11128                             $externally_ok = 0;
11129                         }
11130                     }
11131                     elsif ($prefix ne "") {
11132
11133                         # The 'Is' prefix is handled in the pod by a wild
11134                         # card, and we won't use it for an external name
11135                         $make_pod_entry = 0;
11136                         $status = $status || $NORMAL;
11137                         $externally_ok = 0;
11138                     }
11139                     else {
11140
11141                         # Here, is an empty prefix, non block.  This gets its
11142                         # own pod entry and can be used for an external name.
11143                         $make_pod_entry = 1;
11144                         $status = $status || $NORMAL;
11145                         $externally_ok = 1;
11146                     }
11147
11148                     # Here, there isn't a perl pre-existing table with the
11149                     # name.  Look through the list of equivalents of this
11150                     # table to see if one is a perl table.
11151                     foreach my $equivalent ($actual->leader->equivalents) {
11152                         next if $equivalent->property != $perl;
11153
11154                         # Here, have found a table for $perl.  Add this alias
11155                         # to it, and are done with this prefix.
11156                         $equivalent->add_alias($proposed_name,
11157                                         Pod_Entry => $make_pod_entry,
11158                                         Status => $status,
11159                                         Externally_Ok => $externally_ok);
11160                         trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
11161                         next PREFIX;
11162                     }
11163
11164                     # Here, $perl doesn't already have a table that is a
11165                     # synonym for this property, add one.
11166                     my $added_table = $perl->add_match_table($proposed_name,
11167                                             Pod_Entry => $make_pod_entry,
11168                                             Status => $status,
11169                                             Externally_Ok => $externally_ok);
11170                     # And it will be related to the actual table, since it is
11171                     # based on it.
11172                     $added_table->set_equivalent_to($actual, Related => 1);
11173                     trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
11174                     next;
11175                 } # End of no pre-existing.
11176
11177                 # Here, there is a pre-existing table that has the proposed
11178                 # name.  We could be in trouble, but not if this is just a
11179                 # synonym for another table that we have already made a child
11180                 # of the pre-existing one.
11181                 if ($pre_existing->is_equivalent_to($actual)) {
11182                     trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
11183                     $pre_existing->add_alias($proposed_name);
11184                     next;
11185                 }
11186
11187                 # Here, there is a name collision, but it still could be ok if
11188                 # the tables match the identical set of code points, in which
11189                 # case, we can combine the names.  Compare each table's code
11190                 # point list to see if they are identical.
11191                 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
11192                 if ($pre_existing->matches_identically_to($actual)) {
11193
11194                     # Here, they do match identically.  Not a real conflict.
11195                     # Make the perl version a child of the Unicode one, except
11196                     # in the non-obvious case of where the perl name is
11197                     # already a synonym of another Unicode property.  (This is
11198                     # excluded by the test for it being its own parent.)  The
11199                     # reason for this exclusion is that then the two Unicode
11200                     # properties become related; and we don't really know if
11201                     # they are or not.  We generate documentation based on
11202                     # relatedness, and this would be misleading.  Code
11203                     # later executed in the process will cause the tables to
11204                     # be represented by a single file anyway, without making
11205                     # it look in the pod like they are necessarily related.
11206                     if ($pre_existing->parent == $pre_existing
11207                         && ($pre_existing->property == $perl
11208                             || $actual->property == $perl))
11209                     {
11210                         trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
11211                         $pre_existing->set_equivalent_to($actual, Related => 1);
11212                     }
11213                     elsif (main::DEBUG && $to_trace) {
11214                         trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
11215                         trace $pre_existing->parent;
11216                     }
11217                     next PREFIX;
11218                 }
11219
11220                 # Here they didn't match identically, there is a real conflict
11221                 # between our new name and a pre-existing property.
11222                 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
11223                 $pre_existing->add_conflicting($nominal->full_name,
11224                                                'p',
11225                                                $actual);
11226
11227                 # Don't output a warning for aliases for the block
11228                 # properties (unless they start with 'In_') as it is
11229                 # expected that there will be conflicts and the block
11230                 # form loses.
11231                 if ($verbosity >= $NORMAL_VERBOSITY
11232                     && ($actual->property != $block || $prefix eq 'In_'))
11233                 {
11234                     print simple_fold(join_lines(<<END
11235 There is already an alias named $proposed_name (from " . $pre_existing . "),
11236 so not creating this alias for " . $actual
11237 END
11238                     ), "", 4);
11239                 }
11240
11241                 # Keep track for documentation purposes.
11242                 $has_In_conflicts++ if $prefix eq 'In_';
11243                 $has_Is_conflicts++ if $prefix eq 'Is_';
11244             }
11245         }
11246     }
11247
11248     # There are some properties which have No and Yes (and N and Y) as
11249     # property values, but aren't binary, and could possibly be confused with
11250     # binary ones.  So create caveats for them.  There are tables that are
11251     # named 'No', and tables that are named 'N', but confusion is not likely
11252     # unless they are the same table.  For example, N meaning Number or
11253     # Neutral is not likely to cause confusion, so don't add caveats to things
11254     # like them.
11255     foreach my $property (grep { $_->type != $BINARY } property_ref('*')) {
11256         my $yes = $property->table('Yes');
11257         if (defined $yes) {
11258             my $y = $property->table('Y');
11259             if (defined $y && $yes == $y) {
11260                 foreach my $alias ($property->aliases) {
11261                     $yes->add_conflicting($alias->name);
11262                 }
11263             }
11264         }
11265         my $no = $property->table('No');
11266         if (defined $no) {
11267             my $n = $property->table('N');
11268             if (defined $n && $no == $n) {
11269                 foreach my $alias ($property->aliases) {
11270                     $no->add_conflicting($alias->name, 'P');
11271                 }
11272             }
11273         }
11274     }
11275
11276     return;
11277 }
11278
11279 sub register_file_for_name($$$) {
11280     # Given info about a table and a datafile that it should be associated
11281     # with, register that assocation
11282
11283     my $table = shift;
11284     my $directory_ref = shift;   # Array of the directory path for the file
11285     my $file = shift;            # The file name in the final directory, [-1].
11286     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11287
11288     trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
11289
11290     if ($table->isa('Property')) {
11291         $table->set_file_path(@$directory_ref, $file);
11292         push @map_properties, $table
11293                                     if $directory_ref->[0] eq $map_directory;
11294         return;
11295     }
11296
11297     # Do all of the work for all equivalent tables when called with the leader
11298     # table, so skip if isn't the leader.
11299     return if $table->leader != $table;
11300
11301     # Join all the file path components together, using slashes.
11302     my $full_filename = join('/', @$directory_ref, $file);
11303
11304     # All go in the same subdirectory of unicore
11305     if ($directory_ref->[0] ne $matches_directory) {
11306         Carp::my_carp("Unexpected directory in "
11307                 .  join('/', @{$directory_ref}, $file));
11308     }
11309
11310     # For this table and all its equivalents ...
11311     foreach my $table ($table, $table->equivalents) {
11312
11313         # Associate it with its file internally.  Don't include the
11314         # $matches_directory first component
11315         $table->set_file_path(@$directory_ref, $file);
11316         my $sub_filename = join('/', $directory_ref->[1, -1], $file);
11317
11318         my $property = $table->property;
11319         $property = ($property == $perl)
11320                     ? ""                # 'perl' is never explicitly stated
11321                     : standardize($property->name) . '=';
11322
11323         my $deprecated = ($table->status eq $DEPRECATED)
11324                          ? $table->status_info
11325                          : "";
11326
11327         # And for each of the table's aliases...  This inner loop eventually
11328         # goes through all aliases in the UCD that we generate regex match
11329         # files for
11330         foreach my $alias ($table->aliases) {
11331             my $name = $alias->name;
11332
11333             # Generate an entry in either the loose or strict hashes, which
11334             # will translate the property and alias names combination into the
11335             # file where the table for them is stored.
11336             my $standard;
11337             if ($alias->loose_match) {
11338                 $standard = $property . standardize($alias->name);
11339                 if (exists $loose_to_file_of{$standard}) {
11340                     Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
11341                 }
11342                 else {
11343                     $loose_to_file_of{$standard} = $sub_filename;
11344                 }
11345             }
11346             else {
11347                 $standard = lc ($property . $name);
11348                 if (exists $stricter_to_file_of{$standard}) {
11349                     Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
11350                 }
11351                 else {
11352                     $stricter_to_file_of{$standard} = $sub_filename;
11353
11354                     # Tightly coupled with how utf8_heavy.pl works, for a
11355                     # floating point number that is a whole number, get rid of
11356                     # the trailing decimal point and 0's, so that utf8_heavy
11357                     # will work.  Also note that this assumes that such a
11358                     # number is matched strictly; so if that were to change,
11359                     # this would be wrong.
11360                     if ((my $integer_name = $name)
11361                             =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
11362                     {
11363                         $stricter_to_file_of{$property . $integer_name}
11364                             = $sub_filename;
11365                     }
11366                 }
11367             }
11368
11369             # Keep a list of the deprecated properties and their filenames
11370             if ($deprecated) {
11371                 $utf8::why_deprecated{$sub_filename} = $deprecated;
11372             }
11373         }
11374     }
11375
11376     return;
11377 }
11378
11379 {   # Closure
11380     my %base_names;  # Names already used for avoiding DOS 8.3 filesystem
11381                      # conflicts
11382     my %full_dir_name_of;   # Full length names of directories used.
11383
11384     sub construct_filename($$$) {
11385         # Return a file name for a table, based on the table name, but perhaps
11386         # changed to get rid of non-portable characters in it, and to make
11387         # sure that it is unique on a file system that allows the names before
11388         # any period to be at most 8 characters (DOS).  While we're at it
11389         # check and complain if there are any directory conflicts.
11390
11391         my $name = shift;       # The name to start with
11392         my $mutable = shift;    # Boolean: can it be changed?  If no, but
11393                                 # yet it must be to work properly, a warning
11394                                 # is given
11395         my $directories_ref = shift;  # A reference to an array containing the
11396                                 # path to the file, with each element one path
11397                                 # component.  This is used because the same
11398                                 # name can be used in different directories.
11399         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11400
11401         my $warn = ! defined wantarray;  # If true, then if the name is
11402                                 # changed, a warning is issued as well.
11403
11404         if (! defined $name) {
11405             Carp::my_carp("Undefined name in directory "
11406                           . File::Spec->join(@$directories_ref)
11407                           . ". '_' used");
11408             return '_';
11409         }
11410
11411         # Make sure that no directory names conflict with each other.  Look at
11412         # each directory in the input file's path.  If it is already in use,
11413         # assume it is correct, and is merely being re-used, but if we
11414         # truncate it to 8 characters, and find that there are two directories
11415         # that are the same for the first 8 characters, but differ after that,
11416         # then that is a problem.
11417         foreach my $directory (@$directories_ref) {
11418             my $short_dir = substr($directory, 0, 8);
11419             if (defined $full_dir_name_of{$short_dir}) {
11420                 next if $full_dir_name_of{$short_dir} eq $directory;
11421                 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}.  Bad News.  Continuing anyway");
11422             }
11423             else {
11424                 $full_dir_name_of{$short_dir} = $directory;
11425             }
11426         }
11427
11428         my $path = join '/', @$directories_ref;
11429         $path .= '/' if $path;
11430
11431         # Remove interior underscores.
11432         (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
11433
11434         # Change any non-word character into an underscore, and truncate to 8.
11435         $filename =~ s/\W+/_/g;   # eg., "L&" -> "L_"
11436         substr($filename, 8) = "" if length($filename) > 8;
11437
11438         # Make sure the basename doesn't conflict with something we
11439         # might have already written. If we have, say,
11440         #     InGreekExtended1
11441         #     InGreekExtended2
11442         # they become
11443         #     InGreekE
11444         #     InGreek2
11445         my $warned = 0;
11446         while (my $num = $base_names{$path}{lc $filename}++) {
11447             $num++; # so basenames with numbers start with '2', which
11448                     # just looks more natural.
11449
11450             # Want to append $num, but if it'll make the basename longer
11451             # than 8 characters, pre-truncate $filename so that the result
11452             # is acceptable.
11453             my $delta = length($filename) + length($num) - 8;
11454             if ($delta > 0) {
11455                 substr($filename, -$delta) = $num;
11456             }
11457             else {
11458                 $filename .= $num;
11459             }
11460             if ($warn && ! $warned) {
11461                 $warned = 1;
11462                 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS).  Proceeding anyway.");
11463             }
11464         }
11465
11466         return $filename if $mutable;
11467
11468         # If not changeable, must return the input name, but warn if needed to
11469         # change it beyond shortening it.
11470         if ($name ne $filename
11471             && substr($name, 0, length($filename)) ne $filename) {
11472             Carp::my_carp("'$path$name' had to be changed into '$filename'.  Bad News.  Proceeding anyway.");
11473         }
11474         return $name;
11475     }
11476 }
11477
11478 # The pod file contains a very large table.  Many of the lines in that table
11479 # would exceed a typical output window's size, and so need to be wrapped with
11480 # a hanging indent to make them look good.  The pod language is really
11481 # insufficient here.  There is no general construct to do that in pod, so it
11482 # is done here by beginning each such line with a space to cause the result to
11483 # be output without formatting, and doing all the formatting here.  This leads
11484 # to the result that if the eventual display window is too narrow it won't
11485 # look good, and if the window is too wide, no advantage is taken of that
11486 # extra width.  A further complication is that the output may be indented by
11487 # the formatter so that there is less space than expected.  What I (khw) have
11488 # done is to assume that that indent is a particular number of spaces based on
11489 # what it is in my Linux system;  people can always resize their windows if
11490 # necessary, but this is obviously less than desirable, but the best that can
11491 # be expected.
11492 my $automatic_pod_indent = 8;
11493
11494 # Try to format so that uses fewest lines, but few long left column entries
11495 # slide into the right column.  An experiment on 5.1 data yielded the
11496 # following percentages that didn't cut into the other side along with the
11497 # associated first-column widths
11498 # 69% = 24
11499 # 80% not too bad except for a few blocks
11500 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
11501 # 95% = 37;
11502 my $indent_info_column = 27;    # 75% of lines didn't have overlap
11503
11504 my $FILLER = 3;     # Length of initial boiler-plate columns in a pod line
11505                     # The 3 is because of:
11506                     #   1   for the leading space to tell the pod formatter to
11507                     #       output as-is
11508                     #   1   for the flag
11509                     #   1   for the space between the flag and the main data
11510
11511 sub format_pod_line ($$$;$$) {
11512     # Take a pod line and return it, formatted properly
11513
11514     my $first_column_width = shift;
11515     my $entry = shift;  # Contents of left column
11516     my $info = shift;   # Contents of right column
11517
11518     my $status = shift || "";   # Any flag
11519
11520     my $loose_match = shift;    # Boolean.
11521     $loose_match = 1 unless defined $loose_match;
11522
11523     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11524
11525     my $flags = "";
11526     $flags .= $STRICTER if ! $loose_match;
11527
11528     $flags .= $status if $status;
11529
11530     # There is a blank in the left column to cause the pod formatter to
11531     # output the line as-is.
11532     return sprintf " %-*s%-*s %s\n",
11533                     # The first * in the format is replaced by this, the -1 is
11534                     # to account for the leading blank.  There isn't a
11535                     # hard-coded blank after this to separate the flags from
11536                     # the rest of the line, so that in the unlikely event that
11537                     # multiple flags are shown on the same line, they both
11538                     # will get displayed at the expense of that separation,
11539                     # but since they are left justified, a blank will be
11540                     # inserted in the normal case.
11541                     $FILLER - 1,
11542                     $flags,
11543
11544                     # The other * in the format is replaced by this number to
11545                     # cause the first main column to right fill with blanks.
11546                     # The -1 is for the guaranteed blank following it.
11547                     $first_column_width - $FILLER - 1,
11548                     $entry,
11549                     $info;
11550 }
11551
11552 my @zero_match_tables;  # List of tables that have no matches in this release
11553
11554 sub make_table_pod_entries($) {
11555     # This generates the entries for the pod file for a given table.
11556     # Also done at this time are any children tables.  The output looks like:
11557     # \p{Common}              \p{Script=Common} (Short: \p{Zyyy}) (5178)
11558
11559     my $input_table = shift;        # Table the entry is for
11560     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11561
11562     # Generate parent and all its children at the same time.
11563     return if $input_table->parent != $input_table;
11564
11565     my $property = $input_table->property;
11566     my $type = $property->type;
11567     my $full_name = $property->full_name;
11568
11569     my $count = $input_table->count;
11570     my $string_count = clarify_number($count);
11571     my $status = $input_table->status;
11572     my $status_info = $input_table->status_info;
11573
11574     my $entry_for_first_table; # The entry for the first table output.
11575                            # Almost certainly, it is the parent.
11576
11577     # For each related table (including itself), we will generate a pod entry
11578     # for each name each table goes by
11579     foreach my $table ($input_table, $input_table->children) {
11580
11581         # utf8_heavy.pl cannot deal with null string property values, so don't
11582         # output any.
11583         next if $table->name eq "";
11584
11585         # First, gather all the info that applies to this table as a whole.
11586
11587         push @zero_match_tables, $table if $count == 0;
11588
11589         my $table_property = $table->property;
11590
11591         # The short name has all the underscores removed, while the full name
11592         # retains them.  Later, we decide whether to output a short synonym
11593         # for the full one, we need to compare apples to apples, so we use the
11594         # short name's length including underscores.
11595         my $table_property_short_name_length;
11596         my $table_property_short_name
11597             = $table_property->short_name(\$table_property_short_name_length);
11598         my $table_property_full_name = $table_property->full_name;
11599
11600         # Get how much savings there is in the short name over the full one
11601         # (delta will always be <= 0)
11602         my $table_property_short_delta = $table_property_short_name_length
11603                                          - length($table_property_full_name);
11604         my @table_description = $table->description;
11605         my @table_note = $table->note;
11606
11607         # Generate an entry for each alias in this table.
11608         my $entry_for_first_alias;  # saves the first one encountered.
11609         foreach my $alias ($table->aliases) {
11610
11611             # Skip if not to go in pod.
11612             next unless $alias->make_pod_entry;
11613
11614             # Start gathering all the components for the entry
11615             my $name = $alias->name;
11616
11617             my $entry;      # Holds the left column, may include extras
11618             my $entry_ref;  # To refer to the left column's contents from
11619                             # another entry; has no extras
11620
11621             # First the left column of the pod entry.  Tables for the $perl
11622             # property always use the single form.
11623             if ($table_property == $perl) {
11624                 $entry = "\\p{$name}";
11625                 $entry_ref = "\\p{$name}";
11626             }
11627             else {    # Compound form.
11628
11629                 # Only generate one entry for all the aliases that mean true
11630                 # or false in binary properties.  Append a '*' to indicate
11631                 # some are missing.  (The heading comment notes this.)
11632                 my $wild_card_mark;
11633                 if ($type == $BINARY) {
11634                     next if $name ne 'N' && $name ne 'Y';
11635                     $wild_card_mark = '*';
11636                 }
11637                 else {
11638                     $wild_card_mark = "";
11639                 }
11640
11641                 # Colon-space is used to give a little more space to be easier
11642                 # to read;
11643                 $entry = "\\p{"
11644                         . $table_property_full_name
11645                         . ": $name$wild_card_mark}";
11646
11647                 # But for the reference to this entry, which will go in the
11648                 # right column, where space is at a premium, use equals
11649                 # without a space
11650                 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
11651             }
11652
11653             # Then the right (info) column.  This is stored as components of
11654             # an array for the moment, then joined into a string later.  For
11655             # non-internal only properties, begin the info with the entry for
11656             # the first table we encountered (if any), as things are ordered
11657             # so that that one is the most descriptive.  This leads to the
11658             # info column of an entry being a more descriptive version of the
11659             # name column
11660             my @info;
11661             if ($name =~ /^_/) {
11662                 push @info,
11663                         '(For internal use by Perl, not necessarily stable)';
11664             }
11665             elsif ($entry_for_first_alias) {
11666                 push @info, $entry_for_first_alias;
11667             }
11668
11669             # If this entry is equivalent to another, add that to the info,
11670             # using the first such table we encountered
11671             if ($entry_for_first_table) {
11672                 if (@info) {
11673                     push @info, "(= $entry_for_first_table)";
11674                 }
11675                 else {
11676                     push @info, $entry_for_first_table;
11677                 }
11678             }
11679
11680             # If the name is a large integer, add an equivalent with an
11681             # exponent for better readability
11682             if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
11683                 push @info, sprintf "(= %.1e)", $name
11684             }
11685
11686             my $parenthesized = "";
11687             if (! $entry_for_first_alias) {
11688
11689                 # This is the first alias for the current table.  The alias
11690                 # array is ordered so that this is the fullest, most
11691                 # descriptive alias, so it gets the fullest info.  The other
11692                 # aliases are mostly merely pointers to this one, using the
11693                 # information already added above.
11694
11695                 # Display any status message, but only on the parent table
11696                 if ($status && ! $entry_for_first_table) {
11697                     push @info, $status_info;
11698                 }
11699
11700                 # Put out any descriptive info
11701                 if (@table_description || @table_note) {
11702                     push @info, join "; ", @table_description, @table_note;
11703                 }
11704
11705                 # Look to see if there is a shorter name we can point people
11706                 # at
11707                 my $standard_name = standardize($name);
11708                 my $short_name;
11709                 my $proposed_short = $table->short_name;
11710                 if (defined $proposed_short) {
11711                     my $standard_short = standardize($proposed_short);
11712
11713                     # If the short name is shorter than the standard one, or
11714                     # even it it's not, but the combination of it and its
11715                     # short property name (as in \p{prop=short} ($perl doesn't
11716                     # have this form)) saves at least two characters, then,
11717                     # cause it to be listed as a shorter synonym.
11718                     if (length $standard_short < length $standard_name
11719                         || ($table_property != $perl
11720                             && (length($standard_short)
11721                                 - length($standard_name)
11722                                 + $table_property_short_delta)  # (<= 0)
11723                                 < -2))
11724                     {
11725                         $short_name = $proposed_short;
11726                         if ($table_property != $perl) {
11727                             $short_name = $table_property_short_name
11728                                           . "=$short_name";
11729                         }
11730                         $short_name = "\\p{$short_name}";
11731                     }
11732                 }
11733
11734                 # And if this is a compound form name, see if there is a
11735                 # single form equivalent
11736                 my $single_form;
11737                 if ($table_property != $perl) {
11738
11739                     # Special case the binary N tables, so that will print
11740                     # \P{single}, but use the Y table values to populate
11741                     # 'single', as we haven't populated the N table.
11742                     my $test_table;
11743                     my $p;
11744                     if ($type == $BINARY
11745                         && $input_table == $property->table('No'))
11746                     {
11747                         $test_table = $property->table('Yes');
11748                         $p = 'P';
11749                     }
11750                     else {
11751                         $test_table = $input_table;
11752                         $p = 'p';
11753                     }
11754
11755                     # Look for a single form amongst all the children.
11756                     foreach my $table ($test_table->children) {
11757                         next if $table->property != $perl;
11758                         my $proposed_name = $table->short_name;
11759                         next if ! defined $proposed_name;
11760
11761                         # Don't mention internal-only properties as a possible
11762                         # single form synonym
11763                         next if substr($proposed_name, 0, 1) eq '_';
11764
11765                         $proposed_name = "\\$p\{$proposed_name}";
11766                         if (! defined $single_form
11767                             || length($proposed_name) < length $single_form)
11768                         {
11769                             $single_form = $proposed_name;
11770
11771                             # The goal here is to find a single form; not the
11772                             # shortest possible one.  We've already found a
11773                             # short name.  So, stop at the first single form
11774                             # found, which is likely to be closer to the
11775                             # original.
11776                             last;
11777                         }
11778                     }
11779                 }
11780
11781                 # Ouput both short and single in the same parenthesized
11782                 # expression, but with only one of 'Single', 'Short' if there
11783                 # are both items.
11784                 if ($short_name || $single_form || $table->conflicting) {
11785                     $parenthesized .= '(';
11786                     $parenthesized .= "Short: $short_name" if $short_name;
11787                     if ($short_name && $single_form) {
11788                         $parenthesized .= ', ';
11789                     }
11790                     elsif ($single_form) {
11791                         $parenthesized .= 'Single: ';
11792                     }
11793                     $parenthesized .= $single_form if $single_form;
11794                 }
11795             }
11796
11797
11798             # Warn if this property isn't the same as one that a
11799             # semi-casual user might expect.  The other components of this
11800             # parenthesized structure are calculated only for the first entry
11801             # for this table, but the conflicting is deemed important enough
11802             # to go on every entry.
11803             my $conflicting = join " NOR ", $table->conflicting;
11804             if ($conflicting) {
11805                 $parenthesized .= '(' if ! $parenthesized;
11806                 $parenthesized .=  '; ' if $parenthesized ne '(';
11807                 $parenthesized .= "NOT $conflicting";
11808             }
11809             $parenthesized .= ')' if $parenthesized;
11810
11811             push @info, $parenthesized if $parenthesized;
11812             push @info, "($string_count)" if $output_range_counts;
11813
11814             # Now, we have both the entry and info so add them to the
11815             # list of all the properties.
11816             push @match_properties,
11817                 format_pod_line($indent_info_column,
11818                                 $entry,
11819                                 join( " ", @info),
11820                                 $alias->status,
11821                                 $alias->loose_match);
11822
11823             $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
11824         } # End of looping through the aliases for this table.
11825
11826         if (! $entry_for_first_table) {
11827             $entry_for_first_table = $entry_for_first_alias;
11828         }
11829     } # End of looping through all the related tables
11830     return;
11831 }
11832
11833 sub pod_alphanumeric_sort {
11834     # Sort pod entries alphanumerically.
11835
11836     # The first few character columns are filler, plus the '\p{'; and get rid
11837     # of all the trailing stuff, starting with the trailing '}', so as to sort
11838     # on just 'Name=Value'
11839     (my $a = lc $a) =~ s/^ .*? { //x;
11840     $a =~ s/}.*//;
11841     (my $b = lc $b) =~ s/^ .*? { //x;
11842     $b =~ s/}.*//;
11843
11844     # Determine if the two operands are both internal only or both not.
11845     # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
11846     # should be the underscore that begins internal only
11847     my $a_is_internal = (substr($a, 0, 1) eq '_');
11848     my $b_is_internal = (substr($b, 0, 1) eq '_');
11849
11850     # Sort so the internals come last in the table instead of first (which the
11851     # leading underscore would otherwise indicate).
11852     if ($a_is_internal != $b_is_internal) {
11853         return 1 if $a_is_internal;
11854         return -1
11855     }
11856
11857     # Determine if the two operands are numeric property values or not.
11858     # A numeric property will look like xyz: 3.  But the number
11859     # can begin with an optional minus sign, and may have a
11860     # fraction or rational component, like xyz: 3/2.  If either
11861     # isn't numeric, use alphabetic sort.
11862     my ($a_initial, $a_number) =
11863         ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
11864     return $a cmp $b unless defined $a_number;
11865     my ($b_initial, $b_number) =
11866         ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
11867     return $a cmp $b unless defined $b_number;
11868
11869     # Here they are both numeric, but use alphabetic sort if the
11870     # initial parts don't match
11871     return $a cmp $b if $a_initial ne $b_initial;
11872
11873     # Convert rationals to floating for the comparison.
11874     $a_number = eval $a_number if $a_number =~ qr{/};
11875     $b_number = eval $b_number if $b_number =~ qr{/};
11876
11877     return $a_number <=> $b_number;
11878 }
11879
11880 sub make_pod () {
11881     # Create the .pod file.  This generates the various subsections and then
11882     # combines them in one big HERE document.
11883
11884     return unless defined $pod_directory;
11885     print "Making pod file\n" if $verbosity >= $PROGRESS;
11886
11887     my $exception_message =
11888     '(Any exceptions are individually noted beginning with the word NOT.)';
11889     my @block_warning;
11890     if (-e 'Blocks.txt') {
11891
11892         # Add the line: '\p{In_*}    \p{Block: *}', with the warning message
11893         # if the global $has_In_conflicts indicates we have them.
11894         push @match_properties, format_pod_line($indent_info_column,
11895                                                 '\p{In_*}',
11896                                                 '\p{Block: *}'
11897                                                     . (($has_In_conflicts)
11898                                                       ? " $exception_message"
11899                                                       : ""));
11900         @block_warning = << "END";
11901
11902 Matches in the Block property have shortcuts that begin with 'In_'.  For
11903 example, \\p{Block=Latin1} can be written as \\p{In_Latin1}.  For backward
11904 compatibility, if there is no conflict with another shortcut, these may also
11905 be written as \\p{Latin1} or \\p{Is_Latin1}.  But, N.B., there are numerous
11906 such conflicting shortcuts.  Use of these forms for Block is discouraged, and
11907 are flagged as such, not only because of the potential confusion as to what is
11908 meant, but also because a later release of Unicode may preempt the shortcut,
11909 and your program would no longer be correct.  Use the 'In_' form instead to
11910 avoid this, or even more clearly, use the compound form, e.g.,
11911 \\p{blk:latin1}.  See L<perlunicode/"Blocks"> for more information about this.
11912 END
11913     }
11914     my $text = "If an entry has flag(s) at its beginning, like '$DEPRECATED', the 'Is_' form has the same flag(s)";
11915     $text = "$exception_message $text" if $has_Is_conflicts;
11916
11917     # And the 'Is_ line';
11918     push @match_properties, format_pod_line($indent_info_column,
11919                                             '\p{Is_*}',
11920                                             "\\p{*} $text");
11921
11922     # Sort the properties array for output.  It is sorted alphabetically
11923     # except numerically for numeric properties, and only output unique lines.
11924     @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
11925
11926     my $formatted_properties = simple_fold(\@match_properties,
11927                                         "",
11928                                         # indent succeeding lines by two extra
11929                                         # which looks better
11930                                         $indent_info_column + 2,
11931
11932                                         # shorten the line length by how much
11933                                         # the formatter indents, so the folded
11934                                         # line will fit in the space
11935                                         # presumably available
11936                                         $automatic_pod_indent);
11937     # Add column headings, indented to be a little more centered, but not
11938     # exactly
11939     $formatted_properties =  format_pod_line($indent_info_column,
11940                                                     '    NAME',
11941                                                     '           INFO')
11942                                     . "\n"
11943                                     . $formatted_properties;
11944
11945     # Generate pod documentation lines for the tables that match nothing
11946     my $zero_matches;
11947     if (@zero_match_tables) {
11948         @zero_match_tables = uniques(@zero_match_tables);
11949         $zero_matches = join "\n\n",
11950                         map { $_ = '=item \p{' . $_->complete_name . "}" }
11951                             sort { $a->complete_name cmp $b->complete_name }
11952                             uniques(@zero_match_tables);
11953
11954         $zero_matches = <<END;
11955
11956 =head2 Legal \\p{} and \\P{} constructs that match no characters
11957
11958 Unicode has some property-value pairs that currently don't match anything.
11959 This happens generally either because they are obsolete, or for symmetry with
11960 other forms, but no language has yet been encoded that uses them.  In this
11961 version of Unicode, the following match zero code points:
11962
11963 =over 4
11964
11965 $zero_matches
11966
11967 =back
11968
11969 END
11970     }
11971
11972     # Generate list of properties that we don't accept, grouped by the reasons
11973     # why.  This is so only put out the 'why' once, and then list all the
11974     # properties that have that reason under it.
11975
11976     my %why_list;   # The keys are the reasons; the values are lists of
11977                     # properties that have the key as their reason
11978
11979     # For each property, add it to the list that are suppressed for its reason
11980     # The sort will cause the alphabetically first properties to be added to
11981     # each list first, so each list will be sorted.
11982     foreach my $property (sort keys %why_suppressed) {
11983         push @{$why_list{$why_suppressed{$property}}}, $property;
11984     }
11985
11986     # For each reason (sorted by the first property that has that reason)...
11987     my @bad_re_properties;
11988     foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
11989                      keys %why_list)
11990     {
11991         # Add to the output, all the properties that have that reason.  Start
11992         # with an empty line.
11993         push @bad_re_properties, "\n\n";
11994
11995         my $has_item = 0;   # Flag if actually output anything.
11996         foreach my $name (@{$why_list{$why}}) {
11997
11998             # Split compound names into $property and $table components
11999             my $property = $name;
12000             my $table;
12001             if ($property =~ / (.*) = (.*) /x) {
12002                 $property = $1;
12003                 $table = $2;
12004             }
12005
12006             # This release of Unicode may not have a property that is
12007             # suppressed, so don't reference a non-existent one.
12008             $property = property_ref($property);
12009             next if ! defined $property;
12010
12011             # And since this list is only for match tables, don't list the
12012             # ones that don't have match tables.
12013             next if ! $property->to_create_match_tables;
12014
12015             # Find any abbreviation, and turn it into a compound name if this
12016             # is a property=value pair.
12017             my $short_name = $property->name;
12018             $short_name .= '=' . $property->table($table)->name if $table;
12019
12020             # And add the property as an item for the reason.
12021             push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
12022             $has_item = 1;
12023         }
12024
12025         # And add the reason under the list of properties, if such a list
12026         # actually got generated.  Note that the header got added
12027         # unconditionally before.  But pod ignores extra blank lines, so no
12028         # harm.
12029         push @bad_re_properties, "\n$why\n" if $has_item;
12030
12031     } # End of looping through each reason.
12032
12033     # Generate a list of the properties whose map table we output, from the
12034     # global @map_properties.
12035     my @map_tables_actually_output;
12036     my $info_indent = 20;       # Left column is narrower than \p{} table.
12037     foreach my $property (@map_properties) {
12038
12039         # Get the path to the file; don't output any not in the standard
12040         # directory.
12041         my @path = $property->file_path;
12042         next if $path[0] ne $map_directory;
12043         shift @path;    # Remove the standard name
12044
12045         my $file = join '/', @path; # In case is in sub directory
12046         my $info = $property->full_name;
12047         my $short_name = $property->name;
12048         if ($info ne $short_name) {
12049             $info .= " ($short_name)";
12050         }
12051         foreach my $more_info ($property->description,
12052                                $property->note,
12053                                $property->status_info)
12054         {
12055             next unless $more_info;
12056             $info =~ s/\.\Z//;
12057             $info .= ".  $more_info";
12058         }
12059         push @map_tables_actually_output, format_pod_line($info_indent,
12060                                                           $file,
12061                                                           $info,
12062                                                           $property->status);
12063     }
12064
12065     # Sort alphabetically, and fold for output
12066     @map_tables_actually_output = sort
12067                             pod_alphanumeric_sort @map_tables_actually_output;
12068     @map_tables_actually_output
12069                         = simple_fold(\@map_tables_actually_output,
12070                                         ' ',
12071                                         $info_indent,
12072                                         $automatic_pod_indent);
12073
12074     # Generate a list of the formats that can appear in the map tables.
12075     my @map_table_formats;
12076     foreach my $format (sort keys %map_table_formats) {
12077         push @map_table_formats, " $format    $map_table_formats{$format}\n";
12078     }
12079
12080     # Everything is ready to assemble.
12081     my @OUT = << "END";
12082 =begin comment
12083
12084 $HEADER
12085
12086 To change this file, edit $0 instead.
12087
12088 =end comment
12089
12090 =head1 NAME
12091
12092 $pod_file - Complete index of Unicode Version $string_version properties
12093
12094 =head1 DESCRIPTION
12095
12096 There are many properties in Unicode, and Perl provides access to almost all of
12097 them, as well as some additional extensions and short-cut synonyms.
12098
12099 And just about all of the few that aren't accessible through the Perl
12100 core are accessible through the modules: Unicode::Normalize and
12101 Unicode::UCD, and for Unihan properties, via the CPAN module Unicode::Unihan.
12102
12103 This document merely lists all available properties and does not attempt to
12104 explain what each property really means.  There is a brief description of each
12105 Perl extension.  There is some detail about Blocks, Scripts, General_Category,
12106 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
12107 Unicode properties, refer to the Unicode standard.  A good starting place is
12108 L<$unicode_reference_url>.  More information on the Perl extensions is in
12109 L<perlrecharclass>.
12110
12111 Note that you can define your own properties; see
12112 L<perlunicode/"User-Defined Character Properties">.
12113
12114 =head1 Properties accessible through \\p{} and \\P{}
12115
12116 The Perl regular expression \\p{} and \\P{} constructs give access to most of
12117 the Unicode character properties.  The table below shows all these constructs,
12118 both single and compound forms.
12119
12120 B<Compound forms> consist of two components, separated by an equals sign or a
12121 colon.  The first component is the property name, and the second component is
12122 the particular value of the property to match against, for example,
12123 '\\p{Script: Greek}' or '\\p{Script=Greek}' both mean to match characters
12124 whose Script property is Greek.
12125
12126 B<Single forms>, like '\\p{Greek}', are mostly Perl-defined shortcuts for
12127 their equivalent compound forms.  The table shows these equivalences.  (In our
12128 example, '\\p{Greek}' is a just a shortcut for '\\p{Script=Greek}'.)
12129 There are also a few Perl-defined single forms that are not shortcuts for a
12130 compound form.  One such is \\p{Word}.  These are also listed in the table.
12131
12132 In parsing these constructs, Perl always ignores Upper/lower case differences
12133 everywhere within the {braces}.  Thus '\\p{Greek}' means the same thing as
12134 '\\p{greek}'.  But note that changing the case of the 'p' or 'P' before the
12135 left brace completely changes the meaning of the construct, from "match" (for
12136 '\\p{}') to "doesn't match" (for '\\P{}').  Casing in this document is for
12137 improved legibility.
12138
12139 Also, white space, hyphens, and underscores are also normally ignored
12140 everywhere between the {braces}, and hence can be freely added or removed
12141 even if the C</x> modifier hasn't been specified on the regular expression.
12142 But $a_bold_stricter at the beginning of an entry in the table below
12143 means that tighter (stricter) rules are used for that entry:
12144
12145 =over 4
12146
12147 =item Single form (\\p{name}) tighter rules:
12148
12149 White space, hyphens, and underscores ARE significant
12150 except for:
12151
12152 =over 4
12153
12154 =item * white space adjacent to a non-word character
12155
12156 =item * underscores separating digits in numbers
12157
12158 =back
12159
12160 That means, for example, that you can freely add or remove white space
12161 adjacent to (but within) the braces without affecting the meaning.
12162
12163 =item Compound form (\\p{name=value} or \\p{name:value}) tighter rules:
12164
12165 The tighter rules given above for the single form apply to everything to the
12166 right of the colon or equals; the looser rules still apply to everything to
12167 the left.
12168
12169 That means, for example, that you can freely add or remove white space
12170 adjacent to (but within) the braces and the colon or equal sign.
12171
12172 =back
12173
12174 Some properties are considered obsolete, but still available.  There are
12175 several varieties of obsolesence:
12176
12177 =over 4
12178
12179 =item Obsolete
12180
12181 Properties marked with $a_bold_obsolete in the table are considered
12182 obsolete.  At the time of this writing (Unicode version 5.2) there is no
12183 information in the Unicode standard about the implications of a property being
12184 obsolete.
12185
12186 =item Stabilized
12187
12188 Obsolete properties may be stabilized.  This means that they are not actively
12189 maintained by Unicode, and will not be extended as new characters are added to
12190 the standard.  Such properties are marked with $a_bold_stabilized in the
12191 table.  At the time of this writing (Unicode version 5.2) there is no further
12192 information in the Unicode standard about the implications of a property being
12193 stabilized.
12194
12195 =item Deprecated
12196
12197 Obsolete properties may be deprecated.  This means that their use is strongly
12198 discouraged, so much so that a warning will be issued if used, unless the
12199 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
12200 statement.  $A_bold_deprecated flags each such entry in the table, and
12201 the entry there for the longest, most descriptive version of the property will
12202 give the reason it is deprecated, and perhaps advice.  Perl may issue such a
12203 warning, even for properties that aren't officially deprecated by Unicode,
12204 when there used to be characters or code points that were matched by them, but
12205 no longer.  This is to warn you that your program may not work like it did on
12206 earlier Unicode releases.
12207
12208 A deprecated property may be made unavailable in a future Perl version, so it
12209 is best to move away from them.
12210
12211 =back
12212
12213 Some Perl extensions are present for backwards compatibility and are
12214 discouraged from being used, but not obsolete.  $A_bold_discouraged
12215 flags each such entry in the table.
12216
12217 @block_warning
12218
12219 The table below has two columns.  The left column contains the \\p{}
12220 constructs to look up, possibly preceeded by the flags mentioned above; and
12221 the right column contains information about them, like a description, or
12222 synonyms.  It shows both the single and compound forms for each property that
12223 has them.  If the left column is a short name for a property, the right column
12224 will give its longer, more descriptive name; and if the left column is the
12225 longest name, the right column will show any equivalent shortest name, in both
12226 single and compound forms if applicable.
12227
12228 The right column will also caution you if a property means something different
12229 than what might normally be expected.
12230
12231 Numbers in (parentheses) indicate the total number of code points matched by
12232 the property.  For emphasis, those properties that match no code points at all
12233 are listed as well in a separate section following the table.
12234
12235 There is no description given for most non-Perl defined properties (See
12236 $unicode_reference_url for that).
12237
12238 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
12239 combinations.  For example, entries like:
12240
12241  \\p{Gc: *}                                  \\p{General_Category: *}
12242
12243 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
12244 for the latter is also valid for the former.  Similarly,
12245
12246  \\p{Is_*}                                   \\p{*}
12247
12248 means that if and only if, for example, \\p{Foo} exists, then \\p{Is_Foo} and
12249 \\p{IsFoo} are also valid and all mean the same thing.  And similarly,
12250 \\p{Foo=Bar} means the same as \\p{Is_Foo=Bar} and \\p{IsFoo=Bar}.  '*' here
12251 is restricted to something not beginning with an underscore.
12252
12253 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
12254 And 'No', 'F', and 'False' are all synonyms for 'N'.  The table shows 'Y*' and
12255 'N*' to indicate this, and doesn't have separate entries for the other
12256 possibilities.  Note that not all properties which have values 'Yes' and 'No'
12257 are binary, and they have all their values spelled out without using this wild
12258 card, and a C<NOT> clause in their description that highlights their not being
12259 binary.  These also require the compound form to match them, whereas true
12260 binary properties have both single and compound forms available.
12261
12262 Note that all non-essential underscores are removed in the display of the
12263 short names below.
12264
12265 B<Summary legend:>
12266
12267 =over 4
12268
12269 =item B<*> is a wild-card
12270
12271 =item B<(\\d+)> in the info column gives the number of code points matched by
12272 this property.
12273
12274 =item B<$DEPRECATED> means this is deprecated.
12275
12276 =item B<$OBSOLETE> means this is obsolete.
12277
12278 =item B<$STABILIZED> means this is stabilized.
12279
12280 =item B<$STRICTER> means tighter (stricter) name matching applies.
12281
12282 =item B<$DISCOURAGED> means use of this form is discouraged.
12283
12284 =back
12285
12286 $formatted_properties
12287
12288 $zero_matches
12289
12290 =head1 Properties not accessible through \\p{} and \\P{}
12291
12292 A few properties are accessible in Perl via various function calls only.
12293 These are:
12294  Lowercase_Mapping          lc() and lcfirst()
12295  Titlecase_Mapping          ucfirst()
12296  Uppercase_Mapping          uc()
12297
12298 Case_Folding is accessible through the /i modifier in regular expressions.
12299
12300 The Name property is accessible through the \\N{} interpolation in
12301 double-quoted strings and regular expressions, but both usages require a C<use
12302 charnames;> to be specified, which also contains related functions viacode()
12303 and vianame().
12304
12305 =head1 Unicode regular expression properties that are NOT accepted by Perl
12306
12307 Perl will generate an error for a few character properties in Unicode when
12308 used in a regular expression.  The non-Unihan ones are listed below, with the
12309 reasons they are not accepted, perhaps with work-arounds.  The short names for
12310 the properties are listed enclosed in (parentheses).
12311
12312 =over 4
12313
12314 @bad_re_properties
12315
12316 =back
12317
12318 An installation can choose to allow any of these to be matched by changing the
12319 controlling lists contained in the program C<\$Config{privlib}>/F<unicore/$0>
12320 and then re-running F<$0>.  (C<\%Config> is available from the Config module).
12321
12322 =head1 Files in the I<To> directory (for serious hackers only)
12323
12324 All Unicode properties are really mappings (in the mathematical sense) from
12325 code points to their respective values.  As part of its build process,
12326 Perl constructs tables containing these mappings for all properties that it
12327 deals with.  But only a few of these are written out into files.
12328 Those written out are in the directory C<\$Config{privlib}>/F<unicore/To/>
12329 (%Config is available from the Config module).
12330
12331 Those ones written are ones needed by Perl internally during execution, or for
12332 which there is some demand, and those for which there is no access through the
12333 Perl core.  Generally, properties that can be used in regular expression
12334 matching do not have their map tables written, like Script.  Nor are the
12335 simplistic properties that have a better, more complete version, such as
12336 Simple_Uppercase_Mapping  (Uppercase_Mapping is written instead).
12337
12338 None of the properties in the I<To> directory are currently directly
12339 accessible through the Perl core, although some may be accessed indirectly.
12340 For example, the uc() function implements the Uppercase_Mapping property and
12341 uses the F<Upper.pl> file found in this directory.
12342
12343 The available files with their properties (short names in parentheses),
12344 and any flags or comments about them, are:
12345
12346 @map_tables_actually_output
12347
12348 An installation can choose to change which files are generated by changing the
12349 controlling lists contained in the program C<\$Config{privlib}>/F<unicore/$0>
12350 and then re-running F<$0>.
12351
12352 Each of these files defines two hash entries to help reading programs decipher
12353 it.  One of them looks like this:
12354
12355     \$utf8::SwashInfo{'ToNAME'}{'format'} = 's';
12356
12357 where 'NAME' is a name to indicate the property.  For backwards compatibility,
12358 this is not necessarily the property's official Unicode name.  (The 'To' is
12359 also for backwards compatibility.)  The hash entry gives the format of the
12360 mapping fields of the table, currently one of the following:
12361
12362  @map_table_formats
12363
12364 This format applies only to the entries in the main body of the table.
12365 Entries defined in hashes or ones that are missing from the list can have a
12366 different format.
12367
12368 The value that the missing entries have is given by the other SwashInfo hash
12369 entry line; it looks like this:
12370
12371     \$utf8::SwashInfo{'ToNAME'}{'missing'} = 'NaN';
12372
12373 This example line says that any Unicode code points not explicitly listed in
12374 the file have the value 'NaN' under the property indicated by NAME.  If the
12375 value is the special string C<< <code point> >>, it means that the value for
12376 any missing code point is the code point itself.  This happens, for example,
12377 in the file for Uppercase_Mapping (To/Upper.pl), in which code points like the
12378 character 'A', are missing because the uppercase of 'A' is itself.
12379
12380 =head1 SEE ALSO
12381
12382 L<$unicode_reference_url>
12383
12384 L<perlrecharclass>
12385
12386 L<perlunicode>
12387
12388 END
12389
12390     # And write it.
12391     main::write([ $pod_directory, "$pod_file.pod" ], @OUT);
12392     return;
12393 }
12394
12395 sub make_Heavy () {
12396     # Create and write Heavy.pl, which passes info about the tables to
12397     # utf8_heavy.pl
12398
12399     my @heavy = <<END;
12400 $HEADER
12401 $INTERNAL_ONLY
12402
12403 # This file is for the use of utf8_heavy.pl
12404
12405 # Maps property names in loose standard form to its standard name
12406 \%utf8::loose_property_name_of = (
12407 END
12408
12409     push @heavy, simple_dumper (\%loose_property_name_of, ' ' x 4);
12410     push @heavy, <<END;
12411 );
12412
12413 # Maps property, table to file for those using stricter matching
12414 \%utf8::stricter_to_file_of = (
12415 END
12416     push @heavy, simple_dumper (\%stricter_to_file_of, ' ' x 4);
12417     push @heavy, <<END;
12418 );
12419
12420 # Maps property, table to file for those using loose matching
12421 \%utf8::loose_to_file_of = (
12422 END
12423     push @heavy, simple_dumper (\%loose_to_file_of, ' ' x 4);
12424     push @heavy, <<END;
12425 );
12426
12427 # Maps floating point to fractional form
12428 \%utf8::nv_floating_to_rational = (
12429 END
12430     push @heavy, simple_dumper (\%nv_floating_to_rational, ' ' x 4);
12431     push @heavy, <<END;
12432 );
12433
12434 # If a floating point number doesn't have enough digits in it to get this
12435 # close to a fraction, it isn't considered to be that fraction even if all the
12436 # digits it does have match.
12437 \$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
12438
12439 # Deprecated tables to generate a warning for.  The key is the file containing
12440 # the table, so as to avoid duplication, as many property names can map to the
12441 # file, but we only need one entry for all of them.
12442 \%utf8::why_deprecated = (
12443 END
12444
12445     push @heavy, simple_dumper (\%utf8::why_deprecated, ' ' x 4);
12446     push @heavy, <<END;
12447 );
12448
12449 1;
12450 END
12451
12452     main::write("Heavy.pl", @heavy);
12453     return;
12454 }
12455
12456 sub write_all_tables() {
12457     # Write out all the tables generated by this program to files, as well as
12458     # the supporting data structures, pod file, and .t file.
12459
12460     my @writables;              # List of tables that actually get written
12461     my %match_tables_to_write;  # Used to collapse identical match tables
12462                                 # into one file.  Each key is a hash function
12463                                 # result to partition tables into buckets.
12464                                 # Each value is an array of the tables that
12465                                 # fit in the bucket.
12466
12467     # For each property ...
12468     # (sort so that if there is an immutable file name, it has precedence, so
12469     # some other property can't come in and take over its file name.  If b's
12470     # file name is defined, will return 1, meaning to take it first; don't
12471     # care if both defined, as they had better be different anyway)
12472     PROPERTY:
12473     foreach my $property (sort { defined $b->file } property_ref('*')) {
12474         my $type = $property->type;
12475
12476         # And for each table for that property, starting with the mapping
12477         # table for it ...
12478         TABLE:
12479         foreach my $table($property,
12480
12481                         # and all the match tables for it (if any), sorted so
12482                         # the ones with the shortest associated file name come
12483                         # first.  The length sorting prevents problems of a
12484                         # longer file taking a name that might have to be used
12485                         # by a shorter one.  The alphabetic sorting prevents
12486                         # differences between releases
12487                         sort {  my $ext_a = $a->external_name;
12488                                 return 1 if ! defined $ext_a;
12489                                 my $ext_b = $b->external_name;
12490                                 return -1 if ! defined $ext_b;
12491                                 my $cmp = length $ext_a <=> length $ext_b;
12492
12493                                 # Return result if lengths not equal
12494                                 return $cmp if $cmp;
12495
12496                                 # Alphabetic if lengths equal
12497                                 return $ext_a cmp $ext_b
12498                         } $property->tables
12499                     )
12500         {
12501
12502             # Here we have a table associated with a property.  It could be
12503             # the map table (done first for each property), or one of the
12504             # other tables.  Determine which type.
12505             my $is_property = $table->isa('Property');
12506
12507             my $name = $table->name;
12508             my $complete_name = $table->complete_name;
12509
12510             # See if should suppress the table if is empty, but warn if it
12511             # contains something.
12512             my $suppress_if_empty_warn_if_not = grep { $complete_name eq $_ }
12513                                     keys %why_suppress_if_empty_warn_if_not;
12514
12515             # Calculate if this table should have any code points associated
12516             # with it or not.
12517             my $expected_empty =
12518
12519                 # $perl should be empty, as well as properties that we just
12520                 # don't do anything with
12521                 ($is_property
12522                     && ($table == $perl
12523                         || grep { $complete_name eq $_ }
12524                                                     @unimplemented_properties
12525                     )
12526                 )
12527
12528                 # Match tables in properties we skipped populating should be
12529                 # empty
12530                 || (! $is_property && ! $property->to_create_match_tables)
12531
12532                 # Tables and properties that are expected to have no code
12533                 # points should be empty
12534                 || $suppress_if_empty_warn_if_not
12535             ;
12536
12537             # Set a boolean if this table is the complement of an empty binary
12538             # table
12539             my $is_complement_of_empty_binary =
12540                 $type == $BINARY &&
12541                 (($table == $property->table('Y')
12542                     && $property->table('N')->is_empty)
12543                 || ($table == $property->table('N')
12544                     && $property->table('Y')->is_empty));
12545
12546
12547             # Some tables should match everything
12548             my $expected_full =
12549                 ($is_property)
12550                 ? # All these types of map tables will be full because
12551                   # they will have been populated with defaults
12552                   ($type == $ENUM || $type == $BINARY)
12553
12554                 : # A match table should match everything if its method
12555                   # shows it should
12556                   ($table->matches_all
12557
12558                   # The complement of an empty binary table will match
12559                   # everything
12560                   || $is_complement_of_empty_binary
12561                   )
12562             ;
12563
12564             if ($table->is_empty) {
12565
12566
12567                 if ($suppress_if_empty_warn_if_not) {
12568                     $table->set_status($SUPPRESSED,
12569                         $why_suppress_if_empty_warn_if_not{$complete_name});
12570                 }
12571
12572                 # Suppress expected empty tables.
12573                 next TABLE if $expected_empty;
12574
12575                 # And setup to later output a warning for those that aren't
12576                 # known to be allowed to be empty.  Don't do the warning if
12577                 # this table is a child of another one to avoid duplicating
12578                 # the warning that should come from the parent one.
12579                 if (($table == $property || $table->parent == $table)
12580                     && $table->status ne $SUPPRESSED
12581                     && ! grep { $complete_name =~ /^$_$/ }
12582                                                     @tables_that_may_be_empty)
12583                 {
12584                     push @unhandled_properties, "$table";
12585                 }
12586             }
12587             elsif ($expected_empty) {
12588                 my $because = "";
12589                 if ($suppress_if_empty_warn_if_not) {
12590                     $because = " because $why_suppress_if_empty_warn_if_not{$complete_name}";
12591                 }
12592
12593                 Carp::my_carp("Not expecting property $table$because.  Generating file for it anyway.");
12594             }
12595
12596             my $count = $table->count;
12597             if ($expected_full) {
12598                 if ($count != $MAX_UNICODE_CODEPOINTS) {
12599                     Carp::my_carp("$table matches only "
12600                     . clarify_number($count)
12601                     . " Unicode code points but should match "
12602                     . clarify_number($MAX_UNICODE_CODEPOINTS)
12603                     . " (off by "
12604                     .  clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
12605                     . ").  Proceeding anyway.");
12606                 }
12607
12608                 # Here is expected to be full.  If it is because it is the
12609                 # complement of an (empty) binary table that is to be
12610                 # suppressed, then suppress this one as well.
12611                 if ($is_complement_of_empty_binary) {
12612                     my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
12613                     my $opposing = $property->table($opposing_name);
12614                     my $opposing_status = $opposing->status;
12615                     if ($opposing_status) {
12616                         $table->set_status($opposing_status,
12617                                            $opposing->status_info);
12618                     }
12619                 }
12620             }
12621             elsif ($count == $MAX_UNICODE_CODEPOINTS) {
12622                 if ($table == $property || $table->leader == $table) {
12623                     Carp::my_carp("$table unexpectedly matches all Unicode code points.  Proceeding anyway.");
12624                 }
12625             }
12626
12627             if ($table->status eq $SUPPRESSED) {
12628                 if (! $is_property) {
12629                     my @children = $table->children;
12630                     foreach my $child (@children) {
12631                         if ($child->status ne $SUPPRESSED) {
12632                             Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
12633                         }
12634                     }
12635                 }
12636                 next TABLE;
12637
12638             }
12639             if (! $is_property) {
12640
12641                 # Several things need to be done just once for each related
12642                 # group of match tables.  Do them on the parent.
12643                 if ($table->parent == $table) {
12644
12645                     # Add an entry in the pod file for the table; it also does
12646                     # the children.
12647                     make_table_pod_entries($table) if defined $pod_directory;
12648
12649                     # See if the the table matches identical code points with
12650                     # something that has already been output.  In that case,
12651                     # no need to have two files with the same code points in
12652                     # them.  We use the table's hash() method to store these
12653                     # in buckets, so that it is quite likely that if two
12654                     # tables are in the same bucket they will be identical, so
12655                     # don't have to compare tables frequently.  The tables
12656                     # have to have the same status to share a file, so add
12657                     # this to the bucket hash.  (The reason for this latter is
12658                     # that Heavy.pl associates a status with a file.)
12659                     my $hash = $table->hash . ';' . $table->status;
12660
12661                     # Look at each table that is in the same bucket as this
12662                     # one would be.
12663                     foreach my $comparison (@{$match_tables_to_write{$hash}})
12664                     {
12665                         if ($table->matches_identically_to($comparison)) {
12666                             $table->set_equivalent_to($comparison,
12667                                                                 Related => 0);
12668                             next TABLE;
12669                         }
12670                     }
12671
12672                     # Here, not equivalent, add this table to the bucket.
12673                     push @{$match_tables_to_write{$hash}}, $table;
12674                 }
12675             }
12676             else {
12677
12678                 # Here is the property itself.
12679                 # Don't write out or make references to the $perl property
12680                 next if $table == $perl;
12681
12682                 if ($type != $STRING) {
12683
12684                     # There is a mapping stored of the various synonyms to the
12685                     # standardized name of the property for utf8_heavy.pl.
12686                     # Also, the pod file contains entries of the form:
12687                     # \p{alias: *}         \p{full: *}
12688                     # rather than show every possible combination of things.
12689
12690                     my @property_aliases = $property->aliases;
12691
12692                     # The full name of this property is stored by convention
12693                     # first in the alias array
12694                     my $full_property_name =
12695                                 '\p{' . $property_aliases[0]->name . ': *}';
12696                     my $standard_property_name = standardize($table->name);
12697
12698                     # For each synonym ...
12699                     for my $i (0 .. @property_aliases - 1)  {
12700                         my $alias = $property_aliases[$i];
12701                         my $alias_name = $alias->name;
12702                         my $alias_standard = standardize($alias_name);
12703
12704                         # Set the mapping for utf8_heavy of the alias to the
12705                         # property
12706                         if (exists ($loose_property_name_of{$alias_standard}))
12707                         {
12708                             Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}.  Old name is retained");
12709                         }
12710                         else {
12711                             $loose_property_name_of{$alias_standard}
12712                                                 = $standard_property_name;
12713                         }
12714
12715                         # Now for the pod entry for this alias.  Skip if not
12716                         # outputting a pod; skip the first one, which is the
12717                         # full name so won't have an entry like: '\p{full: *}
12718                         # \p{full: *}', and skip if don't want an entry for
12719                         # this one.
12720                         next if $i == 0
12721                                 || ! defined $pod_directory
12722                                 || ! $alias->make_pod_entry;
12723
12724                         push @match_properties,
12725                             format_pod_line($indent_info_column,
12726                                         '\p{' . $alias->name . ': *}',
12727                                         $full_property_name,
12728                                         $alias->status);
12729                     }
12730                 } # End of non-string-like property code
12731
12732
12733                 # Don't output a mapping file if not desired.
12734                 next if ! $property->to_output_map;
12735             }
12736
12737             # Here, we know we want to write out the table, but don't do it
12738             # yet because there may be other tables that come along and will
12739             # want to share the file, and the file's comments will change to
12740             # mention them.  So save for later.
12741             push @writables, $table;
12742
12743         } # End of looping through the property and all its tables.
12744     } # End of looping through all properties.
12745
12746     # Now have all the tables that will have files written for them.  Do it.
12747     foreach my $table (@writables) {
12748         my @directory;
12749         my $filename;
12750         my $property = $table->property;
12751         my $is_property = ($table == $property);
12752         if (! $is_property) {
12753
12754             # Match tables for the property go in lib/$subdirectory, which is
12755             # the property's name.  Don't use the standard file name for this,
12756             # as may get an unfamiliar alias
12757             @directory = ($matches_directory, $property->external_name);
12758         }
12759         else {
12760
12761             @directory = $table->directory;
12762             $filename = $table->file;
12763         }
12764
12765         # Use specified filename if avaliable, or default to property's
12766         # shortest name.  We need an 8.3 safe filename (which means "an 8
12767         # safe" filename, since after the dot is only 'pl', which is < 3)
12768         # The 2nd parameter is if the filename shouldn't be changed, and
12769         # it shouldn't iff there is a hard-coded name for this table.
12770         $filename = construct_filename(
12771                                 $filename || $table->external_name,
12772                                 ! $filename,    # mutable if no filename
12773                                 \@directory);
12774
12775         register_file_for_name($table, \@directory, $filename);
12776
12777         # Only need to write one file when shared by more than one
12778         # property
12779         next if ! $is_property && $table->leader != $table;
12780
12781         # Construct a nice comment to add to the file
12782         $table->set_final_comment;
12783
12784         $table->write;
12785     }
12786
12787
12788     # Write out the pod file
12789     make_pod;
12790
12791     # And Heavy.pl
12792     make_Heavy;
12793
12794     make_property_test_script() if $make_test_script;
12795     return;
12796 }
12797
12798 my @white_space_separators = ( # This used only for making the test script.
12799                             "",
12800                             ' ',
12801                             "\t",
12802                             '   '
12803                         );
12804
12805 sub generate_separator($) {
12806     # This used only for making the test script.  It generates the colon or
12807     # equal separator between the property and property value, with random
12808     # white space surrounding the separator
12809
12810     my $lhs = shift;
12811
12812     return "" if $lhs eq "";  # No separator if there's only one (the r) side
12813
12814     # Choose space before and after randomly
12815     my $spaces_before =$white_space_separators[rand(@white_space_separators)];
12816     my $spaces_after = $white_space_separators[rand(@white_space_separators)];
12817
12818     # And return the whole complex, half the time using a colon, half the
12819     # equals
12820     return $spaces_before
12821             . (rand() < 0.5) ? '=' : ':'
12822             . $spaces_after;
12823 }
12824
12825 sub generate_tests($$$$$$) {
12826     # This used only for making the test script.  It generates test cases that
12827     # are expected to compile successfully in perl.  Note that the lhs and
12828     # rhs are assumed to already be as randomized as the caller wants.
12829
12830     my $file_handle = shift;   # Where to output the tests
12831     my $lhs = shift;           # The property: what's to the left of the colon
12832                                #  or equals separator
12833     my $rhs = shift;           # The property value; what's to the right
12834     my $valid_code = shift;    # A code point that's known to be in the
12835                                # table given by lhs=rhs; undef if table is
12836                                # empty
12837     my $invalid_code = shift;  # A code point known to not be in the table;
12838                                # undef if the table is all code points
12839     my $warning = shift;
12840
12841     # Get the colon or equal
12842     my $separator = generate_separator($lhs);
12843
12844     # The whole 'property=value'
12845     my $name = "$lhs$separator$rhs";
12846
12847     # Create a complete set of tests, with complements.
12848     if (defined $valid_code) {
12849         printf $file_handle
12850                     qq/Expect(1, $valid_code, '\\p{$name}', $warning);\n/;
12851         printf $file_handle
12852                     qq/Expect(0, $valid_code, '\\p{^$name}', $warning);\n/;
12853         printf $file_handle
12854                     qq/Expect(0, $valid_code, '\\P{$name}', $warning);\n/;
12855         printf $file_handle
12856                     qq/Expect(1, $valid_code, '\\P{^$name}', $warning);\n/;
12857     }
12858     if (defined $invalid_code) {
12859         printf $file_handle
12860                     qq/Expect(0, $invalid_code, '\\p{$name}', $warning);\n/;
12861         printf $file_handle
12862                     qq/Expect(1, $invalid_code, '\\p{^$name}', $warning);\n/;
12863         printf $file_handle
12864                     qq/Expect(1, $invalid_code, '\\P{$name}', $warning);\n/;
12865         printf $file_handle
12866                     qq/Expect(0, $invalid_code, '\\P{^$name}', $warning);\n/;
12867     }
12868     return;
12869 }
12870
12871 sub generate_error($$$$) {
12872     # This used only for making the test script.  It generates test cases that
12873     # are expected to not only not match, but to be syntax or similar errors
12874
12875     my $file_handle = shift;        # Where to output to.
12876     my $lhs = shift;                # The property: what's to the left of the
12877                                     # colon or equals separator
12878     my $rhs = shift;                # The property value; what's to the right
12879     my $already_in_error = shift;   # Boolean; if true it's known that the
12880                                 # unmodified lhs and rhs will cause an error.
12881                                 # This routine should not force another one
12882     # Get the colon or equal
12883     my $separator = generate_separator($lhs);
12884
12885     # Since this is an error only, don't bother to randomly decide whether to
12886     # put the error on the left or right side; and assume that the rhs is
12887     # loosely matched, again for convenience rather than rigor.
12888     $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
12889
12890     my $property = $lhs . $separator . $rhs;
12891
12892     print $file_handle qq/Error('\\p{$property}');\n/;
12893     print $file_handle qq/Error('\\P{$property}');\n/;
12894     return;
12895 }
12896
12897 # These are used only for making the test script
12898 # XXX Maybe should also have a bad strict seps, which includes underscore.
12899
12900 my @good_loose_seps = (
12901             " ",
12902             "-",
12903             "\t",
12904             "",
12905             "_",
12906            );
12907 my @bad_loose_seps = (
12908            "/a/",
12909            ':=',
12910           );
12911
12912 sub randomize_stricter_name {
12913     # This used only for making the test script.  Take the input name and
12914     # return a randomized, but valid version of it under the stricter matching
12915     # rules.
12916
12917     my $name = shift;
12918     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12919
12920     # If the name looks like a number (integer, floating, or rational), do
12921     # some extra work
12922     if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
12923         my $sign = $1;
12924         my $number = $2;
12925         my $separator = $3;
12926
12927         # If there isn't a sign, part of the time add a plus
12928         # Note: Not testing having any denominator having a minus sign
12929         if (! $sign) {
12930             $sign = '+' if rand() <= .3;
12931         }
12932
12933         # And add 0 or more leading zeros.
12934         $name = $sign . ('0' x int rand(10)) . $number;
12935
12936         if (defined $separator) {
12937             my $extra_zeros = '0' x int rand(10);
12938
12939             if ($separator eq '.') {
12940
12941                 # Similarly, add 0 or more trailing zeros after a decimal
12942                 # point
12943                 $name .= $extra_zeros;
12944             }
12945             else {
12946
12947                 # Or, leading zeros before the denominator
12948                 $name =~ s,/,/$extra_zeros,;
12949             }
12950         }
12951     }
12952
12953     # For legibility of the test, only change the case of whole sections at a
12954     # time.  To do this, first split into sections.  The split returns the
12955     # delimiters
12956     my @sections;
12957     for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
12958         trace $section if main::DEBUG && $to_trace;
12959
12960         if (length $section > 1 && $section !~ /\D/) {
12961
12962             # If the section is a sequence of digits, about half the time
12963             # randomly add underscores between some of them.
12964             if (rand() > .5) {
12965
12966                 # Figure out how many underscores to add.  max is 1 less than
12967                 # the number of digits.  (But add 1 at the end to make sure
12968                 # result isn't 0, and compensate earlier by subtracting 2
12969                 # instead of 1)
12970                 my $num_underscores = int rand(length($section) - 2) + 1;
12971
12972                 # And add them evenly throughout, for convenience, not rigor
12973                 use integer;
12974                 my $spacing = (length($section) - 1)/ $num_underscores;
12975                 my $temp = $section;
12976                 $section = "";
12977                 for my $i (1 .. $num_underscores) {
12978                     $section .= substr($temp, 0, $spacing, "") . '_';
12979                 }
12980                 $section .= $temp;
12981             }
12982             push @sections, $section;
12983         }
12984         else {
12985
12986             # Here not a sequence of digits.  Change the case of the section
12987             # randomly
12988             my $switch = int rand(4);
12989             if ($switch == 0) {
12990                 push @sections, uc $section;
12991             }
12992             elsif ($switch == 1) {
12993                 push @sections, lc $section;
12994             }
12995             elsif ($switch == 2) {
12996                 push @sections, ucfirst $section;
12997             }
12998             else {
12999                 push @sections, $section;
13000             }
13001         }
13002     }
13003     trace "returning", join "", @sections if main::DEBUG && $to_trace;
13004     return join "", @sections;
13005 }
13006
13007 sub randomize_loose_name($;$) {
13008     # This used only for making the test script
13009
13010     my $name = shift;
13011     my $want_error = shift;  # if true, make an error
13012     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13013
13014     $name = randomize_stricter_name($name);
13015
13016     my @parts;
13017     push @parts, $good_loose_seps[rand(@good_loose_seps)];
13018     for my $part (split /[-\s_]+/, $name) {
13019         if (@parts) {
13020             if ($want_error and rand() < 0.3) {
13021                 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
13022                 $want_error = 0;
13023             }
13024             else {
13025                 push @parts, $good_loose_seps[rand(@good_loose_seps)];
13026             }
13027         }
13028         push @parts, $part;
13029     }
13030     my $new = join("", @parts);
13031     trace "$name => $new" if main::DEBUG && $to_trace;
13032
13033     if ($want_error) {
13034         if (rand() >= 0.5) {
13035             $new .= $bad_loose_seps[rand(@bad_loose_seps)];
13036         }
13037         else {
13038             $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
13039         }
13040     }
13041     return $new;
13042 }
13043
13044 # Used to make sure don't generate duplicate test cases.
13045 my %test_generated;
13046
13047 sub make_property_test_script() {
13048     # This used only for making the test script
13049     # this written directly -- it's huge.
13050
13051     print "Making test script\n" if $verbosity >= $PROGRESS;
13052
13053     # This uses randomness to test different possibilities without testing all
13054     # possibilities.  To ensure repeatability, set the seed to 0.  But if
13055     # tests are added, it will perturb all later ones in the .t file
13056     srand 0;
13057
13058     $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
13059
13060     force_unlink ($t_path);
13061     push @files_actually_output, $t_path;
13062     my $OUT;
13063     if (not open $OUT, "> $t_path") {
13064         Carp::my_carp("Can't open $t_path.  Skipping: $!");
13065         return;
13066     }
13067
13068     # Keep going down an order of magnitude
13069     # until find that adding this quantity to
13070     # 1 remains 1; but put an upper limit on
13071     # this so in case this algorithm doesn't
13072     # work properly on some platform, that we
13073     # won't loop forever.
13074     my $digits = 0;
13075     my $min_floating_slop = 1;
13076     while (1+ $min_floating_slop != 1
13077             && $digits++ < 50)
13078     {
13079         my $next = $min_floating_slop / 10;
13080         last if $next == 0; # If underflows,
13081                             # use previous one
13082         $min_floating_slop = $next;
13083     }
13084     print $OUT $HEADER, <DATA>;
13085
13086     foreach my $property (property_ref('*')) {
13087         foreach my $table ($property->tables) {
13088
13089             # Find code points that match, and don't match this table.
13090             my $valid = $table->get_valid_code_point;
13091             my $invalid = $table->get_invalid_code_point;
13092             my $warning = ($table->status eq $DEPRECATED)
13093                             ? "'deprecated'"
13094                             : '""';
13095
13096             # Test each possible combination of the property's aliases with
13097             # the table's.  If this gets to be too many, could do what is done
13098             # in the set_final_comment() for Tables
13099             my @table_aliases = $table->aliases;
13100             my @property_aliases = $table->property->aliases;
13101             my $max = max(scalar @table_aliases, scalar @property_aliases);
13102             for my $j (0 .. $max - 1) {
13103
13104                 # The current alias for property is the next one on the list,
13105                 # or if beyond the end, start over.  Similarly for table
13106                 my $property_name
13107                             = $property_aliases[$j % @property_aliases]->name;
13108
13109                 $property_name = "" if $table->property == $perl;
13110                 my $table_alias = $table_aliases[$j % @table_aliases];
13111                 my $table_name = $table_alias->name;
13112                 my $loose_match = $table_alias->loose_match;
13113
13114                 # If the table doesn't have a file, any test for it is
13115                 # already guaranteed to be in error
13116                 my $already_error = ! $table->file_path;
13117
13118                 # Generate error cases for this alias.
13119                 generate_error($OUT,
13120                                 $property_name,
13121                                 $table_name,
13122                                 $already_error);
13123
13124                 # If the table is guaranteed to always generate an error,
13125                 # quit now without generating success cases.
13126                 next if $already_error;
13127
13128                 # Now for the success cases.
13129                 my $random;
13130                 if ($loose_match) {
13131
13132                     # For loose matching, create an extra test case for the
13133                     # standard name.
13134                     my $standard = standardize($table_name);
13135
13136                     # $test_name should be a unique combination for each test
13137                     # case; used just to avoid duplicate tests
13138                     my $test_name = "$property_name=$standard";
13139
13140                     # Don't output duplicate test cases.
13141                     if (! exists $test_generated{$test_name}) {
13142                         $test_generated{$test_name} = 1;
13143                         generate_tests($OUT,
13144                                         $property_name,
13145                                         $standard,
13146                                         $valid,
13147                                         $invalid,
13148                                         $warning,
13149                                     );
13150                     }
13151                     $random = randomize_loose_name($table_name)
13152                 }
13153                 else { # Stricter match
13154                     $random = randomize_stricter_name($table_name);
13155                 }
13156
13157                 # Now for the main test case for this alias.
13158                 my $test_name = "$property_name=$random";
13159                 if (! exists $test_generated{$test_name}) {
13160                     $test_generated{$test_name} = 1;
13161                     generate_tests($OUT,
13162                                     $property_name,
13163                                     $random,
13164                                     $valid,
13165                                     $invalid,
13166                                     $warning,
13167                                 );
13168
13169                     # If the name is a rational number, add tests for the
13170                     # floating point equivalent.
13171                     if ($table_name =~ qr{/}) {
13172
13173                         # Calculate the float, and find just the fraction.
13174                         my $float = eval $table_name;
13175                         my ($whole, $fraction)
13176                                             = $float =~ / (.*) \. (.*) /x;
13177
13178                         # Starting with one digit after the decimal point,
13179                         # create a test for each possible precision (number of
13180                         # digits past the decimal point) until well beyond the
13181                         # native number found on this machine.  (If we started
13182                         # with 0 digits, it would be an integer, which could
13183                         # well match an unrelated table)
13184                         PLACE:
13185                         for my $i (1 .. $min_floating_slop + 3) {
13186                             my $table_name = sprintf("%.*f", $i, $float);
13187                             if ($i < $MIN_FRACTION_LENGTH) {
13188
13189                                 # If the test case has fewer digits than the
13190                                 # minimum acceptable precision, it shouldn't
13191                                 # succeed, so we expect an error for it.
13192                                 # E.g., 2/3 = .7 at one decimal point, and we
13193                                 # shouldn't say it matches .7.  We should make
13194                                 # it be .667 at least before agreeing that the
13195                                 # intent was to match 2/3.  But at the
13196                                 # less-than- acceptable level of precision, it
13197                                 # might actually match an unrelated number.
13198                                 # So don't generate a test case if this
13199                                 # conflating is possible.  In our example, we
13200                                 # don't want 2/3 matching 7/10, if there is
13201                                 # a 7/10 code point.
13202                                 for my $existing
13203                                         (keys %nv_floating_to_rational)
13204                                 {
13205                                     next PLACE
13206                                         if abs($table_name - $existing)
13207                                                 < $MAX_FLOATING_SLOP;
13208                                 }
13209                                 generate_error($OUT,
13210                                             $property_name,
13211                                             $table_name,
13212                                             1   # 1 => already an error
13213                                 );
13214                             }
13215                             else {
13216
13217                                 # Here the number of digits exceeds the
13218                                 # minimum we think is needed.  So generate a
13219                                 # success test case for it.
13220                                 generate_tests($OUT,
13221                                                 $property_name,
13222                                                 $table_name,
13223                                                 $valid,
13224                                                 $invalid,
13225                                                 $warning,
13226                                 );
13227                             }
13228                         }
13229                     }
13230                 }
13231             }
13232         }
13233     }
13234     print $OUT "Finished();\n";
13235     close $OUT;
13236     return;
13237 }
13238
13239 # This is a list of the input files and how to handle them.  The files are
13240 # processed in their order in this list.  Some reordering is possible if
13241 # desired, but the v0 files should be first, and the extracted before the
13242 # others except DAge.txt (as data in an extracted file can be over-ridden by
13243 # the non-extracted.  Some other files depend on data derived from an earlier
13244 # file, like UnicodeData requires data from Jamo, and the case changing and
13245 # folding requires data from Unicode.  Mostly, it safest to order by first
13246 # version releases in (except the Jamo).  DAge.txt is read before the
13247 # extracted ones because of the rarely used feature $compare_versions.  In the
13248 # unlikely event that there were ever an extracted file that contained the Age
13249 # property information, it would have to go in front of DAge.
13250 #
13251 # The version strings allow the program to know whether to expect a file or
13252 # not, but if a file exists in the directory, it will be processed, even if it
13253 # is in a version earlier than expected, so you can copy files from a later
13254 # release into an earlier release's directory.
13255 my @input_file_objects = (
13256     Input_file->new('PropertyAliases.txt', v0,
13257                     Handler => \&process_PropertyAliases,
13258                     ),
13259     Input_file->new(undef, v0,  # No file associated with this
13260                     Progress_Message => 'Finishing property setup',
13261                     Handler => \&finish_property_setup,
13262                     ),
13263     Input_file->new('PropValueAliases.txt', v0,
13264                      Handler => \&process_PropValueAliases,
13265                      Has_Missings_Defaults => $NOT_IGNORED,
13266                      ),
13267     Input_file->new('DAge.txt', v3.2.0,
13268                     Has_Missings_Defaults => $NOT_IGNORED,
13269                     Property => 'Age'
13270                     ),
13271     Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
13272                     Property => 'General_Category',
13273                     ),
13274     Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
13275                     Property => 'Canonical_Combining_Class',
13276                     Has_Missings_Defaults => $NOT_IGNORED,
13277                     ),
13278     Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
13279                     Property => 'Numeric_Type',
13280                     Has_Missings_Defaults => $NOT_IGNORED,
13281                     ),
13282     Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
13283                     Property => 'East_Asian_Width',
13284                     Has_Missings_Defaults => $NOT_IGNORED,
13285                     ),
13286     Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
13287                     Property => 'Line_Break',
13288                     Has_Missings_Defaults => $NOT_IGNORED,
13289                     ),
13290     Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
13291                     Property => 'Bidi_Class',
13292                     Has_Missings_Defaults => $NOT_IGNORED,
13293                     ),
13294     Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
13295                     Property => 'Decomposition_Type',
13296                     Has_Missings_Defaults => $NOT_IGNORED,
13297                     ),
13298     Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
13299     Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
13300                     Property => 'Numeric_Value',
13301                     Each_Line_Handler => \&filter_numeric_value_line,
13302                     Has_Missings_Defaults => $NOT_IGNORED,
13303                     ),
13304     Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
13305                     Property => 'Joining_Group',
13306                     Has_Missings_Defaults => $NOT_IGNORED,
13307                     ),
13308
13309     Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
13310                     Property => 'Joining_Type',
13311                     Has_Missings_Defaults => $NOT_IGNORED,
13312                     ),
13313     Input_file->new('Jamo.txt', v2.0.0,
13314                     Property => 'Jamo_Short_Name',
13315                     Each_Line_Handler => \&filter_jamo_line,
13316                     ),
13317     Input_file->new('UnicodeData.txt', v1.1.5,
13318                     Pre_Handler => \&setup_UnicodeData,
13319
13320                     # We clean up this file for some early versions.
13321                     Each_Line_Handler => [ (($v_version lt v2.0.0 )
13322                                             ? \&filter_v1_ucd
13323                                             : ($v_version eq v2.1.5)
13324                                                 ? \&filter_v2_1_5_ucd
13325                                                 : undef),
13326
13327                                             # And the main filter
13328                                             \&filter_UnicodeData_line,
13329                                          ],
13330                     EOF_Handler => \&EOF_UnicodeData,
13331                     ),
13332     Input_file->new('ArabicShaping.txt', v2.0.0,
13333                     Each_Line_Handler =>
13334                         [ ($v_version lt 4.1.0)
13335                                     ? \&filter_old_style_arabic_shaping
13336                                     : undef,
13337                         \&filter_arabic_shaping_line,
13338                         ],
13339                     Has_Missings_Defaults => $NOT_IGNORED,
13340                     ),
13341     Input_file->new('Blocks.txt', v2.0.0,
13342                     Property => 'Block',
13343                     Has_Missings_Defaults => $NOT_IGNORED,
13344                     Each_Line_Handler => \&filter_blocks_lines
13345                     ),
13346     Input_file->new('PropList.txt', v2.0.0,
13347                     Each_Line_Handler => (($v_version lt v3.1.0)
13348                                             ? \&filter_old_style_proplist
13349                                             : undef),
13350                     ),
13351     Input_file->new('Unihan.txt', v2.0.0,
13352                     Pre_Handler => \&setup_unihan,
13353                     Optional => 1,
13354                     Each_Line_Handler => \&filter_unihan_line,
13355                         ),
13356     Input_file->new('SpecialCasing.txt', v2.1.8,
13357                     Each_Line_Handler => \&filter_special_casing_line,
13358                     Pre_Handler => \&setup_special_casing,
13359                     ),
13360     Input_file->new(
13361                     'LineBreak.txt', v3.0.0,
13362                     Has_Missings_Defaults => $NOT_IGNORED,
13363                     Property => 'Line_Break',
13364                     # Early versions had problematic syntax
13365                     Each_Line_Handler => (($v_version lt v3.1.0)
13366                                         ? \&filter_early_ea_lb
13367                                         : undef),
13368                     ),
13369     Input_file->new('EastAsianWidth.txt', v3.0.0,
13370                     Property => 'East_Asian_Width',
13371                     Has_Missings_Defaults => $NOT_IGNORED,
13372                     # Early versions had problematic syntax
13373                     Each_Line_Handler => (($v_version lt v3.1.0)
13374                                         ? \&filter_early_ea_lb
13375                                         : undef),
13376                     ),
13377     Input_file->new('CompositionExclusions.txt', v3.0.0,
13378                     Property => 'Composition_Exclusion',
13379                     ),
13380     Input_file->new('BidiMirroring.txt', v3.0.1,
13381                     Property => 'Bidi_Mirroring_Glyph',
13382                     ),
13383     Input_file->new('CaseFolding.txt', v3.0.1,
13384                     Pre_Handler => \&setup_case_folding,
13385                     Each_Line_Handler =>
13386                         [ ($v_version lt v3.1.0)
13387                                  ? \&filter_old_style_case_folding
13388                                  : undef,
13389                            \&filter_case_folding_line
13390                         ],
13391                     Post_Handler => \&post_fold,
13392                     ),
13393     Input_file->new('DCoreProperties.txt', v3.1.0,
13394                     # 5.2 changed this file
13395                     Has_Missings_Defaults => (($v_version ge v5.2.0)
13396                                             ? $NOT_IGNORED
13397                                             : $NO_DEFAULTS),
13398                     ),
13399     Input_file->new('Scripts.txt', v3.1.0,
13400                     Property => 'Script',
13401                     Has_Missings_Defaults => $NOT_IGNORED,
13402                     ),
13403     Input_file->new('DNormalizationProps.txt', v3.1.0,
13404                     Has_Missings_Defaults => $NOT_IGNORED,
13405                     Each_Line_Handler => (($v_version lt v4.0.1)
13406                                       ? \&filter_old_style_normalization_lines
13407                                       : undef),
13408                     ),
13409     Input_file->new('HangulSyllableType.txt', v4.0.0,
13410                     Has_Missings_Defaults => $NOT_IGNORED,
13411                     Property => 'Hangul_Syllable_Type'),
13412     Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
13413                     Property => 'Word_Break',
13414                     Has_Missings_Defaults => $NOT_IGNORED,
13415                     ),
13416     Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0,
13417                     Property => 'Grapheme_Cluster_Break',
13418                     Has_Missings_Defaults => $NOT_IGNORED,
13419                     ),
13420     Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
13421                     Property => 'Sentence_Break',
13422                     Has_Missings_Defaults => $NOT_IGNORED,
13423                     ),
13424     Input_file->new('NamedSequences.txt', v4.1.0,
13425                     Handler => \&process_NamedSequences
13426                     ),
13427     Input_file->new('NameAliases.txt', v5.0.0,
13428                     Property => 'Name_Alias',
13429                     ),
13430     Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
13431                     Optional => 1,
13432                     Each_Line_Handler => \&filter_unihan_line,
13433                     ),
13434     Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
13435                     Optional => 1,
13436                     Each_Line_Handler => \&filter_unihan_line,
13437                     ),
13438     Input_file->new('UnihanIRGSources.txt', v5.2.0,
13439                     Optional => 1,
13440                     Pre_Handler => \&setup_unihan,
13441                     Each_Line_Handler => \&filter_unihan_line,
13442                     ),
13443     Input_file->new('UnihanNumericValues.txt', v5.2.0,
13444                     Optional => 1,
13445                     Each_Line_Handler => \&filter_unihan_line,
13446                     ),
13447     Input_file->new('UnihanOtherMappings.txt', v5.2.0,
13448                     Optional => 1,
13449                     Each_Line_Handler => \&filter_unihan_line,
13450                     ),
13451     Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
13452                     Optional => 1,
13453                     Each_Line_Handler => \&filter_unihan_line,
13454                     ),
13455     Input_file->new('UnihanReadings.txt', v5.2.0,
13456                     Optional => 1,
13457                     Each_Line_Handler => \&filter_unihan_line,
13458                     ),
13459     Input_file->new('UnihanVariants.txt', v5.2.0,
13460                     Optional => 1,
13461                     Each_Line_Handler => \&filter_unihan_line,
13462                     ),
13463 );
13464
13465 # End of all the preliminaries.
13466 # Do it...
13467
13468 if ($compare_versions) {
13469     Carp::my_carp(<<END
13470 Warning.  \$compare_versions is set.  Output is not suitable for production
13471 END
13472     );
13473 }
13474
13475 # Put into %potential_files a list of all the files in the directory structure
13476 # that could be inputs to this program, excluding those that we should ignore.
13477 # Also don't consider test files.  Use absolute file names because it makes it
13478 # easier across machine types.
13479 my @ignored_files_full_names = map { File::Spec->rel2abs(
13480                                      internal_file_to_platform($_))
13481                                 } keys %ignored_files;
13482 File::Find::find({
13483     wanted=>sub {
13484         return unless /\.txt$/i;
13485         return if /Test\.txt$/i;
13486         my $full = lc(File::Spec->rel2abs($_));
13487         $potential_files{$full} = 1
13488                         if ! grep { $full eq lc($_) } @ignored_files_full_names;
13489         return;
13490     }
13491 }, File::Spec->curdir());
13492
13493 my @mktables_list_output_files;
13494
13495 if ($write_unchanged_files) {
13496     print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
13497 }
13498 else {
13499     print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
13500     my $file_handle;
13501     if (! open $file_handle, "<", $file_list) {
13502         Carp::my_carp("Failed to open '$file_list' (this is expected to be missing the first time); turning on -globlist option instead: $!");
13503         $glob_list = 1;
13504     }
13505     else {
13506         my @input;
13507
13508         # Read and parse mktables.lst, placing the results from the first part
13509         # into @input, and the second part into @mktables_list_output_files
13510         for my $list ( \@input, \@mktables_list_output_files ) {
13511             while (<$file_handle>) {
13512                 s/^ \s+ | \s+ $//xg;
13513                 next if /^ \s* (?: \# .* )? $/x;
13514                 last if /^ =+ $/x;
13515                 my ( $file ) = split /\t/;
13516                 push @$list, $file;
13517             }
13518             @$list = uniques(@$list);
13519             next;
13520         }
13521
13522         # Look through all the input files
13523         foreach my $input (@input) {
13524             next if $input eq 'version'; # Already have checked this.
13525
13526             # Ignore if doesn't exist.  The checking about whether we care or
13527             # not is done via the Input_file object.
13528             next if ! file_exists($input);
13529
13530             # The paths are stored with relative names, and with '/' as the
13531             # delimiter; convert to absolute on this machine
13532             my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
13533             $potential_files{$full} = 1
13534                         if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
13535         }
13536     }
13537
13538     close $file_handle;
13539 }
13540
13541 if ($glob_list) {
13542
13543     # Here wants to process all .txt files in the directory structure.
13544     # Convert them to full path names.  They are stored in the platform's
13545     # relative style
13546     my @known_files;
13547     foreach my $object (@input_file_objects) {
13548         my $file = $object->file;
13549         next unless defined $file;
13550         push @known_files, File::Spec->rel2abs($file);
13551     }
13552
13553     my @unknown_input_files;
13554     foreach my $file (keys %potential_files) {
13555         next if grep { lc($file) eq lc($_) } @known_files;
13556
13557         # Here, the file is unknown to us.  Get relative path name
13558         $file = File::Spec->abs2rel($file);
13559         push @unknown_input_files, $file;
13560
13561         # What will happen is we create a data structure for it, and add it to
13562         # the list of input files to process.  First get the subdirectories
13563         # into an array
13564         my (undef, $directories, undef) = File::Spec->splitpath($file);
13565         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
13566         my @directories = File::Spec->splitdir($directories);
13567
13568         # If the file isn't extracted (meaning none of the directories is the
13569         # extracted one), just add it to the end of the list of inputs.
13570         if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
13571             push @input_file_objects, Input_file->new($file, v0);
13572         }
13573         else {
13574
13575             # Here, the file is extracted.  It needs to go ahead of most other
13576             # processing.  Search for the first input file that isn't a
13577             # special required property (that is, find one whose first_release
13578             # is non-0), and isn't extracted.  Also, the Age property file is
13579             # processed before the extracted ones, just in case
13580             # $compare_versions is set.
13581             for (my $i = 0; $i < @input_file_objects; $i++) {
13582                 if ($input_file_objects[$i]->first_released ne v0
13583                     && lc($input_file_objects[$i]->file) ne 'dage.txt'
13584                     && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
13585                 {
13586                     splice @input_file_objects, $i, 0,
13587                                                     Input_file->new($file, v0);
13588                     last;
13589                 }
13590             }
13591
13592         }
13593     }
13594     if (@unknown_input_files) {
13595         print STDERR simple_fold(join_lines(<<END
13596
13597 The following files are unknown as to how to handle.  Assuming they are
13598 typical property files.  You'll know by later error messages if it worked or
13599 not:
13600 END
13601         ) . " " . join(", ", @unknown_input_files) . "\n\n");
13602     }
13603 } # End of looking through directory structure for more .txt files.
13604
13605 # Create the list of input files from the objects we have defined, plus
13606 # version
13607 my @input_files = 'version';
13608 foreach my $object (@input_file_objects) {
13609     my $file = $object->file;
13610     next if ! defined $file;    # Not all objects have files
13611     next if $object->optional && ! -e $file;
13612     push @input_files,  $file;
13613 }
13614
13615 if ( $verbosity >= $VERBOSE ) {
13616     print "Expecting ".scalar( @input_files )." input files. ",
13617          "Checking ".scalar( @mktables_list_output_files )." output files.\n";
13618 }
13619
13620 # We set $youngest to be the most recently changed input file, including this
13621 # program itself (done much earlier in this file)
13622 foreach my $in (@input_files) {
13623     my $age = -M $in;
13624     next unless defined $age;        # Keep going even if missing a file
13625     $youngest = $age if $age < $youngest;
13626
13627     # See that the input files have distinct names, to warn someone if they
13628     # are adding a new one
13629     if ($make_list) {
13630         my ($volume, $directories, $file ) = File::Spec->splitpath($in);
13631         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
13632         my @directories = File::Spec->splitdir($directories);
13633         my $base = $file =~ s/\.txt$//;
13634         construct_filename($file, 'mutable', \@directories);
13635     }
13636 }
13637
13638 my $ok = ! $write_unchanged_files
13639         && scalar @mktables_list_output_files;        # If none known, rebuild
13640
13641 # Now we check to see if any output files are older than youngest, if
13642 # they are, we need to continue on, otherwise we can presumably bail.
13643 if ($ok) {
13644     foreach my $out (@mktables_list_output_files) {
13645         if ( ! file_exists($out)) {
13646             print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
13647             $ok = 0;
13648             last;
13649          }
13650         #local $to_trace = 1 if main::DEBUG;
13651         trace $youngest, -M $out if main::DEBUG && $to_trace;
13652         if ( -M $out > $youngest ) {
13653             #trace "$out: age: ", -M $out, ", youngest: $youngest\n" if main::DEBUG && $to_trace;
13654             print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
13655             $ok = 0;
13656             last;
13657         }
13658     }
13659 }
13660 if ($ok) {
13661     print "Files seem to be ok, not bothering to rebuild.\n";
13662     exit(0);
13663 }
13664 print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
13665
13666 # Ready to do the major processing.  First create the perl pseudo-property.
13667 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
13668
13669 # Process each input file
13670 foreach my $file (@input_file_objects) {
13671     $file->run;
13672 }
13673
13674 # Finish the table generation.
13675
13676 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
13677 finish_Unicode();
13678
13679 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
13680 compile_perl();
13681
13682 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
13683 add_perl_synonyms();
13684
13685 print "Writing tables\n" if $verbosity >= $PROGRESS;
13686 write_all_tables();
13687
13688 # Write mktables.lst
13689 if ( $file_list and $make_list ) {
13690
13691     print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
13692     foreach my $file (@input_files, @files_actually_output) {
13693         my (undef, $directories, $file) = File::Spec->splitpath($file);
13694         my @directories = File::Spec->splitdir($directories);
13695         $file = join '/', @directories, $file;
13696     }
13697
13698     my $ofh;
13699     if (! open $ofh,">",$file_list) {
13700         Carp::my_carp("Can't write to '$file_list'.  Skipping: $!");
13701         return
13702     }
13703     else {
13704         print $ofh <<"END";
13705 #
13706 # $file_list -- File list for $0.
13707 #
13708 #   Autogenerated on @{[scalar localtime]}
13709 #
13710 # - First section is input files
13711 #   ($0 itself is not listed but is automatically considered an input)
13712 # - Section seperator is /^=+\$/
13713 # - Second section is a list of output files.
13714 # - Lines matching /^\\s*#/ are treated as comments
13715 #   which along with blank lines are ignored.
13716 #
13717
13718 # Input files:
13719
13720 END
13721         print $ofh "$_\n" for sort(@input_files);
13722         print $ofh "\n=================================\n# Output files:\n\n";
13723         print $ofh "$_\n" for sort @files_actually_output;
13724         print $ofh "\n# ",scalar(@input_files)," input files\n",
13725                 "# ",scalar(@files_actually_output)+1," output files\n\n",
13726                 "# End list\n";
13727         close $ofh
13728             or Carp::my_carp("Failed to close $ofh: $!");
13729
13730         print "Filelist has ",scalar(@input_files)," input files and ",
13731             scalar(@files_actually_output)+1," output files\n"
13732             if $verbosity >= $VERBOSE;
13733     }
13734 }
13735
13736 # Output these warnings unless -q explicitly specified.
13737 if ($verbosity >= $NORMAL_VERBOSITY) {
13738     if (@unhandled_properties) {
13739         print "\nProperties and tables that unexpectedly have no code points\n";
13740         foreach my $property (sort @unhandled_properties) {
13741             print $property, "\n";
13742         }
13743     }
13744
13745     if (%potential_files) {
13746         print "\nInput files that are not considered:\n";
13747         foreach my $file (sort keys %potential_files) {
13748             print File::Spec->abs2rel($file), "\n";
13749         }
13750     }
13751     print "\nAll done\n" if $verbosity >= $VERBOSE;
13752 }
13753 exit(0);
13754
13755 # TRAILING CODE IS USED BY make_property_test_script()
13756 __DATA__
13757
13758 use strict;
13759 use warnings;
13760
13761 # Test the \p{} regular expression constructs.  This file is constructed by
13762 # mktables from the tables it generates, so if mktables is buggy, this won't
13763 # necessarily catch those bugs.  Tests are generated for all feasible
13764 # properties; a few aren't currently feasible; see is_code_point_usable()
13765 # in mktables for details.
13766
13767 # Standard test packages are not used because this manipulates SIG_WARN.  It
13768 # exits 0 if every non-skipped test succeeded; -1 if any failed.
13769
13770 my $Tests = 0;
13771 my $Fails = 0;
13772 my $Skips = 0;
13773
13774 my $non_ASCII = (ord('A') != 65);
13775
13776 # The first 127 ASCII characters in ordinal order, with the ones that don't
13777 # have Perl names (as of 5.8) replaced by dots.  The 127th is used as the
13778 # string delimiter
13779 my $ascii_to_ebcdic = "\0......\a\b\t\n.\f\r.................. !\"#\$\%&'()*+,-./0123456789:;<=>?\@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
13780 #for my $i (0..126) {
13781 #    print $i, ": ", substr($ascii_to_ebcdic, $i, 1), "\n";
13782 #}
13783
13784 sub Expect($$$$) {
13785     my $expected = shift;
13786     my $ord = shift;
13787     my $regex  = shift;
13788     my $warning_type = shift;   # Type of warning message, like 'deprecated'
13789                                 # or empty if none
13790     my $line   = (caller)[2];
13791
13792     # Convert the code point to hex form
13793     my $string = sprintf "\"\\x{%04X}\"", $ord;
13794
13795     # Convert the non-ASCII code points expressible as characters in Perl 5.8
13796     # to their ASCII equivalents, and skip the others.
13797     if ($non_ASCII && $ord < 255) {
13798
13799         # Dots are used as place holders in the conversion string for the
13800         # non-convertible ones, so check for it first.
13801         if ($ord == 0x2E) {
13802             $ord = ord('.');
13803         }
13804         elsif ($ord < 0x7F
13805                   # Any dots returned are non-convertible.
13806                  && ((my $char = substr($ascii_to_ebcdic, $ord, 1)) ne '.'))
13807         {
13808             #print STDERR "$ord, $char, \n";
13809             $ord = ord($char);
13810         }
13811         else {
13812             $Tests++;
13813             $Skips++;
13814             print "ok $Tests - $string =~ $regex # Skipped: non-ASCII\n";
13815             return;
13816         }
13817     }
13818
13819     # The first time through, use all warnings.
13820     my @tests = "";
13821
13822     # If the input should generate a warning, add another time through with
13823     # them turned off
13824     push @tests, "no warnings '$warning_type';" if $warning_type;
13825
13826     foreach my $no_warnings (@tests) {
13827
13828         # Store any warning messages instead of outputting them
13829         local $SIG{__WARN__} = $SIG{__WARN__};
13830         my $warning_message;
13831         $SIG{__WARN__} = sub { $warning_message = $_[0] };
13832
13833         $Tests++;
13834
13835         # A string eval is needed because of the 'no warnings'.
13836         # Assumes no parens in the regular expression
13837         my $result = eval "$no_warnings
13838                             my \$RegObj = qr($regex);
13839                             $string =~ \$RegObj ? 1 : 0";
13840         if (not defined $result) {
13841             print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
13842             $Fails++;
13843         }
13844         elsif ($result ^ $expected) {
13845             print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
13846             $Fails++;
13847         }
13848         elsif ($warning_message) {
13849             if (! $warning_type || ($warning_type && $no_warnings)) {
13850                 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
13851                 $Fails++;
13852             }
13853             else {
13854                 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
13855             }
13856         }
13857         elsif ($warning_type && ! $no_warnings) {
13858             print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
13859             $Fails++;
13860         }
13861         else {
13862             print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
13863         }
13864     }
13865     return;
13866 }
13867
13868 sub Error($) {
13869     my $regex  = shift;
13870     $Tests++;
13871     if (eval { 'x' =~ qr/$regex/; 1 }) {
13872         $Fails++;
13873         my $line = (caller)[2];
13874         print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
13875     }
13876     else {
13877         my $line = (caller)[2];
13878         print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
13879     }
13880     return;
13881 }
13882
13883 sub Finished() {
13884     print "1..$Tests\n";
13885     exit($Fails ? -1 : 0);
13886 }
13887
13888 Error('\p{Script=InGreek}');    # Bug #69018