lib/unicore/mktables

   1 #!/usr/bin/perl -w
   2
   3 # !!!!!!!!!!!!!!       IF YOU MODIFY THIS FILE       !!!!!!!!!!!!!!!!!!!!!!!!!
   4 # Any files created or read by this program should be listed in 'mktables.lst'
   5 # Use -makelist to regenerate it.
   6
   7 require 5.008;        # Needs pack "U". Probably safest to run on 5.8.x
   8 use strict;
   9 use warnings;
  10 use Carp;
  11 use File::Find;
  12 use File::Path;
  13 use File::Spec;
  14 use Text::Tabs;
  15
  16 sub DEBUG () { 0 }  # Set to 0 for production; 1 for development
  17
  18 ##########################################################################
  19 #
  20 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
  21 # from the Unicode database files (lib/unicore/.../*.txt),  It also generates
  22 # a pod file and a .t file
  23 #
  24 # The structure of this file is:
  25 #   First these introductory comments; then
  26 #   code needed for everywhere, such as debugging stuff; then
  27 #   code to handle input parameters; then
  28 #   data structures likely to be of external interest (some of which depend on
  29 #       the input parameters, so follows them; then
  30 #   more data structures and subroutine and package (class) definitions; then
  31 #   the small actual loop to process the input files and finish up; then
  32 #   a __DATA__ section, for the .t tests
  33 #
  34 # This program works on all releases of Unicode through at least 5.2.  The
  35 # outputs have been scrutinized most intently for release 5.1.  The others
  36 # have been checked for somewhat more than just sanity.  It can handle all
  37 # existing Unicode character properties in those releases.
  38 #
  39 # This program needs to be able to run under miniperl.  Therefore, it uses a
  40 # minimum of other modules, and hence implements some things itself that could
  41 # be gotten from CPAN
  42 #
  43 # This program uses inputs published by the Unicode Consortium.  These can
  44 # change incompatibly between releases without the Perl maintainers realizing
  45 # it.  Therefore this program is now designed to try to flag these.  It looks
  46 # at the directories where the inputs are, and flags any unrecognized files.
  47 # It keeps track of all the properties in the files it handles, and flags any
  48 # that it doesn't know how to handle.  It also flags any input lines that
  49 # don't match the expected syntax, among other checks.
  50 # It is also designed so if a new input file matches one of the known
  51 # templates, one hopefully just needs to add it to a list to have it
  52 # processed.
  53 #
  54 # It tries to keep fatal errors to a minimum, to generate something usable for
  55 # testing purposes.  It always looks for files that could be inputs, and will
  56 # warn about any that it doesn't know how to handle (the -q option suppresses
  57 # the warning).
  58 #
  59 # This program is mostly about Unicode character (or code point) properties.
  60 # A property describes some attribute or quality of a code point, like if it
  61 # is lowercase or not, its name, what version of Unicode it was first defined
  62 # in, or what its uppercase equivalent is.  Unicode deals with these disparate
  63 # possibilities by making all properties into mappings from each code point
  64 # into some corresponding value.  In the case of it being lowercase or not,
  65 # the mapping is either to 'Y' or 'N' (or various synonyms thereof).  Each
  66 # property maps each Unicode code point to a single value, called a "property
  67 # value".  (Hence each Unicode property is a true mathematical function with
  68 # exactly one value per code point.)
  69 #
  70 # When using a property in a regular expression, what is desired isn't the
  71 # mapping of the code point to its property's value, but the reverse (or the
  72 # mathematical "inverse relation"): starting with the property value, "Does a
  73 # code point map to it?"  These are written in a "compound" form:
  74 # \p{property=value}, e.g., \p{category=punctuation}.  This program generates
  75 # files containing the lists of code points that map to each such regular
  76 # expression property value, one file per list
  77 #
  78 # There is also a single form shortcut that Perl adds for many of the commonly
  79 # used properties.  This happens for all binary properties, plus script,
  80 # general_category, and block properties.
  81 #
  82 # Thus the outputs of this program are files.  There are map files, mostly in
  83 # the 'To' directory; and there are list files for use in regular expression
  84 # matching, all in subdirectories of the 'lib' directory, with each
  85 # subdirectory being named for the property that the lists in it are for.
  86 # Bookkeeping, test, and documentation files are also generated.
  87
  88 my $matches_directory = 'lib';   # Where match (\p{}) files go.
  89 my $map_directory = 'To';        # Where map files go.
  90
  91 # DATA STRUCTURES
  92 #
  93 # The major data structures of this program are Property, of course, but also
  94 # Table.  There are two kinds of tables, very similar to each other.
  95 # "Match_Table" is the data structure giving the list of code points that have
  96 # a particular property value, mentioned above.  There is also a "Map_Table"
  97 # data structure which gives the property's mapping from code point to value.
  98 # There are two structures because the match tables need to be combined in
  99 # various ways, such as constructing unions, intersections, complements, etc.,
 100 # and the map ones don't.  And there would be problems, perhaps subtle, if
 101 # a map table were inadvertently operated on in some of those ways.
 102 # The use of separate classes with operations defined on one but not the other
 103 # prevents accidentally confusing the two.
 104 #
 105 # At the heart of each table's data structure is a "Range_List", which is just
 106 # an ordered list of "Ranges", plus ancillary information, and methods to
 107 # operate on them.  A Range is a compact way to store property information.
 108 # Each range has a starting code point, an ending code point, and a value that
 109 # is meant to apply to all the code points between the two end points,
 110 # inclusive.  For a map table, this value is the property value for those
 111 # code points.  Two such ranges could be written like this:
 112 #   0x41 .. 0x5A, 'Upper',
 113 #   0x61 .. 0x7A, 'Lower'
 114 #
 115 # Each range also has a type used as a convenience to classify the values.
 116 # Most ranges in this program will be Type 0, or normal, but there are some
 117 # ranges that have a non-zero type.  These are used only in map tables, and
 118 # are for mappings that don't fit into the normal scheme of things.  Mappings
 119 # that require a hash entry to communicate with utf8.c are one example;
 120 # another example is mappings for charnames.pm to use which indicate a name
 121 # that is algorithmically determinable from its code point (and vice-versa).
 122 # These are used to significantly compact these tables, instead of listing
 123 # each one of the tens of thousands individually.
 124 #
 125 # In a match table, the value of a range is irrelevant (and hence the type as
 126 # well, which will always be 0), and arbitrarily set to the null string.
 127 # Using the example above, there would be two match tables for those two
 128 # entries, one named Upper would contain the 0x41..0x5A range, and the other
 129 # named Lower would contain 0x61..0x7A.
 130 #
 131 # Actually, there are two types of range lists, "Range_Map" is the one
 132 # associated with map tables, and "Range_List" with match tables.
 133 # Again, this is so that methods can be defined on one and not the other so as
 134 # to prevent operating on them in incorrect ways.
 135 #
 136 # Eventually, most tables are written out to files to be read by utf8_heavy.pl
 137 # in the perl core.  All tables could in theory be written, but some are
 138 # suppressed because there is no current practical use for them.  It is easy
 139 # to change which get written by changing various lists that are near the top
 140 # of the actual code in this file.  The table data structures contain enough
 141 # ancillary information to allow them to be treated as separate entities for
 142 # writing, such as the path to each one's file.  There is a heading in each
 143 # map table that gives the format of its entries, and what the map is for all
 144 # the code points missing from it.  (This allows tables to be more compact.)
 145
 146 # The Property data structure contains one or more tables.  All properties
 147 # contain a map table (except the $perl property which is a
 148 # pseudo-property containing only match tables), and any properties that
 149 # are usable in regular expression matches also contain various matching
 150 # tables, one for each value the property can have.  A binary property can
 151 # have two values, True and False (or Y and N, which are preferred by Unicode
 152 # terminology).  Thus each of these properties will have a map table that
 153 # takes every code point and maps it to Y or N (but having ranges cuts the
 154 # number of entries in that table way down), and two match tables, one
 155 # which has a list of all the code points that map to Y, and one for all the
 156 # code points that map to N.  (For each of these, a third table is also
 157 # generated for the pseudo Perl property.  It contains the identical code
 158 # points as the Y table, but can be written, not in the compound form, but in
 159 # a "single" form like \p{IsUppercase}.)  Many properties are binary, but some
 160 # properties have several possible values, some have many, and properties like
 161 # Name have a different value for every named code point.  Those will not,
 162 # unless the controlling lists are changed, have their match tables written
 163 # out.  But all the ones which can be used in regular expression \p{} and \P{}
 164 # constructs will.  Generally a property will have either its map table or its
 165 # match tables written but not both.  Again, what gets written is controlled
 166 # by lists which can easily be changed.
 167
 168 # For information about the Unicode properties, see Unicode's UAX44 document:
 169
 170 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
 171
 172 # As stated earlier, this program will work on any release of Unicode so far.
 173 # Most obvious problems in earlier data have NOT been corrected except when
 174 # necessary to make Perl or this program work reasonably.  For example, no
 175 # folding information was given in early releases, so this program uses the
 176 # substitute of lower case, just so that a regular expression with the /i
 177 # option will do something that actually gives the right results in many
 178 # cases.  There are also a couple other corrections for version 1.1.5,
 179 # commented at the point they are made.  As an example of corrections that
 180 # weren't made (but could be) is this statement from DerivedAge.txt: "The
 181 # supplementary private use code points and the non-character code points were
 182 # assigned in version 2.0, but not specifically listed in the UCD until
 183 # versions 3.0 and 3.1 respectively."  (To be precise it was 3.0.1 not 3.0.0)
 184 # More information on Unicode version glitches is further down in these
 185 # introductory comments.
 186 #
 187 # This program works on all properties as of 5.2, though the files for some
 188 # are suppressed from apparent lack of demand for.  You can change which are
 189 # output by changing lists in this program.
 190
 191 # The old version of mktables emphasized the term "Fuzzy" to mean Unocde's
 192 # loose matchings rules (from Unicode TR18):
 193 #
 194 #    The recommended names for UCD properties and property values are in
 195 #    PropertyAliases.txt [Prop] and PropertyValueAliases.txt
 196 #    [PropValue]. There are both abbreviated names and longer, more
 197 #    descriptive names. It is strongly recommended that both names be
 198 #    recognized, and that loose matching of property names be used,
 199 #    whereby the case distinctions, whitespace, hyphens, and underbar
 200 #    are ignored.
 201 # The program still allows Fuzzy to override its determination of if loose
 202 # matching should be used, but it isn't currently used, as it is no longer
 203 # needed; the calculations it makes are good enough.
 204
 205 # SUMMARY OF HOW IT WORKS:
 206 #
 207 #   Process arguments
 208 #
 209 #   A list is constructed containing each input file that is to be processed
 210 #
 211 #   Each file on the list is processed in a loop, using the associated handler
 212 #   code for each:
 213 #        The PropertyAliases.txt and PropValueAliases.txt files are processed
 214 #            first.  These files name the properties and property values.
 215 #            Objects are created of all the property and property value names
 216 #            that the rest of the input should expect, including all synonyms.
 217 #        The other input files give mappings from properties to property
 218 #           values.  That is, they list code points and say what the mapping
 219 #           is under the given property.  Some files give the mappings for
 220 #           just one property; and some for many.  This program goes through
 221 #           each file and populates the properties from them.  Some properties
 222 #           are listed in more than one file, and Unicode has set up a
 223 #           precedence as to which has priority if there is a conflict.  Thus
 224 #           the order of processing matters, and this program handles the
 225 #           conflict possibility by processing the overriding input files
 226 #           last, so that if necessary they replace earlier values.
 227 #        After this is all done, the program creates the property mappings not
 228 #            furnished by Unicode, but derivable from what it does give.
 229 #        The tables of code points that match each property value in each
 230 #            property that is accessible by regular expressions are created.
 231 #        The Perl-defined properties are created and populated.  Many of these
 232 #            require data determined from the earlier steps
 233 #        Any Perl-defined synonyms are created, and name clashes between Perl
 234 #            and Unicode are reconciled.
 235 #        All the properties are written to files
 236 #        Any other files are written, and final warnings issued.
 237
 238 # As mentioned above, some properties are given in more than one file.  In
 239 # particular, the files in the extracted directory are supposedly just
 240 # reformattings of the others.  But they contain information not easily
 241 # derivable from the other files, including results for Unihan, which this
 242 # program doesn't ordinarily look at, and for unassigned code points.  They
 243 # also have historically had errors or been incomplete.  In an attempt to
 244 # create the best possible data, this program thus processes them first to
 245 # glean information missing from the other files; then processes those other
 246 # files to override any errors in the extracted ones.
 247
 248 # For clarity, a number of operators have been overloaded to work on tables:
 249 #   ~ means invert (take all characters not in the set).  The more
 250 #       conventional '!' is not used because of the possibility of confusing
 251 #       it with the actual boolean operation.
 252 #   + means union
 253 #   - means subtraction
 254 #   & means intersection
 255 # The precedence of these is the order listed.  Parentheses should be
 256 # copiously used.  These are not a general scheme.  The operations aren't
 257 # defined for a number of things, deliberately, to avoid getting into trouble.
 258 # Operations are done on references and affect the underlying structures, so
 259 # that the copy constructors for them have been overloaded to not return a new
 260 # clone, but the input object itself.
 261
 262 # The bool operator is deliberately not overloaded to avoid confusion with
 263 # "should it mean if the object merely exists, or also is non-empty?".
 264
 265 #
 266 # WHY CERTAIN DESIGN DECISIONS WERE MADE
 267
 268 # XXX These comments need more work.
 269 #
 270 # Why have files written out for binary 'N' matches?
 271 #   For binary properties, if you know the mapping for either Y or N; the
 272 #   other is trivial to construct, so could be done at Perl run-time instead
 273 #   of having a file for it.  That is, if someone types in \p{foo: N}, Perl
 274 #   could translate that to \P{foo: Y} and not need a file.   The problem is
 275 #   communicating to Perl that a given property is binary.  Perl can't figure
 276 #   it out from looking at the N (or No), as some non-binary properties have
 277 #   these as property values.
 278 # Why
 279 # There are several types of properties, based on what form their values can
 280 # take on.  These are described in more detail below in the DATA STRUCTURES
 281 # section of these comments, but for now, you should know that there are
 282 # string properties, whose values are strings of one or more code points (such
 283 # as the Uppercase_mapping property); every other property maps to some other
 284 # form, like true or false, or a number, or a name, etc.  The reason there are
 285 # two directories for map files is because of the way utf8.c works.  It
 286 # expects that any files there are string properties, that is that the
 287 # mappings are each to one code point, with mappings in multiple code points
 288 # handled specially in an extra hash data structure.  Digit.pl is a table that
 289 # is written there for historical reasons, even though it doesn't fit that
 290 # mold.  Thus it can't currently be looked at by the Perl core.
 291 #
 292 # There are no match tables generated for matches of the null string.  These
 293 # would like like \p{JSN=}.  Perhaps something like them could be added if
 294 # necessary.  The JSN does have a real code point U+110B that maps to the null
 295 # string, but it is a contributory property, and therefore not output by
 296 # default.
 297 #
 298 # FUTURE ISSUES
 299 #
 300 # The program would break if Unicode were to change its names so that
 301 # interior white space, underscores, or dashes differences were significant
 302 # within property and property value names.
 303 #
 304 # It might be easier to use the xml versions of the UCD if this program ever
 305 # would need heavy revision, and the ability to handle old versions was not
 306 # required.
 307 #
 308 # There is the potential for name collisions, in that Perl has chosen names
 309 # that Unicode could decide it also likes.  There have been such collisions in
 310 # the past, with mostly Perl deciding to adopt the Unicode definition of the
 311 # name.  However in the 5.2 Unicode beta testing, there were a number of such
 312 # collisions, which were withdrawn before the final release, because of Perl's
 313 # and other's protests.  These all involved new properties which began with
 314 # 'Is'.  Based on the protests, Unicode is unlikely to try that again.  Also,
 315 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
 316 # Unicode document, so they are unlikely to be used by Unicode for another
 317 # purpose.  However, they might try something beginning with 'In', or use any
 318 # of the other Perl-defined properties.  This program will warn you of name
 319 # collisions, and refuse to generate tables with them, but manual intervention
 320 # will be required in this event.  One scheme that could be implemented, if
 321 # necessary, would be to have this program generate another file, or add a
 322 # field to mktables.lst that gives the date of first definition of a property.
 323 # Each new release of Unicode would use that file as a basis for the next
 324 # iteration.  And the Perl synonym addition code could sort based on the age
 325 # of the property, so older properties get priority, and newer ones that clash
 326 # would be refused; hence existing code would not be impacted, and some other
 327 # synonym would have to be used for the new property.  This is ugly, and
 328 # manual intervention would certainly be easier to do in the short run; lets
 329 # hope it never comes to this.
 330
 331 # A NOTE ON UNIHAN
 332 #
 333 # This program can generate tables from the Unihan database.  But it doesn't
 334 # by default, letting the CPAN module Unicode::Unihan handle them.  Prior to
 335 # version 5.2, this database was in a single file, Unihan.txt.  In 5.2 the
 336 # database was split into 8 different files, all beginning with the letters
 337 # 'Unihan'.  This program will read those file(s) if present, but it needs to
 338 # know which of the many properties in the file(s) should have tables created
 339 # for them.  It will create tables for any properties listed in
 340 # PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
 341 # @cjk_properties array and the @cjk_property_values array.  Thus, if a
 342 # property you want is not in those files of the release you are building
 343 # against, you must add it to those two arrays.  Starting in 4.0, the
 344 # Unicode_Radical_Stroke was listed in those files, so if the Unihan database
 345 # is present in the directory, a table will be generated for that property.
 346 # In 5.2, several more properties were added.  For your convenience, the two
 347 # arrays are initialized with all the 5.2 listed properties that are also in
 348 # earlier releases.  But these are commented out.  You can just uncomment the
 349 # ones you want, or use them as a template for adding entries for other
 350 # properties.
 351 #
 352 # You may need to adjust the entries to suit your purposes.  setup_unihan(),
 353 # and filter_unihan_line() are the functions where this is done.  This program
 354 # already does some adjusting to make the lines look more like the rest of the
 355 # Unicode DB;  You can see what that is in filter_unihan_line()
 356 #
 357 # There is a bug in the 3.2 data file in which some values for the
 358 # kPrimaryNumeric property have commas and an unexpected comment.  A filter
 359 # could be added for these; or for a particular installation, the Unihan.txt
 360 # file could be edited to fix them.
 361 # have to be
 362 #
 363 # HOW TO ADD A FILE
 364
 365 # Unicode Versions Notes
 366
 367 # alpha's numbers halve in 2.1.9, answer cjk block at 4E00 were removed from PropList; not changed, could add gc Letter, put back in in 3.1.0
 368 # Some versions of 2.1.x Jamo.txt have the wrong value for 1105, which causes
 369 # real problems for the algorithms for Jamo calculations, so it is changed
 370 # here.
 371 #   White space vs Space.  in 3.2 perl has +205F=medium math space, fixed in 4.0, and ok in 3.1.1 because not there in unicode. synonym introduced in 4.1
 372 # ATBL = 202.  202 changed to ATB, and all code points stayed there.  So if you were useing ATBL you were out of luck.
 373 # Hrkt Katakana_Or_Hiragana came in 4.01, before was Unknown.
 374 #
 375 # The default for missing code points for BidiClass is complicated.  Starting
 376 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
 377 # tries to do the best it can for earlier releases.  It is done in
 378 # process_PropertyAliases()
 379 #
 380 ##############################################################################
 381
 382 my $UNDEF = ':UNDEF:';  # String to print out for undefined values in tracing
 383                         # and errors
 384 my $MAX_LINE_WIDTH = 78;
 385
 386 # Debugging aid to skip most files so as to not be distracted by them when
 387 # concentrating on the ones being debugged.  Add
 388 # non_skip => 1,
 389 # to the constructor for those files you want processed when you set this.
 390 # Files with a first version number of 0 are special: they are always
 391 # processed regardless of the state of this flag.
 392 my $debug_skip = 0;
 393
 394 # Set to 1 to enable tracing.
 395 our $to_trace = 0;
 396
 397 { # Closure for trace: debugging aid
 398     my $print_caller = 1;        # ? Include calling subroutine name
 399     my $main_with_colon = 'main::';
 400     my $main_colon_length = length($main_with_colon);
 401
 402     sub trace {
 403         return unless $to_trace;        # Do nothing if global flag not set
 404
 405         my @input = @_;
 406
 407         local $DB::trace = 0;
 408         $DB::trace = 0;          # Quiet 'used only once' message
 409
 410         my $line_number;
 411
 412         # Loop looking up the stack to get the first non-trace caller
 413         my $caller_line;
 414         my $caller_name;
 415         my $i = 0;
 416         do {
 417             $line_number = $caller_line;
 418             (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
 419             $caller = $main_with_colon unless defined $caller;
 420
 421             $caller_name = $caller;
 422
 423             # get rid of pkg
 424             $caller_name =~ s/.*:://;
 425             if (substr($caller_name, 0, $main_colon_length)
 426                 eq $main_with_colon)
 427             {
 428                 $caller_name = substr($caller_name, $main_colon_length);
 429             }
 430
 431         } until ($caller_name ne 'trace');
 432
 433         # If the stack was empty, we were called from the top level
 434         $caller_name = 'main' if ($caller_name eq ""
 435                                     || $caller_name eq 'trace');
 436
 437         my $output = "";
 438         foreach my $string (@input) {
 439             #print STDERR __LINE__, ": ", join ", ", @input, "\n";
 440             if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
 441                 $output .= simple_dumper($string);
 442             }
 443             else {
 444                 $string = "$string" if ref $string;
 445                 $string = $UNDEF unless defined $string;
 446                 chomp $string;
 447                 $string = '""' if $string eq "";
 448                 $output .= " " if $output ne ""
 449                                 && $string ne ""
 450                                 && substr($output, -1, 1) ne " "
 451                                 && substr($string, 0, 1) ne " ";
 452                 $output .= $string;
 453             }
 454         }
 455
 456         if ($print_caller) {
 457             if (defined $line_number) {
 458                     print STDERR sprintf "%4d: ", $line_number;
 459             }
 460             else {
 461                     print STDERR "     ";
 462             }
 463             $caller_name .= ": ";
 464             print STDERR $caller_name;
 465         }
 466
 467         print STDERR $output, "\n";
 468         return;
 469     }
 470 }
 471
 472 # This is for a rarely used development feature that allows you to compare two
 473 # versions of the Unicode standard without having to deal with changes caused
 474 # by the code points introduced in the later verson.  Change the 0 to a SINGLE
 475 # dotted Unicode release number (e.g. 2.1).  Only code points introduced in
 476 # that release and earlier will be used; later ones are thrown away.  You use
 477 # the version number of the earliest one you want to compare; then run this
 478 # program on directory structures containing each release, and compare the
 479 # outputs.  These outputs will therefore include only the code points common
 480 # to both releases, and you can see the changes caused just by the underlying
 481 # release semantic changes.  For versions earlier than 3.2, you must copy a
 482 # version of DAge.txt into the directory.
 483 my $string_compare_versions = DEBUG && 0; #  e.g., v2.1;
 484 my $compare_versions = DEBUG
 485                        && $string_compare_versions
 486                        && pack "C*", split /\./, $string_compare_versions;
 487
 488 sub uniques {
 489     # Returns non-duplicated input values.  From "Perl Best Practices:
 490     # Encapsulated Cleverness".  p. 455 in first edition.
 491
 492     my %seen;
 493     return grep { ! $seen{$_}++ } @_;
 494 }
 495
 496 $0 = File::Spec->canonpath($0);
 497
 498 my $make_test_script = 0;      # ? Should we output a test script
 499 my $write_unchanged_files = 0; # ? Should we update the output files even if
 500                                #    we don't think they have changed
 501 my $use_directory = "";        # ? Should we chdir somewhere.
 502 my $pod_directory;             # input directory to store the pod file.
 503 my $pod_file = 'perluniprops';
 504 my $t_path;                     # Path to the .t test file
 505 my $file_list = 'mktables.lst'; # File to store input and output file names.
 506                                # This is used to speed up the build, by not
 507                                # executing the main body of the program if
 508                                # nothing on the list has changed since the
 509                                # previous build
 510 my $make_list = 1;             # ? Should we write $file_list.  Set to always
 511                                # make a list so that when the pumpking is
 512                                # preparing a release, s/he won't have to do
 513                                # special things
 514 my $glob_list = 0;             # ? Should we try to include unknown .txt files
 515                                # in the input.
 516 my $output_range_counts = 1;   # ? Should we include the number of code points
 517                                # in ranges in the output
 518 # Verbosity levels; 0 is quiet
 519 my $NORMAL_VERBOSITY = 1;
 520 my $PROGRESS = 2;
 521 my $VERBOSE = 3;
 522
 523 my $verbosity = $NORMAL_VERBOSITY;
 524
 525 # Process arguments
 526 while (@ARGV) {
 527     my $arg = shift @ARGV;
 528     if ($arg eq '-v') {
 529         $verbosity = $VERBOSE;
 530     }
 531     elsif ($arg eq '-p') {
 532         $verbosity = $PROGRESS;
 533         $| = 1;     # Flush buffers as we go.
 534     }
 535     elsif ($arg eq '-q') {
 536         $verbosity = 0;
 537     }
 538     elsif ($arg eq '-w') {
 539         $write_unchanged_files = 1; # update the files even if havent changed
 540     }
 541     elsif ($arg eq '-check') {
 542         my $this = shift @ARGV;
 543         my $ok = shift @ARGV;
 544         if ($this ne $ok) {
 545             print "Skipping as check params are not the same.\n";
 546             exit(0);
 547         }
 548     }
 549     elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
 550         -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
 551     }
 552     elsif ($arg eq '-maketest'
 553              || ($arg eq '-T' && defined ($t_path = shift))) {
 554         $make_test_script = 1;
 555         $t_path = 'TestProp.pl' unless defined $t_path;
 556     }
 557     elsif ($arg eq '-makelist') {
 558         $make_list = 1;
 559     }
 560     elsif ($arg eq '-C' && defined ($use_directory = shift)) {
 561         -d $use_directory or croak "Unknown directory '$use_directory'";
 562     }
 563     elsif ($arg eq '-L') {
 564
 565         # Existence not tested until have chdir'd
 566         $file_list = shift;
 567     }
 568     elsif ($arg eq '-globlist') {
 569         $glob_list = 1;
 570     }
 571     elsif ($arg eq '-c') {
 572         $output_range_counts = ! $output_range_counts
 573     }
 574     else {
 575         my $with_c = 'with';
 576         $with_c .= 'out' if $output_range_counts;   # Complements the state
 577         croak <<END;
 578 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
 579           [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
 580           [-check A B ]
 581   -c          : Output comments $with_c number of code points in ranges
 582   -q          : Quiet Mode: Only output serious warnings.
 583   -p          : Set verbosity level to normal plus show progress.
 584   -v          : Set Verbosity level high:  Show progress and non-serious
 585                 warnings
 586   -w          : Write files regardless
 587   -C dir      : Change to this directory before proceeding. All relative paths
 588                 except those specified by the -P and -T options will be done
 589                 with respect to this directory.
 590   -P dir      : Output $pod_file file to directory 'dir'.
 591   -T path     : Create a .t test file as 'path'
 592   -L filelist : Use alternate 'filelist' instead of standard one
 593   -globlist   : Take as input all non-Test *.txt files in current and sub
 594                 directories
 595   -maketest   : Make test script
 596   -makelist   : Rewrite the file list $file_list based on current setup
 597   -check A B  : Executes $0 only if A and B are the same
 598 END
 599     }
 600 }
 601
 602 # Stores the most-recently changed file.  If none have changed, can skip the
 603 # build
 604 my $youngest = -M $0;   # Do this before the chdir!
 605
 606 # Change directories now, because need to read 'version' early.
 607 if ($use_directory) {
 608     if ($pod_directory
 609         && ! File::Spec->file_name_is_absolute($pod_directory))
 610     {
 611         $pod_directory = File::Spec->rel2abs($pod_directory);
 612     }
 613     if ($t_path
 614         && ! File::Spec->file_name_is_absolute($t_path))
 615     {
 616         $t_path = File::Spec->rel2abs($t_path);
 617     }
 618     chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
 619     if ($pod_directory
 620         && File::Spec->file_name_is_absolute($pod_directory))
 621     {
 622         $pod_directory = File::Spec->abs2rel($pod_directory);
 623     }
 624     if ($t_path
 625         && File::Spec->file_name_is_absolute($t_path))
 626     {
 627         $t_path = File::Spec->abs2rel($t_path);
 628     }
 629 }
 630
 631 # Get Unicode version into regular and v-string.  This is done now because
 632 # various tables below get populated based on it.  These tables are populated
 633 # here to be near the top of the file, and so easily seeable by those needing
 634 # to modify things.
 635 open my $VERSION, "<", "version"
 636                     or croak "$0: can't open required file 'version': $!\n";
 637 my $string_version = <$VERSION>;
 638 close $VERSION;
 639 chomp $string_version;
 640 my $v_version = pack "C*", split /\./, $string_version;        # v string
 641
 642 # The following are the complete names of properties with property values that
 643 # are known to not match any code points in some versions of Unicode, but that
 644 # may change in the future so they should be matchable, hence an empty file is
 645 # generated for them.
 646 my @tables_that_may_be_empty = (
 647                                 'Joining_Type=Left_Joining',
 648                                 );
 649 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
 650 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
 651 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
 652                                                     if $v_version ge v4.1.0;
 653
 654 # The lists below are hashes, so the key is the item in the list, and the
 655 # value is the reason why it is in the list.  This makes generation of
 656 # documentation easier.
 657
 658 my %why_suppressed;  # No file generated for these.
 659
 660 # Files aren't generated for empty extraneous properties.  This is arguable.
 661 # Extraneous properties generally come about because a property is no longer
 662 # used in a newer version of Unicode.  If we generated a file without code
 663 # points, programs that used to work on that property will still execute
 664 # without errors.  It just won't ever match (or will always match, with \P{}).
 665 # This means that the logic is now likely wrong.  I (khw) think its better to
 666 # find this out by getting an error message.  Just move them to the table
 667 # above to change this behavior
 668 my %why_suppress_if_empty_warn_if_not = (
 669
 670    # It is the only property that has ever officially been removed from the
 671    # Standard.  The database never contained any code points for it.
 672    'Special_Case_Condition' => 'Obsolete',
 673
 674    # Apparently never official, but there were code points in some versions of
 675    # old-style PropList.txt
 676    'Non_Break' => 'Obsolete',
 677 );
 678
 679 # These would normally go in the warn table just above, but they were changed
 680 # a long time before this program was written, so warnings about them are
 681 # moot.
 682 if ($v_version gt v3.2.0) {
 683     push @tables_that_may_be_empty,
 684                                 'Canonical_Combining_Class=Attached_Below_Left'
 685 }
 686
 687 # These are listed in the Property aliases file in 5.2, but Unihan is ignored
 688 # unless explicitly added.
 689 if ($v_version ge v5.2.0) {
 690     my $unihan = 'Unihan; remove from list if using Unihan';
 691     foreach my $table qw (
 692                            kAccountingNumeric
 693                            kOtherNumeric
 694                            kPrimaryNumeric
 695                            kCompatibilityVariant
 696                            kIICore
 697                            kIRG_GSource
 698                            kIRG_HSource
 699                            kIRG_JSource
 700                            kIRG_KPSource
 701                            kIRG_MSource
 702                            kIRG_KSource
 703                            kIRG_TSource
 704                            kIRG_USource
 705                            kIRG_VSource
 706                            kRSUnicode
 707                         )
 708     {
 709         $why_suppress_if_empty_warn_if_not{$table} = $unihan;
 710     }
 711 }
 712
 713 # Properties that this program ignores.
 714 my @unimplemented_properties = (
 715 'Unicode_Radical_Stroke'    # Remove if changing to handle this one.
 716 );
 717
 718 # There are several types of obsolete properties defined by Unicode.  These
 719 # must be hand-edited for every new Unicode release.
 720 my %why_deprecated;  # Generates a deprecated warning message if used.
 721 my %why_stabilized;  # Documentation only
 722 my %why_obsolete;    # Documentation only
 723
 724 {   # Closure
 725     my $simple = 'Perl uses the more complete version of this property';
 726     my $unihan = 'Unihan properties are by default not enabled in the Perl core.  Instead use CPAN: Unicode::Unihan';
 727
 728     my $other_properties = 'other properties';
 729     my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
 730     my $why_no_expand  = "Easily computed, and yet doesn't cover the common encoding forms (UTF-16/8)",
 731
 732     %why_deprecated = (
 733         'Grapheme_Link' => 'Deprecated by Unicode.  Use ccc=vr (Canonical_Combining_Class=Virama) instead',
 734         'Jamo_Short_Name' => $contributory,
 735         'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
 736         'Other_Alphabetic' => $contributory,
 737         'Other_Default_Ignorable_Code_Point' => $contributory,
 738         'Other_Grapheme_Extend' => $contributory,
 739         'Other_ID_Continue' => $contributory,
 740         'Other_ID_Start' => $contributory,
 741         'Other_Lowercase' => $contributory,
 742         'Other_Math' => $contributory,
 743         'Other_Uppercase' => $contributory,
 744     );
 745
 746     %why_suppressed = (
 747         # There is a lib/unicore/Decomposition.pl (used by normalize.pm) which
 748         # contains the same information, but without the algorithmically
 749         # determinable Hangul syllables'.  This file is not published, so it's
 750         # existence is not noted in the comment.
 751         'Decomposition_Mapping' => 'Accessible via Unicode::Normalize',
 752
 753         'ISO_Comment' => 'Apparently no demand for it, but can access it through Unicode::UCD::charinfo.  Obsoleted, and code points for it removed in Unicode 5.2',
 754         'Unicode_1_Name' => "$simple, and no apparent demand for it, but can access it through Unicode::UCD::charinfo.  If there is no later name for a code point, then this one is used instead in charnames",
 755
 756         'Simple_Case_Folding' => "$simple.  Can access this through Unicode::UCD::casefold",
 757         'Simple_Lowercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo",
 758         'Simple_Titlecase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo",
 759         'Simple_Uppercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo",
 760
 761         'Name' => "Accessible via 'use charnames;'",
 762         'Name_Alias' => "Accessible via 'use charnames;'",
 763
 764         # These are sort of jumping the gun; deprecation is proposed for
 765         # Unicode version 6.0, but they have never been exposed by Perl, and
 766         # likely are soon to be deprecated, so best not to expose them.
 767         FC_NFKC_Closure => 'Use NFKC_Casefold instead',
 768         Expands_On_NFC => $why_no_expand,
 769         Expands_On_NFD => $why_no_expand,
 770         Expands_On_NFKC => $why_no_expand,
 771         Expands_On_NFKD => $why_no_expand,
 772     );
 773
 774     # The following are suppressed because they were made contributory or
 775     # deprecated by Unicode before Perl ever thought about supporting them.
 776     foreach my $property ('Jamo_Short_Name', 'Grapheme_Link') {
 777         $why_suppressed{$property} = $why_deprecated{$property};
 778     }
 779
 780     # Customize the message for all the 'Other_' properties
 781     foreach my $property (keys %why_deprecated) {
 782         next if (my $main_property = $property) !~ s/^Other_//;
 783         $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
 784     }
 785 }
 786
 787 if ($v_version ge 4.0.0) {
 788     $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
 789 }
 790 if ($v_version ge 5.2.0) {
 791     $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
 792 }
 793
 794 # Probably obsolete forever
 795 if ($v_version ge v4.1.0) {
 796     $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete.  All code points previously matched by this have been moved to "Script=Common"';
 797 }
 798
 799 # This program can create files for enumerated-like properties, such as
 800 # 'Numeric_Type'.  This file would be the same format as for a string
 801 # property, with a mapping from code point to its value, so you could look up,
 802 # for example, the script a code point is in.  But no one so far wants this
 803 # mapping, or they have found another way to get it since this is a new
 804 # feature.  So no file is generated except if it is in this list.
 805 my @output_mapped_properties = split "\n", <<END;
 806 END
 807
 808 # If you are using the Unihan database, you need to add the properties that
 809 # you want to extract from it to this table.  For your convenience, the
 810 # properties in the 5.2 PropertyAliases.txt file are listed, commented out
 811 my @cjk_properties = split "\n", <<'END';
 812 #cjkAccountingNumeric; kAccountingNumeric
 813 #cjkOtherNumeric; kOtherNumeric
 814 #cjkPrimaryNumeric; kPrimaryNumeric
 815 #cjkCompatibilityVariant; kCompatibilityVariant
 816 #cjkIICore ; kIICore
 817 #cjkIRG_GSource; kIRG_GSource
 818 #cjkIRG_HSource; kIRG_HSource
 819 #cjkIRG_JSource; kIRG_JSource
 820 #cjkIRG_KPSource; kIRG_KPSource
 821 #cjkIRG_KSource; kIRG_KSource
 822 #cjkIRG_TSource; kIRG_TSource
 823 #cjkIRG_USource; kIRG_USource
 824 #cjkIRG_VSource; kIRG_VSource
 825 #cjkRSUnicode; kRSUnicode                ; Unicode_Radical_Stroke; URS
 826 END
 827
 828 # Similarly for the property values.  For your convenience, the lines in the
 829 # 5.2 PropertyAliases.txt file are listed.  Just remove the first BUT NOT both
 830 # '#' marks
 831 my @cjk_property_values = split "\n", <<'END';
 832 ## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
 833 ## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
 834 ## @missing: 0000..10FFFF; cjkIICore; <none>
 835 ## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
 836 ## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
 837 ## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
 838 ## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
 839 ## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
 840 ## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
 841 ## @missing: 0000..10FFFF; cjkIRG_USource; <none>
 842 ## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
 843 ## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
 844 ## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
 845 ## @missing: 0000..10FFFF; cjkRSUnicode; <none>
 846 END
 847
 848 # The input files don't list every code point.  Those not listed are to be
 849 # defaulted to some value.  Below are hard-coded what those values are for
 850 # non-binary properties as of 5.1.  Starting in 5.0, there are
 851 # machine-parsable comment lines in the files the give the defaults; so this
 852 # list shouldn't have to be extended.  The claim is that all missing entries
 853 # for binary properties will default to 'N'.  Unicode tried to change that in
 854 # 5.2, but the beta period produced enough protest that they backed off.
 855 #
 856 # The defaults for the fields that appear in UnicodeData.txt in this hash must
 857 # be in the form that it expects.  The others may be synonyms.
 858 my $CODE_POINT = '<code point>';
 859 my %default_mapping = (
 860     Age => "Unassigned",
 861     # Bidi_Class => Complicated; set in code
 862     Bidi_Mirroring_Glyph => "",
 863     Block => 'No_Block',
 864     Canonical_Combining_Class => 0,
 865     Case_Folding => $CODE_POINT,
 866     Decomposition_Mapping => $CODE_POINT,
 867     Decomposition_Type => 'None',
 868     East_Asian_Width => "Neutral",
 869     FC_NFKC_Closure => $CODE_POINT,
 870     General_Category => 'Cn',
 871     Grapheme_Cluster_Break => 'Other',
 872     Hangul_Syllable_Type => 'NA',
 873     ISO_Comment => "",
 874     Jamo_Short_Name => "",
 875     Joining_Group => "No_Joining_Group",
 876     # Joining_Type => Complicated; set in code
 877     kIICore => 'N',   #                       Is converted to binary
 878     #Line_Break => Complicated; set in code
 879     Lowercase_Mapping => $CODE_POINT,
 880     Name => "",
 881     Name_Alias => "",
 882     NFC_QC => 'Yes',
 883     NFD_QC => 'Yes',
 884     NFKC_QC => 'Yes',
 885     NFKD_QC => 'Yes',
 886     Numeric_Type => 'None',
 887     Numeric_Value => 'NaN',
 888     Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
 889     Sentence_Break => 'Other',
 890     Simple_Case_Folding => $CODE_POINT,
 891     Simple_Lowercase_Mapping => $CODE_POINT,
 892     Simple_Titlecase_Mapping => $CODE_POINT,
 893     Simple_Uppercase_Mapping => $CODE_POINT,
 894     Titlecase_Mapping => $CODE_POINT,
 895     Unicode_1_Name => "",
 896     Unicode_Radical_Stroke => "",
 897     Uppercase_Mapping => $CODE_POINT,
 898     Word_Break => 'Other',
 899 );
 900
 901 # Below are files that Unicode furnishes, but this program ignores, and why
 902 my %ignored_files = (
 903     'CJKRadicals.txt' => 'Unihan data',
 904     'Index.txt' => 'An index, not actual data',
 905     'NamedSqProv.txt' => 'Not officially part of the Unicode standard; Append it to NamedSequences.txt if you want to process the contents.',
 906     'NamesList.txt' => 'Just adds commentary',
 907     'NormalizationCorrections.txt' => 'Data is already in other files.',
 908     'Props.txt' => 'Adds nothing to PropList.txt; only in very early releases',
 909     'ReadMe.txt' => 'Just comments',
 910     'README.TXT' => 'Just comments',
 911     'StandardizedVariants.txt' => 'Only for glyph changes, not a Unicode character property.  Does not fit into current scheme where one code point is mapped',
 912 );
 913
 914 ################ End of externally interesting definitions ###############
 915
 916 my $HEADER=<<"EOF";
 917 # !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
 918 # This file is machine-generated by $0 from the Unicode database,
 919 # Version $string_version.  Any changes made here will be lost!
 920 EOF
 921
 922 my $INTERNAL_ONLY=<<"EOF";
 923
 924 # !!!!!!!   INTERNAL PERL USE ONLY   !!!!!!!
 925 # This file is for internal use by the Perl program only.  The format and even
 926 # the name or existence of this file are subject to change without notice.
 927 # Don't use it directly.
 928 EOF
 929
 930 my $DEVELOPMENT_ONLY=<<"EOF";
 931 # !!!!!!!   DEVELOPMENT USE ONLY   !!!!!!!
 932 # This file contains information artificially constrained to code points
 933 # present in Unicode release $string_compare_versions.
 934 # IT CANNOT BE RELIED ON.  It is for use during development only and should
 935 # not be used for production.
 936
 937 EOF
 938
 939 my $LAST_UNICODE_CODEPOINT_STRING = "10FFFF";
 940 my $LAST_UNICODE_CODEPOINT = hex $LAST_UNICODE_CODEPOINT_STRING;
 941 my $MAX_UNICODE_CODEPOINTS = $LAST_UNICODE_CODEPOINT + 1;
 942
 943 # Matches legal code point.  4-6 hex numbers, If there are 6, the first
 944 # two must be 10; if there are 5, the first must not be a 0.  Written this way
 945 # to decrease backtracking
 946 my $code_point_re =
 947         qr/ \b (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
 948
 949 # This matches the beginning of the line in the Unicode db files that give the
 950 # defaults for code points not listed (i.e., missing) in the file.  The code
 951 # depends on this ending with a semi-colon, so it can assume it is a valid
 952 # field when the line is split() by semi-colons
 953 my $missing_defaults_prefix =
 954             qr/^#\s+\@missing:\s+0000\.\.$LAST_UNICODE_CODEPOINT_STRING\s*;/;
 955
 956 # Property types.  Unicode has more types, but these are sufficient for our
 957 # purposes.
 958 my $UNKNOWN = -1;   # initialized to illegal value
 959 my $NON_STRING = 1; # Either binary or enum
 960 my $BINARY = 2;
 961 my $ENUM = 3;       # Include catalog
 962 my $STRING = 4;     # Anything else: string or misc
 963
 964 # Some input files have lines that give default values for code points not
 965 # contained in the file.  Sometimes these should be ignored.
 966 my $NO_DEFAULTS = 0;        # Must evaluate to false
 967 my $NOT_IGNORED = 1;
 968 my $IGNORED = 2;
 969
 970 # Range types.  Each range has a type.  Most ranges are type 0, for normal,
 971 # and will appear in the main body of the tables in the output files, but
 972 # there are other types of ranges as well, listed below, that are specially
 973 # handled.   There are pseudo-types as well that will never be stored as a
 974 # type, but will affect the calculation of the type.
 975
 976 # 0 is for normal, non-specials
 977 my $MULTI_CP = 1;           # Sequence of more than code point
 978 my $HANGUL_SYLLABLE = 2;
 979 my $CP_IN_NAME = 3;         # The NAME contains the code point appended to it.
 980 my $NULL = 4;               # The map is to the null string; utf8.c can't
 981                             # handle these, nor is there an accepted syntax
 982                             # for them in \p{} constructs
 983 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
 984                              # otherwise be $MULTI_CP type are instead type 0
 985
 986 # process_generic_property_file() can accept certain overrides in its input.
 987 # Each of these must begin AND end with $CMD_DELIM.
 988 my $CMD_DELIM = "\a";
 989 my $REPLACE_CMD = 'replace';    # Override the Replace
 990 my $MAP_TYPE_CMD = 'map_type';  # Override the Type
 991
 992 my $NO = 0;
 993 my $YES = 1;
 994
 995 # Values for the Replace argument to add_range.
 996 # $NO                      # Don't replace; add only the code points not
 997                            # already present.
 998 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
 999                            # the comments at the subroutine definition.
1000 my $UNCONDITIONALLY = 2;   # Replace without conditions.
1001 my $MULTIPLE = 4;          # Don't replace, but add a duplicate record if
1002                            # already there
1003
1004 # Flags to give property statuses.  The phrases are to remind maintainers that
1005 # if the flag is changed, the indefinite article referring to it in the
1006 # documentation may need to be as well.
1007 my $NORMAL = "";
1008 my $SUPPRESSED = 'z';   # The character should never actually be seen, since
1009                         # it is suppressed
1010 my $DEPRECATED = 'D';
1011 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1012 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1013 my $DISCOURAGED = 'X';
1014 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1015 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1016 my $STRICTER = 'T';
1017 my $a_bold_stricter = "a 'B<$STRICTER>'";
1018 my $A_bold_stricter = "A 'B<$STRICTER>'";
1019 my $STABILIZED = 'S';
1020 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1021 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1022 my $OBSOLETE = 'O';
1023 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1024 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1025
1026 my %status_past_participles = (
1027     $DISCOURAGED => 'discouraged',
1028     $SUPPRESSED => 'should never be generated',
1029     $STABILIZED => 'stabilized',
1030     $OBSOLETE => 'obsolete',
1031     $DEPRECATED => 'deprecated'
1032 );
1033
1034 # The format of the values of the map tables:
1035 my $BINARY_FORMAT = 'b';
1036 my $DECIMAL_FORMAT = 'd';
1037 my $FLOAT_FORMAT = 'f';
1038 my $INTEGER_FORMAT = 'i';
1039 my $HEX_FORMAT = 'x';
1040 my $RATIONAL_FORMAT = 'r';
1041 my $STRING_FORMAT = 's';
1042
1043 my %map_table_formats = (
1044     $BINARY_FORMAT => 'binary',
1045     $DECIMAL_FORMAT => 'single decimal digit',
1046     $FLOAT_FORMAT => 'floating point number',
1047     $INTEGER_FORMAT => 'integer',
1048     $HEX_FORMAT => 'positive hex whole number; a code point',
1049     $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1050     $STRING_FORMAT => 'arbitrary string',
1051 );
1052
1053 # Unicode didn't put such derived files in a separate directory at first.
1054 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1055 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1056 my $AUXILIARY = 'auxiliary';
1057
1058 # Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1059 my %loose_to_file_of;       # loosely maps table names to their respective
1060                             # files
1061 my %stricter_to_file_of;    # same; but for stricter mapping.
1062 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1063                              # their rational equivalent
1064 my %loose_property_name_of; # Loosely maps property names to standard form
1065
1066 # These constants names and values were taken from the Unicode standard,
1067 # version 5.1, section 3.12.  They are used in conjunction with Hangul
1068 # syllables
1069 my $SBase = 0xAC00;
1070 my $LBase = 0x1100;
1071 my $VBase = 0x1161;
1072 my $TBase = 0x11A7;
1073 my $SCount = 11172;
1074 my $LCount = 19;
1075 my $VCount = 21;
1076 my $TCount = 28;
1077 my $NCount = $VCount * $TCount;
1078
1079 # For Hangul syllables;  These store the numbers from Jamo.txt in conjunction
1080 # with the above published constants.
1081 my %Jamo;
1082 my %Jamo_L;     # Leading consonants
1083 my %Jamo_V;     # Vowels
1084 my %Jamo_T;     # Trailing consonants
1085
1086 my @unhandled_properties;  # Will contain a list of properties found in
1087                            # the input that we didn't process.
1088 my @match_properties;      # Properties that have match tables, to be
1089                            # listed in the pod
1090 my @map_properties;        # Properties that get map files written
1091 my @named_sequences;       # NamedSequences.txt contents.
1092 my %potential_files;       # Generated list of all .txt files in the directory
1093                            # structure so we can warn if something is being
1094                            # ignored.
1095 my @files_actually_output; # List of files we generated.
1096 my @more_Names;            # Some code point names are compound; this is used
1097                            # to store the extra components of them.
1098 my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1099                            # the minimum before we consider it equivalent to a
1100                            # candidate rational
1101 my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1102
1103 # These store references to certain commonly used property objects
1104 my $gc;
1105 my $perl;
1106 my $block;
1107
1108 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1109 my $has_In_conflicts = 0;
1110 my $has_Is_conflicts = 0;
1111
1112 sub internal_file_to_platform ($) {
1113     # Convert our file paths which have '/' separators to those of the
1114     # platform.
1115
1116     my $file = shift;
1117     return undef unless defined $file;
1118
1119     return File::Spec->join(split '/', $file);
1120 }
1121
1122 sub file_exists ($) {   # platform independent '-e'.  This program internally
1123                         # uses slash as a path separator.
1124     my $file = shift;
1125     return 0 if ! defined $file;
1126     return -e internal_file_to_platform($file);
1127 }
1128
1129 # This 'require' doesn't necessarily work in miniperl, and even if it does,
1130 # the native perl version of it (which is what would operate under miniperl)
1131 # is extremely slow, as it does a string eval every call.
1132 my $has_fast_scalar_util = $\18 !~ /miniperl/
1133                             && defined eval "require Scalar::Util";
1134
1135 sub objaddr($) {
1136     # Returns the address of the blessed input object.  Uses the XS version if
1137     # available.  It doesn't check for blessedness because that would do a
1138     # string eval every call, and the program is structured so that this is
1139     # never called for a non-blessed object.
1140
1141     return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1142
1143     # Check at least that is a ref.
1144     my $pkg = ref($_[0]) or return undef;
1145
1146     # Change to a fake package to defeat any overloaded stringify
1147     bless $_[0], 'main::Fake';
1148
1149     # Numifying a ref gives its address.
1150     my $addr = 0 + $_[0];
1151
1152     # Return to original class
1153     bless $_[0], $pkg;
1154     return $addr;
1155 }
1156
1157 sub max ($$) {
1158     my $a = shift;
1159     my $b = shift;
1160     return $a if $a >= $b;
1161     return $b;
1162 }
1163
1164 sub min ($$) {
1165     my $a = shift;
1166     my $b = shift;
1167     return $a if $a <= $b;
1168     return $b;
1169 }
1170
1171 sub clarify_number ($) {
1172     # This returns the input number with underscores inserted every 3 digits
1173     # in large (5 digits or more) numbers.  Input must be entirely digits, not
1174     # checked.
1175
1176     my $number = shift;
1177     my $pos = length($number) - 3;
1178     return $number if $pos <= 1;
1179     while ($pos > 0) {
1180         substr($number, $pos, 0) = '_';
1181         $pos -= 3;
1182     }
1183     return $number;
1184 }
1185
1186
1187 package Carp;
1188
1189 # These routines give a uniform treatment of messages in this program.  They
1190 # are placed in the Carp package to cause the stack trace to not include them,
1191 # although an alternative would be to use another package and set @CARP_NOT
1192 # for it.
1193
1194 our $Verbose = 1 if main::DEBUG;  # Useful info when debugging
1195
1196 sub my_carp {
1197     my $message = shift || "";
1198     my $nofold = shift || 0;
1199
1200     if ($message) {
1201         $message = main::join_lines($message);
1202         $message =~ s/^$0: *//;     # Remove initial program name
1203         $message =~ s/[.;,]+$//;    # Remove certain ending punctuation
1204         $message = "\n$0: $message;";
1205
1206         # Fold the message with program name, semi-colon end punctuation
1207         # (which looks good with the message that carp appends to it), and a
1208         # hanging indent for continuation lines.
1209         $message = main::simple_fold($message, "", 4) unless $nofold;
1210         $message =~ s/\n$//;        # Remove the trailing nl so what carp
1211                                     # appends is to the same line
1212     }
1213
1214     return $message if defined wantarray;   # If a caller just wants the msg
1215
1216     carp $message;
1217     return;
1218 }
1219
1220 sub my_carp_bug {
1221     # This is called when it is clear that the problem is caused by a bug in
1222     # this program.
1223
1224     my $message = shift;
1225     $message =~ s/^$0: *//;
1226     $message = my_carp("Bug in $0.  Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1227     carp $message;
1228     return;
1229 }
1230
1231 sub carp_too_few_args {
1232     if (@_ != 2) {
1233         my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'.  No action taken.");
1234         return;
1235     }
1236
1237     my $args_ref = shift;
1238     my $count = shift;
1239
1240     my_carp_bug("Need at least $count arguments to "
1241         . (caller 1)[3]
1242         . ".  Instead got: '"
1243         . join ', ', @$args_ref
1244         . "'.  No action taken.");
1245     return;
1246 }
1247
1248 sub carp_extra_args {
1249     my $args_ref = shift;
1250     my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . ");  Extras ignored.") if @_;
1251
1252     unless (ref $args_ref) {
1253         my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref.  Not checking arguments.");
1254         return;
1255     }
1256     my ($package, $file, $line) = caller;
1257     my $subroutine = (caller 1)[3];
1258
1259     my $list;
1260     if (ref $args_ref eq 'HASH') {
1261         foreach my $key (keys %$args_ref) {
1262             $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1263         }
1264         $list = join ', ', each %{$args_ref};
1265     }
1266     elsif (ref $args_ref eq 'ARRAY') {
1267         foreach my $arg (@$args_ref) {
1268             $arg = $UNDEF unless defined $arg;
1269         }
1270         $list = join ', ', @$args_ref;
1271     }
1272     else {
1273         my_carp_bug("Can't cope with ref "
1274                 . ref($args_ref)
1275                 . " . argument to 'carp_extra_args'.  Not checking arguments.");
1276         return;
1277     }
1278
1279     my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine.  Skipped.");
1280     return;
1281 }
1282
1283 package main;
1284
1285 { # Closure
1286
1287     # This program uses the inside-out method for objects, as recommended in
1288     # "Perl Best Practices".  This closure aids in generating those.  There
1289     # are two routines.  setup_package() is called once per package to set
1290     # things up, and then set_access() is called for each hash representing a
1291     # field in the object.  These routines arrange for the object to be
1292     # properly destroyed when no longer used, and for standard accessor
1293     # functions to be generated.  If you need more complex accessors, just
1294     # write your own and leave those accesses out of the call to set_access().
1295     # More details below.
1296
1297     my %constructor_fields; # fields that are to be used in constructors; see
1298                             # below
1299
1300     # The values of this hash will be the package names as keys to other
1301     # hashes containing the name of each field in the package as keys, and
1302     # references to their respective hashes as values.
1303     my %package_fields;
1304
1305     sub setup_package {
1306         # Sets up the package, creating standard DESTROY and dump methods
1307         # (unless already defined).  The dump method is used in debugging by
1308         # simple_dumper().
1309         # The optional parameters are:
1310         #   a)  a reference to a hash, that gets populated by later
1311         #       set_access() calls with one of the accesses being
1312         #       'constructor'.  The caller can then refer to this, but it is
1313         #       not otherwise used by these two routines.
1314         #   b)  a reference to a callback routine to call during destruction
1315         #       of the object, before any fields are actually destroyed
1316
1317         my %args = @_;
1318         my $constructor_ref = delete $args{'Constructor_Fields'};
1319         my $destroy_callback = delete $args{'Destroy_Callback'};
1320         Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1321
1322         my %fields;
1323         my $package = (caller)[0];
1324
1325         $package_fields{$package} = \%fields;
1326         $constructor_fields{$package} = $constructor_ref;
1327
1328         unless ($package->can('DESTROY')) {
1329             my $destroy_name = "${package}::DESTROY";
1330             no strict "refs";
1331
1332             # Use typeglob to give the anonymous subroutine the name we want
1333             *$destroy_name = sub {
1334                 my $self = shift;
1335                 my $addr = main::objaddr($self);
1336
1337                 $self->$destroy_callback if $destroy_callback;
1338                 foreach my $field (keys %{$package_fields{$package}}) {
1339                     #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1340                     delete $package_fields{$package}{$field}{$addr};
1341                 }
1342                 return;
1343             }
1344         }
1345
1346         unless ($package->can('dump')) {
1347             my $dump_name = "${package}::dump";
1348             no strict "refs";
1349             *$dump_name = sub {
1350                 my $self = shift;
1351                 return dump_inside_out($self, $package_fields{$package}, @_);
1352             }
1353         }
1354         return;
1355     }
1356
1357     sub set_access {
1358         # Arrange for the input field to be garbage collected when no longer
1359         # needed.  Also, creates standard accessor functions for the field
1360         # based on the optional parameters-- none if none of these parameters:
1361         #   'addable'    creates an 'add_NAME()' accessor function.
1362         #   'readable' or 'readable_array'   creates a 'NAME()' accessor
1363         #                function.
1364         #   'settable'   creates a 'set_NAME()' accessor function.
1365         #   'constructor' doesn't create an accessor function, but adds the
1366         #                field to the hash that was previously passed to
1367         #                setup_package();
1368         # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1369         # 'add' etc. all mean 'addable'.
1370         # The read accessor function will work on both array and scalar
1371         # values.  If another accessor in the parameter list is 'a', the read
1372         # access assumes an array.  You can also force it to be array access
1373         # by specifying 'readable_array' instead of 'readable'
1374         #
1375         # A sort-of 'protected' access can be set-up by preceding the addable,
1376         # readable or settable with some initial portion of 'protected_' (but,
1377         # the underscore is required), like 'p_a', 'pro_set', etc.  The
1378         # "protection" is only by convention.  All that happens is that the
1379         # accessor functions' names begin with an underscore.  So instead of
1380         # calling set_foo, the call is _set_foo.  (Real protection could be
1381         # accomplished by having a new subroutine, end_package called at the
1382         # end of each package, and then storing the __LINE__ ranges and
1383         # checking them on every accessor.  But that is way overkill.)
1384
1385         # We create anonymous subroutines as the accessors and then use
1386         # typeglobs to assign them to the proper package and name
1387
1388         my $name = shift;   # Name of the field
1389         my $field = shift;  # Reference to the inside-out hash containing the
1390                             # field
1391
1392         my $package = (caller)[0];
1393
1394         if (! exists $package_fields{$package}) {
1395             croak "$0: Must call 'setup_package' before 'set_access'";
1396         }
1397
1398         # Stash the field so DESTROY can get it.
1399         $package_fields{$package}{$name} = $field;
1400
1401         # Remaining arguments are the accessors.  For each...
1402         foreach my $access (@_) {
1403             my $access = lc $access;
1404
1405             my $protected = "";
1406
1407             # Match the input as far as it goes.
1408             if ($access =~ /^(p[^_]*)_/) {
1409                 $protected = $1;
1410                 if (substr('protected_', 0, length $protected)
1411                     eq $protected)
1412                 {
1413
1414                     # Add 1 for the underscore not included in $protected
1415                     $access = substr($access, length($protected) + 1);
1416                     $protected = '_';
1417                 }
1418                 else {
1419                     $protected = "";
1420                 }
1421             }
1422
1423             if (substr('addable', 0, length $access) eq $access) {
1424                 my $subname = "${package}::${protected}add_$name";
1425                 no strict "refs";
1426
1427                 # add_ accessor.  Don't add if already there, which we
1428                 # determine using 'eq' for scalars and '==' otherwise.
1429                 *$subname = sub {
1430                     use strict "refs";
1431                     return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1432                     my $self = shift;
1433                     my $value = shift;
1434                     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1435                     if (ref $value) {
1436                         return if grep { $value == $_ }
1437                                             @{$field->{main::objaddr $self}};
1438                     }
1439                     else {
1440                         return if grep { $value eq $_ }
1441                                             @{$field->{main::objaddr $self}};
1442                     }
1443                     push @{$field->{main::objaddr $self}}, $value;
1444                     return;
1445                 }
1446             }
1447             elsif (substr('constructor', 0, length $access) eq $access) {
1448                 if ($protected) {
1449                     Carp::my_carp_bug("Can't set-up 'protected' constructors")
1450                 }
1451                 else {
1452                     $constructor_fields{$package}{$name} = $field;
1453                 }
1454             }
1455             elsif (substr('readable_array', 0, length $access) eq $access) {
1456
1457                 # Here has read access.  If one of the other parameters for
1458                 # access is array, or this one specifies array (by being more
1459                 # than just 'readable_'), then create a subroutine that
1460                 # assumes the data is an array.  Otherwise just a scalar
1461                 my $subname = "${package}::${protected}$name";
1462                 if (grep { /^a/i } @_
1463                     or length($access) > length('readable_'))
1464                 {
1465                     no strict "refs";
1466                     *$subname = sub {
1467                         use strict "refs";
1468                         my $self = shift;
1469                         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1470                         my $addr = main::objaddr $self;
1471                         if (ref $field->{$addr} ne 'ARRAY') {
1472                             my $type = ref $field->{$addr};
1473                             $type = 'scalar' unless $type;
1474                             Carp::my_carp_bug("Trying to read $name as an array when it is a $type.  Big problems.");
1475                             return;
1476                         }
1477                         return scalar @{$field->{$addr}} unless wantarray;
1478
1479                         # Make a copy; had problems with caller modifying the
1480                         # original otherwise
1481                         my @return = @{$field->{$addr}};
1482                         return @return;
1483                     }
1484                 }
1485                 else {
1486
1487                     # Here not an array value, a simpler function.
1488                     no strict "refs";
1489                     *$subname = sub {
1490                         use strict "refs";
1491                         my $self = shift;
1492                         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1493                         return $field->{main::objaddr $self};
1494                     }
1495                 }
1496             }
1497             elsif (substr('settable', 0, length $access) eq $access) {
1498                 my $subname = "${package}::${protected}set_$name";
1499                 no strict "refs";
1500                 *$subname = sub {
1501                     use strict "refs";
1502                     return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1503                     my $self = shift;
1504                     my $value = shift;
1505                     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1506                     $field->{main::objaddr $self} = $value;
1507                     return;
1508                 }
1509             }
1510             else {
1511                 Carp::my_carp_bug("Unknown accessor type $access.  No accessor set.");
1512             }
1513         }
1514         return;
1515     }
1516 }
1517
1518 package Input_file;
1519
1520 # All input files use this object, which stores various attributes about them,
1521 # and provides for convenient, uniform handling.  The run method wraps the
1522 # processing.  It handles all the bookkeeping of opening, reading, and closing
1523 # the file, returning only significant input lines.
1524 #
1525 # Each object gets a handler which processes the body of the file, and is
1526 # called by run().  Most should use the generic, default handler, which has
1527 # code scrubbed to handle things you might not expect.  A handler should
1528 # basically be a while(next_line()) {...} loop.
1529 #
1530 # You can also set up handlers to
1531 #   1) call before the first line is read for pre processing
1532 #   2) call to adjust each line of the input before the main handler gets them
1533 #   3) call upon EOF before the main handler exits its loop
1534 #   4) call at the end for post processing
1535 #
1536 # $_ is used to store the input line, and is to be filtered by the
1537 # each_line_handler()s.  So, if the format of the line is not in the desired
1538 # format for the main handler, these are used to do that adjusting.  They can
1539 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
1540 # so the $_ output of one is used as the input to the next.  None of the other
1541 # handlers are stackable, but could easily be changed to be so.
1542 #
1543 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
1544 # which insert the parameters as lines to be processed before the next input
1545 # file line is read.  This allows the EOF handler to flush buffers, for
1546 # example.  The difference between the two routines is that the lines inserted
1547 # by insert_lines() are subjected to the each_line_handler()s.  (So if you
1548 # called it from such a handler, you would get infinite recursion.)  Lines
1549 # inserted by insert_adjusted_lines() go directly to the main handler without
1550 # any adjustments.  If the  post-processing handler calls any of these, there
1551 # will be no effect.  Some error checking for these conditions could be added,
1552 # but it hasn't been done.
1553 #
1554 # carp_bad_line() should be called to warn of bad input lines, which clears $_
1555 # to prevent further processing of the line.  This routine will output the
1556 # message as a warning once, and then keep a count of the lines that have the
1557 # same message, and output that count at the end of the file's processing.
1558 # This keeps the number of messages down to a manageable amount.
1559 #
1560 # get_missings() should be called to retrieve any @missing input lines.
1561 # Messages will be raised if this isn't done if the options aren't to ignore
1562 # missings.
1563
1564 sub trace { return main::trace(@_); }
1565
1566
1567 { # Closure
1568     # Keep track of fields that are to be put into the constructor.
1569     my %constructor_fields;
1570
1571     main::setup_package(Constructor_Fields => \%constructor_fields);
1572
1573     my %file; # Input file name, required
1574     main::set_access('file', \%file, qw{ c r });
1575
1576     my %first_released; # Unicode version file was first released in, required
1577     main::set_access('first_released', \%first_released, qw{ c r });
1578
1579     my %handler;    # Subroutine to process the input file, defaults to
1580                     # 'process_generic_property_file'
1581     main::set_access('handler', \%handler, qw{ c });
1582
1583     my %property;
1584     # name of property this file is for.  defaults to none, meaning not
1585     # applicable, or is otherwise determinable, for example, from each line.
1586     main::set_access('property', \%property, qw{ c });
1587
1588     my %optional;
1589     # If this is true, the file is optional.  If not present, no warning is
1590     # output.  If it is present, the string given by this parameter is
1591     # evaluated, and if false the file is not processed.
1592     main::set_access('optional', \%optional, 'c', 'r');
1593
1594     my %non_skip;
1595     # This is used for debugging, to skip processing of all but a few input
1596     # files.  Add 'non_skip => 1' to the constructor for those files you want
1597     # processed when you set the $debug_skip global.
1598     main::set_access('non_skip', \%non_skip, 'c');
1599
1600     my %each_line_handler;
1601     # list of subroutines to look at and filter each non-comment line in the
1602     # file.  defaults to none.  The subroutines are called in order, each is
1603     # to adjust $_ for the next one, and the final one adjusts it for
1604     # 'handler'
1605     main::set_access('each_line_handler', \%each_line_handler, 'c');
1606
1607     my %has_missings_defaults;
1608     # ? Are there lines in the file giving default values for code points
1609     # missing from it?.  Defaults to NO_DEFAULTS.  Otherwise NOT_IGNORED is
1610     # the norm, but IGNORED means it has such lines, but the handler doesn't
1611     # use them.  Having these three states allows us to catch changes to the
1612     # UCD that this program should track
1613     main::set_access('has_missings_defaults',
1614                                         \%has_missings_defaults, qw{ c r });
1615
1616     my %pre_handler;
1617     # Subroutine to call before doing anything else in the file.  If undef, no
1618     # such handler is called.
1619     main::set_access('pre_handler', \%pre_handler, qw{ c });
1620
1621     my %eof_handler;
1622     # Subroutine to call upon getting an EOF on the input file, but before
1623     # that is returned to the main handler.  This is to allow buffers to be
1624     # flushed.  The handler is expected to call insert_lines() or
1625     # insert_adjusted() with the buffered material
1626     main::set_access('eof_handler', \%eof_handler, qw{ c r });
1627
1628     my %post_handler;
1629     # Subroutine to call after all the lines of the file are read in and
1630     # processed.  If undef, no such handler is called.
1631     main::set_access('post_handler', \%post_handler, qw{ c });
1632
1633     my %progress_message;
1634     # Message to print to display progress in lieu of the standard one
1635     main::set_access('progress_message', \%progress_message, qw{ c });
1636
1637     my %handle;
1638     # cache open file handle, internal.  Is undef if file hasn't been
1639     # processed at all, empty if has;
1640     main::set_access('handle', \%handle);
1641
1642     my %added_lines;
1643     # cache of lines added virtually to the file, internal
1644     main::set_access('added_lines', \%added_lines);
1645
1646     my %errors;
1647     # cache of errors found, internal
1648     main::set_access('errors', \%errors);
1649
1650     my %missings;
1651     # storage of '@missing' defaults lines
1652     main::set_access('missings', \%missings);
1653
1654     sub new {
1655         my $class = shift;
1656
1657         my $self = bless \do{ my $anonymous_scalar }, $class;
1658         my $addr = main::objaddr($self);
1659
1660         # Set defaults
1661         $handler{$addr} = \&main::process_generic_property_file;
1662         $non_skip{$addr} = 0;
1663         $has_missings_defaults{$addr} = $NO_DEFAULTS;
1664         $handle{$addr} = undef;
1665         $added_lines{$addr} = [ ];
1666         $each_line_handler{$addr} = [ ];
1667         $errors{$addr} = { };
1668         $missings{$addr} = [ ];
1669
1670         # Two positional parameters.
1671         $file{$addr} = main::internal_file_to_platform(shift);
1672         $first_released{$addr} = shift;
1673
1674         # The rest of the arguments are key => value pairs
1675         # %constructor_fields has been set up earlier to list all possible
1676         # ones.  Either set or push, depending on how the default has been set
1677         # up just above.
1678         my %args = @_;
1679         foreach my $key (keys %args) {
1680             my $argument = $args{$key};
1681
1682             # Note that the fields are the lower case of the constructor keys
1683             my $hash = $constructor_fields{lc $key};
1684             if (! defined $hash) {
1685                 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self.  Skipped");
1686                 next;
1687             }
1688             if (ref $hash->{$addr} eq 'ARRAY') {
1689                 if (ref $argument eq 'ARRAY') {
1690                     foreach my $argument (@{$argument}) {
1691                         next if ! defined $argument;
1692                         push @{$hash->{$addr}}, $argument;
1693                     }
1694                 }
1695                 else {
1696                     push @{$hash->{$addr}}, $argument if defined $argument;
1697                 }
1698             }
1699             else {
1700                 $hash->{$addr} = $argument;
1701             }
1702             delete $args{$key};
1703         };
1704
1705         # If the file has a property for it, it means that the property is not
1706         # listed in the file's entries.  So add a handler to the list of line
1707         # handlers to insert the property name into the lines, to provide a
1708         # uniform interface to the final processing subroutine.
1709         # the final code doesn't have to worry about that.
1710         if ($property{$addr}) {
1711             push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
1712         }
1713
1714         if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
1715             print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
1716         }
1717
1718         return $self;
1719     }
1720
1721
1722     use overload
1723         fallback => 0,
1724         qw("") => "_operator_stringify",
1725         "." => \&main::_operator_dot,
1726     ;
1727
1728     sub _operator_stringify {
1729         my $self = shift;
1730
1731         return __PACKAGE__ . " object for " . $self->file;
1732     }
1733
1734     # flag to make sure extracted files are processed early
1735     my $seen_non_extracted_non_age = 0;
1736
1737     sub run {
1738         # Process the input object $self.  This opens and closes the file and
1739         # calls all the handlers for it.  Currently,  this can only be called
1740         # once per file, as it destroy's the EOF handler
1741
1742         my $self = shift;
1743         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1744
1745         my $addr = main::objaddr $self;
1746
1747         my $file = $file{$addr};
1748
1749         # Don't process if not expecting this file (because released later
1750         # than this Unicode version), and isn't there.  This means if someone
1751         # copies it into an earlier version's directory, we will go ahead and
1752         # process it.
1753         return if $first_released{$addr} gt $v_version && ! -e $file;
1754
1755         # If in debugging mode and this file doesn't have the non-skip
1756         # flag set, and isn't one of the critical files, skip it.
1757         if ($debug_skip
1758             && $first_released{$addr} ne v0
1759             && ! $non_skip{$addr})
1760         {
1761             print "Skipping $file in debugging\n" if $verbosity;
1762             return;
1763         }
1764
1765         # File could be optional
1766         if ($optional{$addr}){
1767             return unless -e $file;
1768             my $result = eval $optional{$addr};
1769             if (! defined $result) {
1770                 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}.  $file Skipped.");
1771                 return;
1772             }
1773             if (! $result) {
1774                 if ($verbosity) {
1775                     print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
1776                 }
1777                 return;
1778             }
1779         }
1780
1781         if (! defined $file || ! -e $file) {
1782
1783             # If the file doesn't exist, see if have internal data for it
1784             # (based on first_released being 0).
1785             if ($first_released{$addr} eq v0) {
1786                 $handle{$addr} = 'pretend_is_open';
1787             }
1788             else {
1789                 if (! $optional{$addr}  # File could be optional
1790                     && $v_version ge $first_released{$addr})
1791                 {
1792                     print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
1793                 }
1794                 return;
1795             }
1796         }
1797         else {
1798
1799             # Here, the file exists
1800             if ($seen_non_extracted_non_age) {
1801                 if ($file =~ /$EXTRACTED/) {
1802                     Carp::my_carp_bug(join_lines(<<END
1803 $file should be processed just after the 'Prop..Alias' files, and before
1804 anything not in the $EXTRACTED_DIR directory.  Proceeding, but the results may
1805 have subtle problems
1806 END
1807                     ));
1808                 }
1809             }
1810             elsif ($EXTRACTED_DIR
1811                     && $first_released{$addr} ne v0
1812                     && $file !~ /$EXTRACTED/
1813                     && $file ne 'DAge.txt')
1814             {
1815                 # We don't set this (by the 'if' above) if we have no
1816                 # extracted directory, so if running on an early version,
1817                 # this test won't work.  Not worth worrying about.
1818                 $seen_non_extracted_non_age = 1;
1819             }
1820
1821             # And mark the file as having being processed, and warn if it
1822             # isn't a file we are expecting.  As we process the files,
1823             # they are deleted from the hash, so any that remain at the
1824             # end of the program are files that we didn't process.
1825             Carp::my_carp("Was not expecting '$file'.") if
1826                     ! delete $potential_files{File::Spec->rel2abs($file)}
1827                     && ! defined $handle{$addr};
1828
1829             # Open the file, converting the slashes used in this program
1830             # into the proper form for the OS
1831             my $file_handle;
1832             if (not open $file_handle, "<", $file) {
1833                 Carp::my_carp("Can't open $file.  Skipping: $!");
1834                 return 0;
1835             }
1836             $handle{$addr} = $file_handle; # Cache the open file handle
1837         }
1838
1839         if ($verbosity >= $PROGRESS) {
1840             if ($progress_message{$addr}) {
1841                 print "$progress_message{$addr}\n";
1842             }
1843             else {
1844                 # If using a virtual file, say so.
1845                 print "Processing ", (-e $file)
1846                                        ? $file
1847                                        : "substitute $file",
1848                                      "\n";
1849             }
1850         }
1851
1852
1853         # Call any special handler for before the file.
1854         &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
1855
1856         # Then the main handler
1857         &{$handler{$addr}}($self);
1858
1859         # Then any special post-file handler.
1860         &{$post_handler{$addr}}($self) if $post_handler{$addr};
1861
1862         # If any errors have been accumulated, output the counts (as the first
1863         # error message in each class was output when it was encountered).
1864         if ($errors{$addr}) {
1865             my $total = 0;
1866             my $types = 0;
1867             foreach my $error (keys %{$errors{$addr}}) {
1868                 $total += $errors{$addr}->{$error};
1869                 delete $errors{$addr}->{$error};
1870                 $types++;
1871             }
1872             if ($total > 1) {
1873                 my $message
1874                         = "A total of $total lines had errors in $file.  ";
1875
1876                 $message .= ($types == 1)
1877                             ? '(Only the first one was displayed.)'
1878                             : '(Only the first of each type was displayed.)';
1879                 Carp::my_carp($message);
1880             }
1881         }
1882
1883         if (@{$missings{$addr}}) {
1884             Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines.  Generated tables likely are wrong");
1885         }
1886
1887         # If a real file handle, close it.
1888         close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
1889                                                         ref $handle{$addr};
1890         $handle{$addr} = "";   # Uses empty to indicate that has already seen
1891                                # the file, as opposed to undef
1892         return;
1893     }
1894
1895     sub next_line {
1896         # Sets $_ to be the next logical input line, if any.  Returns non-zero
1897         # if such a line exists.  'logical' means that any lines that have
1898         # been added via insert_lines() will be returned in $_ before the file
1899         # is read again.
1900
1901         my $self = shift;
1902         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1903
1904         my $addr = main::objaddr $self;
1905
1906         # Here the file is open (or if the handle is not a ref, is an open
1907         # 'virtual' file).  Get the next line; any inserted lines get priority
1908         # over the file itself.
1909         my $adjusted;
1910
1911         LINE:
1912         while (1) { # Loop until find non-comment, non-empty line
1913             #local $to_trace = 1 if main::DEBUG;
1914             my $inserted_ref = shift @{$added_lines{$addr}};
1915             if (defined $inserted_ref) {
1916                 ($adjusted, $_) = @{$inserted_ref};
1917                 trace $adjusted, $_ if main::DEBUG && $to_trace;
1918                 return 1 if $adjusted;
1919             }
1920             else {
1921                 last if ! ref $handle{$addr}; # Don't read unless is real file
1922                 last if ! defined ($_ = readline $handle{$addr});
1923             }
1924             chomp;
1925             trace $_ if main::DEBUG && $to_trace;
1926
1927             # See if this line is the comment line that defines what property
1928             # value that code points that are not listed in the file should
1929             # have.  The format or existence of these lines is not guaranteed
1930             # by Unicode since they are comments, but the documentation says
1931             # that this was added for machine-readability, so probably won't
1932             # change.  This works starting in Unicode Version 5.0.  They look
1933             # like:
1934             #
1935             # @missing: 0000..10FFFF; Not_Reordered
1936             # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
1937             # @missing: 0000..10FFFF; ; NaN
1938             #
1939             # Save the line for a later get_missings() call.
1940             if (/$missing_defaults_prefix/) {
1941                 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
1942                     $self->carp_bad_line("Unexpected \@missing line.  Assuming no missing entries");
1943                 }
1944                 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
1945                     my @defaults = split /\s* ; \s*/x, $_;
1946
1947                     # The first field is the @missing, which ends in a
1948                     # semi-colon, so can safely shift.
1949                     shift @defaults;
1950
1951                     # Some of these lines may have empty field placeholders
1952                     # which get in the way.  An example is:
1953                     # @missing: 0000..10FFFF; ; NaN
1954                     # Remove them.  Process starting from the top so the
1955                     # splice doesn't affect things still to be looked at.
1956                     for (my $i = @defaults - 1; $i >= 0; $i--) {
1957                         next if $defaults[$i] ne "";
1958                         splice @defaults, $i, 1;
1959                     }
1960
1961                     # What's left should be just the property (maybe) and the
1962                     # default.  Having only one element means it doesn't have
1963                     # the property.
1964                     my $default;
1965                     my $property;
1966                     if (@defaults >= 1) {
1967                         if (@defaults == 1) {
1968                             $default = $defaults[0];
1969                         }
1970                         else {
1971                             $property = $defaults[0];
1972                             $default = $defaults[1];
1973                         }
1974                     }
1975
1976                     if (@defaults < 1
1977                         || @defaults > 2
1978                         || ($default =~ /^</
1979                             && $default !~ /^<code *point>$/i
1980                             && $default !~ /^<none>$/i))
1981                     {
1982                         $self->carp_bad_line("Unrecognized \@missing line: $_.  Assuming no missing entries");
1983                     }
1984                     else {
1985
1986                         # If the property is missing from the line, it should
1987                         # be the one for the whole file
1988                         $property = $property{$addr} if ! defined $property;
1989
1990                         # Change <none> to the null string, which is what it
1991                         # really means.  If the default is the code point
1992                         # itself, set it to <code point>, which is what
1993                         # Unicode uses (but sometimes they've forgotten the
1994                         # space)
1995                         if ($default =~ /^<none>$/i) {
1996                             $default = "";
1997                         }
1998                         elsif ($default =~ /^<code *point>$/i) {
1999                             $default = $CODE_POINT;
2000                         }
2001
2002                         # Store them as a sub-arrays with both components.
2003                         push @{$missings{$addr}}, [ $default, $property ];
2004                     }
2005                 }
2006
2007                 # There is nothing for the caller to process on this comment
2008                 # line.
2009                 next;
2010             }
2011
2012             # Remove comments and trailing space, and skip this line if the
2013             # result is empty
2014             s/#.*//;
2015             s/\s+$//;
2016             next if /^$/;
2017
2018             # Call any handlers for this line, and skip further processing of
2019             # the line if the handler sets the line to null.
2020             foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2021                 &{$sub_ref}($self);
2022                 next LINE if /^$/;
2023             }
2024
2025             # Here the line is ok.  return success.
2026             return 1;
2027         } # End of looping through lines.
2028
2029         # If there is an EOF handler, call it (only once) and if it generates
2030         # more lines to process go back in the loop to handle them.
2031         if ($eof_handler{$addr}) {
2032             &{$eof_handler{$addr}}($self);
2033             $eof_handler{$addr} = "";   # Currently only get one shot at it.
2034             goto LINE if $added_lines{$addr};
2035         }
2036
2037         # Return failure -- no more lines.
2038         return 0;
2039
2040     }
2041
2042 #   Not currently used, not fully tested.
2043 #    sub peek {
2044 #        # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2045 #        # record.  Not callable from an each_line_handler(), nor does it call
2046 #        # an each_line_handler() on the line.
2047 #
2048 #        my $self = shift;
2049 #        my $addr = main::objaddr $self;
2050 #
2051 #        foreach my $inserted_ref (@{$added_lines{$addr}}) {
2052 #            my ($adjusted, $line) = @{$inserted_ref};
2053 #            next if $adjusted;
2054 #
2055 #            # Remove comments and trailing space, and return a non-empty
2056 #            # resulting line
2057 #            $line =~ s/#.*//;
2058 #            $line =~ s/\s+$//;
2059 #            return $line if $line ne "";
2060 #        }
2061 #
2062 #        return if ! ref $handle{$addr}; # Don't read unless is real file
2063 #        while (1) { # Loop until find non-comment, non-empty line
2064 #            local $to_trace = 1 if main::DEBUG;
2065 #            trace $_ if main::DEBUG && $to_trace;
2066 #            return if ! defined (my $line = readline $handle{$addr});
2067 #            chomp $line;
2068 #            push @{$added_lines{$addr}}, [ 0, $line ];
2069 #
2070 #            $line =~ s/#.*//;
2071 #            $line =~ s/\s+$//;
2072 #            return $line if $line ne "";
2073 #        }
2074 #
2075 #        return;
2076 #    }
2077
2078
2079     sub insert_lines {
2080         # Lines can be inserted so that it looks like they were in the input
2081         # file at the place it was when this routine is called.  See also
2082         # insert_adjusted_lines().  Lines inserted via this routine go through
2083         # any each_line_handler()
2084
2085         my $self = shift;
2086
2087         # Each inserted line is an array, with the first element being 0 to
2088         # indicate that this line hasn't been adjusted, and needs to be
2089         # processed.
2090         push @{$added_lines{main::objaddr $self}}, map { [ 0, $_ ] } @_;
2091         return;
2092     }
2093
2094     sub insert_adjusted_lines {
2095         # Lines can be inserted so that it looks like they were in the input
2096         # file at the place it was when this routine is called.  See also
2097         # insert_lines().  Lines inserted via this routine are already fully
2098         # adjusted, ready to be processed; each_line_handler()s handlers will
2099         # not be called.  This means this is not a completely general
2100         # facility, as only the last each_line_handler on the stack should
2101         # call this.  It could be made more general, by passing to each of the
2102         # line_handlers their position on the stack, which they would pass on
2103         # to this routine, and that would replace the boolean first element in
2104         # the anonymous array pushed here, so that the next_line routine could
2105         # use that to call only those handlers whose index is after it on the
2106         # stack.  But this is overkill for what is needed now.
2107
2108         my $self = shift;
2109         trace $_[0] if main::DEBUG && $to_trace;
2110
2111         # Each inserted line is an array, with the first element being 1 to
2112         # indicate that this line has been adjusted
2113         push @{$added_lines{main::objaddr $self}}, map { [ 1, $_ ] } @_;
2114         return;
2115     }
2116
2117     sub get_missings {
2118         # Returns the stored up @missings lines' values, and clears the list.
2119         # The values are in an array, consisting of the default in the first
2120         # element, and the property in the 2nd.  However, since these lines
2121         # can be stacked up, the return is an array of all these arrays.
2122
2123         my $self = shift;
2124         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2125
2126         my $addr = main::objaddr $self;
2127
2128         # If not accepting a list return, just return the first one.
2129         return shift @{$missings{$addr}} unless wantarray;
2130
2131         my @return = @{$missings{$addr}};
2132         undef @{$missings{$addr}};
2133         return @return;
2134     }
2135
2136     sub _insert_property_into_line {
2137         # Add a property field to $_, if this file requires it.
2138
2139         my $property = $property{main::objaddr shift};
2140         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2141
2142         $_ =~ s/(;|$)/; $property$1/;
2143         return;
2144     }
2145
2146     sub carp_bad_line {
2147         # Output consistent error messages, using either a generic one, or the
2148         # one given by the optional parameter.  To avoid gazillions of the
2149         # same message in case the syntax of a  file is way off, this routine
2150         # only outputs the first instance of each message, incrementing a
2151         # count so the totals can be output at the end of the file.
2152
2153         my $self = shift;
2154         my $message = shift;
2155         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2156
2157         my $addr = main::objaddr $self;
2158
2159         $message = 'Unexpected line' unless $message;
2160
2161         # No trailing punctuation so as to fit with our addenda.
2162         $message =~ s/[.:;,]$//;
2163
2164         # If haven't seen this exact message before, output it now.  Otherwise
2165         # increment the count of how many times it has occurred
2166         unless ($errors{$addr}->{$message}) {
2167             Carp::my_carp("$message in '$_' in "
2168                             . $file{main::objaddr $self}
2169                             . " at line $..  Skipping this line;");
2170             $errors{$addr}->{$message} = 1;
2171         }
2172         else {
2173             $errors{$addr}->{$message}++;
2174         }
2175
2176         # Clear the line to prevent any further (meaningful) processing of it.
2177         $_ = "";
2178
2179         return;
2180     }
2181 } # End closure
2182
2183 package Multi_Default;
2184
2185 # Certain properties in early versions of Unicode had more than one possible
2186 # default for code points missing from the files.  In these cases, one
2187 # default applies to everything left over after all the others are applied,
2188 # and for each of the others, there is a description of which class of code
2189 # points applies to it.  This object helps implement this by storing the
2190 # defaults, and for all but that final default, an eval string that generates
2191 # the class that it applies to.
2192
2193
2194 {   # Closure
2195
2196     main::setup_package();
2197
2198     my %class_defaults;
2199     # The defaults structure for the classes
2200     main::set_access('class_defaults', \%class_defaults);
2201
2202     my %other_default;
2203     # The default that applies to everything left over.
2204     main::set_access('other_default', \%other_default, 'r');
2205
2206
2207     sub new {
2208         # The constructor is called with default => eval pairs, terminated by
2209         # the left-over default. e.g.
2210         # Multi_Default->new(
2211         #        'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2212         #               -  0x200D',
2213         #        'R' => 'some other expression that evaluates to code points',
2214         #        .
2215         #        .
2216         #        .
2217         #        'U'));
2218
2219         my $class = shift;
2220
2221         my $self = bless \do{my $anonymous_scalar}, $class;
2222         my $addr = main::objaddr($self);
2223
2224         while (@_ > 1) {
2225             my $default = shift;
2226             my $eval = shift;
2227             $class_defaults{$addr}->{$default} = $eval;
2228         }
2229
2230         $other_default{$addr} = shift;
2231
2232         return $self;
2233     }
2234
2235     sub get_next_defaults {
2236         # Iterates and returns the next class of defaults.
2237         my $self = shift;
2238         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2239
2240         my $addr = main::objaddr $self;
2241
2242         return each %{$class_defaults{$addr}};
2243     }
2244 }
2245
2246 package Alias;
2247
2248 # An alias is one of the names that a table goes by.  This class defines them
2249 # including some attributes.  Everything is currently setup in the
2250 # constructor.
2251
2252
2253 {   # Closure
2254
2255     main::setup_package();
2256
2257     my %name;
2258     main::set_access('name', \%name, 'r');
2259
2260     my %loose_match;
2261     # Determined by the constructor code if this name should match loosely or
2262     # not.  The constructor parameters can override this, but it isn't fully
2263     # implemented, as should have ability to override Unicode one's via
2264     # something like a set_loose_match()
2265     main::set_access('loose_match', \%loose_match, 'r');
2266
2267     my %make_pod_entry;
2268     # Some aliases should not get their own entries because they are covered
2269     # by a wild-card, and some we want to discourage use of.  Binary
2270     main::set_access('make_pod_entry', \%make_pod_entry, 'r');
2271
2272     my %status;
2273     # Aliases have a status, like deprecated, or even suppressed (which means
2274     # they don't appear in documentation).  Enum
2275     main::set_access('status', \%status, 'r');
2276
2277     my %externally_ok;
2278     # Similarly, some aliases should not be considered as usable ones for
2279     # external use, such as file names, or we don't want documentation to
2280     # recommend them.  Boolean
2281     main::set_access('externally_ok', \%externally_ok, 'r');
2282
2283     sub new {
2284         my $class = shift;
2285
2286         my $self = bless \do { my $anonymous_scalar }, $class;
2287         my $addr = main::objaddr($self);
2288
2289         $name{$addr} = shift;
2290         $loose_match{$addr} = shift;
2291         $make_pod_entry{$addr} = shift;
2292         $externally_ok{$addr} = shift;
2293         $status{$addr} = shift;
2294
2295         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2296
2297         # Null names are never ok externally
2298         $externally_ok{$addr} = 0 if $name{$addr} eq "";
2299
2300         return $self;
2301     }
2302 }
2303
2304 package Range;
2305
2306 # A range is the basic unit for storing code points, and is described in the
2307 # comments at the beginning of the program.  Each range has a starting code
2308 # point; an ending code point (not less than the starting one); a value
2309 # that applies to every code point in between the two end-points, inclusive;
2310 # and an enum type that applies to the value.  The type is for the user's
2311 # convenience, and has no meaning here, except that a non-zero type is
2312 # considered to not obey the normal Unicode rules for having standard forms.
2313 #
2314 # The same structure is used for both map and match tables, even though in the
2315 # latter, the value (and hence type) is irrelevant and could be used as a
2316 # comment.  In map tables, the value is what all the code points in the range
2317 # map to.  Type 0 values have the standardized version of the value stored as
2318 # well, so as to not have to recalculate it a lot.
2319
2320 sub trace { return main::trace(@_); }
2321
2322 {   # Closure
2323
2324     main::setup_package();
2325
2326     my %start;
2327     main::set_access('start', \%start, 'r', 's');
2328
2329     my %end;
2330     main::set_access('end', \%end, 'r', 's');
2331
2332     my %value;
2333     main::set_access('value', \%value, 'r');
2334
2335     my %type;
2336     main::set_access('type', \%type, 'r');
2337
2338     my %standard_form;
2339     # The value in internal standard form.  Defined only if the type is 0.
2340     main::set_access('standard_form', \%standard_form);
2341
2342     # Note that if these fields change, the dump() method should as well
2343
2344     sub new {
2345         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2346         my $class = shift;
2347
2348         my $self = bless \do { my $anonymous_scalar }, $class;
2349         my $addr = main::objaddr($self);
2350
2351         $start{$addr} = shift;
2352         $end{$addr} = shift;
2353
2354         my %args = @_;
2355
2356         my $value = delete $args{'Value'};  # Can be 0
2357         $value = "" unless defined $value;
2358         $value{$addr} = $value;
2359
2360         $type{$addr} = delete $args{'Type'} || 0;
2361
2362         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2363
2364         if (! $type{$addr}) {
2365             $standard_form{$addr} = main::standardize($value);
2366         }
2367
2368         return $self;
2369     }
2370
2371     use overload
2372         fallback => 0,
2373         qw("") => "_operator_stringify",
2374         "." => \&main::_operator_dot,
2375     ;
2376
2377     sub _operator_stringify {
2378         my $self = shift;
2379         my $addr = main::objaddr $self;
2380
2381         # Output it like '0041..0065 (value)'
2382         my $return = sprintf("%04X", $start{$addr})
2383                         .  '..'
2384                         . sprintf("%04X", $end{$addr});
2385         my $value = $value{$addr};
2386         my $type = $type{$addr};
2387         $return .= ' (';
2388         $return .= "$value";
2389         $return .= ", Type=$type" if $type != 0;
2390         $return .= ')';
2391
2392         return $return;
2393     }
2394
2395     sub standard_form {
2396         # The standard form is the value itself if the standard form is
2397         # undefined (that is if the value is special)
2398
2399         my $self = shift;
2400         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2401
2402         my $addr = main::objaddr $self;
2403
2404         return $standard_form{$addr} if defined $standard_form{$addr};
2405         return $value{$addr};
2406     }
2407
2408     sub dump {
2409         # Human, not machine readable.  For machine readable, comment out this
2410         # entire routine and let the standard one take effect.
2411         my $self = shift;
2412         my $indent = shift;
2413         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2414
2415         my $addr = main::objaddr $self;
2416
2417         my $return = $indent
2418                     . sprintf("%04X", $start{$addr})
2419                     . '..'
2420                     . sprintf("%04X", $end{$addr})
2421                     . " '$value{$addr}';";
2422         if (! defined $standard_form{$addr}) {
2423             $return .= "(type=$type{$addr})";
2424         }
2425         elsif ($standard_form{$addr} ne $value{$addr}) {
2426             $return .= "(standard '$standard_form{$addr}')";
2427         }
2428         return $return;
2429     }
2430 } # End closure
2431
2432 package _Range_List_Base;
2433
2434 # Base class for range lists.  A range list is simply an ordered list of
2435 # ranges, so that the ranges with the lowest starting numbers are first in it.
2436 #
2437 # When a new range is added that is adjacent to an existing range that has the
2438 # same value and type, it merges with it to form a larger range.
2439 #
2440 # Ranges generally do not overlap, except that there can be multiple entries
2441 # of single code point ranges.  This is because of NameAliases.txt.
2442 #
2443 # In this program, there is a standard value such that if two different
2444 # values, have the same standard value, they are considered equivalent.  This
2445 # value was chosen so that it gives correct results on Unicode data
2446
2447 # There are a number of methods to manipulate range lists, and some operators
2448 # are overloaded to handle them.
2449
2450 # Because of the slowness of pure Perl objaddr() on miniperl, and measurements
2451 # showing this package was using a lot of real time calculating that, the code
2452 # was changed to only calculate it once per call stack.  This is done by
2453 # consistently using the package variable $addr in routines, and only calling
2454 # objaddr() if it isn't defined, and setting that to be local, so that callees
2455 # will have it already.  It would be a good thing to change this. XXX
2456
2457 sub trace { return main::trace(@_); }
2458
2459 { # Closure
2460
2461     our $addr;
2462
2463     main::setup_package();
2464
2465     my %ranges;
2466     # The list of ranges
2467     main::set_access('ranges', \%ranges, 'readable_array');
2468
2469     my %max;
2470     # The highest code point in the list.  This was originally a method, but
2471     # actual measurements said it was used a lot.
2472     main::set_access('max', \%max, 'r');
2473
2474     my %each_range_iterator;
2475     # Iterator position for each_range()
2476     main::set_access('each_range_iterator', \%each_range_iterator);
2477
2478     my %owner_name_of;
2479     # Name of parent this is attached to, if any.  Solely for better error
2480     # messages.
2481     main::set_access('owner_name_of', \%owner_name_of, 'p_r');
2482
2483     my %_search_ranges_cache;
2484     # A cache of the previous result from _search_ranges(), for better
2485     # performance
2486     main::set_access('_search_ranges_cache', \%_search_ranges_cache);
2487
2488     sub new {
2489         my $class = shift;
2490         my %args = @_;
2491
2492         # Optional initialization data for the range list.
2493         my $initialize = delete $args{'Initialize'};
2494
2495         my $self;
2496
2497         # Use _union() to initialize.  _union() returns an object of this
2498         # class, which means that it will call this constructor recursively.
2499         # But it won't have this $initialize parameter so that it won't
2500         # infinitely loop on this.
2501         return _union($class, $initialize, %args) if defined $initialize;
2502
2503         $self = bless \do { my $anonymous_scalar }, $class;
2504         local $addr = main::objaddr($self);
2505
2506         # Optional parent object, only for debug info.
2507         $owner_name_of{$addr} = delete $args{'Owner'};
2508         $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
2509
2510         # Stringify, in case it is an object.
2511         $owner_name_of{$addr} = "$owner_name_of{$addr}";
2512
2513         # This is used only for error messages, and so a colon is added
2514         $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
2515
2516         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2517
2518         # Max is initialized to a negative value that isn't adjacent to 0,
2519         # for simpler tests
2520         $max{$addr} = -2;
2521
2522         $_search_ranges_cache{$addr} = 0;
2523         $ranges{$addr} = [];
2524
2525         return $self;
2526     }
2527
2528     use overload
2529         fallback => 0,
2530         qw("") => "_operator_stringify",
2531         "." => \&main::_operator_dot,
2532     ;
2533
2534     sub _operator_stringify {
2535         my $self = shift;
2536         local $addr = main::objaddr($self) if !defined $addr;
2537
2538         return "Range_List attached to '$owner_name_of{$addr}'"
2539                                                 if $owner_name_of{$addr};
2540         return "anonymous Range_List " . \$self;
2541     }
2542
2543     sub _union {
2544         # Returns the union of the input code points.  It can be called as
2545         # either a constructor or a method.  If called as a method, the result
2546         # will be a new() instance of the calling object, containing the union
2547         # of that object with the other parameter's code points;  if called as
2548         # a constructor, the first parameter gives the class the new object
2549         # should be, and the second parameter gives the code points to go into
2550         # it.
2551         # In either case, there are two parameters looked at by this routine;
2552         # any additional parameters are passed to the new() constructor.
2553         #
2554         # The code points can come in the form of some object that contains
2555         # ranges, and has a conventionally named method to access them; or
2556         # they can be an array of individual code points (as integers); or
2557         # just a single code point.
2558         #
2559         # If they are ranges, this routine doesn't make any effort to preserve
2560         # the range values of one input over the other.  Therefore this base
2561         # class should not allow _union to be called from other than
2562         # initialization code, so as to prevent two tables from being added
2563         # together where the range values matter.  The general form of this
2564         # routine therefore belongs in a derived class, but it was moved here
2565         # to avoid duplication of code.  The failure to overload this in this
2566         # class keeps it safe.
2567         #
2568
2569         my $self;
2570         my @args;   # Arguments to pass to the constructor
2571
2572         my $class = shift;
2573
2574         # If a method call, will start the union with the object itself, and
2575         # the class of the new object will be the same as self.
2576         if (ref $class) {
2577             $self = $class;
2578             $class = ref $self;
2579             push @args, $self;
2580         }
2581
2582         # Add the other required parameter.
2583         push @args, shift;
2584         # Rest of parameters are passed on to the constructor
2585
2586         # Accumulate all records from both lists.
2587         my @records;
2588         for my $arg (@args) {
2589             #local $to_trace = 0 if main::DEBUG;
2590             trace "argument = $arg" if main::DEBUG && $to_trace;
2591             if (! defined $arg) {
2592                 my $message = "";
2593                 if (defined $self) {
2594                     $message .= $owner_name_of{main::objaddr $self};
2595                 }
2596                 Carp::my_carp_bug($message .= "Undefined argument to _union.  No union done.");
2597                 return;
2598             }
2599             $arg = [ $arg ] if ! ref $arg;
2600             my $type = ref $arg;
2601             if ($type eq 'ARRAY') {
2602                 foreach my $element (@$arg) {
2603                     push @records, Range->new($element, $element);
2604                 }
2605             }
2606             elsif ($arg->isa('Range')) {
2607                 push @records, $arg;
2608             }
2609             elsif ($arg->can('ranges')) {
2610                 push @records, $arg->ranges;
2611             }
2612             else {
2613                 my $message = "";
2614                 if (defined $self) {
2615                     $message .= $owner_name_of{main::objaddr $self};
2616                 }
2617                 Carp::my_carp_bug($message . "Cannot take the union of a $type.  No union done.");
2618                 return;
2619             }
2620         }
2621
2622         # Sort with the range containing the lowest ordinal first, but if
2623         # two ranges start at the same code point, sort with the bigger range
2624         # of the two first, because it takes fewer cycles.
2625         @records = sort { ($a->start <=> $b->start)
2626                                       or
2627                                     # if b is shorter than a, b->end will be
2628                                     # less than a->end, and we want to select
2629                                     # a, so want to return -1
2630                                     ($b->end <=> $a->end)
2631                                    } @records;
2632
2633         my $new = $class->new(@_);
2634
2635         # Fold in records so long as they add new information.
2636         for my $set (@records) {
2637             my $start = $set->start;
2638             my $end   = $set->end;
2639             my $value   = $set->value;
2640             if ($start > $new->max) {
2641                 $new->_add_delete('+', $start, $end, $value);
2642             }
2643             elsif ($end > $new->max) {
2644                 $new->_add_delete('+', $new->max +1, $end, $value);
2645             }
2646         }
2647
2648         return $new;
2649     }
2650
2651     sub range_count {        # Return the number of ranges in the range list
2652         my $self = shift;
2653         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2654
2655         local $addr = main::objaddr($self) if ! defined $addr;
2656
2657         return scalar @{$ranges{$addr}};
2658     }
2659
2660     sub min {
2661         # Returns the minimum code point currently in the range list, or if
2662         # the range list is empty, 2 beyond the max possible.  This is a
2663         # method because used so rarely, that not worth saving between calls,
2664         # and having to worry about changing it as ranges are added and
2665         # deleted.
2666
2667         my $self = shift;
2668         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2669
2670         local $addr = main::objaddr($self) if ! defined $addr;
2671
2672         # If the range list is empty, return a large value that isn't adjacent
2673         # to any that could be in the range list, for simpler tests
2674         return $LAST_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
2675         return $ranges{$addr}->[0]->start;
2676     }
2677
2678     sub contains {
2679         # Boolean: Is argument in the range list?  If so returns $i such that:
2680         #   range[$i]->end < $codepoint <= range[$i+1]->end
2681         # which is one beyond what you want; this is so that the 0th range
2682         # doesn't return false
2683         my $self = shift;
2684         my $codepoint = shift;
2685         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2686
2687         local $addr = main::objaddr $self if ! defined $addr;
2688
2689         my $i = $self->_search_ranges($codepoint);
2690         return 0 unless defined $i;
2691
2692         # The search returns $i, such that
2693         #   range[$i-1]->end < $codepoint <= range[$i]->end
2694         # So is in the table if and only iff it is at least the start position
2695         # of range $i.
2696         return 0 if $ranges{$addr}->[$i]->start > $codepoint;
2697         return $i + 1;
2698     }
2699
2700     sub value_of {
2701         # Returns the value associated with the code point, undef if none
2702
2703         my $self = shift;
2704         my $codepoint = shift;
2705         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2706
2707         local $addr = main::objaddr $self if ! defined $addr;
2708
2709         my $i = $self->contains($codepoint);
2710         return unless $i;
2711
2712         # contains() returns 1 beyond where we should look
2713         return $ranges{$addr}->[$i-1]->value;
2714     }
2715
2716     sub _search_ranges {
2717         # Find the range in the list which contains a code point, or where it
2718         # should go if were to add it.  That is, it returns $i, such that:
2719         #   range[$i-1]->end < $codepoint <= range[$i]->end
2720         # Returns undef if no such $i is possible (e.g. at end of table), or
2721         # if there is an error.
2722
2723         my $self = shift;
2724         my $code_point = shift;
2725         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2726
2727         local $addr = main::objaddr $self if ! defined $addr;
2728
2729         return if $code_point > $max{$addr};
2730         my $r = $ranges{$addr};                # The current list of ranges
2731         my $range_list_size = scalar @$r;
2732         my $i;
2733
2734         use integer;        # want integer division
2735
2736         # Use the cached result as the starting guess for this one, because,
2737         # an experiment on 5.1 showed that 90% of the time the cache was the
2738         # same as the result on the next call (and 7% it was one less).
2739         $i = $_search_ranges_cache{$addr};
2740         $i = 0 if $i >= $range_list_size;   # Reset if no longer valid (prob.
2741                                             # from an intervening deletion
2742         #local $to_trace = 1 if main::DEBUG;
2743         trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
2744         return $i if $code_point <= $r->[$i]->end
2745                      && ($i == 0 || $r->[$i-1]->end < $code_point);
2746
2747         # Here the cache doesn't yield the correct $i.  Try adding 1.
2748         if ($i < $range_list_size - 1
2749             && $r->[$i]->end < $code_point &&
2750             $code_point <= $r->[$i+1]->end)
2751         {
2752             $i++;
2753             trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
2754             $_search_ranges_cache{$addr} = $i;
2755             return $i;
2756         }
2757
2758         # Here, adding 1 also didn't work.  We do a binary search to
2759         # find the correct position, starting with current $i
2760         my $lower = 0;
2761         my $upper = $range_list_size - 1;
2762         while (1) {
2763             trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
2764
2765             if ($code_point <= $r->[$i]->end) {
2766
2767                 # Here we have met the upper constraint.  We can quit if we
2768                 # also meet the lower one.
2769                 last if $i == 0 || $r->[$i-1]->end < $code_point;
2770
2771                 $upper = $i;        # Still too high.
2772
2773             }
2774             else {
2775
2776                 # Here, $r[$i]->end < $code_point, so look higher up.
2777                 $lower = $i;
2778             }
2779
2780             # Split search domain in half to try again.
2781             my $temp = ($upper + $lower) / 2;
2782
2783             # No point in continuing unless $i changes for next time
2784             # in the loop.
2785             if ($temp == $i) {
2786
2787                 # We can't reach the highest element because of the averaging.
2788                 # So if one below the upper edge, force it there and try one
2789                 # more time.
2790                 if ($i == $range_list_size - 2) {
2791
2792                     trace "Forcing to upper edge" if main::DEBUG && $to_trace;
2793                     $i = $range_list_size - 1;
2794
2795                     # Change $lower as well so if fails next time through,
2796                     # taking the average will yield the same $i, and we will
2797                     # quit with the error message just below.
2798                     $lower = $i;
2799                     next;
2800                 }
2801                 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go.  No action taken.");
2802                 return;
2803             }
2804             $i = $temp;
2805         } # End of while loop
2806
2807         if (main::DEBUG && $to_trace) {
2808             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
2809             trace "i=  [ $i ]", $r->[$i];
2810             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
2811         }
2812
2813         # Here we have found the offset.  Cache it as a starting point for the
2814         # next call.
2815         $_search_ranges_cache{$addr} = $i;
2816         return $i;
2817     }
2818
2819     sub _add_delete {
2820         # Add, replace or delete ranges to or from a list.  The $type
2821         # parameter gives which:
2822         #   '+' => insert or replace a range, returning a list of any changed
2823         #          ranges.
2824         #   '-' => delete a range, returning a list of any deleted ranges.
2825         #
2826         # The next three parameters give respectively the start, end, and
2827         # value associated with the range.  'value' should be null unless the
2828         # operation is '+';
2829         #
2830         # The range list is kept sorted so that the range with the lowest
2831         # starting position is first in the list, and generally, adjacent
2832         # ranges with the same values are merged into single larger one (see
2833         # exceptions below).
2834         #
2835         # There are more parameters, all are key => value pairs:
2836         #   Type    gives the type of the value.  It is only valid for '+'.
2837         #           All ranges have types; if this parameter is omitted, 0 is
2838         #           assumed.  Ranges with type 0 are assumed to obey the
2839         #           Unicode rules for casing, etc; ranges with other types are
2840         #           not.  Otherwise, the type is arbitrary, for the caller's
2841         #           convenience, and looked at only by this routine to keep
2842         #           adjacent ranges of different types from being merged into
2843         #           a single larger range, and when Replace =>
2844         #           $IF_NOT_EQUIVALENT is specified (see just below).
2845         #   Replace  determines what to do if the range list already contains
2846         #            ranges which coincide with all or portions of the input
2847         #            range.  It is only valid for '+':
2848         #       => $NO            means that the new value is not to replace
2849         #                         any existing ones, but any empty gaps of the
2850         #                         range list coinciding with the input range
2851         #                         will be filled in with the new value.
2852         #       => $UNCONDITIONALLY  means to replace the existing values with
2853         #                         this one unconditionally.  However, if the
2854         #                         new and old values are identical, the
2855         #                         replacement is skipped to save cycles
2856         #       => $IF_NOT_EQUIVALENT means to replace the existing values
2857         #                         with this one if they are not equivalent.
2858         #                         Ranges are equivalent if their types are the
2859         #                         same, and they are the same string, or if
2860         #                         both are type 0 ranges, if their Unicode
2861         #                         standard forms are identical.  In this last
2862         #                         case, the routine chooses the more "modern"
2863         #                         one to use.  This is because some of the
2864         #                         older files are formatted with values that
2865         #                         are, for example, ALL CAPs, whereas the
2866         #                         derived files have a more modern style,
2867         #                         which looks better.  By looking for this
2868         #                         style when the pre-existing and replacement
2869         #                         standard forms are the same, we can move to
2870         #                         the modern style
2871         #       => $MULTIPLE      means that if this range duplicates an
2872         #                         existing one, but has a different value,
2873         #                         don't replace the existing one, but insert
2874         #                         this, one so that the same range can occur
2875         #                         multiple times.
2876         #       => anything else  is the same as => $IF_NOT_EQUIVALENT
2877         #
2878         # "same value" means identical for type-0 ranges, and it means having
2879         # the same standard forms for non-type-0 ranges.
2880
2881         return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
2882
2883         my $self = shift;
2884         my $operation = shift;   # '+' for add/replace; '-' for delete;
2885         my $start = shift;
2886         my $end   = shift;
2887         my $value = shift;
2888
2889         my %args = @_;
2890
2891         $value = "" if not defined $value;        # warning: $value can be "0"
2892
2893         my $replace = delete $args{'Replace'};
2894         $replace = $IF_NOT_EQUIVALENT unless defined $replace;
2895
2896         my $type = delete $args{'Type'};
2897         $type = 0 unless defined $type;
2898
2899         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2900
2901         local $addr = main::objaddr($self) if ! defined $addr;
2902
2903         if ($operation ne '+' && $operation ne '-') {
2904             Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'.  No action taken.");
2905             return;
2906         }
2907         unless (defined $start && defined $end) {
2908             Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete.  No action taken.");
2909             return;
2910         }
2911         unless ($end >= $start) {
2912             Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . ").  No action taken.");
2913             return;
2914         }
2915         #local $to_trace = 1 if main::DEBUG;
2916
2917         if ($operation eq '-') {
2918             if ($replace != $IF_NOT_EQUIVALENT) {
2919                 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list.  Assuming Replace => \$IF_NOT_EQUIVALENT.");
2920                 $replace = $IF_NOT_EQUIVALENT;
2921             }
2922             if ($type) {
2923                 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list.  Assuming Type => 0.");
2924                 $type = 0;
2925             }
2926             if ($value ne "") {
2927                 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list.  Assuming Value => \"\".");
2928                 $value = "";
2929             }
2930         }
2931
2932         my $r = $ranges{$addr};               # The current list of ranges
2933         my $range_list_size = scalar @$r;     # And its size
2934         my $max = $max{$addr};                # The current high code point in
2935                                               # the list of ranges
2936
2937         # Do a special case requiring fewer machine cycles when the new range
2938         # starts after the current highest point.  The Unicode input data is
2939         # structured so this is common.
2940         if ($start > $max) {
2941
2942             trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace;
2943             return if $operation eq '-'; # Deleting a non-existing range is a
2944                                          # no-op
2945
2946             # If the new range doesn't logically extend the current final one
2947             # in the range list, create a new range at the end of the range
2948             # list.  (max cleverly is initialized to a negative number not
2949             # adjacent to 0 if the range list is empty, so even adding a range
2950             # to an empty range list starting at 0 will have this 'if'
2951             # succeed.)
2952             if ($start > $max + 1        # non-adjacent means can't extend.
2953                 || @{$r}[-1]->value ne $value # values differ, can't extend.
2954                 || @{$r}[-1]->type != $type # types differ, can't extend.
2955             ) {
2956                 push @$r, Range->new($start, $end,
2957                                      Value => $value,
2958                                      Type => $type);
2959             }
2960             else {
2961
2962                 # Here, the new range starts just after the current highest in
2963                 # the range list, and they have the same type and value.
2964                 # Extend the current range to incorporate the new one.
2965                 @{$r}[-1]->set_end($end);
2966             }
2967
2968             # This becomes the new maximum.
2969             $max{$addr} = $end;
2970
2971             return;
2972         }
2973         #local $to_trace = 0 if main::DEBUG;
2974
2975         trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
2976
2977         # Here, the input range isn't after the whole rest of the range list.
2978         # Most likely 'splice' will be needed.  The rest of the routine finds
2979         # the needed splice parameters, and if necessary, does the splice.
2980         # First, find the offset parameter needed by the splice function for
2981         # the input range.  Note that the input range may span multiple
2982         # existing ones, but we'll worry about that later.  For now, just find
2983         # the beginning.  If the input range is to be inserted starting in a
2984         # position not currently in the range list, it must (obviously) come
2985         # just after the range below it, and just before the range above it.
2986         # Slightly less obviously, it will occupy the position currently
2987         # occupied by the range that is to come after it.  More formally, we
2988         # are looking for the position, $i, in the array of ranges, such that:
2989         #
2990         # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
2991         #
2992         # (The ordered relationships within existing ranges are also shown in
2993         # the equation above).  However, if the start of the input range is
2994         # within an existing range, the splice offset should point to that
2995         # existing range's position in the list; that is $i satisfies a
2996         # somewhat different equation, namely:
2997         #
2998         #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
2999         #
3000         # More briefly, $start can come before or after r[$i]->start, and at
3001         # this point, we don't know which it will be.  However, these
3002         # two equations share these constraints:
3003         #
3004         #   r[$i-1]->end < $start <= r[$i]->end
3005         #
3006         # And that is good enough to find $i.
3007
3008         my $i = $self->_search_ranges($start);
3009         if (! defined $i) {
3010             Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined.  Operation '$operation' not performed");
3011             return;
3012         }
3013
3014         # The search function returns $i such that:
3015         #
3016         # r[$i-1]->end < $start <= r[$i]->end
3017         #
3018         # That means that $i points to the first range in the range list
3019         # that could possibly be affected by this operation.  We still don't
3020         # know if the start of the input range is within r[$i], or if it
3021         # points to empty space between r[$i-1] and r[$i].
3022         trace "[$i] is the beginning splice point.  Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3023
3024         # Special case the insertion of data that is not to replace any
3025         # existing data.
3026         if ($replace == $NO) {  # If $NO, has to be operation '+'
3027             #local $to_trace = 1 if main::DEBUG;
3028             trace "Doesn't replace" if main::DEBUG && $to_trace;
3029
3030             # Here, the new range is to take effect only on those code points
3031             # that aren't already in an existing range.  This can be done by
3032             # looking through the existing range list and finding the gaps in
3033             # the ranges that this new range affects, and then calling this
3034             # function recursively on each of those gaps, leaving untouched
3035             # anything already in the list.  Gather up a list of the changed
3036             # gaps first so that changes to the internal state as new ranges
3037             # are added won't be a problem.
3038             my @gap_list;
3039
3040             # First, if the starting point of the input range is outside an
3041             # existing one, there is a gap from there to the beginning of the
3042             # existing range -- add a span to fill the part that this new
3043             # range occupies
3044             if ($start < $r->[$i]->start) {
3045                 push @gap_list, Range->new($start,
3046                                            main::min($end,
3047                                                      $r->[$i]->start - 1),
3048                                            Type => $type);
3049                 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3050             }
3051
3052             # Then look through the range list for other gaps until we reach
3053             # the highest range affected by the input one.
3054             my $j;
3055             for ($j = $i+1; $j < $range_list_size; $j++) {
3056                 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3057                 last if $end < $r->[$j]->start;
3058
3059                 # If there is a gap between when this range starts and the
3060                 # previous one ends, add a span to fill it.  Note that just
3061                 # because there are two ranges doesn't mean there is a
3062                 # non-zero gap between them.  It could be that they have
3063                 # different values or types
3064                 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3065                     push @gap_list,
3066                         Range->new($r->[$j-1]->end + 1,
3067                                    $r->[$j]->start - 1,
3068                                    Type => $type);
3069                     trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3070                 }
3071             }
3072
3073             # Here, we have either found an existing range in the range list,
3074             # beyond the area affected by the input one, or we fell off the
3075             # end of the loop because the input range affects the whole rest
3076             # of the range list.  In either case, $j is 1 higher than the
3077             # highest affected range.  If $j == $i, it means that there are no
3078             # affected ranges, that the entire insertion is in the gap between
3079             # r[$i-1], and r[$i], which we already have taken care of before
3080             # the loop.
3081             # On the other hand, if there are affected ranges, it might be
3082             # that there is a gap that needs filling after the final such
3083             # range to the end of the input range
3084             if ($r->[$j-1]->end < $end) {
3085                     push @gap_list, Range->new(main::max($start,
3086                                                          $r->[$j-1]->end + 1),
3087                                                $end,
3088                                                Type => $type);
3089                     trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3090             }
3091
3092             # Call recursively to fill in all the gaps.
3093             foreach my $gap (@gap_list) {
3094                 $self->_add_delete($operation,
3095                                    $gap->start,
3096                                    $gap->end,
3097                                    $value,
3098                                    Type => $type);
3099             }
3100
3101             return;
3102         }
3103
3104         # Here, we have taken care of the case where $replace is $NO, which
3105         # means that whatever action we now take is done unconditionally.  It
3106         # still could be that this call will result in a no-op, if duplicates
3107         # aren't allowed, and we are inserting a range that merely duplicates
3108         # data already in the range list; or also if deleting a non-existent
3109         # range.
3110         # $i still points to the first potential affected range.  Now find the
3111         # highest range affected, which will determine the length parameter to
3112         # splice.  (The input range can span multiple existing ones.)  While
3113         # we are looking through the range list, see also if this is an
3114         # insertion that will change the values of at least one of the
3115         # affected ranges.  We don't need to do this check unless this is an
3116         # insertion of non-multiples, and also since this is a boolean, we
3117         # don't need to do it if have already determined that it will make a
3118         # change; just unconditionally change them.  $cdm is created to be 1
3119         # if either of these is true. (The 'c' in the name comes from below)
3120         my $cdm = ($operation eq '-' || $replace == $MULTIPLE);
3121         my $j;        # This will point to the highest affected range
3122
3123         # For non-zero types, the standard form is the value itself;
3124         my $standard_form = ($type) ? $value : main::standardize($value);
3125
3126         for ($j = $i; $j < $range_list_size; $j++) {
3127             trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3128
3129             # If find a range that it doesn't overlap into, we can stop
3130             # searching
3131             last if $end < $r->[$j]->start;
3132
3133             # Here, overlaps the range at $j.  If the value's don't match,
3134             # and this is supposedly an insertion, it becomes a change
3135             # instead.  This is what the 'c' stands for in $cdm.
3136             if (! $cdm) {
3137                 if ($r->[$j]->standard_form ne $standard_form) {
3138                     $cdm = 1;
3139                 }
3140                 else {
3141
3142                     # Here, the two values are essentially the same.  If the
3143                     # two are actually identical, replacing wouldn't change
3144                     # anything so skip it.
3145                     my $pre_existing = $r->[$j]->value;
3146                     if ($pre_existing ne $value) {
3147
3148                         # Here the new and old standardized values are the
3149                         # same, but the non-standardized values aren't.  If
3150                         # replacing unconditionally, then replace
3151                         if( $replace == $UNCONDITIONALLY) {
3152                             $cdm = 1;
3153                         }
3154                         else {
3155
3156                             # Here, are replacing conditionally.  Decide to
3157                             # replace or not based on which appears to look
3158                             # the "nicest".  If one is mixed case and the
3159                             # other isn't, choose the mixed case one.
3160                             my $new_mixed = $value =~ /[A-Z]/
3161                                             && $value =~ /[a-z]/;
3162                             my $old_mixed = $pre_existing =~ /[A-Z]/
3163                                             && $pre_existing =~ /[a-z]/;
3164
3165                             if ($old_mixed != $new_mixed) {
3166                                 $cdm = 1 if $new_mixed;
3167                                 if (main::DEBUG && $to_trace) {
3168                                     if ($cdm) {
3169                                         trace "Replacing $pre_existing with $value";
3170                                     }
3171                                     else {
3172                                         trace "Retaining $pre_existing over $value";
3173                                     }
3174                                 }
3175                             }
3176                             else {
3177
3178                                 # Here casing wasn't different between the two.
3179                                 # If one has hyphens or underscores and the
3180                                 # other doesn't, choose the one with the
3181                                 # punctuation.
3182                                 my $new_punct = $value =~ /[-_]/;
3183                                 my $old_punct = $pre_existing =~ /[-_]/;
3184
3185                                 if ($old_punct != $new_punct) {
3186                                     $cdm = 1 if $new_punct;
3187                                     if (main::DEBUG && $to_trace) {
3188                                         if ($cdm) {
3189                                             trace "Replacing $pre_existing with $value";
3190                                         }
3191                                         else {
3192                                             trace "Retaining $pre_existing over $value";
3193                                         }
3194                                     }
3195                                 }   # else existing one is just as "good";
3196                                     # retain it to save cycles.
3197                             }
3198                         }
3199                     }
3200                 }
3201             }
3202         } # End of loop looking for highest affected range.
3203
3204         # Here, $j points to one beyond the highest range that this insertion
3205         # affects (hence to beyond the range list if that range is the final
3206         # one in the range list).
3207
3208         # The splice length is all the affected ranges.  Get it before
3209         # subtracting, for efficiency, so we don't have to later add 1.
3210         my $length = $j - $i;
3211
3212         $j--;        # $j now points to the highest affected range.
3213         trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3214
3215         # If inserting a multiple record, this is where it goes, after all the
3216         # existing ones for this range.  This implies an insertion, and no
3217         # change to any existing ranges.  Note that $j can be -1 if this new
3218         # range doesn't actually duplicate any existing, and comes at the
3219         # beginning of the list, in which case we can handle it like any other
3220         # insertion, and is easier to do so.
3221         if ($replace == $MULTIPLE && $j >= 0) {
3222
3223             # This restriction could be remedied with a little extra work, but
3224             # it won't hopefully ever be necessary
3225             if ($r->[$j]->start != $r->[$j]->end) {
3226                 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple when the other range ($r->[$j]) contains more than one code point.  No action taken.");
3227                 return;
3228             }
3229
3230             # Don't add an exact duplicate, as it isn't really a multiple
3231             return if $value eq $r->[$j]->value && $type eq $r->[$j]->type;
3232
3233             trace "Adding multiple record at $j+1 with $start..$end, $value" if main::DEBUG && $to_trace;
3234             my @return = splice @$r,
3235                                 $j+1,
3236                                 0,
3237                                 Range->new($start,
3238                                            $end,
3239                                            Value => $value,
3240                                            Type => $type);
3241             if (main::DEBUG && $to_trace) {
3242                 trace "After splice:";
3243                 trace 'j-2=[', $j-2, ']', $r->[$j-2] if $j >= 2;
3244                 trace 'j-1=[', $j-1, ']', $r->[$j-1] if $j >= 1;
3245                 trace "j  =[", $j, "]", $r->[$j] if $j >= 0;
3246                 trace 'j+1=[', $j+1, ']', $r->[$j+1] if $j < @$r - 1;
3247                 trace 'j+2=[', $j+2, ']', $r->[$j+2] if $j < @$r - 2;
3248                 trace 'j+3=[', $j+3, ']', $r->[$j+3] if $j < @$r - 3;
3249             }
3250             return @return;
3251         }
3252
3253         # Here, have taken care of $NO and $MULTIPLE replaces.
3254         # $j points to the highest affected range.  But it can be < $i or even
3255         # -1.  These happen only if the insertion is entirely in the gap
3256         # between r[$i-1] and r[$i].  Here's why: j < i means that the j loop
3257         # above exited first time through with $end < $r->[$i]->start.  (And
3258         # then we subtracted one from j)  This implies also that $start <
3259         # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3260         # $start, so the entire input range is in the gap.
3261         if ($j < $i) {
3262
3263             # Here the entire input range is in the gap before $i.
3264
3265             if (main::DEBUG && $to_trace) {
3266                 if ($i) {
3267                     trace "Entire range is between $r->[$i-1] and $r->[$i]";
3268                 }
3269                 else {
3270                     trace "Entire range is before $r->[$i]";
3271                 }
3272             }
3273             return if $operation ne '+'; # Deletion of a non-existent range is
3274                                          # a no-op
3275         }
3276         else {
3277
3278             # Here the entire input range is not in the gap before $i.  There
3279             # is an affected one, and $j points to the highest such one.
3280
3281             # At this point, here is the situation:
3282             # This is not an insertion of a multiple, nor of tentative ($NO)
3283             # data.
3284             #   $i  points to the first element in the current range list that
3285             #            may be affected by this operation.  In fact, we know
3286             #            that the range at $i is affected because we are in
3287             #            the else branch of this 'if'
3288             #   $j  points to the highest affected range.
3289             # In other words,
3290             #   r[$i-1]->end < $start <= r[$i]->end
3291             # And:
3292             #   r[$i-1]->end < $start <= $end <= r[$j]->end
3293             #
3294             # Also:
3295             #   $cdm is a boolean which is set true if and only if this is a
3296             #        change or deletion (multiple was handled above).  In
3297             #        other words, it could be renamed to be just $cd.
3298
3299             # We now have enough information to decide if this call is a no-op
3300             # or not.  It is a no-op if it is a deletion of a non-existent
3301             # range, or an insertion of already existing data.
3302
3303             if (main::DEBUG && $to_trace && ! $cdm
3304                                          && $i == $j
3305                                          && $start >= $r->[$i]->start)
3306             {
3307                     trace "no-op";
3308             }
3309             return if ! $cdm      # change or delete => not no-op
3310                       && $i == $j # more than one affected range => not no-op
3311
3312                       # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
3313                       # Further, $start and/or $end is >= r[$i]->start
3314                       # The test below hence guarantees that
3315                       #     r[$i]->start < $start <= $end <= r[$i]->end
3316                       # This means the input range is contained entirely in
3317                       # the one at $i, so is a no-op
3318                       && $start >= $r->[$i]->start;
3319         }
3320
3321         # Here, we know that some action will have to be taken.  We have
3322         # calculated the offset and length (though adjustments may be needed)
3323         # for the splice.  Now start constructing the replacement list.
3324         my @replacement;
3325         my $splice_start = $i;
3326
3327         my $extends_below;
3328         my $extends_above;
3329
3330         # See if should extend any adjacent ranges.
3331         if ($operation eq '-') { # Don't extend deletions
3332             $extends_below = $extends_above = 0;
3333         }
3334         else {  # Here, should extend any adjacent ranges.  See if there are
3335                 # any.
3336             $extends_below = ($i > 0
3337                             # can't extend unless adjacent
3338                             && $r->[$i-1]->end == $start -1
3339                             # can't extend unless are same standard value
3340                             && $r->[$i-1]->standard_form eq $standard_form
3341                             # can't extend unless share type
3342                             && $r->[$i-1]->type == $type);
3343             $extends_above = ($j+1 < $range_list_size
3344                             && $r->[$j+1]->start == $end +1
3345                             && $r->[$j+1]->standard_form eq $standard_form
3346                             && $r->[$j-1]->type == $type);
3347         }
3348         if ($extends_below && $extends_above) { # Adds to both
3349             $splice_start--;     # start replace at element below
3350             $length += 2;        # will replace on both sides
3351             trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
3352
3353             # The result will fill in any gap, replacing both sides, and
3354             # create one large range.
3355             @replacement = Range->new($r->[$i-1]->start,
3356                                       $r->[$j+1]->end,
3357                                       Value => $value,
3358                                       Type => $type);
3359         }
3360         else {
3361
3362             # Here we know that the result won't just be the conglomeration of
3363             # a new range with both its adjacent neighbors.  But it could
3364             # extend one of them.
3365
3366             if ($extends_below) {
3367
3368                 # Here the new element adds to the one below, but not to the
3369                 # one above.  If inserting, and only to that one range,  can
3370                 # just change its ending to include the new one.
3371                 if ($length == 0 && ! $cdm) {
3372                     $r->[$i-1]->set_end($end);
3373                     trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
3374                     return;
3375                 }
3376                 else {
3377                     trace "Changing inserted range to start at ", sprintf("%04X",  $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
3378                     $splice_start--;        # start replace at element below
3379                     $length++;              # will replace the element below
3380                     $start = $r->[$i-1]->start;
3381                 }
3382             }
3383             elsif ($extends_above) {
3384
3385                 # Here the new element adds to the one above, but not below.
3386                 # Mirror the code above
3387                 if ($length == 0 && ! $cdm) {
3388                     $r->[$j+1]->set_start($start);
3389                     trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
3390                     return;
3391                 }
3392                 else {
3393                     trace "Changing inserted range to end at ", sprintf("%04X",  $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
3394                     $length++;        # will replace the element above
3395                     $end = $r->[$j+1]->end;
3396                 }
3397             }
3398
3399             trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
3400
3401             # Finally, here we know there will have to be a splice.
3402             # If the change or delete affects only the highest portion of the
3403             # first affected range, the range will have to be split.  The
3404             # splice will remove the whole range, but will replace it by a new
3405             # range containing just the unaffected part.  So, in this case,
3406             # add to the replacement list just this unaffected portion.
3407             if (! $extends_below
3408                 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
3409             {
3410                 push @replacement,
3411                     Range->new($r->[$i]->start,
3412                                $start - 1,
3413                                Value => $r->[$i]->value,
3414                                Type => $r->[$i]->type);
3415             }
3416
3417             # In the case of an insert or change, but not a delete, we have to
3418             # put in the new stuff;  this comes next.
3419             if ($operation eq '+') {
3420                 push @replacement, Range->new($start,
3421                                               $end,
3422                                               Value => $value,
3423                                               Type => $type);
3424             }
3425
3426             trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
3427             #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
3428
3429             # And finally, if we're changing or deleting only a portion of the
3430             # highest affected range, it must be split, as the lowest one was.
3431             if (! $extends_above
3432                 && $j >= 0  # Remember that j can be -1 if before first
3433                             # current element
3434                 && $end >= $r->[$j]->start
3435                 && $end < $r->[$j]->end)
3436             {
3437                 push @replacement,
3438                     Range->new($end + 1,
3439                                $r->[$j]->end,
3440                                Value => $r->[$j]->value,
3441                                Type => $r->[$j]->type);
3442             }
3443         }
3444
3445         # And do the splice, as calculated above
3446         if (main::DEBUG && $to_trace) {
3447             trace "replacing $length element(s) at $i with ";
3448             foreach my $replacement (@replacement) {
3449                 trace "    $replacement";
3450             }
3451             trace "Before splice:";
3452             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3453             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3454             trace "i  =[", $i, "]", $r->[$i];
3455             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3456             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3457         }
3458
3459         my @return = splice @$r, $splice_start, $length, @replacement;
3460
3461         if (main::DEBUG && $to_trace) {
3462             trace "After splice:";
3463             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3464             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3465             trace "i  =[", $i, "]", $r->[$i];
3466             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3467             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3468             trace "removed @return";
3469         }
3470
3471         # An actual deletion could have changed the maximum in the list.
3472         # There was no deletion if the splice didn't return something, but
3473         # otherwise recalculate it.  This is done too rarely to worry about
3474         # performance.
3475         if ($operation eq '-' && @return) {
3476             $max{$addr} = $r->[-1]->end;
3477         }
3478         return @return;
3479     }
3480
3481     sub reset_each_range {  # reset the iterator for each_range();
3482         my $self = shift;
3483         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3484
3485         local $addr = main::objaddr $self if ! defined $addr;
3486
3487         undef $each_range_iterator{$addr};
3488         return;
3489     }
3490
3491     sub each_range {
3492         # Iterate over each range in a range list.  Results are undefined if
3493         # the range list is changed during the iteration.
3494
3495         my $self = shift;
3496         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3497
3498         local $addr = main::objaddr($self) if ! defined $addr;
3499
3500         return if $self->is_empty;
3501
3502         $each_range_iterator{$addr} = -1
3503                                 if ! defined $each_range_iterator{$addr};
3504         $each_range_iterator{$addr}++;
3505         return $ranges{$addr}->[$each_range_iterator{$addr}]
3506                         if $each_range_iterator{$addr} < @{$ranges{$addr}};
3507         undef $each_range_iterator{$addr};
3508         return;
3509     }
3510
3511     sub count {        # Returns count of code points in range list
3512         my $self = shift;
3513         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3514
3515         local $addr = main::objaddr($self) if ! defined $addr;
3516
3517         my $count = 0;
3518         foreach my $range (@{$ranges{$addr}}) {
3519             $count += $range->end - $range->start + 1;
3520         }
3521         return $count;
3522     }
3523
3524     sub delete_range {    # Delete a range
3525         my $self = shift;
3526         my $start = shift;
3527         my $end = shift;
3528
3529         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3530
3531         return $self->_add_delete('-', $start, $end, "");
3532     }
3533
3534     sub is_empty { # Returns boolean as to if a range list is empty
3535         my $self = shift;
3536         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3537
3538         local $addr = main::objaddr($self) if ! defined $addr;
3539         return scalar @{$ranges{$addr}} == 0;
3540     }
3541
3542     sub hash {
3543         # Quickly returns a scalar suitable for separating tables into
3544         # buckets, i.e. it is a hash function of the contents of a table, so
3545         # there are relatively few conflicts.
3546
3547         my $self = shift;
3548         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3549
3550         local $addr = main::objaddr($self) if ! defined $addr;
3551
3552         # These are quickly computable.  Return looks like 'min..max;count'
3553         return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
3554     }
3555 } # End closure for _Range_List_Base
3556
3557 package Range_List;
3558 use base '_Range_List_Base';
3559
3560 # A Range_List is a range list for match tables; i.e. the range values are
3561 # not significant.  Thus a number of operations can be safely added to it,
3562 # such as inversion, intersection.  Note that union is also an unsafe
3563 # operation when range values are cared about, and that method is in the base
3564 # class, not here.  But things are set up so that that method is callable only
3565 # during initialization.  Only in this derived class, is there an operation
3566 # that combines two tables.  A Range_Map can thus be used to initialize a
3567 # Range_List, and its mappings will be in the list, but are not significant to
3568 # this class.
3569
3570 sub trace { return main::trace(@_); }
3571
3572 { # Closure
3573
3574     use overload
3575         fallback => 0,
3576         '+' => sub { my $self = shift;
3577                     my $other = shift;
3578
3579                     return $self->_union($other)
3580                 },
3581         '&' => sub { my $self = shift;
3582                     my $other = shift;
3583
3584                     return $self->_intersect($other, 0);
3585                 },
3586         '~' => "_invert",
3587         '-' => "_subtract",
3588     ;
3589
3590     sub _invert {
3591         # Returns a new Range_List that gives all code points not in $self.
3592
3593         my $self = shift;
3594
3595         my $new = Range_List->new;
3596
3597         # Go through each range in the table, finding the gaps between them
3598         my $max = -1;   # Set so no gap before range beginning at 0
3599         for my $range ($self->ranges) {
3600             my $start = $range->start;
3601             my $end   = $range->end;
3602
3603             # If there is a gap before this range, the inverse will contain
3604             # that gap.
3605             if ($start > $max + 1) {
3606                 $new->add_range($max + 1, $start - 1);
3607             }
3608             $max = $end;
3609         }
3610
3611         # And finally, add the gap from the end of the table to the max
3612         # possible code point
3613         if ($max < $LAST_UNICODE_CODEPOINT) {
3614             $new->add_range($max + 1, $LAST_UNICODE_CODEPOINT);
3615         }
3616         return $new;
3617     }
3618
3619     sub _subtract {
3620         # Returns a new Range_List with the argument deleted from it.  The
3621         # argument can be a single code point, a range, or something that has
3622         # a range, with the _range_list() method on it returning them
3623
3624         my $self = shift;
3625         my $other = shift;
3626         my $reversed = shift;
3627         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3628
3629         if ($reversed) {
3630             Carp::my_carp_bug("Can't cope with a "
3631              .  __PACKAGE__
3632              . " being the second parameter in a '-'.  Subtraction ignored.");
3633             return $self;
3634         }
3635
3636         my $new = Range_List->new(Initialize => $self);
3637
3638         if (! ref $other) { # Single code point
3639             $new->delete_range($other, $other);
3640         }
3641         elsif ($other->isa('Range')) {
3642             $new->delete_range($other->start, $other->end);
3643         }
3644         elsif ($other->can('_range_list')) {
3645             foreach my $range ($other->_range_list->ranges) {
3646                 $new->delete_range($range->start, $range->end);
3647             }
3648         }
3649         else {
3650             Carp::my_carp_bug("Can't cope with a "
3651                         . ref($other)
3652                         . " argument to '-'.  Subtraction ignored."
3653                         );
3654             return $self;
3655         }
3656
3657         return $new;
3658     }
3659
3660     sub _intersect {
3661         # Returns either a boolean giving whether the two inputs' range lists
3662         # intersect (overlap), or a new Range_List containing the intersection
3663         # of the two lists.  The optional final parameter being true indicates
3664         # to do the check instead of the intersection.
3665
3666         my $a_object = shift;
3667         my $b_object = shift;
3668         my $check_if_overlapping = shift;
3669         $check_if_overlapping = 0 unless defined $check_if_overlapping;
3670         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3671
3672         if (! defined $b_object) {
3673             my $message = "";
3674             $message .= $a_object->_owner_name_of if defined $a_object;
3675             Carp::my_carp_bug($message .= "Called with undefined value.  Intersection not done.");
3676             return;
3677         }
3678
3679         # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
3680         # Thus the intersection could be much more simply be written:
3681         #   return ~(~$a_object + ~$b_object);
3682         # But, this is slower, and when taking the inverse of a large
3683         # range_size_1 table, back when such tables were always stored that
3684         # way, it became prohibitively slow, hence the code was changed to the
3685         # below
3686
3687         if ($b_object->isa('Range')) {
3688             $b_object = Range_List->new(Initialize => $b_object,
3689                                         Owner => $a_object->_owner_name_of);
3690         }
3691         $b_object = $b_object->_range_list if $b_object->can('_range_list');
3692
3693         my @a_ranges = $a_object->ranges;
3694         my @b_ranges = $b_object->ranges;
3695
3696         #local $to_trace = 1 if main::DEBUG;
3697         trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
3698
3699         # Start with the first range in each list
3700         my $a_i = 0;
3701         my $range_a = $a_ranges[$a_i];
3702         my $b_i = 0;
3703         my $range_b = $b_ranges[$b_i];
3704
3705         my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
3706                                                 if ! $check_if_overlapping;
3707
3708         # If either list is empty, there is no intersection and no overlap
3709         if (! defined $range_a || ! defined $range_b) {
3710             return $check_if_overlapping ? 0 : $new;
3711         }
3712         trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
3713
3714         # Otherwise, must calculate the intersection/overlap.  Start with the
3715         # very first code point in each list
3716         my $a = $range_a->start;
3717         my $b = $range_b->start;
3718
3719         # Loop through all the ranges of each list; in each iteration, $a and
3720         # $b are the current code points in their respective lists
3721         while (1) {
3722
3723             # If $a and $b are the same code point, ...
3724             if ($a == $b) {
3725
3726                 # it means the lists overlap.  If just checking for overlap
3727                 # know the answer now,
3728                 return 1 if $check_if_overlapping;
3729
3730                 # The intersection includes this code point plus anything else
3731                 # common to both current ranges.
3732                 my $start = $a;
3733                 my $end = main::min($range_a->end, $range_b->end);
3734                 if (! $check_if_overlapping) {
3735                     trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
3736                     $new->add_range($start, $end);
3737                 }
3738
3739                 # Skip ahead to the end of the current intersect
3740                 $a = $b = $end;
3741
3742                 # If the current intersect ends at the end of either range (as
3743                 # it must for at least one of them), the next possible one
3744                 # will be the beginning code point in it's list's next range.
3745                 if ($a == $range_a->end) {
3746                     $range_a = $a_ranges[++$a_i];
3747                     last unless defined $range_a;
3748                     $a = $range_a->start;
3749                 }
3750                 if ($b == $range_b->end) {
3751                     $range_b = $b_ranges[++$b_i];
3752                     last unless defined $range_b;
3753                     $b = $range_b->start;
3754                 }
3755
3756                 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
3757             }
3758             elsif ($a < $b) {
3759
3760                 # Not equal, but if the range containing $a encompasses $b,
3761                 # change $a to be the middle of the range where it does equal
3762                 # $b, so the next iteration will get the intersection
3763                 if ($range_a->end >= $b) {
3764                     $a = $b;
3765                 }
3766                 else {
3767
3768                     # Here, the current range containing $a is entirely below
3769                     # $b.  Go try to find a range that could contain $b.
3770                     $a_i = $a_object->_search_ranges($b);
3771
3772                     # If no range found, quit.
3773                     last unless defined $a_i;
3774
3775                     # The search returns $a_i, such that
3776                     #   range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
3777                     # Set $a to the beginning of this new range, and repeat.
3778                     $range_a = $a_ranges[$a_i];
3779                     $a = $range_a->start;
3780                 }
3781             }
3782             else { # Here, $b < $a.
3783
3784                 # Mirror image code to the leg just above
3785                 if ($range_b->end >= $a) {
3786                     $b = $a;
3787                 }
3788                 else {
3789                     $b_i = $b_object->_search_ranges($a);
3790                     last unless defined $b_i;
3791                     $range_b = $b_ranges[$b_i];
3792                     $b = $range_b->start;
3793                 }
3794             }
3795         } # End of looping through ranges.
3796
3797         # Intersection fully computed, or now know that there is no overlap
3798         return $check_if_overlapping ? 0 : $new;
3799     }
3800
3801     sub overlaps {
3802         # Returns boolean giving whether the two arguments overlap somewhere
3803
3804         my $self = shift;
3805         my $other = shift;
3806         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3807
3808         return $self->_intersect($other, 1);
3809     }
3810
3811     sub add_range {
3812         # Add a range to the list.
3813
3814         my $self = shift;
3815         my $start = shift;
3816         my $end = shift;
3817         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3818
3819         return $self->_add_delete('+', $start, $end, "");
3820     }
3821
3822     my $non_ASCII = (ord('A') == 65);   # Assumes test on same platform
3823
3824     sub is_code_point_usable {
3825         # This used only for making the test script.  See if the input
3826         # proposed trial code point is one that Perl will handle.  If second
3827         # parameter is 0, it won't select some code points for various
3828         # reasons, noted below.
3829
3830         my $code = shift;
3831         my $try_hard = shift;
3832         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3833
3834         return 0 if $code < 0;                # Never use a negative
3835
3836         # For non-ASCII, we shun the characters that don't have Perl encoding-
3837         # independent symbols for them.  'A' is such a symbol, so is "\n".
3838         # Note, this program hopefully will work on 5.8 Perls, and \v is not
3839         # such a symbol in them.
3840         return $try_hard if $non_ASCII
3841                             && $code <= 0xFF
3842                             && ($code >= 0x7F
3843                                 || ($code >= 0x0E && $code <= 0x1F)
3844                                 || ($code >= 0x01 && $code <= 0x06)
3845                                 || $code == 0x0B);  # \v introduced after 5.8
3846
3847         # shun null.  I'm (khw) not sure why this was done, but NULL would be
3848         # the character very frequently used.
3849         return $try_hard if $code == 0x0000;
3850
3851         return 0 if $try_hard;  # XXX Temporary until fix utf8.c
3852
3853         # shun non-character code points.
3854         return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
3855         return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
3856
3857         return $try_hard if $code > $LAST_UNICODE_CODEPOINT;   # keep in range
3858         return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
3859
3860         return 1;
3861     }
3862
3863     sub get_valid_code_point {
3864         # Return a code point that's part of the range list.  Returns nothing
3865         # if the table is empty or we can't find a suitable code point.  This
3866         # used only for making the test script.
3867
3868         my $self = shift;
3869         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3870
3871         my $addr = main::objaddr($self);
3872
3873         # On first pass, don't choose less desirable code points; if no good
3874         # one is found, repeat, allowing a less desirable one to be selected.
3875         for my $try_hard (0, 1) {
3876
3877             # Look through all the ranges for a usable code point.
3878             for my $set ($self->ranges) {
3879
3880                 # Try the edge cases first, starting with the end point of the
3881                 # range.
3882                 my $end = $set->end;
3883                 return $end if is_code_point_usable($end, $try_hard);
3884
3885                 # End point didn't, work.  Start at the beginning and try
3886                 # every one until find one that does work.
3887                 for my $trial ($set->start .. $end - 1) {
3888                     return $trial if is_code_point_usable($trial, $try_hard);
3889                 }
3890             }
3891         }
3892         return ();  # If none found, give up.
3893     }
3894
3895     sub get_invalid_code_point {
3896         # Return a code point that's not part of the table.  Returns nothing
3897         # if the table covers all code points or a suitable code point can't
3898         # be found.  This used only for making the test script.
3899
3900         my $self = shift;
3901         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3902
3903         # Just find a valid code point of the inverse, if any.
3904         return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
3905     }
3906 } # end closure for Range_List
3907
3908 package Range_Map;
3909 use base '_Range_List_Base';
3910
3911 # A Range_Map is a range list in which the range values (called maps) are
3912 # significant, and hence shouldn't be manipulated by our other code, which
3913 # could be ambiguous or lose things.  For example, in taking the union of two
3914 # lists, which share code points, but which have differing values, which one
3915 # has precedence in the union?
3916 # It turns out that these operations aren't really necessary for map tables,
3917 # and so this class was created to make sure they aren't accidentally
3918 # applied to them.
3919
3920 { # Closure
3921
3922     sub add_map {
3923         # Add a range containing a mapping value to the list
3924
3925         my $self = shift;
3926         # Rest of parameters passed on
3927
3928         return $self->_add_delete('+', @_);
3929     }
3930
3931     sub add_duplicate {
3932         # Adds entry to a range list which can duplicate an existing entry
3933
3934         my $self = shift;
3935         my $code_point = shift;
3936         my $value = shift;
3937         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3938
3939         return $self->add_map($code_point, $code_point,
3940                                 $value, Replace => $MULTIPLE);
3941     }
3942 } # End of closure for package Range_Map
3943
3944 package _Base_Table;
3945
3946 # A table is the basic data structure that gets written out into a file for
3947 # use by the Perl core.  This is the abstract base class implementing the
3948 # common elements from the derived ones.  A list of the methods to be
3949 # furnished by an implementing class is just after the constructor.
3950
3951 sub standardize { return main::standardize($_[0]); }
3952 sub trace { return main::trace(@_); }
3953
3954 { # Closure
3955
3956     main::setup_package();
3957
3958     my %range_list;
3959     # Object containing the ranges of the table.
3960     main::set_access('range_list', \%range_list, 'p_r', 'p_s');
3961
3962     my %full_name;
3963     # The full table name.
3964     main::set_access('full_name', \%full_name, 'r');
3965
3966     my %name;
3967     # The table name, almost always shorter
3968     main::set_access('name', \%name, 'r');
3969
3970     my %short_name;
3971     # The shortest of all the aliases for this table, with underscores removed
3972     main::set_access('short_name', \%short_name);
3973
3974     my %nominal_short_name_length;
3975     # The length of short_name before removing underscores
3976     main::set_access('nominal_short_name_length',
3977                     \%nominal_short_name_length);
3978
3979     my %property;
3980     # Parent property this table is attached to.
3981     main::set_access('property', \%property, 'r');
3982
3983     my %aliases;
3984     # Ordered list of aliases of the table's name.  The first ones in the list
3985     # are output first in comments
3986     main::set_access('aliases', \%aliases, 'readable_array');
3987
3988     my %comment;
3989     # A comment associated with the table for human readers of the files
3990     main::set_access('comment', \%comment, 's');
3991
3992     my %description;
3993     # A comment giving a short description of the table's meaning for human
3994     # readers of the files.
3995     main::set_access('description', \%description, 'readable_array');
3996
3997     my %note;
3998     # A comment giving a short note about the table for human readers of the
3999     # files.
4000     main::set_access('note', \%note, 'readable_array');
4001
4002     my %internal_only;
4003     # Boolean; if set means any file that contains this table is marked as for
4004     # internal-only use.
4005     main::set_access('internal_only', \%internal_only);
4006
4007     my %find_table_from_alias;
4008     # The parent property passes this pointer to a hash which this class adds
4009     # all its aliases to, so that the parent can quickly take an alias and
4010     # find this table.
4011     main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4012
4013     my %locked;
4014     # After this table is made equivalent to another one; we shouldn't go
4015     # changing the contents because that could mean it's no longer equivalent
4016     main::set_access('locked', \%locked, 'r');
4017
4018     my %file_path;
4019     # This gives the final path to the file containing the table.  Each
4020     # directory in the path is an element in the array
4021     main::set_access('file_path', \%file_path, 'readable_array');
4022
4023     my %status;
4024     # What is the table's status, normal, $OBSOLETE, etc.  Enum
4025     main::set_access('status', \%status, 'r');
4026
4027     my %status_info;
4028     # A comment about its being obsolete, or whatever non normal status it has
4029     main::set_access('status_info', \%status_info, 'r');
4030
4031     my %range_size_1;
4032     # Is the table to be output with each range only a single code point?
4033     # This is done to avoid breaking existing code that may have come to rely
4034     # on this behavior in previous versions of this program.)
4035     main::set_access('range_size_1', \%range_size_1, 'r', 's');
4036
4037     my %perl_extension;
4038     # A boolean set iff this table is a Perl extension to the Unicode
4039     # standard.
4040     main::set_access('perl_extension', \%perl_extension, 'r');
4041
4042     sub new {
4043         # All arguments are key => value pairs, which you can see below, most
4044         # of which match fields documented above.  Otherwise: Pod_Entry,
4045         # Externally_Ok, and Fuzzy apply to the names of the table, and are
4046         # documented in the Alias package
4047
4048         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4049
4050         my $class = shift;
4051
4052         my $self = bless \do { my $anonymous_scalar }, $class;
4053         my $addr = main::objaddr($self);
4054
4055         my %args = @_;
4056
4057         $name{$addr} = delete $args{'Name'};
4058         $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4059         $full_name{$addr} = delete $args{'Full_Name'};
4060         $internal_only{$addr} = delete $args{'Internal_Only_Warning'} || 0;
4061         $perl_extension{$addr} = delete $args{'Perl_Extension'} || 0;
4062         $property{$addr} = delete $args{'_Property'};
4063         $range_list{$addr} = delete $args{'_Range_List'};
4064         $status{$addr} = delete $args{'Status'} || $NORMAL;
4065         $status_info{$addr} = delete $args{'_Status_Info'} || "";
4066         $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
4067
4068         my $description = delete $args{'Description'};
4069         my $externally_ok = delete $args{'Externally_Ok'};
4070         my $loose_match = delete $args{'Fuzzy'};
4071         my $note = delete $args{'Note'};
4072         my $make_pod_entry = delete $args{'Pod_Entry'};
4073
4074         # Shouldn't have any left over
4075         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4076
4077         # Can't use || above because conceivably the name could be 0, and
4078         # can't use // operator in case this program gets used in Perl 5.8
4079         $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
4080
4081         $aliases{$addr} = [ ];
4082         $comment{$addr} = [ ];
4083         $description{$addr} = [ ];
4084         $note{$addr} = [ ];
4085         $file_path{$addr} = [ ];
4086         $locked{$addr} = "";
4087
4088         push @{$description{$addr}}, $description if $description;
4089         push @{$note{$addr}}, $note if $note;
4090
4091         # If hasn't set its status already, see if it is on one of the lists
4092         # of properties or tables that have particular statuses; if not, is
4093         # normal.  The lists are prioritized so the most serious ones are
4094         # checked first
4095         my $complete_name = $self->complete_name;
4096         if (! $status{$addr}) {
4097             if (exists $why_suppressed{$complete_name}) {
4098                 $status{$addr} = $SUPPRESSED;
4099             }
4100             elsif (exists $why_deprecated{$complete_name}) {
4101                 $status{$addr} = $DEPRECATED;
4102             }
4103             elsif (exists $why_stabilized{$complete_name}) {
4104                 $status{$addr} = $STABILIZED;
4105             }
4106             elsif (exists $why_obsolete{$complete_name}) {
4107                 $status{$addr} = $OBSOLETE;
4108             }
4109
4110             # Existence above doesn't necessarily mean there is a message
4111             # associated with it.  Use the most serious message.
4112             if ($status{$addr}) {
4113                 if ($why_suppressed{$complete_name}) {
4114                     $status_info{$addr}
4115                                 = $why_suppressed{$complete_name};
4116                 }
4117                 elsif ($why_deprecated{$complete_name}) {
4118                     $status_info{$addr}
4119                                 = $why_deprecated{$complete_name};
4120                 }
4121                 elsif ($why_stabilized{$complete_name}) {
4122                     $status_info{$addr}
4123                                 = $why_stabilized{$complete_name};
4124                 }
4125                 elsif ($why_obsolete{$complete_name}) {
4126                     $status_info{$addr}
4127                                 = $why_obsolete{$complete_name};
4128                 }
4129             }
4130         }
4131
4132         # By convention what typically gets printed only or first is what's
4133         # first in the list, so put the full name there for good output
4134         # clarity.  Other routines rely on the full name being first on the
4135         # list
4136         $self->add_alias($full_name{$addr},
4137                             Externally_Ok => $externally_ok,
4138                             Fuzzy => $loose_match,
4139                             Pod_Entry => $make_pod_entry,
4140                             Status => $status{$addr},
4141                             );
4142
4143         # Then comes the other name, if meaningfully different.
4144         if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4145             $self->add_alias($name{$addr},
4146                             Externally_Ok => $externally_ok,
4147                             Fuzzy => $loose_match,
4148                             Pod_Entry => $make_pod_entry,
4149                             Status => $status{$addr},
4150                             );
4151         }
4152
4153         return $self;
4154     }
4155
4156     # Here are the methods that are required to be defined by any derived
4157     # class
4158     for my $sub qw(
4159                     append_to_body
4160                     complete_name
4161                     pre_body
4162                 )
4163                 # append_to_body and pre_body are called in the write() method
4164                 # to add stuff after the main body of the table, but before
4165                 # its close; and to prepend stuff before the beginning of the
4166                 # table.
4167                 # complete_name returns the complete name of the property and
4168                 # table, like Script=Latin
4169     {
4170         no strict "refs";
4171         *$sub = sub {
4172             Carp::my_carp_bug( __LINE__
4173                               . ": Must create method '$sub()' for "
4174                               . ref shift);
4175             return;
4176         }
4177     }
4178
4179     use overload
4180         fallback => 0,
4181         "." => \&main::_operator_dot,
4182         '!=' => \&main::_operator_not_equal,
4183         '==' => \&main::_operator_equal,
4184     ;
4185
4186     sub ranges {
4187         # Returns the array of ranges associated with this table.
4188
4189         return $range_list{main::objaddr shift}->ranges;
4190     }
4191
4192     sub add_alias {
4193         # Add a synonym for this table.
4194
4195         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
4196
4197         my $self = shift;
4198         my $name = shift;       # The name to add.
4199         my $pointer = shift;    # What the alias hash should point to.  For
4200                                 # map tables, this is the parent property;
4201                                 # for match tables, it is the table itself.
4202
4203         my %args = @_;
4204         my $loose_match = delete $args{'Fuzzy'};
4205
4206         my $make_pod_entry = delete $args{'Pod_Entry'};
4207         $make_pod_entry = $YES unless defined $make_pod_entry;
4208
4209         my $externally_ok = delete $args{'Externally_Ok'};
4210         $externally_ok = 1 unless defined $externally_ok;
4211
4212         my $status = delete $args{'Status'};
4213         $status = $NORMAL unless defined $status;
4214
4215         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4216
4217         # Capitalize the first letter of the alias unless it is one of the CJK
4218         # ones which specifically begins with a lower 'k'.  Do this because
4219         # Unicode has varied whether they capitalize first letters or not, and
4220         # have later changed their minds and capitalized them, but not the
4221         # other way around.  So do it always and avoid changes from release to
4222         # release
4223         $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
4224
4225         my $addr = main::objaddr $self;
4226
4227         # Figure out if should be loosely matched if not already specified.
4228         if (! defined $loose_match) {
4229
4230             # Is a loose_match if isn't null, and doesn't begin with an
4231             # underscore and isn't just a number
4232             if ($name ne ""
4233                 && substr($name, 0, 1) ne '_'
4234                 && $name !~ qr{^[0-9_.+-/]+$})
4235             {
4236                 $loose_match = 1;
4237             }
4238             else {
4239                 $loose_match = 0;
4240             }
4241         }
4242
4243         # If this alias has already been defined, do nothing.
4244         return if defined $find_table_from_alias{$addr}->{$name};
4245
4246         # That includes if it is standardly equivalent to an existing alias,
4247         # in which case, add this name to the list, so won't have to search
4248         # for it again.
4249         my $standard_name = main::standardize($name);
4250         if (defined $find_table_from_alias{$addr}->{$standard_name}) {
4251             $find_table_from_alias{$addr}->{$name}
4252                         = $find_table_from_alias{$addr}->{$standard_name};
4253             return;
4254         }
4255
4256         # Set the index hash for this alias for future quick reference.
4257         $find_table_from_alias{$addr}->{$name} = $pointer;
4258         $find_table_from_alias{$addr}->{$standard_name} = $pointer;
4259         local $to_trace = 0 if main::DEBUG;
4260         trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
4261         trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
4262
4263
4264         # Put the new alias at the end of the list of aliases unless the final
4265         # element begins with an underscore (meaning it is for internal perl
4266         # use) or is all numeric, in which case, put the new one before that
4267         # one.  This floats any all-numeric or underscore-beginning aliases to
4268         # the end.  This is done so that they are listed last in output lists,
4269         # to encourage the user to use a better name (either more descriptive
4270         # or not an internal-only one) instead.  This ordering is relied on
4271         # implicitly elsewhere in this program, like in short_name()
4272         my $list = $aliases{$addr};
4273         my $insert_position = (@$list == 0
4274                                 || (substr($list->[-1]->name, 0, 1) ne '_'
4275                                     && $list->[-1]->name =~ /\D/))
4276                             ? @$list
4277                             : @$list - 1;
4278         splice @$list,
4279                 $insert_position,
4280                 0,
4281                 Alias->new($name, $loose_match, $make_pod_entry,
4282                                                     $externally_ok, $status);
4283
4284         # This name may be shorter than any existing ones, so clear the cache
4285         # of the shortest, so will have to be recalculated.
4286         undef $short_name{main::objaddr $self};
4287         return;
4288     }
4289
4290     sub short_name {
4291         # Returns a name suitable for use as the base part of a file name.
4292         # That is, shorter wins.  It can return undef if there is no suitable
4293         # name.  The name has all non-essential underscores removed.
4294
4295         # The optional second parameter is a reference to a scalar in which
4296         # this routine will store the length the returned name had before the
4297         # underscores were removed, or undef if the return is undef.
4298
4299         # The shortest name can change if new aliases are added.  So using
4300         # this should be deferred until after all these are added.  The code
4301         # that does that should clear this one's cache.
4302         # Any name with alphabetics is preferred over an all numeric one, even
4303         # if longer.
4304
4305         my $self = shift;
4306         my $nominal_length_ptr = shift;
4307         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4308
4309         my $addr = main::objaddr $self;
4310
4311         # For efficiency, don't recalculate, but this means that adding new
4312         # aliases could change what the shortest is, so the code that does
4313         # that needs to undef this.
4314         if (defined $short_name{$addr}) {
4315             if ($nominal_length_ptr) {
4316                 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4317             }
4318             return $short_name{$addr};
4319         }
4320
4321         # Look at each alias
4322         foreach my $alias ($self->aliases()) {
4323
4324             # Don't use an alias that isn't ok to use for an external name.
4325             next if ! $alias->externally_ok;
4326
4327             my $name = main::Standardize($alias->name);
4328             trace $self, $name if main::DEBUG && $to_trace;
4329
4330             # Take the first one, or a shorter one that isn't numeric.  This
4331             # relies on numeric aliases always being last in the array
4332             # returned by aliases().  Any alpha one will have precedence.
4333             if (! defined $short_name{$addr}
4334                 || ($name =~ /\D/
4335                     && length($name) < length($short_name{$addr})))
4336             {
4337                 # Remove interior underscores.
4338                 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
4339
4340                 $nominal_short_name_length{$addr} = length $name;
4341             }
4342         }
4343
4344         # If no suitable external name return undef
4345         if (! defined $short_name{$addr}) {
4346             $$nominal_length_ptr = undef if $nominal_length_ptr;
4347             return;
4348         }
4349
4350         # Don't allow a null external name.
4351         if ($short_name{$addr} eq "") {
4352             $short_name{$addr} = '_';
4353             $nominal_short_name_length{$addr} = 1;
4354         }
4355
4356         trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
4357
4358         if ($nominal_length_ptr) {
4359             $$nominal_length_ptr = $nominal_short_name_length{$addr};
4360         }
4361         return $short_name{$addr};
4362     }
4363
4364     sub external_name {
4365         # Returns the external name that this table should be known by.  This
4366         # is usually the short_name, but not if the short_name is undefined.
4367
4368         my $self = shift;
4369         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4370
4371         my $short = $self->short_name;
4372         return $short if defined $short;
4373
4374         return '_';
4375     }
4376
4377     sub add_description { # Adds the parameter as a short description.
4378
4379         my $self = shift;
4380         my $description = shift;
4381         chomp $description;
4382         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4383
4384         push @{$description{main::objaddr $self}}, $description;
4385
4386         return;
4387     }
4388
4389     sub add_note { # Adds the parameter as a short note.
4390
4391         my $self = shift;
4392         my $note = shift;
4393         chomp $note;
4394         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4395
4396         push @{$note{main::objaddr $self}}, $note;
4397
4398         return;
4399     }
4400
4401     sub add_comment { # Adds the parameter as a comment.
4402
4403         my $self = shift;
4404         my $comment = shift;
4405         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4406
4407         chomp $comment;
4408         push @{$comment{main::objaddr $self}}, $comment;
4409
4410         return;
4411     }
4412
4413     sub comment {
4414         # Return the current comment for this table.  If called in list
4415         # context, returns the array of comments.  In scalar, returns a string
4416         # of each element joined together with a period ending each.
4417
4418         my $self = shift;
4419         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4420
4421         my @list = @{$comment{main::objaddr $self}};
4422         return @list if wantarray;
4423         my $return = "";
4424         foreach my $sentence (@list) {
4425             $return .= '.  ' if $return;
4426             $return .= $sentence;
4427             $return =~ s/\.$//;
4428         }
4429         $return .= '.' if $return;
4430         return $return;
4431     }
4432
4433     sub initialize {
4434         # Initialize the table with the argument which is any valid
4435         # initialization for range lists.
4436
4437         my $self = shift;
4438         my $initialization = shift;
4439         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4440
4441         # Replace the current range list with a new one of the same exact
4442         # type.
4443         my $class = ref $range_list{main::objaddr $self};
4444         $range_list{main::objaddr $self} = $class->new(Owner => $self,
4445                                         Initialize => $initialization);
4446         return;
4447
4448     }
4449
4450     sub header {
4451         # The header that is output for the table in the file it is written
4452         # in.
4453
4454         my $self = shift;
4455         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4456
4457         my $return = "";
4458         $return .= $DEVELOPMENT_ONLY if $compare_versions;
4459         $return .= $HEADER;
4460         $return .= $INTERNAL_ONLY if $internal_only{main::objaddr $self};
4461         return $return;
4462     }
4463
4464     sub write {
4465         # Write a representation of the table to its file.
4466
4467         my $self = shift;
4468         my $tab_stops = shift;       # The number of tab stops over to put any
4469                                      # comment.
4470         my $suppress_value = shift;  # Optional, if the value associated with
4471                                      # a range equals this one, don't write
4472                                      # the range
4473         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4474
4475         my $addr = main::objaddr($self);
4476
4477         # Start with the header
4478         my @OUT = $self->header;
4479
4480         # Then the comments
4481         push @OUT, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
4482                                                         if $comment{$addr};
4483
4484         # Then any pre-body stuff.
4485         my $pre_body = $self->pre_body;
4486         push @OUT, $pre_body, "\n" if $pre_body;
4487
4488         # The main body looks like a 'here' document
4489         push @OUT, "return <<'END';\n";
4490
4491         if ($range_list{$addr}->is_empty) {
4492
4493             # This is a kludge for empty tables to silence a warning in
4494             # utf8.c, which can't really deal with empty tables, but it can
4495             # deal with a table that matches nothing, as the inverse of 'Any'
4496             # does.
4497             push @OUT, "!utf8::IsAny\n";
4498         }
4499         else {
4500             my $range_size_1 = $range_size_1{$addr};
4501
4502             # Output each range as part of the here document.
4503             for my $set ($range_list{$addr}->ranges) {
4504                 my $start = $set->start;
4505                 my $end   = $set->end;
4506                 my $value  = $set->value;
4507
4508                 # Don't output ranges whose value is the one to suppress
4509                 next if defined $suppress_value && $value eq $suppress_value;
4510
4511                 # If has or wants a single point range output
4512                 if ($start == $end || $range_size_1) {
4513                     for my $i ($start .. $end) {
4514                         push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
4515                     }
4516                 }
4517                 else  {
4518                     push @OUT, sprintf "%04X\t%04X\t%s", $start, $end, $value;
4519
4520                     # Add a comment with the size of the range, if requested.
4521                     # Expand Tabs to make sure they all start in the same
4522                     # column, and then unexpand to use mostly tabs.
4523                     if (! $output_range_counts) {
4524                         $OUT[-1] .= "\n";
4525                     }
4526                     else {
4527                         $OUT[-1] = Text::Tabs::expand($OUT[-1]);
4528                         my $count = main::clarify_number($end - $start + 1);
4529                         use integer;
4530
4531                         my $width = $tab_stops * 8 - 1;
4532                         $OUT[-1] = sprintf("%-*s # [%s]\n",
4533                                             $width,
4534                                             $OUT[-1],
4535                                             $count);
4536                         $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
4537                     }
4538                 }
4539             } # End of loop through all the table's ranges
4540         }
4541
4542         # Add anything that goes after the main body, but within the here
4543         # document,
4544         my $append_to_body = $self->append_to_body;
4545         push @OUT, $append_to_body if $append_to_body;
4546
4547         # And finish the here document.
4548         push @OUT, "END\n";
4549
4550         # All these files have a .pl suffix
4551         $file_path{$addr}->[-1] .= '.pl';
4552
4553         main::write($file_path{$addr}, \@OUT);
4554         return;
4555     }
4556
4557     sub set_status {    # Set the table's status
4558         my $self = shift;
4559         my $status = shift; # The status enum value
4560         my $info = shift;   # Any message associated with it.
4561         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4562
4563         my $addr = main::objaddr($self);
4564
4565         $status{$addr} = $status;
4566         $status_info{$addr} = $info;
4567         return;
4568     }
4569
4570     sub lock {
4571         # Don't allow changes to the table from now on.  This stores a stack
4572         # trace of where it was called, so that later attempts to modify it
4573         # can immediately show where it got locked.
4574
4575         my $self = shift;
4576         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4577
4578         my $addr = main::objaddr $self;
4579
4580         $locked{$addr} = "";
4581
4582         my $line = (caller(0))[2];
4583         my $i = 1;
4584
4585         # Accumulate the stack trace
4586         while (1) {
4587             my ($pkg, $file, $caller_line, $caller) = caller $i++;
4588
4589             last unless defined $caller;
4590
4591             $locked{$addr} .= "    called from $caller() at line $line\n";
4592             $line = $caller_line;
4593         }
4594         $locked{$addr} .= "    called from main at line $line\n";
4595
4596         return;
4597     }
4598
4599     sub carp_if_locked {
4600         # Return whether a table is locked or not, and, by the way, complain
4601         # if is locked
4602
4603         my $self = shift;
4604         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4605
4606         my $addr = main::objaddr $self;
4607
4608         return 0 if ! $locked{$addr};
4609         Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
4610         return 1;
4611     }
4612
4613     sub set_file_path { # Set the final directory path for this table
4614         my $self = shift;
4615         # Rest of parameters passed on
4616
4617         @{$file_path{main::objaddr $self}} = @_;
4618         return
4619     }
4620
4621     # Accessors for the range list stored in this table.  First for
4622     # unconditional
4623     for my $sub qw(
4624                     contains
4625                     count
4626                     each_range
4627                     hash
4628                     is_empty
4629                     max
4630                     min
4631                     range_count
4632                     reset_each_range
4633                     value_of
4634                 )
4635     {
4636         no strict "refs";
4637         *$sub = sub {
4638             use strict "refs";
4639             my $self = shift;
4640             return $range_list{main::objaddr $self}->$sub(@_);
4641         }
4642     }
4643
4644     # Then for ones that should fail if locked
4645     for my $sub qw(
4646                     delete_range
4647                 )
4648     {
4649         no strict "refs";
4650         *$sub = sub {
4651             use strict "refs";
4652             my $self = shift;
4653
4654             return if $self->carp_if_locked;
4655             return $range_list{main::objaddr $self}->$sub(@_);
4656         }
4657     }
4658
4659 } # End closure
4660
4661 package Map_Table;
4662 use base '_Base_Table';
4663
4664 # A Map Table is a table that contains the mappings from code points to
4665 # values.  There are two weird cases:
4666 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
4667 #    are written in the table's file at the end of the table nonetheless.  It
4668 #    requires specially constructed code to handle these; utf8.c can not read
4669 #    these in, so they should not go in $map_directory.  As of this writing,
4670 #    the only case that these happen is for named sequences used in
4671 #    charnames.pm.   But this code doesn't enforce any syntax on these, so
4672 #    something else could come along that uses it.
4673 # 2) Specials are anything that doesn't fit syntactically into the body of the
4674 #    table.  The ranges for these have a map type of non-zero.  The code below
4675 #    knows about and handles each possible type.   In most cases, these are
4676 #    written as part of the header.
4677 #
4678 # A map table deliberately can't be manipulated at will unlike match tables.
4679 # This is because of the ambiguities having to do with what to do with
4680 # overlapping code points.  And there just isn't a need for those things;
4681 # what one wants to do is just query, add, replace, or delete mappings, plus
4682 # write the final result.
4683 # However, there is a method to get the list of possible ranges that aren't in
4684 # this table to use for defaulting missing code point mappings.  And,
4685 # map_add_or_replace_non_nulls() does allow one to add another table to this
4686 # one, but it is clearly very specialized, and defined that the other's
4687 # non-null values replace this one's if there is any overlap.
4688
4689 sub trace { return main::trace(@_); }
4690
4691 { # Closure
4692
4693     main::setup_package();
4694
4695     my %default_map;
4696     # Many input files omit some entries; this gives what the mapping for the
4697     # missing entries should be
4698     main::set_access('default_map', \%default_map, 'r');
4699
4700     my %anomalous_entries;
4701     # Things that go in the body of the table which don't fit the normal
4702     # scheme of things, like having a range.  Not much can be done with these
4703     # once there except to output them.  This was created to handle named
4704     # sequences.
4705     main::set_access('anomalous_entry', \%anomalous_entries, 'a');
4706     main::set_access('anomalous_entries',       # Append singular, read plural
4707                     \%anomalous_entries,
4708                     'readable_array');
4709
4710     my %format;
4711     # The format of the entries of the table.  This is calculated from the
4712     # data in the table (or passed in the constructor).  This is an enum e.g.,
4713     # $STRING_FORMAT
4714     main::set_access('format', \%format);
4715
4716     my %core_access;
4717     # This is a string, solely for documentation, indicating how one can get
4718     # access to this property via the Perl core.
4719     main::set_access('core_access', \%core_access, 'r', 's');
4720
4721     my %has_specials;
4722     # Boolean set when non-zero map-type ranges are added to this table,
4723     # which happens in only a few tables.  This is purely for performance, to
4724     # avoid having to search through every table upon output, so if all the
4725     # non-zero maps got deleted before output, this would remain set, and the
4726     # only penalty would be performance.  Currently, most map tables that get
4727     # output have specials in them, so this doesn't help that much anyway.
4728     main::set_access('has_specials', \%has_specials);
4729
4730     my %to_output_map;
4731     # Boolean as to whether or not to write out this map table
4732     main::set_access('to_output_map', \%to_output_map, 's');
4733
4734
4735     sub new {
4736         my $class = shift;
4737         my $name = shift;
4738
4739         my %args = @_;
4740
4741         # Optional initialization data for the table.
4742         my $initialize = delete $args{'Initialize'};
4743
4744         my $core_access = delete $args{'Core_Access'};
4745         my $default_map = delete $args{'Default_Map'};
4746         my $format = delete $args{'Format'};
4747         my $property = delete $args{'_Property'};
4748         # Rest of parameters passed on
4749
4750         my $range_list = Range_Map->new(Owner => $property);
4751
4752         my $self = $class->SUPER::new(
4753                                     Name => $name,
4754                                     _Property => $property,
4755                                     _Range_List => $range_list,
4756                                     %args);
4757
4758         my $addr = main::objaddr $self;
4759
4760         $anomalous_entries{$addr} = [];
4761         $core_access{$addr} = $core_access;
4762         $default_map{$addr} = $default_map;
4763         $format{$addr} = $format;
4764
4765         $self->initialize($initialize) if defined $initialize;
4766
4767         return $self;
4768     }
4769
4770     use overload
4771         fallback => 0,
4772         qw("") => "_operator_stringify",
4773     ;
4774
4775     sub _operator_stringify {
4776         my $self = shift;
4777
4778         my $name = $self->property->full_name;
4779         $name = '""' if $name eq "";
4780         return "Map table for Property '$name'";
4781     }
4782
4783     sub complete_name {
4784         # The complete name for a map table is just its full name, as that
4785         # completely identifies the property it represents
4786
4787         return shift->full_name;
4788     }
4789
4790     sub add_alias {
4791         # Add a synonym for this table (which means the property itself)
4792         my $self = shift;
4793         my $name = shift;
4794         # Rest of parameters passed on.
4795
4796         $self->SUPER::add_alias($name, $self->property, @_);
4797         return;
4798     }
4799
4800     sub add_map {
4801         # Add a range of code points to the list of specially-handled code
4802         # points.  $MULTI_CP is assumed if the type of special is not passed
4803         # in.
4804
4805         my $self = shift;
4806         my $lower = shift;
4807         my $upper = shift;
4808         my $string = shift;
4809         my %args = @_;
4810
4811         my $type = delete $args{'Type'} || 0;
4812         # Rest of parameters passed on
4813
4814         # Can't change the table if locked.
4815         return if $self->carp_if_locked;
4816
4817         my $addr = main::objaddr $self;
4818
4819         $has_specials{$addr} = 1 if $type;
4820
4821         $self->_range_list->add_map($lower, $upper,
4822                                     $string,
4823                                     @_,
4824                                     Type => $type);
4825         return;
4826     }
4827
4828     sub append_to_body {
4829         # Adds to the written HERE document of the table's body any anomalous
4830         # entries in the table..
4831
4832         my $self = shift;
4833         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4834
4835         my $addr = main::objaddr $self;
4836
4837         return "" unless @{$anomalous_entries{$addr}};
4838         return join("\n", @{$anomalous_entries{$addr}}) . "\n";
4839     }
4840
4841     sub map_add_or_replace_non_nulls {
4842         # This adds the mappings in the table $other to $self.  Non-null
4843         # mappings from $other override those in $self.  It essentially merges
4844         # the two tables, with the second having priority except for null
4845         # mappings.
4846
4847         my $self = shift;
4848         my $other = shift;
4849         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4850
4851         return if $self->carp_if_locked;
4852
4853         if (! $other->isa(__PACKAGE__)) {
4854             Carp::my_carp_bug("$other should be a "
4855                         . __PACKAGE__
4856                         . ".  Not a '"
4857                         . ref($other)
4858                         . "'.  Not added;");
4859             return;
4860         }
4861
4862         my $addr = main::objaddr $self;
4863         my $other_addr = main::objaddr $other;
4864
4865         local $to_trace = 0 if main::DEBUG;
4866
4867         my $self_range_list = $self->_range_list;
4868         my $other_range_list = $other->_range_list;
4869         foreach my $range ($other_range_list->ranges) {
4870             my $value = $range->value;
4871             next if $value eq "";
4872             $self_range_list->_add_delete('+',
4873                                           $range->start,
4874                                           $range->end,
4875                                           $value,
4876                                           Type => $range->type,
4877                                           Replace => $UNCONDITIONALLY);
4878         }
4879
4880         # Copy the specials information from the other table to $self
4881         if ($has_specials{$other_addr}) {
4882             $has_specials{$addr} = 1;
4883         }
4884
4885         return;
4886     }
4887
4888     sub set_default_map {
4889         # Define what code points that are missing from the input files should
4890         # map to
4891
4892         my $self = shift;
4893         my $map = shift;
4894         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4895
4896         my $addr = main::objaddr $self;
4897
4898         # Convert the input to the standard equivalent, if any (won't have any
4899         # for $STRING properties)
4900         my $standard = $self->_find_table_from_alias->{$map};
4901         $map = $standard->name if defined $standard;
4902
4903         # Warn if there already is a non-equivalent default map for this
4904         # property.  Note that a default map can be a ref, which means that
4905         # what it actually means is delayed until later in the program, and it
4906         # IS permissible to override it here without a message.
4907         my $default_map = $default_map{$addr};
4908         if (defined $default_map
4909             && ! ref($default_map)
4910             && $default_map ne $map
4911             && main::Standardize($map) ne $default_map)
4912         {
4913             my $property = $self->property;
4914             my $map_table = $property->table($map);
4915             my $default_table = $property->table($default_map);
4916             if (defined $map_table
4917                 && defined $default_table
4918                 && $map_table != $default_table)
4919             {
4920                 Carp::my_carp("Changing the default mapping for "
4921                             . $property
4922                             . " from $default_map to $map'");
4923             }
4924         }
4925
4926         $default_map{$addr} = $map;
4927
4928         # Don't also create any missing table for this map at this point,
4929         # because if we did, it could get done before the main table add is
4930         # done for PropValueAliases.txt; instead the caller will have to make
4931         # sure it exists, if desired.
4932         return;
4933     }
4934
4935     sub to_output_map {
4936         # Returns boolean: should we write this map table?
4937
4938         my $self = shift;
4939         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4940
4941         my $addr = main::objaddr $self;
4942
4943         # If overridden, use that
4944         return $to_output_map{$addr} if defined $to_output_map{$addr};
4945
4946         my $full_name = $self->full_name;
4947
4948         # If table says to output, do so; if says to suppress it, do do.
4949         return 1 if grep { $_ eq $full_name } @output_mapped_properties;
4950         return 0 if $self->status eq $SUPPRESSED;
4951
4952         my $type = $self->property->type;
4953
4954         # Don't want to output binary map tables even for debugging.
4955         return 0 if $type == $BINARY;
4956
4957         # But do want to output string ones.
4958         return 1 if $type == $STRING;
4959
4960         # Otherwise is an $ENUM, don't output it
4961         return 0;
4962     }
4963
4964     sub inverse_list {
4965         # Returns a Range_List that is gaps of the current table.  That is,
4966         # the inversion
4967
4968         my $self = shift;
4969         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4970
4971         my $current = Range_List->new(Initialize => $self->_range_list,
4972                                 Owner => $self->property);
4973         return ~ $current;
4974     }
4975
4976     sub set_final_comment {
4977         # Just before output, create the comment that heads the file
4978         # containing this table.
4979
4980         my $self = shift;
4981         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4982
4983         # No sense generating a comment if aren't going to write it out.
4984         return if ! $self->to_output_map;
4985
4986         my $addr = main::objaddr $self;
4987
4988         my $property = $self->property;
4989
4990         # Get all the possible names for this property.  Don't use any that
4991         # aren't ok for use in a file name, etc.  This is perhaps causing that
4992         # flag to do double duty, and may have to be changed in the future to
4993         # have our own flag for just this purpose; but it works now to exclude
4994         # Perl generated synonyms from the lists for properties, where the
4995         # name is always the proper Unicode one.
4996         my @property_aliases = grep { $_->externally_ok } $self->aliases;
4997
4998         my $count = $self->count;
4999         my $default_map = $default_map{$addr};
5000
5001         # The ranges that map to the default aren't output, so subtract that
5002         # to get those actually output.  A property with matching tables
5003         # already has the information calculated.
5004         if ($property->type != $STRING) {
5005             $count -= $property->table($default_map)->count;
5006         }
5007         elsif (defined $default_map) {
5008
5009             # But for $STRING properties, must calculate now.  Subtract the
5010             # count from each range that maps to the default.
5011             foreach my $range ($self->_range_list->ranges) {
5012         local $to_trace = 1 if main::DEBUG;
5013         trace $self, $range;
5014                 if ($range->value eq $default_map) {
5015                     $count -= $range->end +1 - $range->start;
5016                 }
5017             }
5018
5019         }
5020
5021         # Get a  string version of $count with underscores in large numbers,
5022         # for clarity.
5023         my $string_count = main::clarify_number($count);
5024
5025         my $code_points = ($count == 1)
5026                         ? 'single code point'
5027                         : "$string_count code points";
5028
5029         my $mapping;
5030         my $these_mappings;
5031         my $are;
5032         if (@property_aliases <= 1) {
5033             $mapping = 'mapping';
5034             $these_mappings = 'this mapping';
5035             $are = 'is'
5036         }
5037         else {
5038             $mapping = 'synonymous mappings';
5039             $these_mappings = 'these mappings';
5040             $are = 'are'
5041         }
5042         my $cp;
5043         if ($count >= $MAX_UNICODE_CODEPOINTS) {
5044             $cp = "any code point in Unicode Version $string_version";
5045         }
5046         else {
5047             my $map_to;
5048             if ($default_map eq "") {
5049                 $map_to = 'the null string';
5050             }
5051             elsif ($default_map eq $CODE_POINT) {
5052                 $map_to = "itself";
5053             }
5054             else {
5055                 $map_to = "'$default_map'";
5056             }
5057             if ($count == 1) {
5058                 $cp = "the single code point";
5059             }
5060             else {
5061                 $cp = "one of the $code_points";
5062             }
5063             $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
5064         }
5065
5066         my $comment = "";
5067
5068         my $status = $self->status;
5069         if ($status) {
5070             my $warn = uc $status_past_participles{$status};
5071             $comment .= <<END;
5072
5073 !!!!!!!   $warn !!!!!!!!!!!!!!!!!!!
5074  All property or property=value combinations contained in this file are $warn.
5075  See $unicode_reference_url for what this means.
5076
5077 END
5078         }
5079         $comment .= "This file returns the $mapping:\n";
5080
5081         for my $i (0 .. @property_aliases - 1) {
5082             $comment .= sprintf("%-8s%s\n",
5083                                 " ",
5084                                 $property_aliases[$i]->name . '(cp)'
5085                                 );
5086         }
5087         $comment .=
5088                 "\nwhere 'cp' is $cp.  Note that $these_mappings $are ";
5089
5090         my $access = $core_access{$addr};
5091         if ($access) {
5092             $comment .= "accessible through the Perl core via $access.";
5093         }
5094         else {
5095             $comment .= "not accessible through the Perl core directly.";
5096         }
5097
5098         # And append any commentary already set from the actual property.
5099         $comment .= "\n\n" . $self->comment if $self->comment;
5100         if ($self->description) {
5101             $comment .= "\n\n" . join " ", $self->description;
5102         }
5103         if ($self->note) {
5104             $comment .= "\n\n" . join " ", $self->note;
5105         }
5106         $comment .= "\n";
5107
5108         if (! $self->perl_extension) {
5109             $comment .= <<END;
5110
5111 For information about what this property really means, see:
5112 $unicode_reference_url
5113 END
5114         }
5115
5116         if ($count) {        # Format differs for empty table
5117                 $comment.= "\nThe format of the ";
5118             if ($self->range_size_1) {
5119                 $comment.= <<END;
5120 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
5121 is in hex; MAPPING is what CODE_POINT maps to.
5122 END
5123             }
5124             else {
5125
5126                 # There are tables which end up only having one element per
5127                 # range, but it is not worth keeping track of for making just
5128                 # this comment a little better.
5129                 $comment.= <<END;
5130 non-comment portions of the main body of lines of this file is:
5131 START\\tSTOP\\tMAPPING where START is the starting code point of the
5132 range, in hex; STOP is the ending point, or if omitted, the range has just one
5133 code point; MAPPING is what each code point between START and STOP maps to.
5134 END
5135                 if ($output_range_counts) {
5136                     $comment .= <<END;
5137 Numbers in comments in [brackets] indicate how many code points are in the
5138 range (omitted when the range is a single code point or if the mapping is to
5139 the null string).
5140 END
5141                 }
5142             }
5143         }
5144         $self->set_comment(main::join_lines($comment));
5145         return;
5146     }
5147
5148     my %swash_keys; # Makes sure don't duplicate swash names.
5149
5150     sub pre_body {
5151         # Returns the string that should be output in the file before the main
5152         # body of this table.  This includes some hash entries identifying the
5153         # format of the body, and what the single value should be for all
5154         # ranges missing from it.  It also includes any code points which have
5155         # map_types that don't go in the main table.
5156
5157         my $self = shift;
5158         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5159
5160         my $addr = main::objaddr $self;
5161
5162         my $name = $self->property->swash_name;
5163
5164         if (defined $swash_keys{$name}) {
5165             Carp::my_carp(join_lines(<<END
5166 Already created a swash name '$name' for $swash_keys{$name}.  This means that
5167 the same name desired for $self shouldn't be used.  Bad News.  This must be
5168 fixed before production use, but proceeding anyway
5169 END
5170             ));
5171         }
5172         $swash_keys{$name} = "$self";
5173
5174         my $default_map = $default_map{$addr};
5175
5176         my $pre_body = "";
5177         if ($has_specials{$addr}) {
5178
5179             # Here, some maps with non-zero type have been added to the table.
5180             # Go through the table and handle each of them.  None will appear
5181             # in the body of the table, so delete each one as we go.  The
5182             # code point count has already been calculated, so ok to delete
5183             # now.
5184
5185             my @multi_code_point_maps;
5186             my $has_hangul_syllables = 0;
5187
5188             # The key is the base name of the code point, and the value is an
5189             # array giving all the ranges that use this base name.  Each range
5190             # is actually a hash giving the 'low' and 'high' values of it.
5191             my %names_ending_in_code_point;
5192
5193             # Inverse mapping.  The list of ranges that have these kinds of
5194             # names.  Each element contains the low, high, and base names in a
5195             # hash.
5196             my @code_points_ending_in_code_point;
5197
5198             my $range_map = $self->_range_list;
5199             foreach my $range ($range_map->ranges) {
5200                 next unless $range->type != 0;
5201                 my $low = $range->start;
5202                 my $high = $range->end;
5203                 my $map = $range->value;
5204                 my $type = $range->type;
5205
5206                 # No need to output the range if it maps to the default.  And
5207                 # the write method won't output it either, so no need to
5208                 # delete it to keep it from being output, and is faster to
5209                 # skip than to delete anyway.
5210                 next if $map eq $default_map;
5211
5212                 # Delete the range to keep write() from trying to output it
5213                 $range_map->delete_range($low, $high);
5214
5215                 # Switch based on the map type...
5216                 if ($type == $HANGUL_SYLLABLE) {
5217
5218                     # These are entirely algorithmically determinable based on
5219                     # some constants furnished by Unicode; for now, just set a
5220                     # flag to indicate that have them.  Below we will output
5221                     # the code that does the algorithm.
5222                     $has_hangul_syllables = 1;
5223                 }
5224                 elsif ($type == $CP_IN_NAME) {
5225
5226                     # If the name ends in the code point it represents, are
5227                     # also algorithmically determinable, but need information
5228                     # about the map to do so.  Both the map and its inverse
5229                     # are stored in data structures output in the file.
5230                     push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
5231                     push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
5232
5233                     push @code_points_ending_in_code_point, { low => $low,
5234                                                               high => $high,
5235                                                               name => $map
5236                                                             };
5237                 }
5238                 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
5239
5240                     # Multi-code point maps and null string maps have an entry
5241                     # for each code point in the range.  They use the same
5242                     # output format.
5243                     for my $code_point ($low .. $high) {
5244
5245                         # The pack() below can't cope with surrogates.
5246                         if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
5247                             Carp::my_carp("Surrogage code point '$code_point' in mapping to '$map' in $self.  No map created");
5248                             next;
5249                         }
5250
5251                         # Generate the hash entries for these in the form that
5252                         # utf8.c understands.
5253                         my $tostr = "";
5254                         foreach my $to (split " ", $map) {
5255                             if ($to !~ /^$code_point_re$/) {
5256                                 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self.  No map created");
5257                                 next;
5258                             }
5259                             $tostr .= sprintf "\\x{%s}", $to;
5260                         }
5261
5262                         # I (khw) have never waded through this line to
5263                         # understand it well enough to comment it.
5264                         my $utf8 = sprintf(qq["%s" => "$tostr",],
5265                                 join("", map { sprintf "\\x%02X", $_ }
5266                                     unpack("U0C*", pack("U", $code_point))));
5267
5268                         # Add a comment so that a human reader can more easily
5269                         # see what's going on.
5270                         push @multi_code_point_maps,
5271                                 sprintf("%-45s # U+%04X => %s", $utf8,
5272                                                                 $code_point,
5273                                                                 $map);
5274                     }
5275                 }
5276                 else {
5277                     Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self.  Using type 0 instead");
5278                     $range_map->add_map($low, $high, $map, Replace => $UNCONDITIONALLY, Type => 0);
5279                 }
5280             } # End of loop through all ranges
5281
5282             # Here have gone through the whole file.  If actually generated
5283             # anything for each map type, add its respective header and
5284             # trailer
5285             if (@multi_code_point_maps) {
5286                 $pre_body .= <<END;
5287
5288 # Some code points require special handling because their mappings are each to
5289 # multiple code points.  These do not appear in the main body, but are defined
5290 # in the hash below.
5291
5292 # The key: UTF-8 _bytes_, the value: UTF-8 (speed hack)
5293 %utf8::ToSpec$name = (
5294 END
5295                 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
5296             }
5297
5298             if ($has_hangul_syllables || @code_points_ending_in_code_point) {
5299
5300                 # Convert these structures to output format.
5301                 my $code_points_ending_in_code_point =
5302                     main::simple_dumper(\@code_points_ending_in_code_point,
5303                                         ' ' x 8);
5304                 my $names = main::simple_dumper(\%names_ending_in_code_point,
5305                                                 ' ' x 8);
5306
5307                 # Do the same with the Hangul names,
5308                 my $jamo;
5309                 my $jamo_l;
5310                 my $jamo_v;
5311                 my $jamo_t;
5312                 my $jamo_re;
5313                 if ($has_hangul_syllables) {
5314
5315                     # Construct a regular expression of all the possible
5316                     # combinations of the Hangul syllables.
5317                     my @L_re;   # Leading consonants
5318                     for my $i ($LBase .. $LBase + $LCount - 1) {
5319                         push @L_re, $Jamo{$i}
5320                     }
5321                     my @V_re;   # Middle vowels
5322                     for my $i ($VBase .. $VBase + $VCount - 1) {
5323                         push @V_re, $Jamo{$i}
5324                     }
5325                     my @T_re;   # Trailing consonants
5326                     for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
5327                         push @T_re, $Jamo{$i}
5328                     }
5329
5330                     # The whole re is made up of the L V T combination.
5331                     $jamo_re = '('
5332                                . join ('|', sort @L_re)
5333                                . ')('
5334                                . join ('|', sort @V_re)
5335                                . ')('
5336                                . join ('|', sort @T_re)
5337                                . ')?';
5338
5339                     # These hashes needed by the algorithm were generated
5340                     # during reading of the Jamo.txt file
5341                     $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
5342                     $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
5343                     $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
5344                     $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
5345                 }
5346
5347                 $pre_body .= <<END;
5348
5349 # To achieve significant memory savings when this file is read in,
5350 # algorithmically derivable code points are omitted from the main body below.
5351 # Instead, the following routines can be used to translate between name and
5352 # code point and vice versa
5353
5354 { # Closure
5355
5356     # Matches legal code point.  4-6 hex numbers, If there are 6, the
5357     # first two must be '10'; if there are 5, the first must not be a '0'.
5358     my \$code_point_re = qr/$code_point_re/;
5359
5360     # In the following hash, the keys are the bases of names which includes
5361     # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The values
5362     # of each key is another hash which is used to get the low and high ends
5363     # for each range of code points that apply to the name
5364     my %names_ending_in_code_point = (
5365 $names
5366     );
5367
5368     # And the following array gives the inverse mapping from code points to
5369     # names.  Lowest code points are first
5370     my \@code_points_ending_in_code_point = (
5371 $code_points_ending_in_code_point
5372     );
5373 END
5374                 # Earlier releases didn't have Jamos.  No sense outputting
5375                 # them unless will be used.
5376                 if ($has_hangul_syllables) {
5377                     $pre_body .= <<END;
5378
5379     # Convert from code point to Jamo short name for use in composing Hangul
5380     # syllable names
5381     my %Jamo = (
5382 $jamo
5383     );
5384
5385     # Leading consonant (can be null)
5386     my %Jamo_L = (
5387 $jamo_l
5388     );
5389
5390     # Vowel
5391     my %Jamo_V = (
5392 $jamo_v
5393     );
5394
5395     # Optional trailing consonant
5396     my %Jamo_T = (
5397 $jamo_t
5398     );
5399
5400     # Computed re that splits up a Hangul name into LVT or LV syllables
5401     my \$syllable_re = qr/$jamo_re/;
5402
5403     my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
5404     my \$HANGUL_SYLLABLE_LENGTH = length \$HANGUL_SYLLABLE;
5405
5406     # These constants names and values were taken from the Unicode standard,
5407     # version 5.1, section 3.12.  They are used in conjunction with Hangul
5408     # syllables
5409     my \$SBase = 0xAC00;
5410     my \$LBase = 0x1100;
5411     my \$VBase = 0x1161;
5412     my \$TBase = 0x11A7;
5413     my \$SCount = 11172;
5414     my \$LCount = 19;
5415     my \$VCount = 21;
5416     my \$TCount = 28;
5417     my \$NCount = \$VCount * \$TCount;
5418 END
5419                 } # End of has Jamos
5420
5421                 $pre_body .= << 'END';
5422
5423     sub name_to_code_point_special {
5424         my $name = shift;
5425
5426         # Returns undef if not one of the specially handled names; otherwise
5427         # returns the code point equivalent to the input name
5428 END
5429                 if ($has_hangul_syllables) {
5430                     $pre_body .= << 'END';
5431
5432         if (substr($name, 0, $HANGUL_SYLLABLE_LENGTH) eq $HANGUL_SYLLABLE) {
5433             $name = substr($name, $HANGUL_SYLLABLE_LENGTH);
5434             return if $name !~ qr/^$syllable_re$/;
5435             my $L = $Jamo_L{$1};
5436             my $V = $Jamo_V{$2};
5437             my $T = (defined $3) ? $Jamo_T{$3} : 0;
5438             return ($L * $VCount + $V) * $TCount + $T + $SBase;
5439         }
5440 END
5441                 }
5442                 $pre_body .= << 'END';
5443
5444         # Name must end in '-code_point' for this to handle.
5445         if ($name !~ /^ (.*) - ($code_point_re) $/x) {
5446             return;
5447         }
5448
5449         my $base = $1;
5450         my $code_point = CORE::hex $2;
5451
5452         # Name must be one of the ones which has the code point in it.
5453         return if ! $names_ending_in_code_point{$base};
5454
5455         # Look through the list of ranges that apply to this name to see if
5456         # the code point is in one of them.
5457         for (my $i = 0; $i < scalar @{$names_ending_in_code_point{$base}{'low'}}; $i++) {
5458             return if $names_ending_in_code_point{$base}{'low'}->[$i] > $code_point;
5459             next if $names_ending_in_code_point{$base}{'high'}->[$i] < $code_point;
5460
5461             # Here, the code point is in the range.
5462             return $code_point;
5463         }
5464
5465         # Here, looked like the name had a code point number in it, but
5466         # did not match one of the valid ones.
5467         return;
5468     }
5469
5470     sub code_point_to_name_special {
5471         my $code_point = shift;
5472
5473         # Returns the name of a code point if algorithmically determinable;
5474         # undef if not
5475 END
5476                 if ($has_hangul_syllables) {
5477                     $pre_body .= << 'END';
5478
5479         # If in the Hangul range, calculate the name based on Unicode's
5480         # algorithm
5481         if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
5482             use integer;
5483             my $SIndex = $code_point - $SBase;
5484             my $L = $LBase + $SIndex / $NCount;
5485             my $V = $VBase + ($SIndex % $NCount) / $TCount;
5486             my $T = $TBase + $SIndex % $TCount;
5487             $name = "$HANGUL_SYLLABLE $Jamo{$L}$Jamo{$V}";
5488             $name .= $Jamo{$T} if $T != $TBase;
5489             return $name;
5490         }
5491 END
5492                 }
5493                 $pre_body .= << 'END';
5494
5495         # Look through list of these code points for one in range.
5496         foreach my $hash (@code_points_ending_in_code_point) {
5497             return if $code_point < $hash->{'low'};
5498             if ($code_point <= $hash->{'high'}) {
5499                 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
5500             }
5501         }
5502         return;            # None found
5503     }
5504 } # End closure
5505
5506 END
5507             } # End of has hangul or code point in name maps.
5508         } # End of has specials
5509
5510         # Calculate the format of the table if not already done.
5511         my $format = $format{$addr};
5512         my $property = $self->property;
5513         my $type = $property->type;
5514         if (! defined $format) {
5515             if ($type == $BINARY) {
5516
5517                 # Don't bother checking the values, because we elsewhere
5518                 # verify that a binary table has only 2 values.
5519                 $format = $BINARY_FORMAT;
5520             }
5521             else {
5522                 my @ranges = $self->_range_list->ranges;
5523
5524                 # default an empty table based on its type and default map
5525                 if (! @ranges) {
5526
5527                     # But it turns out that the only one we can say is a
5528                     # non-string (besides binary, handled above) is when the
5529                     # table is a string and the default map is to a code point
5530                     if ($type == $STRING && $default_map eq $CODE_POINT) {
5531                         $format = $HEX_FORMAT;
5532                     }
5533                     else {
5534                         $format = $STRING_FORMAT;
5535                     }
5536                 }
5537                 else {
5538
5539                     # Start with the most restrictive format, and as we find
5540                     # something that doesn't fit with that, change to the next
5541                     # most restrictive, and so on.
5542                     $format = $DECIMAL_FORMAT;
5543                     foreach my $range (@ranges) {
5544                         my $map = $range->value;
5545                         if ($map ne $default_map) {
5546                             last if $format eq $STRING_FORMAT;  # already at
5547                                                                 # least
5548                                                                 # restrictive
5549                             $format = $INTEGER_FORMAT
5550                                                 if $format eq $DECIMAL_FORMAT
5551                                                     && $map !~ / ^ [0-9] $ /x;
5552                             $format = $FLOAT_FORMAT
5553                                             if $format eq $INTEGER_FORMAT
5554                                                 && $map !~ / ^ -? [0-9]+ $ /x;
5555                             $format = $RATIONAL_FORMAT
5556                                 if $format eq $FLOAT_FORMAT
5557                                     && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
5558                             $format = $HEX_FORMAT
5559                             if $format eq $RATIONAL_FORMAT
5560                                 && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
5561                             $format = $STRING_FORMAT if $format eq $HEX_FORMAT
5562                                                        && $map =~ /[^0-9A-F]/;
5563                         }
5564                     }
5565                 }
5566             }
5567         } # end of calculating format
5568
5569         my $return = <<END;
5570 # The name this swash is to be known by, with the format of the mappings in
5571 # the main body of the table, and what all code points missing from this file
5572 # map to.
5573 \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
5574 END
5575         my $missing = $default_map;
5576         if ($missing eq $CODE_POINT
5577             && $format ne $HEX_FORMAT
5578             && ! defined $format{$addr})    # Is expected if was manually set
5579         {
5580             Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
5581         }
5582         $format{$addr} = $format;
5583         $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$missing';";
5584         if ($missing eq $CODE_POINT) {
5585             $return .= ' # code point maps to itself';
5586         }
5587         elsif ($missing eq "") {
5588             $return .= ' # code point maps to the null string';
5589         }
5590         $return .= "\n";
5591
5592         $return .= $pre_body;
5593
5594         return $return;
5595     }
5596
5597     sub write {
5598         # Write the table to the file.
5599
5600         my $self = shift;
5601         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5602
5603         my $addr = main::objaddr $self;
5604
5605         return $self->SUPER::write(
5606             ($self->property == $block)
5607                 ? 7     # block file needs more tab stops
5608                 : 3,
5609             $default_map{$addr});   # don't write defaulteds
5610     }
5611
5612     # Accessors for the underlying list that should fail if locked.
5613     for my $sub qw(
5614                     add_duplicate
5615                 )
5616     {
5617         no strict "refs";
5618         *$sub = sub {
5619             use strict "refs";
5620             my $self = shift;
5621
5622             return if $self->carp_if_locked;
5623             return $self->_range_list->$sub(@_);
5624         }
5625     }
5626 } # End closure for Map_Table
5627
5628 package Match_Table;
5629 use base '_Base_Table';
5630
5631 # A Match table is one which is a list of all the code points that have
5632 # the same property and property value, for use in \p{property=value}
5633 # constructs in regular expressions.  It adds very little data to the base
5634 # structure, but many methods, as these lists can be combined in many ways to
5635 # form new ones.
5636 # There are only a few concepts added:
5637 # 1) Equivalents and Relatedness.
5638 #    Two tables can match the identical code points, but have different names.
5639 #    This always happens when there is a perl single form extension
5640 #    \p{IsProperty} for the Unicode compound form \P{Property=True}.  The two
5641 #    tables are set to be related, with the Perl extension being a child, and
5642 #    the Unicode property being the parent.
5643 #
5644 #    It may be that two tables match the identical code points and we don't
5645 #    know if they are related or not.  This happens most frequently when the
5646 #    Block and Script properties have the exact range.  But note that a
5647 #    revision to Unicode could add new code points to the script, which would
5648 #    now have to be in a different block (as the block was filled, or there
5649 #    would have been 'Unknown' script code points in it and they wouldn't have
5650 #    been identical).  So we can't rely on any two properties from Unicode
5651 #    always matching the same code points from release to release, and thus
5652 #    these tables are considered coincidentally equivalent--not related.  When
5653 #    two tables are unrelated but equivalent, one is arbitrarily chosen as the
5654 #    'leader', and the others are 'equivalents'.  This concept is useful
5655 #    to minimize the number of tables written out.  Only one file is used for
5656 #    any identical set of code points, with entries in Heavy.pl mapping all
5657 #    the involved tables to it.
5658 #
5659 #    Related tables will always be identical; we set them up to be so.  Thus
5660 #    if the Unicode one is deprecated, the Perl one will be too.  Not so for
5661 #    unrelated tables.  Relatedness makes generating the documentation easier.
5662 #
5663 # 2) Conflicting.  It may be that there will eventually be name clashes, with
5664 #    the same name meaning different things.  For a while, there actually were
5665 #    conflicts, but they have so far been resolved by changing Perl's or
5666 #    Unicode's definitions to match the other, but when this code was written,
5667 #    it wasn't clear that that was what was going to happen.  (Unicode changed
5668 #    because of protests during their beta period.)  Name clashes are warned
5669 #    about during compilation, and the documentation.  The generated tables
5670 #    are sane, free of name clashes, because the code suppresses the Perl
5671 #    version.  But manual intervention to decide what the actual behavior
5672 #    should be may be required should this happen.  The introductory comments
5673 #    have more to say about this.
5674
5675 sub standardize { return main::standardize($_[0]); }
5676 sub trace { return main::trace(@_); }
5677
5678
5679 { # Closure
5680
5681     main::setup_package();
5682
5683     my %leader;
5684     # The leader table of this one; initially $self.
5685     main::set_access('leader', \%leader, 'r');
5686
5687     my %equivalents;
5688     # An array of any tables that have this one as their leader
5689     main::set_access('equivalents', \%equivalents, 'readable_array');
5690
5691     my %parent;
5692     # The parent table to this one, initially $self.  This allows us to
5693     # distinguish between equivalent tables that are related, and those which
5694     # may not be, but share the same output file because they match the exact
5695     # same set of code points in the current Unicode release.
5696     main::set_access('parent', \%parent, 'r');
5697
5698     my %children;
5699     # An array of any tables that have this one as their parent
5700     main::set_access('children', \%children, 'readable_array');
5701
5702     my %conflicting;
5703     # Array of any tables that would have the same name as this one with
5704     # a different meaning.  This is used for the generated documentation.
5705     main::set_access('conflicting', \%conflicting, 'readable_array');
5706
5707     my %matches_all;
5708     # Set in the constructor for tables that are expected to match all code
5709     # points.
5710     main::set_access('matches_all', \%matches_all, 'r');
5711
5712     sub new {
5713         my $class = shift;
5714
5715         my %args = @_;
5716
5717         # The property for which this table is a listing of property values.
5718         my $property = delete $args{'_Property'};
5719
5720         # Optional
5721         my $initialize = delete $args{'Initialize'};
5722         my $matches_all = delete $args{'Matches_All'} || 0;
5723         # Rest of parameters passed on.
5724
5725         my $range_list = Range_List->new(Initialize => $initialize,
5726                                          Owner => $property);
5727
5728         my $self = $class->SUPER::new(%args,
5729                                       _Property => $property,
5730                                       _Range_List => $range_list,
5731                                       );
5732         my $addr = main::objaddr $self;
5733
5734         $conflicting{$addr} = [ ];
5735         $equivalents{$addr} = [ ];
5736         $children{$addr} = [ ];
5737         $matches_all{$addr} = $matches_all;
5738         $leader{$addr} = $self;
5739         $parent{$addr} = $self;
5740
5741         return $self;
5742     }
5743
5744     # See this program's beginning comment block about overloading these.
5745     use overload
5746         fallback => 0,
5747         qw("") => "_operator_stringify",
5748         '=' => sub {
5749                     my $self = shift;
5750
5751                     return if $self->carp_if_locked;
5752                     return $self;
5753                 },
5754
5755         '+' => sub {
5756                         my $self = shift;
5757                         my $other = shift;
5758
5759                         return $self->_range_list + $other;
5760                     },
5761         '&' => sub {
5762                         my $self = shift;
5763                         my $other = shift;
5764
5765                         return $self->_range_list & $other;
5766                     },
5767         '+=' => sub {
5768                         my $self = shift;
5769                         my $other = shift;
5770
5771                         return if $self->carp_if_locked;
5772
5773                         my $addr = main::objaddr $self;
5774
5775                         if (ref $other) {
5776
5777                             # Change the range list of this table to be the
5778                             # union of the two.
5779                             $self->_set_range_list($self->_range_list
5780                                                     + $other);
5781                         }
5782                         else {    # $other is just a simple value
5783                             $self->add_range($other, $other);
5784                         }
5785                         return $self;
5786                     },
5787         '-' => sub { my $self = shift;
5788                     my $other = shift;
5789                     my $reversed = shift;
5790
5791                     if ($reversed) {
5792                         Carp::my_carp_bug("Can't cope with a "
5793                             .  __PACKAGE__
5794                             . " being the first parameter in a '-'.  Subtraction ignored.");
5795                         return;
5796                     }
5797
5798                     return $self->_range_list - $other;
5799                 },
5800         '~' => sub { my $self = shift;
5801                     return ~ $self->_range_list;
5802                 },
5803     ;
5804
5805     sub _operator_stringify {
5806         my $self = shift;
5807
5808         my $name= $self->complete_name;
5809         return "Table '$name'";
5810     }
5811
5812     sub add_alias {
5813         # Add a synonym for this table.  See the comments in the base class
5814
5815         my $self = shift;
5816         my $name = shift;
5817         # Rest of parameters passed on.
5818
5819         $self->SUPER::add_alias($name, $self, @_);
5820         return;
5821     }
5822
5823     sub add_conflicting {
5824         # Add the name of some other object to the list of ones that name
5825         # clash with this match table.
5826
5827         my $self = shift;
5828         my $conflicting_name = shift;   # The name of the conflicting object
5829         my $p = shift || 'p';           # Optional, is this a \p{} or \P{} ?
5830         my $conflicting_object = shift; # Optional, the conflicting object
5831                                         # itself.  This is used to
5832                                         # disambiguate the text if the input
5833                                         # name is identical to any of the
5834                                         # aliases $self is known by.
5835                                         # Sometimes the conflicting object is
5836                                         # merely hypothetical, so this has to
5837                                         # be an optional parameter.
5838         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5839
5840         my $addr = main::objaddr $self;
5841
5842         # Check if the conflicting name is exactly the same as any existing
5843         # alias in this table (as long as there is a real object there to
5844         # disambiguate with).
5845         if (defined $conflicting_object) {
5846             foreach my $alias ($self->aliases) {
5847                 if ($alias->name eq $conflicting_name) {
5848
5849                     # Here, there is an exact match.  This results in
5850                     # ambiguous comments, so disambiguate by changing the
5851                     # conflicting name to its object's complete equivalent.
5852                     $conflicting_name = $conflicting_object->complete_name;
5853                     last;
5854                 }
5855             }
5856         }
5857
5858         # Convert to the \p{...} final name
5859         $conflicting_name = "\\$p" . "{$conflicting_name}";
5860
5861         # Only add once
5862         return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
5863
5864         push @{$conflicting{$addr}}, $conflicting_name;
5865
5866         return;
5867     }
5868
5869     sub is_equivalent_to {
5870         # Return boolean of whether or not the other object is a table of this
5871         # type and has been marked equivalent to this one.
5872
5873         my $self = shift;
5874         my $other = shift;
5875         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5876
5877         return 0 if ! defined $other; # Can happen for incomplete early
5878                                       # releases
5879         unless ($other->isa(__PACKAGE__)) {
5880             my $ref_other = ref $other;
5881             my $ref_self = ref $self;
5882             Carp::my_carp_bug("Argument to 'is_equivalent_to' must be another $ref_self, not a '$ref_other'.  $other not set equivalent to $self.");
5883             return 0;
5884         }
5885
5886         # Two tables are equivalent if they have the same leader.
5887         return $leader{main::objaddr $self}
5888                 == $leader{main::objaddr $other};
5889         return;
5890     }
5891
5892     sub matches_identically_to {
5893         # Return a boolean as to whether or not two tables match identical
5894         # sets of code points.
5895
5896         my $self = shift;
5897         my $other = shift;
5898         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5899
5900         unless ($other->isa(__PACKAGE__)) {
5901             my $ref_other = ref $other;
5902             my $ref_self = ref $self;
5903             Carp::my_carp_bug("Argument to 'matches_identically_to' must be another $ref_self, not a '$ref_other'.  $other not set equivalent to $self.");
5904             return 0;
5905         }
5906
5907         # These are ordered in increasing real time to figure out (at least
5908         # until a patch changes that and doesn't change this)
5909         return 0 if $self->max != $other->max;
5910         return 0 if $self->min != $other->min;
5911         return 0 if $self->range_count != $other->range_count;
5912         return 0 if $self->count != $other->count;
5913
5914         # Here they could be identical because all the tests above passed.
5915         # The loop below is somewhat simpler since we know they have the same
5916         # number of elements.  Compare range by range, until reach the end or
5917         # find something that differs.
5918         my @a_ranges = $self->_range_list->ranges;
5919         my @b_ranges = $other->_range_list->ranges;
5920         for my $i (0 .. @a_ranges - 1) {
5921             my $a = $a_ranges[$i];
5922             my $b = $b_ranges[$i];
5923             trace "self $a; other $b" if main::DEBUG && $to_trace;
5924             return 0 if $a->start != $b->start || $a->end != $b->end;
5925         }
5926         return 1;
5927     }
5928
5929     sub set_equivalent_to {
5930         # Set $self equivalent to the parameter table.
5931         # The required Related => 'x' parameter is a boolean indicating
5932         # whether these tables are related or not.  If related, $other becomes
5933         # the 'parent' of $self; if unrelated it becomes the 'leader'
5934         #
5935         # Related tables share all characteristics except names; equivalents
5936         # not quite so many.
5937         # If they are related, one must be a perl extension.  This is because
5938         # we can't guarantee that Unicode won't change one or the other in a
5939         # later release even if they are idential now.
5940
5941         my $self = shift;
5942         my $other = shift;
5943
5944         my %args = @_;
5945         my $related = delete $args{'Related'};
5946
5947         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5948
5949         return if ! defined $other;     # Keep on going; happens in some early
5950                                         # Unicode releases.
5951
5952         if (! defined $related) {
5953             Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter.  Assuming $self is not related to $other");
5954             $related = 0;
5955         }
5956
5957         # If already are equivalent, no need to re-do it;  if subroutine
5958         # returns null, it found an error, also do nothing
5959         my $are_equivalent = $self->is_equivalent_to($other);
5960         return if ! defined $are_equivalent || $are_equivalent;
5961
5962         my $current_leader = ($related)
5963                              ? $parent{main::objaddr $self}
5964                              : $leader{main::objaddr $self};
5965
5966         if ($related &&
5967             ! $other->perl_extension
5968             && ! $current_leader->perl_extension)
5969         {
5970             Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties.  Assuming $self is not related to $other");
5971             $related = 0;
5972         }
5973
5974         my $leader = main::objaddr $current_leader;
5975         my $other_addr = main::objaddr $other;
5976
5977         # Any tables that are equivalent to or children of this table must now
5978         # instead be equivalent to or (children) to the new leader (parent),
5979         # still equivalent.  The equivalency includes their matches_all info,
5980         # and for related tables, their status
5981         # All related tables are of necessity equivalent, but the converse
5982         # isn't necessarily true
5983         my $status = $other->status;
5984         my $status_info = $other->status_info;
5985         my $matches_all = $matches_all{other_addr};
5986         foreach my $table ($current_leader, @{$equivalents{$leader}}) {
5987             next if $table == $other;
5988             trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
5989
5990             my $table_addr = main::objaddr $table;
5991             $leader{$table_addr} = $other;
5992             $matches_all{$table_addr} = $matches_all;
5993             $self->_set_range_list($other->_range_list);
5994             push @{$equivalents{$other_addr}}, $table;
5995             if ($related) {
5996                 $parent{$table_addr} = $other;
5997                 push @{$children{$other_addr}}, $table;
5998                 $table->set_status($status, $status_info);
5999             }
6000         }
6001
6002         # Now that we've declared these to be equivalent, any changes to one
6003         # of the tables would invalidate that equivalency.
6004         $self->lock;
6005         $other->lock;
6006         return;
6007     }
6008
6009     sub add_range { # Add a range to the list for this table.
6010         my $self = shift;
6011         # Rest of parameters passed on
6012
6013         return if $self->carp_if_locked;
6014         return $self->_range_list->add_range(@_);
6015     }
6016
6017     sub complete_name {
6018         # The complete name for a match table includes it's property in a
6019         # compound form 'property=table', except if the property is the
6020         # pseudo-property, perl, in which case it is just the single form,
6021         # 'table'
6022
6023         my $self = shift;
6024         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6025
6026         my $name = $self->full_name;
6027         my $property = $self->property;
6028         $name = '""' if $name eq "";  # A null name shouldn't happen, but this
6029                                       # helps debug if it does
6030         return $name if $property == $perl;
6031
6032         # (If change the '=' must also change the ':' in set_final_comment(),
6033         # and the references to colon in its text)
6034         return $property->full_name . '=' . $name;
6035     }
6036
6037     sub pre_body {  # Does nothing for match tables.
6038         return
6039     }
6040
6041     sub append_to_body {  # Does nothing for match tables.
6042         return
6043     }
6044
6045     sub write {
6046         my $self = shift;
6047         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6048
6049         return $self->SUPER::write(2); # 2 tab stops
6050     }
6051
6052     sub set_final_comment {
6053         # This creates a comment for the file that is to hold the match table
6054         # $self.  It is somewhat convoluted to make the English read nicely,
6055         # but, heh, it's just a comment.
6056         # This should be called only with the leader match table of all the
6057         # ones that share the same file.  It lists all such tables, ordered so
6058         # that related ones are together.
6059
6060         my $leader = shift;   # Should only be called on the leader table of
6061                               # an equivalent group
6062         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6063
6064         my $addr = main::objaddr $leader;
6065
6066         if ($leader{$addr} != $leader) {
6067             Carp::my_carp_bug(<<END
6068 set_final_comment() must be called on a leader table, which $leader is not.
6069 It is equivalent to $leader{$addr}.  No comment created
6070 END
6071             );
6072             return;
6073         }
6074
6075         # Get the number of code points matched by each of the tables in this
6076         # file, and add underscores for clarity.
6077         my $count = $leader->count;
6078         my $string_count = main::clarify_number($count);
6079
6080         my $loose_count = 0;        # how many aliases loosely matched
6081         my $compound_name = "";     # ? Are any names compound?, and if so, an
6082                                     # example
6083         my $properties_with_compound_names = 0;    # count of these
6084
6085
6086         my %flags;              # The status flags used in the file
6087         my $total_entries = 0;  # number of entries written in the comment
6088         my $matches_comment = ""; # The portion of the comment about the
6089                                   # \p{}'s
6090         my @global_comments;    # List of all the tables' comments that are
6091                                 # there before this routine was called.
6092
6093         # Get list of all the parent tables that are equivalent to this one
6094         # (including itself).
6095         my @parents = grep { $parent{main::objaddr $_} == $_ }
6096                             main::uniques($leader, @{$equivalents{$addr}});
6097         my $has_unrelated = (@parents >= 2);  # boolean, ? are there unrelated
6098                                               # tables
6099
6100         for my $parent (@parents) {
6101
6102             my $property = $parent->property;
6103
6104             # Special case 'N' tables in properties with two match tables when
6105             # the other is a 'Y' one.  These are likely to be binary tables,
6106             # but not necessarily.  In either case, \P{} will match the
6107             # complement of \p{}, and so if something is a synonym of \p, the
6108             # complement of that something will be the synonym of \P.  This
6109             # would be true of any property with just two match tables, not
6110             # just those whose values are Y and N; but that would require a
6111             # little extra work, and there are none such so far in Unicode.
6112             my $perl_p = 'p';        # which is it?  \p{} or \P{}
6113             my @yes_perl_synonyms;   # list of any synonyms for the 'Y' table
6114
6115             if (scalar $property->tables == 2
6116                 && $parent == $property->table('N')
6117                 && defined (my $yes = $property->table('Y')))
6118             {
6119                 my $yes_addr = main::objaddr $yes;
6120                 @yes_perl_synonyms
6121                     = grep { $_->property == $perl }
6122                                     main::uniques($yes,
6123                                                 $parent{$yes_addr},
6124                                                 $parent{$yes_addr}->children);
6125
6126                 # But these synonyms are \P{} ,not \p{}
6127                 $perl_p = 'P';
6128             }
6129
6130             my @description;        # Will hold the table description
6131             my @note;               # Will hold the table notes.
6132             my @conflicting;        # Will hold the table conflicts.
6133
6134             # Look at the parent, any yes synonyms, and all the children
6135             for my $table ($parent,
6136                            @yes_perl_synonyms,
6137                            @{$children{main::objaddr $parent}})
6138             {
6139                 my $table_addr = main::objaddr $table;
6140                 my $table_property = $table->property;
6141
6142                 # Tables are separated by a blank line to create a grouping.
6143                 $matches_comment .= "\n" if $matches_comment;
6144
6145                 # The table is named based on the property and value
6146                 # combination it is for, like script=greek.  But there may be
6147                 # a number of synonyms for each side, like 'sc' for 'script',
6148                 # and 'grek' for 'greek'.  Any combination of these is a valid
6149                 # name for this table.  In this case, there are three more,
6150                 # 'sc=grek', 'sc=greek', and 'script='grek'.  Rather than
6151                 # listing all possible combinations in the comment, we make
6152                 # sure that each synonym occurs at least once, and add
6153                 # commentary that the other combinations are possible.
6154                 my @property_aliases = $table_property->aliases;
6155                 my @table_aliases = $table->aliases;
6156
6157                 Carp::my_carp_bug("$table doesn't have any names.  Proceeding anyway.") unless @table_aliases;
6158
6159                 # The alias lists above are already ordered in the order we
6160                 # want to output them.  To ensure that each synonym is listed,
6161                 # we must use the max of the two numbers.
6162                 my $listed_combos = main::max(scalar @table_aliases,
6163                                                 scalar @property_aliases);
6164                 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
6165
6166                 my $property_had_compound_name = 0;
6167
6168                 for my $i (0 .. $listed_combos - 1) {
6169                     $total_entries++;
6170
6171                     # The current alias for the property is the next one on
6172                     # the list, or if beyond the end, start over.  Similarly
6173                     # for the table (\p{prop=table})
6174                     my $property_alias = $property_aliases
6175                                             [$i % @property_aliases]->name;
6176                     my $table_alias_object = $table_aliases
6177                                                         [$i % @table_aliases];
6178                     my $table_alias = $table_alias_object->name;
6179                     my $loose_match = $table_alias_object->loose_match;
6180
6181                     if ($table_alias !~ /\D/) { # Clarify large numbers.
6182                         $table_alias = main::clarify_number($table_alias)
6183                     }
6184
6185                     # Add a comment for this alias combination
6186                     my $current_match_comment;
6187                     if ($table_property == $perl) {
6188                         $current_match_comment = "\\$perl_p"
6189                                                     . "{$table_alias}";
6190                     }
6191                     else {
6192                         $current_match_comment
6193                                         = "\\p{$property_alias=$table_alias}";
6194                         $property_had_compound_name = 1;
6195                     }
6196
6197                     # Flag any abnormal status for this table.
6198                     my $flag = $property->status
6199                                 || $table->status
6200                                 || $table_alias_object->status;
6201                     $flags{$flag} = $status_past_participles{$flag} if $flag;
6202
6203                     $loose_count++;
6204
6205                     # Pretty up the comment.  Note the \b; it says don't make
6206                     # this line a continuation.
6207                     $matches_comment .= sprintf("\b%-1s%-s%s\n",
6208                                         $flag,
6209                                         " " x 7,
6210                                         $current_match_comment);
6211                 } # End of generating the entries for this table.
6212
6213                 # Save these for output after this group of related tables.
6214                 push @description, $table->description;
6215                 push @note, $table->note;
6216                 push @conflicting, $table->conflicting;
6217
6218                 # Compute an alternate compound name using the final property
6219                 # synonym and the first table synonym with a colon instead of
6220                 # the equal sign used elsewhere.
6221                 if ($property_had_compound_name) {
6222                     $properties_with_compound_names ++;
6223                     if (! $compound_name || @property_aliases > 1) {
6224                         $compound_name = $property_aliases[-1]->name
6225                                         . ': '
6226                                         . $table_aliases[0]->name;
6227                     }
6228                 }
6229             } # End of looping through all children of this table
6230
6231             # Here have assembled in $matches_comment all the related tables
6232             # to the current parent (preceded by the same info for all the
6233             # previous parents).  Put out information that applies to all of
6234             # the current family.
6235             if (@conflicting) {
6236
6237                 # But output the conflicting information now, as it applies to
6238                 # just this table.
6239                 my $conflicting = join ", ", @conflicting;
6240                 if ($conflicting) {
6241                     $matches_comment .= <<END;
6242
6243     Note that contrary to what you might expect, the above is NOT the same as
6244 END
6245                     $matches_comment .= "any of: " if @conflicting > 1;
6246                     $matches_comment .= "$conflicting\n";
6247                 }
6248             }
6249             if (@description) {
6250                 $matches_comment .= "\n    Meaning: "
6251                                     . join('; ', @description)
6252                                     . "\n";
6253             }
6254             if (@note) {
6255                 $matches_comment .= "\n    Note: "
6256                                     . join("\n    ", @note)
6257                                     . "\n";
6258             }
6259         } # End of looping through all tables
6260
6261
6262         my $code_points;
6263         my $match;
6264         my $any_of_these;
6265         if ($count == 1) {
6266             $match = 'matches';
6267             $code_points = 'single code point';
6268         }
6269         else {
6270             $match = 'match';
6271             $code_points = "$string_count code points";
6272         }
6273
6274         my $synonyms;
6275         my $entries;
6276         if ($total_entries <= 1) {
6277             $synonyms = "";
6278             $entries = 'entry';
6279             $any_of_these = 'this'
6280         }
6281         else {
6282             $synonyms = " any of the following regular expression constructs";
6283             $entries = 'entries';
6284             $any_of_these = 'any of these'
6285         }
6286
6287         my $comment = "";
6288         if ($has_unrelated) {
6289             $comment .= <<END;
6290 This file is for tables that are not necessarily related:  To conserve
6291 resources, every table that matches the identical set of code points in this
6292 version of Unicode uses this file.  Each one is listed in a separate group
6293 below.  It could be that the tables will match the same set of code points in
6294 other Unicode releases, or it could be purely coincidence that they happen to
6295 be the same in Unicode $string_version, and hence may not in other versions.
6296
6297 END
6298         }
6299
6300         if (%flags) {
6301             foreach my $flag (sort keys %flags) {
6302                 $comment .= <<END;
6303 '$flag' below means that this form is $flags{$flag}.  Consult $pod_file.pod
6304 END
6305             }
6306             $comment .= "\n";
6307         }
6308
6309         $comment .= <<END;
6310 This file returns the $code_points in Unicode Version $string_version that
6311 $match$synonyms:
6312
6313 $matches_comment
6314 $pod_file.pod should be consulted for the rules on using $any_of_these,
6315 including if adding or subtracting white space, underscore, and hyphen
6316 characters matters or doesn't matter, and other permissible syntactic
6317 variants.  Upper/lower case distinctions never matter.
6318 END
6319
6320         if ($compound_name) {
6321             $comment .= <<END;
6322
6323 A colon can be substituted for the equals sign, and
6324 END
6325             if ($properties_with_compound_names > 1) {
6326                 $comment .= <<END;
6327 within each group above,
6328 END
6329             }
6330             $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
6331
6332             # Note the \b below, it says don't make that line a continuation.
6333             $comment .= <<END;
6334 anything to the left of the equals (or colon) can be combined with anything to
6335 the right.  Thus, for example,
6336 $compound_name
6337 \bis also valid.
6338 END
6339         }
6340
6341         # And append any comment(s) from the actual tables.  They are all
6342         # gathered here, so may not read all that well.
6343         $comment .= "\n" . join "\n\n", @global_comments if @global_comments;
6344
6345         if ($count) {   # The format differs if no code points, and needs no
6346                         # explanation in that case
6347                 $comment.= <<END;
6348
6349 The format of the lines of this file is:
6350 END
6351             $comment.= <<END;
6352 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
6353 STOP is the ending point, or if omitted, the range has just one code point.
6354 END
6355             if ($output_range_counts) {
6356                 $comment .= <<END;
6357 Numbers in comments in [brackets] indicate how many code points are in the
6358 range.
6359 END
6360             }
6361         }
6362
6363         $leader->set_comment(main::join_lines($comment));
6364         return;
6365     }
6366
6367     # Accessors for the underlying list
6368     for my $sub qw(
6369                     get_valid_code_point
6370                     get_invalid_code_point
6371                 )
6372     {
6373         no strict "refs";
6374         *$sub = sub {
6375             use strict "refs";
6376             my $self = shift;
6377
6378             return $self->_range_list->$sub(@_);
6379         }
6380     }
6381 } # End closure for Match_Table
6382
6383 package Property;
6384
6385 # The Property class represents a Unicode property, or the $perl
6386 # pseudo-property.  It contains a map table initialized empty at construction
6387 # time, and for properties accessible through regular expressions, various
6388 # match tables, created through the add_match_table() method, and referenced
6389 # by the table('NAME') or tables() methods, the latter returning a list of all
6390 # of the match tables.  Otherwise table operations implicitly are for the map
6391 # table.
6392 #
6393 # Most of the data in the property is actually about its map table, so it
6394 # mostly just uses that table's accessors for most methods.  The two could
6395 # have been combined into one object, but for clarity because of their
6396 # differing semantics, they have been kept separate.  It could be argued that
6397 # the 'file' and 'directory' fields should be kept with the map table.
6398 #
6399 # Each property has a type.  This can be set in the constructor, or in the
6400 # set_type accessor, but mostly it is figured out by the data.  Every property
6401 # starts with unknown type, overridden by a parameter to the constructor, or
6402 # as match tables are added, or ranges added to the map table, the data is
6403 # inspected, and the type changed.  After the table is mostly or entirely
6404 # filled, compute_type() should be called to finalize they analysis.
6405 #
6406 # There are very few operations defined.  One can safely remove a range from
6407 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
6408 # table to this one, replacing any in the intersection of the two.
6409
6410 sub standardize { return main::standardize($_[0]); }
6411 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
6412
6413 {   # Closure
6414
6415     # This hash will contain as keys, all the aliases of all properties, and
6416     # as values, pointers to their respective property objects.  This allows
6417     # quick look-up of a property from any of its names.
6418     my %alias_to_property_of;
6419
6420     sub dump_alias_to_property_of {
6421         # For debugging
6422
6423         print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
6424         return;
6425     }
6426
6427     sub property_ref {
6428         # This is a package subroutine, not called as a method.
6429         # If the single parameter is a literal '*' it returns a list of all
6430         # defined properties.
6431         # Otherwise, the single parameter is a name, and it returns a pointer
6432         # to the corresponding property object, or undef if none.
6433         #
6434         # Properties can have several different names.  The 'standard' form of
6435         # each of them is stored in %alias_to_property_of as they are defined.
6436         # But it's possible that this subroutine will be called with some
6437         # variant, so if the initial lookup fails, it is repeated with the
6438         # standarized form of the input name.  If found, besides returning the
6439         # result, the input name is added to the list so future calls won't
6440         # have to do the conversion again.
6441
6442         my $name = shift;
6443
6444         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6445
6446         if (! defined $name) {
6447             Carp::my_carp_bug("Undefined input property.  No action taken.");
6448             return;
6449         }
6450
6451         return main::uniques(values %alias_to_property_of) if $name eq '*';
6452
6453         # Return cached result if have it.
6454         my $result = $alias_to_property_of{$name};
6455         return $result if defined $result;
6456
6457         # Convert the input to standard form.
6458         my $standard_name = standardize($name);
6459
6460         $result = $alias_to_property_of{$standard_name};
6461         return unless defined $result;        # Don't cache undefs
6462
6463         # Cache the result before returning it.
6464         $alias_to_property_of{$name} = $result;
6465         return $result;
6466     }
6467
6468
6469     main::setup_package();
6470
6471     my %map;
6472     # A pointer to the map table object for this property
6473     main::set_access('map', \%map);
6474
6475     my %full_name;
6476     # The property's full name.  This is a duplicate of the copy kept in the
6477     # map table, but is needed because stringify needs it during
6478     # construction of the map table, and then would have a chicken before egg
6479     # problem.
6480     main::set_access('full_name', \%full_name, 'r');
6481
6482     my %table_ref;
6483     # This hash will contain as keys, all the aliases of any match tables
6484     # attached to this property, and as values, the pointers to their
6485     # respective tables.  This allows quick look-up of a table from any of its
6486     # names.
6487     main::set_access('table_ref', \%table_ref);
6488
6489     my %type;
6490     # The type of the property, $ENUM, $BINARY, etc
6491     main::set_access('type', \%type, 'r');
6492
6493     my %file;
6494     # The filename where the map table will go (if actually written).
6495     # Normally defaulted, but can be overridden.
6496     main::set_access('file', \%file, 'r', 's');
6497
6498     my %directory;
6499     # The directory where the map table will go (if actually written).
6500     # Normally defaulted, but can be overridden.
6501     main::set_access('directory', \%directory, 's');
6502
6503     my %pseudo_map_type;
6504     # This is used to affect the calculation of the map types for all the
6505     # ranges in the table.  It should be set to one of the values that signify
6506     # to alter the calculation.
6507     main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
6508
6509     my %has_only_code_point_maps;
6510     # A boolean used to help in computing the type of data in the map table.
6511     main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
6512
6513     my %unique_maps;
6514     # A list of the first few distinct mappings this property has.  This is
6515     # used to disambiguate between binary and enum property types, so don't
6516     # have to keep more than three.
6517     main::set_access('unique_maps', \%unique_maps);
6518
6519     sub new {
6520         # The only required parameter is the positionally first, name.  All
6521         # other parameters are key => value pairs.  See the documentation just
6522         # above for the meanings of the ones not passed directly on to the map
6523         # table constructor.
6524
6525         my $class = shift;
6526         my $name = shift || "";
6527
6528         my $self = property_ref($name);
6529         if (defined $self) {
6530             my $options_string = join ", ", @_;
6531             $options_string = ".  Ignoring options $options_string" if $options_string;
6532             Carp::my_carp("$self is already in use.  Using existing one$options_string;");
6533             return $self;
6534         }
6535
6536         my %args = @_;
6537
6538         $self = bless \do { my $anonymous_scalar }, $class;
6539         my $addr = main::objaddr $self;
6540
6541         $directory{$addr} = delete $args{'Directory'};
6542         $file{$addr} = delete $args{'File'};
6543         $full_name{$addr} = delete $args{'Full_Name'} || $name;
6544         $type{$addr} = delete $args{'Type'} || $UNKNOWN;
6545         $pseudo_map_type{$addr} = delete $args{'Map_Type'};
6546         # Rest of parameters passed on.
6547
6548         $has_only_code_point_maps{$addr} = 1;
6549         $table_ref{$addr} = { };
6550         $unique_maps{$addr} = { };
6551
6552         $map{$addr} = Map_Table->new($name,
6553                                     Full_Name => $full_name{$addr},
6554                                     _Alias_Hash => \%alias_to_property_of,
6555                                     _Property => $self,
6556                                     %args);
6557         return $self;
6558     }
6559
6560     # See this program's beginning comment block about overloading the copy
6561     # constructor.  Few operations are defined on properties, but a couple are
6562     # useful.  It is safe to take the inverse of a property, and to remove a
6563     # single code point from it.
6564     use overload
6565         fallback => 0,
6566         qw("") => "_operator_stringify",
6567         "." => \&main::_operator_dot,
6568         '==' => \&main::_operator_equal,
6569         '!=' => \&main::_operator_not_equal,
6570         '=' => sub { return shift },
6571         '-=' => "_minus_and_equal",
6572     ;
6573
6574     sub _operator_stringify {
6575         return "Property '" .  shift->full_name . "'";
6576     }
6577
6578     sub _minus_and_equal {
6579         # Remove a single code point from the map table of a property.
6580
6581         my $self = shift;
6582         my $other = shift;
6583         my $reversed = shift;
6584         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6585
6586         if (ref $other) {
6587             Carp::my_carp_bug("Can't cope with a "
6588                         . ref($other)
6589                         . " argument to '-='.  Subtraction ignored.");
6590             return $self;
6591         }
6592         elsif ($reversed) {   # Shouldnt happen in a -=, but just in case
6593             Carp::my_carp_bug("Can't cope with a "
6594             .  __PACKAGE__
6595             . " being the first parameter in a '-='.  Subtraction ignored.");
6596             return $self;
6597         }
6598         else {
6599             $map{main::objaddr $self}->delete_range($other, $other);
6600         }
6601         return $self;
6602     }
6603
6604     sub add_match_table {
6605         # Add a new match table for this property, with name given by the
6606         # parameter.  It returns a pointer to the table.
6607
6608         my $self = shift;
6609         my $name = shift;
6610         my %args = @_;
6611
6612         my $addr = main::objaddr $self;
6613
6614         my $table = $table_ref{$addr}{$name};
6615         my $standard_name = main::standardize($name);
6616         if (defined $table
6617             || (defined ($table = $table_ref{$addr}{$standard_name})))
6618         {
6619             Carp::my_carp("Table '$name' in $self is already in use.  Using existing one");
6620             $table_ref{$addr}{$name} = $table;
6621             return $table;
6622         }
6623         else {
6624
6625             # See if this is a perl extension, if not passed in.
6626             my $perl_extension = delete $args{'Perl_Extension'};
6627             $perl_extension
6628                         = $self->perl_extension if ! defined $perl_extension;
6629
6630             $table = Match_Table->new(
6631                                 Name => $name,
6632                                 Perl_Extension => $perl_extension,
6633                                 _Alias_Hash => $table_ref{$addr},
6634                                 _Property => $self,
6635
6636                                 # gets property's status by default
6637                                 Status => $self->status,
6638                                 _Status_Info => $self->status_info,
6639                                 %args,
6640                                 Internal_Only_Warning => 1); # Override any
6641                                                              # input param
6642             return unless defined $table;
6643         }
6644
6645         # Save the names for quick look up
6646         $table_ref{$addr}{$standard_name} = $table;
6647         $table_ref{$addr}{$name} = $table;
6648
6649         # Perhaps we can figure out the type of this property based on the
6650         # fact of adding this match table.  First, string properties don't
6651         # have match tables; second, a binary property can't have 3 match
6652         # tables
6653         if ($type{$addr} == $UNKNOWN) {
6654             $type{$addr} = $NON_STRING;
6655         }
6656         elsif ($type{$addr} == $STRING) {
6657             Carp::my_carp("$self Added a match table '$name' to a string property '$self'.  Changed it to a non-string property.  Bad News.");
6658             $type{$addr} = $NON_STRING;
6659         }
6660         elsif ($type{$addr} != $ENUM) {
6661             if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
6662                 && $type{$addr} == $BINARY)
6663             {
6664                 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary.  Changing its type to 'enum'.  Bad News.");
6665                 $type{$addr} = $ENUM;
6666             }
6667         }
6668
6669         return $table;
6670     }
6671
6672     sub table {
6673         # Return a pointer to the match table (with name given by the
6674         # parameter) associated with this property; undef if none.
6675
6676         my $self = shift;
6677         my $name = shift;
6678         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6679
6680         my $addr = main::objaddr $self;
6681
6682         return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
6683
6684         # If quick look-up failed, try again using the standard form of the
6685         # input name.  If that succeeds, cache the result before returning so
6686         # won't have to standardize this input name again.
6687         my $standard_name = main::standardize($name);
6688         return unless defined $table_ref{$addr}{$standard_name};
6689
6690         $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
6691         return $table_ref{$addr}{$name};
6692     }
6693
6694     sub tables {
6695         # Return a list of pointers to all the match tables attached to this
6696         # property
6697
6698         return main::uniques(values %{$table_ref{main::objaddr shift}});
6699     }
6700
6701     sub directory {
6702         # Returns the directory the map table for this property should be
6703         # output in.  If a specific directory has been specified, that has
6704         # priority;  'undef' is returned if the type isn't defined;
6705         # or $map_directory for everything else.
6706
6707         my $addr = main::objaddr shift;
6708
6709         return $directory{$addr} if defined $directory{$addr};
6710         return undef if $type{$addr} == $UNKNOWN;
6711         return $map_directory;
6712     }
6713
6714     sub swash_name {
6715         # Return the name that is used to both:
6716         #   1)  Name the file that the map table is written to.
6717         #   2)  The name of swash related stuff inside that file.
6718         # The reason for this is that the Perl core historically has used
6719         # certain names that aren't the same as the Unicode property names.
6720         # To continue using these, $file is hard-coded in this file for those,
6721         # but otherwise the standard name is used.  This is different from the
6722         # external_name, so that the rest of the files, like in lib can use
6723         # the standard name always, without regard to historical precedent.
6724
6725         my $self = shift;
6726         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6727
6728         my $addr = main::objaddr $self;
6729
6730         return $file{$addr} if defined $file{$addr};
6731         return $map{$addr}->external_name;
6732     }
6733
6734     sub to_create_match_tables {
6735         # Returns a boolean as to whether or not match tables should be
6736         # created for this property.
6737
6738         my $self = shift;
6739         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6740
6741         # The whole point of this pseudo property is match tables.
6742         return 1 if $self == $perl;
6743
6744         my $addr = main::objaddr $self;
6745
6746         # Don't generate tables of code points that match the property values
6747         # of a string property.  Such a list would most likely have many
6748         # property values, each with just one or very few code points mapping
6749         # to it.
6750         return 0 if $type{$addr} == $STRING;
6751
6752         # Don't generate anything for unimplemented properties.
6753         return 0 if grep { $self->complete_name eq $_ }
6754                                                     @unimplemented_properties;
6755         # Otherwise, do.
6756         return 1;
6757     }
6758
6759     sub property_add_or_replace_non_nulls {
6760         # This adds the mappings in the property $other to $self.  Non-null
6761         # mappings from $other override those in $self.  It essentially merges
6762         # the two properties, with the second having priority except for null
6763         # mappings.
6764
6765         my $self = shift;
6766         my $other = shift;
6767         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6768
6769         if (! $other->isa(__PACKAGE__)) {
6770             Carp::my_carp_bug("$other should be a "
6771                             . __PACKAGE__
6772                             . ".  Not a '"
6773                             . ref($other)
6774                             . "'.  Not added;");
6775             return;
6776         }
6777
6778         return $map{main::objaddr $self}->
6779                 map_add_or_replace_non_nulls($map{main::objaddr $other});
6780     }
6781
6782     sub set_type {
6783         # Set the type of the property.  Mostly this is figured out by the
6784         # data in the table.  But this is used to set it explicitly.  The
6785         # reason it is not a standard accessor is that when setting a binary
6786         # property, we need to make sure that all the true/false aliases are
6787         # present, as they were omitted in early Unicode releases.
6788
6789         my $self = shift;
6790         my $type = shift;
6791         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6792
6793         if ($type != $ENUM && $type != $BINARY && $type != $STRING) {
6794             Carp::my_carp("Unrecognized type '$type'.  Type not set");
6795             return;
6796         }
6797
6798         $type{main::objaddr $self} = $type;
6799         return if $type != $BINARY;
6800
6801         my $yes = $self->table('Y');
6802         $yes = $self->table('Yes') if ! defined $yes;
6803         $yes = $self->add_match_table('Y') if ! defined $yes;
6804         $yes->add_alias('Yes');
6805         $yes->add_alias('T');
6806         $yes->add_alias('True');
6807
6808         my $no = $self->table('N');
6809         $no = $self->table('No') if ! defined $no;
6810         $no = $self->add_match_table('N') if ! defined $no;
6811         $no->add_alias('No');
6812         $no->add_alias('F');
6813         $no->add_alias('False');
6814         return;
6815     }
6816
6817     sub add_map {
6818         # Add a map to the property's map table.  This also keeps
6819         # track of the maps so that the property type can be determined from
6820         # its data.
6821
6822         my $self = shift;
6823         my $start = shift;  # First code point in range
6824         my $end = shift;    # Final code point in range
6825         my $map = shift;    # What the range maps to.
6826         # Rest of parameters passed on.
6827
6828         my $addr = main::objaddr $self;
6829
6830         # If haven't the type of the property, gather information to figure it
6831         # out.
6832         if ($type{$addr} == $UNKNOWN) {
6833
6834             # If the map contains an interior blank or dash, or most other
6835             # nonword characters, it will be a string property.  This
6836             # heuristic may actually miss some string properties.  If so, they
6837             # may need to have explicit set_types called for them.  This
6838             # happens in the Unihan properties.
6839             if ($map =~ / (?<= . ) [ -] (?= . ) /x
6840                 || $map =~ / [^\w.\/\ -]  /x)
6841             {
6842                 $self->set_type($STRING);
6843
6844                 # $unique_maps is used for disambiguating between ENUM and
6845                 # BINARY later; since we know the property is not going to be
6846                 # one of those, no point in keeping the data around
6847                 undef $unique_maps{$addr};
6848             }
6849             else {
6850
6851                 # Not necessarily a string.  The final decision has to be
6852                 # deferred until all the data are in.  We keep track of if all
6853                 # the values are code points for that eventual decision.
6854                 $has_only_code_point_maps{$addr} &=
6855                                             $map =~ / ^ $code_point_re $/x;
6856
6857                 # For the purposes of disambiguating between binary and other
6858                 # enumerations at the end, we keep track of the first three
6859                 # distinct property values.  Once we get to three, we know
6860                 # it's not going to be binary, so no need to track more.
6861                 if (scalar keys %{$unique_maps{$addr}} < 3) {
6862                     $unique_maps{$addr}{main::standardize($map)} = 1;
6863                 }
6864             }
6865         }
6866
6867         # Add the mapping by calling our map table's method
6868         return $map{$addr}->add_map($start, $end, $map, @_);
6869     }
6870
6871     sub compute_type {
6872         # Compute the type of the property: $ENUM, $STRING, or $BINARY.  This
6873         # should be called after the property is mostly filled with its maps.
6874         # We have been keeping track of what the property values have been,
6875         # and now have the necessary information to figure out the type.
6876
6877         my $self = shift;
6878         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6879
6880         my $addr = main::objaddr($self);
6881
6882         my $type = $type{$addr};
6883
6884         # If already have figured these out, no need to do so again, but we do
6885         # a double check on ENUMS to make sure that a string property hasn't
6886         # improperly been classified as an ENUM, so continue on with those.
6887         return if $type == $STRING || $type == $BINARY;
6888
6889         # If every map is to a code point, is a string property.
6890         if ($type == $UNKNOWN
6891             && ($has_only_code_point_maps{$addr}
6892                 || (defined $map{$addr}->default_map
6893                     && $map{$addr}->default_map eq "")))
6894         {
6895             $self->set_type($STRING);
6896         }
6897         else {
6898
6899             # Otherwise, it is to some sort of enumeration.  (The case where
6900             # it is a Unicode miscellaneous property, and treated like a
6901             # string in this program is handled in add_map()).  Distinguish
6902             # between binary and some other enumeration type.  Of course, if
6903             # there are more than two values, it's not binary.  But more
6904             # subtle is the test that the default mapping is defined means it
6905             # isn't binary.  This in fact may change in the future if Unicode
6906             # changes the way its data is structured.  But so far, no binary
6907             # properties ever have @missing lines for them, so the default map
6908             # isn't defined for them.  The few properties that are two-valued
6909             # and aren't considered binary have the default map defined
6910             # starting in Unicode 5.0, when the @missing lines appeared; and
6911             # this program has special code to put in a default map for them
6912             # for earlier than 5.0 releases.
6913             if ($type == $ENUM
6914                 || scalar keys %{$unique_maps{$addr}} > 2
6915                 || defined $self->default_map)
6916             {
6917                 my $tables = $self->tables;
6918                 my $count = $self->count;
6919                 if ($verbosity && $count > 500 && $tables/$count > .1) {
6920                     Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
6921                 }
6922                 $self->set_type($ENUM);
6923             }
6924             else {
6925                 $self->set_type($BINARY);
6926             }
6927         }
6928         undef $unique_maps{$addr};  # Garbage collect
6929         return;
6930     }
6931
6932     # Most of the accessors for a property actually apply to its map table.
6933     # Setup up accessor functions for those, referring to %map
6934     for my $sub qw(
6935                     add_alias
6936                     add_anomalous_entry
6937                     add_comment
6938                     add_conflicting
6939                     add_description
6940                     add_duplicate
6941                     add_note
6942                     aliases
6943                     comment
6944                     complete_name
6945                     core_access
6946                     count
6947                     default_map
6948                     delete_range
6949                     description
6950                     each_range
6951                     external_name
6952                     file_path
6953                     format
6954                     initialize
6955                     inverse_list
6956                     is_empty
6957                     name
6958                     note
6959                     perl_extension
6960                     property
6961                     range_count
6962                     ranges
6963                     range_size_1
6964                     reset_each_range
6965                     set_comment
6966                     set_core_access
6967                     set_default_map
6968                     set_file_path
6969                     set_final_comment
6970                     set_range_size_1
6971                     set_status
6972                     set_to_output_map
6973                     short_name
6974                     status
6975                     status_info
6976                     to_output_map
6977                     value_of
6978                     write
6979                 )
6980                     # 'property' above is for symmetry, so that one can take
6981                     # the property of a property and get itself, and so don't
6982                     # have to distinguish between properties and tables in
6983                     # calling code
6984     {
6985         no strict "refs";
6986         *$sub = sub {
6987             use strict "refs";
6988             my $self = shift;
6989             return $map{main::objaddr $self}->$sub(@_);
6990         }
6991     }
6992
6993
6994 } # End closure
6995
6996 package main;
6997
6998 sub join_lines($) {
6999     # Returns lines of the input joined together, so that they can be folded
7000     # properly.
7001     # This causes continuation lines to be joined together into one long line
7002     # for folding.  A continuation line is any line that doesn't begin with a
7003     # space or "\b" (the latter is stripped from the output).  This is so
7004     # lines can be be in a HERE document so as to fit nicely in the terminal
7005     # width, but be joined together in one long line, and then folded with
7006     # indents, '#' prefixes, etc, properly handled.
7007     # A blank separates the joined lines except if there is a break; an extra
7008     # blank is inserted after a period ending a line.
7009
7010     # Intialize the return with the first line.
7011     my ($return, @lines) = split "\n", shift;
7012
7013     # If the first line is null, it was an empty line, add the \n back in
7014     $return = "\n" if $return eq "";
7015
7016     # Now join the remainder of the physical lines.
7017     for my $line (@lines) {
7018
7019         # An empty line means wanted a blank line, so add two \n's to get that
7020         # effect, and go to the next line.
7021         if (length $line == 0) {
7022             $return .= "\n\n";
7023             next;
7024         }
7025
7026         # Look at the last character of what we have so far.
7027         my $previous_char = substr($return, -1, 1);
7028
7029         # And at the next char to be output.
7030         my $next_char = substr($line, 0, 1);
7031
7032         if ($previous_char ne "\n") {
7033
7034             # Here didn't end wth a nl.  If the next char a blank or \b, it
7035             # means that here there is a break anyway.  So add a nl to the
7036             # output.
7037             if ($next_char eq " " || $next_char eq "\b") {
7038                 $previous_char = "\n";
7039                 $return .= $previous_char;
7040             }
7041
7042             # Add an extra space after periods.
7043             $return .= " " if $previous_char eq '.';
7044         }
7045
7046         # Here $previous_char is still the latest character to be output.  If
7047         # it isn't a nl, it means that the next line is to be a continuation
7048         # line, with a blank inserted between them.
7049         $return .= " " if $previous_char ne "\n";
7050
7051         # Get rid of any \b
7052         substr($line, 0, 1) = "" if $next_char eq "\b";
7053
7054         # And append this next line.
7055         $return .= $line;
7056     }
7057
7058     return $return;
7059 }
7060
7061 sub simple_fold($;$$$) {
7062     # Returns a string of the input (string or an array of strings) folded
7063     # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
7064     # a \n
7065     # This is tailored for the kind of text written by this program,
7066     # especially the pod file, which can have very long names with
7067     # underscores in the middle, or words like AbcDefgHij....  We allow
7068     # breaking in the middle of such constructs if the line won't fit
7069     # otherwise.  The break in such cases will come either just after an
7070     # underscore, or just before one of the Capital letters.
7071
7072     local $to_trace = 0 if main::DEBUG;
7073
7074     my $line = shift;
7075     my $prefix = shift;     # Optional string to prepend to each output
7076                             # line
7077     $prefix = "" unless defined $prefix;
7078
7079     my $hanging_indent = shift; # Optional number of spaces to indent
7080                                 # continuation lines
7081     $hanging_indent = 0 unless $hanging_indent;
7082
7083     my $right_margin = shift;   # Optional number of spaces to narrow the
7084                                 # total width by.
7085     $right_margin = 0 unless defined $right_margin;
7086
7087     # Call carp with the 'nofold' option to avoid it from trying to call us
7088     # recursively
7089     Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
7090
7091     # The space available doesn't include what's automatically prepended
7092     # to each line, or what's reserved on the right.
7093     my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
7094     # XXX Instead of using the 'nofold' perhaps better to look up the stack
7095
7096     if (DEBUG && $hanging_indent >= $max) {
7097         Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max.  Using 0", 'nofold');
7098         $hanging_indent = 0;
7099     }
7100
7101     # First, split into the current physical lines.
7102     my @line;
7103     if (ref $line) {        # Better be an array, because not bothering to
7104                             # test
7105         foreach my $line (@{$line}) {
7106             push @line, split /\n/, $line;
7107         }
7108     }
7109     else {
7110         @line = split /\n/, $line;
7111     }
7112
7113     #local $to_trace = 1 if main::DEBUG;
7114     trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
7115
7116     # Look at each current physical line.
7117     for (my $i = 0; $i < @line; $i++) {
7118         Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
7119         #local $to_trace = 1 if main::DEBUG;
7120         trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
7121
7122         # Remove prefix, because will be added back anyway, don't want
7123         # doubled prefix
7124         $line[$i] =~ s/^$prefix//;
7125
7126         # Remove trailing space
7127         $line[$i] =~ s/\s+\Z//;
7128
7129         # If the line is too long, fold it.
7130         if (length $line[$i] > $max) {
7131             my $remainder;
7132
7133             # Here needs to fold.  Save the leading space in the line for
7134             # later.
7135             $line[$i] =~ /^ ( \s* )/x;
7136             my $leading_space = $1;
7137             trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
7138
7139             # If character at final permissible position is white space,
7140             # fold there, which will delete that white space
7141             if (substr($line[$i], $max - 1, 1) =~ /\s/) {
7142                 $remainder = substr($line[$i], $max);
7143                 $line[$i] = substr($line[$i], 0, $max - 1);
7144             }
7145             else {
7146
7147                 # Otherwise fold at an acceptable break char closest to
7148                 # the max length.  Look at just the maximal initial
7149                 # segment of the line
7150                 my $segment = substr($line[$i], 0, $max - 1);
7151                 if ($segment =~
7152                     /^ ( .{$hanging_indent}   # Don't look before the
7153                                               #  indent.
7154                         \ *                   # Don't look in leading
7155                                               #  blanks past the indent
7156                             [^ ] .*           # Find the right-most
7157                         (?:                   #  acceptable break:
7158                             [ \s = ]          # space or equal
7159                             | - (?! [.0-9] )  # or non-unary minus.
7160                         )                     # $1 includes the character
7161                     )/x)
7162                 {
7163                     # Split into the initial part that fits, and remaining
7164                     # part of the input
7165                     $remainder = substr($line[$i], length $1);
7166                     $line[$i] = $1;
7167                     trace $line[$i] if DEBUG && $to_trace;
7168                     trace $remainder if DEBUG && $to_trace;
7169                 }
7170
7171                 # If didn't find a good breaking spot, see if there is a
7172                 # not-so-good breaking spot.  These are just after
7173                 # underscores or where the case changes from lower to
7174                 # upper.  Use \a as a soft hyphen, but give up
7175                 # and don't break the line if there is actually a \a
7176                 # already in the input.  We use an ascii character for the
7177                 # soft-hyphen to avoid any attempt by miniperl to try to
7178                 # access the files that this program is creating.
7179                 elsif ($segment !~ /\a/
7180                        && ($segment =~ s/_/_\a/g
7181                        || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
7182                 {
7183                     # Here were able to find at least one place to insert
7184                     # our substitute soft hyphen.  Find the right-most one
7185                     # and replace it by a real hyphen.
7186                     trace $segment if DEBUG && $to_trace;
7187                     substr($segment,
7188                             rindex($segment, "\a"),
7189                             1) = '-';
7190
7191                     # Then remove the soft hyphen substitutes.
7192                     $segment =~ s/\a//g;
7193                     trace $segment if DEBUG && $to_trace;
7194
7195                     # And split into the initial part that fits, and
7196                     # remainder of the line
7197                     my $pos = rindex($segment, '-');
7198                     $remainder = substr($line[$i], $pos);
7199                     trace $remainder if DEBUG && $to_trace;
7200                     $line[$i] = substr($segment, 0, $pos + 1);
7201                 }
7202             }
7203
7204             # Here we know if we can fold or not.  If we can, $remainder
7205             # is what remains to be processed in the next iteration.
7206             if (defined $remainder) {
7207                 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
7208
7209                 # Insert the folded remainder of the line as a new element
7210                 # of the array.  (It may still be too long, but we will
7211                 # deal with that next time through the loop.)  Omit any
7212                 # leading space in the remainder.
7213                 $remainder =~ s/^\s+//;
7214                 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
7215
7216                 # But then indent by whichever is larger of:
7217                 # 1) the leading space on the input line;
7218                 # 2) the hanging indent.
7219                 # This preserves indentation in the original line.
7220                 my $lead = ($leading_space)
7221                             ? length $leading_space
7222                             : $hanging_indent;
7223                 $lead = max($lead, $hanging_indent);
7224                 splice @line, $i+1, 0, (" " x $lead) . $remainder;
7225             }
7226         }
7227
7228         # Ready to output the line. Get rid of any trailing space
7229         # And prefix by the required $prefix passed in.
7230         $line[$i] =~ s/\s+$//;
7231         $line[$i] = "$prefix$line[$i]\n";
7232     } # End of looping through all the lines.
7233
7234     return join "", @line;
7235 }
7236
7237 sub property_ref {  # Returns a reference to a property object.
7238     return Property::property_ref(@_);
7239 }
7240
7241 sub force_unlink ($) {
7242     my $filename = shift;
7243     return unless file_exists($filename);
7244     return if CORE::unlink($filename);
7245
7246     # We might need write permission
7247     chmod 0777, $filename;
7248     CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename.  Proceeding anyway: $!");
7249     return;
7250 }
7251
7252 sub write ($\@) {
7253     # Given a filename and a reference to an array of lines, write the lines
7254     # to the file
7255     # Filename can be given as an arrayref of directory names
7256
7257     my $file  = shift;
7258     my $lines_ref = shift;
7259     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7260
7261     if (! defined $lines_ref) {
7262         Carp::my_carp("Missing lines to write parameter for $file.  Writing skipped;");
7263         return;
7264     }
7265
7266     # Get into a single string if an array, and get rid of, in Unix terms, any
7267     # leading '.'
7268     $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
7269     $file = File::Spec->canonpath($file);
7270
7271     # If has directories, make sure that they all exist
7272     (undef, my $directories, undef) = File::Spec->splitpath($file);
7273     File::Path::mkpath($directories) if $directories && ! -d $directories;
7274
7275     push @files_actually_output, $file;
7276
7277     my $text;
7278     if (@$lines_ref) {
7279         $text = join "", @$lines_ref;
7280     }
7281     else {
7282         $text = "";
7283         Carp::my_carp("Output file '$file' is empty; writing it anyway;");
7284     }
7285
7286     force_unlink ($file);
7287
7288     my $OUT;
7289     if (not open $OUT, ">", $file) {
7290         Carp::my_carp("can't open $file for output.  Skipping this file: $!");
7291         return;
7292     }
7293     print "$file written.\n" if $verbosity >= $VERBOSE;
7294
7295     print $OUT $text;
7296     close $OUT;
7297     return;
7298 }
7299
7300
7301 sub Standardize($) {
7302     # This converts the input name string into a standardized equivalent to
7303     # use internally.
7304
7305     my $name = shift;
7306     unless (defined $name) {
7307       Carp::my_carp_bug("Standardize() called with undef.  Returning undef.");
7308       return;
7309     }
7310
7311     # Remove any leading or trailing white space
7312     $name =~ s/^\s+//g;
7313     $name =~ s/\s+$//g;
7314
7315     # Convert interior white space and hypens into underscores.
7316     $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
7317
7318     # Capitalize the letter following an underscore, and convert a sequence of
7319     # multiple underscores to a single one
7320     $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
7321
7322     # And capitalize the first letter, but not for the special cjk ones.
7323     $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
7324     return $name;
7325 }
7326
7327 sub standardize ($) {
7328     # Returns a lower-cased standardized name, without underscores.  This form
7329     # is chosen so that it can distinguish between any real versus superficial
7330     # Unicode name differences.  It relies on the fact that Unicode doesn't
7331     # have interior underscores, white space, nor dashes in any
7332     # stricter-matched name.  It should not be used on Unicode code point
7333     # names (the Name property), as they mostly, but not always follow these
7334     # rules.
7335
7336     my $name = Standardize(shift);
7337     return if !defined $name;
7338
7339     $name =~ s/ (?<= .) _ (?= . ) //xg;
7340     return lc $name;
7341 }
7342
7343 {   # Closure
7344
7345     my $indent_increment = " " x 2;
7346     my %already_output;
7347
7348     $main::simple_dumper_nesting = 0;
7349
7350     sub simple_dumper {
7351         # Like Simple Data::Dumper. Good enough for our needs. We can't use
7352         # the real thing as we have to run under miniperl.
7353
7354         # It is designed so that on input it is at the beginning of a line,
7355         # and the final thing output in any call is a trailing ",\n".
7356
7357         my $item = shift;
7358         my $indent = shift;
7359         $indent = "" if ! defined $indent;
7360
7361         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7362
7363         # nesting level is localized, so that as the call stack pops, it goes
7364         # back to the prior value.
7365         local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
7366         undef %already_output if $main::simple_dumper_nesting == 0;
7367         $main::simple_dumper_nesting++;
7368         #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
7369
7370         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7371
7372         # Determine the indent for recursive calls.
7373         my $next_indent = $indent . $indent_increment;
7374
7375         my $output;
7376         if (! ref $item) {
7377
7378             # Dump of scalar: just output it in quotes if not a number.  To do
7379             # so we must escape certain characters, and therefore need to
7380             # operate on a copy to avoid changing the original
7381             my $copy = $item;
7382             $copy = $UNDEF unless defined $copy;
7383
7384             # Quote non-numbers (numbers also have optional leading '-' and
7385             # fractions)
7386             if ($copy eq "" || $copy !~ /^ -? \d+ ( \. \d+ )? $/x) {
7387
7388                 # Escape apostrophe and backslash
7389                 $copy =~ s/ ( ['\\] ) /\\$1/xg;
7390                 $copy = "'$copy'";
7391             }
7392             $output = "$indent$copy,\n";
7393         }
7394         else {
7395
7396             # Keep track of cycles in the input, and refuse to infinitely loop
7397             if (defined $already_output{main::objaddr $item}) {
7398                 return "${indent}ALREADY OUTPUT: $item\n";
7399             }
7400             $already_output{main::objaddr $item} = $item;
7401
7402             if (ref $item eq 'ARRAY') {
7403                 my $using_brackets;
7404                 $output = $indent;
7405                 if ($main::simple_dumper_nesting > 1) {
7406                     $output .= '[';
7407                     $using_brackets = 1;
7408                 }
7409                 else {
7410                     $using_brackets = 0;
7411                 }
7412
7413                 # If the array is empty, put the closing bracket on the same
7414                 # line.  Otherwise, recursively add each array element
7415                 if (@$item == 0) {
7416                     $output .= " ";
7417                 }
7418                 else {
7419                     $output .= "\n";
7420                     for (my $i = 0; $i < @$item; $i++) {
7421
7422                         # Indent array elements one level
7423                         $output .= &simple_dumper($item->[$i], $next_indent);
7424                         $output =~ s/\n$//;      # Remove trailing nl so as to
7425                         $output .= " # [$i]\n";  # add a comment giving the
7426                                                  # array index
7427                     }
7428                     $output .= $indent;     # Indent closing ']' to orig level
7429                 }
7430                 $output .= ']' if $using_brackets;
7431                 $output .= ",\n";
7432             }
7433             elsif (ref $item eq 'HASH') {
7434                 my $is_first_line;
7435                 my $using_braces;
7436                 my $body_indent;
7437
7438                 # No surrounding braces at top level
7439                 $output .= $indent;
7440                 if ($main::simple_dumper_nesting > 1) {
7441                     $output .= "{\n";
7442                     $is_first_line = 0;
7443                     $body_indent = $next_indent;
7444                     $next_indent .= $indent_increment;
7445                     $using_braces = 1;
7446                 }
7447                 else {
7448                     $is_first_line = 1;
7449                     $body_indent = $indent;
7450                     $using_braces = 0;
7451                 }
7452
7453                 # Output hashes sorted alphabetically instead of apparently
7454                 # random.  Use caseless alphabetic sort
7455                 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
7456                 {
7457                     if ($is_first_line) {
7458                         $is_first_line = 0;
7459                     }
7460                     else {
7461                         $output .= "$body_indent";
7462                     }
7463
7464                     # The key must be a scalar, but this recursive call quotes
7465                     # it
7466                     $output .= &simple_dumper($key);
7467
7468                     # And change the trailing comma and nl to the hash fat
7469                     # comma for clarity, and so the value can be on the same
7470                     # line
7471                     $output =~ s/,\n$/ => /;
7472
7473                     # Recursively call to get the value's dump.
7474                     my $next = &simple_dumper($item->{$key}, $next_indent);
7475
7476                     # If the value is all on one line, remove its indent, so
7477                     # will follow the => immediately.  If it takes more than
7478                     # one line, start it on a new line.
7479                     if ($next !~ /\n.*\n/) {
7480                         $next =~ s/^ *//;
7481                     }
7482                     else {
7483                         $output .= "\n";
7484                     }
7485                     $output .= $next;
7486                 }
7487
7488                 $output .= "$indent},\n" if $using_braces;
7489             }
7490             elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
7491                 $output = $indent . ref($item) . "\n";
7492                 # XXX see if blessed
7493             }
7494             elsif ($item->can('dump')) {
7495
7496                 # By convention in this program, objects furnish a 'dump'
7497                 # method.  Since not doing any output at this level, just pass
7498                 # on the input indent
7499                 $output = $item->dump($indent);
7500             }
7501             else {
7502                 Carp::my_carp("Can't cope with dumping a " . ref($item) . ".  Skipping.");
7503             }
7504         }
7505         return $output;
7506     }
7507 }
7508
7509 sub dump_inside_out {
7510     # Dump inside-out hashes in an object's state by converting them to a
7511     # regular hash and then calling simple_dumper on that.
7512
7513     my $object = shift;
7514     my $fields_ref = shift;
7515     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7516
7517     my $addr = main::objaddr $object;
7518
7519     my %hash;
7520     foreach my $key (keys %$fields_ref) {
7521         $hash{$key} = $fields_ref->{$key}{$addr};
7522     }
7523
7524     return simple_dumper(\%hash, @_);
7525 }
7526
7527 sub _operator_dot {
7528     # Overloaded '.' method that is common to all packages.  It uses the
7529     # package's stringify method.
7530
7531     my $self = shift;
7532     my $other = shift;
7533     my $reversed = shift;
7534     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7535
7536     $other = "" unless defined $other;
7537
7538     foreach my $which (\$self, \$other) {
7539         next unless ref $$which;
7540         if ($$which->can('_operator_stringify')) {
7541             $$which = $$which->_operator_stringify;
7542         }
7543         else {
7544             my $ref = ref $$which;
7545             my $addr = main::objaddr $$which;
7546             $$which = "$ref ($addr)";
7547         }
7548     }
7549     return ($reversed)
7550             ? "$other$self"
7551             : "$self$other";
7552 }
7553
7554 sub _operator_equal {
7555     # Generic overloaded '==' routine.  To be equal, they must be the exact
7556     # same object
7557
7558     my $self = shift;
7559     my $other = shift;
7560
7561     return 0 unless defined $other;
7562     return 0 unless ref $other;
7563     return main::objaddr $self == main::objaddr $other;
7564 }
7565
7566 sub _operator_not_equal {
7567     my $self = shift;
7568     my $other = shift;
7569
7570     return ! _operator_equal($self, $other);
7571 }
7572
7573 sub process_PropertyAliases($) {
7574     # This reads in the PropertyAliases.txt file, which contains almost all
7575     # the character properties in Unicode and their equivalent aliases:
7576     # scf       ; Simple_Case_Folding         ; sfc
7577     #
7578     # Field 0 is the preferred short name for the property.
7579     # Field 1 is the full name.
7580     # Any succeeding ones are other accepted names.
7581
7582     my $file= shift;
7583     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7584
7585     # This whole file was non-existent in early releases, so use our own
7586     # internal one.
7587     $file->insert_lines(get_old_property_aliases())
7588                                                 if ! -e 'PropertyAliases.txt';
7589
7590     # Add any cjk properties that may have been defined.
7591     $file->insert_lines(@cjk_properties);
7592
7593     while ($file->next_line) {
7594
7595         my @data = split /\s*;\s*/;
7596
7597         my $full = $data[1];
7598
7599         my $this = Property->new($data[0], Full_Name => $full);
7600
7601         # Start looking for more aliases after these two.
7602         for my $i (2 .. @data - 1) {
7603             $this->add_alias($data[$i]);
7604         }
7605
7606     }
7607     return;
7608 }
7609
7610 sub finish_property_setup {
7611     # Finishes setting up after PropertyAliases.
7612
7613     my $file = shift;
7614     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7615
7616     # This entry was missing from this file in earlier Unicode versions
7617     if (-e 'Jamo.txt') {
7618         my $jsn = property_ref('JSN');
7619         if (! defined $jsn) {
7620             $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name');
7621         }
7622     }
7623
7624     # This entry is still missing as of 5.2, perhaps because no short name for
7625     # it.
7626     if (-e 'NameAliases.txt') {
7627         my $aliases = property_ref('Name_Alias');
7628         if (! defined $aliases) {
7629             $aliases = Property->new('Name_Alias');
7630         }
7631     }
7632
7633     # These are used so much, that we set globals for them.
7634     $gc = property_ref('General_Category');
7635     $block = property_ref('Block');
7636
7637     # Perl adds this alias.
7638     $gc->add_alias('Category');
7639
7640     # For backwards compatibility, these property files have particular names.
7641     my $upper = property_ref('Uppercase_Mapping');
7642     $upper->set_core_access('uc()');
7643     $upper->set_file('Upper'); # This is what utf8.c calls it
7644
7645     my $lower = property_ref('Lowercase_Mapping');
7646     $lower->set_core_access('lc()');
7647     $lower->set_file('Lower');
7648
7649     my $title = property_ref('Titlecase_Mapping');
7650     $title->set_core_access('ucfirst()');
7651     $title->set_file('Title');
7652
7653     my $fold = property_ref('Case_Folding');
7654     $fold->set_file('Fold') if defined $fold;
7655
7656     # utf8.c can't currently cope with non range-size-1 for these, and even if
7657     # it were changed to do so, someone else may be using them, expecting the
7658     # old style
7659     foreach my $property (qw {
7660                                 Case_Folding
7661                                 Lowercase_Mapping
7662                                 Titlecase_Mapping
7663                                 Uppercase_Mapping
7664                             })
7665     {
7666         property_ref($property)->set_range_size_1(1);
7667     }
7668
7669     # These two properties aren't actually used in the core, but unfortunately
7670     # the names just above that are in the core interfere with these, so
7671     # choose different names.  These aren't a problem unless the map tables
7672     # for these files get written out.
7673     my $lowercase = property_ref('Lowercase');
7674     $lowercase->set_file('IsLower') if defined $lowercase;
7675     my $uppercase = property_ref('Uppercase');
7676     $uppercase->set_file('IsUpper') if defined $uppercase;
7677
7678     # Set up the hard-coded default mappings, but only on properties defined
7679     # for this release
7680     foreach my $property (keys %default_mapping) {
7681         my $property_object = property_ref($property);
7682         next if ! defined $property_object;
7683         my $default_map = $default_mapping{$property};
7684         $property_object->set_default_map($default_map);
7685
7686         # A map of <code point> implies the property is string.
7687         if ($property_object->type == $UNKNOWN
7688             && $default_map eq $CODE_POINT)
7689         {
7690             $property_object->set_type($STRING);
7691         }
7692     }
7693
7694     # The following use the Multi_Default class to create objects for
7695     # defaults.
7696
7697     # Bidi class has a complicated default, but the derived file takes care of
7698     # the complications, leaving just 'L'.
7699     if (file_exists("${EXTRACTED}DBidiClass.txt")) {
7700         property_ref('Bidi_Class')->set_default_map('L');
7701     }
7702     else {
7703         my $default;
7704
7705         # The derived file was introduced in 3.1.1.  The values below are
7706         # taken from table 3-8, TUS 3.0
7707         my $default_R =
7708             'my $default = Range_List->new;
7709              $default->add_range(0x0590, 0x05FF);
7710              $default->add_range(0xFB1D, 0xFB4F);'
7711         ;
7712
7713         # The defaults apply only to unassigned characters
7714         $default_R .= '$gc->table("Cn") & $default;';
7715
7716         if ($v_version lt v3.0.0) {
7717             $default = Multi_Default->new(R => $default_R, 'L');
7718         }
7719         else {
7720
7721             # AL apparently not introduced until 3.0:  TUS 2.x references are
7722             # not on-line to check it out
7723             my $default_AL =
7724                 'my $default = Range_List->new;
7725                  $default->add_range(0x0600, 0x07BF);
7726                  $default->add_range(0xFB50, 0xFDFF);
7727                  $default->add_range(0xFE70, 0xFEFF);'
7728             ;
7729
7730             # Non-character code points introduced in this release; aren't AL
7731             if ($v_version ge 3.1.0) {
7732                 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
7733             }
7734             $default_AL .= '$gc->table("Cn") & $default';
7735             $default = Multi_Default->new(AL => $default_AL,
7736                                           R => $default_R,
7737                                           'L');
7738         }
7739         property_ref('Bidi_Class')->set_default_map($default);
7740     }
7741
7742     # Joining type has a complicated default, but the derived file takes care
7743     # of the complications, leaving just 'U' (or Non_Joining), except the file
7744     # is bad in 3.1.0
7745     if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
7746         if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
7747             property_ref('Joining_Type')->set_default_map('Non_Joining');
7748         }
7749         else {
7750
7751             # Otherwise, there are not one, but two possibilities for the
7752             # missing defaults: T and U.
7753             # The missing defaults that evaluate to T are given by:
7754             # T = Mn + Cf - ZWNJ - ZWJ
7755             # where Mn and Cf are the general category values. In other words,
7756             # any non-spacing mark or any format control character, except
7757             # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
7758             # WIDTH JOINER (joining type C).
7759             my $default = Multi_Default->new(
7760                'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
7761                'Non_Joining');
7762             property_ref('Joining_Type')->set_default_map($default);
7763         }
7764     }
7765
7766     # Line break has a complicated default in early releases. It is 'Unknown'
7767     # for non-assigned code points; 'AL' for assigned.
7768     if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
7769         my $lb = property_ref('Line_Break');
7770         if ($v_version gt 3.2.0) {
7771             $lb->set_default_map('Unknown');
7772         }
7773         else {
7774             my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
7775                                               'AL');
7776             $lb->set_default_map($default);
7777         }
7778
7779         # If has the URS property, make sure that the standard aliases are in
7780         # it, since not in the input tables in some versions.
7781         my $urs = property_ref('Unicode_Radical_Stroke');
7782         if (defined $urs) {
7783             $urs->add_alias('cjkRSUnicode');
7784             $urs->add_alias('kRSUnicode');
7785         }
7786     }
7787     return;
7788 }
7789
7790 sub get_old_property_aliases() {
7791     # Returns what would be in PropertyAliases.txt if it existed in very old
7792     # versions of Unicode.  It was derived from the one in 3.2, and pared
7793     # down based on the data that was actually in the older releases.
7794     # An attempt was made to use the existence of files to mean inclusion or
7795     # not of various aliases, but if this was not sufficient, using version
7796     # numbers was resorted to.
7797
7798     my @return;
7799
7800     # These are to be used in all versions (though some are constructed by
7801     # this program if missing)
7802     push @return, split /\n/, <<'END';
7803 bc        ; Bidi_Class
7804 Bidi_M    ; Bidi_Mirrored
7805 cf        ; Case_Folding
7806 ccc       ; Canonical_Combining_Class
7807 dm        ; Decomposition_Mapping
7808 dt        ; Decomposition_Type
7809 gc        ; General_Category
7810 isc       ; ISO_Comment
7811 lc        ; Lowercase_Mapping
7812 na        ; Name
7813 na1       ; Unicode_1_Name
7814 nt        ; Numeric_Type
7815 nv        ; Numeric_Value
7816 sfc       ; Simple_Case_Folding
7817 slc       ; Simple_Lowercase_Mapping
7818 stc       ; Simple_Titlecase_Mapping
7819 suc       ; Simple_Uppercase_Mapping
7820 tc        ; Titlecase_Mapping
7821 uc        ; Uppercase_Mapping
7822 END
7823
7824     if (-e 'Blocks.txt') {
7825         push @return, "blk       ; Block\n";
7826     }
7827     if (-e 'ArabicShaping.txt') {
7828         push @return, split /\n/, <<'END';
7829 jg        ; Joining_Group
7830 jt        ; Joining_Type
7831 END
7832     }
7833     if (-e 'PropList.txt') {
7834
7835         # This first set is in the original old-style proplist.
7836         push @return, split /\n/, <<'END';
7837 Alpha     ; Alphabetic
7838 Bidi_C    ; Bidi_Control
7839 Dash      ; Dash
7840 Dia       ; Diacritic
7841 Ext       ; Extender
7842 Hex       ; Hex_Digit
7843 Hyphen    ; Hyphen
7844 IDC       ; ID_Continue
7845 Ideo      ; Ideographic
7846 Join_C    ; Join_Control
7847 Math      ; Math
7848 QMark     ; Quotation_Mark
7849 Term      ; Terminal_Punctuation
7850 WSpace    ; White_Space
7851 END
7852         # The next sets were added later
7853         if ($v_version ge v3.0.0) {
7854             push @return, split /\n/, <<'END';
7855 Upper     ; Uppercase
7856 Lower     ; Lowercase
7857 END
7858         }
7859         if ($v_version ge v3.0.1) {
7860             push @return, split /\n/, <<'END';
7861 NChar     ; Noncharacter_Code_Point
7862 END
7863         }
7864         # The next sets were added in the new-style
7865         if ($v_version ge v3.1.0) {
7866             push @return, split /\n/, <<'END';
7867 OAlpha    ; Other_Alphabetic
7868 OLower    ; Other_Lowercase
7869 OMath     ; Other_Math
7870 OUpper    ; Other_Uppercase
7871 END
7872         }
7873         if ($v_version ge v3.1.1) {
7874             push @return, "AHex      ; ASCII_Hex_Digit\n";
7875         }
7876     }
7877     if (-e 'EastAsianWidth.txt') {
7878         push @return, "ea        ; East_Asian_Width\n";
7879     }
7880     if (-e 'CompositionExclusions.txt') {
7881         push @return, "CE        ; Composition_Exclusion\n";
7882     }
7883     if (-e 'LineBreak.txt') {
7884         push @return, "lb        ; Line_Break\n";
7885     }
7886     if (-e 'BidiMirroring.txt') {
7887         push @return, "bmg       ; Bidi_Mirroring_Glyph\n";
7888     }
7889     if (-e 'Scripts.txt') {
7890         push @return, "sc        ; Script\n";
7891     }
7892     if (-e 'DNormalizationProps.txt') {
7893         push @return, split /\n/, <<'END';
7894 Comp_Ex   ; Full_Composition_Exclusion
7895 FC_NFKC   ; FC_NFKC_Closure
7896 NFC_QC    ; NFC_Quick_Check
7897 NFD_QC    ; NFD_Quick_Check
7898 NFKC_QC   ; NFKC_Quick_Check
7899 NFKD_QC   ; NFKD_Quick_Check
7900 XO_NFC    ; Expands_On_NFC
7901 XO_NFD    ; Expands_On_NFD
7902 XO_NFKC   ; Expands_On_NFKC
7903 XO_NFKD   ; Expands_On_NFKD
7904 END
7905     }
7906     if (-e 'DCoreProperties.txt') {
7907         push @return, split /\n/, <<'END';
7908 IDS       ; ID_Start
7909 XIDC      ; XID_Continue
7910 XIDS      ; XID_Start
7911 END
7912         # These can also appear in some versions of PropList.txt
7913         push @return, "Lower     ; Lowercase\n"
7914                                     unless grep { $_ =~ /^Lower\b/} @return;
7915         push @return, "Upper     ; Uppercase\n"
7916                                     unless grep { $_ =~ /^Upper\b/} @return;
7917     }
7918
7919     # This flag requires the DAge.txt file to be copied into the directory.
7920     if (DEBUG && $compare_versions) {
7921         push @return, 'age       ; Age';
7922     }
7923
7924     return @return;
7925 }
7926
7927 sub process_PropValueAliases {
7928     # This file contains values that properties look like:
7929     # bc ; AL        ; Arabic_Letter
7930     # blk; n/a       ; Greek_And_Coptic                 ; Greek
7931     #
7932     # Field 0 is the property.
7933     # Field 1 is the short name of a property value or 'n/a' if no
7934     #                short name exists;
7935     # Field 2 is the full property value name;
7936     # Any other fields are more synonyms for the property value.
7937     # Purely numeric property values are omitted from the file; as are some
7938     # others, fewer and fewer in later releases
7939
7940     # Entries for the ccc property have an extra field before the
7941     # abbreviation:
7942     # ccc;   0; NR   ; Not_Reordered
7943     # It is the numeric value that the names are synonyms for.
7944
7945     # There are comment entries for values missing from this file:
7946     # # @missing: 0000..10FFFF; ISO_Comment; <none>
7947     # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
7948
7949     my $file= shift;
7950     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7951
7952     # This whole file was non-existent in early releases, so use our own
7953     # internal one if necessary.
7954     if (! -e 'PropValueAliases.txt') {
7955         $file->insert_lines(get_old_property_value_aliases());
7956     }
7957
7958     # Add any explicit cjk values
7959     $file->insert_lines(@cjk_property_values);
7960
7961     # This line is used only for testing the code that checks for name
7962     # conflicts.  There is a script Inherited, and when this line is executed
7963     # it causes there to be a name conflict with the 'Inherited' that this
7964     # program generates for this block property value
7965     #$file->insert_lines('blk; n/a; Herited');
7966
7967
7968     # Process each line of the file ...
7969     while ($file->next_line) {
7970
7971         my ($property, @data) = split /\s*;\s*/;
7972
7973         # The full name for the ccc property value is in field 2 of the
7974         # remaining ones; field 1 for all other properties.  Swap ccc fields 1
7975         # and 2.  (Rightmost splice removes field 2, returning it; left splice
7976         # inserts that into field 1, thus shifting former field 1 to field 2.)
7977         splice (@data, 1, 0, splice(@data, 2, 1)) if $property eq 'ccc';
7978
7979         # If there is no short name, use the full one in element 1
7980         $data[0] = $data[1] if $data[0] eq "n/a";
7981
7982         # Earlier releases had the pseudo property 'qc' that should expand to
7983         # the ones that replace it below.
7984         if ($property eq 'qc') {
7985             if (lc $data[0] eq 'y') {
7986                 $file->insert_lines('NFC_QC; Y      ; Yes',
7987                                     'NFD_QC; Y      ; Yes',
7988                                     'NFKC_QC; Y     ; Yes',
7989                                     'NFKD_QC; Y     ; Yes',
7990                                     );
7991             }
7992             elsif (lc $data[0] eq 'n') {
7993                 $file->insert_lines('NFC_QC; N      ; No',
7994                                     'NFD_QC; N      ; No',
7995                                     'NFKC_QC; N     ; No',
7996                                     'NFKD_QC; N     ; No',
7997                                     );
7998             }
7999             elsif (lc $data[0] eq 'm') {
8000                 $file->insert_lines('NFC_QC; M      ; Maybe',
8001                                     'NFKC_QC; M     ; Maybe',
8002                                     );
8003             }
8004             else {
8005                 $file->carp_bad_line("qc followed by unexpected '$data[0]");
8006             }
8007             next;
8008         }
8009
8010         # The first field is the short name, 2nd is the full one.
8011         my $property_object = property_ref($property);
8012         my $table = $property_object->add_match_table($data[0],
8013                                                 Full_Name => $data[1]);
8014
8015         # Start looking for more aliases after these two.
8016         for my $i (2 .. @data - 1) {
8017             $table->add_alias($data[$i]);
8018         }
8019     } # End of looping through the file
8020
8021     # As noted in the comments early in the program, it generates tables for
8022     # the default values for all releases, even those for which the concept
8023     # didn't exist at the time.  Here we add those if missing.
8024     my $age = property_ref('age');
8025     if (defined $age && ! defined $age->table('Unassigned')) {
8026         $age->add_match_table('Unassigned');
8027     }
8028     $block->add_match_table('No_Block') if -e 'Blocks.txt'
8029                                     && ! defined $block->table('No_Block');
8030
8031
8032     # Now set the default mappings of the properties from the file.  This is
8033     # done after the loop because a number of properties have only @missings
8034     # entries in the file, and may not show up until the end.
8035     my @defaults = $file->get_missings;
8036     foreach my $default_ref (@defaults) {
8037         my $default = $default_ref->[0];
8038         my $property = property_ref($default_ref->[1]);
8039         $property->set_default_map($default);
8040     }
8041     return;
8042 }
8043
8044 sub get_old_property_value_aliases () {
8045     # Returns what would be in PropValueAliases.txt if it existed in very old
8046     # versions of Unicode.  It was derived from the one in 3.2, and pared
8047     # down.  An attempt was made to use the existence of files to mean
8048     # inclusion or not of various aliases, but if this was not sufficient,
8049     # using version numbers was resorted to.
8050
8051     my @return = split /\n/, <<'END';
8052 bc ; AN        ; Arabic_Number
8053 bc ; B         ; Paragraph_Separator
8054 bc ; CS        ; Common_Separator
8055 bc ; EN        ; European_Number
8056 bc ; ES        ; European_Separator
8057 bc ; ET        ; European_Terminator
8058 bc ; L         ; Left_To_Right
8059 bc ; ON        ; Other_Neutral
8060 bc ; R         ; Right_To_Left
8061 bc ; WS        ; White_Space
8062
8063 # The standard combining classes are very much different in v1, so only use
8064 # ones that look right (not checked thoroughly)
8065 ccc;   0; NR   ; Not_Reordered
8066 ccc;   1; OV   ; Overlay
8067 ccc;   7; NK   ; Nukta
8068 ccc;   8; KV   ; Kana_Voicing
8069 ccc;   9; VR   ; Virama
8070 ccc; 202; ATBL ; Attached_Below_Left
8071 ccc; 216; ATAR ; Attached_Above_Right
8072 ccc; 218; BL   ; Below_Left
8073 ccc; 220; B    ; Below
8074 ccc; 222; BR   ; Below_Right
8075 ccc; 224; L    ; Left
8076 ccc; 228; AL   ; Above_Left
8077 ccc; 230; A    ; Above
8078 ccc; 232; AR   ; Above_Right
8079 ccc; 234; DA   ; Double_Above
8080
8081 dt ; can       ; canonical
8082 dt ; enc       ; circle
8083 dt ; fin       ; final
8084 dt ; font      ; font
8085 dt ; fra       ; fraction
8086 dt ; init      ; initial
8087 dt ; iso       ; isolated
8088 dt ; med       ; medial
8089 dt ; n/a       ; none
8090 dt ; nb        ; noBreak
8091 dt ; sqr       ; square
8092 dt ; sub       ; sub
8093 dt ; sup       ; super
8094
8095 gc ; C         ; Other                            # Cc | Cf | Cn | Co | Cs
8096 gc ; Cc        ; Control
8097 gc ; Cn        ; Unassigned
8098 gc ; Co        ; Private_Use
8099 gc ; L         ; Letter                           # Ll | Lm | Lo | Lt | Lu
8100 gc ; LC        ; Cased_Letter                     # Ll | Lt | Lu
8101 gc ; Ll        ; Lowercase_Letter
8102 gc ; Lm        ; Modifier_Letter
8103 gc ; Lo        ; Other_Letter
8104 gc ; Lu        ; Uppercase_Letter
8105 gc ; M         ; Mark                             # Mc | Me | Mn
8106 gc ; Mc        ; Spacing_Mark
8107 gc ; Mn        ; Nonspacing_Mark
8108 gc ; N         ; Number                           # Nd | Nl | No
8109 gc ; Nd        ; Decimal_Number
8110 gc ; No        ; Other_Number
8111 gc ; P         ; Punctuation                      # Pc | Pd | Pe | Pf | Pi | Po | Ps
8112 gc ; Pd        ; Dash_Punctuation
8113 gc ; Pe        ; Close_Punctuation
8114 gc ; Po        ; Other_Punctuation
8115 gc ; Ps        ; Open_Punctuation
8116 gc ; S         ; Symbol                           # Sc | Sk | Sm | So
8117 gc ; Sc        ; Currency_Symbol
8118 gc ; Sm        ; Math_Symbol
8119 gc ; So        ; Other_Symbol
8120 gc ; Z         ; Separator                        # Zl | Zp | Zs
8121 gc ; Zl        ; Line_Separator
8122 gc ; Zp        ; Paragraph_Separator
8123 gc ; Zs        ; Space_Separator
8124
8125 nt ; de        ; Decimal
8126 nt ; di        ; Digit
8127 nt ; n/a       ; None
8128 nt ; nu        ; Numeric
8129 END
8130
8131     if (-e 'ArabicShaping.txt') {
8132         push @return, split /\n/, <<'END';
8133 jg ; n/a       ; AIN
8134 jg ; n/a       ; ALEF
8135 jg ; n/a       ; DAL
8136 jg ; n/a       ; GAF
8137 jg ; n/a       ; LAM
8138 jg ; n/a       ; MEEM
8139 jg ; n/a       ; NO_JOINING_GROUP
8140 jg ; n/a       ; NOON
8141 jg ; n/a       ; QAF
8142 jg ; n/a       ; SAD
8143 jg ; n/a       ; SEEN
8144 jg ; n/a       ; TAH
8145 jg ; n/a       ; WAW
8146
8147 jt ; C         ; Join_Causing
8148 jt ; D         ; Dual_Joining
8149 jt ; L         ; Left_Joining
8150 jt ; R         ; Right_Joining
8151 jt ; U         ; Non_Joining
8152 jt ; T         ; Transparent
8153 END
8154         if ($v_version ge v3.0.0) {
8155             push @return, split /\n/, <<'END';
8156 jg ; n/a       ; ALAPH
8157 jg ; n/a       ; BEH
8158 jg ; n/a       ; BETH
8159 jg ; n/a       ; DALATH_RISH
8160 jg ; n/a       ; E
8161 jg ; n/a       ; FEH
8162 jg ; n/a       ; FINAL_SEMKATH
8163 jg ; n/a       ; GAMAL
8164 jg ; n/a       ; HAH
8165 jg ; n/a       ; HAMZA_ON_HEH_GOAL
8166 jg ; n/a       ; HE
8167 jg ; n/a       ; HEH
8168 jg ; n/a       ; HEH_GOAL
8169 jg ; n/a       ; HETH
8170 jg ; n/a       ; KAF
8171 jg ; n/a       ; KAPH
8172 jg ; n/a       ; KNOTTED_HEH
8173 jg ; n/a       ; LAMADH
8174 jg ; n/a       ; MIM
8175 jg ; n/a       ; NUN
8176 jg ; n/a       ; PE
8177 jg ; n/a       ; QAPH
8178 jg ; n/a       ; REH
8179 jg ; n/a       ; REVERSED_PE
8180 jg ; n/a       ; SADHE
8181 jg ; n/a       ; SEMKATH
8182 jg ; n/a       ; SHIN
8183 jg ; n/a       ; SWASH_KAF
8184 jg ; n/a       ; TAW
8185 jg ; n/a       ; TEH_MARBUTA
8186 jg ; n/a       ; TETH
8187 jg ; n/a       ; YEH
8188 jg ; n/a       ; YEH_BARREE
8189 jg ; n/a       ; YEH_WITH_TAIL
8190 jg ; n/a       ; YUDH
8191 jg ; n/a       ; YUDH_HE
8192 jg ; n/a       ; ZAIN
8193 END
8194         }
8195     }
8196
8197
8198     if (-e 'EastAsianWidth.txt') {
8199         push @return, split /\n/, <<'END';
8200 ea ; A         ; Ambiguous
8201 ea ; F         ; Fullwidth
8202 ea ; H         ; Halfwidth
8203 ea ; N         ; Neutral
8204 ea ; Na        ; Narrow
8205 ea ; W         ; Wide
8206 END
8207     }
8208
8209     if (-e 'LineBreak.txt') {
8210         push @return, split /\n/, <<'END';
8211 lb ; AI        ; Ambiguous
8212 lb ; AL        ; Alphabetic
8213 lb ; B2        ; Break_Both
8214 lb ; BA        ; Break_After
8215 lb ; BB        ; Break_Before
8216 lb ; BK        ; Mandatory_Break
8217 lb ; CB        ; Contingent_Break
8218 lb ; CL        ; Close_Punctuation
8219 lb ; CM        ; Combining_Mark
8220 lb ; CR        ; Carriage_Return
8221 lb ; EX        ; Exclamation
8222 lb ; GL        ; Glue
8223 lb ; HY        ; Hyphen
8224 lb ; ID        ; Ideographic
8225 lb ; IN        ; Inseperable
8226 lb ; IS        ; Infix_Numeric
8227 lb ; LF        ; Line_Feed
8228 lb ; NS        ; Nonstarter
8229 lb ; NU        ; Numeric
8230 lb ; OP        ; Open_Punctuation
8231 lb ; PO        ; Postfix_Numeric
8232 lb ; PR        ; Prefix_Numeric
8233 lb ; QU        ; Quotation
8234 lb ; SA        ; Complex_Context
8235 lb ; SG        ; Surrogate
8236 lb ; SP        ; Space
8237 lb ; SY        ; Break_Symbols
8238 lb ; XX        ; Unknown
8239 lb ; ZW        ; ZWSpace
8240 END
8241     }
8242
8243     if (-e 'DNormalizationProps.txt') {
8244         push @return, split /\n/, <<'END';
8245 qc ; M         ; Maybe
8246 qc ; N         ; No
8247 qc ; Y         ; Yes
8248 END
8249     }
8250
8251     if (-e 'Scripts.txt') {
8252         push @return, split /\n/, <<'END';
8253 sc ; Arab      ; Arabic
8254 sc ; Armn      ; Armenian
8255 sc ; Beng      ; Bengali
8256 sc ; Bopo      ; Bopomofo
8257 sc ; Cans      ; Canadian_Aboriginal
8258 sc ; Cher      ; Cherokee
8259 sc ; Cyrl      ; Cyrillic
8260 sc ; Deva      ; Devanagari
8261 sc ; Dsrt      ; Deseret
8262 sc ; Ethi      ; Ethiopic
8263 sc ; Geor      ; Georgian
8264 sc ; Goth      ; Gothic
8265 sc ; Grek      ; Greek
8266 sc ; Gujr      ; Gujarati
8267 sc ; Guru      ; Gurmukhi
8268 sc ; Hang      ; Hangul
8269 sc ; Hani      ; Han
8270 sc ; Hebr      ; Hebrew
8271 sc ; Hira      ; Hiragana
8272 sc ; Ital      ; Old_Italic
8273 sc ; Kana      ; Katakana
8274 sc ; Khmr      ; Khmer
8275 sc ; Knda      ; Kannada
8276 sc ; Laoo      ; Lao
8277 sc ; Latn      ; Latin
8278 sc ; Mlym      ; Malayalam
8279 sc ; Mong      ; Mongolian
8280 sc ; Mymr      ; Myanmar
8281 sc ; Ogam      ; Ogham
8282 sc ; Orya      ; Oriya
8283 sc ; Qaai      ; Inherited
8284 sc ; Runr      ; Runic
8285 sc ; Sinh      ; Sinhala
8286 sc ; Syrc      ; Syriac
8287 sc ; Taml      ; Tamil
8288 sc ; Telu      ; Telugu
8289 sc ; Thaa      ; Thaana
8290 sc ; Thai      ; Thai
8291 sc ; Tibt      ; Tibetan
8292 sc ; Yiii      ; Yi
8293 sc ; Zyyy      ; Common
8294 END
8295     }
8296
8297     if ($v_version ge v2.0.0) {
8298         push @return, split /\n/, <<'END';
8299 dt ; com       ; compat
8300 dt ; nar       ; narrow
8301 dt ; sml       ; small
8302 dt ; vert      ; vertical
8303 dt ; wide      ; wide
8304
8305 gc ; Cf        ; Format
8306 gc ; Cs        ; Surrogate
8307 gc ; Lt        ; Titlecase_Letter
8308 gc ; Me        ; Enclosing_Mark
8309 gc ; Nl        ; Letter_Number
8310 gc ; Pc        ; Connector_Punctuation
8311 gc ; Sk        ; Modifier_Symbol
8312 END
8313     }
8314     if ($v_version ge v2.1.2) {
8315         push @return, "bc ; S         ; Segment_Separator\n";
8316     }
8317     if ($v_version ge v2.1.5) {
8318         push @return, split /\n/, <<'END';
8319 gc ; Pf        ; Final_Punctuation
8320 gc ; Pi        ; Initial_Punctuation
8321 END
8322     }
8323     if ($v_version ge v2.1.8) {
8324         push @return, "ccc; 240; IS   ; Iota_Subscript\n";
8325     }
8326
8327     if ($v_version ge v3.0.0) {
8328         push @return, split /\n/, <<'END';
8329 bc ; AL        ; Arabic_Letter
8330 bc ; BN        ; Boundary_Neutral
8331 bc ; LRE       ; Left_To_Right_Embedding
8332 bc ; LRO       ; Left_To_Right_Override
8333 bc ; NSM       ; Nonspacing_Mark
8334 bc ; PDF       ; Pop_Directional_Format
8335 bc ; RLE       ; Right_To_Left_Embedding
8336 bc ; RLO       ; Right_To_Left_Override
8337
8338 ccc; 233; DB   ; Double_Below
8339 END
8340     }
8341
8342     if ($v_version ge v3.1.0) {
8343         push @return, "ccc; 226; R    ; Right\n";
8344     }
8345
8346     return @return;
8347 }
8348
8349 { # Closure
8350     # This is used to store the range list of all the code points usable when
8351     # the little used $compare_versions feature is enabled.
8352     my $compare_versions_range_list;
8353
8354     sub process_generic_property_file {
8355         # This processes a file containing property mappings and puts them
8356         # into internal map tables.  It should be used to handle any property
8357         # files that have mappings from a code point or range thereof to
8358         # something else.  This means almost all the UCD .txt files.
8359         # each_line_handlers() should be set to adjust the lines of these
8360         # files, if necessary, to what this routine understands:
8361         #
8362         # 0374          ; NFD_QC; N
8363         # 003C..003E    ; Math
8364         #
8365         # the fields are: "codepoint range ; property; map"
8366         #
8367         # meaning the codepoints in the range all have the value 'map' under
8368         # 'property'.
8369         # Beginning and trailing white space in each field are not signficant.
8370         # Note there is not a trailing semi-colon in the above.  A trailing
8371         # semi-colon means the map is a null-string.  An omitted map, as
8372         # opposed to a null-string, is assumed to be 'Y', based on Unicode
8373         # table syntax.  (This could have been hidden from this routine by
8374         # doing it in the $file object, but that would require parsing of the
8375         # line there, so would have to parse it twice, or change the interface
8376         # to pass this an array.  So not done.)
8377         #
8378         # The map field may begin with a sequence of commands that apply to
8379         # this range.  Each such command begins and ends with $CMD_DELIM.
8380         # These are used to indicate, for example, that the mapping for a
8381         # range has a non-default type.
8382         #
8383         # This loops through the file, calling it's next_line() method, and
8384         # then taking the map and adding it to the property's table.
8385         # Complications arise because any number of properties can be in the
8386         # file, in any order, interspersed in any way.  The first time a
8387         # property is seen, it gets information about that property and
8388         # caches it for quick retrieval later.  It also normalizes the maps
8389         # so that only one of many synonym is stored.  The Unicode input files
8390         # do use some multiple synonyms.
8391
8392         my $file = shift;
8393         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8394
8395         my %property_info;               # To keep track of what properties
8396                                          # have already had entries in the
8397                                          # current file, and info about each,
8398                                          # so don't have to recompute.
8399         my $property_name;               # property currently being worked on
8400         my $property_type;               # and its type
8401         my $previous_property_name = ""; # name from last time through loop
8402         my $property_object;             # pointer to the current property's
8403                                          # object
8404         my $property_addr;               # the address of that object
8405         my $default_map;                 # the string that code points missing
8406                                          # from the file map to
8407         my $default_table;               # For non-string properties, a
8408                                          # reference to the match table that
8409                                          # will contain the list of code
8410                                          # points that map to $default_map.
8411
8412         # Get the next real non-comment line
8413         LINE:
8414         while ($file->next_line) {
8415
8416             # Default replacement type; means that if parts of the range have
8417             # already been stored in our tables, the new map overrides them if
8418             # they differ more than cosmetically
8419             my $replace = $IF_NOT_EQUIVALENT;
8420             my $map_type;            # Default type for the map of this range
8421
8422             #local $to_trace = 1 if main::DEBUG;
8423             trace $_ if main::DEBUG && $to_trace;
8424
8425             # Split the line into components
8426             my ($range, $property_name, $map, @remainder)
8427                 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
8428
8429             # If more or less on the line than we are expecting, warn and skip
8430             # the line
8431             if (@remainder) {
8432                 $file->carp_bad_line('Extra fields');
8433                 next LINE;
8434             }
8435             elsif ( ! defined $property_name) {
8436                 $file->carp_bad_line('Missing property');
8437                 next LINE;
8438             }
8439
8440             # Examine the range.
8441             if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
8442             {
8443                 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
8444                 next LINE;
8445             }
8446             my $low = hex $1;
8447             my $high = (defined $2) ? hex $2 : $low;
8448
8449             # For the very specialized case of comparing two Unicode
8450             # versions...
8451             if (DEBUG && $compare_versions) {
8452                 if ($property_name eq 'Age') {
8453
8454                     # Only allow code points at least as old as the version
8455                     # specified.
8456                     my $age = pack "C*", split(/\./, $map);        # v string
8457                     next LINE if $age gt $compare_versions;
8458                 }
8459                 else {
8460
8461                     # Again, we throw out code points younger than those of
8462                     # the specified version.  By now, the Age property is
8463                     # populated.  We use the intersection of each input range
8464                     # with this property to find what code points in it are
8465                     # valid.   To do the intersection, we have to convert the
8466                     # Age property map to a Range_list.  We only have to do
8467                     # this once.
8468                     if (! defined $compare_versions_range_list) {
8469                         my $age = property_ref('Age');
8470                         if (! -e 'DAge.txt') {
8471                             croak "Need to have 'DAge.txt' file to do version comparison";
8472                         }
8473                         elsif ($age->count == 0) {
8474                             croak "The 'Age' table is empty, but its file exists";
8475                         }
8476                         $compare_versions_range_list
8477                                         = Range_List->new(Initialize => $age);
8478                     }
8479
8480                     # An undefined map is always 'Y'
8481                     $map = 'Y' if ! defined $map;
8482
8483                     # Calculate the intersection of the input range with the
8484                     # code points that are known in the specified version
8485                     my @ranges = ($compare_versions_range_list
8486                                   & Range->new($low, $high))->ranges;
8487
8488                     # If the intersection is empty, throw away this range
8489                     next LINE unless @ranges;
8490
8491                     # Only examine the first range this time through the loop.
8492                     my $this_range = shift @ranges;
8493
8494                     # Put any remaining ranges in the queue to be processed
8495                     # later.  Note that there is unnecessary work here, as we
8496                     # will do the intersection again for each of these ranges
8497                     # during some future iteration of the LINE loop, but this
8498                     # code is not used in production.  The later intersections
8499                     # are guaranteed to not splinter, so this will not become
8500                     # an infinite loop.
8501                     my $line = join ';', $property_name, $map;
8502                     foreach my $range (@ranges) {
8503                         $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
8504                                                             $range->start,
8505                                                             $range->end,
8506                                                             $line));
8507                     }
8508
8509                     # And process the first range, like any other.
8510                     $low = $this_range->start;
8511                     $high = $this_range->end;
8512                 }
8513             } # End of $compare_versions
8514
8515             # If changing to a new property, get the things constant per
8516             # property
8517             if ($previous_property_name ne $property_name) {
8518
8519                 $property_object = property_ref($property_name);
8520                 if (! defined $property_object) {
8521                     $file->carp_bad_line("Unexpected property '$property_name'.  Skipped");
8522                     next LINE;
8523                 }
8524                 $property_addr = main::objaddr($property_object);
8525
8526                 # Defer changing names until have a line that is acceptable
8527                 # (the 'next' statement above means is unacceptable)
8528                 $previous_property_name = $property_name;
8529
8530                 # If not the first time for this property, retrieve info about
8531                 # it from the cache
8532                 if (defined ($property_info{$property_addr}{'type'})) {
8533                     $property_type = $property_info{$property_addr}{'type'};
8534                     $default_map = $property_info{$property_addr}{'default'};
8535                     $map_type
8536                         = $property_info{$property_addr}{'pseudo_map_type'};
8537                     $default_table
8538                             = $property_info{$property_addr}{'default_table'};
8539                 }
8540                 else {
8541
8542                     # Here, is the first time for this property.  Set up the
8543                     # cache.
8544                     $property_type = $property_info{$property_addr}{'type'}
8545                                    = $property_object->type;
8546                     $map_type
8547                         = $property_info{$property_addr}{'pseudo_map_type'}
8548                         = $property_object->pseudo_map_type;
8549
8550                     # The Unicode files are set up so that if the map is not
8551                     # defined, it is a binary property
8552                     if (! defined $map && $property_type != $BINARY) {
8553                         if ($property_type != $UNKNOWN
8554                             && $property_type != $NON_STRING)
8555                         {
8556                             $file->carp_bad_line("No mapping defined on a non-binary property.  Using 'Y' for the map");
8557                         }
8558                         else {
8559                             $property_object->set_type($BINARY);
8560                             $property_type
8561                                 = $property_info{$property_addr}{'type'}
8562                                 = $BINARY;
8563                         }
8564                     }
8565
8566                     # Get any @missings default for this property.  This
8567                     # should precede the first entry for the property in the
8568                     # input file, and is located in a comment that has been
8569                     # stored by the Input_file class until we access it here.
8570                     # It's possible that there is more than one such line
8571                     # waiting for us; collect them all, and parse
8572                     my @missings_list = $file->get_missings
8573                                             if $file->has_missings_defaults;
8574                     foreach my $default_ref (@missings_list) {
8575                         my $default = $default_ref->[0];
8576                         my $addr = objaddr property_ref($default_ref->[1]);
8577
8578                         # For string properties, the default is just what the
8579                         # file says, but non-string properties should already
8580                         # have set up a table for the default property value;
8581                         # use the table for these, so can resolve synonyms
8582                         # later to a single standard one.
8583                         if ($property_type == $STRING
8584                             || $property_type == $UNKNOWN)
8585                         {
8586                             $property_info{$addr}{'missings'} = $default;
8587                         }
8588                         else {
8589                             $property_info{$addr}{'missings'}
8590                                         = $property_object->table($default);
8591                         }
8592                     }
8593
8594                     # Finished storing all the @missings defaults in the input
8595                     # file so far.  Get the one for the current property.
8596                     my $missings = $property_info{$property_addr}{'missings'};
8597
8598                     # But we likely have separately stored what the default
8599                     # should be.  (This is to accommodate versions of the
8600                     # standard where the @missings lines are absent or
8601                     # incomplete.)  Hopefully the two will match.  But check
8602                     # it out.
8603                     $default_map = $property_object->default_map;
8604
8605                     # If the map is a ref, it means that the default won't be
8606                     # processed until later, so undef it, so next few lines
8607                     # will redefine it to something that nothing will match
8608                     undef $default_map if ref $default_map;
8609
8610                     # Create a $default_map if don't have one; maybe a dummy
8611                     # that won't match anything.
8612                     if (! defined $default_map) {
8613
8614                         # Use any @missings line in the file.
8615                         if (defined $missings) {
8616                             if (ref $missings) {
8617                                 $default_map = $missings->full_name;
8618                                 $default_table = $missings;
8619                             }
8620                             else {
8621                                 $default_map = $missings;
8622                             }
8623
8624                             # And store it with the property for outside use.
8625                             $property_object->set_default_map($default_map);
8626                         }
8627                         else {
8628
8629                             # Neither an @missings nor a default map.  Create
8630                             # a dummy one, so won't have to test definedness
8631                             # in the main loop.
8632                             $default_map = '_Perl This will never be in a file
8633                                             from Unicode';
8634                         }
8635                     }
8636
8637                     # Here, we have $default_map defined, possibly in terms of
8638                     # $missings, but maybe not, and possibly is a dummy one.
8639                     if (defined $missings) {
8640
8641                         # Make sure there is no conflict between the two.
8642                         # $missings has priority.
8643                         if (ref $missings) {
8644                             $default_table
8645                                 = $property_object->table($default_map);
8646                             if (! defined $default_table
8647                                 || $default_table != $missings)
8648                             {
8649                                 if (! defined $default_table) {
8650                                     $default_table = $UNDEF;
8651                                 }
8652                                 $file->carp_bad_line(<<END
8653 The \@missings line for $property_name in $file says that missings default to
8654 $missings, but we expect it to be $default_table.  $missings used.
8655 END
8656                                 );
8657                                 $default_table = $missings;
8658                                 $default_map = $missings->full_name;
8659                             }
8660                             $property_info{$property_addr}{'default_table'}
8661                                                         = $default_table;
8662                         }
8663                         elsif ($default_map ne $missings) {
8664                             $file->carp_bad_line(<<END
8665 The \@missings line for $property_name in $file says that missings default to
8666 $missings, but we expect it to be $default_map.  $missings used.
8667 END
8668                             );
8669                             $default_map = $missings;
8670                         }
8671                     }
8672
8673                     $property_info{$property_addr}{'default'}
8674                                                     = $default_map;
8675
8676                     # If haven't done so already, find the table corresponding
8677                     # to this map for non-string properties.
8678                     if (! defined $default_table
8679                         && $property_type != $STRING
8680                         && $property_type != $UNKNOWN)
8681                     {
8682                         $default_table = $property_info{$property_addr}
8683                                                         {'default_table'}
8684                                     = $property_object->table($default_map);
8685                     }
8686                 } # End of is first time for this property
8687             } # End of switching properties.
8688
8689             # Ready to process the line.
8690             # The Unicode files are set up so that if the map is not defined,
8691             # it is a binary property with value 'Y'
8692             if (! defined $map) {
8693                 $map = 'Y';
8694             }
8695             else {
8696
8697                 # If the map begins with a special command to us (enclosed in
8698                 # delimiters), extract the command(s).
8699                 if (substr($map, 0, 1) eq $CMD_DELIM) {
8700                     while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
8701                         my $command = $1;
8702                         if ($command =~  / ^ $REPLACE_CMD= (.*) /x) {
8703                             $replace = $1;
8704                         }
8705                         elsif ($command =~  / ^ $MAP_TYPE_CMD= (.*) /x) {
8706                             $map_type = $1;
8707                         }
8708                         else {
8709                            $file->carp_bad_line("Unknown command line: '$1'");
8710                            next LINE;
8711                         }
8712                     }
8713                 }
8714             }
8715
8716             if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
8717             {
8718
8719                 # Here, we have a map to a particular code point, and the
8720                 # default map is to a code point itself.  If the range
8721                 # includes the particular code point, change that portion of
8722                 # the range to the default.  This makes sure that in the final
8723                 # table only the non-defaults are listed.
8724                 my $decimal_map = hex $map;
8725                 if ($low <= $decimal_map && $decimal_map <= $high) {
8726
8727                     # If the range includes stuff before or after the map
8728                     # we're changing, split it and process the split-off parts
8729                     # later.
8730                     if ($low < $decimal_map) {
8731                         $file->insert_adjusted_lines(
8732                                             sprintf("%04X..%04X; %s; %s",
8733                                                     $low,
8734                                                     $decimal_map - 1,
8735                                                     $property_name,
8736                                                     $map));
8737                     }
8738                     if ($high > $decimal_map) {
8739                         $file->insert_adjusted_lines(
8740                                             sprintf("%04X..%04X; %s; %s",
8741                                                     $decimal_map + 1,
8742                                                     $high,
8743                                                     $property_name,
8744                                                     $map));
8745                     }
8746                     $low = $high = $decimal_map;
8747                     $map = $CODE_POINT;
8748                 }
8749             }
8750
8751             # If we can tell that this is a synonym for the default map, use
8752             # the default one instead.
8753             if ($property_type != $STRING
8754                 && $property_type != $UNKNOWN)
8755             {
8756                 my $table = $property_object->table($map);
8757                 if (defined $table && $table == $default_table) {
8758                     $map = $default_map;
8759                 }
8760             }
8761
8762             # And figure out the map type if not known.
8763             if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
8764                 if ($map eq "") {   # Nulls are always $NULL map type
8765                     $map_type = $NULL;
8766                 } # Otherwise, non-strings, and those that don't allow
8767                   # $MULTI_CP, and those that aren't multiple code points are
8768                   # 0
8769                 elsif
8770                    (($property_type != $STRING && $property_type != $UNKNOWN)
8771                    || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
8772                    || $map !~ /^ $code_point_re ( \  $code_point_re )+ $ /x)
8773                 {
8774                     $map_type = 0;
8775                 }
8776                 else {
8777                     $map_type = $MULTI_CP;
8778                 }
8779             }
8780
8781             $property_object->add_map($low, $high,
8782                                         $map,
8783                                         Type => $map_type,
8784                                         Replace => $replace);
8785         } # End of loop through file's lines
8786
8787         return;
8788     }
8789 }
8790
8791 # XXX Unused until revise charnames;
8792 #sub check_and_handle_compound_name {
8793 #    This looks at Name properties for parenthesized components and splits
8794 #    them off.  Thus it finds FF as an equivalent to Form Feed.
8795 #    my $code_point = shift;
8796 #    my $name = shift;
8797 #    if ($name =~ /^ ( .*? ) ( \s* ) \( ( [^)]* ) \) (.*) $/x) {
8798 #        #local $to_trace = 1 if main::DEBUG;
8799 #        trace $1, $2, $3, $4 if main::DEBUG && $to_trace;
8800 #        push @more_Names, "$code_point; $1";
8801 #        push @more_Names, "$code_point; $3";
8802 #        Carp::my_carp_bug("Expecting blank space before left parenthesis in '$_'.  Proceeding and assuming it was there;") if $2 ne " ";
8803 #        Carp::my_carp_bug("Not expecting anything after the right parenthesis in '$_'.  Proceeding and ignoring that;") if $4 ne "";
8804 #    }
8805 #    return;
8806 #}
8807
8808 { # Closure for UnicodeData.txt handling
8809
8810     # This file was the first one in the UCD; its design leads to some
8811     # awkwardness in processing.  Here is a sample line:
8812     # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
8813     # The fields in order are:
8814     my $i = 0;            # The code point is in field 0, and is shifted off.
8815     my $NAME = $i++;      # character name (e.g. "LATIN CAPITAL LETTER A")
8816     my $CATEGORY = $i++;  # category (e.g. "Lu")
8817     my $CCC = $i++;       # Canonical combining class (e.g. "230")
8818     my $BIDI = $i++;      # directional class (e.g. "L")
8819     my $PERL_DECOMPOSITION = $i++;  # decomposition mapping
8820     my $PERL_DECIMAL_DIGIT = $i++;   # decimal digit value
8821     my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
8822                                          # Dual-use in this program; see below
8823     my $NUMERIC = $i++;   # numeric value
8824     my $MIRRORED = $i++;  # ? mirrored
8825     my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
8826     my $COMMENT = $i++;   # iso comment
8827     my $UPPER = $i++;     # simple uppercase mapping
8828     my $LOWER = $i++;     # simple lowercase mapping
8829     my $TITLE = $i++;     # simple titlecase mapping
8830     my $input_field_count = $i;
8831
8832     # This routine in addition outputs these extra fields:
8833     my $DECOMP_TYPE = $i++; # Decomposition type
8834     my $DECOMP_MAP = $i++;  # Must be last; another decomposition mapping
8835     my $last_field = $i - 1;
8836
8837     # All these are read into an array for each line, with the indices defined
8838     # above.  The empty fields in the example line above indicate that the
8839     # value is defaulted.  The handler called for each line of the input
8840     # changes these to their defaults.
8841
8842     # Here are the official names of the properties, in a parallel array:
8843     my @field_names;
8844     $field_names[$BIDI] = 'Bidi_Class';
8845     $field_names[$CATEGORY] = 'General_Category';
8846     $field_names[$CCC] = 'Canonical_Combining_Class';
8847     $field_names[$COMMENT] = 'ISO_Comment';
8848     $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
8849     $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
8850     $field_names[$LOWER] = 'Simple_Lowercase_Mapping';
8851     $field_names[$MIRRORED] = 'Bidi_Mirrored';
8852     $field_names[$NAME] = 'Name';
8853     $field_names[$NUMERIC] = 'Numeric_Value';
8854     $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
8855     $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
8856     $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
8857     $field_names[$TITLE] = 'Simple_Titlecase_Mapping';
8858     $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
8859     $field_names[$UPPER] = 'Simple_Uppercase_Mapping';
8860
8861     # Some of these need a little more explanation.  The $PERL_DECIMAL_DIGIT
8862     # field does not lead to an official Unicode property, but is used in
8863     # calculating the Numeric_Type.  Perl however, creates a file from this
8864     # field, so a Perl property is created from it.  Similarly, the Other
8865     # Digit field is used only for calculating the Numeric_Type, and so it can
8866     # be safely re-used as the place to store the value for Numeric_Type;
8867     # hence it is referred to as $NUMERIC_TYPE_OTHER_DIGIT.  The input field
8868     # named $PERL_DECOMPOSITION is a combination of both the decomposition
8869     # mapping and its type.  Perl creates a file containing exactly this
8870     # field, so it is used for that.  The two properties are separated into
8871     # two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
8872
8873     # This file is processed like most in this program.  Control is passed to
8874     # process_generic_property_file() which calls filter_UnicodeData_line()
8875     # for each input line.  This filter converts the input into line(s) that
8876     # process_generic_property_file() understands.  There is also a setup
8877     # routine called before any of the file is processed, and a handler for
8878     # EOF processing, all in this closure.
8879
8880     # A huge speed-up occurred at the cost of some added complexity when these
8881     # routines were altered to buffer the outputs into ranges.  Almost all the
8882     # lines of the input file apply to just one code point, and for most
8883     # properties, the map for the next code point up is the same as the
8884     # current one.  So instead of creating a line for each property for each
8885     # input line, filter_UnicodeData_line() remembers what the previous map
8886     # of a property was, and doesn't generate a line to pass on until it has
8887     # to, as when the map changes; and that passed-on line encompasses the
8888     # whole contiguous range of code points that have the same map for that
8889     # property.  This means a slight amount of extra setup, and having to
8890     # flush these buffers on EOF, testing if the maps have changed, plus
8891     # remembering state information in the closure.  But it means a lot less
8892     # real time in not having to change the data base for each property on
8893     # each line.
8894
8895     # Another complication is that there are already a few ranges designated
8896     # in the input.  There are two lines for each, with the same maps except
8897     # the code point and name on each line.  This was actually the hardest
8898     # thing to design around.  The code points in those ranges may actually
8899     # have real maps not given by these two lines.  These maps will either
8900     # be algorthimically determinable, or in the extracted files furnished
8901     # with the UCD.  In the event of conflicts between these extracted files,
8902     # and this one, Unicode says that this one prevails.  But it shouldn't
8903     # prevail for conflicts that occur in these ranges.  The data from the
8904     # extracted files prevails in those cases.  So, this program is structured
8905     # so that those files are processed first, storing maps.  Then the other
8906     # files are processed, generally overwriting what the extracted files
8907     # stored.  But just the range lines in this input file are processed
8908     # without overwriting.  This is accomplished by adding a special string to
8909     # the lines output to tell process_generic_property_file() to turn off the
8910     # overwriting for just this one line.
8911     # A similar mechanism is used to tell it that the map is of a non-default
8912     # type.
8913
8914     sub setup_UnicodeData { # Called before any lines of the input are read
8915         my $file = shift;
8916         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8917
8918         my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
8919                                         Directory => '.',
8920                                         File => 'Decomposition',
8921                                         Format => $STRING_FORMAT,
8922                                         Internal_Only_Warning => 1,
8923                                         Perl_Extension => 1,
8924                                         Default_Map => $CODE_POINT,
8925
8926                                         # This is a specially formatted table
8927                                         # explicitly for normalize.pm, which
8928                                         # is expecting a particular format,
8929                                         # which means that mappings containing
8930                                         # multiple code points are in the main
8931                                         # body of the table
8932                                         Map_Type => $COMPUTE_NO_MULTI_CP,
8933                                         Type => $STRING,
8934                                         );
8935         $Perl_decomp->add_comment(join_lines(<<END
8936 This mapping is a combination of the Unicode 'Decomposition_Type' and
8937 'Decomposition_Mapping' properties, formatted for use by normalize.pm.  It is
8938 identical to the official Unicode 'Decomposition_Mapping'  property except for
8939 two things:
8940  1) It omits the algorithmically determinable Hangul syllable decompositions,
8941 which normalize.pm handles algorithmically.
8942  2) It contains the decomposition type as well.  Non-canonical decompositions
8943 begin with a word in angle brackets, like <super>, which denotes the
8944 compatible decomposition type.  If the map does not begin with the <angle
8945 brackets>, the decomposition is canonical.
8946 END
8947         ));
8948
8949         my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
8950                                         Default_Map => "",
8951                                         Perl_Extension => 1,
8952                                         File => 'Digit',    # Trad. location
8953                                         Directory => $map_directory,
8954                                         Type => $STRING,
8955                                         Range_Size_1 => 1,
8956                                         );
8957         $Decimal_Digit->add_comment(join_lines(<<END
8958 This file gives the mapping of all code points which represent a single
8959 decimal digit [0-9] to their respective digits.  For example, the code point
8960 U+0031 (an ASCII '1') is mapped to a numeric 1.  These code points are those
8961 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
8962 numerals.
8963 END
8964         ));
8965
8966         # This property is not used for generating anything else, and is
8967         # usually not output.  By making it last in the list, we can just
8968         # change the high end of the loop downwards to avoid the work of
8969         # generating a table that is just going to get thrown away.
8970         if (! property_ref('Decomposition_Mapping')->to_output_map) {
8971             $last_field--;
8972         }
8973         return;
8974     }
8975
8976     my $first_time = 1;                 # ? Is this the first line of the file
8977     my $in_range = 0;                   # ? Are we in one of the file's ranges
8978     my $previous_cp;                    # hex code point of previous line
8979     my $decimal_previous_cp = -1;       # And its decimal equivalent
8980     my @start;                          # For each field, the current starting
8981                                         # code point in hex for the range
8982                                         # being accumulated.
8983     my @fields;                         # The input fields;
8984     my @previous_fields;                # And those from the previous call
8985
8986     sub filter_UnicodeData_line {
8987         # Handle a single input line from UnicodeData.txt; see comments above
8988         # Conceptually this takes a single line from the file containing N
8989         # properties, and converts it into N lines with one property per line,
8990         # which is what the final handler expects.  But there are
8991         # complications due to the quirkiness of the input file, and to save
8992         # time, it accumulates ranges where the property values don't change
8993         # and only emits lines when necessary.  This is about an order of
8994         # magnitude fewer lines emitted.
8995
8996         my $file = shift;
8997         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8998
8999         # $_ contains the input line.
9000         # -1 in split means retain trailing null fields
9001         (my $cp, @fields) = split /\s*;\s*/, $_, -1;
9002
9003         #local $to_trace = 1 if main::DEBUG;
9004         trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
9005         if (@fields > $input_field_count) {
9006             $file->carp_bad_line('Extra fields');
9007             $_ = "";
9008             return;
9009         }
9010
9011         my $decimal_cp = hex $cp;
9012
9013         # We have to output all the buffered ranges when the next code point
9014         # is not exactly one after the previous one, which means there is a
9015         # gap in the ranges.
9016         my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
9017
9018         # The decomposition mapping field requires special handling.  It looks
9019         # like either:
9020         #
9021         # <compat> 0032 0020
9022         # 0041 0300
9023         #
9024         # The decomposition type is enclosed in <brackets>; if missing, it
9025         # means the type is canonical.  There are two decomposition mapping
9026         # tables: the one for use by Perl's normalize.pm has a special format
9027         # which is this field intact; the other, for general use is of
9028         # standard format.  In either case we have to find the decomposition
9029         # type.  Empty fields have None as their type, and map to the code
9030         # point itself
9031         if ($fields[$PERL_DECOMPOSITION] eq "") {
9032             $fields[$DECOMP_TYPE] = 'None';
9033             $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
9034         }
9035         else {
9036             ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
9037                                             =~ / < ( .+? ) > \s* ( .+ ) /x;
9038             if (! defined $fields[$DECOMP_TYPE]) {
9039                 $fields[$DECOMP_TYPE] = 'Canonical';
9040                 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
9041             }
9042             else {
9043                 $fields[$DECOMP_MAP] = $map;
9044             }
9045         }
9046
9047         # The 3 numeric fields also require special handling.  The 2 digit
9048         # fields must be either empty or match the number field.  This means
9049         # that if it is empty, they must be as well, and the numeric type is
9050         # None, and the numeric value is 'Nan'.
9051         # The decimal digit field must be empty or match the other digit
9052         # field.  If the decimal digit field is non-empty, the code point is
9053         # a decimal digit, and the other two fields will have the same value.
9054         # If it is empty, but the other digit field is non-empty, the code
9055         # point is an 'other digit', and the number field will have the same
9056         # value as the other digit field.  If the other digit field is empty,
9057         # but the number field is non-empty, the code point is a generic
9058         # numeric type.
9059         if ($fields[$NUMERIC] eq "") {
9060             if ($fields[$PERL_DECIMAL_DIGIT] ne ""
9061                 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
9062             ) {
9063                 $file->carp_bad_line("Numeric values inconsistent.  Trying to process anyway");
9064             }
9065             $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
9066             $fields[$NUMERIC] = 'NaN';
9067         }
9068         else {
9069             $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number.  Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
9070             if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
9071                 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
9072                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
9073             }
9074             elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
9075                 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
9076                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
9077             }
9078             else {
9079                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
9080
9081                 # Rationals require extra effort.
9082                 register_fraction($fields[$NUMERIC])
9083                                                 if $fields[$NUMERIC] =~ qr{/};
9084             }
9085         }
9086
9087         # For the properties that have empty fields in the file, and which
9088         # mean something different from empty, change them to that default.
9089         # Certain fields just haven't been empty so far in any Unicode
9090         # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
9091         # $CATEGORY.  This leaves just the two fields, and so we hard-code in
9092         # the defaults; which are verly unlikely to ever change.
9093         $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
9094         $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
9095
9096         # UAX44 says that if title is empty, it is the same as whatever upper
9097         # is,
9098         $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
9099
9100         # There are a few pairs of lines like:
9101         #   AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
9102         #   D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
9103         # that define ranges.  These should be processed after the fields are
9104         # adjusted above, as they may override some of them; but mostly what
9105         # is left is to possibly adjust the $NAME field.  The names of all the
9106         # paired lines start with a '<', but this is also true of '<control>,
9107         # which isn't one of these special ones.
9108         if ($fields[$NAME] eq '<control>') {
9109
9110             # Some code points in this file have the pseudo-name
9111             # '<control>', but the official name for such ones is the null
9112             # string.
9113             $fields[$NAME] = "";
9114
9115             # We had better not be in between range lines.
9116             if ($in_range) {
9117                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$NAME]'.  Trying anyway");
9118                 $in_range = 0;
9119             }
9120         }
9121         elsif (substr($fields[$NAME], 0, 1) ne '<') {
9122
9123             # Here is a non-range line.  We had better not be in between range
9124             # lines.
9125             if ($in_range) {
9126                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$NAME]'.  Trying anyway");
9127                 $in_range = 0;
9128             }
9129             # XXX until charnames catches up.
9130 #            if ($fields[$NAME] =~ s/- $cp $//x) {
9131 #
9132 #                # These are code points whose names end in their code points,
9133 #                # which means the names are algorithmically derivable from the
9134 #                # code points.  To shorten the output Name file, the algorithm
9135 #                # for deriving these is placed in the file instead of each
9136 #                # code point, so they have map type $CP_IN_NAME
9137 #                $fields[$NAME] = $CMD_DELIM
9138 #                                 . $MAP_TYPE_CMD
9139 #                                 . '='
9140 #                                 . $CP_IN_NAME
9141 #                                 . $CMD_DELIM
9142 #                                 . $fields[$NAME];
9143 #            }
9144
9145             # Some official names are really two alternate names with one in
9146             # parentheses.  What we do here is use the full official one for
9147             # the standard property (stored just above), but for the charnames
9148             # table, we add two more entries, one for each of the alternate
9149             # ones.
9150             # elsif name ne ""
9151             #check_and_handle_compound_name($cp, $fields[$NAME]);
9152             #check_and_handle_compound_name($cp, $unicode_1_name);
9153             # XXX until charnames catches up.
9154         }
9155         elsif ($fields[$NAME] =~ /^<(.+), First>$/) {
9156             $fields[$NAME] = $1;
9157
9158             # Here we are at the beginning of a range pair.
9159             if ($in_range) {
9160                 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$NAME]'.  Trying anyway");
9161             }
9162             $in_range = 1;
9163
9164             # Because the properties in the range do not overwrite any already
9165             # in the db, we must flush the buffers of what's already there, so
9166             # they get handled in the normal scheme.
9167             $force_output = 1;
9168
9169         }
9170         elsif ($fields[$NAME] !~ s/^<(.+), Last>$/$1/) {
9171             $file->carp_bad_line("Unexpected name starting with '<' $fields[$NAME].  Ignoring this line.");
9172             $_ = "";
9173             return;
9174         }
9175         else { # Here, we are at the last line of a range pair.
9176
9177             if (! $in_range) {
9178                 $file->carp_bad_line("Unexpected end of range $fields[$NAME] when not in one.  Ignoring this line.");
9179                 $_ = "";
9180                 return;
9181             }
9182             $in_range = 0;
9183
9184             # Check that the input is valid: that the closing of the range is
9185             # the same as the beginning.
9186             foreach my $i (0 .. $last_field) {
9187                 next if $fields[$i] eq $previous_fields[$i];
9188                 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'.  Bad News.  Trying anyway");
9189             }
9190
9191             # The processing differs depending on the type of range,
9192             # determined by its $NAME
9193             if ($fields[$NAME] =~ /^Hangul Syllable/) {
9194
9195                 # Check that the data looks right.
9196                 if ($decimal_previous_cp != $SBase) {
9197                     $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp.  Bad News.  Results will be wrong");
9198                 }
9199                 if ($decimal_cp != $SBase + $SCount - 1) {
9200                     $file->carp_bad_line("Unexpected Hangul syllable end = $cp.  Bad News.  Results will be wrong");
9201                 }
9202
9203                 # The Hangul syllable range has a somewhat complicated name
9204                 # generation algorithm.  Each code point in it has a canonical
9205                 # decomposition also computable by an algorithm.  The
9206                 # perl decomposition map table built from these is used only
9207                 # by normalize.pm, which has the algorithm built in it, so the
9208                 # decomposition maps are not needed, and are large, so are
9209                 # omitted from it.  If the full decomposition map table is to
9210                 # be output, the decompositions are generated for it, in the
9211                 # EOF handling code for this input file.
9212
9213                 $previous_fields[$DECOMP_TYPE] = 'Canonical';
9214
9215                 # This range is stored in our internal structure with its
9216                 # own map type, different from all others.
9217                 $previous_fields[$NAME] = $CMD_DELIM
9218                                           . $MAP_TYPE_CMD
9219                                           . '='
9220                                           . $HANGUL_SYLLABLE
9221                                           . $CMD_DELIM
9222                                           . $fields[$NAME];
9223             }
9224             elsif ($fields[$NAME] =~ /^CJK/) {
9225
9226                 # The name for these contains the code point itself, and all
9227                 # are defined to have the same base name, regardless of what
9228                 # is in the file.  They are stored in our internal structure
9229                 # with a map type of $CP_IN_NAME
9230                 $previous_fields[$NAME] = $CMD_DELIM
9231                                            . $MAP_TYPE_CMD
9232                                            . '='
9233                                            . $CP_IN_NAME
9234                                            . $CMD_DELIM
9235                                            . 'CJK UNIFIED IDEOGRAPH';
9236
9237             }
9238             elsif ($fields[$CATEGORY] eq 'Co'
9239                      || $fields[$CATEGORY] eq 'Cs')
9240             {
9241                 # The names of all the code points in these ranges are set to
9242                 # null, as there are no names for the private use and
9243                 # surrogate code points.
9244
9245                 $previous_fields[$NAME] = "";
9246             }
9247             else {
9248                 $file->carp_bad_line("Unexpected code point range $fields[$NAME] because category is $fields[$CATEGORY].  Attempting to process it.");
9249             }
9250
9251             # The first line of the range caused everything else to be output,
9252             # and then its values were stored as the beginning values for the
9253             # next set of ranges, which this one ends.  Now, for each value,
9254             # add a command to tell the handler that these values should not
9255             # replace any existing ones in our database.
9256             foreach my $i (0 .. $last_field) {
9257                 $previous_fields[$i] = $CMD_DELIM
9258                                         . $REPLACE_CMD
9259                                         . '='
9260                                         . $NO
9261                                         . $CMD_DELIM
9262                                         . $previous_fields[$i];
9263             }
9264
9265             # And change things so it looks like the entire range has been
9266             # gone through with this being the final part of it.  Adding the
9267             # command above to each field will cause this range to be flushed
9268             # during the next iteration, as it guaranteed that the stored
9269             # field won't match whatever value the next one has.
9270             $previous_cp = $cp;
9271             $decimal_previous_cp = $decimal_cp;
9272
9273             # We are now set up for the next iteration; so skip the remaining
9274             # code in this subroutine that does the same thing, but doesn't
9275             # know about these ranges.
9276             $_ = "";
9277             return;
9278         }
9279
9280         # On the very first line, we fake it so the code below thinks there is
9281         # nothing to output, and initialize so that when it does get output it
9282         # uses the first line's values for the lowest part of the range.
9283         # (One could avoid this by using peek(), but then one would need to
9284         # know the adjustments done above and do the same ones in the setup
9285         # routine; not worth it)
9286         if ($first_time) {
9287             $first_time = 0;
9288             @previous_fields = @fields;
9289             @start = ($cp) x scalar @fields;
9290             $decimal_previous_cp = $decimal_cp - 1;
9291         }
9292
9293         # For each field, output the stored up ranges that this code point
9294         # doesn't fit in.  Earlier we figured out if all ranges should be
9295         # terminated because of changing the replace or map type styles, or if
9296         # there is a gap between this new code point and the previous one, and
9297         # that is stored in $force_output.  But even if those aren't true, we
9298         # need to output the range if this new code point's value for the
9299         # given property doesn't match the stored range's.
9300         #local $to_trace = 1 if main::DEBUG;
9301         foreach my $i (0 .. $last_field) {
9302             my $field = $fields[$i];
9303             if ($force_output || $field ne $previous_fields[$i]) {
9304
9305                 # Flush the buffer of stored values.
9306                 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
9307
9308                 # Start a new range with this code point and its value
9309                 $start[$i] = $cp;
9310                 $previous_fields[$i] = $field;
9311             }
9312         }
9313
9314         # Set the values for the next time.
9315         $previous_cp = $cp;
9316         $decimal_previous_cp = $decimal_cp;
9317
9318         # The input line has generated whatever adjusted lines are needed, and
9319         # should not be looked at further.
9320         $_ = "";
9321         return;
9322     }
9323
9324     sub EOF_UnicodeData {
9325         # Called upon EOF to flush the buffers, and create the Hangul
9326         # decomposition mappings if needed.
9327
9328         my $file = shift;
9329         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9330
9331         # Flush the buffers.
9332         foreach my $i (1 .. $last_field) {
9333             $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
9334         }
9335
9336         if (-e 'Jamo.txt') {
9337
9338             # The algorithm is published by Unicode, based on values in
9339             # Jamo.txt, (which should have been processed before this
9340             # subroutine), and the results left in %Jamo
9341             unless (%Jamo) {
9342                 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt.  Hangul syllables not generated.");
9343                 return;
9344             }
9345
9346             # If the full decomposition map table is being output, insert
9347             # into it the Hangul syllable mappings.  This is to avoid having
9348             # to publish a subroutine in it to compute them.  (which would
9349             # essentially be this code.)  This uses the algorithm published by
9350             # Unicode.
9351             if (property_ref('Decomposition_Mapping')->to_output_map) {
9352                 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
9353                     use integer;
9354                     my $SIndex = $S - $SBase;
9355                     my $L = $LBase + $SIndex / $NCount;
9356                     my $V = $VBase + ($SIndex % $NCount) / $TCount;
9357                     my $T = $TBase + $SIndex % $TCount;
9358
9359                     trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
9360                     my $decomposition = sprintf("%04X %04X", $L, $V);
9361                     $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
9362                     $file->insert_adjusted_lines(
9363                                 sprintf("%04X; Decomposition_Mapping; %s",
9364                                         $S,
9365                                         $decomposition));
9366                 }
9367             }
9368         }
9369
9370         return;
9371     }
9372
9373     sub filter_v1_ucd {
9374         # Fix UCD lines in version 1.  This is probably overkill, but this
9375         # fixes some glaring errors in Version 1 UnicodeData.txt.  That file:
9376         # 1)    had many Hangul (U+3400 - U+4DFF) code points that were later
9377         #       removed.  This program retains them
9378         # 2)    didn't include ranges, which it should have, and which are now
9379         #       added in @corrected_lines below.  It was hand populated by
9380         #       taking the data from Version 2, verified by analyzing
9381         #       DAge.txt.
9382         # 3)    There is a syntax error in the entry for U+09F8 which could
9383         #       cause problems for utf8_heavy, and so is changed.  It's
9384         #       numeric value was simply a minus sign, without any number.
9385         #       (Eventually Unicode changed the code point to non-numeric.)
9386         # 4)    The decomposition types often don't match later versions
9387         #       exactly, and the whole syntax of that field is different; so
9388         #       the syntax is changed as well as the types to their later
9389         #       terminology.  Otherwise normalize.pm would be very unhappy
9390         # 5)    Many ccc classes are different.  These are left intact.
9391         # 6)    U+FF10 - U+FF19 are missing their numeric values in all three
9392         #       fields.  These are unchanged because it doesn't really cause
9393         #       problems for Perl.
9394         # 7)    A number of code points, such as controls, don't have their
9395         #       Unicode Version 1 Names in this file.  These are unchanged.
9396
9397         my @corrected_lines = split /\n/, <<'END';
9398 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
9399 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
9400 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
9401 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
9402 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
9403 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
9404 END
9405
9406         my $file = shift;
9407         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9408
9409         #local $to_trace = 1 if main::DEBUG;
9410         trace $_ if main::DEBUG && $to_trace;
9411
9412         # -1 => retain trailing null fields
9413         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9414
9415         # At the first place that is wrong in the input, insert all the
9416         # corrections, replacing the wrong line.
9417         if ($code_point eq '4E00') {
9418             my @copy = @corrected_lines;
9419             $_ = shift @copy;
9420             ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9421
9422             $file->insert_lines(@copy);
9423         }
9424
9425
9426         if ($fields[$NUMERIC] eq '-') {
9427             $fields[$NUMERIC] = '-1';  # This is what 2.0 made it.
9428         }
9429
9430         if  ($fields[$PERL_DECOMPOSITION] ne "") {
9431
9432             # Several entries have this change to superscript 2 or 3 in the
9433             # middle.  Convert these to the modern version, which is to use
9434             # the actual U+00B2 and U+00B3 (the superscript forms) instead.
9435             # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
9436             # 'HHHH HHHH 00B3 HHHH'.
9437             # It turns out that all of these that don't have another
9438             # decomposition defined at the beginning of the line have the
9439             # <square> decomposition in later releases.
9440             if ($code_point ne '00B2' && $code_point ne '00B3') {
9441                 if  ($fields[$PERL_DECOMPOSITION]
9442                                     =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
9443                 {
9444                     if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
9445                         $fields[$PERL_DECOMPOSITION] = '<square> '
9446                         . $fields[$PERL_DECOMPOSITION];
9447                     }
9448                 }
9449             }
9450
9451             # If is like '<+circled> 0052 <-circled>', convert to
9452             # '<circled> 0052'
9453             $fields[$PERL_DECOMPOSITION] =~
9454                             s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x;
9455
9456             # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
9457             $fields[$PERL_DECOMPOSITION] =~
9458                             s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
9459             or $fields[$PERL_DECOMPOSITION] =~
9460                             s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
9461             or $fields[$PERL_DECOMPOSITION] =~
9462                             s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
9463             or $fields[$PERL_DECOMPOSITION] =~
9464                         s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
9465
9466             # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
9467             $fields[$PERL_DECOMPOSITION] =~
9468                     s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
9469
9470             # Change names to modern form.
9471             $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
9472             $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
9473             $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
9474             $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
9475
9476             # One entry has weird braces
9477             $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
9478         }
9479
9480         $_ = join ';', $code_point, @fields;
9481         trace $_ if main::DEBUG && $to_trace;
9482         return;
9483     }
9484
9485     sub filter_v2_1_5_ucd {
9486         # A dozen entries in this 2.1.5 file had the mirrored and numeric
9487         # columns swapped;  These all had mirrored be 'N'.  So if the numeric
9488         # column appears to be N, swap it back.
9489
9490         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9491         if ($fields[$NUMERIC] eq 'N') {
9492             $fields[$NUMERIC] = $fields[$MIRRORED];
9493             $fields[$MIRRORED] = 'N';
9494             $_ = join ';', $code_point, @fields;
9495         }
9496         return;
9497     }
9498 } # End closure for UnicodeData
9499
9500 sub process_NamedSequences {
9501     # NamedSequences.txt entries are just added to an array.  Because these
9502     # don't look like the other tables, they have their own handler.
9503     # An example:
9504     # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
9505     #
9506     # This just adds the sequence to an array for later handling
9507
9508     return; # XXX Until charnames catches up
9509     my $file = shift;
9510     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9511
9512     while ($file->next_line) {
9513         my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
9514         if (@remainder) {
9515             $file->carp_bad_line(
9516                 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
9517             next;
9518         }
9519         push @named_sequences, "$sequence\t\t$name";
9520     }
9521     return;
9522 }
9523
9524 { # Closure
9525
9526     my $first_range;
9527
9528     sub  filter_early_ea_lb {
9529         # Fixes early EastAsianWidth.txt and LineBreak.txt files.  These had a
9530         # third field be the name of the code point, which can be ignored in
9531         # most cases.  But it can be meaningful if it marks a range:
9532         # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
9533         # 3400;W;<CJK Ideograph Extension A, First>
9534         #
9535         # We need to see the First in the example above to know it's a range.
9536         # They did not use the later range syntaxes.  This routine changes it
9537         # to use the modern syntax.
9538         # $1 is the Input_file object.
9539
9540         my @fields = split /\s*;\s*/;
9541         if ($fields[2] =~ /^<.*, First>/) {
9542             $first_range = $fields[0];
9543             $_ = "";
9544         }
9545         elsif ($fields[2] =~ /^<.*, Last>/) {
9546             $_ = $_ = "$first_range..$fields[0]; $fields[1]";
9547         }
9548         else {
9549             undef $first_range;
9550             $_ = "$fields[0]; $fields[1]";
9551         }
9552
9553         return;
9554     }
9555 }
9556
9557 sub filter_old_style_arabic_shaping {
9558     # Early versions used a different term for the later one.
9559
9560     my @fields = split /\s*;\s*/;
9561     $fields[3] =~ s/<no shaping>/No_Joining_Group/;
9562     $fields[3] =~ s/\s+/_/g;                # Change spaces to underscores
9563     $_ = join ';', @fields;
9564     return;
9565 }
9566
9567 sub filter_arabic_shaping_line {
9568     # ArabicShaping.txt has entries that look like:
9569     # 062A; TEH; D; BEH
9570     # The field containing 'TEH' is not used.  The next field is Joining_Type
9571     # and the last is Joining_Group
9572     # This generates two lines to pass on, one for each property on the input
9573     # line.
9574
9575     my $file = shift;
9576     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9577
9578     my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9579
9580     if (@fields > 4) {
9581         $file->carp_bad_line('Extra fields');
9582         $_ = "";
9583         return;
9584     }
9585
9586     $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
9587     $_ = "$fields[0]; Joining_Type; $fields[2]";
9588
9589     return;
9590 }
9591
9592 sub setup_special_casing {
9593     # SpecialCasing.txt contains the non-simple case change mappings.  The
9594     # simple ones are in UnicodeData.txt, and should already have been read
9595     # in.
9596     # This routine initializes the full mappings to the simple, then as each
9597     # line is processed, it overrides the simple ones.
9598
9599     my $file= shift;
9600     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9601
9602     # For each of the case change mappings...
9603     foreach my $case ('lc', 'tc', 'uc') {
9604
9605         # The simple version's name in each mapping merely has an 's' in front
9606         # of the full one's
9607         my $simple = property_ref('s' . $case);
9608         unless (defined $simple && ! $simple->is_empty) {
9609             Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing.  Only special casing will be generated.");
9610         }
9611
9612         # Initialize the full case mappings with the simple ones.
9613         property_ref($case)->initialize($simple);
9614     }
9615
9616     return;
9617 }
9618
9619 sub filter_special_casing_line {
9620     # Change the format of $_ from SpecialCasing.txt into something that the
9621     # generic handler understands.  Each input line contains three case
9622     # mappings.  This will generate three lines to pass to the generic handler
9623     # for each of those.
9624
9625     # The input syntax (after stripping comments and trailing white space is
9626     # like one of the following (with the final two being entries that we
9627     # ignore):
9628     # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
9629     # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
9630     # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
9631     # Note the trailing semi-colon, unlike many of the input files.  That
9632     # means that there will be an extra null field generated by the split
9633
9634     my $file = shift;
9635     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9636
9637     my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9638
9639     # field #4 is when this mapping is conditional.  If any of these get
9640     # implemented, it would be by hard-coding in the casing functions in the
9641     # Perl core, not through tables.  But if there is a new condition we don't
9642     # know about, output a warning.  We know about all the conditions through
9643     # 5.2
9644     if ($fields[4] ne "") {
9645         my @conditions = split ' ', $fields[4];
9646         if ($conditions[0] ne 'tr'  # We know that these languages have
9647                                     # conditions, and some are multiple
9648             && $conditions[0] ne 'az'
9649             && $conditions[0] ne 'lt'
9650
9651             # And, we know about a single condition Final_Sigma, but
9652             # nothing else.
9653             && ($v_version gt v5.2.0
9654                 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
9655         {
9656             $file->carp_bad_line("Unknown condition '$fields[4]'.  You should inspect it and either add code to handle it, or add to list of those that are to ignore");
9657         }
9658         elsif ($conditions[0] ne 'Final_Sigma') {
9659
9660                 # Don't print out a message for Final_Sigma, because we have
9661                 # hard-coded handling for it.  (But the standard could change
9662                 # what the rule should be, but it wouldn't show up here
9663                 # anyway.
9664
9665                 print "# SKIPPING Special Casing: $_\n"
9666                                                     if $verbosity >= $VERBOSE;
9667         }
9668         $_ = "";
9669         return;
9670     }
9671     elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
9672         $file->carp_bad_line('Extra fields');
9673         $_ = "";
9674         return;
9675     }
9676
9677     $_ = "$fields[0]; lc; $fields[1]";
9678     $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
9679     $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
9680
9681     return;
9682 }
9683
9684 sub filter_old_style_case_folding {
9685     # This transforms $_ containing the case folding style of 3.0.1, to 3.1
9686     # and later style.  Different letters were used in the earlier.
9687
9688     my $file = shift;
9689     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9690
9691     my @fields = split /\s*;\s*/;
9692     if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
9693         $fields[1] = 'I';
9694     }
9695     elsif ($fields[1] eq 'L') {
9696         $fields[1] = 'C';             # L => C always
9697     }
9698     elsif ($fields[1] eq 'E') {
9699         if ($fields[2] =~ / /) {      # E => C if one code point; F otherwise
9700             $fields[1] = 'F'
9701         }
9702         else {
9703             $fields[1] = 'C'
9704         }
9705     }
9706     else {
9707         $file->carp_bad_line("Expecting L or E in second field");
9708         $_ = "";
9709         return;
9710     }
9711     $_ = join("; ", @fields) . ';';
9712     return;
9713 }
9714
9715 { # Closure for case folding
9716
9717     # Create the map for simple only if are going to output it, for otherwise
9718     # it takes no part in anything we do.
9719     my $to_output_simple;
9720
9721     # These are experimental, perhaps will need these to pass to regcomp.c to
9722     # handle the cases where for example the Kelvin sign character folds to k,
9723     # and in regcomp, we need to know which of the characters can have a
9724     # non-latin1 char fold to it, so it doesn't do the optimizations it might
9725     # otherwise.
9726     my @latin1_singly_folded;
9727     my @latin1_folded;
9728
9729     sub setup_case_folding($) {
9730         # Read in the case foldings in CaseFolding.txt.  This handles both
9731         # simple and full case folding.
9732
9733         $to_output_simple
9734                         = property_ref('Simple_Case_Folding')->to_output_map;
9735
9736         return;
9737     }
9738
9739     sub filter_case_folding_line {
9740         # Called for each line in CaseFolding.txt
9741         # Input lines look like:
9742         # 0041; C; 0061; # LATIN CAPITAL LETTER A
9743         # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
9744         # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
9745         #
9746         # 'C' means that folding is the same for both simple and full
9747         # 'F' that it is only for full folding
9748         # 'S' that it is only for simple folding
9749         # 'T' is locale-dependent, and ignored
9750         # 'I' is a type of 'F' used in some early releases.
9751         # Note the trailing semi-colon, unlike many of the input files.  That
9752         # means that there will be an extra null field generated by the split
9753         # below, which we ignore and hence is not an error.
9754
9755         my $file = shift;
9756         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9757
9758         my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
9759         if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
9760             $file->carp_bad_line('Extra fields');
9761             $_ = "";
9762             return;
9763         }
9764
9765         if ($type eq 'T') {   # Skip Turkic case folding, is locale dependent
9766             $_ = "";
9767             return;
9768         }
9769
9770         # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
9771         # I are all full foldings
9772         if ($type eq 'C' || $type eq 'F' || $type eq 'I') {
9773             $_ = "$range; Case_Folding; $map";
9774         }
9775         else {
9776             $_ = "";
9777             if ($type ne 'S') {
9778                $file->carp_bad_line('Expecting C F I S or T in second field');
9779                return;
9780             }
9781         }
9782
9783         # C and S are simple foldings, but simple case folding is not needed
9784         # unless we explicitly want its map table output.
9785         if ($to_output_simple && $type eq 'C' || $type eq 'S') {
9786             $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
9787         }
9788
9789         # Experimental, see comment above
9790         if ($type ne 'S' && hex($range) >= 256) {   # assumes range is 1 point
9791             my @folded = split ' ', $map;
9792             if (hex $folded[0] < 256 && @folded == 1) {
9793                 push @latin1_singly_folded, hex $folded[0];
9794             }
9795             foreach my $folded (@folded) {
9796                 push @latin1_folded, hex $folded if hex $folded < 256;
9797             }
9798         }
9799
9800         return;
9801     }
9802
9803     sub post_fold {
9804         # Experimental, see comment above
9805         return;
9806
9807         #local $to_trace = 1 if main::DEBUG;
9808         @latin1_singly_folded = uniques(@latin1_singly_folded);
9809         @latin1_folded = uniques(@latin1_folded);
9810         trace "latin1 single folded:", map { chr $_ } sort { $a <=> $b } @latin1_singly_folded if main::DEBUG && $to_trace;
9811         trace "latin1 folded:", map { chr $_ } sort { $a <=> $b } @latin1_folded if main::DEBUG && $to_trace;
9812         return;
9813     }
9814 } # End case fold closure
9815
9816 sub filter_jamo_line {
9817     # Filter Jamo.txt lines.  This routine mainly is used to populate hashes
9818     # from this file that is used in generating the Name property for Jamo
9819     # code points.  But, it also is used to convert early versions' syntax
9820     # into the modern form.  Here are two examples:
9821     # 1100; G   # HANGUL CHOSEONG KIYEOK            # Modern syntax
9822     # U+1100; G; HANGUL CHOSEONG KIYEOK             # 2.0 syntax
9823     #
9824     # The input is $_, the output is $_ filtered.
9825
9826     my @fields = split /\s*;\s*/, $_, -1;  # -1 => retain trailing null fields
9827
9828     # Let the caller handle unexpected input.  In earlier versions, there was
9829     # a third field which is supposed to be a comment, but did not have a '#'
9830     # before it.
9831     return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
9832
9833     $fields[0] =~ s/^U\+//;     # Also, early versions had this extraneous
9834                                 # beginning.
9835
9836     # Some 2.1 versions had this wrong.  Causes havoc with the algorithm.
9837     $fields[1] = 'R' if $fields[0] eq '1105';
9838
9839     # Add to structure so can generate Names from it.
9840     my $cp = hex $fields[0];
9841     my $short_name = $fields[1];
9842     $Jamo{$cp} = $short_name;
9843     if ($cp <= $LBase + $LCount) {
9844         $Jamo_L{$short_name} = $cp - $LBase;
9845     }
9846     elsif ($cp <= $VBase + $VCount) {
9847         $Jamo_V{$short_name} = $cp - $VBase;
9848     }
9849     elsif ($cp <= $TBase + $TCount) {
9850         $Jamo_T{$short_name} = $cp - $TBase;
9851     }
9852     else {
9853         Carp::my_carp_bug("Unexpected Jamo code point in $_");
9854     }
9855
9856
9857     # Reassemble using just the first two fields to look like a typical
9858     # property file line
9859     $_ = "$fields[0]; $fields[1]";
9860
9861     return;
9862 }
9863
9864 sub register_fraction($) {
9865     # This registers the input rational number so that it can be passed on to
9866     # utf8_heavy.pl, both in rational and floating forms.
9867
9868     my $rational = shift;
9869
9870     my $float = eval $rational;
9871     $nv_floating_to_rational{$float} = $rational;
9872     return;
9873 }
9874
9875 sub filter_numeric_value_line {
9876     # DNumValues contains lines of a different syntax than the typical
9877     # property file:
9878     # 0F33          ; -0.5 ; ; -1/2 # No       TIBETAN DIGIT HALF ZERO
9879     #
9880     # This routine transforms $_ containing the anomalous syntax to the
9881     # typical, by filtering out the extra columns, and convert early version
9882     # decimal numbers to strings that look like rational numbers.
9883
9884     my $file = shift;
9885     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9886
9887     # Starting in 5.1, there is a rational field.  Just use that, omitting the
9888     # extra columns.  Otherwise convert the decimal number in the second field
9889     # to a rational, and omit extraneous columns.
9890     my @fields = split /\s*;\s*/, $_, -1;
9891     my $rational;
9892
9893     if ($v_version ge v5.1.0) {
9894         if (@fields != 4) {
9895             $file->carp_bad_line('Not 4 semi-colon separated fields');
9896             $_ = "";
9897             return;
9898         }
9899         $rational = $fields[3];
9900         $_ = join '; ', @fields[ 0, 3 ];
9901     }
9902     else {
9903
9904         # Here, is an older Unicode file, which has decimal numbers instead of
9905         # rationals in it.  Use the fraction to calculate the denominator and
9906         # convert to rational.
9907
9908         if (@fields != 2 && @fields != 3) {
9909             $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
9910             $_ = "";
9911             return;
9912         }
9913
9914         my $codepoints = $fields[0];
9915         my $decimal = $fields[1];
9916         if ($decimal =~ s/\.0+$//) {
9917
9918             # Anything ending with a decimal followed by nothing but 0's is an
9919             # integer
9920             $_ = "$codepoints; $decimal";
9921             $rational = $decimal;
9922         }
9923         else {
9924
9925             my $denominator;
9926             if ($decimal =~ /\.50*$/) {
9927                 $denominator = 2;
9928             }
9929
9930             # Here have the hardcoded repeating decimals in the fraction, and
9931             # the denominator they imply.  There were only a few denominators
9932             # in the older Unicode versions of this file which this code
9933             # handles, so it is easy to convert them.
9934
9935             # The 4 is because of a round-off error in the Unicode 3.2 files
9936             elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
9937                 $denominator = 3;
9938             }
9939             elsif ($decimal =~ /\.[27]50*$/) {
9940                 $denominator = 4;
9941             }
9942             elsif ($decimal =~ /\.[2468]0*$/) {
9943                 $denominator = 5;
9944             }
9945             elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
9946                 $denominator = 6;
9947             }
9948             elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
9949                 $denominator = 8;
9950             }
9951             if ($denominator) {
9952                 my $sign = ($decimal < 0) ? "-" : "";
9953                 my $numerator = int((abs($decimal) * $denominator) + .5);
9954                 $rational = "$sign$numerator/$denominator";
9955                 $_ = "$codepoints; $rational";
9956             }
9957             else {
9958                 $file->carp_bad_line("Can't cope with number '$decimal'.");
9959                 $_ = "";
9960                 return;
9961             }
9962         }
9963     }
9964
9965     register_fraction($rational) if $rational =~ qr{/};
9966     return;
9967 }
9968
9969 { # Closure
9970     my %unihan_properties;
9971     my $iicore;
9972
9973
9974     sub setup_unihan {
9975         # Do any special setup for Unihan properties.
9976
9977         # This property gives the wrong computed type, so override.
9978         my $usource = property_ref('kIRG_USource');
9979         $usource->set_type($STRING) if defined $usource;
9980
9981         # This property is to be considered binary, so change all the values
9982         # to Y.
9983         $iicore = property_ref('kIICore');
9984         if (defined $iicore) {
9985             $iicore->add_match_table('Y') if ! defined $iicore->table('Y');
9986
9987             # We have to change the default map, because the @missing line is
9988             # misleading, given that we are treating it as binary.
9989             $iicore->set_default_map('N');
9990             $iicore->set_type($BINARY);
9991         }
9992
9993         return;
9994     }
9995
9996     sub filter_unihan_line {
9997         # Change unihan db lines to look like the others in the db.  Here is
9998         # an input sample:
9999         #   U+341C        kCangjie        IEKN
10000
10001         # Tabs are used instead of semi-colons to separate fields; therefore
10002         # they may have semi-colons embedded in them.  Change these to periods
10003         # so won't screw up the rest of the code.
10004         s/;/./g;
10005
10006         # Remove lines that don't look like ones we accept.
10007         if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
10008             $_ = "";
10009             return;
10010         }
10011
10012         # Extract the property, and save a reference to its object.
10013         my $property = $1;
10014         if (! exists $unihan_properties{$property}) {
10015             $unihan_properties{$property} = property_ref($property);
10016         }
10017
10018         # Don't do anything unless the property is one we're handling, which
10019         # we determine by seeing if there is an object defined for it or not
10020         if (! defined $unihan_properties{$property}) {
10021             $_ = "";
10022             return;
10023         }
10024
10025         # The iicore property is supposed to be a boolean, so convert to our
10026         # standard boolean form.
10027         if (defined $iicore && $unihan_properties{$property} == $iicore) {
10028             $_ =~ s/$property.*/$property\tY/
10029         }
10030
10031         # Convert the tab separators to our standard semi-colons, and convert
10032         # the U+HHHH notation to the rest of the standard's HHHH
10033         s/\t/;/g;
10034         s/\b U \+ (?= $code_point_re )//xg;
10035
10036         #local $to_trace = 1 if main::DEBUG;
10037         trace $_ if main::DEBUG && $to_trace;
10038
10039         return;
10040     }
10041 }
10042
10043 sub filter_blocks_lines {
10044     # In the Blocks.txt file, the names of the blocks don't quite match the
10045     # names given in PropertyValueAliases.txt, so this changes them so they
10046     # do match:  Blanks and hyphens are changed into underscores.  Also makes
10047     # early release versions look like later ones
10048     #
10049     # $_ is transformed to the correct value.
10050
10051     my $file = shift;
10052         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10053
10054     if ($v_version lt v3.2.0) {
10055         if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
10056             $_ = "";
10057             return;
10058         }
10059
10060         # Old versions used a different syntax to mark the range.
10061         $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
10062     }
10063
10064     my @fields = split /\s*;\s*/, $_, -1;
10065     if (@fields != 2) {
10066         $file->carp_bad_line("Expecting exactly two fields");
10067         $_ = "";
10068         return;
10069     }
10070
10071     # Change hyphens and blanks in the block name field only
10072     $fields[1] =~ s/[ -]/_/g;
10073     $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g;   # Capitalize first letter of word
10074
10075     $_ = join("; ", @fields);
10076     return;
10077 }
10078
10079 { # Closure
10080     my $current_property;
10081
10082     sub filter_old_style_proplist {
10083         # PropList.txt has been in Unicode since version 2.0.  Until 3.1, it
10084         # was in a completely different syntax.  Ken Whistler of Unicode says
10085         # that it was something he used as an aid for his own purposes, but
10086         # was never an official part of the standard.  However, comments in
10087         # DAge.txt indicate that non-character code points were available in
10088         # the UCD as of 3.1.  It is unclear to me (khw) how they could be
10089         # there except through this file (but on the other hand, they first
10090         # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
10091         # not.  But the claim is that it was published as an aid to others who
10092         # might want some more information than was given in the official UCD
10093         # of the time.  Many of the properties in it were incorporated into
10094         # the later PropList.txt, but some were not.  This program uses this
10095         # early file to generate property tables that are otherwise not
10096         # accessible in the early UCD's, and most were probably not really
10097         # official at that time, so one could argue that it should be ignored,
10098         # and you can easily modify things to skip this.  And there are bugs
10099         # in this file in various versions.  (For example, the 2.1.9 version
10100         # removes from Alphabetic the CJK range starting at 4E00, and they
10101         # weren't added back in until 3.1.0.)  Many of this file's properties
10102         # were later sanctioned, so this code generates tables for those
10103         # properties that aren't otherwise in the UCD of the time but
10104         # eventually did become official, and throws away the rest.  Here is a
10105         # list of all the ones that are thrown away:
10106         #   Bidi=*                       duplicates UnicodeData.txt
10107         #   Combining                    never made into official property;
10108         #                                is \P{ccc=0}
10109         #   Composite                    never made into official property.
10110         #   Currency Symbol              duplicates UnicodeData.txt: gc=sc
10111         #   Decimal Digit                duplicates UnicodeData.txt: gc=nd
10112         #   Delimiter                    never made into official property;
10113         #                                removed in 3.0.1
10114         #   Format Control               never made into official property;
10115         #                                similar to gc=cf
10116         #   High Surrogate               duplicates Blocks.txt
10117         #   Ignorable Control            never made into official property;
10118         #                                similar to di=y
10119         #   ISO Control                  duplicates UnicodeData.txt: gc=cc
10120         #   Left of Pair                 never made into official property;
10121         #   Line Separator               duplicates UnicodeData.txt: gc=zl
10122         #   Low Surrogate                duplicates Blocks.txt
10123         #   Non-break                    was actually listed as a property
10124         #                                in 3.2, but without any code
10125         #                                points.  Unicode denies that this
10126         #                                was ever an official property
10127         #   Non-spacing                  duplicate UnicodeData.txt: gc=mn
10128         #   Numeric                      duplicates UnicodeData.txt: gc=cc
10129         #   Paired Punctuation           never made into official property;
10130         #                                appears to be gc=ps + gc=pe
10131         #   Paragraph Separator          duplicates UnicodeData.txt: gc=cc
10132         #   Private Use                  duplicates UnicodeData.txt: gc=co
10133         #   Private Use High Surrogate   duplicates Blocks.txt
10134         #   Punctuation                  duplicates UnicodeData.txt: gc=p
10135         #   Space                        different definition than eventual
10136         #                                one.
10137         #   Titlecase                    duplicates UnicodeData.txt: gc=lt
10138         #   Unassigned Code Value        duplicates UnicodeData.txt: gc=cc
10139         #   Zero-width                   never made into offical property;
10140         #                                subset of gc=cf
10141         # Most of the properties have the same names in this file as in later
10142         # versions, but a couple do not.
10143         #
10144         # This subroutine filters $_, converting it from the old style into
10145         # the new style.  Here's a sample of the old-style
10146         #
10147         #   *******************************************
10148         #
10149         #   Property dump for: 0x100000A0 (Join Control)
10150         #
10151         #   200C..200D  (2 chars)
10152         #
10153         # In the example, the property is "Join Control".  It is kept in this
10154         # closure between calls to the subroutine.  The numbers beginning with
10155         # 0x were internal to Ken's program that generated this file.
10156
10157         # If this line contains the property name, extract it.
10158         if (/^Property dump for: [^(]*\((.*)\)/) {
10159             $_ = $1;
10160
10161             # Convert white space to underscores.
10162             s/ /_/g;
10163
10164             # Convert the few properties that don't have the same name as
10165             # their modern counterparts
10166             s/Identifier_Part/ID_Continue/
10167             or s/Not_a_Character/NChar/;
10168
10169             # If the name matches an existing property, use it.
10170             if (defined property_ref($_)) {
10171                 trace "new property=", $_ if main::DEBUG && $to_trace;
10172                 $current_property = $_;
10173             }
10174             else {        # Otherwise discard it
10175                 trace "rejected property=", $_ if main::DEBUG && $to_trace;
10176                 undef $current_property;
10177             }
10178             $_ = "";    # The property is saved for the next lines of the
10179                         # file, but this defining line is of no further use,
10180                         # so clear it so that the caller won't process it
10181                         # further.
10182         }
10183         elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
10184
10185             # Here, the input line isn't a header defining a property for the
10186             # following section, and either we aren't in such a section, or
10187             # the line doesn't look like one that defines the code points in
10188             # such a section.  Ignore this line.
10189             $_ = "";
10190         }
10191         else {
10192
10193             # Here, we have a line defining the code points for the current
10194             # stashed property.  Anything starting with the first blank is
10195             # extraneous.  Otherwise, it should look like a normal range to
10196             # the caller.  Append the property name so that it looks just like
10197             # a modern PropList entry.
10198
10199             $_ =~ s/\s.*//;
10200             $_ .= "; $current_property";
10201         }
10202         trace $_ if main::DEBUG && $to_trace;
10203         return;
10204     }
10205 } # End closure for old style proplist
10206
10207 sub filter_old_style_normalization_lines {
10208     # For early releases of Unicode, the lines were like:
10209     #        74..2A76    ; NFKD_NO
10210     # For later releases this became:
10211     #        74..2A76    ; NFKD_QC; N
10212     # Filter $_ to look like those in later releases.
10213     # Similarly for MAYBEs
10214
10215     s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
10216
10217     # Also, the property FC_NFKC was abbreviated to FNC
10218     s/FNC/FC_NFKC/;
10219     return;
10220 }
10221
10222 sub finish_Unicode() {
10223     # This routine should be called after all the Unicode files have been read
10224     # in.  It:
10225     # 1) Adds the mappings for code points missing from the files which have
10226     #    defaults specified for them.
10227     # 2) At this this point all mappings are known, so it computes the type of
10228     #    each property whose type hasn't been determined yet.
10229     # 3) Calculates all the regular expression match tables based on the
10230     #    mappings.
10231     # 3) Calculates and adds the tables which are defined by Unicode, but
10232     #    which aren't derived by them
10233
10234     # For each property, fill in any missing mappings, and calculate the re
10235     # match tables.  If a property has more than one missing mapping, the
10236     # default is a reference to a data structure, and requires data from other
10237     # properties to resolve.  The sort is used to cause these to be processed
10238     # last, after all the other properties have been calculated.
10239     # (Fortunately, the missing properties so far don't depend on each other.)
10240     foreach my $property
10241         (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
10242         property_ref('*'))
10243     {
10244         # $perl has been defined, but isn't one of the Unicode properties that
10245         # need to be finished up.
10246         next if $property == $perl;
10247
10248         # Handle the properties that have more than one possible default
10249         if (ref $property->default_map) {
10250             my $default_map = $property->default_map;
10251
10252             # These properties have stored in the default_map:
10253             # One or more of:
10254             #   1)  A default map which applies to all code points in a
10255             #       certain class
10256             #   2)  an expression which will evaluate to the list of code
10257             #       points in that class
10258             # And
10259             #   3) the default map which applies to every other missing code
10260             #      point.
10261             #
10262             # Go through each list.
10263             while (my ($default, $eval) = $default_map->get_next_defaults) {
10264
10265                 # Get the class list, and intersect it with all the so-far
10266                 # unspecified code points yielding all the code points
10267                 # in the class that haven't been specified.
10268                 my $list = eval $eval;
10269                 if ($@) {
10270                     Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
10271                     last;
10272                 }
10273
10274                 # Narrow down the list to just those code points we don't have
10275                 # maps for yet.
10276                 $list = $list & $property->inverse_list;
10277
10278                 # Add mappings to the property for each code point in the list
10279                 foreach my $range ($list->ranges) {
10280                     $property->add_map($range->start, $range->end, $default);
10281                 }
10282             }
10283
10284             # All remaining code points have the other mapping.  Set that up
10285             # so the normal single-default mapping code will work on them
10286             $property->set_default_map($default_map->other_default);
10287
10288             # And fall through to do that
10289         }
10290
10291         # We should have enough data now to compute the type of the property.
10292         $property->compute_type;
10293         my $property_type = $property->type;
10294
10295         next if ! $property->to_create_match_tables;
10296
10297         # Here want to create match tables for this property
10298
10299         # The Unicode db always (so far, and they claim into the future) have
10300         # the default for missing entries in binary properties be 'N' (unless
10301         # there is a '@missing' line that specifies otherwise)
10302         if ($property_type == $BINARY && ! defined $property->default_map) {
10303             $property->set_default_map('N');
10304         }
10305
10306         # Add any remaining code points to the mapping, using the default for
10307         # missing code points
10308         if (defined (my $default_map = $property->default_map)) {
10309             foreach my $range ($property->inverse_list->ranges) {
10310                 $property->add_map($range->start, $range->end, $default_map);
10311             }
10312
10313             # Make sure there is a match table for the default
10314             if (! defined $property->table($default_map)) {
10315                 $property->add_match_table($default_map);
10316             }
10317         }
10318
10319         # Have all we need to populate the match tables.
10320         my $property_name = $property->name;
10321         foreach my $range ($property->ranges) {
10322             my $map = $range->value;
10323             my $table = property_ref($property_name)->table($map);
10324             if (! defined $table) {
10325
10326                 # Integral and rational property values are not necessarily
10327                 # defined in PropValueAliases, but all other ones should be,
10328                 # starting in 5.1
10329                 if ($v_version ge v5.1.0
10330                     && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
10331                 {
10332                     Carp::my_carp("Table '$property_name=$map' should have been defined.  Defining it now.")
10333                 }
10334                 $table = property_ref($property_name)->add_match_table($map);
10335             }
10336
10337             $table->add_range($range->start, $range->end);
10338         }
10339
10340         # And add the Is_ prefix synonyms for Perl 5.6 compatibility, in which
10341         # all properties have this optional prefix.  These do not get a
10342         # separate entry in the pod file, because are covered by a wild-card
10343         # entry
10344         foreach my $alias ($property->aliases) {
10345             my $Is_name = 'Is_' . $alias->name;
10346             if (! defined (my $pre_existing = property_ref($Is_name))) {
10347                 $property->add_alias($Is_name,
10348                                      Pod_Entry => 0,
10349                                      Status => $alias->status,
10350                                      Externally_Ok => 0);
10351             }
10352             else {
10353
10354                 # It seemed too much work to add in these warnings when it
10355                 # appears that Unicode has made a decision never to begin a
10356                 # property name with 'Is_', so this shouldn't happen, but just
10357                 # in case, it is a warning.
10358                 Carp::my_carp(<<END
10359 There is already an alias named $Is_name (from " . $pre_existing . "), so not
10360 creating this alias for $property.  The generated table and pod files do not
10361 warn users of this conflict.
10362 END
10363                 );
10364                 $has_Is_conflicts++;
10365             }
10366         } # End of loop through aliases for this property
10367     } # End of loop through all Unicode properties.
10368
10369     # Fill in the mappings that Unicode doesn't completely furnish.  First the
10370     # single letter major general categories.  If Unicode were to start
10371     # delivering the values, this would be redundant, but better that than to
10372     # try to figure out if should skip and not get it right.  Ths could happen
10373     # if a new major category were to be introduced, and the hard-coded test
10374     # wouldn't know about it.
10375     # This routine depends on the standard names for the general categories
10376     # being what it thinks they are, like 'Cn'.  The major categories are the
10377     # union of all the general category tables which have the same first
10378     # letters. eg. L = Lu + Lt + Ll + Lo + Lm
10379     foreach my $minor_table ($gc->tables) {
10380         my $minor_name = $minor_table->name;
10381         next if length $minor_name == 1;
10382         if (length $minor_name != 2) {
10383             Carp::my_carp_bug("Unexpected general category '$minor_name'.  Skipped.");
10384             next;
10385         }
10386
10387         my $major_name = uc(substr($minor_name, 0, 1));
10388         my $major_table = $gc->table($major_name);
10389         $major_table += $minor_table;
10390     }
10391
10392     # LC is Ll, Lu, and Lt.  (used to be L& or L_, but PropValueAliases.txt
10393     # defines it as LC)
10394     my $LC = $gc->table('LC');
10395     $LC->add_alias('L_', Status => $DISCOURAGED);   # For backwards...
10396     $LC->add_alias('L&', Status => $DISCOURAGED);   # compatibility.
10397
10398
10399     if ($LC->is_empty) { # Assume if not empty that Unicode has started to
10400                          # deliver the correct values in it
10401         $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
10402
10403         # Lt not in release 1.
10404         $LC += $gc->table('Lt') if defined $gc->table('Lt');
10405     }
10406     $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
10407
10408     my $Cs = $gc->table('Cs');
10409     if (defined $Cs) {
10410         $Cs->add_note('Mostly not usable in Perl.');
10411         $Cs->add_comment(join_lines(<<END
10412 Surrogates are used exclusively for I/O in UTF-16, and should not appear in
10413 Unicode text, and hence their use will generate (usually fatal) messages
10414 END
10415         ));
10416     }
10417
10418
10419     # Folding information was introduced later into Unicode data.  To get
10420     # Perl's case ignore (/i) to work at all in releases that don't have
10421     # folding, use the best available alternative, which is lower casing.
10422     my $fold = property_ref('Simple_Case_Folding');
10423     if ($fold->is_empty) {
10424         $fold->initialize(property_ref('Simple_Lowercase_Mapping'));
10425         $fold->add_note(join_lines(<<END
10426 WARNING: This table uses lower case as a substitute for missing fold
10427 information
10428 END
10429         ));
10430     }
10431
10432     # Multiple-character mapping was introduced later into Unicode data.  If
10433     # missing, use the single-characters maps as best available alternative
10434     foreach my $map (qw {   Uppercase_Mapping
10435                             Lowercase_Mapping
10436                             Titlecase_Mapping
10437                             Case_Folding
10438                         } ) {
10439         my $full = property_ref($map);
10440         if ($full->is_empty) {
10441             my $simple = property_ref('Simple_' . $map);
10442             $full->initialize($simple);
10443             $full->add_comment($simple->comment) if ($simple->comment);
10444             $full->add_note(join_lines(<<END
10445 WARNING: This table uses simple mapping (single-character only) as a
10446 substitute for missing multiple-character information
10447 END
10448             ));
10449         }
10450     }
10451     return
10452 }
10453
10454 sub compile_perl() {
10455     # Create perl-defined tables.  Almost all are part of the pseudo-property
10456     # named 'perl' internally to this program.  Many of these are recommended
10457     # in UTS#18 "Unicode Regular Expressions", and their derivations are based
10458     # on those found there.
10459     # Almost all of these are equivalent to some Unicode property.
10460     # A number of these properties have equivalents restricted to the ASCII
10461     # range, with their names prefaced by 'Posix', to signify that these match
10462     # what the Posix standard says they should match.  A couple are
10463     # effectively this, but the name doesn't have 'Posix' in it because there
10464     # just isn't any Posix equivalent.
10465
10466     # 'Any' is all code points.  As an error check, instead of just setting it
10467     # to be that, construct it to be the union of all the major categories
10468     my $Any = $perl->add_match_table('Any',
10469             Description  => "[\\x{0000}-\\x{$LAST_UNICODE_CODEPOINT_STRING}]",
10470             Matches_All => 1);
10471
10472     foreach my $major_table ($gc->tables) {
10473
10474         # Major categories are the ones with single letter names.
10475         next if length($major_table->name) != 1;
10476
10477         $Any += $major_table;
10478     }
10479
10480     if ($Any->max != $LAST_UNICODE_CODEPOINT) {
10481         Carp::my_carp_bug("Generated highest code point ("
10482            . sprintf("%X", $Any->max)
10483            . ") doesn't match expected value $LAST_UNICODE_CODEPOINT_STRING.")
10484     }
10485     if ($Any->range_count != 1 || $Any->min != 0) {
10486      Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
10487     }
10488
10489     $Any->add_alias('All');
10490
10491     # Assigned is the opposite of gc=unassigned
10492     my $Assigned = $perl->add_match_table('Assigned',
10493                                 Description  => "All assigned code points",
10494                                 Initialize => ~ $gc->table('Unassigned'),
10495                                 );
10496
10497     # Our internal-only property should be treated as more than just a
10498     # synonym.
10499     $perl->add_match_table('_CombAbove')
10500             ->set_equivalent_to(property_ref('ccc')->table('Above'),
10501                                                                 Related => 1);
10502
10503     my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
10504     if (defined $block) {   # This is equivalent to the block if have it.
10505         my $Unicode_ASCII = $block->table('Basic_Latin');
10506         if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
10507             $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
10508         }
10509     }
10510
10511     # Very early releases didn't have blocks, so initialize ASCII ourselves if
10512     # necessary
10513     if ($ASCII->is_empty) {
10514         $ASCII->initialize([ 0..127 ]);
10515     }
10516
10517     # A number of the Perl synonyms have a restricted-range synonym whose name
10518     # begins with Posix.  This hash gets filled in with them, so that they can
10519     # be populated in a small loop.
10520     my %posix_equivalent;
10521
10522     # Get the best available case definitions.  Early Unicode versions didn't
10523     # have Uppercase and Lowercase defined, so use the general category
10524     # instead for them.
10525     my $Lower = $perl->add_match_table('Lower');
10526     my $Unicode_Lower = property_ref('Lowercase');
10527     if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
10528         $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
10529     }
10530     else {
10531         $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
10532                                                                 Related => 1);
10533     }
10534     $posix_equivalent{'Lower'} = $Lower;
10535
10536     my $Upper = $perl->add_match_table('Upper');
10537     my $Unicode_Upper = property_ref('Uppercase');
10538     if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
10539         $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
10540     }
10541     else {
10542         $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
10543                                                                 Related => 1);
10544     }
10545     $posix_equivalent{'Upper'} = $Upper;
10546
10547     # Earliest releases didn't have title case.  Initialize it to empty if not
10548     # otherwise present
10549     my $Title = $perl->add_match_table('Title');
10550     my $lt = $gc->table('Lt');
10551     if (defined $lt) {
10552         $Title->set_equivalent_to($lt, Related => 1);
10553     }
10554
10555     # If this Unicode version doesn't have Cased, set up our own.  From
10556     # Unicode 5.1: Definition D120: A character C is defined to be cased if
10557     # and only if C has the Lowercase or Uppercase property or has a
10558     # General_Category value of Titlecase_Letter.
10559     unless (defined property_ref('Cased')) {
10560         my $cased = $perl->add_match_table('Cased',
10561                         Initialize => $Lower + $Upper + $Title,
10562                         Description => 'Uppercase or Lowercase or Titlecase',
10563                         );
10564     }
10565
10566     # Similarly, set up our own Case_Ignorable property if this Unicode
10567     # version doesn't have it.  From Unicode 5.1: Definition D121: A character
10568     # C is defined to be case-ignorable if C has the value MidLetter or the
10569     # value MidNumLet for the Word_Break property or its General_Category is
10570     # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
10571     # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
10572
10573     # Perl has long had an internal-only alias for this property.
10574     my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable');
10575     my $case_ignorable = property_ref('Case_Ignorable');
10576     if (defined $case_ignorable && ! $case_ignorable->is_empty) {
10577         $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
10578                                                                 Related => 1);
10579     }
10580     else {
10581
10582         $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
10583
10584         # The following three properties are not in early releases
10585         $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
10586         $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
10587         $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
10588
10589         # For versions 4.1 - 5.0, there is no MidNumLet property, and
10590         # correspondingly the case-ignorable definition lacks that one.  For
10591         # 4.0, it appears that it was meant to be the same definition, but was
10592         # inadvertently omitted from the standard's text, so add it if the
10593         # property actually is there
10594         my $wb = property_ref('Word_Break');
10595         if (defined $wb) {
10596             my $midlet = $wb->table('MidLetter');
10597             $perl_case_ignorable += $midlet if defined $midlet;
10598             my $midnumlet = $wb->table('MidNumLet');
10599             $perl_case_ignorable += $midnumlet if defined $midnumlet;
10600         }
10601         else {
10602
10603             # In earlier versions of the standard, instead of the above two
10604             # properties , just the following characters were used:
10605             $perl_case_ignorable +=  0x0027  # APOSTROPHE
10606                                 +   0x00AD  # SOFT HYPHEN (SHY)
10607                                 +   0x2019; # RIGHT SINGLE QUOTATION MARK
10608         }
10609     }
10610
10611     # The remaining perl defined tables are mostly based on Unicode TR 18,
10612     # "Annex C: Compatibility Properties".  All of these have two versions,
10613     # one whose name generally begins with Posix that is posix-compliant, and
10614     # one that matches Unicode characters beyond the Posix, ASCII range
10615
10616     my $Alpha = $perl->add_match_table('Alpha',
10617                         Description => '[[:Alpha:]] extended beyond ASCII');
10618
10619     # Alphabetic was not present in early releases
10620     my $Alphabetic = property_ref('Alphabetic');
10621     if (defined $Alphabetic && ! $Alphabetic->is_empty) {
10622         $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
10623     }
10624     else {
10625
10626         # For early releases, we don't get it exactly right.  The below
10627         # includes more than it should, which in 5.2 terms is: L + Nl +
10628         # Other_Alphabetic.  Other_Alphabetic contains many characters from
10629         # Mn and Mc.  It's better to match more than we should, than less than
10630         # we should.
10631         $Alpha->initialize($gc->table('Letter')
10632                             + $gc->table('Mn')
10633                             + $gc->table('Mc'));
10634         $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
10635     }
10636     $posix_equivalent{'Alpha'} = $Alpha;
10637
10638     my $Alnum = $perl->add_match_table('Alnum',
10639                         Description => "[[:Alnum:]] extended beyond ASCII",
10640                         Initialize => $Alpha + $gc->table('Decimal_Number'),
10641                         );
10642     $posix_equivalent{'Alnum'} = $Alnum;
10643
10644     my $Word = $perl->add_match_table('Word',
10645                                 Description => '\w, including beyond ASCII',
10646                                 Initialize => $Alnum + $gc->table('Mark'),
10647                                 );
10648     my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
10649     $Word += $Pc if defined $Pc;
10650
10651     # There is no [[:Word:]], so the name doesn't begin with Posix.
10652     $perl->add_match_table('PerlWord',
10653                     Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
10654                     Initialize => $Word & $ASCII,
10655                     );
10656
10657     my $Blank = $perl->add_match_table('Blank',
10658                                 Description => '\h, Horizontal white space',
10659
10660                                 # 200B is Zero Width Space which is for line
10661                                 # break control, and was listed as
10662                                 # Space_Separator in early releases
10663                                 Initialize => $gc->table('Space_Separator')
10664                                             +   0x0009  # TAB
10665                                             -   0x200B, # ZWSP
10666                                 );
10667     $Blank->add_alias('HorizSpace');        # Another name for it.
10668     $posix_equivalent{'Blank'} = $Blank;
10669
10670     my $VertSpace = $perl->add_match_table('VertSpace',
10671                             Description => '\v',
10672                             Initialize => $gc->table('Line_Separator')
10673                                         + $gc->table('Paragraph_Separator')
10674                                         + 0x000A  # LINE FEED
10675                                         + 0x000B  # VERTICAL TAB
10676                                         + 0x000C  # FORM FEED
10677                                         + 0x000D  # CARRIAGE RETURN
10678                                         + 0x0085, # NEL
10679                             );
10680     # No Posix equivalent for vertical space
10681
10682     my $Space = $perl->add_match_table('Space',
10683         Description => '\s including beyond ASCII plus vertical tab = [[:Space:]]',
10684         Initialize => $Blank + $VertSpace,
10685     );
10686     $posix_equivalent{'Space'} = $Space;
10687
10688     # Perl's traditional space doesn't include Vertical Tab
10689     my $SpacePerl = $perl->add_match_table('SpacePerl',
10690                                   Description => '\s, including beyond ASCII',
10691                                   Initialize => $Space - 0x000B,
10692                                 );
10693     $perl->add_match_table('PerlSpace',
10694                             Description => '\s, restricted to ASCII',
10695                             Initialize => $SpacePerl & $ASCII,
10696                             );
10697
10698     my $Cntrl = $perl->add_match_table('Cntrl',
10699                         Description => "[[:Cntrl:]] extended beyond ASCII");
10700     $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
10701     $posix_equivalent{'Cntrl'} = $Cntrl;
10702
10703     # $controls is a temporary used to construct Graph.
10704     my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
10705                                                 + $gc->table('Control'));
10706     # Cs not in release 1
10707     $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
10708
10709     # Graph is  ~space &  ~(Cc|Cs|Cn) = ~(space + $controls)
10710     my $Graph = $perl->add_match_table('Graph',
10711                         Description => "[[:Graph:]] extended beyond ASCII",
10712                         Initialize => ~ ($Space + $controls),
10713                         );
10714     $posix_equivalent{'Graph'} = $Graph;
10715
10716     my $Print = $perl->add_match_table('Print',
10717                         Description => "[[:Print:]] extended beyond ASCII",
10718                         Initialize => $Space + $Graph - $gc->table('Control'),
10719                         );
10720     $posix_equivalent{'Print'} = $Print;
10721
10722     my $Punct = $perl->add_match_table('Punct');
10723     $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
10724
10725     # \p{punct} doesn't include the symbols, which posix does
10726     $perl->add_match_table('PosixPunct',
10727                             Description => "[[:Punct:]]",
10728                             Initialize => $ASCII & ($gc->table('Punctuation')
10729                                                     + $gc->table('Symbol')),
10730                             );
10731
10732     my $Digit = $perl->add_match_table('Digit',
10733                             Description => '\d, extended beyond just [0-9]');
10734     $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
10735     $posix_equivalent{'Digit'} = $Digit;
10736
10737     # AHex was not present in early releases
10738     my $Xdigit = $perl->add_match_table('XDigit',
10739                                         Description => '[0-9A-Fa-f]');
10740     my $AHex = property_ref('ASCII_Hex_Digit');
10741     if (defined $AHex && ! $AHex->is_empty) {
10742         $Xdigit->set_equivalent_to($AHex->table('Y'), Related => 1);
10743     }
10744     else {
10745         # (Have to use hex because could be running on an non-ASCII machine,
10746         # and we want the Unicode (ASCII) values)
10747         $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66 ]);
10748     }
10749
10750     # Now, add the ASCII-restricted tables that get uniform treatment
10751     while (my ($name, $table) = each %posix_equivalent) {
10752         $perl->add_match_table("Posix$name",
10753                                 Description => "[[:$name:]]",
10754                                 Initialize => $table & $ASCII,
10755                                 );
10756     }
10757     $perl->table('PosixDigit')->add_description('\d, restricted to ASCII');
10758     $perl->table('PosixDigit')->add_description('[0-9]');
10759
10760
10761     my $dt = property_ref('Decomposition_Type');
10762     $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
10763         Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
10764         Perl_Extension => 1,
10765         Note => 'Perl extension consisting of the union of all non-canonical decompositions',
10766         );
10767
10768     # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
10769     # than SD appeared, construct it ourselves, based on the first release SD
10770     # was in.
10771     my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ');
10772     my $soft_dotted = property_ref('Soft_Dotted');
10773     if (defined $soft_dotted && ! $soft_dotted->is_empty) {
10774         $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
10775     }
10776     else {
10777
10778         # This list came from 3.2 Soft_Dotted.
10779         $CanonDCIJ->initialize([ 0x0069,
10780                                  0x006A,
10781                                  0x012F,
10782                                  0x0268,
10783                                  0x0456,
10784                                  0x0458,
10785                                  0x1E2D,
10786                                  0x1ECB,
10787                                ]);
10788         $CanonDCIJ = $CanonDCIJ & $Assigned;
10789     }
10790
10791     # These are used in Unicode's definition of \X
10792     my $gcb = property_ref('Grapheme_Cluster_Break');
10793     #my $extend = $perl->add_match_table('_X_Extend');
10794     my $extend = $perl->add_match_table('_GCB_Extend');
10795     # XXX until decide what todo my $begin = $perl->add_match_table('_X_Begin');
10796     if (defined $gcb) {
10797         $extend += $gcb->table('Extend') + $gcb->table('SpacingMark')
10798         #$begin += ~ ($gcb->table('Control')
10799         #             + $gcb->table('CR')
10800         #             + $gcb->table('LF'));
10801     }
10802     else {    # Old definition, used on early releases.
10803         $extend += $gc->table('Mark')
10804                     + 0x200C    # ZWNJ
10805                     + 0x200D;    # ZWJ
10806         #$begin += ~ $extend;
10807     }
10808
10809     # Create a new property specially located that is a combination of the
10810     # various Name properties: Name, Unicode_1_Name, Named Sequences, and
10811     # Name_Alias properties.  (The final duplicates elements of the first.)  A
10812     # comment for it is constructed based on the actual properties present and
10813     # used
10814     my $perl_charname = Property->new('Perl_Charnames',
10815                                 Core_Access => '\N{...} and charnames.pm',
10816                                 Default_Map => "",
10817                                 Directory => '.',
10818                                 File => 'Name',
10819                                 Internal_Only_Warning => 1,
10820                                 Perl_Extension => 1,
10821                                 Range_Size_1 => 1,
10822                                 Type => $STRING,
10823                                 Initialize => property_ref('Unicode_1_Name'),
10824                                 );
10825     # Name overrides Unicode_1_Name
10826     $perl_charname->property_add_or_replace_non_nulls(property_ref('Name'));
10827     my @composition = ('Name', 'Unicode_1_Name');
10828
10829     if (@named_sequences) {
10830         push @composition, 'Named_Sequence';
10831         foreach my $sequence (@named_sequences) {
10832             $perl_charname->add_anomalous_entry($sequence);
10833         }
10834     }
10835
10836     my $alias_sentence = "";
10837     my $alias = property_ref('Name_Alias');
10838     if (defined $alias) {
10839         push @composition, 'Name_Alias';
10840         $alias->reset_each_range;
10841         while (my ($range) = $alias->each_range) {
10842             next if $range->value eq "";
10843             if ($range->start != $range->end) {
10844                 Carp::my_carp("Expecting only one code point in the range $range.  Just to keep going, using just the first code point;");
10845             }
10846             $perl_charname->add_duplicate($range->start, $range->value);
10847         }
10848         $alias_sentence = <<END;
10849 The Name_Alias property adds duplicate code point entries with a corrected
10850 name.  The original (less correct, but still valid) name will be physically
10851 first.
10852 END
10853     }
10854     my $comment;
10855     if (@composition <= 2) { # Always at least 2
10856         $comment = join " and ", @composition;
10857     }
10858     else {
10859         $comment = join ", ", @composition[0 .. scalar @composition - 2];
10860         $comment .= ", and $composition[-1]";
10861     }
10862
10863     # Wait for charnames to catch up
10864 #    foreach my $entry (@more_Names,
10865 #                        split "\n", <<"END"
10866 #000A; LF
10867 #000C; FF
10868 #000D; CR
10869 #0085; NEL
10870 #200C; ZWNJ
10871 #200D; ZWJ
10872 #FEFF; BOM
10873 #FEFF; BYTE ORDER MARK
10874 #END
10875 #    ) {
10876 #        #local $to_trace = 1 if main::DEBUG;
10877 #        trace $entry if main::DEBUG && $to_trace;
10878 #        my ($code_point, $name) = split /\s*;\s*/, $entry;
10879 #        $code_point = hex $code_point;
10880 #        trace $code_point, $name if main::DEBUG && $to_trace;
10881 #        $perl_charname->add_duplicate($code_point, $name);
10882 #    }
10883 #    #$perl_charname->add_comment("This file is for charnames.pm.  It is the union of the $comment properties, plus certain commonly used but unofficial names, such as 'FF' and 'ZWNJ'.  Unicode_1_Name entries are used only for otherwise nameless code points.$alias_sentence");
10884     $perl_charname->add_comment(join_lines( <<END
10885 This file is for charnames.pm.  It is the union of the $comment properties.
10886 Unicode_1_Name entries are used only for otherwise nameless code
10887 points.
10888 $alias_sentence
10889 END
10890     ));
10891
10892     # The combining class property used by Perl's normalize.pm is not located
10893     # in the normal mapping directory; create a copy for it.
10894     my $ccc = property_ref('Canonical_Combining_Class');
10895     my $perl_ccc = Property->new('Perl_ccc',
10896                             Default_Map => $ccc->default_map,
10897                             Full_Name => 'Perl_Canonical_Combining_Class',
10898                             Internal_Only_Warning => 1,
10899                             Perl_Extension => 1,
10900                             Pod_Entry =>0,
10901                             Type => $ENUM,
10902                             Initialize => $ccc,
10903                             File => 'CombiningClass',
10904                             Directory => '.',
10905                             );
10906     $perl_ccc->set_to_output_map(1);
10907     $perl_ccc->add_comment(join_lines(<<END
10908 This mapping is for normalize.pm.  It is currently identical to the Unicode
10909 Canonical_Combining_Class property.
10910 END
10911     ));
10912
10913     # This one match table for it is needed for calculations on output
10914     my $default = $perl_ccc->add_match_table($ccc->default_map,
10915                         Initialize => $ccc->table($ccc->default_map),
10916                         Status => $SUPPRESSED);
10917
10918     # Construct the Present_In property from the Age property.
10919     if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
10920         my $default_map = $age->default_map;
10921         my $in = Property->new('In',
10922                                 Default_Map => $default_map,
10923                                 Full_Name => "Present_In",
10924                                 Internal_Only_Warning => 1,
10925                                 Perl_Extension => 1,
10926                                 Type => $ENUM,
10927                                 Initialize => $age,
10928                                 );
10929         $in->add_comment(join_lines(<<END
10930 This file should not be used for any purpose.  The values in this file are the
10931 same as for $age, and not for what $in really means.  This is because anything
10932 defined in a given release should have multiple values: that release and all
10933 higher ones.  But only one value per code point can be represented in a table
10934 like this.
10935 END
10936         ));
10937
10938         # The Age tables are named like 1.5, 2.0, 2.1, ....  Sort so that the
10939         # lowest numbered (earliest) come first, with the non-numeric one
10940         # last.
10941         my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
10942                                             ? 1
10943                                             : ($b->name !~ /^[\d.]*$/)
10944                                                 ? -1
10945                                                 : $a->name <=> $b->name
10946                                             } $age->tables;
10947
10948         # The Present_In property is the cumulative age properties.  The first
10949         # one hence is identical to the first age one.
10950         my $previous_in = $in->add_match_table($first_age->name);
10951         $previous_in->set_equivalent_to($first_age, Related => 1);
10952
10953         my $description_start = "Code point's usage introduced in version ";
10954         $first_age->add_description($description_start . $first_age->name);
10955
10956         # To construct the accumlated values, for each of the age tables
10957         # starting with the 2nd earliest, merge the earliest with it, to get
10958         # all those code points existing in the 2nd earliest.  Repeat merging
10959         # the new 2nd earliest with the 3rd earliest to get all those existing
10960         # in the 3rd earliest, and so on.
10961         foreach my $current_age (@rest_ages) {
10962             next if $current_age->name !~ /^[\d.]*$/;   # Skip the non-numeric
10963
10964             my $current_in = $in->add_match_table(
10965                                     $current_age->name,
10966                                     Initialize => $current_age + $previous_in,
10967                                     Description => $description_start
10968                                                     . $current_age->name
10969                                                     . ' or earlier',
10970                                     );
10971             $previous_in = $current_in;
10972
10973             # Add clarifying material for the corresponding age file.  This is
10974             # in part because of the confusing and contradictory information
10975             # given in the Standard's documentation itself, as of 5.2.
10976             $current_age->add_description(
10977                             "Code point's usage was introduced in version "
10978                             . $current_age->name);
10979             $current_age->add_note("See also $in");
10980
10981         }
10982
10983         # And finally the code points whose usages have yet to be decided are
10984         # the same in both properties.  Note that permanently unassigned code
10985         # points actually have their usage assigned (as being permanently
10986         # unassigned), so that these tables are not the same as gc=cn.
10987         my $unassigned = $in->add_match_table($default_map);
10988         my $age_default = $age->table($default_map);
10989         $age_default->add_description(<<END
10990 Code point's usage has not been assigned in any Unicode release thus far.
10991 END
10992         );
10993         $unassigned->set_equivalent_to($age_default, Related => 1);
10994     }
10995
10996
10997     # Finished creating all the perl properties.  All non-internal non-string
10998     # ones have a synonym of 'Is_' prefixed.  (Internal properties begin with
10999     # an underscore.)  These do not get a separate entry in the pod file
11000     foreach my $table ($perl->tables) {
11001         foreach my $alias ($table->aliases) {
11002             next if $alias->name =~ /^_/;
11003             $table->add_alias('Is_' . $alias->name,
11004                                Pod_Entry => 0,
11005                                Status => $alias->status,
11006                                Externally_Ok => 0);
11007         }
11008     }
11009
11010     return;
11011 }
11012
11013 sub add_perl_synonyms() {
11014     # A number of Unicode tables have Perl synonyms that are expressed in
11015     # the single-form, \p{name}.  These are:
11016     #   All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
11017     #       \p{Is_Name} as synonyms
11018     #   \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
11019     #   \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
11020     #   \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
11021     #       conflict, \p{Value} and \p{Is_Value} as well
11022     #
11023     # This routine generates these synonyms, warning of any unexpected
11024     # conflicts.
11025
11026     # Construct the list of tables to get synonyms for.  Start with all the
11027     # binary and the General_Category ones.
11028     my @tables = grep { $_->type == $BINARY } property_ref('*');
11029     push @tables, $gc->tables;
11030
11031     # If the version of Unicode includes the Script property, add its tables
11032     if (defined property_ref('Script')) {
11033         push @tables, property_ref('Script')->tables;
11034     }
11035
11036     # The Block tables are kept separate because they are treated differently.
11037     # And the earliest versions of Unicode didn't include them, so add only if
11038     # there are some.
11039     my @blocks;
11040     push @blocks, $block->tables if defined $block;
11041
11042     # Here, have the lists of tables constructed.  Process blocks last so that
11043     # if there are name collisions with them, blocks have lowest priority.
11044     # Should there ever be other collisions, manual intervention would be
11045     # required.  See the comments at the beginning of the program for a
11046     # possible way to handle those semi-automatically.
11047     foreach my $table (@tables,  @blocks) {
11048
11049         # For non-binary properties, the synonym is just the name of the
11050         # table, like Greek, but for binary properties the synonym is the name
11051         # of the property, and means the code points in its 'Y' table.
11052         my $nominal = $table;
11053         my $nominal_property = $nominal->property;
11054         my $actual;
11055         if (! $nominal->isa('Property')) {
11056             $actual = $table;
11057         }
11058         else {
11059
11060             # Here is a binary property.  Use the 'Y' table.  Verify that is
11061             # there
11062             my $yes = $nominal->table('Y');
11063             unless (defined $yes) {  # Must be defined, but is permissible to
11064                                      # be empty.
11065                 Carp::my_carp_bug("Undefined $nominal, 'Y'.  Skipping.");
11066                 next;
11067             }
11068             $actual = $yes;
11069         }
11070
11071         foreach my $alias ($nominal->aliases) {
11072
11073             # Attempt to create a table in the perl directory for the
11074             # candidate table, using whatever aliases in it that don't
11075             # conflict.  Also add non-conflicting aliases for all these
11076             # prefixed by 'Is_' (and/or 'In_' for Block property tables)
11077             PREFIX:
11078             foreach my $prefix ("", 'Is_', 'In_') {
11079
11080                 # Only Block properties can have added 'In_' aliases.
11081                 next if $prefix eq 'In_' and $nominal_property != $block;
11082
11083                 my $proposed_name = $prefix . $alias->name;
11084
11085                 # No Is_Is, In_In, nor combinations thereof
11086                 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
11087                 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
11088
11089                 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
11090
11091                 # Get a reference to any existing table in the perl
11092                 # directory with the desired name.
11093                 my $pre_existing = $perl->table($proposed_name);
11094
11095                 if (! defined $pre_existing) {
11096
11097                     # No name collision, so ok to add the perl synonym.
11098
11099                     my $make_pod_entry;
11100                     my $externally_ok;
11101                     my $status = $actual->status;
11102                     if ($nominal_property == $block) {
11103
11104                         # For block properties, the 'In' form is preferred for
11105                         # external use; the pod file contains wild cards for
11106                         # this and the 'Is' form so no entries for those; and
11107                         # we don't want people using the name without the
11108                         # 'In', so discourage that.
11109                         if ($prefix eq "") {
11110                             $make_pod_entry = 1;
11111                             $status = $status || $DISCOURAGED;
11112                             $externally_ok = 0;
11113                         }
11114                         elsif ($prefix eq 'In_') {
11115                             $make_pod_entry = 0;
11116                             $status = $status || $NORMAL;
11117                             $externally_ok = 1;
11118                         }
11119                         else {
11120                             $make_pod_entry = 0;
11121                             $status = $status || $DISCOURAGED;
11122                             $externally_ok = 0;
11123                         }
11124                     }
11125                     elsif ($prefix ne "") {
11126
11127                         # The 'Is' prefix is handled in the pod by a wild
11128                         # card, and we won't use it for an external name
11129                         $make_pod_entry = 0;
11130                         $status = $status || $NORMAL;
11131                         $externally_ok = 0;
11132                     }
11133                     else {
11134
11135                         # Here, is an empty prefix, non block.  This gets its
11136                         # own pod entry and can be used for an external name.
11137                         $make_pod_entry = 1;
11138                         $status = $status || $NORMAL;
11139                         $externally_ok = 1;
11140                     }
11141
11142                     # Here, there isn't a perl pre-existing table with the
11143                     # name.  Look through the list of equivalents of this
11144                     # table to see if one is a perl table.
11145                     foreach my $equivalent ($actual->leader->equivalents) {
11146                         next if $equivalent->property != $perl;
11147
11148                         # Here, have found a table for $perl.  Add this alias
11149                         # to it, and are done with this prefix.
11150                         $equivalent->add_alias($proposed_name,
11151                                         Pod_Entry => $make_pod_entry,
11152                                         Status => $status,
11153                                         Externally_Ok => $externally_ok);
11154                         trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
11155                         next PREFIX;
11156                     }
11157
11158                     # Here, $perl doesn't already have a table that is a
11159                     # synonym for this property, add one.
11160                     my $added_table = $perl->add_match_table($proposed_name,
11161                                             Pod_Entry => $make_pod_entry,
11162                                             Status => $status,
11163                                             Externally_Ok => $externally_ok);
11164                     # And it will be related to the actual table, since it is
11165                     # based on it.
11166                     $added_table->set_equivalent_to($actual, Related => 1);
11167                     trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
11168                     next;
11169                 } # End of no pre-existing.
11170
11171                 # Here, there is a pre-existing table that has the proposed
11172                 # name.  We could be in trouble, but not if this is just a
11173                 # synonym for another table that we have already made a child
11174                 # of the pre-existing one.
11175                 if ($pre_existing->is_equivalent_to($actual)) {
11176                     trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
11177                     $pre_existing->add_alias($proposed_name);
11178                     next;
11179                 }
11180
11181                 # Here, there is a name collision, but it still could be ok if
11182                 # the tables match the identical set of code points, in which
11183                 # case, we can combine the names.  Compare each table's code
11184                 # point list to see if they are identical.
11185                 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
11186                 if ($pre_existing->matches_identically_to($actual)) {
11187
11188                     # Here, they do match identically.  Not a real conflict.
11189                     # Make the perl version a child of the Unicode one, except
11190                     # in the non-obvious case of where the perl name is
11191                     # already a synonym of another Unicode property.  (This is
11192                     # excluded by the test for it being its own parent.)  The
11193                     # reason for this exclusion is that then the two Unicode
11194                     # properties become related; and we don't really know if
11195                     # they are or not.  We generate documentation based on
11196                     # relatedness, and this would be misleading.  Code
11197                     # later executed in the process will cause the tables to
11198                     # be represented by a single file anyway, without making
11199                     # it look in the pod like they are necessarily related.
11200                     if ($pre_existing->parent == $pre_existing
11201                         && ($pre_existing->property == $perl
11202                             || $actual->property == $perl))
11203                     {
11204                         trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
11205                         $pre_existing->set_equivalent_to($actual, Related => 1);
11206                     }
11207                     elsif (main::DEBUG && $to_trace) {
11208                         trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
11209                         trace $pre_existing->parent;
11210                     }
11211                     next PREFIX;
11212                 }
11213
11214                 # Here they didn't match identically, there is a real conflict
11215                 # between our new name and a pre-existing property.
11216                 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
11217                 $pre_existing->add_conflicting($nominal->full_name,
11218                                                'p',
11219                                                $actual);
11220
11221                 # Don't output a warning for aliases for the block
11222                 # properties (unless they start with 'In_') as it is
11223                 # expected that there will be conflicts and the block
11224                 # form loses.
11225                 if ($verbosity >= $NORMAL_VERBOSITY
11226                     && ($actual->property != $block || $prefix eq 'In_'))
11227                 {
11228                     print simple_fold(join_lines(<<END
11229 There is already an alias named $proposed_name (from " . $pre_existing . "),
11230 so not creating this alias for " . $actual
11231 END
11232                     ), "", 4);
11233                 }
11234
11235                 # Keep track for documentation purposes.
11236                 $has_In_conflicts++ if $prefix eq 'In_';
11237                 $has_Is_conflicts++ if $prefix eq 'Is_';
11238             }
11239         }
11240     }
11241
11242     # There are some properties which have No and Yes (and N and Y) as
11243     # property values, but aren't binary, and could possibly be confused with
11244     # binary ones.  So create caveats for them.  There are tables that are
11245     # named 'No', and tables that are named 'N', but confusion is not likely
11246     # unless they are the same table.  For example, N meaning Number or
11247     # Neutral is not likely to cause confusion, so don't add caveats to things
11248     # like them.
11249     foreach my $property (grep { $_->type != $BINARY } property_ref('*')) {
11250         my $yes = $property->table('Yes');
11251         if (defined $yes) {
11252             my $y = $property->table('Y');
11253             if (defined $y && $yes == $y) {
11254                 foreach my $alias ($property->aliases) {
11255                     $yes->add_conflicting($alias->name);
11256                 }
11257             }
11258         }
11259         my $no = $property->table('No');
11260         if (defined $no) {
11261             my $n = $property->table('N');
11262             if (defined $n && $no == $n) {
11263                 foreach my $alias ($property->aliases) {
11264                     $no->add_conflicting($alias->name, 'P');
11265                 }
11266             }
11267         }
11268     }
11269
11270     return;
11271 }
11272
11273 sub register_file_for_name($$$) {
11274     # Given info about a table and a datafile that it should be associated
11275     # with, register that assocation
11276
11277     my $table = shift;
11278     my $directory_ref = shift;   # Array of the directory path for the file
11279     my $file = shift;            # The file name in the final directory, [-1].
11280     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11281
11282     trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
11283
11284     if ($table->isa('Property')) {
11285         $table->set_file_path(@$directory_ref, $file);
11286         push @map_properties, $table
11287                                     if $directory_ref->[0] eq $map_directory;
11288         return;
11289     }
11290
11291     # Do all of the work for all equivalent tables when called with the leader
11292     # table, so skip if isn't the leader.
11293     return if $table->leader != $table;
11294
11295     # Join all the file path components together, using slashes.
11296     my $full_filename = join('/', @$directory_ref, $file);
11297
11298     # All go in the same subdirectory of unicore
11299     if ($directory_ref->[0] ne $matches_directory) {
11300         Carp::my_carp("Unexpected directory in "
11301                 .  join('/', @{$directory_ref}, $file));
11302     }
11303
11304     # For this table and all its equivalents ...
11305     foreach my $table ($table, $table->equivalents) {
11306
11307         # Associate it with its file internally.  Don't include the
11308         # $matches_directory first component
11309         $table->set_file_path(@$directory_ref, $file);
11310         my $sub_filename = join('/', $directory_ref->[1, -1], $file);
11311
11312         my $property = $table->property;
11313         $property = ($property == $perl)
11314                     ? ""                # 'perl' is never explicitly stated
11315                     : standardize($property->name) . '=';
11316
11317         my $deprecated = ($table->status eq $DEPRECATED)
11318                          ? $table->status_info
11319                          : "";
11320
11321         # And for each of the table's aliases...  This inner loop eventually
11322         # goes through all aliases in the UCD that we generate regex match
11323         # files for
11324         foreach my $alias ($table->aliases) {
11325             my $name = $alias->name;
11326
11327             # Generate an entry in either the loose or strict hashes, which
11328             # will translate the property and alias names combination into the
11329             # file where the table for them is stored.
11330             my $standard;
11331             if ($alias->loose_match) {
11332                 $standard = $property . standardize($alias->name);
11333                 if (exists $loose_to_file_of{$standard}) {
11334                     Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
11335                 }
11336                 else {
11337                     $loose_to_file_of{$standard} = $sub_filename;
11338                 }
11339             }
11340             else {
11341                 $standard = lc ($property . $name);
11342                 if (exists $stricter_to_file_of{$standard}) {
11343                     Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
11344                 }
11345                 else {
11346                     $stricter_to_file_of{$standard} = $sub_filename;
11347
11348                     # Tightly coupled with how utf8_heavy.pl works, for a
11349                     # floating point number that is a whole number, get rid of
11350                     # the trailing decimal point and 0's, so that utf8_heavy
11351                     # will work.  Also note that this assumes that such a
11352                     # number is matched strictly; so if that were to change,
11353                     # this would be wrong.
11354                     if ((my $integer_name = $name)
11355                             =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
11356                     {
11357                         $stricter_to_file_of{$property . $integer_name}
11358                             = $sub_filename;
11359                     }
11360                 }
11361             }
11362
11363             # Keep a list of the deprecated properties and their filenames
11364             if ($deprecated) {
11365                 $utf8::why_deprecated{$sub_filename} = $deprecated;
11366             }
11367         }
11368     }
11369
11370     return;
11371 }
11372
11373 {   # Closure
11374     my %base_names;  # Names already used for avoiding DOS 8.3 filesystem
11375                      # conflicts
11376     my %full_dir_name_of;   # Full length names of directories used.
11377
11378     sub construct_filename($$$) {
11379         # Return a file name for a table, based on the table name, but perhaps
11380         # changed to get rid of non-portable characters in it, and to make
11381         # sure that it is unique on a file system that allows the names before
11382         # any period to be at most 8 characters (DOS).  While we're at it
11383         # check and complain if there are any directory conflicts.
11384
11385         my $name = shift;       # The name to start with
11386         my $mutable = shift;    # Boolean: can it be changed?  If no, but
11387                                 # yet it must be to work properly, a warning
11388                                 # is given
11389         my $directories_ref = shift;  # A reference to an array containing the
11390                                 # path to the file, with each element one path
11391                                 # component.  This is used because the same
11392                                 # name can be used in different directories.
11393         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11394
11395         my $warn = ! defined wantarray;  # If true, then if the name is
11396                                 # changed, a warning is issued as well.
11397
11398         if (! defined $name) {
11399             Carp::my_carp("Undefined name in directory "
11400                           . File::Spec->join(@$directories_ref)
11401                           . ". '_' used");
11402             return '_';
11403         }
11404
11405         # Make sure that no directory names conflict with each other.  Look at
11406         # each directory in the input file's path.  If it is already in use,
11407         # assume it is correct, and is merely being re-used, but if we
11408         # truncate it to 8 characters, and find that there are two directories
11409         # that are the same for the first 8 characters, but differ after that,
11410         # then that is a problem.
11411         foreach my $directory (@$directories_ref) {
11412             my $short_dir = substr($directory, 0, 8);
11413             if (defined $full_dir_name_of{$short_dir}) {
11414                 next if $full_dir_name_of{$short_dir} eq $directory;
11415                 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}.  Bad News.  Continuing anyway");
11416             }
11417             else {
11418                 $full_dir_name_of{$short_dir} = $directory;
11419             }
11420         }
11421
11422         my $path = join '/', @$directories_ref;
11423         $path .= '/' if $path;
11424
11425         # Remove interior underscores.
11426         (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
11427
11428         # Change any non-word character into an underscore, and truncate to 8.
11429         $filename =~ s/\W+/_/g;   # eg., "L&" -> "L_"
11430         substr($filename, 8) = "" if length($filename) > 8;
11431
11432         # Make sure the basename doesn't conflict with something we
11433         # might have already written. If we have, say,
11434         #     InGreekExtended1
11435         #     InGreekExtended2
11436         # they become
11437         #     InGreekE
11438         #     InGreek2
11439         my $warned = 0;
11440         while (my $num = $base_names{$path}{lc $filename}++) {
11441             $num++; # so basenames with numbers start with '2', which
11442                     # just looks more natural.
11443
11444             # Want to append $num, but if it'll make the basename longer
11445             # than 8 characters, pre-truncate $filename so that the result
11446             # is acceptable.
11447             my $delta = length($filename) + length($num) - 8;
11448             if ($delta > 0) {
11449                 substr($filename, -$delta) = $num;
11450             }
11451             else {
11452                 $filename .= $num;
11453             }
11454             if ($warn && ! $warned) {
11455                 $warned = 1;
11456                 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS).  Proceeding anyway.");
11457             }
11458         }
11459
11460         return $filename if $mutable;
11461
11462         # If not changeable, must return the input name, but warn if needed to
11463         # change it beyond shortening it.
11464         if ($name ne $filename
11465             && substr($name, 0, length($filename)) ne $filename) {
11466             Carp::my_carp("'$path$name' had to be changed into '$filename'.  Bad News.  Proceeding anyway.");
11467         }
11468         return $name;
11469     }
11470 }
11471
11472 # The pod file contains a very large table.  Many of the lines in that table
11473 # would exceed a typical output window's size, and so need to be wrapped with
11474 # a hanging indent to make them look good.  The pod language is really
11475 # insufficient here.  There is no general construct to do that in pod, so it
11476 # is done here by beginning each such line with a space to cause the result to
11477 # be output without formatting, and doing all the formatting here.  This leads
11478 # to the result that if the eventual display window is too narrow it won't
11479 # look good, and if the window is too wide, no advantage is taken of that
11480 # extra width.  A further complication is that the output may be indented by
11481 # the formatter so that there is less space than expected.  What I (khw) have
11482 # done is to assume that that indent is a particular number of spaces based on
11483 # what it is in my Linux system;  people can always resize their windows if
11484 # necessary, but this is obviously less than desirable, but the best that can
11485 # be expected.
11486 my $automatic_pod_indent = 8;
11487
11488 # Try to format so that uses fewest lines, but few long left column entries
11489 # slide into the right column.  An experiment on 5.1 data yielded the
11490 # following percentages that didn't cut into the other side along with the
11491 # associated first-column widths
11492 # 69% = 24
11493 # 80% not too bad except for a few blocks
11494 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
11495 # 95% = 37;
11496 my $indent_info_column = 27;    # 75% of lines didn't have overlap
11497
11498 my $FILLER = 3;     # Length of initial boiler-plate columns in a pod line
11499                     # The 3 is because of:
11500                     #   1   for the leading space to tell the pod formatter to
11501                     #       output as-is
11502                     #   1   for the flag
11503                     #   1   for the space between the flag and the main data
11504
11505 sub format_pod_line ($$$;$$) {
11506     # Take a pod line and return it, formatted properly
11507
11508     my $first_column_width = shift;
11509     my $entry = shift;  # Contents of left column
11510     my $info = shift;   # Contents of right column
11511
11512     my $status = shift || "";   # Any flag
11513
11514     my $loose_match = shift;    # Boolean.
11515     $loose_match = 1 unless defined $loose_match;
11516
11517     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11518
11519     my $flags = "";
11520     $flags .= $STRICTER if ! $loose_match;
11521
11522     $flags .= $status if $status;
11523
11524     # There is a blank in the left column to cause the pod formatter to
11525     # output the line as-is.
11526     return sprintf " %-*s%-*s %s\n",
11527                     # The first * in the format is replaced by this, the -1 is
11528                     # to account for the leading blank.  There isn't a
11529                     # hard-coded blank after this to separate the flags from
11530                     # the rest of the line, so that in the unlikely event that
11531                     # multiple flags are shown on the same line, they both
11532                     # will get displayed at the expense of that separation,
11533                     # but since they are left justified, a blank will be
11534                     # inserted in the normal case.
11535                     $FILLER - 1,
11536                     $flags,
11537
11538                     # The other * in the format is replaced by this number to
11539                     # cause the first main column to right fill with blanks.
11540                     # The -1 is for the guaranteed blank following it.
11541                     $first_column_width - $FILLER - 1,
11542                     $entry,
11543                     $info;
11544 }
11545
11546 my @zero_match_tables;  # List of tables that have no matches in this release
11547
11548 sub make_table_pod_entries($) {
11549     # This generates the entries for the pod file for a given table.
11550     # Also done at this time are any children tables.  The output looks like:
11551     # \p{Common}              \p{Script=Common} (Short: \p{Zyyy}) (5178)
11552
11553     my $input_table = shift;        # Table the entry is for
11554     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11555
11556     # Generate parent and all its children at the same time.
11557     return if $input_table->parent != $input_table;
11558
11559     my $property = $input_table->property;
11560     my $type = $property->type;
11561     my $full_name = $property->full_name;
11562
11563     my $count = $input_table->count;
11564     my $string_count = clarify_number($count);
11565     my $status = $input_table->status;
11566     my $status_info = $input_table->status_info;
11567
11568     my $entry_for_first_table; # The entry for the first table output.
11569                            # Almost certainly, it is the parent.
11570
11571     # For each related table (including itself), we will generate a pod entry
11572     # for each name each table goes by
11573     foreach my $table ($input_table, $input_table->children) {
11574
11575         # utf8_heavy.pl cannot deal with null string property values, so don't
11576         # output any.
11577         next if $table->name eq "";
11578
11579         # First, gather all the info that applies to this table as a whole.
11580
11581         push @zero_match_tables, $table if $count == 0;
11582
11583         my $table_property = $table->property;
11584
11585         # The short name has all the underscores removed, while the full name
11586         # retains them.  Later, we decide whether to output a short synonym
11587         # for the full one, we need to compare apples to apples, so we use the
11588         # short name's length including underscores.
11589         my $table_property_short_name_length;
11590         my $table_property_short_name
11591             = $table_property->short_name(\$table_property_short_name_length);
11592         my $table_property_full_name = $table_property->full_name;
11593
11594         # Get how much savings there is in the short name over the full one
11595         # (delta will always be <= 0)
11596         my $table_property_short_delta = $table_property_short_name_length
11597                                          - length($table_property_full_name);
11598         my @table_description = $table->description;
11599         my @table_note = $table->note;
11600
11601         # Generate an entry for each alias in this table.
11602         my $entry_for_first_alias;  # saves the first one encountered.
11603         foreach my $alias ($table->aliases) {
11604
11605             # Skip if not to go in pod.
11606             next unless $alias->make_pod_entry;
11607
11608             # Start gathering all the components for the entry
11609             my $name = $alias->name;
11610
11611             my $entry;      # Holds the left column, may include extras
11612             my $entry_ref;  # To refer to the left column's contents from
11613                             # another entry; has no extras
11614
11615             # First the left column of the pod entry.  Tables for the $perl
11616             # property always use the single form.
11617             if ($table_property == $perl) {
11618                 $entry = "\\p{$name}";
11619                 $entry_ref = "\\p{$name}";
11620             }
11621             else {    # Compound form.
11622
11623                 # Only generate one entry for all the aliases that mean true
11624                 # or false in binary properties.  Append a '*' to indicate
11625                 # some are missing.  (The heading comment notes this.)
11626                 my $wild_card_mark;
11627                 if ($type == $BINARY) {
11628                     next if $name ne 'N' && $name ne 'Y';
11629                     $wild_card_mark = '*';
11630                 }
11631                 else {
11632                     $wild_card_mark = "";
11633                 }
11634
11635                 # Colon-space is used to give a little more space to be easier
11636                 # to read;
11637                 $entry = "\\p{"
11638                         . $table_property_full_name
11639                         . ": $name$wild_card_mark}";
11640
11641                 # But for the reference to this entry, which will go in the
11642                 # right column, where space is at a premium, use equals
11643                 # without a space
11644                 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
11645             }
11646
11647             # Then the right (info) column.  This is stored as components of
11648             # an array for the moment, then joined into a string later.  For
11649             # non-internal only properties, begin the info with the entry for
11650             # the first table we encountered (if any), as things are ordered
11651             # so that that one is the most descriptive.  This leads to the
11652             # info column of an entry being a more descriptive version of the
11653             # name column
11654             my @info;
11655             if ($name =~ /^_/) {
11656                 push @info,
11657                         '(For internal use by Perl, not necessarily stable)';
11658             }
11659             elsif ($entry_for_first_alias) {
11660                 push @info, $entry_for_first_alias;
11661             }
11662
11663             # If this entry is equivalent to another, add that to the info,
11664             # using the first such table we encountered
11665             if ($entry_for_first_table) {
11666                 if (@info) {
11667                     push @info, "(= $entry_for_first_table)";
11668                 }
11669                 else {
11670                     push @info, $entry_for_first_table;
11671                 }
11672             }
11673
11674             # If the name is a large integer, add an equivalent with an
11675             # exponent for better readability
11676             if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
11677                 push @info, sprintf "(= %.1e)", $name
11678             }
11679
11680             my $parenthesized = "";
11681             if (! $entry_for_first_alias) {
11682
11683                 # This is the first alias for the current table.  The alias
11684                 # array is ordered so that this is the fullest, most
11685                 # descriptive alias, so it gets the fullest info.  The other
11686                 # aliases are mostly merely pointers to this one, using the
11687                 # information already added above.
11688
11689                 # Display any status message, but only on the parent table
11690                 if ($status && ! $entry_for_first_table) {
11691                     push @info, $status_info;
11692                 }
11693
11694                 # Put out any descriptive info
11695                 if (@table_description || @table_note) {
11696                     push @info, join "; ", @table_description, @table_note;
11697                 }
11698
11699                 # Look to see if there is a shorter name we can point people
11700                 # at
11701                 my $standard_name = standardize($name);
11702                 my $short_name;
11703                 my $proposed_short = $table->short_name;
11704                 if (defined $proposed_short) {
11705                     my $standard_short = standardize($proposed_short);
11706
11707                     # If the short name is shorter than the standard one, or
11708                     # even it it's not, but the combination of it and its
11709                     # short property name (as in \p{prop=short} ($perl doesn't
11710                     # have this form)) saves at least two characters, then,
11711                     # cause it to be listed as a shorter synonym.
11712                     if (length $standard_short < length $standard_name
11713                         || ($table_property != $perl
11714                             && (length($standard_short)
11715                                 - length($standard_name)
11716                                 + $table_property_short_delta)  # (<= 0)
11717                                 < -2))
11718                     {
11719                         $short_name = $proposed_short;
11720                         if ($table_property != $perl) {
11721                             $short_name = $table_property_short_name
11722                                           . "=$short_name";
11723                         }
11724                         $short_name = "\\p{$short_name}";
11725                     }
11726                 }
11727
11728                 # And if this is a compound form name, see if there is a
11729                 # single form equivalent
11730                 my $single_form;
11731                 if ($table_property != $perl) {
11732
11733                     # Special case the binary N tables, so that will print
11734                     # \P{single}, but use the Y table values to populate
11735                     # 'single', as we haven't populated the N table.
11736                     my $test_table;
11737                     my $p;
11738                     if ($type == $BINARY
11739                         && $input_table == $property->table('No'))
11740                     {
11741                         $test_table = $property->table('Yes');
11742                         $p = 'P';
11743                     }
11744                     else {
11745                         $test_table = $input_table;
11746                         $p = 'p';
11747                     }
11748
11749                     # Look for a single form amongst all the children.
11750                     foreach my $table ($test_table->children) {
11751                         next if $table->property != $perl;
11752                         my $proposed_name = $table->short_name;
11753                         next if ! defined $proposed_name;
11754
11755                         # Don't mention internal-only properties as a possible
11756                         # single form synonym
11757                         next if substr($proposed_name, 0, 1) eq '_';
11758
11759                         $proposed_name = "\\$p\{$proposed_name}";
11760                         if (! defined $single_form
11761                             || length($proposed_name) < length $single_form)
11762                         {
11763                             $single_form = $proposed_name;
11764
11765                             # The goal here is to find a single form; not the
11766                             # shortest possible one.  We've already found a
11767                             # short name.  So, stop at the first single form
11768                             # found, which is likely to be closer to the
11769                             # original.
11770                             last;
11771                         }
11772                     }
11773                 }
11774
11775                 # Ouput both short and single in the same parenthesized
11776                 # expression, but with only one of 'Single', 'Short' if there
11777                 # are both items.
11778                 if ($short_name || $single_form || $table->conflicting) {
11779                     $parenthesized .= '(';
11780                     $parenthesized .= "Short: $short_name" if $short_name;
11781                     if ($short_name && $single_form) {
11782                         $parenthesized .= ', ';
11783                     }
11784                     elsif ($single_form) {
11785                         $parenthesized .= 'Single: ';
11786                     }
11787                     $parenthesized .= $single_form if $single_form;
11788                 }
11789             }
11790
11791
11792             # Warn if this property isn't the same as one that a
11793             # semi-casual user might expect.  The other components of this
11794             # parenthesized structure are calculated only for the first entry
11795             # for this table, but the conflicting is deemed important enough
11796             # to go on every entry.
11797             my $conflicting = join " NOR ", $table->conflicting;
11798             if ($conflicting) {
11799                 $parenthesized .= '(' if ! $parenthesized;
11800                 $parenthesized .=  '; ' if $parenthesized ne '(';
11801                 $parenthesized .= "NOT $conflicting";
11802             }
11803             $parenthesized .= ')' if $parenthesized;
11804
11805             push @info, $parenthesized if $parenthesized;
11806             push @info, "($string_count)" if $output_range_counts;
11807
11808             # Now, we have both the entry and info so add them to the
11809             # list of all the properties.
11810             push @match_properties,
11811                 format_pod_line($indent_info_column,
11812                                 $entry,
11813                                 join( " ", @info),
11814                                 $alias->status,
11815                                 $alias->loose_match);
11816
11817             $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
11818         } # End of looping through the aliases for this table.
11819
11820         if (! $entry_for_first_table) {
11821             $entry_for_first_table = $entry_for_first_alias;
11822         }
11823     } # End of looping through all the related tables
11824     return;
11825 }
11826
11827 sub pod_alphanumeric_sort {
11828     # Sort pod entries alphanumerically.
11829
11830     # The first few character columns are filler; and get rid of all the
11831     # trailing stuff, starting with the trailing '}', so as to sort on just
11832     # '\p{Name=Value'
11833     my $a = lc substr($a, $FILLER);
11834     $a =~ s/}.*//;
11835     my $b = lc substr($b, $FILLER);
11836     $b =~ s/}.*//;
11837
11838     # Determine if the two operands are numeric property values or not.
11839     # A numeric property will look like \p{xyz: 3}.  But the number
11840     # can begin with an optional minus sign, and may have a
11841     # fraction or rational component, like \p{xyz: 3/2}.  If either
11842     # isn't numeric, use alphabetic sort.
11843     my ($a_initial, $a_number) =
11844         ($a =~ /^\\p{ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
11845     return $a cmp $b unless defined $a_number;
11846     my ($b_initial, $b_number) =
11847         ($b =~ /^\\p{ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
11848     return $a cmp $b unless defined $b_number;
11849
11850     # Here they are both numeric, but use alphabetic sort if the
11851     # initial parts don't match
11852     return $a cmp $b if $a_initial ne $b_initial;
11853
11854     # Convert rationals to floating for the comparison.
11855     $a_number = eval $a_number if $a_number =~ qr{/};
11856     $b_number = eval $b_number if $b_number =~ qr{/};
11857
11858     return $a_number <=> $b_number;
11859 }
11860
11861 sub make_pod () {
11862     # Create the .pod file.  This generates the various subsections and then
11863     # combines them in one big HERE document.
11864
11865     return unless defined $pod_directory;
11866     print "Making pod file\n" if $verbosity >= $PROGRESS;
11867
11868     my $exception_message =
11869     '(Any exceptions are individually noted beginning with the word NOT.)';
11870     my @block_warning;
11871     if (-e 'Blocks.txt') {
11872
11873         # Add the line: '\p{In_*}    \p{Block: *}', with the warning message
11874         # if the global $has_In_conflicts indicates we have them.
11875         push @match_properties, format_pod_line($indent_info_column,
11876                                                 '\p{In_*}',
11877                                                 '\p{Block: *}'
11878                                                     . (($has_In_conflicts)
11879                                                       ? " $exception_message"
11880                                                       : ""));
11881         @block_warning = << "END";
11882
11883 Matches in the Block property have shortcuts that begin with 'In_'.  For
11884 example, \\p{Block=Latin1} can be written as \\p{In_Latin1}.  For backward
11885 compatibility, if there is no conflict with another shortcut, these may also
11886 be written as \\p{Latin1} or \\p{Is_Latin1}.  But, N.B., there are numerous
11887 such conflicting shortcuts.  Use of these forms for Block is discouraged, and
11888 are flagged as such, not only because of the potential confusion as to what is
11889 meant, but also because a later release of Unicode may preempt the shortcut,
11890 and your program would no longer be correct.  Use the 'In_' form instead to
11891 avoid this, or even more clearly, use the compound form, e.g.,
11892 \\p{blk:latin1}.  See L<perlunicode/"Blocks"> for more information about this.
11893 END
11894     }
11895     my $text = "If an entry has flag(s) at its beginning, like '$DEPRECATED', the 'Is_' form has the same flag(s)";
11896     $text = "$exception_message $text" if $has_Is_conflicts;
11897
11898     # And the 'Is_ line';
11899     push @match_properties, format_pod_line($indent_info_column,
11900                                             '\p{Is_*}',
11901                                             "\\p{*} $text");
11902
11903     # Sort the properties array for output.  It is sorted alphabetically
11904     # except numerically for numeric properties, and only output unique lines.
11905     @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
11906
11907     my $formatted_properties = simple_fold(\@match_properties,
11908                                         "",
11909                                         # indent succeeding lines by two extra
11910                                         # which looks better
11911                                         $indent_info_column + 2,
11912
11913                                         # shorten the line length by how much
11914                                         # the formatter indents, so the folded
11915                                         # line will fit in the space
11916                                         # presumably available
11917                                         $automatic_pod_indent);
11918     # Add column headings, indented to be a little more centered, but not
11919     # exactly
11920     $formatted_properties =  format_pod_line($indent_info_column,
11921                                                     '    NAME',
11922                                                     '           INFO')
11923                                     . "\n"
11924                                     . $formatted_properties;
11925
11926     # Generate pod documentation lines for the tables that match nothing
11927     my $zero_matches;
11928     if (@zero_match_tables) {
11929         @zero_match_tables = uniques(@zero_match_tables);
11930         $zero_matches = join "\n\n",
11931                         map { $_ = '=item \p{' . $_->complete_name . "}" }
11932                             sort { $a->complete_name cmp $b->complete_name }
11933                             uniques(@zero_match_tables);
11934
11935         $zero_matches = <<END;
11936
11937 =head2 Legal \\p{} and \\P{} constructs that match no characters
11938
11939 Unicode has some property-value pairs that currently don't match anything.
11940 This happens generally either because they are obsolete, or for symmetry with
11941 other forms, but no language has yet been encoded that uses them.  In this
11942 version of Unicode, the following match zero code points:
11943
11944 =over 4
11945
11946 $zero_matches
11947
11948 =back
11949
11950 END
11951     }
11952
11953     # Generate list of properties that we don't accept, grouped by the reasons
11954     # why.  This is so only put out the 'why' once, and then list all the
11955     # properties that have that reason under it.
11956
11957     my %why_list;   # The keys are the reasons; the values are lists of
11958                     # properties that have the key as their reason
11959
11960     # For each property, add it to the list that are suppressed for its reason
11961     # The sort will cause the alphabetically first properties to be added to
11962     # each list first, so each list will be sorted.
11963     foreach my $property (sort keys %why_suppressed) {
11964         push @{$why_list{$why_suppressed{$property}}}, $property;
11965     }
11966
11967     # For each reason (sorted by the first property that has that reason)...
11968     my @bad_re_properties;
11969     foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
11970                      keys %why_list)
11971     {
11972         # Add to the output, all the properties that have that reason.  Start
11973         # with an empty line.
11974         push @bad_re_properties, "\n\n";
11975
11976         my $has_item = 0;   # Flag if actually output anything.
11977         foreach my $name (@{$why_list{$why}}) {
11978
11979             # Split compound names into $property and $table components
11980             my $property = $name;
11981             my $table;
11982             if ($property =~ / (.*) = (.*) /x) {
11983                 $property = $1;
11984                 $table = $2;
11985             }
11986
11987             # This release of Unicode may not have a property that is
11988             # suppressed, so don't reference a non-existent one.
11989             $property = property_ref($property);
11990             next if ! defined $property;
11991
11992             # And since this list is only for match tables, don't list the
11993             # ones that don't have match tables.
11994             next if ! $property->to_create_match_tables;
11995
11996             # Find any abbreviation, and turn it into a compound name if this
11997             # is a property=value pair.
11998             my $short_name = $property->name;
11999             $short_name .= '=' . $property->table($table)->name if $table;
12000
12001             # And add the property as an item for the reason.
12002             push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
12003             $has_item = 1;
12004         }
12005
12006         # And add the reason under the list of properties, if such a list
12007         # actually got generated.  Note that the header got added
12008         # unconditionally before.  But pod ignores extra blank lines, so no
12009         # harm.
12010         push @bad_re_properties, "\n$why\n" if $has_item;
12011
12012     } # End of looping through each reason.
12013
12014     # Generate a list of the properties whose map table we output, from the
12015     # global @map_properties.
12016     my @map_tables_actually_output;
12017     my $info_indent = 20;       # Left column is narrower than \p{} table.
12018     foreach my $property (@map_properties) {
12019
12020         # Get the path to the file; don't output any not in the standard
12021         # directory.
12022         my @path = $property->file_path;
12023         next if $path[0] ne $map_directory;
12024         shift @path;    # Remove the standard name
12025
12026         my $file = join '/', @path; # In case is in sub directory
12027         my $info = $property->full_name;
12028         my $short_name = $property->name;
12029         if ($info ne $short_name) {
12030             $info .= " ($short_name)";
12031         }
12032         foreach my $more_info ($property->description,
12033                                $property->note,
12034                                $property->status_info)
12035         {
12036             next unless $more_info;
12037             $info =~ s/\.\Z//;
12038             $info .= ".  $more_info";
12039         }
12040         push @map_tables_actually_output, format_pod_line($info_indent,
12041                                                           $file,
12042                                                           $info,
12043                                                           $property->status);
12044     }
12045
12046     # Sort alphabetically, and fold for output
12047     @map_tables_actually_output = sort
12048                             pod_alphanumeric_sort @map_tables_actually_output;
12049     @map_tables_actually_output
12050                         = simple_fold(\@map_tables_actually_output,
12051                                         ' ',
12052                                         $info_indent,
12053                                         $automatic_pod_indent);
12054
12055     # Generate a list of the formats that can appear in the map tables.
12056     my @map_table_formats;
12057     foreach my $format (sort keys %map_table_formats) {
12058         push @map_table_formats, " $format    $map_table_formats{$format}\n";
12059     }
12060
12061     # Everything is ready to assemble.
12062     my @OUT = << "END";
12063 =begin comment
12064
12065 $HEADER
12066
12067 To change this file, edit $0 instead.
12068
12069 =end comment
12070
12071 =head1 NAME
12072
12073 $pod_file - Complete index of Unicode Version $string_version properties in the Perl core.
12074
12075 =head1 DESCRIPTION
12076
12077 There are many properties in Unicode, and Perl provides access to almost all of
12078 them, as well as some additional extensions and short-cut synonyms.
12079
12080 And just about all of the few that aren't accessible through the Perl
12081 core are accessible through the modules: Unicode::Normalize and
12082 Unicode::UCD, and for Unihan properties, via the CPAN module Unicode::Unihan.
12083
12084 This document merely lists all available properties and does not attempt to
12085 explain what each property really means.  There is a brief description of each
12086 Perl extension.  There is some detail about Blocks, Scripts, General_Category,
12087 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
12088 Unicode properties, refer to the Unicode standard.  A good starting place is
12089 L<$unicode_reference_url>.  More information on the Perl extensions is in
12090 L<perlrecharclass>.
12091
12092 Note that you can define your own properties; see
12093 L<perlunicode/"User-Defined Character Properties">.
12094
12095 =head1 Properties accessible through \\p{} and \\P{}
12096
12097 The Perl regular expression \\p{} and \\P{} constructs give access to most of
12098 the Unicode character properties.  The table below shows all these constructs,
12099 both single and compound forms.
12100
12101 B<Compound forms> consist of two components, separated by an equals sign or a
12102 colon.  The first component is the property name, and the second component is
12103 the particular value of the property to match against, for example,
12104 '\\p{Script: Greek}' or '\\p{Script=Greek}' both mean to match characters
12105 whose Script property is Greek.
12106
12107 B<Single forms>, like '\\p{Greek}', are mostly Perl-defined shortcuts for
12108 their equivalent compound forms.  The table shows these equivalences.  (In our
12109 example, '\\p{Greek}' is a just a shortcut for '\\p{Script=Greek}'.)
12110 There are also a few Perl-defined single forms that are not shortcuts for a
12111 compound form.  One such is \\p{Word}.  These are also listed in the table.
12112
12113 In parsing these constructs, Perl always ignores Upper/lower case differences
12114 everywhere within the {braces}.  Thus '\\p{Greek}' means the same thing as
12115 '\\p{greek}'.  But note that changing the case of the 'p' or 'P' before the
12116 left brace completely changes the meaning of the construct, from "match" (for
12117 '\\p{}') to "doesn't match" (for '\\P{}').  Casing in this document is for
12118 improved legibility.
12119
12120 Also, white space, hyphens, and underscores are also normally ignored
12121 everywhere between the {braces}, and hence can be freely added or removed
12122 even if the C</x> modifier hasn't been specified on the regular expression.
12123 But $a_bold_stricter at the beginning of an entry in the table below
12124 means that tighter (stricter) rules are used for that entry:
12125
12126 =over 4
12127
12128 =item Single form (\\p{name}) tighter rules:
12129
12130 White space, hyphens, and underscores ARE significant
12131 except for:
12132
12133 =over 4
12134
12135 =item * white space adjacent to a non-word character
12136
12137 =item * underscores separating digits in numbers
12138
12139 =back
12140
12141 That means, for example, that you can freely add or remove white space
12142 adjacent to (but within) the braces without affecting the meaning.
12143
12144 =item Compound form (\\p{name=value} or \\p{name:value}) tighter rules:
12145
12146 The tighter rules given above for the single form apply to everything to the
12147 right of the colon or equals; the looser rules still apply to everything to
12148 the left.
12149
12150 That means, for example, that you can freely add or remove white space
12151 adjacent to (but within) the braces and the colon or equal sign.
12152
12153 =back
12154
12155 Some properties are considered obsolete, but still available.  There are
12156 several varieties of obsolesence:
12157
12158 =over 4
12159
12160 =item Obsolete
12161
12162 Properties marked with $a_bold_obsolete in the table are considered
12163 obsolete.  At the time of this writing (Unicode version 5.2) there is no
12164 information in the Unicode standard about the implications of a property being
12165 obsolete.
12166
12167 =item Stabilized
12168
12169 Obsolete properties may be stabilized.  This means that they are not actively
12170 maintained by Unicode, and will not be extended as new characters are added to
12171 the standard.  Such properties are marked with $a_bold_stabilized in the
12172 table.  At the time of this writing (Unicode version 5.2) there is no further
12173 information in the Unicode standard about the implications of a property being
12174 stabilized.
12175
12176 =item Deprecated
12177
12178 Obsolete properties may be deprecated.  This means that their use is strongly
12179 discouraged, so much so that a warning will be issued if used, unless the
12180 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
12181 statement.  $A_bold_deprecated flags each such entry in the table, and
12182 the entry there for the longest, most descriptive version of the property will
12183 give the reason it is deprecated, and perhaps advice.  Perl may issue such a
12184 warning, even for properties that aren't officially deprecated by Unicode,
12185 when there used to be characters or code points that were matched by them, but
12186 no longer.  This is to warn you that your program may not work like it did on
12187 earlier Unicode releases.
12188
12189 A deprecated property may be made unavailable in a future Perl version, so it
12190 is best to move away from them.
12191
12192 =back
12193
12194 Some Perl extensions are present for backwards compatibility and are
12195 discouraged from being used, but not obsolete.  $A_bold_discouraged
12196 flags each such entry in the table.
12197
12198 @block_warning
12199
12200 The table below has two columns.  The left column contains the \\p{}
12201 constructs to look up, possibly preceeded by the flags mentioned above; and
12202 the right column contains information about them, like a description, or
12203 synonyms.  It shows both the single and compound forms for each property that
12204 has them.  If the left column is a short name for a property, the right column
12205 will give its longer, more descriptive name; and if the left column is the
12206 longest name, the right column will show any equivalent shortest name, in both
12207 single and compound forms if applicable.
12208
12209 The right column will also caution you if a property means something different
12210 than what might normally be expected.
12211
12212 Numbers in (parentheses) indicate the total number of code points matched by
12213 the property.  For emphasis, those properties that match no code points at all
12214 are listed as well in a separate section following the table.
12215
12216 There is no description given for most non-Perl defined properties (See
12217 $unicode_reference_url for that).
12218
12219 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
12220 combinations.  For example, entries like:
12221
12222  \\p{Gc: *}                                  \\p{General_Category: *}
12223
12224 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
12225 for the latter is also valid for the former.  Similarly,
12226
12227  \\p{Is_*}                                   \\p{*}
12228
12229 means that if and only if, for example, \\p{Foo} exists, then \\p{Is_Foo} and
12230 \\p{IsFoo} are also valid and all mean the same thing.  And similarly,
12231 \\p{Foo=Bar} means the same as \\p{Is_Foo=Bar} and \\p{IsFoo=Bar}.  '*' here
12232 is restricted to something not beginning with an underscore.
12233
12234 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
12235 And 'No', 'F', and 'False' are all synonyms for 'N'.  The table shows 'Y*' and
12236 'N*' to indicate this, and doesn't have separate entries for the other
12237 possibilities.  Note that not all properties which have values 'Yes' and 'No'
12238 are binary, and they have all their values spelled out without using this wild
12239 card, and a C<NOT> clause in their description that highlights their not being
12240 binary.  These also require the compound form to match them, whereas true
12241 binary properties have both single and compound forms available.
12242
12243 Note that all non-essential underscores are removed in the display of the
12244 short names below.
12245
12246 B<Summary legend:>
12247
12248 =over 4
12249
12250 =item B<*> is a wild-card
12251
12252 =item B<(\\d+)> in the info column gives the number of code points matched by
12253 this property.
12254
12255 =item B<$DEPRECATED> means this is deprecated.
12256
12257 =item B<$OBSOLETE> means this is obsolete.
12258
12259 =item B<$STABILIZED> means this is stabilized.
12260
12261 =item B<$STRICTER> means tighter (stricter) name matching applies.
12262
12263 =item B<$DISCOURAGED> means use of this form is discouraged.
12264
12265 =back
12266
12267 $formatted_properties
12268
12269 $zero_matches
12270
12271 =head1 Properties not accessible through \\p{} and \\P{}
12272
12273 A few properties are accessible in Perl via various function calls only.
12274 These are:
12275  Lowercase_Mapping          lc() and lcfirst()
12276  Titlecase_Mapping          ucfirst()
12277  Uppercase_Mapping          uc()
12278
12279 Case_Folding is accessible through the /i modifier in regular expressions.
12280
12281 The Name property is accessible through the \\N{} interpolation in
12282 double-quoted strings and regular expressions, but both usages require a C<use
12283 charnames;> to be specified, which also contains related functions viacode()
12284 and vianame().
12285
12286 =head1 Unicode regular expression properties that are NOT accepted by Perl
12287
12288 Perl will generate an error for a few character properties in Unicode when
12289 used in a regular expression.  The non-Unihan ones are listed below, with the
12290 reasons they are not accepted, perhaps with work-arounds.  The short names for
12291 the properties are listed enclosed in (parentheses).
12292
12293 =over 4
12294
12295 @bad_re_properties
12296
12297 =back
12298
12299 An installation can choose to allow any of these to be matched by changing the
12300 controlling lists contained in the program C<\$Config{privlib}>/F<unicore/$0>
12301 and then re-running F<$0>.  (C<\%Config> is available from the Config module).
12302
12303 =head1 Files in the I<To> directory (for serious hackers only)
12304
12305 All Unicode properties are really mappings (in the mathematical sense) from
12306 code points to their respective values.  As part of its build process,
12307 Perl constructs tables containing these mappings for all properties that it
12308 deals with.  But only a few of these are written out into files.
12309 Those written out are in the directory C<\$Config{privlib}>/F<unicore/To/>
12310 (%Config is available from the Config module).
12311
12312 Those ones written are ones needed by Perl internally during execution, or for
12313 which there is some demand, and those for which there is no access through the
12314 Perl core.  Generally, properties that can be used in regular expression
12315 matching do not have their map tables written, like Script.  Nor are the
12316 simplistic properties that have a better, more complete version, such as
12317 Simple_Uppercase_Mapping  (Uppercase_Mapping is written instead).
12318
12319 None of the properties in the I<To> directory are currently directly
12320 accessible through the Perl core, although some may be accessed indirectly.
12321 For example, the uc() function implements the Uppercase_Mapping property and
12322 uses the F<Upper.pl> file found in this directory.
12323
12324 The available files with their properties (short names in parentheses),
12325 and any flags or comments about them, are:
12326
12327 @map_tables_actually_output
12328
12329 An installation can choose to change which files are generated by changing the
12330 controlling lists contained in the program C<\$Config{privlib}>/F<unicore/$0>
12331 and then re-running F<$0>.
12332
12333 Each of these files defines two hash entries to help reading programs decipher
12334 it.  One of them looks like this:
12335
12336     \$utf8::SwashInfo{'ToNAME'}{'format'} = 's';
12337
12338 where 'NAME' is a name to indicate the property.  For backwards compatibility,
12339 this is not necessarily the property's official Unicode name.  (The 'To' is
12340 also for backwards compatibility.)  The hash entry gives the format of the
12341 mapping fields of the table, currently one of the following:
12342
12343  @map_table_formats
12344
12345 This format applies only to the entries in the main body of the table.
12346 Entries defined in hashes or ones that are missing from the list can have a
12347 different format.
12348
12349 The value that the missing entries have is given by the other SwashInfo hash
12350 entry line; it looks like this:
12351
12352     \$utf8::SwashInfo{'ToNAME'}{'missing'} = 'NaN';
12353
12354 This example line says that any Unicode code points not explicitly listed in
12355 the file have the value 'NaN' under the property indicated by NAME.  If the
12356 value is the special string C<< <code point> >>, it means that the value for
12357 any missing code point is the code point itself.  This happens, for example,
12358 in the file for Uppercase_Mapping (To/Upper.pl), in which code points like the
12359 character 'A', are missing because the uppercase of 'A' is itself.
12360
12361 =head1 SEE ALSO
12362
12363 L<$unicode_reference_url>
12364
12365 L<perlrecharclass>
12366
12367 L<perlunicode>
12368
12369 END
12370
12371     # And write it.
12372     main::write([ $pod_directory, "$pod_file.pod" ], @OUT);
12373     return;
12374 }
12375
12376 sub make_Heavy () {
12377     # Create and write Heavy.pl, which passes info about the tables to
12378     # utf8_heavy.pl
12379
12380     my @heavy = <<END;
12381 $HEADER
12382 $INTERNAL_ONLY
12383
12384 # This file is for the use of utf8_heavy.pl
12385
12386 # Maps property names in loose standard form to its standard name
12387 \%utf8::loose_property_name_of = (
12388 END
12389
12390     push @heavy, simple_dumper (\%loose_property_name_of, ' ' x 4);
12391     push @heavy, <<END;
12392 );
12393
12394 # Maps property, table to file for those using stricter matching
12395 \%utf8::stricter_to_file_of = (
12396 END
12397     push @heavy, simple_dumper (\%stricter_to_file_of, ' ' x 4);
12398     push @heavy, <<END;
12399 );
12400
12401 # Maps property, table to file for those using loose matching
12402 \%utf8::loose_to_file_of = (
12403 END
12404     push @heavy, simple_dumper (\%loose_to_file_of, ' ' x 4);
12405     push @heavy, <<END;
12406 );
12407
12408 # Maps floating point to fractional form
12409 \%utf8::nv_floating_to_rational = (
12410 END
12411     push @heavy, simple_dumper (\%nv_floating_to_rational, ' ' x 4);
12412     push @heavy, <<END;
12413 );
12414
12415 # If a floating point number doesn't have enough digits in it to get this
12416 # close to a fraction, it isn't considered to be that fraction even if all the
12417 # digits it does have match.
12418 \$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
12419
12420 # Deprecated tables to generate a warning for.  The key is the file containing
12421 # the table, so as to avoid duplication, as many property names can map to the
12422 # file, but we only need one entry for all of them.
12423 \%utf8::why_deprecated = (
12424 END
12425
12426     push @heavy, simple_dumper (\%utf8::why_deprecated, ' ' x 4);
12427     push @heavy, <<END;
12428 );
12429
12430 1;
12431 END
12432
12433     main::write("Heavy.pl", @heavy);
12434     return;
12435 }
12436
12437 sub write_all_tables() {
12438     # Write out all the tables generated by this program to files, as well as
12439     # the supporting data structures, pod file, and .t file.
12440
12441     my @writables;              # List of tables that actually get written
12442     my %match_tables_to_write;  # Used to collapse identical match tables
12443                                 # into one file.  Each key is a hash function
12444                                 # result to partition tables into buckets.
12445                                 # Each value is an array of the tables that
12446                                 # fit in the bucket.
12447
12448     # For each property ...
12449     # (sort so that if there is an immutable file name, it has precedence, so
12450     # some other property can't come in and take over its file name.  If b's
12451     # file name is defined, will return 1, meaning to take it first; don't
12452     # care if both defined, as they had better be different anyway)
12453     PROPERTY:
12454     foreach my $property (sort { defined $b->file } property_ref('*')) {
12455         my $type = $property->type;
12456
12457         # And for each table for that property, starting with the mapping
12458         # table for it ...
12459         TABLE:
12460         foreach my $table($property,
12461
12462                         # and all the match tables for it (if any), sorted so
12463                         # the ones with the shortest associated file name come
12464                         # first.  The length sorting prevents problems of a
12465                         # longer file taking a name that might have to be used
12466                         # by a shorter one.  The alphabetic sorting prevents
12467                         # differences between releases
12468                         sort {  my $ext_a = $a->external_name;
12469                                 return 1 if ! defined $ext_a;
12470                                 my $ext_b = $b->external_name;
12471                                 return -1 if ! defined $ext_b;
12472                                 my $cmp = length $ext_a <=> length $ext_b;
12473
12474                                 # Return result if lengths not equal
12475                                 return $cmp if $cmp;
12476
12477                                 # Alphabetic if lengths equal
12478                                 return $ext_a cmp $ext_b
12479                         } $property->tables
12480                     )
12481         {
12482
12483             # Here we have a table associated with a property.  It could be
12484             # the map table (done first for each property), or one of the
12485             # other tables.  Determine which type.
12486             my $is_property = $table->isa('Property');
12487
12488             my $name = $table->name;
12489             my $complete_name = $table->complete_name;
12490
12491             # See if should suppress the table if is empty, but warn if it
12492             # contains something.
12493             my $suppress_if_empty_warn_if_not = grep { $complete_name eq $_ }
12494                                     keys %why_suppress_if_empty_warn_if_not;
12495
12496             # Calculate if this table should have any code points associated
12497             # with it or not.
12498             my $expected_empty =
12499
12500                 # $perl should be empty, as well as properties that we just
12501                 # don't do anything with
12502                 ($is_property
12503                     && ($table == $perl
12504                         || grep { $complete_name eq $_ }
12505                                                     @unimplemented_properties
12506                     )
12507                 )
12508
12509                 # Match tables in properties we skipped populating should be
12510                 # empty
12511                 || (! $is_property && ! $property->to_create_match_tables)
12512
12513                 # Tables and properties that are expected to have no code
12514                 # points should be empty
12515                 || $suppress_if_empty_warn_if_not
12516             ;
12517
12518             # Set a boolean if this table is the complement of an empty binary
12519             # table
12520             my $is_complement_of_empty_binary =
12521                 $type == $BINARY &&
12522                 (($table == $property->table('Y')
12523                     && $property->table('N')->is_empty)
12524                 || ($table == $property->table('N')
12525                     && $property->table('Y')->is_empty));
12526
12527
12528             # Some tables should match everything
12529             my $expected_full =
12530                 ($is_property)
12531                 ? # All these types of map tables will be full because
12532                   # they will have been populated with defaults
12533                   ($type == $ENUM || $type == $BINARY)
12534
12535                 : # A match table should match everything if its method
12536                   # shows it should
12537                   ($table->matches_all
12538
12539                   # The complement of an empty binary table will match
12540                   # everything
12541                   || $is_complement_of_empty_binary
12542                   )
12543             ;
12544
12545             if ($table->is_empty) {
12546
12547
12548                 if ($suppress_if_empty_warn_if_not) {
12549                     $table->set_status($SUPPRESSED,
12550                         $why_suppress_if_empty_warn_if_not{$complete_name});
12551                 }
12552
12553                 # Suppress expected empty tables.
12554                 next TABLE if $expected_empty;
12555
12556                 # And setup to later output a warning for those that aren't
12557                 # known to be allowed to be empty.  Don't do the warning if
12558                 # this table is a child of another one to avoid duplicating
12559                 # the warning that should come from the parent one.
12560                 if (($table == $property || $table->parent == $table)
12561                     && $table->status ne $SUPPRESSED
12562                     && ! grep { $complete_name =~ /^$_$/ }
12563                                                     @tables_that_may_be_empty)
12564                 {
12565                     push @unhandled_properties, "$table";
12566                 }
12567             }
12568             elsif ($expected_empty) {
12569                 my $because = "";
12570                 if ($suppress_if_empty_warn_if_not) {
12571                     $because = " because $why_suppress_if_empty_warn_if_not{$complete_name}";
12572                 }
12573
12574                 Carp::my_carp("Not expecting property $table$because.  Generating file for it anyway.");
12575             }
12576
12577             my $count = $table->count;
12578             if ($expected_full) {
12579                 if ($count != $MAX_UNICODE_CODEPOINTS) {
12580                     Carp::my_carp("$table matches only "
12581                     . clarify_number($count)
12582                     . " Unicode code points but should match "
12583                     . clarify_number($MAX_UNICODE_CODEPOINTS)
12584                     . " (off by "
12585                     .  clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
12586                     . ").  Proceeding anyway.");
12587                 }
12588
12589                 # Here is expected to be full.  If it is because it is the
12590                 # complement of an (empty) binary table that is to be
12591                 # suppressed, then suppress this one as well.
12592                 if ($is_complement_of_empty_binary) {
12593                     my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
12594                     my $opposing = $property->table($opposing_name);
12595                     my $opposing_status = $opposing->status;
12596                     if ($opposing_status) {
12597                         $table->set_status($opposing_status,
12598                                            $opposing->status_info);
12599                     }
12600                 }
12601             }
12602             elsif ($count == $MAX_UNICODE_CODEPOINTS) {
12603                 if ($table == $property || $table->leader == $table) {
12604                     Carp::my_carp("$table unexpectedly matches all Unicode code points.  Proceeding anyway.");
12605                 }
12606             }
12607
12608             if ($table->status eq $SUPPRESSED) {
12609                 if (! $is_property) {
12610                     my @children = $table->children;
12611                     foreach my $child (@children) {
12612                         if ($child->status ne $SUPPRESSED) {
12613                             Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
12614                         }
12615                     }
12616                 }
12617                 next TABLE;
12618
12619             }
12620             if (! $is_property) {
12621
12622                 # Several things need to be done just once for each related
12623                 # group of match tables.  Do them on the parent.
12624                 if ($table->parent == $table) {
12625
12626                     # Add an entry in the pod file for the table; it also does
12627                     # the children.
12628                     make_table_pod_entries($table);
12629
12630                     # See if the the table matches identical code points with
12631                     # something that has already been output.  In that case,
12632                     # no need to have two files with the same code points in
12633                     # them.  We use the table's hash() method to store these
12634                     # in buckets, so that it is quite likely that if two
12635                     # tables are in the same bucket they will be identical, so
12636                     # don't have to compare tables frequently.  The tables
12637                     # have to have the same status to share a file, so add
12638                     # this to the bucket hash.  (The reason for this latter is
12639                     # that Heavy.pl associates a status with a file.)
12640                     my $hash = $table->hash . ';' . $table->status;
12641
12642                     # Look at each table that is in the same bucket as this
12643                     # one would be.
12644                     foreach my $comparison (@{$match_tables_to_write{$hash}})
12645                     {
12646                         if ($table->matches_identically_to($comparison)) {
12647                             $table->set_equivalent_to($comparison,
12648                                                                 Related => 0);
12649                             next TABLE;
12650                         }
12651                     }
12652
12653                     # Here, not equivalent, add this table to the bucket.
12654                     push @{$match_tables_to_write{$hash}}, $table;
12655                 }
12656             }
12657             else {
12658
12659                 # Here is the property itself.
12660                 # Don't write out or make references to the $perl property
12661                 next if $table == $perl;
12662
12663                 if ($type != $STRING) {
12664
12665                     # There is a mapping stored of the various synonyms to the
12666                     # standardized name of the property for utf8_heavy.pl.
12667                     # Also, the pod file contains entries of the form:
12668                     # \p{alias: *}         \p{full: *}
12669                     # rather than show every possible combination of things.
12670
12671                     my @property_aliases = $property->aliases;
12672
12673                     # The full name of this property is stored by convention
12674                     # first in the alias array
12675                     my $full_property_name =
12676                                 '\p{' . $property_aliases[0]->name . ': *}';
12677                     my $standard_property_name = standardize($table->name);
12678
12679                     # For each synonym ...
12680                     for my $i (0 .. @property_aliases - 1)  {
12681                         my $alias = $property_aliases[$i];
12682                         my $alias_name = $alias->name;
12683                         my $alias_standard = standardize($alias_name);
12684
12685                         # Set the mapping for utf8_heavy of the alias to the
12686                         # property
12687                         if (exists ($loose_property_name_of{$alias_standard}))
12688                         {
12689                             Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}.  Old name is retained");
12690                         }
12691                         else {
12692                             $loose_property_name_of{$alias_standard}
12693                                                 = $standard_property_name;
12694                         }
12695
12696                         # Now for the pod entry for this alias.  Skip
12697                         # the first one, which is the full name so won't have
12698                         # an entry like: '\p{full: *}   \p{full: *}', and skip
12699                         # if don't want an entry for this one.
12700                         next if $i == 0 || ! $alias->make_pod_entry;
12701
12702                         push @match_properties,
12703                             format_pod_line($indent_info_column,
12704                                         '\p{' . $alias->name . ': *}',
12705                                         $full_property_name,
12706                                         $alias->status);
12707                     }
12708                 } # End of non-string-like property code
12709
12710
12711                 # Don't output a mapping file if not desired.
12712                 next if ! $property->to_output_map;
12713             }
12714
12715             # Here, we know we want to write out the table, but don't do it
12716             # yet because there may be other tables that come along and will
12717             # want to share the file, and the file's comments will change to
12718             # mention them.  So save for later.
12719             push @writables, $table;
12720
12721         } # End of looping through the property and all its tables.
12722     } # End of looping through all properties.
12723
12724     # Now have all the tables that will have files written for them.  Do it.
12725     foreach my $table (@writables) {
12726         my @directory;
12727         my $filename;
12728         my $property = $table->property;
12729         my $is_property = ($table == $property);
12730         if (! $is_property) {
12731
12732             # Match tables for the property go in lib/$subdirectory, which is
12733             # the property's name.  Don't use the standard file name for this,
12734             # as may get an unfamiliar alias
12735             @directory = ($matches_directory, $property->external_name);
12736         }
12737         else {
12738
12739             @directory = $table->directory;
12740             $filename = $table->file;
12741         }
12742
12743         # Use specified filename if avaliable, or default to property's
12744         # shortest name.  We need an 8.3 safe filename (which means "an 8
12745         # safe" filename, since after the dot is only 'pl', which is < 3)
12746         # The 2nd parameter is if the filename shouldn't be changed, and
12747         # it shouldn't iff there is a hard-coded name for this table.
12748         $filename = construct_filename(
12749                                 $filename || $table->external_name,
12750                                 ! $filename,    # mutable if no filename
12751                                 \@directory);
12752
12753         register_file_for_name($table, \@directory, $filename);
12754
12755         # Only need to write one file when shared by more than one
12756         # property
12757         next if ! $is_property && $table->leader != $table;
12758
12759         # Construct a nice comment to add to the file
12760         $table->set_final_comment;
12761
12762         $table->write;
12763     }
12764
12765
12766     # Write out the pod file
12767     make_pod;
12768
12769     # And Heavy.pl
12770     make_Heavy;
12771
12772     make_property_test_script() if $make_test_script;
12773     return;
12774 }
12775
12776 my @white_space_separators = ( # This used only for making the test script.
12777                             "",
12778                             ' ',
12779                             "\t",
12780                             '   '
12781                         );
12782
12783 sub generate_separator($) {
12784     # This used only for making the test script.  It generates the colon or
12785     # equal separator between the property and property value, with random
12786     # white space surrounding the separator
12787
12788     my $lhs = shift;
12789
12790     return "" if $lhs eq "";  # No separator if there's only one (the r) side
12791
12792     # Choose space before and after randomly
12793     my $spaces_before =$white_space_separators[rand(@white_space_separators)];
12794     my $spaces_after = $white_space_separators[rand(@white_space_separators)];
12795
12796     # And return the whole complex, half the time using a colon, half the
12797     # equals
12798     return $spaces_before
12799             . (rand() < 0.5) ? '=' : ':'
12800             . $spaces_after;
12801 }
12802
12803 sub generate_tests($$$$$$) {
12804     # This used only for making the test script.  It generates test cases that
12805     # are expected to compile successfully in perl.  Note that the lhs and
12806     # rhs are assumed to already be as randomized as the caller wants.
12807
12808     my $file_handle = shift;   # Where to output the tests
12809     my $lhs = shift;           # The property: what's to the left of the colon
12810                                #  or equals separator
12811     my $rhs = shift;           # The property value; what's to the right
12812     my $valid_code = shift;    # A code point that's known to be in the
12813                                # table given by lhs=rhs; undef if table is
12814                                # empty
12815     my $invalid_code = shift;  # A code point known to not be in the table;
12816                                # undef if the table is all code points
12817     my $warning = shift;
12818
12819     # Get the colon or equal
12820     my $separator = generate_separator($lhs);
12821
12822     # The whole 'property=value'
12823     my $name = "$lhs$separator$rhs";
12824
12825     # Create a complete set of tests, with complements.
12826     if (defined $valid_code) {
12827         printf $file_handle
12828                     qq/Expect(1, $valid_code, '\\p{$name}', $warning);\n/;
12829         printf $file_handle
12830                     qq/Expect(0, $valid_code, '\\p{^$name}', $warning);\n/;
12831         printf $file_handle
12832                     qq/Expect(0, $valid_code, '\\P{$name}', $warning);\n/;
12833         printf $file_handle
12834                     qq/Expect(1, $valid_code, '\\P{^$name}', $warning);\n/;
12835     }
12836     if (defined $invalid_code) {
12837         printf $file_handle
12838                     qq/Expect(0, $invalid_code, '\\p{$name}', $warning);\n/;
12839         printf $file_handle
12840                     qq/Expect(1, $invalid_code, '\\p{^$name}', $warning);\n/;
12841         printf $file_handle
12842                     qq/Expect(1, $invalid_code, '\\P{$name}', $warning);\n/;
12843         printf $file_handle
12844                     qq/Expect(0, $invalid_code, '\\P{^$name}', $warning);\n/;
12845     }
12846     return;
12847 }
12848
12849 sub generate_error($$$$) {
12850     # This used only for making the test script.  It generates test cases that
12851     # are expected to not only not match, but to be syntax or similar errors
12852
12853     my $file_handle = shift;        # Where to output to.
12854     my $lhs = shift;                # The property: what's to the left of the
12855                                     # colon or equals separator
12856     my $rhs = shift;                # The property value; what's to the right
12857     my $already_in_error = shift;   # Boolean; if true it's known that the
12858                                 # unmodified lhs and rhs will cause an error.
12859                                 # This routine should not force another one
12860     # Get the colon or equal
12861     my $separator = generate_separator($lhs);
12862
12863     # Since this is an error only, don't bother to randomly decide whether to
12864     # put the error on the left or right side; and assume that the rhs is
12865     # loosely matched, again for convenience rather than rigor.
12866     $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
12867
12868     my $property = $lhs . $separator . $rhs;
12869
12870     print $file_handle qq/Error('\\p{$property}');\n/;
12871     print $file_handle qq/Error('\\P{$property}');\n/;
12872     return;
12873 }
12874
12875 # These are used only for making the test script
12876 # XXX Maybe should also have a bad strict seps, which includes underscore.
12877
12878 my @good_loose_seps = (
12879             " ",
12880             "-",
12881             "\t",
12882             "",
12883             "_",
12884            );
12885 my @bad_loose_seps = (
12886            "/a/",
12887            ':=',
12888           );
12889
12890 sub randomize_stricter_name {
12891     # This used only for making the test script.  Take the input name and
12892     # return a randomized, but valid version of it under the stricter matching
12893     # rules.
12894
12895     my $name = shift;
12896     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12897
12898     # If the name looks like a number (integer, floating, or rational), do
12899     # some extra work
12900     if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
12901         my $sign = $1;
12902         my $number = $2;
12903         my $separator = $3;
12904
12905         # If there isn't a sign, part of the time add a plus
12906         # Note: Not testing having any denominator having a minus sign
12907         if (! $sign) {
12908             $sign = '+' if rand() <= .3;
12909         }
12910
12911         # And add 0 or more leading zeros.
12912         $name = $sign . ('0' x int rand(10)) . $number;
12913
12914         if (defined $separator) {
12915             my $extra_zeros = '0' x int rand(10);
12916
12917             if ($separator eq '.') {
12918
12919                 # Similarly, add 0 or more trailing zeros after a decimal
12920                 # point
12921                 $name .= $extra_zeros;
12922             }
12923             else {
12924
12925                 # Or, leading zeros before the denominator
12926                 $name =~ s,/,/$extra_zeros,;
12927             }
12928         }
12929     }
12930
12931     # For legibility of the test, only change the case of whole sections at a
12932     # time.  To do this, first split into sections.  The split returns the
12933     # delimiters
12934     my @sections;
12935     for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
12936         trace $section if main::DEBUG && $to_trace;
12937
12938         if (length $section > 1 && $section !~ /\D/) {
12939
12940             # If the section is a sequence of digits, about half the time
12941             # randomly add underscores between some of them.
12942             if (rand() > .5) {
12943
12944                 # Figure out how many underscores to add.  max is 1 less than
12945                 # the number of digits.  (But add 1 at the end to make sure
12946                 # result isn't 0, and compensate earlier by subtracting 2
12947                 # instead of 1)
12948                 my $num_underscores = int rand(length($section) - 2) + 1;
12949
12950                 # And add them evenly throughout, for convenience, not rigor
12951                 use integer;
12952                 my $spacing = (length($section) - 1)/ $num_underscores;
12953                 my $temp = $section;
12954                 $section = "";
12955                 for my $i (1 .. $num_underscores) {
12956                     $section .= substr($temp, 0, $spacing, "") . '_';
12957                 }
12958                 $section .= $temp;
12959             }
12960             push @sections, $section;
12961         }
12962         else {
12963
12964             # Here not a sequence of digits.  Change the case of the section
12965             # randomly
12966             my $switch = int rand(4);
12967             if ($switch == 0) {
12968                 push @sections, uc $section;
12969             }
12970             elsif ($switch == 1) {
12971                 push @sections, lc $section;
12972             }
12973             elsif ($switch == 2) {
12974                 push @sections, ucfirst $section;
12975             }
12976             else {
12977                 push @sections, $section;
12978             }
12979         }
12980     }
12981     trace "returning", join "", @sections if main::DEBUG && $to_trace;
12982     return join "", @sections;
12983 }
12984
12985 sub randomize_loose_name($;$) {
12986     # This used only for making the test script
12987
12988     my $name = shift;
12989     my $want_error = shift;  # if true, make an error
12990     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12991
12992     $name = randomize_stricter_name($name);
12993
12994     my @parts;
12995     push @parts, $good_loose_seps[rand(@good_loose_seps)];
12996     for my $part (split /[-\s_]+/, $name) {
12997         if (@parts) {
12998             if ($want_error and rand() < 0.3) {
12999                 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
13000                 $want_error = 0;
13001             }
13002             else {
13003                 push @parts, $good_loose_seps[rand(@good_loose_seps)];
13004             }
13005         }
13006         push @parts, $part;
13007     }
13008     my $new = join("", @parts);
13009     trace "$name => $new" if main::DEBUG && $to_trace;
13010
13011     if ($want_error) {
13012         if (rand() >= 0.5) {
13013             $new .= $bad_loose_seps[rand(@bad_loose_seps)];
13014         }
13015         else {
13016             $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
13017         }
13018     }
13019     return $new;
13020 }
13021
13022 # Used to make sure don't generate duplicate test cases.
13023 my %test_generated;
13024
13025 sub make_property_test_script() {
13026     # This used only for making the test script
13027     # this written directly -- it's huge.
13028
13029     print "Making test script\n" if $verbosity >= $PROGRESS;
13030
13031     # This uses randomness to test different possibilities without testing all
13032     # possibilities.  To ensure repeatability, set the seed to 0.  But if
13033     # tests are added, it will perturb all later ones in the .t file
13034     srand 0;
13035
13036     force_unlink ($t_path);
13037     push @files_actually_output, $t_path;
13038     my $OUT;
13039     if (not open $OUT, "> $t_path") {
13040         Carp::my_carp("Can't open $t_path.  Skipping: $!");
13041         return;
13042     }
13043
13044     # Keep going down an order of magnitude
13045     # until find that adding this quantity to
13046     # 1 remains 1; but put an upper limit on
13047     # this so in case this algorithm doesn't
13048     # work properly on some platform, that we
13049     # won't loop forever.
13050     my $digits = 0;
13051     my $min_floating_slop = 1;
13052     while (1+ $min_floating_slop != 1
13053             && $digits++ < 50)
13054     {
13055         my $next = $min_floating_slop / 10;
13056         last if $next == 0; # If underflows,
13057                             # use previous one
13058         $min_floating_slop = $next;
13059     }
13060     print $OUT $HEADER, <DATA>;
13061
13062     foreach my $property (property_ref('*')) {
13063         foreach my $table ($property->tables) {
13064
13065             # Find code points that match, and don't match this table.
13066             my $valid = $table->get_valid_code_point;
13067             my $invalid = $table->get_invalid_code_point;
13068             my $warning = ($table->status eq $DEPRECATED)
13069                             ? "'deprecated'"
13070                             : '""';
13071
13072             # Test each possible combination of the property's aliases with
13073             # the table's.  If this gets to be too many, could do what is done
13074             # in the set_final_comment() for Tables
13075             my @table_aliases = $table->aliases;
13076             my @property_aliases = $table->property->aliases;
13077             my $max = max(scalar @table_aliases, scalar @property_aliases);
13078             for my $j (0 .. $max - 1) {
13079
13080                 # The current alias for property is the next one on the list,
13081                 # or if beyond the end, start over.  Similarly for table
13082                 my $property_name
13083                             = $property_aliases[$j % @property_aliases]->name;
13084
13085                 $property_name = "" if $table->property == $perl;
13086                 my $table_alias = $table_aliases[$j % @table_aliases];
13087                 my $table_name = $table_alias->name;
13088                 my $loose_match = $table_alias->loose_match;
13089
13090                 # If the table doesn't have a file, any test for it is
13091                 # already guaranteed to be in error
13092                 my $already_error = ! $table->file_path;
13093
13094                 # Generate error cases for this alias.
13095                 generate_error($OUT,
13096                                 $property_name,
13097                                 $table_name,
13098                                 $already_error);
13099
13100                 # If the table is guaranteed to always generate an error,
13101                 # quit now without generating success cases.
13102                 next if $already_error;
13103
13104                 # Now for the success cases.
13105                 my $random;
13106                 if ($loose_match) {
13107
13108                     # For loose matching, create an extra test case for the
13109                     # standard name.
13110                     my $standard = standardize($table_name);
13111
13112                     # $test_name should be a unique combination for each test
13113                     # case; used just to avoid duplicate tests
13114                     my $test_name = "$property_name=$standard";
13115
13116                     # Don't output duplicate test cases.
13117                     if (! exists $test_generated{$test_name}) {
13118                         $test_generated{$test_name} = 1;
13119                         generate_tests($OUT,
13120                                         $property_name,
13121                                         $standard,
13122                                         $valid,
13123                                         $invalid,
13124                                         $warning,
13125                                     );
13126                     }
13127                     $random = randomize_loose_name($table_name)
13128                 }
13129                 else { # Stricter match
13130                     $random = randomize_stricter_name($table_name);
13131                 }
13132
13133                 # Now for the main test case for this alias.
13134                 my $test_name = "$property_name=$random";
13135                 if (! exists $test_generated{$test_name}) {
13136                     $test_generated{$test_name} = 1;
13137                     generate_tests($OUT,
13138                                     $property_name,
13139                                     $random,
13140                                     $valid,
13141                                     $invalid,
13142                                     $warning,
13143                                 );
13144
13145                     # If the name is a rational number, add tests for the
13146                     # floating point equivalent.
13147                     if ($table_name =~ qr{/}) {
13148
13149                         # Calculate the float, and find just the fraction.
13150                         my $float = eval $table_name;
13151                         my ($whole, $fraction)
13152                                             = $float =~ / (.*) \. (.*) /x;
13153
13154                         # Starting with one digit after the decimal point,
13155                         # create a test for each possible precision (number of
13156                         # digits past the decimal point) until well beyond the
13157                         # native number found on this machine.  (If we started
13158                         # with 0 digits, it would be an integer, which could
13159                         # well match an unrelated table)
13160                         PLACE:
13161                         for my $i (1 .. $min_floating_slop + 3) {
13162                             my $table_name = sprintf("%.*f", $i, $float);
13163                             if ($i < $MIN_FRACTION_LENGTH) {
13164
13165                                 # If the test case has fewer digits than the
13166                                 # minimum acceptable precision, it shouldn't
13167                                 # succeed, so we expect an error for it.
13168                                 # E.g., 2/3 = .7 at one decimal point, and we
13169                                 # shouldn't say it matches .7.  We should make
13170                                 # it be .667 at least before agreeing that the
13171                                 # intent was to match 2/3.  But at the
13172                                 # less-than- acceptable level of precision, it
13173                                 # might actually match an unrelated number.
13174                                 # So don't generate a test case if this
13175                                 # conflating is possible.  In our example, we
13176                                 # don't want 2/3 matching 7/10, if there is
13177                                 # a 7/10 code point.
13178                                 for my $existing
13179                                         (keys %nv_floating_to_rational)
13180                                 {
13181                                     next PLACE
13182                                         if abs($table_name - $existing)
13183                                                 < $MAX_FLOATING_SLOP;
13184                                 }
13185                                 generate_error($OUT,
13186                                             $property_name,
13187                                             $table_name,
13188                                             1   # 1 => already an error
13189                                 );
13190                             }
13191                             else {
13192
13193                                 # Here the number of digits exceeds the
13194                                 # minimum we think is needed.  So generate a
13195                                 # success test case for it.
13196                                 generate_tests($OUT,
13197                                                 $property_name,
13198                                                 $table_name,
13199                                                 $valid,
13200                                                 $invalid,
13201                                                 $warning,
13202                                 );
13203                             }
13204                         }
13205                     }
13206                 }
13207             }
13208         }
13209     }
13210     print $OUT "Finished();\n";
13211     close $OUT;
13212     return;
13213 }
13214
13215 # This is a list of the input files and how to handle them.  The files are
13216 # processed in their order in this list.  Some reordering is possible if
13217 # desired, but the v0 files should be first, and the extracted before the
13218 # others except DAge.txt (as data in an extracted file can be over-ridden by
13219 # the non-extracted.  Some other files depend on data derived from an earlier
13220 # file, like UnicodeData requires data from Jamo, and the case changing and
13221 # folding requires data from Unicode.  Mostly, it safest to order by first
13222 # version releases in (except the Jamo).  DAge.txt is read before the
13223 # extracted ones because of the rarely used feature $compare_versions.  In the
13224 # unlikely event that there were ever an extracted file that contained the Age
13225 # property information, it would have to go in front of DAge.
13226 #
13227 # The version strings allow the program to know whether to expect a file or
13228 # not, but if a file exists in the directory, it will be processed, even if it
13229 # is in a version earlier than expected, so you can copy files from a later
13230 # release into an earlier release's directory.
13231 my @input_file_objects = (
13232     Input_file->new('PropertyAliases.txt', v0,
13233                     Handler => \&process_PropertyAliases,
13234                     ),
13235     Input_file->new(undef, v0,  # No file associated with this
13236                     Progress_Message => 'Finishing Property Setup',
13237                     Handler => \&finish_property_setup,
13238                     ),
13239     Input_file->new('PropValueAliases.txt', v0,
13240                      Handler => \&process_PropValueAliases,
13241                      Has_Missings_Defaults => $NOT_IGNORED,
13242                      ),
13243     Input_file->new('DAge.txt', v3.2.0,
13244                     Has_Missings_Defaults => $NOT_IGNORED,
13245                     Property => 'Age'
13246                     ),
13247     Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
13248                     Property => 'General_Category',
13249                     ),
13250     Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
13251                     Property => 'Canonical_Combining_Class',
13252                     Has_Missings_Defaults => $NOT_IGNORED,
13253                     ),
13254     Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
13255                     Property => 'Numeric_Type',
13256                     Has_Missings_Defaults => $NOT_IGNORED,
13257                     ),
13258     Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
13259                     Property => 'East_Asian_Width',
13260                     Has_Missings_Defaults => $NOT_IGNORED,
13261                     ),
13262     Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
13263                     Property => 'Line_Break',
13264                     Has_Missings_Defaults => $NOT_IGNORED,
13265                     ),
13266     Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
13267                     Property => 'Bidi_Class',
13268                     Has_Missings_Defaults => $NOT_IGNORED,
13269                     ),
13270     Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
13271                     Property => 'Decomposition_Type',
13272                     Has_Missings_Defaults => $NOT_IGNORED,
13273                     ),
13274     Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
13275     Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
13276                     Property => 'Numeric_Value',
13277                     Each_Line_Handler => \&filter_numeric_value_line,
13278                     Has_Missings_Defaults => $NOT_IGNORED,
13279                     ),
13280     Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
13281                     Property => 'Joining_Group',
13282                     Has_Missings_Defaults => $NOT_IGNORED,
13283                     ),
13284
13285     Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
13286                     Property => 'Joining_Type',
13287                     Has_Missings_Defaults => $NOT_IGNORED,
13288                     ),
13289     Input_file->new('Jamo.txt', v2.0.0,
13290                     Property => 'Jamo_Short_Name',
13291                     Each_Line_Handler => \&filter_jamo_line,
13292                     ),
13293     Input_file->new('UnicodeData.txt', v1.1.5,
13294                     Pre_Handler => \&setup_UnicodeData,
13295
13296                     # We clean up this file for some early versions.
13297                     Each_Line_Handler => [ (($v_version lt v2.0.0 )
13298                                             ? \&filter_v1_ucd
13299                                             : ($v_version eq v2.1.5)
13300                                                 ? \&filter_v2_1_5_ucd
13301                                                 : undef),
13302
13303                                             # And the main filter
13304                                             \&filter_UnicodeData_line,
13305                                          ],
13306                     EOF_Handler => \&EOF_UnicodeData,
13307                     ),
13308     Input_file->new('ArabicShaping.txt', v2.0.0,
13309                     Each_Line_Handler =>
13310                         [ ($v_version lt 4.1.0)
13311                                     ? \&filter_old_style_arabic_shaping
13312                                     : undef,
13313                         \&filter_arabic_shaping_line,
13314                         ],
13315                     Has_Missings_Defaults => $NOT_IGNORED,
13316                     ),
13317     Input_file->new('Blocks.txt', v2.0.0,
13318                     Property => 'Block',
13319                     Has_Missings_Defaults => $NOT_IGNORED,
13320                     Each_Line_Handler => \&filter_blocks_lines
13321                     ),
13322     Input_file->new('PropList.txt', v2.0.0,
13323                     Each_Line_Handler => (($v_version lt v3.1.0)
13324                                             ? \&filter_old_style_proplist
13325                                             : undef),
13326                     ),
13327     Input_file->new('Unihan.txt', v2.0.0,
13328                     Pre_Handler => \&setup_unihan,
13329                     Optional => 1,
13330                     Each_Line_Handler => \&filter_unihan_line,
13331                         ),
13332     Input_file->new('SpecialCasing.txt', v2.1.8,
13333                     Each_Line_Handler => \&filter_special_casing_line,
13334                     Pre_Handler => \&setup_special_casing,
13335                     ),
13336     Input_file->new(
13337                     'LineBreak.txt', v3.0.0,
13338                     Has_Missings_Defaults => $NOT_IGNORED,
13339                     Property => 'Line_Break',
13340                     # Early versions had problematic syntax
13341                     Each_Line_Handler => (($v_version lt v3.1.0)
13342                                         ? \&filter_early_ea_lb
13343                                         : undef),
13344                     ),
13345     Input_file->new('EastAsianWidth.txt', v3.0.0,
13346                     Property => 'East_Asian_Width',
13347                     Has_Missings_Defaults => $NOT_IGNORED,
13348                     # Early versions had problematic syntax
13349                     Each_Line_Handler => (($v_version lt v3.1.0)
13350                                         ? \&filter_early_ea_lb
13351                                         : undef),
13352                     ),
13353     Input_file->new('CompositionExclusions.txt', v3.0.0,
13354                     Property => 'Composition_Exclusion',
13355                     ),
13356     Input_file->new('BidiMirroring.txt', v3.0.1,
13357                     Property => 'Bidi_Mirroring_Glyph',
13358                     ),
13359     Input_file->new('CaseFolding.txt', v3.0.1,
13360                     Pre_Handler => \&setup_case_folding,
13361                     Each_Line_Handler =>
13362                         [ ($v_version lt v3.1.0)
13363                                  ? \&filter_old_style_case_folding
13364                                  : undef,
13365                            \&filter_case_folding_line
13366                         ],
13367                     Post_Handler => \&post_fold,
13368                     ),
13369     Input_file->new('DCoreProperties.txt', v3.1.0,
13370                     # 5.2 changed this file
13371                     Has_Missings_Defaults => (($v_version ge v5.2.0)
13372                                             ? $NOT_IGNORED
13373                                             : $NO_DEFAULTS),
13374                     ),
13375     Input_file->new('Scripts.txt', v3.1.0,
13376                     Property => 'Script',
13377                     Has_Missings_Defaults => $NOT_IGNORED,
13378                     ),
13379     Input_file->new('DNormalizationProps.txt', v3.1.0,
13380                     Has_Missings_Defaults => $NOT_IGNORED,
13381                     Each_Line_Handler => (($v_version lt v4.0.1)
13382                                       ? \&filter_old_style_normalization_lines
13383                                       : undef),
13384                     ),
13385     Input_file->new('HangulSyllableType.txt', v4.0.0,
13386                     Has_Missings_Defaults => $NOT_IGNORED,
13387                     Property => 'Hangul_Syllable_Type'),
13388     Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
13389                     Property => 'Word_Break',
13390                     Has_Missings_Defaults => $NOT_IGNORED,
13391                     ),
13392     Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0,
13393                     Property => 'Grapheme_Cluster_Break',
13394                     Has_Missings_Defaults => $NOT_IGNORED,
13395                     ),
13396     Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
13397                     Property => 'Sentence_Break',
13398                     Has_Missings_Defaults => $NOT_IGNORED,
13399                     ),
13400     Input_file->new('NamedSequences.txt', v4.1.0,
13401                     Handler => \&process_NamedSequences
13402                     ),
13403     Input_file->new('NameAliases.txt', v5.0.0,
13404                     Property => 'Name_Alias',
13405                     ),
13406     Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
13407                     Optional => 1,
13408                     Each_Line_Handler => \&filter_unihan_line,
13409                     ),
13410     Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
13411                     Optional => 1,
13412                     Each_Line_Handler => \&filter_unihan_line,
13413                     ),
13414     Input_file->new('UnihanIRGSources.txt', v5.2.0,
13415                     Optional => 1,
13416                     Pre_Handler => \&setup_unihan,
13417                     Each_Line_Handler => \&filter_unihan_line,
13418                     ),
13419     Input_file->new('UnihanNumericValues.txt', v5.2.0,
13420                     Optional => 1,
13421                     Each_Line_Handler => \&filter_unihan_line,
13422                     ),
13423     Input_file->new('UnihanOtherMappings.txt', v5.2.0,
13424                     Optional => 1,
13425                     Each_Line_Handler => \&filter_unihan_line,
13426                     ),
13427     Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
13428                     Optional => 1,
13429                     Each_Line_Handler => \&filter_unihan_line,
13430                     ),
13431     Input_file->new('UnihanReadings.txt', v5.2.0,
13432                     Optional => 1,
13433                     Each_Line_Handler => \&filter_unihan_line,
13434                     ),
13435     Input_file->new('UnihanVariants.txt', v5.2.0,
13436                     Optional => 1,
13437                     Each_Line_Handler => \&filter_unihan_line,
13438                     ),
13439 );
13440
13441 # End of all the preliminaries.
13442 # Do it...
13443
13444 if ($compare_versions) {
13445     Carp::my_carp(<<END
13446 Warning.  \$compare_versions is set.  Output is not suitable for production
13447 END
13448     );
13449 }
13450
13451 # Put into %potential_files a list of all the files in the directory structure
13452 # that could be inputs to this program, excluding those that we should ignore.
13453 # Also don't consider test files.  Use absolute file names because it makes it
13454 # easier across machine types.
13455 my @ignored_files_full_names = map { File::Spec->rel2abs(
13456                                      internal_file_to_platform($_))
13457                                 } keys %ignored_files;
13458 File::Find::find({
13459     wanted=>sub {
13460         return unless /\.txt$/i;
13461         return if /Test\.txt$/i;
13462         my $full = File::Spec->rel2abs($_);
13463         $potential_files{$full} = 1
13464                         if ! grep { $full eq $_ } @ignored_files_full_names;
13465         return;
13466     }
13467 }, File::Spec->curdir());
13468
13469 my @mktables_list_output_files;
13470
13471 if ($write_unchanged_files) {
13472     print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
13473 }
13474 else {
13475     print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
13476     my $file_handle;
13477     if (! open $file_handle,"<",$file_list) {
13478         Carp::my_carp("Failed to open '$file_list', turning on -globlist option instead: $!");
13479         $glob_list = 1;
13480     }
13481     else {
13482         my @input;
13483
13484         # Read and parse mktables.lst, placing the results from the first part
13485         # into @input, and the second part into @mktables_list_output_files
13486         for my $list ( \@input, \@mktables_list_output_files ) {
13487             while (<$file_handle>) {
13488                 s/^ \s+ | \s+ $//xg;
13489                 next if /^ \s* (?: \# .* )? $/x;
13490                 last if /^ =+ $/x;
13491                 my ( $file ) = split /\t/;
13492                 push @$list, $file;
13493             }
13494             @$list = uniques(@$list);
13495             next;
13496         }
13497
13498         # Look through all the input files
13499         foreach my $input (@input) {
13500             next if $input eq 'version'; # Already have checked this.
13501
13502             # Ignore if doesn't exist.  The checking about whether we care or
13503             # not is done via the Input_file object.
13504             next if ! file_exists($input);
13505
13506             # The paths are stored with relative names, and with '/' as the
13507             # delimiter; convert to absolute on this machine
13508             my $full = File::Spec->rel2abs(internal_file_to_platform($input));
13509             $potential_files{$full} = 1
13510                         if ! grep { $full eq $_ } @ignored_files_full_names;
13511         }
13512     }
13513
13514     close $file_handle;
13515 }
13516
13517 if ($glob_list) {
13518
13519     # Here wants to process all .txt files in the directory structure.
13520     # Convert them to full path names.  They are stored in the platform's
13521     # relative style
13522     my @known_files;
13523     foreach my $object (@input_file_objects) {
13524         my $file = $object->file;
13525         next unless defined $file;
13526         push @known_files, File::Spec->rel2abs($file);
13527     }
13528
13529     my @unknown_input_files;
13530     foreach my $file (keys %potential_files) {
13531         next if grep { $file eq $_ } @known_files;
13532
13533         # Here, the file is unknown to us.  Get relative path name
13534         $file = File::Spec->abs2rel($file);
13535         push @unknown_input_files, $file;
13536
13537         # What will happen is we create a data structure for it, and add it to
13538         # the list of input files to process.  First get the subdirectories
13539         # into an array
13540         my (undef, $directories, undef) = File::Spec->splitpath($file);
13541         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
13542         my @directories = File::Spec->splitdir($directories);
13543
13544         # If the file isn't extracted (meaning none of the directories is the
13545         # extracted one), just add it to the end of the list of inputs.
13546         if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
13547             push @input_file_objects, Input_file->new($file);
13548         }
13549         else {
13550
13551             # Here, the file is extracted.  It needs to go ahead of most other
13552             # processing.  Search for the first input file that isn't a
13553             # special required property (that is, find one whose first_release
13554             # is non-0), and isn't extracted.  Also, the Age property file is
13555             # processed before the extracted ones, just in case
13556             # $compare_versions is set.
13557             for (my $i = 0; $i < @input_file_objects; $i++) {
13558                 if ($input_file_objects[$i]->first_released ne v0
13559                     && $input_file_objects[$i]->file ne 'DAge.txt'
13560                     && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/)
13561                 {
13562                     splice @input_file_objects, $i, 0, Input_file->new($file);
13563                     last;
13564                 }
13565             }
13566
13567         }
13568     }
13569     if (@unknown_input_files) {
13570         print STDERR simple_fold(join_line(<<END
13571
13572 The following files are unknown as to how to handle.  Assuming they are
13573 typical property files.  You'll know by later error messages if it worked or
13574 not:
13575 END
13576         ) . join(", ", @unknown_input_files) . "\n\n");
13577     }
13578 } # End of looking through directory structure for more .txt files.
13579
13580 # Create the list of input files from the objects we have defined, plus
13581 # version
13582 my @input_files = 'version';
13583 foreach my $object (@input_file_objects) {
13584     my $file = $object->file;
13585     next if ! defined $file;    # Not all objects have files
13586     next if $object->optional && ! -e $file;
13587     push @input_files,  $file;
13588 }
13589
13590 if ( $verbosity >= $VERBOSE ) {
13591     print "Expecting ".scalar( @input_files )." input files. ",
13592          "Checking ".scalar( @mktables_list_output_files )." output files.\n";
13593 }
13594
13595 # We set $youngest to be the most recently changed input file, including this
13596 # program itself (done much earlier in this file)
13597 foreach my $in (@input_files) {
13598     my $age = -M $in;
13599     next unless defined $age;        # Keep going even if missing a file
13600     $youngest = $age if $age < $youngest;
13601
13602     # See that the input files have distinct names, to warn someone if they
13603     # are adding a new one
13604     if ($make_list) {
13605         my ($volume, $directories, $file ) = File::Spec->splitpath($in);
13606         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
13607         my @directories = File::Spec->splitdir($directories);
13608         my $base = $file =~ s/\.txt$//;
13609         construct_filename($file, 'mutable', \@directories);
13610     }
13611 }
13612
13613 my $ok = ! $write_unchanged_files
13614         && scalar @mktables_list_output_files;        # If none known, rebuild
13615
13616 # Now we check to see if any output files are older than youngest, if
13617 # they are, we need to continue on, otherwise we can presumably bail.
13618 if ($ok) {
13619     foreach my $out (@mktables_list_output_files) {
13620         if ( ! file_exists($out)) {
13621             print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
13622             $ok = 0;
13623             last;
13624          }
13625         #local $to_trace = 1 if main::DEBUG;
13626         trace $youngest, -M $out if main::DEBUG && $to_trace;
13627         if ( -M $out > $youngest ) {
13628             #trace "$out: age: ", -M $out, ", youngest: $youngest\n" if main::DEBUG && $to_trace;
13629             print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
13630             $ok = 0;
13631             last;
13632         }
13633     }
13634 }
13635 if ($ok) {
13636     print "Files seem to be ok, not bothering to rebuild.\n";
13637     exit(0);
13638 }
13639 print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
13640
13641 # Ready to do the major processing.  First create the perl pseudo-property.
13642 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
13643
13644 # Process each input file
13645 foreach my $file (@input_file_objects) {
13646     $file->run;
13647 }
13648
13649 # Finish the table generation.
13650
13651 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
13652 finish_Unicode();
13653
13654 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
13655 compile_perl();
13656
13657 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
13658 add_perl_synonyms();
13659
13660 print "Writing tables\n" if $verbosity >= $PROGRESS;
13661 write_all_tables();
13662
13663 # Write mktables.lst
13664 if ( $file_list and $make_list ) {
13665
13666     print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
13667     foreach my $file (@input_files, @files_actually_output) {
13668         my (undef, $directories, $file) = File::Spec->splitpath($file);
13669         my @directories = File::Spec->splitdir($directories);
13670         $file = join '/', @directories, $file;
13671     }
13672
13673     my $ofh;
13674     if (! open $ofh,">",$file_list) {
13675         Carp::my_carp("Can't write to '$file_list'.  Skipping: $!");
13676         return
13677     }
13678     else {
13679         print $ofh <<"END";
13680 #
13681 # $file_list -- File list for $0.
13682 #
13683 #   Autogenerated on @{[scalar localtime]}
13684 #
13685 # - First section is input files
13686 #   ($0 itself is not listed but is automatically considered an input)
13687 # - Section seperator is /^=+\$/
13688 # - Second section is a list of output files.
13689 # - Lines matching /^\\s*#/ are treated as comments
13690 #   which along with blank lines are ignored.
13691 #
13692
13693 # Input files:
13694
13695 END
13696         print $ofh "$_\n" for sort(@input_files);
13697         print $ofh "\n=================================\n# Output files:\n\n";
13698         print $ofh "$_\n" for sort @files_actually_output;
13699         print $ofh "\n# ",scalar(@input_files)," input files\n",
13700                 "# ",scalar(@files_actually_output)+1," output files\n\n",
13701                 "# End list\n";
13702         close $ofh
13703             or Carp::my_carp("Failed to close $ofh: $!");
13704
13705         print "Filelist has ",scalar(@input_files)," input files and ",
13706             scalar(@files_actually_output)+1," output files\n"
13707             if $verbosity >= $VERBOSE;
13708     }
13709 }
13710
13711 # Output these warnings unless -q explicitly specified.
13712 if ($verbosity >= $NORMAL_VERBOSITY) {
13713     if (@unhandled_properties) {
13714         print "\nProperties and tables that unexpectedly have no code points\n";
13715         foreach my $property (sort @unhandled_properties) {
13716             print $property, "\n";
13717         }
13718     }
13719
13720     if (%potential_files) {
13721         print "\nInput files that are not considered:\n";
13722         foreach my $file (sort keys %potential_files) {
13723             print File::Spec->abs2rel($file), "\n";
13724         }
13725     }
13726     print "\nAll done\n" if $verbosity >= $VERBOSE;
13727 }
13728 exit(0);
13729
13730 # TRAILING CODE IS USED BY make_property_test_script()
13731 __DATA__
13732
13733 use strict;
13734 use warnings;
13735
13736 # Test the \p{} regular expression constructs.  This file is constructed by
13737 # mktables from the tables it generates, so if mktables is buggy, this won't
13738 # necessarily catch those bugs.  Tests are generated for all feasible
13739 # properties; a few aren't currently feasible; see is_code_point_usable()
13740 # in mktables for details.
13741
13742 # Standard test packages are not used because this manipulates SIG_WARN.  It
13743 # exits 0 if every non-skipped test succeeded; -1 if any failed.
13744
13745 my $Tests = 0;
13746 my $Fails = 0;
13747 my $Skips = 0;
13748
13749 my $non_ASCII = (ord('A') == 65);
13750
13751 # The first 127 ASCII characters in ordinal order, with the ones that don't
13752 # have Perl names (as of 5.8) replaced by dots.  The 127th is used as the
13753 # string delimiter
13754 my $ascii_to_ebcdic = "\0......\a\b\t\n.\f\r.................. !\"#\$\%&'()*+,-./0123456789:;<=>?\@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
13755 #for my $i (0..126) {
13756 #    print $i, ": ", substr($ascii_to_ebcdic, $i, 1), "\n";
13757 #}
13758
13759 sub Expect($$$$) {
13760     my $expected = shift;
13761     my $ord = shift;
13762     my $regex  = shift;
13763     my $warning_type = shift;   # Type of warning message, like 'deprecated'
13764                                 # or empty if none
13765     my $line   = (caller)[2];
13766
13767     # Convert the code point to hex form
13768     my $string = sprintf "\"\\x{%04X}\"", $ord;
13769
13770     # Convert the non-ASCII code points expressible as characters in Perl 5.8
13771     # to their ASCII equivalents, and skip the others.
13772     if ($non_ASCII && $ord < 255) {
13773
13774         # Dots are used as place holders in the conversion string for the
13775         # non-convertible ones, so check for it first.
13776         if ($ord == 0x2E) {
13777             $ord = ord('.');
13778         }
13779         elsif ($ord < 0x7F
13780                   # Any dots returned are non-convertible.
13781                  && ((my $char = substr($ascii_to_ebcdic, $ord, 1)) ne '.'))
13782         {
13783             #print STDERR "$ord, $char, \n";
13784             $ord = ord($char);
13785         }
13786         else {
13787             $Tests++;
13788             $Skips++;
13789             print "ok $Tests - $string =~ $regex # Skipped: non-ASCII\n";
13790             return;
13791         }
13792     }
13793
13794     # The first time through, use all warnings.
13795     my @tests = "";
13796
13797     # If the input should generate a warning, add another time through with
13798     # them turned off
13799     push @tests, "no warnings '$warning_type';" if $warning_type;
13800
13801     foreach my $no_warnings (@tests) {
13802
13803         # Store any warning messages instead of outputting them
13804         local $SIG{__WARN__} = $SIG{__WARN__};
13805         my $warning_message;
13806         $SIG{__WARN__} = sub { $warning_message = $_[0] };
13807
13808         $Tests++;
13809
13810         # A string eval is needed because of the 'no warnings'.
13811         # Assumes no parens in the regular expression
13812         my $result = eval "$no_warnings
13813                             my \$RegObj = qr($regex);
13814                             $string =~ \$RegObj ? 1 : 0";
13815         if (not defined $result) {
13816             print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
13817             $Fails++;
13818         }
13819         elsif ($result ^ $expected) {
13820             print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
13821             $Fails++;
13822         }
13823         elsif ($warning_message) {
13824             if (! $warning_type || ($warning_type && $no_warnings)) {
13825                 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
13826                 $Fails++;
13827             }
13828             else {
13829                 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
13830             }
13831         }
13832         elsif ($warning_type && ! $no_warnings) {
13833             print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
13834             $Fails++;
13835         }
13836         else {
13837             print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
13838         }
13839     }
13840     return;
13841 }
13842
13843 sub Error($) {
13844     my $regex  = shift;
13845     $Tests++;
13846     if (eval { 'x' =~ qr/$regex/; 1 }) {
13847         $Fails++;
13848         my $line = (caller)[2];
13849         print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
13850     }
13851     else {
13852         my $line = (caller)[2];
13853         print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
13854     }
13855     return;
13856 }
13857
13858 sub Finished() {
13859     print "1..$Tests\n";
13860     exit($Fails ? -1 : 0);
13861 }
13862
13863 Error('\p{Script=InGreek}');    # Bug #69018