X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2Funicore%2Fmktables;h=c61a3f4709e43245b77b55cab99b20d42391ada5;hb=6182169b72782336c6202161aa4cde16ac88296e;hp=b268623cd7af211aeb6fddd7156d76527e684428;hpb=997e7b23827e884e717eba50697f2e5714034828;p=p5sagit%2Fp5-mst-13.2.git diff --git a/lib/unicore/mktables b/lib/unicore/mktables index b268623..c61a3f4 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -4,7 +4,10 @@ # Any files created or read by this program should be listed in 'mktables.lst' # Use -makelist to regenerate it. -require 5.008; # Needs pack "U". Probably safest to run on 5.8.x +# Needs 'no overloading' to run faster on miniperl. Code commented out at the +# subroutine objaddr can be used instead to work as far back (untested) as +# 5.8: needs pack "U". +require 5.010_001; use strict; use warnings; use Carp; @@ -295,6 +298,11 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; # string, but it is a contributory property, and therefore not output by # default. # +# DEBUGGING +# +# XXX Add more stuff here. use perl instead of miniperl to find problems with +# Scalar::Util + # FUTURE ISSUES # # The program would break if Unicode were to change its names so that @@ -453,17 +461,8 @@ our $to_trace = 0; } } - if ($print_caller) { - if (defined $line_number) { - print STDERR sprintf "%4d: ", $line_number; - } - else { - print STDERR " "; - } - $caller_name .= ": "; - print STDERR $caller_name; - } - + print STDERR sprintf "%4d: ", $line_number if defined $line_number; + print STDERR "$caller_name: " if $print_caller; print STDERR $output, "\n"; return; } @@ -549,10 +548,9 @@ while (@ARGV) { elsif ($arg eq '-P' && defined ($pod_directory = shift)) { -d $pod_directory or croak "Directory '$pod_directory' doesn't exist"; } - elsif ($arg eq '-maketest' - || ($arg eq '-T' && defined ($t_path = shift))) { + elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift))) + { $make_test_script = 1; - $t_path = 'TestProp.pl' unless defined $t_path; } elsif ($arg eq '-makelist') { $make_list = 1; @@ -588,11 +586,12 @@ usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ] except those specified by the -P and -T options will be done with respect to this directory. -P dir : Output $pod_file file to directory 'dir'. - -T path : Create a .t test file as 'path' + -T path : Create a test script as 'path'; overrides -maketest -L filelist : Use alternate 'filelist' instead of standard one -globlist : Take as input all non-Test *.txt files in current and sub directories - -maketest : Make test script + -maketest : Make test script 'TestProp.pl' in current (or -C directory), + overrides -T -makelist : Rewrite the file list $file_list based on current setup -check A B : Executes $0 only if A and B are the same END @@ -605,25 +604,17 @@ my $youngest = -M $0; # Do this before the chdir! # Change directories now, because need to read 'version' early. if ($use_directory) { - if ($pod_directory - && ! File::Spec->file_name_is_absolute($pod_directory)) - { + if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) { $pod_directory = File::Spec->rel2abs($pod_directory); } - if ($t_path - && ! File::Spec->file_name_is_absolute($t_path)) - { + if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) { $t_path = File::Spec->rel2abs($t_path); } chdir $use_directory or croak "Failed to chdir to '$use_directory':$!"; - if ($pod_directory - && File::Spec->file_name_is_absolute($pod_directory)) - { + if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) { $pod_directory = File::Spec->abs2rel($pod_directory); } - if ($t_path - && File::Spec->file_name_is_absolute($t_path)) - { + if ($t_path && File::Spec->file_name_is_absolute($t_path)) { $t_path = File::Spec->abs2rel($t_path); } } @@ -688,7 +679,7 @@ if ($v_version gt v3.2.0) { # unless explicitly added. if ($v_version ge v5.2.0) { my $unihan = 'Unihan; remove from list if using Unihan'; - foreach my $table qw ( + foreach my $table qw ( kAccountingNumeric kOtherNumeric kPrimaryNumeric @@ -915,8 +906,8 @@ my %ignored_files = ( my $HEADER=<<"EOF"; # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! -# This file is machine-generated by $0 from the Unicode database, -# Version $string_version. Any changes made here will be lost! +# This file is machine-generated by $0 from the Unicode +# database, Version $string_version. Any changes made here will be lost! EOF my $INTERNAL_ONLY=<<"EOF"; @@ -932,7 +923,7 @@ my $DEVELOPMENT_ONLY=<<"EOF"; # This file contains information artificially constrained to code points # present in Unicode release $string_compare_versions. # IT CANNOT BE RELIED ON. It is for use during development only and should -# not be used for production. +# not be used for production. EOF @@ -980,7 +971,7 @@ my $CP_IN_NAME = 3; # The NAME contains the code point appended to it. my $NULL = 4; # The map is to the null string; utf8.c can't # handle these, nor is there an accepted syntax # for them in \p{} constructs -my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; meanst that ranges that would +my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would # otherwise be $MULTI_CP type are instead type 0 # process_generic_property_file() can accept certain overrides in its input. @@ -1007,6 +998,7 @@ my $MULTIPLE = 4; # Don't replace, but add a duplicate record if my $NORMAL = ""; my $SUPPRESSED = 'z'; # The character should never actually be seen, since # it is suppressed +my $PLACEHOLDER = 'P'; # Implies no pod entry generated my $DEPRECATED = 'D'; my $a_bold_deprecated = "a 'B<$DEPRECATED>'"; my $A_bold_deprecated = "A 'B<$DEPRECATED>'"; @@ -1028,7 +1020,7 @@ my %status_past_participles = ( $SUPPRESSED => 'should never be generated', $STABILIZED => 'stabilized', $OBSOLETE => 'obsolete', - $DEPRECATED => 'deprecated' + $DEPRECATED => 'deprecated', ); # The format of the values of the map tables: @@ -1083,9 +1075,10 @@ my %Jamo_L; # Leading consonants my %Jamo_V; # Vowels my %Jamo_T; # Trailing consonants +my @backslash_X_tests; # List of tests read in for testing \X my @unhandled_properties; # Will contain a list of properties found in # the input that we didn't process. -my @match_properties; # properties that have match tables, to be +my @match_properties; # Properties that have match tables, to be # listed in the pod my @map_properties; # Properties that get map files written my @named_sequences; # NamedSequences.txt contents. @@ -1126,34 +1119,47 @@ sub file_exists ($) { # platform independent '-e'. This program internally return -e internal_file_to_platform($file); } -# This 'require' doesn't necessarily work in miniperl, and even if it does, -# the native perl version of it (which is what would operate under miniperl) -# is extremely slow, as it does a string eval every call. -my $has_fast_scalar_util = $ !~ /miniperl/ - && defined eval "require Scalar::Util"; - sub objaddr($) { - # Returns the address of the blessed input object. Uses the XS version if - # available. It doesn't check for blessedness because that would do a - # string eval every call, and the program is structured so that this is - # never called for a non-blessed object. - - return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util; - - # Check at least that is a ref. - my $pkg = ref($_[0]) or return undef; + # Returns the address of the blessed input object. + # It doesn't check for blessedness because that would do a string eval + # every call, and the program is structured so that this is never called + # for a non-blessed object. - # Change to a fake package to defeat any overloaded stringify - bless $_[0], 'main::Fake'; + no overloading; # If overloaded, numifying below won't work. # Numifying a ref gives its address. - my $addr = 0 + $_[0]; - - # Return to original class - bless $_[0], $pkg; - return $addr; + return 0 + $_[0]; } +# Commented code below should work on Perl 5.8. +## This 'require' doesn't necessarily work in miniperl, and even if it does, +## the native perl version of it (which is what would operate under miniperl) +## is extremely slow, as it does a string eval every call. +#my $has_fast_scalar_util = $ !~ /miniperl/ +# && defined eval "require Scalar::Util"; +# +#sub objaddr($) { +# # Returns the address of the blessed input object. Uses the XS version if +# # available. It doesn't check for blessedness because that would do a +# # string eval every call, and the program is structured so that this is +# # never called for a non-blessed object. +# +# return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util; +# +# # Check at least that is a ref. +# my $pkg = ref($_[0]) or return undef; +# +# # Change to a fake package to defeat any overloaded stringify +# bless $_[0], 'main::Fake'; +# +# # Numifying a ref gives its address. +# my $addr = 0 + $_[0]; +# +# # Return to original class +# bless $_[0], $pkg; +# return $addr; +#} + sub max ($$) { my $a = shift; my $b = shift; @@ -1193,6 +1199,11 @@ package Carp; our $Verbose = 1 if main::DEBUG; # Useful info when debugging +# This is a work-around suggested by Nicholas Clark to fix a problem with Carp +# and overload trying to load Scalar:Util under miniperl. See +# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html +undef $overload::VERSION; + sub my_carp { my $message = shift || ""; my $nofold = shift || 0; @@ -1465,9 +1476,8 @@ package main; no strict "refs"; *$subname = sub { use strict "refs"; - my $self = shift; - Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr = main::objaddr $self; + Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1; + my $addr = main::objaddr $_[0]; if (ref $field->{$addr} ne 'ARRAY') { my $type = ref $field->{$addr}; $type = 'scalar' unless $type; @@ -1488,9 +1498,8 @@ package main; no strict "refs"; *$subname = sub { use strict "refs"; - my $self = shift; - Carp::carp_extra_args(\@_) if main::DEBUG && @_; - return $field->{main::objaddr $self}; + Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1; + return $field->{main::objaddr $_[0]}; } } } @@ -1499,11 +1508,12 @@ package main; no strict "refs"; *$subname = sub { use strict "refs"; - return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; - my $self = shift; - my $value = shift; - Carp::carp_extra_args(\@_) if main::DEBUG && @_; - $field->{main::objaddr $self} = $value; + if (main::DEBUG) { + return Carp::carp_too_few_args(\@_, 2) if @_ < 2; + Carp::carp_extra_args(\@_) if @_ > 2; + } + # $self is $_[0]; $value is $_[1] + $field->{main::objaddr $_[0]} = $_[1]; return; } } @@ -1563,7 +1573,6 @@ package Input_file; sub trace { return main::trace(@_); } - { # Closure # Keep track of fields that are to be put into the constructor. my %constructor_fields; @@ -1597,6 +1606,15 @@ sub trace { return main::trace(@_); } # processed when you set the $debug_skip global. main::set_access('non_skip', \%non_skip, 'c'); + my %skip; + # This is used to skip processing of this input file semi-permanently. + # It is used for files that we aren't planning to process anytime soon, + # but want to allow to be in the directory and not raise a message that we + # are not handling. Mostly for test files. This is in contrast to the + # non_skip element, which is supposed to be used very temporarily for + # debugging. Sets 'optional' to 1 + main::set_access('skip', \%skip, 'c'); + my %each_line_handler; # list of subroutines to look at and filter each non-comment line in the # file. defaults to none. The subroutines are called in order, each is @@ -1660,6 +1678,7 @@ sub trace { return main::trace(@_); } # Set defaults $handler{$addr} = \&main::process_generic_property_file; $non_skip{$addr} = 0; + $skip{$addr} = 0; $has_missings_defaults{$addr} = $NO_DEFAULTS; $handle{$addr} = undef; $added_lines{$addr} = [ ]; @@ -1668,6 +1687,7 @@ sub trace { return main::trace(@_); } $missings{$addr} = [ ]; # Two positional parameters. + return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; $file{$addr} = main::internal_file_to_platform(shift); $first_released{$addr} = shift; @@ -1715,6 +1735,8 @@ sub trace { return main::trace(@_); } print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n"; } + $optional{$addr} = 1 if $skip{$addr}; + return $self; } @@ -1763,7 +1785,7 @@ sub trace { return main::trace(@_); } } # File could be optional - if ($optional{$addr}){ + if ($optional{$addr}) { return unless -e $file; my $result = eval $optional{$addr}; if (! defined $result) { @@ -1796,11 +1818,12 @@ sub trace { return main::trace(@_); } } else { - # Here, the file exists + # Here, the file exists. Some platforms may change the case of + # its name if ($seen_non_extracted_non_age) { - if ($file =~ /$EXTRACTED/) { + if ($file =~ /$EXTRACTED/i) { Carp::my_carp_bug(join_lines(<rel2abs($file)} + my $fkey = File::Spec->rel2abs($file); + my $expecting = delete $potential_files{$fkey}; + $expecting = delete $potential_files{lc($fkey)} unless defined $expecting; + Carp::my_carp("Was not expecting '$file'.") if + ! $expecting && ! defined $handle{$addr}; + # Having deleted from expected files, we can quit if not to do + # anything. Don't print progress unless really want verbosity + if ($skip{$addr}) { + print "Skipping $file.\n" if $verbosity >= $VERBOSE; + return; + } + # Open the file, converting the slashes used in this program # into the proper form for the OS my $file_handle; @@ -3819,7 +3852,7 @@ sub trace { return main::trace(@_); } return $self->_add_delete('+', $start, $end, ""); } - my $non_ASCII = (ord('A') == 65); # Assumes test on same platform + my $non_ASCII = (ord('A') != 65); # Assumes test on same platform sub is_code_point_usable { # This used only for making the test script. See if the input @@ -3835,14 +3868,12 @@ sub trace { return main::trace(@_); } # For non-ASCII, we shun the characters that don't have Perl encoding- # independent symbols for them. 'A' is such a symbol, so is "\n". - # Note, this program hopefully will work on 5.8 Perls, and \v is not - # such a symbol in them. return $try_hard if $non_ASCII && $code <= 0xFF && ($code >= 0x7F || ($code >= 0x0E && $code <= 0x1F) || ($code >= 0x01 && $code <= 0x06) - || $code == 0x0B); # \v introduced after 5.8 + || $code == 0x0B); # shun null. I'm (khw) not sure why this was done, but NULL would be # the character very frequently used. @@ -3976,6 +4007,10 @@ sub trace { return main::trace(@_); } main::set_access('nominal_short_name_length', \%nominal_short_name_length); + my %complete_name; + # The complete name, including property. + main::set_access('complete_name', \%complete_name, 'r'); + my %property; # Parent property this table is attached to. main::set_access('property', \%property, 'r'); @@ -4057,8 +4092,9 @@ sub trace { return main::trace(@_); } $name{$addr} = delete $args{'Name'}; $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'}; $full_name{$addr} = delete $args{'Full_Name'}; + my $complete_name = $complete_name{$addr} + = delete $args{'Complete_Name'}; $internal_only{$addr} = delete $args{'Internal_Only_Warning'} || 0; - $perl_extension{$addr} = delete $args{'Perl_Extension'} || 0; $property{$addr} = delete $args{'_Property'}; $range_list{$addr} = delete $args{'_Range_List'}; $status{$addr} = delete $args{'Status'} || $NORMAL; @@ -4070,6 +4106,7 @@ sub trace { return main::trace(@_); } my $loose_match = delete $args{'Fuzzy'}; my $note = delete $args{'Note'}; my $make_pod_entry = delete $args{'Pod_Entry'}; + my $perl_extension = delete $args{'Perl_Extension'}; # Shouldn't have any left over Carp::carp_extra_args(\%args) if main::DEBUG && %args; @@ -4088,12 +4125,20 @@ sub trace { return main::trace(@_); } push @{$description{$addr}}, $description if $description; push @{$note{$addr}}, $note if $note; - # If hasn't set its status already, see if it is on one of the lists - # of properties or tables that have particular statuses; if not, is - # normal. The lists are prioritized so the most serious ones are - # checked first - my $complete_name = $self->complete_name; - if (! $status{$addr}) { + if ($status{$addr} eq $PLACEHOLDER) { + + # A placeholder table doesn't get documented, is a perl extension, + # and quite likely will be empty + $make_pod_entry = 0 if ! defined $make_pod_entry; + $perl_extension = 1 if ! defined $perl_extension; + push @tables_that_may_be_empty, $complete_name{$addr}; + } + elsif (! $status{$addr}) { + + # If hasn't set its status already, see if it is on one of the + # lists of properties or tables that have particular statuses; if + # not, is normal. The lists are prioritized so the most serious + # ones are checked first if (exists $why_suppressed{$complete_name}) { $status{$addr} = $SUPPRESSED; } @@ -4129,6 +4174,8 @@ sub trace { return main::trace(@_); } } } + $perl_extension{$addr} = $perl_extension || 0; + # By convention what typically gets printed only or first is what's # first in the list, so put the full name there for good output # clarity. Other routines rely on the full name being first on the @@ -4157,15 +4204,12 @@ sub trace { return main::trace(@_); } # class for my $sub qw( append_to_body - complete_name pre_body ) # append_to_body and pre_body are called in the write() method # to add stuff after the main body of the table, but before # its close; and to prepend stuff before the beginning of the # table. - # complete_name returns the complete name of the property and - # table, like Script=Latin { no strict "refs"; *$sub = sub { @@ -4745,12 +4789,15 @@ sub trace { return main::trace(@_); } my $default_map = delete $args{'Default_Map'}; my $format = delete $args{'Format'}; my $property = delete $args{'_Property'}; + my $full_name = delete $args{'Full_Name'}; # Rest of parameters passed on my $range_list = Range_Map->new(Owner => $property); my $self = $class->SUPER::new( Name => $name, + Complete_Name => $full_name, + Full_Name => $full_name, _Property => $property, _Range_List => $range_list, %args); @@ -4780,13 +4827,6 @@ sub trace { return main::trace(@_); } return "Map table for Property '$name'"; } - sub complete_name { - # The complete name for a map table is just its full name, as that - # completely identifies the property it represents - - return shift->full_name; - } - sub add_alias { # Add a synonym for this table (which means the property itself) my $self = shift; @@ -5009,8 +5049,6 @@ sub trace { return main::trace(@_); } # But for $STRING properties, must calculate now. Subtract the # count from each range that maps to the default. foreach my $range ($self->_range_list->ranges) { - local $to_trace = 1 if main::DEBUG; - trace $self, $range; if ($range->value eq $default_map) { $count -= $range->end +1 - $range->start; } @@ -5717,6 +5755,10 @@ sub trace { return main::trace(@_); } # The property for which this table is a listing of property values. my $property = delete $args{'_Property'}; + my $name = delete $args{'Name'}; + my $full_name = delete $args{'Full_Name'}; + $full_name = $name if ! defined $full_name; + # Optional my $initialize = delete $args{'Initialize'}; my $matches_all = delete $args{'Matches_All'} || 0; @@ -5725,7 +5767,22 @@ sub trace { return main::trace(@_); } my $range_list = Range_List->new(Initialize => $initialize, Owner => $property); + my $complete = $full_name; + $complete = '""' if $complete eq ""; # A null name shouldn't happen, + # but this helps debug if it + # does + # The complete name for a match table includes it's property in a + # compound form 'property=table', except if the property is the + # pseudo-property, perl, in which case it is just the single form, + # 'table' (If you change the '=' must also change the ':' in lots of + # places in this program that assume an equal sign) + $complete = $property->full_name . "=$complete" if $property != $perl; + + my $self = $class->SUPER::new(%args, + Name => $name, + Complete_Name => $complete, + Full_Name => $full_name, _Property => $property, _Range_List => $range_list, ); @@ -5805,7 +5862,7 @@ sub trace { return main::trace(@_); } sub _operator_stringify { my $self = shift; - my $name= $self->complete_name; + my $name = $self->complete_name; return "Table '$name'"; } @@ -6014,26 +6071,6 @@ sub trace { return main::trace(@_); } return $self->_range_list->add_range(@_); } - sub complete_name { - # The complete name for a match table includes it's property in a - # compound form 'property=table', except if the property is the - # pseudo-property, perl, in which case it is just the single form, - # 'table' - - my $self = shift; - Carp::carp_extra_args(\@_) if main::DEBUG && @_; - - my $name = $self->full_name; - my $property = $self->property; - $name = '""' if $name eq ""; # A null name shouldn't happen, but this - # helps debug if it does - return $name if $property == $perl; - - # (If change the '=' must also change the ':' in set_final_comment(), - # and the references to colon in its text) - return $property->full_name . '=' . $name; - } - sub pre_body { # Does nothing for match tables. return } @@ -6198,7 +6235,17 @@ END my $flag = $property->status || $table->status || $table_alias_object->status; - $flags{$flag} = $status_past_participles{$flag} if $flag; + if ($flag) { + if ($flag ne $PLACEHOLDER) { + $flags{$flag} = $status_past_participles{$flag}; + } else { + $flags{$flag} = <note; push @conflicting, $table->conflicting; + # And this for output after all the tables. + push @global_comments, $table->comment; + # Compute an alternate compound name using the final property # synonym and the first table synonym with a colon instead of # the equal sign used elsewhere. @@ -6300,8 +6350,10 @@ END if (%flags) { foreach my $flag (sort keys %flags) { $comment .= <table($default_map); + $default_table + = $property_object->table($default_map); if (! defined $default_table || $default_table != $missings) { @@ -8788,7 +8842,7 @@ END } } -# Unused until revise charnames; +# XXX Unused until revise charnames; #sub check_and_handle_compound_name { # This looks at Name properties for parenthesized components and splits # them off. Thus it finds FF as an equivalent to Form Feed. @@ -8916,7 +8970,7 @@ END Carp::carp_extra_args(\@_) if main::DEBUG && @_; my $Perl_decomp = Property->new('Perl_Decomposition_Mapping', - Directory => '.', + Directory => File::Spec->curdir(), File => 'Decomposition', Format => $STRING_FORMAT, Internal_Only_Warning => 1, @@ -9497,6 +9551,18 @@ END } } # End closure for UnicodeData +sub process_GCB_test { + + my $file = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + while ($file->next_line) { + push @backslash_X_tests, $_; + } + + return; +} + sub process_NamedSequences { # NamedSequences.txt entries are just added to an array. Because these # don't look like the other tables, they have their own handler. @@ -9683,8 +9749,7 @@ sub filter_special_casing_line { sub filter_old_style_case_folding { # This transforms $_ containing the case folding style of 3.0.1, to 3.1 - # and later style, then calls the handler for the later style. Different - # letters were used. + # and later style. Different letters were used in the earlier. my $file = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; @@ -9862,13 +9927,6 @@ sub filter_jamo_line { return; } -sub filter_numeric_value_comment_missing_line { - # Filters out the extra column in DNumValues.txt of this line - - s/^($missing_defaults_prefix)\s*;\s*/$1/; - return; -} - sub register_fraction($) { # This registers the input rational number so that it can be passed on to # utf8_heavy.pl, both in rational and floating forms. @@ -10796,19 +10854,79 @@ sub compile_perl() { $CanonDCIJ = $CanonDCIJ & $Assigned; } - # This is used in Unicode's definition of \X + # These are used in Unicode's definition of \X + my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1); + my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1); + my $gcb = property_ref('Grapheme_Cluster_Break'); - if (defined $gcb) { - my $extend = $perl->add_match_table('_GCB_Extend', - Initialize => $gcb->table('Extend')); - $extend += $gcb->table('SpacingMark') - if defined $gcb->table('SpacingMark'); + + # The 'extended' grapheme cluster came in 5.1. The non-extended + # definition differs too much from the traditional Perl one to use. + if (defined $gcb && defined $gcb->table('SpacingMark')) { + + # Note that assumes HST is defined; it came in an earlier release than + # GCB. In the line below, two negatives means: yes hangul + $begin += ~ property_ref('Hangul_Syllable_Type') + ->table('Not_Applicable') + + ~ ($gcb->table('Control') + + $gcb->table('CR') + + $gcb->table('LF')); + $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control'); + + $extend += $gcb->table('Extend') + $gcb->table('SpacingMark'); + $extend->add_comment('For use in \X; matches: Extend | SpacingMark'); } else { # Old definition, used on early releases. - $perl->add_match_table('_X_Extend', Initialize => $gc->table('Mark') - + 0x200C # ZWNJ - + 0x200D # ZWJ - ); + $extend += $gc->table('Mark') + + 0x200C # ZWNJ + + 0x200D; # ZWJ + $begin += ~ $extend; + + # Here we may have a release that has the regular grapheme cluster + # defined, or a release that doesn't have anything defined. + # We set things up so the Perl core degrades gracefully, possibly with + # placeholders that match nothing. + + if (! defined $gcb) { + $gcb = Property->new('GCB', Status => $PLACEHOLDER); + } + my $hst = property_ref('HST'); + if (!defined $hst) { + $hst = Property->new('HST', Status => $PLACEHOLDER); + $hst->add_match_table('Not_Applicable', + Initialize => $Any, + Matches_All => 1); + } + + # On some releases, here we may not have the needed tables for the + # perl core, in some releases we may. + foreach my $name (qw{ L LV LVT T V prepend }) { + my $table = $gcb->table($name); + if (! defined $table) { + $table = $gcb->add_match_table($name); + push @tables_that_may_be_empty, $table->complete_name; + } + + # The HST property predates the GCB one, and has identical tables + # for some of them, so use it if we can. + if ($table->is_empty + && defined $hst + && defined $hst->table($name)) + { + $table += $hst->table($name); + } + } + } + + # More GCB. If we found some hangul syllables, populate a combined + # table. + my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V'); + my $LV = $gcb->table('LV'); + if ($LV->is_empty) { + push @tables_that_may_be_empty, $lv_lvt_v->complete_name; + } else { + $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V'); + $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V'); } # Create a new property specially located that is a combination of the @@ -10819,7 +10937,7 @@ sub compile_perl() { my $perl_charname = Property->new('Perl_Charnames', Core_Access => '\N{...} and charnames.pm', Default_Map => "", - Directory => '.', + Directory => File::Spec->curdir(), File => 'Name', Internal_Only_Warning => 1, Perl_Extension => 1, @@ -10906,7 +11024,7 @@ END Type => $ENUM, Initialize => $ccc, File => 'CombiningClass', - Directory => '.', + Directory => File::Spec->curdir(), ); $perl_ccc->set_to_output_map(1); $perl_ccc->add_comment(join_lines(<make_pod_entry; + # Now for the pod entry for this alias. Skip if not + # outputting a pod; skip the first one, which is the + # full name so won't have an entry like: '\p{full: *} + # \p{full: *}', and skip if don't want an entry for + # this one. + next if $i == 0 + || ! defined $pod_directory + || ! $alias->make_pod_entry; push @match_properties, format_pod_line($indent_info_column, @@ -13038,6 +13172,8 @@ sub make_property_test_script() { # tests are added, it will perturb all later ones in the .t file srand 0; + $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name + force_unlink ($t_path); push @files_actually_output, $t_path; my $OUT; @@ -13212,6 +13348,11 @@ sub make_property_test_script() { } } } + + foreach my $test (@backslash_X_tests) { + print $OUT "Test_X('$test');\n"; + } + print $OUT "Finished();\n"; close $OUT; return; @@ -13238,7 +13379,7 @@ my @input_file_objects = ( Handler => \&process_PropertyAliases, ), Input_file->new(undef, v0, # No file associated with this - Progress_Message => 'Finishing Property Setup', + Progress_Message => 'Finishing property setup', Handler => \&finish_property_setup, ), Input_file->new('PropValueAliases.txt', v0, @@ -13361,6 +13502,9 @@ my @input_file_objects = ( Input_file->new('BidiMirroring.txt', v3.0.1, Property => 'Bidi_Mirroring_Glyph', ), + Input_file->new("NormalizationTest.txt", v3.0.1, + Skip => 1, + ), Input_file->new('CaseFolding.txt', v3.0.1, Pre_Handler => \&setup_case_folding, Each_Line_Handler => @@ -13398,6 +13542,18 @@ my @input_file_objects = ( Property => 'Grapheme_Cluster_Break', Has_Missings_Defaults => $NOT_IGNORED, ), + Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0, + Handler => \&process_GCB_test, + ), + Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0, + Skip => 1, + ), + Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0, + Skip => 1, + ), + Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0, + Skip => 1, + ), Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0, Property => 'Sentence_Break', Has_Missings_Defaults => $NOT_IGNORED, @@ -13408,6 +13564,9 @@ my @input_file_objects = ( Input_file->new('NameAliases.txt', v5.0.0, Property => 'Name_Alias', ), + Input_file->new("BidiTest.txt", v5.2.0, + Skip => 1, + ), Input_file->new('UnihanIndicesDictionary.txt', v5.2.0, Optional => 1, Each_Line_Handler => \&filter_unihan_line, @@ -13455,18 +13614,16 @@ END # Put into %potential_files a list of all the files in the directory structure # that could be inputs to this program, excluding those that we should ignore. -# Also don't consider test files. Use absolute file names because it makes it -# easier across machine types. +# Use absolute file names because it makes it easier across machine types. my @ignored_files_full_names = map { File::Spec->rel2abs( internal_file_to_platform($_)) } keys %ignored_files; File::Find::find({ wanted=>sub { - return unless /\.txt$/i; - return if /Test\.txt$/i; - my $full = File::Spec->rel2abs($_); + return unless /\.txt$/i; # Some platforms change the name's case + my $full = lc(File::Spec->rel2abs($_)); $potential_files{$full} = 1 - if ! grep { $full eq $_ } @ignored_files_full_names; + if ! grep { $full eq lc($_) } @ignored_files_full_names; return; } }, File::Spec->curdir()); @@ -13479,8 +13636,8 @@ if ($write_unchanged_files) { else { print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE; my $file_handle; - if (! open $file_handle,"<",$file_list) { - Carp::my_carp("Failed to open '$file_list', turning on -globlist option instead: $!"); + if (! open $file_handle, "<", $file_list) { + Carp::my_carp("Failed to open '$file_list' (this is expected to be missing the first time); turning on -globlist option instead: $!"); $glob_list = 1; } else { @@ -13510,9 +13667,9 @@ else { # The paths are stored with relative names, and with '/' as the # delimiter; convert to absolute on this machine - my $full = File::Spec->rel2abs(internal_file_to_platform($input)); + my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input))); $potential_files{$full} = 1 - if ! grep { $full eq $_ } @ignored_files_full_names; + if ! grep { lc($full) eq lc($_) } @ignored_files_full_names; } } @@ -13524,12 +13681,16 @@ if ($glob_list) { # Here wants to process all .txt files in the directory structure. # Convert them to full path names. They are stored in the platform's # relative style - my @known_files = map { File::Spec->rel2abs($_->file) } - @input_file_objects; + my @known_files; + foreach my $object (@input_file_objects) { + my $file = $object->file; + next unless defined $file; + push @known_files, File::Spec->rel2abs($file); + } my @unknown_input_files; foreach my $file (keys %potential_files) { - next if grep { $file eq $_ } @known_files; + next if grep { lc($file) eq lc($_) } @known_files; # Here, the file is unknown to us. Get relative path name $file = File::Spec->abs2rel($file); @@ -13545,7 +13706,7 @@ if ($glob_list) { # If the file isn't extracted (meaning none of the directories is the # extracted one), just add it to the end of the list of inputs. if (! grep { $EXTRACTED_DIR eq $_ } @directories) { - push @input_file_objects, Input_file->new($file); + push @input_file_objects, Input_file->new($file, v0); } else { @@ -13557,10 +13718,11 @@ if ($glob_list) { # $compare_versions is set. for (my $i = 0; $i < @input_file_objects; $i++) { if ($input_file_objects[$i]->first_released ne v0 - && $input_file_objects[$i]->file ne 'DAge.txt' - && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/) + && lc($input_file_objects[$i]->file) ne 'dage.txt' + && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i) { - splice @input_file_objects, $i, 0, Input_file->new($file); + splice @input_file_objects, $i, 0, + Input_file->new($file, v0); last; } } @@ -13568,22 +13730,16 @@ if ($glob_list) { } } if (@unknown_input_files) { - print STDERR simple_fold(join_line(<", "?", "@", + "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", + "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", + "[", "\\", "]", "^", "_", "`", + "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", + "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", + "{", "|", "}", "~", + (-1) x 129 +); -# The first 127 ASCII characters in ordinal order, with the ones that don't -# have Perl names (as of 5.8) replaced by dots. The 127th is used as the -# string delimiter -my $ascii_to_ebcdic = "\0......\a\b\t\n.\f\r.................. !\"#\$\%&'()*+,-./0123456789:;<=>?\@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"; -#for my $i (0..126) { -# print $i, ": ", substr($ascii_to_ebcdic, $i, 1), "\n"; -#} +sub ASCII_ord_to_native ($) { + # Converts input ordinal number to the native one, if can be done easily. + # Returns -1 otherwise. + + my $ord = shift; + + return $ord if $ord > 255 || ! $non_ASCII; + my $result = $ascii_ordered_chars[$ord]; + return $result if $result eq '-1'; + return ord($result); +} sub Expect($$$$) { my $expected = shift; @@ -13771,38 +13952,24 @@ sub Expect($$$$) { # or empty if none my $line = (caller)[2]; - # Convert the code point to hex form - my $string = sprintf "\"\\x{%04X}\"", $ord; - - # Convert the non-ASCII code points expressible as characters in Perl 5.8 - # to their ASCII equivalents, and skip the others. - if ($non_ASCII && $ord < 255) { - - # Dots are used as place holders in the conversion string for the - # non-convertible ones, so check for it first. - if ($ord == 0x2E) { - $ord = ord('.'); - } - elsif ($ord < 0x7F - # Any dots returned are non-convertible. - && ((my $char = substr($ascii_to_ebcdic, $ord, 1)) ne '.')) - { - #print STDERR "$ord, $char, \n"; - $ord = ord($char); - } - else { - $Tests++; - $Skips++; - print "ok $Tests - $string =~ $regex # Skipped: non-ASCII\n"; - return; - } + # Convert the non-ASCII code points expressible as characters to their + # ASCII equivalents, and skip the others. + $ord = ASCII_ord_to_native($ord); + if ($ord < 0) { + $Tests++; + print "ok $Tests - " + . sprintf("\"\\x{%04X}\"", $ord) + . " =~ $regex # Skipped: non-ASCII\n"; + return; } - # The first time through, use all warnings. + # Convert the code point to hex form + my $string = sprintf "\"\\x{%04X}\"", $ord; + my @tests = ""; - # If the input should generate a warning, add another time through with - # them turned off + # The first time through, use all warnings. If the input should generate + # a warning, add another time through with them turned off push @tests, "no warnings '$warning_type';" if $warning_type; foreach my $no_warnings (@tests) { @@ -13862,9 +14029,142 @@ sub Error($) { return; } +# GCBTest.txt character that separates grapheme clusters +my $breakable_utf8 = my $breakable = chr(0xF7); +utf8::upgrade($breakable_utf8); + +# GCBTest.txt character that indicates that the adjoining code points are part +# of the same grapheme cluster +my $nobreak_utf8 = my $nobreak = chr(0xD7); +utf8::upgrade($nobreak_utf8); + +sub Test_X($) { + # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt + # Each such line is a sequence of code points given by their hex numbers, + # separated by the two characters defined just before this subroutine that + # indicate that either there can or cannot be a break between the adjacent + # code points. If there isn't a break, that means the sequence forms an + # extended grapheme cluster, which means that \X should match the whole + # thing. If there is a break, \X should stop there. This is all + # converted by this routine into a match: + # $string =~ /(\X)/, + # Each \X should match the next cluster; and that is what is checked. + + my $template = shift; + + my $line = (caller)[2]; + + # The line contains characters above the ASCII range, but in Latin1. It + # may or may not be in utf8, and if it is, it may or may not know it. So, + # convert these characters to 8 bits. If knows is in utf8, simply + # downgrade. + if (utf8::is_utf8($template)) { + utf8::downgrade($template); + } else { + + # Otherwise, if it is in utf8, but doesn't know it, the next lines + # convert the two problematic characters to their 8-bit equivalents. + # If it isn't in utf8, they don't harm anything. + use bytes; + $template =~ s/$nobreak_utf8/$nobreak/g; + $template =~ s/$breakable_utf8/$breakable/g; + } + + # Get rid of the leading and trailing breakables + $template =~ s/^ \s* $breakable \s* //x; + $template =~ s/ \s* $breakable \s* $ //x; + + # And no-breaks become just a space. + $template =~ s/ \s* $nobreak \s* / /xg; + + # Split the input into segments that are breakable between them. + my @segments = split /\s*$breakable\s*/, $template; + + my $string = ""; + my $display_string = ""; + my @should_match; + my @should_display; + + # Convert the code point sequence in each segment into a Perl string of + # characters + foreach my $segment (@segments) { + my @code_points = split /\s+/, $segment; + my $this_string = ""; + my $this_display = ""; + foreach my $code_point (@code_points) { + my $ord = ASCII_ord_to_native(hex $code_point); + if ($ord < 0) { + $Tests++; + print "ok $Tests - String containing $code_point =~ /(\\X)/g # Skipped: non-ASCII\n"; + return; + } + $this_string .= chr $ord; + $this_display .= "\\x{$code_point}"; + } + + # The next cluster should match the string in this segment. + push @should_match, $this_string; + push @should_display, $this_display; + $string .= $this_string; + $display_string .= $this_display; + } + + # If a string can be represented in both non-ut8 and utf8, test both cases + UPGRADE: + for my $to_upgrade (0 .. 1) { + + if ($to_upgrade) { + + # If already in utf8, would just be a repeat + next UPGRADE if utf8::is_utf8($string); + + utf8::upgrade($string); + } + + # Finally, do the \X match. + my @matches = $string =~ /(\X)/g; + + # Look through each matched cluster to verify that it matches what we + # expect. + my $min = (@matches < @should_match) ? @matches : @should_match; + for my $i (0 .. $min - 1) { + $Tests++; + if ($matches[$i] eq $should_match[$i]) { + print "ok $Tests - "; + if ($i == 0) { + print "In \"$display_string\" =~ /(\\X)/g, \\X #1"; + } else { + print "And \\X #", $i + 1, + } + print " correctly matched $should_display[$i]; line $line\n"; + } else { + $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ } + unpack("U*", $matches[$i])); + print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #", + $i + 1, + " should have matched $should_display[$i]", + " but instead matched $matches[$i]", + ". Abandoning rest of line $line\n"; + next UPGRADE; + } + } + + # And the number of matches should equal the number of expected matches. + $Tests++; + if (@matches == @should_match) { + print "ok $Tests - Nothing was left over; line $line\n"; + } else { + print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n"; + } + } + + return; +} + sub Finished() { - print "1..$Tests.\n"; + print "1..$Tests\n"; exit($Fails ? -1 : 0); } Error('\p{Script=InGreek}'); # Bug #69018 +Test_X("1100 $nobreak 1161"); # Bug #70940