# Any files created or read by this program should be listed in 'mktables.lst'
# Use -makelist to regenerate it.
-require 5.008; # Needs pack "U". Probably safest to run on 5.8.x
+# Needs 'no overloading' to run faster on miniperl. Code commented out at the
+# subroutine objaddr can be used instead to work as far back (untested) as
+# 5.8: needs pack "U".
+require 5.010_001;
use strict;
use warnings;
use Carp;
# string, but it is a contributory property, and therefore not output by
# default.
#
+# DEBUGGING
+#
+# XXX Add more stuff here. use perl instead of miniperl to find problems with
+# Scalar::Util
+
# FUTURE ISSUES
#
# The program would break if Unicode were to change its names so that
}
}
- if ($print_caller) {
- if (defined $line_number) {
- print STDERR sprintf "%4d: ", $line_number;
- }
- else {
- print STDERR " ";
- }
- $caller_name .= ": ";
- print STDERR $caller_name;
- }
-
+ print STDERR sprintf "%4d: ", $line_number if defined $line_number;
+ print STDERR "$caller_name: " if $print_caller;
print STDERR $output, "\n";
return;
}
elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
-d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
}
- elsif ($arg eq '-maketest'
- || ($arg eq '-T' && defined ($t_path = shift))) {
+ elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
+ {
$make_test_script = 1;
- $t_path = 'TestProp.pl' unless defined $t_path;
}
elsif ($arg eq '-makelist') {
$make_list = 1;
except those specified by the -P and -T options will be done
with respect to this directory.
-P dir : Output $pod_file file to directory 'dir'.
- -T path : Create a .t test file as 'path'
+ -T path : Create a test script as 'path'; overrides -maketest
-L filelist : Use alternate 'filelist' instead of standard one
-globlist : Take as input all non-Test *.txt files in current and sub
directories
- -maketest : Make test script
+ -maketest : Make test script 'TestProp.pl' in current (or -C directory),
+ overrides -T
-makelist : Rewrite the file list $file_list based on current setup
-check A B : Executes $0 only if A and B are the same
END
# Change directories now, because need to read 'version' early.
if ($use_directory) {
- if ($pod_directory
- && ! File::Spec->file_name_is_absolute($pod_directory))
- {
+ if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
$pod_directory = File::Spec->rel2abs($pod_directory);
}
- if ($t_path
- && ! File::Spec->file_name_is_absolute($t_path))
- {
+ if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
$t_path = File::Spec->rel2abs($t_path);
}
chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
- if ($pod_directory
- && File::Spec->file_name_is_absolute($pod_directory))
- {
+ if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
$pod_directory = File::Spec->abs2rel($pod_directory);
}
- if ($t_path
- && File::Spec->file_name_is_absolute($t_path))
- {
+ if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
$t_path = File::Spec->abs2rel($t_path);
}
}
# unless explicitly added.
if ($v_version ge v5.2.0) {
my $unihan = 'Unihan; remove from list if using Unihan';
- foreach my $table qw (
+ foreach my $table qw (
kAccountingNumeric
kOtherNumeric
kPrimaryNumeric
my $HEADER=<<"EOF";
# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
-# This file is machine-generated by $0 from the Unicode database,
-# Version $string_version. Any changes made here will be lost!
+# This file is machine-generated by $0 from the Unicode
+# database, Version $string_version. Any changes made here will be lost!
EOF
my $INTERNAL_ONLY=<<"EOF";
# This file contains information artificially constrained to code points
# present in Unicode release $string_compare_versions.
# IT CANNOT BE RELIED ON. It is for use during development only and should
-# not be used for production.
+# not be used for production.
EOF
my $NULL = 4; # The map is to the null string; utf8.c can't
# handle these, nor is there an accepted syntax
# for them in \p{} constructs
-my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; meanst that ranges that would
+my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
# otherwise be $MULTI_CP type are instead type 0
# process_generic_property_file() can accept certain overrides in its input.
my $NORMAL = "";
my $SUPPRESSED = 'z'; # The character should never actually be seen, since
# it is suppressed
+my $PLACEHOLDER = 'P'; # Implies no pod entry generated
my $DEPRECATED = 'D';
my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
$SUPPRESSED => 'should never be generated',
$STABILIZED => 'stabilized',
$OBSOLETE => 'obsolete',
- $DEPRECATED => 'deprecated'
+ $DEPRECATED => 'deprecated',
);
# The format of the values of the map tables:
my %Jamo_V; # Vowels
my %Jamo_T; # Trailing consonants
+my @backslash_X_tests; # List of tests read in for testing \X
my @unhandled_properties; # Will contain a list of properties found in
# the input that we didn't process.
-my @match_properties; # properties that have match tables, to be
+my @match_properties; # Properties that have match tables, to be
# listed in the pod
my @map_properties; # Properties that get map files written
my @named_sequences; # NamedSequences.txt contents.
return -e internal_file_to_platform($file);
}
-# This 'require' doesn't necessarily work in miniperl, and even if it does,
-# the native perl version of it (which is what would operate under miniperl)
-# is extremely slow, as it does a string eval every call.
-my $has_fast_scalar_util = $\18 !~ /miniperl/
- && defined eval "require Scalar::Util";
-
sub objaddr($) {
- # Returns the address of the blessed input object. Uses the XS version if
- # available. It doesn't check for blessedness because that would do a
- # string eval every call, and the program is structured so that this is
- # never called for a non-blessed object.
-
- return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
-
- # Check at least that is a ref.
- my $pkg = ref($_[0]) or return undef;
+ # Returns the address of the blessed input object.
+ # It doesn't check for blessedness because that would do a string eval
+ # every call, and the program is structured so that this is never called
+ # for a non-blessed object.
- # Change to a fake package to defeat any overloaded stringify
- bless $_[0], 'main::Fake';
+ no overloading; # If overloaded, numifying below won't work.
# Numifying a ref gives its address.
- my $addr = 0 + $_[0];
-
- # Return to original class
- bless $_[0], $pkg;
- return $addr;
+ return 0 + $_[0];
}
+# Commented code below should work on Perl 5.8.
+## This 'require' doesn't necessarily work in miniperl, and even if it does,
+## the native perl version of it (which is what would operate under miniperl)
+## is extremely slow, as it does a string eval every call.
+#my $has_fast_scalar_util = $\18 !~ /miniperl/
+# && defined eval "require Scalar::Util";
+#
+#sub objaddr($) {
+# # Returns the address of the blessed input object. Uses the XS version if
+# # available. It doesn't check for blessedness because that would do a
+# # string eval every call, and the program is structured so that this is
+# # never called for a non-blessed object.
+#
+# return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
+#
+# # Check at least that is a ref.
+# my $pkg = ref($_[0]) or return undef;
+#
+# # Change to a fake package to defeat any overloaded stringify
+# bless $_[0], 'main::Fake';
+#
+# # Numifying a ref gives its address.
+# my $addr = 0 + $_[0];
+#
+# # Return to original class
+# bless $_[0], $pkg;
+# return $addr;
+#}
+
sub max ($$) {
my $a = shift;
my $b = shift;
our $Verbose = 1 if main::DEBUG; # Useful info when debugging
+# This is a work-around suggested by Nicholas Clark to fix a problem with Carp
+# and overload trying to load Scalar:Util under miniperl. See
+# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
+undef $overload::VERSION;
+
sub my_carp {
my $message = shift || "";
my $nofold = shift || 0;
no strict "refs";
*$subname = sub {
use strict "refs";
- my $self = shift;
- Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr $self;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
+ my $addr = main::objaddr $_[0];
if (ref $field->{$addr} ne 'ARRAY') {
my $type = ref $field->{$addr};
$type = 'scalar' unless $type;
no strict "refs";
*$subname = sub {
use strict "refs";
- my $self = shift;
- Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- return $field->{main::objaddr $self};
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
+ return $field->{main::objaddr $_[0]};
}
}
}
no strict "refs";
*$subname = sub {
use strict "refs";
- return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
- my $self = shift;
- my $value = shift;
- Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- $field->{main::objaddr $self} = $value;
+ if (main::DEBUG) {
+ return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
+ Carp::carp_extra_args(\@_) if @_ > 2;
+ }
+ # $self is $_[0]; $value is $_[1]
+ $field->{main::objaddr $_[0]} = $_[1];
return;
}
}
sub trace { return main::trace(@_); }
-
{ # Closure
# Keep track of fields that are to be put into the constructor.
my %constructor_fields;
# processed when you set the $debug_skip global.
main::set_access('non_skip', \%non_skip, 'c');
+ my %skip;
+ # This is used to skip processing of this input file semi-permanently.
+ # It is used for files that we aren't planning to process anytime soon,
+ # but want to allow to be in the directory and not raise a message that we
+ # are not handling. Mostly for test files. This is in contrast to the
+ # non_skip element, which is supposed to be used very temporarily for
+ # debugging. Sets 'optional' to 1
+ main::set_access('skip', \%skip, 'c');
+
my %each_line_handler;
# list of subroutines to look at and filter each non-comment line in the
# file. defaults to none. The subroutines are called in order, each is
# Set defaults
$handler{$addr} = \&main::process_generic_property_file;
$non_skip{$addr} = 0;
+ $skip{$addr} = 0;
$has_missings_defaults{$addr} = $NO_DEFAULTS;
$handle{$addr} = undef;
$added_lines{$addr} = [ ];
$missings{$addr} = [ ];
# Two positional parameters.
+ return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
$file{$addr} = main::internal_file_to_platform(shift);
$first_released{$addr} = shift;
print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
}
+ $optional{$addr} = 1 if $skip{$addr};
+
return $self;
}
}
# File could be optional
- if ($optional{$addr}){
+ if ($optional{$addr}) {
return unless -e $file;
my $result = eval $optional{$addr};
if (! defined $result) {
}
else {
- # Here, the file exists
+ # Here, the file exists. Some platforms may change the case of
+ # its name
if ($seen_non_extracted_non_age) {
- if ($file =~ /$EXTRACTED/) {
+ if ($file =~ /$EXTRACTED/i) {
Carp::my_carp_bug(join_lines(<<END
-$file should be processed just after the 'Prop..Alias' files, and before
+$file should be processed just after the 'Prop...Alias' files, and before
anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
have subtle problems
END
}
elsif ($EXTRACTED_DIR
&& $first_released{$addr} ne v0
- && $file !~ /$EXTRACTED/
- && $file ne 'DAge.txt')
+ && $file !~ /$EXTRACTED/i
+ && lc($file) ne 'dage.txt')
{
# We don't set this (by the 'if' above) if we have no
# extracted directory, so if running on an early version,
# isn't a file we are expecting. As we process the files,
# they are deleted from the hash, so any that remain at the
# end of the program are files that we didn't process.
- Carp::my_carp("Was not expecting '$file'.") if
- ! delete $potential_files{File::Spec->rel2abs($file)}
+ my $fkey = File::Spec->rel2abs($file);
+ my $expecting = delete $potential_files{$fkey};
+ $expecting = delete $potential_files{lc($fkey)} unless defined $expecting;
+ Carp::my_carp("Was not expecting '$file'.") if
+ ! $expecting
&& ! defined $handle{$addr};
+ # Having deleted from expected files, we can quit if not to do
+ # anything. Don't print progress unless really want verbosity
+ if ($skip{$addr}) {
+ print "Skipping $file.\n" if $verbosity >= $VERBOSE;
+ return;
+ }
+
# Open the file, converting the slashes used in this program
# into the proper form for the OS
my $file_handle;
return $self->_add_delete('+', $start, $end, "");
}
- my $non_ASCII = (ord('A') == 65); # Assumes test on same platform
+ my $non_ASCII = (ord('A') != 65); # Assumes test on same platform
sub is_code_point_usable {
# This used only for making the test script. See if the input
# For non-ASCII, we shun the characters that don't have Perl encoding-
# independent symbols for them. 'A' is such a symbol, so is "\n".
- # Note, this program hopefully will work on 5.8 Perls, and \v is not
- # such a symbol in them.
return $try_hard if $non_ASCII
&& $code <= 0xFF
&& ($code >= 0x7F
|| ($code >= 0x0E && $code <= 0x1F)
|| ($code >= 0x01 && $code <= 0x06)
- || $code == 0x0B); # \v introduced after 5.8
+ || $code == 0x0B);
# shun null. I'm (khw) not sure why this was done, but NULL would be
# the character very frequently used.
main::set_access('nominal_short_name_length',
\%nominal_short_name_length);
+ my %complete_name;
+ # The complete name, including property.
+ main::set_access('complete_name', \%complete_name, 'r');
+
my %property;
# Parent property this table is attached to.
main::set_access('property', \%property, 'r');
$name{$addr} = delete $args{'Name'};
$find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
$full_name{$addr} = delete $args{'Full_Name'};
+ my $complete_name = $complete_name{$addr}
+ = delete $args{'Complete_Name'};
$internal_only{$addr} = delete $args{'Internal_Only_Warning'} || 0;
- $perl_extension{$addr} = delete $args{'Perl_Extension'} || 0;
$property{$addr} = delete $args{'_Property'};
$range_list{$addr} = delete $args{'_Range_List'};
$status{$addr} = delete $args{'Status'} || $NORMAL;
my $loose_match = delete $args{'Fuzzy'};
my $note = delete $args{'Note'};
my $make_pod_entry = delete $args{'Pod_Entry'};
+ my $perl_extension = delete $args{'Perl_Extension'};
# Shouldn't have any left over
Carp::carp_extra_args(\%args) if main::DEBUG && %args;
push @{$description{$addr}}, $description if $description;
push @{$note{$addr}}, $note if $note;
- # If hasn't set its status already, see if it is on one of the lists
- # of properties or tables that have particular statuses; if not, is
- # normal. The lists are prioritized so the most serious ones are
- # checked first
- my $complete_name = $self->complete_name;
- if (! $status{$addr}) {
+ if ($status{$addr} eq $PLACEHOLDER) {
+
+ # A placeholder table doesn't get documented, is a perl extension,
+ # and quite likely will be empty
+ $make_pod_entry = 0 if ! defined $make_pod_entry;
+ $perl_extension = 1 if ! defined $perl_extension;
+ push @tables_that_may_be_empty, $complete_name{$addr};
+ }
+ elsif (! $status{$addr}) {
+
+ # If hasn't set its status already, see if it is on one of the
+ # lists of properties or tables that have particular statuses; if
+ # not, is normal. The lists are prioritized so the most serious
+ # ones are checked first
if (exists $why_suppressed{$complete_name}) {
$status{$addr} = $SUPPRESSED;
}
}
}
+ $perl_extension{$addr} = $perl_extension || 0;
+
# By convention what typically gets printed only or first is what's
# first in the list, so put the full name there for good output
# clarity. Other routines rely on the full name being first on the
# class
for my $sub qw(
append_to_body
- complete_name
pre_body
)
# append_to_body and pre_body are called in the write() method
# to add stuff after the main body of the table, but before
# its close; and to prepend stuff before the beginning of the
# table.
- # complete_name returns the complete name of the property and
- # table, like Script=Latin
{
no strict "refs";
*$sub = sub {
my $default_map = delete $args{'Default_Map'};
my $format = delete $args{'Format'};
my $property = delete $args{'_Property'};
+ my $full_name = delete $args{'Full_Name'};
# Rest of parameters passed on
my $range_list = Range_Map->new(Owner => $property);
my $self = $class->SUPER::new(
Name => $name,
+ Complete_Name => $full_name,
+ Full_Name => $full_name,
_Property => $property,
_Range_List => $range_list,
%args);
return "Map table for Property '$name'";
}
- sub complete_name {
- # The complete name for a map table is just its full name, as that
- # completely identifies the property it represents
-
- return shift->full_name;
- }
-
sub add_alias {
# Add a synonym for this table (which means the property itself)
my $self = shift;
# But for $STRING properties, must calculate now. Subtract the
# count from each range that maps to the default.
foreach my $range ($self->_range_list->ranges) {
- local $to_trace = 1 if main::DEBUG;
- trace $self, $range;
if ($range->value eq $default_map) {
$count -= $range->end +1 - $range->start;
}
# The property for which this table is a listing of property values.
my $property = delete $args{'_Property'};
+ my $name = delete $args{'Name'};
+ my $full_name = delete $args{'Full_Name'};
+ $full_name = $name if ! defined $full_name;
+
# Optional
my $initialize = delete $args{'Initialize'};
my $matches_all = delete $args{'Matches_All'} || 0;
my $range_list = Range_List->new(Initialize => $initialize,
Owner => $property);
+ my $complete = $full_name;
+ $complete = '""' if $complete eq ""; # A null name shouldn't happen,
+ # but this helps debug if it
+ # does
+ # The complete name for a match table includes it's property in a
+ # compound form 'property=table', except if the property is the
+ # pseudo-property, perl, in which case it is just the single form,
+ # 'table' (If you change the '=' must also change the ':' in lots of
+ # places in this program that assume an equal sign)
+ $complete = $property->full_name . "=$complete" if $property != $perl;
+
+
my $self = $class->SUPER::new(%args,
+ Name => $name,
+ Complete_Name => $complete,
+ Full_Name => $full_name,
_Property => $property,
_Range_List => $range_list,
);
sub _operator_stringify {
my $self = shift;
- my $name= $self->complete_name;
+ my $name = $self->complete_name;
return "Table '$name'";
}
return $self->_range_list->add_range(@_);
}
- sub complete_name {
- # The complete name for a match table includes it's property in a
- # compound form 'property=table', except if the property is the
- # pseudo-property, perl, in which case it is just the single form,
- # 'table'
-
- my $self = shift;
- Carp::carp_extra_args(\@_) if main::DEBUG && @_;
-
- my $name = $self->full_name;
- my $property = $self->property;
- $name = '""' if $name eq ""; # A null name shouldn't happen, but this
- # helps debug if it does
- return $name if $property == $perl;
-
- # (If change the '=' must also change the ':' in set_final_comment(),
- # and the references to colon in its text)
- return $property->full_name . '=' . $name;
- }
-
sub pre_body { # Does nothing for match tables.
return
}
my $flag = $property->status
|| $table->status
|| $table_alias_object->status;
- $flags{$flag} = $status_past_participles{$flag} if $flag;
+ if ($flag) {
+ if ($flag ne $PLACEHOLDER) {
+ $flags{$flag} = $status_past_participles{$flag};
+ } else {
+ $flags{$flag} = <<END;
+a placeholder because it is not in Version $string_version of Unicode, but is
+needed by the Perl core to work gracefully. Because it is not in this version
+of Unicode, it will not be listed in $pod_file.pod
+END
+ }
+ }
$loose_count++;
push @note, $table->note;
push @conflicting, $table->conflicting;
+ # And this for output after all the tables.
+ push @global_comments, $table->comment;
+
# Compute an alternate compound name using the final property
# synonym and the first table synonym with a colon instead of
# the equal sign used elsewhere.
if (%flags) {
foreach my $flag (sort keys %flags) {
$comment .= <<END;
-'$flag' below means that this form is $flags{$flag}. Consult $pod_file.pod
+'$flag' below means that this form is $flags{$flag}.
END
+ next if $flag eq $PLACEHOLDER;
+ $comment .= "Consult $pod_file.pod\n";
}
$comment .= "\n";
}
$match$synonyms:
$matches_comment
-$pod_file.pod should be consulted for the rules on using $any_of_these,
+$pod_file.pod should be consulted for the syntax rules for $any_of_these,
including if adding or subtracting white space, underscore, and hyphen
characters matters or doesn't matter, and other permissible syntactic
variants. Upper/lower case distinctions never matter.
# And append any comment(s) from the actual tables. They are all
# gathered here, so may not read all that well.
- $comment .= "\n" . join "\n\n", @global_comments if @global_comments;
+ if (@global_comments) {
+ $comment .= "\n" . join("\n\n", @global_comments) . "\n";
+ }
if ($count) { # The format differs if no code points, and needs no
# explanation in that case
# Complications arise because any number of properties can be in the
# file, in any order, interspersed in any way. The first time a
# property is seen, it gets information about that property and
- # cache's it for quick retrieval later. It also normalizes the maps
+ # caches it for quick retrieval later. It also normalizes the maps
# so that only one of many synonym is stored. The Unicode input files
# do use some multiple synonyms.
# Make sure there is no conflict between the two.
# $missings has priority.
if (ref $missings) {
- $default_table
- = $property_object->table($default_map);
+ $default_table
+ = $property_object->table($default_map);
if (! defined $default_table
|| $default_table != $missings)
{
}
}
-# Unused until revise charnames;
+# XXX Unused until revise charnames;
#sub check_and_handle_compound_name {
# This looks at Name properties for parenthesized components and splits
# them off. Thus it finds FF as an equivalent to Form Feed.
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
- Directory => '.',
+ Directory => File::Spec->curdir(),
File => 'Decomposition',
Format => $STRING_FORMAT,
Internal_Only_Warning => 1,
}
} # End closure for UnicodeData
+sub process_GCB_test {
+
+ my $file = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ while ($file->next_line) {
+ push @backslash_X_tests, $_;
+ }
+
+ return;
+}
+
sub process_NamedSequences {
# NamedSequences.txt entries are just added to an array. Because these
# don't look like the other tables, they have their own handler.
sub filter_old_style_case_folding {
# This transforms $_ containing the case folding style of 3.0.1, to 3.1
- # and later style, then calls the handler for the later style. Different
- # letters were used.
+ # and later style. Different letters were used in the earlier.
my $file = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
return;
}
-sub filter_numeric_value_comment_missing_line {
- # Filters out the extra column in DNumValues.txt of this line
-
- s/^($missing_defaults_prefix)\s*;\s*/$1/;
- return;
-}
-
sub register_fraction($) {
# This registers the input rational number so that it can be passed on to
# utf8_heavy.pl, both in rational and floating forms.
$CanonDCIJ = $CanonDCIJ & $Assigned;
}
- # This is used in Unicode's definition of \X
+ # These are used in Unicode's definition of \X
+ my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1);
+ my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1);
+
my $gcb = property_ref('Grapheme_Cluster_Break');
- if (defined $gcb) {
- my $extend = $perl->add_match_table('_GCB_Extend',
- Initialize => $gcb->table('Extend'));
- $extend += $gcb->table('SpacingMark')
- if defined $gcb->table('SpacingMark');
+
+ # The 'extended' grapheme cluster came in 5.1. The non-extended
+ # definition differs too much from the traditional Perl one to use.
+ if (defined $gcb && defined $gcb->table('SpacingMark')) {
+
+ # Note that assumes HST is defined; it came in an earlier release than
+ # GCB. In the line below, two negatives means: yes hangul
+ $begin += ~ property_ref('Hangul_Syllable_Type')
+ ->table('Not_Applicable')
+ + ~ ($gcb->table('Control')
+ + $gcb->table('CR')
+ + $gcb->table('LF'));
+ $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
+
+ $extend += $gcb->table('Extend') + $gcb->table('SpacingMark');
+ $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
}
else { # Old definition, used on early releases.
- $perl->add_match_table('_X_Extend', Initialize => $gc->table('Mark')
- + 0x200C # ZWNJ
- + 0x200D # ZWJ
- );
+ $extend += $gc->table('Mark')
+ + 0x200C # ZWNJ
+ + 0x200D; # ZWJ
+ $begin += ~ $extend;
+
+ # Here we may have a release that has the regular grapheme cluster
+ # defined, or a release that doesn't have anything defined.
+ # We set things up so the Perl core degrades gracefully, possibly with
+ # placeholders that match nothing.
+
+ if (! defined $gcb) {
+ $gcb = Property->new('GCB', Status => $PLACEHOLDER);
+ }
+ my $hst = property_ref('HST');
+ if (!defined $hst) {
+ $hst = Property->new('HST', Status => $PLACEHOLDER);
+ $hst->add_match_table('Not_Applicable',
+ Initialize => $Any,
+ Matches_All => 1);
+ }
+
+ # On some releases, here we may not have the needed tables for the
+ # perl core, in some releases we may.
+ foreach my $name (qw{ L LV LVT T V prepend }) {
+ my $table = $gcb->table($name);
+ if (! defined $table) {
+ $table = $gcb->add_match_table($name);
+ push @tables_that_may_be_empty, $table->complete_name;
+ }
+
+ # The HST property predates the GCB one, and has identical tables
+ # for some of them, so use it if we can.
+ if ($table->is_empty
+ && defined $hst
+ && defined $hst->table($name))
+ {
+ $table += $hst->table($name);
+ }
+ }
+ }
+
+ # More GCB. If we found some hangul syllables, populate a combined
+ # table.
+ my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V');
+ my $LV = $gcb->table('LV');
+ if ($LV->is_empty) {
+ push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
+ } else {
+ $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
+ $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
}
# Create a new property specially located that is a combination of the
my $perl_charname = Property->new('Perl_Charnames',
Core_Access => '\N{...} and charnames.pm',
Default_Map => "",
- Directory => '.',
+ Directory => File::Spec->curdir(),
File => 'Name',
Internal_Only_Warning => 1,
Perl_Extension => 1,
Type => $ENUM,
Initialize => $ccc,
File => 'CombiningClass',
- Directory => '.',
+ Directory => File::Spec->curdir(),
);
$perl_ccc->set_to_output_map(1);
$perl_ccc->add_comment(join_lines(<<END
sub pod_alphanumeric_sort {
# Sort pod entries alphanumerically.
- # The first few character columns are filler; and get rid of all the
- # trailing stuff, starting with the trailing '}', so as to sort on just
- # '\p{Name=Value'
- my $a = lc substr($a, $FILLER);
+ # The first few character columns are filler, plus the '\p{'; and get rid
+ # of all the trailing stuff, starting with the trailing '}', so as to sort
+ # on just 'Name=Value'
+ (my $a = lc $a) =~ s/^ .*? { //x;
$a =~ s/}.*//;
- my $b = lc substr($b, $FILLER);
+ (my $b = lc $b) =~ s/^ .*? { //x;
$b =~ s/}.*//;
+ # Determine if the two operands are both internal only or both not.
+ # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
+ # should be the underscore that begins internal only
+ my $a_is_internal = (substr($a, 0, 1) eq '_');
+ my $b_is_internal = (substr($b, 0, 1) eq '_');
+
+ # Sort so the internals come last in the table instead of first (which the
+ # leading underscore would otherwise indicate).
+ if ($a_is_internal != $b_is_internal) {
+ return 1 if $a_is_internal;
+ return -1
+ }
+
# Determine if the two operands are numeric property values or not.
- # A numeric property will look like \p{xyz: 3}. But the number
+ # A numeric property will look like xyz: 3. But the number
# can begin with an optional minus sign, and may have a
- # fraction or rational component, like \p{xyz: 3/2}. If either
+ # fraction or rational component, like xyz: 3/2. If either
# isn't numeric, use alphabetic sort.
my ($a_initial, $a_number) =
- ($a =~ /^\\p{ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
+ ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
return $a cmp $b unless defined $a_number;
my ($b_initial, $b_number) =
- ($b =~ /^\\p{ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
+ ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
return $a cmp $b unless defined $b_number;
# Here they are both numeric, but use alphabetic sort if the
# Add an entry in the pod file for the table; it also does
# the children.
- make_table_pod_entries($table);
+ make_table_pod_entries($table) if defined $pod_directory;
# See if the the table matches identical code points with
# something that has already been output. In that case,
= $standard_property_name;
}
- # Now for the pod entry for this alias. Skip
- # the first one, which is the full name so won't have
- # an entry like: '\p{full: *} \p{full: *}', and skip
- # if don't want an entry for this one.
- next if $i == 0 || ! $alias->make_pod_entry;
+ # Now for the pod entry for this alias. Skip if not
+ # outputting a pod; skip the first one, which is the
+ # full name so won't have an entry like: '\p{full: *}
+ # \p{full: *}', and skip if don't want an entry for
+ # this one.
+ next if $i == 0
+ || ! defined $pod_directory
+ || ! $alias->make_pod_entry;
push @match_properties,
format_pod_line($indent_info_column,
# tests are added, it will perturb all later ones in the .t file
srand 0;
+ $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
+
force_unlink ($t_path);
push @files_actually_output, $t_path;
my $OUT;
}
}
}
+
+ foreach my $test (@backslash_X_tests) {
+ print $OUT "Test_X('$test');\n";
+ }
+
print $OUT "Finished();\n";
close $OUT;
return;
Handler => \&process_PropertyAliases,
),
Input_file->new(undef, v0, # No file associated with this
- Progress_Message => 'Finishing Property Setup',
+ Progress_Message => 'Finishing property setup',
Handler => \&finish_property_setup,
),
Input_file->new('PropValueAliases.txt', v0,
Input_file->new('BidiMirroring.txt', v3.0.1,
Property => 'Bidi_Mirroring_Glyph',
),
+ Input_file->new("NormalizationTest.txt", v3.0.1,
+ Skip => 1,
+ ),
Input_file->new('CaseFolding.txt', v3.0.1,
Pre_Handler => \&setup_case_folding,
Each_Line_Handler =>
Property => 'Grapheme_Cluster_Break',
Has_Missings_Defaults => $NOT_IGNORED,
),
+ Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
+ Handler => \&process_GCB_test,
+ ),
+ Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
+ Skip => 1,
+ ),
+ Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
+ Skip => 1,
+ ),
+ Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
+ Skip => 1,
+ ),
Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
Property => 'Sentence_Break',
Has_Missings_Defaults => $NOT_IGNORED,
Input_file->new('NameAliases.txt', v5.0.0,
Property => 'Name_Alias',
),
+ Input_file->new("BidiTest.txt", v5.2.0,
+ Skip => 1,
+ ),
Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
Optional => 1,
Each_Line_Handler => \&filter_unihan_line,
# Put into %potential_files a list of all the files in the directory structure
# that could be inputs to this program, excluding those that we should ignore.
-# Also don't consider test files. Use absolute file names because it makes it
-# easier across machine types.
+# Use absolute file names because it makes it easier across machine types.
my @ignored_files_full_names = map { File::Spec->rel2abs(
internal_file_to_platform($_))
} keys %ignored_files;
File::Find::find({
wanted=>sub {
- return unless /\.txt$/i;
- return if /Test\.txt$/i;
- my $full = File::Spec->rel2abs($_);
+ return unless /\.txt$/i; # Some platforms change the name's case
+ my $full = lc(File::Spec->rel2abs($_));
$potential_files{$full} = 1
- if ! grep { $full eq $_ } @ignored_files_full_names;
+ if ! grep { $full eq lc($_) } @ignored_files_full_names;
return;
}
}, File::Spec->curdir());
else {
print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
my $file_handle;
- if (! open $file_handle,"<",$file_list) {
- Carp::my_carp("Failed to open '$file_list', turning on -globlist option instead: $!");
+ if (! open $file_handle, "<", $file_list) {
+ Carp::my_carp("Failed to open '$file_list' (this is expected to be missing the first time); turning on -globlist option instead: $!");
$glob_list = 1;
}
else {
# The paths are stored with relative names, and with '/' as the
# delimiter; convert to absolute on this machine
- my $full = File::Spec->rel2abs(internal_file_to_platform($input));
+ my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
$potential_files{$full} = 1
- if ! grep { $full eq $_ } @ignored_files_full_names;
+ if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
}
}
# Here wants to process all .txt files in the directory structure.
# Convert them to full path names. They are stored in the platform's
# relative style
- my @known_files = map { File::Spec->rel2abs($_->file) }
- @input_file_objects;
+ my @known_files;
+ foreach my $object (@input_file_objects) {
+ my $file = $object->file;
+ next unless defined $file;
+ push @known_files, File::Spec->rel2abs($file);
+ }
my @unknown_input_files;
foreach my $file (keys %potential_files) {
- next if grep { $file eq $_ } @known_files;
+ next if grep { lc($file) eq lc($_) } @known_files;
# Here, the file is unknown to us. Get relative path name
$file = File::Spec->abs2rel($file);
# If the file isn't extracted (meaning none of the directories is the
# extracted one), just add it to the end of the list of inputs.
if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
- push @input_file_objects, Input_file->new($file);
+ push @input_file_objects, Input_file->new($file, v0);
}
else {
# $compare_versions is set.
for (my $i = 0; $i < @input_file_objects; $i++) {
if ($input_file_objects[$i]->first_released ne v0
- && $input_file_objects[$i]->file ne 'DAge.txt'
- && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/)
+ && lc($input_file_objects[$i]->file) ne 'dage.txt'
+ && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
{
- splice @input_file_objects, $i, 0, Input_file->new($file);
+ splice @input_file_objects, $i, 0,
+ Input_file->new($file, v0);
last;
}
}
}
}
if (@unknown_input_files) {
- print STDERR simple_fold(join_line(<<END
+ print STDERR simple_fold(join_lines(<<END
The following files are unknown as to how to handle. Assuming they are
typical property files. You'll know by later error messages if it worked or
not:
END
- ) . join(", ", @unknown_input_files) . "\n\n");
+ ) . " " . join(", ", @unknown_input_files) . "\n\n");
}
} # End of looking through directory structure for more .txt files.
-if ( $make_list ) {
- foreach my $file (@mktables_list_output_files) {
- unlink internal_file_to_platform($file);
- }
-}
-
# Create the list of input files from the objects we have defined, plus
# version
my @input_files = 'version';
use strict;
use warnings;
-# Test the \p{} regular expression constructs. This file is constructed by
-# mktables from the tables it generates, so if mktables is buggy, this won't
-# necessarily catch those bugs. Tests are generated for all feasible
-# properties; a few aren't currently feasible; see is_code_point_usable()
-# in mktables for details.
+# Test qr/\X/ and the \p{} regular expression constructs. This file is
+# constructed by mktables from the tables it generates, so if mktables is
+# buggy, this won't necessarily catch those bugs. Tests are generated for all
+# feasible properties; a few aren't currently feasible; see
+# is_code_point_usable() in mktables for details.
# Standard test packages are not used because this manipulates SIG_WARN. It
# exits 0 if every non-skipped test succeeded; -1 if any failed.
my $Tests = 0;
my $Fails = 0;
-my $Skips = 0;
-my $non_ASCII = (ord('A') == 65);
+my $non_ASCII = (ord('A') != 65);
+
+# The 256 8-bit characters in ASCII ordinal order, with the ones that don't
+# have Perl names replaced by -1
+my @ascii_ordered_chars = (
+ "\0",
+ (-1) x 6,
+ "\a", "\b", "\t", "\n",
+ -1, # No Vt
+ "\f", "\r",
+ (-1) x 18,
+ " ", "!", "\"", "#", '$', "%", "&", "'",
+ "(", ")", "*", "+", ",", "-", ".", "/",
+ "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
+ ":", ";", "<", "=", ">", "?", "@",
+ "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
+ "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
+ "[", "\\", "]", "^", "_", "`",
+ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
+ "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
+ "{", "|", "}", "~",
+ (-1) x 129
+);
-# The first 127 ASCII characters in ordinal order, with the ones that don't
-# have Perl names (as of 5.8) replaced by dots. The 127th is used as the
-# string delimiter
-my $ascii_to_ebcdic = "\0......\a\b\t\n.\f\r.................. !\"#\$\%&'()*+,-./0123456789:;<=>?\@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
-#for my $i (0..126) {
-# print $i, ": ", substr($ascii_to_ebcdic, $i, 1), "\n";
-#}
+sub ASCII_ord_to_native ($) {
+ # Converts input ordinal number to the native one, if can be done easily.
+ # Returns -1 otherwise.
+
+ my $ord = shift;
+
+ return $ord if $ord > 255 || ! $non_ASCII;
+ my $result = $ascii_ordered_chars[$ord];
+ return $result if $result eq '-1';
+ return ord($result);
+}
sub Expect($$$$) {
my $expected = shift;
# or empty if none
my $line = (caller)[2];
- # Convert the code point to hex form
- my $string = sprintf "\"\\x{%04X}\"", $ord;
-
- # Convert the non-ASCII code points expressible as characters in Perl 5.8
- # to their ASCII equivalents, and skip the others.
- if ($non_ASCII && $ord < 255) {
-
- # Dots are used as place holders in the conversion string for the
- # non-convertible ones, so check for it first.
- if ($ord == 0x2E) {
- $ord = ord('.');
- }
- elsif ($ord < 0x7F
- # Any dots returned are non-convertible.
- && ((my $char = substr($ascii_to_ebcdic, $ord, 1)) ne '.'))
- {
- #print STDERR "$ord, $char, \n";
- $ord = ord($char);
- }
- else {
- $Tests++;
- $Skips++;
- print "ok $Tests - $string =~ $regex # Skipped: non-ASCII\n";
- return;
- }
+ # Convert the non-ASCII code points expressible as characters to their
+ # ASCII equivalents, and skip the others.
+ $ord = ASCII_ord_to_native($ord);
+ if ($ord < 0) {
+ $Tests++;
+ print "ok $Tests - "
+ . sprintf("\"\\x{%04X}\"", $ord)
+ . " =~ $regex # Skipped: non-ASCII\n";
+ return;
}
- # The first time through, use all warnings.
+ # Convert the code point to hex form
+ my $string = sprintf "\"\\x{%04X}\"", $ord;
+
my @tests = "";
- # If the input should generate a warning, add another time through with
- # them turned off
+ # The first time through, use all warnings. If the input should generate
+ # a warning, add another time through with them turned off
push @tests, "no warnings '$warning_type';" if $warning_type;
foreach my $no_warnings (@tests) {
return;
}
+# GCBTest.txt character that separates grapheme clusters
+my $breakable_utf8 = my $breakable = chr(0xF7);
+utf8::upgrade($breakable_utf8);
+
+# GCBTest.txt character that indicates that the adjoining code points are part
+# of the same grapheme cluster
+my $nobreak_utf8 = my $nobreak = chr(0xD7);
+utf8::upgrade($nobreak_utf8);
+
+sub Test_X($) {
+ # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt
+ # Each such line is a sequence of code points given by their hex numbers,
+ # separated by the two characters defined just before this subroutine that
+ # indicate that either there can or cannot be a break between the adjacent
+ # code points. If there isn't a break, that means the sequence forms an
+ # extended grapheme cluster, which means that \X should match the whole
+ # thing. If there is a break, \X should stop there. This is all
+ # converted by this routine into a match:
+ # $string =~ /(\X)/,
+ # Each \X should match the next cluster; and that is what is checked.
+
+ my $template = shift;
+
+ my $line = (caller)[2];
+
+ # The line contains characters above the ASCII range, but in Latin1. It
+ # may or may not be in utf8, and if it is, it may or may not know it. So,
+ # convert these characters to 8 bits. If knows is in utf8, simply
+ # downgrade.
+ if (utf8::is_utf8($template)) {
+ utf8::downgrade($template);
+ } else {
+
+ # Otherwise, if it is in utf8, but doesn't know it, the next lines
+ # convert the two problematic characters to their 8-bit equivalents.
+ # If it isn't in utf8, they don't harm anything.
+ use bytes;
+ $template =~ s/$nobreak_utf8/$nobreak/g;
+ $template =~ s/$breakable_utf8/$breakable/g;
+ }
+
+ # Get rid of the leading and trailing breakables
+ $template =~ s/^ \s* $breakable \s* //x;
+ $template =~ s/ \s* $breakable \s* $ //x;
+
+ # And no-breaks become just a space.
+ $template =~ s/ \s* $nobreak \s* / /xg;
+
+ # Split the input into segments that are breakable between them.
+ my @segments = split /\s*$breakable\s*/, $template;
+
+ my $string = "";
+ my $display_string = "";
+ my @should_match;
+ my @should_display;
+
+ # Convert the code point sequence in each segment into a Perl string of
+ # characters
+ foreach my $segment (@segments) {
+ my @code_points = split /\s+/, $segment;
+ my $this_string = "";
+ my $this_display = "";
+ foreach my $code_point (@code_points) {
+ my $ord = ASCII_ord_to_native(hex $code_point);
+ if ($ord < 0) {
+ $Tests++;
+ print "ok $Tests - String containing $code_point =~ /(\\X)/g # Skipped: non-ASCII\n";
+ return;
+ }
+ $this_string .= chr $ord;
+ $this_display .= "\\x{$code_point}";
+ }
+
+ # The next cluster should match the string in this segment.
+ push @should_match, $this_string;
+ push @should_display, $this_display;
+ $string .= $this_string;
+ $display_string .= $this_display;
+ }
+
+ # If a string can be represented in both non-ut8 and utf8, test both cases
+ UPGRADE:
+ for my $to_upgrade (0 .. 1) {
+
+ if ($to_upgrade) {
+
+ # If already in utf8, would just be a repeat
+ next UPGRADE if utf8::is_utf8($string);
+
+ utf8::upgrade($string);
+ }
+
+ # Finally, do the \X match.
+ my @matches = $string =~ /(\X)/g;
+
+ # Look through each matched cluster to verify that it matches what we
+ # expect.
+ my $min = (@matches < @should_match) ? @matches : @should_match;
+ for my $i (0 .. $min - 1) {
+ $Tests++;
+ if ($matches[$i] eq $should_match[$i]) {
+ print "ok $Tests - ";
+ if ($i == 0) {
+ print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
+ } else {
+ print "And \\X #", $i + 1,
+ }
+ print " correctly matched $should_display[$i]; line $line\n";
+ } else {
+ $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
+ unpack("U*", $matches[$i]));
+ print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
+ $i + 1,
+ " should have matched $should_display[$i]",
+ " but instead matched $matches[$i]",
+ ". Abandoning rest of line $line\n";
+ next UPGRADE;
+ }
+ }
+
+ # And the number of matches should equal the number of expected matches.
+ $Tests++;
+ if (@matches == @should_match) {
+ print "ok $Tests - Nothing was left over; line $line\n";
+ } else {
+ print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
+ }
+ }
+
+ return;
+}
+
sub Finished() {
- print "1..$Tests.\n";
+ print "1..$Tests\n";
exit($Fails ? -1 : 0);
}
Error('\p{Script=InGreek}'); # Bug #69018
+Test_X("1100 $nobreak 1161"); # Bug #70940