X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2Funicore%2Fmktables;h=e4020ecf3d24d2623d3056e86355588a663d8ea1;hb=12ac2576dfc10fd43d91903e7602870c10b4f00f;hp=067af24b6abe19bc6482feb5af2e808d518f8251;hpb=d8f6a7325d6b2ec46e8cdc1ec4b5e1ad4a86abd0;p=p5sagit%2Fp5-mst-13.2.git diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 067af24..e4020ec 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -1,882 +1,1952 @@ #!/usr/bin/perl -w - -# -# mktables -- create the runtime Perl Unicode files (lib/unicore/**/*.pl) -# from the Unicode database files (lib/unicore/*.txt). -# - use strict; +use Carp; + +## +## mktables -- create the runtime Perl Unicode files (lib/unicore/**/*.pl) +## from the Unicode database files (lib/unicore/*.txt). +## + +mkdir("lib", 0755); +mkdir("lib/gc_sc", 0755); +mkdir("To", 0755); + +## +## Process any args. +## +my $Verbose = 0; +my $MakeTestScript = 0; + +while (@ARGV) +{ + my $arg = shift @ARGV; + if ($arg eq '-v') { + $Verbose = 1; + } elsif ($arg eq '-q') { + $Verbose = 0; + } elsif ($arg eq '-maketest') { + $MakeTestScript = 1; + } else { + die "usage: $0 [-v|-q] [-maketest]"; + } +} my $LastUnicodeCodepoint = 0x10FFFF; # As of Unicode 3.1.1. -mkdir("In", 0755); -mkdir("Is", 0755); -mkdir("To", 0755); +my $HEADER=<<"EOF"; +# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! +# This file is built by $0 from e.g. UnicodeData.txt. +# Any changes made here will be lost! -sub extend { - my ($table, $last) = @_; +EOF + + +## +## Given a filename and a reference to an array of lines, +## write the lines to the file only if the contents have not changed. +## +sub WriteIfChanged($\@) +{ + my $file = shift; + my $lines = shift; + + my $TextToWrite = join '', @$lines; + if (open IN, $file) { + local($/) = undef; + my $PreviousText = ; + close IN; + if ($PreviousText eq $TextToWrite) { + print "$file unchanged.\n" if $Verbose; + return; + } + } + if (not open OUT, ">$file") { + die "$0: can't open $file for output: $!\n"; + } + print "$file written.\n" if $Verbose; - $table->[-1]->[1] = $last; + print OUT $TextToWrite; + close OUT; } -sub append { - my ($table, $code, $name) = @_; - if (@$table && - hex($table->[-1]->[1]) == hex($code) - 1 && - (!defined $name || $table->[-1]->[2] eq $name)) { - extend($table, $code); - } else { - push @$table, [$code, $code, $name]; - } +## +## The main datastructure (a "Table") represents a set of code points that +## are part of a particular quality (that are part of \pL, \p{InGreek}, +## etc.). They are kept as ranges of code points (starting and ending of +## each range). +## +## For example, a range ASCII LETTERS would be represented as: +## [ [ 0x41 => 0x5A, 'UPPER' ], +## [ 0x61 => 0x7A, 'LOWER, ] ] +## +sub RANGE_START() { 0 } ## index into range element +sub RANGE_END() { 1 } ## index into range element +sub RANGE_NAME() { 2 } ## index into range element + +## Conceptually, these should really be folded into the 'Table' objects +my %TableInfo; +my %TableDesc; +my %FuzzyNames; +my %AliasInfo; +my %CanonicalToOrig; + +## +## Turn something like +## OLD-ITALIC +## into +## OldItalic +## +sub CanonicalName($) +{ + my $orig = shift; + my $name = lc $orig; + $name =~ s/(?) { + s/#.*//; + s/\s+$//; + next if /^$/; + + my ($abbrev, $name) = split /\s*;\s*/; + next if $abbrev eq "n/a"; + $PropertyAlias{$abbrev} = $name; + $PA_reverse{$name} = $abbrev; + } + close PA; + + open PVA, "< PropValueAliases.txt" + or confess "Can't open PropValueAliases.txt: $!"; + while () { + s/#.*//; + s/\s+$//; + next if /^$/; + + my ($prop, @data) = split /\s*;\s*/; + + if ($prop eq 'ccc') { + $PropValueAlias{$prop}{$data[1]} = [ @data[0,2] ]; + $PVA_reverse{$prop}{$data[2]} = [ @data[0,1] ]; + } + else { + next if $data[0] eq "n/a"; + $PropValueAlias{$prop}{$data[0]} = $data[1]; + $PVA_reverse{$prop}{$data[1]} = $data[0]; + } + } + close PVA; } -sub inverse { - my ($table) = @_; - my $inverse = []; - my ($first, $last); - if ($table->[0]->[0]) { - $last = hex($table->[0]->[0]); - push @$inverse, [ "0000", - sprintf("%04X", $last - 1) ]; + +## +## Associates a property ("Greek", "Lu", "Assigned",...) with a Table. +## +## Called like: +## New_Prop(In => 'Greek', $Table, Desc => 'Greek Block', Fuzzy => 1); +## +## Normally, these parameters are set when the Table is created (when the +## Table->New constructor is called), but there are times when it needs to +## be done after-the-fact...) +## +sub New_Prop($$$@) +{ + my $Type = shift; ## "Is" or "In"; + my $Name = shift; + my $Table = shift; + + ## remaining args are optional key/val + my %Args = @_; + + my $Fuzzy = delete $Args{Fuzzy}; + my $Desc = delete $Args{Desc}; # description + + $Name = CanonicalName($Name) if $Fuzzy; + + ## sanity check a few args + if (%Args or ($Type ne 'Is' and $Type ne 'In') or not ref $Table) { + confess "$0: bad args to New_Prop" } - for my $i (0..$#$table-1) { - $first = defined $table->[$i ]->[1] ? - hex($table->[$i ]->[1]) : 0; - $last = defined $table->[$i + 1]->[0] ? - hex($table->[$i + 1]->[0]) : $first; - push @$inverse, [ sprintf("%04X", $first + 1), - sprintf("%04X", $last - 1) ] - unless $first + 1 == $last; + + if (not $TableInfo{$Type}->{$Name}) + { + $TableInfo{$Type}->{$Name} = $Table; + $TableDesc{$Type}->{$Name} = $Desc; + if ($Fuzzy) { + $FuzzyNames{$Type}->{$Name} = $Name; + } } - return $inverse; } -sub header { - my $fh = shift; - print $fh < Name -- Name of "In" property to be associated with +## Is => Name -- Name of "Is" property to be associated with +## Fuzzy => Boolean -- True if name can be accessed "fuzzily" +## Desc => String -- Description of the property +## +## No args are required. +## +sub Table::New +{ + my $class = shift; + my %Args = @_; + + my $Table = bless [], $class; + + my $Fuzzy = delete $Args{Fuzzy}; + my $Desc = delete $Args{Desc}; + + for my $Type ('Is', 'In') + { + if (my $Name = delete $Args{$Type}) { + New_Prop($Type => $Name, $Table, Desc => $Desc, Fuzzy => $Fuzzy); + } + } + + ## shouldn't have any left over + if (%Args) { + confess "$0: bad args to Table->New" + } + + return $Table; } -sub begin { - my $fh = shift; +## +## Returns true if the Table has no code points +## +sub Table::IsEmpty +{ + my $Table = shift; #self + return not @$Table; +} - print $fh <$file")) { - header($fh); - begin($fh); - for my $i (@$table) { - print $fh $i->[0], "\t", - $i->[1] ne $i->[0] ? $i->[1] : "", "\t", - defined $i->[2] ? $i->[2] : "", "\n"; - } - end($fh); - close($fh); - } else { - die "$0: $file: $!\n"; - } +## +## Returns the maximum code point currently in the table. +## +sub Table::Max +{ + my $Table = shift; #self + confess "oops" if $Table->IsEmpty; ## must have code points to have a max + return $Table->[-1]->[RANGE_END]; } -# -# The %In contains the mapping of the script/block name into a number. -# +## +## Replaces the codepoints in the Table with those in the Table given +## as an arg. (NOTE: this is not a "deep copy"). +## +sub Table::Replace($$) +{ + my $Table = shift; #self + my $New = shift; -my %In; -my $InId = 0; -my %InIn; + @$Table = @$New; +} -my %InScript; -my %InBlock; +## +## Given a new code point, make the last range of the Table extend to +## include the new (and all intervening) code points. +## +sub Table::Extend +{ + my $Table = shift; #self + my $codepoint = shift; -# -# Read in the Unicode.txt, the main Unicode database. -# + my $PrevMax = $Table->Max; -my %Cat; -my %General; -my @General; - -sub gencat { - my ($Name, $GeneralH, $GeneralA, $Cat, - $name, $cat, $code, $op) = @_; - - $op->($Name, $code, $name); - $op->($GeneralA, $code, $cat); - - $op->($GeneralH->{$name} ||= [], $code, $name); - - $op->($Cat->{$cat} ||= [], $code); - $op->($Cat->{substr($cat, 0, 1)} - ||= [], $code); - # 005F: SPACING UNDERSCORE - $op->($Cat->{Word} ||= [], $code) - if $cat =~ /^[LMN]/ || $code eq "005F"; - $op->($Cat->{Alnum} ||= [], $code) - if $cat =~ /^[LMN]/; - $op->($Cat->{Alpha} ||= [], $code) - if $cat =~ /^[LM]/; - # 0009: HORIZONTAL TABULATION - # 000A: LINE FEED - # 000B: VERTICAL TABULATION - # 000C: FORM FEED - # 000D: CARRIAGE RETURN - # 0020: SPACE - $op->($Cat->{Space} ||= [], $code) - if $cat =~ /^Z/ || - $code =~ /^(0009|000A|000B|000C|000D)$/; - $op->($Cat->{SpacePerl} ||= [], $code) - if $cat =~ /^Z/ || $code =~ /^(?:0085|2028|2029)$/ || - $code =~ /^(0009|000A|000C|000D)$/; - $op->($Cat->{Blank} ||= [], $code) - if $code =~ /^(0020|0009)$/ || - $cat =~ /^Z[^lp]$/; - $op->($Cat->{Digit} ||= [], $code) if $cat eq "Nd"; - $op->($Cat->{Upper} ||= [], $code) if $cat eq "Lu"; - $op->($Cat->{Lower} ||= [], $code) if $cat eq "Ll"; - $op->($Cat->{Title} ||= [], $code) if $cat eq "Lt"; - $op->($Cat->{ASCII} ||= [], $code) if $code le "007F"; - $op->($Cat->{Cntrl} ||= [], $code) if $cat =~ /^C/; - $op->($Cat->{Graph} ||= [], $code) if $cat =~ /^([LMNPS]|Co)/; - $op->($Cat->{Print} ||= [], $code) if $cat =~ /^([LMNPS]|Co|Zs)/; - $op->($Cat->{Punct} ||= [], $code) if $cat =~ /^P/; - # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f - $op->($Cat->{XDigit} ||= [], $code) - if $code =~ /^00(3[0-9]|[46][1-6])$/; - -} - -if (open(my $Unicode, "Unicode.txt")) { - my @Name; - my @Bidi; - my %Bidi; - my @Comb; - my @Deco; - my %Deco; - my %DC; - my @Number; - my @Mirrored; - my %To; + confess "oops ($codepoint <= $PrevMax)" if $codepoint <= $PrevMax; + $Table->[-1]->[RANGE_END] = $codepoint; +} - my $LastCodeInt = -1; # a numeric, not a hexadecimal string. - - # UnicodeData-3.1.0.html says - # no characters in the file have the property, Cn, Not Assigned. - - sub check_no_characters { # in the scope of my $LastCodeInt; - my $code = shift; - my $diff_from_last = hex($code) - $LastCodeInt; - my $code_ini = sprintf("%04X", $LastCodeInt + 1); - $LastCodeInt = hex($code); - if ($diff_from_last == 1) { - return; - } elsif ($diff_from_last == 2) { - append($Cat{Cn} ||= [], $code_ini); - append($Cat{C} ||= [], $code_ini); - } else { - my $code_fin = sprintf("%04X", hex($code) - 1); - append_range($Cat{Cn} ||= [], $code_ini, $code_fin); - append_range($Cat{C} ||= [], $code_ini, $code_fin); - } +## +## Given a code point range start and end (and optional name), blindly +## append them to the list of ranges for the Table. +## +## NOTE: Code points must be added in strictly ascending numeric order. +## +sub Table::RawAppendRange +{ + my $Table = shift; #self + my $start = shift; + my $end = shift; + my $name = shift; + $name = "" if not defined $name; ## warning: $name can be "0" + + push @$Table, [ $start, # RANGE_START + $end, # RANGE_END + $name ]; # RANGE_NAME +} + +## +## Given a code point (and optional name), add it to the Table. +## +## NOTE: Code points must be added in strictly ascending numeric order. +## +sub Table::Append +{ + my $Table = shift; #self + my $codepoint = shift; + my $name = shift; + $name = "" if not defined $name; ## warning: $name can be "0" + + ## + ## If we've already got a range working, and this code point is the next + ## one in line, and if the name is the same, just extend the current range. + ## + if ($Table->NotEmpty + and + $Table->Max == $codepoint - 1 + and + $Table->[-1]->[RANGE_NAME] eq $name) + { + $Table->Extend($codepoint); } + else + { + $Table->RawAppendRange($codepoint, $codepoint, $name); + } +} - while (<$Unicode>) { - next unless /^[0-9A-Fa-f]+;/; - s/\s+$//; +## +## Given a code point range starting value and ending value (and name), +## Add the range to teh Table. +## +## NOTE: Code points must be added in strictly ascending numeric order. +## +sub Table::AppendRange +{ + my $Table = shift; #self + my $start = shift; + my $end = shift; + my $name = shift; + $name = "" if not defined $name; ## warning: $name can be "0" + + $Table->Append($start, $name); + $Table->Extend($end) if $end > $start; +} - my ($code, $name, $cat, $comb, $bidi, $deco, - $decimal, $digit, $number, - $mirrored, $unicode10, $comment, - $upper, $lower, $title) = split(/\s*;\s*/); - - if ($name =~ /^<(.+), (First|Last)>$/) { - if($2 eq 'First') { - check_no_characters($code); - } else { - $LastCodeInt = hex($code); - } - - $name = $1; - gencat(\@Name, \%General, \@General, \%Cat, - $name, $cat, $code, - $2 eq 'First' ? \&append : \&extend); - unless (defined $In{$name}) { - $In{$name} = $InId++; - $InIn{$name} = $General{$name}; - } - } else { - check_no_characters($code); - - gencat(\@Name, \%General, \@General, \%Cat, - $name, $cat, $code, \&append); - - append($To{Upper} ||= [], $code, $upper) if $upper; - append($To{Lower} ||= [], $code, $lower) if $lower; - append($To{Title} ||= [], $code, $title) if $title; - append($To{Digit} ||= [], $code, $decimal) if $decimal; - - append(\@Bidi, $code, $bidi); - append($Bidi{$bidi} ||= [], $code); - - append(\@Comb, $code, $comb) if $comb; - - if ($deco) { - append(\@Deco, $code, $deco); - if ($deco =~/^<(\w+)>/) { - append($Deco{Compat} ||= [], $code); - append($DC{$1} ||= [], $code); - } else { - append($Deco{Canon} ||= [], $code); - } - } - - append(\@Number, $code, $number) if $number; - - append(\@Mirrored, $code) if $mirrored eq "Y"; - } +## +## Return a new Table that represents all code points not in the Table. +## +sub Table::Invert +{ + my $Table = shift; #self + + my $New = Table->New(); + my $max = -1; + for my $range (@$Table) + { + my $start = $range->[RANGE_START]; + my $end = $range->[RANGE_END]; + if ($start-1 >= $max+1) { + $New->AppendRange($max+1, $start-1, ""); + } + $max = $end; + } + if ($max+1 < $LastUnicodeCodepoint) { + $New->AppendRange($max+1, $LastUnicodeCodepoint); + } + return $New; +} + +## +## Merges any number of other tables with $self, returning the new table. +## (existing tables are not modified) +## +## +## Args may be Tables, or individual code points (as integers). +## +## Can be called as either a constructor or a method. +## +sub Table::Merge +{ + shift(@_) if not ref $_[0]; ## if called as a constructor, lose the class + my @Tables = @_; + + ## Accumulate all records from all tables + my @Records; + for my $Arg (@Tables) + { + if (ref $Arg) { + ## arg is a table -- get its ranges + push @Records, @$Arg; + } else { + ## arg is a codepoint, make a range + push @Records, [ $Arg, $Arg ] + } } - check_no_characters(sprintf("%X", $LastUnicodeCodepoint + 1)); + ## sort by range start, with longer ranges coming first. + my ($first, @Rest) = sort { + ($a->[RANGE_START] <=> $b->[RANGE_START]) + or + ($b->[RANGE_END] <=> $b->[RANGE_END]) + } @Records; + + my $New = Table->New(); + + ## Ensuring the first range is there makes the subsequent loop easier + $New->AppendRange($first->[RANGE_START], + $first->[RANGE_END]); + + ## Fold in records so long as they add new information. + for my $set (@Rest) + { + my $start = $set->[RANGE_START]; + my $end = $set->[RANGE_END]; + if ($start > $New->Max) { + $New->AppendRange($start, $end); + } elsif ($end > $New->Max) { + $New->Extend($end); + } + } - flush(\@Name, "Name.pl"); + return $New; +} - foreach my $cat (sort keys %Cat) { - flush($Cat{$cat}, "Is/$cat.pl"); +## +## Given a filename, write a representation of the Table to a file. +## May have an optional comment as a 2nd arg. +## +sub Table::Write +{ + my $Table = shift; #self + my $filename = shift; + my $comment = shift; + + my @OUT = $HEADER; + if (defined $comment) { + $comment =~ s/\s+\Z//; + $comment =~ s/^/# /gm; + push @OUT, "#\n$comment\n#\n"; + } + push @OUT, "return <<'END';\n"; + + for my $set (@$Table) + { + my $start = $set->[RANGE_START]; + my $end = $set->[RANGE_END]; + my $name = $set->[RANGE_NAME]; + + if ($start == $end) { + push @OUT, sprintf "%04X\t\t%s\n", $start, $name; + } else { + push @OUT, sprintf "%04X\t%04X\t%s\n", $start, $end, $name; + } } - foreach my $to (sort keys %To) { - flush($To{$to}, "To/$to.pl"); + push @OUT, "END\n"; + + WriteIfChanged($filename, @OUT); +} + +## This used only for making the test script. +## helper function +sub IsUsable($) +{ + my $code = shift; + return 0 if $code <= 0x0000; ## don't use null + return 0 if $code >= $LastUnicodeCodepoint; ## keep in range + return 0 if ($code >= 0xD800 and $code <= 0xDFFF); ## no surrogates + return 0 if ($code >= 0xFDD0 and $code <= 0xFDEF); ## utf8.c says no good + return 0 if (($code & 0xFFFF) == 0xFFFE); ## utf8.c says no good + return 0 if (($code & 0xFFFF) == 0xFFFF); ## utf8.c says no good + return 1; +} + +## Return a code point that's part of the table. +## Returns nothing if the table is empty (or covers only surrogates). +## This used only for making the test script. +sub Table::ValidCode +{ + my $Table = shift; #self + for my $set (@$Table) { + return $set->[RANGE_END] if IsUsable($set->[RANGE_END]); } + return (); +} + +## Return a code point that's not part of the table +## Returns nothing if the table covers all code points. +## This used only for making the test script. +sub Table::InvalidCode +{ + my $Table = shift; #self + + return 0x1234 if $Table->IsEmpty(); + + for my $set (@$Table) + { + if (IsUsable($set->[RANGE_END] + 1)) + { + return $set->[RANGE_END] + 1; + } + + if (IsUsable($set->[RANGE_START] - 1)) + { + return $set->[RANGE_START] - 1; + } + } + return (); +} - flush(\@Bidi, "Bidirectional.pl"); - foreach my $bidi (sort keys %Bidi) { - flush($Bidi{$bidi}, "Is/Bidi$bidi.pl"); +########################################################################### +########################################################################### +########################################################################### + + +## +## Called like: +## New_Alias(Is => 'All', SameAs => 'Any', Fuzzy => 1); +## +## The args must be in that order, although the Fuzzy pair may be omitted. +## +## This creates 'IsAll' as an alias for 'IsAny' +## +sub New_Alias($$$@) +{ + my $Type = shift; ## "Is" or "In" + my $Alias = shift; + my $SameAs = shift; # expecting "SameAs" -- just ignored + my $Name = shift; + + ## remaining args are optional key/val + my %Args = @_; + + my $Fuzzy = delete $Args{Fuzzy}; + + ## sanity check a few args + if (%Args or ($Type ne 'Is' and $Type ne 'In') or $SameAs ne 'SameAs') { + confess "$0: bad args to New_Alias" } - flush(\@Comb, "CombiningClass.pl"); + $Alias = CanonicalName($Alias) if $Fuzzy; - flush(\@Deco, "Decomposition.pl"); - foreach my $deco (sort keys %Deco) { - flush($Deco{$deco}, "Is/Deco$deco.pl"); + if (not $TableInfo{$Type}->{$Name}) + { + my $CName = CanonicalName($Name); + if ($TableInfo{$Type}->{$CName}) { + confess "$0: Use canonical form '$CName' instead of '$Name' for alias."; + } else { + confess "$0: don't have original $Type => $Name to make alias\n"; + } } - foreach my $dc (sort keys %DC) { - flush($DC{$dc}, "Is/DC$dc.pl"); + if ($TableInfo{$Alias}) { + confess "$0: already have original $Type => $Alias; can't make alias"; + } + $AliasInfo{$Type}->{$Name} = $Alias; + if ($Fuzzy) { + $FuzzyNames{$Type}->{$Alias} = $Name; } - flush(\@Number, "Number.pl"); - - flush(\@Mirrored, "Is/Mirrored.pl"); -} else { - die "$0: Unicode.txt: $!\n"; } -# The general cateory can be written out already now. -flush(\@General, "Category.pl"); +## All assigned code points +my $Assigned = Table->New(Is => 'Assigned', + Desc => "All assigned code points", + Fuzzy => 0); -# -# Read in the LineBrk.txt. -# +my $Name = Table->New(); ## all characters, individually by name +my $General = Table->New(); ## all characters, grouped by category +my %General; +my %Cat; -if (open(my $LineBrk, "LineBrk.txt")) { - my @Lbrk; - my %Lbrk; +## +## Process UnicodeData.txt (Categories, etc.) +## +sub UnicodeData_Txt() +{ + my $Bidi = Table->New(); + my $Deco = Table->New(); + my $Comb = Table->New(); + my $Number = Table->New(); + my $Mirrored = Table->New();#Is => 'Mirrored', + #Desc => "Mirrored in bidirectional text", + #Fuzzy => 0); - while (<$LineBrk>) { - next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\w+)/; + my %DC; + my %Bidi; + my %Number; + $DC{can} = Table->New(); + $DC{com} = Table->New(); + + ## Initialize Perl-generated categories + ## (Categories from UnicodeData.txt are auto-initialized in gencat) + $Cat{Alnum} = + Table->New(Is => 'Alnum', Desc => "[[:Alnum:]]", Fuzzy => 0); + $Cat{Alpha} = + Table->New(Is => 'Alpha', Desc => "[[:Alpha:]]", Fuzzy => 0); + $Cat{ASCII} = + Table->New(Is => 'ASCII', Desc => "[[:ASCII:]]", Fuzzy => 0); + $Cat{Blank} = + Table->New(Is => 'Blank', Desc => "[[:Blank:]]", Fuzzy => 0); + $Cat{Cntrl} = + Table->New(Is => 'Cntrl', Desc => "[[:Cntrl:]]", Fuzzy => 0); + $Cat{Digit} = + Table->New(Is => 'Digit', Desc => "[[:Digit:]]", Fuzzy => 0); + $Cat{Graph} = + Table->New(Is => 'Graph', Desc => "[[:Graph:]]", Fuzzy => 0); + $Cat{Lower} = + Table->New(Is => 'Lower', Desc => "[[:Lower:]]", Fuzzy => 0); + $Cat{Print} = + Table->New(Is => 'Print', Desc => "[[:Print:]]", Fuzzy => 0); + $Cat{Punct} = + Table->New(Is => 'Punct', Desc => "[[:Punct:]]", Fuzzy => 0); + $Cat{Space} = + Table->New(Is => 'Space', Desc => "[[:Space:]]", Fuzzy => 0); + $Cat{Title} = + Table->New(Is => 'Title', Desc => "[[:Title:]]", Fuzzy => 0); + $Cat{Upper} = + Table->New(Is => 'Upper', Desc => "[[:Upper:]]", Fuzzy => 0); + $Cat{XDigit} = + Table->New(Is => 'XDigit', Desc => "[[:XDigit:]]", Fuzzy => 0); + $Cat{Word} = + Table->New(Is => 'Word', Desc => "[[:Word:]]", Fuzzy => 0); + $Cat{SpacePerl} = + Table->New(Is => 'SpacePerl', Desc => '\s', Fuzzy => 0); - my ($first, $last, $lbrk) = ($1, $2, $3); + my %To; + $To{Upper} = Table->New(); + $To{Lower} = Table->New(); + $To{Title} = Table->New(); + $To{Digit} = Table->New(); + + sub gencat($$$$) + { + my ($name, ## Name ("LATIN CAPITAL LETTER A") + $cat, ## Category ("Lu", "Zp", "Nd", etc.) + $code, ## Code point (as an integer) + $op) = @_; + + my $MajorCat = substr($cat, 0, 1); ## L, M, Z, S, etc + + $Assigned->$op($code); + $Name->$op($code, $name); + $General->$op($code, $cat); + + ## add to the sub category (e.g. "Lu", "Nd", "Cf", ..) + $Cat{$cat} ||= Table->New(Is => $cat, + Desc => "General Category '$cat'", + Fuzzy => 0); + $Cat{$cat}->$op($code); + + ## add to the major category (e.g. "L", "N", "C", ...) + $Cat{$MajorCat} ||= Table->New(Is => $MajorCat, + Desc => "Major Category '$MajorCat'", + Fuzzy => 0); + $Cat{$MajorCat}->$op($code); + + ($General{$name} ||= Table->New)->$op($code, $name); + + # 005F: SPACING UNDERSCORE + $Cat{Word}->$op($code) if $cat =~ /^[LMN]|Pc/; + $Cat{Alnum}->$op($code) if $cat =~ /^[LM]|Nd/; + $Cat{Alpha}->$op($code) if $cat =~ /^[LM]/; + + my $isspace = + ($cat =~ /Zs|Zl|Zp/ && + $code != 0x200B) # 200B is ZWSP which is for line break control + # and therefore it is not part of "space" even while it is "Zs". + || $code == 0x0009 # 0009: HORIZONTAL TAB + || $code == 0x000A # 000A: LINE FEED + || $code == 0x000B # 000B: VERTICAL TAB + || $code == 0x000C # 000C: FORM FEED + || $code == 0x000D # 000D: CARRIAGE RETURN + || $code == 0x0085 # 0085: NEL + + ; + + $Cat{Space}->$op($code) if $isspace; + + $Cat{SpacePerl}->$op($code) if $isspace + && $code != 0x000B; # Backward compat. + + $Cat{Blank}->$op($code) if $isspace + && !($code == 0x000A || + $code == 0x000B || + $code == 0x000C || + $code == 0x000D || + $code == 0x0085 || + $cat =~ /^Z[lp]/); + + $Cat{Digit}->$op($code) if $cat eq "Nd"; + $Cat{Upper}->$op($code) if $cat eq "Lu"; + $Cat{Lower}->$op($code) if $cat eq "Ll"; + $Cat{Title}->$op($code) if $cat eq "Lt"; + $Cat{ASCII}->$op($code) if $code <= 0x007F; + $Cat{Cntrl}->$op($code) if $cat =~ /^C/; + my $isgraph = !$isspace && $cat !~ /Cc|Cs|Cn/; + $Cat{Graph}->$op($code) if $isgraph; + $Cat{Print}->$op($code) if $isgraph || $isspace; + $Cat{Punct}->$op($code) if $cat =~ /^P/; + + $Cat{XDigit}->$op($code) if ($code >= 0x30 && $code <= 0x39) ## 0..9 + || ($code >= 0x41 && $code <= 0x46) ## A..F + || ($code >= 0x61 && $code <= 0x66); ## a..f + } - append(\@Lbrk, $first, $lbrk); - append($Lbrk{$lbrk} ||= [], $first); - if (defined $last) { - extend(\@Lbrk, $last); - extend($Lbrk{$lbrk}, $last); - } + ## open ane read file..... + if (not open IN, "UnicodeData.txt") { + die "$0: UnicodeData.txt: $!\n"; } - flush(\@Lbrk, "Lbrk.pl"); - foreach my $lbrk (sort keys %Lbrk) { - flush($Lbrk{$lbrk}, "Is/Lbrk$lbrk.pl"); + ## + ## For building \p{_CombAbove} and \p{_CanonDCIJ} + ## + my %_Above_HexCodes; ## Hexcodes for chars with $comb == 230 ("ABOVE") + + my %CodeToDeco; ## Maps code to decomp. list for chars with first + ## decomp. char an "i" or "j" (for \p{_CanonDCIJ}) + + ## This is filled in as we go.... + my $CombAbove = Table->New(Is => '_CombAbove', + Desc => '(for internal casefolding use)', + Fuzzy => 0); + + while () + { + next unless /^[0-9A-Fa-f]+;/; + s/\s+$//; + + my ($hexcode, ## code point in hex (e.g. "0041") + $name, ## character name (e.g. "LATIN CAPITAL LETTER A") + $cat, ## category (e.g. "Lu") + $comb, ## Canonical combining class (e.t. "230") + $bidi, ## directional category (e.g. "L") + $deco, ## decomposition mapping + $decimal, ## decimal digit value + $digit, ## digit value + $number, ## numeric value + $mirrored, ## mirrored + $unicode10, ## name in Unicode 1.0 + $comment, ## comment field + $upper, ## uppercase mapping + $lower, ## lowercase mapping + $title, ## titlecase mapping + ) = split(/\s*;\s*/); + + # Note that in Unicode 3.2 there will be names like + # LINE FEED (LF), which probably means that \N{} needs + # to cope also with LINE FEED and LF. + $name = $unicode10 if $name eq '' && $unicode10 ne ''; + + my $code = hex($hexcode); + + if ($comb and $comb == 230) { + $CombAbove->Append($code); + $_Above_HexCodes{$hexcode} = 1; + } + + ## Used in building \p{_CanonDCIJ} + if ($deco and $deco =~ m/^006[9A]\b/) { + $CodeToDeco{$code} = $deco; + } + + ## + ## There are a few pairs of lines like: + ## AC00;;Lo;0;L;;;;;N;;;;; + ## D7A3;;Lo;0;L;;;;;N;;;;; + ## that define ranges. + ## + if ($name =~ /^<(.+), (First|Last)>$/) + { + $name = $1; + gencat($name, $cat, $code, $2 eq 'First' ? 'Append' : 'Extend'); + #New_Prop(In => $name, $General{$name}, Fuzzy => 1); + } + else + { + ## normal (single-character) lines + gencat($name, $cat, $code, 'Append'); + + # No Append() here since since several codes may map into one. + $To{Upper}->RawAppendRange($code, $code, $upper) if $upper; + $To{Lower}->RawAppendRange($code, $code, $lower) if $lower; + $To{Title}->RawAppendRange($code, $code, $title) if $title; + $To{Digit}->Append($code, $decimal) if length $decimal; + + $Bidi->Append($code, $bidi); + $Comb->Append($code, $comb) if $comb; + $Number->Append($code, $number) if length $number; + + length($decimal) and ($Number{De} ||= Table->New())->Append($code) + or + length($digit) and ($Number{Di} ||= Table->New())->Append($code) + or + length($number) and ($Number{Nu} ||= Table->New())->Append($code); + + $Mirrored->Append($code) if $mirrored eq "Y"; + + $Bidi{$bidi} ||= Table->New();#Is => "bt/$bidi", + #Desc => "Bi-directional category '$bidi'", + #Fuzzy => 0); + $Bidi{$bidi}->Append($code); + + if ($deco) + { + $Deco->Append($code, $deco); + if ($deco =~/^<(\w+)>/) + { + my $dshort = $PVA_reverse{dt}{ucfirst lc $1}; + $DC{com}->Append($code); + + $DC{$dshort} ||= Table->New(); + $DC{$dshort}->Append($code); + } + else + { + $DC{can}->Append($code); + } + } + } + } + close IN; + + ## + ## Tidy up a few special cases.... + ## + + $Cat{Cn} = $Assigned->Invert; ## Cn is everything that doesn't exist + New_Prop(Is => 'Cn', + $Cat{Cn}, + Desc => "General Category 'Cn' [not functional in Perl]", + Fuzzy => 0); + + ## Unassigned is the same as 'Cn' + New_Alias(Is => 'Unassigned', SameAs => 'Cn', Fuzzy => 0); + + $Cat{C}->Replace($Cat{C}->Merge($Cat{Cn})); ## Now merge in Cn into C + + + # LC is Ll, Lu, and Lt. + # (used to be L& or L_, but PropValueAliases.txt defines it as LC) + New_Prop(Is => 'LC', + Table->Merge(@Cat{qw[Ll Lu Lt]}), + Desc => '[\p{Ll}\p{Lu}\p{Lt}]', + Fuzzy => 0); + + ## Any and All are all code points. + my $Any = Table->New(Is => 'Any', + Desc => sprintf("[\\x{0000}-\\x{%X}]", + $LastUnicodeCodepoint), + Fuzzy => 0); + $Any->RawAppendRange(0, $LastUnicodeCodepoint); + + New_Alias(Is => 'All', SameAs => 'Any', Fuzzy => 0); + + ## + ## Build special properties for Perl's internal case-folding needs: + ## \p{_CaseIgnorable} + ## \p{_CanonDCIJ} + ## \p{_CombAbove} + ## _CombAbove was built above. Others are built here.... + ## + + ## \p{_CaseIgnorable} is [\p{Mn}\0x00AD\x2010] + New_Prop(Is => '_CaseIgnorable', + Table->Merge($Cat{Mn}, + 0x00AD, #SOFT HYPHEN + 0x2010), #HYPHEN + Desc => '(for internal casefolding use)', + Fuzzy => 0); + + + ## \p{_CanonDCIJ} is fairly complex... + my $CanonCDIJ = Table->New(Is => '_CanonDCIJ', + Desc => '(for internal casefolding use)', + Fuzzy => 0); + ## It contains the ASCII 'i' and 'j'.... + $CanonCDIJ->Append(0x0069); # ASCII ord("i") + $CanonCDIJ->Append(0x006A); # ASCII ord("j") + ## ...and any character with a decomposition that starts with either of + ## those code points, but only if the decomposition does not have any + ## combining character with the "ABOVE" canonical combining class. + for my $code (sort { $a <=> $b} keys %CodeToDeco) + { + ## Need to ensure that all decomposition characters do not have + ## a %HexCodeToComb in %AboveCombClasses. + my $want = 1; + for my $deco_hexcode (split / /, $CodeToDeco{$code}) + { + if (exists $_Above_HexCodes{$deco_hexcode}) { + ## one of the decmposition chars has an ABOVE combination + ## class, so we're not interested in this one + $want = 0; + last; + } + } + if ($want) { + $CanonCDIJ->Append($code); + } } -} else { - die "$0: LineBrk.txt: $!\n"; -} -# -# Read in the ArabShap.txt. -# -if (open(my $ArabShap, "ArabShap.txt")) { - my @ArabLink; - my @ArabLinkGroup; - while (<$ArabShap>) { - next unless /^[0-9A-Fa-f]+;/; - s/\s+$//; + ## + ## Now dump the files. + ## + $Name->Write("Name.pl"); - my ($code, $name, $link, $linkgroup) = split(/\s*;\s*/); + # $Bidi->Write("Bidirectional.pl"); + mkdir("lib/bc", 0755); + for (keys %Bidi) { + $Bidi{$_}->Write( + "lib/bc/$_.pl", + "BidiClass category '$PropValueAlias{bc}{$_}'" + ); + } - append(\@ArabLink, $code, $link); - append(\@ArabLinkGroup, $code, $linkgroup); + $Comb->Write("CombiningClass.pl"); + mkdir("lib/ccc", 0755); + for (keys %{ $PropValueAlias{ccc} }) { + my ($code, $name) = @{ $PropValueAlias{ccc}{$_} }; + (my $c = Table->New())->Append($code); + $c->Write( + "lib/ccc/$_.pl", + "CombiningClass category '$name'" + ); } - flush(\@ArabLink, "ArabLink.pl"); - flush(\@ArabLinkGroup, "ArabLnkGrp.pl"); -} else { - die "$0: ArabShap.txt: $!\n"; -} + $Deco->Write("Decomposition.pl"); + mkdir("lib/dt", 0755); + for (keys %DC) { + $DC{$_}->Write( + "lib/dt/$_.pl", + "DecompositionType category '$PropValueAlias{dt}{$_}'" + ); + } -# -# Read in the Jamo.txt. -# + # $Number->Write("Number.pl"); + mkdir("lib/nt", 0755); + for (keys %Number) { + $Number{$_}->Write( + "lib/nt/$_.pl", + "NumericType category '$PropValueAlias{nt}{$_}'" + ); + } -if (open(my $Jamo, "Jamo.txt")) { - my @Short; + # $General->Write("Category.pl"); - while (<$Jamo>) { - next unless /^([0-9A-Fa-f]+)\s*;\s*(\w*)/; + for my $to (sort keys %To) { + $To{$to}->Write("To/$to.pl"); + } - my ($code, $short) = ($1, $2); + for (keys %{ $PropValueAlias{gc} }) { + New_Alias(Is => $PropValueAlias{gc}{$_}, SameAs => $_, Fuzzy => 1); + } +} - append(\@Short, $code, $short); +## +## Process LineBreak.txt +## +sub LineBreak_Txt() +{ + if (not open IN, "LineBreak.txt") { + die "$0: LineBreak.txt: $!\n"; } - flush(\@Short, "JamoShort.pl"); -} else { - die "$0: Jamo.txt: $!\n"; -} + my $Lbrk = Table->New(); + my %Lbrk; -# -# Read in the Scripts.txt. -# + while () + { + next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\w+)/; -my @Scripts; + my ($first, $last, $lbrk) = (hex($1), hex($2||""), $3); -if (open(my $Scripts, "Scripts.txt")) { - while (<$Scripts>) { - next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/; + $Lbrk->Append($first, $lbrk); - # Wait until all the scripts have been read since - # they are not listed in numeric order. - push @Scripts, [ hex($1), $1, $2, $3 ]; + $Lbrk{$lbrk} ||= Table->New(); + $Lbrk{$lbrk}->Append($first); + + if ($last) { + $Lbrk->Extend($last); + $Lbrk{$lbrk}->Extend($last); + } } -} else { - die "$0: Scripts.txt: $!\n"; -} + close IN; -# Now append the scripts properties in their code point order. + # $Lbrk->Write("Lbrk.pl"); -my %Script; -my $Scripts = []; + mkdir("lib/lb", 0755); -for my $script (sort { $a->[0] <=> $b->[0] } @Scripts) { - my ($code, $first, $last, $name) = @$script; - append($Scripts, $first, $name); - append($Script{$name} ||= [], $first, $name); - if (defined $last) { - extend($Scripts, $last); - extend($Script{$name}, $last); - } - unless (defined $In{$name}) { - $InScript{$InId} = $name; - $In{$name} = $InId++; - $InIn{$name} = $Script{$name}; + for (keys %Lbrk) { + $Lbrk{$_}->Write( + "lib/lb/$_.pl", + "Linebreak category '$PropValueAlias{lb}{$_}'" + ); } } -# Scripts.pl can be written out already now. +## +## Process ArabicShaping.txt. +## +sub ArabicShaping_txt() +{ + if (not open IN, "ArabicShaping.txt") { + die "$0: ArabicShaping.txt: $!\n"; + } + + my $ArabLink = Table->New(); + my $ArabLinkGroup = Table->New(); -flush(\@Scripts, "Scripts.pl"); + my %JoinType; -# Common is everything not explicitly assigned to a Script + while () + { + next unless /^[0-9A-Fa-f]+;/; + s/\s+$//; -$In{Common} = $InId++; -my $Common = inverse($Scripts); -$InIn{Common} = $Common; + my ($hexcode, $name, $link, $linkgroup) = split(/\s*;\s*/); + my $code = hex($hexcode); + $ArabLink->Append($code, $link); + $ArabLinkGroup->Append($code, $linkgroup); -# -# Read in the Blocks.txt. -# + $JoinType{$link} ||= Table->New(Is => "JoinType$link"); + $JoinType{$link}->Append($code); + } + close IN; -my @Blocks; -my %Blocks; - -if (open(my $Blocks, "Blocks.txt")) { - while (<$Blocks>) { - next unless /^([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.+?)\s*$/; - - my ($first, $last, $name) = ($1, $2, $3); - my $origname = $name; - - # If there's a naming conflict (the script names are - # in uppercase), the name of the block has " Block" - # appended to it. - my $pat = $name; - $pat =~ s/([- _])/(?:[-_]|\\s+)?/g; - for my $i (values %InScript) { - if ($i =~ /^$pat$/i) { - $name .= " Block"; - last; - } - } + # $ArabLink->Write("ArabLink.pl"); + # $ArabLinkGroup->Write("ArabLnkGrp.pl"); - append(\@Blocks, $first, $name); - append($Blocks{$name} ||= [], $first, $name); - if (defined $last) { - extend(\@Blocks, $last); - extend($Blocks{$name}, $last); - } - unless (defined $In{$name}) { - $InBlock{$InId} = $origname; - $In{$name} = $InId++; - $InIn{$name} = $Blocks{$name}; - } + mkdir("lib/jt", 0755); + + for (keys %JoinType) { + $JoinType{$_}->Write( + "lib/jt/$_.pl", + "JoiningType category '$PropValueAlias{jt}{$_}'" + ); } -} else { - die "$0: Blocks.txt: $!\n"; } -# Blocks.pl can be written out already now. +## +## Process EastAsianWidth.txt. +## +sub EastAsianWidth_txt() +{ + if (not open IN, "EastAsianWidth.txt") { + die "$0: EastAsianWidth.txt: $!\n"; + } -flush(\@Blocks, "Blocks.pl"); + my %EAW; -# -# Read in the PropList.txt. It contains extended properties not -# listed in the Unicode.txt, such as 'Other_Alphabetic': -# alphabetic but not of the general category L; many modifiers -# belong to this extended property category: while they are not -# alphabets, they are alphabetic in nature. -# + while () + { + next unless /^[0-9A-Fa-f]+;/; + s/#.*//; + s/\s+$//; -my @Props; + my ($hexcode, $pv) = split(/\s*;\s*/); + my $code = hex($hexcode); + $EAW{$pv} ||= Table->New(Is => "EastAsianWidth$pv"); + $EAW{$pv}->Append($code); + } + close IN; -if (open(my $Props, "PropList.txt")) { - while (<$Props>) { - next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/; + mkdir("lib/ea", 0755); - # Wait until all the extended properties have been read since - # they are not listed in numeric order. - push @Props, [ hex($1), $1, $2, $3 ]; + for (keys %EAW) { + $EAW{$_}->Write( + "lib/ea/$_.pl", + "EastAsianWidth category '$PropValueAlias{ea}{$_}'" + ); } -} else { - die "$0: PropList.txt: $!\n"; } -# Now append the extended properties in their code point order. +## +## Process HangulSyllableType.txt. +## +sub HangulSyllableType_txt() +{ + if (not open IN, "HangulSyllableType.txt") { + die "$0: HangulSyllableType.txt: $!\n"; + } + + my %HST; + + while () + { + next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\w+)/; + my ($first, $last, $pv) = (hex($1), hex($2||""), $3); -my %Prop; -my $Props = []; + $HST{$pv} ||= Table->New(Is => "HangulSyllableType$pv"); + $HST{$pv}->Append($first); -for my $prop (sort { $a->[0] <=> $b->[0] } @Props) { - my ($code, $first, $last, $name) = @$prop; - append($Props, $first, $name); - append($Prop{$name} ||= [], $first, $name); - if (defined $last) { - extend($Props, $last); - extend($Prop{$name}, $last); + if ($last) { $HST{$pv}->Extend($last) } } - unless (defined $In{$name}) { - $In{$name} = $InId++; - $InIn{$name} = $Prop{$name}; + close IN; + + mkdir("lib/hst", 0755); + + for (keys %HST) { + $HST{$_}->Write( + "lib/hst/$_.pl", + "HangulSyllableType category '$PropValueAlias{hst}{$_}'" + ); } } -# Assigned is everything not Cn +## +## Process Jamo.txt. +## +sub Jamo_txt() +{ + if (not open IN, "Jamo.txt") { + die "$0: Jamo.txt: $!\n"; + } + my $Short = Table->New(); -$In{Assigned} = $InId++; -my $Assigned = inverse($Cat{Cn}); -$InIn{Assigned} = $Assigned; + while () + { + next unless /^([0-9A-Fa-f]+)\s*;\s*(\w*)/; + my ($code, $short) = (hex($1), $2); + + $Short->Append($code, $short); + } + close IN; + # $Short->Write("JamoShort.pl"); +} -# Unassigned is everything not Assigned +## +## Process Scripts.txt. +## +sub Scripts_txt() +{ + my @ScriptInfo; -$In{Unassigned} = $InId++; -my $Unassigned = $Cat{Cn}; -$InIn{Unassigned} = $Unassigned; + if (not open(IN, "Scripts.txt")) { + die "$0: Scripts.txt: $!\n"; + } + while () { + next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/; -# Unassigned is everything not Assigned -sub merge_general_and_extended { - my ($name, $general, $extended) = @_; - my $merged; + # Wait until all the scripts have been read since + # they are not listed in numeric order. + push @ScriptInfo, [ hex($1), hex($2||""), $3 ]; + } + close IN; - push @$merged, - map { pop @{$_}; $_ } - sort { $a->[2] <=> $b->[2] } - map { [ $_->[0], $_->[1], hex($_->[0]) ] } - ($general ? - map { ref $_ ? @$_ : $_ } - @Cat {ref $general ? @$general : $general } : - (), - $extended ? - map { ref $_ ? @$_ : $_ } - @Prop{ref $extended ? @$extended : $extended} : - ()); + # Now append the scripts properties in their code point order. - $In{$name} = $InId++; - $InIn{$name} = $merged; - - return $merged; -} + my %Script; + my $Scripts = Table->New(); -# Alphabetic is L and Other_Alphabetic. + for my $script (sort { $a->[0] <=> $b->[0] } @ScriptInfo) + { + my ($first, $last, $name) = @$script; + $Scripts->Append($first, $name); -my $Alphabetic = - merge_general_and_extended('Alphabetic', 'L', 'Other_Alphabetic'); + $Script{$name} ||= Table->New(Is => $name, + Desc => "Script '$name'", + Fuzzy => 1); + $Script{$name}->Append($first, $name); -# Lowercase is Ll and Other_Lowercase. + if ($last) { + $Scripts->Extend($last); + $Script{$name}->Extend($last); + } + } -my $Lowercase = - merge_general_and_extended('Lowercase', 'Ll', 'Other_Lowercase'); + # $Scripts->Write("Scripts.pl"); -# Uppercase is Lu and Other_Uppercase. + ## Common is everything not explicitly assigned to a Script + ## + ## ***shouldn't this be intersected with \p{Assigned}? ****** + ## + New_Prop(Is => 'Common', + $Scripts->Invert, + Desc => 'Pseudo-Script of codepoints not in other Unicode scripts', + Fuzzy => 1); +} -my $Uppercase = - merge_general_and_extended('Uppercase', 'Lu', 'Other_Uppercase'); +## +## Given a name like "Close Punctuation", return a regex (that when applied +## with /i) matches any valid form of that name (e.g. "ClosePunctuation", +## "Close-Punctuation", etc.) +## +## Accept any space, dash, or underbar where in the official name there is +## space or a dash (or underbar, but there never is). +## +## +sub NameToRegex($) +{ + my $Name = shift; + $Name =~ s/[- _]/(?:[-_]|\\s+)?/g; + return $Name; +} -# Math is Sm and Other_Math. +## +## Process Blocks.txt. +## +sub Blocks_txt() +{ + my $Blocks = Table->New(); + my %Blocks; -my $Math = - merge_general_and_extended('Math', 'Sm', 'Other_Math'); + if (not open IN, "Blocks.txt") { + die "$0: Blocks.txt: $!\n"; + } -# Lampersand is Ll, Lu, and Lt. + while () + { + #next if not /Private Use$/; + next if not /^([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.+?)\s*$/; -my $Lampersand = - merge_general_and_extended('Lampersand', [ qw(Ll Lu Lt) ]); + my ($first, $last, $name) = (hex($1), hex($2), $3); -# ID_Start is Ll, Lu, Lt, Lm, Lo, and Nl. + $Blocks->Append($first, $name); -my $ID_Start = - merge_general_and_extended('ID_Start', [ qw(Ll Lu Lt Lm Lo Nl) ]); + $Blocks{$name} ||= Table->New(In => $name, + Desc => "Block '$name'", + Fuzzy => 1); + $Blocks{$name}->Append($first, $name); -# ID_Continue is ID_Start, Mn, Mc, Nd, and Pc. + if ($last and $last != $first) { + $Blocks->Extend($last); + $Blocks{$name}->Extend($last); + } + } + close IN; -my $ID_Continue = - merge_general_and_extended('ID_Continue', [ qw(Ll Lu Lt Lm Lo Nl - Mn Mc Nd Pc) ]); + # $Blocks->Write("Blocks.pl"); +} -# -# Any is any. -# +## +## Read in the PropList.txt. It contains extended properties not +## listed in the UnicodeData.txt, such as 'Other_Alphabetic': +## alphabetic but not of the general category L; many modifiers +## belong to this extended property category: while they are not +## alphabets, they are alphabetic in nature. +## +sub PropList_txt() +{ + my @PropInfo; + + if (not open IN, "PropList.txt") { + die "$0: PropList.txt: $!\n"; + } -$In{Any} = $InId++; -my $Any = [ [ 0, sprintf("%04X", $LastUnicodeCodepoint) ] ]; -$InIn{Any} = $Any; + while () + { + next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/; -# -# All is any, too. -# + # Wait until all the extended properties have been read since + # they are not listed in numeric order. + push @PropInfo, [ hex($1), hex($2||""), $3 ]; + } + close IN; + + # Now append the extended properties in their code point order. + my $Props = Table->New(); + my %Prop; + + for my $prop (sort { $a->[0] <=> $b->[0] } @PropInfo) + { + my ($first, $last, $name) = @$prop; + $Props->Append($first, $name); + + $Prop{$name} ||= Table->New(Is => $name, + Desc => "Extended property '$name'", + Fuzzy => 1); + $Prop{$name}->Append($first, $name); + + if ($last) { + $Props->Extend($last); + $Prop{$name}->Extend($last); + } + } -$In{All} = $InId++; -$InIn{All} = $Any; + for (keys %Prop) { + (my $file = $PA_reverse{$_}) =~ tr/_//d; + $Prop{$_}->Write( + "lib/gc_sc/$file.pl", + "Binary property '$_'" + ); + } -# -# mapping() will be used to write out the In and Is virtual mappings. -# + # Alphabetic is L and Other_Alphabetic. + New_Prop(Is => 'Alphabetic', + Table->Merge($Cat{L}, $Prop{Other_Alphabetic}), + Desc => '[\p{L}\p{OtherAlphabetic}]', # use canonical names here + Fuzzy => 1); + + # Lowercase is Ll and Other_Lowercase. + New_Prop(Is => 'Lowercase', + Table->Merge($Cat{Ll}, $Prop{Other_Lowercase}), + Desc => '[\p{Ll}\p{OtherLowercase}]', # use canonical names here + Fuzzy => 1); + + # Uppercase is Lu and Other_Uppercase. + New_Prop(Is => 'Uppercase', + Table->Merge($Cat{Lu}, $Prop{Other_Uppercase}), + Desc => '[\p{Lu}\p{Other_Uppercase}]', # use canonical names here + Fuzzy => 1); + + # Math is Sm and Other_Math. + New_Prop(Is => 'Math', + Table->Merge($Cat{Sm}, $Prop{Other_Math}), + Desc => '[\p{Sm}\p{OtherMath}]', # use canonical names here + Fuzzy => 1); + + # ID_Start is Ll, Lu, Lt, Lm, Lo, and Nl. + New_Prop(Is => 'ID_Start', + Table->Merge(@Cat{qw[Ll Lu Lt Lm Lo Nl]}), + Desc => '[\p{Ll}\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{Nl}]', + Fuzzy => 1); + + # ID_Continue is ID_Start, Mn, Mc, Nd, and Pc. + New_Prop(Is => 'ID_Continue', + Table->Merge(@Cat{qw[Ll Lu Lt Lm Lo Nl Mn Mc Nd Pc ]}), + Desc => '[\p{ID_Start}\p{Mn}\p{Mc}\p{Nd}\p{Pc}]', + Fuzzy => 1); +} -sub mapping { - my ($map, $name) = @_; - - if (open(my $fh, ">$name.pl")) { - print "$name.pl\n"; - header($fh); - - # The %pat will hold a hash that maps the first two - # lowercased letters of a class to a 'fuzzified' regular - # expression that points to the real mapping. - - my %pat; - - # But first write out the offical name to real name - # (the filename) mapping. - - print $fh < '$map->{$i}',\n", "'$i'"; - } - print $fh < {\n"; - foreach my $ipat (@{$pat{$prefix}}) { - my ($i, $pat) = @$ipat; - print $fh "\t'$pat' => '$map->{$i}',\n"; - } - print $fh "},\n"; - } - print $fh <= 0.5) { + $new .= $BadSeps[rand(@BadSeps)]; + } else { + $new = $BadSeps[rand(@BadSeps)] . $new; + } + } + return $new; +} -if (open(my $In, ">>In.pl")) { - print $In <TestProp.pl") { + die "$0: TestProp.pl: $!\n"; + } + print OUT ; -%utf8::InScript = -( -EOT - for my $i (sort { $a <=> $b } keys %InScript) { - printf $In "%4d => '$InScript{$i}',\n", $i; + while (my ($Name, $Table) = each %ExactNameToTest) + { + GenTests(*OUT, $Name, $Table->ValidCode, $Table->InvalidCode); + ExpectError(*OUT, uc $Name) if uc $Name ne $Name; + ExpectError(*OUT, lc $Name) if lc $Name ne $Name; } - print $In < $b } keys %InBlock) { - printf $In "%4d => '$InBlock{$i}',\n", $i; - } - print $In < 1, + $Orig => 1, + RandomlyFuzzifyName($Orig) => 1 + ); -# -# Write out the real In mappings -# (the In.pl written out just above has the virtual In mappings) -# + for my $N (keys %Names) { + GenTests(*OUT, $N, $Table->ValidCode, $Table->InvalidCode); + } + + ExpectError(*OUT, RandomlyFuzzifyName($Orig, 'ERROR')); + } -foreach my $in (sort { $In{$a} <=> $In{$b} } keys %In) { - flush($InIn{$in}, "In/$In{$in}.pl"); + print OUT "Finished();\n"; + close OUT; } -# -# The mapping from General Category long forms to short forms is -# currently hardwired here since no simple data file in the UCD -# seems to do that. Unicode 3.2 will assumedly correct this. -# -my %Is = ( - 'Letter' => 'L', - 'Uppercase_Letter' => 'Lu', - 'Lowercase_Letter' => 'Ll', - 'Titlecase_Letter' => 'Lt', - 'Modifier_Letter' => 'Lm', - 'Other_Letter' => 'Lo', - - 'Mark' => 'M', - 'Non_Spacing_Mark' => 'Mn', - 'Spacing_Mark' => 'Mc', - 'Enclosing_Mark' => 'Me', - - 'Separator' => 'Z', - 'Space_Separator' => 'Zs', - 'Line_Separator' => 'Zl', - 'Paragraph_Separator' => 'Zp', - - 'Number' => 'N', - 'Decimal_Number' => 'Nd', - 'Letter_Number' => 'Nl', - 'Other_Number' => 'No', - - 'Punctuation' => 'P', - 'Connector_Punctuation' => 'Pc', - 'Dash_Punctuation' => 'Pd', - 'Open_Punctuation' => 'Ps', - 'Close_Punctuation' => 'Pe', - 'Initial_Punctuation' => 'Pi', - 'Final_Punctuation' => 'Pf', - 'Other_Punctuation' => 'Po', - - 'Symbol' => 'S', - 'Math_Symbol' => 'Sm', - 'Currency_Symbol' => 'Sc', - 'Modifier_Symbol' => 'Sk', - 'Other_Symbol' => 'So', - - 'Other' => 'C', - 'Control' => 'Cc', - 'Format' => 'Cf', - 'Surrogate' => 'Cs', - 'Private Use' => 'Co', - 'Unassigned' => 'Cn', -); +## +## These are used only in: +## RegisterFileForName() +## WriteAllMappings() +## +my %Exact; ## will become %utf8::Exact; +my %Canonical; ## will become %utf8::Canonical; +my %CaComment; ## Comment for %Canonical entry of same key + +## +## Given info about a name and a datafile that it should be associated with, +## register that assocation in %Exact and %Canonical. +sub RegisterFileForName($$$$) +{ + my $Type = shift; + my $Name = shift; + my $IsFuzzy = shift; + my $filename = shift; + + ## + ## Now in details for the mapping. $Type eq 'Is' has the + ## Is removed, as it will be removed in utf8_heavy when this + ## data is being checked. In keeps its "In", but a second + ## sans-In record is written if it doesn't conflict with + ## anything already there. + ## + if (not $IsFuzzy) + { + if ($Type eq 'Is') { + die "oops[$Name]" if $Exact{$Name}; + $Exact{$Name} = $filename; + } else { + die "oops[$Type$Name]" if $Exact{"$Type$Name"}; + $Exact{"$Type$Name"} = $filename; + $Exact{$Name} = $filename if not $Exact{$Name}; + } + } + else + { + my $CName = lc $Name; + if ($Type eq 'Is') { + die "oops[$CName]" if $Canonical{$CName}; + $Canonical{$CName} = $filename; + $CaComment{$CName} = $Name if $Name =~ tr/A-Z// >= 2; + } else { + die "oops[$Type$CName]" if $Canonical{lc "$Type$CName"}; + $Canonical{lc "$Type$CName"} = $filename; + $CaComment{lc "$Type$CName"} = "$Type$Name"; + if (not $Canonical{$CName}) { + $Canonical{$CName} = $filename; + $CaComment{$CName} = "$Type$Name"; + } + } + } +} -# -# Write out the virtual Is mappings. -# +## +## Writes the info accumulated in +## +## %TableInfo; +## %FuzzyNames; +## %AliasInfo; +## +## +sub WriteAllMappings() +{ + my @MAP; + + my %BaseNames; ## Base names already used (for avoiding 8.3 conflicts) + + ## 'Is' *MUST* come first, so its names have precidence over 'In's + for my $Type ('Is', 'In') + { + my %RawNameToFile; ## a per-$Type cache + + for my $Name (sort {length $a <=> length $b} keys %{$TableInfo{$Type}}) + { + ## Note: $Name is already canonical + my $Table = $TableInfo{$Type}->{$Name}; + my $IsFuzzy = $FuzzyNames{$Type}->{$Name}; + + ## Need an 8.3 safe filename (which means "an 8 safe" $filename) + my $filename; + { + ## 'Is' items lose 'Is' from the basename. + $filename = $Type eq 'Is' ? + ($PVA_reverse{sc}{$Name} || $Name) : + "$Type$Name"; + + $filename =~ s/[^\w_]+/_/g; # "L&" -> "L_" + substr($filename, 8) = '' if length($filename) > 8; + + ## + ## Make sure the basename doesn't conflict with something we + ## might have already written. If we have, say, + ## InGreekExtended1 + ## InGreekExtended2 + ## they become + ## InGreekE + ## InGreek2 + ## + while (my $num = $BaseNames{lc $filename}++) + { + $num++; ## so basenames with numbers start with '2', which + ## just looks more natural. + ## Want to append $num, but if it'll make the basename longer + ## than 8 characters, pre-truncate $filename so that the result + ## is acceptable. + my $delta = length($filename) + length($num) - 8; + if ($delta > 0) { + substr($filename, -$delta) = $num; + } else { + $filename .= $num; + } + } + }; + + ## + ## Construct a nice comment to add to the file, and build data + ## for the "./Properties" file along the way. + ## + my $Comment; + { + my $Desc = $TableDesc{$Type}->{$Name} || ""; + ## get list of names this table is reference by + my @Supported = $Name; + while (my ($Orig, $Alias) = each %{ $AliasInfo{$Type} }) + { + if ($Orig eq $Name) { + push @Supported, $Alias; + } + } + + my $TypeToShow = $Type eq 'Is' ? "" : $Type; + my $OrigProp; + + $Comment = "This file supports:\n"; + for my $N (@Supported) + { + my $IsFuzzy = $FuzzyNames{$Type}->{$N}; + my $Prop = "\\p{$TypeToShow$Name}"; + $OrigProp = $Prop if not $OrigProp; #cache for aliases + if ($IsFuzzy) { + $Comment .= "\t$Prop (and fuzzy permutations)\n"; + } else { + $Comment .= "\t$Prop\n"; + } + my $MyDesc = ($N eq $Name) ? $Desc : "Alias for $OrigProp ($Desc)"; + + push @MAP, sprintf("%s %-42s %s\n", + $IsFuzzy ? '*' : ' ', $Prop, $MyDesc); + } + if ($Desc) { + $Comment .= "\nMeaning: $Desc\n"; + } + + } + ## + ## Okay, write the file... + ## + $Table->Write("lib/gc_sc/$filename.pl", $Comment); + + ## and register it + $RawNameToFile{$Name} = $filename; + RegisterFileForName($Type => $Name, $IsFuzzy, $filename); + + if ($IsFuzzy) + { + my $CName = CanonicalName($Type . '_'. $Name); + $FuzzyNameToTest{$Name} = $Table if !$FuzzyNameToTest{$Name}; + $FuzzyNameToTest{$CName} = $Table if !$FuzzyNameToTest{$CName}; + } else { + $ExactNameToTest{$Name} = $Table; + } + + } + + ## Register aliase info + for my $Name (sort {length $a <=> length $b} keys %{$AliasInfo{$Type}}) + { + my $Alias = $AliasInfo{$Type}->{$Name}; + my $IsFuzzy = $FuzzyNames{$Type}->{$Alias}; + my $filename = $RawNameToFile{$Name}; + die "oops [$Alias]->[$Name]" if not $filename; + RegisterFileForName($Type => $Alias, $IsFuzzy, $filename); + + my $Table = $TableInfo{$Type}->{$Name}; + die "oops" if not $Table; + if ($IsFuzzy) + { + my $CName = CanonicalName($Type .'_'. $Alias); + $FuzzyNameToTest{$Alias} = $Table if !$FuzzyNameToTest{$Alias}; + $FuzzyNameToTest{$CName} = $Table if !$FuzzyNameToTest{$CName}; + } else { + $ExactNameToTest{$Alias} = $Table; + } + } + } -mapping(\%Is, "Is"); + ## + ## Write out the property list + ## + { + my @OUT = ( + "##\n", + "## This file created by $0\n", + "## List of built-in \\p{...}/\\P{...} properties.\n", + "##\n", + "## '*' means name may be 'fuzzy'\n", + "##\n\n", + sort { substr($a,2) cmp substr($b, 2) } @MAP, + ); + WriteIfChanged('Properties', @OUT); + } -# -# Read in the special cases. -# + use Text::Tabs (); ## using this makes the files about half the size + + ## Write Exact.pl + { + my @OUT = ( + $HEADER, + "##\n", + "## Data in this file used by ../utf8_heavy.pl\n", + "##\n\n", + "## Mapping from name to filename in ./lib/gc_sc\n", + "%utf8::Exact = (\n", + ); + + $Exact{InGreek} = 'InGreekA'; # this is evil kludge + for my $Name (sort keys %Exact) + { + my $File = $Exact{$Name}; + $Name = $Name =~ m/\W/ ? qq/'$Name'/ : " $Name "; + my $Text = sprintf("%-15s => %s,\n", $Name, qq/'$File'/); + push @OUT, Text::Tabs::unexpand($Text); + } + push @OUT, ");\n1;\n"; + + WriteIfChanged('Exact.pl', @OUT); + } -my %Case; + ## Write Canonical.pl + { + my @OUT = ( + $HEADER, + "##\n", + "## Data in this file used by ../utf8_heavy.pl\n", + "##\n\n", + "## Mapping from lc(canonical name) to filename in ./lib\n", + "%utf8::Canonical = (\n", + ); + my $Trail = ""; ## used just to keep the spacing pretty + for my $Name (sort keys %Canonical) + { + my $File = $Canonical{$Name}; + if ($CaComment{$Name}) { + push @OUT, "\n" if not $Trail; + push @OUT, " # $CaComment{$Name}\n"; + $Trail = "\n"; + } else { + $Trail = ""; + } + $Name = $Name =~ m/\W/ ? qq/'$Name'/ : " $Name "; + my $Text = sprintf(" %-41s => %s,\n$Trail", $Name, qq/'$File'/); + push @OUT, Text::Tabs::unexpand($Text); + } + push @OUT, ");\n1\n"; + WriteIfChanged('Canonical.pl', @OUT); + } -if (open(my $SpecCase, "SpecCase.txt")) { - while (<$SpecCase>) { - next unless /^[0-9A-Fa-f]+;/; - s/\#.*//; - s/\s+$//; + MakePropTestScript() if $MakeTestScript; +} - my ($code, $lower, $title, $upper, $condition) = split(/\s*;\s*/); - if ($condition) { # not implemented yet - print "# SKIPPING $_\n"; - next; - } +sub SpecialCasing_txt() +{ + # + # Read in the special cases. + # - # Wait until all the special cases have been read since - # they are not listed in numeric order. - my $ix = hex($code); - push @{$Case{Lower}}, [ $ix, $code, $lower ]; - push @{$Case{Title}}, [ $ix, $code, $title ]; - push @{$Case{Upper}}, [ $ix, $code, $upper ]; - } -} else { - die "$0: SpecCase.txt: $!\n"; -} - -# Now write out the special cases properties in their code point order. -# Prepend them to the To/{Upper,Lower,Title}.pl. - -for my $case (qw(Lower Title Upper)) { - my $NormalCase = do "To/$case.pl" || die "$0: To/$case.pl: $!\n"; - if (open(my $Case, ">To/$case.pl")) { - header($Case); - print $Case <[0] <=> $b->[0] } @{$Case{$case}}) { - my ($ix, $code, $to) = @$prop; - my $tostr = - join "", map { sprintf "\\x{%s}", $_ } split ' ', $to; - printf $Case qq['%04X' => "$tostr",\n], $ix; - } - print $Case <) { + next unless /^[0-9A-Fa-f]+;/; + s/\#.*//; + s/\s+$//; + + my ($code, $lower, $title, $upper, $condition) = split(/\s*;\s*/); + + if ($condition) { # not implemented yet + print "# SKIPPING $_\n" if $Verbose; + next; + } + + # Wait until all the special cases have been read since + # they are not listed in numeric order. + my $ix = hex($code); + push @{$CaseInfo{Lower}}, [ $ix, $code, $lower ] + unless $code eq $lower; + push @{$CaseInfo{Title}}, [ $ix, $code, $title ] + unless $code eq $title; + push @{$CaseInfo{Upper}}, [ $ix, $code, $upper ] + unless $code eq $upper; + } + close IN; + + # Now write out the special cases properties in their code point order. + # Prepend them to the To/{Upper,Lower,Title}.pl. + + for my $case (qw(Lower Title Upper)) + { + my $NormalCase = do "To/$case.pl" || die "$0: $@\n"; + + my @OUT = + ( + $HEADER, "\n", + "# The key UTF-8 _bytes_, the value UTF-8 (speed hack)\n", + "%utf8::ToSpec$case =\n(\n", + ); + + for my $prop (sort { $a->[0] <=> $b->[0] } @{$CaseInfo{$case}}) { + my ($ix, $code, $to) = @$prop; + my $tostr = + join "", map { sprintf "\\x{%s}", $_ } split ' ', $to; + push @OUT, sprintf qq["%s" => "$tostr",\n], join("", map { sprintf "\\x%02X", $_ } unpack("U0C*", pack("U", $ix))); + # Remove any single-character mappings for + # the same character since we are going for + # the special casing rules. + $NormalCase =~ s/^$code\t\t\w+\n//m; + } + push @OUT, ( + ");\n\n", + "return <<'END';\n", + $NormalCase, + "END\n" + ); + WriteIfChanged("To/$case.pl", @OUT); } } # # Read in the case foldings. # -# We will do full case folding, C + F + I (see CaseFold.txt). +# We will do full case folding, C + F + I (see CaseFolding.txt). # +sub CaseFolding_txt() +{ + if (not open IN, "CaseFolding.txt") { + die "$0: CaseFolding.txt: $!\n"; + } -if (open(my $CaseFold, "CaseFold.txt")) { - my @Fold; + my $Fold = Table->New(); my %Fold; - while (<$CaseFold>) { + while () { # Skip status 'S', simple case folding next unless /^([0-9A-Fa-f]+)\s*;\s*([CFI])\s*;\s*([0-9A-Fa-f]+(?: [0-9A-Fa-f]+)*)\s*;/; - my ($code, $status, $fold) = ($1, $2, $3); + my ($code, $status, $fold) = (hex($1), $2, $3); if ($status eq 'C') { # Common: one-to-one folding # No append() since several codes may fold into one. - push @Fold, [ $code, $code, $fold ]; + $Fold->RawAppendRange($code, $code, $fold); } else { # F: full, or I: dotted uppercase I -> dotless lowercase I - $Fold{hex($code)} = $fold; + $Fold{$code} = $fold; } } + close IN; - flush(\@Fold, "To/Fold.pl"); + $Fold->Write("To/Fold.pl"); # # Prepend the special foldings to the common foldings. # - my $CommonFold = do "To/Fold.pl" || die "$0: To/Fold.pl: $!\n"; - if (open(my $Fold, ">To/Fold.pl")) { - header($Fold); - print $Fold < $b } keys %Fold) { - my $foldstr = - join "", map { sprintf "\\x{%s}", $_ } split ' ', $Fold{$code}; - printf $Fold qq['%04X' => "$foldstr",\n], $code; - } - print $Fold < $b } keys %Fold) { + my $foldstr = + join "", map { sprintf "\\x{%s}", $_ } split ' ', $Fold{$code}; + push @OUT, sprintf qq["%s" => "$foldstr",\n], join("", map { sprintf "\\x%02X", $_ } unpack("U0C*", pack("U", $code))); } -} else { - die "$0: CaseFold.txt: $!\n"; + push @OUT, ( + ");\n\n", + "return <<'END';\n", + $CommonFold, + "END\n", + ); + + WriteIfChanged("To/Fold.pl", @OUT); } -# That's all, folks! +## Do it.... +Build_Aliases(); +UnicodeData_Txt(); +PropList_txt(); + +Scripts_txt(); +Blocks_txt(); + +WriteAllMappings(); + +LineBreak_Txt(); +ArabicShaping_txt(); +EastAsianWidth_txt(); +HangulSyllableType_txt(); +Jamo_txt(); +SpecialCasing_txt(); +CaseFolding_txt(); + +exit(0); + +## TRAILING CODE IS USED BY MakePropTestScript() +__DATA__ +use strict; +use warnings; + +my $Tests = 0; +my $Fails = 0; + +sub Expect($$$) +{ + my $Expect = shift; + my $String = shift; + my $Regex = shift; + my $Line = (caller)[2]; + + $Tests++; + my $RegObj; + my $result = eval { + $RegObj = qr/$Regex/; + $String =~ $RegObj ? 1 : 0 + }; + + if (not defined $result) { + print "couldn't compile /$Regex/ on $0 line $Line: $@\n"; + $Fails++; + } elsif ($result ^ $Expect) { + print "bad result (expected $Expect) on $0 line $Line: $@\n"; + $Fails++; + } +} + +sub Error($) +{ + my $Regex = shift; + $Tests++; + if (eval { 'x' =~ qr/$Regex/; 1 }) { + $Fails++; + my $Line = (caller)[2]; + print "expected error for /$Regex/ on $0 line $Line: $@\n"; + } +} + +sub Finished() +{ + if ($Fails == 0) { + print "All $Tests tests passed.\n"; + exit(0); + } else { + print "$Tests tests, $Fails failed!\n"; + exit(-1); + } +}