#!/usr/bin/perl -w
-
-#
-# mktables -- create the runtime Perl Unicode files (lib/unicore/**/*.pl)
-# from the Unicode database files (lib/unicore/*.txt).
-#
-
use strict;
-
-my $LastUnicodeCodepoint = 0x10FFFF; # As of Unicode 3.1.1.
+use Carp;
+##
+## mktables -- create the runtime Perl Unicode files (lib/unicore/**/*.pl)
+## from the Unicode database files (lib/unicore/*.txt).
+##
mkdir("In", 0755);
mkdir("Is", 0755);
mkdir("To", 0755);
-sub extend {
- my ($table, $last) = @_;
-
- $table->[-1]->[1] = $last;
-}
-
-sub append {
- my ($table, $code, $name) = @_;
- if (@$table &&
- hex($table->[-1]->[1]) == hex($code) - 1 &&
- (!defined $name || $table->[-1]->[2] eq $name)) {
- extend($table, $code);
+##
+## Process any args.
+##
+my $Verbose = 0;
+
+while (@ARGV)
+{
+ my $arg = shift @ARGV;
+ if ($arg eq '-v') {
+ $Verbose = 1;
+ } elsif ($arg eq '-q') {
+ $Verbose = 0;
} else {
- push @$table, [$code, $code, $name];
+ die "usage: $0 [-v|-q]";
}
}
-sub append_range {
- my ($table, $code_ini, $code_fin, $name) = @_;
- append($table, $code_ini, $name);
- extend($table, $code_fin);
-}
-
-sub inverse {
- my ($table) = @_;
- my $inverse = [];
- my ($first, $last);
- if ($table->[0]->[0]) {
- $last = hex($table->[0]->[0]);
- push @$inverse, [ "0000",
- sprintf("%04X", $last - 1) ];
- }
- for my $i (0..$#$table-1) {
- $first = defined $table->[$i ]->[1] ?
- hex($table->[$i ]->[1]) : 0;
- $last = defined $table->[$i + 1]->[0] ?
- hex($table->[$i + 1]->[0]) : $first;
- push @$inverse, [ sprintf("%04X", $first + 1),
- sprintf("%04X", $last - 1) ]
- unless $first + 1 == $last;
- }
- return $inverse;
-}
-
-sub header {
- my $fh = shift;
+my $LastUnicodeCodepoint = 0x10FFFF; # As of Unicode 3.1.1.
- print $fh <<EOT;
+my $now = localtime;
+my $HEADER=<<"EOF";
# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
# This file is built by $0 from e.g. Unicode.txt.
# Any changes made here will be lost!
-EOT
+# Built $now.
+
+EOF
+
+##
+## The main datastructure (a "Table") represents a set of code points that
+## are part of a particular quality (that are part of \pL, \p{InGreek},
+## etc.). They are kept as ranges of code points (starting and ending of
+## each range).
+##
+## For example, a range ASCII LETTERS would be represented as:
+## [ [ 0x41 => 0x5A, 'UPPER' ],
+## [ 0x61 => 0x7A, 'LOWER, ] ]
+##
+sub RANGE_START() { 0 } ## index into range element
+sub RANGE_END() { 1 } ## index into range element
+sub RANGE_NAME() { 2 } ## index into range element
+
+my %TableInfo;
+my %FuzzyNames;
+my %AliasInfo;
+
+##
+## Associates a property ("Greek", "Lu", "Assigned",...) with a Table.
+##
+## Called like:
+## New_Prop(In => 'Greek', $Table, AllowFuzzy => 1);
+##
+## Normally, these parameters are set when the Table is created (when the
+## Table->New constructor is called), but there are times when it needs to
+## be done after-the-fact...)
+##
+sub New_Prop($$$@)
+{
+ my $Type = shift; ## "Is" or "In";
+ my $Name = shift;
+ my $Table = shift;
+
+ ## remaining args are optional key/val
+ my %Args = @_;
+
+ my $AllowFuzzy = delete $Args{AllowFuzzy};
+
+ ## sanity check a few args
+ if (%Args or ($Type ne 'Is' and $Type ne 'In') or not ref $Table) {
+ confess "$0: bad args to New_Prop"
+ }
+
+ if (not $TableInfo{$Type}->{$Name})
+ {
+ $TableInfo{$Type}->{$Name} = $Table;
+ if ($AllowFuzzy) {
+ $FuzzyNames{$Type}->{$Name} = $Name;
+ }
+ }
}
-sub begin {
- my $fh = shift;
- print $fh <<EOT;
-return <<'END';
-EOT
+##
+## Creates a new Table object.
+##
+## Args are key/value pairs:
+## In => Name -- Name of "In" property to be associated with
+## Is => Name -- Name of "Is" property to be associated with
+## AllowFuzzy => Boolean -- True if name can be accessed "fuzzily"
+##
+## No args are required.
+##
+sub Table::New
+{
+ my $class = shift;
+ my %Args = @_;
+
+ my $Table = bless [], $class;
+
+ my $AllowFuzzy = delete $Args{AllowFuzzy};
+
+ for my $Type ('Is', 'In')
+ {
+ if (my $Name = delete $Args{$Type}) {
+ New_Prop($Type => $Name, $Table, AllowFuzzy => $AllowFuzzy);
+ }
+ }
+
+ ## shouldn't have any left over
+ if (%Args) {
+ confess "$0: bad args to Table->New"
+ }
+
+ return $Table;
}
-sub end {
- my $fh = shift;
-
- print $fh <<EOT;
-END
-EOT
+##
+## Returns true if the Table has no code points
+##
+sub Table::IsEmpty
+{
+ my $Table = shift; #self
+ return not @$Table;
}
-sub flush {
- my ($table, $file) = @_;
- print "$file\n";
- if (open(my $fh, ">$file")) {
- header($fh);
- begin($fh);
- for my $i (@$table) {
- print $fh $i->[0], "\t",
- $i->[1] ne $i->[0] ? $i->[1] : "", "\t",
- defined $i->[2] ? $i->[2] : "", "\n";
- }
- end($fh);
- close($fh);
- } else {
- die "$0: $file: $!\n";
- }
+##
+## Returns true if the Table has code points
+##
+sub Table::NotEmpty
+{
+ my $Table = shift; #self
+ return @$Table;
}
-#
-# The %In contains the mapping of the script/block name into a number.
-#
+##
+## Returns the maximum code point currently in the table.
+##
+sub Table::Max
+{
+ my $Table = shift; #self
+ confess "oops" if $Table->IsEmpty; ## must have code points to have a max
+ return $Table->[-1]->[RANGE_END];
+}
-my %In;
-my $InId = 0;
-my %InIn;
+##
+## Replaces the codepoints in the Table with those in the Table given
+## as an arg. (NOTE: this is not a "deep copy").
+##
+sub Table::Replace($$)
+{
+ my $Table = shift; #self
+ my $New = shift;
-my %InScript;
-my %InBlock;
+ @$Table = @$New;
+}
-#
-# Read in the Unicode.txt, the main Unicode database.
-#
+##
+## Given a new code point, make the last range of the Table extend to
+## include the new (and all intervening) code points.
+##
+sub Table::Extend
+{
+ my $Table = shift; #self
+ my $codepoint = shift;
-my %Cat;
-my %General;
-my @General;
-
-sub gencat {
- my ($Name, $GeneralH, $GeneralA, $Cat,
- $name, $cat, $code, $op) = @_;
-
- $op->($Name, $code, $name);
- $op->($GeneralA, $code, $cat);
-
- $op->($GeneralH->{$name} ||= [], $code, $name);
-
- $op->($Cat->{$cat} ||= [], $code);
- $op->($Cat->{substr($cat, 0, 1)}
- ||= [], $code);
- # 005F: SPACING UNDERSCORE
- $op->($Cat->{Word} ||= [], $code)
- if $cat =~ /^[LMN]/ || $code eq "005F";
- $op->($Cat->{Alnum} ||= [], $code)
- if $cat =~ /^[LMN]/;
- $op->($Cat->{Alpha} ||= [], $code)
- if $cat =~ /^[LM]/;
- # 0009: HORIZONTAL TABULATION
- # 000A: LINE FEED
- # 000B: VERTICAL TABULATION
- # 000C: FORM FEED
- # 000D: CARRIAGE RETURN
- # 0020: SPACE
- $op->($Cat->{Space} ||= [], $code)
- if $cat =~ /^Z/ ||
- $code =~ /^(0009|000A|000B|000C|000D)$/;
- $op->($Cat->{SpacePerl} ||= [], $code)
- if $cat =~ /^Z/ || $code =~ /^(?:0085|2028|2029)$/ ||
- $code =~ /^(0009|000A|000C|000D)$/;
- $op->($Cat->{Blank} ||= [], $code)
- if $code =~ /^(0020|0009)$/ ||
- $cat =~ /^Z[^lp]$/;
- $op->($Cat->{Digit} ||= [], $code) if $cat eq "Nd";
- $op->($Cat->{Upper} ||= [], $code) if $cat eq "Lu";
- $op->($Cat->{Lower} ||= [], $code) if $cat eq "Ll";
- $op->($Cat->{Title} ||= [], $code) if $cat eq "Lt";
- $op->($Cat->{ASCII} ||= [], $code) if $code le "007F";
- $op->($Cat->{Cntrl} ||= [], $code) if $cat =~ /^C/;
- $op->($Cat->{Graph} ||= [], $code) if $cat =~ /^([LMNPS]|Co)/;
- $op->($Cat->{Print} ||= [], $code) if $cat =~ /^([LMNPS]|Co|Zs)/;
- $op->($Cat->{Punct} ||= [], $code) if $cat =~ /^P/;
- # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f
- $op->($Cat->{XDigit} ||= [], $code)
- if $code =~ /^00(3[0-9]|[46][1-6])$/;
+ my $PrevMax = $Table->Max;
-}
+ confess "oops ($codepoint <= $PrevMax)" if $codepoint <= $PrevMax;
-if (open(my $Unicode, "Unicode.txt")) {
- my @Name;
- my @Bidi;
- my %Bidi;
- my @Comb;
- my @Deco;
- my %Deco;
- my %DC;
- my @Number;
- my @Mirrored;
- my %To;
+ $Table->[-1]->[RANGE_END] = $codepoint;
+}
+##
+## Given a code point range start and end (and optional name), blindly
+## append them to the list of ranges for the Table.
+##
+## NOTE: Code points must be added in strictly ascending numeric order.
+##
+sub Table::RawAppendRange
+{
+ my $Table = shift; #self
+ my $start = shift;
+ my $end = shift;
+ my $name = shift;
+ $name = "" if not defined $name; ## warning: $name can be "0"
+
+ push @$Table, [ $start, # RANGE_START
+ $end, # RANGE_END
+ $name ]; # RANGE_NAME
+}
- my $LastCodeInt = -1; # a numeric, not a hexadecimal string.
-
- # UnicodeData-3.1.0.html says
- # no characters in the file have the property, Cn, Not Assigned.
-
- sub check_no_characters { # in the scope of my $LastCodeInt;
- my $code = shift;
- my $diff_from_last = hex($code) - $LastCodeInt;
- my $code_ini = sprintf("%04X", $LastCodeInt + 1);
- $LastCodeInt = hex($code);
- if ($diff_from_last == 1) {
- return;
- } elsif ($diff_from_last == 2) {
- append($Cat{Cn} ||= [], $code_ini);
- append($Cat{C} ||= [], $code_ini);
- } else {
- my $code_fin = sprintf("%04X", hex($code) - 1);
- append_range($Cat{Cn} ||= [], $code_ini, $code_fin);
- append_range($Cat{C} ||= [], $code_ini, $code_fin);
- }
+##
+## Given a code point (and optional name), add it to the Table.
+##
+## NOTE: Code points must be added in strictly ascending numeric order.
+##
+sub Table::Append
+{
+ my $Table = shift; #self
+ my $codepoint = shift;
+ my $name = shift;
+ $name = "" if not defined $name; ## warning: $name can be "0"
+
+ ##
+ ## If we've already got a range working, and this code point is the next
+ ## one in line, and if the name is the same, just extend the current range.
+ ##
+ if ($Table->NotEmpty
+ and
+ $Table->Max == $codepoint - 1
+ and
+ $Table->[-1]->[RANGE_NAME] eq $name)
+ {
+ $Table->Extend($codepoint);
}
-
- while (<$Unicode>) {
- next unless /^[0-9A-Fa-f]+;/;
- s/\s+$//;
-
- my ($code, $name, $cat, $comb, $bidi, $deco,
- $decimal, $digit, $number,
- $mirrored, $unicode10, $comment,
- $upper, $lower, $title) = split(/\s*;\s*/);
-
- if ($name =~ /^<(.+), (First|Last)>$/) {
- if($2 eq 'First') {
- check_no_characters($code);
- } else {
- $LastCodeInt = hex($code);
- }
-
- $name = $1;
- gencat(\@Name, \%General, \@General, \%Cat,
- $name, $cat, $code,
- $2 eq 'First' ? \&append : \&extend);
- unless (defined $In{$name}) {
- $In{$name} = $InId++;
- $InIn{$name} = $General{$name};
- }
- } else {
- check_no_characters($code);
-
- gencat(\@Name, \%General, \@General, \%Cat,
- $name, $cat, $code, \&append);
-
- # No append() here since since several codes may map into one.
- push @{$To{Upper}}, [ $code, $code, $upper ] if $upper;
- push @{$To{Lower}}, [ $code, $code, $lower ] if $lower;
- push @{$To{Title}}, [ $code, $code, $title ] if $title;
-
- append($To{Digit} ||= [], $code, $decimal) if $decimal;
-
- append(\@Bidi, $code, $bidi);
- append($Bidi{$bidi} ||= [], $code);
-
- append(\@Comb, $code, $comb) if $comb;
-
- if ($deco) {
- append(\@Deco, $code, $deco);
- if ($deco =~/^<(\w+)>/) {
- append($Deco{Compat} ||= [], $code);
- append($DC{$1} ||= [], $code);
- } else {
- append($Deco{Canon} ||= [], $code);
- }
- }
-
- append(\@Number, $code, $number) if $number;
-
- append(\@Mirrored, $code) if $mirrored eq "Y";
- }
+ else
+ {
+ $Table->RawAppendRange($codepoint, $codepoint, $name);
}
+}
- check_no_characters(sprintf("%X", $LastUnicodeCodepoint + 1));
-
- flush(\@Name, "Name.pl");
+##
+## Given a code point range starting value and ending value (and name),
+## Add the range to teh Table.
+##
+## NOTE: Code points must be added in strictly ascending numeric order.
+##
+sub Table::AppendRange
+{
+ my $Table = shift; #self
+ my $start = shift;
+ my $end = shift;
+ my $name = shift;
+ $name = "" if not defined $name; ## warning: $name can be "0"
+
+ $Table->Append($start, $name);
+ $Table->Extend($end) if $end > $start;
+}
- foreach my $cat (sort keys %Cat) {
- flush($Cat{$cat}, "Is/$cat.pl");
+##
+## Return a new Table that represents all code points not in the Table.
+##
+sub Table::Invert
+{
+ my $Table = shift; #self
+
+ my $New = Table->New();
+ my $max = -1;
+ for my $range (@$Table)
+ {
+ my $start = $range->[RANGE_START];
+ my $end = $range->[RANGE_END];
+ if ($start-1 >= $max+1) {
+ $New->AppendRange($max+1, $start-1, "");
+ }
+ $max = $end;
}
-
- foreach my $to (sort keys %To) {
- flush($To{$to}, "To/$to.pl");
+ if ($max+1 < $LastUnicodeCodepoint) {
+ $New->AppendRange($max+1, $LastUnicodeCodepoint);
}
+ return $New;
+}
- flush(\@Bidi, "Bidirectional.pl");
- foreach my $bidi (sort keys %Bidi) {
- flush($Bidi{$bidi}, "Is/Bidi$bidi.pl");
+##
+## Merges any number of other tables with $self, returning the new table.
+## (existing tables are not modified)
+##
+## Can be called as either a constructor or a method.
+##
+sub Table::Merge
+{
+ shift(@_) if not ref $_[0]; ## if called as a constructor, lose the class
+ my @Tables = @_;
+
+ ## Accumulate all records from all tables
+ my @Records;
+ for my $Table (@Tables) {
+ push @Records, @$Table;
}
- flush(\@Comb, "CombiningClass.pl");
-
- flush(\@Deco, "Decomposition.pl");
- foreach my $deco (sort keys %Deco) {
- flush($Deco{$deco}, "Is/Deco$deco.pl");
+ ## sort by range start, with longer ranges coming first.
+ my ($first, @Rest) = sort {
+ ($a->[RANGE_START] <=> $b->[RANGE_START])
+ or
+ ($b->[RANGE_END] <=> $b->[RANGE_END])
+ } @Records;
+
+ my $New = Table->New();
+
+ ## Ensuring the first range is there makes the subsequent loop easier
+ $New->AppendRange($first->[RANGE_START],
+ $first->[RANGE_END]);
+
+ ## Fold in records so long as they add new information.
+ for my $set (@Rest)
+ {
+ my $start = $set->[RANGE_START];
+ my $end = $set->[RANGE_END];
+ if ($start > $New->Max) {
+ $New->AppendRange($start, $end);
+ } elsif ($end > $New->Max) {
+ $New->Extend($end);
+ }
}
- foreach my $dc (sort keys %DC) {
- flush($DC{$dc}, "Is/DC$dc.pl");
- }
-
- flush(\@Number, "Number.pl");
- flush(\@Mirrored, "Is/Mirrored.pl");
-} else {
- die "$0: Unicode.txt: $!\n";
+ return $New;
}
-# The general cateory can be written out already now.
+##
+## Given a filename, write a representation of the Table to a file.
+##
+sub Table::Write
+{
+ my $Table = shift; #self
+ my $filename = shift;
-flush(\@General, "Category.pl");
+ print "$filename\n" if $Verbose;
-#
-# Read in the LineBrk.txt.
-#
+ if (not open(OUT, ">$filename")) {
+ die "$0: can't write $filename: $!\n";
+ }
-if (open(my $LineBrk, "LineBrk.txt")) {
- my @Lbrk;
- my %Lbrk;
+ print OUT $HEADER;
+ print OUT "return <<'END';\n";
- while (<$LineBrk>) {
- next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\w+)/;
+ for my $set (@$Table)
+ {
+ my $start = $set->[RANGE_START];
+ my $end = $set->[RANGE_END];
+ my $name = $set->[RANGE_NAME];
- my ($first, $last, $lbrk) = ($1, $2, $3);
+ if ($start == $end) {
+ printf OUT "%04X\t\t%s\n", $start, $name;
+ } else {
+ printf OUT "%04X\t%04X\t%s\n", $start, $end, $name;
+ }
+ }
- append(\@Lbrk, $first, $lbrk);
- append($Lbrk{$lbrk} ||= [], $first);
- if (defined $last) {
- extend(\@Lbrk, $last);
- extend($Lbrk{$lbrk}, $last);
- }
+ print OUT "END\n";
+ close OUT;
+}
+
+###########################################################################
+###########################################################################
+###########################################################################
+
+
+##
+## Called like:
+## New_Alias(Is => 'All', SameAs => 'Any', AllowFuzzy => 1);
+##
+## The args must be in that order, although the AllowFuzzy pair may be omitted.
+##
+## This creates 'IsAll' as an alias for 'IsAny'
+##
+sub New_Alias($$$@)
+{
+ my $Type = shift; ## "Is" or "In"
+ my $Alias = shift;
+ my $SameAs = shift;
+ my $Name = shift;
+
+ ## remaining args are optional key/val
+ my %Args = @_;
+
+ my $AllowFuzzy = delete $Args{AllowFuzzy};
+
+ ## sanity check a few args
+ if (%Args or ($Type ne 'Is' and $Type ne 'In') or $SameAs ne 'SameAs') {
+ confess "$0: bad args to New_Alias"
}
- flush(\@Lbrk, "Lbrk.pl");
- foreach my $lbrk (sort keys %Lbrk) {
- flush($Lbrk{$lbrk}, "Is/Lbrk$lbrk.pl");
+ if (not $TableInfo{$Type}->{$Name}) {
+ confess "$0: don't have orignial $Type => $Name to make alias"
+ }
+ if ($TableInfo{$Alias}) {
+ confess "$0: already have original $Type => $Alias; can't make alias";
}
-} else {
- die "$0: LineBrk.txt: $!\n";
+ $AliasInfo{$Type}->{$Name} = $Alias;
+ if ($AllowFuzzy) {
+ $FuzzyNames{$Type}->{$Alias} = $Name;
+ }
+
}
-#
-# Read in the ArabShap.txt.
-#
+##
+## Turn something like
+## OLD-ITALIC
+## to
+## Old_Italic
+##
+sub CanonicalName($)
+{
+ my $name = lc shift;
+ $name =~ s/\W+/_/;
+ $name =~ s/(?<![a-z])(\w)/\u$1/g;
+ return $name;
+}
-if (open(my $ArabShap, "ArabShap.txt")) {
- my @ArabLink;
- my @ArabLinkGroup;
- while (<$ArabShap>) {
- next unless /^[0-9A-Fa-f]+;/;
- s/\s+$//;
+## All assigned code points
+my $Assigned = Table->New(Is => 'Assigned', AllowFuzzy => 1);
- my ($code, $name, $link, $linkgroup) = split(/\s*;\s*/);
+my $Name = Table->New(); ## all characters, individually by name
+my $General = Table->New(); ## all characters, grouped by category
+my %General;
+my %Cat;
- append(\@ArabLink, $code, $link);
- append(\@ArabLinkGroup, $code, $linkgroup);
- }
+##
+## Process Unicode.txt (Categories, etc.)
+##
+sub Unicode_Txt()
+{
+ my $Bidi = Table->New();
+ my $Deco = Table->New();
+ my $Comb = Table->New();
+ my $Number = Table->New();
+ my $Mirrored = Table->New(Is => 'Mirrored', AllowFuzzy => 0);
- flush(\@ArabLink, "ArabLink.pl");
- flush(\@ArabLinkGroup, "ArabLnkGrp.pl");
-} else {
- die "$0: ArabShap.txt: $!\n";
-}
+ my %DC;
+ my %Bidi;
+ my %Deco;
+ $Deco{Canon} = Table->New(Is => 'Canon', AllowFuzzy => 0);
+ $Deco{Compat} = Table->New(Is => 'Compat', AllowFuzzy => 0);
+
+ ## Initialize Perl-generated categories
+ $Cat{Alnum} = Table->New(Is => 'Alnum', AllowFuzzy => 0);
+ $Cat{Alpha} = Table->New(Is => 'Alpha', AllowFuzzy => 0);
+ $Cat{ASCII} = Table->New(Is => 'ASCII', AllowFuzzy => 0);
+ $Cat{Blank} = Table->New(Is => 'Blank', AllowFuzzy => 0);
+ $Cat{Cntrl} = Table->New(Is => 'Cntrl', AllowFuzzy => 0);
+ $Cat{Digit} = Table->New(Is => 'Digit', AllowFuzzy => 0);
+ $Cat{Graph} = Table->New(Is => 'Graph', AllowFuzzy => 0);
+ $Cat{Lower} = Table->New(Is => 'Lower', AllowFuzzy => 0);
+ $Cat{Print} = Table->New(Is => 'Print', AllowFuzzy => 0);
+ $Cat{Punct} = Table->New(Is => 'Punct', AllowFuzzy => 0);
+ $Cat{SpacePerl} = Table->New(Is => 'SpacePerl', AllowFuzzy => 0);
+ $Cat{Space} = Table->New(Is => 'Space', AllowFuzzy => 0);
+ $Cat{Title} = Table->New(Is => 'Title', AllowFuzzy => 0);
+ $Cat{Upper} = Table->New(Is => 'Upper', AllowFuzzy => 0);
+ $Cat{Word} = Table->New(Is => 'Word' , AllowFuzzy => 0);
+ $Cat{XDigit} = Table->New(Is => 'XDigit', AllowFuzzy => 0);
+ ## Categories from Unicode.txt are auto-initialized in gencat()
-#
-# Read in the Jamo.txt.
-#
+ my %To;
+ $To{Upper} = Table->New();
+ $To{Lower} = Table->New();
+ $To{Title} = Table->New();
+ $To{Digit} = Table->New();
+
+ sub gencat($$$$)
+ {
+ my ($name, ## Name ("LATIN CAPITAL LETTER A")
+ $cat, ## Category ("Lu", "Zp", "Nd", etc.)
+ $code, ## Code point (as an integer)
+ $op) = @_;
+
+ my $MajorCat = substr($cat, 0, 1); ## L, M, Z, S, etc
+
+ $Assigned->$op($code);
+ $Name->$op($code, $name);
+ $General->$op($code, $cat);
+
+ ## add to the sub category (e.g. "Lu", "Nd", "Cf", ..)
+ $Cat{$cat} ||= Table->New(Is => $cat, AllowFuzzy => 0);
+ $Cat{$cat}->$op($code);
+
+ ## add to the major category (e.g. "L", "N", "C", ...)
+ $Cat{$MajorCat} ||= Table->New(Is => $MajorCat, AllowFuzzy => 0);
+ $Cat{$MajorCat}->$op($code);
+
+ ($General{$name} ||= Table->New)->$op($code, $name);
+
+ # 005F: SPACING UNDERSCORE
+ $Cat{Word}->$op($code) if $cat =~ /^[LMN]/ || $code == 0x005F;
+ $Cat{Alnum}->$op($code) if $cat =~ /^[LMN]/;
+ $Cat{Alpha}->$op($code) if $cat =~ /^[LM]/;
+
+
+
+ $Cat{Space}->$op($code) if $cat =~ /^Z/
+ || $code == 0x0009 # 0009: HORIZONTAL TAB
+ || $code == 0x000A # 000A: LINE FEED
+ || $code == 0x000B # 000B: VERTICAL TAB
+ || $code == 0x000C # 000C: FORM FEED
+ || $code == 0x000D; # 000D: CARRIAGE RETURN
+
+
+ $Cat{SpacePerl}->$op($code) if $cat =~ /^Z/
+ || $code == 0x0009 # 0009: HORIZONTAL TAB
+ || $code == 0x000A # 000A: LINE FEED
+ || $code == 0x000C # 000C: FORM FEED
+ || $code == 0x000D # 000D: CARRIAGE RETURN
+ || $code == 0x0085 # 0085: <NEXT LINE>
+ || $code == 0x2028 # 2028: LINE SEPARATOR
+ || $code == 0x2029;# 2029: PARAGRAPH SEP.
+
+ $Cat{Blank}->$op($code) if $cat =~ /^Z[^lp]$/
+ || $code == 0x0009 # 0009: HORIZONTAL TAB
+ || $code == 0x0020; # 0020: SPACE
+
+ $Cat{Digit}->$op($code) if $cat eq "Nd";
+ $Cat{Upper}->$op($code) if $cat eq "Lu";
+ $Cat{Lower}->$op($code) if $cat eq "Ll";
+ $Cat{Title}->$op($code) if $cat eq "Lt";
+ $Cat{ASCII}->$op($code) if $code <= 0x007F;
+ $Cat{Cntrl}->$op($code) if $cat =~ /^C/;
+ $Cat{Graph}->$op($code) if $cat =~ /^([LMNPS]|Co)/;
+ $Cat{Print}->$op($code) if $cat =~ /^([LMNPS]|Co|Zs)/;
+ $Cat{Punct}->$op($code) if $cat =~ /^P/;
+
+ $Cat{XDigit}->$op($code) if ($code >= 0x30 && $code <= 0x39) ## 0..9
+ || ($code >= 0x41 && $code <= 0x46) ## A..F
+ || ($code >= 0x61 && $code <= 0x66); ## a..f
+ }
-if (open(my $Jamo, "Jamo.txt")) {
- my @Short;
+ ## open ane read file.....
+ if (not open IN, "Unicode.txt") {
+ die "$0: Unicode.txt: $!\n";
+ }
- while (<$Jamo>) {
- next unless /^([0-9A-Fa-f]+)\s*;\s*(\w*)/;
+ while (<IN>)
+ {
+ next unless /^[0-9A-Fa-f]+;/;
+ s/\s+$//;
+
+ my ($hexcode, ## code point in hex (e.g. "0041")
+ $name, ## character name (e.g. "LATIN CAPITAL LETTER A")
+ $cat, ## category (e.g. "Lu")
+ $comb, ## Canonical combining class (e.t. "230")
+ $bidi, ## directional category (e.g. "L")
+ $deco, ## decomposition mapping
+ $decimal, ## decimal digit value
+ $digit, ## digit value
+ $number, ## numeric value
+ $mirrored, ## mirrored
+ $unicode10, ## name in Unicode 1.0
+ $comment, ## comment field
+ $upper, ## uppercase mapping
+ $lower, ## lowercase mapping
+ $title, ## titlecase mapping
+ ) = split(/\s*;\s*/);
+
+ my $code = hex($hexcode);
+
+ ##
+ ## There are a few pairs of lines like:
+ ## AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
+ ## D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
+ ## that define ranges.
+ ##
+ if ($name =~ /^<(.+), (First|Last)>$/)
+ {
+ $name = $1;
+ gencat($name, $cat, $code, $2 eq 'First' ? 'Append' : 'Extend');
+ #New_Prop(In => $name, $General{$name}, AllowFuzzy => 1);
+ }
+ else
+ {
+ ## normal (single-character) lines
+ gencat($name, $cat, $code, 'Append');
+
+ # No Append() here since since several codes may map into one.
+ $To{Upper}->RawAppendRange($code, $code, $upper) if $upper;
+ $To{Lower}->RawAppendRange($code, $code, $lower) if $lower;
+ $To{Title}->RawAppendRange($code, $code, $title) if $title;
+ $To{Digit}->Append($code, $decimal) if length $decimal;
+
+ $Bidi->Append($code, $bidi);
+ $Comb->Append($code, $comb) if $comb;
+ $Number->Append($code, $number) if length $number;
+
+ $Mirrored->Append($code) if $mirrored eq "Y";
+
+ $Bidi{$bidi} ||= Table->New(Is => "Bidi$bidi", AllowFuzzy => 0);
+ $Bidi{$bidi}->Append($code);
+
+ if ($deco)
+ {
+ $Deco->Append($code, $deco);
+ if ($deco =~/^<(\w+)>/)
+ {
+ $Deco{Compat}->Append($code);
+
+ $DC{$1} ||= Table->New(Is => "DC$1", AllowFuzzy => 0);
+ $DC{$1}->Append($code);
+ }
+ else
+ {
+ $Deco{Canon}->Append($code);
+ }
+ }
+ }
+ }
+ close IN;
- my ($code, $short) = ($1, $2);
+ ##
+ ## Tidy up a few special cases....
+ ##
- append(\@Short, $code, $short);
- }
+ $Cat{Cn} = $Assigned->Invert; ## Cn is everything that doesn't exist
+ New_Prop(Is => 'Cn', $Cat{Cn}, AllowFuzzy => 0);
- flush(\@Short, "JamoShort.pl");
-} else {
- die "$0: Jamo.txt: $!\n";
-}
+ ## Unassigned is the same as 'Cn'
+ New_Alias(Is => 'Unassigned', SameAs => 'Cn', AllowFuzzy => 1);
-#
-# Read in the Scripts.txt.
-#
+ $Cat{C}->Replace($Cat{C}->Merge($Cat{Cn})); ## Now merge in Cn into C
-my @Scripts;
-if (open(my $Scripts, "Scripts.txt")) {
- while (<$Scripts>) {
- next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/;
+ # L& is Ll, Lu, and Lt.
+ New_Prop(Is => 'L&',
+ Table->Merge(@Cat{qw[Ll Lu Lt]}),
+ AllowFuzzy => 0);
- # Wait until all the scripts have been read since
- # they are not listed in numeric order.
- push @Scripts, [ hex($1), $1, $2, $3 ];
- }
-} else {
- die "$0: Scripts.txt: $!\n";
-}
+ ## Any and All are all code points.
+ my $Any = Table->New(Is => 'Any', AllowFuzzy => 1);
+ $Any->RawAppendRange(0, $LastUnicodeCodepoint);
-# Now append the scripts properties in their code point order.
+ New_Alias(Is => 'All', SameAs => 'Any', AllowFuzzy => 1);
-my %Script;
-my $Scripts = [];
-for my $script (sort { $a->[0] <=> $b->[0] } @Scripts) {
- my ($code, $first, $last, $name) = @$script;
- append($Scripts, $first, $name);
- append($Script{$name} ||= [], $first, $name);
- if (defined $last) {
- extend($Scripts, $last);
- extend($Script{$name}, $last);
- }
- unless (defined $In{$name}) {
- $InScript{$InId} = $name;
- $In{$name} = $InId++;
- $InIn{$name} = $Script{$name};
+ ##
+ ## Now dump the files.
+ ##
+ $Name->Write("Name.pl");
+ $Bidi->Write("Bidirectional.pl");
+ $Comb->Write("CombiningClass.pl");
+ $Deco->Write("Decomposition.pl");
+ $Number->Write("Number.pl");
+ $General->Write("Category.pl");
+
+ for my $to (sort keys %To) {
+ $To{$to}->Write("To/$to.pl");
}
}
-# Scripts.pl can be written out already now.
+##
+## Process LineBrk.txt
+##
+sub LineBrk_Txt()
+{
+ if (not open IN, "LineBrk.txt") {
+ die "$0: LineBrk.txt: $!\n";
+ }
-flush(\@Scripts, "Scripts.pl");
+ my $Lbrk = Table->New();
+ my %Lbrk;
-# Common is everything not explicitly assigned to a Script
+ while (<IN>)
+ {
+ next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\w+)/;
-$In{Common} = $InId++;
-my $Common = inverse($Scripts);
-$InIn{Common} = $Common;
+ my ($first, $last, $lbrk) = (hex($1), hex($2||""), $3);
-#
-# Read in the Blocks.txt.
-#
+ $Lbrk->Append($first, $lbrk);
-my @Blocks;
-my %Blocks;
-
-if (open(my $Blocks, "Blocks.txt")) {
- while (<$Blocks>) {
- next unless /^([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.+?)\s*$/;
-
- my ($first, $last, $name) = ($1, $2, $3);
- my $origname = $name;
-
- # If there's a naming conflict (the script names are
- # in uppercase), the name of the block has " Block"
- # appended to it.
- my $pat = $name;
- $pat =~ s/([- _])/(?:[-_]|\\s+)?/g;
- for my $i (values %InScript) {
- if ($i =~ /^$pat$/i) {
- $name .= " Block";
- last;
- }
- }
+ $Lbrk{$lbrk} ||= Table->New(Is => "Lbrk$lbrk", AllowFuzzy => 0);
+ $Lbrk{$lbrk}->Append($first);
- append(\@Blocks, $first, $name);
- append($Blocks{$name} ||= [], $first, $name);
- if (defined $last) {
- extend(\@Blocks, $last);
- extend($Blocks{$name}, $last);
- }
- unless (defined $In{$name}) {
- $InBlock{$InId} = $origname;
- $In{$name} = $InId++;
- $InIn{$name} = $Blocks{$name};
+ if ($last) {
+ $Lbrk->Extend($last);
+ $Lbrk{$lbrk}->Extend($last);
}
}
-} else {
- die "$0: Blocks.txt: $!\n";
-}
-
-# Blocks.pl can be written out already now.
+ close IN;
-flush(\@Blocks, "Blocks.pl");
+ $Lbrk->Write("Lbrk.pl");
+}
-#
-# Read in the PropList.txt. It contains extended properties not
-# listed in the Unicode.txt, such as 'Other_Alphabetic':
-# alphabetic but not of the general category L; many modifiers
-# belong to this extended property category: while they are not
-# alphabets, they are alphabetic in nature.
-#
+##
+## Process ArabShap.txt.
+##
+sub ArabShap_txt()
+{
+ if (not open IN, "ArabShap.txt") {
+ die "$0: ArabShap.txt: $!\n";
+ }
-my @Props;
+ my $ArabLink = Table->New();
+ my $ArabLinkGroup = Table->New();
-if (open(my $Props, "PropList.txt")) {
- while (<$Props>) {
- next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/;
+ while (<IN>)
+ {
+ next unless /^[0-9A-Fa-f]+;/;
+ s/\s+$//;
- # Wait until all the extended properties have been read since
- # they are not listed in numeric order.
- push @Props, [ hex($1), $1, $2, $3 ];
+ my ($hexcode, $name, $link, $linkgroup) = split(/\s*;\s*/);
+ my $code = hex($hexcode);
+ $ArabLink->Append($code, $link);
+ $ArabLinkGroup->Append($code, $linkgroup);
}
-} else {
- die "$0: PropList.txt: $!\n";
+ close IN;
+
+ $ArabLink->Write("ArabLink.pl");
+ $ArabLinkGroup->Write("ArabLnkGrp.pl");
}
-# Now append the extended properties in their code point order.
+##
+## Process Jamo.txt.
+##
+sub Jamo_txt()
+{
+ if (not open IN, "Jamo.txt") {
+ die "$0: Jamo.txt: $!\n";
+ }
+ my $Short = Table->New();
-my %Prop;
-my $Props = [];
+ while (<IN>)
+ {
+ next unless /^([0-9A-Fa-f]+)\s*;\s*(\w*)/;
+ my ($code, $short) = (hex($1), $2);
-for my $prop (sort { $a->[0] <=> $b->[0] } @Props) {
- my ($code, $first, $last, $name) = @$prop;
- append($Props, $first, $name);
- append($Prop{$name} ||= [], $first, $name);
- if (defined $last) {
- extend($Props, $last);
- extend($Prop{$name}, $last);
- }
- unless (defined $In{$name}) {
- $In{$name} = $InId++;
- $InIn{$name} = $Prop{$name};
+ $Short->Append($code, $short);
}
+ close IN;
+ $Short->Write("JamoShort.pl");
}
-# Assigned is everything not Cn
-
-$In{Assigned} = $InId++;
-my $Assigned = inverse($Cat{Cn});
-$InIn{Assigned} = $Assigned;
-
-# Unassigned is everything not Assigned
-
-$In{Unassigned} = $InId++;
-my $Unassigned = $Cat{Cn};
-$InIn{Unassigned} = $Unassigned;
-
-# Unassigned is everything not Assigned
-sub merge_general_and_extended {
- my ($name, $general, $extended) = @_;
- my $merged;
-
- push @$merged,
- map { pop @{$_}; $_ }
- sort { $a->[2] <=> $b->[2] }
- map { [ $_->[0], $_->[1], hex($_->[0]) ] }
- ($general ?
- map { ref $_ ? @$_ : $_ }
- @Cat {ref $general ? @$general : $general } :
- (),
- $extended ?
- map { ref $_ ? @$_ : $_ }
- @Prop{ref $extended ? @$extended : $extended} :
- ());
-
- $In{$name} = $InId++;
- $InIn{$name} = $merged;
-
- return $merged;
-}
-
-# Alphabetic is L and Other_Alphabetic.
-
-my $Alphabetic =
- merge_general_and_extended('Alphabetic', 'L', 'Other_Alphabetic');
+##
+## Process Scripts.txt.
+##
+sub Scripts_txt()
+{
+ my @ScriptInfo;
-# Lowercase is Ll and Other_Lowercase.
+ if (not open(IN, "Scripts.txt")) {
+ die "$0: Scripts.txt: $!\n";
+ }
+ while (<IN>) {
+ next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/;
-my $Lowercase =
- merge_general_and_extended('Lowercase', 'Ll', 'Other_Lowercase');
+ # Wait until all the scripts have been read since
+ # they are not listed in numeric order.
+ push @ScriptInfo, [ hex($1), hex($2||""), $3 ];
+ }
+ close IN;
-# Uppercase is Lu and Other_Uppercase.
+ # Now append the scripts properties in their code point order.
-my $Uppercase =
- merge_general_and_extended('Uppercase', 'Lu', 'Other_Uppercase');
+ my %Script;
+ my $Scripts = Table->New();
-# Math is Sm and Other_Math.
+ for my $script (sort { $a->[0] <=> $b->[0] } @ScriptInfo)
+ {
+ my ($first, $last, $name) = @$script;
+ $Scripts->Append($first, $name);
-my $Math =
- merge_general_and_extended('Math', 'Sm', 'Other_Math');
+ $Script{$name} ||= Table->New(Is => CanonicalName($name),
+ AllowFuzzy => 1);
+ $Script{$name}->Append($first, $name);
-# Lampersand is Ll, Lu, and Lt.
+ if ($last) {
+ $Scripts->Extend($last);
+ $Script{$name}->Extend($last);
+ }
+ }
-my $Lampersand =
- merge_general_and_extended('Lampersand', [ qw(Ll Lu Lt) ]);
+ $Scripts->Write("Scripts.pl");
-# ID_Start is Ll, Lu, Lt, Lm, Lo, and Nl.
+ ## Common is everything not explicitly assigned to a Script
+ ##
+ ## ***shouldn't this be intersected with \p{Assigned}? ******
+ ##
+ New_Prop(Is => 'Common', $Scripts->Invert, AllowFuzzy => 1);
+}
-my $ID_Start =
- merge_general_and_extended('ID_Start', [ qw(Ll Lu Lt Lm Lo Nl) ]);
+##
+## Given a name like "Close Punctuation", return a regex (that when applied
+## with /i) matches any valid form of that name (e.g. "ClosePunctuation",
+## "Close-Punctuation", etc.)
+##
+## Accept any space, dash, or underbar where in the official name there is
+## space or a dash (or underbar, but there never is).
+##
+##
+sub NameToRegex($)
+{
+ my $Name = shift;
+ $Name =~ s/[- _]/(?:[-_]|\\s+)?/g;
+ return $Name;
+}
-# ID_Continue is ID_Start, Mn, Mc, Nd, and Pc.
+##
+## Process Blocks.txt.
+##
+sub Blocks_txt()
+{
+ my $Blocks = Table->New();
+ my %Blocks;
-my $ID_Continue =
- merge_general_and_extended('ID_Continue', [ qw(Ll Lu Lt Lm Lo Nl
- Mn Mc Nd Pc) ]);
+ if (not open IN, "Blocks.txt") {
+ die "$0: Blocks.txt: $!\n";
+ }
-#
-# Any is any.
-#
+ while (<IN>)
+ {
+ #next if not /Private Use$/;
+ next if not /^([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.+?)\s*$/;
-$In{Any} = $InId++;
-my $Any = [ [ 0, sprintf("%04X", $LastUnicodeCodepoint) ] ];
-$InIn{Any} = $Any;
+ my ($first, $last, $name) = (hex($1), hex($2), $3);
-#
-# All is any, too.
-#
+ $Blocks->Append($first, $name);
-$In{All} = $InId++;
-$InIn{All} = $Any;
+ $Blocks{$name} ||= Table->New(In=>CanonicalName($name), AllowFuzzy=>1);
+ $Blocks{$name}->Append($first, $name);
-#
-# mapping() will be used to write out the In and Is virtual mappings.
-#
-
-sub mapping {
- my ($map, $name) = @_;
-
- if (open(my $fh, ">$name.pl")) {
- print "$name.pl\n";
- header($fh);
-
- # The %pat will hold a hash that maps the first two
- # lowercased letters of a class to a 'fuzzified' regular
- # expression that points to the real mapping.
-
- my %pat;
-
- # But first write out the offical name to real name
- # (the filename) mapping.
-
- print $fh <<EOT;
-%utf8::${name} =
-(
-EOT
- for my $i (sort { lc $a cmp lc $b } keys %$map) {
- my $pat = $i;
- # Here is the 'fuzzification': accept any space,
- # dash, or underbar where in the official name
- # there is space or a dash (or underbar, but
- # there never is).
- $pat =~ s/([- _])/(?:[-_]|\\s+)?/g;
- # The prefix length of 2 is enough spread,
- # and besides, we have 'Yi' as an In category.
- push @{$pat{lc(substr($i, 0, 2))}}, [ $i, $pat ];
- printf $fh "%-45s => '$map->{$i}',\n", "'$i'";
- }
- print $fh <<EOT;
-);
-EOT
-
- # Now write out the %pat mapping.
-
- print $fh <<EOT;
-%utf8::${name}Pat =
-(
-EOT
- foreach my $prefix (sort keys %pat) {
- print $fh "'$prefix' => {\n";
- foreach my $ipat (@{$pat{$prefix}}) {
- my ($i, $pat) = @$ipat;
- print $fh "\t'$pat' => '$map->{$i}',\n";
- }
- print $fh "},\n";
+ if ($last and $last != $first) {
+ $Blocks->Extend($last);
+ $Blocks{$name}->Extend($last);
}
- print $fh <<EOT;
-);
-EOT
-
- close($fh);
- } else {
- die "$0: $name.pl: $!\n";
}
+ close IN;
+
+ $Blocks->Write("Blocks.pl");
}
-#
-# Write out the virtual In mappings.
-#
+##
+## Read in the PropList.txt. It contains extended properties not
+## listed in the Unicode.txt, such as 'Other_Alphabetic':
+## alphabetic but not of the general category L; many modifiers
+## belong to this extended property category: while they are not
+## alphabets, they are alphabetic in nature.
+##
+sub PropList_txt()
+{
+ my @PropInfo;
+
+ if (not open IN, "PropList.txt") {
+ die "$0: PropList.txt: $!\n";
+ }
-mapping(\%In, "In");
+ while (<IN>)
+ {
+ next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/;
-#
-# Append the InScript and InBlock mappings.
-# These are needed only if Script= and Block= syntaxes are used.
-#
+ # Wait until all the extended properties have been read since
+ # they are not listed in numeric order.
+ push @PropInfo, [ hex($1), hex($2||""), $3 ];
+ }
+ close IN;
-if (open(my $In, ">>In.pl")) {
- print $In <<EOT;
+ # Now append the extended properties in their code point order.
+ my $Props = Table->New();
+ my %Prop;
-%utf8::InScript =
-(
-EOT
- for my $i (sort { $a <=> $b } keys %InScript) {
- printf $In "%4d => '$InScript{$i}',\n", $i;
- }
- print $In <<EOT;
-);
-EOT
+ for my $prop (sort { $a->[0] <=> $b->[0] } @PropInfo)
+ {
+ my ($first, $last, $name) = @$prop;
+ $Props->Append($first, $name);
- print $In <<EOT;
+ $Prop{$name} ||= Table->New(Is => $name, AllowFuzzy => 1);
+ $Prop{$name}->Append($first, $name);
-%utf8::InBlock =
-(
-EOT
- for my $i (sort { $a <=> $b } keys %InBlock) {
- printf $In "%4d => '$InBlock{$i}',\n", $i;
+ if ($last) {
+ $Props->Extend($last);
+ $Prop{$name}->Extend($last);
+ }
}
- print $In <<EOT;
-);
-EOT
-} else {
- die "$0: In.pl: $!\n";
-}
-
-#
-# Write out the real In mappings
-# (the In.pl written out just above has the virtual In mappings)
-#
-foreach my $in (sort { $In{$a} <=> $In{$b} } keys %In) {
- flush($InIn{$in}, "In/$In{$in}.pl");
+ # Alphabetic is L and Other_Alphabetic.
+ New_Prop(Is => 'Alphabetic',
+ Table->Merge($Cat{L}, $Prop{Other_Alphabetic}),
+ AllowFuzzy => 1);
+
+ # Lowercase is Ll and Other_Lowercase.
+ New_Prop(Is => 'Lowercase',
+ Table->Merge($Cat{Ll}, $Prop{Other_Lowercase}),
+ AllowFuzzy => 1);
+
+ # Uppercase is Lu and Other_Uppercase.
+ New_Prop(Is => 'Uppercase',
+ Table->Merge($Cat{Lu}, $Prop{Other_Uppercase}),
+ AllowFuzzy => 1);
+
+ # Math is Sm and Other_Math.
+ New_Prop(Is => 'Math',
+ Table->Merge($Cat{Sm}, $Prop{Other_Math}),
+ AllowFuzzy => 1);
+
+ # ID_Start is Ll, Lu, Lt, Lm, Lo, and Nl.
+ New_Prop(Is => 'ID_Start',
+ Table->Merge(@Cat{qw[Ll Lu Lt Lm Lo Nl]}),
+ AllowFuzzy => 1);
+
+ # ID_Continue is ID_Start, Mn, Mc, Nd, and Pc.
+ New_Prop(Is => 'ID_Continue',
+ Table->Merge(@Cat{qw[Ll Lu Lt Lm Lo Nl Mn Mc Nd Pc ]}),
+ AllowFuzzy => 1);
}
-#
-# The mapping from General Category long forms to short forms is
-# currently hardwired here since no simple data file in the UCD
-# seems to do that. Unicode 3.2 will assumedly correct this.
-#
-
-my %Is = (
+sub Make_GC_Aliases()
+{
+ ##
+ ## The mapping from General Category long forms to short forms is
+ ## currently hardwired here since no simple data file in the UCD
+ ## seems to do that. Unicode 3.2 will assumedly correct this.
+ ##
+ my %Is = (
'Letter' => 'L',
'Uppercase_Letter' => 'Lu',
'Lowercase_Letter' => 'Ll',
'Surrogate' => 'Cs',
'Private Use' => 'Co',
'Unassigned' => 'Cn',
-);
-
-#
-# Write out the virtual Is mappings.
-#
-
-mapping(\%Is, "Is");
-
-#
-# Read in the special cases.
-#
+ );
-my %Case;
-
-if (open(my $SpecCase, "SpecCase.txt")) {
- while (<$SpecCase>) {
- next unless /^[0-9A-Fa-f]+;/;
- s/\#.*//;
- s/\s+$//;
-
- my ($code, $lower, $title, $upper, $condition) = split(/\s*;\s*/);
-
- if ($condition) { # not implemented yet
- print "# SKIPPING $_\n";
- next;
- }
+ ## make the aliases....
+ while (my ($Alias, $Name) = each %Is) {
+ New_Alias(Is => $Alias, SameAs => $Name, AllowFuzzy => 1);
+ }
+}
- # Wait until all the special cases have been read since
- # they are not listed in numeric order.
- my $ix = hex($code);
- push @{$Case{Lower}}, [ $ix, $code, $lower ];
- push @{$Case{Title}}, [ $ix, $code, $title ];
- push @{$Case{Upper}}, [ $ix, $code, $upper ];
+##
+## Writes the info accumulated in
+##
+## %TableInfo;
+## %FuzzyNames;
+## %AliasInfo;
+##
+##
+sub WriteAllMappings()
+{
+ for my $Type ('In', 'Is')
+ {
+ my %Filenames;
+ my %NameToFile;
+
+ my %Exact; ## will become %utf8::Is or %utf8::In
+ my %Pat; ## will become %utf8::IsPat or %utf8::InPat
+
+ ##
+ ## First write all the files to the $Type/ directory
+ ##
+ while (my ($Name, $Table) = each %{$TableInfo{$Type}})
+ {
+ ## Need an 8.3 safe filename.
+ my $filename = $Name;
+ $filename =~ s/[_\W]+(\w*)/\u$1/g;
+ substr($filename, 8) = '' if length($filename) > 8;
+
+ ##
+ ## Make sure the filename doesn't conflict with something we
+ ## might have already written. If we have, say,
+ ## Greek_Extended1
+ ## Greek_Extended2
+ ## they become
+ ## Greek_Ex
+ ## Greek_E2
+ ##
+ while (my $num = $Filenames{lc $filename}++)
+ {
+ $num++; ## so filenames with numbers start with '2', which
+ ## just looks more natural.
+ substr($filename, -length($num)) = $num;
+ }
+
+ ##
+ ## Okay, write the file...
+ ##
+ $Exact{$Name} = $filename;
+ $Table->Write("$Type/$filename.pl");
+ }
+
+ ##
+ ## Build %Pat
+ ##
+ while (my ($Fuzzy, $Real) = each %{$FuzzyNames{$Type}})
+ {
+ my $File = $Exact{$Real};
+
+ if (not $File) {
+ die "$0: oops [$Real]";
+ }
+
+ ## The prefix length of 2 is enough spread,
+ ## and besides, we have 'Yi' as an In category.
+ my $Prefix = lc(substr($Fuzzy, 0, 2));
+ my $Regex = NameToRegex($Fuzzy);
+
+ if ($Pat{$Prefix}->{$Regex}) {
+ warn "WHOA, conflict with /$Regex/: $Pat{$Prefix}->{$Regex} vs $File\n";
+ }
+
+ $Pat{$Prefix}->{$Regex} = $File;
+ }
+
+ ##
+ ## Since the fuzzy method will provide for a way to match $Fuzzy,
+ ## there's no need for $Fuzzy to be in %Exact as well.
+ ## This can't be done in the loop above because there could be
+ ## multiple $Fuzzys pointing at the same $Real, and we don't want
+ ## the first to delete the exact mapping out from under the second.
+ ##
+ for my $Fuzzy (keys %{$FuzzyNames{$Type}})
+ {
+ delete $Exact{$Fuzzy};
+ }
+
+
+
+ ##
+ ## Now write In.pl / Is.pl
+ ##
+ if (not open OUT, ">$Type.pl") {
+ die "$0: $Type.pl: $!\n";
+ }
+ print OUT $HEADER;
+ print OUT "##\n";
+ print OUT "## Data in this file used by ../utf8_heavy.pl\n";
+ print OUT "##\n";
+ print OUT "\n";
+ print OUT "## Mapping from name to filename in ./$Type\n";
+ print OUT "%utf8::$Type = (\n";
+ for my $Name (sort keys %Exact)
+ {
+ my $File = $Exact{$Name};
+ printf OUT " %-41s => %s,\n", "'$Name'", "'$File'";
+ }
+ print OUT ");\n\n";
+
+ print OUT "## Mappings from regex to filename in ./$Type/\n";
+ print OUT "%utf8::${Type}Pat = (\n";
+ for my $Prefix (sort keys %Pat)
+ {
+ print OUT " '$Prefix' => {\n";
+ while (my ($Regex, $File) = each %{ $Pat{$Prefix} }) {
+ print OUT "\t'$Regex' => '$File',\n";
+ }
+ print OUT " },\n";
+ }
+ print OUT ");\n";
+
+ close(OUT);
}
-} else {
- die "$0: SpecCase.txt: $!\n";
}
-# Now write out the special cases properties in their code point order.
-# Prepend them to the To/{Upper,Lower,Title}.pl.
-
-for my $case (qw(Lower Title Upper)) {
- my $NormalCase = do "To/$case.pl" || die "$0: To/$case.pl: $!\n";
- if (open(my $Case, ">To/$case.pl")) {
- header($Case);
- print $Case <<EOT;
-
-%utf8::ToSpec$case = (
-EOT
- for my $prop (sort { $a->[0] <=> $b->[0] } @{$Case{$case}}) {
- my ($ix, $code, $to) = @$prop;
- my $tostr =
- join "", map { sprintf "\\x{%s}", $_ } split ' ', $to;
- printf $Case qq['%04X' => "$tostr",\n], $ix;
- }
- print $Case <<EOT;
-);
+sub SpecCase_txt()
+{
+ #
+ # Read in the special cases.
+ #
-EOT
- begin($Case);
- print $Case $NormalCase;
- end($Case);
- } else {
- die "$0: To/$case.txt: $!\n";
+ my %CaseInfo;
+
+ if (not open IN, "SpecCase.txt") {
+ die "$0: SpecCase.txt: $!\n";
+ }
+ while (<IN>) {
+ next unless /^[0-9A-Fa-f]+;/;
+ s/\#.*//;
+ s/\s+$//;
+
+ my ($code, $lower, $title, $upper, $condition) = split(/\s*;\s*/);
+
+ if ($condition) { # not implemented yet
+ print "# SKIPPING $_\n" if $Verbose;
+ next;
+ }
+
+ # Wait until all the special cases have been read since
+ # they are not listed in numeric order.
+ my $ix = hex($code);
+ push @{$CaseInfo{Lower}}, [ $ix, $code, $lower ];
+ push @{$CaseInfo{Title}}, [ $ix, $code, $title ];
+ push @{$CaseInfo{Upper}}, [ $ix, $code, $upper ];
+ }
+ close IN;
+
+ # Now write out the special cases properties in their code point order.
+ # Prepend them to the To/{Upper,Lower,Title}.pl.
+
+ for my $case (qw(Lower Title Upper))
+ {
+ my $NormalCase = do "To/$case.pl" || die "$0: $@\n";
+ if (not open OUT, ">To/$case.pl") {
+ die "$0: To/$case.txt: $!";
+ }
+
+ print OUT $HEADER, "\n";
+ print OUT "%utf8::ToSpec$case =\n(\n";
+
+ for my $prop (sort { $a->[0] <=> $b->[0] } @{$CaseInfo{$case}}) {
+ my ($ix, $code, $to) = @$prop;
+ my $tostr =
+ join "", map { sprintf "\\x{%s}", $_ } split ' ', $to;
+ printf OUT qq['%04X' => "$tostr",\n], $ix;
+ }
+ print OUT ");\n\n";
+ print OUT "return <<'END';\n";
+ print OUT $NormalCase;
+ print OUT "END\n";
+ close OUT;
}
}
#
# We will do full case folding, C + F + I (see CaseFold.txt).
#
+sub CaseFold_txt()
+{
+ if (not open IN, "CaseFold.txt") {
+ die "$0: To/Fold.pl: $!\n";
+ }
-if (open(my $CaseFold, "CaseFold.txt")) {
- my @Fold;
+ my $Fold = Table->New();
my %Fold;
- while (<$CaseFold>) {
+ while (<IN>) {
# Skip status 'S', simple case folding
next unless /^([0-9A-Fa-f]+)\s*;\s*([CFI])\s*;\s*([0-9A-Fa-f]+(?: [0-9A-Fa-f]+)*)\s*;/;
- my ($code, $status, $fold) = ($1, $2, $3);
+ my ($code, $status, $fold) = (hex($1), $2, $3);
if ($status eq 'C') { # Common: one-to-one folding
# No append() since several codes may fold into one.
- push @Fold, [ $code, $code, $fold ];
+ $Fold->RawAppendRange($code, $code, $fold);
} else { # F: full, or I: dotted uppercase I -> dotless lowercase I
- $Fold{hex($code)} = $fold;
+ $Fold{$code} = $fold;
}
}
+ close IN;
- flush(\@Fold, "To/Fold.pl");
+ $Fold->Write("To/Fold.pl");
#
# Prepend the special foldings to the common foldings.
#
my $CommonFold = do "To/Fold.pl" || die "$0: To/Fold.pl: $!\n";
- if (open(my $Fold, ">To/Fold.pl")) {
- header($Fold);
- print $Fold <<EOT;
-
-%utf8::ToSpecFold = (
-EOT
- for my $code (sort { $a <=> $b } keys %Fold) {
- my $foldstr =
- join "", map { sprintf "\\x{%s}", $_ } split ' ', $Fold{$code};
- printf $Fold qq['%04X' => "$foldstr",\n], $code;
- }
- print $Fold <<EOT;
-);
-
-EOT
- begin($Fold);
- print $Fold $CommonFold;
- end($Fold);
- } else {
- die "$0: To/Fold.pl: $!\n";
+ if (not open OUT, ">To/Fold.pl") {
+ die "$0: To/Fold.pl: $!\n";
+ }
+ print OUT $HEADER, "\n";
+ print OUT "%utf8::ToSpecFold =\n(\n";
+ for my $code (sort { $a <=> $b } keys %Fold) {
+ my $foldstr =
+ join "", map { sprintf "\\x{%s}", $_ } split ' ', $Fold{$code};
+ printf OUT qq['%04X' => "$foldstr",\n], $code;
}
-} else {
- die "$0: CaseFold.txt: $!\n";
+ print OUT ");\n\n";
+ print OUT "return <<'END';\n";
+ print OUT $CommonFold;
+ print OUT "END\n";
+ close OUT;
}
+## Do it....
+
+Unicode_Txt();
+Make_GC_Aliases();
+PropList_txt();
+
+Scripts_txt();
+Blocks_txt();
+
+LineBrk_Txt();
+ArabShap_txt();
+Jamo_txt();
+SpecCase_txt();
+
+WriteAllMappings();
+
+CaseFold_txt();
+
# That's all, folks!
+__END__