The First, Last ranges in the Unicode data weren't

[p5sagit/p5-mst-13.2.git] / lib / unicore / mktables
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index 5b2d786..5615aee 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -5,10 +5,10 @@
 # from the Unicode database files (lib/unicore/*.txt).
 #
 
-my $LastUnicodeCodepoint = 0x10FFFF; # As of Unicode 3.1.1.
-
 use strict;
 
+my $LastUnicodeCodepoint = 0x10FFFF; # As of Unicode 3.1.1.
+
 mkdir("In", 0755);
 mkdir("Is", 0755);
 mkdir("To", 0755);
@@ -103,6 +103,9 @@ my %In;
 my $InId = 0;
 my %InIn;
 
+my %InScript;
+my %InBlock;
+
 #
 # Read in the Unicode.txt, the main Unicode database.
 #
@@ -111,6 +114,55 @@ my %Cat;
 my %General;
 my @General;
 
+sub gencat {
+    my ($Name, $GeneralH, $GeneralA, $Cat,
+       $name, $cat, $code, $op) = @_;
+
+    $op->($Name,                     $code, $name);
+    $op->($GeneralA,                 $code, $cat);
+
+    $op->($GeneralH->{$name} ||= [], $code, $name);
+
+    $op->($Cat->{$cat}       ||= [], $code);
+    $op->($Cat->{substr($cat, 0, 1)}
+                           ||= [],  $code);
+    # 005F: SPACING UNDERSCORE
+    $op->($Cat->{Word}       ||= [], $code)
+       if $cat =~ /^[LMN]/ or $code eq "005F";
+    $op->($Cat->{Alnum}      ||= [], $code)
+       if $cat =~ /^[LMN]/;
+    $op->($Cat->{Alpha}      ||= [], $code)
+       if $cat =~ /^[LM]/;
+    # 0009: HORIZONTAL TABULATION
+    # 000A: LINE FEED
+    # 000B: VERTICAL TABULATION
+    # 000C: FORM FEED
+    # 000D: CARRIAGE RETURN
+    # 0020: SPACE
+    $op->($Cat->{Space}      ||= [], $code)
+       if $cat  =~ /^Z/ ||
+           $code =~ /^(0009|000A|000B|000C|000D)$/;
+    $op->($Cat->{SpacePerl}  ||= [], $code)
+       if $cat  =~ /^Z/ ||
+           $code =~ /^(0009|000A|000C|000D)$/;
+    $op->($Cat->{Blank}      ||= [], $code)
+       if $code =~ /^(0020|0009)$/ ||
+           $cat  =~ /^Z[^lp]$/;
+    $op->($Cat->{Digit}      ||= [], $code) if $cat eq "Nd";
+    $op->($Cat->{Upper}      ||= [], $code) if $cat eq "Lu";
+    $op->($Cat->{Lower}      ||= [], $code) if $cat eq "Ll";
+    $op->($Cat->{Title}      ||= [], $code) if $cat eq "Lt";
+    $op->($Cat->{ASCII}      ||= [], $code) if $code le "007F";
+    $op->($Cat->{Cntrl}      ||= [], $code) if $cat =~ /^C/;
+    $op->($Cat->{Graph}      ||= [], $code) if $cat =~ /^([LMNPS]|Co)/;
+    $op->($Cat->{Print}      ||= [], $code) if $cat =~ /^([LMNPS]|Co|Zs)/;
+    $op->($Cat->{Punct}      ||= [], $code) if $cat =~ /^P/;
+    # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f
+    $op->($Cat->{XDigit}     ||= [], $code)
+       if $code =~ /^00(3[0-9]|[46][1-6])$/;
+
+}
+
 if (open(my $Unicode, "Unicode.txt")) {
     my @Name;
     my @Bidi;
@@ -123,9 +175,9 @@ if (open(my $Unicode, "Unicode.txt")) {
     my @Mirrored;
     my %To;
     while (<$Unicode>) {
-       next if /^\#/ || /^\s*$/;
-       next unless /^[0-9a-f]+\s*;/i;
+       next unless /^[0-9A-Fa-f]+;/;
        s/\s+$//;
+
        my ($code, $name, $cat, $comb, $bidi, $deco,
            $decimal, $digit, $number,
            $mirrored, $unicode10, $comment,
@@ -133,61 +185,18 @@ if (open(my $Unicode, "Unicode.txt")) {
 
        if ($name =~ /^<(.+), (First|Last)>$/) {
            $name = $1;
-           if ($2 eq 'First') {
-               append($General{$name} ||= [], $code, $name);
-           } else {
-               extend($General{$name}       , $code);
-           }
+           gencat(\@Name, \%General, \@General, \%Cat,
+                  $name, $cat, $code,
+                  $2 eq 'First' ? \&append : \&extend);
            unless (defined $In{$name}) {
                $In{$name}   = $InId++;
                $InIn{$name} = $General{$name};
            }
-           append($Cat{$cat}       ||= [], $code);
-           append($Cat{substr($cat, 0, 1)}
-                                   ||= [], $code);
        } else {
-           append(\@Name,                  $code, $name);
-
-           append(\@General,               $code, $cat);
-
-           append($Cat{$cat}       ||= [], $code);
-           append($Cat{substr($cat, 0, 1)}
-                                   ||= [], $code);
-           # 005F: SPACING UNDERSCORE
-           append($Cat{Word}       ||= [], $code)
-               if $cat =~ /^[LMN]/ or $code eq "005F";
-           append($Cat{Alnum}      ||= [], $code)
-               if $cat =~ /^[LMN]/;
-           append($Cat{Alpha}      ||= [], $code)
-               if $cat =~ /^[LM]/;
-           # 0009: HORIZONTAL TABULATION
-           # 000A: LINE FEED
-           # 000B: VERTICAL TABULATION
-           # 000C: FORM FEED
-           # 000D: CARRIAGE RETURN
-           # 0020: SPACE
-           append($Cat{Space}      ||= [], $code)
-               if $cat  =~ /^Z/ ||
-                   $code =~ /^(0009|000A|000B|000C|000D)$/;
-           append($Cat{SpacePerl}  ||= [], $code)
-               if $cat  =~ /^Z/ ||
-                   $code =~ /^(0009|000A|000C|000D)$/;
-           append($Cat{Blank}      ||= [], $code)
-               if $code =~ /^(0020|0009)$/ ||
-                   $cat  =~ /^Z[^lp]$/;
-           append($Cat{Digit}      ||= [], $code) if $cat eq "Nd";
-           append($Cat{Upper}      ||= [], $code) if $cat eq "Lu";
-           append($Cat{Lower}      ||= [], $code) if $cat eq "Ll";
-           append($Cat{Title}      ||= [], $code) if $cat eq "Lt";
-           append($Cat{ASCII}      ||= [], $code) if $code le "007F";
-           append($Cat{Cntrl}      ||= [], $code) if $cat =~ /^C/;
-           append($Cat{Graph}      ||= [], $code) if $cat =~ /^([LMNPS]|Co)/;
-           append($Cat{Print}      ||= [], $code) if $cat =~ /^([LMNPS]|Co|Zs)/;
-           append($Cat{Punct}      ||= [], $code) if $cat =~ /^P/;
-           # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f
-           append($Cat{XDigit}     ||= [], $code)
-               if $code =~ /^00(3[0-9]|[46][1-6])$/;
-           
+
+           gencat(\@Name, \%General, \@General, \%Cat,
+                  $name, $cat, $code, \&append);
+
            append($To{Upper}       ||= [], $code, $upper)   if $upper;
            append($To{Lower}       ||= [], $code, $lower)   if $lower;
            append($To{Title}       ||= [], $code, $title)   if $title;
@@ -259,10 +268,7 @@ if (open(my $LineBrk, "LineBrk.txt")) {
     my %Lbrk;
 
     while (<$LineBrk>) {
-       next if /^\#/ || /^\s*$/;
-       s/\s+$//;
-       s/\s*\#.*//;
-       next unless /^([0-9a-f]+)(?:\.\.([0-9a-f]+))?\s*;\s*(.+)$/i;
+        next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\w+)/;
 
        my ($first, $last, $lbrk) = ($1, $2, $3);
 
@@ -291,9 +297,9 @@ if (open(my $ArabShap, "ArabShap.txt")) {
     my @ArabLinkGroup;
 
     while (<$ArabShap>) {
-       next if /^\#/ || /^\s*$/;
-       next unless /^[0-9a-f]+\s*;/i;
+       next unless /^[0-9A-Fa-f]+;/;
        s/\s+$//;
+
        my ($code, $name, $link, $linkgroup) = split(/\s*;\s*/);
 
        append(\@ArabLink,      $code, $link);
@@ -314,11 +320,9 @@ if (open(my $Jamo, "Jamo.txt")) {
     my @Short;
 
     while (<$Jamo>) {
-       next if /^\#/ || /^\s*$/;
-       next unless /^[0-9a-f]+\s*;/i;
-       s/\s*\#.*//;
-       s/\s+$//;
-       my ($code, $short) = split(/\s*;\s*/);
+       next unless /^([0-9A-Fa-f]+)\s*;\s*(\w*)/;
+
+       my ($code, $short) = ($1, $2);
 
        append(\@Short, $code, $short);
     }
@@ -336,10 +340,7 @@ my @Scripts;
 
 if (open(my $Scripts, "Scripts.txt")) {
     while (<$Scripts>) {
-       next if /^\#/ || /^\s*$/;
-       s/\s*\#.*//;
-       s/\s+$//;
-       next unless /^([0-9a-f]+)(?:\.\.([0-9a-f]+))?\s*;\s*(.+)$/i;
+       next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/;
 
        # Wait until all the scripts have been read since
        # they are not listed in numeric order.
@@ -363,8 +364,9 @@ for my $script (sort { $a->[0] <=> $b->[0] } @Scripts) {
        extend($Script{$name}, $last);
     }
     unless (defined $In{$name}) {
-       $In{$name}   = $InId++;
-       $InIn{$name} = $Script{$name};
+       $InScript{$InId} = $name;
+       $In{$name}       = $InId++;
+       $InIn{$name}     = $Script{$name};
     }
 }
 
@@ -387,17 +389,22 @@ my %Blocks;
 
 if (open(my $Blocks, "Blocks.txt")) {
     while (<$Blocks>) {
-       next if /^\#/ || /^\s*$/;
-       s/\s*\#.*//;
-       s/\s+$//;
-       next unless /^([0-9a-f]+)\.\.([0-9a-f]+)\s*;\s*(.+)$/i;
-
+       next unless /^([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.+?)\s*$/;
+       
        my ($first, $last, $name) = ($1, $2, $3);
+       my $origname = $name;
 
        # If there's a naming conflict (the script names are
        # in uppercase), the name of the block has " Block"
        # appended to it.
-       $name = "$name Block" if defined $In{"\U$name"};
+       my $pat = $name;
+       $pat =~ s/([- _])/(?:[-_]|\\s+)?/g;
+       for my $i (values %InScript) {
+           if ($i =~ /^$pat$/i) {
+               $name .= " Block";
+               last;
+           }
+       }
 
        append(\@Blocks,              $first, $name);
        append($Blocks{$name} ||= [], $first, $name);
@@ -406,8 +413,9 @@ if (open(my $Blocks, "Blocks.txt")) {
            extend($Blocks{$name}, $last);
        }
        unless (defined $In{$name}) {
-           $In{$name}   = $InId++;
-           $InIn{$name} = $Blocks{$name};
+           $InBlock{$InId} = $origname;
+           $In{$name}      = $InId++;
+           $InIn{$name}    = $Blocks{$name};
        }
     }
 } else {
@@ -430,10 +438,7 @@ my @Props;
 
 if (open(my $Props, "PropList.txt")) {
     while (<$Props>) {
-       next if /^\#/ || /^\s*$/;
-       s/\s*\#.*//;
-       s/\s+$//;
-       next unless /^([0-9a-f]+)(?:\.\.([0-9a-f]+))?\s*;\s*(\w+)/i;
+       next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/;
 
        # Wait until all the extended properties have been read since
        # they are not listed in numeric order.
@@ -559,7 +564,7 @@ sub mapping {
 %utf8::${name} =
 (
 EOT
-        for my $i (sort keys %$map) {
+        for my $i (sort { lc $a cmp lc $b } keys %$map) {
            my $pat = $i;
            # Here is the 'fuzzification': accept any space,
            # dash, or underbar where in the official name
@@ -569,7 +574,7 @@ EOT
            # The prefix length of 2 is enough spread,
            # and besides, we have 'Yi' as an In category.
            push @{$pat{lc(substr($i, 0, 2))}}, [ $i, $pat ];
-           print $fh "'$i' => '$map->{$i}',\n";
+           printf $fh "%-45s => '$map->{$i}',\n", "'$i'";
        }
         print $fh <<EOT;
 );
@@ -605,6 +610,39 @@ EOT
 
 mapping(\%In, "In");
 
+#
+# Append the InScript and InBlock mappings.
+# These are needed only if Script= and Block= syntaxes are used.
+#
+
+if (open(my $In, ">>In.pl")) {
+    print $In <<EOT;
+
+%utf8::InScript =
+(
+EOT
+    for my $i (sort { $a <=> $b } keys %InScript) {
+       printf $In "%4d => '$InScript{$i}',\n", $i;
+    }
+    print $In <<EOT;
+);
+EOT
+
+    print $In <<EOT;
+
+%utf8::InBlock =
+(
+EOT
+    for my $i (sort { $a <=> $b } keys %InBlock) {
+       printf $In "%4d => '$InBlock{$i}',\n", $i;
+    }
+    print $In <<EOT;
+);
+EOT
+} else {
+    die "$0: In.pl: $!\n";
+}
+
 # Easy low-calorie cheat.
 use File::Copy;
 copy("In/$In{Noncharacter_Code_Point}.pl", "Is/Cn.pl");
@@ -621,59 +659,53 @@ foreach my $in (sort { $In{$a} <=> $In{$b} } keys %In) {
 #
 # The mapping from General Category long forms to short forms is
 # currently hardwired here since no simple data file in the UCD
-# seems to do that.
+# seems to do that.  Unicode 3.2 will assumedly correct this.
 #
 
 my %Is = (
        'Letter'                        =>      'L',
-       'Uppercase Letter'              =>      'Lu',
-       'Lowercase Letter'              =>      'Ll',
-       'Titlecase Letter'              =>      'Lt',
-       'Modifier Letter'               =>      'Lm',
-       'Other Letter'                  =>      'Lo',
+       'Uppercase_Letter'              =>      'Lu',
+       'Lowercase_Letter'              =>      'Ll',
+       'Titlecase_Letter'              =>      'Lt',
+       'Modifier_Letter'               =>      'Lm',
+       'Other_Letter'                  =>      'Lo',
 
        'Mark'                          =>      'M',
-       'Non-Spacing Mark'              =>      'Mn',
-       'Spacing Combining Mark'        =>      'Mc',
-       'Enclosing Mark'                =>      'Me',
+       'Non_Spacing_Mark'              =>      'Mn',
+       'Spacing_Mark'                  =>      'Mc',
+       'Enclosing_Mark'                =>      'Me',
 
        'Separator'                     =>      'Z',
-       'Space Separator'               =>      'Zs',
-       'Line Separator'                =>      'Zl',
-       'Paragraph Separator'           =>      'Zp',
+       'Space_Separator'               =>      'Zs',
+       'Line_Separator'                =>      'Zl',
+       'Paragraph_Separator'           =>      'Zp',
 
        'Number'                        =>      'N',
-       'Decimal Digit Number'          =>      'Nd',
-       'Letter Number'                 =>      'Nl',
-       'Other Number'                  =>      'No',
+       'Decimal_Number'                =>      'Nd',
+       'Letter_Number'                 =>      'Nl',
+       'Other_Number'                  =>      'No',
 
        'Punctuation'                   =>      'P',
-       'Connector Punctuation'         =>      'Pc',
-       'Dash Punctuation'              =>      'Pd',
-       'Open Punctuation'              =>      'Ps',
-       'Close Punctuation'             =>      'Pe',
-       'Initial Punctuation'           =>      'Pi',
-       'Final Punctuation'             =>      'Pf',
-       'Other Punctuation'             =>      'Po',
+       'Connector_Punctuation'         =>      'Pc',
+       'Dash_Punctuation'              =>      'Pd',
+       'Open_Punctuation'              =>      'Ps',
+       'Close_Punctuation'             =>      'Pe',
+       'Initial_Punctuation'           =>      'Pi',
+       'Final_Punctuation'             =>      'Pf',
+       'Other_Punctuation'             =>      'Po',
 
        'Symbol'                        =>      'S',
-       'Math Symbol'                   =>      'Sm',
-       'Currency Symbol'               =>      'Sc',
-       'Modifier Symbol'               =>      'Sk',
-       'Other Symbol'                  =>      'So',
+       'Math_Symbol'                   =>      'Sm',
+       'Currency_Symbol'               =>      'Sc',
+       'Modifier_Symbol'               =>      'Sk',
+       'Other_Symbol'                  =>      'So',
 
        'Other'                         =>      'C',
        'Control'                       =>      'Cc',
        'Format'                        =>      'Cf',
        'Surrogate'                     =>      'Cs',
        'Private Use'                   =>      'Co',
-       'Not Assigned'                  =>      'Cn',
-       # 'Other' aliases
-       'Other Control'                 =>      'Cc',
-       'Other Format'                  =>      'Cf',
-       'Other Surrogate'               =>      'Cs',
-       'Other Private Use'             =>      'Co',
-       'Other Not Assigned'            =>      'Cn',
+       'Unassigned'                    =>      'Cn',
 );
 
 #
@@ -682,5 +714,118 @@ my %Is = (
 
 mapping(\%Is, "Is");
 
+#
+# Read in the special cases.
+#
+
+my %Case;
+
+if (open(my $SpecCase, "SpecCase.txt")) {
+    while (<$SpecCase>) {
+       next unless /^[0-9A-Fa-f]+;/;
+       s/\#.*//;
+       s/\s+$//;
+
+       my ($code, $lower, $title, $upper, $condition) = split(/\s*;\s*/);
+
+       if ($condition) { # not implemented yet
+           print "# SKIPPING $_\n";
+           next;
+       }
+
+       # Wait until all the special cases have been read since
+       # they are not listed in numeric order.
+       my $ix = hex($code);
+       push @{$Case{Lower}}, [ $ix, $code, $lower ];
+       push @{$Case{Title}}, [ $ix, $code, $title ];
+       push @{$Case{Upper}}, [ $ix, $code, $upper ];
+    }
+} else {
+    die "$0: SpecCase.txt: $!\n";
+}
+
+# Now write out the special cases properties in their code point order.
+# Prepend them to the To/{Upper,Lower,Title}.pl.
+
+for my $case (qw(Lower Title Upper)) {
+    my $NormalCase = do "To/$case.pl" || die "$0: To/$case.pl: $!\n";
+    if (open(my $Case, ">To/$case.pl")) {
+       header($Case);
+       print $Case <<EOT;
+
+%utf8::ToSpec$case = (
+EOT
+        for my $prop (sort { $a->[0] <=> $b->[0] } @{$Case{$case}}) {
+           my ($ix, $code, $to) = @$prop;
+           my $tostr =
+               join "", map { sprintf "\\x{%s}", $_ } split ' ', $to;
+           printf $Case qq['%04X' => "$tostr",\n], $ix;
+       }
+       print $Case <<EOT;
+);
+
+EOT
+       begin($Case);
+       print $Case $NormalCase;
+       end($Case);
+    } else {
+       die "$0: To/$case.txt: $!\n";
+    }
+}
+
+#
+# Read in the case foldings.
+#
+# We will do full case folding, C + F + I (see CaseFold.txt).
+#
+
+if (open(my $CaseFold, "CaseFold.txt")) {
+    my @Fold;
+    my %Fold;
+
+    while (<$CaseFold>) {
+       next unless /^([0-9A-Fa-f]+)\s*;\s*([CFI])\s*;\s*([0-9A-Fa-f]+(?: [0-9A-Fa-f]+)*)\s*;/;
+
+       my ($code, $status, $fold) = ($1, $2, $3);
+
+       if ($status eq 'C') { # Common: one-to-one folding
+           append(\@Fold, $code, $fold);
+       } else { # F: full, or I: dotted uppercase I -> dotless lowercase I
+           $Fold{hex($code)} = $fold;
+       }
+    }
+
+    flush(\@Fold, "To/Fold.pl");
+
+    #
+    # Prepend the special foldings to the common foldings.
+    #
+
+    my $CommonFold = do "To/Fold.pl" || die "$0: To/Fold.pl: $!\n";
+    if (open(my $Fold, ">To/Fold.pl")) {
+       header($Fold);
+       print $Fold <<EOT;
+
+%utf8::ToSpecFold = (
+EOT
+        for my $code (sort { $a <=> $b } keys %Fold) {
+           my $foldstr =
+               join "", map { sprintf "\\x{%s}", $_ } split ' ', $Fold{$code};
+           printf $Fold qq['%04X' => "$foldstr",\n], $code;
+       }
+       print $Fold <<EOT;
+);
+
+EOT
+       begin($Fold);
+       print $Fold $CommonFold;
+       end($Fold);
+    } else {
+       die "$0: To/Fold.pl: $!\n";
+    }
+} else {
+    die "$0: CaseFold.txt: $!\n";
+}
+
 # That's all, folks!