candidate for TR18 compliance
[p5sagit/p5-mst-13.2.git] / lib / unicore / mktables
index 083bc58..e4020ec 100644 (file)
@@ -2,15 +2,13 @@
 use strict;
 use Carp;
 
-die "$0: Please run me as ./mktables to avoid unnecessary differences\n"
-    unless $0 eq "./mktables";
-
 ##
 ## mktables -- create the runtime Perl Unicode files (lib/unicore/**/*.pl)
 ## from the Unicode database files (lib/unicore/*.txt).
 ##
 
 mkdir("lib", 0755);
+mkdir("lib/gc_sc", 0755);
 mkdir("To",  0755);
 
 ##
@@ -109,6 +107,62 @@ sub CanonicalName($)
     return $name;
 }
 
+
+##
+## Store the alias definitions for later use.
+##
+my %PropertyAlias;
+my %PropValueAlias;
+
+my %PA_reverse;
+my %PVA_reverse;
+
+sub Build_Aliases()
+{
+    ##
+    ## Most of the work with aliases doesn't occur here,
+    ## but rather in utf8_heavy.pl, which uses utf8_pva.pl,
+    ## which contains just this function.  However, this one
+    ## 
+    ##   -- japhy (2004/04/13)
+
+    open PA, "< PropertyAliases.txt"
+       or confess "Can't open PropertyAliases.txt: $!";
+    while (<PA>) {
+       s/#.*//;
+       s/\s+$//;
+       next if /^$/;
+
+       my ($abbrev, $name) = split /\s*;\s*/;
+        next if $abbrev eq "n/a";
+       $PropertyAlias{$abbrev} = $name;
+        $PA_reverse{$name} = $abbrev;
+    }
+    close PA;
+
+    open PVA, "< PropValueAliases.txt"
+       or confess "Can't open PropValueAliases.txt: $!";
+    while (<PVA>) {
+       s/#.*//;
+       s/\s+$//;
+       next if /^$/;
+
+       my ($prop, @data) = split /\s*;\s*/;
+
+       if ($prop eq 'ccc') {
+           $PropValueAlias{$prop}{$data[1]} = [ @data[0,2] ];
+           $PVA_reverse{$prop}{$data[2]} = [ @data[0,1] ];
+       }
+       else {
+            next if $data[0] eq "n/a";
+           $PropValueAlias{$prop}{$data[0]} = $data[1];
+            $PVA_reverse{$prop}{$data[1]} = $data[0];
+       }
+    }
+    close PVA;
+}
+
+
 ##
 ## Associates a property ("Greek", "Lu", "Assigned",...) with a Table.
 ##
@@ -511,7 +565,7 @@ sub New_Alias($$$@)
         if ($TableInfo{$Type}->{$CName}) {
             confess "$0: Use canonical form '$CName' instead of '$Name' for alias.";
         } else {
-            confess "$0: don't have orignial $Type => $Name to make alias";
+            confess "$0: don't have original $Type => $Name to make alias\n";
         }
     }
     if ($TableInfo{$Alias}) {
@@ -544,19 +598,15 @@ sub UnicodeData_Txt()
     my $Deco     = Table->New();
     my $Comb     = Table->New();
     my $Number   = Table->New();
-    my $Mirrored = Table->New(Is    => 'Mirrored',
-                              Desc  => "Mirrored in bidirectional text",
-                              Fuzzy => 0);
+    my $Mirrored = Table->New();#Is    => 'Mirrored',
+                              #Desc  => "Mirrored in bidirectional text",
+                              #Fuzzy => 0);
 
     my %DC;
     my %Bidi;
-    my %Deco;
-    $Deco{Canon}   = Table->New(Is    => 'Canon',
-                                Desc  => 'Decomposes to multiple characters',
-                                Fuzzy => 0);
-    $Deco{Compat}  = Table->New(Is    => 'Compat',
-                                Desc  => 'Compatible with a more-basic character',
-                                Fuzzy => 0);
+    my %Number;
+    $DC{can} = Table->New();
+    $DC{com} = Table->New();
 
     ## Initialize Perl-generated categories
     ## (Categories from UnicodeData.txt are auto-initialized in gencat)
@@ -627,32 +677,35 @@ sub UnicodeData_Txt()
         ($General{$name} ||= Table->New)->$op($code, $name);
 
         # 005F: SPACING UNDERSCORE
-        $Cat{Word}->$op($code)  if $cat =~ /^[LMN]/ || $code == 0x005F;
+        $Cat{Word}->$op($code)  if $cat =~ /^[LMN]|Pc/;
         $Cat{Alnum}->$op($code) if $cat =~ /^[LM]|Nd/;
         $Cat{Alpha}->$op($code) if $cat =~ /^[LM]/;
 
-
-
-        $Cat{Space}->$op($code) if $cat  =~ /^Z/
+       my $isspace = 
+           ($cat =~ /Zs|Zl|Zp/ &&
+            $code != 0x200B) # 200B is ZWSP which is for line break control
+            # and therefore it is not part of "space" even while it is "Zs".
                                 || $code == 0x0009  # 0009: HORIZONTAL TAB
                                 || $code == 0x000A  # 000A: LINE FEED
                                 || $code == 0x000B  # 000B: VERTICAL TAB
                                 || $code == 0x000C  # 000C: FORM FEED
                                 || $code == 0x000D  # 000D: CARRIAGE RETURN
-                                || $code == 0x0085; # 0085: NEL
+                                || $code == 0x0085  # 0085: NEL
 
+           ;
 
-        $Cat{SpacePerl}->$op($code) if $cat =~ /^Z/
-                                    || $code == 0x0009 # 0009: HORIZONTAL TAB
-                                    || $code == 0x000A # 000A: LINE FEED
-                                    || $code == 0x000C # 000C: FORM FEED
-                                    || $code == 0x000D # 000D: CARRIAGE RETURN
-                                    || $code == 0x0085 # 0085: <NEXT LINE>
-                                    || $code == 0x2028 # 2028: LINE SEPARATOR
-                                    || $code == 0x2029;# 2029: PARAGRAPH SEP.
+        $Cat{Space}->$op($code) if $isspace;
 
-        $Cat{Blank}->$op($code) if $cat eq "Zs"
-                                || $code == 0x0009; # 0009: HORIZONTAL TAB
+        $Cat{SpacePerl}->$op($code) if $isspace
+                                      && $code != 0x000B; # Backward compat.
+
+        $Cat{Blank}->$op($code) if $isspace
+                                && !($code == 0x000A ||
+                                    $code == 0x000B ||
+                                    $code == 0x000C ||
+                                    $code == 0x000D ||
+                                    $code == 0x0085 ||
+                                    $cat =~ /^Z[lp]/);
 
         $Cat{Digit}->$op($code) if $cat eq "Nd";
         $Cat{Upper}->$op($code) if $cat eq "Lu";
@@ -660,9 +713,9 @@ sub UnicodeData_Txt()
         $Cat{Title}->$op($code) if $cat eq "Lt";
         $Cat{ASCII}->$op($code) if $code <= 0x007F;
         $Cat{Cntrl}->$op($code) if $cat =~ /^C/;
-        $Cat{Graph}->$op($code) if $cat !~ /Zs|Cc|Cs|Cn/;
-        $Cat{Print}->$op($code) if $cat =~ /^[LMNPS]/
-                                || $cat eq "Zs";
+       my $isgraph = !$isspace && $cat !~ /Cc|Cs|Cn/;
+        $Cat{Graph}->$op($code) if $isgraph;
+        $Cat{Print}->$op($code) if $isgraph || $isspace;
         $Cat{Punct}->$op($code) if $cat =~ /^P/;
 
         $Cat{XDigit}->$op($code) if ($code >= 0x30 && $code <= 0x39)  ## 0..9
@@ -754,11 +807,17 @@ sub UnicodeData_Txt()
             $Comb->Append($code, $comb) if $comb;
             $Number->Append($code, $number) if length $number;
 
+           length($decimal) and ($Number{De} ||= Table->New())->Append($code)
+             or
+           length($digit)   and ($Number{Di} ||= Table->New())->Append($code)
+             or
+           length($number)  and ($Number{Nu} ||= Table->New())->Append($code);
+
             $Mirrored->Append($code) if $mirrored eq "Y";
 
-            $Bidi{$bidi} ||= Table->New(Is    => "Bidi$bidi",
-                                        Desc  => "Bi-directional category '$bidi'",
-                                        Fuzzy => 0);
+            $Bidi{$bidi} ||= Table->New();#Is    => "bt/$bidi",
+                                        #Desc  => "Bi-directional category '$bidi'",
+                                        #Fuzzy => 0);
             $Bidi{$bidi}->Append($code);
 
             if ($deco)
@@ -766,16 +825,15 @@ sub UnicodeData_Txt()
                 $Deco->Append($code, $deco);
                 if ($deco =~/^<(\w+)>/)
                 {
-                    $Deco{Compat}->Append($code);
+                   my $dshort = $PVA_reverse{dt}{ucfirst lc $1};
+                    $DC{com}->Append($code);
 
-                    $DC{$1} ||= Table->New(Is => "DC$1",
-                                           Desc  => "Compatible with '$1'",
-                                           Fuzzy => 0);
-                    $DC{$1}->Append($code);
+                    $DC{$dshort} ||= Table->New();
+                    $DC{$dshort}->Append($code);
                 }
                 else
                 {
-                    $Deco{Canon}->Append($code);
+                    $DC{can}->Append($code);
                 }
             }
         }
@@ -798,8 +856,9 @@ sub UnicodeData_Txt()
     $Cat{C}->Replace($Cat{C}->Merge($Cat{Cn}));  ## Now merge in Cn into C
 
 
-    # L& is Ll, Lu, and Lt.
-    New_Prop(Is => 'L&',
+    # LC is Ll, Lu, and Lt.
+    # (used to be L& or L_, but PropValueAliases.txt defines it as LC)
+    New_Prop(Is => 'LC',
              Table->Merge(@Cat{qw[Ll Lu Lt]}),
              Desc  => '[\p{Ll}\p{Lu}\p{Lt}]',
              Fuzzy => 0);
@@ -865,15 +924,54 @@ sub UnicodeData_Txt()
     ## Now dump the files.
     ##
     $Name->Write("Name.pl");
-    $Bidi->Write("Bidirectional.pl");
+
+    # $Bidi->Write("Bidirectional.pl");
+    mkdir("lib/bc", 0755);
+    for (keys %Bidi) {
+       $Bidi{$_}->Write(
+           "lib/bc/$_.pl",
+           "BidiClass category '$PropValueAlias{bc}{$_}'"
+       );
+    }
+
     $Comb->Write("CombiningClass.pl");
+    mkdir("lib/ccc", 0755);
+    for (keys %{ $PropValueAlias{ccc} }) {
+       my ($code, $name) = @{ $PropValueAlias{ccc}{$_} };
+       (my $c = Table->New())->Append($code);
+       $c->Write(
+           "lib/ccc/$_.pl",
+           "CombiningClass category '$name'"
+       );
+    }
+
     $Deco->Write("Decomposition.pl");
-    $Number->Write("Number.pl");
-    $General->Write("Category.pl");
+    mkdir("lib/dt", 0755);
+    for (keys %DC) {
+       $DC{$_}->Write(
+           "lib/dt/$_.pl",
+           "DecompositionType category '$PropValueAlias{dt}{$_}'"
+       );
+    }
+
+    # $Number->Write("Number.pl");
+    mkdir("lib/nt", 0755);
+    for (keys %Number) {
+       $Number{$_}->Write(
+           "lib/nt/$_.pl",
+           "NumericType category '$PropValueAlias{nt}{$_}'"
+       );
+    }
+
+    # $General->Write("Category.pl");
 
     for my $to (sort keys %To) {
         $To{$to}->Write("To/$to.pl");
     }
+
+    for (keys %{ $PropValueAlias{gc} }) {
+       New_Alias(Is => $PropValueAlias{gc}{$_}, SameAs => $_, Fuzzy => 1);
+    }
 }
 
 ##
@@ -896,9 +994,7 @@ sub LineBreak_Txt()
 
        $Lbrk->Append($first, $lbrk);
 
-        $Lbrk{$lbrk} ||= Table->New(Is    => "Lbrk$lbrk",
-                                    Desc  => "Linebreak category '$lbrk'",
-                                    Fuzzy => 0);
+        $Lbrk{$lbrk} ||= Table->New();
         $Lbrk{$lbrk}->Append($first);
 
        if ($last) {
@@ -908,7 +1004,16 @@ sub LineBreak_Txt()
     }
     close IN;
 
-    $Lbrk->Write("Lbrk.pl");
+    # $Lbrk->Write("Lbrk.pl");
+
+    mkdir("lib/lb", 0755);
+
+    for (keys %Lbrk) {
+       $Lbrk{$_}->Write(
+           "lib/lb/$_.pl",
+           "Linebreak category '$PropValueAlias{lb}{$_}'"
+       );
+    }
 }
 
 ##
@@ -923,6 +1028,8 @@ sub ArabicShaping_txt()
     my $ArabLink      = Table->New();
     my $ArabLinkGroup = Table->New();
 
+    my %JoinType;
+
     while (<IN>)
     {
        next unless /^[0-9A-Fa-f]+;/;
@@ -932,11 +1039,90 @@ sub ArabicShaping_txt()
         my $code = hex($hexcode);
        $ArabLink->Append($code, $link);
        $ArabLinkGroup->Append($code, $linkgroup);
+
+        $JoinType{$link} ||= Table->New(Is => "JoinType$link");
+        $JoinType{$link}->Append($code);
     }
     close IN;
 
-    $ArabLink->Write("ArabLink.pl");
-    $ArabLinkGroup->Write("ArabLnkGrp.pl");
+    # $ArabLink->Write("ArabLink.pl");
+    # $ArabLinkGroup->Write("ArabLnkGrp.pl");
+
+    mkdir("lib/jt", 0755);
+
+    for (keys %JoinType) {
+       $JoinType{$_}->Write(
+           "lib/jt/$_.pl",
+           "JoiningType category '$PropValueAlias{jt}{$_}'"
+       );
+    }
+}
+
+##
+## Process EastAsianWidth.txt.
+##
+sub EastAsianWidth_txt()
+{
+    if (not open IN, "EastAsianWidth.txt") {
+        die "$0: EastAsianWidth.txt: $!\n";
+    }
+
+    my %EAW;
+
+    while (<IN>)
+    {
+       next unless /^[0-9A-Fa-f]+;/;
+       s/#.*//;
+       s/\s+$//;
+
+       my ($hexcode, $pv) = split(/\s*;\s*/);
+        my $code = hex($hexcode);
+        $EAW{$pv} ||= Table->New(Is => "EastAsianWidth$pv");
+        $EAW{$pv}->Append($code);
+    }
+    close IN;
+
+    mkdir("lib/ea", 0755);
+
+    for (keys %EAW) {
+       $EAW{$_}->Write(
+           "lib/ea/$_.pl",
+           "EastAsianWidth category '$PropValueAlias{ea}{$_}'"
+       );
+    }
+}
+
+##
+## Process HangulSyllableType.txt.
+##
+sub HangulSyllableType_txt()
+{
+    if (not open IN, "HangulSyllableType.txt") {
+        die "$0: HangulSyllableType.txt: $!\n";
+    }
+
+    my %HST;
+
+    while (<IN>)
+    {
+        next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\w+)/;
+       my ($first, $last, $pv) = (hex($1), hex($2||""), $3);
+
+        $HST{$pv} ||= Table->New(Is => "HangulSyllableType$pv");
+        $HST{$pv}->Append($first);
+
+       if ($last) { $HST{$pv}->Extend($last) }
+    }
+    close IN;
+
+    mkdir("lib/hst", 0755);
+
+    for (keys %HST) {
+       $HST{$_}->Write(
+           "lib/hst/$_.pl",
+           "HangulSyllableType category '$PropValueAlias{hst}{$_}'"
+       );
+    }
 }
 
 ##
@@ -957,7 +1143,7 @@ sub Jamo_txt()
        $Short->Append($code, $short);
     }
     close IN;
-    $Short->Write("JamoShort.pl");
+    # $Short->Write("JamoShort.pl");
 }
 
 ##
@@ -1000,7 +1186,7 @@ sub Scripts_txt()
         }
     }
 
-    $Scripts->Write("Scripts.pl");
+    # $Scripts->Write("Scripts.pl");
 
     ## Common is everything not explicitly assigned to a Script
     ##
@@ -1061,7 +1247,7 @@ sub Blocks_txt()
     }
     close IN;
 
-    $Blocks->Write("Blocks.pl");
+    # $Blocks->Write("Blocks.pl");
 }
 
 ##
@@ -1109,6 +1295,14 @@ sub PropList_txt()
         }
     }
 
+    for (keys %Prop) {
+       (my $file = $PA_reverse{$_}) =~ tr/_//d;
+       $Prop{$_}->Write(
+           "lib/gc_sc/$file.pl",
+           "Binary property '$_'"
+       );
+    }
+
     # Alphabetic is L and Other_Alphabetic.
     New_Prop(Is    => 'Alphabetic',
              Table->Merge($Cat{L}, $Prop{Other_Alphabetic}),
@@ -1146,65 +1340,6 @@ sub PropList_txt()
              Fuzzy => 1);
 }
 
-sub Make_GC_Aliases()
-{
-    ##
-    ## The mapping from General Category long forms to short forms is
-    ## currently hardwired here since no simple data file in the UCD
-    ## seems to do that.  Unicode 3.2 will assumedly correct this.
-    ##
-    my %Is = (
-       'Letter'                        =>      'L',
-       'Uppercase_Letter'              =>      'Lu',
-       'Lowercase_Letter'              =>      'Ll',
-       'Titlecase_Letter'              =>      'Lt',
-       'Modifier_Letter'               =>      'Lm',
-       'Other_Letter'                  =>      'Lo',
-
-       'Mark'                          =>      'M',
-       'Non_Spacing_Mark'              =>      'Mn',
-       'Spacing_Mark'                  =>      'Mc',
-       'Enclosing_Mark'                =>      'Me',
-
-       'Separator'                     =>      'Z',
-       'Space_Separator'               =>      'Zs',
-       'Line_Separator'                =>      'Zl',
-       'Paragraph_Separator'           =>      'Zp',
-
-       'Number'                        =>      'N',
-       'Decimal_Number'                =>      'Nd',
-       'Letter_Number'                 =>      'Nl',
-       'Other_Number'                  =>      'No',
-
-       'Punctuation'                   =>      'P',
-       'Connector_Punctuation'         =>      'Pc',
-       'Dash_Punctuation'              =>      'Pd',
-       'Open_Punctuation'              =>      'Ps',
-       'Close_Punctuation'             =>      'Pe',
-       'Initial_Punctuation'           =>      'Pi',
-       'Final_Punctuation'             =>      'Pf',
-       'Other_Punctuation'             =>      'Po',
-
-       'Symbol'                        =>      'S',
-       'Math_Symbol'                   =>      'Sm',
-       'Currency_Symbol'               =>      'Sc',
-       'Modifier_Symbol'               =>      'Sk',
-       'Other_Symbol'                  =>      'So',
-
-       'Other'                         =>      'C',
-       'Control'                       =>      'Cc',
-       'Format'                        =>      'Cf',
-       'Surrogate'                     =>      'Cs',
-       'Private Use'                   =>      'Co',
-       'Unassigned'                    =>      'Cn',
-    );
-
-    ## make the aliases....
-    while (my ($Alias, $Name) = each %Is) {
-        New_Alias(Is => $Alias, SameAs => $Name, Fuzzy => 1);
-    }
-}
-
 
 ##
 ## These are used in:
@@ -1427,7 +1562,9 @@ sub WriteAllMappings()
             my $filename;
             {
                 ## 'Is' items lose 'Is' from the basename.
-                $filename = $Type eq 'Is' ? $Name : "$Type$Name";
+                $filename = $Type eq 'Is' ?
+                   ($PVA_reverse{sc}{$Name} || $Name) :
+                   "$Type$Name";
 
                 $filename =~ s/[^\w_]+/_/g; # "L&" -> "L_"
                 substr($filename, 8) = '' if length($filename) > 8;
@@ -1500,7 +1637,7 @@ sub WriteAllMappings()
             ##
             ## Okay, write the file...
             ##
-            $Table->Write("lib/$filename.pl", $Comment);
+            $Table->Write("lib/gc_sc/$filename.pl", $Comment);
 
             ## and register it
             $RawNameToFile{$Name} = $filename;
@@ -1564,10 +1701,11 @@ sub WriteAllMappings()
                    "##\n",
                    "## Data in this file used by ../utf8_heavy.pl\n",
                    "##\n\n",
-                   "## Mapping from name to filename in ./lib\n",
+                   "## Mapping from name to filename in ./lib/gc_sc\n",
                    "%utf8::Exact = (\n",
                   );
 
+       $Exact{InGreek} = 'InGreekA';  # this is evil kludge
         for my $Name (sort keys %Exact)
         {
             my $File = $Exact{$Name};
@@ -1655,16 +1793,18 @@ sub SpecialCasing_txt()
     {
         my $NormalCase = do "To/$case.pl" || die "$0: $@\n";
 
-        my @OUT = (
-                   $HEADER, "\n",
-                   "%utf8::ToSpec$case =\n(\n",
-                   );
+        my @OUT =
+           (
+            $HEADER, "\n",
+            "# The key UTF-8 _bytes_, the value UTF-8 (speed hack)\n",
+            "%utf8::ToSpec$case =\n(\n",
+           );
 
         for my $prop (sort { $a->[0] <=> $b->[0] } @{$CaseInfo{$case}}) {
             my ($ix, $code, $to) = @$prop;
             my $tostr =
               join "", map { sprintf "\\x{%s}", $_ } split ' ', $to;
-            push @OUT, sprintf qq['%04X' => "$tostr",\n], $ix;
+            push @OUT, sprintf qq["%s" => "$tostr",\n], join("", map { sprintf "\\x%02X", $_ } unpack("U0C*", pack("U", $ix)));
            # Remove any single-character mappings for
            # the same character since we are going for
            # the special casing rules.
@@ -1716,14 +1856,16 @@ sub CaseFolding_txt()
     #
     my $CommonFold = do "To/Fold.pl" || die "$0: To/Fold.pl: $!\n";
 
-    my @OUT = (
-               $HEADER, "\n",
-               "%utf8::ToSpecFold =\n(\n",
-              );
+    my @OUT =
+       (
+        $HEADER, "\n",
+        "#  The ke UTF-8 _bytes_, the value UTF-8 (speed hack)\n",
+        "%utf8::ToSpecFold =\n(\n",
+       );
     for my $code (sort { $a <=> $b } keys %Fold) {
         my $foldstr =
           join "", map { sprintf "\\x{%s}", $_ } split ' ', $Fold{$code};
-        push @OUT, sprintf qq['%04X' => "$foldstr",\n], $code;
+        push @OUT, sprintf qq["%s" => "$foldstr",\n], join("", map { sprintf "\\x%02X", $_ } unpack("U0C*", pack("U", $code)));
     }
     push @OUT, (
                 ");\n\n",
@@ -1737,8 +1879,8 @@ sub CaseFolding_txt()
 
 ## Do it....
 
+Build_Aliases();
 UnicodeData_Txt();
-Make_GC_Aliases();
 PropList_txt();
 
 Scripts_txt();
@@ -1748,6 +1890,8 @@ WriteAllMappings();
 
 LineBreak_Txt();
 ArabicShaping_txt();
+EastAsianWidth_txt();
+HangulSyllableType_txt();
 Jamo_txt();
 SpecialCasing_txt();
 CaseFolding_txt();