perl5db on miniperl

[p5sagit/p5-mst-13.2.git] / lib / unicore / mktables
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index 16fc09f..006b9ef 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -1,20 +1,39 @@
+## !!!!!!!!!!!!!!       IF YOU MODIFY THIS FILE       !!!!!!!!!!!!!!!!!!!!!!!!!
+## Any files created or read by this program should be listed in 'mktables.lst'
+
 #!/usr/bin/perl -w
+require 5.008; # Needs pack "U". Probably safest to run on 5.8.x
 use strict;
 use Carp;
+use File::Spec;
 
 ##
 ## mktables -- create the runtime Perl Unicode files (lib/unicore/**/*.pl)
 ## from the Unicode database files (lib/unicore/*.txt).
 ##
 
-mkdir("lib", 0755);
-mkdir("To",  0755);
+## "Fuzzy" means this section in Unicode TR18:
+##
+##    The recommended names for UCD properties and property values are in
+##    PropertyAliases.txt [Prop] and PropertyValueAliases.txt
+##    [PropValue]. There are both abbreviated names and longer, more
+##    descriptive names. It is strongly recommended that both names be
+##    recognized, and that loose matching of property names be used,
+##    whereby the case distinctions, whitespace, hyphens, and underbar
+##    are ignored.
+
+## Base names already used in lib/gc_sc (for avoiding 8.3 conflicts)
+my %BaseNames;
 
 ##
 ## Process any args.
 ##
 my $Verbose        = 0;
 my $MakeTestScript = 0;
+my $AlwaysWrite    = 0;
+my $UseDir         = "";
+my $FileList       = "$0.lst";
+my $MakeList       = 0;
 
 while (@ARGV)
 {
@@ -23,11 +42,114 @@ while (@ARGV)
         $Verbose = 1;
     } elsif ($arg eq '-q') {
         $Verbose = 0;
+    } elsif ($arg eq '-w') {
+        $AlwaysWrite = 1;      # update the files even if they havent changed
+        $FileList = "";
+    } elsif ($arg eq '-check') {
+        my $this = shift @ARGV;
+        my $ok = shift @ARGV;
+        if ($this ne $ok) {
+            print "Skipping as check params are not the same.\n";
+            exit(0);
+        }
     } elsif ($arg eq '-maketest') {
         $MakeTestScript = 1;
+    } elsif ($arg eq '-makelist') {
+        $MakeList = 1;        
+    } elsif ($arg eq '-C' && defined ($UseDir = shift)) {
+       -d $UseDir or die "Unknown directory '$UseDir'";
+    } elsif ($arg eq '-L' && defined ($FileList = shift)) {
+        -e $FileList or die "Filelist '$FileList' doesn't appear to exist!";
     } else {
-        die "usage: $0 [-v|-q] [-maketest]";
+        die "usage: $0 [-v|-q|-w|-C dir|-L filelist] [-maketest] [-makelist]\n",
+            "  -v          : Verbose Mode\n",
+            "  -q          : Quiet Mode\n",
+            "  -w          : Write files regardless\n",
+            "  -maketest   : Make test script\n",
+            "  -makelist   : Rewrite the file list based on current setup\n",
+            "  -L filelist : Use this file list, (defaults to $0)\n",
+            "  -C dir      : Change to this directory before proceeding\n",
+            "  -check A B  : Executes only if A and B are the same\n";   
+    }
+}
+
+if ($FileList) {
+    print "Reading file list '$FileList'\n"
+        if $Verbose;
+    open my $fh,"<",$FileList or die "Failed to read '$FileList':$!";
+    my @input;
+    my @output;
+    for my $list ( \@input, \@output ) {
+        while (<$fh>) {
+            s/^ \s+ | \s+ $//xg;
+            next if /^ \s* (?: \# .* )? $/x;
+            last if /^ =+ $/x;
+            my ( $file ) = split /\t/, $_;
+            push @$list, $file;
+        }
+        my %dupe;
+        @$list = grep !$dupe{ $_ }++, @$list;
+    }
+    close $fh;
+    die "No input or output files in '$FileList'!"
+        if !@input or !@output;
+    if ( $MakeList ) {
+        foreach my $file (@output) {
+            unlink $file;
+        }
+    }            
+    if ( $Verbose ) {
+        print "Expecting ".scalar( @input )." input files. ",
+              "Checking ".scalar( @output )." output files.\n";
     }
+    # we set maxtime to be the youngest input file, including $0 itself.
+    my $maxtime = -M $0; # do this before the chdir!
+    if ($UseDir) {
+        chdir $UseDir or die "Failed to chdir to '$UseDir':$!";
+    }
+    foreach my $in (@input) {
+        my $time = -M $in;
+        die "Missing input file '$in'" unless defined $time;
+        $maxtime = $time if $maxtime < $time;
+    }
+
+    # now we check to see if any output files are older than maxtime, if
+    # they are we need to continue on, otherwise we can presumably bail.
+    my $ok = 1;
+    foreach my $out (@output) {
+        if ( ! -e $out ) {
+            print "'$out' is missing.\n"
+                if $Verbose;
+            $ok = 0;
+            last;
+        }
+        if ( -M $out > $maxtime ) {
+            print "'$out' is too old.\n"
+                if $Verbose;
+            $ok = 0;
+            last;
+        }
+    }
+    if ($ok) {
+        print "Files seem to be ok, not bothering to rebuild.\n";
+        exit(0);
+    }
+    print "Must rebuild tables.\n"
+        if $Verbose;
+} else {
+    if ($Verbose) {
+        print "Not checking filelist.\n";
+    }
+    if ($UseDir) {
+        chdir $UseDir or die "Failed to chdir to '$UseDir':$!";
+    }
+}
+
+foreach my $lib ('To', 'lib',
+                map {File::Spec->catdir("lib",$_)}
+                qw(gc_sc dt bc hst ea jt lb nt ccc)) {
+  next if -d $lib;
+  mkdir $lib, 0755 or die "mkdir '$lib': $!";
 }
 
 my $LastUnicodeCodepoint = 0x10FFFF; # As of Unicode 3.1.1.
@@ -39,16 +161,27 @@ my $HEADER=<<"EOF";
 
 EOF
 
+sub force_unlink {
+    my $filename = shift;
+    return unless -e $filename;
+    return if CORE::unlink($filename);
+    # We might need write permission
+    chmod 0777, $filename;
+    CORE::unlink($filename) or die "Couldn't unlink $filename: $!\n";
+}
 
 ##
 ## Given a filename and a reference to an array of lines,
 ## write the lines to the file only if the contents have not changed.
+## Filename can be given as an arrayref of directory names
 ##
 sub WriteIfChanged($\@)
 {
     my $file  = shift;
     my $lines = shift;
 
+    $file = File::Spec->catfile(@$file) if ref $file;
+
     my $TextToWrite = join '', @$lines;
     if (open IN, $file) {
         local($/) = undef;
@@ -56,9 +189,10 @@ sub WriteIfChanged($\@)
         close IN;
         if ($PreviousText eq $TextToWrite) {
             print "$file unchanged.\n" if $Verbose;
-            return;
+            return unless $AlwaysWrite;
         }
     }
+    force_unlink ($file);
     if (not open OUT, ">$file") {
         die "$0: can't open $file for output: $!\n";
     }
@@ -106,6 +240,88 @@ sub CanonicalName($)
     return $name;
 }
 
+
+##
+## Store the alias definitions for later use.
+##
+my %PropertyAlias;
+my %PropValueAlias;
+
+my %PA_reverse;
+my %PVA_reverse;
+
+sub Build_Aliases()
+{
+    ##
+    ## Most of the work with aliases doesn't occur here,
+    ## but rather in utf8_heavy.pl, which uses PVA.pl,
+
+    # Placate the warnings about used only once. (They are used again, but
+    # via a typeglob lookup)
+    %utf8::PropertyAlias = ();
+    %utf8::PA_reverse = ();
+    %utf8::PropValueAlias = ();
+    %utf8::PVA_reverse = ();
+    %utf8::PVA_abbr_map = ();
+
+    open PA, "< PropertyAliases.txt"
+       or confess "Can't open PropertyAliases.txt: $!";
+    while (<PA>) {
+       s/#.*//;
+       s/\s+$//;
+       next if /^$/;
+
+       my ($abbrev, $name) = split /\s*;\s*/;
+        next if $abbrev eq "n/a";
+       $PropertyAlias{$abbrev} = $name;
+        $PA_reverse{$name} = $abbrev;
+
+       # The %utf8::... versions use japhy's code originally from utf8_pva.pl
+       # However, it's moved here so that we build the tables at runtime.
+       tr/ _-//d for $abbrev, $name;
+       $utf8::PropertyAlias{lc $abbrev} = $name;
+       $utf8::PA_reverse{lc $name} = $abbrev;
+    }
+    close PA;
+
+    open PVA, "< PropValueAliases.txt"
+       or confess "Can't open PropValueAliases.txt: $!";
+    while (<PVA>) {
+       s/#.*//;
+       s/\s+$//;
+       next if /^$/;
+
+       my ($prop, @data) = split /\s*;\s*/;
+
+       if ($prop eq 'ccc') {
+           $PropValueAlias{$prop}{$data[1]} = [ @data[0,2] ];
+           $PVA_reverse{$prop}{$data[2]} = [ @data[0,1] ];
+       }
+       else {
+            next if $data[0] eq "n/a";
+           $PropValueAlias{$prop}{$data[0]} = $data[1];
+            $PVA_reverse{$prop}{$data[1]} = $data[0];
+       }
+
+       shift @data if $prop eq 'ccc';
+       next if $data[0] eq "n/a";
+
+       $data[1] =~ tr/ _-//d;
+       $utf8::PropValueAlias{$prop}{lc $data[0]} = $data[1];
+       $utf8::PVA_reverse{$prop}{lc $data[1]} = $data[0];
+
+       my $abbr_class = ($prop eq 'gc' or $prop eq 'sc') ? 'gc_sc' : $prop;
+       $utf8::PVA_abbr_map{$abbr_class}{lc $data[0]} = $data[0];
+    }
+    close PVA;
+
+    # backwards compatibility for L& -> LC
+    $utf8::PropValueAlias{gc}{'l&'} = $utf8::PropValueAlias{gc}{lc};
+    $utf8::PVA_abbr_map{gc_sc}{'l&'} = $utf8::PVA_abbr_map{gc_sc}{lc};
+
+}
+
+
 ##
 ## Associates a property ("Greek", "Lu", "Assigned",...) with a Table.
 ##
@@ -182,32 +398,15 @@ sub Table::New
     return $Table;
 }
 
-##
-## Returns true if the Table has no code points
-##
-sub Table::IsEmpty
-{
-    my $Table = shift; #self
-    return not @$Table;
-}
-
-##
-## Returns true if the Table has code points
-##
-sub Table::NotEmpty
-{
-    my $Table = shift; #self
-    return @$Table;
-}
 
 ##
 ## Returns the maximum code point currently in the table.
 ##
 sub Table::Max
 {
-    my $Table = shift; #self
-    confess "oops" if $Table->IsEmpty; ## must have code points to have a max
-    return $Table->[-1]->[RANGE_END];
+    my $last = $_[0]->[-1];      ## last code point
+    confess "oops" unless $last; ## must have code points to have a max
+    return $last->[RANGE_END];
 }
 
 ##
@@ -226,6 +425,8 @@ sub Table::Replace($$)
 ## Given a new code point, make the last range of the Table extend to
 ## include the new (and all intervening) code points.
 ##
+## Takes the time to make sure that the extension is valid.
+##
 sub Table::Extend
 {
     my $Table = shift; #self
@@ -235,7 +436,21 @@ sub Table::Extend
 
     confess "oops ($codepoint <= $PrevMax)" if $codepoint <= $PrevMax;
 
-    $Table->[-1]->[RANGE_END] = $codepoint;
+    $Table->ExtendNoCheck($codepoint);
+}
+
+
+##
+## Given a new code point, make the last range of the Table extend to
+## include the new (and all intervening) code points.
+##
+## Does NOT check that the extension is valid.  Assumes that the caller
+## has already made this check.
+##
+sub Table::ExtendNoCheck
+{
+    ## Optmized adding: Assumes $Table and $codepoint as parms
+    $_[0]->[-1]->[RANGE_END] = $_[1];
 }
 
 ##
@@ -273,13 +488,14 @@ sub Table::Append
     ## If we've already got a range working, and this code point is the next
     ## one in line, and if the name is the same, just extend the current range.
     ##
-    if ($Table->NotEmpty
+    my $last = $Table->[-1];
+    if ($last
         and
-        $Table->Max == $codepoint - 1
+        $last->[RANGE_END] == $codepoint - 1
         and
-        $Table->[-1]->[RANGE_NAME] eq $name)
+        $last->[RANGE_NAME] eq $name)
     {
-        $Table->Extend($codepoint);
+        $Table->ExtendNoCheck($codepoint);
     }
     else
     {
@@ -377,7 +593,7 @@ sub Table::Merge
         if ($start > $New->Max) {
             $New->AppendRange($start, $end);
         } elsif ($end > $New->Max) {
-            $New->Extend($end);
+            $New->ExtendNoCheck($end);
         }
     }
 
@@ -387,6 +603,7 @@ sub Table::Merge
 ##
 ## Given a filename, write a representation of the Table to a file.
 ## May have an optional comment as a 2nd arg.
+## Filename may actually be an arrayref of directories
 ##
 sub Table::Write
 {
@@ -453,7 +670,7 @@ sub Table::InvalidCode
 {
     my $Table = shift; #self
 
-    return 0x1234 if $Table->IsEmpty();
+    return 0x1234 if not @$Table;
 
     for my $set (@$Table)
     {
@@ -508,7 +725,7 @@ sub New_Alias($$$@)
         if ($TableInfo{$Type}->{$CName}) {
             confess "$0: Use canonical form '$CName' instead of '$Name' for alias.";
         } else {
-            confess "$0: don't have orignial $Type => $Name to make alias";
+            confess "$0: don't have original $Type => $Name to make alias\n";
         }
     }
     if ($TableInfo{$Alias}) {
@@ -532,47 +749,86 @@ my $General  = Table->New(); ## all characters, grouped by category
 my %General;
 my %Cat;
 
+## Simple Data::Dumper alike. Good enough for our needs. We can't use the real
+## thing as we have to run under miniperl
+sub simple_dumper {
+    my @lines;
+    my $item;
+    foreach $item (@_) {
+       if (ref $item) {
+           if (ref $item eq 'ARRAY') {
+               push @lines, "[\n", simple_dumper (@$item), "],\n";
+           } elsif (ref $item eq 'HASH') {
+               push @lines, "{\n", simple_dumper (%$item), "},\n";
+           } else {
+               die "Can't cope with $item";
+           }
+       } else {
+           if (defined $item) {
+               my $copy = $item;
+               $copy =~ s/([\'\\])/\\$1/gs;
+               push @lines, "'$copy',\n";
+           } else {
+               push @lines, "undef,\n";
+           }
+       }
+    }
+    @lines;
+}
+
 ##
 ## Process UnicodeData.txt (Categories, etc.)
 ##
-sub Unicode_Txt()
+sub UnicodeData_Txt()
 {
     my $Bidi     = Table->New();
     my $Deco     = Table->New();
     my $Comb     = Table->New();
     my $Number   = Table->New();
-    my $Mirrored = Table->New(Is    => 'Mirrored',
-                              Desc  => "Mirrored in bidirectional text",
-                              Fuzzy => 0);
+    my $Mirrored = Table->New();#Is    => 'Mirrored',
+                              #Desc  => "Mirrored in bidirectional text",
+                              #Fuzzy => 0);
 
     my %DC;
     my %Bidi;
-    my %Deco;
-    $Deco{Canon}   = Table->New(Is    => 'Canon',
-                                Desc  => 'Decomposes to multiple characters',
-                                Fuzzy => 0);
-    $Deco{Compat}  = Table->New(Is    => 'Compat',
-                                Desc  => 'Compatible with a more-basic character',
-                                Fuzzy => 0);
+    my %Number;
+    $DC{can} = Table->New();
+    $DC{com} = Table->New();
 
     ## Initialize Perl-generated categories
     ## (Categories from UnicodeData.txt are auto-initialized in gencat)
-    $Cat{Alnum}  = Table->New(Is => 'Alnum',  Desc => "[[:Alnum:]]",  Fuzzy => 0);
-    $Cat{Alpha}  = Table->New(Is => 'Alpha',  Desc => "[[:Alpha:]]",  Fuzzy => 0);
-    $Cat{ASCII}  = Table->New(Is => 'ASCII',  Desc => "[[:ASCII:]]",  Fuzzy => 0);
-    $Cat{Blank}  = Table->New(Is => 'Blank',  Desc => "[[:Blank:]]",  Fuzzy => 0);
-    $Cat{Cntrl}  = Table->New(Is => 'Cntrl',  Desc => "[[:Cntrl:]]",  Fuzzy => 0);
-    $Cat{Digit}  = Table->New(Is => 'Digit',  Desc => "[[:Digit:]]",  Fuzzy => 0);
-    $Cat{Graph}  = Table->New(Is => 'Graph',  Desc => "[[:Graph:]]",  Fuzzy => 0);
-    $Cat{Lower}  = Table->New(Is => 'Lower',  Desc => "[[:Lower:]]",  Fuzzy => 0);
-    $Cat{Print}  = Table->New(Is => 'Print',  Desc => "[[:Print:]]",  Fuzzy => 0);
-    $Cat{Punct}  = Table->New(Is => 'Punct',  Desc => "[[:Punct:]]",  Fuzzy => 0);
-    $Cat{Space}  = Table->New(Is => 'Space',  Desc => "[[:Space:]]",  Fuzzy => 0);
-    $Cat{Title}  = Table->New(Is => 'Title',  Desc => "[[:Title:]]",  Fuzzy => 0);
-    $Cat{Upper}  = Table->New(Is => 'Upper',  Desc => "[[:Upper:]]",  Fuzzy => 0);
-    $Cat{XDigit} = Table->New(Is => 'XDigit', Desc => "[[:XDigit:]]", Fuzzy => 0);
-    $Cat{Word}   = Table->New(Is => 'Word',   Desc => "[[:Word:]]",   Fuzzy => 0);
-    $Cat{SpacePerl} = Table->New(Is => 'SpacePerl', Desc => '\s', Fuzzy => 0);
+    $Cat{Alnum}  =
+       Table->New(Is => 'Alnum',  Desc => "[[:Alnum:]]",  Fuzzy => 0);
+    $Cat{Alpha}  =
+       Table->New(Is => 'Alpha',  Desc => "[[:Alpha:]]",  Fuzzy => 0);
+    $Cat{ASCII}  =
+       Table->New(Is => 'ASCII',  Desc => "[[:ASCII:]]",  Fuzzy => 0);
+    $Cat{Blank}  =
+       Table->New(Is => 'Blank',  Desc => "[[:Blank:]]",  Fuzzy => 0);
+    $Cat{Cntrl}  =
+       Table->New(Is => 'Cntrl',  Desc => "[[:Cntrl:]]",  Fuzzy => 0);
+    $Cat{Digit}  =
+       Table->New(Is => 'Digit',  Desc => "[[:Digit:]]",  Fuzzy => 0);
+    $Cat{Graph}  =
+       Table->New(Is => 'Graph',  Desc => "[[:Graph:]]",  Fuzzy => 0);
+    $Cat{Lower}  =
+       Table->New(Is => 'Lower',  Desc => "[[:Lower:]]",  Fuzzy => 0);
+    $Cat{Print}  =
+       Table->New(Is => 'Print',  Desc => "[[:Print:]]",  Fuzzy => 0);
+    $Cat{Punct}  =
+       Table->New(Is => 'Punct',  Desc => "[[:Punct:]]",  Fuzzy => 0);
+    $Cat{Space}  =
+       Table->New(Is => 'Space',  Desc => "[[:Space:]]",  Fuzzy => 0);
+    $Cat{Title}  =
+       Table->New(Is => 'Title',  Desc => "[[:Title:]]",  Fuzzy => 0);
+    $Cat{Upper}  =
+       Table->New(Is => 'Upper',  Desc => "[[:Upper:]]",  Fuzzy => 0);
+    $Cat{XDigit} =
+       Table->New(Is => 'XDigit', Desc => "[[:XDigit:]]", Fuzzy => 0);
+    $Cat{Word}   =
+       Table->New(Is => 'Word',   Desc => "[[:Word:]]",   Fuzzy => 0);
+    $Cat{SpacePerl} =
+       Table->New(Is => 'SpacePerl', Desc => '\s', Fuzzy => 0);
 
     my %To;
     $To{Upper} = Table->New();
@@ -608,32 +864,35 @@ sub Unicode_Txt()
         ($General{$name} ||= Table->New)->$op($code, $name);
 
         # 005F: SPACING UNDERSCORE
-        $Cat{Word}->$op($code)  if $cat =~ /^[LMN]/ || $code == 0x005F;
-        $Cat{Alnum}->$op($code) if $cat =~ /^[LMN]/;
+        $Cat{Word}->$op($code)  if $cat =~ /^[LMN]|Pc/;
+        $Cat{Alnum}->$op($code) if $cat =~ /^[LM]|Nd/;
         $Cat{Alpha}->$op($code) if $cat =~ /^[LM]/;
 
-
-
-        $Cat{Space}->$op($code) if $cat  =~ /^Z/
+       my $isspace = 
+           ($cat =~ /Zs|Zl|Zp/ &&
+            $code != 0x200B) # 200B is ZWSP which is for line break control
+            # and therefore it is not part of "space" even while it is "Zs".
                                 || $code == 0x0009  # 0009: HORIZONTAL TAB
                                 || $code == 0x000A  # 000A: LINE FEED
                                 || $code == 0x000B  # 000B: VERTICAL TAB
                                 || $code == 0x000C  # 000C: FORM FEED
-                                || $code == 0x000D; # 000D: CARRIAGE RETURN
+                                || $code == 0x000D  # 000D: CARRIAGE RETURN
+                                || $code == 0x0085  # 0085: NEL
 
+           ;
 
-        $Cat{SpacePerl}->$op($code) if $cat =~ /^Z/
-                                    || $code == 0x0009 # 0009: HORIZONTAL TAB
-                                    || $code == 0x000A # 000A: LINE FEED
-                                    || $code == 0x000C # 000C: FORM FEED
-                                    || $code == 0x000D # 000D: CARRIAGE RETURN
-                                    || $code == 0x0085 # 0085: <NEXT LINE>
-                                    || $code == 0x2028 # 2028: LINE SEPARATOR
-                                    || $code == 0x2029;# 2029: PARAGRAPH SEP.
+        $Cat{Space}->$op($code) if $isspace;
 
-        $Cat{Blank}->$op($code) if $cat  =~ /^Z[^lp]$/
-                                || $code == 0x0009  # 0009: HORIZONTAL TAB
-                                || $code == 0x0020; # 0020: SPACE
+        $Cat{SpacePerl}->$op($code) if $isspace
+                                      && $code != 0x000B; # Backward compat.
+
+        $Cat{Blank}->$op($code) if $isspace
+                                && !($code == 0x000A ||
+                                    $code == 0x000B ||
+                                    $code == 0x000C ||
+                                    $code == 0x000D ||
+                                    $code == 0x0085 ||
+                                    $cat =~ /^Z[lp]/);
 
         $Cat{Digit}->$op($code) if $cat eq "Nd";
         $Cat{Upper}->$op($code) if $cat eq "Lu";
@@ -641,8 +900,9 @@ sub Unicode_Txt()
         $Cat{Title}->$op($code) if $cat eq "Lt";
         $Cat{ASCII}->$op($code) if $code <= 0x007F;
         $Cat{Cntrl}->$op($code) if $cat =~ /^C/;
-        $Cat{Graph}->$op($code) if $cat =~ /^([LMNPS]|Co)/;
-        $Cat{Print}->$op($code) if $cat =~ /^([LMNPS]|Co|Zs)/;
+       my $isgraph = !$isspace && $cat !~ /Cc|Cs|Cn/;
+        $Cat{Graph}->$op($code) if $isgraph;
+        $Cat{Print}->$op($code) if $isgraph || $isspace;
         $Cat{Punct}->$op($code) if $cat =~ /^P/;
 
         $Cat{XDigit}->$op($code) if ($code >= 0x30 && $code <= 0x39)  ## 0..9
@@ -690,6 +950,11 @@ sub Unicode_Txt()
             $title,     ## titlecase mapping
               ) = split(/\s*;\s*/);
 
+       # Note that in Unicode 3.2 there will be names like
+       # LINE FEED (LF), which probably means that \N{} needs
+       # to cope also with LINE FEED and LF.
+       $name = $unicode10 if $name eq '<control>' && $unicode10 ne '';
+
         my $code = hex($hexcode);
 
         if ($comb and $comb == 230) {
@@ -729,11 +994,17 @@ sub Unicode_Txt()
             $Comb->Append($code, $comb) if $comb;
             $Number->Append($code, $number) if length $number;
 
+           length($decimal) and ($Number{De} ||= Table->New())->Append($code)
+             or
+           length($digit)   and ($Number{Di} ||= Table->New())->Append($code)
+             or
+           length($number)  and ($Number{Nu} ||= Table->New())->Append($code);
+
             $Mirrored->Append($code) if $mirrored eq "Y";
 
-            $Bidi{$bidi} ||= Table->New(Is    => "Bidi$bidi",
-                                        Desc  => "Bi-directional category '$bidi'",
-                                        Fuzzy => 0);
+            $Bidi{$bidi} ||= Table->New();#Is    => "bt/$bidi",
+                                        #Desc  => "Bi-directional category '$bidi'",
+                                        #Fuzzy => 0);
             $Bidi{$bidi}->Append($code);
 
             if ($deco)
@@ -741,16 +1012,15 @@ sub Unicode_Txt()
                 $Deco->Append($code, $deco);
                 if ($deco =~/^<(\w+)>/)
                 {
-                    $Deco{Compat}->Append($code);
+                   my $dshort = $PVA_reverse{dt}{ucfirst lc $1};
+                    $DC{com}->Append($code);
 
-                    $DC{$1} ||= Table->New(Is => "DC$1",
-                                           Desc  => "Compatible with '$1'",
-                                           Fuzzy => 0);
-                    $DC{$1}->Append($code);
+                    $DC{$dshort} ||= Table->New();
+                    $DC{$dshort}->Append($code);
                 }
                 else
                 {
-                    $Deco{Canon}->Append($code);
+                    $DC{can}->Append($code);
                 }
             }
         }
@@ -773,8 +1043,9 @@ sub Unicode_Txt()
     $Cat{C}->Replace($Cat{C}->Merge($Cat{Cn}));  ## Now merge in Cn into C
 
 
-    # L& is Ll, Lu, and Lt.
-    New_Prop(Is => 'L&',
+    # LC is Ll, Lu, and Lt.
+    # (used to be L& or L_, but PropValueAliases.txt defines it as LC)
+    New_Prop(Is => 'LC',
              Table->Merge(@Cat{qw[Ll Lu Lt]}),
              Desc  => '[\p{Ll}\p{Lu}\p{Lt}]',
              Fuzzy => 0);
@@ -840,14 +1111,62 @@ sub Unicode_Txt()
     ## Now dump the files.
     ##
     $Name->Write("Name.pl");
-    $Bidi->Write("Bidirectional.pl");
+
+    {
+       my @PVA = $HEADER;
+       foreach my $name (qw (PropertyAlias PA_reverse PropValueAlias
+                             PVA_reverse PVA_abbr_map)) {
+           # Should I really jump through typeglob hoops just to avoid a
+           # symbolic reference? (%{"utf8::$name})
+           push @PVA, "\n", "\%utf8::$name = (\n",
+               simple_dumper (%{$utf8::{$name}}), ");\n";
+       }
+       push @PVA, "1;\n";
+       WriteIfChanged("PVA.pl", @PVA);
+    }
+
+    # $Bidi->Write("Bidirectional.pl");
+    for (keys %Bidi) {
+       $Bidi{$_}->Write(
+           ["lib","bc","$_.pl"],
+           "BidiClass category '$PropValueAlias{bc}{$_}'"
+       );
+    }
+
     $Comb->Write("CombiningClass.pl");
+    for (keys %{ $PropValueAlias{ccc} }) {
+       my ($code, $name) = @{ $PropValueAlias{ccc}{$_} };
+       (my $c = Table->New())->Append($code);
+       $c->Write(
+           ["lib","ccc","$_.pl"],
+           "CombiningClass category '$name'"
+       );
+    }
+
     $Deco->Write("Decomposition.pl");
-    $Number->Write("Number.pl");
-    $General->Write("Category.pl");
+    for (keys %DC) {
+       $DC{$_}->Write(
+           ["lib","dt","$_.pl"],
+           "DecompositionType category '$PropValueAlias{dt}{$_}'"
+       );
+    }
+
+    # $Number->Write("Number.pl");
+    for (keys %Number) {
+       $Number{$_}->Write(
+           ["lib","nt","$_.pl"],
+           "NumericType category '$PropValueAlias{nt}{$_}'"
+       );
+    }
+
+    # $General->Write("Category.pl");
 
     for my $to (sort keys %To) {
-        $To{$to}->Write("To/$to.pl");
+        $To{$to}->Write(["To","$to.pl"]);
+    }
+
+    for (keys %{ $PropValueAlias{gc} }) {
+       New_Alias(Is => $PropValueAlias{gc}{$_}, SameAs => $_, Fuzzy => 1);
     }
 }
 
@@ -871,9 +1190,7 @@ sub LineBreak_Txt()
 
        $Lbrk->Append($first, $lbrk);
 
-        $Lbrk{$lbrk} ||= Table->New(Is    => "Lbrk$lbrk",
-                                    Desc  => "Linebreak category '$lbrk'",
-                                    Fuzzy => 0);
+        $Lbrk{$lbrk} ||= Table->New();
         $Lbrk{$lbrk}->Append($first);
 
        if ($last) {
@@ -883,7 +1200,15 @@ sub LineBreak_Txt()
     }
     close IN;
 
-    $Lbrk->Write("Lbrk.pl");
+    # $Lbrk->Write("Lbrk.pl");
+
+
+    for (keys %Lbrk) {
+       $Lbrk{$_}->Write(
+           ["lib","lb","$_.pl"],
+           "Linebreak category '$PropValueAlias{lb}{$_}'"
+       );
+    }
 }
 
 ##
@@ -898,6 +1223,8 @@ sub ArabicShaping_txt()
     my $ArabLink      = Table->New();
     my $ArabLinkGroup = Table->New();
 
+    my %JoinType;
+
     while (<IN>)
     {
        next unless /^[0-9A-Fa-f]+;/;
@@ -907,11 +1234,90 @@ sub ArabicShaping_txt()
         my $code = hex($hexcode);
        $ArabLink->Append($code, $link);
        $ArabLinkGroup->Append($code, $linkgroup);
+
+        $JoinType{$link} ||= Table->New(Is => "JoinType$link");
+        $JoinType{$link}->Append($code);
     }
     close IN;
 
-    $ArabLink->Write("ArabLink.pl");
-    $ArabLinkGroup->Write("ArabLnkGrp.pl");
+    # $ArabLink->Write("ArabLink.pl");
+    # $ArabLinkGroup->Write("ArabLnkGrp.pl");
+
+
+    for (keys %JoinType) {
+       $JoinType{$_}->Write(
+           ["lib","jt","$_.pl"],
+           "JoiningType category '$PropValueAlias{jt}{$_}'"
+       );
+    }
+}
+
+##
+## Process EastAsianWidth.txt.
+##
+sub EastAsianWidth_txt()
+{
+    if (not open IN, "EastAsianWidth.txt") {
+        die "$0: EastAsianWidth.txt: $!\n";
+    }
+
+    my %EAW;
+
+    while (<IN>)
+    {
+       next unless /^[0-9A-Fa-f]+(\.\.[0-9A-Fa-f]+)?;/;
+       s/#.*//;
+       s/\s+$//;
+
+       my ($hexcodes, $pv) = split(/\s*;\s*/);
+        $EAW{$pv} ||= Table->New(Is => "EastAsianWidth$pv");
+      my ($start, $end) = split(/\.\./, $hexcodes);
+      if (defined $end) {
+        $EAW{$pv}->AppendRange(hex($start), hex($end));
+      } else {
+        $EAW{$pv}->Append(hex($start));
+      }
+    }
+    close IN;
+
+
+    for (keys %EAW) {
+       $EAW{$_}->Write(
+           ["lib","ea","$_.pl"],
+           "EastAsianWidth category '$PropValueAlias{ea}{$_}'"
+       );
+    }
+}
+
+##
+## Process HangulSyllableType.txt.
+##
+sub HangulSyllableType_txt()
+{
+    if (not open IN, "HangulSyllableType.txt") {
+        die "$0: HangulSyllableType.txt: $!\n";
+    }
+
+    my %HST;
+
+    while (<IN>)
+    {
+        next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\w+)/;
+       my ($first, $last, $pv) = (hex($1), hex($2||""), $3);
+
+        $HST{$pv} ||= Table->New(Is => "HangulSyllableType$pv");
+        $HST{$pv}->Append($first);
+
+       if ($last) { $HST{$pv}->Extend($last) }
+    }
+    close IN;
+
+    for (keys %HST) {
+       $HST{$_}->Write(
+           ["lib","hst","$_.pl"],
+           "HangulSyllableType category '$PropValueAlias{hst}{$_}'"
+       );
+    }
 }
 
 ##
@@ -932,7 +1338,7 @@ sub Jamo_txt()
        $Short->Append($code, $short);
     }
     close IN;
-    $Short->Write("JamoShort.pl");
+    # $Short->Write("JamoShort.pl");
 }
 
 ##
@@ -975,7 +1381,7 @@ sub Scripts_txt()
         }
     }
 
-    $Scripts->Write("Scripts.pl");
+    # $Scripts->Write("Scripts.pl");
 
     ## Common is everything not explicitly assigned to a Script
     ##
@@ -1036,7 +1442,7 @@ sub Blocks_txt()
     }
     close IN;
 
-    $Blocks->Write("Blocks.pl");
+    # $Blocks->Write("Blocks.pl");
 }
 
 ##
@@ -1084,6 +1490,16 @@ sub PropList_txt()
         }
     }
 
+    for (keys %Prop) {
+       (my $file = $PA_reverse{$_}) =~ tr/_//d;
+       # XXX I'm assuming that the names from %Prop don't suffer 8.3 clashes.
+       $BaseNames{lc $file}++;
+       $Prop{$_}->Write(
+           ["lib","gc_sc","$file.pl"],
+           "Binary property '$_'"
+       );
+    }
+
     # Alphabetic is L and Other_Alphabetic.
     New_Prop(Is    => 'Alphabetic',
              Table->Merge($Cat{L}, $Prop{Other_Alphabetic}),
@@ -1121,65 +1537,6 @@ sub PropList_txt()
              Fuzzy => 1);
 }
 
-sub Make_GC_Aliases()
-{
-    ##
-    ## The mapping from General Category long forms to short forms is
-    ## currently hardwired here since no simple data file in the UCD
-    ## seems to do that.  Unicode 3.2 will assumedly correct this.
-    ##
-    my %Is = (
-       'Letter'                        =>      'L',
-       'Uppercase_Letter'              =>      'Lu',
-       'Lowercase_Letter'              =>      'Ll',
-       'Titlecase_Letter'              =>      'Lt',
-       'Modifier_Letter'               =>      'Lm',
-       'Other_Letter'                  =>      'Lo',
-
-       'Mark'                          =>      'M',
-       'Non_Spacing_Mark'              =>      'Mn',
-       'Spacing_Mark'                  =>      'Mc',
-       'Enclosing_Mark'                =>      'Me',
-
-       'Separator'                     =>      'Z',
-       'Space_Separator'               =>      'Zs',
-       'Line_Separator'                =>      'Zl',
-       'Paragraph_Separator'           =>      'Zp',
-
-       'Number'                        =>      'N',
-       'Decimal_Number'                =>      'Nd',
-       'Letter_Number'                 =>      'Nl',
-       'Other_Number'                  =>      'No',
-
-       'Punctuation'                   =>      'P',
-       'Connector_Punctuation'         =>      'Pc',
-       'Dash_Punctuation'              =>      'Pd',
-       'Open_Punctuation'              =>      'Ps',
-       'Close_Punctuation'             =>      'Pe',
-       'Initial_Punctuation'           =>      'Pi',
-       'Final_Punctuation'             =>      'Pf',
-       'Other_Punctuation'             =>      'Po',
-
-       'Symbol'                        =>      'S',
-       'Math_Symbol'                   =>      'Sm',
-       'Currency_Symbol'               =>      'Sc',
-       'Modifier_Symbol'               =>      'Sk',
-       'Other_Symbol'                  =>      'So',
-
-       'Other'                         =>      'C',
-       'Control'                       =>      'Cc',
-       'Format'                        =>      'Cf',
-       'Surrogate'                     =>      'Cs',
-       'Private Use'                   =>      'Co',
-       'Unassigned'                    =>      'Cn',
-    );
-
-    ## make the aliases....
-    while (my ($Alias, $Name) = each %Is) {
-        New_Alias(Is => $Alias, SameAs => $Name, Fuzzy => 1);
-    }
-}
-
 
 ##
 ## These are used in:
@@ -1283,6 +1640,7 @@ sub RandomlyFuzzifyName($;$)
 sub MakePropTestScript()
 {
     ## this written directly -- it's huge.
+    force_unlink ("TestProp.pl");
     if (not open OUT, ">TestProp.pl") {
         die "$0: TestProp.pl: $!\n";
     }
@@ -1385,8 +1743,6 @@ sub WriteAllMappings()
 {
     my @MAP;
 
-    my %BaseNames;  ## Base names already used (for avoiding 8.3 conflicts)
-
     ## 'Is' *MUST* come first, so its names have precidence over 'In's
     for my $Type ('Is', 'In')
     {
@@ -1402,7 +1758,9 @@ sub WriteAllMappings()
             my $filename;
             {
                 ## 'Is' items lose 'Is' from the basename.
-                $filename = $Type eq 'Is' ? $Name : "$Type$Name";
+                $filename = $Type eq 'Is' ?
+                   ($PVA_reverse{sc}{$Name} || $Name) :
+                   "$Type$Name";
 
                 $filename =~ s/[^\w_]+/_/g; # "L&" -> "L_"
                 substr($filename, 8) = '' if length($filename) > 8;
@@ -1475,7 +1833,7 @@ sub WriteAllMappings()
             ##
             ## Okay, write the file...
             ##
-            $Table->Write("lib/$filename.pl", $Comment);
+            $Table->Write(["lib","gc_sc","$filename.pl"], $Comment);
 
             ## and register it
             $RawNameToFile{$Name} = $filename;
@@ -1539,10 +1897,11 @@ sub WriteAllMappings()
                    "##\n",
                    "## Data in this file used by ../utf8_heavy.pl\n",
                    "##\n\n",
-                   "## Mapping from name to filename in ./lib\n",
+                   "## Mapping from name to filename in ./lib/gc_sc\n",
                    "%utf8::Exact = (\n",
                   );
 
+       $Exact{InGreek} = 'InGreekA';  # this is evil kludge
         for my $Name (sort keys %Exact)
         {
             my $File = $Exact{$Name};
@@ -1614,9 +1973,12 @@ sub SpecialCasing_txt()
         # Wait until all the special cases have been read since
         # they are not listed in numeric order.
         my $ix = hex($code);
-        push @{$CaseInfo{Lower}}, [ $ix, $code, $lower ];
-        push @{$CaseInfo{Title}}, [ $ix, $code, $title ];
-        push @{$CaseInfo{Upper}}, [ $ix, $code, $upper ];
+        push @{$CaseInfo{Lower}}, [ $ix, $code, $lower ]
+           unless $code eq $lower;
+        push @{$CaseInfo{Title}}, [ $ix, $code, $title ]
+           unless $code eq $title;
+        push @{$CaseInfo{Upper}}, [ $ix, $code, $upper ]
+           unless $code eq $upper;
     }
     close IN;
 
@@ -1627,16 +1989,22 @@ sub SpecialCasing_txt()
     {
         my $NormalCase = do "To/$case.pl" || die "$0: $@\n";
 
-        my @OUT = (
-                   $HEADER, "\n",
-                   "%utf8::ToSpec$case =\n(\n",
-                   );
+        my @OUT =
+           (
+            $HEADER, "\n",
+            "# The key UTF-8 _bytes_, the value UTF-8 (speed hack)\n",
+            "%utf8::ToSpec$case =\n(\n",
+           );
 
         for my $prop (sort { $a->[0] <=> $b->[0] } @{$CaseInfo{$case}}) {
             my ($ix, $code, $to) = @$prop;
             my $tostr =
               join "", map { sprintf "\\x{%s}", $_ } split ' ', $to;
-            push @OUT, sprintf qq['%04X' => "$tostr",\n], $ix;
+            push @OUT, sprintf qq["%s" => "$tostr",\n], join("", map { sprintf "\\x%02X", $_ } unpack("U0C*", pack("U", $ix)));
+           # Remove any single-character mappings for
+           # the same character since we are going for
+           # the special casing rules.
+           $NormalCase =~ s/^$code\t\t\w+\n//m;
         }
         push @OUT, (
                     ");\n\n",
@@ -1644,7 +2012,7 @@ sub SpecialCasing_txt()
                     $NormalCase,
                     "END\n"
                     );
-        WriteIfChanged("To/$case.pl", @OUT);
+        WriteIfChanged(["To","$case.pl"], @OUT);
     }
 }
 
@@ -1684,14 +2052,16 @@ sub CaseFolding_txt()
     #
     my $CommonFold = do "To/Fold.pl" || die "$0: To/Fold.pl: $!\n";
 
-    my @OUT = (
-               $HEADER, "\n",
-               "%utf8::ToSpecFold =\n(\n",
-              );
+    my @OUT =
+       (
+        $HEADER, "\n",
+        "#  The ke UTF-8 _bytes_, the value UTF-8 (speed hack)\n",
+        "%utf8::ToSpecFold =\n(\n",
+       );
     for my $code (sort { $a <=> $b } keys %Fold) {
         my $foldstr =
           join "", map { sprintf "\\x{%s}", $_ } split ' ', $Fold{$code};
-        push @OUT, sprintf qq['%04X' => "$foldstr",\n], $code;
+        push @OUT, sprintf qq["%s" => "$foldstr",\n], join("", map { sprintf "\\x%02X", $_ } unpack("U0C*", pack("U", $code)));
     }
     push @OUT, (
                 ");\n\n",
@@ -1700,13 +2070,13 @@ sub CaseFolding_txt()
                 "END\n",
                );
 
-    WriteIfChanged("To/Fold.pl", @OUT);
+    WriteIfChanged(["To","Fold.pl"], @OUT);
 }
 
 ## Do it....
 
-Unicode_Txt();
-Make_GC_Aliases();
+Build_Aliases();
+UnicodeData_Txt();
 PropList_txt();
 
 Scripts_txt();
@@ -1716,10 +2086,69 @@ WriteAllMappings();
 
 LineBreak_Txt();
 ArabicShaping_txt();
+EastAsianWidth_txt();
+HangulSyllableType_txt();
 Jamo_txt();
 SpecialCasing_txt();
 CaseFolding_txt();
 
+if ( $FileList and $MakeList ) {
+    
+    print "Updating '$FileList'\n"
+        if ($Verbose);
+        
+    open my $ofh,">",$FileList 
+        or die "Can't write to '$FileList':$!";
+    print $ofh <<"EOFHEADER";
+#
+# mktables.lst -- File list for mktables.
+#
+#   Autogenerated on @{[scalar localtime]}
+#
+# - First section is input files
+#   (mktables itself is automatically included)
+# - Section seperator is /^=+\$/
+# - Second section is a list of output files.
+# - Lines matching /^\\s*#/ are treated as comments
+#   which along with blank lines are ignored.
+#
+
+# Input files:
+
+EOFHEADER
+    my @input=("version",glob('*.txt'));
+    print $ofh "$_\n" for 
+        @input,
+        "\n=================================\n",
+        "# Output files:\n",
+        # special files
+        "Properties";
+        
+    
+    require File::Find;
+    my $count=0;
+    File::Find::find({
+        no_chdir=>1,
+        wanted=>sub {
+          if (/\.pl$/) {
+            s!^\./!!;
+            print $ofh "$_\n";
+            $count++;
+          }
+        },
+    },"."); 
+    
+    print $ofh "\n# ",scalar(@input)," input files\n",
+               "# ",scalar($count+1)," output files\n\n",
+               "# End list\n";  
+    close $ofh 
+        or warn "Failed to close $ofh: $!";
+    
+    print "Filelist has ",scalar(@input)," input files and ",
+          scalar($count+1)," output files\n"
+        if $Verbose;
+}
+print "All done\n" if $Verbose;
 exit(0);
 
 ## TRAILING CODE IS USED BY MakePropTestScript()