Regen Unicode tables to include a warning:

[p5sagit/p5-mst-13.2.git] / lib / unicode / mktables.PL
diff --git a/lib/unicode/mktables.PL b/lib/unicode/mktables.PL

index 0b3a8e0..a6e3a5e 100755 (executable)
--- a/lib/unicode/mktables.PL
+++ b/lib/unicode/mktables.PL
@@ -181,6 +181,11 @@ foreach $file (@todo) {
     else {
        open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n";
     }
+    print OUT <<EOH;
+# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!! 
+# This file is built by $0 from e.g. $UnicodeData.
+# Any changes made here will be lost!
+EOH
     print OUT <<"END";
 return <<'END';
 END
@@ -193,8 +198,13 @@ END
 
 exit if @ARGV and not grep { $_ eq Block } @ARGV;
 print "Block\n";
-open(UD, 'blocks.txt') or die "Can't open blocks.txt: $!\n";
+open(UD, 'Blocks.txt') or die "Can't open blocks.txt: $!\n";
 open(OUT, ">Block.pl") or die "Can't create $table.pl: $!\n";
+print OUT <<EOH;
+# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!! 
+# This file is built by $0 from e.g. $UnicodeData.
+# Any changes made here will be lost!
+EOH
 print OUT <<"END";
 return <<'END';
 END
@@ -208,6 +218,11 @@ while (<UD>) {
        print OUT "$code        $last   $name\n";
        $name =~ s/\s+//g;
        open(BLOCK, ">In/$name.pl");
+       print BLOCK <<EOH;
+# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!! 
+# This file is built by $0 from e.g. $UnicodeData.
+# Any changes made here will be lost!
+EOH
        print BLOCK <<"END2";
 return <<'END';
 $code  $last
@@ -229,12 +244,12 @@ sub proplist {
     my $split;
 
     if ($table =~ /^Arab/) {
-       open(UD, "arabshp.txt") or warn "Can't open $table: $!";
+       open(UD, "ArabShap.txt") or warn "Can't open $table: $!";
 
        $split = '($code, $name, $link, $linkgroup) = split(/; */);';
     }
     elsif ($table =~ /^Jamo/) {
-       open(UD, "jamo2.txt") or warn "Can't open $table: $!";
+       open(UD, "Jamo-2.txt") or warn "Can't open $table: $!";
 
        $split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;';
     }
@@ -327,46 +342,101 @@ END
 open(UNICODEDATA, $UnicodeData) || die "$0: $UnicodeData: $!\n";
 
 while (<UNICODEDATA>) {
-    ($code, $name) = split /;/;
+    ($code, $name, $category, $decomposition) = (split /;/)[0,1,2,5];
     
     $code{$name} = $code;
     $name{$code} = $name;
+    $category{$code} = $category;
+
+    next unless $category =~ /^L/;
+
+    # The definition of "equivalence" is twofold.
+    if ($decomposition ne '') {
+       # (1) If there's an official Unicode decomposition
+        #     and the base is a Unicode letter.
+       $decomposition =~ s/^<\w+> //;
+       @decomposition = split(' ', $decomposition);
+       # Some Arabic ligatures like
+       # FC5E;ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM;Lo;...
+       # are problematic because their decomposition begins with
+       # a space (0020) -- which could be just skipped -- but then
+       # their base glyph is not a letter, for example
+       # the above decomposes as <isolated> 0020 064C 0651,
+       # but 064C is 064C;ARABIC DAMMATAN;Mn;...
+       # (the 0651 being ARABIC SHADDA;Mn)
+       ($basecode) = shift @decomposition;
+       push @base, [ $code, $basecode ];
+    } elsif ($name =~ /^(.+?) WITH /) {
+       # (2) If there's a "FOO WITH ..." Unicode name and FOO
+       #     happens to be valid Unicode letter.  This is
+       #     a debatable definition and all fault is by me (jhi).
+        #     For example this definition adds
+        #     LATIN SMALL LETTER O WITH STROKE
+        #     as a derivative of 
+        #     LATIN SMALL LETTER O
+        #     which some might rightfully contest, especially
+        #     the speakers of languages who have the former
+        #     phonetically as very distinct from the latter.
+       push @with, [ $code, $1 ];
+    }
+}
 
-    next unless $name =~ /^(.+? LETTER .+?) WITH .+( \w+ FORM)?$/;
-
-    push @base, [ $code, $1 ];
-    push @base, [ $code, $1.$2 ] if $2 ne '';
+foreach my $w (@with) {
+    ($code, $basename) = @$w;
+    next if not exists $code{$basename} or
+            not $category{$code{$basename}} =~ /^L/;
+    push @base, [ $code, $code{$basename} ];
 }
 
+@base = sort { $a->[0] cmp $b->[0] } @base;
+
 foreach my $b (@base) {
-    ($code, $base) = @$b;
-    next unless exists $code{$base};
-    push @{$unicode{$code{$base}}}, $code;
-#   print "$code: $name{$code} -> $base\n",
+    ($code, $basecode) = @$b;
+    $basename = $name{$basecode};
+    next if not defined $basename or
+            not exists $code{$basename} or
+            not $category{$code{$basename}} =~ /^L/;
+    push @{$unicode{$code{$basename}}}, $code;
+#   print "$code: $name{$code} -> $basename\n",
 }
 
 @unicode = sort keys %unicode;
 
-print "Eq/Unicode\n";
-if (open(EQ_UNICODE, ">Eq/Unicode")) {
+print "EqUnicode\n";
+if (open(OUT, ">Eq/Unicode.pl")) {
+    print OUT <<EOH;
+# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!! 
+# This file is built by $0 from e.g. $UnicodeData.
+# Any changes made here will be lost!
+return <<'END';
+EOH
     foreach my $c (@unicode) {
-       print EQ_UNICODE "$c @{$unicode{$c}}\n";
+       print OUT "$c @{$unicode{$c}}\n";
     }
-    close EQ_UNICODE;
+    print OUT "END\n";
+    close OUT;
 } else {
     die "$0: failed to open Eq/Unicode for writing: $!\n";
 }
 
-print "Eq/Latin1\n";
-if (open(EQ_LATIN1, ">Eq/Latin1")) {
+print "EqLatin1\n";
+if (open(OUT, ">Eq/Latin1.pl")) {
+    print OUT <<EOH;
+# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!! 
+# This file is built by $0 from e.g. $UnicodeData.
+# Any changes made here will be lost!
+return <<'END';
+EOH
     foreach my $c (@unicode) {
        last if hex($c) > 255;
-       my @c = grep { hex($_) <= 255 } @{$unicode{$c}};
+       my @c = grep { hex($_) < 256 } @{$unicode{$c}};
        next unless @c;
-       print EQ_LATIN1 "$c @c\n";
+       print OUT "$c @c\n";
     }
-    close EQ_LATIN1;
+    print OUT "END\n";
+    close OUT;
 } else {
     die "$0: failed to open Eq/Latin1 for writing: $!\n";
 }
-  
+
+# eof