Unicode data updated to be the latest beta of the Unicode 3.0.

[p5sagit/p5-mst-13.2.git] / lib / unicode / mktables.PL
diff --git a/lib/unicode/mktables.PL b/lib/unicode/mktables.PL

index 306f2a4..f54ea69 100755 (executable)
--- a/lib/unicode/mktables.PL
+++ b/lib/unicode/mktables.PL
@@ -1,25 +1,34 @@
 #!../../miniperl
 
+$UnicodeData = "UnicodeData-Latest.txt";
+
 # Note: we try to keep filenames unique within first 8 chars.  Using
 # subdirectories for the following helps.
 mkdir "In", 0777;
 mkdir "Is", 0777;
 mkdir "To", 0777;
+mkdir "Eq", 0777;
 
 @todo = (
 # typical
 
-    ['IsAlnum', '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"',    ''],
-    ['IsAlpha', '$cat =~ /^L[ulo]/',   ''],
-    ['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''],
-    ['IsDigit', '$cat =~ /^Nd$/',      ''],
-    ['IsUpper', '$cat =~ /^Lu$/',      ''],
-    ['IsLower', '$cat =~ /^Ll$/',      ''],
-    ['IsPrint', '$cat =~ /^[^C]/',     ''],
-    ['ToUpper', '$up',                 '$up'],
-    ['ToLower', '$down',               '$down'],
-    ['ToTitle', '$title',              '$title'],
-    ['ToDigit', '$dec ne ""',          '$dec'],
+    ['IsWord',  '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"',    ''],
+    ['IsAlnum', '$cat =~ /^L[ulo]|^Nd/',       ''],
+    ['IsAlpha',  '$cat =~ /^L[ulo]/',  ''],
+    ['IsSpace',  '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/',        ''],
+    ['IsDigit',  '$cat =~ /^Nd$/',     ''],
+    ['IsUpper',  '$cat =~ /^Lu$/',     ''],
+    ['IsLower',  '$cat =~ /^Ll$/',     ''],
+    ['IsASCII',  'hex $code <= 127',   ''],
+    ['IsCntrl',  '$cat =~ /^C/',       ''],
+    ['IsGraph',  '$cat =~ /^[^C]/ and $code ne "0020"',        ''],
+    ['IsPrint',  '$cat =~ /^[^C]/',    ''],
+    ['IsPunct',  '$cat =~ /^P/',       ''],
+    ['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/',  ''],
+    ['ToUpper',  '$up',                        '$up'],
+    ['ToLower',  '$down',              '$down'],
+    ['ToTitle',  '$title',             '$title'],
+    ['ToDigit',  '$dec ne ""',         '$dec'],
 
 # Name
 
@@ -143,6 +152,21 @@ mkdir "To", 0777;
 # Jamo
 
     ['JamoShort',      '1',            '$short'],
+
+# Syllables
+
+    ['IsSylV', '$syl eq "V"',          ''],
+    ['IsSylU', '$syl eq "U"',          ''],
+    ['IsSylI', '$syl eq "I"',          ''],
+    ['IsSylA', '$syl eq "A"',          ''],
+    ['IsSylE', '$syl eq "E"',          ''],
+    ['IsSylC', '$syl eq "C"',          ''],
+    ['IsSylO', '$syl eq "O"',          ''],
+    ['IsSylWV',        '$syl eq "V"',          ''],
+    ['IsSylWI',        '$syl eq "I"',          ''],
+    ['IsSylWA',        '$syl eq "A"',          ''],
+    ['IsSylWE',        '$syl eq "E"',          ''],
+    ['IsSylWC',        '$syl eq "C"',          ''],
 );
 
 # This is not written for speed...
@@ -214,8 +238,13 @@ sub proplist {
 
        $split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;';
     }
+    elsif ($table =~ /^IsSyl/) {
+       open(UD, "syllables.txt") or warn "Can't open $table: $!";
+
+       $split = '($code, $short, $syl) = split(/; */); $code =~ s/^U\+//;';
+    }
     else {
-       open(UD, "UnicodeData-Latest.txt") or warn "Can't open $table: $!";
+       open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!";
 
        $split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1,
                $comment, $up, $down, $title) = split(/;/);';
@@ -292,3 +321,74 @@ END
     }
     $out;
 }
+
+# Create the equivalence mappings.
+
+open(UNICODEDATA, $UnicodeData) || die "$0: $UnicodeData: $!\n";
+
+while (<UNICODEDATA>) {
+    ($code, $name) = split /;/;
+    
+    $code{$name} = $code;
+    $name{$code} = $name;
+
+    next unless $name =~ /^(.+? LETTER .+?) WITH .+( \w+ FORM)?$/;
+
+    push @base, [ $code, $1 ];
+    push @base, [ $code, $1.$2 ] if $2 ne '';
+
+    # Before this "diacritics stripping" phase (and for Arabic, also
+    # "form stripping" phase) all ligatures could be decomposed into
+    # their constituent letters.
+    #
+    # For example the ligature
+    # ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH ALEF ISOLATED FORM
+    # would go first through ligature decomposition producing the two letters
+    # ARABIC LETTER YEH WITH HAMZA ABOVE ISOLATED FORM
+    # ARABIC LETTER ALEF WITH HAMZA ABOVE ISOLATED FORM
+    # and those with diacritics stripping
+    # ARABIC LETTER YEH ISOLATED FORM
+    # ARABIC LETTER ALEF ISOLATED FORM
+    # and those with the Arabic form stripping
+    # ARABIC LETTER YEH
+    # ARABIC LETTER ALEF ISOLATED FORM
+    # ARABIC LETTER YEH
+    # ARABIC LETTER ALEF ISOLATED FORM
+    #
+    # Similarly for ligatures from other scripts.
+    # Effectively this would mean that ligatures turn into categories
+    # (Unicodese for character classes).
+}
+
+foreach my $b (@base) {
+    ($code, $base) = @$b;
+    next unless exists $code{$base};
+    push @{$unicode{$code{$base}}}, $code;
+#   print "$code: $name{$code} -> $base\n",
+}
+
+@unicode = sort keys %unicode;
+
+print "Eq/Unicode\n";
+if (open(EQ_UNICODE, ">Eq/Unicode")) {
+    foreach my $c (@unicode) {
+       print EQ_UNICODE "$c @{$unicode{$c}}\n";
+    }
+    close EQ_UNICODE;
+} else {
+    die "$0: failed to open Eq/Unicode for writing: $!\n";
+}
+
+print "Eq/Latin1\n";
+if (open(EQ_LATIN1, ">Eq/Latin1")) {
+    foreach my $c (@unicode) {
+       last if hex($c) > 255;
+       my @c = grep { hex($_) <= 255 } @{$unicode{$c}};
+       next unless @c;
+       print EQ_LATIN1 "$c @c\n";
+    }
+    close EQ_LATIN1;
+} else {
+    die "$0: failed to open Eq/Latin1 for writing: $!\n";
+}
+