#!../../miniperl
+$UnicodeData = "UnicodeData-Latest.txt";
+
# Note: we try to keep filenames unique within first 8 chars. Using
# subdirectories for the following helps.
mkdir "In", 0777;
mkdir "Is", 0777;
mkdir "To", 0777;
+mkdir "Eq", 0777;
@todo = (
# typical
- ['IsAlnum', '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"', ''],
- ['IsAlpha', '$cat =~ /^L[ulo]/', ''],
- ['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''],
- ['IsDigit', '$cat =~ /^Nd$/', ''],
- ['IsUpper', '$cat =~ /^Lu$/', ''],
- ['IsLower', '$cat =~ /^Ll$/', ''],
- ['IsPrint', '$cat =~ /^[^C]/', ''],
- ['ToUpper', '$up', '$up'],
- ['ToLower', '$down', '$down'],
- ['ToTitle', '$title', '$title'],
- ['ToDigit', '$dec ne ""', '$dec'],
+ ['IsWord', '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"', ''],
+ ['IsAlnum', '$cat =~ /^L[ulo]|^Nd/', ''],
+ ['IsAlpha', '$cat =~ /^L[ulo]/', ''],
+ ['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''],
+ ['IsDigit', '$cat =~ /^Nd$/', ''],
+ ['IsUpper', '$cat =~ /^Lu$/', ''],
+ ['IsLower', '$cat =~ /^Ll$/', ''],
+ ['IsASCII', 'hex $code <= 127', ''],
+ ['IsCntrl', '$cat =~ /^C/', ''],
+ ['IsGraph', '$cat =~ /^[^C]/ and $code ne "0020"', ''],
+ ['IsPrint', '$cat =~ /^[^C]/', ''],
+ ['IsPunct', '$cat =~ /^P/', ''],
+ ['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''],
+ ['ToUpper', '$up', '$up'],
+ ['ToLower', '$down', '$down'],
+ ['ToTitle', '$title', '$title'],
+ ['ToDigit', '$dec ne ""', '$dec'],
# Name
# Jamo
['JamoShort', '1', '$short'],
+
+# Syllables
+
+ ['IsSylV', '$syl eq "V"', ''],
+ ['IsSylU', '$syl eq "U"', ''],
+ ['IsSylI', '$syl eq "I"', ''],
+ ['IsSylA', '$syl eq "A"', ''],
+ ['IsSylE', '$syl eq "E"', ''],
+ ['IsSylC', '$syl eq "C"', ''],
+ ['IsSylO', '$syl eq "O"', ''],
+ ['IsSylWV', '$syl eq "V"', ''],
+ ['IsSylWI', '$syl eq "I"', ''],
+ ['IsSylWA', '$syl eq "A"', ''],
+ ['IsSylWE', '$syl eq "E"', ''],
+ ['IsSylWC', '$syl eq "C"', ''],
);
# This is not written for speed...
$split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;';
}
+ elsif ($table =~ /^IsSyl/) {
+ open(UD, "syllables.txt") or warn "Can't open $table: $!";
+
+ $split = '($code, $short, $syl) = split(/; */); $code =~ s/^U\+//;';
+ }
else {
- open(UD, "UnicodeData-Latest.txt") or warn "Can't open $table: $!";
+ open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!";
$split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1,
$comment, $up, $down, $title) = split(/;/);';
}
$out;
}
+
+# Create the equivalence mappings.
+
+open(UNICODEDATA, $UnicodeData) || die "$0: $UnicodeData: $!\n";
+
+while (<UNICODEDATA>) {
+ ($code, $name) = split /;/;
+
+ $code{$name} = $code;
+ $name{$code} = $name;
+
+ next unless $name =~ /^(.+? LETTER .+?) WITH .+( \w+ FORM)?$/;
+
+ push @base, [ $code, $1 ];
+ push @base, [ $code, $1.$2 ] if $2 ne '';
+
+ # Before this "diacritics stripping" phase (and for Arabic, also
+ # "form stripping" phase) all ligatures could be decomposed into
+ # their constituent letters.
+ #
+ # For example the ligature
+ # ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH ALEF ISOLATED FORM
+ # would go first through ligature decomposition producing the two letters
+ # ARABIC LETTER YEH WITH HAMZA ABOVE ISOLATED FORM
+ # ARABIC LETTER ALEF WITH HAMZA ABOVE ISOLATED FORM
+ # and those with diacritics stripping
+ # ARABIC LETTER YEH ISOLATED FORM
+ # ARABIC LETTER ALEF ISOLATED FORM
+ # and those with the Arabic form stripping
+ # ARABIC LETTER YEH
+ # ARABIC LETTER ALEF ISOLATED FORM
+ # ARABIC LETTER YEH
+ # ARABIC LETTER ALEF ISOLATED FORM
+ #
+ # Similarly for ligatures from other scripts.
+ # Effectively this would mean that ligatures turn into categories
+ # (Unicodese for character classes).
+}
+
+foreach my $b (@base) {
+ ($code, $base) = @$b;
+ next unless exists $code{$base};
+ push @{$unicode{$code{$base}}}, $code;
+# print "$code: $name{$code} -> $base\n",
+}
+
+@unicode = sort keys %unicode;
+
+print "Eq/Unicode\n";
+if (open(EQ_UNICODE, ">Eq/Unicode")) {
+ foreach my $c (@unicode) {
+ print EQ_UNICODE "$c @{$unicode{$c}}\n";
+ }
+ close EQ_UNICODE;
+} else {
+ die "$0: failed to open Eq/Unicode for writing: $!\n";
+}
+
+print "Eq/Latin1\n";
+if (open(EQ_LATIN1, ">Eq/Latin1")) {
+ foreach my $c (@unicode) {
+ last if hex($c) > 255;
+ my @c = grep { hex($_) <= 255 } @{$unicode{$c}};
+ next unless @c;
+ print EQ_LATIN1 "$c @c\n";
+ }
+ close EQ_LATIN1;
+} else {
+ die "$0: failed to open Eq/Latin1 for writing: $!\n";
+}
+