else {
open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n";
}
+ print OUT <<EOH;
+# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
+# This file is built by $0 from e.g. $UnicodeData.
+# Any changes made here will be lost!
+EOH
print OUT <<"END";
return <<'END';
END
exit if @ARGV and not grep { $_ eq Block } @ARGV;
print "Block\n";
-open(UD, 'blocks.txt') or die "Can't open blocks.txt: $!\n";
+open(UD, 'Blocks.txt') or die "Can't open blocks.txt: $!\n";
open(OUT, ">Block.pl") or die "Can't create $table.pl: $!\n";
+print OUT <<EOH;
+# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
+# This file is built by $0 from e.g. $UnicodeData.
+# Any changes made here will be lost!
+EOH
print OUT <<"END";
return <<'END';
END
print OUT "$code $last $name\n";
$name =~ s/\s+//g;
open(BLOCK, ">In/$name.pl");
+ print BLOCK <<EOH;
+# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
+# This file is built by $0 from e.g. $UnicodeData.
+# Any changes made here will be lost!
+EOH
print BLOCK <<"END2";
return <<'END';
$code $last
my $split;
if ($table =~ /^Arab/) {
- open(UD, "arabshp.txt") or warn "Can't open $table: $!";
+ open(UD, "ArabShap.txt") or warn "Can't open $table: $!";
$split = '($code, $name, $link, $linkgroup) = split(/; */);';
}
elsif ($table =~ /^Jamo/) {
- open(UD, "jamo2.txt") or warn "Can't open $table: $!";
+ open(UD, "Jamo-2.txt") or warn "Can't open $table: $!";
$split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;';
}
open(UNICODEDATA, $UnicodeData) || die "$0: $UnicodeData: $!\n";
while (<UNICODEDATA>) {
- ($code, $name) = split /;/;
+ ($code, $name, $category, $decomposition) = (split /;/)[0,1,2,5];
$code{$name} = $code;
$name{$code} = $name;
+ $category{$code} = $category;
+
+ next unless $category =~ /^L/;
+
+ # The definition of "equivalence" is twofold.
+ if ($decomposition ne '') {
+ # (1) If there's an official Unicode decomposition
+ # and the base is a Unicode letter.
+ $decomposition =~ s/^<\w+> //;
+ @decomposition = split(' ', $decomposition);
+ # Some Arabic ligatures like
+ # FC5E;ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM;Lo;...
+ # are problematic because their decomposition begins with
+ # a space (0020) -- which could be just skipped -- but then
+ # their base glyph is not a letter, for example
+ # the above decomposes as <isolated> 0020 064C 0651,
+ # but 064C is 064C;ARABIC DAMMATAN;Mn;...
+ # (the 0651 being ARABIC SHADDA;Mn)
+ ($basecode) = shift @decomposition;
+ push @base, [ $code, $basecode ];
+ } elsif ($name =~ /^(.+?) WITH /) {
+ # (2) If there's a "FOO WITH ..." Unicode name and FOO
+ # happens to be valid Unicode letter. This is
+ # a debatable definition and all fault is by me (jhi).
+ # For example this definition adds
+ # LATIN SMALL LETTER O WITH STROKE
+ # as a derivative of
+ # LATIN SMALL LETTER O
+ # which some might rightfully contest, especially
+ # the speakers of languages who have the former
+ # phonetically as very distinct from the latter.
+ push @with, [ $code, $1 ];
+ }
+}
- next unless $name =~ /^(.+? LETTER .+?) WITH .+( \w+ FORM)?$/;
-
- push @base, [ $code, $1 ];
- push @base, [ $code, $1.$2 ] if $2 ne '';
+foreach my $w (@with) {
+ ($code, $basename) = @$w;
+ next if not exists $code{$basename} or
+ not $category{$code{$basename}} =~ /^L/;
+ push @base, [ $code, $code{$basename} ];
}
+@base = sort { $a->[0] cmp $b->[0] } @base;
+
foreach my $b (@base) {
- ($code, $base) = @$b;
- next unless exists $code{$base};
- push @{$unicode{$code{$base}}}, $code;
-# print "$code: $name{$code} -> $base\n",
+ ($code, $basecode) = @$b;
+ $basename = $name{$basecode};
+ next if not defined $basename or
+ not exists $code{$basename} or
+ not $category{$code{$basename}} =~ /^L/;
+ push @{$unicode{$code{$basename}}}, $code;
+# print "$code: $name{$code} -> $basename\n",
}
@unicode = sort keys %unicode;
-print "Eq/Unicode\n";
-if (open(EQ_UNICODE, ">Eq/Unicode")) {
+print "EqUnicode\n";
+if (open(OUT, ">Eq/Unicode.pl")) {
+ print OUT <<EOH;
+# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
+# This file is built by $0 from e.g. $UnicodeData.
+# Any changes made here will be lost!
+return <<'END';
+EOH
foreach my $c (@unicode) {
- print EQ_UNICODE "$c @{$unicode{$c}}\n";
+ print OUT "$c @{$unicode{$c}}\n";
}
- close EQ_UNICODE;
+ print OUT "END\n";
+ close OUT;
} else {
die "$0: failed to open Eq/Unicode for writing: $!\n";
}
-print "Eq/Latin1\n";
-if (open(EQ_LATIN1, ">Eq/Latin1")) {
+print "EqLatin1\n";
+if (open(OUT, ">Eq/Latin1.pl")) {
+ print OUT <<EOH;
+# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
+# This file is built by $0 from e.g. $UnicodeData.
+# Any changes made here will be lost!
+return <<'END';
+EOH
foreach my $c (@unicode) {
last if hex($c) > 255;
- my @c = grep { hex($_) <= 255 } @{$unicode{$c}};
+ my @c = grep { hex($_) < 256 } @{$unicode{$c}};
next unless @c;
- print EQ_LATIN1 "$c @c\n";
+ print OUT "$c @c\n";
}
- close EQ_LATIN1;
+ print OUT "END\n";
+ close OUT;
} else {
die "$0: failed to open Eq/Latin1 for writing: $!\n";
}
-
+
+# eof