#!../../miniperl
-# Create the equivalence mappings.
+use bytes;
+
+$UnicodeData = "Unicode.301";
+$SyllableData = "syllables.txt";
+$PropData = "PropList.txt";
-$UnicodeData = "UnicodeData-Latest.txt";
# Note: we try to keep filenames unique within first 8 chars. Using
# subdirectories for the following helps.
-mkdir "In", 0777;
-mkdir "Is", 0777;
-mkdir "To", 0777;
-mkdir "Eq", 0777;
+mkdir "In", 0755;
+mkdir "Is", 0755;
+mkdir "To", 0755;
@todo = (
# typical
- ['IsWord', '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"', ''],
- ['IsAlnum', '$cat =~ /^L[ulo]|^Nd/', ''],
- ['IsAlpha', '$cat =~ /^L[ulo]/', ''],
- ['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''],
+ # 005F: SPACING UNDERSCROE
+ ['IsWord', '$cat =~ /^[LMN]/ or $code eq "005F"', ''],
+ ['IsAlnum', '$cat =~ /^[LMN]/', ''],
+ ['IsAlpha', '$cat =~ /^[LM]/', ''],
+ # 0009: HORIZONTAL TABULATION
+ # 000A: LINE FEED
+ # 000B: VERTICAL TABULATION
+ # 000C: FORM FEED
+ # 000D: CARRIAGE RETURN
+ # 0020: SPACE
+ ['IsSpace', '$cat =~ /^Z/ ||
+ $code =~ /^(0009|000A|000B|000C|000D)$/', ''],
+ ['IsSpacePerl',
+ '$cat =~ /^Z/ ||
+ $code =~ /^(0009|000A|000C|000D)$/', ''],
+ ['IsBlank', '$cat =~ /^Z[^lp]$/ || $code eq "0009"', ''],
['IsDigit', '$cat =~ /^Nd$/', ''],
- ['IsUpper', '$cat =~ /^Lu$/', ''],
+ ['IsUpper', '$cat =~ /^L[ut]$/', ''],
['IsLower', '$cat =~ /^Ll$/', ''],
- ['IsASCII', 'hex $code <= 127', ''],
+ ['IsASCII', '$code le "007f"', ''],
['IsCntrl', '$cat =~ /^C/', ''],
- ['IsGraph', '$cat =~ /^[^C]/ and $code ne "0020"', ''],
- ['IsPrint', '$cat =~ /^[^C]/', ''],
+ ['IsGraph', '$cat =~ /^([LMNPS]|Co)/', ''],
+ ['IsPrint', '$cat =~ /^([LMNPS]|Co|Zs)/', ''],
['IsPunct', '$cat =~ /^P/', ''],
+ # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f
['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''],
['ToUpper', '$up', '$up'],
['ToLower', '$down', '$down'],
['IsM', '$cat =~ /^M/', ''], # Mark
['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing
['IsMc', '$cat eq "Mc"', ''], # Mark, Combining
+ ['IsMe', '$cat eq "Me"', ''], # Mark, Enclosing
['IsN', '$cat =~ /^N/', ''], # Number
['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit
['IsNo', '$cat eq "No"', ''], # Number, Other
+ ['IsNl', '$cat eq "Nl"', ''], # Number, Letter
- ['IsZ', '$cat =~ /^Z/', ''], # Zeparator
+ ['IsZ', '$cat =~ /^Z/', ''], # Separator
['IsZs', '$cat eq "Zs"', ''], # Separator, Space
['IsZl', '$cat eq "Zl"', ''], # Separator, Line
['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph
['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format
['IsCo', '$cat eq "Co"', ''], # Other, Private Use
['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned
+ ['IsCf', '$cat eq "Cf"', ''], # Other, Format
+ ['IsCs', '$cat eq "Cs"', ''], # Other, Surrogate
+ ['IsCn', 'Unassigned Code Value',$PropData], # Other, Not Assigned
# Informative
['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open
['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close
['IsPo', '$cat eq "Po"', ''], # Punctuation, Other
+ ['IsPc', '$cat eq "Pc"', ''], # Punctuation, Connector
+ ['IsPi', '$cat eq "Pi"', ''], # Punctuation, Initial quote
+ ['IsPf', '$cat eq "Pf"', ''], # Punctuation, Final quote
['IsS', '$cat =~ /^S/', ''], # Symbol
['IsSm', '$cat eq "Sm"', ''], # Symbol, Math
+ ['IsSk', '$cat eq "Sk"', ''], # Symbol, Modifier
['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency
['IsSo', '$cat eq "So"', ''], # Symbol, Other
# and punctuation specific to
# those scripts
+ ['IsBidiLRE', '$bid eq "LRE"', ''], # Left-to-Right Embedding
+ ['IsBidiLRO', '$bid eq "LRO"', ''], # Left-to-Right Override
+ ['IsBidiAL', '$bid eq "AL"', ''], # Right-to-Left Arabic
+ ['IsBidiRLE', '$bid eq "RLE"', ''], # Right-to-Left Embedding
+ ['IsBidiRLO', '$bid eq "RLO"', ''], # Right-to-Left Override
+ ['IsBidiPDF', '$bid eq "PDF"', ''], # Pop Directional Format
+ ['IsBidiNSM', '$bid eq "NSM"', ''], # Non-Spacing Mark
+ ['IsBidiBN', '$bid eq "BN"', ''], # Boundary Neutral
+
# Weak types:
['IsBidiEN','$bid eq "EN"', ''], # European Number
['IsDCfont', '$decomp =~ /^<font>/', ''],
['IsDCnoBreak', '$decomp =~ /^<noBreak>/', ''],
['IsDCinitial', '$decomp =~ /^<initial>/', ''],
- ['IsDCinital', '$decomp =~ /^<medial>/', ''],
+ ['IsDCmedial', '$decomp =~ /^<medial>/', ''],
['IsDCfinal', '$decomp =~ /^<final>/', ''],
['IsDCisolated', '$decomp =~ /^<isolated>/', ''],
['IsDCcircle', '$decomp =~ /^<circle>/', ''],
['IsDCnarrow', '$decomp =~ /^<narrow>/', ''],
['IsDCsmall', '$decomp =~ /^<small>/', ''],
['IsDCsquare', '$decomp =~ /^<square>/', ''],
+ ['IsDCfraction', '$decomp =~ /^<fraction>/', ''],
['IsDCcompat', '$decomp =~ /^<compat>/', ''],
# Number
- ['Number', '$num', '$num'],
+ ['Number', '$num ne ""', '$num'],
# Mirrored
# Jamo
['JamoShort', '1', '$short'],
+
+# Syllables
+
+ syllable_defs(),
+
+# Line break properties - Normative
+
+ ['IsLbrkBK','$brk eq "BK"', ''], # Mandatory Break
+ ['IsLbrkCR','$brk eq "CR"', ''], # Carriage Return
+ ['IsLbrkLF','$brk eq "LF"', ''], # Line Feed
+ ['IsLbrkCM','$brk eq "CM"', ''], # Attached Characters and Combining Marks
+ ['IsLbrkSG','$brk eq "SG"', ''], # Surrogates
+ ['IsLbrkGL','$brk eq "GL"', ''], # Non-breaking (Glue)
+ ['IsLbrkCB','$brk eq "CB"', ''], # Contingent Break Opportunity
+ ['IsLbrkSP','$brk eq "SP"', ''], # Space
+ ['IsLbrkZW','$brk eq "ZW"', ''], # Zero Width Space
+
+# Line break properties - Informative
+ ['IsLbrkXX','$brk eq "XX"', ''], # Unknown
+ ['IsLbrkOP','$brk eq "OP"', ''], # Opening Punctuation
+ ['IsLbrkCL','$brk eq "CL"', ''], # Closing Punctuation
+ ['IsLbrkQU','$brk eq "QU"', ''], # Ambiguous Quotation
+ ['IsLbrkNS','$brk eq "NS"', ''], # Non Starter
+ ['IsLbrkEX','$brk eq "EX"', ''], # Exclamation/Interrogation
+ ['IsLbrkSY','$brk eq "SY"', ''], # Symbols Allowing Breaks
+ ['IsLbrkIS','$brk eq "IS"', ''], # Infix Separator (Numeric)
+ ['IsLbrkPR','$brk eq "PR"', ''], # Prefix (Numeric)
+ ['IsLbrkPO','$brk eq "PO"', ''], # Postfix (Numeric)
+ ['IsLbrkNU','$brk eq "NU"', ''], # Numeric
+ ['IsLbrkAL','$brk eq "AL"', ''], # Ordinary Alphabetic and Symbol Characters
+ ['IsLbrkID','$brk eq "ID"', ''], # Ideographic
+ ['IsLbrkIN','$brk eq "IN"', ''], # Inseparable
+ ['IsLbrkHY','$brk eq "HY"', ''], # Hyphen
+ ['IsLbrkBB','$brk eq "BB"', ''], # Break Opportunity Before
+ ['IsLbrkBA','$brk eq "BA"', ''], # Break Opportunity After
+ ['IsLbrkSA','$brk eq "SA"', ''], # Complex Context (South East Asian)
+ ['IsLbrkAI','$brk eq "AI"', ''], # Ambiguous (Alphabetic or Ideographic)
+ ['IsLbrkB2','$brk eq "B2"', ''], # Break Opportunity Before and After
);
# This is not written for speed...
else {
open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n";
}
+ print OUT <<EOH;
+# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
+# This file is built by $0 from e.g. $UnicodeData.
+# Any changes made here will be lost!
+EOH
print OUT <<"END";
return <<'END';
END
exit if @ARGV and not grep { $_ eq Block } @ARGV;
print "Block\n";
-open(UD, 'blocks.txt') or die "Can't open blocks.txt: $!\n";
-open(OUT, ">Block.pl") or die "Can't create $table.pl: $!\n";
+open(UD, 'Blocks.txt') or die "Can't open Blocks.txt: $!\n";
+open(OUT, ">Block.pl") or die "Can't create Block.pl: $!\n";
+print OUT <<EOH;
+# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
+# This file is built by $0 from e.g. $UnicodeData.
+# Any changes made here will be lost!
+EOH
print OUT <<"END";
return <<'END';
END
print OUT "$code $last $name\n";
$name =~ s/\s+//g;
open(BLOCK, ">In/$name.pl");
+ print BLOCK <<EOH;
+# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
+# This file is built by $0 from e.g. $UnicodeData.
+# Any changes made here will be lost!
+EOH
print BLOCK <<"END2";
return <<'END';
$code $last
my $out;
my $split;
+ return listFromPropFile($wanted) if $val eq $PropData;
+
if ($table =~ /^Arab/) {
- open(UD, "arabshp.txt") or warn "Can't open $table: $!";
+ open(UD, "ArabShap.txt") or warn "Can't open $table: $!";
$split = '($code, $name, $link, $linkgroup) = split(/; */);';
}
elsif ($table =~ /^Jamo/) {
- open(UD, "jamo2.txt") or warn "Can't open $table: $!";
+ open(UD, "Jamo.txt") or warn "Can't open $table: $!";
$split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;';
}
+ elsif ($table =~ /^IsSyl/) {
+ open(UD, $SyllableData) or warn "Can't open $table: $!";
+
+ $split = '($code, $short, $syl) = split(/; */); $code =~ s/^U\+//;';
+ }
+ elsif ($table =~ /^IsLbrk/) {
+ open(UD, "LineBrk.txt") or warn "Can't open $table: $!";
+
+ $split = '($code, $brk, $name) = split(/;/);';
+ }
else {
open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!";
eval <<"END";
while (<UD>) {
next if /^#/;
- next if /^\s/;
- chop;
+ next if /^\\s/;
+ s/\\s+\$//;
$split
if ($wanted) {
push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]);
eval <<"END";
while (<UD>) {
next if /^#/;
- next if /^\s*\$/;
+ next if /^\\s*\$/;
chop;
$split
if ($wanted) {
$out;
}
-open(UNICODEDATA, $UnicodeData) || die "$0: $UnicodeData: $!\n";
-
-while (<UNICODEDATA>) {
- ($code, $name) = split /;/;
-
- $code{$name} = $code;
- $name{$code} = $name;
-
- if ($name =~ /^((?:LATIN|GREEK|CYRILLIC|HEBREW|BENGALI) .+? LETTER .+?) WITH /) {
- push @base, [ $code, $1 ];
- } elsif ($name =~ /^(ARABIC LETTER \w+?) WITH .+ (\w+ FORM)$/) {
- push @base, [ $code, "$1 $2" ];
- } elsif ($name =~ /^(ARABIC LETTER \w+?) WITH /) {
- push @base, [ $code, $1 ];
-# Is the concept of turning ligatures into character classes sound?
- } elsif ($name =~ /^(ARABIC) LIGATURE (.+?) (WITH .+ )+(\w+ FORM)$/) {
- my $script = $1;
- my $base = $2;
- my $with = $3;
- my $form = $4;
- push @base, [ $code, "$script LETTER $base" ];
- push @base, [ $code, "$script LETTER $base $form" ];
- my @with = split(/\bWITH\s+/, $with);
- shift @with;
- @with = grep { ! /^ (?:ABOVE|BELOW)/ } @with;
- foreach my $base (@with) {
- push @base, [ $code, "$script LETTER $base" ];
- push @base, [ $code, "$script LETTER $base $form" ];
- }
- } elsif ($name =~ /^((?:ARMENIAN|CYRILLIC) .+) LIGATURE (\w+) (\w+)$/) {
- push @base, [ $code, "$1 LETTER $2" ];
- push @base, [ $code, "$1 LETTER $3" ];
-# Latin ligatures (ae, oe, ij, ff, fi, fl, ffi, ffl, long st, st) ignored.
-# Hebrew Yiddish ligatures (double vav, vav yod, double yod, yod yod patah,
-# alef lamed) ignored.
- } else {
- next;
- }
-
-}
-
-foreach my $b (@base) {
- ($code, $base) = @$b;
- next unless exists $code{$base};
- push @{$unicode{$code{$base}}}, $code;
- print "$code: $name{$code} -> $base\n",
-}
-
-@unicode = sort keys %unicode;
+sub listFromPropFile {
+ my ($wanted) = @_;
+ my $out;
-if (open(EQ_UNICODE, ">Eq/Unicode")) {
- foreach my $c (@unicode) {
- print EQ_UNICODE "$c @{$unicode{$c}}\n";
+ open (UD, $PropData) or die "Can't open $PropData: $!\n";
+ local($/) = "\n" . '*' x 43 . "\n\nProperty dump for:"; # not 42?
+
+ <UD>;
+ while (<UD>) {
+ chomp;
+ if (s/0x[\d\w]+\s+\((.*?)\)// and $wanted eq $1) {
+ s/\(\d+ chars\)//g;
+ s/^\s+//mg;
+ s/\s+$//mg;
+ s/\.\./\t/g;
+ $out = lc $_;
+ last;
+ }
}
- close EQ_UNICODE;
-} else {
- die "$0: failed to open Eq/Unicode for writing: $!\n";
+ close (UD);
+ "$out\n";
}
-if (open(EQ_LATIN1, ">Eq/Latin1")) {
- foreach my $c (@unicode) {
- last if hex($c) > 255;
- my @c = grep { hex($_) <= 255 } @{$unicode{$c}};
- next unless @c;
- print EQ_LATIN1 "$c @c\n";
+sub syllable_defs {
+ my @defs;
+ my %seen;
+
+ open (SD, $SyllableData) or die "Can't open $SyllableData: $!\n";
+ while (<SD>) {
+ next if /^\s*(#|$)/;
+ s/\s+$//;
+ ($code, $name, $syl) = split /; */;
+ next unless $syl;
+ push (@defs, ["IsSyl$syl", qq{\$syl eq "$syl"}, ''])
+ unless $seen{$syl}++;
}
- close EQ_LATIN1;
-} else {
- die "$0: failed to open Eq/Latin1 for writing: $!\n";
+ close (SD);
+ return (@defs);
}
-
+
+# eof