#!../../miniperl
-$UnicodeData = "Unicode.300";
+use bytes;
+
+$UnicodeData = "Unicode.301";
+$SyllableData = "syllables.txt";
+$PropData = "PropList.txt";
+
# Note: we try to keep filenames unique within first 8 chars. Using
# subdirectories for the following helps.
-mkdir "In", 0777;
-mkdir "Is", 0777;
-mkdir "To", 0777;
-mkdir "Eq", 0777;
+mkdir "In", 0755;
+mkdir "Is", 0755;
+mkdir "To", 0755;
@todo = (
# typical
- ['IsWord', '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"', ''],
- ['IsAlnum', '$cat =~ /^L[ulo]|^Nd/', ''],
- ['IsAlpha', '$cat =~ /^L[ulo]/', ''],
- ['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''],
+ ['IsWord', '$cat =~ /^L[ulot]|^Nd/ or $code eq "005F"', ''],
+ ['IsAlnum', '$cat =~ /^L[ulot]|^Nd/', ''],
+ ['IsAlpha', '$cat =~ /^L[ulot]/', ''],
+ ['IsSpace', 'White space', $PropData],
['IsDigit', '$cat =~ /^Nd$/', ''],
- ['IsUpper', '$cat =~ /^Lu$/', ''],
+ ['IsUpper', '$cat =~ /^L[ut]$/', ''],
['IsLower', '$cat =~ /^Ll$/', ''],
['IsASCII', 'hex $code <= 127', ''],
['IsCntrl', '$cat =~ /^C/', ''],
- ['IsGraph', '$cat =~ /^[^C]/ and $code ne "0020"', ''],
+ ['IsGraph', '$cat =~ /^[^C]/ and ($cat !~ /^Z/ and $code ne "0020" or chr(hex $code) !~ /^\s/)', ''],
['IsPrint', '$cat =~ /^[^C]/', ''],
- ['IsPunct', '$cat =~ /^P/', ''],
+ ['IsPunct', 'Punctuation', $PropData],
['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''],
['ToUpper', '$up', '$up'],
['ToLower', '$down', '$down'],
['IsM', '$cat =~ /^M/', ''], # Mark
['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing
['IsMc', '$cat eq "Mc"', ''], # Mark, Combining
+ ['IsMe', '$cat eq "Me"', ''], # Mark, Enclosing
['IsN', '$cat =~ /^N/', ''], # Number
['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit
['IsNo', '$cat eq "No"', ''], # Number, Other
+ ['IsNl', '$cat eq "Nl"', ''], # Number, Letter
- ['IsZ', '$cat =~ /^Z/', ''], # Zeparator
+ ['IsZ', '$cat =~ /^Z/', ''], # Separator
['IsZs', '$cat eq "Zs"', ''], # Separator, Space
['IsZl', '$cat eq "Zl"', ''], # Separator, Line
['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph
['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format
['IsCo', '$cat eq "Co"', ''], # Other, Private Use
['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned
+ ['IsCf', '$cat eq "Cf"', ''], # Other, Format
+ ['IsCs', '$cat eq "Cs"', ''], # Other, Surrogate
+ ['IsCn', 'Unassigned Code Value',$PropData], # Other, Not Assigned
# Informative
['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open
['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close
['IsPo', '$cat eq "Po"', ''], # Punctuation, Other
+ ['IsPc', '$cat eq "Pc"', ''], # Punctuation, Connector
+ ['IsPi', '$cat eq "Pi"', ''], # Punctuation, Initial quote
+ ['IsPf', '$cat eq "Pf"', ''], # Punctuation, Final quote
['IsS', '$cat =~ /^S/', ''], # Symbol
['IsSm', '$cat eq "Sm"', ''], # Symbol, Math
+ ['IsSk', '$cat eq "Sk"', ''], # Symbol, Modifier
['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency
['IsSo', '$cat eq "So"', ''], # Symbol, Other
# and punctuation specific to
# those scripts
+ ['IsBidiLRE', '$bid eq "LRE"', ''], # Left-to-Right Embedding
+ ['IsBidiLRO', '$bid eq "LRO"', ''], # Left-to-Right Override
+ ['IsBidiAL', '$bid eq "AL"', ''], # Right-to-Left Arabic
+ ['IsBidiRLE', '$bid eq "RLE"', ''], # Right-to-Left Embedding
+ ['IsBidiRLO', '$bid eq "RLO"', ''], # Right-to-Left Override
+ ['IsBidiPDF', '$bid eq "PDF"', ''], # Pop Directional Format
+ ['IsBidiNSM', '$bid eq "NSM"', ''], # Non-Spacing Mark
+ ['IsBidiBN', '$bid eq "BN"', ''], # Boundary Neutral
+
# Weak types:
['IsBidiEN','$bid eq "EN"', ''], # European Number
['IsDCnarrow', '$decomp =~ /^<narrow>/', ''],
['IsDCsmall', '$decomp =~ /^<small>/', ''],
['IsDCsquare', '$decomp =~ /^<square>/', ''],
+ ['IsDCfraction', '$decomp =~ /^<fraction>/', ''],
['IsDCcompat', '$decomp =~ /^<compat>/', ''],
# Number
- ['Number', '$num', '$num'],
+ ['Number', '$num ne ""', '$num'],
# Mirrored
# Syllables
- ['IsSylV', '$syl eq "V"', ''],
- ['IsSylU', '$syl eq "U"', ''],
- ['IsSylI', '$syl eq "I"', ''],
- ['IsSylA', '$syl eq "A"', ''],
- ['IsSylE', '$syl eq "E"', ''],
- ['IsSylC', '$syl eq "C"', ''],
- ['IsSylO', '$syl eq "O"', ''],
- ['IsSylWV', '$syl eq "V"', ''],
- ['IsSylWI', '$syl eq "I"', ''],
- ['IsSylWA', '$syl eq "A"', ''],
- ['IsSylWE', '$syl eq "E"', ''],
- ['IsSylWC', '$syl eq "C"', ''],
+ syllable_defs(),
+
+# Line break properties - Normative
+
+ ['IsLbrkBK','$brk eq "BK"', ''], # Mandatory Break
+ ['IsLbrkCR','$brk eq "CR"', ''], # Carriage Return
+ ['IsLbrkLF','$brk eq "LF"', ''], # Line Feed
+ ['IsLbrkCM','$brk eq "CM"', ''], # Attached Characters and Combining Marks
+ ['IsLbrkSG','$brk eq "SG"', ''], # Surrogates
+ ['IsLbrkGL','$brk eq "GL"', ''], # Non-breaking (Glue)
+ ['IsLbrkCB','$brk eq "CB"', ''], # Contingent Break Opportunity
+ ['IsLbrkSP','$brk eq "SP"', ''], # Space
+ ['IsLbrkZW','$brk eq "ZW"', ''], # Zero Width Space
+
+# Line break properties - Informative
+ ['IsLbrkXX','$brk eq "XX"', ''], # Unknown
+ ['IsLbrkOP','$brk eq "OP"', ''], # Opening Punctuation
+ ['IsLbrkCL','$brk eq "CL"', ''], # Closing Punctuation
+ ['IsLbrkQU','$brk eq "QU"', ''], # Ambiguous Quotation
+ ['IsLbrkNS','$brk eq "NS"', ''], # Non Starter
+ ['IsLbrkEX','$brk eq "EX"', ''], # Exclamation/Interrogation
+ ['IsLbrkSY','$brk eq "SY"', ''], # Symbols Allowing Breaks
+ ['IsLbrkIS','$brk eq "IS"', ''], # Infix Separator (Numeric)
+ ['IsLbrkPR','$brk eq "PR"', ''], # Prefix (Numeric)
+ ['IsLbrkPO','$brk eq "PO"', ''], # Postfix (Numeric)
+ ['IsLbrkNU','$brk eq "NU"', ''], # Numeric
+ ['IsLbrkAL','$brk eq "AL"', ''], # Ordinary Alphabetic and Symbol Characters
+ ['IsLbrkID','$brk eq "ID"', ''], # Ideographic
+ ['IsLbrkIN','$brk eq "IN"', ''], # Inseparable
+ ['IsLbrkHY','$brk eq "HY"', ''], # Hyphen
+ ['IsLbrkBB','$brk eq "BB"', ''], # Break Opportunity Before
+ ['IsLbrkBA','$brk eq "BA"', ''], # Break Opportunity After
+ ['IsLbrkSA','$brk eq "SA"', ''], # Complex Context (South East Asian)
+ ['IsLbrkAI','$brk eq "AI"', ''], # Ambiguous (Alphabetic or Ideographic)
+ ['IsLbrkB2','$brk eq "B2"', ''], # Break Opportunity Before and After
);
# This is not written for speed...
exit if @ARGV and not grep { $_ eq Block } @ARGV;
print "Block\n";
-open(UD, 'Blocks.txt') or die "Can't open blocks.txt: $!\n";
-open(OUT, ">Block.pl") or die "Can't create $table.pl: $!\n";
+open(UD, 'Blocks.txt') or die "Can't open Blocks.txt: $!\n";
+open(OUT, ">Block.pl") or die "Can't create Block.pl: $!\n";
print OUT <<EOH;
# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
# This file is built by $0 from e.g. $UnicodeData.
my $out;
my $split;
+ return listFromPropFile($wanted) if $val eq $PropData;
+
if ($table =~ /^Arab/) {
open(UD, "ArabShap.txt") or warn "Can't open $table: $!";
$split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;';
}
elsif ($table =~ /^IsSyl/) {
- open(UD, "syllables.txt") or warn "Can't open $table: $!";
+ open(UD, $SyllableData) or warn "Can't open $table: $!";
$split = '($code, $short, $syl) = split(/; */); $code =~ s/^U\+//;';
}
+ elsif ($table =~ /^IsLbrk/) {
+ open(UD, "LineBrk.txt") or warn "Can't open $table: $!";
+
+ $split = '($code, $brk, $name) = split(/;/);';
+ }
else {
open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!";
eval <<"END";
while (<UD>) {
next if /^#/;
- next if /^\s/;
- chop;
+ next if /^\\s/;
+ s/\\s+\$//;
$split
if ($wanted) {
push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]);
eval <<"END";
while (<UD>) {
next if /^#/;
- next if /^\s*\$/;
+ next if /^\\s*\$/;
chop;
$split
if ($wanted) {
$out;
}
-# Create the equivalence mappings.
-
-open(UNICODEDATA, $UnicodeData) || die "$0: $UnicodeData: $!\n";
-
-while (<UNICODEDATA>) {
- ($code, $name, $category, $decomposition) = (split /;/)[0,1,2,5];
-
- $code{$name} = $code;
- $name{$code} = $name;
- $category{$code} = $category;
-
- next unless $category =~ /^L/;
-
- # The definition of "equivalence" is twofold.
- if ($decomposition ne '') {
- # (1) If there's an official Unicode decomposition
- # and the base is a Unicode letter.
- $decomposition =~ s/^<\w+> //;
- @decomposition = split(' ', $decomposition);
- # Some Arabic ligatures like
- # FC5E;ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM;Lo;...
- # are problematic because their decomposition begins with
- # a space (0020) -- which could be just skipped -- but then
- # their base glyph is not a letter, for example
- # the above decomposes as <isolated> 0020 064C 0651,
- # but 064C is 064C;ARABIC DAMMATAN;Mn;...
- # (the 0651 being ARABIC SHADDA;Mn)
- ($basecode) = shift @decomposition;
- push @base, [ $code, $basecode ];
- } elsif ($name =~ /^(.+?) WITH /) {
- # (2) If there's a "FOO WITH ..." Unicode name and FOO
- # happens to be valid Unicode letter. This is
- # a debatable definition and all fault is by me (jhi).
- # For example this definition adds
- # LATIN SMALL LETTER O WITH STROKE
- # as a derivative of
- # LATIN SMALL LETTER O
- # which some might rightfully contest, especially
- # the speakers of languages who have the former
- # phonetically as very distinct from the latter.
- push @with, [ $code, $1 ];
- }
-}
-
-foreach my $w (@with) {
- ($code, $basename) = @$w;
- next if not exists $code{$basename} or
- not $category{$code{$basename}} =~ /^L/;
- push @base, [ $code, $code{$basename} ];
-}
-
-@base = sort { $a->[0] cmp $b->[0] } @base;
-
-foreach my $b (@base) {
- ($code, $basecode) = @$b;
- $basename = $name{$basecode};
- next if not defined $basename or
- not exists $code{$basename} or
- not $category{$code{$basename}} =~ /^L/;
- push @{$unicode{$code{$basename}}}, $code;
-# print "$code: $name{$code} -> $basename\n",
-}
-
-@unicode = sort keys %unicode;
+sub listFromPropFile {
+ my ($wanted) = @_;
+ my $out;
-print "EqUnicode\n";
-if (open(OUT, ">Eq/Unicode.pl")) {
- print OUT <<EOH;
-# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
-# This file is built by $0 from e.g. $UnicodeData.
-# Any changes made here will be lost!
-return <<'END';
-EOH
- foreach my $c (@unicode) {
- print OUT "$c @{$unicode{$c}}\n";
+ open (UD, $PropData) or die "Can't open $PropData: $!\n";
+ local($/) = "\n" . '*' x 43 . "\n\nProperty dump for:"; # not 42?
+
+ <UD>;
+ while (<UD>) {
+ chomp;
+ if (s/0x[\d\w]+\s+\((.*?)\)// and $wanted eq $1) {
+ s/\(\d+ chars\)//g;
+ s/^\s+//mg;
+ s/\s+$//mg;
+ s/\.\./\t/g;
+ $out = lc $_;
+ last;
+ }
}
- print OUT "END\n";
- close OUT;
-} else {
- die "$0: failed to open Eq/Unicode.pl for writing: $!\n";
+ close (UD);
+ "$out\n";
}
-print "EqLatin1\n";
-if (open(OUT, ">Eq/Latin1.pl")) {
- print OUT <<EOH;
-# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
-# This file is built by $0 from e.g. $UnicodeData.
-# Any changes made here will be lost!
-return <<'END';
-EOH
- foreach my $c (@unicode) {
- last if hex($c) > 255;
- my @c = grep { hex($_) < 256 } @{$unicode{$c}};
- next unless @c;
- print OUT "$c @c\n";
+sub syllable_defs {
+ my @defs;
+ my %seen;
+
+ open (SD, $SyllableData) or die "Can't open $SyllableData: $!\n";
+ while (<SD>) {
+ next if /^\s*(#|$)/;
+ s/\s+$//;
+ ($code, $name, $syl) = split /; */;
+ next unless $syl;
+ push (@defs, ["IsSyl$syl", qq{\$syl eq "$syl"}, ''])
+ unless $seen{$syl}++;
}
- print OUT "END\n";
- close OUT;
-} else {
- die "$0: failed to open Eq/Latin1.pl for writing: $!\n";
+ close (SD);
+ return (@defs);
}
# eof