lib/unicode/Is/ASCII.pl Unicode character database
lib/unicode/Is/Alnum.pl Unicode character database
lib/unicode/Is/Alpha.pl Unicode character database
+lib/unicode/Is/BidiAL.pl Unicode character database
lib/unicode/Is/BidiAN.pl Unicode character database
lib/unicode/Is/BidiB.pl Unicode character database
+lib/unicode/Is/BidiBN.pl Unicode character database
lib/unicode/Is/BidiCS.pl Unicode character database
lib/unicode/Is/BidiEN.pl Unicode character database
lib/unicode/Is/BidiES.pl Unicode character database
lib/unicode/Is/BidiET.pl Unicode character database
lib/unicode/Is/BidiL.pl Unicode character database
+lib/unicode/Is/BidiLRE.pl Unicode character database
+lib/unicode/Is/BidiLRO.pl Unicode character database
+lib/unicode/Is/BidiNSM.pl Unicode character database
lib/unicode/Is/BidiON.pl Unicode character database
+lib/unicode/Is/BidiPDF.pl Unicode character database
lib/unicode/Is/BidiR.pl Unicode character database
+lib/unicode/Is/BidiRLE.pl Unicode character database
+lib/unicode/Is/BidiRLO.pl Unicode character database
lib/unicode/Is/BidiS.pl Unicode character database
lib/unicode/Is/BidiWS.pl Unicode character database
lib/unicode/Is/C.pl Unicode character database
lib/unicode/Is/Cc.pl Unicode character database
+lib/unicode/Is/Cf.pl Unicode character database
lib/unicode/Is/Cn.pl Unicode character database
lib/unicode/Is/Cntrl.pl Unicode character database
lib/unicode/Is/Co.pl Unicode character database
+lib/unicode/Is/Cs.pl Unicode character database
lib/unicode/Is/DCcircle.pl Unicode character database
lib/unicode/Is/DCcompat.pl Unicode character database
lib/unicode/Is/DCfinal.pl Unicode character database
lib/unicode/Is/DCfont.pl Unicode character database
+lib/unicode/Is/DCfraction.pl Unicode character database
lib/unicode/Is/DCinital.pl Unicode character database
lib/unicode/Is/DCinitial.pl Unicode character database
lib/unicode/Is/DCisolated.pl Unicode character database
lib/unicode/Is/Lu.pl Unicode character database
lib/unicode/Is/M.pl Unicode character database
lib/unicode/Is/Mc.pl Unicode character database
+lib/unicode/Is/Me.pl Unicode character database
lib/unicode/Is/Mirrored.pl Unicode character database
lib/unicode/Is/Mn.pl Unicode character database
lib/unicode/Is/N.pl Unicode character database
lib/unicode/Is/Nd.pl Unicode character database
+lib/unicode/Is/Nl.pl Unicode character database
lib/unicode/Is/No.pl Unicode character database
lib/unicode/Is/P.pl Unicode character database
+lib/unicode/Is/Pc.pl Unicode character database
lib/unicode/Is/Pd.pl Unicode character database
lib/unicode/Is/Pe.pl Unicode character database
+lib/unicode/Is/Pf.pl Unicode character database
+lib/unicode/Is/Pi.pl Unicode character database
lib/unicode/Is/Po.pl Unicode character database
lib/unicode/Is/Print.pl Unicode character database
lib/unicode/Is/Ps.pl Unicode character database
lib/unicode/Is/Punct.pl Unicode character database
lib/unicode/Is/S.pl Unicode character database
lib/unicode/Is/Sc.pl Unicode character database
+lib/unicode/Is/Sk.pl Unicode character database
lib/unicode/Is/Sm.pl Unicode character database
lib/unicode/Is/So.pl Unicode character database
lib/unicode/Is/Space.pl Unicode character database
lib/unicode/Is/SylA.pl Unicode character database
+lib/unicode/Is/SylAA.pl Unicode character database
+lib/unicode/Is/SylAAI.pl Unicode character database
+lib/unicode/Is/SylAI.pl Unicode character database
lib/unicode/Is/SylC.pl Unicode character database
lib/unicode/Is/SylE.pl Unicode character database
+lib/unicode/Is/SylEE.pl Unicode character database
lib/unicode/Is/SylI.pl Unicode character database
+lib/unicode/Is/SylII.pl Unicode character database
+lib/unicode/Is/SylN.pl Unicode character database
lib/unicode/Is/SylO.pl Unicode character database
+lib/unicode/Is/SylOO.pl Unicode character database
lib/unicode/Is/SylU.pl Unicode character database
lib/unicode/Is/SylV.pl Unicode character database
lib/unicode/Is/SylWA.pl Unicode character database
+lib/unicode/Is/SylWAA.pl Unicode character database
lib/unicode/Is/SylWC.pl Unicode character database
lib/unicode/Is/SylWE.pl Unicode character database
+lib/unicode/Is/SylWEE.pl Unicode character database
lib/unicode/Is/SylWI.pl Unicode character database
+lib/unicode/Is/SylWII.pl Unicode character database
+lib/unicode/Is/SylWO.pl Unicode character database
+lib/unicode/Is/SylWOO.pl Unicode character database
+lib/unicode/Is/SylWU.pl Unicode character database
lib/unicode/Is/SylWV.pl Unicode character database
lib/unicode/Is/Syllable.pl Unicode character database
lib/unicode/Is/Upper.pl Unicode character database
#!../../miniperl
+use bytes;
+
$UnicodeData = "Unicode.300";
+$SyllableData = "syllables.txt";
+$PropData = "Props.txt";
+
# Note: we try to keep filenames unique within first 8 chars. Using
# subdirectories for the following helps.
['IsWord', '$cat =~ /^L[ulot]|^Nd/ or $code eq "005F"', ''],
['IsAlnum', '$cat =~ /^L[ulot]|^Nd/', ''],
['IsAlpha', '$cat =~ /^L[ulot]/', ''],
- # XXX broken: recursive definition (/\s/ will look up IsSpace in future)
- ['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''],
+ ['IsSpace', 'White space', $PropData],
['IsDigit', '$cat =~ /^Nd$/', ''],
- ['IsUpper', '$cat =~ /^Lu$/', ''],
+ ['IsUpper', '$cat =~ /^L[ut]$/', ''],
['IsLower', '$cat =~ /^Ll$/', ''],
['IsASCII', 'hex $code <= 127', ''],
['IsCntrl', '$cat =~ /^C/', ''],
- ['IsGraph', '$cat =~ /^[^C]/ and $code ne "0020"', ''],
+ ['IsGraph', '$cat =~ /^[^C]/ and ($cat !~ /^Z/ and $code ne "0020" or chr(hex $code) !~ /^\s/)', ''],
['IsPrint', '$cat =~ /^[^C]/', ''],
- ['IsPunct', '$cat =~ /^P/', ''],
+ ['IsPunct', 'Punctuation', $PropData],
['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''],
['ToUpper', '$up', '$up'],
['ToLower', '$down', '$down'],
['IsM', '$cat =~ /^M/', ''], # Mark
['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing
['IsMc', '$cat eq "Mc"', ''], # Mark, Combining
+ ['IsMe', '$cat eq "Me"', ''], # Mark, Enclosing
['IsN', '$cat =~ /^N/', ''], # Number
['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit
['IsNo', '$cat eq "No"', ''], # Number, Other
+ ['IsNl', '$cat eq "Nl"', ''], # Number, Letter
- ['IsZ', '$cat =~ /^Z/', ''], # Zeparator
+ ['IsZ', '$cat =~ /^Z/', ''], # Separator
['IsZs', '$cat eq "Zs"', ''], # Separator, Space
['IsZl', '$cat eq "Zl"', ''], # Separator, Line
['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph
['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format
['IsCo', '$cat eq "Co"', ''], # Other, Private Use
['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned
+ ['IsCf', '$cat eq "Cf"', ''], # Other, Format
+ ['IsCs', '$cat eq "Cs"', ''], # Other, Surrogate
+ ['IsCn', 'Unassigned Code Value',$PropData], # Other, Not Assigned
# Informative
['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open
['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close
['IsPo', '$cat eq "Po"', ''], # Punctuation, Other
+ ['IsPc', '$cat eq "Pc"', ''], # Punctuation, Connector
+ ['IsPi', '$cat eq "Pi"', ''], # Punctuation, Initial quote
+ ['IsPf', '$cat eq "Pf"', ''], # Punctuation, Final quote
['IsS', '$cat =~ /^S/', ''], # Symbol
['IsSm', '$cat eq "Sm"', ''], # Symbol, Math
+ ['IsSk', '$cat eq "Sk"', ''], # Symbol, Modifier
['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency
['IsSo', '$cat eq "So"', ''], # Symbol, Other
# and punctuation specific to
# those scripts
+ ['IsBidiLRE', '$bid eq "LRE"', ''], # Left-to-Right Embedding
+ ['IsBidiLRO', '$bid eq "LRO"', ''], # Left-to-Right Override
+ ['IsBidiAL', '$bid eq "AL"', ''], # Right-to-Left Arabic
+ ['IsBidiRLE', '$bid eq "RLE"', ''], # Right-to-Left Embedding
+ ['IsBidiRLO', '$bid eq "RLO"', ''], # Right-to-Left Override
+ ['IsBidiPDF', '$bid eq "PDF"', ''], # Pop Directional Format
+ ['IsBidiNSM', '$bid eq "NSM"', ''], # Non-Spacing Mark
+ ['IsBidiBN', '$bid eq "BN"', ''], # Boundary Neutral
+
# Weak types:
['IsBidiEN','$bid eq "EN"', ''], # European Number
['IsDCnarrow', '$decomp =~ /^<narrow>/', ''],
['IsDCsmall', '$decomp =~ /^<small>/', ''],
['IsDCsquare', '$decomp =~ /^<square>/', ''],
+ ['IsDCfraction', '$decomp =~ /^<fraction>/', ''],
['IsDCcompat', '$decomp =~ /^<compat>/', ''],
# Number
# Syllables
- ['IsSylV', '$syl eq "V"', ''],
- ['IsSylU', '$syl eq "U"', ''],
- ['IsSylI', '$syl eq "I"', ''],
- ['IsSylA', '$syl eq "A"', ''],
- ['IsSylE', '$syl eq "E"', ''],
- ['IsSylC', '$syl eq "C"', ''],
- ['IsSylO', '$syl eq "O"', ''],
- ['IsSylWV', '$syl eq "V"', ''],
- ['IsSylWI', '$syl eq "I"', ''],
- ['IsSylWA', '$syl eq "A"', ''],
- ['IsSylWE', '$syl eq "E"', ''],
- ['IsSylWC', '$syl eq "C"', ''],
-
+ syllable_defs(),
+
# Line break properties - Normative
['IsLbrkBK','$brk eq "BK"', ''], # Mandatory Break
exit if @ARGV and not grep { $_ eq Block } @ARGV;
print "Block\n";
-open(UD, 'Blocks.txt') or die "Can't open blocks.txt: $!\n";
-open(OUT, ">Block.pl") or die "Can't create $table.pl: $!\n";
+open(UD, 'Blocks.txt') or die "Can't open Blocks.txt: $!\n";
+open(OUT, ">Block.pl") or die "Can't create Block.pl: $!\n";
print OUT <<EOH;
# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
# This file is built by $0 from e.g. $UnicodeData.
my $out;
my $split;
+ return listFromPropFile($wanted) if $val eq $PropData;
+
if ($table =~ /^Arab/) {
open(UD, "ArabShap.txt") or warn "Can't open $table: $!";
$split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;';
}
elsif ($table =~ /^IsSyl/) {
- open(UD, "syllables.txt") or warn "Can't open $table: $!";
+ open(UD, $SyllableData) or warn "Can't open $table: $!";
$split = '($code, $short, $syl) = split(/; */); $code =~ s/^U\+//;';
}
eval <<"END";
while (<UD>) {
next if /^#/;
- next if /^\s/;
- chop;
+ next if /^\\s/;
+ s/\\s+\$//;
$split
if ($wanted) {
push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]);
eval <<"END";
while (<UD>) {
next if /^#/;
- next if /^\s*\$/;
+ next if /^\\s*\$/;
chop;
$split
if ($wanted) {
$out;
}
+sub listFromPropFile {
+ my ($wanted) = @_;
+ my $out;
+
+ open (UD, $PropData) or die "Can't open $PropData: $!\n";
+ local($/) = "\n" . '*' x 43 . "\n\nProperty dump for:"; # not 42?
+
+ <UD>;
+ while (<UD>) {
+ chomp;
+ if (s/0x[\d\w]+\s+\((.*?)\)// and $wanted eq $1) {
+ s/\(\d+ chars\)//g;
+ s/^\s+//mg;
+ s/\s+$//mg;
+ s/\.\./\t/g;
+ $out = lc $_;
+ last;
+ }
+ }
+ close (UD);
+ "$out\n";
+}
+
+sub syllable_defs {
+ my @defs;
+ my %seen;
+
+ open (SD, $SyllableData) or die "Can't open $SyllableData: $!\n";
+ while (<SD>) {
+ next if /^\s*(#|$)/;
+ s/\s+$//;
+ ($code, $name, $syl) = split /; */;
+ next unless $syl;
+ push (@defs, ["IsSyl$syl", qq{\$syl eq "$syl"}, ''])
+ unless $seen{$syl}++;
+ }
+ close (SD);
+ return (@defs);
+}
+
# eof