X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2Funicore%2Fmktables;h=e4020ecf3d24d2623d3056e86355588a663d8ea1;hb=12ac2576dfc10fd43d91903e7602870c10b4f00f;hp=654301eeb429debe647c0622342d6756d85f40cf;hpb=5cb851a619abc5c8836307a3be2292a0ed632588;p=p5sagit%2Fp5-mst-13.2.git diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 654301e..e4020ec 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -8,6 +8,7 @@ use Carp; ## mkdir("lib", 0755); +mkdir("lib/gc_sc", 0755); mkdir("To", 0755); ## @@ -106,6 +107,62 @@ sub CanonicalName($) return $name; } + +## +## Store the alias definitions for later use. +## +my %PropertyAlias; +my %PropValueAlias; + +my %PA_reverse; +my %PVA_reverse; + +sub Build_Aliases() +{ + ## + ## Most of the work with aliases doesn't occur here, + ## but rather in utf8_heavy.pl, which uses utf8_pva.pl, + ## which contains just this function. However, this one + ## + ## -- japhy (2004/04/13) + + open PA, "< PropertyAliases.txt" + or confess "Can't open PropertyAliases.txt: $!"; + while () { + s/#.*//; + s/\s+$//; + next if /^$/; + + my ($abbrev, $name) = split /\s*;\s*/; + next if $abbrev eq "n/a"; + $PropertyAlias{$abbrev} = $name; + $PA_reverse{$name} = $abbrev; + } + close PA; + + open PVA, "< PropValueAliases.txt" + or confess "Can't open PropValueAliases.txt: $!"; + while () { + s/#.*//; + s/\s+$//; + next if /^$/; + + my ($prop, @data) = split /\s*;\s*/; + + if ($prop eq 'ccc') { + $PropValueAlias{$prop}{$data[1]} = [ @data[0,2] ]; + $PVA_reverse{$prop}{$data[2]} = [ @data[0,1] ]; + } + else { + next if $data[0] eq "n/a"; + $PropValueAlias{$prop}{$data[0]} = $data[1]; + $PVA_reverse{$prop}{$data[1]} = $data[0]; + } + } + close PVA; +} + + ## ## Associates a property ("Greek", "Lu", "Assigned",...) with a Table. ## @@ -508,7 +565,7 @@ sub New_Alias($$$@) if ($TableInfo{$Type}->{$CName}) { confess "$0: Use canonical form '$CName' instead of '$Name' for alias."; } else { - confess "$0: don't have orignial $Type => $Name to make alias"; + confess "$0: don't have original $Type => $Name to make alias\n"; } } if ($TableInfo{$Alias}) { @@ -541,19 +598,15 @@ sub UnicodeData_Txt() my $Deco = Table->New(); my $Comb = Table->New(); my $Number = Table->New(); - my $Mirrored = Table->New(Is => 'Mirrored', - Desc => "Mirrored in bidirectional text", - Fuzzy => 0); + my $Mirrored = Table->New();#Is => 'Mirrored', + #Desc => "Mirrored in bidirectional text", + #Fuzzy => 0); my %DC; my %Bidi; - my %Deco; - $Deco{Canon} = Table->New(Is => 'Canon', - Desc => 'Decomposes to multiple characters', - Fuzzy => 0); - $Deco{Compat} = Table->New(Is => 'Compat', - Desc => 'Compatible with a more-basic character', - Fuzzy => 0); + my %Number; + $DC{can} = Table->New(); + $DC{com} = Table->New(); ## Initialize Perl-generated categories ## (Categories from UnicodeData.txt are auto-initialized in gencat) @@ -624,32 +677,35 @@ sub UnicodeData_Txt() ($General{$name} ||= Table->New)->$op($code, $name); # 005F: SPACING UNDERSCORE - $Cat{Word}->$op($code) if $cat =~ /^[LMN]/ || $code == 0x005F; - $Cat{Alnum}->$op($code) if $cat =~ /^[LMN]/; + $Cat{Word}->$op($code) if $cat =~ /^[LMN]|Pc/; + $Cat{Alnum}->$op($code) if $cat =~ /^[LM]|Nd/; $Cat{Alpha}->$op($code) if $cat =~ /^[LM]/; - - - $Cat{Space}->$op($code) if $cat =~ /^Z/ + my $isspace = + ($cat =~ /Zs|Zl|Zp/ && + $code != 0x200B) # 200B is ZWSP which is for line break control + # and therefore it is not part of "space" even while it is "Zs". || $code == 0x0009 # 0009: HORIZONTAL TAB || $code == 0x000A # 000A: LINE FEED || $code == 0x000B # 000B: VERTICAL TAB || $code == 0x000C # 000C: FORM FEED - || $code == 0x000D; # 000D: CARRIAGE RETURN + || $code == 0x000D # 000D: CARRIAGE RETURN + || $code == 0x0085 # 0085: NEL + ; - $Cat{SpacePerl}->$op($code) if $cat =~ /^Z/ - || $code == 0x0009 # 0009: HORIZONTAL TAB - || $code == 0x000A # 000A: LINE FEED - || $code == 0x000C # 000C: FORM FEED - || $code == 0x000D # 000D: CARRIAGE RETURN - || $code == 0x0085 # 0085: - || $code == 0x2028 # 2028: LINE SEPARATOR - || $code == 0x2029;# 2029: PARAGRAPH SEP. + $Cat{Space}->$op($code) if $isspace; - $Cat{Blank}->$op($code) if $cat =~ /^Z[^lp]$/ - || $code == 0x0009 # 0009: HORIZONTAL TAB - || $code == 0x0020; # 0020: SPACE + $Cat{SpacePerl}->$op($code) if $isspace + && $code != 0x000B; # Backward compat. + + $Cat{Blank}->$op($code) if $isspace + && !($code == 0x000A || + $code == 0x000B || + $code == 0x000C || + $code == 0x000D || + $code == 0x0085 || + $cat =~ /^Z[lp]/); $Cat{Digit}->$op($code) if $cat eq "Nd"; $Cat{Upper}->$op($code) if $cat eq "Lu"; @@ -657,8 +713,9 @@ sub UnicodeData_Txt() $Cat{Title}->$op($code) if $cat eq "Lt"; $Cat{ASCII}->$op($code) if $code <= 0x007F; $Cat{Cntrl}->$op($code) if $cat =~ /^C/; - $Cat{Graph}->$op($code) if $cat =~ /^([LMNPS]|Co)/; - $Cat{Print}->$op($code) if $cat =~ /^([LMNPS]|Co|Zs)/; + my $isgraph = !$isspace && $cat !~ /Cc|Cs|Cn/; + $Cat{Graph}->$op($code) if $isgraph; + $Cat{Print}->$op($code) if $isgraph || $isspace; $Cat{Punct}->$op($code) if $cat =~ /^P/; $Cat{XDigit}->$op($code) if ($code >= 0x30 && $code <= 0x39) ## 0..9 @@ -750,11 +807,17 @@ sub UnicodeData_Txt() $Comb->Append($code, $comb) if $comb; $Number->Append($code, $number) if length $number; + length($decimal) and ($Number{De} ||= Table->New())->Append($code) + or + length($digit) and ($Number{Di} ||= Table->New())->Append($code) + or + length($number) and ($Number{Nu} ||= Table->New())->Append($code); + $Mirrored->Append($code) if $mirrored eq "Y"; - $Bidi{$bidi} ||= Table->New(Is => "Bidi$bidi", - Desc => "Bi-directional category '$bidi'", - Fuzzy => 0); + $Bidi{$bidi} ||= Table->New();#Is => "bt/$bidi", + #Desc => "Bi-directional category '$bidi'", + #Fuzzy => 0); $Bidi{$bidi}->Append($code); if ($deco) @@ -762,16 +825,15 @@ sub UnicodeData_Txt() $Deco->Append($code, $deco); if ($deco =~/^<(\w+)>/) { - $Deco{Compat}->Append($code); + my $dshort = $PVA_reverse{dt}{ucfirst lc $1}; + $DC{com}->Append($code); - $DC{$1} ||= Table->New(Is => "DC$1", - Desc => "Compatible with '$1'", - Fuzzy => 0); - $DC{$1}->Append($code); + $DC{$dshort} ||= Table->New(); + $DC{$dshort}->Append($code); } else { - $Deco{Canon}->Append($code); + $DC{can}->Append($code); } } } @@ -794,8 +856,9 @@ sub UnicodeData_Txt() $Cat{C}->Replace($Cat{C}->Merge($Cat{Cn})); ## Now merge in Cn into C - # L& is Ll, Lu, and Lt. - New_Prop(Is => 'L&', + # LC is Ll, Lu, and Lt. + # (used to be L& or L_, but PropValueAliases.txt defines it as LC) + New_Prop(Is => 'LC', Table->Merge(@Cat{qw[Ll Lu Lt]}), Desc => '[\p{Ll}\p{Lu}\p{Lt}]', Fuzzy => 0); @@ -861,15 +924,54 @@ sub UnicodeData_Txt() ## Now dump the files. ## $Name->Write("Name.pl"); - $Bidi->Write("Bidirectional.pl"); + + # $Bidi->Write("Bidirectional.pl"); + mkdir("lib/bc", 0755); + for (keys %Bidi) { + $Bidi{$_}->Write( + "lib/bc/$_.pl", + "BidiClass category '$PropValueAlias{bc}{$_}'" + ); + } + $Comb->Write("CombiningClass.pl"); + mkdir("lib/ccc", 0755); + for (keys %{ $PropValueAlias{ccc} }) { + my ($code, $name) = @{ $PropValueAlias{ccc}{$_} }; + (my $c = Table->New())->Append($code); + $c->Write( + "lib/ccc/$_.pl", + "CombiningClass category '$name'" + ); + } + $Deco->Write("Decomposition.pl"); - $Number->Write("Number.pl"); - $General->Write("Category.pl"); + mkdir("lib/dt", 0755); + for (keys %DC) { + $DC{$_}->Write( + "lib/dt/$_.pl", + "DecompositionType category '$PropValueAlias{dt}{$_}'" + ); + } + + # $Number->Write("Number.pl"); + mkdir("lib/nt", 0755); + for (keys %Number) { + $Number{$_}->Write( + "lib/nt/$_.pl", + "NumericType category '$PropValueAlias{nt}{$_}'" + ); + } + + # $General->Write("Category.pl"); for my $to (sort keys %To) { $To{$to}->Write("To/$to.pl"); } + + for (keys %{ $PropValueAlias{gc} }) { + New_Alias(Is => $PropValueAlias{gc}{$_}, SameAs => $_, Fuzzy => 1); + } } ## @@ -892,9 +994,7 @@ sub LineBreak_Txt() $Lbrk->Append($first, $lbrk); - $Lbrk{$lbrk} ||= Table->New(Is => "Lbrk$lbrk", - Desc => "Linebreak category '$lbrk'", - Fuzzy => 0); + $Lbrk{$lbrk} ||= Table->New(); $Lbrk{$lbrk}->Append($first); if ($last) { @@ -904,7 +1004,16 @@ sub LineBreak_Txt() } close IN; - $Lbrk->Write("Lbrk.pl"); + # $Lbrk->Write("Lbrk.pl"); + + mkdir("lib/lb", 0755); + + for (keys %Lbrk) { + $Lbrk{$_}->Write( + "lib/lb/$_.pl", + "Linebreak category '$PropValueAlias{lb}{$_}'" + ); + } } ## @@ -919,6 +1028,8 @@ sub ArabicShaping_txt() my $ArabLink = Table->New(); my $ArabLinkGroup = Table->New(); + my %JoinType; + while () { next unless /^[0-9A-Fa-f]+;/; @@ -928,11 +1039,90 @@ sub ArabicShaping_txt() my $code = hex($hexcode); $ArabLink->Append($code, $link); $ArabLinkGroup->Append($code, $linkgroup); + + $JoinType{$link} ||= Table->New(Is => "JoinType$link"); + $JoinType{$link}->Append($code); } close IN; - $ArabLink->Write("ArabLink.pl"); - $ArabLinkGroup->Write("ArabLnkGrp.pl"); + # $ArabLink->Write("ArabLink.pl"); + # $ArabLinkGroup->Write("ArabLnkGrp.pl"); + + mkdir("lib/jt", 0755); + + for (keys %JoinType) { + $JoinType{$_}->Write( + "lib/jt/$_.pl", + "JoiningType category '$PropValueAlias{jt}{$_}'" + ); + } +} + +## +## Process EastAsianWidth.txt. +## +sub EastAsianWidth_txt() +{ + if (not open IN, "EastAsianWidth.txt") { + die "$0: EastAsianWidth.txt: $!\n"; + } + + my %EAW; + + while () + { + next unless /^[0-9A-Fa-f]+;/; + s/#.*//; + s/\s+$//; + + my ($hexcode, $pv) = split(/\s*;\s*/); + my $code = hex($hexcode); + $EAW{$pv} ||= Table->New(Is => "EastAsianWidth$pv"); + $EAW{$pv}->Append($code); + } + close IN; + + mkdir("lib/ea", 0755); + + for (keys %EAW) { + $EAW{$_}->Write( + "lib/ea/$_.pl", + "EastAsianWidth category '$PropValueAlias{ea}{$_}'" + ); + } +} + +## +## Process HangulSyllableType.txt. +## +sub HangulSyllableType_txt() +{ + if (not open IN, "HangulSyllableType.txt") { + die "$0: HangulSyllableType.txt: $!\n"; + } + + my %HST; + + while () + { + next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\w+)/; + my ($first, $last, $pv) = (hex($1), hex($2||""), $3); + + $HST{$pv} ||= Table->New(Is => "HangulSyllableType$pv"); + $HST{$pv}->Append($first); + + if ($last) { $HST{$pv}->Extend($last) } + } + close IN; + + mkdir("lib/hst", 0755); + + for (keys %HST) { + $HST{$_}->Write( + "lib/hst/$_.pl", + "HangulSyllableType category '$PropValueAlias{hst}{$_}'" + ); + } } ## @@ -953,7 +1143,7 @@ sub Jamo_txt() $Short->Append($code, $short); } close IN; - $Short->Write("JamoShort.pl"); + # $Short->Write("JamoShort.pl"); } ## @@ -996,7 +1186,7 @@ sub Scripts_txt() } } - $Scripts->Write("Scripts.pl"); + # $Scripts->Write("Scripts.pl"); ## Common is everything not explicitly assigned to a Script ## @@ -1057,7 +1247,7 @@ sub Blocks_txt() } close IN; - $Blocks->Write("Blocks.pl"); + # $Blocks->Write("Blocks.pl"); } ## @@ -1105,6 +1295,14 @@ sub PropList_txt() } } + for (keys %Prop) { + (my $file = $PA_reverse{$_}) =~ tr/_//d; + $Prop{$_}->Write( + "lib/gc_sc/$file.pl", + "Binary property '$_'" + ); + } + # Alphabetic is L and Other_Alphabetic. New_Prop(Is => 'Alphabetic', Table->Merge($Cat{L}, $Prop{Other_Alphabetic}), @@ -1142,65 +1340,6 @@ sub PropList_txt() Fuzzy => 1); } -sub Make_GC_Aliases() -{ - ## - ## The mapping from General Category long forms to short forms is - ## currently hardwired here since no simple data file in the UCD - ## seems to do that. Unicode 3.2 will assumedly correct this. - ## - my %Is = ( - 'Letter' => 'L', - 'Uppercase_Letter' => 'Lu', - 'Lowercase_Letter' => 'Ll', - 'Titlecase_Letter' => 'Lt', - 'Modifier_Letter' => 'Lm', - 'Other_Letter' => 'Lo', - - 'Mark' => 'M', - 'Non_Spacing_Mark' => 'Mn', - 'Spacing_Mark' => 'Mc', - 'Enclosing_Mark' => 'Me', - - 'Separator' => 'Z', - 'Space_Separator' => 'Zs', - 'Line_Separator' => 'Zl', - 'Paragraph_Separator' => 'Zp', - - 'Number' => 'N', - 'Decimal_Number' => 'Nd', - 'Letter_Number' => 'Nl', - 'Other_Number' => 'No', - - 'Punctuation' => 'P', - 'Connector_Punctuation' => 'Pc', - 'Dash_Punctuation' => 'Pd', - 'Open_Punctuation' => 'Ps', - 'Close_Punctuation' => 'Pe', - 'Initial_Punctuation' => 'Pi', - 'Final_Punctuation' => 'Pf', - 'Other_Punctuation' => 'Po', - - 'Symbol' => 'S', - 'Math_Symbol' => 'Sm', - 'Currency_Symbol' => 'Sc', - 'Modifier_Symbol' => 'Sk', - 'Other_Symbol' => 'So', - - 'Other' => 'C', - 'Control' => 'Cc', - 'Format' => 'Cf', - 'Surrogate' => 'Cs', - 'Private Use' => 'Co', - 'Unassigned' => 'Cn', - ); - - ## make the aliases.... - while (my ($Alias, $Name) = each %Is) { - New_Alias(Is => $Alias, SameAs => $Name, Fuzzy => 1); - } -} - ## ## These are used in: @@ -1423,7 +1562,9 @@ sub WriteAllMappings() my $filename; { ## 'Is' items lose 'Is' from the basename. - $filename = $Type eq 'Is' ? $Name : "$Type$Name"; + $filename = $Type eq 'Is' ? + ($PVA_reverse{sc}{$Name} || $Name) : + "$Type$Name"; $filename =~ s/[^\w_]+/_/g; # "L&" -> "L_" substr($filename, 8) = '' if length($filename) > 8; @@ -1496,7 +1637,7 @@ sub WriteAllMappings() ## ## Okay, write the file... ## - $Table->Write("lib/$filename.pl", $Comment); + $Table->Write("lib/gc_sc/$filename.pl", $Comment); ## and register it $RawNameToFile{$Name} = $filename; @@ -1560,10 +1701,11 @@ sub WriteAllMappings() "##\n", "## Data in this file used by ../utf8_heavy.pl\n", "##\n\n", - "## Mapping from name to filename in ./lib\n", + "## Mapping from name to filename in ./lib/gc_sc\n", "%utf8::Exact = (\n", ); + $Exact{InGreek} = 'InGreekA'; # this is evil kludge for my $Name (sort keys %Exact) { my $File = $Exact{$Name}; @@ -1651,16 +1793,18 @@ sub SpecialCasing_txt() { my $NormalCase = do "To/$case.pl" || die "$0: $@\n"; - my @OUT = ( - $HEADER, "\n", - "%utf8::ToSpec$case =\n(\n", - ); + my @OUT = + ( + $HEADER, "\n", + "# The key UTF-8 _bytes_, the value UTF-8 (speed hack)\n", + "%utf8::ToSpec$case =\n(\n", + ); for my $prop (sort { $a->[0] <=> $b->[0] } @{$CaseInfo{$case}}) { my ($ix, $code, $to) = @$prop; my $tostr = join "", map { sprintf "\\x{%s}", $_ } split ' ', $to; - push @OUT, sprintf qq['%04X' => "$tostr",\n], $ix; + push @OUT, sprintf qq["%s" => "$tostr",\n], join("", map { sprintf "\\x%02X", $_ } unpack("U0C*", pack("U", $ix))); # Remove any single-character mappings for # the same character since we are going for # the special casing rules. @@ -1712,14 +1856,16 @@ sub CaseFolding_txt() # my $CommonFold = do "To/Fold.pl" || die "$0: To/Fold.pl: $!\n"; - my @OUT = ( - $HEADER, "\n", - "%utf8::ToSpecFold =\n(\n", - ); + my @OUT = + ( + $HEADER, "\n", + "# The ke UTF-8 _bytes_, the value UTF-8 (speed hack)\n", + "%utf8::ToSpecFold =\n(\n", + ); for my $code (sort { $a <=> $b } keys %Fold) { my $foldstr = join "", map { sprintf "\\x{%s}", $_ } split ' ', $Fold{$code}; - push @OUT, sprintf qq['%04X' => "$foldstr",\n], $code; + push @OUT, sprintf qq["%s" => "$foldstr",\n], join("", map { sprintf "\\x%02X", $_ } unpack("U0C*", pack("U", $code))); } push @OUT, ( ");\n\n", @@ -1733,8 +1879,8 @@ sub CaseFolding_txt() ## Do it.... +Build_Aliases(); UnicodeData_Txt(); -Make_GC_Aliases(); PropList_txt(); Scripts_txt(); @@ -1744,6 +1890,8 @@ WriteAllMappings(); LineBreak_Txt(); ArabicShaping_txt(); +EastAsianWidth_txt(); +HangulSyllableType_txt(); Jamo_txt(); SpecialCasing_txt(); CaseFolding_txt();