#!/usr/bin/perl -w # # mktables -- create the runtime Perl Unicode files (lib/unicore/**/*.pl) # from the Unicode database files (lib/unicore/*.txt). # use strict; my $LastUnicodeCodepoint = 0x10FFFF; # As of Unicode 3.1.1. mkdir("In", 0755); mkdir("Is", 0755); mkdir("To", 0755); sub extend { my ($table, $last) = @_; $table->[-1]->[1] = $last; } sub append { my ($table, $code, $name) = @_; if (@$table && hex($table->[-1]->[1]) == hex($code) - 1 && (!defined $name || $table->[-1]->[2] eq $name)) { extend($table, $code); } else { push @$table, [$code, $code, $name]; } } sub append_range { my ($table, $code_ini, $code_fin, $name) = @_; append($table, $code_ini, $name); extend($table, $code_fin); } sub inverse { my ($table) = @_; my $inverse = []; my ($first, $last); if ($table->[0]->[0]) { $last = hex($table->[0]->[0]); push @$inverse, [ "0000", sprintf("%04X", $last - 1) ]; } for my $i (0..$#$table-1) { $first = defined $table->[$i ]->[1] ? hex($table->[$i ]->[1]) : 0; $last = defined $table->[$i + 1]->[0] ? hex($table->[$i + 1]->[0]) : $first; push @$inverse, [ sprintf("%04X", $first + 1), sprintf("%04X", $last - 1) ] unless $first + 1 == $last; } return $inverse; } sub header { my $fh = shift; print $fh <$file")) { header($fh); begin($fh); for my $i (@$table) { print $fh $i->[0], "\t", $i->[1] ne $i->[0] ? $i->[1] : "", "\t", defined $i->[2] ? $i->[2] : "", "\n"; } end($fh); close($fh); } else { die "$0: $file: $!\n"; } } # # The %In contains the mapping of the script/block name into a number. # my %In; my $InId = 0; my %InIn; my %InScript; my %InBlock; # # Read in the Unicode.txt, the main Unicode database. # my %Cat; my %General; my @General; sub gencat { my ($Name, $GeneralH, $GeneralA, $Cat, $name, $cat, $code, $op) = @_; $op->($Name, $code, $name); $op->($GeneralA, $code, $cat); $op->($GeneralH->{$name} ||= [], $code, $name); $op->($Cat->{$cat} ||= [], $code); $op->($Cat->{substr($cat, 0, 1)} ||= [], $code); # 005F: SPACING UNDERSCORE $op->($Cat->{Word} ||= [], $code) if $cat =~ /^[LMN]/ || $code eq "005F"; $op->($Cat->{Alnum} ||= [], $code) if $cat =~ /^[LMN]/; $op->($Cat->{Alpha} ||= [], $code) if $cat =~ /^[LM]/; # 0009: HORIZONTAL TABULATION # 000A: LINE FEED # 000B: VERTICAL TABULATION # 000C: FORM FEED # 000D: CARRIAGE RETURN # 0020: SPACE $op->($Cat->{Space} ||= [], $code) if $cat =~ /^Z/ || $code =~ /^(0009|000A|000B|000C|000D)$/; $op->($Cat->{SpacePerl} ||= [], $code) if $cat =~ /^Z/ || $code =~ /^(?:0085|2028|2029)$/ || $code =~ /^(0009|000A|000C|000D)$/; $op->($Cat->{Blank} ||= [], $code) if $code =~ /^(0020|0009)$/ || $cat =~ /^Z[^lp]$/; $op->($Cat->{Digit} ||= [], $code) if $cat eq "Nd"; $op->($Cat->{Upper} ||= [], $code) if $cat eq "Lu"; $op->($Cat->{Lower} ||= [], $code) if $cat eq "Ll"; $op->($Cat->{Title} ||= [], $code) if $cat eq "Lt"; $op->($Cat->{ASCII} ||= [], $code) if $code le "007F"; $op->($Cat->{Cntrl} ||= [], $code) if $cat =~ /^C/; $op->($Cat->{Graph} ||= [], $code) if $cat =~ /^([LMNPS]|Co)/; $op->($Cat->{Print} ||= [], $code) if $cat =~ /^([LMNPS]|Co|Zs)/; $op->($Cat->{Punct} ||= [], $code) if $cat =~ /^P/; # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f $op->($Cat->{XDigit} ||= [], $code) if $code =~ /^00(3[0-9]|[46][1-6])$/; } if (open(my $Unicode, "Unicode.txt")) { my @Name; my @Bidi; my %Bidi; my @Comb; my @Deco; my %Deco; my %DC; my @Number; my @Mirrored; my %To; my $LastCodeInt = -1; # a numeric, not a hexadecimal string. # UnicodeData-3.1.0.html says # no characters in the file have the property, Cn, Not Assigned. sub check_no_characters { # in the scope of my $LastCodeInt; my $code = shift; my $diff_from_last = hex($code) - $LastCodeInt; my $code_ini = sprintf("%04X", $LastCodeInt + 1); $LastCodeInt = hex($code); if ($diff_from_last == 1) { return; } elsif ($diff_from_last == 2) { append($Cat{Cn} ||= [], $code_ini); append($Cat{C} ||= [], $code_ini); } else { my $code_fin = sprintf("%04X", hex($code) - 1); append_range($Cat{Cn} ||= [], $code_ini, $code_fin); append_range($Cat{C} ||= [], $code_ini, $code_fin); } } while (<$Unicode>) { next unless /^[0-9A-Fa-f]+;/; s/\s+$//; my ($code, $name, $cat, $comb, $bidi, $deco, $decimal, $digit, $number, $mirrored, $unicode10, $comment, $upper, $lower, $title) = split(/\s*;\s*/); if ($name =~ /^<(.+), (First|Last)>$/) { if($2 eq 'First') { check_no_characters($code); } else { $LastCodeInt = hex($code); } $name = $1; gencat(\@Name, \%General, \@General, \%Cat, $name, $cat, $code, $2 eq 'First' ? \&append : \&extend); unless (defined $In{$name}) { $In{$name} = $InId++; $InIn{$name} = $General{$name}; } } else { check_no_characters($code); gencat(\@Name, \%General, \@General, \%Cat, $name, $cat, $code, \&append); append($To{Upper} ||= [], $code, $upper) if $upper; append($To{Lower} ||= [], $code, $lower) if $lower; append($To{Title} ||= [], $code, $title) if $title; append($To{Digit} ||= [], $code, $decimal) if $decimal; append(\@Bidi, $code, $bidi); append($Bidi{$bidi} ||= [], $code); append(\@Comb, $code, $comb) if $comb; if ($deco) { append(\@Deco, $code, $deco); if ($deco =~/^<(\w+)>/) { append($Deco{Compat} ||= [], $code); append($DC{$1} ||= [], $code); } else { append($Deco{Canon} ||= [], $code); } } append(\@Number, $code, $number) if $number; append(\@Mirrored, $code) if $mirrored eq "Y"; } } check_no_characters(sprintf("%X", $LastUnicodeCodepoint + 1)); flush(\@Name, "Name.pl"); foreach my $cat (sort keys %Cat) { flush($Cat{$cat}, "Is/$cat.pl"); } foreach my $to (sort keys %To) { flush($To{$to}, "To/$to.pl"); } flush(\@Bidi, "Bidirectional.pl"); foreach my $bidi (sort keys %Bidi) { flush($Bidi{$bidi}, "Is/Bidi$bidi.pl"); } flush(\@Comb, "CombiningClass.pl"); flush(\@Deco, "Decomposition.pl"); foreach my $deco (sort keys %Deco) { flush($Deco{$deco}, "Is/Deco$deco.pl"); } foreach my $dc (sort keys %DC) { flush($DC{$dc}, "Is/DC$dc.pl"); } flush(\@Number, "Number.pl"); flush(\@Mirrored, "Is/Mirrored.pl"); } else { die "$0: Unicode.txt: $!\n"; } # The general cateory can be written out already now. flush(\@General, "Category.pl"); # # Read in the LineBrk.txt. # if (open(my $LineBrk, "LineBrk.txt")) { my @Lbrk; my %Lbrk; while (<$LineBrk>) { next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\w+)/; my ($first, $last, $lbrk) = ($1, $2, $3); append(\@Lbrk, $first, $lbrk); append($Lbrk{$lbrk} ||= [], $first); if (defined $last) { extend(\@Lbrk, $last); extend($Lbrk{$lbrk}, $last); } } flush(\@Lbrk, "Lbrk.pl"); foreach my $lbrk (sort keys %Lbrk) { flush($Lbrk{$lbrk}, "Is/Lbrk$lbrk.pl"); } } else { die "$0: LineBrk.txt: $!\n"; } # # Read in the ArabShap.txt. # if (open(my $ArabShap, "ArabShap.txt")) { my @ArabLink; my @ArabLinkGroup; while (<$ArabShap>) { next unless /^[0-9A-Fa-f]+;/; s/\s+$//; my ($code, $name, $link, $linkgroup) = split(/\s*;\s*/); append(\@ArabLink, $code, $link); append(\@ArabLinkGroup, $code, $linkgroup); } flush(\@ArabLink, "ArabLink.pl"); flush(\@ArabLinkGroup, "ArabLnkGrp.pl"); } else { die "$0: ArabShap.txt: $!\n"; } # # Read in the Jamo.txt. # if (open(my $Jamo, "Jamo.txt")) { my @Short; while (<$Jamo>) { next unless /^([0-9A-Fa-f]+)\s*;\s*(\w*)/; my ($code, $short) = ($1, $2); append(\@Short, $code, $short); } flush(\@Short, "JamoShort.pl"); } else { die "$0: Jamo.txt: $!\n"; } # # Read in the Scripts.txt. # my @Scripts; if (open(my $Scripts, "Scripts.txt")) { while (<$Scripts>) { next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/; # Wait until all the scripts have been read since # they are not listed in numeric order. push @Scripts, [ hex($1), $1, $2, $3 ]; } } else { die "$0: Scripts.txt: $!\n"; } # Now append the scripts properties in their code point order. my %Script; my $Scripts = []; for my $script (sort { $a->[0] <=> $b->[0] } @Scripts) { my ($code, $first, $last, $name) = @$script; append($Scripts, $first, $name); append($Script{$name} ||= [], $first, $name); if (defined $last) { extend($Scripts, $last); extend($Script{$name}, $last); } unless (defined $In{$name}) { $InScript{$InId} = $name; $In{$name} = $InId++; $InIn{$name} = $Script{$name}; } } # Scripts.pl can be written out already now. flush(\@Scripts, "Scripts.pl"); # Common is everything not explicitly assigned to a Script $In{Common} = $InId++; my $Common = inverse($Scripts); $InIn{Common} = $Common; # # Read in the Blocks.txt. # my @Blocks; my %Blocks; if (open(my $Blocks, "Blocks.txt")) { while (<$Blocks>) { next unless /^([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.+?)\s*$/; my ($first, $last, $name) = ($1, $2, $3); my $origname = $name; # If there's a naming conflict (the script names are # in uppercase), the name of the block has " Block" # appended to it. my $pat = $name; $pat =~ s/([- _])/(?:[-_]|\\s+)?/g; for my $i (values %InScript) { if ($i =~ /^$pat$/i) { $name .= " Block"; last; } } append(\@Blocks, $first, $name); append($Blocks{$name} ||= [], $first, $name); if (defined $last) { extend(\@Blocks, $last); extend($Blocks{$name}, $last); } unless (defined $In{$name}) { $InBlock{$InId} = $origname; $In{$name} = $InId++; $InIn{$name} = $Blocks{$name}; } } } else { die "$0: Blocks.txt: $!\n"; } # Blocks.pl can be written out already now. flush(\@Blocks, "Blocks.pl"); # # Read in the PropList.txt. It contains extended properties not # listed in the Unicode.txt, such as 'Other_Alphabetic': # alphabetic but not of the general category L; many modifiers # belong to this extended property category: while they are not # alphabets, they are alphabetic in nature. # my @Props; if (open(my $Props, "PropList.txt")) { while (<$Props>) { next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/; # Wait until all the extended properties have been read since # they are not listed in numeric order. push @Props, [ hex($1), $1, $2, $3 ]; } } else { die "$0: PropList.txt: $!\n"; } # Now append the extended properties in their code point order. my %Prop; my $Props = []; for my $prop (sort { $a->[0] <=> $b->[0] } @Props) { my ($code, $first, $last, $name) = @$prop; append($Props, $first, $name); append($Prop{$name} ||= [], $first, $name); if (defined $last) { extend($Props, $last); extend($Prop{$name}, $last); } unless (defined $In{$name}) { $In{$name} = $InId++; $InIn{$name} = $Prop{$name}; } } # Assigned is everything not Cn $In{Assigned} = $InId++; my $Assigned = inverse($Cat{Cn}); $InIn{Assigned} = $Assigned; # Unassigned is everything not Assigned $In{Unassigned} = $InId++; my $Unassigned = $Cat{Cn}; $InIn{Unassigned} = $Unassigned; # Unassigned is everything not Assigned sub merge_general_and_extended { my ($name, $general, $extended) = @_; my $merged; push @$merged, map { pop @{$_}; $_ } sort { $a->[2] <=> $b->[2] } map { [ $_->[0], $_->[1], hex($_->[0]) ] } ($general ? map { ref $_ ? @$_ : $_ } @Cat {ref $general ? @$general : $general } : (), $extended ? map { ref $_ ? @$_ : $_ } @Prop{ref $extended ? @$extended : $extended} : ()); $In{$name} = $InId++; $InIn{$name} = $merged; return $merged; } # Alphabetic is L and Other_Alphabetic. my $Alphabetic = merge_general_and_extended('Alphabetic', 'L', 'Other_Alphabetic'); # Lowercase is Ll and Other_Lowercase. my $Lowercase = merge_general_and_extended('Lowercase', 'Ll', 'Other_Lowercase'); # Uppercase is Lu and Other_Uppercase. my $Uppercase = merge_general_and_extended('Uppercase', 'Lu', 'Other_Uppercase'); # Math is Sm and Other_Math. my $Math = merge_general_and_extended('Math', 'Sm', 'Other_Math'); # Lampersand is Ll, Lu, and Lt. my $Lampersand = merge_general_and_extended('Lampersand', [ qw(Ll Lu Lt) ]); # ID_Start is Ll, Lu, Lt, Lm, Lo, and Nl. my $ID_Start = merge_general_and_extended('ID_Start', [ qw(Ll Lu Lt Lm Lo Nl) ]); # ID_Continue is ID_Start, Mn, Mc, Nd, and Pc. my $ID_Continue = merge_general_and_extended('ID_Continue', [ qw(Ll Lu Lt Lm Lo Nl Mn Mc Nd Pc) ]); # # Any is any. # $In{Any} = $InId++; my $Any = [ [ 0, sprintf("%04X", $LastUnicodeCodepoint) ] ]; $InIn{Any} = $Any; # # All is any, too. # $In{All} = $InId++; $InIn{All} = $Any; # # mapping() will be used to write out the In and Is virtual mappings. # sub mapping { my ($map, $name) = @_; if (open(my $fh, ">$name.pl")) { print "$name.pl\n"; header($fh); # The %pat will hold a hash that maps the first two # lowercased letters of a class to a 'fuzzified' regular # expression that points to the real mapping. my %pat; # But first write out the offical name to real name # (the filename) mapping. print $fh < '$map->{$i}',\n", "'$i'"; } print $fh < {\n"; foreach my $ipat (@{$pat{$prefix}}) { my ($i, $pat) = @$ipat; print $fh "\t'$pat' => '$map->{$i}',\n"; } print $fh "},\n"; } print $fh <>In.pl")) { print $In < $b } keys %InScript) { printf $In "%4d => '$InScript{$i}',\n", $i; } print $In < $b } keys %InBlock) { printf $In "%4d => '$InBlock{$i}',\n", $i; } print $In < $In{$b} } keys %In) { flush($InIn{$in}, "In/$In{$in}.pl"); } # # The mapping from General Category long forms to short forms is # currently hardwired here since no simple data file in the UCD # seems to do that. Unicode 3.2 will assumedly correct this. # my %Is = ( 'Letter' => 'L', 'Uppercase_Letter' => 'Lu', 'Lowercase_Letter' => 'Ll', 'Titlecase_Letter' => 'Lt', 'Modifier_Letter' => 'Lm', 'Other_Letter' => 'Lo', 'Mark' => 'M', 'Non_Spacing_Mark' => 'Mn', 'Spacing_Mark' => 'Mc', 'Enclosing_Mark' => 'Me', 'Separator' => 'Z', 'Space_Separator' => 'Zs', 'Line_Separator' => 'Zl', 'Paragraph_Separator' => 'Zp', 'Number' => 'N', 'Decimal_Number' => 'Nd', 'Letter_Number' => 'Nl', 'Other_Number' => 'No', 'Punctuation' => 'P', 'Connector_Punctuation' => 'Pc', 'Dash_Punctuation' => 'Pd', 'Open_Punctuation' => 'Ps', 'Close_Punctuation' => 'Pe', 'Initial_Punctuation' => 'Pi', 'Final_Punctuation' => 'Pf', 'Other_Punctuation' => 'Po', 'Symbol' => 'S', 'Math_Symbol' => 'Sm', 'Currency_Symbol' => 'Sc', 'Modifier_Symbol' => 'Sk', 'Other_Symbol' => 'So', 'Other' => 'C', 'Control' => 'Cc', 'Format' => 'Cf', 'Surrogate' => 'Cs', 'Private Use' => 'Co', 'Unassigned' => 'Cn', ); # # Write out the virtual Is mappings. # mapping(\%Is, "Is"); # # Read in the special cases. # my %Case; if (open(my $SpecCase, "SpecCase.txt")) { while (<$SpecCase>) { next unless /^[0-9A-Fa-f]+;/; s/\#.*//; s/\s+$//; my ($code, $lower, $title, $upper, $condition) = split(/\s*;\s*/); if ($condition) { # not implemented yet print "# SKIPPING $_\n"; next; } # Wait until all the special cases have been read since # they are not listed in numeric order. my $ix = hex($code); push @{$Case{Lower}}, [ $ix, $code, $lower ]; push @{$Case{Title}}, [ $ix, $code, $title ]; push @{$Case{Upper}}, [ $ix, $code, $upper ]; } } else { die "$0: SpecCase.txt: $!\n"; } # Now write out the special cases properties in their code point order. # Prepend them to the To/{Upper,Lower,Title}.pl. for my $case (qw(Lower Title Upper)) { my $NormalCase = do "To/$case.pl" || die "$0: To/$case.pl: $!\n"; if (open(my $Case, ">To/$case.pl")) { header($Case); print $Case <[0] <=> $b->[0] } @{$Case{$case}}) { my ($ix, $code, $to) = @$prop; my $tostr = join "", map { sprintf "\\x{%s}", $_ } split ' ', $to; printf $Case qq['%04X' => "$tostr",\n], $ix; } print $Case <) { # Skip status 'S', simple case folding next unless /^([0-9A-Fa-f]+)\s*;\s*([CFI])\s*;\s*([0-9A-Fa-f]+(?: [0-9A-Fa-f]+)*)\s*;/; my ($code, $status, $fold) = ($1, $2, $3); if ($status eq 'C') { # Common: one-to-one folding # No append() since several codes may fold into one. push @Fold, [ $code, $code, $fold ]; } else { # F: full, or I: dotted uppercase I -> dotless lowercase I $Fold{hex($code)} = $fold; } } flush(\@Fold, "To/Fold.pl"); # # Prepend the special foldings to the common foldings. # my $CommonFold = do "To/Fold.pl" || die "$0: To/Fold.pl: $!\n"; if (open(my $Fold, ">To/Fold.pl")) { header($Fold); print $Fold < $b } keys %Fold) { my $foldstr = join "", map { sprintf "\\x{%s}", $_ } split ' ', $Fold{$code}; printf $Fold qq['%04X' => "$foldstr",\n], $code; } print $Fold <