#!/usr/bin/perl -w # # mktables -- create the runtime Perl Unicode files (lib/unicore/**/*.pl) # from the Unicode database files (lib/unicore/*.txt). # use strict; my $LastUnicodeCodepoint = 0x10FFFF; # As of Unicode 3.1.1. mkdir("In", 0755); mkdir("Is", 0755); mkdir("To", 0755); sub extend { my ($table, $last) = @_; $table->[-1]->[1] = $last; } sub append { my ($table, $code, $name) = @_; if (@$table && hex($table->[-1]->[1]) == hex($code) - 1 && (!defined $name || $table->[-1]->[2] eq $name)) { extend($table, $code); } else { push @$table, [$code, $code, $name]; } } sub inverse { my ($table) = @_; my $inverse = []; my ($first, $last); if ($table->[0]->[0]) { $last = hex($table->[0]->[0]); push @$inverse, [ "0000", sprintf("%04X", $last - 1) ]; } for my $i (0..$#$table-1) { $first = defined $table->[$i ]->[1] ? hex($table->[$i ]->[1]) : 0; $last = defined $table->[$i + 1]->[0] ? hex($table->[$i + 1]->[0]) : $first; push @$inverse, [ sprintf("%04X", $first + 1), sprintf("%04X", $last - 1) ] unless $first + 1 == $last; } return $inverse; } sub header { my $fh = shift; print $fh <$file")) { header($fh); begin($fh); for my $i (@$table) { print $fh $i->[0], "\t", $i->[1] ne $i->[0] ? $i->[1] : "", "\t", defined $i->[2] ? $i->[2] : "", "\n"; } end($fh); close($fh); } else { die "$0: $file: $!\n"; } } # # The %In contains the mapping of the script/block name into a number. # my %In; my $InId = 0; my %InIn; # # Read in the Unicode.txt, the main Unicode database. # my %Cat; my %General; my @General; if (open(my $Unicode, "Unicode.txt")) { my @Name; my @Bidi; my %Bidi; my @Comb; my @Deco; my %Deco; my %DC; my @Number; my @Mirrored; my %To; while (<$Unicode>) { next unless /^[0-9A-Fa-f]+;/; s/\s+$//; my ($code, $name, $cat, $comb, $bidi, $deco, $decimal, $digit, $number, $mirrored, $unicode10, $comment, $upper, $lower, $title) = split(/\s*;\s*/); if ($name =~ /^<(.+), (First|Last)>$/) { $name = $1; if ($2 eq 'First') { append($General{$name} ||= [], $code, $name); } else { extend($General{$name} , $code); } unless (defined $In{$name}) { $In{$name} = $InId++; $InIn{$name} = $General{$name}; } append($Cat{$cat} ||= [], $code); append($Cat{substr($cat, 0, 1)} ||= [], $code); } else { append(\@Name, $code, $name); append(\@General, $code, $cat); append($Cat{$cat} ||= [], $code); append($Cat{substr($cat, 0, 1)} ||= [], $code); # 005F: SPACING UNDERSCORE append($Cat{Word} ||= [], $code) if $cat =~ /^[LMN]/ or $code eq "005F"; append($Cat{Alnum} ||= [], $code) if $cat =~ /^[LMN]/; append($Cat{Alpha} ||= [], $code) if $cat =~ /^[LM]/; # 0009: HORIZONTAL TABULATION # 000A: LINE FEED # 000B: VERTICAL TABULATION # 000C: FORM FEED # 000D: CARRIAGE RETURN # 0020: SPACE append($Cat{Space} ||= [], $code) if $cat =~ /^Z/ || $code =~ /^(0009|000A|000B|000C|000D)$/; append($Cat{SpacePerl} ||= [], $code) if $cat =~ /^Z/ || $code =~ /^(0009|000A|000C|000D)$/; append($Cat{Blank} ||= [], $code) if $code =~ /^(0020|0009)$/ || $cat =~ /^Z[^lp]$/; append($Cat{Digit} ||= [], $code) if $cat eq "Nd"; append($Cat{Upper} ||= [], $code) if $cat eq "Lu"; append($Cat{Lower} ||= [], $code) if $cat eq "Ll"; append($Cat{Title} ||= [], $code) if $cat eq "Lt"; append($Cat{ASCII} ||= [], $code) if $code le "007F"; append($Cat{Cntrl} ||= [], $code) if $cat =~ /^C/; append($Cat{Graph} ||= [], $code) if $cat =~ /^([LMNPS]|Co)/; append($Cat{Print} ||= [], $code) if $cat =~ /^([LMNPS]|Co|Zs)/; append($Cat{Punct} ||= [], $code) if $cat =~ /^P/; # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f append($Cat{XDigit} ||= [], $code) if $code =~ /^00(3[0-9]|[46][1-6])$/; append($To{Upper} ||= [], $code, $upper) if $upper; append($To{Lower} ||= [], $code, $lower) if $lower; append($To{Title} ||= [], $code, $title) if $title; append($To{Digit} ||= [], $code, $decimal) if $decimal; append(\@Bidi, $code, $bidi); append($Bidi{$bidi} ||= [], $code); append(\@Comb, $code, $comb) if $comb; if ($deco) { append(\@Deco, $code, $deco); if ($deco =~/^<(\w+)>/) { append($Deco{Compat} ||= [], $code); append($DC{$1} ||= [], $code); } else { append($Deco{Canon} ||= [], $code); } } append(\@Number, $code, $number) if $number; append(\@Mirrored, $code) if $mirrored eq "Y"; } } flush(\@Name, "Name.pl"); foreach my $cat (sort keys %Cat) { flush($Cat{$cat}, "Is/$cat.pl"); } foreach my $to (sort keys %To) { flush($To{$to}, "To/$to.pl"); } flush(\@Bidi, "Bidirectional.pl"); foreach my $bidi (sort keys %Bidi) { flush($Bidi{$bidi}, "Is/Bidi$bidi.pl"); } flush(\@Comb, "CombiningClass.pl"); flush(\@Deco, "Decomposition.pl"); foreach my $deco (sort keys %Deco) { flush($Deco{$deco}, "Is/Deco$deco.pl"); } foreach my $dc (sort keys %DC) { flush($DC{$dc}, "Is/DC$dc.pl"); } flush(\@Number, "Number.pl"); flush(\@Mirrored, "Is/Mirrored.pl"); } else { die "$0: Unicode.txt: $!\n"; } # The general cateory can be written out already now. flush(\@General, "Category.pl"); # # Read in the LineBrk.txt. # if (open(my $LineBrk, "LineBrk.txt")) { my @Lbrk; my %Lbrk; while (<$LineBrk>) { next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\w+)/; my ($first, $last, $lbrk) = ($1, $2, $3); append(\@Lbrk, $first, $lbrk); append($Lbrk{$lbrk} ||= [], $first); if (defined $last) { extend(\@Lbrk, $last); extend($Lbrk{$lbrk}, $last); } } flush(\@Lbrk, "Lbrk.pl"); foreach my $lbrk (sort keys %Lbrk) { flush($Lbrk{$lbrk}, "Is/Lbrk$lbrk.pl"); } } else { die "$0: LineBrk.txt: $!\n"; } # # Read in the ArabShap.txt. # if (open(my $ArabShap, "ArabShap.txt")) { my @ArabLink; my @ArabLinkGroup; while (<$ArabShap>) { next unless /^[0-9A-Fa-f]+;/; s/\s+$//; my ($code, $name, $link, $linkgroup) = split(/\s*;\s*/); append(\@ArabLink, $code, $link); append(\@ArabLinkGroup, $code, $linkgroup); } flush(\@ArabLink, "ArabLink.pl"); flush(\@ArabLinkGroup, "ArabLnkGrp.pl"); } else { die "$0: ArabShap.txt: $!\n"; } # # Read in the Jamo.txt. # if (open(my $Jamo, "Jamo.txt")) { my @Short; while (<$Jamo>) { next unless /^([0-9A-Fa-f]+)\s*;\s*(\w*)/; my ($code, $short) = ($1, $2); append(\@Short, $code, $short); } flush(\@Short, "JamoShort.pl"); } else { die "$0: Jamo.txt: $!\n"; } # # Read in the Scripts.txt. # my @Scripts; if (open(my $Scripts, "Scripts.txt")) { while (<$Scripts>) { next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/; # Wait until all the scripts have been read since # they are not listed in numeric order. push @Scripts, [ hex($1), $1, $2, $3 ]; } } else { die "$0: Scripts.txt: $!\n"; } # Now append the scripts properties in their code point order. my %Script; my $Scripts = []; for my $script (sort { $a->[0] <=> $b->[0] } @Scripts) { my ($code, $first, $last, $name) = @$script; append($Scripts, $first, $name); append($Script{$name} ||= [], $first, $name); if (defined $last) { extend($Scripts, $last); extend($Script{$name}, $last); } unless (defined $In{$name}) { $In{$name} = $InId++; $InIn{$name} = $Script{$name}; } } # Scripts.pl can be written out already now. flush(\@Scripts, "Scripts.pl"); # Common is everything not explicitly assigned to a Script $In{Common} = $InId++; my $Common = inverse($Scripts); $InIn{Common} = $Common; # # Read in the Blocks.txt. # my @Blocks; my %Blocks; if (open(my $Blocks, "Blocks.txt")) { while (<$Blocks>) { next unless /^([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.+?)\s*$/; my ($first, $last, $name) = ($1, $2, $3); # If there's a naming conflict (the script names are # in uppercase), the name of the block has " Block" # appended to it. $name = "$name Block" if defined $In{"\U$name"}; append(\@Blocks, $first, $name); append($Blocks{$name} ||= [], $first, $name); if (defined $last) { extend(\@Blocks, $last); extend($Blocks{$name}, $last); } unless (defined $In{$name}) { $In{$name} = $InId++; $InIn{$name} = $Blocks{$name}; } } } else { die "$0: Blocks.txt: $!\n"; } # Blocks.pl can be written out already now. flush(\@Blocks, "Blocks.pl"); # # Read in the PropList.txt. It contains extended properties not # listed in the Unicode.txt, such as 'Other_Alphabetic': # alphabetic but not of the general category L; many modifiers # belong to this extended property category: while they are not # alphabets, they are alphabetic in nature. # my @Props; if (open(my $Props, "PropList.txt")) { while (<$Props>) { next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/; # Wait until all the extended properties have been read since # they are not listed in numeric order. push @Props, [ hex($1), $1, $2, $3 ]; } } else { die "$0: PropList.txt: $!\n"; } # Now append the extended properties in their code point order. my %Prop; my $Props = []; for my $prop (sort { $a->[0] <=> $b->[0] } @Props) { my ($code, $first, $last, $name) = @$prop; append($Props, $first, $name); append($Prop{$name} ||= [], $first, $name); if (defined $last) { extend($Props, $last); extend($Prop{$name}, $last); } unless (defined $In{$name}) { $In{$name} = $InId++; $InIn{$name} = $Prop{$name}; } } # Assigned is everything not Cn aka Noncharacter_Code_Point $In{Assigned} = $InId++; my $Assigned = inverse($Prop{Noncharacter_Code_Point}); $InIn{Assigned} = $Assigned; sub merge_general_and_extended { my ($name, $general, $extended) = @_; my $merged; push @$merged, map { pop @{$_}; $_ } sort { $a->[2] <=> $b->[2] } map { [ $_->[0], $_->[1], hex($_->[0]) ] } ($general ? map { ref $_ ? @$_ : $_ } @Cat {ref $general ? @$general : $general } : (), $extended ? map { ref $_ ? @$_ : $_ } @Prop{ref $extended ? @$extended : $extended} : ()); $In{$name} = $InId++; $InIn{$name} = $merged; return $merged; } # Alphabetic is L and Other_Alphabetic. my $Alphabetic = merge_general_and_extended('Alphabetic', 'L', 'Other_Alphabetic'); # Lowercase is Ll and Other_Lowercase. my $Lowercase = merge_general_and_extended('Lowercase', 'Ll', 'Other_Lowercase'); # Uppercase is Lu and Other_Uppercase. my $Uppercase = merge_general_and_extended('Uppercase', 'Lu', 'Other_Uppercase'); # Math is Sm and Other_Math. my $Math = merge_general_and_extended('Math', 'Sm', 'Other_Math'); # Lampersand is Ll, Lu, and Lt. my $Lampersand = merge_general_and_extended('Lampersand', [ qw(Ll Lu Lt) ]); # ID_Start is Ll, Lu, Lt, Lm, Lo, and Nl. my $ID_Start = merge_general_and_extended('ID_Start', [ qw(Ll Lu Lt Lm Lo Nl) ]); # ID_Continue is ID_Start, Mn, Mc, Nd, and Pc. my $ID_Continue = merge_general_and_extended('ID_Continue', [ qw(Ll Lu Lt Lm Lo Nl Mn Mc Nd Pc) ]); # # Any is any. # $In{Any} = $InId++; my $Any = [ [ 0, sprintf("%04X", $LastUnicodeCodepoint) ] ]; $InIn{Any} = $Any; # # mapping() will be used to write out the In and Is virtual mappings. # sub mapping { my ($map, $name) = @_; if (open(my $fh, ">$name.pl")) { print "$name.pl\n"; header($fh); # The %pat will hold a hash that maps the first two # lowercased letters of a class to a 'fuzzified' regular # expression that points to the real mapping. my %pat; # But first write out the offical name to real name # (the filename) mapping. print $fh < '$map->{$i}',\n"; } print $fh < {\n"; foreach my $ipat (@{$pat{$prefix}}) { my ($i, $pat) = @$ipat; print $fh "\t'$pat' => '$map->{$i}',\n"; } print $fh "},\n"; } print $fh < $In{$b} } keys %In) { flush($InIn{$in}, "In/$In{$in}.pl"); } # # The mapping from General Category long forms to short forms is # currently hardwired here since no simple data file in the UCD # seems to do that. # my %Is = ( 'Letter' => 'L', 'Uppercase Letter' => 'Lu', 'Lowercase Letter' => 'Ll', 'Titlecase Letter' => 'Lt', 'Modifier Letter' => 'Lm', 'Other Letter' => 'Lo', 'Mark' => 'M', 'Non-Spacing Mark' => 'Mn', 'Spacing Combining Mark' => 'Mc', 'Enclosing Mark' => 'Me', 'Separator' => 'Z', 'Space Separator' => 'Zs', 'Line Separator' => 'Zl', 'Paragraph Separator' => 'Zp', 'Number' => 'N', 'Decimal Digit Number' => 'Nd', 'Letter Number' => 'Nl', 'Other Number' => 'No', 'Punctuation' => 'P', 'Connector Punctuation' => 'Pc', 'Dash Punctuation' => 'Pd', 'Open Punctuation' => 'Ps', 'Close Punctuation' => 'Pe', 'Initial Punctuation' => 'Pi', 'Final Punctuation' => 'Pf', 'Other Punctuation' => 'Po', 'Symbol' => 'S', 'Math Symbol' => 'Sm', 'Currency Symbol' => 'Sc', 'Modifier Symbol' => 'Sk', 'Other Symbol' => 'So', 'Other' => 'C', 'Control' => 'Cc', 'Format' => 'Cf', 'Surrogate' => 'Cs', 'Private Use' => 'Co', 'Not Assigned' => 'Cn', # 'Other' aliases 'Other Control' => 'Cc', 'Other Format' => 'Cf', 'Other Surrogate' => 'Cs', 'Other Private Use' => 'Co', 'Other Not Assigned' => 'Cn', ); # # Write out the virtual Is mappings. # mapping(\%Is, "Is"); # # Read in the special cases. # my %Case; if (open(my $SpecCase, "SpecCase.txt")) { while (<$SpecCase>) { next unless /^[0-9A-Fa-f]+;/; s/\#.*//; s/\s+$//; my ($code, $lower, $title, $upper, $condition) = split(/\s*;\s*/); if ($condition) { # not implemented yet print "# SKIPPING $_\n"; next; } # Wait until all the special cases have been read since # they are not listed in numeric order. my $ix = hex($code); push @{$Case{Lower}}, [ $ix, $code, $lower ]; push @{$Case{Title}}, [ $ix, $code, $title ]; push @{$Case{Upper}}, [ $ix, $code, $upper ]; } } else { die "$0: SpecCase.txt: $!\n"; } # Now write out the special cases properties in their code point order. # The To/Spec{Lower,Title,Upper}.pl are unused for now since the swash # routines do not do returning multiple characters. for my $case (qw(Lower Title Upper)) { my @case; for my $prop (sort { $a->[0] <=> $b->[0] } @{$Case{$case}}) { my ($ix, $code, $to) = @$prop; append(\@case, $code, $to); } flush(\@case, "To/Spec$case.pl"); } # That's all, folks!