+## !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
+## Any files created or read by this program should be listed in 'mktables.lst'
+
#!/usr/bin/perl -w
+require 5.008; # Needs pack "U". Probably safest to run on 5.8.x
use strict;
use Carp;
-
-die "$0: Please run me as ./mktables to avoid unnecessary differences\n"
- unless $0 eq "./mktables";
+use File::Spec;
##
## mktables -- create the runtime Perl Unicode files (lib/unicore/**/*.pl)
## from the Unicode database files (lib/unicore/*.txt).
##
-mkdir("lib", 0755);
-mkdir("To", 0755);
+## "Fuzzy" means this section in Unicode TR18:
+##
+## The recommended names for UCD properties and property values are in
+## PropertyAliases.txt [Prop] and PropertyValueAliases.txt
+## [PropValue]. There are both abbreviated names and longer, more
+## descriptive names. It is strongly recommended that both names be
+## recognized, and that loose matching of property names be used,
+## whereby the case distinctions, whitespace, hyphens, and underbar
+## are ignored.
+
+## Base names already used in lib/gc_sc (for avoiding 8.3 conflicts)
+my %BaseNames;
##
## Process any args.
##
my $Verbose = 0;
my $MakeTestScript = 0;
+my $AlwaysWrite = 0;
+my $UseDir = "";
+my $FileList = "$0.lst";
+my $MakeList = 0;
while (@ARGV)
{
$Verbose = 1;
} elsif ($arg eq '-q') {
$Verbose = 0;
+ } elsif ($arg eq '-w') {
+ $AlwaysWrite = 1; # update the files even if they havent changed
+ $FileList = "";
+ } elsif ($arg eq '-check') {
+ my $this = shift @ARGV;
+ my $ok = shift @ARGV;
+ if ($this ne $ok) {
+ print "Skipping as check params are not the same.\n";
+ exit(0);
+ }
} elsif ($arg eq '-maketest') {
$MakeTestScript = 1;
+ } elsif ($arg eq '-makelist') {
+ $MakeList = 1;
+ } elsif ($arg eq '-C' && defined ($UseDir = shift)) {
+ -d $UseDir or die "Unknown directory '$UseDir'";
+ } elsif ($arg eq '-L' && defined ($FileList = shift)) {
+ -e $FileList or die "Filelist '$FileList' doesn't appear to exist!";
} else {
- die "usage: $0 [-v|-q] [-maketest]";
+ die "usage: $0 [-v|-q|-w|-C dir|-L filelist] [-maketest] [-makelist]\n",
+ " -v : Verbose Mode\n",
+ " -q : Quiet Mode\n",
+ " -w : Write files regardless\n",
+ " -maketest : Make test script\n",
+ " -makelist : Rewrite the file list based on current setup\n",
+ " -L filelist : Use this file list, (defaults to $0)\n",
+ " -C dir : Change to this directory before proceeding\n",
+ " -check A B : Executes only if A and B are the same\n";
+ }
+}
+
+if ($FileList) {
+ print "Reading file list '$FileList'\n"
+ if $Verbose;
+ open my $fh,"<",$FileList or die "Failed to read '$FileList':$!";
+ my @input;
+ my @output;
+ for my $list ( \@input, \@output ) {
+ while (<$fh>) {
+ s/^ \s+ | \s+ $//xg;
+ next if /^ \s* (?: \# .* )? $/x;
+ last if /^ =+ $/x;
+ my ( $file ) = split /\t/, $_;
+ push @$list, $file;
+ }
+ my %dupe;
+ @$list = grep !$dupe{ $_ }++, @$list;
+ }
+ close $fh;
+ die "No input or output files in '$FileList'!"
+ if !@input or !@output;
+ if ( $MakeList ) {
+ foreach my $file (@output) {
+ unlink $file;
+ }
+ }
+ if ( $Verbose ) {
+ print "Expecting ".scalar( @input )." input files. ",
+ "Checking ".scalar( @output )." output files.\n";
+ }
+ # we set maxtime to be the youngest input file, including $0 itself.
+ my $maxtime = -M $0; # do this before the chdir!
+ if ($UseDir) {
+ chdir $UseDir or die "Failed to chdir to '$UseDir':$!";
+ }
+ foreach my $in (@input) {
+ my $time = -M $in;
+ die "Missing input file '$in'" unless defined $time;
+ $maxtime = $time if $maxtime < $time;
+ }
+
+ # now we check to see if any output files are older than maxtime, if
+ # they are we need to continue on, otherwise we can presumably bail.
+ my $ok = 1;
+ foreach my $out (@output) {
+ if ( ! -e $out ) {
+ print "'$out' is missing.\n"
+ if $Verbose;
+ $ok = 0;
+ last;
+ }
+ if ( -M $out > $maxtime ) {
+ print "'$out' is too old.\n"
+ if $Verbose;
+ $ok = 0;
+ last;
+ }
+ }
+ if ($ok) {
+ print "Files seem to be ok, not bothering to rebuild.\n";
+ exit(0);
+ }
+ print "Must rebuild tables.\n"
+ if $Verbose;
+} else {
+ if ($Verbose) {
+ print "Not checking filelist.\n";
+ }
+ if ($UseDir) {
+ chdir $UseDir or die "Failed to chdir to '$UseDir':$!";
}
}
+foreach my $lib ('To', 'lib',
+ map {File::Spec->catdir("lib",$_)}
+ qw(gc_sc dt bc hst ea jt lb nt ccc)) {
+ next if -d $lib;
+ mkdir $lib, 0755 or die "mkdir '$lib': $!";
+}
+
my $LastUnicodeCodepoint = 0x10FFFF; # As of Unicode 3.1.1.
my $HEADER=<<"EOF";
EOF
+sub force_unlink {
+ my $filename = shift;
+ return unless -e $filename;
+ return if CORE::unlink($filename);
+ # We might need write permission
+ chmod 0777, $filename;
+ CORE::unlink($filename) or die "Couldn't unlink $filename: $!\n";
+}
##
## Given a filename and a reference to an array of lines,
## write the lines to the file only if the contents have not changed.
+## Filename can be given as an arrayref of directory names
##
sub WriteIfChanged($\@)
{
my $file = shift;
my $lines = shift;
+ $file = File::Spec->catfile(@$file) if ref $file;
+
my $TextToWrite = join '', @$lines;
if (open IN, $file) {
local($/) = undef;
close IN;
if ($PreviousText eq $TextToWrite) {
print "$file unchanged.\n" if $Verbose;
- return;
+ return unless $AlwaysWrite;
}
}
+ force_unlink ($file);
if (not open OUT, ">$file") {
die "$0: can't open $file for output: $!\n";
}
return $name;
}
+
+##
+## Store the alias definitions for later use.
+##
+my %PropertyAlias;
+my %PropValueAlias;
+
+my %PA_reverse;
+my %PVA_reverse;
+
+sub Build_Aliases()
+{
+ ##
+ ## Most of the work with aliases doesn't occur here,
+ ## but rather in utf8_heavy.pl, which uses PVA.pl,
+
+ # Placate the warnings about used only once. (They are used again, but
+ # via a typeglob lookup)
+ %utf8::PropertyAlias = ();
+ %utf8::PA_reverse = ();
+ %utf8::PropValueAlias = ();
+ %utf8::PVA_reverse = ();
+ %utf8::PVA_abbr_map = ();
+
+ open PA, "< PropertyAliases.txt"
+ or confess "Can't open PropertyAliases.txt: $!";
+ while (<PA>) {
+ s/#.*//;
+ s/\s+$//;
+ next if /^$/;
+
+ my ($abbrev, $name) = split /\s*;\s*/;
+ next if $abbrev eq "n/a";
+ $PropertyAlias{$abbrev} = $name;
+ $PA_reverse{$name} = $abbrev;
+
+ # The %utf8::... versions use japhy's code originally from utf8_pva.pl
+ # However, it's moved here so that we build the tables at runtime.
+ tr/ _-//d for $abbrev, $name;
+ $utf8::PropertyAlias{lc $abbrev} = $name;
+ $utf8::PA_reverse{lc $name} = $abbrev;
+ }
+ close PA;
+
+ open PVA, "< PropValueAliases.txt"
+ or confess "Can't open PropValueAliases.txt: $!";
+ while (<PVA>) {
+ s/#.*//;
+ s/\s+$//;
+ next if /^$/;
+
+ my ($prop, @data) = split /\s*;\s*/;
+
+ if ($prop eq 'ccc') {
+ $PropValueAlias{$prop}{$data[1]} = [ @data[0,2] ];
+ $PVA_reverse{$prop}{$data[2]} = [ @data[0,1] ];
+ }
+ else {
+ next if $data[0] eq "n/a";
+ $PropValueAlias{$prop}{$data[0]} = $data[1];
+ $PVA_reverse{$prop}{$data[1]} = $data[0];
+ }
+
+ shift @data if $prop eq 'ccc';
+ next if $data[0] eq "n/a";
+
+ $data[1] =~ tr/ _-//d;
+ $utf8::PropValueAlias{$prop}{lc $data[0]} = $data[1];
+ $utf8::PVA_reverse{$prop}{lc $data[1]} = $data[0];
+
+ my $abbr_class = ($prop eq 'gc' or $prop eq 'sc') ? 'gc_sc' : $prop;
+ $utf8::PVA_abbr_map{$abbr_class}{lc $data[0]} = $data[0];
+ }
+ close PVA;
+
+ # backwards compatibility for L& -> LC
+ $utf8::PropValueAlias{gc}{'l&'} = $utf8::PropValueAlias{gc}{lc};
+ $utf8::PVA_abbr_map{gc_sc}{'l&'} = $utf8::PVA_abbr_map{gc_sc}{lc};
+
+}
+
+
##
## Associates a property ("Greek", "Lu", "Assigned",...) with a Table.
##
return $Table;
}
-##
-## Returns true if the Table has no code points
-##
-sub Table::IsEmpty
-{
- my $Table = shift; #self
- return not @$Table;
-}
-
-##
-## Returns true if the Table has code points
-##
-sub Table::NotEmpty
-{
- my $Table = shift; #self
- return @$Table;
-}
##
## Returns the maximum code point currently in the table.
##
sub Table::Max
{
- my $Table = shift; #self
- confess "oops" if $Table->IsEmpty; ## must have code points to have a max
- return $Table->[-1]->[RANGE_END];
+ my $last = $_[0]->[-1]; ## last code point
+ confess "oops" unless $last; ## must have code points to have a max
+ return $last->[RANGE_END];
}
##
## Given a new code point, make the last range of the Table extend to
## include the new (and all intervening) code points.
##
+## Takes the time to make sure that the extension is valid.
+##
sub Table::Extend
{
my $Table = shift; #self
confess "oops ($codepoint <= $PrevMax)" if $codepoint <= $PrevMax;
- $Table->[-1]->[RANGE_END] = $codepoint;
+ $Table->ExtendNoCheck($codepoint);
+}
+
+
+##
+## Given a new code point, make the last range of the Table extend to
+## include the new (and all intervening) code points.
+##
+## Does NOT check that the extension is valid. Assumes that the caller
+## has already made this check.
+##
+sub Table::ExtendNoCheck
+{
+ ## Optmized adding: Assumes $Table and $codepoint as parms
+ $_[0]->[-1]->[RANGE_END] = $_[1];
}
##
## If we've already got a range working, and this code point is the next
## one in line, and if the name is the same, just extend the current range.
##
- if ($Table->NotEmpty
+ my $last = $Table->[-1];
+ if ($last
and
- $Table->Max == $codepoint - 1
+ $last->[RANGE_END] == $codepoint - 1
and
- $Table->[-1]->[RANGE_NAME] eq $name)
+ $last->[RANGE_NAME] eq $name)
{
- $Table->Extend($codepoint);
+ $Table->ExtendNoCheck($codepoint);
}
else
{
if ($start > $New->Max) {
$New->AppendRange($start, $end);
} elsif ($end > $New->Max) {
- $New->Extend($end);
+ $New->ExtendNoCheck($end);
}
}
##
## Given a filename, write a representation of the Table to a file.
## May have an optional comment as a 2nd arg.
+## Filename may actually be an arrayref of directories
##
sub Table::Write
{
{
my $Table = shift; #self
- return 0x1234 if $Table->IsEmpty();
+ return 0x1234 if not @$Table;
for my $set (@$Table)
{
if ($TableInfo{$Type}->{$CName}) {
confess "$0: Use canonical form '$CName' instead of '$Name' for alias.";
} else {
- confess "$0: don't have orignial $Type => $Name to make alias";
+ confess "$0: don't have original $Type => $Name to make alias\n";
}
}
if ($TableInfo{$Alias}) {
my %General;
my %Cat;
+## Simple Data::Dumper alike. Good enough for our needs. We can't use the real
+## thing as we have to run under miniperl
+sub simple_dumper {
+ my @lines;
+ my $item;
+ foreach $item (@_) {
+ if (ref $item) {
+ if (ref $item eq 'ARRAY') {
+ push @lines, "[\n", simple_dumper (@$item), "],\n";
+ } elsif (ref $item eq 'HASH') {
+ push @lines, "{\n", simple_dumper (%$item), "},\n";
+ } else {
+ die "Can't cope with $item";
+ }
+ } else {
+ if (defined $item) {
+ my $copy = $item;
+ $copy =~ s/([\'\\])/\\$1/gs;
+ push @lines, "'$copy',\n";
+ } else {
+ push @lines, "undef,\n";
+ }
+ }
+ }
+ @lines;
+}
+
##
## Process UnicodeData.txt (Categories, etc.)
##
my $Deco = Table->New();
my $Comb = Table->New();
my $Number = Table->New();
- my $Mirrored = Table->New(Is => 'Mirrored',
- Desc => "Mirrored in bidirectional text",
- Fuzzy => 0);
+ my $Mirrored = Table->New();#Is => 'Mirrored',
+ #Desc => "Mirrored in bidirectional text",
+ #Fuzzy => 0);
my %DC;
my %Bidi;
- my %Deco;
- $Deco{Canon} = Table->New(Is => 'Canon',
- Desc => 'Decomposes to multiple characters',
- Fuzzy => 0);
- $Deco{Compat} = Table->New(Is => 'Compat',
- Desc => 'Compatible with a more-basic character',
- Fuzzy => 0);
+ my %Number;
+ $DC{can} = Table->New();
+ $DC{com} = Table->New();
## Initialize Perl-generated categories
## (Categories from UnicodeData.txt are auto-initialized in gencat)
($General{$name} ||= Table->New)->$op($code, $name);
# 005F: SPACING UNDERSCORE
- $Cat{Word}->$op($code) if $cat =~ /^[LMN]/ || $code == 0x005F;
+ $Cat{Word}->$op($code) if $cat =~ /^[LMN]|Pc/;
$Cat{Alnum}->$op($code) if $cat =~ /^[LM]|Nd/;
$Cat{Alpha}->$op($code) if $cat =~ /^[LM]/;
-
-
- $Cat{Space}->$op($code) if $cat =~ /^Z/
+ my $isspace =
+ ($cat =~ /Zs|Zl|Zp/ &&
+ $code != 0x200B) # 200B is ZWSP which is for line break control
+ # and therefore it is not part of "space" even while it is "Zs".
|| $code == 0x0009 # 0009: HORIZONTAL TAB
|| $code == 0x000A # 000A: LINE FEED
|| $code == 0x000B # 000B: VERTICAL TAB
|| $code == 0x000C # 000C: FORM FEED
|| $code == 0x000D # 000D: CARRIAGE RETURN
- || $code == 0x0085; # 0085: NEL
+ || $code == 0x0085 # 0085: NEL
+ ;
- $Cat{SpacePerl}->$op($code) if $cat =~ /^Z/
- || $code == 0x0009 # 0009: HORIZONTAL TAB
- || $code == 0x000A # 000A: LINE FEED
- || $code == 0x000C # 000C: FORM FEED
- || $code == 0x000D # 000D: CARRIAGE RETURN
- || $code == 0x0085 # 0085: <NEXT LINE>
- || $code == 0x2028 # 2028: LINE SEPARATOR
- || $code == 0x2029;# 2029: PARAGRAPH SEP.
+ $Cat{Space}->$op($code) if $isspace;
- $Cat{Blank}->$op($code) if $cat eq "Zs"
- || $code == 0x0009; # 0009: HORIZONTAL TAB
+ $Cat{SpacePerl}->$op($code) if $isspace
+ && $code != 0x000B; # Backward compat.
+
+ $Cat{Blank}->$op($code) if $isspace
+ && !($code == 0x000A ||
+ $code == 0x000B ||
+ $code == 0x000C ||
+ $code == 0x000D ||
+ $code == 0x0085 ||
+ $cat =~ /^Z[lp]/);
$Cat{Digit}->$op($code) if $cat eq "Nd";
$Cat{Upper}->$op($code) if $cat eq "Lu";
$Cat{Title}->$op($code) if $cat eq "Lt";
$Cat{ASCII}->$op($code) if $code <= 0x007F;
$Cat{Cntrl}->$op($code) if $cat =~ /^C/;
- $Cat{Graph}->$op($code) if $cat !~ /Zs|Cc|Cs|Cn/;
- $Cat{Print}->$op($code) if $cat =~ /^[LMNPS]/
- || $cat eq "Zs";
+ my $isgraph = !$isspace && $cat !~ /Cc|Cs|Cn/;
+ $Cat{Graph}->$op($code) if $isgraph;
+ $Cat{Print}->$op($code) if $isgraph || $isspace;
$Cat{Punct}->$op($code) if $cat =~ /^P/;
$Cat{XDigit}->$op($code) if ($code >= 0x30 && $code <= 0x39) ## 0..9
$Comb->Append($code, $comb) if $comb;
$Number->Append($code, $number) if length $number;
+ length($decimal) and ($Number{De} ||= Table->New())->Append($code)
+ or
+ length($digit) and ($Number{Di} ||= Table->New())->Append($code)
+ or
+ length($number) and ($Number{Nu} ||= Table->New())->Append($code);
+
$Mirrored->Append($code) if $mirrored eq "Y";
- $Bidi{$bidi} ||= Table->New(Is => "Bidi$bidi",
- Desc => "Bi-directional category '$bidi'",
- Fuzzy => 0);
+ $Bidi{$bidi} ||= Table->New();#Is => "bt/$bidi",
+ #Desc => "Bi-directional category '$bidi'",
+ #Fuzzy => 0);
$Bidi{$bidi}->Append($code);
if ($deco)
$Deco->Append($code, $deco);
if ($deco =~/^<(\w+)>/)
{
- $Deco{Compat}->Append($code);
+ my $dshort = $PVA_reverse{dt}{ucfirst lc $1};
+ $DC{com}->Append($code);
- $DC{$1} ||= Table->New(Is => "DC$1",
- Desc => "Compatible with '$1'",
- Fuzzy => 0);
- $DC{$1}->Append($code);
+ $DC{$dshort} ||= Table->New();
+ $DC{$dshort}->Append($code);
}
else
{
- $Deco{Canon}->Append($code);
+ $DC{can}->Append($code);
}
}
}
$Cat{C}->Replace($Cat{C}->Merge($Cat{Cn})); ## Now merge in Cn into C
- # L& is Ll, Lu, and Lt.
- New_Prop(Is => 'L&',
+ # LC is Ll, Lu, and Lt.
+ # (used to be L& or L_, but PropValueAliases.txt defines it as LC)
+ New_Prop(Is => 'LC',
Table->Merge(@Cat{qw[Ll Lu Lt]}),
Desc => '[\p{Ll}\p{Lu}\p{Lt}]',
Fuzzy => 0);
## Now dump the files.
##
$Name->Write("Name.pl");
- $Bidi->Write("Bidirectional.pl");
+
+ {
+ my @PVA = $HEADER;
+ foreach my $name (qw (PropertyAlias PA_reverse PropValueAlias
+ PVA_reverse PVA_abbr_map)) {
+ # Should I really jump through typeglob hoops just to avoid a
+ # symbolic reference? (%{"utf8::$name})
+ push @PVA, "\n", "\%utf8::$name = (\n",
+ simple_dumper (%{$utf8::{$name}}), ");\n";
+ }
+ push @PVA, "1;\n";
+ WriteIfChanged("PVA.pl", @PVA);
+ }
+
+ # $Bidi->Write("Bidirectional.pl");
+ for (keys %Bidi) {
+ $Bidi{$_}->Write(
+ ["lib","bc","$_.pl"],
+ "BidiClass category '$PropValueAlias{bc}{$_}'"
+ );
+ }
+
$Comb->Write("CombiningClass.pl");
+ for (keys %{ $PropValueAlias{ccc} }) {
+ my ($code, $name) = @{ $PropValueAlias{ccc}{$_} };
+ (my $c = Table->New())->Append($code);
+ $c->Write(
+ ["lib","ccc","$_.pl"],
+ "CombiningClass category '$name'"
+ );
+ }
+
$Deco->Write("Decomposition.pl");
- $Number->Write("Number.pl");
- $General->Write("Category.pl");
+ for (keys %DC) {
+ $DC{$_}->Write(
+ ["lib","dt","$_.pl"],
+ "DecompositionType category '$PropValueAlias{dt}{$_}'"
+ );
+ }
+
+ # $Number->Write("Number.pl");
+ for (keys %Number) {
+ $Number{$_}->Write(
+ ["lib","nt","$_.pl"],
+ "NumericType category '$PropValueAlias{nt}{$_}'"
+ );
+ }
+
+ # $General->Write("Category.pl");
for my $to (sort keys %To) {
- $To{$to}->Write("To/$to.pl");
+ $To{$to}->Write(["To","$to.pl"]);
+ }
+
+ for (keys %{ $PropValueAlias{gc} }) {
+ New_Alias(Is => $PropValueAlias{gc}{$_}, SameAs => $_, Fuzzy => 1);
}
}
$Lbrk->Append($first, $lbrk);
- $Lbrk{$lbrk} ||= Table->New(Is => "Lbrk$lbrk",
- Desc => "Linebreak category '$lbrk'",
- Fuzzy => 0);
+ $Lbrk{$lbrk} ||= Table->New();
$Lbrk{$lbrk}->Append($first);
if ($last) {
}
close IN;
- $Lbrk->Write("Lbrk.pl");
+ # $Lbrk->Write("Lbrk.pl");
+
+
+ for (keys %Lbrk) {
+ $Lbrk{$_}->Write(
+ ["lib","lb","$_.pl"],
+ "Linebreak category '$PropValueAlias{lb}{$_}'"
+ );
+ }
}
##
my $ArabLink = Table->New();
my $ArabLinkGroup = Table->New();
+ my %JoinType;
+
while (<IN>)
{
next unless /^[0-9A-Fa-f]+;/;
my $code = hex($hexcode);
$ArabLink->Append($code, $link);
$ArabLinkGroup->Append($code, $linkgroup);
+
+ $JoinType{$link} ||= Table->New(Is => "JoinType$link");
+ $JoinType{$link}->Append($code);
+ }
+ close IN;
+
+ # $ArabLink->Write("ArabLink.pl");
+ # $ArabLinkGroup->Write("ArabLnkGrp.pl");
+
+
+ for (keys %JoinType) {
+ $JoinType{$_}->Write(
+ ["lib","jt","$_.pl"],
+ "JoiningType category '$PropValueAlias{jt}{$_}'"
+ );
+ }
+}
+
+##
+## Process EastAsianWidth.txt.
+##
+sub EastAsianWidth_txt()
+{
+ if (not open IN, "EastAsianWidth.txt") {
+ die "$0: EastAsianWidth.txt: $!\n";
+ }
+
+ my %EAW;
+
+ while (<IN>)
+ {
+ next unless /^[0-9A-Fa-f]+(\.\.[0-9A-Fa-f]+)?;/;
+ s/#.*//;
+ s/\s+$//;
+
+ my ($hexcodes, $pv) = split(/\s*;\s*/);
+ $EAW{$pv} ||= Table->New(Is => "EastAsianWidth$pv");
+ my ($start, $end) = split(/\.\./, $hexcodes);
+ if (defined $end) {
+ $EAW{$pv}->AppendRange(hex($start), hex($end));
+ } else {
+ $EAW{$pv}->Append(hex($start));
+ }
+ }
+ close IN;
+
+
+ for (keys %EAW) {
+ $EAW{$_}->Write(
+ ["lib","ea","$_.pl"],
+ "EastAsianWidth category '$PropValueAlias{ea}{$_}'"
+ );
+ }
+}
+
+##
+## Process HangulSyllableType.txt.
+##
+sub HangulSyllableType_txt()
+{
+ if (not open IN, "HangulSyllableType.txt") {
+ die "$0: HangulSyllableType.txt: $!\n";
+ }
+
+ my %HST;
+
+ while (<IN>)
+ {
+ next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\w+)/;
+ my ($first, $last, $pv) = (hex($1), hex($2||""), $3);
+
+ $HST{$pv} ||= Table->New(Is => "HangulSyllableType$pv");
+ $HST{$pv}->Append($first);
+
+ if ($last) { $HST{$pv}->Extend($last) }
}
close IN;
- $ArabLink->Write("ArabLink.pl");
- $ArabLinkGroup->Write("ArabLnkGrp.pl");
+ for (keys %HST) {
+ $HST{$_}->Write(
+ ["lib","hst","$_.pl"],
+ "HangulSyllableType category '$PropValueAlias{hst}{$_}'"
+ );
+ }
}
##
$Short->Append($code, $short);
}
close IN;
- $Short->Write("JamoShort.pl");
+ # $Short->Write("JamoShort.pl");
}
##
}
}
- $Scripts->Write("Scripts.pl");
+ # $Scripts->Write("Scripts.pl");
## Common is everything not explicitly assigned to a Script
##
}
close IN;
- $Blocks->Write("Blocks.pl");
+ # $Blocks->Write("Blocks.pl");
}
##
}
}
+ for (keys %Prop) {
+ (my $file = $PA_reverse{$_}) =~ tr/_//d;
+ # XXX I'm assuming that the names from %Prop don't suffer 8.3 clashes.
+ $BaseNames{lc $file}++;
+ $Prop{$_}->Write(
+ ["lib","gc_sc","$file.pl"],
+ "Binary property '$_'"
+ );
+ }
+
# Alphabetic is L and Other_Alphabetic.
New_Prop(Is => 'Alphabetic',
Table->Merge($Cat{L}, $Prop{Other_Alphabetic}),
Fuzzy => 1);
}
-sub Make_GC_Aliases()
-{
- ##
- ## The mapping from General Category long forms to short forms is
- ## currently hardwired here since no simple data file in the UCD
- ## seems to do that. Unicode 3.2 will assumedly correct this.
- ##
- my %Is = (
- 'Letter' => 'L',
- 'Uppercase_Letter' => 'Lu',
- 'Lowercase_Letter' => 'Ll',
- 'Titlecase_Letter' => 'Lt',
- 'Modifier_Letter' => 'Lm',
- 'Other_Letter' => 'Lo',
-
- 'Mark' => 'M',
- 'Non_Spacing_Mark' => 'Mn',
- 'Spacing_Mark' => 'Mc',
- 'Enclosing_Mark' => 'Me',
-
- 'Separator' => 'Z',
- 'Space_Separator' => 'Zs',
- 'Line_Separator' => 'Zl',
- 'Paragraph_Separator' => 'Zp',
-
- 'Number' => 'N',
- 'Decimal_Number' => 'Nd',
- 'Letter_Number' => 'Nl',
- 'Other_Number' => 'No',
-
- 'Punctuation' => 'P',
- 'Connector_Punctuation' => 'Pc',
- 'Dash_Punctuation' => 'Pd',
- 'Open_Punctuation' => 'Ps',
- 'Close_Punctuation' => 'Pe',
- 'Initial_Punctuation' => 'Pi',
- 'Final_Punctuation' => 'Pf',
- 'Other_Punctuation' => 'Po',
-
- 'Symbol' => 'S',
- 'Math_Symbol' => 'Sm',
- 'Currency_Symbol' => 'Sc',
- 'Modifier_Symbol' => 'Sk',
- 'Other_Symbol' => 'So',
-
- 'Other' => 'C',
- 'Control' => 'Cc',
- 'Format' => 'Cf',
- 'Surrogate' => 'Cs',
- 'Private Use' => 'Co',
- 'Unassigned' => 'Cn',
- );
-
- ## make the aliases....
- while (my ($Alias, $Name) = each %Is) {
- New_Alias(Is => $Alias, SameAs => $Name, Fuzzy => 1);
- }
-}
-
##
## These are used in:
sub MakePropTestScript()
{
## this written directly -- it's huge.
+ force_unlink ("TestProp.pl");
if (not open OUT, ">TestProp.pl") {
die "$0: TestProp.pl: $!\n";
}
{
my @MAP;
- my %BaseNames; ## Base names already used (for avoiding 8.3 conflicts)
-
## 'Is' *MUST* come first, so its names have precidence over 'In's
for my $Type ('Is', 'In')
{
my $filename;
{
## 'Is' items lose 'Is' from the basename.
- $filename = $Type eq 'Is' ? $Name : "$Type$Name";
+ $filename = $Type eq 'Is' ?
+ ($PVA_reverse{sc}{$Name} || $Name) :
+ "$Type$Name";
$filename =~ s/[^\w_]+/_/g; # "L&" -> "L_"
substr($filename, 8) = '' if length($filename) > 8;
##
## Okay, write the file...
##
- $Table->Write("lib/$filename.pl", $Comment);
+ $Table->Write(["lib","gc_sc","$filename.pl"], $Comment);
## and register it
$RawNameToFile{$Name} = $filename;
"##\n",
"## Data in this file used by ../utf8_heavy.pl\n",
"##\n\n",
- "## Mapping from name to filename in ./lib\n",
+ "## Mapping from name to filename in ./lib/gc_sc\n",
"%utf8::Exact = (\n",
);
+ $Exact{InGreek} = 'InGreekA'; # this is evil kludge
for my $Name (sort keys %Exact)
{
my $File = $Exact{$Name};
{
my $NormalCase = do "To/$case.pl" || die "$0: $@\n";
- my @OUT = (
- $HEADER, "\n",
- "%utf8::ToSpec$case =\n(\n",
- );
+ my @OUT =
+ (
+ $HEADER, "\n",
+ "# The key UTF-8 _bytes_, the value UTF-8 (speed hack)\n",
+ "%utf8::ToSpec$case =\n(\n",
+ );
for my $prop (sort { $a->[0] <=> $b->[0] } @{$CaseInfo{$case}}) {
my ($ix, $code, $to) = @$prop;
my $tostr =
join "", map { sprintf "\\x{%s}", $_ } split ' ', $to;
- push @OUT, sprintf qq['%04X' => "$tostr",\n], $ix;
+ push @OUT, sprintf qq["%s" => "$tostr",\n], join("", map { sprintf "\\x%02X", $_ } unpack("U0C*", pack("U", $ix)));
# Remove any single-character mappings for
# the same character since we are going for
# the special casing rules.
$NormalCase,
"END\n"
);
- WriteIfChanged("To/$case.pl", @OUT);
+ WriteIfChanged(["To","$case.pl"], @OUT);
}
}
#
my $CommonFold = do "To/Fold.pl" || die "$0: To/Fold.pl: $!\n";
- my @OUT = (
- $HEADER, "\n",
- "%utf8::ToSpecFold =\n(\n",
- );
+ my @OUT =
+ (
+ $HEADER, "\n",
+ "# The ke UTF-8 _bytes_, the value UTF-8 (speed hack)\n",
+ "%utf8::ToSpecFold =\n(\n",
+ );
for my $code (sort { $a <=> $b } keys %Fold) {
my $foldstr =
join "", map { sprintf "\\x{%s}", $_ } split ' ', $Fold{$code};
- push @OUT, sprintf qq['%04X' => "$foldstr",\n], $code;
+ push @OUT, sprintf qq["%s" => "$foldstr",\n], join("", map { sprintf "\\x%02X", $_ } unpack("U0C*", pack("U", $code)));
}
push @OUT, (
");\n\n",
"END\n",
);
- WriteIfChanged("To/Fold.pl", @OUT);
+ WriteIfChanged(["To","Fold.pl"], @OUT);
}
## Do it....
+Build_Aliases();
UnicodeData_Txt();
-Make_GC_Aliases();
PropList_txt();
Scripts_txt();
LineBreak_Txt();
ArabicShaping_txt();
+EastAsianWidth_txt();
+HangulSyllableType_txt();
Jamo_txt();
SpecialCasing_txt();
CaseFolding_txt();
+if ( $FileList and $MakeList ) {
+
+ print "Updating '$FileList'\n"
+ if ($Verbose);
+
+ open my $ofh,">",$FileList
+ or die "Can't write to '$FileList':$!";
+ print $ofh <<"EOFHEADER";
+#
+# mktables.lst -- File list for mktables.
+#
+# Autogenerated on @{[scalar localtime]}
+#
+# - First section is input files
+# (mktables itself is automatically included)
+# - Section seperator is /^=+\$/
+# - Second section is a list of output files.
+# - Lines matching /^\\s*#/ are treated as comments
+# which along with blank lines are ignored.
+#
+
+# Input files:
+
+EOFHEADER
+ my @input=("version",glob('*.txt'));
+ print $ofh "$_\n" for
+ @input,
+ "\n=================================\n",
+ "# Output files:\n",
+ # special files
+ "Properties";
+
+
+ require File::Find;
+ my $count=0;
+ File::Find::find({
+ no_chdir=>1,
+ wanted=>sub {
+ if (/\.pl$/) {
+ s!^\./!!;
+ print $ofh "$_\n";
+ $count++;
+ }
+ },
+ },".");
+
+ print $ofh "\n# ",scalar(@input)," input files\n",
+ "# ",scalar($count+1)," output files\n\n",
+ "# End list\n";
+ close $ofh
+ or warn "Failed to close $ofh: $!";
+
+ print "Filelist has ",scalar(@input)," input files and ",
+ scalar($count+1)," output files\n"
+ if $Verbose;
+}
+print "All done\n" if $Verbose;
exit(0);
## TRAILING CODE IS USED BY MakePropTestScript()