use strict;
use Getopt::Std;
my @orig_ARGV = @ARGV;
+our $VERSION = '0.30';
# These may get re-ordered.
# RAW is a do_now as inserted by &enter
eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
my %opt;
-getopts('qOo:f:n:',\%opt);
+# I think these are:
+# -Q to disable the duplicate codepoint test
+# -S make mapping errors fatal
+# -q to remove comments written to output files
+# -O to enable the (brute force) substring optimiser
+# -o <output> to specify the output file name (else it's the first arg)
+# -f <inlist> to give a file with a list of input files (else use the args)
+# -n <name> to name the encoding (else use the basename of the input file.
+getopts('SQqOo:f:n:',\%opt);
+
+# This really should go first, else the die here causes empty (non-erroneous)
+# output files to be written.
+my @encfiles;
+if (exists $opt{'f'}) {
+ # -F is followed by name of file containing list of filenames
+ my $flist = $opt{'f'};
+ open(FLIST,$flist) || die "Cannot open $flist:$!";
+ chomp(@encfiles = <FLIST>);
+ close(FLIST);
+} else {
+ @encfiles = @ARGV;
+}
+
my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
chmod(0666,$cname) if -f $cname && !-w $cname;
open(C,">$cname") || die "Cannot open $cname:$!";
-
my $dname = $cname;
-$dname =~ s/(\.[^\.]*)?$/_def.h/;
+my $hname = $cname;
my ($doC,$doEnc,$doUcm,$doPet);
if ($cname =~ /\.(c|xs)$/)
{
$doC = 1;
+ $dname =~ s/(\.[^\.]*)?$/_def.h/;
chmod(0666,$dname) if -f $cname && !-w $dname;
open(D,">$dname") || die "Cannot open $dname:$!";
- my $hname = $cname;
$hname =~ s/(\.[^\.]*)?$/.h/;
chmod(0666,$hname) if -f $cname && !-w $hname;
open(H,">$hname") || die "Cannot open $hname:$!";
$doPet = 1;
}
-my @encfiles;
-if (exists $opt{'f'})
- {
- # -F is followed by name of file containing list of filenames
- my $flist = $opt{'f'};
- open(FLIST,$flist) || die "Cannot open $flist:$!";
- chomp(@encfiles = <FLIST>);
- close(FLIST);
- }
-else
- {
- @encfiles = @ARGV;
- }
-
my %encoding;
my %strings;
my $saved = 0;
print C "#include \"$dname\"\n";
print C "}\n";
}
- close(D);
- close(H);
+ # Close in void context is bad, m'kay
+ close(D) or warn "Error closing '$dname': $!";
+ close(H) or warn "Error closing '$hname': $!";
+
+ my $perc_saved = $strings/($strings + $saved) * 100;
+ my $perc_subsaved = $strings/($strings + $subsave) * 100;
printf STDERR "%d bytes in string tables\n",$strings;
- printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",$saved,100*$saved/$strings if $saved;
- printf STDERR "%d bytes (%.3g%%) saved using substrings\n",$subsave,100*$subsave/$strings if $subsave;
+ printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
+ $saved, $perc_saved if $saved;
+ printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
+ $subsave, $perc_subsaved if $subsave;
}
elsif ($doEnc)
{
}
}
-close(C);
-
+# writing half meg files and then not checking to see if you just filled the
+# disk is bad, m'kay
+close(C) or die "Error closing '$cname': $!";
+# End of the main program.
sub compile_ucm
{
$encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
}
+
+
sub compile_enc
{
my ($fh,$name) = @_;
}
chomp($type);
return if $type eq 'E';
+ # Do the hash lookup once, rather than once per function call. 4% speedup.
+ my $type_func = $encode_types{$type};
my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
warn "$type encoded $name\n";
my $rep = '';
my $max_el = 0; # Anything must be longer than 0
{
my $v = hex($def);
- $rep = &{$encode_types{$type}}($v & 0xFF, ($v >> 8) & 0xffe);
+ $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
}
- my %seen;
- while ($pages--)
+ my $errors;
+ my $seen;
+ # use -Q to silence the seen test. Makefile.PL uses this by default.
+ $seen = {} unless $opt{Q};
+ do
{
my $line = <$fh>;
chomp($line);
my $page = hex($line);
my $ch = 0;
- for (0..15)
+ my $i = 16;
+ do
{
+ # So why is it 1% faster to leave the my here?
my $line = <$fh>;
- die "Line should be exactly 65 characters long including newline"
- unless length ($line) == 65;
+ $line =~ s/\r\n$/\n/;
+ die "$.:${line}Line should be exactly 65 characters long including
+ newline (".length($line).")" unless length ($line) == 65;
# Split line into groups of 4 hex digits, convert groups to ints
- for my $val (map {hex $_} $line =~ /(....)/g)
+ # This takes 65.35
+ # map {hex $_} $line =~ /(....)/g
+ # This takes 63.75 (2.5% less time)
+ # unpack "n*", pack "H*", $line
+ # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
+ # Doing it as while ($line =~ /(....)/g) took 74.63
+ foreach my $val (unpack "n*", pack "H*", $line)
{
next if $val == 0xFFFD;
- my $ech = &{$encode_types{$type}}($ch,$page);
+ my $ech = &$type_func($ch,$page);
if ($val || (!$ch && !$page))
{
my $el = length($ech);
$max_el = $el if $el > $max_el;
$min_el = $el if $el < $min_el;
my $uch = encode_U($val);
- if (exists $seen{$uch})
- {
- warn sprintf("U%04X is %02X%02X and %02X%02X\n",
- $val,$page,$ch,@{$seen{$uch}});
- }
- else
- {
- $seen{$uch} = [$page,$ch];
- }
+ if ($seen) {
+ # We're doing the test.
+ # We don't need to read this quickly, so storing it as a scalar,
+ # rather than 3 (anon array, plus the 2 scalars it holds) saves
+ # RAM and may make us faster on low RAM systems. [see __END__]
+ if (exists $seen->{$uch})
+ {
+ warn sprintf("U%04X is %02X%02X and %04X\n",
+ $val,$page,$ch,$seen->{$uch});
+ $errors++;
+ }
+ else
+ {
+ $seen->{$uch} = $page << 8 | $ch;
+ }
+ }
# Passing 2 extra args each time is 3.6% slower!
- # Even with having to add $fallback ||= 0 in &process
- enter($e2u,$ech,$uch);
- enter($u2e,$uch,$ech);
+ # Even with having to add $fallback ||= 0 later
+ enter_fb0($e2u,$ech,$uch);
+ enter_fb0($u2e,$uch,$ech);
}
else
{
}
$ch++;
}
- }
- }
+ } while --$i;
+ } while --$pages;
die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
if $min_el > $max_el;
+ die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
$encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
}
$next ||= $current;
# Making sure it is defined seems to be faster than {no warnings;} in
# &process, or passing it in as 0 explicity.
- $fallback ||= 0;
+ # XXX $fallback ||= 0;
# Start at the beginning and work forwards through the string to zero.
# effectively we are removing 1 character from the front each time
}
}
+# This is purely for optimistation. It's just &enter hard coded for $fallback
+# of 0, using only a 3 entry array ref to save memory for every entry.
+sub enter_fb0 {
+ my ($current,$inbytes,$outbytes,$next) = @_;
+ $next ||= $current;
+
+ my $pos = -length $inbytes;
+ while (1) {
+ my $byte = substr $inbytes, $pos, 1;
+ my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
+ unless (++$pos) {
+ $do_now->[RAW_OUT_BYTES] = $outbytes;
+ $do_now->[RAW_NEXT] = $next;
+ return;
+ }
+ $current = $do_now->[RAW_NEXT];
+ }
+}
sub outstring
if ($opt{'O'}) {
foreach my $o (keys %strings)
{
- my $i = index($o,$s);
- if ($i >= 0)
- {
- $sym = $strings{$o};
- $sym .= sprintf("+0x%02x",$i) if ($i);
- $subsave += length($s);
- $strings{$s} = $sym;
- return $sym;
- }
- }
+ next unless (my $i = index($o,$s)) >= 0;
+ $sym = $strings{$o};
+ $sym .= sprintf("+0x%02x",$i) if ($i);
+ $subsave += length($s);
+ return $strings{$s} = $sym;
+ }
}
$strings{$s} = $sym = $name;
$strings += length($s);
my ($name,$a) = @_;
$name =~ s/\W+/_/g;
$a->{Cname} = $name;
- my @raw = sort keys %{$a->{Raw}};
+ my $raw = $a->{Raw};
my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
my @ent;
- foreach my $key (@raw) {
+ $agg_max_in = 0;
+ foreach my $key (sort keys %$raw) {
# RAW_NEXT => 0,
# RAW_IN_LEN => 1,
# RAW_OUT_BYTES => 2,
# RAW_FALLBACK => 3,
- my ($next, $in_len, $out_bytes, $fallback) = @{$a->{Raw}{$key}};
+ my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
# Now we are converting from raw to aggregate, switch from 1 byte strings
# to numbers
my $b = ord $key;
+ $fallback ||= 0;
if ($l &&
# If this == fails, we're going to reset $agg_max_in below anyway.
$b == ++$agg_max_in &&
}
}
# encengine.c rules say that last entry must be for 255
- if (ord $raw[-1] < 255) {
- push @ent, [1+ord $raw[-1], 255,undef,$a,0,0];
+ if ($agg_max_in < 255) {
+ push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
}
$a->{'Entries'} = \@ent;
}
sub output_ucm_page
{
- my ($cmap,$a,$t,$pre) = @_;
- # warn sprintf("Page %x\n",$pre);
- foreach my $b (sort keys %$t)
- {
- die "Changed - fix me for new structure";
- my ($s,$e,$out,$n,$end,$l,$fb) = @{$t->{$b}};
- die "oops $s $e" unless $s eq $e;
- my $u = ord($s);
- if ($n != $a && $n != $t)
- {
- output_ucm_page($cmap,$a,$n,(($pre|($u &0x3F)) << 6)&0xFFFF);
- }
- elsif (length($out))
- {
- if ($pre)
- {
- $u = $pre|($u &0x3f);
- }
- my $s = sprintf "<U%04X> ",$u;
- foreach my $c (split(//,$out))
- {
- $s .= sprintf "\\x%02X",ord($c);
+ my ($cmap,$a,$t,$pre) = @_;
+ # warn sprintf("Page %x\n",$pre);
+ my $raw = $t->{Raw};
+ foreach my $key (sort keys %$raw) {
+ # RAW_NEXT => 0,
+ # RAW_IN_LEN => 1,
+ # RAW_OUT_BYTES => 2,
+ # RAW_FALLBACK => 3,
+ my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
+ my $u = ord $key;
+ $fallback ||= 0;
+
+ if ($next != $a && $next != $t) {
+ output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
+ } elsif (length $out_bytes) {
+ if ($pre) {
+ $u = $pre|($u &0x3f);
}
- $s .= sprintf " |%d # %s\n",($fb ? 1 : 0),$uname[$u];
- push(@$cmap,$s);
- }
- else
- {
- warn join(',',@{$t->{$b}},$a,$t);
+ my $s = sprintf "<U%04X> ",$u;
+ #foreach my $c (split(//,$out_bytes)) {
+ # $s .= sprintf "\\x%02X",ord($c);
+ #}
+ # 9.5% faster changing that loop to this:
+ $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
+ $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
+ push(@$cmap,$s);
+ } else {
+ warn join(',',$u, @{$raw->{$key}},$a,$t);
}
}
}
print $fh "END CHARMAP\n";
}
+
+__END__
+With %seen holding array refs:
+
+ 865.66 real 28.80 user 8.79 sys
+ 7904 maximum resident set size
+ 1356 average shared memory size
+ 18566 average unshared data size
+ 229 average unshared stack size
+ 46080 page reclaims
+ 33373 page faults
+
+With %seen holding simple scalars:
+
+ 342.16 real 27.11 user 3.54 sys
+ 8388 maximum resident set size
+ 1394 average shared memory size
+ 14969 average unshared data size
+ 236 average unshared stack size
+ 28159 page reclaims
+ 9839 page faults
+
+Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
+how %seen is storing things its seen. So it is pathalogically bad on a 16M
+RAM machine, but it's going to help even on modern machines.
+Swapping is bad, m'kay :-)