#!../../perl -w
-BEGIN { @INC = '../../lib' };
+BEGIN {
+ unshift @INC, qw(../../lib ../../../lib);
+ $ENV{PATH} .= ';../..;../../..' if $^O eq 'MSWin32';
+}
use strict;
use Getopt::Std;
my @orig_ARGV = @ARGV;
-my $perforce = '$Id$';
+# These may get re-ordered.
+# RAW is a do_now as inserted by &enter
+# AGG is an aggreagated do_now, as built up by &process
+use constant {
+ RAW_NEXT => 0,
+ RAW_IN_LEN => 1,
+ RAW_OUT_BYTES => 2,
+ RAW_FALLBACK => 3,
+
+ AGG_MIN_IN => 0,
+ AGG_MAX_IN => 1,
+ AGG_OUT_BYTES => 2,
+ AGG_NEXT => 3,
+ AGG_IN_LEN => 4,
+ AGG_OUT_LEN => 5,
+ AGG_FALLBACK => 6,
+};
+# (See the algorithm in encengine.c - we're building structures for it)
+
+# There are two sorts of structures.
+# "do_now" (an array, two variants of what needs storing) is whatever we need
+# to do now we've read an input byte.
+# It's housed in a "do_next" (which is how we got to it), and in turn points
+# to a "do_next" which contains all the "do_now"s for the next input byte.
+
+# There will be a "do_next" which is the start state.
+# For a single byte encoding it's the only "do_next" - each "do_now" points
+# back to it, and each "do_now" will cause bytes. There is no state.
+
+# For a multi-byte encoding where all characters in the input are the same
+# length, then there will be a tree of "do_now"->"do_next"->"do_now"
+# branching out from the start state, one step for each input byte.
+# The leaf "do_now"s will all be at the same distance from the start state,
+# only the leaf "do_now"s cause output bytes, and they in turn point back to
+# the start state.
+
+# For an encoding where there are varaible length input byte sequences, you
+# will encounter a leaf "do_now" sooner for the shorter input sequences, but
+# as before the leaves will point back to the start state.
+
+# The system will cope with escape encodings (imagine them as a mostly
+# self-contained tree for each escape state, and cross links between trees
+# at the state-switching characters) but so far no input format defines these.
+
+# The system will also cope with having output "leaves" in the middle of
+# the bifurcating branches, not just at the extremities, but again no
+# input format does this yet.
+
+# There are two variants of the "do_now" structure. The first, smaller variant
+# is generated by &enter as the input file is read. There is one structure
+# for each input byte. Say we are mapping a single byte encoding to a
+# single byte encoding, with "ABCD" going "abcd". There will be
+# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
+
+# &process then walks the tree, building aggregate "do_now" structres for
+# adjacent bytes where possible. The aggregate is for a contiguous range of
+# bytes which each produce the same length of output, each move to the
+# same next state, and each have the same fallback flag.
+# So our 4 RAW "do_now"s above become replaced by a single structure
+# containing:
+# ["A", "D", "abcd", 1, ...]
+# ie, for an input byte $_ in "A".."D", output 1 byte, found as
+# substr ("abcd", (ord $_ - ord "A") * 1, 1)
+# which maps very nicely into pointer arithmetic in C for encengine.c
sub encode_U
{
# UTF-8 encode long hand - only covers part of perl's range
- my $uv = shift;
- if ($uv < 0x80)
- {
- return chr($uv)
- }
- if ($uv < 0x800)
- {
- return chr(($uv >> 6) | 0xC0).
- chr(($uv & 0x3F) | 0x80);
- }
- return chr(($uv >> 12) | 0xE0).
- chr((($uv >> 6) & 0x3F) | 0x80).
- chr(($uv & 0x3F) | 0x80);
+ ## my $uv = shift;
+ # chr() works in native space so convert value from table
+ # into that space before using chr().
+ my $ch = chr(utf8::unicode_to_native($_[0]));
+ # Now get core perl to encode that the way it likes.
+ utf8::encode($ch);
+ return $ch;
}
sub encode_S
{
# encode single byte
- my ($ch,$page) = @_;
- return chr($ch);
+ ## my ($ch,$page) = @_; return chr($ch);
+ return chr $_[0];
}
sub encode_D
{
# encode double byte MS byte first
- my ($ch,$page) = @_;
- return chr($page).chr($ch);
+ ## my ($ch,$page) = @_; return chr($page).chr($ch);
+ return chr ($_[1]) . chr $_[0];
}
sub encode_M
{
# encode Multi-byte - single for 0..255 otherwise double
- my ($ch,$page) = @_;
- return &encode_D if $page;
- return &encode_S;
+ ## my ($ch,$page) = @_;
+ ## return &encode_D if $page;
+ ## return &encode_S;
+ return chr ($_[1]) . chr $_[0] if $_[1];
+ return chr $_[0];
}
+my %encode_types = (U => \&encode_U,
+ S => \&encode_S,
+ D => \&encode_D,
+ M => \&encode_M,
+ );
+
# Win32 does not expand globs on command line
eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
my %opt;
-getopts('qo:f:n:',\%opt);
+getopts('qOo:f:n:',\%opt);
my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
chmod(0666,$cname) if -f $cname && !-w $cname;
open(C,">$cname") || die "Cannot open $cname:$!";
my $dname = $cname;
-$dname =~ s/(\.[^\.]*)?$/.def/;
+$dname =~ s/(\.[^\.]*)?$/_def.h/;
my ($doC,$doEnc,$doUcm,$doPet);
/*
!!!!!!! DO NOT EDIT THIS FILE !!!!!!!
This file was autogenerated by:
- $^X $0 $cname @orig_ARGV
- (Repository $perforce)
+ $^X $0 @orig_ARGV
*/
END
}
print C "#define U8 U8\n";
}
print C "#include \"encode.h\"\n";
+
}
elsif ($cname =~ /\.enc$/)
{
my %encoding;
my %strings;
+my $saved = 0;
+my $subsave = 0;
+my $strings = 0;
sub cmp_name
{
foreach my $enc (sort cmp_name @encfiles)
{
my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
- $name = delete $opt{'n'} if exists $opt{'n'};
+ $name = $opt{'n'} if exists $opt{'n'};
if (open(E,$enc))
{
if ($sfx eq 'enc')
if ($doC)
{
+ print STDERR "Writing compiled form\n";
foreach my $name (sort cmp_name keys %encoding)
{
my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
my $sym = "${enc}_encoding";
$sym =~ s/\W+/_/g;
print H "extern encode_t $sym;\n";
- print D " Encode_Define(aTHX_ &$sym);\n";
+ print D " Encode_XSEncoding(aTHX_ &$sym);\n";
}
if ($cname =~ /(\w+)\.xs$/)
{
my $mod = $1;
+ print C <<'END';
+
+static void
+Encode_XSEncoding(pTHX_ encode_t *enc)
+{
+ dSP;
+ HV *stash = gv_stashpv("Encode::XS", TRUE);
+ SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
+ int i = 0;
+ PUSHMARK(sp);
+ XPUSHs(sv);
+ while (enc->name[i])
+ {
+ const char *name = enc->name[i++];
+ XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
+ }
+ PUTBACK;
+ call_pv("Encode::define_encoding",G_DISCARD);
+ SvREFCNT_dec(sv);
+}
+
+END
+
print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
print C "BOOT:\n{\n";
print C "#include \"$dname\"\n";
}
close(D);
close(H);
+
+ my $perc_saved = $strings/($strings + $saved) * 100;
+ my $perc_subsaved = $strings/($strings + $subsave) * 100;
+ printf STDERR "%d bytes in string tables\n",$strings;
+ printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
+ $saved, $perc_saved if $saved;
+ printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
+ $subsave, $perc_subsaved if $subsave;
}
elsif ($doEnc)
{
close(C);
+
sub compile_ucm
{
my ($fh,$name) = @_;
{
s/#.*$//;
last if /^\s*CHARMAP\s*$/i;
- if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i)
+ if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
{
$attr{$1} = $2;
}
}
else
{
- # $name = lc($cs);
+ $name = $cs unless exists $opt{'n'};
}
my $erep;
my $urep;
push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
$erep = join('',map(chr(hex($_)),@byte));
}
- print "Scanning $name ($cs)\n";
+ print "Reading $name ($cs)\n";
my $nfb = 0;
my $hfb = 0;
while (<$fh>)
}
chomp($type);
return if $type eq 'E';
+ # Do the hash lookup once, rather than once per function call. 4% speedup.
+ my $type_func = $encode_types{$type};
my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
warn "$type encoded $name\n";
my $rep = '';
- my $min_el;
- my $max_el;
+ # Save a defined test by setting these to defined values.
+ my $min_el = ~0; # A very big integer
+ my $max_el = 0; # Anything must be longer than 0
{
my $v = hex($def);
- no strict 'refs';
- $rep = &{"encode_$type"}($v & 0xFF, ($v >> 8) & 0xffe);
+ $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
}
my %seen;
- while ($pages--)
+ do
{
my $line = <$fh>;
chomp($line);
my $page = hex($line);
my $ch = 0;
- for (my $i = 0; $i < 16; $i++)
+ my $i = 16;
+ do
{
+ # So why is it 1% faster to leave the my here?
my $line = <$fh>;
- for (my $j = 0; $j < 16; $j++)
+ die "Line should be exactly 65 characters long including newline"
+ unless length ($line) == 65;
+ # Split line into groups of 4 hex digits, convert groups to ints
+ # This takes 65.35
+ # map {hex $_} $line =~ /(....)/g
+ # This takes 63.75 (2.5% less time)
+ # unpack "n*", pack "H*", $line
+ # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
+ # Doing it as while ($line =~ /(....)/g) took 74.63
+ foreach my $val (unpack "n*", pack "H*", $line)
{
- no strict 'refs';
- my $ech = &{"encode_$type"}($ch,$page);
- my $val = hex(substr($line,0,4,''));
next if $val == 0xFFFD;
+ my $ech = &$type_func($ch,$page);
if ($val || (!$ch && !$page))
{
my $el = length($ech);
- $max_el = $el if (!defined($max_el) || $el > $max_el);
- $min_el = $el if (!defined($min_el) || $el < $min_el);
+ $max_el = $el if $el > $max_el;
+ $min_el = $el if $el < $min_el;
my $uch = encode_U($val);
+ # We don't need to read this quickly, so storing it as a scalar,
+ # rather than 3 (anon array, plus the 2 scalars it holds) saves
+ # RAM and may make us faster on low RAM systems. [see __END__]
if (exists $seen{$uch})
{
- warn sprintf("U%04X is %02X%02X and %02X%02X\n",
- $val,$page,$ch,@{$seen{$uch}});
+ warn sprintf("U%04X is %02X%02X and %04X\n",
+ $val,$page,$ch,$seen{$uch});
}
else
{
- $seen{$uch} = [$page,$ch];
+ $seen{$uch} = $page << 8 | $ch;
}
- enter($e2u,$ech,$uch,$e2u,0);
- enter($u2e,$uch,$ech,$u2e,0);
+ # Passing 2 extra args each time is 3.6% slower!
+ # Even with having to add $fallback ||= 0 later
+ enter_fb0($e2u,$ech,$uch);
+ enter_fb0($u2e,$uch,$ech);
}
else
{
}
$ch++;
}
- }
- }
+ } while --$i;
+ } while --$pages;
+ die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
+ if $min_el > $max_el;
$encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
}
-sub enter
-{
- my ($a,$s,$d,$t,$fb) = @_;
- $t = $a if @_ < 4;
- my $b = substr($s,0,1);
- my $e = $a->{$b};
- unless ($e)
- { # 0 1 2 3 4 5
- $e = [$b,$b,'',{},length($s),0,$fb];
- $a->{$b} = $e;
- }
- if (length($s) > 1)
- {
- enter($e->[3],substr($s,1),$d,$t,$fb);
+# my ($a,$s,$d,$t,$fb) = @_;
+sub enter {
+ my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
+ # state we shift to after this (multibyte) input character defaults to same
+ # as current state.
+ $next ||= $current;
+ # Making sure it is defined seems to be faster than {no warnings;} in
+ # &process, or passing it in as 0 explicity.
+ # XXX $fallback ||= 0;
+
+ # Start at the beginning and work forwards through the string to zero.
+ # effectively we are removing 1 character from the front each time
+ # but we don't actually edit the string. [this alone seems to be 14% speedup]
+ # Hence -$pos is the length of the remaining string.
+ my $pos = -length $inbytes;
+ while (1) {
+ my $byte = substr $inbytes, $pos, 1;
+ # RAW_NEXT => 0,
+ # RAW_IN_LEN => 1,
+ # RAW_OUT_BYTES => 2,
+ # RAW_FALLBACK => 3,
+ # to unicode an array would seem to be better, because the pages are dense.
+ # from unicode can be very sparse, favouring a hash.
+ # hash using the bytes (all length 1) as keys rather than ord value,
+ # as it's easier to sort these in &process.
+
+ # It's faster to always add $fallback even if it's undef, rather than
+ # choosing between 3 and 4 element array. (hence why we set it defined
+ # above)
+ my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
+ # When $pos was -1 we were at the last input character.
+ unless (++$pos) {
+ $do_now->[RAW_OUT_BYTES] = $outbytes;
+ $do_now->[RAW_NEXT] = $next;
+ return;
+ }
+ # Tail recursion. The intermdiate state may not have a name yet.
+ $current = $do_now->[RAW_NEXT];
}
- else
- {
- $e->[2] = $d;
- $e->[3] = $t;
- $e->[5] = length($d);
+}
+
+# This is purely for optimistation. It's just &enter hard coded for $fallback
+# of 0, using only a 3 entry array ref to save memory for every entry.
+sub enter_fb0 {
+ my ($current,$inbytes,$outbytes,$next) = @_;
+ $next ||= $current;
+
+ my $pos = -length $inbytes;
+ while (1) {
+ my $byte = substr $inbytes, $pos, 1;
+ my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
+ unless (++$pos) {
+ $do_now->[RAW_OUT_BYTES] = $outbytes;
+ $do_now->[RAW_NEXT] = $next;
+ return;
+ }
+ $current = $do_now->[RAW_NEXT];
}
}
+
sub outstring
{
my ($fh,$name,$s) = @_;
my $sym = $strings{$s};
- unless ($sym)
+ if ($sym)
{
- foreach my $o (keys %strings)
- {
- my $i = index($o,$s);
- if ($i >= 0)
- {
- $sym = $strings{$o};
- $sym .= sprintf("+0x%02x",$i) if ($i);
- return $sym;
- }
- }
+ $saved += length($s);
+ }
+ else
+ {
+ if ($opt{'O'}) {
+ foreach my $o (keys %strings)
+ {
+ next unless (my $i = index($o,$s)) >= 0;
+ $sym = $strings{$o};
+ $sym .= sprintf("+0x%02x",$i) if ($i);
+ $subsave += length($s);
+ return $strings{$s} = $sym;
+ }
+ }
$strings{$s} = $sym = $name;
- printf $fh "\nstatic const U8 %s[%d] =\n",$name,length($s);
- # Do in chunks of 16 chars to constrain line length
- # Assumes ANSI C adjacent string litteral concatenation
- while (length($s))
- {
- my $c = substr($s,0,16,'');
- print $fh '"',join('',map(sprintf('\x%02x',ord($_)),split(//,$c))),'"';
- print $fh "\n" if length($s);
- }
- printf $fh ";\n";
+ $strings += length($s);
+ my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
+ # Maybe we should assert that these are all <256.
+ $definition .= join(',',unpack "C*",$s);
+ # We have a single long line. Split it at convenient commas.
+ $definition =~ s/(.{74,77},)/$1\n/g;
+ print $fh "$definition };\n\n";
}
return $sym;
}
sub process
{
- my ($name,$a) = @_;
- $name =~ s/\W+/_/g;
- $a->{Cname} = $name;
- my @keys = grep(ref($a->{$_}),sort keys %$a);
- my $l;
- my @ent;
- foreach my $b (@keys)
- {
- my ($s,$f,$out,$t,$end) = @{$a->{$b}};
- if (defined($l) &&
- ord($b) == ord($a->{$l}[1])+1 &&
- $a->{$l}[3] == $a->{$b}[3] &&
- $a->{$l}[4] == $a->{$b}[4] &&
- $a->{$l}[5] == $a->{$b}[5] &&
- $a->{$l}[6] == $a->{$b}[6]
- # && length($a->{$l}[2]) < 16
- )
- {
- my $i = ord($b)-ord($a->{$l}[0]);
- $a->{$l}[1] = $b;
- $a->{$l}[2] .= $a->{$b}[2];
- }
- else
- {
- $l = $b;
- push(@ent,$b);
- }
- if (exists $t->{Cname})
- {
- $t->{'Forward'} = 1 if $t != $a;
+ my ($name,$a) = @_;
+ $name =~ s/\W+/_/g;
+ $a->{Cname} = $name;
+ my $raw = $a->{Raw};
+ my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
+ my @ent;
+ $agg_max_in = 0;
+ foreach my $key (sort keys %$raw) {
+ # RAW_NEXT => 0,
+ # RAW_IN_LEN => 1,
+ # RAW_OUT_BYTES => 2,
+ # RAW_FALLBACK => 3,
+ my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
+ # Now we are converting from raw to aggregate, switch from 1 byte strings
+ # to numbers
+ my $b = ord $key;
+ $fallback ||= 0;
+ if ($l &&
+ # If this == fails, we're going to reset $agg_max_in below anyway.
+ $b == ++$agg_max_in &&
+ # References in numeric context give the pointer as an int.
+ $agg_next == $next &&
+ $agg_in_len == $in_len &&
+ $agg_out_len == length $out_bytes &&
+ $agg_fallback == $fallback
+ # && length($l->[AGG_OUT_BYTES]) < 16
+ ) {
+ # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
+ # we can aggregate this byte onto the end.
+ $l->[AGG_MAX_IN] = $b;
+ $l->[AGG_OUT_BYTES] .= $out_bytes;
+ } else {
+ # AGG_MIN_IN => 0,
+ # AGG_MAX_IN => 1,
+ # AGG_OUT_BYTES => 2,
+ # AGG_NEXT => 3,
+ # AGG_IN_LEN => 4,
+ # AGG_OUT_LEN => 5,
+ # AGG_FALLBACK => 6,
+ # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
+ # (only gains .6% on euc-jp -- is it worth it?)
+ push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
+ $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
+ $agg_fallback = $fallback];
}
- else
- {
- process(sprintf("%s_%02x",$name,ord($s)),$t);
+ if (exists $next->{Cname}) {
+ $next->{'Forward'} = 1 if $next != $a;
+ } else {
+ process(sprintf("%s_%02x",$name,$b),$next);
}
}
- if (ord($keys[-1]) < 255)
- {
- my $t = chr(ord($keys[-1])+1);
- $a->{$t} = [$t,chr(255),undef,$a,0,0];
- push(@ent,$t);
+ # encengine.c rules say that last entry must be for 255
+ if ($agg_max_in < 255) {
+ push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
}
- $a->{'Entries'} = \@ent;
+ $a->{'Entries'} = \@ent;
}
sub outtable
# String tables
foreach my $b (@{$a->{'Entries'}})
{
- next unless $a->{$b}[5];
- my $s = ord($a->{$b}[0]);
- my $e = ord($a->{$b}[1]);
- outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$a->{$b}[2]);
+ next unless $b->[AGG_OUT_LEN];
+ my $s = $b->[AGG_MIN_IN];
+ my $e = $b->[AGG_MAX_IN];
+ outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
}
if ($a->{'Forward'})
{
$a->{'Done'} = 1;
foreach my $b (@{$a->{'Entries'}})
{
- my ($s,$e,$out,$t,$end,$l) = @{$a->{$b}};
+ my ($s,$e,$out,$t,$end,$l) = @$b;
outtable($fh,$t) unless $t->{'Done'};
}
print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
foreach my $b (@{$a->{'Entries'}})
{
- my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
- my $sc = ord($s);
- my $ec = ord($e);
+ my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
$end |= 0x80 if $fb;
print $fh "{";
if ($l)
sub output_enc
{
my ($fh,$name,$a) = @_;
+ die "Changed - fix me for new structure";
foreach my $b (sort keys %$a)
{
my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
my $s = shift;
}
-sub output_ucm_page
+my @uname;
+sub char_names
{
- my ($fh,$a,$t,$pre) = @_;
- # warn sprintf("Page %x\n",$pre);
- foreach my $b (sort keys %$t)
+ my $s = do "unicore/Name.pl";
+ die "char_names: unicore/Name.pl: $!\n" unless defined $s;
+ pos($s) = 0;
+ while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
{
- my ($s,$e,$out,$n,$end,$l,$fb) = @{$t->{$b}};
- die "oops $s $e" unless $s eq $e;
- my $u = ord($s);
- if ($n != $a && $n != $t)
+ my $name = $3;
+ my $s = hex($1);
+ last if $s >= 0x10000;
+ my $e = length($2) ? hex($2) : $s;
+ for (my $i = $s; $i <= $e; $i++)
{
- output_ucm_page($fh,$a,$n,(($pre|($u &0x3F)) << 6)&0xFFFF);
+ $uname[$i] = $name;
+# print sprintf("U%04X $name\n",$i);
}
- elsif (length($out))
- {
- if ($pre)
- {
- $u = $pre|($u &0x3f);
- }
- printf $fh "<U%04X> ",$u;
- foreach my $c (split(//,$out))
- {
- printf $fh "\\x%02X",ord($c);
+ }
+}
+
+sub output_ucm_page
+{
+ my ($cmap,$a,$t,$pre) = @_;
+ # warn sprintf("Page %x\n",$pre);
+ my $raw = $t->{Raw};
+ foreach my $key (sort keys %$raw) {
+ # RAW_NEXT => 0,
+ # RAW_IN_LEN => 1,
+ # RAW_OUT_BYTES => 2,
+ # RAW_FALLBACK => 3,
+ my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
+ my $u = ord $key;
+ $fallback ||= 0;
+
+ if ($next != $a && $next != $t) {
+ output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
+ } elsif (length $out_bytes) {
+ if ($pre) {
+ $u = $pre|($u &0x3f);
}
- printf $fh " |%d\n",($fb ? 1 : 0);
- }
- else
- {
- warn join(',',@{$t->{$b}},$a,$t);
+ my $s = sprintf "<U%04X> ",$u;
+ #foreach my $c (split(//,$out_bytes)) {
+ # $s .= sprintf "\\x%02X",ord($c);
+ #}
+ # 9.5% faster changing that lloop to this:
+ $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
+ $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
+ push(@$cmap,$s);
+ } else {
+ warn join(',',$u, @{$raw->{$key}},$a,$t);
}
}
}
sub output_ucm
{
- my ($fh,$name,$a,$rep,$min_el,$max_el) = @_;
- print $fh "# Written $perforce\n# $0 @orig_ARGV\n" unless $opt{'q'};
+ my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
+ print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
print $fh "<code_set_name> \"$name\"\n";
+ char_names();
if (defined $min_el)
{
print $fh "<mb_cur_min> $min_el\n";
}
print $fh "\n";
}
+ my @cmap;
+ output_ucm_page(\@cmap,$h,$h,0);
print $fh "#\nCHARMAP\n";
- output_ucm_page($fh,$a,$a,0);
+ foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
+ {
+ print $fh $line;
+ }
print $fh "END CHARMAP\n";
}
+
+__END__
+With %seen holding array refs:
+
+ 865.66 real 28.80 user 8.79 sys
+ 7904 maximum resident set size
+ 1356 average shared memory size
+ 18566 average unshared data size
+ 229 average unshared stack size
+ 46080 page reclaims
+ 33373 page faults
+
+With %seen holding simple scalars:
+
+ 342.16 real 27.11 user 3.54 sys
+ 8388 maximum resident set size
+ 1394 average shared memory size
+ 14969 average unshared data size
+ 236 average unshared stack size
+ 28159 page reclaims
+ 9839 page faults
+
+Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
+how %seen is storing things its seen. So it is pathalogically bad on a 16M
+RAM machine, but it's going to help even on modern machines.
+Swapping is bad, m'kay :-)