X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=ext%2FEncode%2Fcompile;h=a76676b288ce9351a6ccfb7236e8b78bd5428d2c;hb=5129552cc421a69f6981a03ac0ecc86b5722d1e6;hp=26442a00a7e46bef8d27c340af7f7e28997ff87a;hpb=d70b7d477c148def9a2f86b1e58a9a6eab8eb345;p=p5sagit%2Fp5-mst-13.2.git

diff --git a/ext/Encode/compile b/ext/Encode/compile
index 26442a0..a76676b 100755
--- a/ext/Encode/compile
+++ b/ext/Encode/compile
@@ -6,14 +6,80 @@ BEGIN {
 use strict;
 use Getopt::Std;
 my @orig_ARGV = @ARGV;
+our $VERSION  = '0.30';
+
+# These may get re-ordered.
+# RAW is a do_now as inserted by &enter
+# AGG is an aggreagated do_now, as built up by &process
+use constant {
+  RAW_NEXT => 0,
+  RAW_IN_LEN => 1,
+  RAW_OUT_BYTES => 2,
+  RAW_FALLBACK => 3,
+
+  AGG_MIN_IN => 0,
+  AGG_MAX_IN => 1,
+  AGG_OUT_BYTES => 2,
+  AGG_NEXT => 3,
+  AGG_IN_LEN => 4,
+  AGG_OUT_LEN => 5,
+  AGG_FALLBACK => 6,
+};
+# (See the algorithm in encengine.c - we're building structures for it)
+
+# There are two sorts of structures.
+# "do_now" (an array, two variants of what needs storing) is whatever we need
+# to do now we've read an input byte.
+# It's housed in a "do_next" (which is how we got to it), and in turn points
+# to a "do_next" which contains all the "do_now"s for the next input byte.
+
+# There will be a "do_next" which is the start state.
+# For a single byte encoding it's the only "do_next" - each "do_now" points
+# back to it, and each "do_now" will cause bytes. There is no state.
+
+# For a multi-byte encoding where all characters in the input are the same
+# length, then there will be a tree of "do_now"->"do_next"->"do_now"
+# branching out from the start state, one step for each input byte.
+# The leaf "do_now"s will all be at the same distance from the start state,
+# only the leaf "do_now"s cause output bytes, and they in turn point back to
+# the start state.
+
+# For an encoding where there are varaible length input byte sequences, you
+# will encounter a leaf "do_now" sooner for the shorter input sequences, but
+# as before the leaves will point back to the start state.
+
+# The system will cope with escape encodings (imagine them as a mostly
+# self-contained tree for each escape state, and cross links between trees
+# at the state-switching characters) but so far no input format defines these.
+
+# The system will also cope with having output "leaves" in the middle of
+# the bifurcating branches, not just at the extremities, but again no
+# input format does this yet.
+
+# There are two variants of the "do_now" structure. The first, smaller variant
+# is generated by &enter as the input file is read. There is one structure
+# for each input byte. Say we are mapping a single byte encoding to a
+# single byte encoding, with  "ABCD" going "abcd". There will be
+# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
+
+# &process then walks the tree, building aggregate "do_now" structres for
+# adjacent bytes where possible. The aggregate is for a contiguous range of
+# bytes which each produce the same length of output, each move to the
+# same next state, and each have the same fallback flag.
+# So our 4 RAW "do_now"s above become replaced by a single structure
+# containing:
+# ["A", "D", "abcd", 1, ...]
+# ie, for an input byte $_ in "A".."D", output 1 byte, found as
+# substr ("abcd", (ord $_ - ord "A") * 1, 1)
+# which maps very nicely into pointer arithmetic in C for encengine.c
 
 sub encode_U
 {
  # UTF-8 encode long hand - only covers part of perl's range
- my $uv = shift;
+ ## my $uv = shift;
  # chr() works in native space so convert value from table
  # into that space before using chr().
- my $ch = chr(utf8::unicode_to_native($uv));
+ my $ch = chr(utf8::unicode_to_native($_[0]));
  # Now get core perl to encode that the way it likes.
  utf8::encode($ch);
  return $ch;
@@ -22,46 +88,75 @@ sub encode_U
 sub encode_S
 {
  # encode single byte
- my ($ch,$page) = @_;
- return chr($ch);
+ ## my ($ch,$page) = @_; return chr($ch);
+ return chr $_[0];
 }
 
 sub encode_D
 {
  # encode double byte MS byte first
- my ($ch,$page) = @_;
- return chr($page).chr($ch);
+ ## my ($ch,$page) = @_; return chr($page).chr($ch);
+ return chr ($_[1]) . chr $_[0];
 }
 
 sub encode_M
 {
  # encode Multi-byte - single for 0..255 otherwise double
- my ($ch,$page) = @_;
- return &encode_D if $page;
- return &encode_S;
+ ## my ($ch,$page) = @_;
+ ## return &encode_D if $page;
+ ## return &encode_S;
+ return chr ($_[1]) . chr $_[0] if $_[1];
+ return chr $_[0];
 }
 
+my %encode_types = (U => \&encode_U,
+                    S => \&encode_S,
+                    D => \&encode_D,
+                    M => \&encode_M,
+                   );
+
 # Win32 does not expand globs on command line
 eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
 
 my %opt;
-getopts('qo:f:n:',\%opt);
+# I think these are:
+# -Q to disable the duplicate codepoint test
+# -S make mapping errors fatal
+# -q to remove comments written to output files
+# -O to enable the (brute force) substring optimiser
+# -o <output> to specify the output file name (else it's the first arg)
+# -f <inlist> to give a file with a list of input files (else use the args)
+# -n <name> to name the encoding (else use the basename of the input file.
+getopts('SQqOo:f:n:',\%opt);
+
+# This really should go first, else the die here causes empty (non-erroneous)
+# output files to be written.
+my @encfiles;
+if (exists $opt{'f'}) {
+    # -F is followed by name of file containing list of filenames
+    my $flist = $opt{'f'};
+    open(FLIST,$flist) || die "Cannot open $flist:$!";
+    chomp(@encfiles = <FLIST>);
+    close(FLIST);
+} else {
+    @encfiles = @ARGV;
+}
+
 my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
 chmod(0666,$cname) if -f $cname && !-w $cname;
 open(C,">$cname") || die "Cannot open $cname:$!";
 
-
 my $dname = $cname;
-$dname =~ s/(\.[^\.]*)?$/.def/;
+my $hname = $cname;
 
 my ($doC,$doEnc,$doUcm,$doPet);
 
 if ($cname =~ /\.(c|xs)$/)
  {
   $doC = 1;
+  $dname =~ s/(\.[^\.]*)?$/_def.h/;
   chmod(0666,$dname) if -f $cname && !-w $dname;
   open(D,">$dname") || die "Cannot open $dname:$!";
-  my $hname = $cname;
   $hname =~ s/(\.[^\.]*)?$/.h/;
   chmod(0666,$hname) if -f $cname && !-w $hname;
   open(H,">$hname") || die "Cannot open $hname:$!";
@@ -72,7 +167,7 @@ if ($cname =~ /\.(c|xs)$/)
 /*
  !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
  This file was autogenerated by:
- $^X $0 $cname @orig_ARGV
+ $^X $0 @orig_ARGV
 */
 END
   }
@@ -100,20 +195,6 @@ elsif ($cname =~ /\.pet$/)
   $doPet = 1;
  }
 
-my @encfiles;
-if (exists $opt{'f'})
- {
-  # -F is followed by name of file containing list of filenames
-  my $flist = $opt{'f'};
-  open(FLIST,$flist) || die "Cannot open $flist:$!";
-  chomp(@encfiles = <FLIST>);
-  close(FLIST);
- }
-else
- {
-  @encfiles = @ARGV;
- }
-
 my %encoding;
 my %strings;
 my $saved = 0;
@@ -189,7 +270,7 @@ if ($doC)
     my $mod = $1;
     print C <<'END';
 
-void
+static void
 Encode_XSEncoding(pTHX_ encode_t *enc)
 {
  dSP;
@@ -215,11 +296,17 @@ END
     print C "#include \"$dname\"\n";
     print C "}\n";
    }
-  close(D);
-  close(H);
+  # Close in void context is bad, m'kay
+  close(D) or warn "Error closing '$dname': $!";
+  close(H) or warn "Error closing '$hname': $!";
+
+  my $perc_saved    = $strings/($strings + $saved) * 100;
+  my $perc_subsaved = $strings/($strings + $subsave) * 100;
   printf STDERR "%d bytes in string tables\n",$strings;
-  printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",$saved,100*$saved/$strings if $saved;
-  printf STDERR "%d bytes (%.3g%%) saved using substrings\n",$subsave,100*$subsave/$strings if $subsave;
+  printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
+    $saved, $perc_saved              if $saved;
+  printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
+    $subsave, $perc_subsaved         if $subsave;
  }
 elsif ($doEnc)
  {
@@ -238,9 +325,11 @@ elsif ($doUcm)
    }
  }
 
-close(C);
-
+# writing half meg files and then not checking to see if you just filled the
+# disk is bad, m'kay
+close(C) or die "Error closing '$cname': $!";
 
+# End of the main program.
 
 sub compile_ucm
 {
@@ -253,7 +342,7 @@ sub compile_ucm
   {
    s/#.*$//;
    last if /^\s*CHARMAP\s*$/i;
-   if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i)
+   if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
     {
      $attr{$1} = $2;
     }
@@ -329,6 +418,8 @@ sub compile_ucm
  $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
 }
 
+
+
 sub compile_enc
 {
  my ($fh,$name) = @_;
@@ -342,49 +433,73 @@ sub compile_enc
   }
  chomp($type);
  return if $type eq 'E';
+ # Do the hash lookup once, rather than once per function call. 4% speedup.
+ my $type_func = $encode_types{$type};
  my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
  warn "$type encoded $name\n";
  my $rep = '';
- my $min_el;
- my $max_el;
+ # Save a defined test by setting these to defined values.
+ my $min_el = ~0; # A very big integer
+ my $max_el = 0;  # Anything must be longer than 0
  {
   my $v = hex($def);
-  no strict 'refs';
-  $rep = &{"encode_$type"}($v & 0xFF, ($v >> 8) & 0xffe);
+  $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
  }
- my %seen;
- while ($pages--)
+ my $errors;
+ my $seen;
+ # use -Q to silence the seen test. Makefile.PL uses this by default.
+ $seen = {} unless $opt{Q};
+ do
   {
    my $line = <$fh>;
    chomp($line);
    my $page = hex($line);
    my $ch = 0;
-   for (my $i = 0; $i < 16; $i++)
+   my $i = 16;
+   do
     {
+     # So why is it 1% faster to leave the my here?
      my $line = <$fh>;
-     for (my $j = 0; $j < 16; $j++)
+     $line =~ s/\r\n$/\n/;
+     die "$.:${line}Line should be exactly 65 characters long including
+     newline (".length($line).")" unless length ($line) == 65;
+     # Split line into groups of 4 hex digits, convert groups to ints
+     # This takes 65.35		
+     # map {hex $_} $line =~ /(....)/g
+     # This takes 63.75 (2.5% less time)
+     # unpack "n*", pack "H*", $line
+     # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
+     # Doing it as while ($line =~ /(....)/g) took 74.63
+     foreach my $val (unpack "n*", pack "H*", $line)
       {
-       no strict 'refs';
-       my $ech = &{"encode_$type"}($ch,$page);
-       my $val = hex(substr($line,0,4,''));
        next if $val == 0xFFFD;
+       my $ech = &$type_func($ch,$page);
        if ($val || (!$ch && !$page))
         {
          my $el  = length($ech);
-         $max_el = $el if (!defined($max_el) || $el > $max_el);
-         $min_el = $el if (!defined($min_el) || $el < $min_el);
+         $max_el = $el if $el > $max_el;
+         $min_el = $el if $el < $min_el;
          my $uch = encode_U($val);
-         if (exists $seen{$uch})
-          {
-           warn sprintf("U%04X is %02X%02X and %02X%02X\n",
-                        $val,$page,$ch,@{$seen{$uch}});
-          }
-         else
-          {
-           $seen{$uch} = [$page,$ch];
-          }
-         enter($e2u,$ech,$uch,$e2u,0);
-         enter($u2e,$uch,$ech,$u2e,0);
+         if ($seen) {
+           # We're doing the test.
+           # We don't need to read this quickly, so storing it as a scalar,
+           # rather than 3 (anon array, plus the 2 scalars it holds) saves
+           # RAM and may make us faster on low RAM systems. [see __END__]
+           if (exists $seen->{$uch})
+             {
+               warn sprintf("U%04X is %02X%02X and %04X\n",
+                            $val,$page,$ch,$seen->{$uch});
+               $errors++;
+             }
+           else
+             {
+               $seen->{$uch} = $page << 8 | $ch;
+             }
+         }
+         # Passing 2 extra args each time is 3.6% slower!
+         # Even with having to add $fallback ||= 0 later
+         enter_fb0($e2u,$ech,$uch);
+         enter_fb0($u2e,$uch,$ech);
         }
        else
         {
@@ -393,34 +508,73 @@ sub compile_enc
         }
        $ch++;
       }
-    }
-  }
+    } while --$i;
+  } while --$pages;
+ die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
+   if $min_el > $max_el;
+ die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
  $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
 }
 
-sub enter
-{
- my ($a,$s,$d,$t,$fb) = @_;
- $t = $a if @_ < 4;
- my $b = substr($s,0,1);
- my $e = $a->{$b};
- unless ($e)
-  {     # 0  1  2  3         4  5
-   $e = [$b,$b,'',{},length($s),0,$fb];
-   $a->{$b} = $e;
-  }
- if (length($s) > 1)
-  {
-   enter($e->[3],substr($s,1),$d,$t,$fb);
-  }
- else
-  {
-   $e->[2] = $d;
-   $e->[3] = $t;
-   $e->[5] = length($d);
+# my ($a,$s,$d,$t,$fb) = @_;
+sub enter {
+  my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
+  # state we shift to after this (multibyte) input character defaults to same
+  # as current state.
+  $next ||= $current;
+  # Making sure it is defined seems to be faster than {no warnings;} in
+  # &process, or passing it in as 0 explicity.
+  # XXX $fallback ||= 0;
+
+  # Start at the beginning and work forwards through the string to zero.
+  # effectively we are removing 1 character from the front each time
+  # but we don't actually edit the string. [this alone seems to be 14% speedup]
+  # Hence -$pos is the length of the remaining string.
+  my $pos = -length $inbytes;
+  while (1) {
+    my $byte = substr $inbytes, $pos, 1;
+    #  RAW_NEXT => 0,
+    #  RAW_IN_LEN => 1,
+    #  RAW_OUT_BYTES => 2,
+    #  RAW_FALLBACK => 3,
+    # to unicode an array would seem to be better, because the pages are dense.
+    # from unicode can be very sparse, favouring a hash.
+    # hash using the bytes (all length 1) as keys rather than ord value,
+    # as it's easier to sort these in &process.
+
+    # It's faster to always add $fallback even if it's undef, rather than
+    # choosing between 3 and 4 element array. (hence why we set it defined
+    # above)
+    my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
+    # When $pos was -1 we were at the last input character.
+    unless (++$pos) {
+      $do_now->[RAW_OUT_BYTES] = $outbytes;
+      $do_now->[RAW_NEXT] = $next;
+      return;
+    }
+    # Tail recursion. The intermdiate state may not have a name yet.
+    $current = $do_now->[RAW_NEXT];
   }
 }
 
+# This is purely for optimistation. It's just &enter hard coded for $fallback
+# of 0, using only a 3 entry array ref to save memory for every entry.
+sub enter_fb0 {
+  my ($current,$inbytes,$outbytes,$next) = @_;
+  $next ||= $current;
+
+  my $pos = -length $inbytes;
+  while (1) {
+    my $byte = substr $inbytes, $pos, 1;
+    my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
+    unless (++$pos) {
+      $do_now->[RAW_OUT_BYTES] = $outbytes;
+      $do_now->[RAW_NEXT] = $next;
+      return;
+    }
+    $current = $do_now->[RAW_NEXT];
+  }
+}
 
 
 sub outstring
@@ -433,78 +587,86 @@ sub outstring
   }
  else
   {
-   foreach my $o (keys %strings)
-    {
-     my $i = index($o,$s);
-     if ($i >= 0)
-      {
-       $sym = $strings{$o};
-       $sym .= sprintf("+0x%02x",$i) if ($i);
-       $subsave += length($s);
-       return $sym;
-      }
-    }
+   if ($opt{'O'}) {
+       foreach my $o (keys %strings)
+        {
+         next unless (my $i = index($o,$s)) >= 0;
+         $sym = $strings{$o};
+         $sym .= sprintf("+0x%02x",$i) if ($i);
+         $subsave += length($s);
+         return $strings{$s} = $sym;
+       }
+   }
    $strings{$s} = $sym = $name;
    $strings += length($s);
-   printf $fh "\nstatic const U8 %s[%d] =\n",$name,length($s);
-   # Do in chunks of 16 chars to constrain line length
-   # Assumes ANSI C adjacent string litteral concatenation
-   while (length($s))
-    {
-     my $c = substr($s,0,16,'');
-     print  $fh '"',join('',map(sprintf('\x%02x',ord($_)),split(//,$c))),'"';
-     print  $fh "\n" if length($s);
-    }
-   printf $fh ";\n";
+   my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
+   # Maybe we should assert that these are all <256.
+   $definition .= join(',',unpack "C*",$s);
+   # We have a single long line. Split it at convenient commas.
+   $definition =~ s/(.{74,77},)/$1\n/g;
+   print $fh "$definition };\n\n";
   }
  return $sym;
 }
 
 sub process
 {
- my ($name,$a) = @_;
- $name =~ s/\W+/_/g;
- $a->{Cname} = $name;
- my @keys = grep(ref($a->{$_}),sort keys %$a);
- my $l;
- my @ent;
- foreach my $b (@keys)
-  {
-   my ($s,$f,$out,$t,$end) = @{$a->{$b}};
-   if (defined($l) &&
-       ord($b) == ord($a->{$l}[1])+1 &&
-       $a->{$l}[3] == $a->{$b}[3] &&
-       $a->{$l}[4] == $a->{$b}[4] &&
-       $a->{$l}[5] == $a->{$b}[5] &&
-       $a->{$l}[6] == $a->{$b}[6]
-       # && length($a->{$l}[2]) < 16
-      )
-    {
-     my $i = ord($b)-ord($a->{$l}[0]);
-     $a->{$l}[1]  = $b;
-     $a->{$l}[2] .= $a->{$b}[2];
-    }
-   else
-    {
-     $l = $b;
-     push(@ent,$b);
-    }
-   if (exists $t->{Cname})
-    {
-     $t->{'Forward'} = 1 if $t != $a;
+  my ($name,$a) = @_;
+  $name =~ s/\W+/_/g;
+  $a->{Cname} = $name;
+  my $raw = $a->{Raw};
+  my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
+  my @ent;
+  $agg_max_in = 0;
+  foreach my $key (sort keys %$raw) {
+    #  RAW_NEXT => 0,
+    #  RAW_IN_LEN => 1,
+    #  RAW_OUT_BYTES => 2,
+    #  RAW_FALLBACK => 3,
+    my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
+    # Now we are converting from raw to aggregate, switch from 1 byte strings
+    # to numbers
+    my $b = ord $key;
+    $fallback ||= 0;
+    if ($l &&
+        # If this == fails, we're going to reset $agg_max_in below anyway.
+        $b == ++$agg_max_in &&
+        # References in numeric context give the pointer as an int.
+        $agg_next == $next &&
+        $agg_in_len == $in_len &&
+        $agg_out_len == length $out_bytes &&
+        $agg_fallback == $fallback
+        # && length($l->[AGG_OUT_BYTES]) < 16
+       ) {
+      #     my $i = ord($b)-ord($l->[AGG_MIN_IN]);
+      # we can aggregate this byte onto the end.
+      $l->[AGG_MAX_IN] = $b;
+      $l->[AGG_OUT_BYTES] .= $out_bytes;
+    } else {
+      # AGG_MIN_IN => 0,
+      # AGG_MAX_IN => 1,
+      # AGG_OUT_BYTES => 2,
+      # AGG_NEXT => 3,
+      # AGG_IN_LEN => 4,
+      # AGG_OUT_LEN => 5,
+      # AGG_FALLBACK => 6,
+      # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
+      # (only gains .6% on euc-jp  -- is it worth it?)
+      push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
+                       $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
+                       $agg_fallback = $fallback];
     }
-   else
-    {
-     process(sprintf("%s_%02x",$name,ord($s)),$t);
+    if (exists $next->{Cname}) {
+      $next->{'Forward'} = 1 if $next != $a;
+    } else {
+      process(sprintf("%s_%02x",$name,$b),$next);
     }
   }
- if (ord($keys[-1]) < 255)
-  {
-   my $t = chr(ord($keys[-1])+1);
-   $a->{$t} = [$t,chr(255),undef,$a,0,0];
-   push(@ent,$t);
+  # encengine.c rules say that last entry must be for 255
+  if ($agg_max_in < 255) {
+    push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
   }
- $a->{'Entries'} = \@ent;
+  $a->{'Entries'} = \@ent;
 }
 
 sub outtable
@@ -514,10 +676,10 @@ sub outtable
  # String tables
  foreach my $b (@{$a->{'Entries'}})
   {
-   next unless $a->{$b}[5];
-   my $s = ord($a->{$b}[0]);
-   my $e = ord($a->{$b}[1]);
-   outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$a->{$b}[2]);
+   next unless $b->[AGG_OUT_LEN];
+   my $s = $b->[AGG_MIN_IN];
+   my $e = $b->[AGG_MAX_IN];
+   outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
   }
  if ($a->{'Forward'})
   {
@@ -526,15 +688,13 @@ sub outtable
  $a->{'Done'} = 1;
  foreach my $b (@{$a->{'Entries'}})
   {
-   my ($s,$e,$out,$t,$end,$l) = @{$a->{$b}};
+   my ($s,$e,$out,$t,$end,$l) = @$b;
    outtable($fh,$t) unless $t->{'Done'};
   }
  print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
  foreach my $b (@{$a->{'Entries'}})
   {
-   my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
-   my $sc = ord($s);
-   my $ec = ord($e);
+   my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
    $end |= 0x80 if $fb;
    print  $fh "{";
    if ($l)
@@ -562,6 +722,7 @@ sub output
 sub output_enc
 {
  my ($fh,$name,$a) = @_;
+ die "Changed - fix me for new structure";
  foreach my $b (sort keys %$a)
   {
    my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
@@ -595,34 +756,34 @@ sub char_names
 
 sub output_ucm_page
 {
- my ($cmap,$a,$t,$pre) = @_;
- # warn sprintf("Page %x\n",$pre);
- foreach my $b (sort keys %$t)
-  {
-   my ($s,$e,$out,$n,$end,$l,$fb) = @{$t->{$b}};
-   die "oops $s $e" unless $s eq $e;
-   my $u = ord($s);
-   if ($n != $a && $n != $t)
-    {
-     output_ucm_page($cmap,$a,$n,(($pre|($u &0x3F)) << 6)&0xFFFF);
-    }
-   elsif (length($out))
-    {
-     if ($pre)
-      {
-       $u = $pre|($u &0x3f);
-      }
-     my $s = sprintf "<U%04X> ",$u;
-     foreach my $c (split(//,$out))
-      {
-       $s .= sprintf "\\x%02X",ord($c);
+  my ($cmap,$a,$t,$pre) = @_;
+  # warn sprintf("Page %x\n",$pre);
+  my $raw = $t->{Raw};
+  foreach my $key (sort keys %$raw) {
+    #  RAW_NEXT => 0,
+    #  RAW_IN_LEN => 1,
+    #  RAW_OUT_BYTES => 2,
+    #  RAW_FALLBACK => 3,
+    my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
+    my $u = ord $key;
+    $fallback ||= 0;
+
+    if ($next != $a && $next != $t) {
+      output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
+    } elsif (length $out_bytes) {
+      if ($pre) {
+        $u = $pre|($u &0x3f);
       }
-     $s .= sprintf " |%d # %s\n",($fb ? 1 : 0),$uname[$u];
-     push(@$cmap,$s);
-    }
-   else
-    {
-     warn join(',',@{$t->{$b}},$a,$t);
+      my $s = sprintf "<U%04X> ",$u;
+      #foreach my $c (split(//,$out_bytes)) {
+      #  $s .= sprintf "\\x%02X",ord($c);
+      #}
+      # 9.5% faster changing that loop to this:
+      $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
+      $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
+      push(@$cmap,$s);
+    } else {
+      warn join(',',$u, @{$raw->{$key}},$a,$t);
     }
   }
 }
@@ -660,3 +821,29 @@ sub output_ucm
  print $fh "END CHARMAP\n";
 }
 
+
+__END__
+With %seen holding array refs:
+
+      865.66 real        28.80 user         8.79 sys
+      7904  maximum resident set size
+      1356  average shared memory size
+     18566  average unshared data size
+       229  average unshared stack size
+     46080  page reclaims
+     33373  page faults
+
+With %seen holding simple scalars:
+
+      342.16 real        27.11 user         3.54 sys
+      8388  maximum resident set size
+      1394  average shared memory size
+     14969  average unshared data size
+       236  average unshared stack size
+     28159  page reclaims
+      9839  page faults
+
+Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
+how %seen is storing things its seen. So it is pathalogically bad on a 16M
+RAM machine, but it's going to help even on modern machines.
+Swapping is bad, m'kay :-)