ext/Encode/compile

   1 #!../../perl -w
   2 BEGIN {
   3     unshift @INC, qw(../../lib ../../../lib);
   4     $ENV{PATH} .= ';../..;../../..' if $^O eq 'MSWin32';
   5 }
   6 use strict;
   7 use Getopt::Std;
   8 my @orig_ARGV = @ARGV;
   9
  10 # These may get re-ordered.
  11 # RAW is a do_now as inserted by &enter
  12 # AGG is an aggreagated do_now, as built up by &process
  13 use constant {
  14   RAW_NEXT => 0,
  15   RAW_IN_LEN => 1,
  16   RAW_OUT_BYTES => 2,
  17   RAW_FALLBACK => 3,
  18
  19   AGG_MIN_IN => 0,
  20   AGG_MAX_IN => 1,
  21   AGG_OUT_BYTES => 2,
  22   AGG_NEXT => 3,
  23   AGG_IN_LEN => 4,
  24   AGG_OUT_LEN => 5,
  25   AGG_FALLBACK => 6,
  26 };
  27 # (See the algorithm in encengine.c - we're building structures for it)
  28
  29 # There are two sorts of structures.
  30 # "do_now" (an array, two variants of what needs storing) is whatever we need
  31 # to do now we've read an input byte.
  32 # It's housed in a "do_next" (which is how we got to it), and in turn points
  33 # to a "do_next" which contains all the "do_now"s for the next input byte.
  34
  35 # There will be a "do_next" which is the start state.
  36 # For a single byte encoding it's the only "do_next" - each "do_now" points
  37 # back to it, and each "do_now" will cause bytes. There is no state.
  38
  39 # For a multi-byte encoding where all characters in the input are the same
  40 # length, then there will be a tree of "do_now"->"do_next"->"do_now"
  41 # branching out from the start state, one step for each input byte.
  42 # The leaf "do_now"s will all be at the same distance from the start state,
  43 # only the leaf "do_now"s cause output bytes, and they in turn point back to
  44 # the start state.
  45
  46 # For an encoding where there are varaible length input byte sequences, you
  47 # will encounter a leaf "do_now" sooner for the shorter input sequences, but
  48 # as before the leaves will point back to the start state.
  49
  50 # The system will cope with escape encodings (imagine them as a mostly
  51 # self-contained tree for each escape state, and cross links between trees
  52 # at the state-switching characters) but so far no input format defines these.
  53
  54 # The system will also cope with having output "leaves" in the middle of
  55 # the bifurcating branches, not just at the extremities, but again no
  56 # input format does this yet.
  57
  58 # There are two variants of the "do_now" structure. The first, smaller variant
  59 # is generated by &enter as the input file is read. There is one structure
  60 # for each input byte. Say we are mapping a single byte encoding to a
  61 # single byte encoding, with  "ABCD" going "abcd". There will be
  62 # 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
  63
  64 # &process then walks the tree, building aggregate "do_now" structres for
  65 # adjacent bytes where possible. The aggregate is for a contiguous range of
  66 # bytes which each produce the same length of output, each move to the
  67 # same next state, and each have the same fallback flag.
  68 # So our 4 RAW "do_now"s above become replaced by a single structure
  69 # containing:
  70 # ["A", "D", "abcd", 1, ...]
  71 # ie, for an input byte $_ in "A".."D", output 1 byte, found as
  72 # substr ("abcd", (ord $_ - ord "A") * 1, 1)
  73 # which maps very nicely into pointer arithmetic in C for encengine.c
  74
  75 sub encode_U
  76 {
  77  # UTF-8 encode long hand - only covers part of perl's range
  78  ## my $uv = shift;
  79  # chr() works in native space so convert value from table
  80  # into that space before using chr().
  81  my $ch = chr(utf8::unicode_to_native($_[0]));
  82  # Now get core perl to encode that the way it likes.
  83  utf8::encode($ch);
  84  return $ch;
  85 }
  86
  87 sub encode_S
  88 {
  89  # encode single byte
  90  ## my ($ch,$page) = @_; return chr($ch);
  91  return chr $_[0];
  92 }
  93
  94 sub encode_D
  95 {
  96  # encode double byte MS byte first
  97  ## my ($ch,$page) = @_; return chr($page).chr($ch);
  98  return chr ($_[1]) . chr $_[0];
  99 }
 100
 101 sub encode_M
 102 {
 103  # encode Multi-byte - single for 0..255 otherwise double
 104  ## my ($ch,$page) = @_;
 105  ## return &encode_D if $page;
 106  ## return &encode_S;
 107  return chr ($_[1]) . chr $_[0] if $_[1];
 108  return chr $_[0];
 109 }
 110
 111 my %encode_types = (U => \&encode_U,
 112                     S => \&encode_S,
 113                     D => \&encode_D,
 114                     M => \&encode_M,
 115                    );
 116
 117 # Win32 does not expand globs on command line
 118 eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
 119
 120 my %opt;
 121 # I think these are:
 122 # -Q to disable the duplicate codepoint test
 123 # -q to remove comments written to output files
 124 # -O to enable the (brute force) substring optimiser
 125 # -o <output> to specify the output file name (else it's the first arg)
 126 # -f <inlist> to give a file with a list of input files (else use the args)
 127 # -n <name> to name the encoding (else use the basename of the input file.
 128 getopts('QqOo:f:n:',\%opt);
 129
 130 # This really should go first, else the die here causes empty (non-erroneous)
 131 # output files to be written.
 132 my @encfiles;
 133 if (exists $opt{'f'}) {
 134     # -F is followed by name of file containing list of filenames
 135     my $flist = $opt{'f'};
 136     open(FLIST,$flist) || die "Cannot open $flist:$!";
 137     chomp(@encfiles = <FLIST>);
 138     close(FLIST);
 139 } else {
 140     @encfiles = @ARGV;
 141 }
 142
 143 my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
 144 chmod(0666,$cname) if -f $cname && !-w $cname;
 145 open(C,">$cname") || die "Cannot open $cname:$!";
 146
 147 my $dname = $cname;
 148 my $hname = $cname;
 149
 150 my ($doC,$doEnc,$doUcm,$doPet);
 151
 152 if ($cname =~ /\.(c|xs)$/)
 153  {
 154   $doC = 1;
 155   $dname =~ s/(\.[^\.]*)?$/_def.h/;
 156   chmod(0666,$dname) if -f $cname && !-w $dname;
 157   open(D,">$dname") || die "Cannot open $dname:$!";
 158   $hname =~ s/(\.[^\.]*)?$/.h/;
 159   chmod(0666,$hname) if -f $cname && !-w $hname;
 160   open(H,">$hname") || die "Cannot open $hname:$!";
 161
 162   foreach my $fh (\*C,\*D,\*H)
 163   {
 164    print $fh <<"END" unless $opt{'q'};
 165 /*
 166  !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
 167  This file was autogenerated by:
 168  $^X $0 @orig_ARGV
 169 */
 170 END
 171   }
 172
 173   if ($cname =~ /(\w+)\.xs$/)
 174    {
 175     print C "#include <EXTERN.h>\n";
 176     print C "#include <perl.h>\n";
 177     print C "#include <XSUB.h>\n";
 178     print C "#define U8 U8\n";
 179    }
 180   print C "#include \"encode.h\"\n";
 181
 182  }
 183 elsif ($cname =~ /\.enc$/)
 184  {
 185   $doEnc = 1;
 186  }
 187 elsif ($cname =~ /\.ucm$/)
 188  {
 189   $doUcm = 1;
 190  }
 191 elsif ($cname =~ /\.pet$/)
 192  {
 193   $doPet = 1;
 194  }
 195
 196 my %encoding;
 197 my %strings;
 198 my $saved = 0;
 199 my $subsave = 0;
 200 my $strings = 0;
 201
 202 sub cmp_name
 203 {
 204  if ($a =~ /^.*-(\d+)/)
 205   {
 206    my $an = $1;
 207    if ($b =~ /^.*-(\d+)/)
 208     {
 209      my $r = $an <=> $1;
 210      return $r if $r;
 211     }
 212   }
 213  return $a cmp $b;
 214 }
 215
 216
 217 foreach my $enc (sort cmp_name @encfiles)
 218  {
 219   my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
 220   $name = $opt{'n'} if exists $opt{'n'};
 221   if (open(E,$enc))
 222    {
 223     if ($sfx eq 'enc')
 224      {
 225       compile_enc(\*E,lc($name));
 226      }
 227     else
 228      {
 229       compile_ucm(\*E,lc($name));
 230      }
 231    }
 232   else
 233    {
 234     warn "Cannot open $enc for $name:$!";
 235    }
 236  }
 237
 238 if ($doC)
 239  {
 240   print STDERR "Writing compiled form\n";
 241   foreach my $name (sort cmp_name keys %encoding)
 242    {
 243     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 244     output(\*C,$name.'_utf8',$e2u);
 245     output(\*C,'utf8_'.$name,$u2e);
 246     push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
 247    }
 248   foreach my $enc (sort cmp_name keys %encoding)
 249    {
 250     my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
 251     my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
 252     my $sym = "${enc}_encoding";
 253     $sym =~ s/\W+/_/g;
 254     print C "encode_t $sym = \n";
 255     print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
 256    }
 257
 258   foreach my $enc (sort cmp_name keys %encoding)
 259    {
 260     my $sym = "${enc}_encoding";
 261     $sym =~ s/\W+/_/g;
 262     print H "extern encode_t $sym;\n";
 263     print D " Encode_XSEncoding(aTHX_ &$sym);\n";
 264    }
 265
 266   if ($cname =~ /(\w+)\.xs$/)
 267    {
 268     my $mod = $1;
 269     print C <<'END';
 270
 271 static void
 272 Encode_XSEncoding(pTHX_ encode_t *enc)
 273 {
 274  dSP;
 275  HV *stash = gv_stashpv("Encode::XS", TRUE);
 276  SV *sv    = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
 277  int i = 0;
 278  PUSHMARK(sp);
 279  XPUSHs(sv);
 280  while (enc->name[i])
 281   {
 282    const char *name = enc->name[i++];
 283    XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
 284   }
 285  PUTBACK;
 286  call_pv("Encode::define_encoding",G_DISCARD);
 287  SvREFCNT_dec(sv);
 288 }
 289
 290 END
 291
 292     print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
 293     print C "BOOT:\n{\n";
 294     print C "#include \"$dname\"\n";
 295     print C "}\n";
 296    }
 297   # Close in void context is bad, m'kay
 298   close(D) or warn "Error closing '$dname': $!";
 299   close(H) or warn "Error closing '$hname': $!";
 300
 301   my $perc_saved    = $strings/($strings + $saved) * 100;
 302   my $perc_subsaved = $strings/($strings + $subsave) * 100;
 303   printf STDERR "%d bytes in string tables\n",$strings;
 304   printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
 305     $saved, $perc_saved              if $saved;
 306   printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
 307     $subsave, $perc_subsaved         if $subsave;
 308  }
 309 elsif ($doEnc)
 310  {
 311   foreach my $name (sort cmp_name keys %encoding)
 312    {
 313     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 314     output_enc(\*C,$name,$e2u);
 315    }
 316  }
 317 elsif ($doUcm)
 318  {
 319   foreach my $name (sort cmp_name keys %encoding)
 320    {
 321     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 322     output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
 323    }
 324  }
 325
 326 # writing half meg files and then not checking to see if you just filled the
 327 # disk is bad, m'kay
 328 close(C) or die "Error closing '$cname': $!";
 329
 330 # End of the main program.
 331
 332 sub compile_ucm
 333 {
 334  my ($fh,$name) = @_;
 335  my $e2u = {};
 336  my $u2e = {};
 337  my $cs;
 338  my %attr;
 339  while (<$fh>)
 340   {
 341    s/#.*$//;
 342    last if /^\s*CHARMAP\s*$/i;
 343    if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
 344     {
 345      $attr{$1} = $2;
 346     }
 347   }
 348  if (!defined($cs =  $attr{'code_set_name'}))
 349   {
 350    warn "No <code_set_name> in $name\n";
 351   }
 352  else
 353   {
 354    $name = $cs unless exists $opt{'n'};
 355   }
 356  my $erep;
 357  my $urep;
 358  my $max_el;
 359  my $min_el;
 360  if (exists $attr{'subchar'})
 361   {
 362    my @byte;
 363    $attr{'subchar'} =~ /^\s*/cg;
 364    push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
 365    $erep = join('',map(chr(hex($_)),@byte));
 366   }
 367  print "Reading $name ($cs)\n";
 368  my $nfb = 0;
 369  my $hfb = 0;
 370  while (<$fh>)
 371   {
 372    s/#.*$//;
 373    last if /^\s*END\s+CHARMAP\s*$/i;
 374    next if /^\s*$/;
 375    my ($u,@byte);
 376    my $fb = '';
 377    $u = $1 if (/^<U([0-9a-f]+)>\s+/igc);
 378    push(@byte,$1) while /\G\\x([0-9a-f]+)/igc;
 379    $fb = $1 if /\G\s*(\|[0-3])/gc;
 380    # warn "$_: $u @byte | $fb\n";
 381    die "Bad line:$_" unless /\G\s*(#.*)?$/gc;
 382    if (defined($u))
 383     {
 384      my $uch = encode_U(hex($u));
 385      my $ech = join('',map(chr(hex($_)),@byte));
 386      my $el  = length($ech);
 387      $max_el = $el if (!defined($max_el) || $el > $max_el);
 388      $min_el = $el if (!defined($min_el) || $el < $min_el);
 389      if (length($fb))
 390       {
 391        $fb = substr($fb,1);
 392        $hfb++;
 393       }
 394      else
 395       {
 396        $nfb++;
 397        $fb = '0';
 398       }
 399      # $fb is fallback flag
 400      # 0 - round trip safe
 401      # 1 - fallback for unicode -> enc
 402      # 2 - skip sub-char mapping
 403      # 3 - fallback enc -> unicode
 404      enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
 405      enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
 406     }
 407    else
 408     {
 409      warn $_;
 410     }
 411   }
 412  if ($nfb && $hfb)
 413   {
 414    die "$nfb entries without fallback, $hfb entries with\n";
 415   }
 416  $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
 417 }
 418
 419 sub compile_enc
 420 {
 421  my ($fh,$name) = @_;
 422  my $e2u = {};
 423  my $u2e = {};
 424
 425  my $type;
 426  while ($type = <$fh>)
 427   {
 428    last if $type !~ /^\s*#/;
 429   }
 430  chomp($type);
 431  return if $type eq 'E';
 432  # Do the hash lookup once, rather than once per function call. 4% speedup.
 433  my $type_func = $encode_types{$type};
 434  my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
 435  warn "$type encoded $name\n";
 436  my $rep = '';
 437  # Save a defined test by setting these to defined values.
 438  my $min_el = ~0; # A very big integer
 439  my $max_el = 0;  # Anything must be longer than 0
 440  {
 441   my $v = hex($def);
 442   $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
 443  }
 444  my $seen;
 445  # use -Q to silence the seen test. Makefile.PL uses this by default.
 446  $seen = {} unless $opt{Q};
 447  do
 448   {
 449    my $line = <$fh>;
 450    chomp($line);
 451    my $page = hex($line);
 452    my $ch = 0;
 453    my $i = 16;
 454    do
 455     {
 456      # So why is it 1% faster to leave the my here?
 457      my $line = <$fh>;
 458      die "Line should be exactly 65 characters long including newline"
 459        unless length ($line) == 65;
 460      # Split line into groups of 4 hex digits, convert groups to ints
 461      # This takes 65.35
 462      # map {hex $_} $line =~ /(....)/g
 463      # This takes 63.75 (2.5% less time)
 464      # unpack "n*", pack "H*", $line
 465      # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
 466      # Doing it as while ($line =~ /(....)/g) took 74.63
 467      foreach my $val (unpack "n*", pack "H*", $line)
 468       {
 469        next if $val == 0xFFFD;
 470        my $ech = &$type_func($ch,$page);
 471        if ($val || (!$ch && !$page))
 472         {
 473          my $el  = length($ech);
 474          $max_el = $el if $el > $max_el;
 475          $min_el = $el if $el < $min_el;
 476          my $uch = encode_U($val);
 477          if ($seen) {
 478            # We're doing the test.
 479            # We don't need to read this quickly, so storing it as a scalar,
 480            # rather than 3 (anon array, plus the 2 scalars it holds) saves
 481            # RAM and may make us faster on low RAM systems. [see __END__]
 482            if (exists $seen->{$uch})
 483              {
 484                warn sprintf("U%04X is %02X%02X and %04X\n",
 485                             $val,$page,$ch,$seen->{$uch});
 486              }
 487            else
 488              {
 489                $seen->{$uch} = $page << 8 | $ch;
 490              }
 491          }
 492          # Passing 2 extra args each time is 3.6% slower!
 493          # Even with having to add $fallback ||= 0 later
 494          enter_fb0($e2u,$ech,$uch);
 495          enter_fb0($u2e,$uch,$ech);
 496         }
 497        else
 498         {
 499          # No character at this position
 500          # enter($e2u,$ech,undef,$e2u);
 501         }
 502        $ch++;
 503       }
 504     } while --$i;
 505   } while --$pages;
 506  die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
 507    if $min_el > $max_el;
 508  $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
 509 }
 510
 511 # my ($a,$s,$d,$t,$fb) = @_;
 512 sub enter {
 513   my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
 514   # state we shift to after this (multibyte) input character defaults to same
 515   # as current state.
 516   $next ||= $current;
 517   # Making sure it is defined seems to be faster than {no warnings;} in
 518   # &process, or passing it in as 0 explicity.
 519   # XXX $fallback ||= 0;
 520
 521   # Start at the beginning and work forwards through the string to zero.
 522   # effectively we are removing 1 character from the front each time
 523   # but we don't actually edit the string. [this alone seems to be 14% speedup]
 524   # Hence -$pos is the length of the remaining string.
 525   my $pos = -length $inbytes;
 526   while (1) {
 527     my $byte = substr $inbytes, $pos, 1;
 528     #  RAW_NEXT => 0,
 529     #  RAW_IN_LEN => 1,
 530     #  RAW_OUT_BYTES => 2,
 531     #  RAW_FALLBACK => 3,
 532     # to unicode an array would seem to be better, because the pages are dense.
 533     # from unicode can be very sparse, favouring a hash.
 534     # hash using the bytes (all length 1) as keys rather than ord value,
 535     # as it's easier to sort these in &process.
 536
 537     # It's faster to always add $fallback even if it's undef, rather than
 538     # choosing between 3 and 4 element array. (hence why we set it defined
 539     # above)
 540     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
 541     # When $pos was -1 we were at the last input character.
 542     unless (++$pos) {
 543       $do_now->[RAW_OUT_BYTES] = $outbytes;
 544       $do_now->[RAW_NEXT] = $next;
 545       return;
 546     }
 547     # Tail recursion. The intermdiate state may not have a name yet.
 548     $current = $do_now->[RAW_NEXT];
 549   }
 550 }
 551
 552 # This is purely for optimistation. It's just &enter hard coded for $fallback
 553 # of 0, using only a 3 entry array ref to save memory for every entry.
 554 sub enter_fb0 {
 555   my ($current,$inbytes,$outbytes,$next) = @_;
 556   $next ||= $current;
 557
 558   my $pos = -length $inbytes;
 559   while (1) {
 560     my $byte = substr $inbytes, $pos, 1;
 561     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
 562     unless (++$pos) {
 563       $do_now->[RAW_OUT_BYTES] = $outbytes;
 564       $do_now->[RAW_NEXT] = $next;
 565       return;
 566     }
 567     $current = $do_now->[RAW_NEXT];
 568   }
 569 }
 570
 571
 572 sub outstring
 573 {
 574  my ($fh,$name,$s) = @_;
 575  my $sym = $strings{$s};
 576  if ($sym)
 577   {
 578    $saved += length($s);
 579   }
 580  else
 581   {
 582    if ($opt{'O'}) {
 583        foreach my $o (keys %strings)
 584         {
 585          next unless (my $i = index($o,$s)) >= 0;
 586          $sym = $strings{$o};
 587          $sym .= sprintf("+0x%02x",$i) if ($i);
 588          $subsave += length($s);
 589          return $strings{$s} = $sym;
 590        }
 591    }
 592    $strings{$s} = $sym = $name;
 593    $strings += length($s);
 594    my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
 595    # Maybe we should assert that these are all <256.
 596    $definition .= join(',',unpack "C*",$s);
 597    # We have a single long line. Split it at convenient commas.
 598    $definition =~ s/(.{74,77},)/$1\n/g;
 599    print $fh "$definition };\n\n";
 600   }
 601  return $sym;
 602 }
 603
 604 sub process
 605 {
 606   my ($name,$a) = @_;
 607   $name =~ s/\W+/_/g;
 608   $a->{Cname} = $name;
 609   my $raw = $a->{Raw};
 610   my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
 611   my @ent;
 612   $agg_max_in = 0;
 613   foreach my $key (sort keys %$raw) {
 614     #  RAW_NEXT => 0,
 615     #  RAW_IN_LEN => 1,
 616     #  RAW_OUT_BYTES => 2,
 617     #  RAW_FALLBACK => 3,
 618     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 619     # Now we are converting from raw to aggregate, switch from 1 byte strings
 620     # to numbers
 621     my $b = ord $key;
 622     $fallback ||= 0;
 623     if ($l &&
 624         # If this == fails, we're going to reset $agg_max_in below anyway.
 625         $b == ++$agg_max_in &&
 626         # References in numeric context give the pointer as an int.
 627         $agg_next == $next &&
 628         $agg_in_len == $in_len &&
 629         $agg_out_len == length $out_bytes &&
 630         $agg_fallback == $fallback
 631         # && length($l->[AGG_OUT_BYTES]) < 16
 632        ) {
 633       #     my $i = ord($b)-ord($l->[AGG_MIN_IN]);
 634       # we can aggregate this byte onto the end.
 635       $l->[AGG_MAX_IN] = $b;
 636       $l->[AGG_OUT_BYTES] .= $out_bytes;
 637     } else {
 638       # AGG_MIN_IN => 0,
 639       # AGG_MAX_IN => 1,
 640       # AGG_OUT_BYTES => 2,
 641       # AGG_NEXT => 3,
 642       # AGG_IN_LEN => 4,
 643       # AGG_OUT_LEN => 5,
 644       # AGG_FALLBACK => 6,
 645       # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
 646       # (only gains .6% on euc-jp  -- is it worth it?)
 647       push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
 648                        $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
 649                        $agg_fallback = $fallback];
 650     }
 651     if (exists $next->{Cname}) {
 652       $next->{'Forward'} = 1 if $next != $a;
 653     } else {
 654       process(sprintf("%s_%02x",$name,$b),$next);
 655     }
 656   }
 657   # encengine.c rules say that last entry must be for 255
 658   if ($agg_max_in < 255) {
 659     push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
 660   }
 661   $a->{'Entries'} = \@ent;
 662 }
 663
 664 sub outtable
 665 {
 666  my ($fh,$a) = @_;
 667  my $name = $a->{'Cname'};
 668  # String tables
 669  foreach my $b (@{$a->{'Entries'}})
 670   {
 671    next unless $b->[AGG_OUT_LEN];
 672    my $s = $b->[AGG_MIN_IN];
 673    my $e = $b->[AGG_MAX_IN];
 674    outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
 675   }
 676  if ($a->{'Forward'})
 677   {
 678    print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
 679   }
 680  $a->{'Done'} = 1;
 681  foreach my $b (@{$a->{'Entries'}})
 682   {
 683    my ($s,$e,$out,$t,$end,$l) = @$b;
 684    outtable($fh,$t) unless $t->{'Done'};
 685   }
 686  print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
 687  foreach my $b (@{$a->{'Entries'}})
 688   {
 689    my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
 690    $end |= 0x80 if $fb;
 691    print  $fh "{";
 692    if ($l)
 693     {
 694      printf $fh outstring($fh,'',$out);
 695     }
 696    else
 697     {
 698      print  $fh "0";
 699     }
 700    print  $fh ",",$t->{Cname};
 701    printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
 702   }
 703  print $fh "};\n";
 704 }
 705
 706 sub output
 707 {
 708  my ($fh,$name,$a) = @_;
 709  process($name,$a);
 710  # Sub-tables
 711  outtable($fh,$a);
 712 }
 713
 714 sub output_enc
 715 {
 716  my ($fh,$name,$a) = @_;
 717  die "Changed - fix me for new structure";
 718  foreach my $b (sort keys %$a)
 719   {
 720    my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
 721   }
 722 }
 723
 724 sub decode_U
 725 {
 726  my $s = shift;
 727 }
 728
 729 my @uname;
 730 sub char_names
 731 {
 732  my $s = do "unicore/Name.pl";
 733  die "char_names: unicore/Name.pl: $!\n" unless defined $s;
 734  pos($s) = 0;
 735  while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
 736   {
 737    my $name = $3;
 738    my $s = hex($1);
 739    last if $s >= 0x10000;
 740    my $e = length($2) ? hex($2) : $s;
 741    for (my $i = $s; $i <= $e; $i++)
 742     {
 743      $uname[$i] = $name;
 744 #    print sprintf("U%04X $name\n",$i);
 745     }
 746   }
 747 }
 748
 749 sub output_ucm_page
 750 {
 751   my ($cmap,$a,$t,$pre) = @_;
 752   # warn sprintf("Page %x\n",$pre);
 753   my $raw = $t->{Raw};
 754   foreach my $key (sort keys %$raw) {
 755     #  RAW_NEXT => 0,
 756     #  RAW_IN_LEN => 1,
 757     #  RAW_OUT_BYTES => 2,
 758     #  RAW_FALLBACK => 3,
 759     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 760     my $u = ord $key;
 761     $fallback ||= 0;
 762
 763     if ($next != $a && $next != $t) {
 764       output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
 765     } elsif (length $out_bytes) {
 766       if ($pre) {
 767         $u = $pre|($u &0x3f);
 768       }
 769       my $s = sprintf "<U%04X> ",$u;
 770       #foreach my $c (split(//,$out_bytes)) {
 771       #  $s .= sprintf "\\x%02X",ord($c);
 772       #}
 773       # 9.5% faster changing that loop to this:
 774       $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
 775       $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
 776       push(@$cmap,$s);
 777     } else {
 778       warn join(',',$u, @{$raw->{$key}},$a,$t);
 779     }
 780   }
 781 }
 782
 783 sub output_ucm
 784 {
 785  my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
 786  print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
 787  print $fh "<code_set_name> \"$name\"\n";
 788  char_names();
 789  if (defined $min_el)
 790   {
 791    print $fh "<mb_cur_min> $min_el\n";
 792   }
 793  if (defined $max_el)
 794   {
 795    print $fh "<mb_cur_max> $max_el\n";
 796   }
 797  if (defined $rep)
 798   {
 799    print $fh "<subchar> ";
 800    foreach my $c (split(//,$rep))
 801     {
 802      printf $fh "\\x%02X",ord($c);
 803     }
 804    print $fh "\n";
 805   }
 806  my @cmap;
 807  output_ucm_page(\@cmap,$h,$h,0);
 808  print $fh "#\nCHARMAP\n";
 809  foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
 810   {
 811    print $fh $line;
 812   }
 813  print $fh "END CHARMAP\n";
 814 }
 815
 816
 817 __END__
 818 With %seen holding array refs:
 819
 820       865.66 real        28.80 user         8.79 sys
 821       7904  maximum resident set size
 822       1356  average shared memory size
 823      18566  average unshared data size
 824        229  average unshared stack size
 825      46080  page reclaims
 826      33373  page faults
 827
 828 With %seen holding simple scalars:
 829
 830       342.16 real        27.11 user         3.54 sys
 831       8388  maximum resident set size
 832       1394  average shared memory size
 833      14969  average unshared data size
 834        236  average unshared stack size
 835      28159  page reclaims
 836       9839  page faults
 837
 838 Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
 839 how %seen is storing things its seen. So it is pathalogically bad on a 16M
 840 RAM machine, but it's going to help even on modern machines.
 841 Swapping is bad, m'kay :-)