ext/Encode/compile

   1 #!../../perl -w
   2 BEGIN {
   3     unshift @INC, qw(../../lib ../../../lib);
   4     $ENV{PATH} .= ';../..;../../..' if $^O eq 'MSWin32';
   5 }
   6 use strict;
   7 use Getopt::Std;
   8 my @orig_ARGV = @ARGV;
   9
  10 # These may get re-ordered.
  11 # RAW is a do_now as inserted by &enter
  12 # AGG is an aggreagated do_now, as built up by &process
  13 use constant {
  14   RAW_NEXT => 0,
  15   RAW_IN_LEN => 1,
  16   RAW_OUT_BYTES => 2,
  17   RAW_FALLBACK => 3,
  18
  19   AGG_MIN_IN => 0,
  20   AGG_MAX_IN => 1,
  21   AGG_OUT_BYTES => 2,
  22   AGG_NEXT => 3,
  23   AGG_IN_LEN => 4,
  24   AGG_OUT_LEN => 5,
  25   AGG_FALLBACK => 6,
  26 };
  27 # (See the algorithm in encengine.c - we're building structures for it)
  28
  29 # There are two sorts of structures.
  30 # "do_now" (an array, two variants of what needs storing) is whatever we need
  31 # to do now we've read an input byte.
  32 # It's housed in a "do_next" (which is how we got to it), and in turn points
  33 # to a "do_next" which contains all the "do_now"s for the next input byte.
  34
  35 # There will be a "do_next" which is the start state.
  36 # For a single byte encoding it's the only "do_next" - each "do_now" points
  37 # back to it, and each "do_now" will cause bytes. There is no state.
  38
  39 # For a multi-byte encoding where all characters in the input are the same
  40 # length, then there will be a tree of "do_now"->"do_next"->"do_now"
  41 # branching out from the start state, one step for each input byte.
  42 # The leaf "do_now"s will all be at the same distance from the start state,
  43 # only the leaf "do_now"s cause output bytes, and they in turn point back to
  44 # the start state.
  45
  46 # For an encoding where there are varaible length input byte sequences, you
  47 # will encounter a leaf "do_now" sooner for the shorter input sequences, but
  48 # as before the leaves will point back to the start state.
  49
  50 # The system will cope with escape encodings (imagine them as a mostly
  51 # self-contained tree for each escape state, and cross links between trees
  52 # at the state-switching characters) but so far no input format defines these.
  53
  54 # The system will also cope with having output "leaves" in the middle of
  55 # the bifurcating branches, not just at the extremities, but again no
  56 # input format does this yet.
  57
  58 # There are two variants of the "do_now" structure. The first, smaller variant
  59 # is generated by &enter as the input file is read. There is one structure
  60 # for each input byte. Say we are mapping a single byte encoding to a
  61 # single byte encoding, with  "ABCD" going "abcd". There will be
  62 # 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
  63
  64 # &process then walks the tree, building aggregate "do_now" structres for
  65 # adjacent bytes where possible. The aggregate is for a contiguous range of
  66 # bytes which each produce the same length of output, each move to the
  67 # same next state, and each have the same fallback flag.
  68 # So our 4 RAW "do_now"s above become replaced by a single structure
  69 # containing:
  70 # ["A", "D", "abcd", 1, ...]
  71 # ie, for an input byte $_ in "A".."D", output 1 byte, found as
  72 # substr ("abcd", (ord $_ - ord "A") * 1, 1)
  73 # which maps very nicely into pointer arithmetic in C for encengine.c
  74
  75 sub encode_U
  76 {
  77  # UTF-8 encode long hand - only covers part of perl's range
  78  ## my $uv = shift;
  79  # chr() works in native space so convert value from table
  80  # into that space before using chr().
  81  my $ch = chr(utf8::unicode_to_native($_[0]));
  82  # Now get core perl to encode that the way it likes.
  83  utf8::encode($ch);
  84  return $ch;
  85 }
  86
  87 sub encode_S
  88 {
  89  # encode single byte
  90  ## my ($ch,$page) = @_; return chr($ch);
  91  return chr $_[0];
  92 }
  93
  94 sub encode_D
  95 {
  96  # encode double byte MS byte first
  97  ## my ($ch,$page) = @_; return chr($page).chr($ch);
  98  return chr ($_[1]) . chr $_[0];
  99 }
 100
 101 sub encode_M
 102 {
 103  # encode Multi-byte - single for 0..255 otherwise double
 104  ## my ($ch,$page) = @_;
 105  ## return &encode_D if $page;
 106  ## return &encode_S;
 107  return chr ($_[1]) . chr $_[0] if $_[1];
 108  return chr $_[0];
 109 }
 110
 111 my %encode_types = (U => \&encode_U,
 112                     S => \&encode_S,
 113                     D => \&encode_D,
 114                     M => \&encode_M,
 115                    );
 116
 117 # Win32 does not expand globs on command line
 118 eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
 119
 120 my %opt;
 121 getopts('qOo:f:n:',\%opt);
 122 my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
 123 chmod(0666,$cname) if -f $cname && !-w $cname;
 124 open(C,">$cname") || die "Cannot open $cname:$!";
 125
 126
 127 my $dname = $cname;
 128 $dname =~ s/(\.[^\.]*)?$/_def.h/;
 129
 130 my ($doC,$doEnc,$doUcm,$doPet);
 131
 132 if ($cname =~ /\.(c|xs)$/)
 133  {
 134   $doC = 1;
 135   chmod(0666,$dname) if -f $cname && !-w $dname;
 136   open(D,">$dname") || die "Cannot open $dname:$!";
 137   my $hname = $cname;
 138   $hname =~ s/(\.[^\.]*)?$/.h/;
 139   chmod(0666,$hname) if -f $cname && !-w $hname;
 140   open(H,">$hname") || die "Cannot open $hname:$!";
 141
 142   foreach my $fh (\*C,\*D,\*H)
 143   {
 144    print $fh <<"END" unless $opt{'q'};
 145 /*
 146  !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
 147  This file was autogenerated by:
 148  $^X $0 @orig_ARGV
 149 */
 150 END
 151   }
 152
 153   if ($cname =~ /(\w+)\.xs$/)
 154    {
 155     print C "#include <EXTERN.h>\n";
 156     print C "#include <perl.h>\n";
 157     print C "#include <XSUB.h>\n";
 158     print C "#define U8 U8\n";
 159    }
 160   print C "#include \"encode.h\"\n";
 161
 162  }
 163 elsif ($cname =~ /\.enc$/)
 164  {
 165   $doEnc = 1;
 166  }
 167 elsif ($cname =~ /\.ucm$/)
 168  {
 169   $doUcm = 1;
 170  }
 171 elsif ($cname =~ /\.pet$/)
 172  {
 173   $doPet = 1;
 174  }
 175
 176 my @encfiles;
 177 if (exists $opt{'f'})
 178  {
 179   # -F is followed by name of file containing list of filenames
 180   my $flist = $opt{'f'};
 181   open(FLIST,$flist) || die "Cannot open $flist:$!";
 182   chomp(@encfiles = <FLIST>);
 183   close(FLIST);
 184  }
 185 else
 186  {
 187   @encfiles = @ARGV;
 188  }
 189
 190 my %encoding;
 191 my %strings;
 192 my $saved = 0;
 193 my $subsave = 0;
 194 my $strings = 0;
 195
 196 sub cmp_name
 197 {
 198  if ($a =~ /^.*-(\d+)/)
 199   {
 200    my $an = $1;
 201    if ($b =~ /^.*-(\d+)/)
 202     {
 203      my $r = $an <=> $1;
 204      return $r if $r;
 205     }
 206   }
 207  return $a cmp $b;
 208 }
 209
 210
 211 foreach my $enc (sort cmp_name @encfiles)
 212  {
 213   my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
 214   $name = $opt{'n'} if exists $opt{'n'};
 215   if (open(E,$enc))
 216    {
 217     if ($sfx eq 'enc')
 218      {
 219       compile_enc(\*E,lc($name));
 220      }
 221     else
 222      {
 223       compile_ucm(\*E,lc($name));
 224      }
 225    }
 226   else
 227    {
 228     warn "Cannot open $enc for $name:$!";
 229    }
 230  }
 231
 232 if ($doC)
 233  {
 234   print STDERR "Writing compiled form\n";
 235   foreach my $name (sort cmp_name keys %encoding)
 236    {
 237     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 238     output(\*C,$name.'_utf8',$e2u);
 239     output(\*C,'utf8_'.$name,$u2e);
 240     push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
 241    }
 242   foreach my $enc (sort cmp_name keys %encoding)
 243    {
 244     my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
 245     my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
 246     my $sym = "${enc}_encoding";
 247     $sym =~ s/\W+/_/g;
 248     print C "encode_t $sym = \n";
 249     print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
 250    }
 251
 252   foreach my $enc (sort cmp_name keys %encoding)
 253    {
 254     my $sym = "${enc}_encoding";
 255     $sym =~ s/\W+/_/g;
 256     print H "extern encode_t $sym;\n";
 257     print D " Encode_XSEncoding(aTHX_ &$sym);\n";
 258    }
 259
 260   if ($cname =~ /(\w+)\.xs$/)
 261    {
 262     my $mod = $1;
 263     print C <<'END';
 264
 265 static void
 266 Encode_XSEncoding(pTHX_ encode_t *enc)
 267 {
 268  dSP;
 269  HV *stash = gv_stashpv("Encode::XS", TRUE);
 270  SV *sv    = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
 271  int i = 0;
 272  PUSHMARK(sp);
 273  XPUSHs(sv);
 274  while (enc->name[i])
 275   {
 276    const char *name = enc->name[i++];
 277    XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
 278   }
 279  PUTBACK;
 280  call_pv("Encode::define_encoding",G_DISCARD);
 281  SvREFCNT_dec(sv);
 282 }
 283
 284 END
 285
 286     print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
 287     print C "BOOT:\n{\n";
 288     print C "#include \"$dname\"\n";
 289     print C "}\n";
 290    }
 291   close(D);
 292   close(H);
 293   printf STDERR "%d bytes in string tables\n",$strings;
 294   printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",$saved,100*$saved/$strings if $saved;
 295   printf STDERR "%d bytes (%.3g%%) saved using substrings\n",$subsave,100*$subsave/$strings if $subsave;
 296  }
 297 elsif ($doEnc)
 298  {
 299   foreach my $name (sort cmp_name keys %encoding)
 300    {
 301     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 302     output_enc(\*C,$name,$e2u);
 303    }
 304  }
 305 elsif ($doUcm)
 306  {
 307   foreach my $name (sort cmp_name keys %encoding)
 308    {
 309     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 310     output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
 311    }
 312  }
 313
 314 close(C);
 315
 316
 317
 318 sub compile_ucm
 319 {
 320  my ($fh,$name) = @_;
 321  my $e2u = {};
 322  my $u2e = {};
 323  my $cs;
 324  my %attr;
 325  while (<$fh>)
 326   {
 327    s/#.*$//;
 328    last if /^\s*CHARMAP\s*$/i;
 329    if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
 330     {
 331      $attr{$1} = $2;
 332     }
 333   }
 334  if (!defined($cs =  $attr{'code_set_name'}))
 335   {
 336    warn "No <code_set_name> in $name\n";
 337   }
 338  else
 339   {
 340    $name = $cs unless exists $opt{'n'};
 341   }
 342  my $erep;
 343  my $urep;
 344  my $max_el;
 345  my $min_el;
 346  if (exists $attr{'subchar'})
 347   {
 348    my @byte;
 349    $attr{'subchar'} =~ /^\s*/cg;
 350    push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
 351    $erep = join('',map(chr(hex($_)),@byte));
 352   }
 353  print "Reading $name ($cs)\n";
 354  my $nfb = 0;
 355  my $hfb = 0;
 356  while (<$fh>)
 357   {
 358    s/#.*$//;
 359    last if /^\s*END\s+CHARMAP\s*$/i;
 360    next if /^\s*$/;
 361    my ($u,@byte);
 362    my $fb = '';
 363    $u = $1 if (/^<U([0-9a-f]+)>\s+/igc);
 364    push(@byte,$1) while /\G\\x([0-9a-f]+)/igc;
 365    $fb = $1 if /\G\s*(\|[0-3])/gc;
 366    # warn "$_: $u @byte | $fb\n";
 367    die "Bad line:$_" unless /\G\s*(#.*)?$/gc;
 368    if (defined($u))
 369     {
 370      my $uch = encode_U(hex($u));
 371      my $ech = join('',map(chr(hex($_)),@byte));
 372      my $el  = length($ech);
 373      $max_el = $el if (!defined($max_el) || $el > $max_el);
 374      $min_el = $el if (!defined($min_el) || $el < $min_el);
 375      if (length($fb))
 376       {
 377        $fb = substr($fb,1);
 378        $hfb++;
 379       }
 380      else
 381       {
 382        $nfb++;
 383        $fb = '0';
 384       }
 385      # $fb is fallback flag
 386      # 0 - round trip safe
 387      # 1 - fallback for unicode -> enc
 388      # 2 - skip sub-char mapping
 389      # 3 - fallback enc -> unicode
 390      enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
 391      enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
 392     }
 393    else
 394     {
 395      warn $_;
 396     }
 397   }
 398  if ($nfb && $hfb)
 399   {
 400    die "$nfb entries without fallback, $hfb entries with\n";
 401   }
 402  $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
 403 }
 404
 405 sub compile_enc
 406 {
 407  my ($fh,$name) = @_;
 408  my $e2u = {};
 409  my $u2e = {};
 410
 411  my $type;
 412  while ($type = <$fh>)
 413   {
 414    last if $type !~ /^\s*#/;
 415   }
 416  chomp($type);
 417  return if $type eq 'E';
 418  # Do the hash lookup once, rather than once per function call. 4% speedup.
 419  my $type_func = $encode_types{$type};
 420  my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
 421  warn "$type encoded $name\n";
 422  my $rep = '';
 423  # Save a defined test by setting these to defined values.
 424  my $min_el = ~0; # A very big integer
 425  my $max_el = 0;  # Anything must be longer than 0
 426  {
 427   my $v = hex($def);
 428   $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
 429  }
 430  my %seen;
 431  do
 432   {
 433    my $line = <$fh>;
 434    chomp($line);
 435    my $page = hex($line);
 436    my $ch = 0;
 437    my $i = 16;
 438    do
 439     {
 440      # So why is it 1% faster to leave the my here?
 441      my $line = <$fh>;
 442      die "Line should be exactly 65 characters long including newline"
 443        unless length ($line) == 65;
 444      # Split line into groups of 4 hex digits, convert groups to ints
 445      # This takes 65.35
 446      # map {hex $_} $line =~ /(....)/g
 447      # This takes 63.75 (2.5% less time)
 448      # unpack "n*", pack "H*", $line
 449      # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
 450      # Doing it as while ($line =~ /(....)/g) took 74.63
 451      foreach my $val (unpack "n*", pack "H*", $line)
 452       {
 453        next if $val == 0xFFFD;
 454        my $ech = &$type_func($ch,$page);
 455        if ($val || (!$ch && !$page))
 456         {
 457          my $el  = length($ech);
 458          $max_el = $el if $el > $max_el;
 459          $min_el = $el if $el < $min_el;
 460          my $uch = encode_U($val);
 461          # We don't need to read this quickly, so storing it as a scalar,
 462          # rather than 3 (anon array, plus the 2 scalars it holds) saves
 463          # RAM and may make us faster on low RAM systems. [see __END__]
 464          if (exists $seen{$uch})
 465           {
 466            warn sprintf("U%04X is %02X%02X and %04X\n",
 467                         $val,$page,$ch,$seen{$uch});
 468           }
 469          else
 470           {
 471            $seen{$uch} = $page << 16 | $ch;
 472           }
 473          # Passing 2 extra args each time is 3.6% slower!
 474          # Even with having to add $fallback ||= 0 later
 475          enter_fb0($e2u,$ech,$uch);
 476          enter_fb0($u2e,$uch,$ech);
 477         }
 478        else
 479         {
 480          # No character at this position
 481          # enter($e2u,$ech,undef,$e2u);
 482         }
 483        $ch++;
 484       }
 485     } while --$i;
 486   } while --$pages;
 487  die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
 488    if $min_el > $max_el;
 489  $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
 490 }
 491
 492 # my ($a,$s,$d,$t,$fb) = @_;
 493 sub enter {
 494   my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
 495   # state we shift to after this (multibyte) input character defaults to same
 496   # as current state.
 497   $next ||= $current;
 498   # Making sure it is defined seems to be faster than {no warnings;} in
 499   # &process, or passing it in as 0 explicity.
 500   # XXX $fallback ||= 0;
 501
 502   # Start at the beginning and work forwards through the string to zero.
 503   # effectively we are removing 1 character from the front each time
 504   # but we don't actually edit the string. [this alone seems to be 14% speedup]
 505   # Hence -$pos is the length of the remaining string.
 506   my $pos = -length $inbytes;
 507   while (1) {
 508     my $byte = substr $inbytes, $pos, 1;
 509     #  RAW_NEXT => 0,
 510     #  RAW_IN_LEN => 1,
 511     #  RAW_OUT_BYTES => 2,
 512     #  RAW_FALLBACK => 3,
 513     # to unicode an array would seem to be better, because the pages are dense.
 514     # from unicode can be very sparse, favouring a hash.
 515     # hash using the bytes (all length 1) as keys rather than ord value,
 516     # as it's easier to sort these in &process.
 517
 518     # It's faster to always add $fallback even if it's undef, rather than
 519     # choosing between 3 and 4 element array. (hence why we set it defined
 520     # above)
 521     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
 522     # When $pos was -1 we were at the last input character.
 523     unless (++$pos) {
 524       $do_now->[RAW_OUT_BYTES] = $outbytes;
 525       $do_now->[RAW_NEXT] = $next;
 526       return;
 527     }
 528     # Tail recursion. The intermdiate state may not have a name yet.
 529     $current = $do_now->[RAW_NEXT];
 530   }
 531 }
 532
 533 # This is purely for optimistation. It's just &enter hard coded for $fallback
 534 # of 0, using only a 3 entry array ref to save memory for every entry.
 535 sub enter_fb0 {
 536   my ($current,$inbytes,$outbytes,$next) = @_;
 537   $next ||= $current;
 538
 539   my $pos = -length $inbytes;
 540   while (1) {
 541     my $byte = substr $inbytes, $pos, 1;
 542     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
 543     unless (++$pos) {
 544       $do_now->[RAW_OUT_BYTES] = $outbytes;
 545       $do_now->[RAW_NEXT] = $next;
 546       return;
 547     }
 548     $current = $do_now->[RAW_NEXT];
 549   }
 550 }
 551
 552
 553 sub outstring
 554 {
 555  my ($fh,$name,$s) = @_;
 556  my $sym = $strings{$s};
 557  if ($sym)
 558   {
 559    $saved += length($s);
 560   }
 561  else
 562   {
 563    if ($opt{'O'}) {
 564        foreach my $o (keys %strings)
 565         {
 566          next unless (my $i = index($o,$s)) >= 0;
 567          $sym = $strings{$o};
 568          $sym .= sprintf("+0x%02x",$i) if ($i);
 569          $subsave += length($s);
 570          return $strings{$s} = $sym;
 571        }
 572    }
 573    $strings{$s} = $sym = $name;
 574    $strings += length($s);
 575    my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
 576    # Maybe we should assert that these are all <256.
 577    $definition .= join(',',unpack "C*",$s);
 578    # We have a single long line. Split it at convenient commas.
 579    $definition =~ s/(.{74,77},)/$1\n/g;
 580    print $fh "$definition };\n\n";
 581   }
 582  return $sym;
 583 }
 584
 585 sub process
 586 {
 587   my ($name,$a) = @_;
 588   $name =~ s/\W+/_/g;
 589   $a->{Cname} = $name;
 590   my $raw = $a->{Raw};
 591   my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
 592   my @ent;
 593   $agg_max_in = 0;
 594   foreach my $key (sort keys %$raw) {
 595     #  RAW_NEXT => 0,
 596     #  RAW_IN_LEN => 1,
 597     #  RAW_OUT_BYTES => 2,
 598     #  RAW_FALLBACK => 3,
 599     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 600     # Now we are converting from raw to aggregate, switch from 1 byte strings
 601     # to numbers
 602     my $b = ord $key;
 603     $fallback ||= 0;
 604     if ($l &&
 605         # If this == fails, we're going to reset $agg_max_in below anyway.
 606         $b == ++$agg_max_in &&
 607         # References in numeric context give the pointer as an int.
 608         $agg_next == $next &&
 609         $agg_in_len == $in_len &&
 610         $agg_out_len == length $out_bytes &&
 611         $agg_fallback == $fallback
 612         # && length($l->[AGG_OUT_BYTES]) < 16
 613        ) {
 614       #     my $i = ord($b)-ord($l->[AGG_MIN_IN]);
 615       # we can aggregate this byte onto the end.
 616       $l->[AGG_MAX_IN] = $b;
 617       $l->[AGG_OUT_BYTES] .= $out_bytes;
 618     } else {
 619       # AGG_MIN_IN => 0,
 620       # AGG_MAX_IN => 1,
 621       # AGG_OUT_BYTES => 2,
 622       # AGG_NEXT => 3,
 623       # AGG_IN_LEN => 4,
 624       # AGG_OUT_LEN => 5,
 625       # AGG_FALLBACK => 6,
 626       # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
 627       # (only gains .6% on euc-jp  -- is it worth it?)
 628       push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
 629                        $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
 630                        $agg_fallback = $fallback];
 631     }
 632     if (exists $next->{Cname}) {
 633       $next->{'Forward'} = 1 if $next != $a;
 634     } else {
 635       process(sprintf("%s_%02x",$name,$b),$next);
 636     }
 637   }
 638   # encengine.c rules say that last entry must be for 255
 639   if ($agg_max_in < 255) {
 640     push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
 641   }
 642   $a->{'Entries'} = \@ent;
 643 }
 644
 645 sub outtable
 646 {
 647  my ($fh,$a) = @_;
 648  my $name = $a->{'Cname'};
 649  # String tables
 650  foreach my $b (@{$a->{'Entries'}})
 651   {
 652    next unless $b->[AGG_OUT_LEN];
 653    my $s = $b->[AGG_MIN_IN];
 654    my $e = $b->[AGG_MAX_IN];
 655    outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
 656   }
 657  if ($a->{'Forward'})
 658   {
 659    print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
 660   }
 661  $a->{'Done'} = 1;
 662  foreach my $b (@{$a->{'Entries'}})
 663   {
 664    my ($s,$e,$out,$t,$end,$l) = @$b;
 665    outtable($fh,$t) unless $t->{'Done'};
 666   }
 667  print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
 668  foreach my $b (@{$a->{'Entries'}})
 669   {
 670    my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
 671    $end |= 0x80 if $fb;
 672    print  $fh "{";
 673    if ($l)
 674     {
 675      printf $fh outstring($fh,'',$out);
 676     }
 677    else
 678     {
 679      print  $fh "0";
 680     }
 681    print  $fh ",",$t->{Cname};
 682    printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
 683   }
 684  print $fh "};\n";
 685 }
 686
 687 sub output
 688 {
 689  my ($fh,$name,$a) = @_;
 690  process($name,$a);
 691  # Sub-tables
 692  outtable($fh,$a);
 693 }
 694
 695 sub output_enc
 696 {
 697  my ($fh,$name,$a) = @_;
 698  die "Changed - fix me for new structure";
 699  foreach my $b (sort keys %$a)
 700   {
 701    my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
 702   }
 703 }
 704
 705 sub decode_U
 706 {
 707  my $s = shift;
 708 }
 709
 710 my @uname;
 711 sub char_names
 712 {
 713  my $s = do "unicore/Name.pl";
 714  die "char_names: unicore/Name.pl: $!\n" unless defined $s;
 715  pos($s) = 0;
 716  while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
 717   {
 718    my $name = $3;
 719    my $s = hex($1);
 720    last if $s >= 0x10000;
 721    my $e = length($2) ? hex($2) : $s;
 722    for (my $i = $s; $i <= $e; $i++)
 723     {
 724      $uname[$i] = $name;
 725 #    print sprintf("U%04X $name\n",$i);
 726     }
 727   }
 728 }
 729
 730 sub output_ucm_page
 731 {
 732  my ($cmap,$a,$t,$pre) = @_;
 733  # warn sprintf("Page %x\n",$pre);
 734  foreach my $b (sort keys %$t)
 735   {
 736     die "Changed - fix me for new structure";
 737    my ($s,$e,$out,$n,$end,$l,$fb) = @{$t->{$b}};
 738    die "oops $s $e" unless $s eq $e;
 739    my $u = ord($s);
 740    if ($n != $a && $n != $t)
 741     {
 742      output_ucm_page($cmap,$a,$n,(($pre|($u &0x3F)) << 6)&0xFFFF);
 743     }
 744    elsif (length($out))
 745     {
 746      if ($pre)
 747       {
 748        $u = $pre|($u &0x3f);
 749       }
 750      my $s = sprintf "<U%04X> ",$u;
 751      foreach my $c (split(//,$out))
 752       {
 753        $s .= sprintf "\\x%02X",ord($c);
 754       }
 755      $s .= sprintf " |%d # %s\n",($fb ? 1 : 0),$uname[$u];
 756      push(@$cmap,$s);
 757     }
 758    else
 759     {
 760      warn join(',',@{$t->{$b}},$a,$t);
 761     }
 762   }
 763 }
 764
 765 sub output_ucm
 766 {
 767  my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
 768  print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
 769  print $fh "<code_set_name> \"$name\"\n";
 770  char_names();
 771  if (defined $min_el)
 772   {
 773    print $fh "<mb_cur_min> $min_el\n";
 774   }
 775  if (defined $max_el)
 776   {
 777    print $fh "<mb_cur_max> $max_el\n";
 778   }
 779  if (defined $rep)
 780   {
 781    print $fh "<subchar> ";
 782    foreach my $c (split(//,$rep))
 783     {
 784      printf $fh "\\x%02X",ord($c);
 785     }
 786    print $fh "\n";
 787   }
 788  my @cmap;
 789  output_ucm_page(\@cmap,$h,$h,0);
 790  print $fh "#\nCHARMAP\n";
 791  foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
 792   {
 793    print $fh $line;
 794   }
 795  print $fh "END CHARMAP\n";
 796 }
 797
 798
 799 __END__
 800 With %seen holding array refs:
 801
 802       865.66 real        28.80 user         8.79 sys
 803       7904  maximum resident set size
 804       1356  average shared memory size
 805      18566  average unshared data size
 806        229  average unshared stack size
 807      46080  page reclaims
 808      33373  page faults
 809
 810 With %seen holding simple scalars:
 811
 812       342.16 real        27.11 user         3.54 sys
 813       8388  maximum resident set size
 814       1394  average shared memory size
 815      14969  average unshared data size
 816        236  average unshared stack size
 817      28159  page reclaims
 818       9839  page faults
 819
 820 Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
 821 how %seen is storing things its seen. So it is pathalogically bad on a 16M
 822 RAM machine, but it's going to help even on modern machines.
 823 Swapping is bad, m'kay :-)