ext/Encode/compile

   1 #!../../perl -w
   2 BEGIN {
   3     unshift @INC, qw(../../lib ../../../lib);
   4     $ENV{PATH} .= ';../..;../../..' if $^O eq 'MSWin32';
   5 }
   6 use strict;
   7 use Getopt::Std;
   8 my @orig_ARGV = @ARGV;
   9
  10 # These may get re-ordered.
  11 # RAW is a do_now as inserted by &enter
  12 # AGG is an aggreagated do_now, as built up by &process
  13 use constant {
  14   RAW_NEXT => 0,
  15   RAW_IN_LEN => 1,
  16   RAW_OUT_BYTES => 2,
  17   RAW_FALLBACK => 3,
  18
  19   AGG_MIN_IN => 0,
  20   AGG_MAX_IN => 1,
  21   AGG_OUT_BYTES => 2,
  22   AGG_NEXT => 3,
  23   AGG_IN_LEN => 4,
  24   AGG_OUT_LEN => 5,
  25   AGG_FALLBACK => 6,
  26 };
  27 # (See the algorithm in encengine.c - we're building structures for it)
  28
  29 # There are two sorts of structures.
  30 # "do_now" (an array, two variants of what needs storing) is whatever we need
  31 # to do now we've read an input byte.
  32 # It's housed in a "do_next" (which is how we got to it), and in turn points
  33 # to a "do_next" which contains all the "do_now"s for the next input byte.
  34
  35 # There will be a "do_next" which is the start state.
  36 # For a single byte encoding it's the only "do_next" - each "do_now" points
  37 # back to it, and each "do_now" will cause bytes. There is no state.
  38
  39 # For a multi-byte encoding where all characters in the input are the same
  40 # length, then there will be a tree of "do_now"->"do_next"->"do_now"
  41 # branching out from the start state, one step for each input byte.
  42 # The leaf "do_now"s will all be at the same distance from the start state,
  43 # only the leaf "do_now"s cause output bytes, and they in turn point back to
  44 # the start state.
  45
  46 # For an encoding where there are varaible length input byte sequences, you
  47 # will encounter a leaf "do_now" sooner for the shorter input sequences, but
  48 # as before the leaves will point back to the start state.
  49
  50 # The system will cope with escape encodings (imagine them as a mostly
  51 # self-contained tree for each escape state, and cross links between trees
  52 # at the state-switching characters) but so far no input format defines these.
  53
  54 # The system will also cope with having output "leaves" in the middle of
  55 # the bifurcating branches, not just at the extremities, but again no
  56 # input format does this yet.
  57
  58 # There are two variants of the "do_now" structure. The first, smaller variant
  59 # is generated by &enter as the input file is read. There is one structure
  60 # for each input byte. Say we are mapping a single byte encoding to a
  61 # single byte encoding, with  "ABCD" going "abcd". There will be
  62 # 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
  63
  64 # &process then walks the tree, building aggregate "do_now" structres for
  65 # adjacent bytes where possible. The aggregate is for a contiguous range of
  66 # bytes which each produce the same length of output, each move to the
  67 # same next state, and each have the same fallback flag.
  68 # So our 4 RAW "do_now"s above become replaced by a single structure
  69 # containing:
  70 # ["A", "D", "abcd", 1, ...]
  71 # ie, for an input byte $_ in "A".."D", output 1 byte, found as
  72 # substr ("abcd", (ord $_ - ord "A") * 1, 1)
  73 # which maps very nicely into pointer arithmetic in C for encengine.c
  74
  75 sub encode_U
  76 {
  77  # UTF-8 encode long hand - only covers part of perl's range
  78  ## my $uv = shift;
  79  # chr() works in native space so convert value from table
  80  # into that space before using chr().
  81  my $ch = chr(utf8::unicode_to_native($_[0]));
  82  # Now get core perl to encode that the way it likes.
  83  utf8::encode($ch);
  84  return $ch;
  85 }
  86
  87 sub encode_S
  88 {
  89  # encode single byte
  90  ## my ($ch,$page) = @_; return chr($ch);
  91  return chr $_[0];
  92 }
  93
  94 sub encode_D
  95 {
  96  # encode double byte MS byte first
  97  ## my ($ch,$page) = @_; return chr($page).chr($ch);
  98  return chr ($_[1]) . chr $_[0];
  99 }
 100
 101 sub encode_M
 102 {
 103  # encode Multi-byte - single for 0..255 otherwise double
 104  ## my ($ch,$page) = @_;
 105  ## return &encode_D if $page;
 106  ## return &encode_S;
 107  return chr ($_[1]) . chr $_[0] if $_[1];
 108  return chr $_[0];
 109 }
 110
 111 my %encode_types = (U => \&encode_U,
 112                     S => \&encode_S,
 113                     D => \&encode_D,
 114                     M => \&encode_M,
 115                    );
 116
 117 # Win32 does not expand globs on command line
 118 eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
 119
 120 my %opt;
 121 getopts('qOo:f:n:',\%opt);
 122 my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
 123 chmod(0666,$cname) if -f $cname && !-w $cname;
 124 open(C,">$cname") || die "Cannot open $cname:$!";
 125
 126
 127 my $dname = $cname;
 128 $dname =~ s/(\.[^\.]*)?$/_def.h/;
 129
 130 my ($doC,$doEnc,$doUcm,$doPet);
 131
 132 if ($cname =~ /\.(c|xs)$/)
 133  {
 134   $doC = 1;
 135   chmod(0666,$dname) if -f $cname && !-w $dname;
 136   open(D,">$dname") || die "Cannot open $dname:$!";
 137   my $hname = $cname;
 138   $hname =~ s/(\.[^\.]*)?$/.h/;
 139   chmod(0666,$hname) if -f $cname && !-w $hname;
 140   open(H,">$hname") || die "Cannot open $hname:$!";
 141
 142   foreach my $fh (\*C,\*D,\*H)
 143   {
 144    print $fh <<"END" unless $opt{'q'};
 145 /*
 146  !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
 147  This file was autogenerated by:
 148  $^X $0 @orig_ARGV
 149 */
 150 END
 151   }
 152
 153   if ($cname =~ /(\w+)\.xs$/)
 154    {
 155     print C "#include <EXTERN.h>\n";
 156     print C "#include <perl.h>\n";
 157     print C "#include <XSUB.h>\n";
 158     print C "#define U8 U8\n";
 159    }
 160   print C "#include \"encode.h\"\n";
 161
 162  }
 163 elsif ($cname =~ /\.enc$/)
 164  {
 165   $doEnc = 1;
 166  }
 167 elsif ($cname =~ /\.ucm$/)
 168  {
 169   $doUcm = 1;
 170  }
 171 elsif ($cname =~ /\.pet$/)
 172  {
 173   $doPet = 1;
 174  }
 175
 176 my @encfiles;
 177 if (exists $opt{'f'})
 178  {
 179   # -F is followed by name of file containing list of filenames
 180   my $flist = $opt{'f'};
 181   open(FLIST,$flist) || die "Cannot open $flist:$!";
 182   chomp(@encfiles = <FLIST>);
 183   close(FLIST);
 184  }
 185 else
 186  {
 187   @encfiles = @ARGV;
 188  }
 189
 190 my %encoding;
 191 my %strings;
 192 my $saved = 0;
 193 my $subsave = 0;
 194 my $strings = 0;
 195
 196 sub cmp_name
 197 {
 198  if ($a =~ /^.*-(\d+)/)
 199   {
 200    my $an = $1;
 201    if ($b =~ /^.*-(\d+)/)
 202     {
 203      my $r = $an <=> $1;
 204      return $r if $r;
 205     }
 206   }
 207  return $a cmp $b;
 208 }
 209
 210
 211 foreach my $enc (sort cmp_name @encfiles)
 212  {
 213   my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
 214   $name = $opt{'n'} if exists $opt{'n'};
 215   if (open(E,$enc))
 216    {
 217     if ($sfx eq 'enc')
 218      {
 219       compile_enc(\*E,lc($name));
 220      }
 221     else
 222      {
 223       compile_ucm(\*E,lc($name));
 224      }
 225    }
 226   else
 227    {
 228     warn "Cannot open $enc for $name:$!";
 229    }
 230  }
 231
 232 if ($doC)
 233  {
 234   print STDERR "Writing compiled form\n";
 235   foreach my $name (sort cmp_name keys %encoding)
 236    {
 237     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 238     output(\*C,$name.'_utf8',$e2u);
 239     output(\*C,'utf8_'.$name,$u2e);
 240     push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
 241    }
 242   foreach my $enc (sort cmp_name keys %encoding)
 243    {
 244     my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
 245     my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
 246     my $sym = "${enc}_encoding";
 247     $sym =~ s/\W+/_/g;
 248     print C "encode_t $sym = \n";
 249     print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
 250    }
 251
 252   foreach my $enc (sort cmp_name keys %encoding)
 253    {
 254     my $sym = "${enc}_encoding";
 255     $sym =~ s/\W+/_/g;
 256     print H "extern encode_t $sym;\n";
 257     print D " Encode_XSEncoding(aTHX_ &$sym);\n";
 258    }
 259
 260   if ($cname =~ /(\w+)\.xs$/)
 261    {
 262     my $mod = $1;
 263     print C <<'END';
 264
 265 static void
 266 Encode_XSEncoding(pTHX_ encode_t *enc)
 267 {
 268  dSP;
 269  HV *stash = gv_stashpv("Encode::XS", TRUE);
 270  SV *sv    = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
 271  int i = 0;
 272  PUSHMARK(sp);
 273  XPUSHs(sv);
 274  while (enc->name[i])
 275   {
 276    const char *name = enc->name[i++];
 277    XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
 278   }
 279  PUTBACK;
 280  call_pv("Encode::define_encoding",G_DISCARD);
 281  SvREFCNT_dec(sv);
 282 }
 283
 284 END
 285
 286     print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
 287     print C "BOOT:\n{\n";
 288     print C "#include \"$dname\"\n";
 289     print C "}\n";
 290    }
 291   close(D);
 292   close(H);
 293
 294   my $perc_saved    = $strings/($strings + $saved) * 100;
 295   my $perc_subsaved = $strings/($strings + $subsave) * 100;
 296   printf STDERR "%d bytes in string tables\n",$strings;
 297   printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
 298     $saved, $perc_saved              if $saved;
 299   printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
 300     $subsave, $perc_subsaved         if $subsave;
 301  }
 302 elsif ($doEnc)
 303  {
 304   foreach my $name (sort cmp_name keys %encoding)
 305    {
 306     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 307     output_enc(\*C,$name,$e2u);
 308    }
 309  }
 310 elsif ($doUcm)
 311  {
 312   foreach my $name (sort cmp_name keys %encoding)
 313    {
 314     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 315     output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
 316    }
 317  }
 318
 319 close(C);
 320
 321
 322
 323 sub compile_ucm
 324 {
 325  my ($fh,$name) = @_;
 326  my $e2u = {};
 327  my $u2e = {};
 328  my $cs;
 329  my %attr;
 330  while (<$fh>)
 331   {
 332    s/#.*$//;
 333    last if /^\s*CHARMAP\s*$/i;
 334    if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
 335     {
 336      $attr{$1} = $2;
 337     }
 338   }
 339  if (!defined($cs =  $attr{'code_set_name'}))
 340   {
 341    warn "No <code_set_name> in $name\n";
 342   }
 343  else
 344   {
 345    $name = $cs unless exists $opt{'n'};
 346   }
 347  my $erep;
 348  my $urep;
 349  my $max_el;
 350  my $min_el;
 351  if (exists $attr{'subchar'})
 352   {
 353    my @byte;
 354    $attr{'subchar'} =~ /^\s*/cg;
 355    push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
 356    $erep = join('',map(chr(hex($_)),@byte));
 357   }
 358  print "Reading $name ($cs)\n";
 359  my $nfb = 0;
 360  my $hfb = 0;
 361  while (<$fh>)
 362   {
 363    s/#.*$//;
 364    last if /^\s*END\s+CHARMAP\s*$/i;
 365    next if /^\s*$/;
 366    my ($u,@byte);
 367    my $fb = '';
 368    $u = $1 if (/^<U([0-9a-f]+)>\s+/igc);
 369    push(@byte,$1) while /\G\\x([0-9a-f]+)/igc;
 370    $fb = $1 if /\G\s*(\|[0-3])/gc;
 371    # warn "$_: $u @byte | $fb\n";
 372    die "Bad line:$_" unless /\G\s*(#.*)?$/gc;
 373    if (defined($u))
 374     {
 375      my $uch = encode_U(hex($u));
 376      my $ech = join('',map(chr(hex($_)),@byte));
 377      my $el  = length($ech);
 378      $max_el = $el if (!defined($max_el) || $el > $max_el);
 379      $min_el = $el if (!defined($min_el) || $el < $min_el);
 380      if (length($fb))
 381       {
 382        $fb = substr($fb,1);
 383        $hfb++;
 384       }
 385      else
 386       {
 387        $nfb++;
 388        $fb = '0';
 389       }
 390      # $fb is fallback flag
 391      # 0 - round trip safe
 392      # 1 - fallback for unicode -> enc
 393      # 2 - skip sub-char mapping
 394      # 3 - fallback enc -> unicode
 395      enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
 396      enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
 397     }
 398    else
 399     {
 400      warn $_;
 401     }
 402   }
 403  if ($nfb && $hfb)
 404   {
 405    die "$nfb entries without fallback, $hfb entries with\n";
 406   }
 407  $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
 408 }
 409
 410 sub compile_enc
 411 {
 412  my ($fh,$name) = @_;
 413  my $e2u = {};
 414  my $u2e = {};
 415
 416  my $type;
 417  while ($type = <$fh>)
 418   {
 419    last if $type !~ /^\s*#/;
 420   }
 421  chomp($type);
 422  return if $type eq 'E';
 423  # Do the hash lookup once, rather than once per function call. 4% speedup.
 424  my $type_func = $encode_types{$type};
 425  my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
 426  warn "$type encoded $name\n";
 427  my $rep = '';
 428  # Save a defined test by setting these to defined values.
 429  my $min_el = ~0; # A very big integer
 430  my $max_el = 0;  # Anything must be longer than 0
 431  {
 432   my $v = hex($def);
 433   $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
 434  }
 435  my %seen;
 436  do
 437   {
 438    my $line = <$fh>;
 439    chomp($line);
 440    my $page = hex($line);
 441    my $ch = 0;
 442    my $i = 16;
 443    do
 444     {
 445      # So why is it 1% faster to leave the my here?
 446      my $line = <$fh>;
 447      die "Line should be exactly 65 characters long including newline"
 448        unless length ($line) == 65;
 449      # Split line into groups of 4 hex digits, convert groups to ints
 450      # This takes 65.35
 451      # map {hex $_} $line =~ /(....)/g
 452      # This takes 63.75 (2.5% less time)
 453      # unpack "n*", pack "H*", $line
 454      # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
 455      # Doing it as while ($line =~ /(....)/g) took 74.63
 456      foreach my $val (unpack "n*", pack "H*", $line)
 457       {
 458        next if $val == 0xFFFD;
 459        my $ech = &$type_func($ch,$page);
 460        if ($val || (!$ch && !$page))
 461         {
 462          my $el  = length($ech);
 463          $max_el = $el if $el > $max_el;
 464          $min_el = $el if $el < $min_el;
 465          my $uch = encode_U($val);
 466          # We don't need to read this quickly, so storing it as a scalar,
 467          # rather than 3 (anon array, plus the 2 scalars it holds) saves
 468          # RAM and may make us faster on low RAM systems. [see __END__]
 469          if (exists $seen{$uch})
 470           {
 471            warn sprintf("U%04X is %02X%02X and %04X\n",
 472                         $val,$page,$ch,$seen{$uch});
 473           }
 474          else
 475           {
 476            $seen{$uch} = $page << 8 | $ch;
 477           }
 478          # Passing 2 extra args each time is 3.6% slower!
 479          # Even with having to add $fallback ||= 0 later
 480          enter_fb0($e2u,$ech,$uch);
 481          enter_fb0($u2e,$uch,$ech);
 482         }
 483        else
 484         {
 485          # No character at this position
 486          # enter($e2u,$ech,undef,$e2u);
 487         }
 488        $ch++;
 489       }
 490     } while --$i;
 491   } while --$pages;
 492  die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
 493    if $min_el > $max_el;
 494  $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
 495 }
 496
 497 # my ($a,$s,$d,$t,$fb) = @_;
 498 sub enter {
 499   my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
 500   # state we shift to after this (multibyte) input character defaults to same
 501   # as current state.
 502   $next ||= $current;
 503   # Making sure it is defined seems to be faster than {no warnings;} in
 504   # &process, or passing it in as 0 explicity.
 505   # XXX $fallback ||= 0;
 506
 507   # Start at the beginning and work forwards through the string to zero.
 508   # effectively we are removing 1 character from the front each time
 509   # but we don't actually edit the string. [this alone seems to be 14% speedup]
 510   # Hence -$pos is the length of the remaining string.
 511   my $pos = -length $inbytes;
 512   while (1) {
 513     my $byte = substr $inbytes, $pos, 1;
 514     #  RAW_NEXT => 0,
 515     #  RAW_IN_LEN => 1,
 516     #  RAW_OUT_BYTES => 2,
 517     #  RAW_FALLBACK => 3,
 518     # to unicode an array would seem to be better, because the pages are dense.
 519     # from unicode can be very sparse, favouring a hash.
 520     # hash using the bytes (all length 1) as keys rather than ord value,
 521     # as it's easier to sort these in &process.
 522
 523     # It's faster to always add $fallback even if it's undef, rather than
 524     # choosing between 3 and 4 element array. (hence why we set it defined
 525     # above)
 526     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
 527     # When $pos was -1 we were at the last input character.
 528     unless (++$pos) {
 529       $do_now->[RAW_OUT_BYTES] = $outbytes;
 530       $do_now->[RAW_NEXT] = $next;
 531       return;
 532     }
 533     # Tail recursion. The intermdiate state may not have a name yet.
 534     $current = $do_now->[RAW_NEXT];
 535   }
 536 }
 537
 538 # This is purely for optimistation. It's just &enter hard coded for $fallback
 539 # of 0, using only a 3 entry array ref to save memory for every entry.
 540 sub enter_fb0 {
 541   my ($current,$inbytes,$outbytes,$next) = @_;
 542   $next ||= $current;
 543
 544   my $pos = -length $inbytes;
 545   while (1) {
 546     my $byte = substr $inbytes, $pos, 1;
 547     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
 548     unless (++$pos) {
 549       $do_now->[RAW_OUT_BYTES] = $outbytes;
 550       $do_now->[RAW_NEXT] = $next;
 551       return;
 552     }
 553     $current = $do_now->[RAW_NEXT];
 554   }
 555 }
 556
 557
 558 sub outstring
 559 {
 560  my ($fh,$name,$s) = @_;
 561  my $sym = $strings{$s};
 562  if ($sym)
 563   {
 564    $saved += length($s);
 565   }
 566  else
 567   {
 568    if ($opt{'O'}) {
 569        foreach my $o (keys %strings)
 570         {
 571          next unless (my $i = index($o,$s)) >= 0;
 572          $sym = $strings{$o};
 573          $sym .= sprintf("+0x%02x",$i) if ($i);
 574          $subsave += length($s);
 575          return $strings{$s} = $sym;
 576        }
 577    }
 578    $strings{$s} = $sym = $name;
 579    $strings += length($s);
 580    my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
 581    # Maybe we should assert that these are all <256.
 582    $definition .= join(',',unpack "C*",$s);
 583    # We have a single long line. Split it at convenient commas.
 584    $definition =~ s/(.{74,77},)/$1\n/g;
 585    print $fh "$definition };\n\n";
 586   }
 587  return $sym;
 588 }
 589
 590 sub process
 591 {
 592   my ($name,$a) = @_;
 593   $name =~ s/\W+/_/g;
 594   $a->{Cname} = $name;
 595   my $raw = $a->{Raw};
 596   my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
 597   my @ent;
 598   $agg_max_in = 0;
 599   foreach my $key (sort keys %$raw) {
 600     #  RAW_NEXT => 0,
 601     #  RAW_IN_LEN => 1,
 602     #  RAW_OUT_BYTES => 2,
 603     #  RAW_FALLBACK => 3,
 604     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 605     # Now we are converting from raw to aggregate, switch from 1 byte strings
 606     # to numbers
 607     my $b = ord $key;
 608     $fallback ||= 0;
 609     if ($l &&
 610         # If this == fails, we're going to reset $agg_max_in below anyway.
 611         $b == ++$agg_max_in &&
 612         # References in numeric context give the pointer as an int.
 613         $agg_next == $next &&
 614         $agg_in_len == $in_len &&
 615         $agg_out_len == length $out_bytes &&
 616         $agg_fallback == $fallback
 617         # && length($l->[AGG_OUT_BYTES]) < 16
 618        ) {
 619       #     my $i = ord($b)-ord($l->[AGG_MIN_IN]);
 620       # we can aggregate this byte onto the end.
 621       $l->[AGG_MAX_IN] = $b;
 622       $l->[AGG_OUT_BYTES] .= $out_bytes;
 623     } else {
 624       # AGG_MIN_IN => 0,
 625       # AGG_MAX_IN => 1,
 626       # AGG_OUT_BYTES => 2,
 627       # AGG_NEXT => 3,
 628       # AGG_IN_LEN => 4,
 629       # AGG_OUT_LEN => 5,
 630       # AGG_FALLBACK => 6,
 631       # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
 632       # (only gains .6% on euc-jp  -- is it worth it?)
 633       push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
 634                        $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
 635                        $agg_fallback = $fallback];
 636     }
 637     if (exists $next->{Cname}) {
 638       $next->{'Forward'} = 1 if $next != $a;
 639     } else {
 640       process(sprintf("%s_%02x",$name,$b),$next);
 641     }
 642   }
 643   # encengine.c rules say that last entry must be for 255
 644   if ($agg_max_in < 255) {
 645     push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
 646   }
 647   $a->{'Entries'} = \@ent;
 648 }
 649
 650 sub outtable
 651 {
 652  my ($fh,$a) = @_;
 653  my $name = $a->{'Cname'};
 654  # String tables
 655  foreach my $b (@{$a->{'Entries'}})
 656   {
 657    next unless $b->[AGG_OUT_LEN];
 658    my $s = $b->[AGG_MIN_IN];
 659    my $e = $b->[AGG_MAX_IN];
 660    outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
 661   }
 662  if ($a->{'Forward'})
 663   {
 664    print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
 665   }
 666  $a->{'Done'} = 1;
 667  foreach my $b (@{$a->{'Entries'}})
 668   {
 669    my ($s,$e,$out,$t,$end,$l) = @$b;
 670    outtable($fh,$t) unless $t->{'Done'};
 671   }
 672  print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
 673  foreach my $b (@{$a->{'Entries'}})
 674   {
 675    my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
 676    $end |= 0x80 if $fb;
 677    print  $fh "{";
 678    if ($l)
 679     {
 680      printf $fh outstring($fh,'',$out);
 681     }
 682    else
 683     {
 684      print  $fh "0";
 685     }
 686    print  $fh ",",$t->{Cname};
 687    printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
 688   }
 689  print $fh "};\n";
 690 }
 691
 692 sub output
 693 {
 694  my ($fh,$name,$a) = @_;
 695  process($name,$a);
 696  # Sub-tables
 697  outtable($fh,$a);
 698 }
 699
 700 sub output_enc
 701 {
 702  my ($fh,$name,$a) = @_;
 703  die "Changed - fix me for new structure";
 704  foreach my $b (sort keys %$a)
 705   {
 706    my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
 707   }
 708 }
 709
 710 sub decode_U
 711 {
 712  my $s = shift;
 713 }
 714
 715 my @uname;
 716 sub char_names
 717 {
 718  my $s = do "unicore/Name.pl";
 719  die "char_names: unicore/Name.pl: $!\n" unless defined $s;
 720  pos($s) = 0;
 721  while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
 722   {
 723    my $name = $3;
 724    my $s = hex($1);
 725    last if $s >= 0x10000;
 726    my $e = length($2) ? hex($2) : $s;
 727    for (my $i = $s; $i <= $e; $i++)
 728     {
 729      $uname[$i] = $name;
 730 #    print sprintf("U%04X $name\n",$i);
 731     }
 732   }
 733 }
 734
 735 sub output_ucm_page
 736 {
 737   my ($cmap,$a,$t,$pre) = @_;
 738   # warn sprintf("Page %x\n",$pre);
 739   my $raw = $t->{Raw};
 740   foreach my $key (sort keys %$raw) {
 741     #  RAW_NEXT => 0,
 742     #  RAW_IN_LEN => 1,
 743     #  RAW_OUT_BYTES => 2,
 744     #  RAW_FALLBACK => 3,
 745     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 746     my $u = ord $key;
 747     $fallback ||= 0;
 748
 749     if ($next != $a && $next != $t) {
 750       output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
 751     } elsif (length $out_bytes) {
 752       if ($pre) {
 753         $u = $pre|($u &0x3f);
 754       }
 755       my $s = sprintf "<U%04X> ",$u;
 756       #foreach my $c (split(//,$out_bytes)) {
 757       #  $s .= sprintf "\\x%02X",ord($c);
 758       #}
 759       # 9.5% faster changing that lloop to this:
 760       $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
 761       $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
 762       push(@$cmap,$s);
 763     } else {
 764       warn join(',',$u, @{$raw->{$key}},$a,$t);
 765     }
 766   }
 767 }
 768
 769 sub output_ucm
 770 {
 771  my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
 772  print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
 773  print $fh "<code_set_name> \"$name\"\n";
 774  char_names();
 775  if (defined $min_el)
 776   {
 777    print $fh "<mb_cur_min> $min_el\n";
 778   }
 779  if (defined $max_el)
 780   {
 781    print $fh "<mb_cur_max> $max_el\n";
 782   }
 783  if (defined $rep)
 784   {
 785    print $fh "<subchar> ";
 786    foreach my $c (split(//,$rep))
 787     {
 788      printf $fh "\\x%02X",ord($c);
 789     }
 790    print $fh "\n";
 791   }
 792  my @cmap;
 793  output_ucm_page(\@cmap,$h,$h,0);
 794  print $fh "#\nCHARMAP\n";
 795  foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
 796   {
 797    print $fh $line;
 798   }
 799  print $fh "END CHARMAP\n";
 800 }
 801
 802
 803 __END__
 804 With %seen holding array refs:
 805
 806       865.66 real        28.80 user         8.79 sys
 807       7904  maximum resident set size
 808       1356  average shared memory size
 809      18566  average unshared data size
 810        229  average unshared stack size
 811      46080  page reclaims
 812      33373  page faults
 813
 814 With %seen holding simple scalars:
 815
 816       342.16 real        27.11 user         3.54 sys
 817       8388  maximum resident set size
 818       1394  average shared memory size
 819      14969  average unshared data size
 820        236  average unshared stack size
 821      28159  page reclaims
 822       9839  page faults
 823
 824 Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
 825 how %seen is storing things its seen. So it is pathalogically bad on a 16M
 826 RAM machine, but it's going to help even on modern machines.
 827 Swapping is bad, m'kay :-)