ext/Encode/bin/enc2xs

   1 #!./perl
   2 BEGIN {
   3     # @INC poking  no longer needed w/ new MakeMaker and Makefile.PL's
   4     # with $ENV{PERL_CORE} set
   5     # In case we need it in future...
   6     require Config; import Config;
   7 }
   8 use strict;
   9 use Getopt::Std;
  10 my @orig_ARGV = @ARGV;
  11 our $VERSION  = do { my @r = (q$Revision: 1.21 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  12
  13 # These may get re-ordered.
  14 # RAW is a do_now as inserted by &enter
  15 # AGG is an aggreagated do_now, as built up by &process
  16
  17 use constant {
  18   RAW_NEXT => 0,
  19   RAW_IN_LEN => 1,
  20   RAW_OUT_BYTES => 2,
  21   RAW_FALLBACK => 3,
  22
  23   AGG_MIN_IN => 0,
  24   AGG_MAX_IN => 1,
  25   AGG_OUT_BYTES => 2,
  26   AGG_NEXT => 3,
  27   AGG_IN_LEN => 4,
  28   AGG_OUT_LEN => 5,
  29   AGG_FALLBACK => 6,
  30 };
  31
  32 # (See the algorithm in encengine.c - we're building structures for it)
  33
  34 # There are two sorts of structures.
  35 # "do_now" (an array, two variants of what needs storing) is whatever we need
  36 # to do now we've read an input byte.
  37 # It's housed in a "do_next" (which is how we got to it), and in turn points
  38 # to a "do_next" which contains all the "do_now"s for the next input byte.
  39
  40 # There will be a "do_next" which is the start state.
  41 # For a single byte encoding it's the only "do_next" - each "do_now" points
  42 # back to it, and each "do_now" will cause bytes. There is no state.
  43
  44 # For a multi-byte encoding where all characters in the input are the same
  45 # length, then there will be a tree of "do_now"->"do_next"->"do_now"
  46 # branching out from the start state, one step for each input byte.
  47 # The leaf "do_now"s will all be at the same distance from the start state,
  48 # only the leaf "do_now"s cause output bytes, and they in turn point back to
  49 # the start state.
  50
  51 # For an encoding where there are varaible length input byte sequences, you
  52 # will encounter a leaf "do_now" sooner for the shorter input sequences, but
  53 # as before the leaves will point back to the start state.
  54
  55 # The system will cope with escape encodings (imagine them as a mostly
  56 # self-contained tree for each escape state, and cross links between trees
  57 # at the state-switching characters) but so far no input format defines these.
  58
  59 # The system will also cope with having output "leaves" in the middle of
  60 # the bifurcating branches, not just at the extremities, but again no
  61 # input format does this yet.
  62
  63 # There are two variants of the "do_now" structure. The first, smaller variant
  64 # is generated by &enter as the input file is read. There is one structure
  65 # for each input byte. Say we are mapping a single byte encoding to a
  66 # single byte encoding, with  "ABCD" going "abcd". There will be
  67 # 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
  68
  69 # &process then walks the tree, building aggregate "do_now" structres for
  70 # adjacent bytes where possible. The aggregate is for a contiguous range of
  71 # bytes which each produce the same length of output, each move to the
  72 # same next state, and each have the same fallback flag.
  73 # So our 4 RAW "do_now"s above become replaced by a single structure
  74 # containing:
  75 # ["A", "D", "abcd", 1, ...]
  76 # ie, for an input byte $_ in "A".."D", output 1 byte, found as
  77 # substr ("abcd", (ord $_ - ord "A") * 1, 1)
  78 # which maps very nicely into pointer arithmetic in C for encengine.c
  79
  80 sub encode_U
  81 {
  82  # UTF-8 encode long hand - only covers part of perl's range
  83  ## my $uv = shift;
  84  # chr() works in native space so convert value from table
  85  # into that space before using chr().
  86  my $ch = chr(utf8::unicode_to_native($_[0]));
  87  # Now get core perl to encode that the way it likes.
  88  utf8::encode($ch);
  89  return $ch;
  90 }
  91
  92 sub encode_S
  93 {
  94  # encode single byte
  95  ## my ($ch,$page) = @_; return chr($ch);
  96  return chr $_[0];
  97 }
  98
  99 sub encode_D
 100 {
 101  # encode double byte MS byte first
 102  ## my ($ch,$page) = @_; return chr($page).chr($ch);
 103  return chr ($_[1]) . chr $_[0];
 104 }
 105
 106 sub encode_M
 107 {
 108  # encode Multi-byte - single for 0..255 otherwise double
 109  ## my ($ch,$page) = @_;
 110  ## return &encode_D if $page;
 111  ## return &encode_S;
 112  return chr ($_[1]) . chr $_[0] if $_[1];
 113  return chr $_[0];
 114 }
 115
 116 my %encode_types = (U => \&encode_U,
 117                     S => \&encode_S,
 118                     D => \&encode_D,
 119                     M => \&encode_M,
 120                    );
 121
 122 # Win32 does not expand globs on command line
 123 eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
 124
 125 my %opt;
 126 # I think these are:
 127 # -Q to disable the duplicate codepoint test
 128 # -S make mapping errors fatal
 129 # -q to remove comments written to output files
 130 # -O to enable the (brute force) substring optimiser
 131 # -o <output> to specify the output file name (else it's the first arg)
 132 # -f <inlist> to give a file with a list of input files (else use the args)
 133 # -n <name> to name the encoding (else use the basename of the input file.
 134 getopts('CM:SQqOo:f:n:',\%opt);
 135
 136 $opt{M} and make_makefile_pl($opt{M}, @ARGV);
 137 $opt{C} and make_configlocal_pm($opt{C}, @ARGV);
 138
 139 # This really should go first, else the die here causes empty (non-erroneous)
 140 # output files to be written.
 141 my @encfiles;
 142 if (exists $opt{'f'}) {
 143     # -F is followed by name of file containing list of filenames
 144     my $flist = $opt{'f'};
 145     open(FLIST,$flist) || die "Cannot open $flist:$!";
 146     chomp(@encfiles = <FLIST>);
 147     close(FLIST);
 148 } else {
 149     @encfiles = @ARGV;
 150 }
 151
 152 my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
 153 chmod(0666,$cname) if -f $cname && !-w $cname;
 154 open(C,">$cname") || die "Cannot open $cname:$!";
 155
 156 my $dname = $cname;
 157 my $hname = $cname;
 158
 159 my ($doC,$doEnc,$doUcm,$doPet);
 160
 161 if ($cname =~ /\.(c|xs)$/)
 162  {
 163   $doC = 1;
 164   $dname =~ s/(\.[^\.]*)?$/.exh/;
 165   chmod(0666,$dname) if -f $cname && !-w $dname;
 166   open(D,">$dname") || die "Cannot open $dname:$!";
 167   $hname =~ s/(\.[^\.]*)?$/.h/;
 168   chmod(0666,$hname) if -f $cname && !-w $hname;
 169   open(H,">$hname") || die "Cannot open $hname:$!";
 170
 171   foreach my $fh (\*C,\*D,\*H)
 172   {
 173    print $fh <<"END" unless $opt{'q'};
 174 /*
 175  !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
 176  This file was autogenerated by:
 177  $^X $0 @orig_ARGV
 178 */
 179 END
 180   }
 181
 182   if ($cname =~ /(\w+)\.xs$/)
 183    {
 184     print C "#include <EXTERN.h>\n";
 185     print C "#include <perl.h>\n";
 186     print C "#include <XSUB.h>\n";
 187     print C "#define U8 U8\n";
 188    }
 189   print C "#include \"encode.h\"\n";
 190
 191  }
 192 elsif ($cname =~ /\.enc$/)
 193  {
 194   $doEnc = 1;
 195  }
 196 elsif ($cname =~ /\.ucm$/)
 197  {
 198   $doUcm = 1;
 199  }
 200 elsif ($cname =~ /\.pet$/)
 201  {
 202   $doPet = 1;
 203  }
 204
 205 my %encoding;
 206 my %strings;
 207 my $saved = 0;
 208 my $subsave = 0;
 209 my $strings = 0;
 210
 211 sub cmp_name
 212 {
 213  if ($a =~ /^.*-(\d+)/)
 214   {
 215    my $an = $1;
 216    if ($b =~ /^.*-(\d+)/)
 217     {
 218      my $r = $an <=> $1;
 219      return $r if $r;
 220     }
 221   }
 222  return $a cmp $b;
 223 }
 224
 225
 226 foreach my $enc (sort cmp_name @encfiles)
 227  {
 228   my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
 229   $name = $opt{'n'} if exists $opt{'n'};
 230   if (open(E,$enc))
 231    {
 232     if ($sfx eq 'enc')
 233      {
 234       compile_enc(\*E,lc($name));
 235      }
 236     else
 237      {
 238       compile_ucm(\*E,lc($name));
 239      }
 240    }
 241   else
 242    {
 243     warn "Cannot open $enc for $name:$!";
 244    }
 245  }
 246
 247 if ($doC)
 248  {
 249   print STDERR "Writing compiled form\n";
 250   foreach my $name (sort cmp_name keys %encoding)
 251    {
 252     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 253     output(\*C,$name.'_utf8',$e2u);
 254     output(\*C,'utf8_'.$name,$u2e);
 255     push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
 256    }
 257   foreach my $enc (sort cmp_name keys %encoding)
 258    {
 259     my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
 260     my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
 261     my $sym = "${enc}_encoding";
 262     $sym =~ s/\W+/_/g;
 263     print C "encode_t $sym = \n";
 264     print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
 265    }
 266
 267   foreach my $enc (sort cmp_name keys %encoding)
 268    {
 269     my $sym = "${enc}_encoding";
 270     $sym =~ s/\W+/_/g;
 271     print H "extern encode_t $sym;\n";
 272     print D " Encode_XSEncoding(aTHX_ &$sym);\n";
 273    }
 274
 275   if ($cname =~ /(\w+)\.xs$/)
 276    {
 277     my $mod = $1;
 278     print C <<'END';
 279
 280 static void
 281 Encode_XSEncoding(pTHX_ encode_t *enc)
 282 {
 283  dSP;
 284  HV *stash = gv_stashpv("Encode::XS", TRUE);
 285  SV *sv    = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
 286  int i = 0;
 287  PUSHMARK(sp);
 288  XPUSHs(sv);
 289  while (enc->name[i])
 290   {
 291    const char *name = enc->name[i++];
 292    XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
 293   }
 294  PUTBACK;
 295  call_pv("Encode::define_encoding",G_DISCARD);
 296  SvREFCNT_dec(sv);
 297 }
 298
 299 END
 300
 301     print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
 302     print C "BOOT:\n{\n";
 303     print C "#include \"$dname\"\n";
 304     print C "}\n";
 305    }
 306   # Close in void context is bad, m'kay
 307   close(D) or warn "Error closing '$dname': $!";
 308   close(H) or warn "Error closing '$hname': $!";
 309
 310   my $perc_saved    = $strings/($strings + $saved) * 100;
 311   my $perc_subsaved = $strings/($strings + $subsave) * 100;
 312   printf STDERR "%d bytes in string tables\n",$strings;
 313   printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
 314     $saved, $perc_saved              if $saved;
 315   printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
 316     $subsave, $perc_subsaved         if $subsave;
 317  }
 318 elsif ($doEnc)
 319  {
 320   foreach my $name (sort cmp_name keys %encoding)
 321    {
 322     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 323     output_enc(\*C,$name,$e2u);
 324    }
 325  }
 326 elsif ($doUcm)
 327  {
 328   foreach my $name (sort cmp_name keys %encoding)
 329    {
 330     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 331     output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
 332    }
 333  }
 334
 335 # writing half meg files and then not checking to see if you just filled the
 336 # disk is bad, m'kay
 337 close(C) or die "Error closing '$cname': $!";
 338
 339 # End of the main program.
 340
 341 sub compile_ucm
 342 {
 343  my ($fh,$name) = @_;
 344  my $e2u = {};
 345  my $u2e = {};
 346  my $cs;
 347  my %attr;
 348  while (<$fh>)
 349   {
 350    s/#.*$//;
 351    last if /^\s*CHARMAP\s*$/i;
 352    if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
 353     {
 354      $attr{$1} = $2;
 355     }
 356   }
 357  if (!defined($cs =  $attr{'code_set_name'}))
 358   {
 359    warn "No <code_set_name> in $name\n";
 360   }
 361  else
 362   {
 363    $name = $cs unless exists $opt{'n'};
 364   }
 365  my $erep;
 366  my $urep;
 367  my $max_el;
 368  my $min_el;
 369  if (exists $attr{'subchar'})
 370   {
 371    my @byte;
 372    $attr{'subchar'} =~ /^\s*/cg;
 373    push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
 374    $erep = join('',map(chr(hex($_)),@byte));
 375   }
 376  print "Reading $name ($cs)\n";
 377  my $nfb = 0;
 378  my $hfb = 0;
 379  while (<$fh>)
 380   {
 381    s/#.*$//;
 382    last if /^\s*END\s+CHARMAP\s*$/i;
 383    next if /^\s*$/;
 384    my (@uni, @byte) = ();
 385    my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
 386        or die "Bad line: $_";
 387    while ($uni =~  m/\G<([U0-9a-fA-F\+]+)>/g){
 388        push @uni, map { substr($_, 1) } split(/\+/, $1);
 389    }
 390    while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
 391        push @byte, $1;
 392    }
 393    if (@uni)
 394     {
 395      my $uch =  join('', map { encode_U(hex($_)) } @uni );
 396      my $ech = join('',map(chr(hex($_)),@byte));
 397      my $el  = length($ech);
 398      $max_el = $el if (!defined($max_el) || $el > $max_el);
 399      $min_el = $el if (!defined($min_el) || $el < $min_el);
 400      if (length($fb))
 401       {
 402        $fb = substr($fb,1);
 403        $hfb++;
 404       }
 405      else
 406       {
 407        $nfb++;
 408        $fb = '0';
 409       }
 410      # $fb is fallback flag
 411      # 0 - round trip safe
 412      # 1 - fallback for unicode -> enc
 413      # 2 - skip sub-char mapping
 414      # 3 - fallback enc -> unicode
 415      enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
 416      enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
 417     }
 418    else
 419     {
 420      warn $_;
 421     }
 422   }
 423  if ($nfb && $hfb)
 424   {
 425    die "$nfb entries without fallback, $hfb entries with\n";
 426   }
 427  $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
 428 }
 429
 430
 431
 432 sub compile_enc
 433 {
 434  my ($fh,$name) = @_;
 435  my $e2u = {};
 436  my $u2e = {};
 437
 438  my $type;
 439  while ($type = <$fh>)
 440   {
 441    last if $type !~ /^\s*#/;
 442   }
 443  chomp($type);
 444  return if $type eq 'E';
 445  # Do the hash lookup once, rather than once per function call. 4% speedup.
 446  my $type_func = $encode_types{$type};
 447  my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
 448  warn "$type encoded $name\n";
 449  my $rep = '';
 450  # Save a defined test by setting these to defined values.
 451  my $min_el = ~0; # A very big integer
 452  my $max_el = 0;  # Anything must be longer than 0
 453  {
 454   my $v = hex($def);
 455   $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
 456  }
 457  my $errors;
 458  my $seen;
 459  # use -Q to silence the seen test. Makefile.PL uses this by default.
 460  $seen = {} unless $opt{Q};
 461  do
 462   {
 463    my $line = <$fh>;
 464    chomp($line);
 465    my $page = hex($line);
 466    my $ch = 0;
 467    my $i = 16;
 468    do
 469     {
 470      # So why is it 1% faster to leave the my here?
 471      my $line = <$fh>;
 472      $line =~ s/\r\n$/\n/;
 473      die "$.:${line}Line should be exactly 65 characters long including
 474      newline (".length($line).")" unless length ($line) == 65;
 475      # Split line into groups of 4 hex digits, convert groups to ints
 476      # This takes 65.35
 477      # map {hex $_} $line =~ /(....)/g
 478      # This takes 63.75 (2.5% less time)
 479      # unpack "n*", pack "H*", $line
 480      # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
 481      # Doing it as while ($line =~ /(....)/g) took 74.63
 482      foreach my $val (unpack "n*", pack "H*", $line)
 483       {
 484        next if $val == 0xFFFD;
 485        my $ech = &$type_func($ch,$page);
 486        if ($val || (!$ch && !$page))
 487         {
 488          my $el  = length($ech);
 489          $max_el = $el if $el > $max_el;
 490          $min_el = $el if $el < $min_el;
 491          my $uch = encode_U($val);
 492          if ($seen) {
 493            # We're doing the test.
 494            # We don't need to read this quickly, so storing it as a scalar,
 495            # rather than 3 (anon array, plus the 2 scalars it holds) saves
 496            # RAM and may make us faster on low RAM systems. [see __END__]
 497            if (exists $seen->{$uch})
 498              {
 499                warn sprintf("U%04X is %02X%02X and %04X\n",
 500                             $val,$page,$ch,$seen->{$uch});
 501                $errors++;
 502              }
 503            else
 504              {
 505                $seen->{$uch} = $page << 8 | $ch;
 506              }
 507          }
 508          # Passing 2 extra args each time is 3.6% slower!
 509          # Even with having to add $fallback ||= 0 later
 510          enter_fb0($e2u,$ech,$uch);
 511          enter_fb0($u2e,$uch,$ech);
 512         }
 513        else
 514         {
 515          # No character at this position
 516          # enter($e2u,$ech,undef,$e2u);
 517         }
 518        $ch++;
 519       }
 520     } while --$i;
 521   } while --$pages;
 522  die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
 523    if $min_el > $max_el;
 524  die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
 525  $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
 526 }
 527
 528 # my ($a,$s,$d,$t,$fb) = @_;
 529 sub enter {
 530   my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
 531   # state we shift to after this (multibyte) input character defaults to same
 532   # as current state.
 533   $next ||= $current;
 534   # Making sure it is defined seems to be faster than {no warnings;} in
 535   # &process, or passing it in as 0 explicity.
 536   # XXX $fallback ||= 0;
 537
 538   # Start at the beginning and work forwards through the string to zero.
 539   # effectively we are removing 1 character from the front each time
 540   # but we don't actually edit the string. [this alone seems to be 14% speedup]
 541   # Hence -$pos is the length of the remaining string.
 542   my $pos = -length $inbytes;
 543   while (1) {
 544     my $byte = substr $inbytes, $pos, 1;
 545     #  RAW_NEXT => 0,
 546     #  RAW_IN_LEN => 1,
 547     #  RAW_OUT_BYTES => 2,
 548     #  RAW_FALLBACK => 3,
 549     # to unicode an array would seem to be better, because the pages are dense.
 550     # from unicode can be very sparse, favouring a hash.
 551     # hash using the bytes (all length 1) as keys rather than ord value,
 552     # as it's easier to sort these in &process.
 553
 554     # It's faster to always add $fallback even if it's undef, rather than
 555     # choosing between 3 and 4 element array. (hence why we set it defined
 556     # above)
 557     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
 558     # When $pos was -1 we were at the last input character.
 559     unless (++$pos) {
 560       $do_now->[RAW_OUT_BYTES] = $outbytes;
 561       $do_now->[RAW_NEXT] = $next;
 562       return;
 563     }
 564     # Tail recursion. The intermdiate state may not have a name yet.
 565     $current = $do_now->[RAW_NEXT];
 566   }
 567 }
 568
 569 # This is purely for optimistation. It's just &enter hard coded for $fallback
 570 # of 0, using only a 3 entry array ref to save memory for every entry.
 571 sub enter_fb0 {
 572   my ($current,$inbytes,$outbytes,$next) = @_;
 573   $next ||= $current;
 574
 575   my $pos = -length $inbytes;
 576   while (1) {
 577     my $byte = substr $inbytes, $pos, 1;
 578     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
 579     unless (++$pos) {
 580       $do_now->[RAW_OUT_BYTES] = $outbytes;
 581       $do_now->[RAW_NEXT] = $next;
 582       return;
 583     }
 584     $current = $do_now->[RAW_NEXT];
 585   }
 586 }
 587
 588
 589 sub outstring
 590 {
 591  my ($fh,$name,$s) = @_;
 592  my $sym = $strings{$s};
 593  if ($sym)
 594   {
 595    $saved += length($s);
 596   }
 597  else
 598   {
 599    if ($opt{'O'}) {
 600        foreach my $o (keys %strings)
 601         {
 602          next unless (my $i = index($o,$s)) >= 0;
 603          $sym = $strings{$o};
 604          # gcc things that 0x0e+0x10 (anything with e+) starts to look like
 605          # a hexadecimal floating point constant. Silly gcc. Only p
 606          # introduces a floating point constant. Put the space in to stop it
 607          # getting confused.
 608          $sym .= sprintf(" +0x%02x",$i) if ($i);
 609          $subsave += length($s);
 610          return $strings{$s} = $sym;
 611        }
 612    }
 613    $strings{$s} = $sym = $name;
 614    $strings += length($s);
 615    my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
 616    # Maybe we should assert that these are all <256.
 617    $definition .= join(',',unpack "C*",$s);
 618    # We have a single long line. Split it at convenient commas.
 619    $definition =~ s/(.{74,77},)/$1\n/g;
 620    print $fh "$definition };\n\n";
 621   }
 622  return $sym;
 623 }
 624
 625 sub process
 626 {
 627   my ($name,$a) = @_;
 628   $name =~ s/\W+/_/g;
 629   $a->{Cname} = $name;
 630   my $raw = $a->{Raw};
 631   my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
 632   my @ent;
 633   $agg_max_in = 0;
 634   foreach my $key (sort keys %$raw) {
 635     #  RAW_NEXT => 0,
 636     #  RAW_IN_LEN => 1,
 637     #  RAW_OUT_BYTES => 2,
 638     #  RAW_FALLBACK => 3,
 639     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 640     # Now we are converting from raw to aggregate, switch from 1 byte strings
 641     # to numbers
 642     my $b = ord $key;
 643     $fallback ||= 0;
 644     if ($l &&
 645         # If this == fails, we're going to reset $agg_max_in below anyway.
 646         $b == ++$agg_max_in &&
 647         # References in numeric context give the pointer as an int.
 648         $agg_next == $next &&
 649         $agg_in_len == $in_len &&
 650         $agg_out_len == length $out_bytes &&
 651         $agg_fallback == $fallback
 652         # && length($l->[AGG_OUT_BYTES]) < 16
 653        ) {
 654       #     my $i = ord($b)-ord($l->[AGG_MIN_IN]);
 655       # we can aggregate this byte onto the end.
 656       $l->[AGG_MAX_IN] = $b;
 657       $l->[AGG_OUT_BYTES] .= $out_bytes;
 658     } else {
 659       # AGG_MIN_IN => 0,
 660       # AGG_MAX_IN => 1,
 661       # AGG_OUT_BYTES => 2,
 662       # AGG_NEXT => 3,
 663       # AGG_IN_LEN => 4,
 664       # AGG_OUT_LEN => 5,
 665       # AGG_FALLBACK => 6,
 666       # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
 667       # (only gains .6% on euc-jp  -- is it worth it?)
 668       push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
 669                        $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
 670                        $agg_fallback = $fallback];
 671     }
 672     if (exists $next->{Cname}) {
 673       $next->{'Forward'} = 1 if $next != $a;
 674     } else {
 675       process(sprintf("%s_%02x",$name,$b),$next);
 676     }
 677   }
 678   # encengine.c rules say that last entry must be for 255
 679   if ($agg_max_in < 255) {
 680     push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
 681   }
 682   $a->{'Entries'} = \@ent;
 683 }
 684
 685 sub outtable
 686 {
 687  my ($fh,$a) = @_;
 688  my $name = $a->{'Cname'};
 689  # String tables
 690  foreach my $b (@{$a->{'Entries'}})
 691   {
 692    next unless $b->[AGG_OUT_LEN];
 693    my $s = $b->[AGG_MIN_IN];
 694    my $e = $b->[AGG_MAX_IN];
 695    outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
 696   }
 697  if ($a->{'Forward'})
 698   {
 699    print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
 700   }
 701  $a->{'Done'} = 1;
 702  foreach my $b (@{$a->{'Entries'}})
 703   {
 704    my ($s,$e,$out,$t,$end,$l) = @$b;
 705    outtable($fh,$t) unless $t->{'Done'};
 706   }
 707  print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
 708  foreach my $b (@{$a->{'Entries'}})
 709   {
 710    my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
 711    $end |= 0x80 if $fb;
 712    print  $fh "{";
 713    if ($l)
 714     {
 715      printf $fh outstring($fh,'',$out);
 716     }
 717    else
 718     {
 719      print  $fh "0";
 720     }
 721    print  $fh ",",$t->{Cname};
 722    printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
 723   }
 724  print $fh "};\n";
 725 }
 726
 727 sub output
 728 {
 729  my ($fh,$name,$a) = @_;
 730  process($name,$a);
 731  # Sub-tables
 732  outtable($fh,$a);
 733 }
 734
 735 sub output_enc
 736 {
 737  my ($fh,$name,$a) = @_;
 738  die "Changed - fix me for new structure";
 739  foreach my $b (sort keys %$a)
 740   {
 741    my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
 742   }
 743 }
 744
 745 sub decode_U
 746 {
 747  my $s = shift;
 748 }
 749
 750 my @uname;
 751 sub char_names
 752 {
 753  my $s = do "unicore/Name.pl";
 754  die "char_names: unicore/Name.pl: $!\n" unless defined $s;
 755  pos($s) = 0;
 756  while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
 757   {
 758    my $name = $3;
 759    my $s = hex($1);
 760    last if $s >= 0x10000;
 761    my $e = length($2) ? hex($2) : $s;
 762    for (my $i = $s; $i <= $e; $i++)
 763     {
 764      $uname[$i] = $name;
 765 #    print sprintf("U%04X $name\n",$i);
 766     }
 767   }
 768 }
 769
 770 sub output_ucm_page
 771 {
 772   my ($cmap,$a,$t,$pre) = @_;
 773   # warn sprintf("Page %x\n",$pre);
 774   my $raw = $t->{Raw};
 775   foreach my $key (sort keys %$raw) {
 776     #  RAW_NEXT => 0,
 777     #  RAW_IN_LEN => 1,
 778     #  RAW_OUT_BYTES => 2,
 779     #  RAW_FALLBACK => 3,
 780     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 781     my $u = ord $key;
 782     $fallback ||= 0;
 783
 784     if ($next != $a && $next != $t) {
 785       output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
 786     } elsif (length $out_bytes) {
 787       if ($pre) {
 788         $u = $pre|($u &0x3f);
 789       }
 790       my $s = sprintf "<U%04X> ",$u;
 791       #foreach my $c (split(//,$out_bytes)) {
 792       #  $s .= sprintf "\\x%02X",ord($c);
 793       #}
 794       # 9.5% faster changing that loop to this:
 795       $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
 796       $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
 797       push(@$cmap,$s);
 798     } else {
 799       warn join(',',$u, @{$raw->{$key}},$a,$t);
 800     }
 801   }
 802 }
 803
 804 sub output_ucm
 805 {
 806  my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
 807  print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
 808  print $fh "<code_set_name> \"$name\"\n";
 809  char_names();
 810  if (defined $min_el)
 811   {
 812    print $fh "<mb_cur_min> $min_el\n";
 813   }
 814  if (defined $max_el)
 815   {
 816    print $fh "<mb_cur_max> $max_el\n";
 817   }
 818  if (defined $rep)
 819   {
 820    print $fh "<subchar> ";
 821    foreach my $c (split(//,$rep))
 822     {
 823      printf $fh "\\x%02X",ord($c);
 824     }
 825    print $fh "\n";
 826   }
 827  my @cmap;
 828  output_ucm_page(\@cmap,$h,$h,0);
 829  print $fh "#\nCHARMAP\n";
 830  foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
 831   {
 832    print $fh $line;
 833   }
 834  print $fh "END CHARMAP\n";
 835 }
 836
 837 use vars qw(
 838     $_Enc2xs
 839     $_Version
 840     $_Inc
 841     $_Name
 842     $_TableFiles
 843     $_Now
 844 );
 845
 846 sub make_makefile_pl
 847 {
 848     eval { require Encode; };
 849     $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
 850     # our used for variable expanstion
 851     $_Enc2xs = $0;
 852     $_Version = $VERSION;
 853     $_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o;
 854     $_Name = shift;
 855     $_TableFiles = join(",", map {qq('$_')} @_);
 856     $_Now = scalar localtime();
 857     eval { require File::Spec; };
 858     warn "Generating Makefile.PL\n";
 859     _print_expand(File::Spec->catfile($_Inc,"Makefile_PL.e2x"),"Makefile.PL");
 860     warn "Generating $_Name.pm\n";
 861     _print_expand(File::Spec->catfile($_Inc,"_PM.e2x"),        "$_Name.pm");
 862     warn "Generating t/$_Name.t\n";
 863     _print_expand(File::Spec->catfile($_Inc,"_T.e2x"),         "t/$_Name.t");
 864     warn "Generating README\n";
 865     _print_expand(File::Spec->catfile($_Inc,"README.e2x"),     "README");
 866     warn "Generating t/$_Name.t\n";
 867     _print_expand(File::Spec->catfile($_Inc,"Changes.e2x"),    "Changes");
 868     exit;
 869 }
 870
 871 use vars qw(
 872             $_ModLines
 873             $_LocalVer
 874             );
 875
 876 sub make_configlocal_pm
 877 {
 878     eval { require Encode; };
 879     $@ and die "Unable to require Encode: $@\n";
 880     eval { require File::Spec; };
 881     # our used for variable expanstion
 882     my %in_core = map {$_=>1}('ascii','iso-8859-1','utf8');
 883     my %LocalMod = ();
 884     for my $d (@INC){
 885         my $inc = File::Spec->catfile($d, "Encode");
 886         -d $inc or next;
 887         opendir my $dh, $inc or die "$inc:$!";
 888         warn "Checking $inc...\n";
 889         for my $f (grep /\.pm$/o, readdir($dh)){
 890             -f File::Spec->catfile($inc, "$f") or next;
 891             $INC{"Encode/$f"} and next;
 892             warn "require Encode/$f;\n";
 893             eval { require "Encode/$f"; };
 894             $@ and die "Can't require Encode/$f: $@\n";
 895             for my $enc (Encode->encodings()){
 896                 $in_core{$enc} and next;
 897                 $Encode::Config::ExtModule{$enc} and next;
 898                 my $mod = "Encode/$f";
 899                 $mod =~ s/\.pm$//o; $mod =~ s,/,::,og;
 900                 warn "$enc => $mod\n";
 901                 $LocalMod{$enc} = $mod;
 902             }
 903         }
 904     }
 905     $_ModLines = "";
 906     for my $enc (sort keys %LocalMod){
 907         $_ModLines .=
 908             qq(\$Encode::ExtModule{'$enc'} =\t"$LocalMod{$enc}";\n);
 909     }
 910     $_LocalVer = _mkversion();
 911     $_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o;
 912     warn "Writing Encode::ConfigLocal\n";
 913     _print_expand(File::Spec->catfile($_Inc,"ConfigLocal_PM.e2x"),
 914                   File::Spec->catfile($_Inc,"ConfigLocal.pm"));
 915     exit;
 916 }
 917
 918 sub _mkversion{
 919     my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
 920     $yyyy += 1900, $mo +=1;
 921     return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
 922 }
 923
 924 sub _print_expand{
 925     eval { require File::Basename; };
 926     $@ and die "File::Basename needed.  Are you on miniperl?;\nerror: $@\n";
 927     File::Basename->import();
 928     my ($src, $dst) = @_;
 929     open my $in, $src or die "$src : $!";
 930     if ((my $d = dirname($dst)) ne '.'){
 931         -d $d or mkdir $d, 0755 or die  "mkdir $d : $!";
 932     }
 933     open my $out, ">$dst" or die "$!";
 934     my $asis = 0;
 935     while (<$in>){
 936         if (/^#### END_OF_HEADER/){
 937             $asis = 1; next;
 938         }
 939         s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
 940         print $out $_;
 941     }
 942 }
 943 __END__
 944
 945 =head1 NAME
 946
 947 enc2xs -- Perl Encode Module Generator
 948
 949 =head1 SYNOPSIS
 950
 951   enc2xs -[options]
 952   enc2xs -M ModName mapfiles...
 953   enc2xs -C
 954
 955 =head1 DESCRIPTION
 956
 957 F<enc2xs> builds a Perl extension for use by Encode from either
 958 Unicode Character Mapping files (.ucm) or Tcl Encoding Files
 959 (.enc)  Besides internally used during the build process of Encode
 960 module, you can use F<enc2xs> to add your own encoding to perl.  No
 961 knowledge on XS is necessary.
 962
 963 =head1 Quick Guide
 964
 965 If what you want to know as little about Perl possible but needs to
 966 add a new encoding, just read this chapter and forget the rest.
 967
 968 =over 4
 969
 970 =item 0.
 971
 972 Have a .ucm file ready.  You can get it from somewhere or you can
 973 write your own from scratch or you can grab one from Encode
 974 distribution and customize.  For UCM format, see the next Chapter.
 975 In the example below, I'll call my theoretical encoding myascii,
 976 defined inI<my.ucm>.  C<$> is a shell prompt.
 977
 978   $ ls -F
 979   my.ucm
 980
 981 =item 1.
 982
 983 Issue a command as follows;
 984
 985   $ enc2xs -M My my.ucm
 986   generating Makefile.PL
 987   generating My.pm
 988   generating README
 989   generating Changes
 990
 991 Now take a look at your current directory.  It should look like this.
 992
 993   $ ls -F
 994   Makefile.PL   My.pm         my.ucm        t/
 995
 996 The following files are created.
 997
 998   Makefle.PL - MakeMaker script
 999   My.pm      - Encode Submodule
1000   t/My.t     - test file
1001
1002 =item 1.1.
1003
1004 If you want *.ucm installed together with the modules, do as follows;
1005
1006   $ mkdir Encode
1007   $ mv *.ucm Encode
1008   $ enc2xs -M My Encode/*ucm
1009
1010 =item 2.
1011
1012 Edit the files generated.  You don't have to if you have no time AND no
1013 intention to give it to someone else.  But it is a good idea to edit
1014 pod and add more tests.
1015
1016 =item 3.
1017
1018 Now issue a command all Perl Mongers love;
1019
1020   $ perl5.7.3 Makefile.PL
1021   Writing Makefile for Encode::My
1022
1023 =item 4.
1024
1025 Now all you have to do is make.
1026
1027   $ make
1028   cp My.pm blib/lib/Encode/My.pm
1029   /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1030     -o encode_t.c -f encode_t.fnm
1031   Reading myascii (myascii)
1032   Writing compiled form
1033   128 bytes in string tables
1034   384 bytes (25%) saved spotting duplicates
1035   1 bytes (99.2%) saved using substrings
1036   ....
1037   chmod 644 blib/arch/auto/Encode/My/My.bs
1038   $
1039
1040 The time it takes varies how fast your machine is and how large your
1041 encoding is.  Unless you are working on something big like euc-tw, it
1042 won't take too long.
1043
1044 =item 5.
1045
1046 You can "make install" already but you should test first.
1047
1048   $ make test
1049   PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1050     -e 'use Test::Harness  qw(&runtests $verbose); \
1051     $verbose=0; runtests @ARGV;' t/*.t
1052   t/My....ok
1053   All tests successful.
1054   Files=1, Tests=2,  0 wallclock secs
1055    ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1056
1057 =item 6.
1058
1059 If you are content with the test result, just "make install"
1060
1061 =item 7.
1062
1063 If you want to add your encoding to Encode demand-loading list
1064 (so you don't have to "use Encode::YourEncoding"), run
1065
1066   enc2xs -C
1067
1068 to update Encode::ConfigLocal, a module that controls local settings.
1069 After that, "use Encode;" is enough to load your encodings on demand.
1070
1071 =back
1072
1073 =head1 The Unicode Character Map
1074
1075 Encode uses The Unicode Character Map (UCM) for source character
1076 mappings.  This format is used by ICU package of IBM and adopted by
1077 Nick Ing-Simmons.  Since UCM is more flexible than Tcl's Encoding Map
1078 and far more user-friendly,  This is the recommended formet for
1079 Encode now.
1080
1081 UCM file looks like this.
1082
1083   #
1084   # Comments
1085   #
1086   <code_set_name> "US-ascii" # Required
1087   <code_set_alias> "ascii"   # Optional
1088   <mb_cur_min> 1             # Required; usually 1
1089   <mb_cur_max> 1             # Max. # of bytes/char
1090   <subchar> \x3F             # Substitution char
1091   #
1092   CHARMAP
1093   <U0000> \x00 |0 # <control>
1094   <U0001> \x01 |0 # <control>
1095   <U0002> \x02 |0 # <control>
1096   ....
1097   <U007C> \x7C |0 # VERTICAL LINE
1098   <U007D> \x7D |0 # RIGHT CURLY BRACKET
1099   <U007E> \x7E |0 # TILDE
1100   <U007F> \x7F |0 # <control>
1101   END CHARMAP
1102
1103 =over 4
1104
1105 =item *
1106
1107 Anything that follows C<#> is treated as comments.
1108
1109 =item *
1110
1111 The header section continues until CHARMAP. This section Has a form of
1112 I<E<lt>keywordE<gt> value>, one at a line.  For a value, strings must
1113 be quoted. Barewords are treated as numbers.  I<\xXX> represents a
1114 byte.
1115
1116 Most of the keywords are self-explanatory. I<subchar> means
1117 substitution character, not subcharacter.  When you decode a Unicode
1118 sequence to this encoding but no matching character is found, the byte
1119 sequence defined here will be used.  For most cases, the value here is
1120 \x3F, in ASCII this is a question mark.
1121
1122 =item *
1123
1124 CHARMAP starts the character map section.  Each line has a form as
1125 follows;
1126
1127   <UXXXX> \xXX.. |0 # comment
1128     ^     ^      ^
1129     |     |      +- Fallback flag
1130     |     +-------- Encoded byte sequence
1131     +-------------- Unicode Character ID in hex
1132
1133 The format is roughly the same as a header section except for fallback
1134 flag.  It is | followed by 0..3.   And their meaning as follows
1135
1136 =over 2
1137
1138 =item |0
1139
1140 Round trip safe.   A character decoded to Unicode encodes back to the
1141 same byte sequence. most character belong to this.
1142
1143 =item |1
1144
1145 Fallback for unicode -> encoding.  When seen, enc2xs adds this
1146 character for encode map only
1147
1148 =item |2
1149
1150 Skip sub-char mapping should there be no code point.
1151
1152 =item |3
1153
1154 Fallback for encoding -> unicode.  When seen, enc2xs adds this
1155 character for decode map only
1156
1157 =back
1158
1159 =item *
1160
1161 And finally, END OF CHARMAP ends the section.
1162
1163 =back
1164
1165 Needless to say, if you are manually creating a UCM file, you should
1166 copy ascii.ucm or existing encoding which is close to yours than write
1167 your own from scratch.
1168
1169 When you do so, make sure you leave at least B<U0000> to B<U0020> as
1170 is, unless your environment is on EBCDIC.
1171
1172 B<CAVEAT>: not all features in UCM are implemented.  For example,
1173 icu:state is not used.  Because of that, you need to write a perl
1174 module if you want to support algorithmical encodings, notablly
1175 ISO-2022 series.  Such modules include L<Encode::JP::2022_JP>,
1176 L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1177
1178 =head1 Bookmarks
1179
1180 ICU Home Page
1181 L<http://oss.software.ibm.com/icu/>
1182
1183 ICU Character Mapping Tables
1184 L<http://oss.software.ibm.com/icu/charset/>
1185
1186 ICU:Conversion Data
1187 L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1188
1189 =head1 SEE ALSO
1190
1191 L<Encode>,
1192 L<perlmod>,
1193 L<perlpod>
1194
1195 =cut
1196
1197 # -Q to disable the duplicate codepoint test
1198 # -S make mapping errors fatal
1199 # -q to remove comments written to output files
1200 # -O to enable the (brute force) substring optimiser
1201 # -o <output> to specify the output file name (else it's the first arg)
1202 # -f <inlist> to give a file with a list of input files (else use the args)
1203 # -n <name> to name the encoding (else use the basename of the input file.
1204
1205 With %seen holding array refs:
1206
1207       865.66 real        28.80 user         8.79 sys
1208       7904  maximum resident set size
1209       1356  average shared memory size
1210      18566  average unshared data size
1211        229  average unshared stack size
1212      46080  page reclaims
1213      33373  page faults
1214
1215 With %seen holding simple scalars:
1216
1217       342.16 real        27.11 user         3.54 sys
1218       8388  maximum resident set size
1219       1394  average shared memory size
1220      14969  average unshared data size
1221        236  average unshared stack size
1222      28159  page reclaims
1223       9839  page faults
1224
1225 Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1226 how %seen is storing things its seen. So it is pathalogically bad on a 16M
1227 RAM machine, but it's going to help even on modern machines.
1228 Swapping is bad, m'kay :-)