ext/Encode/bin/enc2xs

   1 #!./perl
   2 BEGIN {
   3     # fiddle with @INC iff I am a part of perl dist
   4     if ($^X =~ m/\bminiperl$/o){
   5        warn "Fixing \@INC for perl core.\n";
   6        unshift @INC, qw(../../lib ../../../lib ../../../../lib);
   7        $ENV{PATH} .= ';../..;../../..;../../../..' if $^O eq 'MSWin32';
   8     }
   9 }
  10 use strict;
  11 use Getopt::Std;
  12 my @orig_ARGV = @ARGV;
  13 our $VERSION  = do { my @r = (q$Revision: 1.10 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  14
  15
  16 # These may get re-ordered.
  17 # RAW is a do_now as inserted by &enter
  18 # AGG is an aggreagated do_now, as built up by &process
  19 use constant {
  20   RAW_NEXT => 0,
  21   RAW_IN_LEN => 1,
  22   RAW_OUT_BYTES => 2,
  23   RAW_FALLBACK => 3,
  24
  25   AGG_MIN_IN => 0,
  26   AGG_MAX_IN => 1,
  27   AGG_OUT_BYTES => 2,
  28   AGG_NEXT => 3,
  29   AGG_IN_LEN => 4,
  30   AGG_OUT_LEN => 5,
  31   AGG_FALLBACK => 6,
  32 };
  33 # (See the algorithm in encengine.c - we're building structures for it)
  34
  35 # There are two sorts of structures.
  36 # "do_now" (an array, two variants of what needs storing) is whatever we need
  37 # to do now we've read an input byte.
  38 # It's housed in a "do_next" (which is how we got to it), and in turn points
  39 # to a "do_next" which contains all the "do_now"s for the next input byte.
  40
  41 # There will be a "do_next" which is the start state.
  42 # For a single byte encoding it's the only "do_next" - each "do_now" points
  43 # back to it, and each "do_now" will cause bytes. There is no state.
  44
  45 # For a multi-byte encoding where all characters in the input are the same
  46 # length, then there will be a tree of "do_now"->"do_next"->"do_now"
  47 # branching out from the start state, one step for each input byte.
  48 # The leaf "do_now"s will all be at the same distance from the start state,
  49 # only the leaf "do_now"s cause output bytes, and they in turn point back to
  50 # the start state.
  51
  52 # For an encoding where there are varaible length input byte sequences, you
  53 # will encounter a leaf "do_now" sooner for the shorter input sequences, but
  54 # as before the leaves will point back to the start state.
  55
  56 # The system will cope with escape encodings (imagine them as a mostly
  57 # self-contained tree for each escape state, and cross links between trees
  58 # at the state-switching characters) but so far no input format defines these.
  59
  60 # The system will also cope with having output "leaves" in the middle of
  61 # the bifurcating branches, not just at the extremities, but again no
  62 # input format does this yet.
  63
  64 # There are two variants of the "do_now" structure. The first, smaller variant
  65 # is generated by &enter as the input file is read. There is one structure
  66 # for each input byte. Say we are mapping a single byte encoding to a
  67 # single byte encoding, with  "ABCD" going "abcd". There will be
  68 # 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
  69
  70 # &process then walks the tree, building aggregate "do_now" structres for
  71 # adjacent bytes where possible. The aggregate is for a contiguous range of
  72 # bytes which each produce the same length of output, each move to the
  73 # same next state, and each have the same fallback flag.
  74 # So our 4 RAW "do_now"s above become replaced by a single structure
  75 # containing:
  76 # ["A", "D", "abcd", 1, ...]
  77 # ie, for an input byte $_ in "A".."D", output 1 byte, found as
  78 # substr ("abcd", (ord $_ - ord "A") * 1, 1)
  79 # which maps very nicely into pointer arithmetic in C for encengine.c
  80
  81 sub encode_U
  82 {
  83  # UTF-8 encode long hand - only covers part of perl's range
  84  ## my $uv = shift;
  85  # chr() works in native space so convert value from table
  86  # into that space before using chr().
  87  my $ch = chr(utf8::unicode_to_native($_[0]));
  88  # Now get core perl to encode that the way it likes.
  89  utf8::encode($ch);
  90  return $ch;
  91 }
  92
  93 sub encode_S
  94 {
  95  # encode single byte
  96  ## my ($ch,$page) = @_; return chr($ch);
  97  return chr $_[0];
  98 }
  99
 100 sub encode_D
 101 {
 102  # encode double byte MS byte first
 103  ## my ($ch,$page) = @_; return chr($page).chr($ch);
 104  return chr ($_[1]) . chr $_[0];
 105 }
 106
 107 sub encode_M
 108 {
 109  # encode Multi-byte - single for 0..255 otherwise double
 110  ## my ($ch,$page) = @_;
 111  ## return &encode_D if $page;
 112  ## return &encode_S;
 113  return chr ($_[1]) . chr $_[0] if $_[1];
 114  return chr $_[0];
 115 }
 116
 117 my %encode_types = (U => \&encode_U,
 118                     S => \&encode_S,
 119                     D => \&encode_D,
 120                     M => \&encode_M,
 121                    );
 122
 123 # Win32 does not expand globs on command line
 124 eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
 125
 126 my %opt;
 127 # I think these are:
 128 # -Q to disable the duplicate codepoint test
 129 # -S make mapping errors fatal
 130 # -q to remove comments written to output files
 131 # -O to enable the (brute force) substring optimiser
 132 # -o <output> to specify the output file name (else it's the first arg)
 133 # -f <inlist> to give a file with a list of input files (else use the args)
 134 # -n <name> to name the encoding (else use the basename of the input file.
 135 getopts('M:SQqOo:f:n:',\%opt);
 136
 137 $opt{M} and make_makefile_pl($opt{M}, @ARGV);
 138
 139 # This really should go first, else the die here causes empty (non-erroneous)
 140 # output files to be written.
 141 my @encfiles;
 142 if (exists $opt{'f'}) {
 143     # -F is followed by name of file containing list of filenames
 144     my $flist = $opt{'f'};
 145     open(FLIST,$flist) || die "Cannot open $flist:$!";
 146     chomp(@encfiles = <FLIST>);
 147     close(FLIST);
 148 } else {
 149     @encfiles = @ARGV;
 150 }
 151
 152 my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
 153 chmod(0666,$cname) if -f $cname && !-w $cname;
 154 open(C,">$cname") || die "Cannot open $cname:$!";
 155
 156 my $dname = $cname;
 157 my $hname = $cname;
 158
 159 my ($doC,$doEnc,$doUcm,$doPet);
 160
 161 if ($cname =~ /\.(c|xs)$/)
 162  {
 163   $doC = 1;
 164   $dname =~ s/(\.[^\.]*)?$/.exh/;
 165   chmod(0666,$dname) if -f $cname && !-w $dname;
 166   open(D,">$dname") || die "Cannot open $dname:$!";
 167   $hname =~ s/(\.[^\.]*)?$/.h/;
 168   chmod(0666,$hname) if -f $cname && !-w $hname;
 169   open(H,">$hname") || die "Cannot open $hname:$!";
 170
 171   foreach my $fh (\*C,\*D,\*H)
 172   {
 173    print $fh <<"END" unless $opt{'q'};
 174 /*
 175  !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
 176  This file was autogenerated by:
 177  $^X $0 @orig_ARGV
 178 */
 179 END
 180   }
 181
 182   if ($cname =~ /(\w+)\.xs$/)
 183    {
 184     print C "#include <EXTERN.h>\n";
 185     print C "#include <perl.h>\n";
 186     print C "#include <XSUB.h>\n";
 187     print C "#define U8 U8\n";
 188    }
 189   print C "#include \"encode.h\"\n";
 190
 191  }
 192 elsif ($cname =~ /\.enc$/)
 193  {
 194   $doEnc = 1;
 195  }
 196 elsif ($cname =~ /\.ucm$/)
 197  {
 198   $doUcm = 1;
 199  }
 200 elsif ($cname =~ /\.pet$/)
 201  {
 202   $doPet = 1;
 203  }
 204
 205 my %encoding;
 206 my %strings;
 207 my $saved = 0;
 208 my $subsave = 0;
 209 my $strings = 0;
 210
 211 sub cmp_name
 212 {
 213  if ($a =~ /^.*-(\d+)/)
 214   {
 215    my $an = $1;
 216    if ($b =~ /^.*-(\d+)/)
 217     {
 218      my $r = $an <=> $1;
 219      return $r if $r;
 220     }
 221   }
 222  return $a cmp $b;
 223 }
 224
 225
 226 foreach my $enc (sort cmp_name @encfiles)
 227  {
 228   my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
 229   $name = $opt{'n'} if exists $opt{'n'};
 230   if (open(E,$enc))
 231    {
 232     if ($sfx eq 'enc')
 233      {
 234       compile_enc(\*E,lc($name));
 235      }
 236     else
 237      {
 238       compile_ucm(\*E,lc($name));
 239      }
 240    }
 241   else
 242    {
 243     warn "Cannot open $enc for $name:$!";
 244    }
 245  }
 246
 247 if ($doC)
 248  {
 249   print STDERR "Writing compiled form\n";
 250   foreach my $name (sort cmp_name keys %encoding)
 251    {
 252     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 253     output(\*C,$name.'_utf8',$e2u);
 254     output(\*C,'utf8_'.$name,$u2e);
 255     push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
 256    }
 257   foreach my $enc (sort cmp_name keys %encoding)
 258    {
 259     my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
 260     my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
 261     my $sym = "${enc}_encoding";
 262     $sym =~ s/\W+/_/g;
 263     print C "encode_t $sym = \n";
 264     print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
 265    }
 266
 267   foreach my $enc (sort cmp_name keys %encoding)
 268    {
 269     my $sym = "${enc}_encoding";
 270     $sym =~ s/\W+/_/g;
 271     print H "extern encode_t $sym;\n";
 272     print D " Encode_XSEncoding(aTHX_ &$sym);\n";
 273    }
 274
 275   if ($cname =~ /(\w+)\.xs$/)
 276    {
 277     my $mod = $1;
 278     print C <<'END';
 279
 280 static void
 281 Encode_XSEncoding(pTHX_ encode_t *enc)
 282 {
 283  dSP;
 284  HV *stash = gv_stashpv("Encode::XS", TRUE);
 285  SV *sv    = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
 286  int i = 0;
 287  PUSHMARK(sp);
 288  XPUSHs(sv);
 289  while (enc->name[i])
 290   {
 291    const char *name = enc->name[i++];
 292    XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
 293   }
 294  PUTBACK;
 295  call_pv("Encode::define_encoding",G_DISCARD);
 296  SvREFCNT_dec(sv);
 297 }
 298
 299 END
 300
 301     print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
 302     print C "BOOT:\n{\n";
 303     print C "#include \"$dname\"\n";
 304     print C "}\n";
 305    }
 306   # Close in void context is bad, m'kay
 307   close(D) or warn "Error closing '$dname': $!";
 308   close(H) or warn "Error closing '$hname': $!";
 309
 310   my $perc_saved    = $strings/($strings + $saved) * 100;
 311   my $perc_subsaved = $strings/($strings + $subsave) * 100;
 312   printf STDERR "%d bytes in string tables\n",$strings;
 313   printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
 314     $saved, $perc_saved              if $saved;
 315   printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
 316     $subsave, $perc_subsaved         if $subsave;
 317  }
 318 elsif ($doEnc)
 319  {
 320   foreach my $name (sort cmp_name keys %encoding)
 321    {
 322     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 323     output_enc(\*C,$name,$e2u);
 324    }
 325  }
 326 elsif ($doUcm)
 327  {
 328   foreach my $name (sort cmp_name keys %encoding)
 329    {
 330     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 331     output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
 332    }
 333  }
 334
 335 # writing half meg files and then not checking to see if you just filled the
 336 # disk is bad, m'kay
 337 close(C) or die "Error closing '$cname': $!";
 338
 339 # End of the main program.
 340
 341 sub compile_ucm
 342 {
 343  my ($fh,$name) = @_;
 344  my $e2u = {};
 345  my $u2e = {};
 346  my $cs;
 347  my %attr;
 348  while (<$fh>)
 349   {
 350    s/#.*$//;
 351    last if /^\s*CHARMAP\s*$/i;
 352    if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
 353     {
 354      $attr{$1} = $2;
 355     }
 356   }
 357  if (!defined($cs =  $attr{'code_set_name'}))
 358   {
 359    warn "No <code_set_name> in $name\n";
 360   }
 361  else
 362   {
 363    $name = $cs unless exists $opt{'n'};
 364   }
 365  my $erep;
 366  my $urep;
 367  my $max_el;
 368  my $min_el;
 369  if (exists $attr{'subchar'})
 370   {
 371    my @byte;
 372    $attr{'subchar'} =~ /^\s*/cg;
 373    push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
 374    $erep = join('',map(chr(hex($_)),@byte));
 375   }
 376  print "Reading $name ($cs)\n";
 377  my $nfb = 0;
 378  my $hfb = 0;
 379  while (<$fh>)
 380   {
 381    s/#.*$//;
 382    last if /^\s*END\s+CHARMAP\s*$/i;
 383    next if /^\s*$/;
 384    my ($u,@byte);
 385    my $fb = '';
 386    $u = $1 if (/^<U([0-9a-f]+)>\s+/igc);
 387    push(@byte,$1) while /\G\\x([0-9a-f]+)/igc;
 388    $fb = $1 if /\G\s*(\|[0-3])/gc;
 389    # warn "$_: $u @byte | $fb\n";
 390    die "Bad line:$_" unless /\G\s*(#.*)?$/gc;
 391    if (defined($u))
 392     {
 393      my $uch = encode_U(hex($u));
 394      my $ech = join('',map(chr(hex($_)),@byte));
 395      my $el  = length($ech);
 396      $max_el = $el if (!defined($max_el) || $el > $max_el);
 397      $min_el = $el if (!defined($min_el) || $el < $min_el);
 398      if (length($fb))
 399       {
 400        $fb = substr($fb,1);
 401        $hfb++;
 402       }
 403      else
 404       {
 405        $nfb++;
 406        $fb = '0';
 407       }
 408      # $fb is fallback flag
 409      # 0 - round trip safe
 410      # 1 - fallback for unicode -> enc
 411      # 2 - skip sub-char mapping
 412      # 3 - fallback enc -> unicode
 413      enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
 414      enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
 415     }
 416    else
 417     {
 418      warn $_;
 419     }
 420   }
 421  if ($nfb && $hfb)
 422   {
 423    die "$nfb entries without fallback, $hfb entries with\n";
 424   }
 425  $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
 426 }
 427
 428
 429
 430 sub compile_enc
 431 {
 432  my ($fh,$name) = @_;
 433  my $e2u = {};
 434  my $u2e = {};
 435
 436  my $type;
 437  while ($type = <$fh>)
 438   {
 439    last if $type !~ /^\s*#/;
 440   }
 441  chomp($type);
 442  return if $type eq 'E';
 443  # Do the hash lookup once, rather than once per function call. 4% speedup.
 444  my $type_func = $encode_types{$type};
 445  my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
 446  warn "$type encoded $name\n";
 447  my $rep = '';
 448  # Save a defined test by setting these to defined values.
 449  my $min_el = ~0; # A very big integer
 450  my $max_el = 0;  # Anything must be longer than 0
 451  {
 452   my $v = hex($def);
 453   $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
 454  }
 455  my $errors;
 456  my $seen;
 457  # use -Q to silence the seen test. Makefile.PL uses this by default.
 458  $seen = {} unless $opt{Q};
 459  do
 460   {
 461    my $line = <$fh>;
 462    chomp($line);
 463    my $page = hex($line);
 464    my $ch = 0;
 465    my $i = 16;
 466    do
 467     {
 468      # So why is it 1% faster to leave the my here?
 469      my $line = <$fh>;
 470      $line =~ s/\r\n$/\n/;
 471      die "$.:${line}Line should be exactly 65 characters long including
 472      newline (".length($line).")" unless length ($line) == 65;
 473      # Split line into groups of 4 hex digits, convert groups to ints
 474      # This takes 65.35
 475      # map {hex $_} $line =~ /(....)/g
 476      # This takes 63.75 (2.5% less time)
 477      # unpack "n*", pack "H*", $line
 478      # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
 479      # Doing it as while ($line =~ /(....)/g) took 74.63
 480      foreach my $val (unpack "n*", pack "H*", $line)
 481       {
 482        next if $val == 0xFFFD;
 483        my $ech = &$type_func($ch,$page);
 484        if ($val || (!$ch && !$page))
 485         {
 486          my $el  = length($ech);
 487          $max_el = $el if $el > $max_el;
 488          $min_el = $el if $el < $min_el;
 489          my $uch = encode_U($val);
 490          if ($seen) {
 491            # We're doing the test.
 492            # We don't need to read this quickly, so storing it as a scalar,
 493            # rather than 3 (anon array, plus the 2 scalars it holds) saves
 494            # RAM and may make us faster on low RAM systems. [see __END__]
 495            if (exists $seen->{$uch})
 496              {
 497                warn sprintf("U%04X is %02X%02X and %04X\n",
 498                             $val,$page,$ch,$seen->{$uch});
 499                $errors++;
 500              }
 501            else
 502              {
 503                $seen->{$uch} = $page << 8 | $ch;
 504              }
 505          }
 506          # Passing 2 extra args each time is 3.6% slower!
 507          # Even with having to add $fallback ||= 0 later
 508          enter_fb0($e2u,$ech,$uch);
 509          enter_fb0($u2e,$uch,$ech);
 510         }
 511        else
 512         {
 513          # No character at this position
 514          # enter($e2u,$ech,undef,$e2u);
 515         }
 516        $ch++;
 517       }
 518     } while --$i;
 519   } while --$pages;
 520  die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
 521    if $min_el > $max_el;
 522  die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
 523  $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
 524 }
 525
 526 # my ($a,$s,$d,$t,$fb) = @_;
 527 sub enter {
 528   my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
 529   # state we shift to after this (multibyte) input character defaults to same
 530   # as current state.
 531   $next ||= $current;
 532   # Making sure it is defined seems to be faster than {no warnings;} in
 533   # &process, or passing it in as 0 explicity.
 534   # XXX $fallback ||= 0;
 535
 536   # Start at the beginning and work forwards through the string to zero.
 537   # effectively we are removing 1 character from the front each time
 538   # but we don't actually edit the string. [this alone seems to be 14% speedup]
 539   # Hence -$pos is the length of the remaining string.
 540   my $pos = -length $inbytes;
 541   while (1) {
 542     my $byte = substr $inbytes, $pos, 1;
 543     #  RAW_NEXT => 0,
 544     #  RAW_IN_LEN => 1,
 545     #  RAW_OUT_BYTES => 2,
 546     #  RAW_FALLBACK => 3,
 547     # to unicode an array would seem to be better, because the pages are dense.
 548     # from unicode can be very sparse, favouring a hash.
 549     # hash using the bytes (all length 1) as keys rather than ord value,
 550     # as it's easier to sort these in &process.
 551
 552     # It's faster to always add $fallback even if it's undef, rather than
 553     # choosing between 3 and 4 element array. (hence why we set it defined
 554     # above)
 555     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
 556     # When $pos was -1 we were at the last input character.
 557     unless (++$pos) {
 558       $do_now->[RAW_OUT_BYTES] = $outbytes;
 559       $do_now->[RAW_NEXT] = $next;
 560       return;
 561     }
 562     # Tail recursion. The intermdiate state may not have a name yet.
 563     $current = $do_now->[RAW_NEXT];
 564   }
 565 }
 566
 567 # This is purely for optimistation. It's just &enter hard coded for $fallback
 568 # of 0, using only a 3 entry array ref to save memory for every entry.
 569 sub enter_fb0 {
 570   my ($current,$inbytes,$outbytes,$next) = @_;
 571   $next ||= $current;
 572
 573   my $pos = -length $inbytes;
 574   while (1) {
 575     my $byte = substr $inbytes, $pos, 1;
 576     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
 577     unless (++$pos) {
 578       $do_now->[RAW_OUT_BYTES] = $outbytes;
 579       $do_now->[RAW_NEXT] = $next;
 580       return;
 581     }
 582     $current = $do_now->[RAW_NEXT];
 583   }
 584 }
 585
 586
 587 sub outstring
 588 {
 589  my ($fh,$name,$s) = @_;
 590  my $sym = $strings{$s};
 591  if ($sym)
 592   {
 593    $saved += length($s);
 594   }
 595  else
 596   {
 597    if ($opt{'O'}) {
 598        foreach my $o (keys %strings)
 599         {
 600          next unless (my $i = index($o,$s)) >= 0;
 601          $sym = $strings{$o};
 602          # gcc things that 0x0e+0x10 (anything with e+) starts to look like
 603          # a hexadecimal floating point constant. Silly gcc. Only p
 604          # introduces a floating point constant. Put the space in to stop it
 605          # getting confused.
 606          $sym .= sprintf(" +0x%02x",$i) if ($i);
 607          $subsave += length($s);
 608          return $strings{$s} = $sym;
 609        }
 610    }
 611    $strings{$s} = $sym = $name;
 612    $strings += length($s);
 613    my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
 614    # Maybe we should assert that these are all <256.
 615    $definition .= join(',',unpack "C*",$s);
 616    # We have a single long line. Split it at convenient commas.
 617    $definition =~ s/(.{74,77},)/$1\n/g;
 618    print $fh "$definition };\n\n";
 619   }
 620  return $sym;
 621 }
 622
 623 sub process
 624 {
 625   my ($name,$a) = @_;
 626   $name =~ s/\W+/_/g;
 627   $a->{Cname} = $name;
 628   my $raw = $a->{Raw};
 629   my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
 630   my @ent;
 631   $agg_max_in = 0;
 632   foreach my $key (sort keys %$raw) {
 633     #  RAW_NEXT => 0,
 634     #  RAW_IN_LEN => 1,
 635     #  RAW_OUT_BYTES => 2,
 636     #  RAW_FALLBACK => 3,
 637     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 638     # Now we are converting from raw to aggregate, switch from 1 byte strings
 639     # to numbers
 640     my $b = ord $key;
 641     $fallback ||= 0;
 642     if ($l &&
 643         # If this == fails, we're going to reset $agg_max_in below anyway.
 644         $b == ++$agg_max_in &&
 645         # References in numeric context give the pointer as an int.
 646         $agg_next == $next &&
 647         $agg_in_len == $in_len &&
 648         $agg_out_len == length $out_bytes &&
 649         $agg_fallback == $fallback
 650         # && length($l->[AGG_OUT_BYTES]) < 16
 651        ) {
 652       #     my $i = ord($b)-ord($l->[AGG_MIN_IN]);
 653       # we can aggregate this byte onto the end.
 654       $l->[AGG_MAX_IN] = $b;
 655       $l->[AGG_OUT_BYTES] .= $out_bytes;
 656     } else {
 657       # AGG_MIN_IN => 0,
 658       # AGG_MAX_IN => 1,
 659       # AGG_OUT_BYTES => 2,
 660       # AGG_NEXT => 3,
 661       # AGG_IN_LEN => 4,
 662       # AGG_OUT_LEN => 5,
 663       # AGG_FALLBACK => 6,
 664       # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
 665       # (only gains .6% on euc-jp  -- is it worth it?)
 666       push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
 667                        $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
 668                        $agg_fallback = $fallback];
 669     }
 670     if (exists $next->{Cname}) {
 671       $next->{'Forward'} = 1 if $next != $a;
 672     } else {
 673       process(sprintf("%s_%02x",$name,$b),$next);
 674     }
 675   }
 676   # encengine.c rules say that last entry must be for 255
 677   if ($agg_max_in < 255) {
 678     push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
 679   }
 680   $a->{'Entries'} = \@ent;
 681 }
 682
 683 sub outtable
 684 {
 685  my ($fh,$a) = @_;
 686  my $name = $a->{'Cname'};
 687  # String tables
 688  foreach my $b (@{$a->{'Entries'}})
 689   {
 690    next unless $b->[AGG_OUT_LEN];
 691    my $s = $b->[AGG_MIN_IN];
 692    my $e = $b->[AGG_MAX_IN];
 693    outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
 694   }
 695  if ($a->{'Forward'})
 696   {
 697    print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
 698   }
 699  $a->{'Done'} = 1;
 700  foreach my $b (@{$a->{'Entries'}})
 701   {
 702    my ($s,$e,$out,$t,$end,$l) = @$b;
 703    outtable($fh,$t) unless $t->{'Done'};
 704   }
 705  print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
 706  foreach my $b (@{$a->{'Entries'}})
 707   {
 708    my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
 709    $end |= 0x80 if $fb;
 710    print  $fh "{";
 711    if ($l)
 712     {
 713      printf $fh outstring($fh,'',$out);
 714     }
 715    else
 716     {
 717      print  $fh "0";
 718     }
 719    print  $fh ",",$t->{Cname};
 720    printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
 721   }
 722  print $fh "};\n";
 723 }
 724
 725 sub output
 726 {
 727  my ($fh,$name,$a) = @_;
 728  process($name,$a);
 729  # Sub-tables
 730  outtable($fh,$a);
 731 }
 732
 733 sub output_enc
 734 {
 735  my ($fh,$name,$a) = @_;
 736  die "Changed - fix me for new structure";
 737  foreach my $b (sort keys %$a)
 738   {
 739    my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
 740   }
 741 }
 742
 743 sub decode_U
 744 {
 745  my $s = shift;
 746 }
 747
 748 my @uname;
 749 sub char_names
 750 {
 751  my $s = do "unicore/Name.pl";
 752  die "char_names: unicore/Name.pl: $!\n" unless defined $s;
 753  pos($s) = 0;
 754  while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
 755   {
 756    my $name = $3;
 757    my $s = hex($1);
 758    last if $s >= 0x10000;
 759    my $e = length($2) ? hex($2) : $s;
 760    for (my $i = $s; $i <= $e; $i++)
 761     {
 762      $uname[$i] = $name;
 763 #    print sprintf("U%04X $name\n",$i);
 764     }
 765   }
 766 }
 767
 768 sub output_ucm_page
 769 {
 770   my ($cmap,$a,$t,$pre) = @_;
 771   # warn sprintf("Page %x\n",$pre);
 772   my $raw = $t->{Raw};
 773   foreach my $key (sort keys %$raw) {
 774     #  RAW_NEXT => 0,
 775     #  RAW_IN_LEN => 1,
 776     #  RAW_OUT_BYTES => 2,
 777     #  RAW_FALLBACK => 3,
 778     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 779     my $u = ord $key;
 780     $fallback ||= 0;
 781
 782     if ($next != $a && $next != $t) {
 783       output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
 784     } elsif (length $out_bytes) {
 785       if ($pre) {
 786         $u = $pre|($u &0x3f);
 787       }
 788       my $s = sprintf "<U%04X> ",$u;
 789       #foreach my $c (split(//,$out_bytes)) {
 790       #  $s .= sprintf "\\x%02X",ord($c);
 791       #}
 792       # 9.5% faster changing that loop to this:
 793       $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
 794       $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
 795       push(@$cmap,$s);
 796     } else {
 797       warn join(',',$u, @{$raw->{$key}},$a,$t);
 798     }
 799   }
 800 }
 801
 802 sub output_ucm
 803 {
 804  my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
 805  print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
 806  print $fh "<code_set_name> \"$name\"\n";
 807  char_names();
 808  if (defined $min_el)
 809   {
 810    print $fh "<mb_cur_min> $min_el\n";
 811   }
 812  if (defined $max_el)
 813   {
 814    print $fh "<mb_cur_max> $max_el\n";
 815   }
 816  if (defined $rep)
 817   {
 818    print $fh "<subchar> ";
 819    foreach my $c (split(//,$rep))
 820     {
 821      printf $fh "\\x%02X",ord($c);
 822     }
 823    print $fh "\n";
 824   }
 825  my @cmap;
 826  output_ucm_page(\@cmap,$h,$h,0);
 827  print $fh "#\nCHARMAP\n";
 828  foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
 829   {
 830    print $fh $line;
 831   }
 832  print $fh "END CHARMAP\n";
 833 }
 834
 835 use vars qw(
 836     $_Enc2xs
 837     $_Version
 838     $_Inc
 839     $_Name
 840     $_TableFiles
 841     $_Now
 842 );
 843
 844 sub make_makefile_pl
 845 {
 846     eval { require Encode; };
 847     $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
 848     # our used for variable expanstion
 849     $_Enc2xs = $0;
 850     $_Version = $VERSION;
 851     $_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o;
 852     $_Name = shift;
 853     $_TableFiles = join(",", map {qq('$_')} @_);
 854     $_Now = scalar localtime();
 855     warn "Generating Makefile.PL\n";
 856     _print_expand("$_Inc/Makefile_PL.e2x", "Makefile.PL");
 857     warn "Generating $_Name.pm\n";
 858     _print_expand("$_Inc/_PM.e2x",         "$_Name.pm");
 859     warn "Generating t/$_Name.t\n";
 860     _print_expand("$_Inc/_T.e2x",          "t/$_Name.t");
 861     warn "Generating README\n";
 862     _print_expand("$_Inc/README.e2x",      "README");
 863     warn "Generating t/$_Name.t\n";
 864     _print_expand("$_Inc/Changes.e2x",     "Changes");
 865     exit;
 866 }
 867
 868 sub _print_expand{
 869     eval { require File::Basename; };
 870     $@ and die "File::Basename needed.  Are you on miniperl?;\nerror: $@\n";
 871     File::Basename->import();
 872     my ($src, $dst) = @_;
 873     open my $in, $src or die "$src : $!";
 874     if ((my $d = dirname($dst)) ne '.'){
 875         -d $d or mkdir $d, 0755 or die  "mkdir $d : $!";
 876     }
 877     open my $out, ">$dst" or die "$!";
 878     my $asis = 0;
 879     while (<$in>){
 880         if (/^#### END_OF_HEADER/){
 881             $asis = 1; next;
 882         }
 883         s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
 884         print $out $_;
 885     }
 886 }
 887 __END__
 888
 889 =head1 NAME
 890
 891 enc2xs -- Perl Encode Module Generator
 892
 893 =head1 SYNOPSIS
 894
 895   enc2xs -M ModName mapfiles...
 896   enc2xs -[options]
 897
 898 =head1 DESCRIPTION
 899
 900 F<enc2xs> builds a Perl extension for use by Encode from either
 901 Unicode Character Mapping files (.ucm) or Tcl Encoding Files
 902 (.enc)  Besides internally used during the build process of Encode
 903 module, you can use F<enc2xs> to add your own encoding to perl.  No
 904 knowledge on XS is necessary.
 905
 906 =head1 Quick Guide
 907
 908 If what you want to know as little about Perl possible but needs to
 909 add a new encoding, just read this chapter and forget the rest.
 910
 911 =over 4
 912
 913 =item 0.
 914
 915 Have a .ucm file ready.  You can get it from somewhere or you can
 916 write your own from scratch or you can grab one from Encode
 917 distribution and customize.  For UCM format, see the next Chapter.
 918 In the example below, I'll call my theoretical encoding myascii,
 919 defined inI<my.ucm>.  C<$> is a shell prompt.
 920
 921   $ ls -F
 922   my.ucm
 923
 924 =item 1.
 925
 926 Issue a command as follows;
 927
 928   $ enc2xs -M My my.ucm
 929   generating Makefile.PL
 930   generating My.pm
 931   generating README
 932   generating Changes
 933
 934 Now take a look at your current directory.  It should look like this.
 935
 936   $ ls -F
 937   Makefile.PL   My.pm         my.ucm        t/
 938
 939 The following files are created.
 940
 941   Makefle.PL - MakeMaker script
 942   My.pm      - Encode Submodule
 943   t/My.t     - test file
 944
 945 =item 1.1.
 946
 947 If you want *.ucm installed together with the modules, do as follows;
 948
 949   $ mkdir Encode
 950   $ mv *.ucm Encode
 951   $ enc2xs -M My Encode/*ucm
 952
 953 =item 2.
 954
 955 Edit the files generated.  You don't have to if you have no time AND no
 956 intention to give it to someone else.  But it is a good idea to edit
 957 pod and add more tests.
 958
 959 =item 3.
 960
 961 Now issue a command all Perl Mongers love;
 962
 963   $ perl5.7.3 Makefile.PL
 964   Writing Makefile for Encode::My
 965
 966 =item 4.
 967
 968 Now all you have to do is make.
 969
 970   $ make
 971   cp My.pm blib/lib/Encode/My.pm
 972   /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
 973     -o encode_t.c -f encode_t.fnm
 974   Reading myascii (myascii)
 975   Writing compiled form
 976   128 bytes in string tables
 977   384 bytes (25%) saved spotting duplicates
 978   1 bytes (99.2%) saved using substrings
 979   ....
 980   chmod 644 blib/arch/auto/Encode/My/My.bs
 981   $
 982
 983 The time it takes varies how fast your machine is and how large your
 984 encoding is.  Unless you are working on something big like euc-tw, it
 985 won't take too long.
 986
 987 =item 5.
 988
 989 You can "make install" already but you should test first.
 990
 991   $ make test
 992   PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
 993     -e 'use Test::Harness  qw(&runtests $verbose); \
 994     $verbose=0; runtests @ARGV;' t/*.t
 995   t/My....ok
 996   All tests successful.
 997   Files=1, Tests=2,  0 wallclock secs
 998    ( 0.09 cusr + 0.01 csys = 0.09 CPU)
 999
1000 =item 6.
1001
1002 If you are content with the test result, just "make install"
1003
1004 =back
1005
1006 =head1 The Unicode Character Map
1007
1008 Encode uses The Unicode Character Map (UCM) for source character
1009 mappings.  This format is used by ICU package of IBM and adopted by
1010 Nick Ing-Simmons.  Since UCM is more flexible than Tcl's Encoding Map
1011 and far more user-friendly,  This is the recommended formet for
1012 Encode now.
1013
1014 UCM file looks like this.
1015
1016   #
1017   # Comments
1018   #
1019   <code_set_name> "US-ascii" # Required
1020   <code_set_alias> "ascii"   # Optional
1021   <mb_cur_min> 1             # Required; usually 1
1022   <mb_cur_max> 1             # Max. # of bytes/char
1023   <subchar> \x3F             # Substitution char
1024   #
1025   CHARMAP
1026   <U0000> \x00 |0 # <control>
1027   <U0001> \x01 |0 # <control>
1028   <U0002> \x02 |0 # <control>
1029   ....
1030   <U007C> \x7C |0 # VERTICAL LINE
1031   <U007D> \x7D |0 # RIGHT CURLY BRACKET
1032   <U007E> \x7E |0 # TILDE
1033   <U007F> \x7F |0 # <control>
1034   END CHARMAP
1035
1036 =over 4
1037
1038 =item *
1039
1040 Anything that follows C<#> is treated as comments.
1041
1042 =item *
1043
1044 The header section continues until CHARMAP. This section Has a form of
1045 I<E<lt>keywordE<gt> value>, one at a line.  For a value, strings must
1046 be quoted. Barewords are treated as numbers.  I<\xXX> represents a
1047 byte.
1048
1049 Most of the keywords are self-explanatory. I<subchar> means
1050 substitution character, not subcharacter.  When you decode a Unicode
1051 sequence to this encoding but no matching character is found, the byte
1052 sequence defined here will be used.  For most cases, the value here is
1053 \x3F, in ASCII this is a question mark.
1054
1055 =item *
1056
1057 CHARMAP starts the character map section.  Each line has a form as
1058 follows;
1059
1060   <UXXXX> \xXX.. |0 # comment
1061     ^     ^      ^
1062     |     |      +- Fallback flag
1063     |     +-------- Encoded byte sequence
1064     +-------------- Unicode Character ID in hex
1065
1066 The format is roughly the same as a header section except for fallback
1067 flag.  It is | followed by 0..3.   And their meaning as follows
1068
1069 =over 2
1070
1071 =item |0
1072
1073 Round trip safe.   A character decoded to Unicode encodes back to the
1074 same byte sequence. most character belong to this.
1075
1076 =item |1
1077
1078 Fallback for unicode -> encoding.  When seen, enc2xs adds this
1079 character for encode map only
1080
1081 =item |2
1082
1083 Skip sub-char mapping should there be no code point.
1084
1085 =item |3
1086
1087 Fallback for encoding -> unicode.  When seen, enc2xs adds this
1088 character for decode map only
1089
1090 =back
1091
1092 =item *
1093
1094 And finally, END OF CHARMAP ends the section.
1095
1096 =back
1097
1098 Needless to say, if you are manually creating a UCM file, you should
1099 copy ascii.ucm or existing encoding which is close to yours than write
1100 your own from scratch.
1101
1102 When you do so, make sure you leave at least B<U0000> to B<U0020> as
1103 is, unless your environment is on EBCDIC.
1104
1105 B<CAVEAT>: not all features in UCM are implemented.  For example,
1106 icu:state is not used.  Because of that, you need to write a perl
1107 module if you want to support algorithmical encodings, notablly
1108 ISO-2022 series.  Such modules include L<Encode::JP::2022_JP>,
1109 L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1110
1111 =head1 Bookmarks
1112
1113 ICU Home Page
1114 L<http://oss.software.ibm.com/icu/>
1115
1116 ICU Character Mapping Tables
1117 L<http://oss.software.ibm.com/icu/charset/>
1118
1119 ICU:Conversion Data
1120 L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1121
1122 =head1 SEE ALSO
1123
1124 L<Encode>,
1125 L<perlmod>,
1126 L<perlpod>
1127
1128 =cut
1129
1130 # -Q to disable the duplicate codepoint test
1131 # -S make mapping errors fatal
1132 # -q to remove comments written to output files
1133 # -O to enable the (brute force) substring optimiser
1134 # -o <output> to specify the output file name (else it's the first arg)
1135 # -f <inlist> to give a file with a list of input files (else use the args)
1136 # -n <name> to name the encoding (else use the basename of the input file.
1137
1138 With %seen holding array refs:
1139
1140       865.66 real        28.80 user         8.79 sys
1141       7904  maximum resident set size
1142       1356  average shared memory size
1143      18566  average unshared data size
1144        229  average unshared stack size
1145      46080  page reclaims
1146      33373  page faults
1147
1148 With %seen holding simple scalars:
1149
1150       342.16 real        27.11 user         3.54 sys
1151       8388  maximum resident set size
1152       1394  average shared memory size
1153      14969  average unshared data size
1154        236  average unshared stack size
1155      28159  page reclaims
1156       9839  page faults
1157
1158 Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1159 how %seen is storing things its seen. So it is pathalogically bad on a 16M
1160 RAM machine, but it's going to help even on modern machines.
1161 Swapping is bad, m'kay :-)