ext/Encode/bin/enc2xs

   1 #!../../../perl -w
   2 BEGIN {
   3     unshift @INC, qw(../../lib ../../../lib ../../../../lib);
   4     $ENV{PATH} .= ';../..;../../..;../../../..' if $^O eq 'MSWin32';
   5 }
   6 use strict;
   7 use Getopt::Std;
   8 my @orig_ARGV = @ARGV;
   9 our $VERSION  = do { my @r = (q$Revision: 1.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  10
  11
  12 # These may get re-ordered.
  13 # RAW is a do_now as inserted by &enter
  14 # AGG is an aggreagated do_now, as built up by &process
  15 use constant {
  16   RAW_NEXT => 0,
  17   RAW_IN_LEN => 1,
  18   RAW_OUT_BYTES => 2,
  19   RAW_FALLBACK => 3,
  20
  21   AGG_MIN_IN => 0,
  22   AGG_MAX_IN => 1,
  23   AGG_OUT_BYTES => 2,
  24   AGG_NEXT => 3,
  25   AGG_IN_LEN => 4,
  26   AGG_OUT_LEN => 5,
  27   AGG_FALLBACK => 6,
  28 };
  29 # (See the algorithm in encengine.c - we're building structures for it)
  30
  31 # There are two sorts of structures.
  32 # "do_now" (an array, two variants of what needs storing) is whatever we need
  33 # to do now we've read an input byte.
  34 # It's housed in a "do_next" (which is how we got to it), and in turn points
  35 # to a "do_next" which contains all the "do_now"s for the next input byte.
  36
  37 # There will be a "do_next" which is the start state.
  38 # For a single byte encoding it's the only "do_next" - each "do_now" points
  39 # back to it, and each "do_now" will cause bytes. There is no state.
  40
  41 # For a multi-byte encoding where all characters in the input are the same
  42 # length, then there will be a tree of "do_now"->"do_next"->"do_now"
  43 # branching out from the start state, one step for each input byte.
  44 # The leaf "do_now"s will all be at the same distance from the start state,
  45 # only the leaf "do_now"s cause output bytes, and they in turn point back to
  46 # the start state.
  47
  48 # For an encoding where there are varaible length input byte sequences, you
  49 # will encounter a leaf "do_now" sooner for the shorter input sequences, but
  50 # as before the leaves will point back to the start state.
  51
  52 # The system will cope with escape encodings (imagine them as a mostly
  53 # self-contained tree for each escape state, and cross links between trees
  54 # at the state-switching characters) but so far no input format defines these.
  55
  56 # The system will also cope with having output "leaves" in the middle of
  57 # the bifurcating branches, not just at the extremities, but again no
  58 # input format does this yet.
  59
  60 # There are two variants of the "do_now" structure. The first, smaller variant
  61 # is generated by &enter as the input file is read. There is one structure
  62 # for each input byte. Say we are mapping a single byte encoding to a
  63 # single byte encoding, with  "ABCD" going "abcd". There will be
  64 # 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
  65
  66 # &process then walks the tree, building aggregate "do_now" structres for
  67 # adjacent bytes where possible. The aggregate is for a contiguous range of
  68 # bytes which each produce the same length of output, each move to the
  69 # same next state, and each have the same fallback flag.
  70 # So our 4 RAW "do_now"s above become replaced by a single structure
  71 # containing:
  72 # ["A", "D", "abcd", 1, ...]
  73 # ie, for an input byte $_ in "A".."D", output 1 byte, found as
  74 # substr ("abcd", (ord $_ - ord "A") * 1, 1)
  75 # which maps very nicely into pointer arithmetic in C for encengine.c
  76
  77 sub encode_U
  78 {
  79  # UTF-8 encode long hand - only covers part of perl's range
  80  ## my $uv = shift;
  81  # chr() works in native space so convert value from table
  82  # into that space before using chr().
  83  my $ch = chr(utf8::unicode_to_native($_[0]));
  84  # Now get core perl to encode that the way it likes.
  85  utf8::encode($ch);
  86  return $ch;
  87 }
  88
  89 sub encode_S
  90 {
  91  # encode single byte
  92  ## my ($ch,$page) = @_; return chr($ch);
  93  return chr $_[0];
  94 }
  95
  96 sub encode_D
  97 {
  98  # encode double byte MS byte first
  99  ## my ($ch,$page) = @_; return chr($page).chr($ch);
 100  return chr ($_[1]) . chr $_[0];
 101 }
 102
 103 sub encode_M
 104 {
 105  # encode Multi-byte - single for 0..255 otherwise double
 106  ## my ($ch,$page) = @_;
 107  ## return &encode_D if $page;
 108  ## return &encode_S;
 109  return chr ($_[1]) . chr $_[0] if $_[1];
 110  return chr $_[0];
 111 }
 112
 113 my %encode_types = (U => \&encode_U,
 114                     S => \&encode_S,
 115                     D => \&encode_D,
 116                     M => \&encode_M,
 117                    );
 118
 119 # Win32 does not expand globs on command line
 120 eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
 121
 122 my %opt;
 123 # I think these are:
 124 # -Q to disable the duplicate codepoint test
 125 # -S make mapping errors fatal
 126 # -q to remove comments written to output files
 127 # -O to enable the (brute force) substring optimiser
 128 # -o <output> to specify the output file name (else it's the first arg)
 129 # -f <inlist> to give a file with a list of input files (else use the args)
 130 # -n <name> to name the encoding (else use the basename of the input file.
 131 getopts('M:SQqOo:f:n:',\%opt);
 132
 133 $opt{M} and make_makefile_pl($opt{M}, @ARGV);
 134
 135 # This really should go first, else the die here causes empty (non-erroneous)
 136 # output files to be written.
 137 my @encfiles;
 138 if (exists $opt{'f'}) {
 139     # -F is followed by name of file containing list of filenames
 140     my $flist = $opt{'f'};
 141     open(FLIST,$flist) || die "Cannot open $flist:$!";
 142     chomp(@encfiles = <FLIST>);
 143     close(FLIST);
 144 } else {
 145     @encfiles = @ARGV;
 146 }
 147
 148 my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
 149 chmod(0666,$cname) if -f $cname && !-w $cname;
 150 open(C,">$cname") || die "Cannot open $cname:$!";
 151
 152 my $dname = $cname;
 153 my $hname = $cname;
 154
 155 my ($doC,$doEnc,$doUcm,$doPet);
 156
 157 if ($cname =~ /\.(c|xs)$/)
 158  {
 159   $doC = 1;
 160   $dname =~ s/(\.[^\.]*)?$/_def.h/;
 161   chmod(0666,$dname) if -f $cname && !-w $dname;
 162   open(D,">$dname") || die "Cannot open $dname:$!";
 163   $hname =~ s/(\.[^\.]*)?$/.h/;
 164   chmod(0666,$hname) if -f $cname && !-w $hname;
 165   open(H,">$hname") || die "Cannot open $hname:$!";
 166
 167   foreach my $fh (\*C,\*D,\*H)
 168   {
 169    print $fh <<"END" unless $opt{'q'};
 170 /*
 171  !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
 172  This file was autogenerated by:
 173  $^X $0 @orig_ARGV
 174 */
 175 END
 176   }
 177
 178   if ($cname =~ /(\w+)\.xs$/)
 179    {
 180     print C "#include <EXTERN.h>\n";
 181     print C "#include <perl.h>\n";
 182     print C "#include <XSUB.h>\n";
 183     print C "#define U8 U8\n";
 184    }
 185   print C "#include \"encode.h\"\n";
 186
 187  }
 188 elsif ($cname =~ /\.enc$/)
 189  {
 190   $doEnc = 1;
 191  }
 192 elsif ($cname =~ /\.ucm$/)
 193  {
 194   $doUcm = 1;
 195  }
 196 elsif ($cname =~ /\.pet$/)
 197  {
 198   $doPet = 1;
 199  }
 200
 201 my %encoding;
 202 my %strings;
 203 my $saved = 0;
 204 my $subsave = 0;
 205 my $strings = 0;
 206
 207 sub cmp_name
 208 {
 209  if ($a =~ /^.*-(\d+)/)
 210   {
 211    my $an = $1;
 212    if ($b =~ /^.*-(\d+)/)
 213     {
 214      my $r = $an <=> $1;
 215      return $r if $r;
 216     }
 217   }
 218  return $a cmp $b;
 219 }
 220
 221
 222 foreach my $enc (sort cmp_name @encfiles)
 223  {
 224   my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
 225   $name = $opt{'n'} if exists $opt{'n'};
 226   if (open(E,$enc))
 227    {
 228     if ($sfx eq 'enc')
 229      {
 230       compile_enc(\*E,lc($name));
 231      }
 232     else
 233      {
 234       compile_ucm(\*E,lc($name));
 235      }
 236    }
 237   else
 238    {
 239     warn "Cannot open $enc for $name:$!";
 240    }
 241  }
 242
 243 if ($doC)
 244  {
 245   print STDERR "Writing compiled form\n";
 246   foreach my $name (sort cmp_name keys %encoding)
 247    {
 248     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 249     output(\*C,$name.'_utf8',$e2u);
 250     output(\*C,'utf8_'.$name,$u2e);
 251     push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
 252    }
 253   foreach my $enc (sort cmp_name keys %encoding)
 254    {
 255     my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
 256     my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
 257     my $sym = "${enc}_encoding";
 258     $sym =~ s/\W+/_/g;
 259     print C "encode_t $sym = \n";
 260     print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
 261    }
 262
 263   foreach my $enc (sort cmp_name keys %encoding)
 264    {
 265     my $sym = "${enc}_encoding";
 266     $sym =~ s/\W+/_/g;
 267     print H "extern encode_t $sym;\n";
 268     print D " Encode_XSEncoding(aTHX_ &$sym);\n";
 269    }
 270
 271   if ($cname =~ /(\w+)\.xs$/)
 272    {
 273     my $mod = $1;
 274     print C <<'END';
 275
 276 static void
 277 Encode_XSEncoding(pTHX_ encode_t *enc)
 278 {
 279  dSP;
 280  HV *stash = gv_stashpv("Encode::XS", TRUE);
 281  SV *sv    = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
 282  int i = 0;
 283  PUSHMARK(sp);
 284  XPUSHs(sv);
 285  while (enc->name[i])
 286   {
 287    const char *name = enc->name[i++];
 288    XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
 289   }
 290  PUTBACK;
 291  call_pv("Encode::define_encoding",G_DISCARD);
 292  SvREFCNT_dec(sv);
 293 }
 294
 295 END
 296
 297     print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
 298     print C "BOOT:\n{\n";
 299     print C "#include \"$dname\"\n";
 300     print C "}\n";
 301    }
 302   # Close in void context is bad, m'kay
 303   close(D) or warn "Error closing '$dname': $!";
 304   close(H) or warn "Error closing '$hname': $!";
 305
 306   my $perc_saved    = $strings/($strings + $saved) * 100;
 307   my $perc_subsaved = $strings/($strings + $subsave) * 100;
 308   printf STDERR "%d bytes in string tables\n",$strings;
 309   printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
 310     $saved, $perc_saved              if $saved;
 311   printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
 312     $subsave, $perc_subsaved         if $subsave;
 313  }
 314 elsif ($doEnc)
 315  {
 316   foreach my $name (sort cmp_name keys %encoding)
 317    {
 318     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 319     output_enc(\*C,$name,$e2u);
 320    }
 321  }
 322 elsif ($doUcm)
 323  {
 324   foreach my $name (sort cmp_name keys %encoding)
 325    {
 326     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 327     output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
 328    }
 329  }
 330
 331 # writing half meg files and then not checking to see if you just filled the
 332 # disk is bad, m'kay
 333 close(C) or die "Error closing '$cname': $!";
 334
 335 # End of the main program.
 336
 337 sub compile_ucm
 338 {
 339  my ($fh,$name) = @_;
 340  my $e2u = {};
 341  my $u2e = {};
 342  my $cs;
 343  my %attr;
 344  while (<$fh>)
 345   {
 346    s/#.*$//;
 347    last if /^\s*CHARMAP\s*$/i;
 348    if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
 349     {
 350      $attr{$1} = $2;
 351     }
 352   }
 353  if (!defined($cs =  $attr{'code_set_name'}))
 354   {
 355    warn "No <code_set_name> in $name\n";
 356   }
 357  else
 358   {
 359    $name = $cs unless exists $opt{'n'};
 360   }
 361  my $erep;
 362  my $urep;
 363  my $max_el;
 364  my $min_el;
 365  if (exists $attr{'subchar'})
 366   {
 367    my @byte;
 368    $attr{'subchar'} =~ /^\s*/cg;
 369    push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
 370    $erep = join('',map(chr(hex($_)),@byte));
 371   }
 372  print "Reading $name ($cs)\n";
 373  my $nfb = 0;
 374  my $hfb = 0;
 375  while (<$fh>)
 376   {
 377    s/#.*$//;
 378    last if /^\s*END\s+CHARMAP\s*$/i;
 379    next if /^\s*$/;
 380    my ($u,@byte);
 381    my $fb = '';
 382    $u = $1 if (/^<U([0-9a-f]+)>\s+/igc);
 383    push(@byte,$1) while /\G\\x([0-9a-f]+)/igc;
 384    $fb = $1 if /\G\s*(\|[0-3])/gc;
 385    # warn "$_: $u @byte | $fb\n";
 386    die "Bad line:$_" unless /\G\s*(#.*)?$/gc;
 387    if (defined($u))
 388     {
 389      my $uch = encode_U(hex($u));
 390      my $ech = join('',map(chr(hex($_)),@byte));
 391      my $el  = length($ech);
 392      $max_el = $el if (!defined($max_el) || $el > $max_el);
 393      $min_el = $el if (!defined($min_el) || $el < $min_el);
 394      if (length($fb))
 395       {
 396        $fb = substr($fb,1);
 397        $hfb++;
 398       }
 399      else
 400       {
 401        $nfb++;
 402        $fb = '0';
 403       }
 404      # $fb is fallback flag
 405      # 0 - round trip safe
 406      # 1 - fallback for unicode -> enc
 407      # 2 - skip sub-char mapping
 408      # 3 - fallback enc -> unicode
 409      enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
 410      enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
 411     }
 412    else
 413     {
 414      warn $_;
 415     }
 416   }
 417  if ($nfb && $hfb)
 418   {
 419    die "$nfb entries without fallback, $hfb entries with\n";
 420   }
 421  $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
 422 }
 423
 424
 425
 426 sub compile_enc
 427 {
 428  my ($fh,$name) = @_;
 429  my $e2u = {};
 430  my $u2e = {};
 431
 432  my $type;
 433  while ($type = <$fh>)
 434   {
 435    last if $type !~ /^\s*#/;
 436   }
 437  chomp($type);
 438  return if $type eq 'E';
 439  # Do the hash lookup once, rather than once per function call. 4% speedup.
 440  my $type_func = $encode_types{$type};
 441  my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
 442  warn "$type encoded $name\n";
 443  my $rep = '';
 444  # Save a defined test by setting these to defined values.
 445  my $min_el = ~0; # A very big integer
 446  my $max_el = 0;  # Anything must be longer than 0
 447  {
 448   my $v = hex($def);
 449   $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
 450  }
 451  my $errors;
 452  my $seen;
 453  # use -Q to silence the seen test. Makefile.PL uses this by default.
 454  $seen = {} unless $opt{Q};
 455  do
 456   {
 457    my $line = <$fh>;
 458    chomp($line);
 459    my $page = hex($line);
 460    my $ch = 0;
 461    my $i = 16;
 462    do
 463     {
 464      # So why is it 1% faster to leave the my here?
 465      my $line = <$fh>;
 466      $line =~ s/\r\n$/\n/;
 467      die "$.:${line}Line should be exactly 65 characters long including
 468      newline (".length($line).")" unless length ($line) == 65;
 469      # Split line into groups of 4 hex digits, convert groups to ints
 470      # This takes 65.35
 471      # map {hex $_} $line =~ /(....)/g
 472      # This takes 63.75 (2.5% less time)
 473      # unpack "n*", pack "H*", $line
 474      # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
 475      # Doing it as while ($line =~ /(....)/g) took 74.63
 476      foreach my $val (unpack "n*", pack "H*", $line)
 477       {
 478        next if $val == 0xFFFD;
 479        my $ech = &$type_func($ch,$page);
 480        if ($val || (!$ch && !$page))
 481         {
 482          my $el  = length($ech);
 483          $max_el = $el if $el > $max_el;
 484          $min_el = $el if $el < $min_el;
 485          my $uch = encode_U($val);
 486          if ($seen) {
 487            # We're doing the test.
 488            # We don't need to read this quickly, so storing it as a scalar,
 489            # rather than 3 (anon array, plus the 2 scalars it holds) saves
 490            # RAM and may make us faster on low RAM systems. [see __END__]
 491            if (exists $seen->{$uch})
 492              {
 493                warn sprintf("U%04X is %02X%02X and %04X\n",
 494                             $val,$page,$ch,$seen->{$uch});
 495                $errors++;
 496              }
 497            else
 498              {
 499                $seen->{$uch} = $page << 8 | $ch;
 500              }
 501          }
 502          # Passing 2 extra args each time is 3.6% slower!
 503          # Even with having to add $fallback ||= 0 later
 504          enter_fb0($e2u,$ech,$uch);
 505          enter_fb0($u2e,$uch,$ech);
 506         }
 507        else
 508         {
 509          # No character at this position
 510          # enter($e2u,$ech,undef,$e2u);
 511         }
 512        $ch++;
 513       }
 514     } while --$i;
 515   } while --$pages;
 516  die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
 517    if $min_el > $max_el;
 518  die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
 519  $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
 520 }
 521
 522 # my ($a,$s,$d,$t,$fb) = @_;
 523 sub enter {
 524   my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
 525   # state we shift to after this (multibyte) input character defaults to same
 526   # as current state.
 527   $next ||= $current;
 528   # Making sure it is defined seems to be faster than {no warnings;} in
 529   # &process, or passing it in as 0 explicity.
 530   # XXX $fallback ||= 0;
 531
 532   # Start at the beginning and work forwards through the string to zero.
 533   # effectively we are removing 1 character from the front each time
 534   # but we don't actually edit the string. [this alone seems to be 14% speedup]
 535   # Hence -$pos is the length of the remaining string.
 536   my $pos = -length $inbytes;
 537   while (1) {
 538     my $byte = substr $inbytes, $pos, 1;
 539     #  RAW_NEXT => 0,
 540     #  RAW_IN_LEN => 1,
 541     #  RAW_OUT_BYTES => 2,
 542     #  RAW_FALLBACK => 3,
 543     # to unicode an array would seem to be better, because the pages are dense.
 544     # from unicode can be very sparse, favouring a hash.
 545     # hash using the bytes (all length 1) as keys rather than ord value,
 546     # as it's easier to sort these in &process.
 547
 548     # It's faster to always add $fallback even if it's undef, rather than
 549     # choosing between 3 and 4 element array. (hence why we set it defined
 550     # above)
 551     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
 552     # When $pos was -1 we were at the last input character.
 553     unless (++$pos) {
 554       $do_now->[RAW_OUT_BYTES] = $outbytes;
 555       $do_now->[RAW_NEXT] = $next;
 556       return;
 557     }
 558     # Tail recursion. The intermdiate state may not have a name yet.
 559     $current = $do_now->[RAW_NEXT];
 560   }
 561 }
 562
 563 # This is purely for optimistation. It's just &enter hard coded for $fallback
 564 # of 0, using only a 3 entry array ref to save memory for every entry.
 565 sub enter_fb0 {
 566   my ($current,$inbytes,$outbytes,$next) = @_;
 567   $next ||= $current;
 568
 569   my $pos = -length $inbytes;
 570   while (1) {
 571     my $byte = substr $inbytes, $pos, 1;
 572     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
 573     unless (++$pos) {
 574       $do_now->[RAW_OUT_BYTES] = $outbytes;
 575       $do_now->[RAW_NEXT] = $next;
 576       return;
 577     }
 578     $current = $do_now->[RAW_NEXT];
 579   }
 580 }
 581
 582
 583 sub outstring
 584 {
 585  my ($fh,$name,$s) = @_;
 586  my $sym = $strings{$s};
 587  if ($sym)
 588   {
 589    $saved += length($s);
 590   }
 591  else
 592   {
 593    if ($opt{'O'}) {
 594        foreach my $o (keys %strings)
 595         {
 596          next unless (my $i = index($o,$s)) >= 0;
 597          $sym = $strings{$o};
 598          # gcc things that 0x0e+0x10 (anything with e+) starts to look like
 599          # a hexadecimal floating point constant. Silly gcc. Only p
 600          # introduces a floating point constant. Put the space in to stop it
 601          # getting confused.
 602          $sym .= sprintf(" +0x%02x",$i) if ($i);
 603          $subsave += length($s);
 604          return $strings{$s} = $sym;
 605        }
 606    }
 607    $strings{$s} = $sym = $name;
 608    $strings += length($s);
 609    my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
 610    # Maybe we should assert that these are all <256.
 611    $definition .= join(',',unpack "C*",$s);
 612    # We have a single long line. Split it at convenient commas.
 613    $definition =~ s/(.{74,77},)/$1\n/g;
 614    print $fh "$definition };\n\n";
 615   }
 616  return $sym;
 617 }
 618
 619 sub process
 620 {
 621   my ($name,$a) = @_;
 622   $name =~ s/\W+/_/g;
 623   $a->{Cname} = $name;
 624   my $raw = $a->{Raw};
 625   my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
 626   my @ent;
 627   $agg_max_in = 0;
 628   foreach my $key (sort keys %$raw) {
 629     #  RAW_NEXT => 0,
 630     #  RAW_IN_LEN => 1,
 631     #  RAW_OUT_BYTES => 2,
 632     #  RAW_FALLBACK => 3,
 633     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 634     # Now we are converting from raw to aggregate, switch from 1 byte strings
 635     # to numbers
 636     my $b = ord $key;
 637     $fallback ||= 0;
 638     if ($l &&
 639         # If this == fails, we're going to reset $agg_max_in below anyway.
 640         $b == ++$agg_max_in &&
 641         # References in numeric context give the pointer as an int.
 642         $agg_next == $next &&
 643         $agg_in_len == $in_len &&
 644         $agg_out_len == length $out_bytes &&
 645         $agg_fallback == $fallback
 646         # && length($l->[AGG_OUT_BYTES]) < 16
 647        ) {
 648       #     my $i = ord($b)-ord($l->[AGG_MIN_IN]);
 649       # we can aggregate this byte onto the end.
 650       $l->[AGG_MAX_IN] = $b;
 651       $l->[AGG_OUT_BYTES] .= $out_bytes;
 652     } else {
 653       # AGG_MIN_IN => 0,
 654       # AGG_MAX_IN => 1,
 655       # AGG_OUT_BYTES => 2,
 656       # AGG_NEXT => 3,
 657       # AGG_IN_LEN => 4,
 658       # AGG_OUT_LEN => 5,
 659       # AGG_FALLBACK => 6,
 660       # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
 661       # (only gains .6% on euc-jp  -- is it worth it?)
 662       push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
 663                        $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
 664                        $agg_fallback = $fallback];
 665     }
 666     if (exists $next->{Cname}) {
 667       $next->{'Forward'} = 1 if $next != $a;
 668     } else {
 669       process(sprintf("%s_%02x",$name,$b),$next);
 670     }
 671   }
 672   # encengine.c rules say that last entry must be for 255
 673   if ($agg_max_in < 255) {
 674     push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
 675   }
 676   $a->{'Entries'} = \@ent;
 677 }
 678
 679 sub outtable
 680 {
 681  my ($fh,$a) = @_;
 682  my $name = $a->{'Cname'};
 683  # String tables
 684  foreach my $b (@{$a->{'Entries'}})
 685   {
 686    next unless $b->[AGG_OUT_LEN];
 687    my $s = $b->[AGG_MIN_IN];
 688    my $e = $b->[AGG_MAX_IN];
 689    outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
 690   }
 691  if ($a->{'Forward'})
 692   {
 693    print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
 694   }
 695  $a->{'Done'} = 1;
 696  foreach my $b (@{$a->{'Entries'}})
 697   {
 698    my ($s,$e,$out,$t,$end,$l) = @$b;
 699    outtable($fh,$t) unless $t->{'Done'};
 700   }
 701  print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
 702  foreach my $b (@{$a->{'Entries'}})
 703   {
 704    my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
 705    $end |= 0x80 if $fb;
 706    print  $fh "{";
 707    if ($l)
 708     {
 709      printf $fh outstring($fh,'',$out);
 710     }
 711    else
 712     {
 713      print  $fh "0";
 714     }
 715    print  $fh ",",$t->{Cname};
 716    printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
 717   }
 718  print $fh "};\n";
 719 }
 720
 721 sub output
 722 {
 723  my ($fh,$name,$a) = @_;
 724  process($name,$a);
 725  # Sub-tables
 726  outtable($fh,$a);
 727 }
 728
 729 sub output_enc
 730 {
 731  my ($fh,$name,$a) = @_;
 732  die "Changed - fix me for new structure";
 733  foreach my $b (sort keys %$a)
 734   {
 735    my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
 736   }
 737 }
 738
 739 sub decode_U
 740 {
 741  my $s = shift;
 742 }
 743
 744 my @uname;
 745 sub char_names
 746 {
 747  my $s = do "unicore/Name.pl";
 748  die "char_names: unicore/Name.pl: $!\n" unless defined $s;
 749  pos($s) = 0;
 750  while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
 751   {
 752    my $name = $3;
 753    my $s = hex($1);
 754    last if $s >= 0x10000;
 755    my $e = length($2) ? hex($2) : $s;
 756    for (my $i = $s; $i <= $e; $i++)
 757     {
 758      $uname[$i] = $name;
 759 #    print sprintf("U%04X $name\n",$i);
 760     }
 761   }
 762 }
 763
 764 sub output_ucm_page
 765 {
 766   my ($cmap,$a,$t,$pre) = @_;
 767   # warn sprintf("Page %x\n",$pre);
 768   my $raw = $t->{Raw};
 769   foreach my $key (sort keys %$raw) {
 770     #  RAW_NEXT => 0,
 771     #  RAW_IN_LEN => 1,
 772     #  RAW_OUT_BYTES => 2,
 773     #  RAW_FALLBACK => 3,
 774     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 775     my $u = ord $key;
 776     $fallback ||= 0;
 777
 778     if ($next != $a && $next != $t) {
 779       output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
 780     } elsif (length $out_bytes) {
 781       if ($pre) {
 782         $u = $pre|($u &0x3f);
 783       }
 784       my $s = sprintf "<U%04X> ",$u;
 785       #foreach my $c (split(//,$out_bytes)) {
 786       #  $s .= sprintf "\\x%02X",ord($c);
 787       #}
 788       # 9.5% faster changing that loop to this:
 789       $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
 790       $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
 791       push(@$cmap,$s);
 792     } else {
 793       warn join(',',$u, @{$raw->{$key}},$a,$t);
 794     }
 795   }
 796 }
 797
 798 sub output_ucm
 799 {
 800  my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
 801  print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
 802  print $fh "<code_set_name> \"$name\"\n";
 803  char_names();
 804  if (defined $min_el)
 805   {
 806    print $fh "<mb_cur_min> $min_el\n";
 807   }
 808  if (defined $max_el)
 809   {
 810    print $fh "<mb_cur_max> $max_el\n";
 811   }
 812  if (defined $rep)
 813   {
 814    print $fh "<subchar> ";
 815    foreach my $c (split(//,$rep))
 816     {
 817      printf $fh "\\x%02X",ord($c);
 818     }
 819    print $fh "\n";
 820   }
 821  my @cmap;
 822  output_ucm_page(\@cmap,$h,$h,0);
 823  print $fh "#\nCHARMAP\n";
 824  foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
 825   {
 826    print $fh $line;
 827   }
 828  print $fh "END CHARMAP\n";
 829 }
 830
 831 sub make_makefile_pl
 832 {
 833     eval { require Encode; };
 834     $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
 835     eval { require File::Basename; };
 836     $@ and die "File::Basename needed.  Are you on miniperl?;\nerror: $@\n";
 837     File::Basename->import();
 838     my $inc = dirname($INC{"Encode/Internal.pm"});
 839     my $name = shift;
 840     my $table_files = join(",", map {qq('$_')} @_);
 841     my $now = scalar localtime();
 842     open my $fh, ">Makefile.PL" or die "$!";
 843     print $fh <<"END_OF_HEADER";
 844 #
 845 # This file is auto-generated by:
 846 # $0
 847 # $now
 848 #
 849 use 5.7.2;
 850 use strict;
 851 use ExtUtils::MakeMaker;
 852
 853 # Please edit the following to the taste!
 854 my \$name = '$name';
 855 my \%tables = (
 856               encode_t   => [ $table_files ],
 857              );
 858
 859 # And leave the rest!
 860 my \$enc2xs = '$0';
 861 WriteMakefile(
 862               INC               => "-I$inc",
 863 END_OF_HEADER
 864
 865     print $fh <<'END_OF_MAKEFILE_PL';
 866               NAME              => 'Encode::'.$name,
 867               VERSION_FROM      => "$name.pm",
 868               OBJECT            => '$(O_FILES)',
 869               'dist'            => {
 870                   COMPRESS      => 'gzip -9f',
 871                   SUFFIX        => 'gz',
 872                   DIST_DEFAULT => 'all tardist',
 873               },
 874               MAN3PODS  => {},
 875               # OS 390 winges about line numbers > 64K ???
 876               XSOPT => '-nolinenumbers',
 877               );
 878
 879 package MY;
 880
 881 sub post_initialize
 882 {
 883     my ($self) = @_;
 884     my %o;
 885     my $x = $self->{'OBJ_EXT'};
 886     # Add the table O_FILES
 887     foreach my $e (keys %tables)
 888     {
 889         $o{$e.$x} = 1;
 890     }
 891     $o{"$name$x"} = 1;
 892     $self->{'O_FILES'} = [sort keys %o];
 893     my @files = ("$name.xs");
 894     $self->{'C'} = ["$name.c"];
 895     # $self->{'H'} = [$self->catfile($self->updir,'encode.h')];
 896     my %xs;
 897     foreach my $table (keys %tables) {
 898         push (@{$self->{'C'}},"$table.c");
 899         # Do NOT add $table.h etc. to H_FILES unless we own up as to how they
 900         # get built.
 901         foreach my $ext (qw($(OBJ_EXT) .c .h _def.h .fnm)) {
 902             push (@files,$table.$ext);
 903         }
 904     }
 905     $self->{'XS'} = { "$name.xs" => "$name.c" };
 906     $self->{'clean'}{'FILES'} .= join(' ',@files);
 907     open(XS,">$name.xs") || die "Cannot open $name.xs:$!";
 908     print XS <<'END';
 909 #include <EXTERN.h>
 910 #include <perl.h>
 911 #include <XSUB.h>
 912 #define U8 U8
 913 #include "encode.h"
 914 END
 915     foreach my $table (keys %tables) {
 916         print XS qq[#include "${table}.h"\n];
 917     }
 918     print XS <<"END";
 919
 920 static void
 921 Encode_XSEncoding(pTHX_ encode_t *enc)
 922 {
 923  dSP;
 924  HV *stash = gv_stashpv("Encode::XS", TRUE);
 925  SV *sv    = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
 926  int i = 0;
 927  PUSHMARK(sp);
 928  XPUSHs(sv);
 929  while (enc->name[i])
 930   {
 931    const char *name = enc->name[i++];
 932    XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
 933   }
 934  PUTBACK;
 935  call_pv("Encode::define_encoding",G_DISCARD);
 936  SvREFCNT_dec(sv);
 937 }
 938
 939 MODULE = Encode::$name  PACKAGE = Encode::$name
 940 PROTOTYPES: DISABLE
 941 BOOT:
 942 {
 943 END
 944     foreach my $table (keys %tables) {
 945         print XS qq[#include "${table}_def.h"\n];
 946     }
 947     print XS "}\n";
 948     close(XS);
 949     return "# Built $name.xs\n\n";
 950 }
 951
 952 sub postamble
 953 {
 954     my $self = shift;
 955     my $dir  = "."; # $self->catdir('Encode');
 956     my $str  = "# $name\$(OBJ_EXT) depends on .h and _def.h files not .c files - but all written by enc2xs\n";
 957     $str    .= "$name.c : $name.xs ";
 958     foreach my $table (keys %tables)
 959     {
 960         $str .= " $table.c";
 961     }
 962     $str .= "\n\n";
 963     $str .= "$name\$(OBJ_EXT) : $name.c\n\n";
 964
 965     foreach my $table (keys %tables)
 966     {
 967         my $numlines = 1;
 968         my $lengthsofar = length($str);
 969         my $continuator = '';
 970         $str .= "$table.c : Makefile.PL";
 971         foreach my $file (@{$tables{$table}})
 972         {
 973             $str .= $continuator.' '.$self->catfile($dir,$file);
 974             if ( length($str)-$lengthsofar > 128*$numlines )
 975             {
 976                 $continuator .= " \\\n\t";
 977                 $numlines++;
 978             } else {
 979                 $continuator = '';
 980             }
 981         }
 982         $str .= $^O eq 'VMS' # In VMS quote to preserve case
 983             ? qq{\n\t\$(PERL) $enc2xs -"Q" -"O" -o \$\@ -f $table.fnm\n\n}
 984             : qq{\n\t\$(PERL) $enc2xs -Q -O -o \$\@ -f $table.fnm\n\n};
 985         open (FILELIST, ">$table.fnm")
 986             || die "Could not open $table.fnm: $!";
 987         foreach my $file (@{$tables{$table}})
 988         {
 989             print FILELIST $self->catfile($dir,$file) . "\n";
 990         }
 991         close(FILELIST);
 992     }
 993     return $str;
 994 }
 995 END_OF_MAKEFILE_PL
 996     close $fh;
 997     (my $pm =<<"END_OF_PM") =~ s/^# //gm;
 998 # package Encode::$name;
 999 # our \$VERSION = "0.01";
1000 #
1001 # use Encode;
1002 # use XSLoader;
1003 # XSLoader::load('Encode::$name', \$VERSION);
1004 #
1005 # 1;
1006 # __END__
1007 #
1008 # =head1 NAME
1009 #
1010 # Encode::$name - New Encoding
1011 #
1012 # =head1 SYNOPSIS
1013 #
1014 # You got to fill this in!
1015 #
1016 # =head1 SEE ALSO
1017 #
1018 # L<Encode>
1019 #
1020 # =cut
1021 END_OF_PM
1022     open $fh, ">$name.pm" or die "$name.pm:$!";
1023     print $fh $pm;
1024     close $fh;
1025     -d 't' or mkdir 't', 0755 or die "mkdir t:$!";
1026     open $fh, ">t/$name.t" or die "t/$name.t:$!";
1027 print $fh <<"END_OF_TEST";
1028 use strict;
1029 # Adjust the number here!
1030 use Test::More tests => 2;
1031
1032 use_ok('Encode');
1033 use_ok('Encode::$name');
1034 # Add more test here!
1035 END_OF_TEST
1036     close $fh;
1037     exit;
1038 }
1039
1040 __END__
1041
1042 =head1 NAME
1043
1044 enc2xs -- Perl Encode Module Generator
1045
1046 =head1 SYNOPSIS
1047
1048   enc2xs -M ModName mapfiles...
1049   enc2xs -[options]
1050
1051 =head1 DESCRIPTION
1052
1053 F<enc2xs> builds a Perl extension for use by Encode from either
1054 Unicode Character Mapping files (.ucm) or Tcl Encoding Files
1055 (.enc)  Besides internally used during the build process of Encode
1056 module, you can use F<enc2xs> to add your own encoding to perl.  No
1057 knowledge on XS is necessary.
1058
1059 =head1 Quick Guide
1060
1061 If what you want to know as little about Perl possible but needs to
1062 add a new encoding, just read this chapter and forget the rest.
1063
1064 =over 4
1065
1066 =item 0.
1067
1068 Have a .ucm file ready.  You can get it from somewhere or you can
1069 write your own from scratch or you can grab one from Encode
1070 distribution and customize.  For UCM format, see the next Chapter.
1071 In the example below, I'll call my theoretical encoding myascii,
1072 defined inI<my.ucm>.  C<$> is a shell prompt.
1073
1074   $ ls -F
1075   my.ucm
1076
1077 =item 1.
1078
1079 Issue a command as follows;
1080
1081   $ enc2xs -M My my.ucm
1082
1083 Now take a look at your current directory.  It should look like this.
1084
1085   $ ls -F
1086   Makefile.PL   My.pm         my.ucm        t/
1087
1088 The following files are created.
1089
1090   Makefle.PL - MakeMaker script
1091   My.pm      - Encode Submodule
1092   t/My.t     - test file
1093
1094 =item 2.
1095
1096 Edit the files generated.  You don't have to if you have no time AND no
1097 intention to give it to someone else.  But it is a good idea to edit
1098 pod and add more tests.
1099
1100 =item 3.
1101
1102 Now issue a command all Perl Mongers love;
1103
1104   $ perl5.7.3 Makefile.PL
1105   Writing Makefile for Encode::My
1106
1107 =item 4.
1108
1109 Now all you have to do is make.
1110
1111   $ make
1112   cp My.pm blib/lib/Encode/My.pm
1113   /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1114     -o encode_t.c -f encode_t.fnm
1115   Reading myascii (myascii)
1116   Writing compiled form
1117   128 bytes in string tables
1118   384 bytes (25%) saved spotting duplicates
1119   1 bytes (99.2%) saved using substrings
1120   ....
1121   chmod 644 blib/arch/auto/Encode/My/My.bs
1122   $
1123
1124 The time it takes varies how fast your machine is and how large your
1125 encoding is.  Unless you are working on something big like euc-tw, it
1126 won't take too long.
1127
1128 =item 5.
1129
1130 You can "make install" already but you should test first.
1131
1132   $ make test
1133   PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1134     -e 'use Test::Harness  qw(&runtests $verbose); \
1135     $verbose=0; runtests @ARGV;' t/*.t
1136   t/My....ok
1137   All tests successful.
1138   Files=1, Tests=2,  0 wallclock secs
1139    ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1140
1141 =item 6.
1142
1143 If you are content with the test result, just "make install"
1144
1145 =back
1146
1147 =head1 The Unicode Character Map
1148
1149 Encode uses The Unicode Character Map (UCM) for source character
1150 mappings.  This format is used by ICU package of IBM and adopted by
1151 Nick Ing-Simmons.  Since UCM is more flexible than Tcl's Encoding Map
1152 and far more user-friendly,  This is the recommended formet for
1153 Encode now.
1154
1155 UCM file looks like this.
1156
1157   #
1158   # Comments
1159   #
1160   <code_set_name> "US-ascii" # Required
1161   <code_set_alias> "ascii"   # Optional
1162   <mb_cur_min> 1             # Required; usually 1
1163   <mb_cur_max> 1             # Max. # of bytes/char
1164   <subchar> \x3F             # Substitution char
1165   #
1166   CHARMAP
1167   <U0000> \x00 |0 # <control>
1168   <U0001> \x01 |0 # <control>
1169   <U0002> \x02 |0 # <control>
1170   ....
1171   <U007C> \x7C |0 # VERTICAL LINE
1172   <U007D> \x7D |0 # RIGHT CURLY BRACKET
1173   <U007E> \x7E |0 # TILDE
1174   <U007F> \x7F |0 # <control>
1175   END CHARMAP
1176
1177 =over 4
1178
1179 =item *
1180
1181 Anything that follows C<#> is treated as comments.
1182
1183 =item *
1184
1185 The header section continues until CHARMAP. This section Has a form of
1186 I<E<lt>keywordE<gt> value>, one at a line.  For a value, strings must
1187 be quoted. Barewords are treated as numbers.  I<\xXX> represents a
1188 byte.
1189
1190 Most of the keywords are self-explanatory. I<subchar> means
1191 substitution character, not subcharacter.  When you decode a Unicode
1192 sequence to this encoding but no matching character is found, the byte
1193 sequence defined here will be used.  For most cases, the value here is
1194 \x3F, in ASCII this is a question mark.
1195
1196 =item *
1197
1198 CHARMAP starts the character map section.  Each line has a form as
1199 follows;
1200
1201   <UXXXX> \xXX.. |0 # comment
1202     ^     ^      ^
1203     |     |      +- Fallback flag
1204     |     +-------- Encoded byte sequence
1205     +-------------- Unicode Character ID in hex
1206
1207 The format is roughly the same as a header section except for fallback
1208 flag.  It is | followed by 0..3.   And their meaning as follows
1209
1210 =over 2
1211
1212 =item |0
1213
1214 Round trip safe.   A character decoded to Unicode encodes back to the
1215 same byte sequence. most character belong to this.
1216
1217 =item |1
1218
1219 Fallback for unicode -> encoding.  When seen, enc2xs adds this
1220 character for encode map only
1221
1222 =item |2
1223
1224 Skip sub-char mapping should there be no code point.
1225
1226 =item |3
1227
1228 Fallback for encoding -> unicode.  When seen, enc2xs adds this
1229 character for decode map only
1230
1231 =back
1232
1233 =item *
1234
1235 And finally, END OF CHARMAP ends the section.
1236
1237 =back
1238
1239 Needless to say, if you are manually creating a UCM file, you should
1240 copy ascii.ucm or existing encoding which is close to yours than write
1241 your own from scratch.
1242
1243 When you do so, make sure you leave at least B<U0000> to B<U0020> as
1244 is, unless your environment is on EBCDIC.
1245
1246 B<CAVEAT>: not all features in UCM are implemented.  For example,
1247 icu:state is not used.  Because of that, you need to write a perl
1248 module if you want to support algorithmical encodings, notablly
1249 ISO-2022 series.  Such modules include L<Encode::JP::2022_JP>,
1250 L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1251
1252 =head1 Bookmarks
1253
1254 ICU Home Page
1255 L<http://oss.software.ibm.com/icu/>
1256
1257 ICU Character Mapping Tables
1258 L<http://oss.software.ibm.com/icu/charset/>
1259
1260 ICU:Conversion Data
1261 L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1262
1263 =head1 SEE ALSO
1264
1265 L<Encode>,
1266 L<perlmod>,
1267 L<perlpod>
1268
1269 =cut
1270
1271 # -Q to disable the duplicate codepoint test
1272 # -S make mapping errors fatal
1273 # -q to remove comments written to output files
1274 # -O to enable the (brute force) substring optimiser
1275 # -o <output> to specify the output file name (else it's the first arg)
1276 # -f <inlist> to give a file with a list of input files (else use the args)
1277 # -n <name> to name the encoding (else use the basename of the input file.
1278
1279 With %seen holding array refs:
1280
1281       865.66 real        28.80 user         8.79 sys
1282       7904  maximum resident set size
1283       1356  average shared memory size
1284      18566  average unshared data size
1285        229  average unshared stack size
1286      46080  page reclaims
1287      33373  page faults
1288
1289 With %seen holding simple scalars:
1290
1291       342.16 real        27.11 user         3.54 sys
1292       8388  maximum resident set size
1293       1394  average shared memory size
1294      14969  average unshared data size
1295        236  average unshared stack size
1296      28159  page reclaims
1297       9839  page faults
1298
1299 Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1300 how %seen is storing things its seen. So it is pathalogically bad on a 16M
1301 RAM machine, but it's going to help even on modern machines.
1302 Swapping is bad, m'kay :-)