ext/Encode/bin/enc2xs

   1 #!../../../perl -w
   2 BEGIN {
   3     # fiddle with @INC iff I am a part of perl dist
   4     if ($^X =~ m/\bminiperl$/o){
   5        warn "Fixing \@INC for perl core.\n";
   6        unshift @INC, qw(../../lib ../../../lib ../../../../lib);
   7        $ENV{PATH} .= ';../..;../../..;../../../..' if $^O eq 'MSWin32';
   8     }
   9 }
  10 use strict;
  11 use Getopt::Std;
  12 my @orig_ARGV = @ARGV;
  13 our $VERSION  = do { my @r = (q$Revision: 1.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  14
  15
  16 # These may get re-ordered.
  17 # RAW is a do_now as inserted by &enter
  18 # AGG is an aggreagated do_now, as built up by &process
  19 use constant {
  20   RAW_NEXT => 0,
  21   RAW_IN_LEN => 1,
  22   RAW_OUT_BYTES => 2,
  23   RAW_FALLBACK => 3,
  24
  25   AGG_MIN_IN => 0,
  26   AGG_MAX_IN => 1,
  27   AGG_OUT_BYTES => 2,
  28   AGG_NEXT => 3,
  29   AGG_IN_LEN => 4,
  30   AGG_OUT_LEN => 5,
  31   AGG_FALLBACK => 6,
  32 };
  33 # (See the algorithm in encengine.c - we're building structures for it)
  34
  35 # There are two sorts of structures.
  36 # "do_now" (an array, two variants of what needs storing) is whatever we need
  37 # to do now we've read an input byte.
  38 # It's housed in a "do_next" (which is how we got to it), and in turn points
  39 # to a "do_next" which contains all the "do_now"s for the next input byte.
  40
  41 # There will be a "do_next" which is the start state.
  42 # For a single byte encoding it's the only "do_next" - each "do_now" points
  43 # back to it, and each "do_now" will cause bytes. There is no state.
  44
  45 # For a multi-byte encoding where all characters in the input are the same
  46 # length, then there will be a tree of "do_now"->"do_next"->"do_now"
  47 # branching out from the start state, one step for each input byte.
  48 # The leaf "do_now"s will all be at the same distance from the start state,
  49 # only the leaf "do_now"s cause output bytes, and they in turn point back to
  50 # the start state.
  51
  52 # For an encoding where there are varaible length input byte sequences, you
  53 # will encounter a leaf "do_now" sooner for the shorter input sequences, but
  54 # as before the leaves will point back to the start state.
  55
  56 # The system will cope with escape encodings (imagine them as a mostly
  57 # self-contained tree for each escape state, and cross links between trees
  58 # at the state-switching characters) but so far no input format defines these.
  59
  60 # The system will also cope with having output "leaves" in the middle of
  61 # the bifurcating branches, not just at the extremities, but again no
  62 # input format does this yet.
  63
  64 # There are two variants of the "do_now" structure. The first, smaller variant
  65 # is generated by &enter as the input file is read. There is one structure
  66 # for each input byte. Say we are mapping a single byte encoding to a
  67 # single byte encoding, with  "ABCD" going "abcd". There will be
  68 # 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
  69
  70 # &process then walks the tree, building aggregate "do_now" structres for
  71 # adjacent bytes where possible. The aggregate is for a contiguous range of
  72 # bytes which each produce the same length of output, each move to the
  73 # same next state, and each have the same fallback flag.
  74 # So our 4 RAW "do_now"s above become replaced by a single structure
  75 # containing:
  76 # ["A", "D", "abcd", 1, ...]
  77 # ie, for an input byte $_ in "A".."D", output 1 byte, found as
  78 # substr ("abcd", (ord $_ - ord "A") * 1, 1)
  79 # which maps very nicely into pointer arithmetic in C for encengine.c
  80
  81 sub encode_U
  82 {
  83  # UTF-8 encode long hand - only covers part of perl's range
  84  ## my $uv = shift;
  85  # chr() works in native space so convert value from table
  86  # into that space before using chr().
  87  my $ch = chr(utf8::unicode_to_native($_[0]));
  88  # Now get core perl to encode that the way it likes.
  89  utf8::encode($ch);
  90  return $ch;
  91 }
  92
  93 sub encode_S
  94 {
  95  # encode single byte
  96  ## my ($ch,$page) = @_; return chr($ch);
  97  return chr $_[0];
  98 }
  99
 100 sub encode_D
 101 {
 102  # encode double byte MS byte first
 103  ## my ($ch,$page) = @_; return chr($page).chr($ch);
 104  return chr ($_[1]) . chr $_[0];
 105 }
 106
 107 sub encode_M
 108 {
 109  # encode Multi-byte - single for 0..255 otherwise double
 110  ## my ($ch,$page) = @_;
 111  ## return &encode_D if $page;
 112  ## return &encode_S;
 113  return chr ($_[1]) . chr $_[0] if $_[1];
 114  return chr $_[0];
 115 }
 116
 117 my %encode_types = (U => \&encode_U,
 118                     S => \&encode_S,
 119                     D => \&encode_D,
 120                     M => \&encode_M,
 121                    );
 122
 123 # Win32 does not expand globs on command line
 124 eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
 125
 126 my %opt;
 127 # I think these are:
 128 # -Q to disable the duplicate codepoint test
 129 # -S make mapping errors fatal
 130 # -q to remove comments written to output files
 131 # -O to enable the (brute force) substring optimiser
 132 # -o <output> to specify the output file name (else it's the first arg)
 133 # -f <inlist> to give a file with a list of input files (else use the args)
 134 # -n <name> to name the encoding (else use the basename of the input file.
 135 getopts('M:SQqOo:f:n:',\%opt);
 136
 137 $opt{M} and make_makefile_pl($opt{M}, @ARGV);
 138
 139 # This really should go first, else the die here causes empty (non-erroneous)
 140 # output files to be written.
 141 my @encfiles;
 142 if (exists $opt{'f'}) {
 143     # -F is followed by name of file containing list of filenames
 144     my $flist = $opt{'f'};
 145     open(FLIST,$flist) || die "Cannot open $flist:$!";
 146     chomp(@encfiles = <FLIST>);
 147     close(FLIST);
 148 } else {
 149     @encfiles = @ARGV;
 150 }
 151
 152 my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
 153 chmod(0666,$cname) if -f $cname && !-w $cname;
 154 open(C,">$cname") || die "Cannot open $cname:$!";
 155
 156 my $dname = $cname;
 157 my $hname = $cname;
 158
 159 my ($doC,$doEnc,$doUcm,$doPet);
 160
 161 if ($cname =~ /\.(c|xs)$/)
 162  {
 163   $doC = 1;
 164   $dname =~ s/(\.[^\.]*)?$/_def.h/;
 165   chmod(0666,$dname) if -f $cname && !-w $dname;
 166   open(D,">$dname") || die "Cannot open $dname:$!";
 167   $hname =~ s/(\.[^\.]*)?$/.h/;
 168   chmod(0666,$hname) if -f $cname && !-w $hname;
 169   open(H,">$hname") || die "Cannot open $hname:$!";
 170
 171   foreach my $fh (\*C,\*D,\*H)
 172   {
 173    print $fh <<"END" unless $opt{'q'};
 174 /*
 175  !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
 176  This file was autogenerated by:
 177  $^X $0 @orig_ARGV
 178 */
 179 END
 180   }
 181
 182   if ($cname =~ /(\w+)\.xs$/)
 183    {
 184     print C "#include <EXTERN.h>\n";
 185     print C "#include <perl.h>\n";
 186     print C "#include <XSUB.h>\n";
 187     print C "#define U8 U8\n";
 188    }
 189   print C "#include \"encode.h\"\n";
 190
 191  }
 192 elsif ($cname =~ /\.enc$/)
 193  {
 194   $doEnc = 1;
 195  }
 196 elsif ($cname =~ /\.ucm$/)
 197  {
 198   $doUcm = 1;
 199  }
 200 elsif ($cname =~ /\.pet$/)
 201  {
 202   $doPet = 1;
 203  }
 204
 205 my %encoding;
 206 my %strings;
 207 my $saved = 0;
 208 my $subsave = 0;
 209 my $strings = 0;
 210
 211 sub cmp_name
 212 {
 213  if ($a =~ /^.*-(\d+)/)
 214   {
 215    my $an = $1;
 216    if ($b =~ /^.*-(\d+)/)
 217     {
 218      my $r = $an <=> $1;
 219      return $r if $r;
 220     }
 221   }
 222  return $a cmp $b;
 223 }
 224
 225
 226 foreach my $enc (sort cmp_name @encfiles)
 227  {
 228   my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
 229   $name = $opt{'n'} if exists $opt{'n'};
 230   if (open(E,$enc))
 231    {
 232     if ($sfx eq 'enc')
 233      {
 234       compile_enc(\*E,lc($name));
 235      }
 236     else
 237      {
 238       compile_ucm(\*E,lc($name));
 239      }
 240    }
 241   else
 242    {
 243     warn "Cannot open $enc for $name:$!";
 244    }
 245  }
 246
 247 if ($doC)
 248  {
 249   print STDERR "Writing compiled form\n";
 250   foreach my $name (sort cmp_name keys %encoding)
 251    {
 252     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 253     output(\*C,$name.'_utf8',$e2u);
 254     output(\*C,'utf8_'.$name,$u2e);
 255     push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
 256    }
 257   foreach my $enc (sort cmp_name keys %encoding)
 258    {
 259     my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
 260     my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
 261     my $sym = "${enc}_encoding";
 262     $sym =~ s/\W+/_/g;
 263     print C "encode_t $sym = \n";
 264     print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
 265    }
 266
 267   foreach my $enc (sort cmp_name keys %encoding)
 268    {
 269     my $sym = "${enc}_encoding";
 270     $sym =~ s/\W+/_/g;
 271     print H "extern encode_t $sym;\n";
 272     print D " Encode_XSEncoding(aTHX_ &$sym);\n";
 273    }
 274
 275   if ($cname =~ /(\w+)\.xs$/)
 276    {
 277     my $mod = $1;
 278     print C <<'END';
 279
 280 static void
 281 Encode_XSEncoding(pTHX_ encode_t *enc)
 282 {
 283  dSP;
 284  HV *stash = gv_stashpv("Encode::XS", TRUE);
 285  SV *sv    = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
 286  int i = 0;
 287  PUSHMARK(sp);
 288  XPUSHs(sv);
 289  while (enc->name[i])
 290   {
 291    const char *name = enc->name[i++];
 292    XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
 293   }
 294  PUTBACK;
 295  call_pv("Encode::define_encoding",G_DISCARD);
 296  SvREFCNT_dec(sv);
 297 }
 298
 299 END
 300
 301     print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
 302     print C "BOOT:\n{\n";
 303     print C "#include \"$dname\"\n";
 304     print C "}\n";
 305    }
 306   # Close in void context is bad, m'kay
 307   close(D) or warn "Error closing '$dname': $!";
 308   close(H) or warn "Error closing '$hname': $!";
 309
 310   my $perc_saved    = $strings/($strings + $saved) * 100;
 311   my $perc_subsaved = $strings/($strings + $subsave) * 100;
 312   printf STDERR "%d bytes in string tables\n",$strings;
 313   printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
 314     $saved, $perc_saved              if $saved;
 315   printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
 316     $subsave, $perc_subsaved         if $subsave;
 317  }
 318 elsif ($doEnc)
 319  {
 320   foreach my $name (sort cmp_name keys %encoding)
 321    {
 322     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 323     output_enc(\*C,$name,$e2u);
 324    }
 325  }
 326 elsif ($doUcm)
 327  {
 328   foreach my $name (sort cmp_name keys %encoding)
 329    {
 330     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 331     output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
 332    }
 333  }
 334
 335 # writing half meg files and then not checking to see if you just filled the
 336 # disk is bad, m'kay
 337 close(C) or die "Error closing '$cname': $!";
 338
 339 # End of the main program.
 340
 341 sub compile_ucm
 342 {
 343  my ($fh,$name) = @_;
 344  my $e2u = {};
 345  my $u2e = {};
 346  my $cs;
 347  my %attr;
 348  while (<$fh>)
 349   {
 350    s/#.*$//;
 351    last if /^\s*CHARMAP\s*$/i;
 352    if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
 353     {
 354      $attr{$1} = $2;
 355     }
 356   }
 357  if (!defined($cs =  $attr{'code_set_name'}))
 358   {
 359    warn "No <code_set_name> in $name\n";
 360   }
 361  else
 362   {
 363    $name = $cs unless exists $opt{'n'};
 364   }
 365  my $erep;
 366  my $urep;
 367  my $max_el;
 368  my $min_el;
 369  if (exists $attr{'subchar'})
 370   {
 371    my @byte;
 372    $attr{'subchar'} =~ /^\s*/cg;
 373    push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
 374    $erep = join('',map(chr(hex($_)),@byte));
 375   }
 376  print "Reading $name ($cs)\n";
 377  my $nfb = 0;
 378  my $hfb = 0;
 379  while (<$fh>)
 380   {
 381    s/#.*$//;
 382    last if /^\s*END\s+CHARMAP\s*$/i;
 383    next if /^\s*$/;
 384    my ($u,@byte);
 385    my $fb = '';
 386    $u = $1 if (/^<U([0-9a-f]+)>\s+/igc);
 387    push(@byte,$1) while /\G\\x([0-9a-f]+)/igc;
 388    $fb = $1 if /\G\s*(\|[0-3])/gc;
 389    # warn "$_: $u @byte | $fb\n";
 390    die "Bad line:$_" unless /\G\s*(#.*)?$/gc;
 391    if (defined($u))
 392     {
 393      my $uch = encode_U(hex($u));
 394      my $ech = join('',map(chr(hex($_)),@byte));
 395      my $el  = length($ech);
 396      $max_el = $el if (!defined($max_el) || $el > $max_el);
 397      $min_el = $el if (!defined($min_el) || $el < $min_el);
 398      if (length($fb))
 399       {
 400        $fb = substr($fb,1);
 401        $hfb++;
 402       }
 403      else
 404       {
 405        $nfb++;
 406        $fb = '0';
 407       }
 408      # $fb is fallback flag
 409      # 0 - round trip safe
 410      # 1 - fallback for unicode -> enc
 411      # 2 - skip sub-char mapping
 412      # 3 - fallback enc -> unicode
 413      enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
 414      enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
 415     }
 416    else
 417     {
 418      warn $_;
 419     }
 420   }
 421  if ($nfb && $hfb)
 422   {
 423    die "$nfb entries without fallback, $hfb entries with\n";
 424   }
 425  $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
 426 }
 427
 428
 429
 430 sub compile_enc
 431 {
 432  my ($fh,$name) = @_;
 433  my $e2u = {};
 434  my $u2e = {};
 435
 436  my $type;
 437  while ($type = <$fh>)
 438   {
 439    last if $type !~ /^\s*#/;
 440   }
 441  chomp($type);
 442  return if $type eq 'E';
 443  # Do the hash lookup once, rather than once per function call. 4% speedup.
 444  my $type_func = $encode_types{$type};
 445  my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
 446  warn "$type encoded $name\n";
 447  my $rep = '';
 448  # Save a defined test by setting these to defined values.
 449  my $min_el = ~0; # A very big integer
 450  my $max_el = 0;  # Anything must be longer than 0
 451  {
 452   my $v = hex($def);
 453   $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
 454  }
 455  my $errors;
 456  my $seen;
 457  # use -Q to silence the seen test. Makefile.PL uses this by default.
 458  $seen = {} unless $opt{Q};
 459  do
 460   {
 461    my $line = <$fh>;
 462    chomp($line);
 463    my $page = hex($line);
 464    my $ch = 0;
 465    my $i = 16;
 466    do
 467     {
 468      # So why is it 1% faster to leave the my here?
 469      my $line = <$fh>;
 470      $line =~ s/\r\n$/\n/;
 471      die "$.:${line}Line should be exactly 65 characters long including
 472      newline (".length($line).")" unless length ($line) == 65;
 473      # Split line into groups of 4 hex digits, convert groups to ints
 474      # This takes 65.35
 475      # map {hex $_} $line =~ /(....)/g
 476      # This takes 63.75 (2.5% less time)
 477      # unpack "n*", pack "H*", $line
 478      # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
 479      # Doing it as while ($line =~ /(....)/g) took 74.63
 480      foreach my $val (unpack "n*", pack "H*", $line)
 481       {
 482        next if $val == 0xFFFD;
 483        my $ech = &$type_func($ch,$page);
 484        if ($val || (!$ch && !$page))
 485         {
 486          my $el  = length($ech);
 487          $max_el = $el if $el > $max_el;
 488          $min_el = $el if $el < $min_el;
 489          my $uch = encode_U($val);
 490          if ($seen) {
 491            # We're doing the test.
 492            # We don't need to read this quickly, so storing it as a scalar,
 493            # rather than 3 (anon array, plus the 2 scalars it holds) saves
 494            # RAM and may make us faster on low RAM systems. [see __END__]
 495            if (exists $seen->{$uch})
 496              {
 497                warn sprintf("U%04X is %02X%02X and %04X\n",
 498                             $val,$page,$ch,$seen->{$uch});
 499                $errors++;
 500              }
 501            else
 502              {
 503                $seen->{$uch} = $page << 8 | $ch;
 504              }
 505          }
 506          # Passing 2 extra args each time is 3.6% slower!
 507          # Even with having to add $fallback ||= 0 later
 508          enter_fb0($e2u,$ech,$uch);
 509          enter_fb0($u2e,$uch,$ech);
 510         }
 511        else
 512         {
 513          # No character at this position
 514          # enter($e2u,$ech,undef,$e2u);
 515         }
 516        $ch++;
 517       }
 518     } while --$i;
 519   } while --$pages;
 520  die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
 521    if $min_el > $max_el;
 522  die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
 523  $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
 524 }
 525
 526 # my ($a,$s,$d,$t,$fb) = @_;
 527 sub enter {
 528   my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
 529   # state we shift to after this (multibyte) input character defaults to same
 530   # as current state.
 531   $next ||= $current;
 532   # Making sure it is defined seems to be faster than {no warnings;} in
 533   # &process, or passing it in as 0 explicity.
 534   # XXX $fallback ||= 0;
 535
 536   # Start at the beginning and work forwards through the string to zero.
 537   # effectively we are removing 1 character from the front each time
 538   # but we don't actually edit the string. [this alone seems to be 14% speedup]
 539   # Hence -$pos is the length of the remaining string.
 540   my $pos = -length $inbytes;
 541   while (1) {
 542     my $byte = substr $inbytes, $pos, 1;
 543     #  RAW_NEXT => 0,
 544     #  RAW_IN_LEN => 1,
 545     #  RAW_OUT_BYTES => 2,
 546     #  RAW_FALLBACK => 3,
 547     # to unicode an array would seem to be better, because the pages are dense.
 548     # from unicode can be very sparse, favouring a hash.
 549     # hash using the bytes (all length 1) as keys rather than ord value,
 550     # as it's easier to sort these in &process.
 551
 552     # It's faster to always add $fallback even if it's undef, rather than
 553     # choosing between 3 and 4 element array. (hence why we set it defined
 554     # above)
 555     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
 556     # When $pos was -1 we were at the last input character.
 557     unless (++$pos) {
 558       $do_now->[RAW_OUT_BYTES] = $outbytes;
 559       $do_now->[RAW_NEXT] = $next;
 560       return;
 561     }
 562     # Tail recursion. The intermdiate state may not have a name yet.
 563     $current = $do_now->[RAW_NEXT];
 564   }
 565 }
 566
 567 # This is purely for optimistation. It's just &enter hard coded for $fallback
 568 # of 0, using only a 3 entry array ref to save memory for every entry.
 569 sub enter_fb0 {
 570   my ($current,$inbytes,$outbytes,$next) = @_;
 571   $next ||= $current;
 572
 573   my $pos = -length $inbytes;
 574   while (1) {
 575     my $byte = substr $inbytes, $pos, 1;
 576     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
 577     unless (++$pos) {
 578       $do_now->[RAW_OUT_BYTES] = $outbytes;
 579       $do_now->[RAW_NEXT] = $next;
 580       return;
 581     }
 582     $current = $do_now->[RAW_NEXT];
 583   }
 584 }
 585
 586
 587 sub outstring
 588 {
 589  my ($fh,$name,$s) = @_;
 590  my $sym = $strings{$s};
 591  if ($sym)
 592   {
 593    $saved += length($s);
 594   }
 595  else
 596   {
 597    if ($opt{'O'}) {
 598        foreach my $o (keys %strings)
 599         {
 600          next unless (my $i = index($o,$s)) >= 0;
 601          $sym = $strings{$o};
 602          # gcc things that 0x0e+0x10 (anything with e+) starts to look like
 603          # a hexadecimal floating point constant. Silly gcc. Only p
 604          # introduces a floating point constant. Put the space in to stop it
 605          # getting confused.
 606          $sym .= sprintf(" +0x%02x",$i) if ($i);
 607          $subsave += length($s);
 608          return $strings{$s} = $sym;
 609        }
 610    }
 611    $strings{$s} = $sym = $name;
 612    $strings += length($s);
 613    my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
 614    # Maybe we should assert that these are all <256.
 615    $definition .= join(',',unpack "C*",$s);
 616    # We have a single long line. Split it at convenient commas.
 617    $definition =~ s/(.{74,77},)/$1\n/g;
 618    print $fh "$definition };\n\n";
 619   }
 620  return $sym;
 621 }
 622
 623 sub process
 624 {
 625   my ($name,$a) = @_;
 626   $name =~ s/\W+/_/g;
 627   $a->{Cname} = $name;
 628   my $raw = $a->{Raw};
 629   my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
 630   my @ent;
 631   $agg_max_in = 0;
 632   foreach my $key (sort keys %$raw) {
 633     #  RAW_NEXT => 0,
 634     #  RAW_IN_LEN => 1,
 635     #  RAW_OUT_BYTES => 2,
 636     #  RAW_FALLBACK => 3,
 637     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 638     # Now we are converting from raw to aggregate, switch from 1 byte strings
 639     # to numbers
 640     my $b = ord $key;
 641     $fallback ||= 0;
 642     if ($l &&
 643         # If this == fails, we're going to reset $agg_max_in below anyway.
 644         $b == ++$agg_max_in &&
 645         # References in numeric context give the pointer as an int.
 646         $agg_next == $next &&
 647         $agg_in_len == $in_len &&
 648         $agg_out_len == length $out_bytes &&
 649         $agg_fallback == $fallback
 650         # && length($l->[AGG_OUT_BYTES]) < 16
 651        ) {
 652       #     my $i = ord($b)-ord($l->[AGG_MIN_IN]);
 653       # we can aggregate this byte onto the end.
 654       $l->[AGG_MAX_IN] = $b;
 655       $l->[AGG_OUT_BYTES] .= $out_bytes;
 656     } else {
 657       # AGG_MIN_IN => 0,
 658       # AGG_MAX_IN => 1,
 659       # AGG_OUT_BYTES => 2,
 660       # AGG_NEXT => 3,
 661       # AGG_IN_LEN => 4,
 662       # AGG_OUT_LEN => 5,
 663       # AGG_FALLBACK => 6,
 664       # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
 665       # (only gains .6% on euc-jp  -- is it worth it?)
 666       push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
 667                        $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
 668                        $agg_fallback = $fallback];
 669     }
 670     if (exists $next->{Cname}) {
 671       $next->{'Forward'} = 1 if $next != $a;
 672     } else {
 673       process(sprintf("%s_%02x",$name,$b),$next);
 674     }
 675   }
 676   # encengine.c rules say that last entry must be for 255
 677   if ($agg_max_in < 255) {
 678     push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
 679   }
 680   $a->{'Entries'} = \@ent;
 681 }
 682
 683 sub outtable
 684 {
 685  my ($fh,$a) = @_;
 686  my $name = $a->{'Cname'};
 687  # String tables
 688  foreach my $b (@{$a->{'Entries'}})
 689   {
 690    next unless $b->[AGG_OUT_LEN];
 691    my $s = $b->[AGG_MIN_IN];
 692    my $e = $b->[AGG_MAX_IN];
 693    outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
 694   }
 695  if ($a->{'Forward'})
 696   {
 697    print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
 698   }
 699  $a->{'Done'} = 1;
 700  foreach my $b (@{$a->{'Entries'}})
 701   {
 702    my ($s,$e,$out,$t,$end,$l) = @$b;
 703    outtable($fh,$t) unless $t->{'Done'};
 704   }
 705  print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
 706  foreach my $b (@{$a->{'Entries'}})
 707   {
 708    my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
 709    $end |= 0x80 if $fb;
 710    print  $fh "{";
 711    if ($l)
 712     {
 713      printf $fh outstring($fh,'',$out);
 714     }
 715    else
 716     {
 717      print  $fh "0";
 718     }
 719    print  $fh ",",$t->{Cname};
 720    printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
 721   }
 722  print $fh "};\n";
 723 }
 724
 725 sub output
 726 {
 727  my ($fh,$name,$a) = @_;
 728  process($name,$a);
 729  # Sub-tables
 730  outtable($fh,$a);
 731 }
 732
 733 sub output_enc
 734 {
 735  my ($fh,$name,$a) = @_;
 736  die "Changed - fix me for new structure";
 737  foreach my $b (sort keys %$a)
 738   {
 739    my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
 740   }
 741 }
 742
 743 sub decode_U
 744 {
 745  my $s = shift;
 746 }
 747
 748 my @uname;
 749 sub char_names
 750 {
 751  my $s = do "unicore/Name.pl";
 752  die "char_names: unicore/Name.pl: $!\n" unless defined $s;
 753  pos($s) = 0;
 754  while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
 755   {
 756    my $name = $3;
 757    my $s = hex($1);
 758    last if $s >= 0x10000;
 759    my $e = length($2) ? hex($2) : $s;
 760    for (my $i = $s; $i <= $e; $i++)
 761     {
 762      $uname[$i] = $name;
 763 #    print sprintf("U%04X $name\n",$i);
 764     }
 765   }
 766 }
 767
 768 sub output_ucm_page
 769 {
 770   my ($cmap,$a,$t,$pre) = @_;
 771   # warn sprintf("Page %x\n",$pre);
 772   my $raw = $t->{Raw};
 773   foreach my $key (sort keys %$raw) {
 774     #  RAW_NEXT => 0,
 775     #  RAW_IN_LEN => 1,
 776     #  RAW_OUT_BYTES => 2,
 777     #  RAW_FALLBACK => 3,
 778     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 779     my $u = ord $key;
 780     $fallback ||= 0;
 781
 782     if ($next != $a && $next != $t) {
 783       output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
 784     } elsif (length $out_bytes) {
 785       if ($pre) {
 786         $u = $pre|($u &0x3f);
 787       }
 788       my $s = sprintf "<U%04X> ",$u;
 789       #foreach my $c (split(//,$out_bytes)) {
 790       #  $s .= sprintf "\\x%02X",ord($c);
 791       #}
 792       # 9.5% faster changing that loop to this:
 793       $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
 794       $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
 795       push(@$cmap,$s);
 796     } else {
 797       warn join(',',$u, @{$raw->{$key}},$a,$t);
 798     }
 799   }
 800 }
 801
 802 sub output_ucm
 803 {
 804  my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
 805  print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
 806  print $fh "<code_set_name> \"$name\"\n";
 807  char_names();
 808  if (defined $min_el)
 809   {
 810    print $fh "<mb_cur_min> $min_el\n";
 811   }
 812  if (defined $max_el)
 813   {
 814    print $fh "<mb_cur_max> $max_el\n";
 815   }
 816  if (defined $rep)
 817   {
 818    print $fh "<subchar> ";
 819    foreach my $c (split(//,$rep))
 820     {
 821      printf $fh "\\x%02X",ord($c);
 822     }
 823    print $fh "\n";
 824   }
 825  my @cmap;
 826  output_ucm_page(\@cmap,$h,$h,0);
 827  print $fh "#\nCHARMAP\n";
 828  foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
 829   {
 830    print $fh $line;
 831   }
 832  print $fh "END CHARMAP\n";
 833 }
 834
 835 sub make_makefile_pl
 836 {
 837     eval { require Encode; };
 838     $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
 839     eval { require File::Basename; };
 840     $@ and die "File::Basename needed.  Are you on miniperl?;\nerror: $@\n";
 841     File::Basename->import();
 842     my $inc = dirname($INC{"Encode/Internal.pm"});
 843     my $name = shift;
 844     my $table_files = join(",", map {qq('$_')} @_);
 845     my $now = scalar localtime();
 846     open my $fh, ">Makefile.PL" or die "$!";
 847     print $fh <<"END_OF_HEADER";
 848 #
 849 # This file is auto-generated by:
 850 # $0
 851 # $now
 852 #
 853 use 5.7.2;
 854 use strict;
 855 use ExtUtils::MakeMaker;
 856
 857 # Please edit the following to the taste!
 858 my \$name = '$name';
 859 my \%tables = (
 860               encode_t   => [ $table_files ],
 861              );
 862
 863 # And leave the rest!
 864 my \$enc2xs = '$0';
 865 WriteMakefile(
 866               INC               => "-I$inc",
 867 END_OF_HEADER
 868
 869     print $fh <<'END_OF_MAKEFILE_PL';
 870               NAME              => 'Encode::'.$name,
 871               VERSION_FROM      => "$name.pm",
 872               OBJECT            => '$(O_FILES)',
 873               'dist'            => {
 874                   COMPRESS      => 'gzip -9f',
 875                   SUFFIX        => 'gz',
 876                   DIST_DEFAULT => 'all tardist',
 877               },
 878               MAN3PODS  => {},
 879               # OS 390 winges about line numbers > 64K ???
 880               XSOPT => '-nolinenumbers',
 881               );
 882
 883 package MY;
 884
 885 sub post_initialize
 886 {
 887     my ($self) = @_;
 888     my %o;
 889     my $x = $self->{'OBJ_EXT'};
 890     # Add the table O_FILES
 891     foreach my $e (keys %tables)
 892     {
 893         $o{$e.$x} = 1;
 894     }
 895     $o{"$name$x"} = 1;
 896     $self->{'O_FILES'} = [sort keys %o];
 897     my @files = ("$name.xs");
 898     $self->{'C'} = ["$name.c"];
 899     # $self->{'H'} = [$self->catfile($self->updir,'encode.h')];
 900     my %xs;
 901     foreach my $table (keys %tables) {
 902         push (@{$self->{'C'}},"$table.c");
 903         # Do NOT add $table.h etc. to H_FILES unless we own up as to how they
 904         # get built.
 905         foreach my $ext (qw($(OBJ_EXT) .c .h _def.h .fnm)) {
 906             push (@files,$table.$ext);
 907         }
 908     }
 909     $self->{'XS'} = { "$name.xs" => "$name.c" };
 910     $self->{'clean'}{'FILES'} .= join(' ',@files);
 911     open(XS,">$name.xs") || die "Cannot open $name.xs:$!";
 912     print XS <<'END';
 913 #include <EXTERN.h>
 914 #include <perl.h>
 915 #include <XSUB.h>
 916 #define U8 U8
 917 #include "encode.h"
 918 END
 919     foreach my $table (keys %tables) {
 920         print XS qq[#include "${table}.h"\n];
 921     }
 922     print XS <<"END";
 923
 924 static void
 925 Encode_XSEncoding(pTHX_ encode_t *enc)
 926 {
 927  dSP;
 928  HV *stash = gv_stashpv("Encode::XS", TRUE);
 929  SV *sv    = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
 930  int i = 0;
 931  PUSHMARK(sp);
 932  XPUSHs(sv);
 933  while (enc->name[i])
 934   {
 935    const char *name = enc->name[i++];
 936    XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
 937   }
 938  PUTBACK;
 939  call_pv("Encode::define_encoding",G_DISCARD);
 940  SvREFCNT_dec(sv);
 941 }
 942
 943 MODULE = Encode::$name  PACKAGE = Encode::$name
 944 PROTOTYPES: DISABLE
 945 BOOT:
 946 {
 947 END
 948     foreach my $table (keys %tables) {
 949         print XS qq[#include "${table}_def.h"\n];
 950     }
 951     print XS "}\n";
 952     close(XS);
 953     return "# Built $name.xs\n\n";
 954 }
 955
 956 sub postamble
 957 {
 958     my $self = shift;
 959     my $dir  = "."; # $self->catdir('Encode');
 960     my $str  = "# $name\$(OBJ_EXT) depends on .h and _def.h files not .c files - but all written by enc2xs\n";
 961     $str    .= "$name.c : $name.xs ";
 962     foreach my $table (keys %tables)
 963     {
 964         $str .= " $table.c";
 965     }
 966     $str .= "\n\n";
 967     $str .= "$name\$(OBJ_EXT) : $name.c\n\n";
 968
 969     foreach my $table (keys %tables)
 970     {
 971         my $numlines = 1;
 972         my $lengthsofar = length($str);
 973         my $continuator = '';
 974         $str .= "$table.c : Makefile.PL";
 975         foreach my $file (@{$tables{$table}})
 976         {
 977             $str .= $continuator.' '.$self->catfile($dir,$file);
 978             if ( length($str)-$lengthsofar > 128*$numlines )
 979             {
 980                 $continuator .= " \\\n\t";
 981                 $numlines++;
 982             } else {
 983                 $continuator = '';
 984             }
 985         }
 986         $str .= $^O eq 'VMS' # In VMS quote to preserve case
 987             ? qq{\n\t\$(PERL) $enc2xs -"Q" -"O" -o \$\@ -f $table.fnm\n\n}
 988             : qq{\n\t\$(PERL) $enc2xs -Q -O -o \$\@ -f $table.fnm\n\n};
 989         open (FILELIST, ">$table.fnm")
 990             || die "Could not open $table.fnm: $!";
 991         foreach my $file (@{$tables{$table}})
 992         {
 993             print FILELIST $self->catfile($dir,$file) . "\n";
 994         }
 995         close(FILELIST);
 996     }
 997     return $str;
 998 }
 999 END_OF_MAKEFILE_PL
1000     close $fh;
1001     (my $pm =<<"END_OF_PM") =~ s/^# //gm;
1002 # package Encode::$name;
1003 # our \$VERSION = "0.01";
1004 #
1005 # use Encode;
1006 # use XSLoader;
1007 # XSLoader::load('Encode::$name', \$VERSION);
1008 #
1009 # 1;
1010 # __END__
1011 #
1012 # =head1 NAME
1013 #
1014 # Encode::$name - New Encoding
1015 #
1016 # =head1 SYNOPSIS
1017 #
1018 # You got to fill this in!
1019 #
1020 # =head1 SEE ALSO
1021 #
1022 # L<Encode>
1023 #
1024 # =cut
1025 END_OF_PM
1026     open $fh, ">$name.pm" or die "$name.pm:$!";
1027     print $fh $pm;
1028     close $fh;
1029     -d 't' or mkdir 't', 0755 or die "mkdir t:$!";
1030     open $fh, ">t/$name.t" or die "t/$name.t:$!";
1031 print $fh <<"END_OF_TEST";
1032 use strict;
1033 # Adjust the number here!
1034 use Test::More tests => 2;
1035
1036 use_ok('Encode');
1037 use_ok('Encode::$name');
1038 # Add more test here!
1039 END_OF_TEST
1040     close $fh;
1041     exit;
1042 }
1043
1044 __END__
1045
1046 =head1 NAME
1047
1048 enc2xs -- Perl Encode Module Generator
1049
1050 =head1 SYNOPSIS
1051
1052   enc2xs -M ModName mapfiles...
1053   enc2xs -[options]
1054
1055 =head1 DESCRIPTION
1056
1057 F<enc2xs> builds a Perl extension for use by Encode from either
1058 Unicode Character Mapping files (.ucm) or Tcl Encoding Files
1059 (.enc)  Besides internally used during the build process of Encode
1060 module, you can use F<enc2xs> to add your own encoding to perl.  No
1061 knowledge on XS is necessary.
1062
1063 =head1 Quick Guide
1064
1065 If what you want to know as little about Perl possible but needs to
1066 add a new encoding, just read this chapter and forget the rest.
1067
1068 =over 4
1069
1070 =item 0.
1071
1072 Have a .ucm file ready.  You can get it from somewhere or you can
1073 write your own from scratch or you can grab one from Encode
1074 distribution and customize.  For UCM format, see the next Chapter.
1075 In the example below, I'll call my theoretical encoding myascii,
1076 defined inI<my.ucm>.  C<$> is a shell prompt.
1077
1078   $ ls -F
1079   my.ucm
1080
1081 =item 1.
1082
1083 Issue a command as follows;
1084
1085   $ enc2xs -M My my.ucm
1086
1087 Now take a look at your current directory.  It should look like this.
1088
1089   $ ls -F
1090   Makefile.PL   My.pm         my.ucm        t/
1091
1092 The following files are created.
1093
1094   Makefle.PL - MakeMaker script
1095   My.pm      - Encode Submodule
1096   t/My.t     - test file
1097
1098 =item 1.1.
1099
1100 If you want *.ucm installed together with the modules, do as follows;
1101
1102   $ mkdir Encode
1103   $ mv *.ucm Encode
1104   $ enc2xs -M My Encode/*ucm
1105
1106 =item 2.
1107
1108 Edit the files generated.  You don't have to if you have no time AND no
1109 intention to give it to someone else.  But it is a good idea to edit
1110 pod and add more tests.
1111
1112 =item 3.
1113
1114 Now issue a command all Perl Mongers love;
1115
1116   $ perl5.7.3 Makefile.PL
1117   Writing Makefile for Encode::My
1118
1119 =item 4.
1120
1121 Now all you have to do is make.
1122
1123   $ make
1124   cp My.pm blib/lib/Encode/My.pm
1125   /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1126     -o encode_t.c -f encode_t.fnm
1127   Reading myascii (myascii)
1128   Writing compiled form
1129   128 bytes in string tables
1130   384 bytes (25%) saved spotting duplicates
1131   1 bytes (99.2%) saved using substrings
1132   ....
1133   chmod 644 blib/arch/auto/Encode/My/My.bs
1134   $
1135
1136 The time it takes varies how fast your machine is and how large your
1137 encoding is.  Unless you are working on something big like euc-tw, it
1138 won't take too long.
1139
1140 =item 5.
1141
1142 You can "make install" already but you should test first.
1143
1144   $ make test
1145   PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1146     -e 'use Test::Harness  qw(&runtests $verbose); \
1147     $verbose=0; runtests @ARGV;' t/*.t
1148   t/My....ok
1149   All tests successful.
1150   Files=1, Tests=2,  0 wallclock secs
1151    ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1152
1153 =item 6.
1154
1155 If you are content with the test result, just "make install"
1156
1157 =back
1158
1159 =head1 The Unicode Character Map
1160
1161 Encode uses The Unicode Character Map (UCM) for source character
1162 mappings.  This format is used by ICU package of IBM and adopted by
1163 Nick Ing-Simmons.  Since UCM is more flexible than Tcl's Encoding Map
1164 and far more user-friendly,  This is the recommended formet for
1165 Encode now.
1166
1167 UCM file looks like this.
1168
1169   #
1170   # Comments
1171   #
1172   <code_set_name> "US-ascii" # Required
1173   <code_set_alias> "ascii"   # Optional
1174   <mb_cur_min> 1             # Required; usually 1
1175   <mb_cur_max> 1             # Max. # of bytes/char
1176   <subchar> \x3F             # Substitution char
1177   #
1178   CHARMAP
1179   <U0000> \x00 |0 # <control>
1180   <U0001> \x01 |0 # <control>
1181   <U0002> \x02 |0 # <control>
1182   ....
1183   <U007C> \x7C |0 # VERTICAL LINE
1184   <U007D> \x7D |0 # RIGHT CURLY BRACKET
1185   <U007E> \x7E |0 # TILDE
1186   <U007F> \x7F |0 # <control>
1187   END CHARMAP
1188
1189 =over 4
1190
1191 =item *
1192
1193 Anything that follows C<#> is treated as comments.
1194
1195 =item *
1196
1197 The header section continues until CHARMAP. This section Has a form of
1198 I<E<lt>keywordE<gt> value>, one at a line.  For a value, strings must
1199 be quoted. Barewords are treated as numbers.  I<\xXX> represents a
1200 byte.
1201
1202 Most of the keywords are self-explanatory. I<subchar> means
1203 substitution character, not subcharacter.  When you decode a Unicode
1204 sequence to this encoding but no matching character is found, the byte
1205 sequence defined here will be used.  For most cases, the value here is
1206 \x3F, in ASCII this is a question mark.
1207
1208 =item *
1209
1210 CHARMAP starts the character map section.  Each line has a form as
1211 follows;
1212
1213   <UXXXX> \xXX.. |0 # comment
1214     ^     ^      ^
1215     |     |      +- Fallback flag
1216     |     +-------- Encoded byte sequence
1217     +-------------- Unicode Character ID in hex
1218
1219 The format is roughly the same as a header section except for fallback
1220 flag.  It is | followed by 0..3.   And their meaning as follows
1221
1222 =over 2
1223
1224 =item |0
1225
1226 Round trip safe.   A character decoded to Unicode encodes back to the
1227 same byte sequence. most character belong to this.
1228
1229 =item |1
1230
1231 Fallback for unicode -> encoding.  When seen, enc2xs adds this
1232 character for encode map only
1233
1234 =item |2
1235
1236 Skip sub-char mapping should there be no code point.
1237
1238 =item |3
1239
1240 Fallback for encoding -> unicode.  When seen, enc2xs adds this
1241 character for decode map only
1242
1243 =back
1244
1245 =item *
1246
1247 And finally, END OF CHARMAP ends the section.
1248
1249 =back
1250
1251 Needless to say, if you are manually creating a UCM file, you should
1252 copy ascii.ucm or existing encoding which is close to yours than write
1253 your own from scratch.
1254
1255 When you do so, make sure you leave at least B<U0000> to B<U0020> as
1256 is, unless your environment is on EBCDIC.
1257
1258 B<CAVEAT>: not all features in UCM are implemented.  For example,
1259 icu:state is not used.  Because of that, you need to write a perl
1260 module if you want to support algorithmical encodings, notablly
1261 ISO-2022 series.  Such modules include L<Encode::JP::2022_JP>,
1262 L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1263
1264 =head1 Bookmarks
1265
1266 ICU Home Page
1267 L<http://oss.software.ibm.com/icu/>
1268
1269 ICU Character Mapping Tables
1270 L<http://oss.software.ibm.com/icu/charset/>
1271
1272 ICU:Conversion Data
1273 L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1274
1275 =head1 SEE ALSO
1276
1277 L<Encode>,
1278 L<perlmod>,
1279 L<perlpod>
1280
1281 =cut
1282
1283 # -Q to disable the duplicate codepoint test
1284 # -S make mapping errors fatal
1285 # -q to remove comments written to output files
1286 # -O to enable the (brute force) substring optimiser
1287 # -o <output> to specify the output file name (else it's the first arg)
1288 # -f <inlist> to give a file with a list of input files (else use the args)
1289 # -n <name> to name the encoding (else use the basename of the input file.
1290
1291 With %seen holding array refs:
1292
1293       865.66 real        28.80 user         8.79 sys
1294       7904  maximum resident set size
1295       1356  average shared memory size
1296      18566  average unshared data size
1297        229  average unshared stack size
1298      46080  page reclaims
1299      33373  page faults
1300
1301 With %seen holding simple scalars:
1302
1303       342.16 real        27.11 user         3.54 sys
1304       8388  maximum resident set size
1305       1394  average shared memory size
1306      14969  average unshared data size
1307        236  average unshared stack size
1308      28159  page reclaims
1309       9839  page faults
1310
1311 Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1312 how %seen is storing things its seen. So it is pathalogically bad on a 16M
1313 RAM machine, but it's going to help even on modern machines.
1314 Swapping is bad, m'kay :-)