ext/Encode/bin/enc2xs

   1 #!./perl
   2 BEGIN {
   3     # @INC poking  no longer needed w/ new MakeMaker and Makefile.PL's
   4     # with $ENV{PERL_CORE} set
   5     # In case we need it in future...
   6     require Config; import Config;
   7 }
   8 use strict;
   9 use Getopt::Std;
  10 my @orig_ARGV = @ARGV;
  11 our $VERSION  = do { my @r = (q$Revision: 1.23 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  12
  13 # These may get re-ordered.
  14 # RAW is a do_now as inserted by &enter
  15 # AGG is an aggreagated do_now, as built up by &process
  16
  17 use constant {
  18   RAW_NEXT => 0,
  19   RAW_IN_LEN => 1,
  20   RAW_OUT_BYTES => 2,
  21   RAW_FALLBACK => 3,
  22
  23   AGG_MIN_IN => 0,
  24   AGG_MAX_IN => 1,
  25   AGG_OUT_BYTES => 2,
  26   AGG_NEXT => 3,
  27   AGG_IN_LEN => 4,
  28   AGG_OUT_LEN => 5,
  29   AGG_FALLBACK => 6,
  30 };
  31
  32 # (See the algorithm in encengine.c - we're building structures for it)
  33
  34 # There are two sorts of structures.
  35 # "do_now" (an array, two variants of what needs storing) is whatever we need
  36 # to do now we've read an input byte.
  37 # It's housed in a "do_next" (which is how we got to it), and in turn points
  38 # to a "do_next" which contains all the "do_now"s for the next input byte.
  39
  40 # There will be a "do_next" which is the start state.
  41 # For a single byte encoding it's the only "do_next" - each "do_now" points
  42 # back to it, and each "do_now" will cause bytes. There is no state.
  43
  44 # For a multi-byte encoding where all characters in the input are the same
  45 # length, then there will be a tree of "do_now"->"do_next"->"do_now"
  46 # branching out from the start state, one step for each input byte.
  47 # The leaf "do_now"s will all be at the same distance from the start state,
  48 # only the leaf "do_now"s cause output bytes, and they in turn point back to
  49 # the start state.
  50
  51 # For an encoding where there are varaible length input byte sequences, you
  52 # will encounter a leaf "do_now" sooner for the shorter input sequences, but
  53 # as before the leaves will point back to the start state.
  54
  55 # The system will cope with escape encodings (imagine them as a mostly
  56 # self-contained tree for each escape state, and cross links between trees
  57 # at the state-switching characters) but so far no input format defines these.
  58
  59 # The system will also cope with having output "leaves" in the middle of
  60 # the bifurcating branches, not just at the extremities, but again no
  61 # input format does this yet.
  62
  63 # There are two variants of the "do_now" structure. The first, smaller variant
  64 # is generated by &enter as the input file is read. There is one structure
  65 # for each input byte. Say we are mapping a single byte encoding to a
  66 # single byte encoding, with  "ABCD" going "abcd". There will be
  67 # 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
  68
  69 # &process then walks the tree, building aggregate "do_now" structres for
  70 # adjacent bytes where possible. The aggregate is for a contiguous range of
  71 # bytes which each produce the same length of output, each move to the
  72 # same next state, and each have the same fallback flag.
  73 # So our 4 RAW "do_now"s above become replaced by a single structure
  74 # containing:
  75 # ["A", "D", "abcd", 1, ...]
  76 # ie, for an input byte $_ in "A".."D", output 1 byte, found as
  77 # substr ("abcd", (ord $_ - ord "A") * 1, 1)
  78 # which maps very nicely into pointer arithmetic in C for encengine.c
  79
  80 sub encode_U
  81 {
  82  # UTF-8 encode long hand - only covers part of perl's range
  83  ## my $uv = shift;
  84  # chr() works in native space so convert value from table
  85  # into that space before using chr().
  86  my $ch = chr(utf8::unicode_to_native($_[0]));
  87  # Now get core perl to encode that the way it likes.
  88  utf8::encode($ch);
  89  return $ch;
  90 }
  91
  92 sub encode_S
  93 {
  94  # encode single byte
  95  ## my ($ch,$page) = @_; return chr($ch);
  96  return chr $_[0];
  97 }
  98
  99 sub encode_D
 100 {
 101  # encode double byte MS byte first
 102  ## my ($ch,$page) = @_; return chr($page).chr($ch);
 103  return chr ($_[1]) . chr $_[0];
 104 }
 105
 106 sub encode_M
 107 {
 108  # encode Multi-byte - single for 0..255 otherwise double
 109  ## my ($ch,$page) = @_;
 110  ## return &encode_D if $page;
 111  ## return &encode_S;
 112  return chr ($_[1]) . chr $_[0] if $_[1];
 113  return chr $_[0];
 114 }
 115
 116 my %encode_types = (U => \&encode_U,
 117                     S => \&encode_S,
 118                     D => \&encode_D,
 119                     M => \&encode_M,
 120                    );
 121
 122 # Win32 does not expand globs on command line
 123 eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
 124
 125 my %opt;
 126 # I think these are:
 127 # -Q to disable the duplicate codepoint test
 128 # -S make mapping errors fatal
 129 # -q to remove comments written to output files
 130 # -O to enable the (brute force) substring optimiser
 131 # -o <output> to specify the output file name (else it's the first arg)
 132 # -f <inlist> to give a file with a list of input files (else use the args)
 133 # -n <name> to name the encoding (else use the basename of the input file.
 134 getopts('CM:SQqOo:f:n:',\%opt);
 135
 136 $opt{M} and make_makefile_pl($opt{M}, @ARGV);
 137 $opt{C} and make_configlocal_pm($opt{C}, @ARGV);
 138
 139 # This really should go first, else the die here causes empty (non-erroneous)
 140 # output files to be written.
 141 my @encfiles;
 142 if (exists $opt{'f'}) {
 143     # -F is followed by name of file containing list of filenames
 144     my $flist = $opt{'f'};
 145     open(FLIST,$flist) || die "Cannot open $flist:$!";
 146     chomp(@encfiles = <FLIST>);
 147     close(FLIST);
 148 } else {
 149     @encfiles = @ARGV;
 150 }
 151
 152 my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
 153 chmod(0666,$cname) if -f $cname && !-w $cname;
 154 open(C,">$cname") || die "Cannot open $cname:$!";
 155
 156 my $dname = $cname;
 157 my $hname = $cname;
 158
 159 my ($doC,$doEnc,$doUcm,$doPet);
 160
 161 if ($cname =~ /\.(c|xs)$/)
 162  {
 163   $doC = 1;
 164   $dname =~ s/(\.[^\.]*)?$/.exh/;
 165   chmod(0666,$dname) if -f $cname && !-w $dname;
 166   open(D,">$dname") || die "Cannot open $dname:$!";
 167   $hname =~ s/(\.[^\.]*)?$/.h/;
 168   chmod(0666,$hname) if -f $cname && !-w $hname;
 169   open(H,">$hname") || die "Cannot open $hname:$!";
 170
 171   foreach my $fh (\*C,\*D,\*H)
 172   {
 173    print $fh <<"END" unless $opt{'q'};
 174 /*
 175  !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
 176  This file was autogenerated by:
 177  $^X $0 @orig_ARGV
 178 */
 179 END
 180   }
 181
 182   if ($cname =~ /(\w+)\.xs$/)
 183    {
 184     print C "#include <EXTERN.h>\n";
 185     print C "#include <perl.h>\n";
 186     print C "#include <XSUB.h>\n";
 187     print C "#define U8 U8\n";
 188    }
 189   print C "#include \"encode.h\"\n";
 190
 191  }
 192 elsif ($cname =~ /\.enc$/)
 193  {
 194   $doEnc = 1;
 195  }
 196 elsif ($cname =~ /\.ucm$/)
 197  {
 198   $doUcm = 1;
 199  }
 200 elsif ($cname =~ /\.pet$/)
 201  {
 202   $doPet = 1;
 203  }
 204
 205 my %encoding;
 206 my %strings;
 207 my $saved = 0;
 208 my $subsave = 0;
 209 my $strings = 0;
 210
 211 sub cmp_name
 212 {
 213  if ($a =~ /^.*-(\d+)/)
 214   {
 215    my $an = $1;
 216    if ($b =~ /^.*-(\d+)/)
 217     {
 218      my $r = $an <=> $1;
 219      return $r if $r;
 220     }
 221   }
 222  return $a cmp $b;
 223 }
 224
 225
 226 foreach my $enc (sort cmp_name @encfiles)
 227  {
 228   my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
 229   $name = $opt{'n'} if exists $opt{'n'};
 230   if (open(E,$enc))
 231    {
 232     if ($sfx eq 'enc')
 233      {
 234       compile_enc(\*E,lc($name));
 235      }
 236     else
 237      {
 238       compile_ucm(\*E,lc($name));
 239      }
 240    }
 241   else
 242    {
 243     warn "Cannot open $enc for $name:$!";
 244    }
 245  }
 246
 247 if ($doC)
 248  {
 249   print STDERR "Writing compiled form\n";
 250   foreach my $name (sort cmp_name keys %encoding)
 251    {
 252     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 253     output(\*C,$name.'_utf8',$e2u);
 254     output(\*C,'utf8_'.$name,$u2e);
 255     # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
 256    }
 257   foreach my $enc (sort cmp_name keys %encoding)
 258    {
 259     # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
 260     my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
 261     #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
 262     my $replen = 0;
 263     $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
 264     my @info = ($e2u->{Cname},$u2e->{Cname},qq((U8 *)"$rep"),$replen,$min_el,$max_el);
 265     my $sym = "${enc}_encoding";
 266     $sym =~ s/\W+/_/g;
 267     print C "encode_t $sym = \n";
 268     print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
 269    }
 270
 271   foreach my $enc (sort cmp_name keys %encoding)
 272    {
 273     my $sym = "${enc}_encoding";
 274     $sym =~ s/\W+/_/g;
 275     print H "extern encode_t $sym;\n";
 276     print D " Encode_XSEncoding(aTHX_ &$sym);\n";
 277    }
 278
 279   if ($cname =~ /(\w+)\.xs$/)
 280    {
 281     my $mod = $1;
 282     print C <<'END';
 283
 284 static void
 285 Encode_XSEncoding(pTHX_ encode_t *enc)
 286 {
 287  dSP;
 288  HV *stash = gv_stashpv("Encode::XS", TRUE);
 289  SV *sv    = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
 290  int i = 0;
 291  PUSHMARK(sp);
 292  XPUSHs(sv);
 293  while (enc->name[i])
 294   {
 295    const char *name = enc->name[i++];
 296    XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
 297   }
 298  PUTBACK;
 299  call_pv("Encode::define_encoding",G_DISCARD);
 300  SvREFCNT_dec(sv);
 301 }
 302
 303 END
 304
 305     print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
 306     print C "BOOT:\n{\n";
 307     print C "#include \"$dname\"\n";
 308     print C "}\n";
 309    }
 310   # Close in void context is bad, m'kay
 311   close(D) or warn "Error closing '$dname': $!";
 312   close(H) or warn "Error closing '$hname': $!";
 313
 314   my $perc_saved    = $strings/($strings + $saved) * 100;
 315   my $perc_subsaved = $strings/($strings + $subsave) * 100;
 316   printf STDERR "%d bytes in string tables\n",$strings;
 317   printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
 318     $saved, $perc_saved              if $saved;
 319   printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
 320     $subsave, $perc_subsaved         if $subsave;
 321  }
 322 elsif ($doEnc)
 323  {
 324   foreach my $name (sort cmp_name keys %encoding)
 325    {
 326     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 327     output_enc(\*C,$name,$e2u);
 328    }
 329  }
 330 elsif ($doUcm)
 331  {
 332   foreach my $name (sort cmp_name keys %encoding)
 333    {
 334     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 335     output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
 336    }
 337  }
 338
 339 # writing half meg files and then not checking to see if you just filled the
 340 # disk is bad, m'kay
 341 close(C) or die "Error closing '$cname': $!";
 342
 343 # End of the main program.
 344
 345 sub compile_ucm
 346 {
 347  my ($fh,$name) = @_;
 348  my $e2u = {};
 349  my $u2e = {};
 350  my $cs;
 351  my %attr;
 352  while (<$fh>)
 353   {
 354    s/#.*$//;
 355    last if /^\s*CHARMAP\s*$/i;
 356    if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
 357     {
 358      $attr{$1} = $2;
 359     }
 360   }
 361  if (!defined($cs =  $attr{'code_set_name'}))
 362   {
 363    warn "No <code_set_name> in $name\n";
 364   }
 365  else
 366   {
 367    $name = $cs unless exists $opt{'n'};
 368   }
 369  my $erep;
 370  my $urep;
 371  my $max_el;
 372  my $min_el;
 373  if (exists $attr{'subchar'})
 374   {
 375    #my @byte;
 376    #$attr{'subchar'} =~ /^\s*/cg;
 377    #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
 378    #$erep = join('',map(chr(hex($_)),@byte));
 379    $erep = $attr{'subchar'};
 380    $erep =~ s/^\s+//; $erep =~ s/\s+$//;
 381   }
 382  print "Reading $name ($cs)\n";
 383  my $nfb = 0;
 384  my $hfb = 0;
 385  while (<$fh>)
 386   {
 387    s/#.*$//;
 388    last if /^\s*END\s+CHARMAP\s*$/i;
 389    next if /^\s*$/;
 390    my (@uni, @byte) = ();
 391    my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
 392        or die "Bad line: $_";
 393    while ($uni =~  m/\G<([U0-9a-fA-F\+]+)>/g){
 394        push @uni, map { substr($_, 1) } split(/\+/, $1);
 395    }
 396    while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
 397        push @byte, $1;
 398    }
 399    if (@uni)
 400     {
 401      my $uch =  join('', map { encode_U(hex($_)) } @uni );
 402      my $ech = join('',map(chr(hex($_)),@byte));
 403      my $el  = length($ech);
 404      $max_el = $el if (!defined($max_el) || $el > $max_el);
 405      $min_el = $el if (!defined($min_el) || $el < $min_el);
 406      if (length($fb))
 407       {
 408        $fb = substr($fb,1);
 409        $hfb++;
 410       }
 411      else
 412       {
 413        $nfb++;
 414        $fb = '0';
 415       }
 416      # $fb is fallback flag
 417      # 0 - round trip safe
 418      # 1 - fallback for unicode -> enc
 419      # 2 - skip sub-char mapping
 420      # 3 - fallback enc -> unicode
 421      enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
 422      enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
 423     }
 424    else
 425     {
 426      warn $_;
 427     }
 428   }
 429  if ($nfb && $hfb)
 430   {
 431    die "$nfb entries without fallback, $hfb entries with\n";
 432   }
 433  $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
 434 }
 435
 436
 437
 438 sub compile_enc
 439 {
 440  my ($fh,$name) = @_;
 441  my $e2u = {};
 442  my $u2e = {};
 443
 444  my $type;
 445  while ($type = <$fh>)
 446   {
 447    last if $type !~ /^\s*#/;
 448   }
 449  chomp($type);
 450  return if $type eq 'E';
 451  # Do the hash lookup once, rather than once per function call. 4% speedup.
 452  my $type_func = $encode_types{$type};
 453  my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
 454  warn "$type encoded $name\n";
 455  my $rep = '';
 456  # Save a defined test by setting these to defined values.
 457  my $min_el = ~0; # A very big integer
 458  my $max_el = 0;  # Anything must be longer than 0
 459  {
 460   my $v = hex($def);
 461   $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
 462  }
 463  my $errors;
 464  my $seen;
 465  # use -Q to silence the seen test. Makefile.PL uses this by default.
 466  $seen = {} unless $opt{Q};
 467  do
 468   {
 469    my $line = <$fh>;
 470    chomp($line);
 471    my $page = hex($line);
 472    my $ch = 0;
 473    my $i = 16;
 474    do
 475     {
 476      # So why is it 1% faster to leave the my here?
 477      my $line = <$fh>;
 478      $line =~ s/\r\n$/\n/;
 479      die "$.:${line}Line should be exactly 65 characters long including
 480      newline (".length($line).")" unless length ($line) == 65;
 481      # Split line into groups of 4 hex digits, convert groups to ints
 482      # This takes 65.35
 483      # map {hex $_} $line =~ /(....)/g
 484      # This takes 63.75 (2.5% less time)
 485      # unpack "n*", pack "H*", $line
 486      # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
 487      # Doing it as while ($line =~ /(....)/g) took 74.63
 488      foreach my $val (unpack "n*", pack "H*", $line)
 489       {
 490        next if $val == 0xFFFD;
 491        my $ech = &$type_func($ch,$page);
 492        if ($val || (!$ch && !$page))
 493         {
 494          my $el  = length($ech);
 495          $max_el = $el if $el > $max_el;
 496          $min_el = $el if $el < $min_el;
 497          my $uch = encode_U($val);
 498          if ($seen) {
 499            # We're doing the test.
 500            # We don't need to read this quickly, so storing it as a scalar,
 501            # rather than 3 (anon array, plus the 2 scalars it holds) saves
 502            # RAM and may make us faster on low RAM systems. [see __END__]
 503            if (exists $seen->{$uch})
 504              {
 505                warn sprintf("U%04X is %02X%02X and %04X\n",
 506                             $val,$page,$ch,$seen->{$uch});
 507                $errors++;
 508              }
 509            else
 510              {
 511                $seen->{$uch} = $page << 8 | $ch;
 512              }
 513          }
 514          # Passing 2 extra args each time is 3.6% slower!
 515          # Even with having to add $fallback ||= 0 later
 516          enter_fb0($e2u,$ech,$uch);
 517          enter_fb0($u2e,$uch,$ech);
 518         }
 519        else
 520         {
 521          # No character at this position
 522          # enter($e2u,$ech,undef,$e2u);
 523         }
 524        $ch++;
 525       }
 526     } while --$i;
 527   } while --$pages;
 528  die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
 529    if $min_el > $max_el;
 530  die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
 531  $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
 532 }
 533
 534 # my ($a,$s,$d,$t,$fb) = @_;
 535 sub enter {
 536   my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
 537   # state we shift to after this (multibyte) input character defaults to same
 538   # as current state.
 539   $next ||= $current;
 540   # Making sure it is defined seems to be faster than {no warnings;} in
 541   # &process, or passing it in as 0 explicity.
 542   # XXX $fallback ||= 0;
 543
 544   # Start at the beginning and work forwards through the string to zero.
 545   # effectively we are removing 1 character from the front each time
 546   # but we don't actually edit the string. [this alone seems to be 14% speedup]
 547   # Hence -$pos is the length of the remaining string.
 548   my $pos = -length $inbytes;
 549   while (1) {
 550     my $byte = substr $inbytes, $pos, 1;
 551     #  RAW_NEXT => 0,
 552     #  RAW_IN_LEN => 1,
 553     #  RAW_OUT_BYTES => 2,
 554     #  RAW_FALLBACK => 3,
 555     # to unicode an array would seem to be better, because the pages are dense.
 556     # from unicode can be very sparse, favouring a hash.
 557     # hash using the bytes (all length 1) as keys rather than ord value,
 558     # as it's easier to sort these in &process.
 559
 560     # It's faster to always add $fallback even if it's undef, rather than
 561     # choosing between 3 and 4 element array. (hence why we set it defined
 562     # above)
 563     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
 564     # When $pos was -1 we were at the last input character.
 565     unless (++$pos) {
 566       $do_now->[RAW_OUT_BYTES] = $outbytes;
 567       $do_now->[RAW_NEXT] = $next;
 568       return;
 569     }
 570     # Tail recursion. The intermdiate state may not have a name yet.
 571     $current = $do_now->[RAW_NEXT];
 572   }
 573 }
 574
 575 # This is purely for optimistation. It's just &enter hard coded for $fallback
 576 # of 0, using only a 3 entry array ref to save memory for every entry.
 577 sub enter_fb0 {
 578   my ($current,$inbytes,$outbytes,$next) = @_;
 579   $next ||= $current;
 580
 581   my $pos = -length $inbytes;
 582   while (1) {
 583     my $byte = substr $inbytes, $pos, 1;
 584     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
 585     unless (++$pos) {
 586       $do_now->[RAW_OUT_BYTES] = $outbytes;
 587       $do_now->[RAW_NEXT] = $next;
 588       return;
 589     }
 590     $current = $do_now->[RAW_NEXT];
 591   }
 592 }
 593
 594
 595 sub outstring
 596 {
 597  my ($fh,$name,$s) = @_;
 598  my $sym = $strings{$s};
 599  if ($sym)
 600   {
 601    $saved += length($s);
 602   }
 603  else
 604   {
 605    if ($opt{'O'}) {
 606        foreach my $o (keys %strings)
 607         {
 608          next unless (my $i = index($o,$s)) >= 0;
 609          $sym = $strings{$o};
 610          # gcc things that 0x0e+0x10 (anything with e+) starts to look like
 611          # a hexadecimal floating point constant. Silly gcc. Only p
 612          # introduces a floating point constant. Put the space in to stop it
 613          # getting confused.
 614          $sym .= sprintf(" +0x%02x",$i) if ($i);
 615          $subsave += length($s);
 616          return $strings{$s} = $sym;
 617        }
 618    }
 619    $strings{$s} = $sym = $name;
 620    $strings += length($s);
 621    my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
 622    # Maybe we should assert that these are all <256.
 623    $definition .= join(',',unpack "C*",$s);
 624    # We have a single long line. Split it at convenient commas.
 625    $definition =~ s/(.{74,77},)/$1\n/g;
 626    print $fh "$definition };\n\n";
 627   }
 628  return $sym;
 629 }
 630
 631 sub process
 632 {
 633   my ($name,$a) = @_;
 634   $name =~ s/\W+/_/g;
 635   $a->{Cname} = $name;
 636   my $raw = $a->{Raw};
 637   my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
 638   my @ent;
 639   $agg_max_in = 0;
 640   foreach my $key (sort keys %$raw) {
 641     #  RAW_NEXT => 0,
 642     #  RAW_IN_LEN => 1,
 643     #  RAW_OUT_BYTES => 2,
 644     #  RAW_FALLBACK => 3,
 645     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 646     # Now we are converting from raw to aggregate, switch from 1 byte strings
 647     # to numbers
 648     my $b = ord $key;
 649     $fallback ||= 0;
 650     if ($l &&
 651         # If this == fails, we're going to reset $agg_max_in below anyway.
 652         $b == ++$agg_max_in &&
 653         # References in numeric context give the pointer as an int.
 654         $agg_next == $next &&
 655         $agg_in_len == $in_len &&
 656         $agg_out_len == length $out_bytes &&
 657         $agg_fallback == $fallback
 658         # && length($l->[AGG_OUT_BYTES]) < 16
 659        ) {
 660       #     my $i = ord($b)-ord($l->[AGG_MIN_IN]);
 661       # we can aggregate this byte onto the end.
 662       $l->[AGG_MAX_IN] = $b;
 663       $l->[AGG_OUT_BYTES] .= $out_bytes;
 664     } else {
 665       # AGG_MIN_IN => 0,
 666       # AGG_MAX_IN => 1,
 667       # AGG_OUT_BYTES => 2,
 668       # AGG_NEXT => 3,
 669       # AGG_IN_LEN => 4,
 670       # AGG_OUT_LEN => 5,
 671       # AGG_FALLBACK => 6,
 672       # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
 673       # (only gains .6% on euc-jp  -- is it worth it?)
 674       push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
 675                        $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
 676                        $agg_fallback = $fallback];
 677     }
 678     if (exists $next->{Cname}) {
 679       $next->{'Forward'} = 1 if $next != $a;
 680     } else {
 681       process(sprintf("%s_%02x",$name,$b),$next);
 682     }
 683   }
 684   # encengine.c rules say that last entry must be for 255
 685   if ($agg_max_in < 255) {
 686     push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
 687   }
 688   $a->{'Entries'} = \@ent;
 689 }
 690
 691 sub outtable
 692 {
 693  my ($fh,$a) = @_;
 694  my $name = $a->{'Cname'};
 695  # String tables
 696  foreach my $b (@{$a->{'Entries'}})
 697   {
 698    next unless $b->[AGG_OUT_LEN];
 699    my $s = $b->[AGG_MIN_IN];
 700    my $e = $b->[AGG_MAX_IN];
 701    outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
 702   }
 703  if ($a->{'Forward'})
 704   {
 705    print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
 706   }
 707  $a->{'Done'} = 1;
 708  foreach my $b (@{$a->{'Entries'}})
 709   {
 710    my ($s,$e,$out,$t,$end,$l) = @$b;
 711    outtable($fh,$t) unless $t->{'Done'};
 712   }
 713  print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
 714  foreach my $b (@{$a->{'Entries'}})
 715   {
 716    my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
 717    $end |= 0x80 if $fb;
 718    print  $fh "{";
 719    if ($l)
 720     {
 721      printf $fh outstring($fh,'',$out);
 722     }
 723    else
 724     {
 725      print  $fh "0";
 726     }
 727    print  $fh ",",$t->{Cname};
 728    printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
 729   }
 730  print $fh "};\n";
 731 }
 732
 733 sub output
 734 {
 735  my ($fh,$name,$a) = @_;
 736  process($name,$a);
 737  # Sub-tables
 738  outtable($fh,$a);
 739 }
 740
 741 sub output_enc
 742 {
 743  my ($fh,$name,$a) = @_;
 744  die "Changed - fix me for new structure";
 745  foreach my $b (sort keys %$a)
 746   {
 747    my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
 748   }
 749 }
 750
 751 sub decode_U
 752 {
 753  my $s = shift;
 754 }
 755
 756 my @uname;
 757 sub char_names
 758 {
 759  my $s = do "unicore/Name.pl";
 760  die "char_names: unicore/Name.pl: $!\n" unless defined $s;
 761  pos($s) = 0;
 762  while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
 763   {
 764    my $name = $3;
 765    my $s = hex($1);
 766    last if $s >= 0x10000;
 767    my $e = length($2) ? hex($2) : $s;
 768    for (my $i = $s; $i <= $e; $i++)
 769     {
 770      $uname[$i] = $name;
 771 #    print sprintf("U%04X $name\n",$i);
 772     }
 773   }
 774 }
 775
 776 sub output_ucm_page
 777 {
 778   my ($cmap,$a,$t,$pre) = @_;
 779   # warn sprintf("Page %x\n",$pre);
 780   my $raw = $t->{Raw};
 781   foreach my $key (sort keys %$raw) {
 782     #  RAW_NEXT => 0,
 783     #  RAW_IN_LEN => 1,
 784     #  RAW_OUT_BYTES => 2,
 785     #  RAW_FALLBACK => 3,
 786     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 787     my $u = ord $key;
 788     $fallback ||= 0;
 789
 790     if ($next != $a && $next != $t) {
 791       output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
 792     } elsif (length $out_bytes) {
 793       if ($pre) {
 794         $u = $pre|($u &0x3f);
 795       }
 796       my $s = sprintf "<U%04X> ",$u;
 797       #foreach my $c (split(//,$out_bytes)) {
 798       #  $s .= sprintf "\\x%02X",ord($c);
 799       #}
 800       # 9.5% faster changing that loop to this:
 801       $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
 802       $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
 803       push(@$cmap,$s);
 804     } else {
 805       warn join(',',$u, @{$raw->{$key}},$a,$t);
 806     }
 807   }
 808 }
 809
 810 sub output_ucm
 811 {
 812  my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
 813  print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
 814  print $fh "<code_set_name> \"$name\"\n";
 815  char_names();
 816  if (defined $min_el)
 817   {
 818    print $fh "<mb_cur_min> $min_el\n";
 819   }
 820  if (defined $max_el)
 821   {
 822    print $fh "<mb_cur_max> $max_el\n";
 823   }
 824  if (defined $rep)
 825   {
 826    print $fh "<subchar> ";
 827    foreach my $c (split(//,$rep))
 828     {
 829      printf $fh "\\x%02X",ord($c);
 830     }
 831    print $fh "\n";
 832   }
 833  my @cmap;
 834  output_ucm_page(\@cmap,$h,$h,0);
 835  print $fh "#\nCHARMAP\n";
 836  foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
 837   {
 838    print $fh $line;
 839   }
 840  print $fh "END CHARMAP\n";
 841 }
 842
 843 use vars qw(
 844     $_Enc2xs
 845     $_Version
 846     $_Inc
 847     $_E2X
 848     $_Name
 849     $_TableFiles
 850     $_Now
 851 );
 852
 853 sub find_e2x{
 854     eval { require File::Find };
 855     my (@inc, %e2x_dir);
 856     for my $inc (@INC){
 857         push @inc, $inc unless $inc eq '.'; #skip current dir
 858     }
 859     File::Find::find(
 860              sub {
 861                  my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
 862                      $atime,$mtime,$ctime,$blksize,$blocks)
 863                      = lstat($_) or return;
 864                  -f _ or return;
 865                  if (/^.*\.e2x$/o){
 866                      $e2x_dir{$File::Find::dir} ||= $mtime;
 867                  }
 868                  return;
 869              }, @inc);
 870     warn join("\n", keys %e2x_dir), "\n";
 871     for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
 872         $_E2X = $d;
 873         # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
 874         return $_E2X;
 875     }
 876 }
 877
 878 sub make_makefile_pl
 879 {
 880     eval { require Encode; };
 881     $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
 882     # our used for variable expanstion
 883     $_Enc2xs = $0;
 884     $_Version = $VERSION;
 885     $_E2X = find_e2x();
 886     $_Name = shift;
 887     $_TableFiles = join(",", map {qq('$_')} @_);
 888     $_Now = scalar localtime();
 889
 890     eval { require File::Spec; };
 891     warn "Generating Makefile.PL\n";
 892     _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
 893     warn "Generating $_Name.pm\n";
 894     _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"),        "$_Name.pm");
 895     warn "Generating t/$_Name.t\n";
 896     _print_expand(File::Spec->catfile($_E2X,"_T.e2x"),         "t/$_Name.t");
 897     warn "Generating README\n";
 898     _print_expand(File::Spec->catfile($_E2X,"README.e2x"),     "README");
 899     warn "Generating t/$_Name.t\n";
 900     _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"),    "Changes");
 901     exit;
 902 }
 903
 904 use vars qw(
 905             $_ModLines
 906             $_LocalVer
 907             );
 908
 909 sub make_configlocal_pm
 910 {
 911     eval { require Encode; };
 912     $@ and die "Unable to require Encode: $@\n";
 913     eval { require File::Spec; };
 914     # our used for variable expanstion
 915     my %in_core = map {$_=>1}('ascii','iso-8859-1','utf8');
 916     my %LocalMod = ();
 917     for my $d (@INC){
 918         my $inc = File::Spec->catfile($d, "Encode");
 919         -d $inc or next;
 920         opendir my $dh, $inc or die "$inc:$!";
 921         warn "Checking $inc...\n";
 922         for my $f (grep /\.pm$/o, readdir($dh)){
 923             -f File::Spec->catfile($inc, "$f") or next;
 924             $INC{"Encode/$f"} and next;
 925             warn "require Encode/$f;\n";
 926             eval { require "Encode/$f"; };
 927             $@ and die "Can't require Encode/$f: $@\n";
 928             for my $enc (Encode->encodings()){
 929                 $in_core{$enc} and next;
 930                 $Encode::Config::ExtModule{$enc} and next;
 931                 my $mod = "Encode/$f";
 932                 $mod =~ s/\.pm$//o; $mod =~ s,/,::,og;
 933                 $LocalMod{$enc} ||= $mod;
 934             }
 935         }
 936     }
 937     $_ModLines = "";
 938     for my $enc (sort keys %LocalMod){
 939         $_ModLines .=
 940             qq(\$Encode::ExtModule{'$enc'} =\t"$LocalMod{$enc}";\n);
 941     }
 942     warn $_ModLines;
 943     $_LocalVer = _mkversion();
 944     $_E2X = find_e2x();
 945     $_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o;
 946     warn "Writing ", File::Spec->catfile($_Inc,"ConfigLocal.pm"), "\n";
 947     _print_expand(File::Spec->catfile($_E2X,"ConfigLocal_PM.e2x"),
 948                   File::Spec->catfile($_Inc,"ConfigLocal.pm"));
 949     exit;
 950 }
 951
 952 sub _mkversion{
 953     my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
 954     $yyyy += 1900, $mo +=1;
 955     return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
 956 }
 957
 958 sub _print_expand{
 959     eval { require File::Basename; };
 960     $@ and die "File::Basename needed.  Are you on miniperl?;\nerror: $@\n";
 961     File::Basename->import();
 962     my ($src, $dst) = @_;
 963     open my $in, $src or die "$src : $!";
 964     if ((my $d = dirname($dst)) ne '.'){
 965         -d $d or mkdir $d, 0755 or die  "mkdir $d : $!";
 966     }
 967     open my $out, ">$dst" or die "$!";
 968     my $asis = 0;
 969     while (<$in>){
 970         if (/^#### END_OF_HEADER/){
 971             $asis = 1; next;
 972         }
 973         s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
 974         print $out $_;
 975     }
 976 }
 977 __END__
 978
 979 =head1 NAME
 980
 981 enc2xs -- Perl Encode Module Generator
 982
 983 =head1 SYNOPSIS
 984
 985   enc2xs -[options]
 986   enc2xs -M ModName mapfiles...
 987   enc2xs -C
 988
 989 =head1 DESCRIPTION
 990
 991 F<enc2xs> builds a Perl extension for use by Encode from either
 992 Unicode Character Mapping files (.ucm) or Tcl Encoding Files
 993 (.enc)  Besides internally used during the build process of Encode
 994 module, you can use F<enc2xs> to add your own encoding to perl.  No
 995 knowledge on XS is necessary.
 996
 997 =head1 Quick Guide
 998
 999 If what you want to know as little about Perl possible but needs to
1000 add a new encoding, just read this chapter and forget the rest.
1001
1002 =over 4
1003
1004 =item 0.
1005
1006 Have a .ucm file ready.  You can get it from somewhere or you can
1007 write your own from scratch or you can grab one from Encode
1008 distribution and customize.  For UCM format, see the next Chapter.
1009 In the example below, I'll call my theoretical encoding myascii,
1010 defined inI<my.ucm>.  C<$> is a shell prompt.
1011
1012   $ ls -F
1013   my.ucm
1014
1015 =item 1.
1016
1017 Issue a command as follows;
1018
1019   $ enc2xs -M My my.ucm
1020   generating Makefile.PL
1021   generating My.pm
1022   generating README
1023   generating Changes
1024
1025 Now take a look at your current directory.  It should look like this.
1026
1027   $ ls -F
1028   Makefile.PL   My.pm         my.ucm        t/
1029
1030 The following files are created.
1031
1032   Makefle.PL - MakeMaker script
1033   My.pm      - Encode Submodule
1034   t/My.t     - test file
1035
1036 =item 1.1.
1037
1038 If you want *.ucm installed together with the modules, do as follows;
1039
1040   $ mkdir Encode
1041   $ mv *.ucm Encode
1042   $ enc2xs -M My Encode/*ucm
1043
1044 =item 2.
1045
1046 Edit the files generated.  You don't have to if you have no time AND no
1047 intention to give it to someone else.  But it is a good idea to edit
1048 pod and add more tests.
1049
1050 =item 3.
1051
1052 Now issue a command all Perl Mongers love;
1053
1054   $ perl5.7.3 Makefile.PL
1055   Writing Makefile for Encode::My
1056
1057 =item 4.
1058
1059 Now all you have to do is make.
1060
1061   $ make
1062   cp My.pm blib/lib/Encode/My.pm
1063   /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1064     -o encode_t.c -f encode_t.fnm
1065   Reading myascii (myascii)
1066   Writing compiled form
1067   128 bytes in string tables
1068   384 bytes (25%) saved spotting duplicates
1069   1 bytes (99.2%) saved using substrings
1070   ....
1071   chmod 644 blib/arch/auto/Encode/My/My.bs
1072   $
1073
1074 The time it takes varies how fast your machine is and how large your
1075 encoding is.  Unless you are working on something big like euc-tw, it
1076 won't take too long.
1077
1078 =item 5.
1079
1080 You can "make install" already but you should test first.
1081
1082   $ make test
1083   PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1084     -e 'use Test::Harness  qw(&runtests $verbose); \
1085     $verbose=0; runtests @ARGV;' t/*.t
1086   t/My....ok
1087   All tests successful.
1088   Files=1, Tests=2,  0 wallclock secs
1089    ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1090
1091 =item 6.
1092
1093 If you are content with the test result, just "make install"
1094
1095 =item 7.
1096
1097 If you want to add your encoding to Encode demand-loading list
1098 (so you don't have to "use Encode::YourEncoding"), run
1099
1100   enc2xs -C
1101
1102 to update Encode::ConfigLocal, a module that controls local settings.
1103 After that, "use Encode;" is enough to load your encodings on demand.
1104
1105 =back
1106
1107 =head1 The Unicode Character Map
1108
1109 Encode uses The Unicode Character Map (UCM) for source character
1110 mappings.  This format is used by ICU package of IBM and adopted by
1111 Nick Ing-Simmons.  Since UCM is more flexible than Tcl's Encoding Map
1112 and far more user-friendly,  This is the recommended formet for
1113 Encode now.
1114
1115 UCM file looks like this.
1116
1117   #
1118   # Comments
1119   #
1120   <code_set_name> "US-ascii" # Required
1121   <code_set_alias> "ascii"   # Optional
1122   <mb_cur_min> 1             # Required; usually 1
1123   <mb_cur_max> 1             # Max. # of bytes/char
1124   <subchar> \x3F             # Substitution char
1125   #
1126   CHARMAP
1127   <U0000> \x00 |0 # <control>
1128   <U0001> \x01 |0 # <control>
1129   <U0002> \x02 |0 # <control>
1130   ....
1131   <U007C> \x7C |0 # VERTICAL LINE
1132   <U007D> \x7D |0 # RIGHT CURLY BRACKET
1133   <U007E> \x7E |0 # TILDE
1134   <U007F> \x7F |0 # <control>
1135   END CHARMAP
1136
1137 =over 4
1138
1139 =item *
1140
1141 Anything that follows C<#> is treated as comments.
1142
1143 =item *
1144
1145 The header section continues until CHARMAP. This section Has a form of
1146 I<E<lt>keywordE<gt> value>, one at a line.  For a value, strings must
1147 be quoted. Barewords are treated as numbers.  I<\xXX> represents a
1148 byte.
1149
1150 Most of the keywords are self-explanatory. I<subchar> means
1151 substitution character, not subcharacter.  When you decode a Unicode
1152 sequence to this encoding but no matching character is found, the byte
1153 sequence defined here will be used.  For most cases, the value here is
1154 \x3F, in ASCII this is a question mark.
1155
1156 =item *
1157
1158 CHARMAP starts the character map section.  Each line has a form as
1159 follows;
1160
1161   <UXXXX> \xXX.. |0 # comment
1162     ^     ^      ^
1163     |     |      +- Fallback flag
1164     |     +-------- Encoded byte sequence
1165     +-------------- Unicode Character ID in hex
1166
1167 The format is roughly the same as a header section except for fallback
1168 flag.  It is | followed by 0..3.   And their meaning as follows
1169
1170 =over 2
1171
1172 =item |0
1173
1174 Round trip safe.   A character decoded to Unicode encodes back to the
1175 same byte sequence. most character belong to this.
1176
1177 =item |1
1178
1179 Fallback for unicode -> encoding.  When seen, enc2xs adds this
1180 character for encode map only
1181
1182 =item |2
1183
1184 Skip sub-char mapping should there be no code point.
1185
1186 =item |3
1187
1188 Fallback for encoding -> unicode.  When seen, enc2xs adds this
1189 character for decode map only
1190
1191 =back
1192
1193 =item *
1194
1195 And finally, END OF CHARMAP ends the section.
1196
1197 =back
1198
1199 Needless to say, if you are manually creating a UCM file, you should
1200 copy ascii.ucm or existing encoding which is close to yours than write
1201 your own from scratch.
1202
1203 When you do so, make sure you leave at least B<U0000> to B<U0020> as
1204 is, unless your environment is on EBCDIC.
1205
1206 B<CAVEAT>: not all features in UCM are implemented.  For example,
1207 icu:state is not used.  Because of that, you need to write a perl
1208 module if you want to support algorithmical encodings, notablly
1209 ISO-2022 series.  Such modules include L<Encode::JP::2022_JP>,
1210 L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1211
1212 =head1 Bookmarks
1213
1214 ICU Home Page
1215 L<http://oss.software.ibm.com/icu/>
1216
1217 ICU Character Mapping Tables
1218 L<http://oss.software.ibm.com/icu/charset/>
1219
1220 ICU:Conversion Data
1221 L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1222
1223 =head1 SEE ALSO
1224
1225 L<Encode>,
1226 L<perlmod>,
1227 L<perlpod>
1228
1229 =cut
1230
1231 # -Q to disable the duplicate codepoint test
1232 # -S make mapping errors fatal
1233 # -q to remove comments written to output files
1234 # -O to enable the (brute force) substring optimiser
1235 # -o <output> to specify the output file name (else it's the first arg)
1236 # -f <inlist> to give a file with a list of input files (else use the args)
1237 # -n <name> to name the encoding (else use the basename of the input file.
1238
1239 With %seen holding array refs:
1240
1241       865.66 real        28.80 user         8.79 sys
1242       7904  maximum resident set size
1243       1356  average shared memory size
1244      18566  average unshared data size
1245        229  average unshared stack size
1246      46080  page reclaims
1247      33373  page faults
1248
1249 With %seen holding simple scalars:
1250
1251       342.16 real        27.11 user         3.54 sys
1252       8388  maximum resident set size
1253       1394  average shared memory size
1254      14969  average unshared data size
1255        236  average unshared stack size
1256      28159  page reclaims
1257       9839  page faults
1258
1259 Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1260 how %seen is storing things its seen. So it is pathalogically bad on a 16M
1261 RAM machine, but it's going to help even on modern machines.
1262 Swapping is bad, m'kay :-)