ext/Encode/bin/enc2xs

   1 #!./perl
   2 BEGIN {
   3     # @INC poking  no longer needed w/ new MakeMaker and Makefile.PL's
   4     # with $ENV{PERL_CORE} set
   5     # In case we need it in future...
   6     require Config; import Config;
   7 }
   8 use strict;
   9 use warnings;
  10 use Getopt::Std;
  11 use Config;
  12 my @orig_ARGV = @ARGV;
  13 our $VERSION  = do { my @r = (q$Revision: 2.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  14
  15 # These may get re-ordered.
  16 # RAW is a do_now as inserted by &enter
  17 # AGG is an aggreagated do_now, as built up by &process
  18
  19 use constant {
  20   RAW_NEXT => 0,
  21   RAW_IN_LEN => 1,
  22   RAW_OUT_BYTES => 2,
  23   RAW_FALLBACK => 3,
  24
  25   AGG_MIN_IN => 0,
  26   AGG_MAX_IN => 1,
  27   AGG_OUT_BYTES => 2,
  28   AGG_NEXT => 3,
  29   AGG_IN_LEN => 4,
  30   AGG_OUT_LEN => 5,
  31   AGG_FALLBACK => 6,
  32 };
  33
  34 # (See the algorithm in encengine.c - we're building structures for it)
  35
  36 # There are two sorts of structures.
  37 # "do_now" (an array, two variants of what needs storing) is whatever we need
  38 # to do now we've read an input byte.
  39 # It's housed in a "do_next" (which is how we got to it), and in turn points
  40 # to a "do_next" which contains all the "do_now"s for the next input byte.
  41
  42 # There will be a "do_next" which is the start state.
  43 # For a single byte encoding it's the only "do_next" - each "do_now" points
  44 # back to it, and each "do_now" will cause bytes. There is no state.
  45
  46 # For a multi-byte encoding where all characters in the input are the same
  47 # length, then there will be a tree of "do_now"->"do_next"->"do_now"
  48 # branching out from the start state, one step for each input byte.
  49 # The leaf "do_now"s will all be at the same distance from the start state,
  50 # only the leaf "do_now"s cause output bytes, and they in turn point back to
  51 # the start state.
  52
  53 # For an encoding where there are varaible length input byte sequences, you
  54 # will encounter a leaf "do_now" sooner for the shorter input sequences, but
  55 # as before the leaves will point back to the start state.
  56
  57 # The system will cope with escape encodings (imagine them as a mostly
  58 # self-contained tree for each escape state, and cross links between trees
  59 # at the state-switching characters) but so far no input format defines these.
  60
  61 # The system will also cope with having output "leaves" in the middle of
  62 # the bifurcating branches, not just at the extremities, but again no
  63 # input format does this yet.
  64
  65 # There are two variants of the "do_now" structure. The first, smaller variant
  66 # is generated by &enter as the input file is read. There is one structure
  67 # for each input byte. Say we are mapping a single byte encoding to a
  68 # single byte encoding, with  "ABCD" going "abcd". There will be
  69 # 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
  70
  71 # &process then walks the tree, building aggregate "do_now" structres for
  72 # adjacent bytes where possible. The aggregate is for a contiguous range of
  73 # bytes which each produce the same length of output, each move to the
  74 # same next state, and each have the same fallback flag.
  75 # So our 4 RAW "do_now"s above become replaced by a single structure
  76 # containing:
  77 # ["A", "D", "abcd", 1, ...]
  78 # ie, for an input byte $_ in "A".."D", output 1 byte, found as
  79 # substr ("abcd", (ord $_ - ord "A") * 1, 1)
  80 # which maps very nicely into pointer arithmetic in C for encengine.c
  81
  82 sub encode_U
  83 {
  84  # UTF-8 encode long hand - only covers part of perl's range
  85  ## my $uv = shift;
  86  # chr() works in native space so convert value from table
  87  # into that space before using chr().
  88  my $ch = chr(utf8::unicode_to_native($_[0]));
  89  # Now get core perl to encode that the way it likes.
  90  utf8::encode($ch);
  91  return $ch;
  92 }
  93
  94 sub encode_S
  95 {
  96  # encode single byte
  97  ## my ($ch,$page) = @_; return chr($ch);
  98  return chr $_[0];
  99 }
 100
 101 sub encode_D
 102 {
 103  # encode double byte MS byte first
 104  ## my ($ch,$page) = @_; return chr($page).chr($ch);
 105  return chr ($_[1]) . chr $_[0];
 106 }
 107
 108 sub encode_M
 109 {
 110  # encode Multi-byte - single for 0..255 otherwise double
 111  ## my ($ch,$page) = @_;
 112  ## return &encode_D if $page;
 113  ## return &encode_S;
 114  return chr ($_[1]) . chr $_[0] if $_[1];
 115  return chr $_[0];
 116 }
 117
 118 my %encode_types = (U => \&encode_U,
 119                     S => \&encode_S,
 120                     D => \&encode_D,
 121                     M => \&encode_M,
 122                    );
 123
 124 # Win32 does not expand globs on command line
 125 eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
 126
 127 my %opt;
 128 # I think these are:
 129 # -Q to disable the duplicate codepoint test
 130 # -S make mapping errors fatal
 131 # -q to remove comments written to output files
 132 # -O to enable the (brute force) substring optimiser
 133 # -o <output> to specify the output file name (else it's the first arg)
 134 # -f <inlist> to give a file with a list of input files (else use the args)
 135 # -n <name> to name the encoding (else use the basename of the input file.
 136 getopts('CM:SQqOo:f:n:',\%opt);
 137
 138 $opt{M} and make_makefile_pl($opt{M}, @ARGV);
 139 $opt{C} and make_configlocal_pm($opt{C}, @ARGV);
 140
 141 # This really should go first, else the die here causes empty (non-erroneous)
 142 # output files to be written.
 143 my @encfiles;
 144 if (exists $opt{'f'}) {
 145     # -F is followed by name of file containing list of filenames
 146     my $flist = $opt{'f'};
 147     open(FLIST,$flist) || die "Cannot open $flist:$!";
 148     chomp(@encfiles = <FLIST>);
 149     close(FLIST);
 150 } else {
 151     @encfiles = @ARGV;
 152 }
 153
 154 my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
 155 chmod(0666,$cname) if -f $cname && !-w $cname;
 156 open(C,">$cname") || die "Cannot open $cname:$!";
 157
 158 my $dname = $cname;
 159 my $hname = $cname;
 160
 161 my ($doC,$doEnc,$doUcm,$doPet);
 162
 163 if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined
 164  {
 165   $doC = 1;
 166   $dname =~ s/(\.[^\.]*)?$/.exh/;
 167   chmod(0666,$dname) if -f $cname && !-w $dname;
 168   open(D,">$dname") || die "Cannot open $dname:$!";
 169   $hname =~ s/(\.[^\.]*)?$/.h/;
 170   chmod(0666,$hname) if -f $cname && !-w $hname;
 171   open(H,">$hname") || die "Cannot open $hname:$!";
 172
 173   foreach my $fh (\*C,\*D,\*H)
 174   {
 175    print $fh <<"END" unless $opt{'q'};
 176 /*
 177  !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
 178  This file was autogenerated by:
 179  $^X $0 @orig_ARGV
 180  enc2xs VERSION $VERSION
 181 */
 182 END
 183   }
 184
 185   if ($cname =~ /(\w+)\.xs$/)
 186    {
 187     print C "#include <EXTERN.h>\n";
 188     print C "#include <perl.h>\n";
 189     print C "#include <XSUB.h>\n";
 190     print C "#define U8 U8\n";
 191    }
 192   print C "#include \"encode.h\"\n\n";
 193
 194  }
 195 elsif ($cname =~ /\.enc$/)
 196  {
 197   $doEnc = 1;
 198  }
 199 elsif ($cname =~ /\.ucm$/)
 200  {
 201   $doUcm = 1;
 202  }
 203 elsif ($cname =~ /\.pet$/)
 204  {
 205   $doPet = 1;
 206  }
 207
 208 my %encoding;
 209 my %strings;
 210 my $string_acc;
 211 my %strings_in_acc;
 212
 213 my $saved = 0;
 214 my $subsave = 0;
 215 my $strings = 0;
 216
 217 sub cmp_name
 218 {
 219  if ($a =~ /^.*-(\d+)/)
 220   {
 221    my $an = $1;
 222    if ($b =~ /^.*-(\d+)/)
 223     {
 224      my $r = $an <=> $1;
 225      return $r if $r;
 226     }
 227   }
 228  return $a cmp $b;
 229 }
 230
 231
 232 foreach my $enc (sort cmp_name @encfiles)
 233  {
 234   my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
 235   $name = $opt{'n'} if exists $opt{'n'};
 236   if (open(E,$enc))
 237    {
 238     if ($sfx eq 'enc')
 239      {
 240       compile_enc(\*E,lc($name));
 241      }
 242     else
 243      {
 244       compile_ucm(\*E,lc($name));
 245      }
 246    }
 247   else
 248    {
 249     warn "Cannot open $enc for $name:$!";
 250    }
 251  }
 252
 253 if ($doC)
 254  {
 255   print STDERR "Writing compiled form\n";
 256   foreach my $name (sort cmp_name keys %encoding)
 257    {
 258     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 259     process($name.'_utf8',$e2u);
 260     addstrings(\*C,$e2u);
 261
 262     process('utf8_'.$name,$u2e);
 263     addstrings(\*C,$u2e);
 264    }
 265   outbigstring(\*C,"enctable");
 266   foreach my $name (sort cmp_name keys %encoding)
 267    {
 268     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 269     outtable(\*C,$e2u, "enctable");
 270     outtable(\*C,$u2e, "enctable");
 271
 272     # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
 273    }
 274   foreach my $enc (sort cmp_name keys %encoding)
 275    {
 276     # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
 277     my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
 278     #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
 279     my $replen = 0;
 280     $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
 281     my $sym = "${enc}_encoding";
 282     $sym =~ s/\W+/_/g;
 283     my @info = ($e2u->{Cname},$u2e->{Cname},"${sym}_rep_character",$replen,
 284         $min_el,$max_el);
 285     print C "static const U8 ${sym}_rep_character[] = \"$rep\";\n";
 286     print C "static const char ${sym}_enc_name[] = \"$enc\";\n\n";
 287     print C "const encode_t $sym = \n";
 288     # This is to make null encoding work -- dankogai
 289     for (my $i = (scalar @info) - 1;  $i >= 0; --$i){
 290     $info[$i] ||= 1;
 291     }
 292     # end of null tweak -- dankogai
 293     print C " {",join(',',@info,"{${sym}_enc_name,(const char *)0}"),"};\n\n";
 294    }
 295
 296   foreach my $enc (sort cmp_name keys %encoding)
 297    {
 298     my $sym = "${enc}_encoding";
 299     $sym =~ s/\W+/_/g;
 300     print H "extern encode_t $sym;\n";
 301     print D " Encode_XSEncoding(aTHX_ &$sym);\n";
 302    }
 303
 304   if ($cname =~ /(\w+)\.xs$/)
 305    {
 306     my $mod = $1;
 307     print C <<'END';
 308
 309 static void
 310 Encode_XSEncoding(pTHX_ encode_t *enc)
 311 {
 312  dSP;
 313  HV *stash = gv_stashpv("Encode::XS", TRUE);
 314  SV *sv    = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
 315  int i = 0;
 316  PUSHMARK(sp);
 317  XPUSHs(sv);
 318  while (enc->name[i])
 319   {
 320    const char *name = enc->name[i++];
 321    XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
 322   }
 323  PUTBACK;
 324  call_pv("Encode::define_encoding",G_DISCARD);
 325  SvREFCNT_dec(sv);
 326 }
 327
 328 END
 329
 330     print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
 331     print C "BOOT:\n{\n";
 332     print C "#include \"$dname\"\n";
 333     print C "}\n";
 334    }
 335   # Close in void context is bad, m'kay
 336   close(D) or warn "Error closing '$dname': $!";
 337   close(H) or warn "Error closing '$hname': $!";
 338
 339   my $perc_saved    = $saved/($strings + $saved) * 100;
 340   my $perc_subsaved = $subsave/($strings + $subsave) * 100;
 341   printf STDERR "%d bytes in string tables\n",$strings;
 342   printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
 343     $saved, $perc_saved              if $saved;
 344   printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
 345     $subsave, $perc_subsaved         if $subsave;
 346  }
 347 elsif ($doEnc)
 348  {
 349   foreach my $name (sort cmp_name keys %encoding)
 350    {
 351     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 352     output_enc(\*C,$name,$e2u);
 353    }
 354  }
 355 elsif ($doUcm)
 356  {
 357   foreach my $name (sort cmp_name keys %encoding)
 358    {
 359     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 360     output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
 361    }
 362  }
 363
 364 # writing half meg files and then not checking to see if you just filled the
 365 # disk is bad, m'kay
 366 close(C) or die "Error closing '$cname': $!";
 367
 368 # End of the main program.
 369
 370 sub compile_ucm
 371 {
 372  my ($fh,$name) = @_;
 373  my $e2u = {};
 374  my $u2e = {};
 375  my $cs;
 376  my %attr;
 377  while (<$fh>)
 378   {
 379    s/#.*$//;
 380    last if /^\s*CHARMAP\s*$/i;
 381    if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
 382     {
 383      $attr{$1} = $2;
 384     }
 385   }
 386  if (!defined($cs =  $attr{'code_set_name'}))
 387   {
 388    warn "No <code_set_name> in $name\n";
 389   }
 390  else
 391   {
 392    $name = $cs unless exists $opt{'n'};
 393   }
 394  my $erep;
 395  my $urep;
 396  my $max_el;
 397  my $min_el;
 398  if (exists $attr{'subchar'})
 399   {
 400    #my @byte;
 401    #$attr{'subchar'} =~ /^\s*/cg;
 402    #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
 403    #$erep = join('',map(chr(hex($_)),@byte));
 404    $erep = $attr{'subchar'};
 405    $erep =~ s/^\s+//; $erep =~ s/\s+$//;
 406   }
 407  print "Reading $name ($cs)\n";
 408  my $nfb = 0;
 409  my $hfb = 0;
 410  while (<$fh>)
 411   {
 412    s/#.*$//;
 413    last if /^\s*END\s+CHARMAP\s*$/i;
 414    next if /^\s*$/;
 415    my (@uni, @byte) = ();
 416    my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
 417        or die "Bad line: $_";
 418    while ($uni =~  m/\G<([U0-9a-fA-F\+]+)>/g){
 419        push @uni, map { substr($_, 1) } split(/\+/, $1);
 420    }
 421    while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
 422        push @byte, $1;
 423    }
 424    if (@uni)
 425     {
 426      my $uch =  join('', map { encode_U(hex($_)) } @uni );
 427      my $ech = join('',map(chr(hex($_)),@byte));
 428      my $el  = length($ech);
 429      $max_el = $el if (!defined($max_el) || $el > $max_el);
 430      $min_el = $el if (!defined($min_el) || $el < $min_el);
 431      if (length($fb))
 432       {
 433        $fb = substr($fb,1);
 434        $hfb++;
 435       }
 436      else
 437       {
 438        $nfb++;
 439        $fb = '0';
 440       }
 441      # $fb is fallback flag
 442      # 0 - round trip safe
 443      # 1 - fallback for unicode -> enc
 444      # 2 - skip sub-char mapping
 445      # 3 - fallback enc -> unicode
 446      enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
 447      enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
 448     }
 449    else
 450     {
 451      warn $_;
 452     }
 453   }
 454  if ($nfb && $hfb)
 455   {
 456    die "$nfb entries without fallback, $hfb entries with\n";
 457   }
 458  $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
 459 }
 460
 461
 462
 463 sub compile_enc
 464 {
 465  my ($fh,$name) = @_;
 466  my $e2u = {};
 467  my $u2e = {};
 468
 469  my $type;
 470  while ($type = <$fh>)
 471   {
 472    last if $type !~ /^\s*#/;
 473   }
 474  chomp($type);
 475  return if $type eq 'E';
 476  # Do the hash lookup once, rather than once per function call. 4% speedup.
 477  my $type_func = $encode_types{$type};
 478  my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
 479  warn "$type encoded $name\n";
 480  my $rep = '';
 481  # Save a defined test by setting these to defined values.
 482  my $min_el = ~0; # A very big integer
 483  my $max_el = 0;  # Anything must be longer than 0
 484  {
 485   my $v = hex($def);
 486   $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
 487  }
 488  my $errors;
 489  my $seen;
 490  # use -Q to silence the seen test. Makefile.PL uses this by default.
 491  $seen = {} unless $opt{Q};
 492  do
 493   {
 494    my $line = <$fh>;
 495    chomp($line);
 496    my $page = hex($line);
 497    my $ch = 0;
 498    my $i = 16;
 499    do
 500     {
 501      # So why is it 1% faster to leave the my here?
 502      my $line = <$fh>;
 503      $line =~ s/\r\n$/\n/;
 504      die "$.:${line}Line should be exactly 65 characters long including
 505      newline (".length($line).")" unless length ($line) == 65;
 506      # Split line into groups of 4 hex digits, convert groups to ints
 507      # This takes 65.35
 508      # map {hex $_} $line =~ /(....)/g
 509      # This takes 63.75 (2.5% less time)
 510      # unpack "n*", pack "H*", $line
 511      # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
 512      # Doing it as while ($line =~ /(....)/g) took 74.63
 513      foreach my $val (unpack "n*", pack "H*", $line)
 514       {
 515        next if $val == 0xFFFD;
 516        my $ech = &$type_func($ch,$page);
 517        if ($val || (!$ch && !$page))
 518         {
 519          my $el  = length($ech);
 520          $max_el = $el if $el > $max_el;
 521          $min_el = $el if $el < $min_el;
 522          my $uch = encode_U($val);
 523          if ($seen) {
 524            # We're doing the test.
 525            # We don't need to read this quickly, so storing it as a scalar,
 526            # rather than 3 (anon array, plus the 2 scalars it holds) saves
 527            # RAM and may make us faster on low RAM systems. [see __END__]
 528            if (exists $seen->{$uch})
 529              {
 530                warn sprintf("U%04X is %02X%02X and %04X\n",
 531                             $val,$page,$ch,$seen->{$uch});
 532                $errors++;
 533              }
 534            else
 535              {
 536                $seen->{$uch} = $page << 8 | $ch;
 537              }
 538          }
 539          # Passing 2 extra args each time is 3.6% slower!
 540          # Even with having to add $fallback ||= 0 later
 541          enter_fb0($e2u,$ech,$uch);
 542          enter_fb0($u2e,$uch,$ech);
 543         }
 544        else
 545         {
 546          # No character at this position
 547          # enter($e2u,$ech,undef,$e2u);
 548         }
 549        $ch++;
 550       }
 551     } while --$i;
 552   } while --$pages;
 553  die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
 554    if $min_el > $max_el;
 555  die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
 556  $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
 557 }
 558
 559 # my ($a,$s,$d,$t,$fb) = @_;
 560 sub enter {
 561   my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
 562   # state we shift to after this (multibyte) input character defaults to same
 563   # as current state.
 564   $next ||= $current;
 565   # Making sure it is defined seems to be faster than {no warnings;} in
 566   # &process, or passing it in as 0 explicity.
 567   # XXX $fallback ||= 0;
 568
 569   # Start at the beginning and work forwards through the string to zero.
 570   # effectively we are removing 1 character from the front each time
 571   # but we don't actually edit the string. [this alone seems to be 14% speedup]
 572   # Hence -$pos is the length of the remaining string.
 573   my $pos = -length $inbytes;
 574   while (1) {
 575     my $byte = substr $inbytes, $pos, 1;
 576     #  RAW_NEXT => 0,
 577     #  RAW_IN_LEN => 1,
 578     #  RAW_OUT_BYTES => 2,
 579     #  RAW_FALLBACK => 3,
 580     # to unicode an array would seem to be better, because the pages are dense.
 581     # from unicode can be very sparse, favouring a hash.
 582     # hash using the bytes (all length 1) as keys rather than ord value,
 583     # as it's easier to sort these in &process.
 584
 585     # It's faster to always add $fallback even if it's undef, rather than
 586     # choosing between 3 and 4 element array. (hence why we set it defined
 587     # above)
 588     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
 589     # When $pos was -1 we were at the last input character.
 590     unless (++$pos) {
 591       $do_now->[RAW_OUT_BYTES] = $outbytes;
 592       $do_now->[RAW_NEXT] = $next;
 593       return;
 594     }
 595     # Tail recursion. The intermdiate state may not have a name yet.
 596     $current = $do_now->[RAW_NEXT];
 597   }
 598 }
 599
 600 # This is purely for optimistation. It's just &enter hard coded for $fallback
 601 # of 0, using only a 3 entry array ref to save memory for every entry.
 602 sub enter_fb0 {
 603   my ($current,$inbytes,$outbytes,$next) = @_;
 604   $next ||= $current;
 605
 606   my $pos = -length $inbytes;
 607   while (1) {
 608     my $byte = substr $inbytes, $pos, 1;
 609     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
 610     unless (++$pos) {
 611       $do_now->[RAW_OUT_BYTES] = $outbytes;
 612       $do_now->[RAW_NEXT] = $next;
 613       return;
 614     }
 615     $current = $do_now->[RAW_NEXT];
 616   }
 617 }
 618
 619 sub process
 620 {
 621   my ($name,$a) = @_;
 622   $name =~ s/\W+/_/g;
 623   $a->{Cname} = $name;
 624   my $raw = $a->{Raw};
 625   my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
 626   my @ent;
 627   $agg_max_in = 0;
 628   foreach my $key (sort keys %$raw) {
 629     #  RAW_NEXT => 0,
 630     #  RAW_IN_LEN => 1,
 631     #  RAW_OUT_BYTES => 2,
 632     #  RAW_FALLBACK => 3,
 633     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 634     # Now we are converting from raw to aggregate, switch from 1 byte strings
 635     # to numbers
 636     my $b = ord $key;
 637     $fallback ||= 0;
 638     if ($l &&
 639         # If this == fails, we're going to reset $agg_max_in below anyway.
 640         $b == ++$agg_max_in &&
 641         # References in numeric context give the pointer as an int.
 642         $agg_next == $next &&
 643         $agg_in_len == $in_len &&
 644         $agg_out_len == length $out_bytes &&
 645         $agg_fallback == $fallback
 646         # && length($l->[AGG_OUT_BYTES]) < 16
 647        ) {
 648       #     my $i = ord($b)-ord($l->[AGG_MIN_IN]);
 649       # we can aggregate this byte onto the end.
 650       $l->[AGG_MAX_IN] = $b;
 651       $l->[AGG_OUT_BYTES] .= $out_bytes;
 652     } else {
 653       # AGG_MIN_IN => 0,
 654       # AGG_MAX_IN => 1,
 655       # AGG_OUT_BYTES => 2,
 656       # AGG_NEXT => 3,
 657       # AGG_IN_LEN => 4,
 658       # AGG_OUT_LEN => 5,
 659       # AGG_FALLBACK => 6,
 660       # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
 661       # (only gains .6% on euc-jp  -- is it worth it?)
 662       push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
 663                        $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
 664                        $agg_fallback = $fallback];
 665     }
 666     if (exists $next->{Cname}) {
 667       $next->{'Forward'} = 1 if $next != $a;
 668     } else {
 669       process(sprintf("%s_%02x",$name,$b),$next);
 670     }
 671   }
 672   # encengine.c rules say that last entry must be for 255
 673   if ($agg_max_in < 255) {
 674     push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
 675   }
 676   $a->{'Entries'} = \@ent;
 677 }
 678
 679
 680 sub addstrings
 681 {
 682  my ($fh,$a) = @_;
 683  my $name = $a->{'Cname'};
 684  # String tables
 685  foreach my $b (@{$a->{'Entries'}})
 686   {
 687    next unless $b->[AGG_OUT_LEN];
 688    $strings{$b->[AGG_OUT_BYTES]} = undef;
 689   }
 690  if ($a->{'Forward'})
 691   {
 692    my $cpp = ($Config{d_cplusplus} || '') eq 'define';
 693    my $var = $^O eq 'MacOS' || $cpp ? 'extern' : 'static';
 694    my $const = $cpp ? '' : 'const';
 695    print $fh "$var $const encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
 696   }
 697  $a->{'DoneStrings'} = 1;
 698  foreach my $b (@{$a->{'Entries'}})
 699   {
 700    my ($s,$e,$out,$t,$end,$l) = @$b;
 701    addstrings($fh,$t) unless $t->{'DoneStrings'};
 702   }
 703 }
 704
 705 sub outbigstring
 706 {
 707   my ($fh,$name) = @_;
 708
 709   $string_acc = '';
 710
 711   # Make the big string in the string accumulator. Longest first, on the hope
 712   # that this makes it more likely that we find the short strings later on.
 713   # Not sure if it helps sorting strings of the same length lexcically.
 714   foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) {
 715     my $index = index $string_acc, $s;
 716     if ($index >= 0) {
 717       $saved += length($s);
 718       $strings_in_acc{$s} = $index;
 719     } else {
 720     OPTIMISER: {
 721     if ($opt{'O'}) {
 722       my $sublength = length $s;
 723       while (--$sublength > 0) {
 724         # progressively lop characters off the end, to see if the start of
 725         # the new string overlaps the end of the accumulator.
 726         if (substr ($string_acc, -$sublength)
 727         eq substr ($s, 0, $sublength)) {
 728           $subsave += $sublength;
 729           $strings_in_acc{$s} = length ($string_acc) - $sublength;
 730           # append the last bit on the end.
 731           $string_acc .= substr ($s, $sublength);
 732           last OPTIMISER;
 733         }
 734         # or if the end of the new string overlaps the start of the
 735         # accumulator
 736         next unless substr ($string_acc, 0, $sublength)
 737           eq substr ($s, -$sublength);
 738         # well, the last $sublength characters of the accumulator match.
 739         # so as we're prepending to the accumulator, need to shift all our
 740         # existing offsets forwards
 741         $_ += $sublength foreach values %strings_in_acc;
 742         $subsave += $sublength;
 743         $strings_in_acc{$s} = 0;
 744         # append the first bit on the start.
 745         $string_acc = substr ($s, 0, -$sublength) . $string_acc;
 746         last OPTIMISER;
 747       }
 748     }
 749     # Optimiser (if it ran) found nothing, so just going have to tack the
 750     # whole thing on the end.
 751     $strings_in_acc{$s} = length $string_acc;
 752     $string_acc .= $s;
 753       };
 754     }
 755   }
 756
 757   $strings = length $string_acc;
 758   my $cpp = ($Config{d_cplusplus} || '') eq 'define';
 759   my $var = $cpp ? '' : 'static';
 760   my $definition = "\n$var const U8 $name\[$strings] = { " .
 761     join(',',unpack "C*",$string_acc);
 762   # We have a single long line. Split it at convenient commas.
 763   print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs;
 764   print $fh substr ($definition, pos $definition), " };\n";
 765 }
 766
 767 sub findstring {
 768   my ($name,$s) = @_;
 769   my $offset = $strings_in_acc{$s};
 770   die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator"
 771     unless defined $offset;
 772   "$name + $offset";
 773 }
 774
 775 sub outtable
 776 {
 777  my ($fh,$a,$bigname) = @_;
 778  my $name = $a->{'Cname'};
 779  $a->{'Done'} = 1;
 780  foreach my $b (@{$a->{'Entries'}})
 781   {
 782    my ($s,$e,$out,$t,$end,$l) = @$b;
 783    outtable($fh,$t,$bigname) unless $t->{'Done'};
 784   }
 785  my $cpp = ($Config{d_cplusplus} || '') eq 'define';
 786  my $var = $cpp ? '' : 'static';
 787  my $const = $cpp ? '' : 'const';
 788  print $fh "\n$var $const encpage_t $name\[",
 789    scalar(@{$a->{'Entries'}}), "] = {\n";
 790  foreach my $b (@{$a->{'Entries'}})
 791   {
 792    my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
 793    # $end |= 0x80 if $fb; # what the heck was on your mind, Nick?  -- Dan
 794    print  $fh "{";
 795    if ($l)
 796     {
 797      printf $fh findstring($bigname,$out);
 798     }
 799    else
 800     {
 801      print  $fh "0";
 802     }
 803    print  $fh ",",$t->{Cname};
 804    printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
 805   }
 806  print $fh "};\n";
 807 }
 808
 809 sub output_enc
 810 {
 811  my ($fh,$name,$a) = @_;
 812  die "Changed - fix me for new structure";
 813  foreach my $b (sort keys %$a)
 814   {
 815    my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
 816   }
 817 }
 818
 819 sub decode_U
 820 {
 821  my $s = shift;
 822 }
 823
 824 my @uname;
 825 sub char_names
 826 {
 827  my $s = do "unicore/Name.pl";
 828  die "char_names: unicore/Name.pl: $!\n" unless defined $s;
 829  pos($s) = 0;
 830  while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
 831   {
 832    my $name = $3;
 833    my $s = hex($1);
 834    last if $s >= 0x10000;
 835    my $e = length($2) ? hex($2) : $s;
 836    for (my $i = $s; $i <= $e; $i++)
 837     {
 838      $uname[$i] = $name;
 839 #    print sprintf("U%04X $name\n",$i);
 840     }
 841   }
 842 }
 843
 844 sub output_ucm_page
 845 {
 846   my ($cmap,$a,$t,$pre) = @_;
 847   # warn sprintf("Page %x\n",$pre);
 848   my $raw = $t->{Raw};
 849   foreach my $key (sort keys %$raw) {
 850     #  RAW_NEXT => 0,
 851     #  RAW_IN_LEN => 1,
 852     #  RAW_OUT_BYTES => 2,
 853     #  RAW_FALLBACK => 3,
 854     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 855     my $u = ord $key;
 856     $fallback ||= 0;
 857
 858     if ($next != $a && $next != $t) {
 859       output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
 860     } elsif (length $out_bytes) {
 861       if ($pre) {
 862         $u = $pre|($u &0x3f);
 863       }
 864       my $s = sprintf "<U%04X> ",$u;
 865       #foreach my $c (split(//,$out_bytes)) {
 866       #  $s .= sprintf "\\x%02X",ord($c);
 867       #}
 868       # 9.5% faster changing that loop to this:
 869       $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
 870       $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
 871       push(@$cmap,$s);
 872     } else {
 873       warn join(',',$u, @{$raw->{$key}},$a,$t);
 874     }
 875   }
 876 }
 877
 878 sub output_ucm
 879 {
 880  my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
 881  print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
 882  print $fh "<code_set_name> \"$name\"\n";
 883  char_names();
 884  if (defined $min_el)
 885   {
 886    print $fh "<mb_cur_min> $min_el\n";
 887   }
 888  if (defined $max_el)
 889   {
 890    print $fh "<mb_cur_max> $max_el\n";
 891   }
 892  if (defined $rep)
 893   {
 894    print $fh "<subchar> ";
 895    foreach my $c (split(//,$rep))
 896     {
 897      printf $fh "\\x%02X",ord($c);
 898     }
 899    print $fh "\n";
 900   }
 901  my @cmap;
 902  output_ucm_page(\@cmap,$h,$h,0);
 903  print $fh "#\nCHARMAP\n";
 904  foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
 905   {
 906    print $fh $line;
 907   }
 908  print $fh "END CHARMAP\n";
 909 }
 910
 911 use vars qw(
 912     $_Enc2xs
 913     $_Version
 914     $_Inc
 915     $_E2X
 916     $_Name
 917     $_TableFiles
 918     $_Now
 919 );
 920
 921 sub find_e2x{
 922     eval { require File::Find; };
 923     my (@inc, %e2x_dir);
 924     for my $inc (@INC){
 925     push @inc, $inc unless $inc eq '.'; #skip current dir
 926     }
 927     File::Find::find(
 928          sub {
 929          my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
 930              $atime,$mtime,$ctime,$blksize,$blocks)
 931              = lstat($_) or return;
 932          -f _ or return;
 933          if (/^.*\.e2x$/o){
 934              no warnings 'once';
 935              $e2x_dir{$File::Find::dir} ||= $mtime;
 936          }
 937          return;
 938          }, @inc);
 939     warn join("\n", keys %e2x_dir), "\n";
 940     for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
 941     $_E2X = $d;
 942     # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
 943     return $_E2X;
 944     }
 945 }
 946
 947 sub make_makefile_pl
 948 {
 949     eval { require Encode; };
 950     $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
 951     # our used for variable expanstion
 952     $_Enc2xs = $0;
 953     $_Version = $VERSION;
 954     $_E2X = find_e2x();
 955     $_Name = shift;
 956     $_TableFiles = join(",", map {qq('$_')} @_);
 957     $_Now = scalar localtime();
 958
 959     eval { require File::Spec; };
 960     _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
 961     _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"),        "$_Name.pm");
 962     _print_expand(File::Spec->catfile($_E2X,"_T.e2x"),         "t/$_Name.t");
 963     _print_expand(File::Spec->catfile($_E2X,"README.e2x"),     "README");
 964     _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"),    "Changes");
 965     exit;
 966 }
 967
 968 use vars qw(
 969         $_ModLines
 970         $_LocalVer
 971         );
 972
 973 sub make_configlocal_pm {
 974     eval { require Encode; };
 975     $@ and die "Unable to require Encode: $@\n";
 976     eval { require File::Spec; };
 977
 978     # our used for variable expanstion
 979     my %in_core = map { $_ => 1 } (
 980         'ascii',      'iso-8859-1', 'utf8',
 981         'ascii-ctrl', 'null',       'utf-8-strict'
 982     );
 983     my %LocalMod = ();
 984     # check @enc;
 985     use File::Find ();
 986     my $wanted = sub{
 987         -f $_ or return;
 988         $File::Find::name =~ /\A\./        and return;
 989         $File::Find::name =~ /\.pm\z/      or  return;
 990         $File::Find::name =~ m/\bEncode\b/ or  return;
 991         my $mod = $File::Find::name;
 992         $mod =~ s/.*\bEncode\b/Encode/o;
 993         $mod =~ s/\.pm\z//o;
 994         $mod =~ s,/,::,og;
 995         warn qq{ require $mod;\n};
 996         eval qq{ require $mod; };
 997         $@ and die "Can't require $mod: $@\n";
 998         for my $enc ( Encode->encodings() ) {
 999             no warnings;
1000             $in_core{$enc}                   and next;
1001             $Encode::Config::ExtModule{$enc} and next;
1002             $LocalMod{$enc} ||= $mod;
1003         }
1004     };
1005     File::Find::find({wanted => $wanted}, @INC);
1006     $_ModLines = "";
1007     for my $enc ( sort keys %LocalMod ) {
1008         $_ModLines .=
1009           qq(\$Encode::ExtModule{'$enc'} = "$LocalMod{$enc}";\n);
1010     }
1011     warn $_ModLines;
1012     $_LocalVer = _mkversion();
1013     $_E2X      = find_e2x();
1014     $_Inc      = $INC{"Encode.pm"};
1015     $_Inc =~ s/\.pm$//o;
1016     _print_expand( File::Spec->catfile( $_E2X, "ConfigLocal_PM.e2x" ),
1017         File::Spec->catfile( $_Inc, "ConfigLocal.pm" ), 1 );
1018     exit;
1019 }
1020
1021 sub _mkversion{
1022     # v-string is now depreciated; use time() instead;
1023     #my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
1024     #$yyyy += 1900, $mo +=1;
1025     #return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
1026     return time();
1027 }
1028
1029 sub _print_expand{
1030     eval { require File::Basename; };
1031     $@ and die "File::Basename needed.  Are you on miniperl?;\nerror: $@\n";
1032     File::Basename->import();
1033     my ($src, $dst, $clobber) = @_;
1034     if (!$clobber and -e $dst){
1035     warn "$dst exists. skipping\n";
1036     return;
1037     }
1038     warn "Generating $dst...\n";
1039     open my $in, $src or die "$src : $!";
1040     if ((my $d = dirname($dst)) ne '.'){
1041     -d $d or mkdir $d, 0755 or die  "mkdir $d : $!";
1042     }
1043     open my $out, ">$dst" or die "$!";
1044     my $asis = 0;
1045     while (<$in>){
1046     if (/^#### END_OF_HEADER/){
1047         $asis = 1; next;
1048     }
1049     s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
1050     print $out $_;
1051     }
1052 }
1053 __END__
1054
1055 =head1 NAME
1056
1057 enc2xs -- Perl Encode Module Generator
1058
1059 =head1 SYNOPSIS
1060
1061   enc2xs -[options]
1062   enc2xs -M ModName mapfiles...
1063   enc2xs -C
1064
1065 =head1 DESCRIPTION
1066
1067 F<enc2xs> builds a Perl extension for use by Encode from either
1068 Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
1069 Besides being used internally during the build process of the Encode
1070 module, you can use F<enc2xs> to add your own encoding to perl.
1071 No knowledge of XS is necessary.
1072
1073 =head1 Quick Guide
1074
1075 If you want to know as little about Perl as possible but need to
1076 add a new encoding, just read this chapter and forget the rest.
1077
1078 =over 4
1079
1080 =item 0.
1081
1082 Have a .ucm file ready.  You can get it from somewhere or you can write
1083 your own from scratch or you can grab one from the Encode distribution
1084 and customize it.  For the UCM format, see the next Chapter.  In the
1085 example below, I'll call my theoretical encoding myascii, defined
1086 in I<my.ucm>.  C<$> is a shell prompt.
1087
1088   $ ls -F
1089   my.ucm
1090
1091 =item 1.
1092
1093 Issue a command as follows;
1094
1095   $ enc2xs -M My my.ucm
1096   generating Makefile.PL
1097   generating My.pm
1098   generating README
1099   generating Changes
1100
1101 Now take a look at your current directory.  It should look like this.
1102
1103   $ ls -F
1104   Makefile.PL   My.pm         my.ucm        t/
1105
1106 The following files were created.
1107
1108   Makefile.PL - MakeMaker script
1109   My.pm       - Encode submodule
1110   t/My.t      - test file
1111
1112 =over 4
1113
1114 =item 1.1.
1115
1116 If you want *.ucm installed together with the modules, do as follows;
1117
1118   $ mkdir Encode
1119   $ mv *.ucm Encode
1120   $ enc2xs -M My Encode/*ucm
1121
1122 =back
1123
1124 =item 2.
1125
1126 Edit the files generated.  You don't have to if you have no time AND no
1127 intention to give it to someone else.  But it is a good idea to edit
1128 the pod and to add more tests.
1129
1130 =item 3.
1131
1132 Now issue a command all Perl Mongers love:
1133
1134   $ perl Makefile.PL
1135   Writing Makefile for Encode::My
1136
1137 =item 4.
1138
1139 Now all you have to do is make.
1140
1141   $ make
1142   cp My.pm blib/lib/Encode/My.pm
1143   /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1144     -o encode_t.c -f encode_t.fnm
1145   Reading myascii (myascii)
1146   Writing compiled form
1147   128 bytes in string tables
1148   384 bytes (75%) saved spotting duplicates
1149   1 bytes (0.775%) saved using substrings
1150   ....
1151   chmod 644 blib/arch/auto/Encode/My/My.bs
1152   $
1153
1154 The time it takes varies depending on how fast your machine is and
1155 how large your encoding is.  Unless you are working on something big
1156 like euc-tw, it won't take too long.
1157
1158 =item 5.
1159
1160 You can "make install" already but you should test first.
1161
1162   $ make test
1163   PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1164     -e 'use Test::Harness  qw(&runtests $verbose); \
1165     $verbose=0; runtests @ARGV;' t/*.t
1166   t/My....ok
1167   All tests successful.
1168   Files=1, Tests=2,  0 wallclock secs
1169    ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1170
1171 =item 6.
1172
1173 If you are content with the test result, just "make install"
1174
1175 =item 7.
1176
1177 If you want to add your encoding to Encode's demand-loading list
1178 (so you don't have to "use Encode::YourEncoding"), run
1179
1180   enc2xs -C
1181
1182 to update Encode::ConfigLocal, a module that controls local settings.
1183 After that, "use Encode;" is enough to load your encodings on demand.
1184
1185 =back
1186
1187 =head1 The Unicode Character Map
1188
1189 Encode uses the Unicode Character Map (UCM) format for source character
1190 mappings.  This format is used by IBM's ICU package and was adopted
1191 by Nick Ing-Simmons for use with the Encode module.  Since UCM is
1192 more flexible than Tcl's Encoding Map and far more user-friendly,
1193 this is the recommended formet for Encode now.
1194
1195 A UCM file looks like this.
1196
1197   #
1198   # Comments
1199   #
1200   <code_set_name> "US-ascii" # Required
1201   <code_set_alias> "ascii"   # Optional
1202   <mb_cur_min> 1             # Required; usually 1
1203   <mb_cur_max> 1             # Max. # of bytes/char
1204   <subchar> \x3F             # Substitution char
1205   #
1206   CHARMAP
1207   <U0000> \x00 |0 # <control>
1208   <U0001> \x01 |0 # <control>
1209   <U0002> \x02 |0 # <control>
1210   ....
1211   <U007C> \x7C |0 # VERTICAL LINE
1212   <U007D> \x7D |0 # RIGHT CURLY BRACKET
1213   <U007E> \x7E |0 # TILDE
1214   <U007F> \x7F |0 # <control>
1215   END CHARMAP
1216
1217 =over 4
1218
1219 =item *
1220
1221 Anything that follows C<#> is treated as a comment.
1222
1223 =item *
1224
1225 The header section continues until a line containing the word
1226 CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
1227 pair per line.  Strings used as values must be quoted. Barewords are
1228 treated as numbers.  I<\xXX> represents a byte.
1229
1230 Most of the keywords are self-explanatory. I<subchar> means
1231 substitution character, not subcharacter.  When you decode a Unicode
1232 sequence to this encoding but no matching character is found, the byte
1233 sequence defined here will be used.  For most cases, the value here is
1234 \x3F; in ASCII, this is a question mark.
1235
1236 =item *
1237
1238 CHARMAP starts the character map section.  Each line has a form as
1239 follows:
1240
1241   <UXXXX> \xXX.. |0 # comment
1242     ^     ^      ^
1243     |     |      +- Fallback flag
1244     |     +-------- Encoded byte sequence
1245     +-------------- Unicode Character ID in hex
1246
1247 The format is roughly the same as a header section except for the
1248 fallback flag: | followed by 0..3.   The meaning of the possible
1249 values is as follows:
1250
1251 =over 4
1252
1253 =item |0
1254
1255 Round trip safe.  A character decoded to Unicode encodes back to the
1256 same byte sequence.  Most characters have this flag.
1257
1258 =item |1
1259
1260 Fallback for unicode -> encoding.  When seen, enc2xs adds this
1261 character for the encode map only.
1262
1263 =item |2
1264
1265 Skip sub-char mapping should there be no code point.
1266
1267 =item |3
1268
1269 Fallback for encoding -> unicode.  When seen, enc2xs adds this
1270 character for the decode map only.
1271
1272 =back
1273
1274 =item *
1275
1276 And finally, END OF CHARMAP ends the section.
1277
1278 =back
1279
1280 When you are manually creating a UCM file, you should copy ascii.ucm
1281 or an existing encoding which is close to yours, rather than write
1282 your own from scratch.
1283
1284 When you do so, make sure you leave at least B<U0000> to B<U0020> as
1285 is, unless your environment is EBCDIC.
1286
1287 B<CAVEAT>: not all features in UCM are implemented.  For example,
1288 icu:state is not used.  Because of that, you need to write a perl
1289 module if you want to support algorithmical encodings, notably
1290 the ISO-2022 series.  Such modules include L<Encode::JP::2022_JP>,
1291 L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1292
1293 =head2 Coping with duplicate mappings
1294
1295 When you create a map, you SHOULD make your mappings round-trip safe.
1296 That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
1297 $data> stands for all characters that are marked as C<|0>.  Here is
1298 how to make sure:
1299
1300 =over 4
1301
1302 =item *
1303
1304 Sort your map in Unicode order.
1305
1306 =item *
1307
1308 When you have a duplicate entry, mark either one with '|1' or '|3'.
1309
1310 =item *
1311
1312 And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
1313
1314 =back
1315
1316 Here is an example from big5-eten.
1317
1318   <U2550> \xF9\xF9 |0
1319   <U2550> \xA2\xA4 |3
1320
1321 Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
1322 this;
1323
1324   E to U               U to E
1325   --------------------------------------
1326   \xF9\xF9 => U2550    U2550 => \xF9\xF9
1327   \xA2\xA4 => U2550
1328
1329 So it is round-trip safe for \xF9\xF9.  But if the line above is upside
1330 down, here is what happens.
1331
1332   E to U               U to E
1333   --------------------------------------
1334   \xA2\xA4 => U2550    U2550 => \xF9\xF9
1335   (\xF9\xF9 => U2550 is now overwritten!)
1336
1337 The Encode package comes with F<ucmlint>, a crude but sufficient
1338 utility to check the integrity of a UCM file.  Check under the
1339 Encode/bin directory for this.
1340
1341 When in doubt, you can use F<ucmsort>, yet another utility under
1342 Encode/bin directory.
1343
1344 =head1 Bookmarks
1345
1346 =over 4
1347
1348 =item *
1349
1350 ICU Home Page
1351 L<http://oss.software.ibm.com/icu/>
1352
1353 =item *
1354
1355 ICU Character Mapping Tables
1356 L<http://oss.software.ibm.com/icu/charset/>
1357
1358 =item *
1359
1360 ICU:Conversion Data
1361 L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1362
1363 =back
1364
1365 =head1 SEE ALSO
1366
1367 L<Encode>,
1368 L<perlmod>,
1369 L<perlpod>
1370
1371 =cut
1372
1373 # -Q to disable the duplicate codepoint test
1374 # -S make mapping errors fatal
1375 # -q to remove comments written to output files
1376 # -O to enable the (brute force) substring optimiser
1377 # -o <output> to specify the output file name (else it's the first arg)
1378 # -f <inlist> to give a file with a list of input files (else use the args)
1379 # -n <name> to name the encoding (else use the basename of the input file.
1380
1381 With %seen holding array refs:
1382
1383       865.66 real        28.80 user         8.79 sys
1384       7904  maximum resident set size
1385       1356  average shared memory size
1386      18566  average unshared data size
1387        229  average unshared stack size
1388      46080  page reclaims
1389      33373  page faults
1390
1391 With %seen holding simple scalars:
1392
1393       342.16 real        27.11 user         3.54 sys
1394       8388  maximum resident set size
1395       1394  average shared memory size
1396      14969  average unshared data size
1397        236  average unshared stack size
1398      28159  page reclaims
1399       9839  page faults
1400
1401 Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1402 how %seen is storing things its seen. So it is pathalogically bad on a 16M
1403 RAM machine, but it's going to help even on modern machines.
1404 Swapping is bad, m'kay :-)