ext/Encode/bin/enc2xs

   1 #!./perl
   2 BEGIN {
   3     # @INC poking  no longer needed w/ new MakeMaker and Makefile.PL's
   4     # with $ENV{PERL_CORE} set
   5     # In case we need it in future...
   6     require Config; import Config;
   7 }
   8 use strict;
   9 use warnings;
  10 use Getopt::Std;
  11 my @orig_ARGV = @ARGV;
  12 our $VERSION  = do { my @r = (q$Revision: 1.32 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  13
  14 # These may get re-ordered.
  15 # RAW is a do_now as inserted by &enter
  16 # AGG is an aggreagated do_now, as built up by &process
  17
  18 use constant {
  19   RAW_NEXT => 0,
  20   RAW_IN_LEN => 1,
  21   RAW_OUT_BYTES => 2,
  22   RAW_FALLBACK => 3,
  23
  24   AGG_MIN_IN => 0,
  25   AGG_MAX_IN => 1,
  26   AGG_OUT_BYTES => 2,
  27   AGG_NEXT => 3,
  28   AGG_IN_LEN => 4,
  29   AGG_OUT_LEN => 5,
  30   AGG_FALLBACK => 6,
  31 };
  32
  33 # (See the algorithm in encengine.c - we're building structures for it)
  34
  35 # There are two sorts of structures.
  36 # "do_now" (an array, two variants of what needs storing) is whatever we need
  37 # to do now we've read an input byte.
  38 # It's housed in a "do_next" (which is how we got to it), and in turn points
  39 # to a "do_next" which contains all the "do_now"s for the next input byte.
  40
  41 # There will be a "do_next" which is the start state.
  42 # For a single byte encoding it's the only "do_next" - each "do_now" points
  43 # back to it, and each "do_now" will cause bytes. There is no state.
  44
  45 # For a multi-byte encoding where all characters in the input are the same
  46 # length, then there will be a tree of "do_now"->"do_next"->"do_now"
  47 # branching out from the start state, one step for each input byte.
  48 # The leaf "do_now"s will all be at the same distance from the start state,
  49 # only the leaf "do_now"s cause output bytes, and they in turn point back to
  50 # the start state.
  51
  52 # For an encoding where there are varaible length input byte sequences, you
  53 # will encounter a leaf "do_now" sooner for the shorter input sequences, but
  54 # as before the leaves will point back to the start state.
  55
  56 # The system will cope with escape encodings (imagine them as a mostly
  57 # self-contained tree for each escape state, and cross links between trees
  58 # at the state-switching characters) but so far no input format defines these.
  59
  60 # The system will also cope with having output "leaves" in the middle of
  61 # the bifurcating branches, not just at the extremities, but again no
  62 # input format does this yet.
  63
  64 # There are two variants of the "do_now" structure. The first, smaller variant
  65 # is generated by &enter as the input file is read. There is one structure
  66 # for each input byte. Say we are mapping a single byte encoding to a
  67 # single byte encoding, with  "ABCD" going "abcd". There will be
  68 # 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
  69
  70 # &process then walks the tree, building aggregate "do_now" structres for
  71 # adjacent bytes where possible. The aggregate is for a contiguous range of
  72 # bytes which each produce the same length of output, each move to the
  73 # same next state, and each have the same fallback flag.
  74 # So our 4 RAW "do_now"s above become replaced by a single structure
  75 # containing:
  76 # ["A", "D", "abcd", 1, ...]
  77 # ie, for an input byte $_ in "A".."D", output 1 byte, found as
  78 # substr ("abcd", (ord $_ - ord "A") * 1, 1)
  79 # which maps very nicely into pointer arithmetic in C for encengine.c
  80
  81 sub encode_U
  82 {
  83  # UTF-8 encode long hand - only covers part of perl's range
  84  ## my $uv = shift;
  85  # chr() works in native space so convert value from table
  86  # into that space before using chr().
  87  my $ch = chr(utf8::unicode_to_native($_[0]));
  88  # Now get core perl to encode that the way it likes.
  89  utf8::encode($ch);
  90  return $ch;
  91 }
  92
  93 sub encode_S
  94 {
  95  # encode single byte
  96  ## my ($ch,$page) = @_; return chr($ch);
  97  return chr $_[0];
  98 }
  99
 100 sub encode_D
 101 {
 102  # encode double byte MS byte first
 103  ## my ($ch,$page) = @_; return chr($page).chr($ch);
 104  return chr ($_[1]) . chr $_[0];
 105 }
 106
 107 sub encode_M
 108 {
 109  # encode Multi-byte - single for 0..255 otherwise double
 110  ## my ($ch,$page) = @_;
 111  ## return &encode_D if $page;
 112  ## return &encode_S;
 113  return chr ($_[1]) . chr $_[0] if $_[1];
 114  return chr $_[0];
 115 }
 116
 117 my %encode_types = (U => \&encode_U,
 118                     S => \&encode_S,
 119                     D => \&encode_D,
 120                     M => \&encode_M,
 121                    );
 122
 123 # Win32 does not expand globs on command line
 124 eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
 125
 126 my %opt;
 127 # I think these are:
 128 # -Q to disable the duplicate codepoint test
 129 # -S make mapping errors fatal
 130 # -q to remove comments written to output files
 131 # -O to enable the (brute force) substring optimiser
 132 # -o <output> to specify the output file name (else it's the first arg)
 133 # -f <inlist> to give a file with a list of input files (else use the args)
 134 # -n <name> to name the encoding (else use the basename of the input file.
 135 getopts('CM:SQqOo:f:n:',\%opt);
 136
 137 $opt{M} and make_makefile_pl($opt{M}, @ARGV);
 138 $opt{C} and make_configlocal_pm($opt{C}, @ARGV);
 139
 140 # This really should go first, else the die here causes empty (non-erroneous)
 141 # output files to be written.
 142 my @encfiles;
 143 if (exists $opt{'f'}) {
 144     # -F is followed by name of file containing list of filenames
 145     my $flist = $opt{'f'};
 146     open(FLIST,$flist) || die "Cannot open $flist:$!";
 147     chomp(@encfiles = <FLIST>);
 148     close(FLIST);
 149 } else {
 150     @encfiles = @ARGV;
 151 }
 152
 153 my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
 154 chmod(0666,$cname) if -f $cname && !-w $cname;
 155 open(C,">$cname") || die "Cannot open $cname:$!";
 156
 157 my $dname = $cname;
 158 my $hname = $cname;
 159
 160 my ($doC,$doEnc,$doUcm,$doPet);
 161
 162 if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined
 163  {
 164   $doC = 1;
 165   $dname =~ s/(\.[^\.]*)?$/.exh/;
 166   chmod(0666,$dname) if -f $cname && !-w $dname;
 167   open(D,">$dname") || die "Cannot open $dname:$!";
 168   $hname =~ s/(\.[^\.]*)?$/.h/;
 169   chmod(0666,$hname) if -f $cname && !-w $hname;
 170   open(H,">$hname") || die "Cannot open $hname:$!";
 171
 172   foreach my $fh (\*C,\*D,\*H)
 173   {
 174    print $fh <<"END" unless $opt{'q'};
 175 /*
 176  !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
 177  This file was autogenerated by:
 178  $^X $0 @orig_ARGV
 179 */
 180 END
 181   }
 182
 183   if ($cname =~ /(\w+)\.xs$/)
 184    {
 185     print C "#include <EXTERN.h>\n";
 186     print C "#include <perl.h>\n";
 187     print C "#include <XSUB.h>\n";
 188     print C "#define U8 U8\n";
 189    }
 190   print C "#include \"encode.h\"\n\n";
 191
 192  }
 193 elsif ($cname =~ /\.enc$/)
 194  {
 195   $doEnc = 1;
 196  }
 197 elsif ($cname =~ /\.ucm$/)
 198  {
 199   $doUcm = 1;
 200  }
 201 elsif ($cname =~ /\.pet$/)
 202  {
 203   $doPet = 1;
 204  }
 205
 206 my %encoding;
 207 my %strings;
 208 my $string_acc;
 209 my %strings_in_acc;
 210
 211 my $saved = 0;
 212 my $subsave = 0;
 213 my $strings = 0;
 214
 215 sub cmp_name
 216 {
 217  if ($a =~ /^.*-(\d+)/)
 218   {
 219    my $an = $1;
 220    if ($b =~ /^.*-(\d+)/)
 221     {
 222      my $r = $an <=> $1;
 223      return $r if $r;
 224     }
 225   }
 226  return $a cmp $b;
 227 }
 228
 229
 230 foreach my $enc (sort cmp_name @encfiles)
 231  {
 232   my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
 233   $name = $opt{'n'} if exists $opt{'n'};
 234   if (open(E,$enc))
 235    {
 236     if ($sfx eq 'enc')
 237      {
 238       compile_enc(\*E,lc($name));
 239      }
 240     else
 241      {
 242       compile_ucm(\*E,lc($name));
 243      }
 244    }
 245   else
 246    {
 247     warn "Cannot open $enc for $name:$!";
 248    }
 249  }
 250
 251 if ($doC)
 252  {
 253   print STDERR "Writing compiled form\n";
 254   foreach my $name (sort cmp_name keys %encoding)
 255    {
 256     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 257     process($name.'_utf8',$e2u);
 258     addstrings(\*C,$e2u);
 259
 260     process('utf8_'.$name,$u2e);
 261     addstrings(\*C,$u2e);
 262    }
 263   outbigstring(\*C,"enctable");
 264   foreach my $name (sort cmp_name keys %encoding)
 265    {
 266     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 267     outtable(\*C,$e2u, "enctable");
 268     outtable(\*C,$u2e, "enctable");
 269
 270     # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
 271    }
 272   foreach my $enc (sort cmp_name keys %encoding)
 273    {
 274     # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
 275     my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
 276     #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
 277     my $replen = 0;
 278     $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
 279     my @info = ($e2u->{Cname},$u2e->{Cname},qq((U8 *)"$rep"),$replen,$min_el,$max_el);
 280     my $sym = "${enc}_encoding";
 281     $sym =~ s/\W+/_/g;
 282     print C "encode_t $sym = \n";
 283     # This is to make null encoding work -- dankogai
 284     for (my $i = (scalar @info) - 1;  $i >= 0; --$i){
 285         $info[$i] ||= 1;
 286     }
 287     # end of null tweak -- dankogai
 288     print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
 289    }
 290
 291   foreach my $enc (sort cmp_name keys %encoding)
 292    {
 293     my $sym = "${enc}_encoding";
 294     $sym =~ s/\W+/_/g;
 295     print H "extern encode_t $sym;\n";
 296     print D " Encode_XSEncoding(aTHX_ &$sym);\n";
 297    }
 298
 299   if ($cname =~ /(\w+)\.xs$/)
 300    {
 301     my $mod = $1;
 302     print C <<'END';
 303
 304 static void
 305 Encode_XSEncoding(pTHX_ encode_t *enc)
 306 {
 307  dSP;
 308  HV *stash = gv_stashpv("Encode::XS", TRUE);
 309  SV *sv    = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
 310  int i = 0;
 311  PUSHMARK(sp);
 312  XPUSHs(sv);
 313  while (enc->name[i])
 314   {
 315    const char *name = enc->name[i++];
 316    XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
 317   }
 318  PUTBACK;
 319  call_pv("Encode::define_encoding",G_DISCARD);
 320  SvREFCNT_dec(sv);
 321 }
 322
 323 END
 324
 325     print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
 326     print C "BOOT:\n{\n";
 327     print C "#include \"$dname\"\n";
 328     print C "}\n";
 329    }
 330   # Close in void context is bad, m'kay
 331   close(D) or warn "Error closing '$dname': $!";
 332   close(H) or warn "Error closing '$hname': $!";
 333
 334   my $perc_saved    = $strings/($strings + $saved) * 100;
 335   my $perc_subsaved = $strings/($strings + $subsave) * 100;
 336   printf STDERR "%d bytes in string tables\n",$strings;
 337   printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
 338     $saved, $perc_saved              if $saved;
 339   printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
 340     $subsave, $perc_subsaved         if $subsave;
 341  }
 342 elsif ($doEnc)
 343  {
 344   foreach my $name (sort cmp_name keys %encoding)
 345    {
 346     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 347     output_enc(\*C,$name,$e2u);
 348    }
 349  }
 350 elsif ($doUcm)
 351  {
 352   foreach my $name (sort cmp_name keys %encoding)
 353    {
 354     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 355     output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
 356    }
 357  }
 358
 359 # writing half meg files and then not checking to see if you just filled the
 360 # disk is bad, m'kay
 361 close(C) or die "Error closing '$cname': $!";
 362
 363 # End of the main program.
 364
 365 sub compile_ucm
 366 {
 367  my ($fh,$name) = @_;
 368  my $e2u = {};
 369  my $u2e = {};
 370  my $cs;
 371  my %attr;
 372  while (<$fh>)
 373   {
 374    s/#.*$//;
 375    last if /^\s*CHARMAP\s*$/i;
 376    if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
 377     {
 378      $attr{$1} = $2;
 379     }
 380   }
 381  if (!defined($cs =  $attr{'code_set_name'}))
 382   {
 383    warn "No <code_set_name> in $name\n";
 384   }
 385  else
 386   {
 387    $name = $cs unless exists $opt{'n'};
 388   }
 389  my $erep;
 390  my $urep;
 391  my $max_el;
 392  my $min_el;
 393  if (exists $attr{'subchar'})
 394   {
 395    #my @byte;
 396    #$attr{'subchar'} =~ /^\s*/cg;
 397    #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
 398    #$erep = join('',map(chr(hex($_)),@byte));
 399    $erep = $attr{'subchar'};
 400    $erep =~ s/^\s+//; $erep =~ s/\s+$//;
 401   }
 402  print "Reading $name ($cs)\n";
 403  my $nfb = 0;
 404  my $hfb = 0;
 405  while (<$fh>)
 406   {
 407    s/#.*$//;
 408    last if /^\s*END\s+CHARMAP\s*$/i;
 409    next if /^\s*$/;
 410    my (@uni, @byte) = ();
 411    my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
 412        or die "Bad line: $_";
 413    while ($uni =~  m/\G<([U0-9a-fA-F\+]+)>/g){
 414        push @uni, map { substr($_, 1) } split(/\+/, $1);
 415    }
 416    while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
 417        push @byte, $1;
 418    }
 419    if (@uni)
 420     {
 421      my $uch =  join('', map { encode_U(hex($_)) } @uni );
 422      my $ech = join('',map(chr(hex($_)),@byte));
 423      my $el  = length($ech);
 424      $max_el = $el if (!defined($max_el) || $el > $max_el);
 425      $min_el = $el if (!defined($min_el) || $el < $min_el);
 426      if (length($fb))
 427       {
 428        $fb = substr($fb,1);
 429        $hfb++;
 430       }
 431      else
 432       {
 433        $nfb++;
 434        $fb = '0';
 435       }
 436      # $fb is fallback flag
 437      # 0 - round trip safe
 438      # 1 - fallback for unicode -> enc
 439      # 2 - skip sub-char mapping
 440      # 3 - fallback enc -> unicode
 441      enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
 442      enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
 443     }
 444    else
 445     {
 446      warn $_;
 447     }
 448   }
 449  if ($nfb && $hfb)
 450   {
 451    die "$nfb entries without fallback, $hfb entries with\n";
 452   }
 453  $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
 454 }
 455
 456
 457
 458 sub compile_enc
 459 {
 460  my ($fh,$name) = @_;
 461  my $e2u = {};
 462  my $u2e = {};
 463
 464  my $type;
 465  while ($type = <$fh>)
 466   {
 467    last if $type !~ /^\s*#/;
 468   }
 469  chomp($type);
 470  return if $type eq 'E';
 471  # Do the hash lookup once, rather than once per function call. 4% speedup.
 472  my $type_func = $encode_types{$type};
 473  my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
 474  warn "$type encoded $name\n";
 475  my $rep = '';
 476  # Save a defined test by setting these to defined values.
 477  my $min_el = ~0; # A very big integer
 478  my $max_el = 0;  # Anything must be longer than 0
 479  {
 480   my $v = hex($def);
 481   $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
 482  }
 483  my $errors;
 484  my $seen;
 485  # use -Q to silence the seen test. Makefile.PL uses this by default.
 486  $seen = {} unless $opt{Q};
 487  do
 488   {
 489    my $line = <$fh>;
 490    chomp($line);
 491    my $page = hex($line);
 492    my $ch = 0;
 493    my $i = 16;
 494    do
 495     {
 496      # So why is it 1% faster to leave the my here?
 497      my $line = <$fh>;
 498      $line =~ s/\r\n$/\n/;
 499      die "$.:${line}Line should be exactly 65 characters long including
 500      newline (".length($line).")" unless length ($line) == 65;
 501      # Split line into groups of 4 hex digits, convert groups to ints
 502      # This takes 65.35
 503      # map {hex $_} $line =~ /(....)/g
 504      # This takes 63.75 (2.5% less time)
 505      # unpack "n*", pack "H*", $line
 506      # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
 507      # Doing it as while ($line =~ /(....)/g) took 74.63
 508      foreach my $val (unpack "n*", pack "H*", $line)
 509       {
 510        next if $val == 0xFFFD;
 511        my $ech = &$type_func($ch,$page);
 512        if ($val || (!$ch && !$page))
 513         {
 514          my $el  = length($ech);
 515          $max_el = $el if $el > $max_el;
 516          $min_el = $el if $el < $min_el;
 517          my $uch = encode_U($val);
 518          if ($seen) {
 519            # We're doing the test.
 520            # We don't need to read this quickly, so storing it as a scalar,
 521            # rather than 3 (anon array, plus the 2 scalars it holds) saves
 522            # RAM and may make us faster on low RAM systems. [see __END__]
 523            if (exists $seen->{$uch})
 524              {
 525                warn sprintf("U%04X is %02X%02X and %04X\n",
 526                             $val,$page,$ch,$seen->{$uch});
 527                $errors++;
 528              }
 529            else
 530              {
 531                $seen->{$uch} = $page << 8 | $ch;
 532              }
 533          }
 534          # Passing 2 extra args each time is 3.6% slower!
 535          # Even with having to add $fallback ||= 0 later
 536          enter_fb0($e2u,$ech,$uch);
 537          enter_fb0($u2e,$uch,$ech);
 538         }
 539        else
 540         {
 541          # No character at this position
 542          # enter($e2u,$ech,undef,$e2u);
 543         }
 544        $ch++;
 545       }
 546     } while --$i;
 547   } while --$pages;
 548  die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
 549    if $min_el > $max_el;
 550  die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
 551  $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
 552 }
 553
 554 # my ($a,$s,$d,$t,$fb) = @_;
 555 sub enter {
 556   my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
 557   # state we shift to after this (multibyte) input character defaults to same
 558   # as current state.
 559   $next ||= $current;
 560   # Making sure it is defined seems to be faster than {no warnings;} in
 561   # &process, or passing it in as 0 explicity.
 562   # XXX $fallback ||= 0;
 563
 564   # Start at the beginning and work forwards through the string to zero.
 565   # effectively we are removing 1 character from the front each time
 566   # but we don't actually edit the string. [this alone seems to be 14% speedup]
 567   # Hence -$pos is the length of the remaining string.
 568   my $pos = -length $inbytes;
 569   while (1) {
 570     my $byte = substr $inbytes, $pos, 1;
 571     #  RAW_NEXT => 0,
 572     #  RAW_IN_LEN => 1,
 573     #  RAW_OUT_BYTES => 2,
 574     #  RAW_FALLBACK => 3,
 575     # to unicode an array would seem to be better, because the pages are dense.
 576     # from unicode can be very sparse, favouring a hash.
 577     # hash using the bytes (all length 1) as keys rather than ord value,
 578     # as it's easier to sort these in &process.
 579
 580     # It's faster to always add $fallback even if it's undef, rather than
 581     # choosing between 3 and 4 element array. (hence why we set it defined
 582     # above)
 583     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
 584     # When $pos was -1 we were at the last input character.
 585     unless (++$pos) {
 586       $do_now->[RAW_OUT_BYTES] = $outbytes;
 587       $do_now->[RAW_NEXT] = $next;
 588       return;
 589     }
 590     # Tail recursion. The intermdiate state may not have a name yet.
 591     $current = $do_now->[RAW_NEXT];
 592   }
 593 }
 594
 595 # This is purely for optimistation. It's just &enter hard coded for $fallback
 596 # of 0, using only a 3 entry array ref to save memory for every entry.
 597 sub enter_fb0 {
 598   my ($current,$inbytes,$outbytes,$next) = @_;
 599   $next ||= $current;
 600
 601   my $pos = -length $inbytes;
 602   while (1) {
 603     my $byte = substr $inbytes, $pos, 1;
 604     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
 605     unless (++$pos) {
 606       $do_now->[RAW_OUT_BYTES] = $outbytes;
 607       $do_now->[RAW_NEXT] = $next;
 608       return;
 609     }
 610     $current = $do_now->[RAW_NEXT];
 611   }
 612 }
 613
 614 sub process
 615 {
 616   my ($name,$a) = @_;
 617   $name =~ s/\W+/_/g;
 618   $a->{Cname} = $name;
 619   my $raw = $a->{Raw};
 620   my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
 621   my @ent;
 622   $agg_max_in = 0;
 623   foreach my $key (sort keys %$raw) {
 624     #  RAW_NEXT => 0,
 625     #  RAW_IN_LEN => 1,
 626     #  RAW_OUT_BYTES => 2,
 627     #  RAW_FALLBACK => 3,
 628     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 629     # Now we are converting from raw to aggregate, switch from 1 byte strings
 630     # to numbers
 631     my $b = ord $key;
 632     $fallback ||= 0;
 633     if ($l &&
 634         # If this == fails, we're going to reset $agg_max_in below anyway.
 635         $b == ++$agg_max_in &&
 636         # References in numeric context give the pointer as an int.
 637         $agg_next == $next &&
 638         $agg_in_len == $in_len &&
 639         $agg_out_len == length $out_bytes &&
 640         $agg_fallback == $fallback
 641         # && length($l->[AGG_OUT_BYTES]) < 16
 642        ) {
 643       #     my $i = ord($b)-ord($l->[AGG_MIN_IN]);
 644       # we can aggregate this byte onto the end.
 645       $l->[AGG_MAX_IN] = $b;
 646       $l->[AGG_OUT_BYTES] .= $out_bytes;
 647     } else {
 648       # AGG_MIN_IN => 0,
 649       # AGG_MAX_IN => 1,
 650       # AGG_OUT_BYTES => 2,
 651       # AGG_NEXT => 3,
 652       # AGG_IN_LEN => 4,
 653       # AGG_OUT_LEN => 5,
 654       # AGG_FALLBACK => 6,
 655       # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
 656       # (only gains .6% on euc-jp  -- is it worth it?)
 657       push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
 658                        $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
 659                        $agg_fallback = $fallback];
 660     }
 661     if (exists $next->{Cname}) {
 662       $next->{'Forward'} = 1 if $next != $a;
 663     } else {
 664       process(sprintf("%s_%02x",$name,$b),$next);
 665     }
 666   }
 667   # encengine.c rules say that last entry must be for 255
 668   if ($agg_max_in < 255) {
 669     push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
 670   }
 671   $a->{'Entries'} = \@ent;
 672 }
 673
 674
 675 sub addstrings
 676 {
 677  my ($fh,$a) = @_;
 678  my $name = $a->{'Cname'};
 679  # String tables
 680  foreach my $b (@{$a->{'Entries'}})
 681   {
 682    next unless $b->[AGG_OUT_LEN];
 683    $strings{$b->[AGG_OUT_BYTES]} = undef;
 684   }
 685  if ($a->{'Forward'})
 686   {
 687    my $var = $^O eq 'MacOS' ? 'extern' : 'static';
 688    print $fh "$var encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
 689   }
 690  $a->{'DoneStrings'} = 1;
 691  foreach my $b (@{$a->{'Entries'}})
 692   {
 693    my ($s,$e,$out,$t,$end,$l) = @$b;
 694    addstrings($fh,$t) unless $t->{'DoneStrings'};
 695   }
 696 }
 697
 698 sub outbigstring
 699 {
 700   my ($fh,$name) = @_;
 701
 702   $string_acc = '';
 703
 704   # Make the big string in the string accumulator. Longest first, on the hope
 705   # that this makes it more likely that we find the short strings later on.
 706   # Not sure if it helps sorting strings of the same length lexcically.
 707   foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) {
 708     my $index = index $string_acc, $s;
 709     if ($index >= 0) {
 710       $saved += length($s);
 711       $strings_in_acc{$s} = $index;
 712     } else {
 713     OPTIMISER: {
 714         if ($opt{'O'}) {
 715           my $sublength = length $s;
 716           while (--$sublength > 0) {
 717             # progressively lop characters off the end, to see if the start of
 718             # the new string overlaps the end of the accumulator.
 719             if (substr ($string_acc, -$sublength)
 720                 eq substr ($s, 0, $sublength)) {
 721               $subsave += $sublength;
 722               $strings_in_acc{$s} = length ($string_acc) - $sublength;
 723               # append the last bit on the end.
 724               $string_acc .= substr ($s, $sublength);
 725               last OPTIMISER;
 726             }
 727             # or if the end of the new string overlaps the start of the
 728             # accumulator
 729             next unless substr ($string_acc, 0, $sublength)
 730               eq substr ($s, -$sublength);
 731             # well, the last $sublength characters of the accumulator match.
 732             # so as we're prepending to the accumulator, need to shift all our
 733             # existing offsets forwards
 734             $_ += $sublength foreach values %strings_in_acc;
 735             $subsave += $sublength;
 736             $strings_in_acc{$s} = 0;
 737             # append the first bit on the start.
 738             $string_acc = substr ($s, 0, -$sublength) . $string_acc;
 739             last OPTIMISER;
 740           }
 741         }
 742         # Optimiser (if it ran) found nothing, so just going have to tack the
 743         # whole thing on the end.
 744         $strings_in_acc{$s} = length $string_acc;
 745         $string_acc .= $s;
 746       };
 747     }
 748   }
 749
 750   $strings = length $string_acc;
 751   my $definition = "\nstatic const U8 $name\[$strings] = { " .
 752     join(',',unpack "C*",$string_acc);
 753   # We have a single long line. Split it at convenient commas.
 754   print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs;
 755   print $fh substr ($definition, pos $definition), " };\n";
 756 }
 757
 758 sub findstring {
 759   my ($name,$s) = @_;
 760   my $offset = $strings_in_acc{$s};
 761   die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator"
 762     unless defined $offset;
 763   "$name + $offset";
 764 }
 765
 766 sub outtable
 767 {
 768  my ($fh,$a,$bigname) = @_;
 769  my $name = $a->{'Cname'};
 770  $a->{'Done'} = 1;
 771  foreach my $b (@{$a->{'Entries'}})
 772   {
 773    my ($s,$e,$out,$t,$end,$l) = @$b;
 774    outtable($fh,$t,$bigname) unless $t->{'Done'};
 775   }
 776  print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
 777  foreach my $b (@{$a->{'Entries'}})
 778   {
 779    my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
 780    # $end |= 0x80 if $fb; # what the heck was on your mind, Nick?  -- Dan
 781    print  $fh "{";
 782    if ($l)
 783     {
 784      printf $fh findstring($bigname,$out);
 785     }
 786    else
 787     {
 788      print  $fh "0";
 789     }
 790    print  $fh ",",$t->{Cname};
 791    printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
 792   }
 793  print $fh "};\n";
 794 }
 795
 796 sub output_enc
 797 {
 798  my ($fh,$name,$a) = @_;
 799  die "Changed - fix me for new structure";
 800  foreach my $b (sort keys %$a)
 801   {
 802    my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
 803   }
 804 }
 805
 806 sub decode_U
 807 {
 808  my $s = shift;
 809 }
 810
 811 my @uname;
 812 sub char_names
 813 {
 814  my $s = do "unicore/Name.pl";
 815  die "char_names: unicore/Name.pl: $!\n" unless defined $s;
 816  pos($s) = 0;
 817  while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
 818   {
 819    my $name = $3;
 820    my $s = hex($1);
 821    last if $s >= 0x10000;
 822    my $e = length($2) ? hex($2) : $s;
 823    for (my $i = $s; $i <= $e; $i++)
 824     {
 825      $uname[$i] = $name;
 826 #    print sprintf("U%04X $name\n",$i);
 827     }
 828   }
 829 }
 830
 831 sub output_ucm_page
 832 {
 833   my ($cmap,$a,$t,$pre) = @_;
 834   # warn sprintf("Page %x\n",$pre);
 835   my $raw = $t->{Raw};
 836   foreach my $key (sort keys %$raw) {
 837     #  RAW_NEXT => 0,
 838     #  RAW_IN_LEN => 1,
 839     #  RAW_OUT_BYTES => 2,
 840     #  RAW_FALLBACK => 3,
 841     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 842     my $u = ord $key;
 843     $fallback ||= 0;
 844
 845     if ($next != $a && $next != $t) {
 846       output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
 847     } elsif (length $out_bytes) {
 848       if ($pre) {
 849         $u = $pre|($u &0x3f);
 850       }
 851       my $s = sprintf "<U%04X> ",$u;
 852       #foreach my $c (split(//,$out_bytes)) {
 853       #  $s .= sprintf "\\x%02X",ord($c);
 854       #}
 855       # 9.5% faster changing that loop to this:
 856       $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
 857       $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
 858       push(@$cmap,$s);
 859     } else {
 860       warn join(',',$u, @{$raw->{$key}},$a,$t);
 861     }
 862   }
 863 }
 864
 865 sub output_ucm
 866 {
 867  my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
 868  print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
 869  print $fh "<code_set_name> \"$name\"\n";
 870  char_names();
 871  if (defined $min_el)
 872   {
 873    print $fh "<mb_cur_min> $min_el\n";
 874   }
 875  if (defined $max_el)
 876   {
 877    print $fh "<mb_cur_max> $max_el\n";
 878   }
 879  if (defined $rep)
 880   {
 881    print $fh "<subchar> ";
 882    foreach my $c (split(//,$rep))
 883     {
 884      printf $fh "\\x%02X",ord($c);
 885     }
 886    print $fh "\n";
 887   }
 888  my @cmap;
 889  output_ucm_page(\@cmap,$h,$h,0);
 890  print $fh "#\nCHARMAP\n";
 891  foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
 892   {
 893    print $fh $line;
 894   }
 895  print $fh "END CHARMAP\n";
 896 }
 897
 898 use vars qw(
 899     $_Enc2xs
 900     $_Version
 901     $_Inc
 902     $_E2X
 903     $_Name
 904     $_TableFiles
 905     $_Now
 906 );
 907
 908 sub find_e2x{
 909     eval { require File::Find; };
 910     my (@inc, %e2x_dir);
 911     for my $inc (@INC){
 912         push @inc, $inc unless $inc eq '.'; #skip current dir
 913     }
 914     File::Find::find(
 915              sub {
 916                  my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
 917                      $atime,$mtime,$ctime,$blksize,$blocks)
 918                      = lstat($_) or return;
 919                  -f _ or return;
 920                  if (/^.*\.e2x$/o){
 921                      no warnings 'once';
 922                      $e2x_dir{$File::Find::dir} ||= $mtime;
 923                  }
 924                  return;
 925              }, @inc);
 926     warn join("\n", keys %e2x_dir), "\n";
 927     for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
 928         $_E2X = $d;
 929         # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
 930         return $_E2X;
 931     }
 932 }
 933
 934 sub make_makefile_pl
 935 {
 936     eval { require Encode; };
 937     $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
 938     # our used for variable expanstion
 939     $_Enc2xs = $0;
 940     $_Version = $VERSION;
 941     $_E2X = find_e2x();
 942     $_Name = shift;
 943     $_TableFiles = join(",", map {qq('$_')} @_);
 944     $_Now = scalar localtime();
 945
 946     eval { require File::Spec; };
 947     _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
 948     _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"),        "$_Name.pm");
 949     _print_expand(File::Spec->catfile($_E2X,"_T.e2x"),         "t/$_Name.t");
 950     _print_expand(File::Spec->catfile($_E2X,"README.e2x"),     "README");
 951     _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"),    "Changes");
 952     exit;
 953 }
 954
 955 use vars qw(
 956             $_ModLines
 957             $_LocalVer
 958             );
 959
 960 sub make_configlocal_pm
 961 {
 962     eval { require Encode; };
 963     $@ and die "Unable to require Encode: $@\n";
 964     eval { require File::Spec; };
 965     # our used for variable expanstion
 966     my %in_core = map {$_=>1}('ascii','iso-8859-1','utf8');
 967     my %LocalMod = ();
 968     for my $d (@INC){
 969         my $inc = File::Spec->catfile($d, "Encode");
 970         -d $inc or next;
 971         opendir my $dh, $inc or die "$inc:$!";
 972         warn "Checking $inc...\n";
 973         for my $f (grep /\.pm$/o, readdir($dh)){
 974             -f File::Spec->catfile($inc, "$f") or next;
 975             $INC{"Encode/$f"} and next;
 976             warn "require Encode/$f;\n";
 977             eval { require "Encode/$f"; };
 978             $@ and die "Can't require Encode/$f: $@\n";
 979             for my $enc (Encode->encodings()){
 980                 no warnings 'once';
 981                 $in_core{$enc} and next;
 982                 $Encode::Config::ExtModule{$enc} and next;
 983                 my $mod = "Encode/$f";
 984                 $mod =~ s/\.pm$//o; $mod =~ s,/,::,og;
 985                 $LocalMod{$enc} ||= $mod;
 986             }
 987         }
 988     }
 989     $_ModLines = "";
 990     for my $enc (sort keys %LocalMod){
 991         $_ModLines .=
 992             qq(\$Encode::ExtModule{'$enc'} =\t"$LocalMod{$enc}";\n);
 993     }
 994     warn $_ModLines;
 995     $_LocalVer = _mkversion();
 996     $_E2X = find_e2x();
 997     $_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o;
 998     _print_expand(File::Spec->catfile($_E2X,"ConfigLocal_PM.e2x"),
 999                   File::Spec->catfile($_Inc,"ConfigLocal.pm"),
1000                   1);
1001     exit;
1002 }
1003
1004 sub _mkversion{
1005     my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
1006     $yyyy += 1900, $mo +=1;
1007     return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
1008 }
1009
1010 sub _print_expand{
1011     eval { require File::Basename; };
1012     $@ and die "File::Basename needed.  Are you on miniperl?;\nerror: $@\n";
1013     File::Basename->import();
1014     my ($src, $dst, $clobber) = @_;
1015     if (!$clobber and -e $dst){
1016         warn "$dst exists. skipping\n";
1017         return;
1018     }
1019     warn "Generating $dst...\n";
1020     open my $in, $src or die "$src : $!";
1021     if ((my $d = dirname($dst)) ne '.'){
1022         -d $d or mkdir $d, 0755 or die  "mkdir $d : $!";
1023     }
1024     open my $out, ">$dst" or die "$!";
1025     my $asis = 0;
1026     while (<$in>){
1027         if (/^#### END_OF_HEADER/){
1028             $asis = 1; next;
1029         }
1030         s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
1031         print $out $_;
1032     }
1033 }
1034 __END__
1035
1036 =head1 NAME
1037
1038 enc2xs -- Perl Encode Module Generator
1039
1040 =head1 SYNOPSIS
1041
1042   enc2xs -[options]
1043   enc2xs -M ModName mapfiles...
1044   enc2xs -C
1045
1046 =head1 DESCRIPTION
1047
1048 F<enc2xs> builds a Perl extension for use by Encode from either
1049 Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
1050 Besides being used internally during the build process of the Encode
1051 module, you can use F<enc2xs> to add your own encoding to perl.
1052 No knowledge of XS is necessary.
1053
1054 =head1 Quick Guide
1055
1056 If you want to know as little about Perl as possible but need to
1057 add a new encoding, just read this chapter and forget the rest.
1058
1059 =over 4
1060
1061 =item 0.
1062
1063 Have a .ucm file ready.  You can get it from somewhere or you can write
1064 your own from scratch or you can grab one from the Encode distribution
1065 and customize it.  For the UCM format, see the next Chapter.  In the
1066 example below, I'll call my theoretical encoding myascii, defined
1067 in I<my.ucm>.  C<$> is a shell prompt.
1068
1069   $ ls -F
1070   my.ucm
1071
1072 =item 1.
1073
1074 Issue a command as follows;
1075
1076   $ enc2xs -M My my.ucm
1077   generating Makefile.PL
1078   generating My.pm
1079   generating README
1080   generating Changes
1081
1082 Now take a look at your current directory.  It should look like this.
1083
1084   $ ls -F
1085   Makefile.PL   My.pm         my.ucm        t/
1086
1087 The following files were created.
1088
1089   Makefile.PL - MakeMaker script
1090   My.pm       - Encode submodule
1091   t/My.t      - test file
1092
1093 =over 4
1094
1095 =item 1.1.
1096
1097 If you want *.ucm installed together with the modules, do as follows;
1098
1099   $ mkdir Encode
1100   $ mv *.ucm Encode
1101   $ enc2xs -M My Encode/*ucm
1102
1103 =back
1104
1105 =item 2.
1106
1107 Edit the files generated.  You don't have to if you have no time AND no
1108 intention to give it to someone else.  But it is a good idea to edit
1109 the pod and to add more tests.
1110
1111 =item 3.
1112
1113 Now issue a command all Perl Mongers love:
1114
1115   $ perl Makefile.PL
1116   Writing Makefile for Encode::My
1117
1118 =item 4.
1119
1120 Now all you have to do is make.
1121
1122   $ make
1123   cp My.pm blib/lib/Encode/My.pm
1124   /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1125     -o encode_t.c -f encode_t.fnm
1126   Reading myascii (myascii)
1127   Writing compiled form
1128   128 bytes in string tables
1129   384 bytes (25%) saved spotting duplicates
1130   1 bytes (99.2%) saved using substrings
1131   ....
1132   chmod 644 blib/arch/auto/Encode/My/My.bs
1133   $
1134
1135 The time it takes varies depending on how fast your machine is and
1136 how large your encoding is.  Unless you are working on something big
1137 like euc-tw, it won't take too long.
1138
1139 =item 5.
1140
1141 You can "make install" already but you should test first.
1142
1143   $ make test
1144   PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1145     -e 'use Test::Harness  qw(&runtests $verbose); \
1146     $verbose=0; runtests @ARGV;' t/*.t
1147   t/My....ok
1148   All tests successful.
1149   Files=1, Tests=2,  0 wallclock secs
1150    ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1151
1152 =item 6.
1153
1154 If you are content with the test result, just "make install"
1155
1156 =item 7.
1157
1158 If you want to add your encoding to Encode's demand-loading list
1159 (so you don't have to "use Encode::YourEncoding"), run
1160
1161   enc2xs -C
1162
1163 to update Encode::ConfigLocal, a module that controls local settings.
1164 After that, "use Encode;" is enough to load your encodings on demand.
1165
1166 =back
1167
1168 =head1 The Unicode Character Map
1169
1170 Encode uses the Unicode Character Map (UCM) format for source character
1171 mappings.  This format is used by IBM's ICU package and was adopted
1172 by Nick Ing-Simmons for use with the Encode module.  Since UCM is
1173 more flexible than Tcl's Encoding Map and far more user-friendly,
1174 this is the recommended formet for Encode now.
1175
1176 A UCM file looks like this.
1177
1178   #
1179   # Comments
1180   #
1181   <code_set_name> "US-ascii" # Required
1182   <code_set_alias> "ascii"   # Optional
1183   <mb_cur_min> 1             # Required; usually 1
1184   <mb_cur_max> 1             # Max. # of bytes/char
1185   <subchar> \x3F             # Substitution char
1186   #
1187   CHARMAP
1188   <U0000> \x00 |0 # <control>
1189   <U0001> \x01 |0 # <control>
1190   <U0002> \x02 |0 # <control>
1191   ....
1192   <U007C> \x7C |0 # VERTICAL LINE
1193   <U007D> \x7D |0 # RIGHT CURLY BRACKET
1194   <U007E> \x7E |0 # TILDE
1195   <U007F> \x7F |0 # <control>
1196   END CHARMAP
1197
1198 =over 4
1199
1200 =item *
1201
1202 Anything that follows C<#> is treated as a comment.
1203
1204 =item *
1205
1206 The header section continues until a line containing the word
1207 CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
1208 pair per line.  Strings used as values must be quoted. Barewords are
1209 treated as numbers.  I<\xXX> represents a byte.
1210
1211 Most of the keywords are self-explanatory. I<subchar> means
1212 substitution character, not subcharacter.  When you decode a Unicode
1213 sequence to this encoding but no matching character is found, the byte
1214 sequence defined here will be used.  For most cases, the value here is
1215 \x3F; in ASCII, this is a question mark.
1216
1217 =item *
1218
1219 CHARMAP starts the character map section.  Each line has a form as
1220 follows:
1221
1222   <UXXXX> \xXX.. |0 # comment
1223     ^     ^      ^
1224     |     |      +- Fallback flag
1225     |     +-------- Encoded byte sequence
1226     +-------------- Unicode Character ID in hex
1227
1228 The format is roughly the same as a header section except for the
1229 fallback flag: | followed by 0..3.   The meaning of the possible
1230 values is as follows:
1231
1232 =over 4
1233
1234 =item |0
1235
1236 Round trip safe.  A character decoded to Unicode encodes back to the
1237 same byte sequence.  Most characters have this flag.
1238
1239 =item |1
1240
1241 Fallback for unicode -> encoding.  When seen, enc2xs adds this
1242 character for the encode map only.
1243
1244 =item |2
1245
1246 Skip sub-char mapping should there be no code point.
1247
1248 =item |3
1249
1250 Fallback for encoding -> unicode.  When seen, enc2xs adds this
1251 character for the decode map only.
1252
1253 =back
1254
1255 =item *
1256
1257 And finally, END OF CHARMAP ends the section.
1258
1259 =back
1260
1261 When you are manually creating a UCM file, you should copy ascii.ucm
1262 or an existing encoding which is close to yours, rather than write
1263 your own from scratch.
1264
1265 When you do so, make sure you leave at least B<U0000> to B<U0020> as
1266 is, unless your environment is EBCDIC.
1267
1268 B<CAVEAT>: not all features in UCM are implemented.  For example,
1269 icu:state is not used.  Because of that, you need to write a perl
1270 module if you want to support algorithmical encodings, notably
1271 the ISO-2022 series.  Such modules include L<Encode::JP::2022_JP>,
1272 L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1273
1274 =head2 Coping with duplicate mappings
1275
1276 When you create a map, you SHOULD make your mappings round-trip safe.
1277 That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
1278 $data> stands for all characters that are marked as C<|0>.  Here is
1279 how to make sure:
1280
1281 =over 4
1282
1283 =item *
1284
1285 Sort your map in Unicode order.
1286
1287 =item *
1288
1289 When you have a duplicate entry, mark either one with '|1' or '|3'.
1290
1291 =item *
1292
1293 And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
1294
1295 =back
1296
1297 Here is an example from big5-eten.
1298
1299   <U2550> \xF9\xF9 |0
1300   <U2550> \xA2\xA4 |3
1301
1302 Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
1303 this;
1304
1305   E to U               U to E
1306   --------------------------------------
1307   \xF9\xF9 => U2550    U2550 => \xF9\xF9
1308   \xA2\xA4 => U2550
1309
1310 So it is round-trip safe for \xF9\xF9.  But if the line above is upside
1311 down, here is what happens.
1312
1313   E to U               U to E
1314   --------------------------------------
1315   \xA2\xA4 => U2550    U2550 => \xF9\xF9
1316   (\xF9\xF9 => U2550 is now overwritten!)
1317
1318 The Encode package comes with F<ucmlint>, a crude but sufficient
1319 utility to check the integrity of a UCM file.  Check under the
1320 Encode/bin directory for this.
1321
1322
1323 =head1 Bookmarks
1324
1325 =over 4
1326
1327 =item *
1328
1329 ICU Home Page
1330 L<http://oss.software.ibm.com/icu/>
1331
1332 =item *
1333
1334 ICU Character Mapping Tables
1335 L<http://oss.software.ibm.com/icu/charset/>
1336
1337 =item *
1338
1339 ICU:Conversion Data
1340 L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1341
1342 =back
1343
1344 =head1 SEE ALSO
1345
1346 L<Encode>,
1347 L<perlmod>,
1348 L<perlpod>
1349
1350 =cut
1351
1352 # -Q to disable the duplicate codepoint test
1353 # -S make mapping errors fatal
1354 # -q to remove comments written to output files
1355 # -O to enable the (brute force) substring optimiser
1356 # -o <output> to specify the output file name (else it's the first arg)
1357 # -f <inlist> to give a file with a list of input files (else use the args)
1358 # -n <name> to name the encoding (else use the basename of the input file.
1359
1360 With %seen holding array refs:
1361
1362       865.66 real        28.80 user         8.79 sys
1363       7904  maximum resident set size
1364       1356  average shared memory size
1365      18566  average unshared data size
1366        229  average unshared stack size
1367      46080  page reclaims
1368      33373  page faults
1369
1370 With %seen holding simple scalars:
1371
1372       342.16 real        27.11 user         3.54 sys
1373       8388  maximum resident set size
1374       1394  average shared memory size
1375      14969  average unshared data size
1376        236  average unshared stack size
1377      28159  page reclaims
1378       9839  page faults
1379
1380 Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1381 how %seen is storing things its seen. So it is pathalogically bad on a 16M
1382 RAM machine, but it's going to help even on modern machines.
1383 Swapping is bad, m'kay :-)